forked from Mirrors/privacore-open-source-search-engine
8008 lines
220 KiB
C++
8008 lines
220 KiB
C++
//
|
|
// Matt Wells, copyright Sep 2001
|
|
//
|
|
|
|
#include "gb-include.h"
|
|
|
|
#include <sched.h> // clone()
|
|
// declare this stuff up here for call the pread() in our seek test below
|
|
//
|
|
// maybe we should put this in a common header file so we don't have
|
|
// certain files compiled with the platform default, and some not -partap
|
|
|
|
#include "Version.h" // getVersion()
|
|
#include "Mem.h"
|
|
#include "Conf.h"
|
|
#include "JobScheduler.h"
|
|
#include "Hostdb.h"
|
|
#include "Posdb.h"
|
|
#include "Titledb.h"
|
|
#include "Tagdb.h"
|
|
#include "Spider.h"
|
|
#include "SpiderColl.h"
|
|
#include "SpiderLoop.h"
|
|
#include "Doledb.h"
|
|
#include "Clusterdb.h"
|
|
#include "Sections.h"
|
|
#include "Statsdb.h"
|
|
#include "UdpServer.h"
|
|
#include "PingServer.h"
|
|
#include "Repair.h"
|
|
#include "DailyMerge.h"
|
|
#include "MsgC.h"
|
|
#include "HttpServer.h"
|
|
#include "Loop.h"
|
|
#include "HighFrequencyTermShortcuts.h"
|
|
#include "IPAddressChecks.h"
|
|
#include <sys/resource.h> // setrlimit
|
|
#include "Stats.h"
|
|
#include "Statistics.h"
|
|
#include "Speller.h" // g_speller
|
|
#include "Wiki.h" // g_wiki
|
|
#include "Wiktionary.h" // g_wiktionary
|
|
#include "CountryCode.h"
|
|
#include "Pos.h"
|
|
#include "Title.h"
|
|
#include "Speller.h"
|
|
#include "SummaryCache.h"
|
|
|
|
// include all msgs that have request handlers, cuz we register them with g_udp
|
|
#include "Msg0.h"
|
|
#include "Msg1.h"
|
|
#include "Msg4.h"
|
|
#include "Msg13.h"
|
|
#include "Msg20.h"
|
|
#include "Msg22.h"
|
|
#include "Msg39.h"
|
|
#include "Msg40.h" // g_resultsCache
|
|
#include "Parms.h"
|
|
#include "Pages.h"
|
|
#include "PageInject.h"
|
|
#include "Unicode.h"
|
|
|
|
#include "Msg1f.h"
|
|
#include "Profiler.h"
|
|
#include "Blaster.h"
|
|
#include "Proxy.h"
|
|
|
|
#include "linkspam.h"
|
|
#include "Process.h"
|
|
#include "sort.h"
|
|
#include "RdbBuckets.h"
|
|
#include "SpiderProxy.h"
|
|
#include "HashTable.h"
|
|
#include <sys/stat.h> //umask()
|
|
|
|
bool registerMsgHandlers ( ) ;
|
|
bool registerMsgHandlers1 ( ) ;
|
|
bool registerMsgHandlers2 ( ) ;
|
|
bool registerMsgHandlers3 ( ) ;
|
|
|
|
void rmTest();
|
|
|
|
static void dumpTitledb (const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
|
|
int64_t docId , bool justPrintDups );
|
|
static int32_t dumpSpiderdb ( const char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
|
char printStats , int32_t firstIp );
|
|
|
|
static void dumpTagdb( const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char rec = 0,
|
|
int32_t rdbId = RDB_TAGDB, const char *site = NULL );
|
|
|
|
void dumpPosdb ( const char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
|
int64_t termId , bool justVerify ) ;
|
|
static void dumpWaitingTree( const char *coll );
|
|
static void dumpDoledb ( const char *coll, int32_t sfn, int32_t numFiles, bool includeTree);
|
|
|
|
void dumpClusterdb ( const char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
|
|
|
|
//void dumpStatsdb ( int32_t startFileNum, int32_t numFiles, bool includeTree,
|
|
// int test );
|
|
|
|
void dumpLinkdb ( const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
|
|
const char *url );
|
|
|
|
int copyFiles ( const char *dstDir ) ;
|
|
|
|
|
|
const char *getcwd2 ( char *arg ) ;
|
|
|
|
static int32_t checkDirPerms ( char *dir ) ;
|
|
|
|
// benchmark RdbTree::addRecord() for indexdb
|
|
bool treetest ( ) ;
|
|
bool hashtest ( ) ;
|
|
// how fast to parse the content of this docId?
|
|
bool parseTest ( const char *coll , int64_t docId , const char *query );
|
|
bool summaryTest1 ( char *rec, int32_t listSize, const char *coll , int64_t docId ,
|
|
const char *query );
|
|
|
|
// time a big write, read and then seeks
|
|
bool thrutest ( char *testdir , int64_t fileSize ) ;
|
|
void seektest ( const char *testdir , int32_t numThreads , int32_t maxReadSize , const char *filename );
|
|
|
|
bool pingTest ( int32_t hid , uint16_t clientPort );
|
|
bool memTest();
|
|
bool cacheTest();
|
|
bool ramdiskTest();
|
|
void countdomains( const char* coll, int32_t numRecs, int32_t verb, int32_t output );
|
|
|
|
static void wakeupPollLoop() {
|
|
g_loop.wakeupPollLoop();
|
|
}
|
|
|
|
static UdpProtocol g_dp; // Default Proto
|
|
|
|
// installFlag konstants
|
|
typedef enum {
|
|
ifk_install = 1,
|
|
ifk_installgb ,
|
|
ifk_installconf ,
|
|
ifk_dsh ,
|
|
ifk_dsh2 ,
|
|
ifk_backupcopy ,
|
|
ifk_backupmove ,
|
|
ifk_backuprestore ,
|
|
ifk_installconf2 ,
|
|
ifk_start ,
|
|
ifk_tmpstart ,
|
|
ifk_installtmpgb ,
|
|
ifk_proxy_start
|
|
} install_flag_konst_t;
|
|
|
|
static int install_file(const char *file);
|
|
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir = NULL,
|
|
int32_t hostId2 = -1, char *cmd = NULL );
|
|
int scale ( char *newhostsconf , bool useShotgunIp );
|
|
int collinject ( char *newhostsconf );
|
|
int collcopy ( char *newHostsConf , char *coll , int32_t collnum ) ;
|
|
|
|
bool doCmd ( const char *cmd , int32_t hostId , const char *filename , bool sendToHosts,
|
|
bool sendToProxies, int32_t hostId2=-1 );
|
|
int injectFile ( const char *filename , char *ips , const char *coll );
|
|
int injectFileTest ( int32_t reqLen , int32_t hid ); // generates the file
|
|
void membustest ( int32_t nb , int32_t loops , bool readf ) ;
|
|
|
|
//void tryMergingWrapper ( int fd , void *state ) ;
|
|
|
|
void saveRdbs ( int fd , void *state ) ;
|
|
//void resetAll ( );
|
|
//void spamTest ( ) ;
|
|
|
|
extern void resetPageAddUrl ( );
|
|
extern void resetHttpMime ( );
|
|
extern void reset_iana_charset ( );
|
|
extern void resetAdultBit ( );
|
|
extern void resetDomains ( );
|
|
extern void resetEntities ( );
|
|
extern void resetQuery ( );
|
|
extern void resetStopWords ( );
|
|
extern void resetUnicode ( );
|
|
|
|
extern void tryToSyncWrapper ( int fd , void *state ) ;
|
|
|
|
int main2 ( int argc , char *argv[] ) ;
|
|
|
|
int main ( int argc , char *argv[] ) {
|
|
int ret = main2 ( argc , argv );
|
|
|
|
// returns 1 if failed, 0 on successful/graceful exit
|
|
if ( ret ) {
|
|
fprintf( stderr, "Failed to start gb. Exiting.\n" );
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int main2 ( int argc , char *argv[] ) {
|
|
g_conf.m_runAsDaemon = false;
|
|
g_conf.m_logToFile = false;
|
|
|
|
// appears that linux 2.4.17 kernel would crash with this?
|
|
// let's try again on gk127 to make sure
|
|
// YES! gk0 cluster has run for months with this just fine!!
|
|
mlockall(MCL_CURRENT|MCL_FUTURE);
|
|
|
|
// record time for uptime
|
|
g_stats.m_uptimeStart = time(NULL);
|
|
|
|
if (argc < 0) {
|
|
printHelp:
|
|
SafeBuf sb;
|
|
sb.safePrintf(
|
|
"\n"
|
|
"Usage: gb [CMD]\n");
|
|
sb.safePrintf(
|
|
"\n"
|
|
"\tgb will first try to load "
|
|
"the hosts.conf in the same directory as the "
|
|
"gb binary. "
|
|
"Then it will determine its hostId based on "
|
|
"the directory and IP address listed in the "
|
|
"hosts.conf file it loaded. Things in []'s "
|
|
"are optional.");
|
|
sb.safePrintf(
|
|
"[CMD] can have the following values:\n\n"
|
|
|
|
"-h\tPrint this help.\n\n"
|
|
"-v\tPrint version and exit.\n\n"
|
|
|
|
//"<hostId>\n"
|
|
//"\tstart the gb process for this <hostId> locally."
|
|
//" <hostId> is 0 to run as host #0, for instance."
|
|
//"\n\n"
|
|
|
|
|
|
//"<hostId> -d\n\trun as daemon.\n\n"
|
|
"-d\tRun as daemon.\n\n"
|
|
|
|
//"-o\tprint the overview documentation in HTML. "
|
|
//"Contains the format of hosts.conf.\n\n"
|
|
|
|
// "<hostId> -r\n\tindicates recovery mode, "
|
|
// "sends email to addresses "
|
|
// "specified in Conf.h upon startup.\n\n"
|
|
// "-r\tindicates recovery mode, "
|
|
// "sends email to addresses "
|
|
// "specified in Conf.h upon startup.\n\n"
|
|
|
|
"start [hostId]\n"
|
|
"\tStart the gb process on all hosts or just on "
|
|
"[hostId], if specified, using an ssh command. Runs "
|
|
"each gb process in a keepalive loop under bash.\n\n"
|
|
|
|
"start <hostId1-hostId2>\n"
|
|
"\tLike above but just start gb on the supplied "
|
|
"range of hostIds.\n\n"
|
|
|
|
"stop [hostId]\n"
|
|
"\tSaves and exits for all gb hosts or "
|
|
"just on [hostId], if specified.\n\n"
|
|
|
|
"stop <hostId1-hostId2>\n"
|
|
"\tTell gb to save and exit on the given range of "
|
|
"hostIds.\n\n"
|
|
|
|
"save [hostId]\n"
|
|
"\tJust saves for all gb hosts or "
|
|
"just on [hostId], if specified.\n\n"
|
|
|
|
|
|
/*
|
|
"tmpstart [hostId]\n"
|
|
"\tstart the gb process on all hosts or just on "
|
|
"[hostId] if specified, but "
|
|
"use the ports specified in hosts.conf PLUS one. "
|
|
"Then you can switch the "
|
|
"proxy over to point to those and upgrade the "
|
|
"original cluster's gb. "
|
|
"That can be done in the Master Controls of the "
|
|
"proxy using the 'use "
|
|
"temporary cluster'. Also, this assumes the binary "
|
|
"name is tmpgb not gb.\n\n"
|
|
|
|
"tmpstop [hostId]\n"
|
|
"\tsaves and exits for all gb hosts or "
|
|
"just on [hostId] if specified, for the "
|
|
"tmpstart command.\n\n"
|
|
*/
|
|
|
|
"spidersoff [hostId]\n"
|
|
"\tDisables spidering for all gb hosts or "
|
|
"just on [hostId], if specified.\n\n"
|
|
|
|
"spiderson [hostId]\n"
|
|
"\tEnables spidering for all gb hosts or "
|
|
"just on [hostId], if specified.\n\n"
|
|
|
|
/*
|
|
"cacheoff [hostId]\n"
|
|
"\tdisables all disk PAGE caches on all hosts or "
|
|
"just on [hostId] if specified.\n\n"
|
|
|
|
"freecache [maxShmid]\n"
|
|
"\tfinds and frees all shared memory up to shmid "
|
|
"maxShmid, default is 3000000.\n\n"
|
|
*/
|
|
|
|
/*
|
|
"ddump [hostId]\n"
|
|
"\tdump all b-trees in memory to sorted files on "
|
|
"disk. "
|
|
"Will likely trigger merges on files on disk. "
|
|
"Restrict to just host [hostId] if given.\n\n"
|
|
*/
|
|
|
|
/*
|
|
"pmerge [hostId|hostId1-hostId2]\n"
|
|
"\tforce merge of posdb files "
|
|
"just on [hostId] if specified.\n\n"
|
|
|
|
"smerge [hostId|hostId1-hostId2]\n"
|
|
"\tforce merge of sectiondb files "
|
|
"just on [hostId] if specified.\n\n"
|
|
|
|
"tmerge [hostId|hostId1-hostId2]\n"
|
|
"\tforce merge of titledb files "
|
|
"just on [hostId] if specified.\n\n"
|
|
|
|
"merge [hostId|hostId1-hostId2]\n"
|
|
"\tforce merge of all rdb files "
|
|
"just on [hostId] if specified.\n\n"
|
|
*/
|
|
|
|
"dsh <CMD>\n"
|
|
"\tRun this command on the primary IPs of "
|
|
"all active hosts in hosts.conf. It will be "
|
|
"executed in the gigablast working directory on "
|
|
"each host. Example: "
|
|
"gb dsh 'ps auxw; uptime'\n\n"
|
|
|
|
/*
|
|
"dsh2 <CMD>\n"
|
|
"\trun this command on the secondary IPs of "
|
|
"all active hosts in hosts.conf. Example: "
|
|
"gb dsh2 'ps auxw; uptime'\n\n"
|
|
*/
|
|
|
|
"install [hostId]\n"
|
|
"\tInstall all required files for gb from "
|
|
"current working directory of the gb binary "
|
|
"to [hostId]. If no [hostId] is specified, install "
|
|
"to ALL hosts.\n\n"
|
|
|
|
/*
|
|
"install2 [hostId]\n"
|
|
"\tlike above, but use the secondary IPs in the "
|
|
"hosts.conf.\n\n"
|
|
*/
|
|
|
|
"installgb [hostId]\n"
|
|
"\tLike above, but install just the gb executable.\n\n"
|
|
|
|
"installfile <file>\n"
|
|
"\tInstalls the specified file on all hosts\n\n"
|
|
|
|
/*
|
|
"installtmpgb [hostId]\n"
|
|
"\tlike above, but install just the gb executable "
|
|
"as tmpgb (for tmpstart).\n\n"
|
|
*/
|
|
|
|
"installconf [hostId]\n"
|
|
"\tlike above, but install hosts.conf and gb.conf\n\n"
|
|
/*
|
|
"installconf2 [hostId]\n"
|
|
"\tlike above, but install hosts.conf and gbN.conf "
|
|
"to the secondary IPs.\n\n"
|
|
|
|
"backupcopy <backupSubdir>\n"
|
|
"\tsave a copy of all xml, config, data and map files "
|
|
"into <backupSubdir> which is relative "
|
|
"to the working dir. Done for all hosts.\n\n"
|
|
|
|
"backupmove <backupSubdir>\n"
|
|
"\tmove all all xml, config, data and map files "
|
|
"into <backupSubdir> which is relative "
|
|
"to the working dir. Done for all hosts.\n\n"
|
|
|
|
"backuprestore <backupSubdir>\n"
|
|
"\tmove all all xml, config, data and map files "
|
|
"in <backupSubdir>, which is relative "
|
|
"to the working dir, into the working dir. "
|
|
"Will NOT overwrite anything. Done for all "
|
|
"hosts.\n\n"
|
|
|
|
"proxy start [proxyId]\n"
|
|
"\tStart a proxy that acts as a frontend to gb "
|
|
"and passes on requests to random machines on "
|
|
"the cluster given in hosts.conf. Helps to "
|
|
"distribute the load evenly across all machines.\n\n"
|
|
|
|
"proxy load <proxyId>\n"
|
|
"\tStart a proxy process directly without calling "
|
|
"ssh. Called by 'gb proxy start'.\n\n"
|
|
|
|
"proxy stop [proxyId]\n"
|
|
"\tStop a proxy that acts as a frontend to gb.\n\n"
|
|
|
|
"blasterdiff [-v] [-j] [-p] <file1> <file2> "
|
|
"<maxNumThreads> <wait>\n"
|
|
"\tcompare search results between urls in file1 and"
|
|
"file2 and output the search results in the url"
|
|
" from file1 not found in the url from file2 "
|
|
"maxNumThreads is the number of concurrent "
|
|
"comparisons "
|
|
"that should be done at one time and wait is the"
|
|
"time to wait between comparisons. -v is for "
|
|
"verbose "
|
|
" and -j is to just display links not found and "
|
|
"not "
|
|
"search for them on server2. If you do not want to"
|
|
" use the proxy server "
|
|
"on gk10, use -p\n\n"
|
|
*/
|
|
|
|
/*
|
|
"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
|
|
"\tget documents from the urls given in file. The "
|
|
"-l argument is to "
|
|
"automatically get documents "
|
|
"from the gigablast log file.\n"
|
|
"\t-u means to inject/index the url into gb.\n"
|
|
"\t-i means to inject/index the url into gb AND "
|
|
"add all of its outlinks to\n"
|
|
"\tspiderdb for spidering, "
|
|
"which also entails a DNS lookup on each outlink.\n"
|
|
"\tmaxNumThreads is the"
|
|
" number of concurrent threads at one time and wait "
|
|
" is the time to wait between threads.\n\n"
|
|
*/
|
|
|
|
/*
|
|
"scale <newHosts.conf>\n"
|
|
"\tGenerate a script to be called to migrate the "
|
|
"data to the new places. Remaining hosts will "
|
|
"keep the data they have, but it will be "
|
|
"filtered during the next merge operations.\n\n"
|
|
|
|
"collcopy <newHosts.conf> <coll> <collnum>\n"
|
|
"\tGenerate a script to copy the collection data on "
|
|
"the cluster defined by newHosts.conf to the "
|
|
"current cluster. Remote network must have "
|
|
"called \"gb ddump\" twice in a row just before to "
|
|
"ensure all of its data is on disk.\n\n"
|
|
*/
|
|
|
|
|
|
// gb inject <file> <ip:port> [startdocid]
|
|
// gb inject titledb <newhosts.conf> [startdocid]
|
|
"inject <filename> "
|
|
"<ip:port> [collection]\n"
|
|
"\tInject all documents in <filename> into the gb "
|
|
"host at ip:port. File must be in WARC format. "
|
|
"Uses collection of 'main' if not specified. If "
|
|
"ip:port is a hosts.conf file then a round-robin "
|
|
"approach will be used."
|
|
// "Each document listed in the file "
|
|
// "must be preceeded by a valid HTTP mime with "
|
|
// "a Content-Length: field. WARC files are also ok."
|
|
"\n\n"
|
|
|
|
/*
|
|
"inject titledb-<DIR> <newhosts.conf> [startdocid]\n"
|
|
"\tInject all pages from all the titledb "
|
|
"files in the <DIR> directory into the appropriate "
|
|
"host defined by the newhosts.conf config file. This "
|
|
"is useful for populating one search engine with "
|
|
"another. "
|
|
"\n\n"
|
|
|
|
"injecttest <requestLen> [hostId]\n"
|
|
"\tinject random documents into [hostId]. If [hostId] "
|
|
"not given 0 is assumed.\n\n"
|
|
|
|
"ping <hostId> [clientport]\n"
|
|
"\tperforms pings to <hostId>. [clientport] defaults "
|
|
"to 2050.\n\n"
|
|
*/
|
|
|
|
/*
|
|
"spellcheck <file>\n"
|
|
"\tspellchecks the the queries in <file>.\n\n"
|
|
|
|
"dictlookuptest <file>\n"
|
|
"\tgets the popularities of the entries in the "
|
|
"<file>. Used to only check performance of "
|
|
"getPhrasePopularity.\n\n"
|
|
|
|
// less common things
|
|
"gendict <coll> [numWordsToDump]\n\tgenerate "
|
|
"dictionary used for spellchecker "
|
|
"from titledb files in collection <coll>. Use "
|
|
"first [numWordsToDump] words.\n\n"
|
|
|
|
//"update\tupdate titledb0001.dat\n\n"
|
|
"treetest\n\ttree insertion speed test\n\n"
|
|
|
|
"hashtest\n\tadd and delete into hashtable test\n\n"
|
|
|
|
"parsetest <docIdToTest> [coll] [query]\n\t"
|
|
"parser speed tests\n\n"
|
|
*/
|
|
|
|
/*
|
|
"thrutest [dir] [fileSize]\n\tdisk write/read speed "
|
|
"test\n\n"
|
|
|
|
"seektest [dir] [numThreads] [maxReadSize] "
|
|
"[filename]\n"
|
|
"\tdisk seek speed test\n\n"
|
|
|
|
"memtest\n"
|
|
"\t Test how much memory we can use\n\n"
|
|
*/
|
|
|
|
/*
|
|
// Quality Tests
|
|
"countdomains <coll> <X>\n"
|
|
"\tCounts the domains and IPs in collection coll and "
|
|
"in the first X titledb records. Results are sorted"
|
|
"by popularity and stored in the log file. \n\n"
|
|
|
|
"cachetest\n\t"
|
|
"cache stability and speed tests\n\n"
|
|
|
|
"ramdisktest\n\t"
|
|
"test ramdisk functionality\n\n"
|
|
|
|
"dump e <coll> <UTCtimestamp>\n\tdump all events "
|
|
"as if the time is UTCtimestamp.\n\n"
|
|
|
|
"dump es <coll> <UTCtimestamp>\n\tdump stats for "
|
|
"all events as if the time is UTCtimestamp.\n\n"
|
|
*/
|
|
|
|
"dump <db> <collection>\n\tDump a db from disk. "
|
|
"Example: gb dump t main\n"
|
|
"\t<collection> is the name of the collection.\n"
|
|
|
|
"\t<db> is s to dump spiderdb."
|
|
//"set [T] to 1 to print "
|
|
//"new stats. 2 to print old stats. "
|
|
//"T is ip of firstip."
|
|
"\n"
|
|
|
|
"\t<db> is t to dump titledb. "
|
|
//"\tT is the first docId to dump. Applies only to "
|
|
//"titledb. "
|
|
"\n"
|
|
|
|
"\t<db> is p to dump posdb (the index)."
|
|
//"\tOptional: T is the termid to dump."
|
|
"\n"
|
|
|
|
"\t<db> is D to dump duplicate docids in titledb.\n"
|
|
"\t<db> is S to dump tagdb.\n"
|
|
"\t<db> is W to dump tagdb for wget.\n"
|
|
"\t<db> is x to dump doledb.\n"
|
|
"\t<db> is w to dump waiting tree.\n"
|
|
"\t<db> is l to dump clusterdb.\n"
|
|
"\t<db> is z to dump statsdb all keys.\n"
|
|
"\t<db> is Z to dump statsdb all keys and "
|
|
"data samples.\n"
|
|
"\t<db> is L to dump linkdb.\n"
|
|
);
|
|
SafeBuf sb2;
|
|
sb2.brify2 ( sb.getBufStart() , 60 , "\n\t" , false );
|
|
sb2.safeMemcpy("",1);
|
|
fprintf(stdout,"%s",sb2.getBufStart());
|
|
// disable printing of used memory
|
|
//g_mem.m_used = 0;
|
|
return 0;
|
|
}
|
|
|
|
int32_t cmdarg = 0;
|
|
|
|
// get command
|
|
|
|
// it might not be there, might be a simple "./gb"
|
|
const char *cmd = "";
|
|
if ( argc >= 2 ) {
|
|
cmdarg = 1;
|
|
cmd = argv[1];
|
|
}
|
|
|
|
const char *cmd2 = "";
|
|
if ( argc >= 3 )
|
|
cmd2 = argv[2];
|
|
|
|
int arch = 64;
|
|
if ( sizeof(char *) == 4 ) arch = 32;
|
|
|
|
// help
|
|
if ( strcmp ( cmd , "-h" ) == 0 ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
// version
|
|
if ( strcmp ( cmd , "-v" ) == 0 ) {
|
|
printVersion();
|
|
return 0;
|
|
}
|
|
|
|
//send an email on startup for -r, like if we are recovering from an
|
|
//unclean shutdown.
|
|
g_recoveryMode = false;
|
|
const char *cc = NULL;
|
|
if ( strncmp ( cmd , "-r" ,2 ) == 0 ) cc = cmd;
|
|
if ( strncmp ( cmd2 , "-r",2 ) == 0 ) cc = cmd2;
|
|
if ( cc ) {
|
|
g_recoveryMode = true;
|
|
g_recoveryLevel = 1;
|
|
if ( cc[2] ) g_recoveryLevel = atoi(cc+2);
|
|
if ( g_recoveryLevel < 0 ) g_recoveryLevel = 0;
|
|
}
|
|
|
|
// run as daemon? then we have to fork
|
|
if ( ( strcmp ( cmd , "-d" ) == 0 ) || ( strcmp ( cmd2 , "-d" ) == 0 ) ) {
|
|
g_conf.m_runAsDaemon = true;
|
|
}
|
|
|
|
if ( ( strcmp ( cmd , "-l" ) == 0 ) || ( strcmp ( cmd2 , "-l" ) == 0 ) ) {
|
|
g_conf.m_logToFile = true;
|
|
}
|
|
|
|
if( (strcmp( cmd, "countdomains" ) == 0) && (argc >= (cmdarg + 2)) ) {
|
|
uint32_t tmp = atoi( argv[cmdarg+2] );
|
|
if( (tmp * 10) > g_mem.getMemTableSize() )
|
|
g_mem.setMemTableSize(tmp * 10);
|
|
}
|
|
|
|
// these tests do not need a hosts.conf
|
|
if ( strcmp ( cmd , "treetest" ) == 0 ) {
|
|
if ( argc > cmdarg+1 ) goto printHelp;
|
|
treetest();
|
|
return 0;
|
|
}
|
|
// these tests do not need a hosts.conf
|
|
if ( strcmp ( cmd , "hashtest" ) == 0 ) {
|
|
if ( argc > cmdarg+1 ) goto printHelp;
|
|
hashtest();
|
|
return 0;
|
|
}
|
|
// these tests do not need a hosts.conf
|
|
if ( strcmp ( cmd , "memtest" ) == 0 ) {
|
|
if ( argc > cmdarg+1 ) goto printHelp;
|
|
memTest();
|
|
return 0;
|
|
}
|
|
if ( strcmp ( cmd , "cachetest" ) == 0 ) {
|
|
if ( argc > cmdarg+1 ) goto printHelp;
|
|
cacheTest();
|
|
return 0;
|
|
}
|
|
if ( strcmp ( cmd , "ramdisktest" ) == 0 ) {
|
|
if ( argc > cmdarg+1 ) goto printHelp;
|
|
ramdiskTest();
|
|
return 0;
|
|
}
|
|
if ( strcmp ( cmd , "parsetest" ) == 0 ) {
|
|
if ( cmdarg+1 >= argc ) goto printHelp;
|
|
// load up hosts.conf
|
|
//if ( ! g_hostdb.init(hostId) ) {
|
|
// log("db: hostdb init failed." ); return 1; }
|
|
// init our table for doing zobrist hashing
|
|
if ( ! hashinit() ) {
|
|
log("db: Failed to init hashtable." ); return 1; }
|
|
|
|
int64_t docid = atoll1(argv[cmdarg+1]);
|
|
const char *coll = "";
|
|
const char *query = "";
|
|
if ( cmdarg+3 <= argc ) coll = argv[cmdarg+2];
|
|
if ( cmdarg+4 == argc ) query = argv[cmdarg+3];
|
|
parseTest( coll, docid, query );
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
if ( strcmp ( cmd , "querytest" ) == 0){
|
|
if ( ! g_hostdb.init(hostsConf, hostId) ) {
|
|
log("db: hostdb init failed." ); return 1; }
|
|
// init our table for doing zobrist hashing
|
|
if ( ! hashinit() ) {
|
|
log("db: Failed to init hashtable." ); return 1; }
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log("Unicode initialization failed!");
|
|
return 1;
|
|
}
|
|
queryTest();
|
|
return 0;
|
|
|
|
}
|
|
*/
|
|
|
|
if ( strcmp ( cmd ,"isportinuse") == 0 ) {
|
|
if ( cmdarg+1 >= argc ) goto printHelp;
|
|
int port = atol ( argv[cmdarg+1] );
|
|
// make sure port is available. returns false if in use.
|
|
if ( ! g_httpServer.m_tcp.testBind(port,false) )
|
|
// and we should return with 1 so the keep alive
|
|
// script will exit
|
|
exit (1);
|
|
// port is not in use, return 0
|
|
exit(0);
|
|
}
|
|
|
|
// need threads here for tests?
|
|
|
|
// gb thrutest <testDir> <fileSize>
|
|
if ( strcmp ( cmd , "thrutest" ) == 0 ) {
|
|
if ( cmdarg+2 >= argc ) goto printHelp;
|
|
char *testdir = argv[cmdarg+1];
|
|
int64_t fileSize = atoll1 ( argv[cmdarg+2] );
|
|
thrutest ( testdir , fileSize );
|
|
return 0;
|
|
}
|
|
// gb seektest <testdir> <numThreads> <maxReadSize>
|
|
if ( strcmp ( cmd , "seektest" ) == 0 ) {
|
|
const char *testdir = "/tmp/";
|
|
int32_t numThreads = 20; //30;
|
|
int64_t maxReadSize = 20000;
|
|
char *filename = NULL;
|
|
if ( cmdarg+1 < argc ) testdir = argv[cmdarg+1];
|
|
if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]);
|
|
if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
|
|
if ( cmdarg+4 < argc ) filename = argv[cmdarg+4];
|
|
seektest ( testdir , numThreads , maxReadSize , filename );
|
|
return 0;
|
|
}
|
|
|
|
// note the stack size for debug purposes
|
|
struct rlimit rl;
|
|
getrlimit(RLIMIT_STACK, &rl);
|
|
log(LOG_INFO,"db: Stack size is %" PRId64".", (int64_t)rl.rlim_cur);
|
|
|
|
|
|
struct rlimit lim;
|
|
// limit fds
|
|
// try to prevent core from systems where it is above 1024
|
|
// because our FD_ISSET() libc function will core! (it's older)
|
|
int32_t NOFILE = 1024;
|
|
lim.rlim_cur = lim.rlim_max = NOFILE;
|
|
if ( setrlimit(RLIMIT_NOFILE,&lim)) {
|
|
log("db: setrlimit RLIMIT_NOFILE %" PRId32": %s.",
|
|
NOFILE,mstrerror(errno) );
|
|
}
|
|
|
|
struct rlimit rlim;
|
|
getrlimit ( RLIMIT_NOFILE,&rlim);
|
|
if ( (int32_t)rlim.rlim_max > NOFILE || (int32_t)rlim.rlim_cur > NOFILE ) {
|
|
log("db: setrlimit RLIMIT_NOFILE failed!");
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// set the s_pages array for print admin pages
|
|
g_pages.init ( );
|
|
|
|
bool isProxy = false;
|
|
if ( strcmp( cmd , "proxy" ) == 0 && strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
|
|
isProxy = true;
|
|
}
|
|
|
|
// this is just like starting up a gb process, but we add one to
|
|
// each port, we are a dummy machine in the dummy cluster.
|
|
// gb -w <workingdir> tmpstart [hostId]
|
|
char useTmpCluster = 0;
|
|
if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
|
|
useTmpCluster = 1;
|
|
}
|
|
|
|
// gb -w <workingdir> tmpstop [hostId]
|
|
if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
|
|
useTmpCluster = 1;
|
|
}
|
|
|
|
// gb -w <workingdir> tmpstarthost
|
|
if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
|
|
useTmpCluster = 1;
|
|
}
|
|
|
|
// gb inject <file> <ip:port> [startdocid]
|
|
// gb inject titledb-coll.main.0 <newhosts.conf> [startdocid]
|
|
// gb inject titledb-somedir <newhosts.conf> [startdocid]
|
|
// gb inject titledb-coll.foobar.5 <newhosts.conf> [startdocid]
|
|
if ( strcmp ( cmd , "inject" ) == 0 ) {
|
|
if ( argc != cmdarg+3 &&
|
|
argc != cmdarg+4 &&
|
|
argc != cmdarg+5 )
|
|
goto printHelp;
|
|
char *file = argv[cmdarg+1];
|
|
char *ips = argv[cmdarg+2];
|
|
char *coll = argv[cmdarg+3];
|
|
// int64_t startDocId = 0LL;
|
|
// int64_t endDocId = DOCID_MASK;
|
|
// if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]);
|
|
// if ( cmdarg+4 < argc ) endDocId = atoll(argv[cmdarg+4]);
|
|
//injectFile ( file , ips , startDocId , endDocId , false );
|
|
injectFile ( file , ips , coll );
|
|
return 0;
|
|
}
|
|
|
|
//
|
|
// get current working dir that the gb binary is in. all the data
|
|
// files should in there too!!
|
|
const char *workingDir = getcwd2 ( argv[0] );
|
|
if ( ! workingDir ) {
|
|
fprintf(stderr,"could not get working dir. Exiting.\n");
|
|
return 1;
|
|
}
|
|
|
|
//log("host: working directory is %s",workingDir);
|
|
|
|
//initialize IP address checks
|
|
initialize_ip_address_checks();
|
|
|
|
// load up hosts.conf
|
|
// . it will determine our hostid based on the directory path of this
|
|
// gb binary and the ip address of this server
|
|
if ( ! g_hostdb.init(-1, NULL, isProxy, useTmpCluster, workingDir)) {
|
|
log( LOG_ERROR, "db: hostdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
Host *h9 = g_hostdb.m_myHost;
|
|
|
|
// set clock file name so gettimeofdayInMmiilisecondsGlobal()
|
|
// see g_clockInSync to be true... unles clockadjust.dat is more
|
|
// than 2 days old in which case not!
|
|
if ( g_hostdb.m_myHost->m_hostId != 0 ) {
|
|
// host #0 does not need this, everyone syncs with him
|
|
setTimeAdjustmentFilename(g_hostdb.m_dir , "clockadjust.dat");
|
|
|
|
// might as well load it i guess
|
|
loadTimeAdjustment();
|
|
}
|
|
|
|
// init our table for doing zobrist hashing
|
|
if ( ! hashinit() ) {
|
|
log( LOG_ERROR, "db: Failed to init hashtable." );
|
|
return 1;
|
|
}
|
|
|
|
// . hashinit() calls srand() w/ a fixed number
|
|
// . let's mix it up again
|
|
srand ( time(NULL) );
|
|
|
|
// do not save conf if any core dump occurs starting here
|
|
// down to where we set this back to true
|
|
g_conf.m_save = false;
|
|
|
|
//Put this here so that now we can log messages
|
|
if ( strcmp ( cmd , "proxy" ) == 0 ) {
|
|
if (argc < 3){
|
|
goto printHelp;
|
|
}
|
|
|
|
int32_t proxyId = -1;
|
|
if ( cmdarg+2 < argc ) proxyId = atoi ( argv[cmdarg+2] );
|
|
|
|
if ( strcmp ( argv[cmdarg+1] , "start" ) == 0 ) {
|
|
return install ( ifk_proxy_start , proxyId );
|
|
}
|
|
else if ( strcmp ( argv[cmdarg+1] , "stop" ) == 0 ) {
|
|
g_proxy.m_proxyRunning = true;
|
|
return doCmd ( "save=1" , proxyId , "master" , false, true );
|
|
}
|
|
|
|
else if ( strcmp ( argv[cmdarg+1] , "replacehost" ) == 0 ) {
|
|
g_proxy.m_proxyRunning = true;
|
|
int32_t hostId = -1;
|
|
int32_t spareId = -1;
|
|
if ( cmdarg + 2 < argc )
|
|
hostId = atoi ( argv[cmdarg+2] );
|
|
if ( cmdarg + 2 < argc )
|
|
spareId = atoi ( argv[cmdarg+3] );
|
|
char replaceCmd[256];
|
|
sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32, hostId, spareId);
|
|
return doCmd ( replaceCmd, -1, "admin/hosts", false, true);
|
|
}
|
|
|
|
else if ( proxyId == -1 || strcmp ( argv[cmdarg+1] , "load" ) != 0 ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
Host *h = g_hostdb.getProxy( proxyId );
|
|
uint16_t httpPort = h->m_httpPort;
|
|
uint16_t httpsPort = h->m_httpsPort;
|
|
//we need udpserver for addurl and udpserver2 for pingserver
|
|
uint16_t udpPort = h->m_port;
|
|
|
|
if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
|
|
log( LOG_ERROR, "db: Conf init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// init the loop before g_process since g_process
|
|
// registers a sleep callback!
|
|
if ( ! g_loop.init() ) {
|
|
log( LOG_ERROR, "db: Loop init failed." );
|
|
return 1;
|
|
}
|
|
|
|
//if ( ! g_jobScheduler.initialize() ) {
|
|
// log("db: Threads init failed." ); return 1; }
|
|
|
|
g_process.init();
|
|
|
|
if ( ! g_process.checkNTPD() ) {
|
|
log( LOG_ERROR, "db: ntpd not running on proxy" );
|
|
return 1;
|
|
}
|
|
|
|
if ( !ucInit(g_hostdb.m_dir)) {
|
|
log( LOG_ERROR, "db: Unicode initialization failed!" );
|
|
return 1;
|
|
}
|
|
|
|
// load speller unifiedDict for spider compression proxy
|
|
//if ( g_hostdb.m_myHost->m_type & HT_SCPROXY )
|
|
// g_speller.init();
|
|
|
|
if ( ! g_udpServer.init( g_hostdb.getMyPort() ,
|
|
&g_dp,
|
|
20000000 , // readBufSIze
|
|
20000000 , // writeBufSize
|
|
20 , // pollTime in ms
|
|
3500 , // max udp slots
|
|
false )){ // is dns?
|
|
log( LOG_ERROR, "db: UdpServer init failed." );
|
|
return 1;
|
|
}
|
|
|
|
|
|
if (!g_proxy.initProxy (proxyId, udpPort, 0, &g_dp)) {
|
|
log( LOG_ERROR, "proxy: init failed" );
|
|
return 1;
|
|
}
|
|
|
|
// then statsdb
|
|
if ( ! g_statsdb.init() ) {
|
|
log( LOG_ERROR, "db: Statsdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// init our table for doing zobrist hashing
|
|
if ( ! hashinit() ) {
|
|
log( LOG_ERROR, "db: Failed to init hashtable." );
|
|
return 1;
|
|
}
|
|
|
|
if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) {
|
|
log( LOG_ERROR, "db: HttpServer init failed. Another gb "
|
|
"already running? If not, try editing "
|
|
"./hosts.conf to "
|
|
"change the port from %" PRId32" to something bigger. "
|
|
"Or stop gb by running 'gb stop' or by "
|
|
"clicking 'save & exit' in the master controls."
|
|
, (int32_t)httpPort );
|
|
// this is dangerous!!! do not do the shutdown thing
|
|
return 1;
|
|
}
|
|
|
|
//we should save gb.conf right ?
|
|
g_conf.m_save = true;
|
|
|
|
g_loop.runLoop();
|
|
}
|
|
|
|
if ( strcmp ( cmd , "blaster" ) == 0 ) {
|
|
int32_t i=cmdarg+1;
|
|
bool isLogFile=false;
|
|
bool injectUrlWithLinks=false;
|
|
bool injectUrl=false;
|
|
int32_t wait = 0;
|
|
|
|
if ( strcmp (argv[i],"-l") == 0 ){
|
|
isLogFile=true;
|
|
i++;
|
|
}
|
|
if ( strcmp (argv[i],"-i") == 0 ){
|
|
injectUrlWithLinks=true;
|
|
i++;
|
|
}
|
|
if ( strcmp (argv[i],"-u") == 0 ){
|
|
injectUrl=true;
|
|
i++;
|
|
}
|
|
|
|
char *filename = argv[i];
|
|
int32_t maxNumThreads=1;
|
|
if (argv[i+1]) maxNumThreads=atoi(argv[i+1]);
|
|
if (argv[i+2]) wait=atoi(argv[i+2]);
|
|
g_conf.m_maxMem = 2000000000;
|
|
//wait atleast 10 msec before you start again.
|
|
if (wait<1000) wait=10;
|
|
g_blaster.runBlaster (filename,NULL,
|
|
maxNumThreads,wait,
|
|
isLogFile,false,false,false,
|
|
injectUrlWithLinks,
|
|
injectUrl);
|
|
// disable any further logging so final log msg is clear
|
|
g_log.m_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
if ( strcmp ( cmd , "blasterdiff" ) == 0 ) {
|
|
int32_t i=cmdarg+1;
|
|
bool verbose=false;
|
|
bool justDisplay=false;
|
|
bool useProxy=true;
|
|
//cycle through the arguments to check for -v,-j,-p
|
|
while (argv[i] && argv[i][0]=='-'){
|
|
if ( strcmp (argv[i],"-v") == 0 ){
|
|
verbose=true;
|
|
}
|
|
else if ( strcmp (argv[i],"-j") == 0 ){
|
|
justDisplay=true;
|
|
}
|
|
else if ( strcmp (argv[i],"-p") == 0){
|
|
useProxy=false;
|
|
}
|
|
i++;
|
|
}
|
|
|
|
char *file1 = argv[i];
|
|
char *file2 = argv[i+1];
|
|
int32_t maxNumThreads=1;
|
|
if (argv[i+2]) maxNumThreads=atoi(argv[i+2]);
|
|
int32_t wait;
|
|
if (argv[i+3]) wait=atoi(argv[i+3]);
|
|
//wait atleast 1 sec before you start again.
|
|
if (wait<1000) wait=1000;
|
|
g_blaster.runBlaster(file1,file2,
|
|
maxNumThreads,wait,false,
|
|
verbose,justDisplay,useProxy);
|
|
// disable any further logging so final log msg is clear
|
|
g_log.m_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
// gb ping [hostId] [clientPort]
|
|
if ( strcmp ( cmd , "ping" ) == 0 ) {
|
|
int32_t hostId = 0;
|
|
if ( cmdarg + 1 < argc ) {
|
|
hostId = atoi ( argv[cmdarg+1] );
|
|
}
|
|
|
|
uint16_t port = 2050;
|
|
if ( cmdarg + 2 < argc ) {
|
|
port = (uint16_t)atoi ( argv[cmdarg+2] );
|
|
}
|
|
|
|
pingTest ( hostId , port );
|
|
|
|
return 0;
|
|
}
|
|
|
|
// gb injecttest <requestLen> [hostId]
|
|
if ( strcmp ( cmd , "injecttest" ) == 0 ) {
|
|
if ( cmdarg+1 >= argc ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
int32_t hostId = 0;
|
|
if ( cmdarg + 2 < argc ) {
|
|
hostId = atoi ( argv[cmdarg+2] );
|
|
}
|
|
|
|
int32_t reqLen = atoi ( argv[cmdarg+1] );
|
|
if ( reqLen == 0 ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
injectFileTest ( reqLen , hostId );
|
|
return 0;
|
|
}
|
|
|
|
// gb dsh
|
|
if ( strcmp ( cmd , "dsh" ) == 0 ) {
|
|
if ( cmdarg+1 >= argc ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
char *cmd = argv[cmdarg+1];
|
|
return install ( ifk_dsh, -1, NULL, -1, cmd );
|
|
}
|
|
|
|
// gb dsh2
|
|
if ( strcmp ( cmd , "dsh2" ) == 0 ) {
|
|
if ( cmdarg+1 >= argc ) goto printHelp;
|
|
char *cmd = argv[cmdarg+1];
|
|
return install ( ifk_dsh2, -1, NULL, -1, cmd );
|
|
}
|
|
|
|
// gb copyfiles, like gb install but takes a dir not a host #
|
|
if ( strcmp ( cmd , "copyfiles" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
char *dir = argv[cmdarg+1];
|
|
return copyFiles ( dir );
|
|
}
|
|
|
|
// gb install
|
|
if ( strcmp ( cmd , "install" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
if ( cmdarg + 1 < argc ) h1 = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if (cmdarg + 1 < argc && strstr(argv[cmdarg+1],"-") )
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
return install ( ifk_install, h1, NULL, h2 );
|
|
}
|
|
|
|
// gb installgb
|
|
if ( strcmp ( cmd , "installgb" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return install ( ifk_installgb , hostId );
|
|
}
|
|
|
|
// gb installfile
|
|
if ( strcmp ( cmd , "installfile" ) == 0 ) {
|
|
if(cmdarg+1 < argc)
|
|
return install_file ( argv[cmdarg+1] );
|
|
}
|
|
|
|
// gb installtmpgb
|
|
if ( strcmp ( cmd , "installtmpgb" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return install ( ifk_installtmpgb , hostId );
|
|
}
|
|
|
|
// gb installconf
|
|
if ( strcmp ( cmd , "installconf" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return install ( ifk_installconf , hostId );
|
|
}
|
|
|
|
// gb installconf2
|
|
if ( strcmp ( cmd , "installconf2" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return install ( ifk_installconf2 , hostId );
|
|
}
|
|
|
|
// gb start [hostId]
|
|
if ( strcmp ( cmd , "start" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) {
|
|
hostId = atoi( argv[ cmdarg + 1 ] );
|
|
}
|
|
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf( argv[cmdarg+1], "%" PRId32"-%" PRId32, &h1, &h2 );
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return install ( ifk_start, h1, NULL, h2 );
|
|
}
|
|
|
|
// default to keepalive start for now!!
|
|
return install ( ifk_start , hostId );
|
|
}
|
|
|
|
// gb tmpstart [hostId]
|
|
if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return install ( ifk_tmpstart, h1, NULL, h2 );
|
|
}
|
|
return install ( ifk_tmpstart, hostId );
|
|
}
|
|
|
|
if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "save=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "save=1", hostId, "master", true, false );
|
|
}
|
|
|
|
if ( strcmp ( cmd , "kstop" ) == 0 ) {
|
|
//same as stop, here for consistency
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "save=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "save=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb backupcopy [hostId] <backupSubdirName>
|
|
if ( strcmp ( cmd , "backupcopy" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
return install( ifk_backupcopy , -1 , argv[cmdarg+1] );
|
|
}
|
|
|
|
// gb backupmove [hostId] <backupSubdirName>
|
|
if ( strcmp ( cmd , "backupmove" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
return install( ifk_backupmove , -1 , argv[cmdarg+1] );
|
|
}
|
|
|
|
// gb backupmove [hostId] <backupSubdirName>
|
|
if ( strcmp ( cmd , "backuprestore" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
return install( ifk_backuprestore, -1 , argv[cmdarg+1] );
|
|
}
|
|
|
|
// gb scale <hosts.conf>
|
|
if ( strcmp ( cmd , "scale" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
return scale( argv[cmdarg+1] , true );
|
|
}
|
|
|
|
// gb collinject
|
|
if ( strcmp ( cmd , "collinject" ) == 0 ) {
|
|
if ( cmdarg + 1 >= argc ) goto printHelp;
|
|
return collinject( argv[cmdarg+1] );
|
|
}
|
|
|
|
// gb collcopy <hosts.conf> <coll> <collnum>>
|
|
if ( strcmp ( cmd , "collcopy" ) == 0 ) {
|
|
if ( cmdarg + 4 != argc ) goto printHelp;
|
|
char *hostsconf = argv[cmdarg+1];
|
|
char *coll = argv[cmdarg+2];
|
|
int32_t collnum = atoi(argv[cmdarg+3]);
|
|
return collcopy ( hostsconf , coll , collnum );
|
|
}
|
|
|
|
// gb stop [hostId]
|
|
if ( strcmp ( cmd , "stop" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd ( "save=1" , h1 , "master" , true, false, h2 );
|
|
}
|
|
return doCmd( "save=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb save [hostId]
|
|
if ( strcmp ( cmd , "save" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd ( "js=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "js=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb spidersoff [hostId]
|
|
if ( strcmp ( cmd , "spidersoff" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return doCmd( "se=0", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb spiderson [hostid]
|
|
if ( strcmp ( cmd , "spiderson" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return doCmd( "se=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb cacheoff [hostId]
|
|
if ( strcmp ( cmd , "cacheoff" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return doCmd( "dpco=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb ddump [hostId]
|
|
if ( strcmp ( cmd , "ddump" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
return doCmd( "dump=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb pmerge [hostId]
|
|
if ( strcmp ( cmd , "pmerge" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "pmerge=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "pmerge=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb smerge [hostId]
|
|
if ( strcmp ( cmd , "smerge" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "smerge=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "smerge=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb tmerge [hostId]
|
|
if ( strcmp ( cmd , "tmerge" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "tmerge=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "tmerge=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb merge [hostId]
|
|
if ( strcmp ( cmd , "merge" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
// might have a range
|
|
if ( cmdarg + 1 < argc ) {
|
|
int32_t h1 = -1;
|
|
int32_t h2 = -1;
|
|
sscanf ( argv[cmdarg+1],"%" PRId32"-%" PRId32,&h1,&h2);
|
|
if ( h1 != -1 && h2 != -1 && h1 <= h2 )
|
|
return doCmd( "merge=1", h1, "master", true, false, h2 );
|
|
}
|
|
return doCmd( "merge=1", hostId, "master", true, false );
|
|
}
|
|
|
|
// gb setnote <hostid> <note>
|
|
if ( strcmp ( cmd, "setnote" ) == 0 ) {
|
|
int32_t hostId;
|
|
char *note;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
else return false;
|
|
if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
|
|
else return false;
|
|
char urlnote[1024];
|
|
urlEncode(urlnote, 1024, note, strlen(note));
|
|
log ( LOG_INIT, "conf: setnote %" PRId32": %s", hostId, urlnote );
|
|
char setnoteCmd[256];
|
|
sprintf(setnoteCmd, "setnote=1&host=%" PRId32"¬e=%s",
|
|
hostId, urlnote);
|
|
return doCmd( setnoteCmd, -1, "admin/hosts", true, false );
|
|
}
|
|
|
|
// gb setsparenote <spareid> <note>
|
|
if ( strcmp ( cmd, "setsparenote" ) == 0 ) {
|
|
int32_t spareId;
|
|
char *note;
|
|
if ( cmdarg + 1 < argc ) spareId = atoi ( argv[cmdarg+1] );
|
|
else return false;
|
|
if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
|
|
else return false;
|
|
char urlnote[1024];
|
|
urlEncode(urlnote, 1024, note, strlen(note));
|
|
log(LOG_INIT, "conf: setsparenote %" PRId32": %s", spareId, urlnote);
|
|
char setnoteCmd[256];
|
|
sprintf(setnoteCmd, "setsparenote=1&spare=%" PRId32"¬e=%s",
|
|
spareId, urlnote);
|
|
return doCmd( setnoteCmd, -1, "admin/hosts" , true, false );
|
|
}
|
|
|
|
// gb replacehost <hostid> <spareid>
|
|
if ( strcmp ( cmd, "replacehost" ) == 0 ) {
|
|
int32_t hostId = -1;
|
|
int32_t spareId = -1;
|
|
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
|
|
if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+2] );
|
|
char replaceCmd[256];
|
|
sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32,
|
|
hostId, spareId);
|
|
return doCmd( replaceCmd, -1, "admin/hosts", true, true );
|
|
}
|
|
|
|
// HACK: enable logging for Conf.cpp, etc.
|
|
g_process.m_powerIsOn = true;
|
|
|
|
// . read in the conf file
|
|
// . this now initializes from a dir and hostId, they should all be
|
|
// name gbHID.conf
|
|
// . now that hosts.conf has more of the burden, all gbHID.conf files
|
|
// can be identical
|
|
if ( ! g_conf.init ( h9->m_dir ) ) {
|
|
log( LOG_ERROR, "db: Conf init failed." );
|
|
return 1;
|
|
}
|
|
|
|
if ( ! g_jobScheduler.initialize(g_conf.m_maxCpuThreads, g_conf.m_maxIOThreads, g_conf.m_maxExternalThreads, wakeupPollLoop)) {
|
|
log( LOG_ERROR, "db: JobScheduler init failed." );
|
|
return 1;
|
|
}
|
|
|
|
//if ( ! g_hostdb.validateIps ( &g_conf ) ) {
|
|
// log("db: Failed to validate ips." ); return 1;}
|
|
//if ( ! g_hostdb2.validateIps ( &g_conf ) ) {
|
|
// log("db: Failed to validate ips." ); return 1;}
|
|
|
|
// put in read only mode
|
|
if ( useTmpCluster ) {
|
|
g_conf.m_readOnlyMode = true;
|
|
g_conf.m_sendEmailAlerts = false;
|
|
}
|
|
|
|
// log how much mem we can use
|
|
//log(LOG_INIT,"conf: Max mem allowed to use is %" PRId64"\n",
|
|
//g_conf.m_maxMem);
|
|
|
|
// init the loop, needs g_conf
|
|
if ( ! g_loop.init() ) {
|
|
log( LOG_ERROR, "db: Loop init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// the new way to save all rdbs and conf
|
|
// if g_process.m_powerIsOn is false, logging will not work, so init
|
|
// this up here. must call after Loop::init() so it can register
|
|
// its sleep callback
|
|
g_process.init();
|
|
|
|
// set up the threads, might need g_conf
|
|
|
|
// avoid logging threads msgs to stderr if not actually starting up
|
|
// a gb daemon...
|
|
//if(cmd && cmd[0] && ! is_digit(cmd[0]) && ! g_jobScheduler.initialize() ) {
|
|
//if ( ! g_threads.init() ) {
|
|
// log("db: Threads init failed." ); return 1; }
|
|
|
|
// gb gendict
|
|
if ( strcmp ( cmd , "gendict" ) == 0 ) {
|
|
// get hostId to install TO (-1 means all)
|
|
if ( argc != cmdarg + 2 &&
|
|
argc != cmdarg + 3 ) goto printHelp; // take no other args
|
|
char *coll = argv[cmdarg+1];
|
|
// get numWordsToDump
|
|
int32_t nn = 10000000;
|
|
if ( argc == cmdarg + 3 ) nn = atoi ( argv[cmdarg+2] );
|
|
// . generate the dict files
|
|
// . use the first 100,000,000 words/phrases to make them
|
|
g_speller.generateDicts ( nn , coll );
|
|
return 0;
|
|
}
|
|
|
|
if ( strcmp ( cmd , "rmtest" ) == 0 ) {
|
|
rmTest();
|
|
return 0;
|
|
}
|
|
|
|
// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
|
|
// . spiderdb is special:
|
|
// gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
|
|
// [priority] [printStats?]
|
|
if ( strcmp ( cmd , "dump" ) == 0 ) {
|
|
//
|
|
// tell Collectiondb, not to verify each rdb's data
|
|
//
|
|
g_dumpMode = true;
|
|
|
|
if ( cmdarg+1 >= argc ) goto printHelp;
|
|
int32_t startFileNum = 0;
|
|
int32_t numFiles = -1;
|
|
int32_t includeTree = 1;
|
|
int64_t termId = -1;
|
|
const char *coll = "";
|
|
|
|
// so we do not log every collection coll.conf we load
|
|
g_conf.m_doingCommandLine = true;
|
|
|
|
// we have to init collection db because we need to know if
|
|
// the collnum is legit or not in the tree
|
|
if ( ! g_collectiondb.loadAllCollRecs() ) {
|
|
log("db: Collectiondb init failed." ); return 1; }
|
|
|
|
if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
|
|
if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
|
|
if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
|
|
if ( cmdarg+5 < argc ) includeTree = atoi(argv[cmdarg+5]);
|
|
if ( cmdarg+6 < argc ) {
|
|
char *targ = argv[cmdarg+6];
|
|
if ( is_alpha_a(targ[0]) ) {
|
|
char *colon = strstr(targ,":");
|
|
int64_t prefix64 = 0LL;
|
|
if ( colon ) {
|
|
*colon = '\0';
|
|
prefix64 = hash64n(targ);
|
|
targ = colon + 1;
|
|
}
|
|
// hash the term itself
|
|
termId = hash64n(targ);
|
|
// hash prefix with termhash
|
|
if ( prefix64 )
|
|
termId = hash64(termId,prefix64);
|
|
termId &= TERMID_MASK;
|
|
}
|
|
else {
|
|
termId = atoll1(targ);
|
|
}
|
|
}
|
|
if ( argv[cmdarg+1][0] == 't' ) {
|
|
int64_t docId = 0LL;
|
|
if ( cmdarg+6 < argc ) {
|
|
docId = atoll1(argv[cmdarg+6]);
|
|
}
|
|
|
|
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, false);
|
|
|
|
}
|
|
else if ( argv[cmdarg+1][0] == 'D' ) {
|
|
int64_t docId = 0LL;
|
|
if ( cmdarg+6 < argc ) {
|
|
docId = atoll1(argv[cmdarg+6]);
|
|
}
|
|
|
|
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
|
|
}
|
|
else if ( argv[cmdarg+1][0] == 'w' )
|
|
dumpWaitingTree(coll);
|
|
else if ( argv[cmdarg+1][0] == 'x' )
|
|
dumpDoledb (coll,startFileNum,numFiles,includeTree);
|
|
else if ( argv[cmdarg+1][0] == 's' ) {
|
|
char printStats = 0;
|
|
int32_t firstIp = 0;
|
|
if ( cmdarg+6 < argc ){
|
|
printStats= atol(argv[cmdarg+6]);
|
|
// it could be an ip instead of printstats
|
|
if ( strstr(argv[cmdarg+6],".") ) {
|
|
printStats = 0;
|
|
firstIp = atoip(argv[cmdarg+6]);
|
|
}
|
|
}
|
|
|
|
int32_t ret = dumpSpiderdb ( coll, startFileNum, numFiles, includeTree, printStats, firstIp );
|
|
if ( ret == -1 ) {
|
|
fprintf(stdout,"error dumping spiderdb\n");
|
|
}
|
|
}
|
|
else if ( argv[cmdarg+1][0] == 'S' ) {
|
|
char *site = NULL;
|
|
if ( cmdarg+6 < argc ) {
|
|
site = argv[ cmdarg + 6 ];
|
|
}
|
|
dumpTagdb( coll, startFileNum, numFiles, includeTree, 0, RDB_TAGDB, site );
|
|
} else if ( argv[cmdarg+1][0] == 'z' ) {
|
|
char *site = NULL;
|
|
if ( cmdarg+6 < argc ) {
|
|
site = argv[ cmdarg + 6 ];
|
|
}
|
|
dumpTagdb( coll, startFileNum, numFiles, includeTree, 'z', RDB_TAGDB, site );
|
|
} else if ( argv[cmdarg+1][0] == 'A' ) {
|
|
dumpTagdb( coll, startFileNum, numFiles, includeTree, 'A' );
|
|
} else if ( argv[cmdarg+1][0] == 'G' ) {
|
|
dumpTagdb( coll, startFileNum, numFiles, includeTree, 'G' );
|
|
} else if ( argv[cmdarg+1][0] == 'W' ) {
|
|
dumpTagdb( coll, startFileNum, numFiles, includeTree );
|
|
} else if ( argv[cmdarg+1][0] == 'l' )
|
|
dumpClusterdb (coll,startFileNum,numFiles,includeTree);
|
|
//else if ( argv[cmdarg+1][0] == 'z' )
|
|
// dumpStatsdb(startFileNum,numFiles,includeTree,2);
|
|
//else if ( argv[cmdarg+1][0] == 'Z' )
|
|
// dumpStatsdb(startFileNum,numFiles,includeTree,4);
|
|
else if ( argv[cmdarg+1][0] == 'L' ) {
|
|
char *url = NULL;
|
|
if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
|
|
dumpLinkdb(coll,startFileNum,numFiles,includeTree,url);
|
|
} else if ( argv[cmdarg+1][0] == 'p' ) {
|
|
dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
|
|
} else {
|
|
goto printHelp;
|
|
}
|
|
// disable any further logging so final log msg is clear
|
|
g_log.m_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
if( strcmp( cmd, "countdomains" ) == 0 && argc >= (cmdarg + 2) ) {
|
|
const char *coll = "";
|
|
int32_t verb;
|
|
int32_t outpt;
|
|
coll = argv[cmdarg+1];
|
|
|
|
if( argv[cmdarg+2][0] < 0x30 || argv[cmdarg+2][0] > 0x39 ) {
|
|
goto printHelp;
|
|
}
|
|
|
|
int32_t numRecs = atoi( argv[cmdarg+2] );
|
|
|
|
if( argc > (cmdarg + 2) ) verb = atoi( argv[cmdarg+2] );
|
|
else verb = 0;
|
|
|
|
if( argc > (cmdarg + 3) ) outpt = atoi( argv[cmdarg+3] );
|
|
else outpt = 0;
|
|
|
|
log( LOG_INFO, "cntDm: Allocated Larger Mem Table for: %" PRId32,
|
|
g_mem.getMemTableSize() );
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log("Unicode initialization failed!");
|
|
return 1;
|
|
}
|
|
|
|
if ( ! g_collectiondb.loadAllCollRecs() ) {
|
|
log("db: Collectiondb init failed." ); return 1; }
|
|
|
|
countdomains( coll, numRecs, verb, outpt );
|
|
g_log.m_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
// temp merge test
|
|
//RdbList list;
|
|
//list.testIndexMerge();
|
|
|
|
// file creation test, make sure we have dir control
|
|
if ( checkDirPerms ( g_hostdb.m_dir ) < 0 ) {
|
|
return 1;
|
|
}
|
|
|
|
// . make sure we have critical files
|
|
if ( ! g_process.checkFiles ( g_hostdb.m_dir ) ) {
|
|
return 1;
|
|
}
|
|
|
|
// load the appropriate dictionaries
|
|
//g_speller.init();
|
|
//if ( !g_speller.init ( ) ) return 1;
|
|
|
|
g_errno = 0;
|
|
//g_speller.test ( );
|
|
//exit(-1);
|
|
/*
|
|
char dst[1024];
|
|
char test[1024];
|
|
spellLoop:
|
|
test[0] = '\0';
|
|
gets ( test );
|
|
if ( test[strlen(test)-1] == '\n' ) test[strlen(test)-1] = '\0';
|
|
Query qq;
|
|
qq.set ( test , strlen(test) , NULL , 0 , false );
|
|
if ( g_speller.getRecommendation ( &qq , dst , 1000 ) )
|
|
log("spelling suggestion: %s", dst );
|
|
goto spellLoop;
|
|
*/
|
|
|
|
// make sure port is available, no use loading everything up then
|
|
// failing because another process is already running using this port
|
|
if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort, true)) {
|
|
// return 0 so keep alive bash loop exits
|
|
exit(0);
|
|
}
|
|
|
|
int32_t *ips;
|
|
|
|
// move the log file name logxxx to logxxx-2016_03_16-14:59:24
|
|
// we did the test bind so no gb process is bound on the port yet
|
|
// TODO: probably should bind on the port before doing this
|
|
if ( doesFileExist ( g_hostdb.m_logFilename ) ) {
|
|
char tmp2[128];
|
|
SafeBuf newName(tmp2,128);
|
|
time_t ts = getTimeLocal();
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = localtime_r(&ts,&tm_buf);
|
|
//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
|
|
char ppp[100];
|
|
strftime(ppp,100,"%Y%m%d-%H%M%S",timeStruct);
|
|
newName.safePrintf("%s-bak%s",g_hostdb.m_logFilename, ppp );
|
|
::rename ( g_hostdb.m_logFilename, newName.getBufStart() );
|
|
}
|
|
|
|
|
|
log("db: Logging to file %s.",
|
|
g_hostdb.m_logFilename );
|
|
|
|
if ( ! g_conf.m_runAsDaemon )
|
|
log("db: Use 'gb -d' to run as daemon. Example: "
|
|
"gb -d");
|
|
|
|
/*
|
|
// tmp stuff to generate new query log
|
|
if ( ! ucInit(g_hostdb.m_dir, true)) return 1;
|
|
if ( ! g_wiktionary.load() ) return 1;
|
|
if ( ! g_wiktionary.test() ) return 1;
|
|
if ( ! g_wiki.load() ) return 1;
|
|
if ( ! g_speller.init() && g_conf.m_isLive ) return 1;
|
|
return 0;
|
|
*/
|
|
|
|
|
|
// start up log file
|
|
if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
|
|
fprintf (stderr,"db: Log file init failed. Exiting.\n" );
|
|
return 1;
|
|
}
|
|
|
|
g_log.m_logTimestamps = true;
|
|
g_log.m_logReadableTimestamps = true; // @todo BR: Should be configurable..
|
|
|
|
// in case we do not have one, we need it for Images.cpp
|
|
if ( ! makeTrashDir() ) {
|
|
fprintf (stderr,"db: failed to make trash dir. Exiting.\n" );
|
|
return 1;
|
|
}
|
|
|
|
|
|
g_errno = 0;
|
|
|
|
//
|
|
// run as daemon now
|
|
//
|
|
//fprintf(stderr,"running as daemon\n");
|
|
if ( g_conf.m_runAsDaemon ) {
|
|
pid_t pid, sid;
|
|
pid = fork();
|
|
if ( pid < 0 ) exit(EXIT_FAILURE);
|
|
// seems like we core unless parent sets this to NULL.
|
|
// it does not affect the child.
|
|
//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
|
|
// child gets a 0, parent gets the child's pid, so exit
|
|
if ( pid > 0 ) exit(EXIT_SUCCESS);
|
|
// change file mode mask
|
|
umask(0);
|
|
sid = setsid();
|
|
if ( sid < 0 ) exit(EXIT_FAILURE);
|
|
//fprintf(stderr,"done\n");
|
|
|
|
// if we do not do this we don't get sigalarms or quickpolls
|
|
// when running as 'gb -d'
|
|
g_loop.init();
|
|
}
|
|
|
|
// initialize threads down here now so it logs to the logfile and
|
|
// not stderr
|
|
//if ( ( ! cmd || !cmd[0]) && ! g_jobScheduler.initialize() ) {
|
|
// log("db: Threads init failed." ); return 1; }
|
|
|
|
// log the version
|
|
log(LOG_INIT,"conf: Gigablast Version : %s", getVersion());
|
|
log(LOG_INIT,"conf: Gigablast Architecture : %d-bit", arch);
|
|
log(LOG_INIT,"conf: Gigablast Build config : %s", getBuildConfig());
|
|
log(LOG_INIT,"conf: Gigablast Git commit : %s", getCommitId());
|
|
|
|
|
|
// show current working dir
|
|
log("host: Working directory is %s",workingDir);
|
|
|
|
log("host: Using %shosts.conf",g_hostdb.m_dir);
|
|
|
|
{
|
|
pid_t pid = getpid();
|
|
log("host: Process ID is %" PRIu64,(int64_t)pid);
|
|
}
|
|
|
|
// from Hostdb.cpp
|
|
ips = getLocalIps();
|
|
for ( ; ips && *ips ; ips++ )
|
|
log("host: Detected local ip %s",iptoa(*ips));
|
|
|
|
// show it
|
|
log("host: Running as host id #%" PRId32,g_hostdb.m_hostId );
|
|
|
|
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log( LOG_ERROR, "Unicode initialization failed!" );
|
|
return 1;
|
|
}
|
|
|
|
// some tests. the greek letter alpha with an accent mark (decompose)
|
|
/*
|
|
{
|
|
char us[] = {0xe1,0xbe,0x80};
|
|
UChar32 uc = utf8Decode(us);//,&next);
|
|
UChar32 ttt[32];
|
|
int32_t klen = recursiveKDExpand(uc,ttt,256);
|
|
char obuf[64];
|
|
for ( int32_t i = 0 ; i < klen ; i++ ) {
|
|
UChar32 ui = ttt[i];
|
|
int32_t blen = utf8Encode(ui,obuf);
|
|
obuf[blen]=0;
|
|
int32_t an = ucIsAlpha(ui);
|
|
|
|
fprintf(stderr,"#%" PRId32"=%s (alnum=%" PRId32")\n",i,obuf,an);
|
|
}
|
|
fprintf(stderr,"hey\n");
|
|
exit(0);
|
|
}
|
|
*/
|
|
|
|
/*
|
|
|
|
PRINT OUT all Unicode characters and their decompositions
|
|
|
|
{
|
|
for ( int32_t uc = 0 ; uc < 0xe01ef ; uc++ ) {
|
|
//if ( ! ucIsAlnum(uc) ) continue;
|
|
UChar32 ttt[32];
|
|
int32_t klen = recursiveKDExpand(uc,ttt,256);
|
|
char obuf[64];
|
|
int32_t clen = utf8Encode(uc,obuf);
|
|
obuf[clen]=0;
|
|
// print utf8 char we are decomposing
|
|
fprintf(stderr,"%" PRIx32") %s --> ",uc,obuf);
|
|
// sanity
|
|
if ( klen > 1 && ttt[0] == (UChar32)uc ) {
|
|
fprintf(stderr,"SAME\n");
|
|
continue;
|
|
}
|
|
// print decomposition
|
|
for ( int32_t i = 0 ; i < klen ; i++ ) {
|
|
UChar32 ui = ttt[i];
|
|
char qbuf[64];
|
|
int32_t blen = utf8Encode(ui,qbuf);
|
|
qbuf[blen]=0;
|
|
fprintf(stderr,"%s",qbuf);
|
|
// show the #
|
|
fprintf(stderr,"{%" PRIx32"}",(int32_t)ui);
|
|
if ( i+1<klen ) fprintf(stderr,", ");
|
|
}
|
|
// show utf8 rep
|
|
fprintf(stderr," [");
|
|
for ( int32_t i = 0 ; i < clen ; i++ ) {
|
|
fprintf(stderr,"0x%hhx",(int)obuf[i]);
|
|
if ( i+1<clen) fprintf(stderr," ");
|
|
}
|
|
fprintf(stderr,"]");
|
|
fprintf(stderr,"\n");
|
|
}
|
|
exit(0);
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
// the wiktionary for lang identification and alternate word forms/
|
|
// synonyms
|
|
if ( ! g_wiktionary.load() ) {
|
|
log( LOG_ERROR, "Wiktionary initialization failed!" );
|
|
return 1;
|
|
}
|
|
|
|
if ( ! g_wiktionary.test() ) {
|
|
log( LOG_ERROR, "Wiktionary test failed!" );
|
|
return 1;
|
|
}
|
|
|
|
// . load synonyms, synonym affinity, and stems
|
|
// . now we are using g_synonyms
|
|
//g_thesaurus.init();
|
|
//g_synonyms.init();
|
|
|
|
// the wiki titles
|
|
if ( ! g_wiki.load() ) {
|
|
log( LOG_ERROR, "Wiki initialization failed!" );
|
|
return 1;
|
|
}
|
|
|
|
// force give up on dead hosts to false
|
|
g_conf.m_giveupOnDeadHosts = 0;
|
|
|
|
// shout out if we're in read only mode
|
|
if ( g_conf.m_readOnlyMode )
|
|
log("db: -- Read Only Mode Set. Can Not Add New Data. --");
|
|
|
|
// . collectiondb, does not use rdb, loads directly from disk
|
|
// . do this up here so RdbTree::fixTree() can fix RdbTree::m_collnums
|
|
// . this is a fake init, cuz we pass in "true"
|
|
if ( ! g_collectiondb.loadAllCollRecs() ) {
|
|
log( LOG_ERROR, "db: Collectiondb load failed." );
|
|
return 1;
|
|
}
|
|
|
|
// then statsdb
|
|
if ( ! g_statsdb.init() ) {
|
|
log( LOG_ERROR, "db: Statsdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// allow adds to statsdb rdb tree
|
|
g_process.m_powerIsOn = true;
|
|
|
|
if ( ! g_posdb.init() ) {
|
|
log( LOG_ERROR, "db: Posdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// then titledb
|
|
if ( ! g_titledb.init() ) {
|
|
log( LOG_ERROR, "db: Titledb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// then tagdb
|
|
if ( ! g_tagdb.init() ) {
|
|
log( LOG_ERROR, "db: Tagdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// then spiderdb
|
|
if ( ! g_spiderdb.init() ) {
|
|
log( LOG_ERROR, "db: Spiderdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// then doledb
|
|
if ( ! g_doledb.init() ) {
|
|
log( LOG_ERROR, "db: Doledb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// the spider cache used by SpiderLoop
|
|
if ( ! g_spiderCache.init() ) {
|
|
log( LOG_ERROR, "db: SpiderCache init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// site clusterdb
|
|
if ( ! g_clusterdb.init() ) {
|
|
log( LOG_ERROR, "db: Clusterdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// linkdb
|
|
if ( ! g_linkdb.init() ) {
|
|
log( LOG_ERROR, "db: Linkdb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
// now clean the trees since all rdbs have loaded their rdb trees
|
|
// from disk, we need to remove bogus collection data from teh trees
|
|
// like if a collection was delete but tree never saved right it'll
|
|
// still have the collection's data in it
|
|
if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
|
|
log("db: Collectiondb init failed." );
|
|
return 1;
|
|
}
|
|
|
|
//Load the high-frequency term shortcuts (if they exist)
|
|
g_hfts.load();
|
|
|
|
// test all collection dirs for write permission
|
|
int32_t pcount = 0;
|
|
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
|
|
CollectionRec *cr = g_collectiondb.m_recs[i];
|
|
if ( ! cr ) continue;
|
|
if ( ++pcount >= 100 ) {
|
|
log("rdb: not checking directory permission for more than first 100 collections to save time.");
|
|
break;
|
|
}
|
|
char tt[1024 + MAX_COLL_LEN ];
|
|
sprintf ( tt , "%scoll.%s.%" PRId32, g_hostdb.m_dir, cr->m_coll , (int32_t)cr->m_collnum );
|
|
checkDirPerms ( tt ) ;
|
|
}
|
|
|
|
//
|
|
// NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT!
|
|
//
|
|
|
|
// load the appropriate dictionaries
|
|
if ( ! g_speller.init() && g_conf.m_isLive ) {
|
|
return 1;
|
|
}
|
|
|
|
// Load the category language table
|
|
g_countryCode.loadHashTable();
|
|
|
|
// init minsitenuminlinks buffer
|
|
if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
|
|
log("db: failed to load sitelinks.txt data");
|
|
return 1;
|
|
}
|
|
|
|
// . then our main udp server
|
|
// . must pass defaults since g_dns uses it's own port/instance of it
|
|
// . server should listen to a socket and register with g_loop
|
|
// . sock read/write buf sizes are both 64000
|
|
// . poll time is 60ms
|
|
// . if the read/write bufs are too small it severely degrades
|
|
// transmission times for big messages. just use ACK_WINDOW *
|
|
// MAX_DGRAM_SIZE as the size so when sending you don't drop dgrams
|
|
// . the 400k size allows us to cover Sync.cpp's activity well
|
|
if ( ! g_udpServer.init( g_hostdb.getMyPort() ,&g_dp,
|
|
40000000 , // readBufSIze
|
|
20000000 , // writeBufSize
|
|
20 , // pollTime in ms
|
|
3500 , // max udp slots
|
|
false )){ // is dns?
|
|
log("db: UdpServer init failed." ); return 1; }
|
|
|
|
// start pinging right away
|
|
if ( ! g_pingServer.init() ) {
|
|
log("db: PingServer init failed." ); return 1; }
|
|
|
|
// start up repair loop
|
|
if ( ! g_repair.init() ) {
|
|
log("db: Repair init failed." ); return 1; }
|
|
|
|
// start up repair loop
|
|
if ( ! g_dailyMerge.init() ) {
|
|
log("db: Daily merge init failed." ); return 1; }
|
|
|
|
// . then dns Distributed client
|
|
// . server should listen to a socket and register with g_loop
|
|
// . Only the distributed cache shall call the dns server.
|
|
if ( ! g_dns.init( h9->m_dnsClientPort ) ) {
|
|
log("db: Dns distributed client init failed." ); return 1; }
|
|
|
|
g_stable_summary_cache.configure(g_conf.m_stableSummaryCacheMaxAge, g_conf.m_stableSummaryCacheSize);
|
|
g_unstable_summary_cache.configure(g_conf.m_unstableSummaryCacheMaxAge, g_conf.m_unstableSummaryCacheSize);
|
|
|
|
// . then webserver
|
|
// . server should listen to a socket and register with g_loop
|
|
if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {
|
|
log("db: HttpServer init failed. Another gb already running?" );
|
|
// this is dangerous!!! do not do the shutdown thing
|
|
return 1;
|
|
}
|
|
|
|
if(!Msg1f::init()) {
|
|
log("logviewer: init failed.");
|
|
return 1;
|
|
}
|
|
|
|
// . now register all msg handlers with g_udp server
|
|
if ( ! registerMsgHandlers() ) {
|
|
log("db: registerMsgHandlers failed" ); return 1; }
|
|
|
|
// gb spellcheck
|
|
if ( strcmp ( cmd , "spellcheck" ) == 0 ) {
|
|
if ( argc != cmdarg + 2 ) goto printHelp; // take no other args
|
|
g_speller.test ( argv[cmdarg + 1] );
|
|
return 0;
|
|
}
|
|
|
|
// gb dictLookupTest
|
|
if ( strcmp ( cmd , "dictlookuptest" ) == 0 ) {
|
|
if ( argc != cmdarg + 2 ) goto printHelp; // take no other args
|
|
g_speller.dictLookupTest ( argv[cmdarg + 1] );
|
|
return 0;
|
|
}
|
|
|
|
// . register a callback to try to merge everything every 60 seconds
|
|
// . do not exit if we couldn't do this, not a huge deal
|
|
// . put this in here instead of Rdb.cpp because we don't want generator commands merging on us
|
|
// . niceness is 1
|
|
// BR: Upped from 2 sec to 60. No need to check for merge every 2 seconds.
|
|
if ( !g_loop.registerSleepCallback( 60000, (void *)1, attemptMergeAllCallback, 1, true ) ) {
|
|
log( LOG_WARN, "db: Failed to init merge sleep callback." );
|
|
}
|
|
|
|
// try to sync parms (and collection recs) with host 0
|
|
if ( !g_loop.registerSleepCallback(1000, NULL, tryToSyncWrapper, 0 ) ) {
|
|
return false;
|
|
}
|
|
|
|
if ( !Statistics::initialize() ) {
|
|
return false;
|
|
}
|
|
|
|
if(g_recoveryMode) {
|
|
//now that everything is init-ed send the message.
|
|
char buf[256];
|
|
log("admin: Sending emails.");
|
|
sprintf(buf, "Host %" PRId32" respawning after crash.(%s)",
|
|
h9->m_hostId, iptoa(g_hostdb.getMyIp()));
|
|
g_pingServer.sendEmail(NULL, buf);
|
|
}
|
|
|
|
// . start the spiderloop
|
|
// . comment out when testing SpiderCache
|
|
g_spiderLoop.startLoop();
|
|
|
|
// allow saving of conf again
|
|
g_conf.m_save = true;
|
|
|
|
// flush stats
|
|
//g_statsdb.flush();
|
|
|
|
// ok, now activate statsdb
|
|
g_statsdb.m_disabled = false;
|
|
|
|
log("db: gb is now ready");
|
|
|
|
// . now start g_loops main interrupt handling loop
|
|
// . it should block forever
|
|
// . when it gets a signal it dispatches to a server or db to handle it
|
|
g_loop.runLoop();
|
|
}
|
|
|
|
/// @todo ALC wouldn't it be faster to actually check the dir permission instead of trying to write a tmp file?
|
|
int32_t checkDirPerms ( char *dir ) {
|
|
if ( g_conf.m_readOnlyMode ) {
|
|
return 0;
|
|
}
|
|
|
|
File f;
|
|
f.set ( dir , "tmpfile" );
|
|
if ( ! f.open ( O_RDWR | O_CREAT | O_TRUNC ) ) {
|
|
log( LOG_ERROR, "disk: Unable to create %stmpfile. Need write permission in this directory.", dir );
|
|
return -1;
|
|
}
|
|
if ( ! f.unlink() ) {
|
|
log( LOG_ERROR, "disk: Unable to delete %stmpfile. Need write permission in this directory.", dir );
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// save them all
|
|
static void doCmdAll ( int fd, void *state ) ;
|
|
static bool s_sendToHosts;
|
|
static bool s_sendToProxies;
|
|
static int32_t s_hostId;
|
|
static int32_t s_hostId2;
|
|
static char s_buffer[128];
|
|
static HttpRequest s_r;
|
|
bool doCmd ( const char *cmd , int32_t hostId , const char *filename ,
|
|
bool sendToHosts , bool sendToProxies , int32_t hostId2 ) {
|
|
// need loop to work
|
|
if ( ! g_loop.init() ) {
|
|
log(LOG_WARN, "db: Loop init failed." );
|
|
return false;
|
|
}
|
|
// save it
|
|
// we are no part of it
|
|
//g_hostdb.m_hostId = -1;
|
|
// pass it on
|
|
s_hostId = hostId;
|
|
s_sendToHosts = sendToHosts;
|
|
s_sendToProxies = sendToProxies;
|
|
s_hostId2 = hostId2;
|
|
// set stuff so http server client-side works right
|
|
g_conf.m_httpMaxSockets = 512;
|
|
sprintf ( g_conf.m_spiderUserAgent ,"GigablastOpenSource/1.0");
|
|
sprintf ( g_conf.m_spiderBotName ,"gigablastopensource");
|
|
|
|
|
|
// register sleep callback to get started
|
|
if ( ! g_loop.registerSleepCallback(1, NULL, doCmdAll , 0 ) ) {
|
|
log(LOG_WARN, "admin: Loop init failed.");
|
|
return false;
|
|
}
|
|
// not it
|
|
log(LOG_INFO,"admin: broadcasting %s",cmd);
|
|
// make a fake http request
|
|
sprintf ( s_buffer , "GET /%s?%s HTTP/1.0" , filename , cmd );
|
|
TcpSocket sock; sock.m_ip = 0;
|
|
// make it local loopback so it passes the permission test in
|
|
// doCmdAll()'s call to convertHttpRequestToParmList
|
|
sock.m_ip = atoip("127.0.0.1");
|
|
s_r.set ( s_buffer , strlen ( s_buffer ) , &sock );
|
|
// do not do sig alarms! for now just set this to null so
|
|
// the sigalarmhandler doesn't core
|
|
//g_hostdb.m_myHost = NULL;
|
|
|
|
// run the loop
|
|
g_loop.runLoop();
|
|
}
|
|
|
|
[[ noreturn ]] void doneCmdAll ( void *state ) {
|
|
log("cmd: completed command");
|
|
exit ( 0 );
|
|
}
|
|
|
|
|
|
void doCmdAll ( int fd, void *state ) {
|
|
|
|
// do not keep calling it!
|
|
g_loop.unregisterSleepCallback ( NULL, doCmdAll );
|
|
|
|
// make port -1 to indicate none to listen on
|
|
if ( ! g_udpServer.init( 18123 , // port to listen on
|
|
&g_dp,
|
|
20000000 , // readBufSIze
|
|
20000000 , // writeBufSize
|
|
20 , // pollTime in ms
|
|
3500 , // max udp slots
|
|
false )){ // is dns?
|
|
log("db: UdpServer init on port 18123 failed: %s" ,
|
|
mstrerror(g_errno));
|
|
exit(0);
|
|
}
|
|
|
|
// udpserver::sendRequest() checks we have a handle for msgs we send!
|
|
// so fake it out with this lest it cores
|
|
g_udpServer.registerHandler(msg_type_3f,handleRequest3f);
|
|
|
|
|
|
SafeBuf parmList;
|
|
// returns false and sets g_errno on error
|
|
if (!g_parms.convertHttpRequestToParmList(&s_r,&parmList,0,NULL)){
|
|
log("cmd: error converting command: %s",mstrerror(g_errno));
|
|
exit(0);
|
|
}
|
|
|
|
if ( parmList.length() <= 0 ) {
|
|
log("cmd: no parmlist to send");
|
|
exit(0);
|
|
}
|
|
|
|
// restrict broadcast to this hostid range!
|
|
|
|
// returns true with g_errno set on error. uses g_udpServer
|
|
if ( g_parms.broadcastParmList ( &parmList ,
|
|
NULL ,
|
|
doneCmdAll , // callback when done
|
|
s_sendToHosts ,
|
|
s_sendToProxies ,
|
|
s_hostId , // -1 means all
|
|
s_hostId2 ) ) { // -1 means all
|
|
log("cmd: error sending command: %s",mstrerror(g_errno));
|
|
exit(0);
|
|
}
|
|
// wait for it
|
|
log("cmd: sent command");
|
|
}
|
|
|
|
// copy a collection from one network to another (defined by 2 hosts.conf's)
|
|
int collcopy ( char *newHostsConf , char *coll , int32_t collnum ) {
|
|
Hostdb hdb;
|
|
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
|
|
if ( ! hdb.init( 0/*assume we're zero*/) ) {
|
|
log("clusterCopy failed. Could not init hostdb with %s",
|
|
newHostsConf);
|
|
return -1;
|
|
}
|
|
// sanity check
|
|
if ( hdb.getNumShards() != g_hostdb.getNumShards() ) {
|
|
log("Hosts.conf files do not have same number of groups.");
|
|
return -1;
|
|
}
|
|
if ( hdb.getNumHosts() != g_hostdb.getNumHosts() ) {
|
|
log("Hosts.conf files do not have same number of hosts.");
|
|
return -1;
|
|
}
|
|
// host checks
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
Host *h = &g_hostdb.m_hosts[i];
|
|
fprintf(stderr,"ssh %s '",iptoa(h->m_ip));
|
|
fprintf(stderr,"du -skc %scoll.%s.%" PRId32" | tail -1 '\n",
|
|
h->m_dir,coll,collnum);
|
|
}
|
|
// loop over dst hosts
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
Host *h = &g_hostdb.m_hosts[i];
|
|
// get the src host from the provided hosts.conf
|
|
Host *h2 = &hdb.m_hosts[i];
|
|
// print the copy
|
|
//fprintf(stderr,"rcp %s:%s*db*.dat* ",
|
|
// iptoa( h->m_ip), h->m_dir );
|
|
fprintf(stderr,"nohup ssh %s '",iptoa(h->m_ip));
|
|
fprintf(stderr,"rcp -r ");
|
|
fprintf(stderr,"%s:%scoll.%s.%" PRId32" ",
|
|
iptoa(h2->m_ip), h2->m_dir , coll, collnum );
|
|
fprintf(stderr,"%s' &\n", h->m_dir );
|
|
//fprintf(stderr," rcp -p %s*.map* ", h->m_dir );
|
|
//fprintf(stderr," rcp -r %scoll.* ", h->m_dir );
|
|
//fprintf(stderr,"%s:%s " ,iptoa(h2->m_ip), h2->m_dir );
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
// generate the copies that need to be done to scale from oldhosts.conf
|
|
// to newhosts.conf topology.
|
|
int scale ( char *newHostsConf , bool useShotgunIp) {
|
|
|
|
g_hostdb.resetPortTables();
|
|
|
|
Hostdb hdb;
|
|
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
|
|
if ( ! hdb.init( 0/*assume we're zero*/) ) {
|
|
log("Scale failed. Could not init hostdb with %s",
|
|
newHostsConf);
|
|
return -1;
|
|
}
|
|
|
|
// ptrs to the two hostdb's
|
|
Hostdb *hdb1 = &g_hostdb;
|
|
Hostdb *hdb2 = &hdb;
|
|
|
|
// this function was made to scale UP, but if scaling down
|
|
// then swap them!
|
|
if ( hdb1->getNumHosts() > hdb2->getNumHosts() ) {
|
|
Hostdb *tmp = hdb1;
|
|
hdb1 = hdb2;
|
|
hdb2 = tmp;
|
|
}
|
|
|
|
// . ensure old hosts in g_hostdb are in a derivate groupId in
|
|
// newHostsConf
|
|
// . old hosts may not even be present! consider them the same host,
|
|
// though, if have same ip and working dir, because that would
|
|
// interfere with a file copy.
|
|
for ( int32_t i = 0 ; i < hdb1->getNumHosts() ; i++ ) {
|
|
Host *h = &hdb1->m_hosts[i];
|
|
// look in new guy
|
|
for ( int32_t j = 0 ; j < hdb2->getNumHosts() ; j++ ) {
|
|
Host *h2 = &hdb2->m_hosts[j];
|
|
// if a match, ensure same group
|
|
if ( h2->m_ip != h->m_ip ) continue;
|
|
if ( strcmp ( h2->m_dir , h->m_dir ) != 0 ) continue;
|
|
}
|
|
}
|
|
|
|
// . ensure that:
|
|
// (h2->m_groupId & (hdb1->m_numGroups -1)) == h->m_groupId
|
|
// where h2 is in a derivative group of h.
|
|
// . do a quick monte carlo test to make sure that a key in old
|
|
// group #0 maps to groups 0,8,16,24 for all keys and all dbs
|
|
for ( int32_t i = 0 ; i < 1000 ; i++ ) {
|
|
//key_t k;
|
|
//k.n1 = rand(); k.n0 = rand(); k.n0 <<= 32; k.n0 |= rand();
|
|
//key128_t k16;
|
|
//k16.n0 = k.n0;
|
|
//k16.n1 = rand(); k16.n1 <<= 32; k16.n1 |= k.n1;
|
|
char k[MAX_KEY_BYTES];
|
|
for ( int32_t ki = 0 ; ki < MAX_KEY_BYTES ; ki++ )
|
|
k[ki] = rand() & 0xff;
|
|
}
|
|
|
|
// . now copy all titleRecs in old hosts to all derivatives
|
|
// . going from 8 (3bits) hosts to 32 (5bits), for instance, old
|
|
// group id #0 would copy to group ids 0,8,16 and 24.
|
|
// . 000 --> 00000(#0), 01000(#8), 10000(#16), 11000(#24)
|
|
// . titledb determine groupId by mod'ding the docid
|
|
// contained in their most significant key bits with the number
|
|
// of groups. see Titledb.h::getGroupId(docid)
|
|
// . indexdb and tagdb mask the hi bits of the key with
|
|
// hdb1->m_groupMask, which is like a reverse mod'ding:
|
|
// 000 --> 00000, 00001, 00010, 00011
|
|
char done [ 8196 ];
|
|
memset ( done , 0 , 8196 );
|
|
for ( int32_t i = 0 ; i < hdb1->getNumHosts() ; i++ ) {
|
|
Host *h = &hdb1->m_hosts[i];
|
|
char flag = 0;
|
|
// look in new guy
|
|
for ( int32_t j = 0 ; j < hdb2->getNumHosts() ; j++ ) {
|
|
Host *h2 = &hdb2->m_hosts[j];
|
|
// do not copy to oneself
|
|
if ( h2->m_ip == h->m_ip &&
|
|
strcmp ( h2->m_dir , h->m_dir ) == 0 ) continue;
|
|
// skip if not derivative groupId for titledb
|
|
//if ( (h2->m_groupId & hdb1->m_groupMask) !=
|
|
// h->m_groupId ) continue;
|
|
// continue if already copying to here
|
|
if ( done[j] ) continue;
|
|
// mark as done
|
|
done[j] = 1;
|
|
|
|
// skip local copies for now!!
|
|
//if ( h->m_ip == h2->m_ip ) continue;
|
|
|
|
// use ; separator
|
|
if ( flag ) fprintf(stderr,"; ");
|
|
//else fprintf(stderr,"ssh %s \"",iptoa(h->m_ip));
|
|
else fprintf(stderr,"ssh %s \"",h->m_hostname);
|
|
// flag
|
|
flag = 1;
|
|
// print the copy
|
|
//fprintf(stderr,"rcp %s:%s*db*.dat* ",
|
|
// iptoa( h->m_ip), h->m_dir );
|
|
// if same ip then do a 'cp' not rcp
|
|
const char *cmd = "rcp -r";
|
|
if ( h->m_ip == h2->m_ip ) cmd = "cp -pr";
|
|
|
|
fprintf(stderr,"%s %s*db*.dat* ", cmd, h->m_dir );
|
|
|
|
if ( h->m_ip == h2->m_ip )
|
|
fprintf(stderr,"%s ;", h2->m_dir );
|
|
else {
|
|
//int32_t ip = h2->m_ip;
|
|
//if ( useShotgunIp ) ip = h2->m_ipShotgun;
|
|
//fprintf(stderr,"%s:%s ;",iptoa(ip), h2->m_dir );
|
|
char *hn = h2->m_hostname;
|
|
if ( useShotgunIp ) hn = h2->m_hostname;//2
|
|
fprintf(stderr,"%s:%s ;",hn, h2->m_dir );
|
|
|
|
}
|
|
|
|
//fprintf(stderr," rcp -p %s*.map* ", h->m_dir );
|
|
fprintf(stderr," %s %scoll.* ", cmd, h->m_dir );
|
|
|
|
if ( h->m_ip == h2->m_ip )
|
|
fprintf(stderr,"%s " , h2->m_dir );
|
|
else {
|
|
//int32_t ip = h2->m_ip;
|
|
//if ( useShotgunIp ) ip = h2->m_ipShotgun;
|
|
//fprintf(stderr,"%s:%s " ,iptoa(ip), h2->m_dir );
|
|
char *hn = h2->m_hostname;
|
|
if ( useShotgunIp ) hn = h2->m_hostname;//2;
|
|
fprintf(stderr,"%s:%s " ,hn, h2->m_dir );
|
|
}
|
|
|
|
/*
|
|
fprintf(stderr,"scp %s:%s/titledb* %s:%s\n",
|
|
iptoa( h->m_ip), h->m_dir ,
|
|
iptoa(h2->m_ip), h2->m_dir );
|
|
fprintf(stderr,"scp %s:%s/indexdb* %s:%s\n",
|
|
iptoa( h->m_ip), h->m_dir ,
|
|
iptoa(h2->m_ip), h2->m_dir );
|
|
fprintf(stderr,"scp %s:%s/spiderdb* %s:%s\n",
|
|
iptoa( h->m_ip), h->m_dir ,
|
|
iptoa(h2->m_ip), h2->m_dir );
|
|
fprintf(stderr,"scp %s:%s/clusterdb* %s:%s\n",
|
|
iptoa( h->m_ip), h->m_dir ,
|
|
iptoa(h2->m_ip), h2->m_dir );
|
|
fprintf(stderr,"scp %s:%s/tagdb* %s:%s\n",
|
|
iptoa( h->m_ip), h->m_dir ,
|
|
iptoa(h2->m_ip), h2->m_dir );
|
|
*/
|
|
}
|
|
if ( flag ) fprintf(stderr,"\" &\n");
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
|
|
static int install_file(const char *dst_host, const char *src_file, const char *dst_file)
|
|
{
|
|
char cmd[1024];
|
|
sprintf(cmd, "scp -p %s %s:%s",
|
|
src_file,
|
|
dst_host,
|
|
dst_file);
|
|
log(LOG_INIT,"admin: %s", cmd);
|
|
int rc = system(cmd);
|
|
return rc;
|
|
}
|
|
|
|
|
|
static int install_file(const char *file)
|
|
{
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
Host *h2 = g_hostdb.getHost(i);
|
|
if(h2==g_hostdb.getMyShard())
|
|
continue; //skip ourselves
|
|
char full_dst_file[1024];
|
|
sprintf(full_dst_file, "%s%s",h2->m_dir,file);
|
|
install_file(iptoa(h2->m_ip),
|
|
file,
|
|
full_dst_file);
|
|
}
|
|
return 0; //return value is unclear
|
|
}
|
|
|
|
|
|
// installFlag is 1 if we are really installing, 2 if just starting up gb's
|
|
// installFlag should be a member of the ifk_ enum defined above
|
|
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir, int32_t hostId2, char *cmd ) {
|
|
|
|
// use hostId2 to indicate the range hostId-hostId2, but if it is -1
|
|
// then it was not given, so restrict to just hostId
|
|
if ( hostId2 == -1 ) {
|
|
hostId2 = hostId;
|
|
}
|
|
|
|
char tmp[1024];
|
|
if ( installFlag == ifk_proxy_start ) {
|
|
for ( int32_t i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
|
|
Host *h2 = g_hostdb.getProxy(i);
|
|
// limit install to this hostId if it is >= 0
|
|
if ( hostId >= 0 && h2->m_hostId != hostId ) continue;
|
|
|
|
// . assume conf file name gbHID.conf
|
|
// . assume working dir ends in a '/'
|
|
//to test add: ulimit -t 10; to the ssh cmd
|
|
sprintf(tmp,
|
|
"ssh %s \"cd %s ; "
|
|
"export MALLOC_CHECK_=0;"
|
|
"cp -f gb gb.oldsave ; "
|
|
"mv -f gb.installed gb ; "
|
|
"ADDARGS='' ; "
|
|
"EXITSTATUS=1 ; "
|
|
"while [ \\$EXITSTATUS != 0 ]; do "
|
|
"{ "
|
|
"./gb proxy load %" PRId32" " // mdw
|
|
"\\$ADDARGS "
|
|
" >& ./proxylog ;"
|
|
"EXITSTATUS=\\$? ; "
|
|
"ADDARGS='-r' ; "
|
|
"} "
|
|
"done >& /dev/null & \" & ",
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir ,
|
|
h2->m_hostId );
|
|
// log it
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
// execute it
|
|
int32_t ret = system ( tmp );
|
|
if ( ret < 0 ) {
|
|
fprintf(stderr,"Error loading proxy: %s\n",
|
|
mstrerror(errno));
|
|
exit(-1);
|
|
}
|
|
fprintf(stderr,"If proxy does not start, make sure "
|
|
"its ip is correct in hosts.conf\n");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
HashTableX iptab;
|
|
char tmpBuf[2048];
|
|
iptab.set(4,4,64,tmpBuf,2048,true,0,"iptsu");
|
|
|
|
int32_t maxOut = 500;
|
|
|
|
// this is a big scp so only do two at a time...
|
|
if ( installFlag == ifk_install ) maxOut = 1;
|
|
if ( installFlag == ifk_installgb ) maxOut = 4;
|
|
|
|
// go through each host
|
|
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
|
Host *h2 = g_hostdb.getHost(i);
|
|
|
|
const char *amp = " ";
|
|
|
|
// if i is NOT multiple of maxOut then use '&'
|
|
// even if all all different machines (IPs) scp chokes and so
|
|
// does rcp a little. so restrict to maxOut at a time.
|
|
if ( (i+1) % maxOut ) {
|
|
amp = "&";
|
|
}
|
|
|
|
// limit install to this hostId if it is >= 0
|
|
//if ( hostId >= 0 && h2->m_hostId != hostId ) continue;
|
|
if ( hostId >= 0 && hostId2 == -1 ) {
|
|
if ( h2->m_hostId != hostId )
|
|
continue;
|
|
}
|
|
// if doing a range of hostid, hostId2 is >= 0
|
|
else if ( hostId >= 0 && hostId2 >= 0 ) {
|
|
if ( h2->m_hostId < hostId || h2->m_hostId > hostId2 )
|
|
continue;
|
|
}
|
|
|
|
// backupcopy
|
|
if ( installFlag == ifk_backupcopy ) {
|
|
sprintf(tmp,
|
|
"ssh %s \"cd %s ; "
|
|
"mkdir %s ; "
|
|
"cp -ai *.dat* *.map gb.conf "
|
|
"hosts.conf %s\" &",
|
|
iptoa(h2->m_ip), h2->m_dir , dir , dir );
|
|
// log it
|
|
log ( "%s", tmp);
|
|
// execute it
|
|
system ( tmp );
|
|
continue;
|
|
}
|
|
// backupmove
|
|
else if ( installFlag == ifk_backupmove ) {
|
|
sprintf(tmp,
|
|
"ssh %s \"cd %s ; "
|
|
"mkdir %s ; "
|
|
"mv -i *.dat* *.map "
|
|
"%s\" &",
|
|
iptoa(h2->m_ip), h2->m_dir , dir , dir );
|
|
// log it
|
|
log ( "%s", tmp);
|
|
// execute it
|
|
system ( tmp );
|
|
continue;
|
|
}
|
|
// backuprestore
|
|
else if ( installFlag == ifk_backuprestore ) {
|
|
sprintf(tmp,
|
|
"ssh %s \"cd %s ; cd %s ; "
|
|
"mv -i *.dat* *.map gb.conf "
|
|
"hosts.conf %s\" &",
|
|
iptoa(h2->m_ip), h2->m_dir , dir , h2->m_dir );
|
|
// log it
|
|
log ( "%s", tmp);
|
|
// execute it
|
|
system ( tmp );
|
|
continue;
|
|
}
|
|
|
|
const char *dir = "./";
|
|
// install to it
|
|
if ( installFlag == ifk_install ) {
|
|
const char *srcDir = "./";
|
|
SafeBuf fileListBuf;
|
|
g_process.getFilesToCopy ( srcDir , &fileListBuf );
|
|
|
|
fileListBuf.safePrintf(" %shosts.conf",srcDir);
|
|
fileListBuf.safePrintf(" %sgb.conf",srcDir);
|
|
|
|
char *ipStr = iptoa(h2->m_ip);
|
|
|
|
SafeBuf tmpBuf;
|
|
tmpBuf.safePrintf(
|
|
// ensure directory is there, if
|
|
// not then make it
|
|
"ssh %s 'mkdir %s' ; "
|
|
"scp -p -r %s %s:%s"
|
|
, ipStr
|
|
, h2->m_dir
|
|
|
|
, fileListBuf.getBufStart()
|
|
, iptoa(h2->m_ip)
|
|
, h2->m_dir
|
|
);
|
|
char *tmp = tmpBuf.getBufStart();
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
else if ( installFlag == ifk_installgb ) {
|
|
File f;
|
|
const char *target = "gb.new";
|
|
f.set(g_hostdb.m_myHost->m_dir,target);
|
|
if ( ! f.doesExist() ) target = "gb";
|
|
|
|
sprintf(tmp,
|
|
"scp -p " // blowfish is faster
|
|
"%s%s "
|
|
"%s:%s/gb.installed%s",
|
|
dir,
|
|
target,
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir,
|
|
amp);
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
else if ( installFlag == ifk_installtmpgb ) {
|
|
sprintf(tmp,
|
|
"scp -p "
|
|
"%sgb.new "
|
|
"%s:%s/tmpgb.installed &",
|
|
dir,
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir);
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
else if ( installFlag == ifk_installconf ) {
|
|
sprintf(tmp,
|
|
"scp -p %sgb.conf %shosts.conf %s:%s %s",
|
|
dir ,
|
|
dir ,
|
|
//h->m_hostId ,
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir,
|
|
//h2->m_hostId);
|
|
amp);
|
|
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
// start up a dummy cluster using hosts.conf ports + 1
|
|
else if ( installFlag == ifk_tmpstart ) {
|
|
// . assume conf file name gbHID.conf
|
|
// . assume working dir ends in a '/'
|
|
sprintf(tmp,
|
|
"ssh %s \"cd %s ; "
|
|
"cp -f tmpgb tmpgb.oldsave ; "
|
|
"mv -f tmpgb.installed tmpgb ; "
|
|
"%s/tmpgb tmpstarthost "
|
|
"%" PRId32" >& ./tmplog%03" PRId32" &\" &",
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir ,
|
|
h2->m_dir ,
|
|
h2->m_hostId ,
|
|
h2->m_hostId );
|
|
// log it
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
// execute it
|
|
system ( tmp );
|
|
}
|
|
else if ( installFlag == ifk_start ) {
|
|
sprintf( tmp, "ssh %s '%sgbstart.sh %" PRId32"' %s", iptoa(h2->m_ip), h2->m_dir, h2->m_hostId, amp );
|
|
|
|
// log it
|
|
fprintf(stdout,"admin: %s\n", tmp);
|
|
|
|
// execute it
|
|
system ( tmp );
|
|
}
|
|
// dsh
|
|
else if ( installFlag == ifk_dsh ) {
|
|
sprintf(tmp,
|
|
"ssh %s 'cd %s ; %s' %s",
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir,
|
|
cmd ,
|
|
amp );
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
// dsh2
|
|
else if ( installFlag == ifk_dsh2 ) {
|
|
sprintf(tmp,
|
|
"ssh %s 'cd %s ; %s'",
|
|
iptoa(h2->m_ip),
|
|
h2->m_dir,
|
|
cmd );
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
// installconf2
|
|
else if ( installFlag == ifk_installconf2 ) {
|
|
sprintf(tmp,
|
|
"rcp %sgb.conf %shosts.conf %shosts2.conf "
|
|
"%s:%s &",
|
|
dir ,
|
|
dir ,
|
|
dir ,
|
|
iptoa(h2->m_ipShotgun),
|
|
h2->m_dir);
|
|
log(LOG_INIT,"admin: %s", tmp);
|
|
system ( tmp );
|
|
}
|
|
}
|
|
// return 0 on success
|
|
return 0;
|
|
}
|
|
|
|
bool registerMsgHandlers ( ) {
|
|
if (! registerMsgHandlers1()) return false;
|
|
if (! registerMsgHandlers2()) return false;
|
|
if (! registerMsgHandlers3()) return false;
|
|
if ( ! g_pingServer.registerHandler() ) return false;
|
|
|
|
// in SpiderProxy.cpp...
|
|
initSpiderProxyStuff();
|
|
return true;
|
|
}
|
|
|
|
bool registerMsgHandlers1(){
|
|
if ( ! Msg20::registerHandler()) return false;
|
|
if ( ! MsgC::registerHandler()) return false;
|
|
|
|
if ( ! Msg22::registerHandler() ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool registerMsgHandlers2(){
|
|
if ( ! Msg0::registerHandler()) return false;
|
|
if ( ! Msg1::registerHandler()) return false;
|
|
|
|
if ( ! Msg13::registerHandler() ) return false;
|
|
|
|
if ( ! g_udpServer.registerHandler(msg_type_c1,handleRequestc1)) return false;
|
|
if ( ! Msg39::registerHandler()) return false;
|
|
|
|
if ( ! Msg4::registerHandler() ) return false;
|
|
|
|
if(! g_udpServer.registerHandler(msg_type_3e,handleRequest3e)) return false;
|
|
if(! g_udpServer.registerHandler(msg_type_3f,handleRequest3f)) return false;
|
|
|
|
if ( ! g_udpServer.registerHandler(msg_type_25,handleRequest25)) return false;
|
|
if ( ! g_udpServer.registerHandler(msg_type_7,handleRequest7)) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool registerMsgHandlers3(){
|
|
if ( ! Msg40::registerHandler() ) return false;
|
|
return true;
|
|
}
|
|
|
|
#include "Rdb.h"
|
|
#include "Xml.h"
|
|
|
|
//
|
|
// dump routines here now
|
|
//
|
|
|
|
void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree,
|
|
int64_t docid , bool justPrintDups) {
|
|
|
|
if(startFileNum!=0 && numFiles<0) {
|
|
//this may apply to all files, but I haven't checked into hash-based ones yet
|
|
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
|
|
return;
|
|
}
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log("Unicode initialization failed!");
|
|
return;
|
|
}
|
|
// init our table for doing zobrist hashing
|
|
if ( ! hashinit() ) {
|
|
log("db: Failed to init hashtable." ); return ; }
|
|
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
|
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
|
|
g_titledb.init ();
|
|
//g_collectiondb.init(true);
|
|
g_titledb.getRdb()->addRdbBase1(coll);
|
|
key_t startKey ;
|
|
key_t endKey ;
|
|
key_t lastKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
lastKey.setMin();
|
|
startKey = g_titledb.makeFirstKey ( docid );
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
int64_t prevId = 0LL;
|
|
int32_t count = 0;
|
|
char ttt[2048+MAX_URL_LEN];
|
|
HashTableX dedupTable;
|
|
dedupTable.set(4,0,10000,NULL,0,false,0,"maintitledb");
|
|
//g_synonyms.init();
|
|
// load the appropriate dictionaries -- why???
|
|
//g_speller.init();
|
|
|
|
// make this
|
|
XmlDoc *xd;
|
|
try { xd = new (XmlDoc); }
|
|
catch ( ... ) {
|
|
fprintf(stdout,"could not alloc for xmldoc\n");
|
|
exit(-1);
|
|
}
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
if(cr==NULL) {
|
|
fprintf(stderr,"Unknown collection '%s'\n", coll);
|
|
return;
|
|
}
|
|
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_TITLEDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL , // cache key ptr
|
|
0 , // retry num
|
|
-1 , // maxRetries
|
|
true , // compensate for merge
|
|
-1LL, // sync point
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) return;
|
|
|
|
// loop over entries in list
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
char *rec = list.getCurrentRec();
|
|
int32_t recSize = list.getCurrentRecSize();
|
|
int64_t docId = g_titledb.getDocIdFromKey ( k );
|
|
if ( k <= lastKey )
|
|
log("key out of order. "
|
|
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
|
|
"currKey.n1=%" PRIx32" n0=%" PRIx64" ",
|
|
lastKey.n1,lastKey.n0,
|
|
k.n1,k.n0);
|
|
lastKey = k;
|
|
int32_t shard = g_hostdb.getShardNum ( RDB_TITLEDB , &k );
|
|
// print deletes
|
|
if ( (k.n0 & 0x01) == 0) {
|
|
fprintf(stdout,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
|
|
"shard=%" PRId32" (del)\n",
|
|
k.n1 , k.n0 , docId , shard );
|
|
continue;
|
|
}
|
|
// free the mem
|
|
xd->reset();
|
|
// uncompress the title rec
|
|
//TitleRec tr;
|
|
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) )
|
|
continue;
|
|
|
|
// extract the url
|
|
Url *u = xd->getFirstUrl();
|
|
|
|
// get ip
|
|
char ipbuf [ 32 ];
|
|
strcpy ( ipbuf , iptoa(u->getIp() ) );
|
|
// pad with spaces
|
|
int32_t blen = strlen(ipbuf);
|
|
while ( blen < 15 ) ipbuf[blen++]=' ';
|
|
ipbuf[blen]='\0';
|
|
//int32_t nc = xd->size_catIds / 4;//tr.getNumCatids();
|
|
if ( justPrintDups ) {
|
|
// print into buf
|
|
if ( docId != prevId ) {
|
|
time_t ts = xd->m_spideredTime;//tr.getSpiderDa
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = localtime_r(&ts,&tm_buf);
|
|
//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
|
|
char ppp[100];
|
|
strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",
|
|
timeStruct);
|
|
LinkInfo *info = xd->ptr_linkInfo1;//tr.ge
|
|
char foo[1024];
|
|
foo[0] = '\0';
|
|
//if ( tr.getVersion() >= 86 )
|
|
sprintf(foo,
|
|
//"tw=%" PRId32" hw=%" PRId32" upw=%" PRId32" "
|
|
"sni=%" PRId32" ",
|
|
//(int32_t)xd->m_titleWeight,
|
|
//(int32_t)xd->m_headerWeight,
|
|
//(int32_t)xd->m_urlPathWeight,
|
|
(int32_t)xd->m_siteNumInlinks);
|
|
const char *ru = xd->ptr_redirUrl;
|
|
if ( ! ru ) ru = "";
|
|
sprintf(ttt,
|
|
"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
|
|
//hh=%07" PRIx32" ch=%08" PRIx32" "
|
|
"size=%07" PRId32" "
|
|
"ch32=%010" PRIu32" "
|
|
"clen=%07" PRId32" "
|
|
"cs=%04d "
|
|
"lang=%02d "
|
|
"sni=%03" PRId32" "
|
|
"usetimeaxis=%i "
|
|
//"cats=%" PRId32" "
|
|
"lastspidered=%s "
|
|
"ip=%s "
|
|
"numLinkTexts=%04" PRId32" "
|
|
"%s"
|
|
"version=%02" PRId32" "
|
|
//"maxLinkTextWeight=%06" PRIu32"%% "
|
|
"hc=%" PRId32" "
|
|
"redir=%s "
|
|
"url=%s "
|
|
"firstdup=1 "
|
|
"shard=%" PRId32" "
|
|
"\n",
|
|
k.n1 , k.n0 ,
|
|
//rec[0] ,
|
|
docId ,
|
|
//hostHash ,
|
|
//contentHash ,
|
|
recSize - 16 ,
|
|
xd->m_contentHash32,
|
|
xd->size_utf8Content,//tr.getContentLen
|
|
xd->m_charset,//tr.getCharset(),
|
|
xd->m_langId,//tr.getLanguage(),
|
|
(int32_t)xd->m_siteNumInlinks,//tr.getDo
|
|
xd->m_useTimeAxis,
|
|
//nc,
|
|
ppp,
|
|
iptoa(xd->m_ip),//ipbuf ,
|
|
info->getNumGoodInlinks(),
|
|
foo,
|
|
(int32_t)xd->m_version,
|
|
//ms,
|
|
(int32_t)xd->m_hopCount,
|
|
ru,
|
|
u->getUrl() ,
|
|
shard );
|
|
prevId = docId;
|
|
count = 0;
|
|
continue;
|
|
}
|
|
// print previous docid that is same as our
|
|
if ( count++ == 0 ) printf ( "\n%s" , ttt );
|
|
}
|
|
// nice, this is never 0 for a titlerec, so we can use 0 to signal
|
|
// that the following bytes are not compressed, and we can store
|
|
// out special checksum vector there for fuzzy deduping.
|
|
//if ( rec[0] != 0 ) continue;
|
|
// print it out
|
|
//printf("n1=%08" PRIx32" n0=%016" PRIx64" b=0x%02hhx docId=%012" PRId64" sh=%07" PRIx32" ch=%08" PRIx32" "
|
|
// date indexed as local time, not GMT/UTC
|
|
time_t ts = xd->m_spideredTime;//tr.getSpiderDate();
|
|
struct tm tm_buf;
|
|
struct tm *timeStruct = localtime_r(&ts,&tm_buf);
|
|
//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
|
|
char ppp[100];
|
|
strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);
|
|
|
|
LinkInfo *info = xd->ptr_linkInfo1;//tr.getLinkInfo();
|
|
|
|
char foo[1024];
|
|
foo[0] = '\0';
|
|
sprintf(foo,
|
|
"sni=%" PRId32" ",
|
|
(int32_t)xd->m_siteNumInlinks);
|
|
|
|
const char *ru = xd->ptr_redirUrl;
|
|
if ( ! ru ) ru = "";
|
|
|
|
fprintf(stdout,
|
|
"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
|
|
"size=%07" PRId32" "
|
|
"ch32=%010" PRIu32" "
|
|
"clen=%07" PRId32" "
|
|
"cs=%04d "
|
|
"ctype=%s "
|
|
"lang=%02d "
|
|
"sni=%03" PRId32" "
|
|
"usetimeaxis=%i "
|
|
"lastspidered=%s "
|
|
"ip=%s "
|
|
"numLinkTexts=%04" PRId32" "
|
|
"%s"
|
|
"version=%02" PRId32" "
|
|
"hc=%" PRId32" "
|
|
"shard=%" PRId32" "
|
|
"metadatasize=%" PRId32" "
|
|
"redir=%s "
|
|
"url=%s\n",
|
|
k.n1 , k.n0 ,
|
|
docId ,
|
|
recSize - 16 ,
|
|
xd->m_contentHash32,
|
|
xd->size_utf8Content,//tr.getContentLen() ,
|
|
xd->m_charset,//tr.getCharset(),
|
|
g_contentTypeStrings[xd->m_contentType],
|
|
xd->m_langId,//tr.getLanguage(),
|
|
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
|
|
xd->m_useTimeAxis,
|
|
ppp,
|
|
iptoa(xd->m_ip),//ipbuf ,
|
|
info->getNumGoodInlinks(),
|
|
foo,
|
|
(int32_t)xd->m_version,
|
|
(int32_t)xd->m_hopCount,
|
|
shard,
|
|
0,
|
|
ru,
|
|
u->getUrl() );
|
|
// free the mem
|
|
xd->reset();
|
|
}
|
|
startKey = *(key_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key_t *)list.getLastKey() ) return;
|
|
goto loop;
|
|
}
|
|
|
|
void dumpWaitingTree (const char *coll ) {
|
|
RdbTree wt;
|
|
if (!wt.set(0,-1,20000000,true,"waittree2", false,"waitingtree",sizeof(key_t))) {
|
|
return;
|
|
}
|
|
|
|
collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
|
// make dir
|
|
char dir[500];
|
|
sprintf(dir, "%scoll.%s.%" PRId32, g_hostdb.m_dir, coll, (int32_t)collnum);
|
|
|
|
// load in the waiting tree, IPs waiting to get into doledb
|
|
BigFile file;
|
|
file.set ( dir , "waitingtree-saved.dat" , NULL );
|
|
bool treeExists = file.doesExist() > 0;
|
|
// load the table with file named "THISDIR/saved"
|
|
RdbMem wm;
|
|
if ( treeExists && ! wt.fastLoad(&file,&wm) ) return;
|
|
// the the waiting tree
|
|
int32_t node = wt.getFirstNode();
|
|
for ( ; node >= 0 ; node = wt.getNextNode(node) ) {
|
|
// breathe
|
|
QUICKPOLL(MAX_NICENESS);
|
|
// get key
|
|
key_t *key = (key_t *)wt.getKey(node);
|
|
// get ip from that
|
|
int32_t firstIp = (key->n0) & 0xffffffff;
|
|
// get the time
|
|
uint64_t spiderTimeMS = key->n1;
|
|
// shift upp
|
|
spiderTimeMS <<= 32;
|
|
// or in
|
|
spiderTimeMS |= (key->n0 >> 32);
|
|
// get the rest of the data
|
|
fprintf(stdout,"time=%" PRIu64" firstip=%s\n",
|
|
spiderTimeMS,
|
|
iptoa(firstIp));
|
|
}
|
|
}
|
|
|
|
|
|
void dumpDoledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree){
|
|
g_doledb.init ();
|
|
g_doledb.getRdb()->addRdbBase1(coll );
|
|
key_t startKey ;
|
|
key_t endKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
key_t oldk; oldk.setMin();
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_DOLEDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) return;
|
|
// loop over entries in list
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
if ( oldk > k )
|
|
fprintf(stdout,"got bad key order. "
|
|
"%" PRIx32"/%" PRIx64" > %" PRIx32"/%" PRIx64"\n",
|
|
oldk.n1,oldk.n0,k.n1,k.n0);
|
|
oldk = k;
|
|
// get it
|
|
char *drec = list.getCurrentRec();
|
|
// sanity check
|
|
if ( (drec[0] & 0x01) == 0x00 ) {g_process.shutdownAbort(true); }
|
|
// get spider rec in it
|
|
char *srec = drec + 12 + 4;
|
|
// print doledb info first then spider request
|
|
fprintf(stdout,"dolekey=%s (n1=%" PRIu32" n0=%" PRIu64") "
|
|
"pri=%" PRId32" "
|
|
"spidertime=%" PRIu32" "
|
|
"uh48=0x%" PRIx64"\n",
|
|
KEYSTR(&k,12),
|
|
k.n1,
|
|
k.n0,
|
|
(int32_t)g_doledb.getPriority(&k),
|
|
g_doledb.getSpiderTime(&k),
|
|
g_doledb.getUrlHash48(&k));
|
|
fprintf(stdout,"spiderkey=");
|
|
// print it
|
|
g_spiderdb.print ( srec );
|
|
// the \n
|
|
printf("\n");
|
|
// must be a request -- for now, for stats
|
|
if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
|
|
// error!
|
|
continue;
|
|
}
|
|
// cast it
|
|
SpiderRequest *sreq = (SpiderRequest *)srec;
|
|
// skip negatives
|
|
if ( (sreq->m_key.n0 & 0x01) == 0x00 ) {
|
|
g_process.shutdownAbort(true); }
|
|
}
|
|
startKey = *(key_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key_t *)list.getLastKey() ) return;
|
|
goto loop;
|
|
}
|
|
|
|
|
|
// . dataSlot fo the hashtable for spider stats in dumpSpiderdb
|
|
// . key is firstip
|
|
class UStat {
|
|
public:
|
|
// for spider requests:
|
|
int32_t m_numRequests;
|
|
int32_t m_numRequestsWithReplies;
|
|
int32_t m_numWWWRoots;
|
|
int32_t m_numNonWWWRoots;
|
|
int32_t m_numHops1;
|
|
int32_t m_numHops2;
|
|
int32_t m_numHops3orMore;
|
|
int32_t m_ageOfYoungestSpideredRequest;
|
|
int32_t m_ageOfOldestUnspideredRequest;
|
|
int32_t m_ageOfOldestUnspideredWWWRootRequest;
|
|
// for spider replies:
|
|
int32_t m_numGoodReplies;
|
|
int32_t m_numErrorReplies;
|
|
};
|
|
|
|
static HashTableX g_ut;
|
|
|
|
void addUStat1 ( SpiderRequest *sreq, bool hadReply , int32_t now ) {
|
|
int32_t firstIp = sreq->m_firstIp;
|
|
// lookup
|
|
int32_t n = g_ut.getSlot ( &firstIp );
|
|
UStat *us = NULL;
|
|
UStat tmp;
|
|
if ( n < 0 ) {
|
|
us = &tmp;
|
|
memset(us,0,sizeof(UStat));
|
|
g_ut.addKey(&firstIp,us);
|
|
us = (UStat *)g_ut.getValue ( &firstIp );
|
|
}
|
|
else {
|
|
us = (UStat *)g_ut.getValueFromSlot ( n );
|
|
}
|
|
int32_t age = now - sreq->m_addedTime;
|
|
// inc the counts
|
|
us->m_numRequests++;
|
|
if ( hadReply) us->m_numRequestsWithReplies++;
|
|
if ( sreq->m_hopCount == 0 ) {
|
|
if ( sreq->m_isWWWSubdomain ) us->m_numWWWRoots++;
|
|
else us->m_numNonWWWRoots++;
|
|
}
|
|
else if ( sreq->m_hopCount == 1 ) us->m_numHops1++;
|
|
else if ( sreq->m_hopCount == 2 ) us->m_numHops2++;
|
|
else if ( sreq->m_hopCount >= 3 ) us->m_numHops3orMore++;
|
|
if ( hadReply ) {
|
|
if (age < us->m_ageOfYoungestSpideredRequest ||
|
|
us->m_ageOfYoungestSpideredRequest == 0 )
|
|
us->m_ageOfYoungestSpideredRequest = age;
|
|
}
|
|
if ( ! hadReply ) {
|
|
if (age > us->m_ageOfOldestUnspideredRequest ||
|
|
us->m_ageOfOldestUnspideredRequest == 0 )
|
|
us->m_ageOfOldestUnspideredRequest = age;
|
|
}
|
|
if ( ! hadReply && sreq->m_hopCount == 0 && sreq->m_isWWWSubdomain ) {
|
|
if (age > us->m_ageOfOldestUnspideredWWWRootRequest ||
|
|
us->m_ageOfOldestUnspideredWWWRootRequest == 0 )
|
|
us->m_ageOfOldestUnspideredWWWRootRequest = age;
|
|
}
|
|
}
|
|
|
|
void addUStat2 ( SpiderReply *srep , int32_t now ) {
|
|
int32_t firstIp = srep->m_firstIp;
|
|
// lookup
|
|
int32_t n = g_ut.getSlot ( &firstIp );
|
|
UStat *us = NULL;
|
|
UStat tmp;
|
|
if ( n < 0 ) {
|
|
us = &tmp;
|
|
memset(us,0,sizeof(UStat));
|
|
g_ut.addKey(&firstIp,us);
|
|
us = (UStat *)g_ut.getValue ( &firstIp );
|
|
}
|
|
else {
|
|
us = (UStat *)g_ut.getValueFromSlot ( n );
|
|
}
|
|
//int32_t age = now - srep->m_spideredTime;
|
|
// inc the counts
|
|
if ( srep->m_errCode )
|
|
us->m_numErrorReplies++;
|
|
else
|
|
us->m_numGoodReplies++;
|
|
|
|
}
|
|
|
|
|
|
int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char printStats,
|
|
int32_t firstIp ) {
|
|
if ( startFileNum < 0 ) {
|
|
log(LOG_LOGIC,"db: Start file number is < 0. Must be >= 0.");
|
|
return -1;
|
|
}
|
|
|
|
if ( printStats == 1 ) {
|
|
if ( ! g_ut.set ( 4, sizeof(UStat), 10000000, NULL, 0, 0, false, "utttt") ) {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
g_spiderdb.init ();
|
|
g_spiderdb.getRdb()->addRdbBase1(coll );
|
|
|
|
key128_t startKey;
|
|
key128_t endKey;
|
|
|
|
// start based on firstip if non-zero
|
|
if ( firstIp ) {
|
|
startKey = g_spiderdb.makeFirstKey ( firstIp );
|
|
endKey = g_spiderdb.makeLastKey ( firstIp );
|
|
} else {
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
}
|
|
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
|
|
// clear before calling Msg5
|
|
g_errno = 0;
|
|
|
|
// init stats vars
|
|
int32_t negRecs = 0;
|
|
int32_t emptyRecs = 0;
|
|
int32_t uniqDoms = 0;
|
|
|
|
// count urls per domain in "domTable"
|
|
HashTable domTable;
|
|
domTable.set ( 1024*1024 );
|
|
|
|
// count every uniq domain per ip in ipDomTable (uses dup keys)
|
|
HashTableX ipDomTable;
|
|
|
|
// allow dups? true!
|
|
ipDomTable.set ( 4,4,5000000 , NULL, 0, true ,0, "ipdomtbl");
|
|
|
|
// count how many unique domains per ip
|
|
HashTable ipDomCntTable;
|
|
ipDomCntTable.set ( 1024*1024 );
|
|
|
|
// buffer for holding the domains
|
|
int32_t bufSize = 1024*1024;
|
|
char *buf = (char *)mmalloc(bufSize,"spiderstats");
|
|
int32_t bufOff = 0;
|
|
int32_t count = 0;
|
|
int32_t countReplies = 0;
|
|
int32_t countRequests = 0;
|
|
int64_t offset = 0LL;
|
|
int32_t now;
|
|
static int64_t s_lastRepUh48 = 0LL;
|
|
static int32_t s_lastErrCode = 0;
|
|
static int32_t s_lastErrCount = 0;
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_SPIDERDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return -1;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) goto done;
|
|
|
|
// this may not be in sync with host #0!!!
|
|
now = getTimeLocal();
|
|
|
|
// loop over entries in list
|
|
for ( list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord() ) {
|
|
// print a counter
|
|
if ( ((count++) % 100000) == 0 ) {
|
|
fprintf( stderr, "Processed %" PRId32" records.\n", count - 1 );
|
|
}
|
|
|
|
// get it
|
|
char *srec = list.getCurrentRec();
|
|
|
|
// save it
|
|
int64_t curOff = offset;
|
|
|
|
// and advance
|
|
offset += list.getCurrentRecSize();
|
|
|
|
// must be a request -- for now, for stats
|
|
if ( g_spiderdb.isSpiderReply((key128_t *)srec) ) {
|
|
// print it
|
|
if ( ! printStats ) {
|
|
printf( "offset=%" PRId64" ",curOff);
|
|
g_spiderdb.print ( srec );
|
|
printf("\n");
|
|
}
|
|
|
|
// its a spider reply
|
|
SpiderReply *srep = (SpiderReply *)srec;
|
|
++countReplies;
|
|
|
|
// store it
|
|
s_lastRepUh48 = srep->getUrlHash48();
|
|
s_lastErrCode = srep->m_errCode;
|
|
s_lastErrCount = srep->m_errCount;
|
|
|
|
// get firstip
|
|
if ( printStats == 1 ) {
|
|
addUStat2( srep, now );
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// cast it
|
|
SpiderRequest *sreq = (SpiderRequest *)srec;
|
|
++countRequests;
|
|
|
|
int64_t uh48 = sreq->getUrlHash48();
|
|
// count how many requests had replies and how many did not
|
|
bool hadReply = ( uh48 == s_lastRepUh48 );
|
|
|
|
// get firstip
|
|
if ( printStats == 1 ) {
|
|
addUStat1( sreq, hadReply, now );
|
|
}
|
|
|
|
// print it
|
|
if ( ! printStats ) {
|
|
printf( "offset=%" PRId64" ",curOff);
|
|
g_spiderdb.print ( srec );
|
|
|
|
printf(" requestage=%" PRId32"s",now-sreq->m_addedTime);
|
|
printf(" hadReply=%" PRId32,(int32_t)hadReply);
|
|
|
|
printf(" errcount=%" PRId32,(int32_t)s_lastErrCount);
|
|
|
|
if ( s_lastErrCode ) {
|
|
printf( " errcode=%" PRId32"(%s)", ( int32_t ) s_lastErrCode, mstrerror( s_lastErrCode ) );
|
|
} else {
|
|
printf( " errcode=%" PRId32, ( int32_t ) s_lastErrCode );
|
|
}
|
|
|
|
printf("\n");
|
|
}
|
|
|
|
if ( printStats != 2 ) {
|
|
continue;
|
|
}
|
|
|
|
// skip negatives
|
|
if ( (sreq->m_key.n0 & 0x01) == 0x00 ) {
|
|
++negRecs;
|
|
continue;
|
|
}
|
|
|
|
// skip bogus shit
|
|
if ( sreq->m_firstIp == 0 || sreq->m_firstIp==-1 ) continue;
|
|
|
|
// shortcut
|
|
int32_t domHash = sreq->m_domHash32;
|
|
// . is it in the domain table?
|
|
// . keeps count of how many urls per domain
|
|
int32_t slot = domTable.getSlot ( domHash );
|
|
if ( slot >= 0 ) {
|
|
int32_t off = domTable.getValueFromSlot ( slot );
|
|
// just inc the count for this domain
|
|
*(int32_t *)(buf + off) = *(int32_t *)(buf + off) + 1;
|
|
continue;
|
|
}
|
|
|
|
// get the domain
|
|
int32_t domLen = 0;
|
|
const char *dom = getDomFast ( sreq->m_url , &domLen );
|
|
|
|
// always need enough room...
|
|
if ( bufOff + 4 + domLen + 1 >= bufSize ) {
|
|
int32_t growth = bufSize * 2 - bufSize;
|
|
// limit growth to 10MB each time
|
|
if ( growth > 10*1024*1024 ) growth = 10*1024*1024;
|
|
int32_t newBufSize = bufSize + growth;
|
|
char *newBuf = (char *)mrealloc( buf , bufSize ,
|
|
newBufSize,
|
|
"spiderstats");
|
|
if ( ! newBuf ) return -1;
|
|
// re-assign
|
|
buf = newBuf;
|
|
bufSize = newBufSize;
|
|
}
|
|
|
|
// store the count of urls followed by the domain
|
|
char *ptr = buf + bufOff;
|
|
*(int32_t *)ptr = 1;
|
|
ptr += 4;
|
|
gbmemcpy ( ptr , dom , domLen );
|
|
ptr += domLen;
|
|
*ptr = '\0';
|
|
// use an ip of 1 if it is 0 so it hashes right
|
|
int32_t useip = sreq->m_firstIp; // ip;
|
|
|
|
// this table counts how many urls per domain, as
|
|
// well as stores the domain
|
|
if ( ! domTable.addKey (domHash , bufOff) ) return -1;
|
|
|
|
// . if this is the first time we've seen this domain,
|
|
// add it to the ipDomTable
|
|
// . this hash table must support dups.
|
|
// . we need to print out all the domains for each ip
|
|
if ( ! ipDomTable.addKey ( &useip , &bufOff ) ) return -1;
|
|
|
|
// . this table counts how many unique domains per ip
|
|
// . it is kind of redundant since we have ipDomTable
|
|
int32_t ipCnt = ipDomCntTable.getValue ( useip );
|
|
|
|
if ( ipCnt < 0 ) {
|
|
ipCnt = 0;
|
|
}
|
|
|
|
if ( ! ipDomCntTable.addKey ( useip, ipCnt+1) ) {
|
|
return -1;
|
|
}
|
|
|
|
// advance to next empty spot
|
|
bufOff += 4 + domLen + 1;
|
|
|
|
// count unque domains
|
|
++uniqDoms;
|
|
}
|
|
|
|
startKey = *(key128_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
|
|
// watch out for wrap around
|
|
if ( startKey >= *(key128_t *)list.getLastKey() ) {
|
|
goto loop;
|
|
}
|
|
|
|
done:
|
|
// print out the stats
|
|
if ( ! printStats ) {
|
|
return 0;
|
|
}
|
|
|
|
// print UStats now
|
|
if ( printStats == 1 ) {
|
|
for ( int32_t i = 0 ; i < g_ut.getNumSlots();i++ ) {
|
|
if ( g_ut.m_flags[i] == 0 ) continue;
|
|
UStat *us = (UStat *)g_ut.getValueFromSlot(i);
|
|
int32_t firstIp = *(int32_t *)g_ut.getKeyFromSlot(i);
|
|
fprintf(stdout,"%s ",
|
|
iptoa(firstIp));
|
|
fprintf(stdout,"requests=%" PRId32" ",
|
|
us->m_numRequests);
|
|
fprintf(stdout,"wwwroots=%" PRId32" ",
|
|
us->m_numWWWRoots);
|
|
fprintf(stdout,"nonwwwroots=%" PRId32" ",
|
|
us->m_numNonWWWRoots);
|
|
fprintf(stdout,"1hop=%" PRId32" ",
|
|
us->m_numHops1);
|
|
fprintf(stdout,"2hop=%" PRId32" ",
|
|
us->m_numHops2);
|
|
fprintf(stdout,"3hop+=%" PRId32" ",
|
|
us->m_numHops3orMore);
|
|
fprintf(stdout,"mostrecentspider=%" PRId32"s ",
|
|
us->m_ageOfYoungestSpideredRequest);
|
|
fprintf(stdout,"oldestunspidered=%" PRId32"s ",
|
|
us->m_ageOfOldestUnspideredRequest);
|
|
fprintf(stdout,"oldestunspideredwwwroot=%" PRId32" ",
|
|
us->m_ageOfOldestUnspideredWWWRootRequest);
|
|
fprintf(stdout,"spidered=%" PRId32" ",
|
|
us->m_numRequestsWithReplies);
|
|
fprintf(stdout,"goodspiders=%" PRId32" ",
|
|
us->m_numGoodReplies);
|
|
fprintf(stdout,"errorspiders=%" PRId32,
|
|
us->m_numErrorReplies);
|
|
fprintf(stdout,"\n");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int32_t uniqIps = ipDomCntTable.getNumSlotsUsed();
|
|
|
|
// print out all ips, and # of domains they have and list of their
|
|
// domains
|
|
int32_t nn = ipDomTable.getNumSlots();
|
|
// i is the bucket to start at, must be EMPTY!
|
|
int32_t i = 0;
|
|
// count how many buckets we visit
|
|
int32_t visited = 0;
|
|
// find the empty bucket
|
|
for ( i = 0 ; i < nn ; i++ )
|
|
if ( ipDomTable.m_flags[i] == 0 ) break;
|
|
//if ( ipDomTable.getKey(i) == 0 ) break;
|
|
// now we can do our scan of the ips. there can be dup ips in the
|
|
// table so we must chain for each one we find
|
|
for ( ; visited++ < nn ; i++ ) {
|
|
// wrap it
|
|
if ( i == nn ) i = 0;
|
|
// skip empty buckets
|
|
if ( ipDomTable.m_flags[i] == 0 ) continue;
|
|
// get ip of the ith slot
|
|
int32_t ip = *(int32_t *)ipDomTable.getKeyFromSlot(i);
|
|
// get it in the ip table, if not there, skip it
|
|
int32_t domCount = ipDomCntTable.getValue ( ip ) ;
|
|
if ( domCount == 0 ) continue;
|
|
// log the count
|
|
int32_t useip = ip;
|
|
if ( ip == 1 ) useip = 0;
|
|
fprintf(stderr,"%s has %" PRId32" domains.\n",iptoa(useip),domCount);
|
|
// . how many domains on that ip, print em out
|
|
// . use j for the inner loop
|
|
int32_t j = i;
|
|
// buf for printing ip
|
|
char ipbuf[64];
|
|
sprintf (ipbuf,"%s",iptoa(useip) );
|
|
jloop:
|
|
int32_t ip2 = *(int32_t *)ipDomTable.getKeyFromSlot ( j ) ;
|
|
if ( ip2 == ip ) {
|
|
// get count
|
|
int32_t off = *(int32_t *)ipDomTable.getValueFromSlot ( j );
|
|
char *ptr = buf + off;
|
|
int32_t cnt = *(int32_t *)ptr;
|
|
char *dom = buf + off + 4;
|
|
// print: "IP Domain urlCountInDomain"
|
|
fprintf(stderr,"%s %s %" PRId32"\n",ipbuf,dom,cnt);
|
|
// advance && wrap
|
|
if ( ++j >= nn ) j = 0;
|
|
// keep going
|
|
goto jloop;
|
|
}
|
|
// not an empty bucket, so keep chaining
|
|
if ( ip2 != 0 ) {
|
|
// advance & wrap
|
|
if ( ++j >= nn ) j = 0;
|
|
// keep going
|
|
goto jloop;
|
|
}
|
|
// ok, we are done, do not do this ip any more
|
|
ipDomCntTable.removeKey(ip);
|
|
}
|
|
|
|
if ( negRecs ) {
|
|
fprintf( stderr, "There are %" PRId32" total negative records.\n", negRecs );
|
|
}
|
|
|
|
if ( emptyRecs ) {
|
|
fprintf( stderr, "There are %" PRId32" total negative records.\n", emptyRecs );
|
|
}
|
|
|
|
fprintf(stderr,"There are %" PRId32" total records.\n", count);
|
|
fprintf(stderr,"There are %" PRId32" total request records.\n", countRequests);
|
|
fprintf(stderr,"There are %" PRId32" total replies records.\n", countReplies);
|
|
|
|
// end with total uniq domains
|
|
fprintf(stderr,"There are %" PRId32" unique domains.\n", uniqDoms);
|
|
|
|
// and with total uniq ips in this priority
|
|
fprintf(stderr,"There are %" PRId32" unique IPs.\n", uniqIps);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int keycmp(const void *, const void *);
|
|
int keycmp ( const void *p1 , const void *p2 ) {
|
|
// returns 0 if equal, -1 if p1 < p2, +1 if p1 > p2
|
|
if ( *(key_t *)p1 < *(key_t *)p2 ) return -1;
|
|
if ( *(key_t *)p1 > *(key_t *)p2 ) return 1;
|
|
return 0;
|
|
}
|
|
|
|
// time speed of inserts into RdbTree for indexdb
|
|
bool treetest ( ) {
|
|
int32_t numKeys = 500000;
|
|
log("db: speedtest: generating %" PRId32" random keys.",numKeys);
|
|
// seed randomizer
|
|
srand ( (int32_t)gettimeofdayInMilliseconds() );
|
|
// make list of one million random keys
|
|
key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" );
|
|
if ( ! k ) {
|
|
log(LOG_WARN, "speedtest: malloc failed");
|
|
return false;
|
|
}
|
|
int32_t *r = (int32_t *)(void*)k;
|
|
int32_t size = 0;
|
|
int32_t first = 0;
|
|
for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) {
|
|
if ( (i % 3) == 2 && first++ < 50000 ) {
|
|
r[i] = 1234567;
|
|
size++;
|
|
}
|
|
else
|
|
r[i] = rand();
|
|
}
|
|
// init the tree
|
|
RdbTree rt;
|
|
if (!rt.set(0, numKeys + 1000, numKeys * 28, false, "tree-test")) {
|
|
log(LOG_WARN, "speedTest: tree init failed.");
|
|
return false;
|
|
}
|
|
// add to regular tree
|
|
int64_t t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < numKeys ; i++ ) {
|
|
//if ( k[i].n1 == 1234567 )
|
|
// fprintf(stderr,"i=%" PRId32"\n",i);
|
|
if ( rt.addNode ( (collnum_t)0 , k[i] , NULL , 0 ) < 0 ) {
|
|
log(LOG_WARN, "speedTest: rdb tree addNode failed");
|
|
return false;
|
|
}
|
|
}
|
|
// print time it took
|
|
int64_t e = gettimeofdayInMilliseconds();
|
|
log("db: added %" PRId32" keys to rdb tree in %" PRId64" ms",numKeys,e - t);
|
|
|
|
// sort the list of keys
|
|
t = gettimeofdayInMilliseconds();
|
|
gbsort ( k , numKeys , sizeof(key_t) , keycmp );
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("db: sorted %" PRId32" in %" PRId64" ms",numKeys,e - t);
|
|
|
|
// get the list
|
|
key_t kk;
|
|
kk.n0 = 0LL;
|
|
kk.n1 = 0;
|
|
kk.n1 = 1234567;
|
|
int32_t n = rt.getNextNode ( (collnum_t)0, (char *)&kk );
|
|
// loop it
|
|
t = gettimeofdayInMilliseconds();
|
|
int32_t count = 0;
|
|
while ( n >= 0 && --first >= 0 ) {
|
|
n = rt.getNextNode ( n );
|
|
count++;
|
|
}
|
|
e = gettimeofdayInMilliseconds();
|
|
log("db: getList for %" PRId32" nodes in %" PRId64" ms",count,e - t);
|
|
return true;
|
|
}
|
|
|
|
|
|
// time speed of inserts into RdbTree for indexdb
|
|
bool hashtest ( ) {
|
|
// load em up
|
|
int32_t numKeys = 1000000;
|
|
log("db: speedtest: generating %" PRId32" random keys.",numKeys);
|
|
// seed randomizer
|
|
srand ( (int32_t)gettimeofdayInMilliseconds() );
|
|
// make list of one million random keys
|
|
key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" );
|
|
if ( ! k ) {
|
|
log(LOG_WARN, "hashtest: malloc failed");
|
|
return false;
|
|
}
|
|
int32_t *r = (int32_t *)(void*)k;
|
|
for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) r[i] = rand();
|
|
// init the tree
|
|
//HashTableT<int32_t,int32_t> ht;
|
|
HashTable ht;
|
|
ht.set ( (int32_t)(1.1 * numKeys) );
|
|
// add to regular tree
|
|
int64_t t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < numKeys ; i++ )
|
|
if ( ! ht.addKey ( r[i] , 1 ) ) {
|
|
log(LOG_WARN, "hashtest: add key failed.");
|
|
return false;
|
|
}
|
|
// print time it took
|
|
int64_t e = gettimeofdayInMilliseconds();
|
|
// add times
|
|
log("db: added %" PRId32" keys in %" PRId64" ms",numKeys,e - t);
|
|
|
|
// do the delete test
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < numKeys ; i++ )
|
|
if ( ! ht.removeKey ( r[i] ) ) {
|
|
log(LOG_WARN, "hashtest: add key failed.");
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
// add times
|
|
log("db: deleted %" PRId32" keys in %" PRId64" ms",numKeys,e - t);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// time speed of big write, read and the seeks
|
|
bool thrutest ( char *testdir , int64_t fileSize ) {
|
|
|
|
// always block
|
|
g_jobScheduler.disallow_new_jobs();
|
|
|
|
// a read/write buffer of 30M
|
|
int32_t bufSize = 30000000; // 30M
|
|
//int64_t fileSize = 4000000000LL; // 4G
|
|
char *buf = (char *) malloc ( bufSize );
|
|
if ( ! buf ) {
|
|
log(LOG_WARN, "speedtestdisk: %s",strerror(errno));
|
|
return false;
|
|
}
|
|
// store stuff in there
|
|
for ( int32_t i = 0 ; i < bufSize ; i++ ) buf[i] = (char)i;
|
|
|
|
BigFile f;
|
|
// try a read test from speedtest*.dat*
|
|
f.set (testdir,"speedtest");
|
|
if ( f.doesExist() ) {
|
|
if ( ! f.open ( O_RDONLY ) ) {
|
|
log(LOG_WARN, "speedtestdisk: cannot open %s/%s", testdir, "speedtest");
|
|
return false;
|
|
}
|
|
// ensure big enough
|
|
if ( f.getFileSize() < fileSize ) {
|
|
log(LOG_WARN, "speedtestdisk: File %s/%s is too small for requested read size.", testdir, "speedtest");
|
|
return false;
|
|
}
|
|
log("db: reading from speedtest0001.dat");
|
|
f.setBlocking();
|
|
goto doreadtest;
|
|
}
|
|
// try a read test from indexdb*.dat*
|
|
f.set (testdir,"indexdb0001.dat");
|
|
if ( f.doesExist() ) {
|
|
if ( ! f.open ( O_RDONLY ) ) {
|
|
log(LOG_WARN, "speedtestdisk: cannot open %s/%s", testdir, "indexdb0001.dat");
|
|
return false;
|
|
}
|
|
log("db: reading from indexdb0001.dat");
|
|
f.setBlocking();
|
|
goto doreadtest;
|
|
}
|
|
// try a write test to speedtest*.dat*
|
|
f.set (testdir,"speedtest");
|
|
if ( ! f.doesExist() ) {
|
|
if ( ! f.open ( O_RDWR | O_CREAT | O_SYNC ) ) {
|
|
log(LOG_WARN, "speedtestdisk: cannot open %s/%s", testdir, "speedtest");
|
|
return false;
|
|
}
|
|
log("db: writing to speedtest0001.dat");
|
|
f.setBlocking();
|
|
}
|
|
|
|
// write 2 gigs to the file, 1M at a time
|
|
{
|
|
int64_t t1 = gettimeofdayInMilliseconds();
|
|
int32_t numLoops = fileSize / bufSize;
|
|
int64_t off = 0LL;
|
|
int32_t next = 0;
|
|
for ( int32_t i = 0 ; i < numLoops ; i++ ) {
|
|
f.write ( buf , bufSize , off );
|
|
sync(); // f.flush ( );
|
|
off += bufSize ;
|
|
next += bufSize;
|
|
//if ( i >= numLoops || next < 100000000 ) continue;
|
|
if ( i + 1 < numLoops && next < 100000000 ) continue;
|
|
next = 0;
|
|
// print speed every X seconds
|
|
int64_t t2 = gettimeofdayInMilliseconds();
|
|
float mBps = (float)off / (float)(t2-t1) / 1000.0 ;
|
|
fprintf(stderr,"wrote %" PRId64" bytes in %" PRId64" ms (%.1f MB/s)\n",
|
|
off,t2-t1,mBps);
|
|
}
|
|
}
|
|
|
|
doreadtest:
|
|
|
|
{
|
|
int64_t t1 = gettimeofdayInMilliseconds();
|
|
int32_t numLoops = fileSize / bufSize;
|
|
int64_t off = 0LL;
|
|
int32_t next = 0;
|
|
for ( int32_t i = 0 ; i < numLoops ; i++ ) {
|
|
f.read ( buf , bufSize , off );
|
|
//sync(); // f.flush ( );
|
|
off += bufSize ;
|
|
next += bufSize;
|
|
//if ( i >= numLoops || next < 100000000 ) continue;
|
|
if ( i + 1 < numLoops && next < 100000000 ) continue;
|
|
next = 0;
|
|
// print speed every X seconds
|
|
int64_t t2 = gettimeofdayInMilliseconds();
|
|
float mBps = (float)off / (float)(t2-t1) / 1000.0 ;
|
|
fprintf(stderr,"read %" PRId64" bytes in %" PRId64" ms (%.1f MB/s)\n",
|
|
off,t2-t1,mBps);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// SEEK TEST
|
|
//
|
|
|
|
#include <sys/time.h> // gettimeofday()
|
|
#include <sys/time.h>
|
|
#include <sys/resource.h>
|
|
//#include <pthread.h>
|
|
#include <time.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
|
|
static void startUp ( void *state );
|
|
static int32_t s_count = 0;
|
|
static int64_t s_filesize = 0;
|
|
//static int32_t s_lock = 1;
|
|
//static int s_fd1 ; // , s_fd2;
|
|
static BigFile s_f;
|
|
static int32_t s_numThreads = 0;
|
|
static int64_t s_maxReadSize = 1;
|
|
static int64_t s_startTime = 0;
|
|
//#define MAX_READ_SIZE (2000000)
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
|
|
void seektest ( const char *testdir, int32_t numThreads, int32_t maxReadSize , const char *filename ) {
|
|
|
|
g_loop.init();
|
|
g_jobScheduler.initialize(numThreads,numThreads,numThreads);
|
|
s_numThreads = numThreads;
|
|
s_maxReadSize = maxReadSize;
|
|
if ( s_maxReadSize <= 0 ) s_maxReadSize = 1;
|
|
//if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE;
|
|
|
|
log(LOG_INIT,"admin: dir=%s threads=%" PRId32" maxReadSize=%" PRId32" file=%s\n",
|
|
testdir,(int32_t)s_numThreads, (int32_t)s_maxReadSize , filename );
|
|
|
|
// maybe its a filename in the cwd
|
|
if ( filename ) {
|
|
s_f.set(testdir,filename);
|
|
if ( s_f.doesExist() ) {
|
|
log(LOG_INIT,"admin: reading from %s.",
|
|
s_f.getFilename());
|
|
goto skip;
|
|
}
|
|
log("admin: %s does not exists. Use ./gb thrutest ... "
|
|
"to create speedtest* files.",
|
|
s_f.getFilename());
|
|
return;
|
|
}
|
|
// check other defaults
|
|
s_f.set ( testdir , "speedtest" );
|
|
if ( s_f.doesExist() ) {
|
|
log(LOG_INIT,"admin: reading from speedtest*.dat.");
|
|
goto skip;
|
|
}
|
|
// try a read test from indexdb*.dat*
|
|
s_f.set (testdir,"indexdb0001.dat");
|
|
if ( s_f.doesExist() ) {
|
|
log(LOG_INIT,"admin: reading from indexdb0001.dat.");
|
|
goto skip;
|
|
}
|
|
|
|
log("admin: Neither speedtest* or indexdb0001.dat* "
|
|
"exist. Use ./gb thrutest ... to create speedtest* files.");
|
|
return;
|
|
skip:
|
|
s_f.open ( O_RDONLY );
|
|
s_filesize = s_f.getFileSize();
|
|
log ( LOG_INIT, "admin: file size = %" PRId64".",s_filesize);
|
|
// always block
|
|
//g_jobScheduler.disallow_new_jobs();
|
|
// seed rand
|
|
srand(time(NULL));
|
|
|
|
// set time
|
|
s_startTime = gettimeofdayInMilliseconds();
|
|
|
|
int32_t stksize = 1000000 ;
|
|
int32_t bufsize = stksize * s_numThreads ;
|
|
char *buf = (char *)malloc ( bufsize );
|
|
if ( ! buf ) { log("test: malloc of %" PRId32" failed.",bufsize); return; }
|
|
g_jobScheduler.allow_new_jobs();
|
|
//int pid;
|
|
for ( int32_t i = 0 ; i < s_numThreads ; i++ ) {
|
|
if ( !g_jobScheduler.submit(startUp, NULL, (void *)(PTRTYPE)i, thread_type_unspecified_io, 0)){
|
|
log("test: Thread launch failed."); return; }
|
|
log(LOG_INIT,"test: Launched thread #%" PRId32".",i);
|
|
}
|
|
// sleep til done
|
|
#undef sleep
|
|
while ( 1 == 1 ) sleep(1000);
|
|
#define sleep(a) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
|
|
// Use of ThreadEntry parameter is NOT thread safe
|
|
void startUp ( void *state ) {
|
|
int32_t id = (int32_t) (PTRTYPE)state;
|
|
// read buf
|
|
char *buf = (char *) malloc ( s_maxReadSize );
|
|
if ( ! buf ) {
|
|
fprintf(stderr,"MALLOC FAILED in thread\n");
|
|
return;
|
|
}
|
|
// we got ourselves
|
|
// msg
|
|
fprintf(stderr,"id=%" PRId32" launched. Performing 100000 reads.\n",id);
|
|
// now do a stupid loop
|
|
int64_t off , size;
|
|
for ( int32_t i = 0 ; i < 100000 ; i++ ) {
|
|
uint64_t r = rand();
|
|
r <<= 32 ;
|
|
r |= rand();
|
|
off = r % (s_filesize - s_maxReadSize );
|
|
size = s_maxReadSize;
|
|
// time it
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
s_f.read ( buf , size , off );
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
#undef usleep
|
|
usleep(0);
|
|
#define usleep(a) { g_process.shutdownAbort(true); }
|
|
s_count++;
|
|
float sps = (float)((float)s_count * 1000.0) /
|
|
(float)(now - s_startTime);
|
|
fprintf(stderr,"count=%" PRId32" off=%012" PRId64" size=%" PRId32" time=%" PRId32"ms "
|
|
"(%.2f seeks/sec)\n",
|
|
(int32_t)s_count,
|
|
(int64_t)off,
|
|
(int32_t)size,
|
|
(int32_t)(now - start) ,
|
|
sps );
|
|
}
|
|
}
|
|
|
|
void dumpTagdb( const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char req, int32_t rdbId,
|
|
const char *siteArg ) {
|
|
g_tagdb.init ();
|
|
|
|
if ( rdbId == RDB_TAGDB ) {
|
|
g_tagdb.getRdb()->addRdbBase1(coll );
|
|
}
|
|
|
|
key128_t startKey ;
|
|
key128_t endKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
if ( siteArg ) {
|
|
startKey = g_tagdb.makeStartKey ( siteArg, strlen(siteArg) );
|
|
endKey = g_tagdb.makeEndKey ( siteArg, strlen(siteArg) );
|
|
log("gb: using site %s for start key",siteArg );
|
|
}
|
|
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
int64_t hostHash = -1;
|
|
int64_t lastHostHash = -2;
|
|
char *site = NULL;
|
|
char sbuf[1024*2];
|
|
int32_t siteNumInlinks = -1;
|
|
int32_t typeSite = hash64Lower_a("site",4);
|
|
int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);
|
|
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( (rdbid_t)rdbId,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) return;
|
|
// loop over entries in list
|
|
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
|
|
char *rec = list.getCurrentRec();
|
|
//key_t k = list.getCurrentKey();
|
|
key128_t k;
|
|
list.getCurrentKey ( &k );
|
|
char *data = list.getCurrentData();
|
|
int32_t size = list.getCurrentDataSize();
|
|
// is it a delete?
|
|
if ( (k.n0 & 0x01) == 0 ) {
|
|
if ( req == 'z' ) continue;
|
|
printf("k.n1=%016" PRIx64" "
|
|
"k.n0=%016" PRIx64" (delete)\n",
|
|
k.n1 , k.n0 | 0x01 ); // fix it!
|
|
continue;
|
|
}
|
|
// point to the data
|
|
char *p = data;
|
|
char *pend = data + size;
|
|
// breach check
|
|
if ( p >= pend ) {
|
|
printf("corrupt tagdb rec k.n0=%" PRIu64,k.n0);
|
|
continue;
|
|
}
|
|
|
|
// parse it up
|
|
Tag *tag = (Tag *)rec;
|
|
|
|
// print the version and site
|
|
char tmpBuf[1024];
|
|
SafeBuf sb(tmpBuf, 1024);
|
|
|
|
bool match = false;
|
|
|
|
hostHash = tag->m_key.n1;
|
|
|
|
if ( hostHash == lastHostHash ) {
|
|
match = true;
|
|
}
|
|
else {
|
|
site = NULL;
|
|
siteNumInlinks = -1;
|
|
}
|
|
|
|
lastHostHash = hostHash;
|
|
|
|
// making sitelist.txt?
|
|
if ( tag->m_type == typeSite && req == 'z' ) {
|
|
site = tag->getTagData();
|
|
// make it null if too many .'s
|
|
if ( site ) {
|
|
char *p = site;
|
|
int count = 0;
|
|
int alpha = 0;
|
|
int colons = 0;
|
|
// foo.bar.baz.com is ok
|
|
for ( ; *p ; p++ ) {
|
|
if ( *p == '.' ) count++;
|
|
if ( *p == ':' ) colons++;
|
|
if ( is_alpha_a(*p) || *p=='-' )
|
|
alpha++;
|
|
}
|
|
if ( count >= 4 )
|
|
site = NULL;
|
|
if ( colons > 1 )
|
|
site = NULL;
|
|
// no ip addresses allowed, need an alpha char
|
|
if ( alpha == 0 )
|
|
site = NULL;
|
|
}
|
|
// ends in :?
|
|
int slen = 0;
|
|
if ( site ) slen = strlen(site);
|
|
if ( site && site[slen-1] == ':' )
|
|
site = NULL;
|
|
// port bug
|
|
if ( site && site[slen-2] == ':' && site[slen-1]=='/')
|
|
site = NULL;
|
|
// remove heavy spammers to save space
|
|
if ( site && strstr(site,"daily-camshow-report") )
|
|
site = NULL;
|
|
if ( site && strstr(site,".livejasminhd.") )
|
|
site = NULL;
|
|
if ( site && strstr(site,".pornlivenews.") )
|
|
site = NULL;
|
|
if ( site && strstr(site,".isapornblog.") )
|
|
site = NULL;
|
|
if ( site && strstr(site,".teen-model-24.") )
|
|
site = NULL;
|
|
if ( site && ! is_ascii2_a ( site, strlen(site) ) ) {
|
|
site = NULL;
|
|
continue;
|
|
}
|
|
if ( match && siteNumInlinks>=0) {
|
|
// if we ask for 1 or 2 we end up with 100M
|
|
// entries, but with 3+ we get 27M
|
|
if ( siteNumInlinks > 2 && site )
|
|
printf("%i %s\n",siteNumInlinks,site);
|
|
siteNumInlinks = -1;
|
|
site = NULL;
|
|
}
|
|
// save it
|
|
if ( site ) strcpy ( sbuf , site );
|
|
continue;
|
|
}
|
|
|
|
if ( tag->m_type == typeInlinks && req == 'z' ) {
|
|
siteNumInlinks = atoi(tag->getTagData());
|
|
if ( match && site ) {
|
|
// if we ask for 1 or 2 we end up with 100M
|
|
// entries, but with 3+ we get 27M
|
|
if ( siteNumInlinks > 2 )
|
|
printf("%i %s\n",siteNumInlinks,sbuf);
|
|
siteNumInlinks = -1;
|
|
site = NULL;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if ( req == 'z' )
|
|
continue;
|
|
|
|
// print as an add request or just normal
|
|
if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb );
|
|
else tag->printToBuf ( &sb );
|
|
|
|
// dump it
|
|
printf("%s\n",sb.getBufStart());
|
|
|
|
}
|
|
|
|
startKey = *(key128_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key128_t *)list.getLastKey() ){
|
|
printf("\n"); return;}
|
|
goto loop;
|
|
}
|
|
|
|
bool parseTest ( const char *coll, int64_t docId, const char *query ) {
|
|
g_conf.m_maxMem = 2000000000LL; // 2G
|
|
g_titledb.init ();
|
|
g_titledb.getRdb()->addRdbBase1 ( coll );
|
|
log(LOG_INIT, "build: Testing parse speed of html docId %" PRId64".",docId);
|
|
// get a title rec
|
|
g_jobScheduler.disallow_new_jobs();
|
|
RdbList tlist;
|
|
key_t startKey = g_titledb.makeFirstKey ( docId );
|
|
key_t endKey = g_titledb.makeLastKey ( docId );
|
|
// a niceness of 0 tells it to block until it gets results!!
|
|
Msg5 msg5;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
if ( ! msg5.getList ( RDB_TITLEDB ,
|
|
cr->m_collnum ,
|
|
&tlist ,
|
|
startKey ,
|
|
endKey , // should be maxed!
|
|
9999999 , // min rec sizes
|
|
true , // include tree?
|
|
false , // includeCache
|
|
0 , // startFileNum
|
|
-1 , // m_numFiles
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // do error correction?
|
|
NULL , // cache key ptr
|
|
0 , // retry num
|
|
-1 , // maxRetries
|
|
true , // compensate for merge
|
|
-1LL, // sync point
|
|
false, // isRealMerge
|
|
true)) { // allowPageCache
|
|
log(LOG_LOGIC, "build: getList did not block.");
|
|
return false;
|
|
}
|
|
|
|
// get the title rec
|
|
if ( tlist.isEmpty() ) {
|
|
log(LOG_WARN, "build: speedtestxml: docId %" PRId64" not found.", docId);
|
|
return false;
|
|
}
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log(LOG_WARN, "Unicode initialization failed!");
|
|
return false;
|
|
}
|
|
|
|
// get raw rec from list
|
|
char *rec = tlist.getCurrentRec();
|
|
int32_t listSize = tlist.getListSize ();
|
|
XmlDoc xd;
|
|
if ( ! xd.set2 ( rec , listSize , coll , NULL , 0 ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: Error setting xml doc.");
|
|
return false;
|
|
}
|
|
log("build: Doc url is %s",xd.ptr_firstUrl);//tr.getUrl()->getUrl());
|
|
log("build: Doc is %" PRId32" bytes long.",xd.size_utf8Content-1);
|
|
log("build: Doc charset is %s",get_charset_str(xd.m_charset));
|
|
|
|
|
|
// time the summary/title generation code
|
|
log("build: Using query %s",query);
|
|
summaryTest1 ( rec , listSize , coll , docId , query );
|
|
|
|
// for a 128k latin1 doc: (access time is probably 15-20ms)
|
|
// 1.18 ms to set title rec (6ms total)
|
|
// 1.58 ms to set Xml
|
|
// 1.71 ms to set Words (~50% from Words::countWords())
|
|
// 0.42 ms to set Pos
|
|
// 0.66 ms to set Bits
|
|
// 0.51 ms to set Scores
|
|
// 0.35 ms to getText()
|
|
|
|
// speed test
|
|
int64_t t = gettimeofdayInMilliseconds();
|
|
for ( int32_t k = 0 ; k < 100 ; k++ )
|
|
xd.set2 (rec, listSize, coll , NULL , 0 );
|
|
int64_t e = gettimeofdayInMilliseconds();
|
|
logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
|
|
(float)(e-t)/100.0);
|
|
|
|
// speed test
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t k = 0 ; k < 100 ; k++ ) {
|
|
char *mm = (char *)mmalloc ( 300*1024 , "ztest");
|
|
mfree ( mm , 300*1024 ,"ztest");
|
|
}
|
|
e = gettimeofdayInMilliseconds();
|
|
logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
|
|
(float)(e-t)/100.0);
|
|
|
|
// get content
|
|
char *content = xd.ptr_utf8Content;//tr.getContent();
|
|
int32_t contentLen = xd.size_utf8Content-1;//tr.getContentLen();
|
|
|
|
// loop parse
|
|
Xml xml;
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
|
if ( !xml.set( content, contentLen, xd.m_version, 0, CT_HTML ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: xml set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Xml::set() took %.3f ms to parse docId %" PRId64".",
|
|
(double)(e - t)/100.0,docId);
|
|
double bpms = contentLen/((double)(e-t)/100.0);
|
|
log("build: %.3f bytes/msec", bpms);
|
|
// get per char and per byte speeds
|
|
xml.reset();
|
|
|
|
// loop parse
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
|
if ( !xml.set( content, contentLen, xd.m_version, 0, CT_HTML ) ) {
|
|
log(LOG_WARN, "build: xml(setparents=false): %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Xml::set(setparents=false) took %.3f ms to "
|
|
"parse docId %" PRId64".", (double)(e - t)/100.0,docId);
|
|
|
|
|
|
if (!ucInit(g_hostdb.m_dir)) {
|
|
log("Unicode initialization failed!");
|
|
return 1;
|
|
}
|
|
Words words;
|
|
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
if ( ! words.set ( &xml , true , true ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: words set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Words::set(xml,computeIds=true) took %.3f ms for %" PRId32" words"
|
|
" (precount=%" PRId32") for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),words.getPreCount(),docId);
|
|
|
|
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
if ( ! words.set ( &xml , true , false ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: words set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Words::set(xml,computeIds=false) "
|
|
"took %.3f ms for %" PRId32" words"
|
|
" (precount=%" PRId32") for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),words.getPreCount(),docId);
|
|
|
|
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
//if ( ! words.set ( &xml , true , true ) )
|
|
if ( ! words.set ( content , true, 0 ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: words set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Words::set(content,computeIds=true) "
|
|
"took %.3f ms for %" PRId32" words "
|
|
"for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),docId);
|
|
|
|
|
|
Pos pos;
|
|
// computeWordIds from xml
|
|
words.set ( &xml , true , true ) ;
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
//if ( ! words.set ( &xml , true , true ) )
|
|
if ( ! pos.set ( &words ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: pos set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Pos::set() "
|
|
"took %.3f ms for %" PRId32" words "
|
|
"for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),docId);
|
|
|
|
|
|
Bits bits;
|
|
// computeWordIds from xml
|
|
words.set ( &xml , true , true ) ;
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
//if ( ! words.set ( &xml , true , true ) )
|
|
if ( ! bits.setForSummary ( &words ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: Bits set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Bits::setForSummary() "
|
|
"took %.3f ms for %" PRId32" words "
|
|
"for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),docId);
|
|
|
|
|
|
Sections sections;
|
|
// computeWordIds from xml
|
|
words.set ( &xml , true , true ) ;
|
|
bits.set(&words, 0);
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
//if ( ! words.set ( &xml , true , true ) )
|
|
// do not supply xd so it will be set from scratch
|
|
if ( !sections.set( &words, &bits, NULL, NULL, 0, 0 ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: sections set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Scores::set() "
|
|
"took %.3f ms for %" PRId32" words "
|
|
"for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),docId);
|
|
|
|
|
|
|
|
//Phrases phrases;
|
|
Phrases phrases;
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
if ( !phrases.set( &words, &bits, 0 ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: Phrases set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Phrases::set() "
|
|
"took %.3f ms for %" PRId32" words "
|
|
"for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),docId);
|
|
|
|
char *buf = (char *)mmalloc(contentLen*2+1,"main");
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ )
|
|
if ( !xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Xml::getText(computeIds=false) took %.3f ms for docId "
|
|
"%" PRId64".",(double)(e - t)/100.0,docId);
|
|
|
|
|
|
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
|
int32_t bufLen = xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true );
|
|
if ( ! bufLen ) {
|
|
log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
if ( ! words.set ( buf,true,0) ) {
|
|
log(LOG_WARN, "build: speedtestxml: words set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Xml::getText(computeIds=false) w/ word::set() "
|
|
"took %.3f ms for docId "
|
|
"%" PRId64".",(double)(e - t)/100.0,docId);
|
|
|
|
|
|
|
|
Matches matches;
|
|
Query q;
|
|
q.set2 ( query , langUnknown , false );
|
|
matches.setQuery ( &q );
|
|
words.set ( &xml , true , 0 ) ;
|
|
t = gettimeofdayInMilliseconds();
|
|
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
|
matches.reset();
|
|
if ( ! matches.addMatches ( &words ) ) {
|
|
log(LOG_WARN, "build: speedtestxml: matches set: %s", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
}
|
|
// print time it took
|
|
e = gettimeofdayInMilliseconds();
|
|
log("build: Matches::set() took %.3f ms for %" PRId32" words"
|
|
" (precount=%" PRId32") for docId %" PRId64".",
|
|
(double)(e - t)/100.0,words.getNumWords(),words.getPreCount(),docId);
|
|
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
bool summaryTest1 ( char *rec, int32_t listSize, const char *coll, int64_t docId, const char *query ) {
|
|
|
|
// start the timer
|
|
int64_t t = gettimeofdayInMilliseconds();
|
|
|
|
Query q;
|
|
q.set2 ( query , langUnknown , false );
|
|
|
|
char *content ;
|
|
int32_t contentLen ;
|
|
|
|
// loop parse
|
|
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
|
XmlDoc xd;
|
|
xd.set2 (rec, listSize, coll,NULL,0);
|
|
// get content
|
|
content = xd.ptr_utf8Content;//tr.getContent();
|
|
contentLen = xd.size_utf8Content-1;//tr.getContentLen();
|
|
|
|
// now parse into xhtml (takes 15ms on lenny)
|
|
Xml xml;
|
|
xml.set( content, contentLen, xd.m_version, 0, CT_HTML );
|
|
|
|
xd.getSummary();
|
|
}
|
|
|
|
// print time it took
|
|
int64_t e = gettimeofdayInMilliseconds();
|
|
log("build: V1 Summary/Title/Gigabits generation took %.3f ms for docId "
|
|
"%" PRId64".",
|
|
(double)(e - t)/100.0,docId);
|
|
double bpms = contentLen/((double)(e-t)/100.0);
|
|
log("build: %.3f bytes/msec", bpms);
|
|
return true;
|
|
}
|
|
|
|
void dumpPosdb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree,
|
|
int64_t termId , bool justVerify ) {
|
|
if ( ! justVerify ) {
|
|
g_posdb.init ();
|
|
g_posdb.getRdb()->addRdbBase1(coll );
|
|
}
|
|
|
|
key144_t startKey ;
|
|
key144_t endKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
if ( termId >= 0 ) {
|
|
g_posdb.makeStartKey ( &startKey, termId );
|
|
g_posdb.makeEndKey ( &endKey, termId );
|
|
printf("termid=%" PRIu64"\n",termId);
|
|
printf("startkey=%s\n",KEYSTR(&startKey,sizeof(POSDBKEY)));
|
|
printf("endkey=%s\n",KEYSTR(&endKey,sizeof(POSDBKEY)));
|
|
}
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
|
|
// bail if not
|
|
if ( g_posdb.getRdb()->getNumFiles() <= startFileNum && numFiles > 0 ) {
|
|
printf("Request file #%" PRId32" but there are only %" PRId32" "
|
|
"posdb files\n",startFileNum,
|
|
g_posdb.getRdb()->getNumFiles());
|
|
return;
|
|
}
|
|
|
|
key144_t lastKey;
|
|
lastKey.setMin();
|
|
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
|
|
// set this flag so Msg5.cpp if it does error correction does not
|
|
// try to get the list from a twin...
|
|
g_isDumpingRdbFromMain = 1;
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_POSDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
&startKey ,
|
|
&endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
true, // to debug RdbList::removeBadData_r()
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
//false )){// err correction?
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) return;
|
|
|
|
// get last key in list
|
|
char *ek2 = list.m_endKey;
|
|
// print it
|
|
printf("ek=%s\n",KEYSTR(ek2,list.m_ks) );
|
|
|
|
// loop over entries in list
|
|
for ( list.resetListPtr() ; ! list.isExhausted() && ! justVerify ;
|
|
list.skipCurrentRecord() ) {
|
|
key144_t k; list.getCurrentKey(&k);
|
|
// compare to last
|
|
const char *err = "";
|
|
if ( KEYCMP((char *)&k,(char *)&lastKey,sizeof(key144_t))<0 )
|
|
err = " (out of order)";
|
|
lastKey = k;
|
|
// is it a delete?
|
|
const char *dd = "";
|
|
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
|
int64_t d = g_posdb.getDocId(&k);
|
|
uint8_t dh = g_titledb.getDomHash8FromDocId(d);
|
|
char *rec = list.m_listPtr;
|
|
int32_t recSize = 18;
|
|
if ( rec[0] & 0x04 ) recSize = 6;
|
|
else if ( rec[0] & 0x02 ) recSize = 12;
|
|
// alignment bits check
|
|
if ( recSize == 6 && !(rec[1] & 0x02) ) {
|
|
int64_t nd1 = g_posdb.getDocId(rec+6);
|
|
err = " (alignerror1)";
|
|
if ( nd1 < d ) err = " (alignordererror1)";
|
|
//g_process.shutdownAbort(true);
|
|
}
|
|
if ( recSize == 12 && !(rec[1] & 0x02) ) {
|
|
// seems like nd2 is it, so it really is 12 bytes but
|
|
// does not have the alignment bit set...
|
|
int64_t nd2 = g_posdb.getDocId(rec+12);
|
|
err = " (alignerror2)";
|
|
if ( nd2 < d ) err = " (alignorderrror2)";
|
|
}
|
|
// if it
|
|
if ( recSize == 12 && (rec[7] & 0x02)) {
|
|
// seems like nd2 is it, so it really is 12 bytes but
|
|
// does not have the alignment bit set...
|
|
int64_t nd2 = g_posdb.getDocId(rec+12);
|
|
err = " (alignerror3)";
|
|
if ( nd2 < d ) err = " (alignordererror3)";
|
|
}
|
|
if ( KEYCMP((char *)&k,(char *)&startKey,list.m_ks)<0 ||
|
|
KEYCMP((char *)&k,ek2,list.m_ks)>0){
|
|
err = " (out of range)";
|
|
}
|
|
|
|
if ( termId < 0 )
|
|
printf(
|
|
"k=%s "
|
|
"tid=%015" PRIu64" "
|
|
"docId=%012" PRId64" "
|
|
|
|
"siterank=%02" PRId32" "
|
|
"langid=%02" PRId32" "
|
|
"pos=%06" PRId32" "
|
|
"hgrp=%02" PRId32" "
|
|
"spamrank=%02" PRId32" "
|
|
"divrank=%02" PRId32" "
|
|
"syn=%01" PRId32" "
|
|
"densrank=%02" PRId32" "
|
|
"mult=%02" PRId32" "
|
|
|
|
"dh=0x%02" PRIx32" "
|
|
"rs=%" PRId32 //recSize
|
|
"%s" // dd
|
|
"%s" // err
|
|
"\n" ,
|
|
KEYSTR(&k,sizeof(key144_t)),
|
|
(int64_t)g_posdb.getTermId(&k),
|
|
d ,
|
|
(int32_t)g_posdb.getSiteRank(&k),
|
|
(int32_t)g_posdb.getLangId(&k),
|
|
(int32_t)g_posdb.getWordPos(&k),
|
|
(int32_t)g_posdb.getHashGroup(&k),
|
|
(int32_t)g_posdb.getWordSpamRank(&k),
|
|
(int32_t)g_posdb.getDiversityRank(&k),
|
|
(int32_t)g_posdb.getIsSynonym(&k),
|
|
(int32_t)g_posdb.getDensityRank(&k),
|
|
(int32_t)g_posdb.getMultiplier(&k),
|
|
|
|
(int32_t)dh,
|
|
recSize,
|
|
dd ,
|
|
err );
|
|
else
|
|
printf(
|
|
"k=%s "
|
|
"tid=%015" PRIu64" "
|
|
"docId=%012" PRId64" "
|
|
"siterank=%02" PRId32" "
|
|
"langid=%02" PRId32" "
|
|
"pos=%06" PRId32" "
|
|
"hgrp=%02" PRId32" "
|
|
"spamrank=%02" PRId32" "
|
|
"divrank=%02" PRId32" "
|
|
"syn=%01" PRId32" "
|
|
"densrank=%02" PRId32" "
|
|
"mult=%02" PRId32" "
|
|
"recSize=%" PRId32" "
|
|
"dh=0x%02" PRIx32"%s%s\n" ,
|
|
KEYSTR(&k,sizeof(key144_t)),
|
|
(int64_t)g_posdb.getTermId(&k),
|
|
d ,
|
|
(int32_t)g_posdb.getSiteRank(&k),
|
|
(int32_t)g_posdb.getLangId(&k),
|
|
(int32_t)g_posdb.getWordPos(&k),
|
|
(int32_t)g_posdb.getHashGroup(&k),
|
|
(int32_t)g_posdb.getWordSpamRank(&k),
|
|
(int32_t)g_posdb.getDiversityRank(&k),
|
|
(int32_t)g_posdb.getIsSynonym(&k),
|
|
(int32_t)g_posdb.getDensityRank(&k),
|
|
(int32_t)g_posdb.getMultiplier(&k),
|
|
recSize,
|
|
|
|
(int32_t)dh,
|
|
dd ,
|
|
err );
|
|
continue;
|
|
}
|
|
|
|
startKey = *(key144_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key144_t *)list.getLastKey() ) return;
|
|
goto loop;
|
|
}
|
|
|
|
void dumpClusterdb ( const char *coll,
|
|
int32_t startFileNum,
|
|
int32_t numFiles,
|
|
bool includeTree ) {
|
|
g_clusterdb.init ();
|
|
g_clusterdb.getRdb()->addRdbBase1(coll );
|
|
key_t startKey ;
|
|
key_t endKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
|
|
// bail if not
|
|
if ( g_clusterdb.getRdb()->getNumFiles() <= startFileNum ) {
|
|
printf("Request file #%" PRId32" but there are only %" PRId32" "
|
|
"clusterdb files\n",startFileNum,
|
|
g_clusterdb.getRdb()->getNumFiles());
|
|
return;
|
|
}
|
|
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_CLUSTERDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() )
|
|
return;
|
|
// loop over entries in list
|
|
char strLanguage[256];
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
// is it a delete?
|
|
const char *dd = "";
|
|
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
|
// get the language string
|
|
languageToString ( g_clusterdb.getLanguage((char*)&k),
|
|
strLanguage );
|
|
//uint32_t gid = getGroupId ( RDB_CLUSTERDB , &k );
|
|
uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
|
|
Host *grp = g_hostdb.getShard ( shardNum );
|
|
Host *hh = &grp[0];
|
|
// print it
|
|
printf("k.n1=%08" PRIx32" k.n0=%016" PRIx64" "
|
|
"docId=%012" PRId64" family=%" PRIu32" "
|
|
"language=%" PRId32" (%s) siteHash26=%" PRIu32"%s "
|
|
"groupNum=%" PRIu32" "
|
|
"shardNum=%" PRIu32"\n",
|
|
k.n1, k.n0,
|
|
g_clusterdb.getDocId((char*)&k) ,
|
|
g_clusterdb.hasAdultContent((char*)&k) ,
|
|
(int32_t)g_clusterdb.getLanguage((char*)&k),
|
|
strLanguage,
|
|
g_clusterdb.getSiteHash26((char*)&k) ,
|
|
dd ,
|
|
hh->m_hostId ,
|
|
shardNum);
|
|
continue;
|
|
}
|
|
|
|
startKey = *(key_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key_t *)list.getLastKey() )
|
|
return;
|
|
goto loop;
|
|
}
|
|
|
|
void dumpLinkdb ( const char *coll,
|
|
int32_t startFileNum,
|
|
int32_t numFiles,
|
|
bool includeTree ,
|
|
const char *url ) {
|
|
g_linkdb.init ();
|
|
g_linkdb.getRdb()->addRdbBase1(coll );
|
|
key224_t startKey ;
|
|
key224_t endKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
// set to docid
|
|
if ( url ) {
|
|
Url u;
|
|
u.set( url, strlen( url ), true, false );
|
|
uint32_t h32 = u.getHostHash32();//g_linkdb.getUrlHash(&u)
|
|
int64_t uh64 = hash64n(url,0);
|
|
startKey = g_linkdb.makeStartKey_uk ( h32 , uh64 );
|
|
endKey = g_linkdb.makeEndKey_uk ( h32 , uh64 );
|
|
}
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
|
|
// bail if not
|
|
if ( g_linkdb.getRdb()->getNumFiles() <= startFileNum && !includeTree) {
|
|
printf("Request file #%" PRId32" but there are only %" PRId32" "
|
|
"linkdb files\n",startFileNum,
|
|
g_linkdb.getRdb()->getNumFiles());
|
|
return;
|
|
}
|
|
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_LINKDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
minRecSizes ,
|
|
includeTree ,
|
|
0 , // max cache age
|
|
startFileNum ,
|
|
numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL, // cacheKeyPtr
|
|
0, // retryNum
|
|
-1, // maxRetries
|
|
true, // compensateForMerge
|
|
-1, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) return;
|
|
// loop over entries in list
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key224_t k;
|
|
list.getCurrentKey((char *) &k);
|
|
// is it a delete?
|
|
const char *dd = "";
|
|
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
|
int64_t docId = (int64_t)g_linkdb.getLinkerDocId_uk(&k);
|
|
int32_t shardNum = getShardNum(RDB_LINKDB,&k);
|
|
printf("k=%s "
|
|
"linkeesitehash32=0x%08" PRIx32" "
|
|
"linkeeurlhash=0x%012" PRIx64" "
|
|
"linkspam=%" PRId32" "
|
|
"siterank=%02" PRId32" "
|
|
//"hopcount=%03hhu "
|
|
"ip32=%s "
|
|
"docId=%012" PRIu64" "
|
|
"discovered=%" PRIu32" "
|
|
"lost=%" PRIu32" "
|
|
"sitehash32=0x%08" PRIx32" "
|
|
"shardNum=%" PRIu32" "
|
|
"%s\n",
|
|
KEYSTR(&k,sizeof(key224_t)),
|
|
(int32_t)g_linkdb.getLinkeeSiteHash32_uk(&k),
|
|
(int64_t)g_linkdb.getLinkeeUrlHash64_uk(&k),
|
|
(int32_t)g_linkdb.isLinkSpam_uk(&k),
|
|
(int32_t)g_linkdb.getLinkerSiteRank_uk(&k),
|
|
//hc,//g_linkdb.getLinkerHopCount_uk(&k),
|
|
iptoa((int32_t)g_linkdb.getLinkerIp_uk(&k)),
|
|
docId,
|
|
(int32_t)g_linkdb.getDiscoveryDate_uk(&k),
|
|
(int32_t)g_linkdb.getLostDate_uk(&k),
|
|
(int32_t)g_linkdb.getLinkerSiteHash32_uk(&k),
|
|
shardNum,
|
|
dd );
|
|
}
|
|
|
|
startKey = *(key224_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key224_t *)list.getLastKey() ) return;
|
|
goto loop;
|
|
}
|
|
|
|
|
|
bool pingTest ( int32_t hid , uint16_t clientPort ) {
|
|
Host *h = g_hostdb.getHost ( hid );
|
|
if ( ! h ) {
|
|
log(LOG_WARN, "net: pingtest: hostId %" PRId32" is invalid.",hid);
|
|
return false;
|
|
}
|
|
// set up our socket
|
|
int sock = socket ( AF_INET, SOCK_DGRAM , 0 );
|
|
if ( sock < 0 ) {
|
|
log(LOG_WARN, "net: pingtest: socket: %s.", strerror(errno));
|
|
return false;
|
|
}
|
|
|
|
// sockaddr_in provides interface to sockaddr
|
|
struct sockaddr_in name;
|
|
// reset it all just to be safe
|
|
memset((char *)&name, 0,sizeof(name));
|
|
name.sin_family = AF_INET;
|
|
name.sin_addr.s_addr = INADDR_ANY;
|
|
name.sin_port = htons(clientPort);
|
|
// we want to re-use port it if we need to restart
|
|
int options = 1;
|
|
if ( setsockopt(sock, SOL_SOCKET, SO_REUSEADDR , &options,sizeof(options)) < 0 ) {
|
|
log(LOG_WARN, "net: pingtest: setsockopt: %s.", strerror(errno));
|
|
return false;
|
|
}
|
|
// bind this name to the socket
|
|
if ( bind ( sock, (struct sockaddr *)(void*)&name, sizeof(name)) < 0) {
|
|
close ( sock );
|
|
log(LOG_WARN, "net: pingtest: Bind on port %hu: %s.", clientPort,strerror(errno));
|
|
return false;
|
|
}
|
|
|
|
int fd = sock;
|
|
int flags = fcntl ( fd , F_GETFL ) ;
|
|
if ( flags < 0 ) {
|
|
log(LOG_WARN, "net: pingtest: fcntl(F_GETFL): %s.", strerror(errno));
|
|
return false;
|
|
}
|
|
|
|
char dgram[1450];
|
|
int n;
|
|
struct sockaddr_in to;
|
|
sockaddr_in from;
|
|
socklen_t fromLen;
|
|
|
|
// make the dgram
|
|
UdpProtocol *up = &g_dp; // udpServer2.getProtocol();
|
|
int32_t transId = 500000000 - 1 ;
|
|
int32_t dnum = 0; // dgramNum
|
|
|
|
int32_t sends = 0;
|
|
int32_t lost = 0;
|
|
int32_t recovered = 0;
|
|
int32_t acks = 0;
|
|
int32_t replies = 0;
|
|
|
|
memset(&to,0,sizeof(to));
|
|
to.sin_family = AF_INET;
|
|
to.sin_addr.s_addr = h->m_ip;
|
|
to.sin_port = ntohs(h->m_port);
|
|
log("net: pingtest: Testing hostId #%" PRId32" at %s:%hu from client "
|
|
"port %hu", hid,iptoa(h->m_ip),h->m_port,clientPort);
|
|
// if this is higher than number of avail slots UdpServer.cpp
|
|
// will not be able to free the slots and this will end up sticking,
|
|
// because the slots can only be freed in destroySlot() which
|
|
// is not async safe!
|
|
//int32_t count = 40000; // number of loops
|
|
int32_t count = 1000; // number of loops
|
|
int32_t avg = 0;
|
|
sendLoop:
|
|
if ( count-- <= 0 ) {
|
|
log("net: pingtest: Got %" PRId32" replies out of %" PRId32" sent (%" PRId32" lost)"
|
|
"(%" PRId32" recovered)", replies,sends,lost,recovered);
|
|
log("net: pingtest: Average reply time of %.03f ms.",
|
|
(double)avg/(double)replies);
|
|
return true;
|
|
}
|
|
transId++;
|
|
int32_t msgSize = 3; // indicates a debug ping packet to PingServer.cpp
|
|
up->setHeader ( dgram, msgSize, msg_type_11, dnum, transId, true, false , 0 );
|
|
int32_t size = up->getHeaderSize(0) + msgSize;
|
|
int64_t start = gettimeofdayInMilliseconds();
|
|
n = sendto(sock,dgram,size,0,(struct sockaddr *)(void*)&to,sizeof(to));
|
|
if ( n != size ) {
|
|
log(LOG_WARN, "net: pingtest: sendto returned %i (should have returned %" PRId32")",n,size);
|
|
return false;
|
|
}
|
|
sends++;
|
|
readLoop2:
|
|
// loop until we read something
|
|
n = recvfrom (sock,dgram,DGRAM_SIZE,0,(sockaddr *)(void*)&from, &fromLen);
|
|
if (gettimeofdayInMilliseconds() - start>2000) {lost++; goto sendLoop;}
|
|
if ( n <= 0 ) goto readLoop2; // { sched_yield(); goto readLoop2; }
|
|
// for what transId?
|
|
int32_t tid = up->getTransId ( dgram , n );
|
|
// -1 is error
|
|
if ( tid < 0 ) {
|
|
log(LOG_WARN, "net: pingtest: Bad transId.");
|
|
return false;
|
|
}
|
|
// if no match, it was recovered, keep reading
|
|
if ( tid != transId ) {
|
|
log("net: pingTest: Recovered tid=%" PRId32", current tid=%" PRId32". "
|
|
"Resend?",tid,transId);
|
|
recovered++;
|
|
goto readLoop2;
|
|
}
|
|
// an ack?
|
|
if ( up->isAck ( dgram , n ) ) {
|
|
acks++;
|
|
goto readLoop2;
|
|
}
|
|
// mark the time
|
|
int64_t took = gettimeofdayInMilliseconds()-start;
|
|
if ( took > 1 ) log("net: pingtest: got reply #%" PRId32" (tid=%" PRId32") "
|
|
"in %" PRId64" ms",replies,transId,took);
|
|
// make average
|
|
avg += took;
|
|
// the reply?
|
|
replies++;
|
|
// send back an ack
|
|
size = up->makeAck ( dgram, dnum, transId , true/*weinit?*/ , false );
|
|
n = sendto(sock,dgram,size,0,(struct sockaddr *)(void*)&to,sizeof(to));
|
|
// mark our first read
|
|
goto sendLoop;
|
|
}
|
|
|
|
int injectFileTest ( int32_t reqLen , int32_t hid ) {
|
|
|
|
// make a mime
|
|
char *req = (char *)mmalloc ( reqLen , "injecttest");
|
|
if ( ! req ) {
|
|
log(LOG_WARN, "build: injecttest: malloc(%" PRId32") failed", reqLen);
|
|
return -1;
|
|
}
|
|
char *p = req;
|
|
char *pend = req + reqLen;
|
|
sprintf ( p ,
|
|
"POST /inject HTTP/1.0\r\n"
|
|
"Content-Length: 000000000\r\n" // placeholder
|
|
"Content-Type: text/html\r\n"
|
|
"Connection: Close\r\n"
|
|
"\r\n" );
|
|
p += strlen(p);
|
|
char *content = p;
|
|
sprintf ( p ,
|
|
"u=%" PRIu32".injecttest.com&c=&"
|
|
"deleteurl=0&ip=4.5.6.7&iplookups=0&"
|
|
"dedup=1&rs=7&"
|
|
"quick=1&hasmime=1&ucontent="
|
|
"HTTP 200\r\n"
|
|
"Last-Modified: Sun, 06 Nov 1994 08:49:37 GMT\r\n"
|
|
"Connection: Close\r\n"
|
|
"Content-Type: text/html\r\n"
|
|
"\r\n" ,
|
|
(uint32_t)time(NULL) );
|
|
p += strlen(p);
|
|
// now store random words (just numbers of 8 digits each)
|
|
while ( p + 12 < pend ) {
|
|
int32_t r ; r = rand();
|
|
sprintf ( p , "%010" PRIu32" " , r );
|
|
p += strlen ( p );
|
|
}
|
|
// set content length
|
|
int32_t clen = p - content;
|
|
char *ptr = req ;
|
|
// find start of the 9 zeroes
|
|
while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
|
|
// store length there
|
|
sprintf ( ptr , "%09" PRIu32 , clen );
|
|
// remove the \0
|
|
ptr += strlen(ptr); *ptr = '\r';
|
|
|
|
// what is total request length?
|
|
int32_t rlen = p - req;
|
|
|
|
// generate the filename
|
|
const char *filename = "/tmp/inject-test";
|
|
File f;
|
|
f.set ( filename );
|
|
f.unlink();
|
|
if ( ! f.open ( O_RDWR | O_CREAT ) ) {
|
|
log(LOG_WARN, "build: injecttest: Failed to create file %s for testing", filename);
|
|
return -1;
|
|
}
|
|
|
|
if ( rlen != f.write ( req , rlen , 0 ) ) {
|
|
log(LOG_WARN, "build: injecttest: Failed to write %" PRId32" bytes to %s", rlen, filename);
|
|
return -1;
|
|
}
|
|
f.close();
|
|
|
|
mfree ( req , reqLen , "injecttest" );
|
|
|
|
Host *h = g_hostdb.getHost(hid);
|
|
|
|
char *ips = iptoa(h->m_ip);
|
|
|
|
// now inject the file
|
|
return injectFile ( filename , ips , "main");
|
|
}
|
|
|
|
#define MAX_INJECT_SOCKETS 300
|
|
static void doInject ( int fd , void *state ) ;
|
|
static void doInjectWarc ( int64_t fsize );
|
|
static void doInjectArc ( int64_t fsize );
|
|
static void injectedWrapper ( void *state , TcpSocket *s ) ;
|
|
static TcpServer s_tcp;
|
|
static File s_file;
|
|
static int64_t s_off = 0; // offset into file
|
|
static int32_t s_ip;
|
|
static int16_t s_port;
|
|
static Hostdb s_hosts2;
|
|
static int32_t s_rrn = 0;
|
|
static int32_t s_registered = 1;
|
|
static int32_t s_maxSockets = MAX_INJECT_SOCKETS;
|
|
static int32_t s_outstanding = 0;
|
|
static bool s_isDelete;
|
|
static int32_t s_injectTitledb;
|
|
static int32_t s_injectWarc;
|
|
static int32_t s_injectArc;
|
|
static const char *s_coll = NULL;
|
|
static key_t s_titledbKey;
|
|
static char *s_req [MAX_INJECT_SOCKETS];
|
|
static int64_t s_docId[MAX_INJECT_SOCKETS];
|
|
static char s_init5 = false;
|
|
static int64_t s_endDocId;
|
|
|
|
int injectFile ( const char *filename , char *ips , const char *coll ) {
|
|
// or part of an itemlist.txt-N
|
|
int flen2 = strlen(filename);
|
|
if ( flen2>=14 && strncmp(filename,"itemlist.txt",12)==0 ) {
|
|
// must have -N
|
|
int split = atoi(filename+13);
|
|
log("inject: using part file of itemlist.txt of %i",split);
|
|
// open it
|
|
SafeBuf sb;
|
|
sb.load("./itemlist.txt");
|
|
// scan the lines
|
|
char *p = sb.getBufStart();
|
|
char *pend = p + sb.length();
|
|
int count = 0;
|
|
char *nextLine = NULL;
|
|
for ( ; p && p < pend ; p = nextLine ) {
|
|
nextLine = strstr(p,"\n");
|
|
if ( nextLine ) nextLine++;
|
|
// this is how many hosts we are using!!
|
|
// TODO: make this get from hosts.conf!!!
|
|
if ( count >= 40 ) count = 0;
|
|
if ( count++ != split ) continue;
|
|
// get line
|
|
char *archiveDirName = p;
|
|
if ( nextLine ) nextLine[-1] = '\0';
|
|
// download the archive
|
|
SafeBuf cmd;
|
|
cmd.safePrintf("./ia download "
|
|
//"--format=\"Web ARChive GZ\" "
|
|
"--glob='*arc.gz' "
|
|
"%s"
|
|
,archiveDirName);
|
|
gbsystem(cmd.getBufStart());
|
|
// now inject the warc gz files in there
|
|
Dir dir;
|
|
dir.set ( p );
|
|
dir.open();
|
|
log("setting dir to %s",p);
|
|
subloop:
|
|
const char *xarcFilename = dir.getNextFilename("*arc.gz");
|
|
// get next archive
|
|
if ( ! xarcFilename ) {
|
|
cmd.reset();
|
|
// remove the archive dir when done if
|
|
// no more warc.gz files in it
|
|
cmd.safePrintf("rm -rf %s",archiveDirName);
|
|
gbsystem(cmd.getBufStart());
|
|
// download the next archive using 'ia'
|
|
continue;
|
|
}
|
|
int32_t flen = strlen(xarcFilename);
|
|
const char *ext = xarcFilename + flen -7;
|
|
// gunzip to foo.warc or foo.arc depending!
|
|
const char *es = "";
|
|
if ( ext[0] == 'w' ) es = "w";
|
|
// inject the warc.gz files
|
|
cmd.reset();
|
|
cmd.safePrintf("gunzip -c %s/%s > ./foo%i.%sarc"
|
|
,archiveDirName,xarcFilename,split,es);
|
|
gbsystem(cmd.getBufStart());
|
|
// now inject it
|
|
cmd.reset();
|
|
cmd.safePrintf("./gbi inject ./foo%i.%sarc hosts.conf"
|
|
,split,es);
|
|
gbsystem(cmd.getBufStart());
|
|
goto subloop;
|
|
}
|
|
log("cmd: done injecting archives for split %i",split);
|
|
exit(0);
|
|
}
|
|
|
|
bool isDelete = false;
|
|
int64_t startDocId = 0LL;
|
|
int64_t endDocId = MAX_DOCID;
|
|
|
|
g_conf.m_maxMem = 4000000000LL;
|
|
g_mem.init ( );//4000000000LL );
|
|
|
|
// set up the loop
|
|
if ( ! g_loop.init() ) {
|
|
log(LOG_WARN, "build: inject: Loop init failed.");
|
|
return false;
|
|
}
|
|
// init the tcp server, client side only
|
|
if ( ! s_tcp.init( NULL , // requestHandlerWrapper ,
|
|
getMsgSize,
|
|
NULL , // getMsgPiece ,
|
|
0 , // port, only needed for server ,
|
|
&s_maxSockets ) ) return false;
|
|
|
|
s_tcp.m_doReadRateTimeouts = false;
|
|
|
|
s_isDelete = isDelete;
|
|
|
|
if ( ! s_init5 ) {
|
|
s_init5 = true;
|
|
for ( int32_t i = 0; i < MAX_INJECT_SOCKETS ; i++ )
|
|
s_req[i] = NULL;
|
|
}
|
|
|
|
char *colon = strstr(ips,":");
|
|
int32_t port = 8000;
|
|
if ( colon ) {
|
|
*colon = '\0';
|
|
port = atoi(colon+1);
|
|
}
|
|
int32_t ip = 0;
|
|
// is ip field a hosts.conf instead? that means to round robin.
|
|
if ( strstr(ips,".conf") ) {
|
|
if ( ! s_hosts2.init ( -1 ) ) { // ips , 0 ) ) {
|
|
fprintf(stderr,"failed to load %s",ips);
|
|
exit(0);
|
|
}
|
|
s_ip = 0;
|
|
s_port = 0;
|
|
}
|
|
else {
|
|
ip = atoip(ips,strlen(ips));
|
|
if ( ip == 0 || ip == -1 ) {
|
|
log("provided ip \"%s\" is a bad ip. "
|
|
"exiting\n",ips);
|
|
exit(0);
|
|
}
|
|
if ( port == 0 || port == -1 ) {
|
|
log("bad port. exiting\n");
|
|
exit(0);
|
|
}
|
|
s_ip = ip;//h->m_ip;
|
|
s_port = port;//h->m_httpPort;
|
|
}
|
|
|
|
s_injectTitledb = false;
|
|
|
|
//char *coll = "main";
|
|
if ( strncmp(filename,"titledb",7) == 0 ) {
|
|
// a new thing, titledb-gk144 or titledb-coll.main.0
|
|
// init the loop, needs g_conf
|
|
if ( ! g_loop.init() ) {
|
|
log("db: Loop init failed." ); exit(0); }
|
|
// set up the threads, might need g_conf
|
|
if ( ! g_jobScheduler.initialize(2,4,2) ) {
|
|
log("db: Threads init failed." ); exit(0); }
|
|
s_injectTitledb = true;
|
|
s_titledbKey.setMin();
|
|
|
|
// read where we left off from file if possible
|
|
char fname[256];
|
|
sprintf(fname,"./lastinjectdocid.dat");
|
|
SafeBuf ff;
|
|
ff.fillFromFile(fname);
|
|
if ( ff.length() > 1 ) {
|
|
int64_t ffdocId = atoll(ff.getBufStart() );
|
|
// if process got killed in the middle of write
|
|
// i guess the stored docid could be corrupted!
|
|
// so make sure its in startDocId,endDocId range
|
|
if ( ffdocId > 0 &&
|
|
ffdocId >= startDocId &&
|
|
ffdocId < endDocId )
|
|
startDocId = ffdocId;
|
|
else
|
|
log("build: saved docid %" PRId64" not "
|
|
"in [%" PRId64",%" PRId64"]",
|
|
ffdocId,
|
|
startDocId,
|
|
endDocId );
|
|
}
|
|
|
|
if ( startDocId != 0LL )
|
|
s_titledbKey = g_titledb.makeFirstKey(startDocId);
|
|
|
|
s_endDocId = endDocId;
|
|
|
|
// so we do not try to merge files, or write any data:
|
|
g_dumpMode = true;
|
|
|
|
CollectionRec *cr = new (CollectionRec);
|
|
SafeBuf *rb = &g_collectiondb.m_recPtrBuf;
|
|
rb->reserve(4);
|
|
g_collectiondb.m_recs = (CollectionRec **)rb->getBufStart();
|
|
g_collectiondb.m_recs[0] = cr;
|
|
|
|
// right now this is just for the main collection
|
|
const char *coll = "main";
|
|
addCollToTable ( coll , (collnum_t) 0 );
|
|
|
|
// force RdbTree.cpp not to bitch about corruption
|
|
// assume we are only getting out collnum 0 recs i guess
|
|
g_collectiondb.m_numRecs = 1;
|
|
g_titledb.init ();
|
|
// msg5::readList() requires the RdbBase for collnum 0
|
|
// which holds the array of files and the tree
|
|
Rdb *rdb = g_titledb.getRdb();
|
|
static RdbBase *s_base = new ( RdbBase );
|
|
// so getRdbBase always returns
|
|
rdb->m_collectionlessBase = s_base;
|
|
rdb->m_isCollectionLess = true;
|
|
// dir for tree loading
|
|
sprintf(g_hostdb.m_dir , "./" );
|
|
rdb->loadTree();
|
|
// titledb-
|
|
if ( strlen(filename)<=8 ) {
|
|
log(LOG_WARN, "build: need titledb-coll.main.0 or titledb-gk144 not just 'titledb'");
|
|
return false;
|
|
}
|
|
const char *coll2 = filename + 8;
|
|
|
|
char tmp[1024];
|
|
sprintf(tmp,"./%s",coll2);
|
|
s_base->m_dir.set(tmp);
|
|
strcpy(s_base->m_dbname,rdb->getDbname());
|
|
s_base->m_dbnameLen = strlen(rdb->getDbname());
|
|
s_base->m_coll = "main";
|
|
s_base->m_collnum = (collnum_t)0;
|
|
s_base->m_rdb = rdb;
|
|
s_base->m_fixedDataSize = rdb->getFixedDataSize();
|
|
s_base->m_useHalfKeys = rdb->useHalfKeys();
|
|
s_base->m_ks = rdb->getKeySize();
|
|
s_base->m_pageSize = rdb->getPageSize();
|
|
s_base->m_isTitledb = rdb->m_isTitledb;
|
|
s_base->m_minToMerge = 99999;
|
|
// try to set the file info now!
|
|
s_base->setFiles();
|
|
}
|
|
else {
|
|
// open file
|
|
s_file.set ( filename );
|
|
if ( ! s_file.open ( O_RDONLY ) ) {
|
|
log(LOG_WARN, "build: inject: Failed to open file %s for reading.", filename);
|
|
return -1;
|
|
}
|
|
s_off = 0;
|
|
}
|
|
|
|
// this might be a compressed warc like .warc.gz
|
|
s_injectWarc = false;
|
|
s_injectArc = false;
|
|
int flen = strlen(filename);
|
|
if ( flen>5 && strcasecmp(filename+flen-5,".warc")==0 ) {
|
|
s_injectWarc = true;
|
|
}
|
|
if ( flen>5 && strcasecmp(filename+flen-4,".arc")==0 ) {
|
|
s_injectArc = true;
|
|
}
|
|
|
|
|
|
s_coll = coll;
|
|
|
|
if ( ! s_coll ) s_coll = "main";
|
|
|
|
// register sleep callback to get started
|
|
if ( ! g_loop.registerSleepCallback(1, NULL, doInject) ) {
|
|
log( "build: inject: Loop init failed." );
|
|
return -1;
|
|
}
|
|
|
|
// run the loop
|
|
g_loop.runLoop();
|
|
}
|
|
|
|
void doInject ( int fd , void *state ) {
|
|
|
|
if ( s_registered ) {
|
|
s_registered = 0;
|
|
g_loop.unregisterSleepCallback ( NULL, doInject );
|
|
}
|
|
|
|
// turn off threads so this happens right away
|
|
g_jobScheduler.disallow_new_jobs();
|
|
|
|
int64_t fsize ;
|
|
if ( ! s_injectTitledb ) fsize = s_file.getFileSize();
|
|
|
|
// just repeat the function separately. i guess we'd repeat
|
|
// some code but for simplicity i think it is worth it. and we
|
|
// should probably phase out the ++++URL: format thing.
|
|
if ( s_injectWarc ) {
|
|
doInjectWarc ( fsize );
|
|
return;
|
|
}
|
|
|
|
if ( s_injectArc ) {
|
|
doInjectArc ( fsize );
|
|
return;
|
|
}
|
|
|
|
loop:
|
|
|
|
int32_t reqLen;
|
|
int32_t reqAlloc;
|
|
char *req;
|
|
|
|
// if reading from our titledb and injecting into another cluster
|
|
if ( s_injectTitledb ) {
|
|
// turn off threads so this happens right away
|
|
g_jobScheduler.disallow_new_jobs();
|
|
key_t endKey; //endKey.setMax();
|
|
endKey = g_titledb.makeFirstKey(s_endDocId);
|
|
RdbList list;
|
|
Msg5 msg5;
|
|
const char *coll = "main";
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
msg5.getList ( RDB_TITLEDB ,
|
|
cr->m_collnum,
|
|
&list ,
|
|
(char *)&s_titledbKey ,
|
|
(char *)&endKey ,
|
|
100 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1, // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL , // cache key ptr
|
|
0 , // retry num
|
|
-1 , // maxRetries
|
|
true , // compensate for merge
|
|
-1LL, // sync point
|
|
false, // isRealMerge
|
|
true); // allowPageCache
|
|
// all done if empty
|
|
if ( list.isEmpty() ) { g_loop.reset(); exit(0); }
|
|
// loop over entries in list
|
|
list.getCurrentKey((char *) &s_titledbKey);
|
|
// advance for next
|
|
s_titledbKey += 1;
|
|
// is it a delete?
|
|
char *rec = list.getCurrentRec ();
|
|
int32_t recSize = list.getCurrentRecSize();
|
|
// skip negative keys!
|
|
if ( (rec[0] & 0x01) == 0x00 ) goto loop;
|
|
// re-enable threads i guess
|
|
g_jobScheduler.allow_new_jobs();
|
|
// set and uncompress
|
|
XmlDoc xd;
|
|
if ( ! xd.set2 ( rec ,
|
|
recSize ,
|
|
coll ,
|
|
NULL , // safebuf
|
|
0 , // niceness
|
|
NULL ) ) { // spiderrequest
|
|
log("build: inject skipping corrupt title rec" );
|
|
goto loop;
|
|
}
|
|
// sanity!
|
|
if ( xd.size_utf8Content > 5000000 ) {
|
|
log("build: inject skipping huge title rec" );
|
|
goto loop;
|
|
}
|
|
// get the content length. uenc can be 2140 bytes! seen it!
|
|
reqAlloc = xd.size_utf8Content + 6000;
|
|
// make space for content
|
|
req = (char *)mmalloc ( reqAlloc , "maininject" );
|
|
if ( ! req ) {
|
|
log("build: inject: Could not allocate %" PRId32" bytes for "
|
|
"request at offset %" PRId64,reqAlloc,s_off);
|
|
exit(0);
|
|
}
|
|
char *ipStr = iptoa(xd.m_ip);
|
|
// encode the url
|
|
char *url = xd.getFirstUrl()->getUrl();
|
|
char uenc[5000];
|
|
urlEncode ( uenc , 4000 , url , strlen(url) , true );
|
|
char *content = xd.ptr_utf8Content;
|
|
int32_t contentLen = xd.size_utf8Content;
|
|
if ( contentLen > 0 ) contentLen--;
|
|
char c = content[contentLen];
|
|
content[contentLen] = '\0';
|
|
// form what we would read from disk
|
|
reqLen = sprintf(req,
|
|
// print as unencoded content for speed
|
|
"POST /inject HTTP/1.0\r\n"
|
|
"Content-Length: 000000000\r\n"//placeholder
|
|
"Content-Type: text/html\r\n"
|
|
"Connection: Close\r\n"
|
|
"\r\n"
|
|
// now the post cgi parms
|
|
"c=%s&"
|
|
// quick docid only reply
|
|
"quick=1&"
|
|
// url of injecting page
|
|
"u=%s&"
|
|
"ip=%s&"
|
|
"firstindexed=%" PRIu32"&"
|
|
"lastspidered=%" PRIu32"&"
|
|
// prevent looking up firstips
|
|
// on all outlinks for speed:
|
|
"spiderlinks=0&"
|
|
"hopcount=%" PRId32"&"
|
|
"newonly=2&" // only inject if new
|
|
"dontlog=1&"
|
|
"charset=%" PRId32"&"
|
|
"ucontent="
|
|
// first the mime
|
|
//"HTTP 200\r\n"
|
|
//"Connection: Close\r\n"
|
|
//"Content-Type: text/html\r\n"
|
|
//"Content-Length: %" PRId32"\r\n"
|
|
//"\r\n"
|
|
// then the content of the injecting page
|
|
"%s"
|
|
, coll
|
|
, uenc
|
|
, ipStr
|
|
, xd.m_firstIndexedDate
|
|
, xd.m_spideredTime
|
|
, (int32_t)*xd.getHopCount()
|
|
, (int32_t)xd.m_charset
|
|
, content
|
|
);
|
|
content[contentLen] = c;
|
|
if ( reqLen >= reqAlloc ) {
|
|
log("inject: bad engineer here");
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
// set content length
|
|
char *start = strstr(req,"c=");
|
|
int32_t realContentLen = strlen(start);
|
|
char *ptr = req ;
|
|
// find start of the 9 zeroes
|
|
while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
|
|
// store length there
|
|
sprintf ( ptr , "%09" PRIu32 , realContentLen );
|
|
// remove the \0
|
|
ptr += strlen(ptr); *ptr = '\r';
|
|
// map it
|
|
int32_t i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
|
|
// skip if occupied
|
|
if ( s_req[i] ) continue;
|
|
s_req [i] = req;
|
|
s_docId[i] = xd.m_docId;
|
|
break;
|
|
}
|
|
if ( i >= MAX_INJECT_SOCKETS )
|
|
log("build: could not add req to map");
|
|
}
|
|
else {
|
|
// are we done?
|
|
if ( s_off >= fsize ) {
|
|
log("inject: done parsing file");
|
|
g_loop.reset();
|
|
exit(0);
|
|
}
|
|
// read the mime
|
|
char buf [ 1000*1024 ];
|
|
int32_t maxToRead = 1000*1024;
|
|
int32_t toRead = maxToRead;
|
|
if ( s_off + toRead > fsize ) toRead = fsize - s_off;
|
|
int32_t bytesRead = s_file.read ( buf , toRead , s_off ) ;
|
|
if ( bytesRead != toRead ) {
|
|
log("build: inject: Read of %s failed at offset "
|
|
"%" PRId64, s_file.getFilename(), s_off);
|
|
exit(0);
|
|
}
|
|
|
|
char *fend = buf + toRead;
|
|
|
|
char *pbuf = buf;
|
|
// partap padding?
|
|
if ( pbuf[0] == '\n' ) pbuf++;
|
|
if ( pbuf[0] == '\n' ) pbuf++;
|
|
// need "++URL: "
|
|
for ( ; *pbuf && strncmp(pbuf,"+++URL: ",8) ; pbuf++ );
|
|
// none?
|
|
if ( ! *pbuf ) {
|
|
log("inject: done!");
|
|
exit(0);
|
|
}
|
|
// sometimes line starts with "URL: http://www.xxx.com/\n"
|
|
char *url = pbuf + 8; // NULL;
|
|
// skip over url
|
|
pbuf = strchr(pbuf,'\n');
|
|
// null term url
|
|
*pbuf = '\0';
|
|
// log it
|
|
log("inject: injecting url %s",url);
|
|
// skip to next line
|
|
pbuf++;
|
|
// get offset into "buf"
|
|
int32_t len = pbuf - buf;
|
|
// subtract that from toRead so it is the available bytes left
|
|
toRead -= len;
|
|
// advance this for next read
|
|
s_off += len;
|
|
|
|
// should be a mime that starts with GET or POST
|
|
HttpMime m;
|
|
if ( ! m.set ( pbuf , toRead , NULL ) ) {
|
|
if ( toRead > 128 ) toRead = 128;
|
|
pbuf [ toRead ] = '\0';
|
|
log("build: inject: Failed to set mime at offset "
|
|
"%" PRId64" where request=%s",s_off,buf);
|
|
exit(0);
|
|
}
|
|
// find the end of it, the next "URL: " line or
|
|
// end of file
|
|
char *p = pbuf;
|
|
char *contentPtrEnd = fend;
|
|
for ( ; p < fend ; p++ ) {
|
|
if ( p[0] == '+' &&
|
|
p[1] == '+' &&
|
|
p[2] == '+' &&
|
|
p[3] == 'U' &&
|
|
p[4] == 'R' &&
|
|
p[5] == 'L' &&
|
|
p[6] == ':' &&
|
|
p[7] == ' ' ) {
|
|
contentPtrEnd = p;
|
|
break;
|
|
}
|
|
}
|
|
// point to the content (NOW INCLUDE MIME!)
|
|
char *contentPtr = pbuf;// + m.getMimeLen();
|
|
int32_t contentPtrLen = contentPtrEnd - contentPtr;
|
|
if ( contentPtrEnd == fend && bytesRead == maxToRead ) {
|
|
log("inject: not reading enough content to inject "
|
|
"url %s . increase maxToRead from %" PRId32,url,
|
|
maxToRead);
|
|
exit(0);
|
|
}
|
|
// get the length of content (includes the submime for
|
|
// injection)
|
|
int32_t contentLen = m.getContentLen();
|
|
if ( ! url && contentLen == -1 ) {
|
|
log("build: inject: Mime at offset %" PRId64" does not "
|
|
"specify required Content-Length: XXX field.",
|
|
s_off);
|
|
exit(0);
|
|
}
|
|
// alloc space for mime and content
|
|
reqAlloc = contentPtrLen + 2 + 6000;
|
|
// make space for content
|
|
req = (char *)mmalloc ( reqAlloc , "maininject" );
|
|
if ( ! req ) {
|
|
log("build: inject: Could not allocate %" PRId32" bytes for "
|
|
"request at offset %" PRId64,reqAlloc,s_off);
|
|
exit(0);
|
|
}
|
|
char *rp = req;
|
|
// a different format?
|
|
const char *ipStr = "1.2.3.4";
|
|
rp += sprintf(rp,
|
|
"POST /inject HTTP/1.0\r\n"
|
|
"Content-Length: 000000000\r\n"//bookmrk
|
|
"Content-Type: text/html\r\n"
|
|
"Connection: Close\r\n"
|
|
"\r\n"
|
|
"c=main&"
|
|
// do parsing consistency testing (slower!)
|
|
//"dct=1&"
|
|
// mime is in the "&ucontent=" parm
|
|
"hasmime=1&"
|
|
// prevent looking up firstips
|
|
// on all outlinks for speed:
|
|
"spiderlinks=0&"
|
|
"quick=1&" // quick reply
|
|
"dontlog=1&"
|
|
"ip=%s&"
|
|
"deleteurl=%" PRId32"&"
|
|
"u=",
|
|
ipStr,
|
|
(int32_t)s_isDelete);
|
|
// url encode the url
|
|
rp += urlEncode ( rp , 4000 , url , strlen(url) );
|
|
// finish it up
|
|
rp += sprintf(rp,"&ucontent=");
|
|
|
|
if ( ! url ) {
|
|
// what is this?
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// store the content after the &ucontent
|
|
gbmemcpy ( rp , contentPtr , contentPtrLen );
|
|
rp += contentPtrLen;
|
|
|
|
s_off += contentPtrLen;
|
|
|
|
// just for ease of display
|
|
*rp = '\0';
|
|
|
|
|
|
// set content length
|
|
char *start = strstr(req,"c=");
|
|
int32_t realContentLen = strlen(start);
|
|
char *ptr = req ;
|
|
// find start of the 9 zeroes
|
|
while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
|
|
// store length there
|
|
sprintf ( ptr , "%09" PRIu32 , realContentLen );
|
|
// remove the \0
|
|
ptr += strlen(ptr); *ptr = '\r';
|
|
|
|
// set this
|
|
reqLen = rp - req;
|
|
// sanity
|
|
if ( reqLen > reqAlloc ) { g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
int32_t ip = s_ip;
|
|
int32_t port = s_port;
|
|
|
|
// try hosts.conf
|
|
if ( ip == 0 ) {
|
|
// round robin over hosts in s_hosts2
|
|
if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
|
|
Host *h = s_hosts2.getHost ( s_rrn );
|
|
ip = h->m_ip;
|
|
port = h->m_httpPort;
|
|
s_rrn++;
|
|
}
|
|
|
|
// now inject it
|
|
bool status = s_tcp.sendMsg( NULL, 0, ip, port, req, reqAlloc, reqLen, reqLen, NULL, injectedWrapper,
|
|
9999 * 60 * 1000, -1, -1 );
|
|
|
|
// launch another if blocked
|
|
if ( ! status ) {
|
|
if ( ++s_outstanding < MAX_INJECT_SOCKETS ) goto loop;
|
|
return;
|
|
}
|
|
|
|
if ( g_errno )
|
|
log("build: inject had error: %s.",mstrerror(g_errno));
|
|
// free if did not block, tcpserver frees on immediate error
|
|
else
|
|
mfree ( req , reqAlloc , "maininject" );
|
|
// loop if not
|
|
goto loop;
|
|
}
|
|
|
|
|
|
// 100MB per warc rec max
|
|
#define MAXWARCRECSIZE 100*1024*1024
|
|
|
|
void doInjectWarc ( int64_t fsize ) {
|
|
|
|
static char *s_buf = NULL;
|
|
|
|
static bool s_hasMoreToRead;
|
|
|
|
static char *s_pbuf = NULL;
|
|
static char *s_pbufEnd = NULL;
|
|
|
|
bool needReadMore = false;
|
|
if ( ! s_pbuf ) needReadMore = true;
|
|
|
|
|
|
readMore:
|
|
|
|
if ( needReadMore ) {
|
|
|
|
log("inject: reading %" PRId64" bytes more of warc file"
|
|
,(int64_t)MAXWARCRECSIZE);
|
|
|
|
// are we done?
|
|
if ( s_off >= fsize ) {
|
|
log("inject: done parsing warc file");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
g_loop.reset();
|
|
exit(0);
|
|
}
|
|
|
|
// read 1MB of data into this buf to get the first WARC record
|
|
// it must be < 1MB or we faulter.
|
|
if ( ! s_buf ) {
|
|
int64_t need = MAXWARCRECSIZE + 1;
|
|
s_buf = (char *)mmalloc ( need ,"sibuf");
|
|
}
|
|
if ( ! s_buf ) {
|
|
log("inject: failed to alloc buf");
|
|
exit(0);
|
|
}
|
|
|
|
int32_t maxToRead = MAXWARCRECSIZE;
|
|
int32_t toRead = maxToRead;
|
|
s_hasMoreToRead = true;
|
|
if ( s_off + toRead > fsize ) {
|
|
toRead = fsize - s_off;
|
|
s_hasMoreToRead = false;
|
|
}
|
|
int32_t bytesRead = s_file.read ( s_buf , toRead , s_off ) ;
|
|
if ( bytesRead != toRead ) {
|
|
log("inject: read of %s failed at offset "
|
|
"%" PRId64, s_file.getFilename(), s_off);
|
|
exit(0);
|
|
}
|
|
// null term what we read
|
|
s_buf[bytesRead] = '\0';
|
|
|
|
// if not enough to constitute a WARC record probably just new lines
|
|
if( toRead < 20 ) {
|
|
log("inject: done processing file");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
|
|
// mark the end of what we read
|
|
//char *fend = buf + toRead;
|
|
|
|
// point to what we read
|
|
s_pbuf = s_buf;
|
|
s_pbufEnd = s_buf + bytesRead;
|
|
}
|
|
|
|
loop:
|
|
|
|
char *realStart = s_pbuf;
|
|
|
|
// need at least say 100k for warc header
|
|
if ( s_pbuf + 100000 > s_pbufEnd && s_hasMoreToRead ) {
|
|
needReadMore = true;
|
|
goto readMore;
|
|
}
|
|
|
|
// find "WARC/1.0" or whatever
|
|
char *whp = s_pbuf;
|
|
for ( ; *whp && strncmp(whp,"WARC/",5) ; whp++ );
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("inject: could not find WARC/1 header start for file=%s",
|
|
s_file.getFilename());
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
|
|
char *warcHeader = whp;
|
|
|
|
// find end of warc mime HEADER not the content
|
|
char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
|
|
if ( ! warcHeaderEnd ) {
|
|
log("inject: could not find end of WARC header for file=%s.",
|
|
s_file.getFilename());
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
// \0 term for strstrs below
|
|
*warcHeaderEnd = '\0';
|
|
//warcHeaderEnd += 4;
|
|
|
|
char *warcContent = warcHeaderEnd + 4;
|
|
|
|
// get WARC-Type:
|
|
// revisit (if url was already done before)
|
|
// request (making a GET or DNS request)
|
|
// response (reponse to a GET or dns request)
|
|
// warcinfo (crawling parameters, robots: obey, etc)
|
|
// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
|
|
char *warcType = strstr(warcHeader,"WARC-Type:");
|
|
if ( ! warcType ) {
|
|
log("inject: could not find WARC-Type:");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
warcType += 10;
|
|
for ( ; is_wspace_a(*warcType); warcType++ );
|
|
|
|
// get Content-Type:
|
|
// application/warc-fields (fetch time, hops from seed)
|
|
// application/http; msgtype=request (the GET request)
|
|
// application/http; msgtype=response (the GET reply)
|
|
char *warcConType = strstr(warcHeader,"Content-Type:");
|
|
if ( ! warcConType ) {
|
|
log("inject: could not find Content-Type:");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
warcConType += 13;
|
|
for ( ; is_wspace_a(*warcConType); warcConType++ );
|
|
|
|
|
|
// get Content-Length: of WARC header for its content
|
|
char *warcContentLenStr = strstr(warcHeader,"Content-Length:");
|
|
if ( ! warcContentLenStr ) {
|
|
log("inject: could not find WARC "
|
|
"Content-Length:");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
warcContentLenStr += 15;
|
|
for(;is_wspace_a(*warcContentLenStr);warcContentLenStr++);
|
|
|
|
// get warc content len
|
|
int64_t warcContentLen = atoll(warcContentLenStr);
|
|
|
|
char *warcContentEnd = warcContent + warcContentLen;
|
|
|
|
uint64_t oldOff = s_off;
|
|
|
|
uint64_t recSize = (warcContentEnd - realStart);
|
|
|
|
// point to end of this warc record
|
|
s_pbuf += recSize;
|
|
|
|
// if we fall outside of the current read buf then re-read
|
|
if ( s_pbuf > s_pbufEnd ) {
|
|
if ( ! s_hasMoreToRead ) {
|
|
log("inject: warc file exceeded file length.");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
if ( recSize > MAXWARCRECSIZE ) {
|
|
log("inject: skipping warc file of %" PRId64" "
|
|
"bytes which is too big",recSize);
|
|
s_off += recSize;
|
|
}
|
|
needReadMore = true;
|
|
goto readMore;
|
|
}
|
|
|
|
// advance this for next read from the file
|
|
s_off += recSize; // (warcContentEnd - realStart);//s_buf);
|
|
|
|
|
|
// if WARC-Type: is not response, skip it. so if it
|
|
// is a revisit then skip it i guess.
|
|
if ( strncmp ( warcType,"response", 8 ) ) {
|
|
// read another warc record
|
|
goto loop;
|
|
}
|
|
|
|
// warcConType needs to be
|
|
// application/http; msgtype=response
|
|
if ( strncmp(warcConType,"application/http; msgtype=response", 34) ) {
|
|
// read another warc record
|
|
goto loop;
|
|
}
|
|
|
|
// convert to timestamp
|
|
int64_t warcTime = 0;
|
|
char *warcDateStr = strstr(warcHeader,"WARC-Date:");
|
|
if( warcDateStr ) {
|
|
if ( warcDateStr ) warcDateStr += 10;
|
|
for(;warcDateStr && is_wspace_a(*warcDateStr);warcDateStr++);
|
|
if ( warcDateStr ) warcTime = atotime ( warcDateStr );
|
|
}
|
|
|
|
// set the url now
|
|
char *url = strstr(warcHeader,"WARC-Target-URI:");
|
|
if ( url ) url += 16;
|
|
// skip spaces
|
|
for ( ; url && is_wspace_a(*url) ; url++ );
|
|
if ( ! url ) {
|
|
log("inject: could not find WARC-Target-URI:");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
// find end of it
|
|
char *urlEnd = url;
|
|
for (;urlEnd&&*urlEnd&&is_urlchar(*urlEnd);urlEnd++);
|
|
|
|
// null term url
|
|
*urlEnd = '\0';
|
|
|
|
char *httpReply = warcContent;
|
|
int64_t httpReplySize = warcContentLen;
|
|
|
|
// sanity check
|
|
//char *bufEnd = s_buf + MAXWARCRECSIZE;
|
|
if ( httpReply + httpReplySize >= s_pbufEnd ) {
|
|
int needMore = httpReply + httpReplySize - s_pbufEnd;
|
|
log("inject: not reading enough content to inject "
|
|
"url %s . increase MAXWARCRECSIZE by %" PRId32" more",url,
|
|
needMore);
|
|
exit(0);
|
|
}
|
|
|
|
// should be a mime that starts with GET or POST
|
|
HttpMime m;
|
|
if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
|
|
log("inject: failed to set http mime at %" PRId64" in file"
|
|
,oldOff);
|
|
goto loop;
|
|
// exit(0);
|
|
}
|
|
|
|
// check content type
|
|
int ct = m.getContentType();
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_JSON ) {
|
|
goto loop;
|
|
}
|
|
|
|
|
|
SafeBuf req;
|
|
|
|
// a different format?
|
|
const char *ipStr = "1.2.3.4";
|
|
req.safePrintf(
|
|
"POST /admin/inject HTTP/1.0\r\n"
|
|
"Content-Length: 000000000\r\n"//bookmrk
|
|
"Content-Type: text/html\r\n"
|
|
"Connection: Close\r\n"
|
|
"\r\n"
|
|
// we need this ?
|
|
"?"
|
|
"c=%s&"
|
|
// do parsing consistency testing (slower!)
|
|
//"dct=1&"
|
|
"hasmime=1&"
|
|
// prevent looking up firstips
|
|
// on all outlinks for speed:
|
|
"spiderlinks=0&"
|
|
"quick=1&" // quick reply
|
|
"dontlog=0&"
|
|
|
|
// do not do re-injects. should save a TON of time
|
|
"newonly=1&"
|
|
|
|
"lastspidered=%" PRId64"&"
|
|
"firstindexed=%" PRId64"&"
|
|
|
|
"deleteurl=0&"
|
|
"ip=%s&"
|
|
"u="
|
|
,s_coll
|
|
|
|
,warcTime
|
|
,warcTime
|
|
|
|
,ipStr
|
|
);
|
|
|
|
// url encode the url
|
|
req.urlEncode ( url );
|
|
// finish it up
|
|
req.safePrintf("&content=");
|
|
// store the content after the &ucontent
|
|
req.urlEncode ( httpReply , httpReplySize );
|
|
req.nullTerm();
|
|
|
|
|
|
// replace 00000 with the REAL content length
|
|
char *start = strstr(req.getBufStart(),"c=");
|
|
int32_t realContentLen = strlen(start);
|
|
char *ptr = req.getBufStart() ;
|
|
// find start of the 9 zeroes
|
|
while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
|
|
// store length there
|
|
sprintf ( ptr , "%09" PRIu32 , realContentLen );
|
|
// remove the \0
|
|
ptr += strlen(ptr); *ptr = '\r';
|
|
|
|
int32_t ip = s_ip;
|
|
int32_t port = s_port;
|
|
|
|
// try hosts.conf
|
|
if ( ip == 0 ) {
|
|
// round robin over hosts in s_hosts2
|
|
if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
|
|
Host *h = s_hosts2.getHost ( s_rrn );
|
|
ip = h->m_ip;
|
|
port = h->m_httpPort;
|
|
s_rrn++;
|
|
}
|
|
|
|
// log it
|
|
log("inject: injecting to %s:%i WARC url %s",iptoa(ip),(int)port,url);
|
|
|
|
// now inject it
|
|
bool status = s_tcp.sendMsg( NULL, 0, ip, port, req.getBufStart(), req.getCapacity(), req.length(),
|
|
req.length(), NULL, injectedWrapper,
|
|
// because it seems some sockets get stuck and
|
|
// they have no reply but the host they are
|
|
// connected to no longer has the connection
|
|
// open. and the readbuf is empty, but the send
|
|
// buf has been sent and it appears the inject
|
|
// when through. just the reply was never
|
|
// sent back for some reason.
|
|
5 * 60 * 1000, // timeout, 5 mins
|
|
-1, -1 );
|
|
|
|
int realMax = 10;
|
|
if ( s_hosts2.getNumHosts() > 1 )
|
|
realMax = s_hosts2.getNumHosts() * 2;
|
|
|
|
// launch another if blocked
|
|
if ( ! status ) {
|
|
// let injectedWrapper() below free it
|
|
req.detachBuf();
|
|
s_outstanding++;
|
|
if ( s_outstanding < MAX_INJECT_SOCKETS &&
|
|
s_outstanding < realMax )
|
|
goto loop;
|
|
return;
|
|
}
|
|
|
|
if ( g_errno ) {
|
|
// let tcpserver.cpp free it
|
|
req.detachBuf();
|
|
log("build: inject had error: %s.",mstrerror(g_errno));
|
|
}
|
|
// loop if not
|
|
goto loop;
|
|
}
|
|
|
|
|
|
void doInjectArc ( int64_t fsize ) {
|
|
|
|
static char *s_buf = NULL;
|
|
|
|
static bool s_hasMoreToRead;
|
|
|
|
static char *s_pbuf = NULL;
|
|
static char *s_pbufEnd = NULL;
|
|
|
|
bool needReadMore = false;
|
|
if ( ! s_pbuf ) needReadMore = true;
|
|
|
|
|
|
readMore:
|
|
|
|
if ( needReadMore ) {
|
|
|
|
log("inject: reading %" PRId64" bytes more of arc file"
|
|
,(int64_t)MAXWARCRECSIZE);
|
|
|
|
// are we done?
|
|
if ( s_off >= fsize ) {
|
|
log("inject: done parsing arc file");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
g_loop.reset();
|
|
exit(0);
|
|
}
|
|
|
|
// read 1MB of data into this buf to get the first WARC record
|
|
// it must be < 1MB or we faulter.
|
|
if ( ! s_buf ) {
|
|
int64_t need = MAXWARCRECSIZE + 1;
|
|
s_buf = (char *)mmalloc ( need ,"sibuf");
|
|
}
|
|
if ( ! s_buf ) {
|
|
log("inject: failed to alloc buf");
|
|
exit(0);
|
|
}
|
|
|
|
int32_t maxToRead = MAXWARCRECSIZE;
|
|
int32_t toRead = maxToRead;
|
|
s_hasMoreToRead = true;
|
|
if ( s_off + toRead > fsize ) {
|
|
toRead = fsize - s_off;
|
|
s_hasMoreToRead = false;
|
|
}
|
|
int32_t bytesRead = s_file.read ( s_buf , toRead , s_off ) ;
|
|
if ( bytesRead != toRead ) {
|
|
log("inject: read of %s failed at offset "
|
|
"%" PRId64, s_file.getFilename(), s_off);
|
|
exit(0);
|
|
}
|
|
// null term what we read
|
|
s_buf[bytesRead] = '\0';
|
|
|
|
// if not enough to constitute a ARC record probably just new
|
|
// lines
|
|
if( toRead < 20 ) {
|
|
log("inject: done processing file");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
|
|
// point to what we read
|
|
s_pbuf = s_buf;
|
|
s_pbufEnd = s_buf + bytesRead;
|
|
}
|
|
|
|
loop:
|
|
|
|
char *realStart = s_pbuf;
|
|
|
|
// need at least say 100k for arc header
|
|
if ( s_pbuf + 100000 > s_pbufEnd && s_hasMoreToRead ) {
|
|
needReadMore = true;
|
|
goto readMore;
|
|
}
|
|
|
|
// find \n\nhttp://
|
|
char *whp = s_pbuf;
|
|
for ( ; *whp ; whp++ ) {
|
|
if ( whp[0] != '\n' ) continue;
|
|
if ( strncmp(whp+1,"http://",7) ) continue;
|
|
break;
|
|
}
|
|
// none?
|
|
if ( ! *whp ) {
|
|
log("inject: could not find next \\nhttp:// in arc file");
|
|
if ( s_outstanding ) {log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
|
|
char *arcHeader = whp;
|
|
|
|
// find end of arc header not the content
|
|
char *arcHeaderEnd = strstr(arcHeader+1,"\n");
|
|
if ( ! arcHeaderEnd ) {
|
|
log("inject: could not find end of ARC header.");
|
|
exit(0);
|
|
}
|
|
// \0 term for strstrs below
|
|
*arcHeaderEnd = '\0';
|
|
|
|
char *arcContent = arcHeaderEnd + 1;
|
|
|
|
// parse arc header line
|
|
char *url = arcHeader + 1;
|
|
char *hp = url;
|
|
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {log("inject: bad arc header 1.");exit(0);}
|
|
*hp++ = '\0';
|
|
char *ipStr = hp;
|
|
|
|
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {log("inject: bad arc header 2.");exit(0);}
|
|
*hp++ = '\0';
|
|
char *timeStr = hp;
|
|
|
|
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {log("inject: bad arc header 3.");exit(0);}
|
|
*hp++ = '\0'; // null term timeStr
|
|
char *arcConType = hp;
|
|
|
|
for ( ; *hp && *hp != ' ' ; hp++ );
|
|
if ( ! *hp ) {log("inject: bad arc header 4.");exit(0);}
|
|
*hp++ = '\0'; // null term arcContentType
|
|
|
|
char *arcContentLenStr = hp;
|
|
|
|
// get arc content len
|
|
int64_t arcContentLen = atoll(arcContentLenStr);
|
|
|
|
char *arcContentEnd = arcContent + arcContentLen;
|
|
|
|
//uint64_t oldOff = s_off;
|
|
|
|
uint64_t recSize = (arcContentEnd - realStart);
|
|
|
|
// point to end of this arc record
|
|
s_pbuf += recSize;
|
|
|
|
// if we fall outside of the current read buf then re-read
|
|
if ( s_pbuf > s_pbufEnd ) {
|
|
if ( ! s_hasMoreToRead ) {
|
|
log("inject: arc file exceeded file length.");
|
|
if ( s_outstanding ) {
|
|
log("inject: waiting for socks");return;}
|
|
exit(0);
|
|
}
|
|
if ( recSize > MAXWARCRECSIZE ) {
|
|
log("inject: skipping arc file of %" PRId64" "
|
|
"bytes which is too big",recSize);
|
|
s_off += recSize;
|
|
}
|
|
needReadMore = true;
|
|
goto readMore;
|
|
}
|
|
|
|
// advance this for next read from the file
|
|
s_off += recSize;
|
|
|
|
|
|
// arcConType needs to indexable
|
|
int32_t ct = getContentTypeFromStr ( arcConType );
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_JSON ) {
|
|
// read another arc record
|
|
goto loop;
|
|
}
|
|
|
|
// convert to timestamp
|
|
int64_t arcTime = 0;
|
|
// this time structure, once filled, will help yield a time_t
|
|
struct tm t;
|
|
// DAY OF MONTH
|
|
t.tm_mday = atol2 ( timeStr + 6 , 2 );
|
|
// MONTH
|
|
t.tm_mon = atol2 ( timeStr + 4 , 2 );
|
|
// YEAR
|
|
t.tm_year = atol2 ( timeStr , 4 ) - 1900 ; // # of years since 1900
|
|
// TIME
|
|
t.tm_hour = atol2 ( timeStr + 8 , 2 );
|
|
t.tm_min = atol2 ( timeStr + 10 , 2 );
|
|
t.tm_sec = atol2 ( timeStr + 12 , 2 );
|
|
// unknown if we're in daylight savings time
|
|
t.tm_isdst = -1;
|
|
// translate using mktime
|
|
arcTime = timegm ( &t );
|
|
|
|
|
|
char *httpReply = arcContent;
|
|
int64_t httpReplySize = arcContentLen;
|
|
|
|
// sanity check
|
|
if ( httpReply + httpReplySize >= s_pbufEnd ) {
|
|
int needMore = httpReply + httpReplySize - s_pbufEnd;
|
|
log("inject: not reading enough content to inject "
|
|
"url %s . increase MAXWARCRECSIZE by %" PRId32" more",url,
|
|
needMore);
|
|
exit(0);
|
|
}
|
|
|
|
|
|
SafeBuf req;
|
|
|
|
// a different format?
|
|
req.safePrintf(
|
|
"POST /admin/inject HTTP/1.0\r\n"
|
|
"Content-Length: 000000000\r\n"//bookmrk
|
|
"Content-Type: text/html\r\n"
|
|
"Connection: Close\r\n"
|
|
"\r\n"
|
|
// we need this ?
|
|
"?"
|
|
"c=%s&"
|
|
// do parsing consistency testing (slower!)
|
|
//"dct=1&"
|
|
"hasmime=1&"
|
|
// prevent looking up firstips
|
|
// on all outlinks for speed:
|
|
"spiderlinks=0&"
|
|
"quick=1&" // quick reply
|
|
"dontlog=0&"
|
|
|
|
// do not do re-injects. should save a TON of time
|
|
"newonly=1&"
|
|
|
|
"lastspidered=%" PRId64"&"
|
|
"firstindexed=%" PRId64"&"
|
|
|
|
"deleteurl=0&"
|
|
"ip=%s&"
|
|
"u="
|
|
,s_coll
|
|
|
|
,arcTime
|
|
,arcTime
|
|
|
|
,ipStr
|
|
);
|
|
|
|
// url encode the url
|
|
req.urlEncode ( url );
|
|
// finish it up
|
|
req.safePrintf("&content=");
|
|
// store the content after the &ucontent
|
|
req.urlEncode ( httpReply , httpReplySize );
|
|
req.nullTerm();
|
|
|
|
|
|
// replace 00000 with the REAL content length
|
|
char *start = strstr(req.getBufStart(),"c=");
|
|
int32_t realContentLen = strlen(start);
|
|
char *ptr = req.getBufStart() ;
|
|
// find start of the 9 zeroes
|
|
while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
|
|
// store length there
|
|
sprintf ( ptr , "%09" PRIu32 , realContentLen );
|
|
// remove the \0
|
|
ptr += strlen(ptr); *ptr = '\r';
|
|
|
|
|
|
int32_t ip = s_ip;
|
|
int32_t port = s_port;
|
|
|
|
// try hosts.conf
|
|
if ( ip == 0 ) {
|
|
// round robin over hosts in s_hosts2
|
|
if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
|
|
Host *h = s_hosts2.getHost ( s_rrn );
|
|
ip = h->m_ip;
|
|
port = h->m_httpPort;
|
|
s_rrn++;
|
|
}
|
|
|
|
// log it
|
|
log("inject: injecting ARC %s to %s:%i contentLen=%" PRId64
|
|
,url
|
|
,iptoa(ip)
|
|
,(int)port
|
|
,arcContentLen);
|
|
|
|
// now inject it
|
|
bool status = s_tcp.sendMsg( NULL, 0, ip, port, req.getBufStart(), req.getCapacity(), req.length(),
|
|
req.length(), NULL, injectedWrapper,
|
|
// because it seems some sockets get stuck and
|
|
// they have no reply but the host they are
|
|
// connected to no longer has the connection
|
|
// open. and the readbuf is empty, but the send
|
|
// buf has been sent and it appears the inject
|
|
// when through. just the reply was never
|
|
// sent back for some reason.
|
|
5 * 60 * 1000, // timeout, 5 mins
|
|
-1, -1 );
|
|
|
|
int realMax = 10;
|
|
if ( s_hosts2.getNumHosts() > 1 )
|
|
realMax = s_hosts2.getNumHosts() * 3;
|
|
|
|
// launch another if blocked
|
|
if ( ! status ) {
|
|
// let injectedWrapper() below free it
|
|
req.detachBuf();
|
|
s_outstanding++;
|
|
if ( s_outstanding < MAX_INJECT_SOCKETS &&
|
|
s_outstanding < realMax )
|
|
goto loop;
|
|
return;
|
|
}
|
|
|
|
if ( g_errno ) {
|
|
// let tcpserver.cpp free it
|
|
req.detachBuf();
|
|
log("build: inject had error: %s.",mstrerror(g_errno));
|
|
}
|
|
// loop if not
|
|
goto loop;
|
|
}
|
|
|
|
|
|
void injectedWrapper ( void *state , TcpSocket *s ) {
|
|
s_outstanding--;
|
|
|
|
// wtf is this? s_tcp is counting THIS socket so say "== 1"
|
|
if ( s_tcp.m_numUsed == 1 && s_outstanding > 0 ) {
|
|
log("inject: resetting s_outstanding to 0");
|
|
s_outstanding = 0;
|
|
}
|
|
|
|
// debug note
|
|
logf(LOG_DEBUG,"inject: out=%i used=%i",(int)s_outstanding,(int)s_tcp.m_numUsed);
|
|
|
|
// errno?
|
|
if ( g_errno ) {
|
|
log("inject: Got server error: %s.",
|
|
mstrerror(g_errno));
|
|
doInject(0,NULL);
|
|
return;
|
|
}
|
|
// free send buf
|
|
char *req = s->m_sendBuf;
|
|
int32_t reqAlloc = s->m_sendBufSize;
|
|
mfree ( req , reqAlloc , "maininject");
|
|
s->m_sendBuf = NULL;
|
|
|
|
int32_t i;
|
|
static int32_t s_last = 0;
|
|
int32_t now = getTimeLocal();
|
|
|
|
// save docid every 10 seconds
|
|
if ( now - s_last > 10 ) {
|
|
int64_t minDocId = 0x0000ffffffffffffLL;
|
|
// get min outstanding docid inject request
|
|
for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
|
|
// skip if occupied
|
|
if ( ! s_req[i] ) continue;
|
|
if ( s_docId[i] < minDocId ) minDocId = s_docId[i];
|
|
}
|
|
// map it
|
|
bool saveIt = false;
|
|
// are we the min?
|
|
int32_t i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
|
|
// skip if occupied
|
|
if ( s_req[i] != req ) continue;
|
|
// we got our request
|
|
if ( s_docId[i] == minDocId ) saveIt = true;
|
|
break;
|
|
}
|
|
if ( saveIt ) {
|
|
s_last = now;
|
|
SafeBuf sb;
|
|
sb.safePrintf("%" PRId64"\n",minDocId);
|
|
char fname[256];
|
|
//sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir
|
|
sprintf(fname,"./lastinjectdocid.dat");
|
|
sb.dumpToFile(fname);
|
|
}
|
|
}
|
|
|
|
// remove ourselves from map
|
|
for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ )
|
|
if ( s_req[i] == req ) s_req[i] = NULL;
|
|
|
|
// get return code
|
|
char *reply = s->m_readBuf;
|
|
logf(LOG_INFO,"inject: reply=\"%s\"",reply);
|
|
doInject(0,NULL);
|
|
}
|
|
|
|
void saveRdbs ( int fd , void *state ) {
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
int64_t last;
|
|
Rdb *rdb ;
|
|
// . try saving every 10 minutes from time of last write to disk
|
|
// . if nothing more added to tree since then, Rdb::close() return true
|
|
// . this is in MINUTES
|
|
int64_t delta = (int64_t)g_conf.m_autoSaveFrequency *60000LL;
|
|
if ( delta <= 0 ) return;
|
|
// jitter it up a bit so not all hostIds save at same time, 15 secs
|
|
delta += (int64_t)(g_hostdb.m_hostId % 10) * 15000LL + (rand()%7500);
|
|
rdb = g_tagdb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
rdb = g_posdb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
rdb = g_titledb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
rdb = g_spiderdb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
rdb = g_clusterdb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
rdb = g_statsdb.getRdb();
|
|
last = rdb->getLastWriteTime();
|
|
if ( now - last > delta )
|
|
if ( ! rdb->close(NULL,NULL,false,false)) return;
|
|
}
|
|
|
|
bool memTest() {
|
|
// let's ensure our core file can dump
|
|
struct rlimit lim;
|
|
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
|
|
if ( setrlimit(RLIMIT_CORE,&lim) )
|
|
log("db: setrlimit: %s.", mstrerror(errno) );
|
|
|
|
void *ptrs[4096];
|
|
int numPtrs=0;
|
|
int i;
|
|
g_conf.m_maxMem = 0xffffffffLL;
|
|
g_mem.init( );//g_mem.m_maxMem );
|
|
|
|
|
|
fprintf(stderr, "memtest: Testing memory bus bandwidth.\n");
|
|
// . read in 20MB 100 times (~2GB total)
|
|
// . tests main memory throughput
|
|
fprintf(stderr, "memtest: Testing main memory.\n");
|
|
membustest ( 20*1024*1024 , 100 , true );
|
|
// . read in 1MB 2,000 times (~2GB)
|
|
// . tests the L2 cache
|
|
fprintf(stderr, "memtest: Testing 1MB L2 cache.\n");
|
|
membustest ( 1024*1024 , 2000 , true );
|
|
// . read in 8000 200,000 times (~1.6GB)
|
|
// . tests the L1 cache
|
|
fprintf(stderr, "memtest: Testing 8KB L1 cache.\n");
|
|
membustest ( 8000 , 100000 , true );
|
|
|
|
fprintf(stderr, "memtest: Allocating up to %" PRId64" bytes\n",
|
|
g_conf.m_maxMem);
|
|
for (i=0;i<4096;i++) {
|
|
ptrs[numPtrs] = mmalloc(1024*1024, "memtest");
|
|
if (!ptrs[numPtrs]) break;
|
|
numPtrs++;
|
|
}
|
|
|
|
fprintf(stderr, "memtest: Was able to allocate %" PRId64" bytes of a "
|
|
"total of "
|
|
"%" PRId64" bytes of memory attempted.\n",
|
|
g_mem.getUsedMem(),g_conf.m_maxMem);
|
|
|
|
return true;
|
|
}
|
|
|
|
// . read in "nb" bytes, loops times,
|
|
// . if readf is false, do write test, not read test
|
|
void membustest ( int32_t nb , int32_t loops , bool readf ) {
|
|
int32_t count = loops;
|
|
|
|
// don't exceed 50NB
|
|
if ( nb > 50*1024*1024 ) {
|
|
fprintf(stderr,"memtest: truncating to 50 Megabytes.\n");
|
|
nb = 50*1024*1024;
|
|
}
|
|
|
|
int32_t n = nb ; //* 1024 * 1024 ;
|
|
|
|
int32_t bufSize = 50*1024*1024;
|
|
char *buf = (char *) mmalloc ( bufSize , "main" );
|
|
if ( ! buf ) return;
|
|
char *bufStart = buf;
|
|
char *bufEnd = buf + n;
|
|
|
|
// pre-read it so sbrk() can do its thing
|
|
for ( int32_t i = 0 ; i < n ; i++ ) buf[i] = 1;
|
|
|
|
// time stamp
|
|
int64_t t = gettimeofdayInMilliseconds();
|
|
|
|
fprintf(stderr,"memtest: start = %" PRId64"\n",t);
|
|
|
|
// . time the read loop
|
|
// . each read should only be 2 assenbly movl instructions:
|
|
// movl -52(%ebp), %eax
|
|
// movl (%eax), %eax
|
|
// movl -52(%ebp), %eax
|
|
// movl 4(%eax), %eax
|
|
// ...
|
|
loop:
|
|
int32_t c;
|
|
|
|
if ( readf ) {
|
|
while ( buf < bufEnd ) {
|
|
// repeat 16x for efficiency.limit comparison to bufEnd
|
|
c = *(int32_t *)(buf+ 0);
|
|
c = *(int32_t *)(buf+ 4);
|
|
c = *(int32_t *)(buf+ 8);
|
|
c = *(int32_t *)(buf+12);
|
|
c = *(int32_t *)(buf+16);
|
|
c = *(int32_t *)(buf+20);
|
|
c = *(int32_t *)(buf+24);
|
|
c = *(int32_t *)(buf+28);
|
|
c = *(int32_t *)(buf+32);
|
|
c = *(int32_t *)(buf+36);
|
|
c = *(int32_t *)(buf+40);
|
|
c = *(int32_t *)(buf+44);
|
|
c = *(int32_t *)(buf+48);
|
|
c = *(int32_t *)(buf+52);
|
|
c = *(int32_t *)(buf+56);
|
|
c = *(int32_t *)(buf+60);
|
|
buf += 64;
|
|
}
|
|
}
|
|
else {
|
|
while ( buf < bufEnd ) {
|
|
// repeat 8x for efficiency. limit comparison to bufEnd
|
|
*(int32_t *)(buf+ 0) = 0;
|
|
*(int32_t *)(buf+ 4) = 1;
|
|
*(int32_t *)(buf+ 8) = 2;
|
|
*(int32_t *)(buf+12) = 3;
|
|
*(int32_t *)(buf+16) = 4;
|
|
*(int32_t *)(buf+20) = 5;
|
|
*(int32_t *)(buf+24) = 6;
|
|
*(int32_t *)(buf+28) = 7;
|
|
buf += 32;
|
|
}
|
|
}
|
|
if ( --count > 0 ) {
|
|
buf = bufStart;
|
|
goto loop;
|
|
}
|
|
|
|
// completed
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
fprintf(stderr,"memtest: now = %" PRId64"\n",t);
|
|
// multiply by 4 since these are int32_ts
|
|
const char *op = "read";
|
|
if ( ! readf ) op = "wrote";
|
|
fprintf(stderr,"memtest: %s %" PRId32" bytes (x%" PRId32") in"
|
|
"%" PRIu64" ms.\n",
|
|
op , n , loops , now - t );
|
|
// stats
|
|
if ( now - t == 0 ) now++;
|
|
double d = (1000.0*(double)loops*(double)(n)) / ((double)(now - t));
|
|
fprintf(stderr,"memtest: we did %.2f MB/sec.\n" , d/(1024.0*1024.0));
|
|
|
|
mfree ( bufStart , bufSize , "main" );
|
|
|
|
return ;
|
|
}
|
|
|
|
|
|
bool cacheTest() {
|
|
|
|
g_conf.m_maxMem = 2000000000LL; // 2G
|
|
//g_mem.m_maxMem = 2000000000LL; // 2G
|
|
|
|
hashinit();
|
|
|
|
// use an rdb cache
|
|
RdbCache c;
|
|
// init, 50MB
|
|
int32_t maxMem = 50000000;
|
|
// . how many nodes in cache tree can we fit?
|
|
// . each rec is key (12) and ip(4)
|
|
// . overhead in cache is 56
|
|
// . that makes 56 + 4 = 60
|
|
// . not correct? stats suggest it's less than 25 bytes each
|
|
int32_t maxCacheNodes = maxMem / 25;
|
|
// set the cache
|
|
if ( ! c.init ( maxMem ,
|
|
4 , // fixed data size of rec
|
|
false , // support lists of recs?
|
|
maxCacheNodes ,
|
|
false , // use half keys?
|
|
"cachetest" , // dbname
|
|
false )) {// save cache to disk?
|
|
log(LOG_WARN, "test: Cache init failed.");
|
|
return false;
|
|
}
|
|
|
|
int32_t numRecs = 0 * maxCacheNodes;
|
|
logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);
|
|
|
|
// timestamp
|
|
int32_t timestamp = 42;
|
|
// keep ring buffer of last 10 keys
|
|
key_t oldk[10];
|
|
int32_t oldip[10];
|
|
int32_t b = 0;
|
|
// fill with random recs
|
|
for ( int32_t i = 0 ; i < numRecs ; i++ ) {
|
|
if ( (i % 100000) == 0 )
|
|
logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache.",i);
|
|
// random key
|
|
key_t k ;
|
|
k.n1 = rand();
|
|
k.n0 = rand();
|
|
k.n0 <<= 32;
|
|
k.n0 |= rand();
|
|
int32_t ip = rand();
|
|
// keep ring buffer
|
|
oldk [b] = k;
|
|
oldip[b] = ip;
|
|
if ( ++b >= 10 ) b = 0;
|
|
// make rec,size, like dns, will be 4 byte hash and 4 byte key?
|
|
c.addRecord((collnum_t)0,k,(char *)&ip,4,timestamp);
|
|
// reset g_errno in case it had an error (we don't care)
|
|
g_errno = 0;
|
|
// get a rec too!
|
|
if ( i < 10 ) continue;
|
|
int32_t next = b + 1;
|
|
if ( next >= 10 ) next = 0;
|
|
key_t back = oldk[next];
|
|
char *rec;
|
|
int32_t recSize;
|
|
if ( ! c.getRecord ( (collnum_t)0 ,
|
|
back ,
|
|
&rec ,
|
|
&recSize ,
|
|
false , // do copy?
|
|
-1 , // maxAge ,
|
|
true , // inc count?
|
|
NULL , // *cachedTime = NULL,
|
|
true )){ // promoteRecord?
|
|
g_process.shutdownAbort(true); }
|
|
if ( ! rec || recSize != 4 || *(int32_t *)rec != oldip[next] ) {
|
|
g_process.shutdownAbort(true); }
|
|
}
|
|
|
|
// now try variable sized recs
|
|
c.reset();
|
|
|
|
logf(LOG_DEBUG,"test: Testing variably-sized recs.");
|
|
|
|
// init, 300MB
|
|
maxMem = 300000000;
|
|
// . how many nodes in cache tree can we fit?
|
|
// . each rec is key (12) and ip(4)
|
|
// . overhead in cache is 56
|
|
// . that makes 56 + 4 = 60
|
|
// . not correct? stats suggest it's less than 25 bytes each
|
|
maxCacheNodes = maxMem / 5000;
|
|
//maxCacheNodes = 1200;
|
|
// set the cache
|
|
if ( ! c.init ( maxMem ,
|
|
-1 , // fixed data size of rec
|
|
false , // support lists of recs?
|
|
maxCacheNodes ,
|
|
false , // use half keys?
|
|
"cachetest" , // dbname
|
|
false )) { // save cache to disk?
|
|
log(LOG_WARN, "test: Cache init failed.");
|
|
return false;
|
|
}
|
|
|
|
numRecs = 30 * maxCacheNodes;
|
|
logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);
|
|
|
|
// timestamp
|
|
timestamp = 42;
|
|
// keep ring buffer of last 10 keys
|
|
int32_t oldrs[10];
|
|
b = 0;
|
|
// rec to add
|
|
char *rec;
|
|
int32_t recSize;
|
|
int32_t maxRecSize = 40000000; // 40MB for termlists
|
|
int32_t numMisses = 0;
|
|
char *buf = (char *)mmalloc ( maxRecSize + 64 ,"cachetest" );
|
|
if ( ! buf ) return false;
|
|
// fill with random recs
|
|
for ( int32_t i = 0 ; i < numRecs ; i++ ) {
|
|
if ( (i % 100) == 0 )
|
|
logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache. "
|
|
"Misses=%" PRId32".",i,numMisses);
|
|
// random key
|
|
key_t k ;
|
|
k.n1 = rand();
|
|
k.n0 = rand();
|
|
k.n0 <<= 32;
|
|
k.n0 |= rand();
|
|
// random size
|
|
recSize = rand()%maxRecSize;//100000;
|
|
// keep ring buffer
|
|
oldk [b] = k;
|
|
oldrs[b] = recSize;
|
|
if ( ++b >= 10 ) b = 0;
|
|
// make the rec
|
|
rec = buf;
|
|
memset ( rec , (char)k.n1, recSize );
|
|
// make rec,size, like dns, will be 4 byte hash and 4 byte key?
|
|
if ( ! c.addRecord((collnum_t)0,k,rec,recSize,timestamp) ) {
|
|
g_process.shutdownAbort(true); }
|
|
// do a dup add 1% of the time
|
|
if ( (i % 100) == 0 )
|
|
if(!c.addRecord((collnum_t)0,k,rec,recSize,timestamp)){
|
|
g_process.shutdownAbort(true); }
|
|
// reset g_errno in case it had an error (we don't care)
|
|
g_errno = 0;
|
|
// get a rec too!
|
|
if ( i < 10 ) continue;
|
|
int32_t next = b + 1;
|
|
if ( next >= 10 ) next = 0;
|
|
key_t back = oldk[next];
|
|
//log("cache: get rec");
|
|
if ( ! c.getRecord ( (collnum_t)0 ,
|
|
back ,
|
|
&rec ,
|
|
&recSize ,
|
|
false , // do copy?
|
|
-1 , // maxAge ,
|
|
true , // inc count?
|
|
NULL , // *cachedTime = NULL,
|
|
true) ) {//true )){ // promoteRecord?
|
|
numMisses++;
|
|
continue;
|
|
}
|
|
if ( recSize != oldrs[next] ) {
|
|
logf(LOG_DEBUG,"test: bad rec size.");
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
char r = (char)back.n1;
|
|
for ( int32_t j = 0 ; j < recSize ; j++ ) {
|
|
if ( rec[j] == r ) continue;
|
|
logf(LOG_DEBUG,"test: bad char in rec.");
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
}
|
|
|
|
c.verify();
|
|
|
|
c.reset();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ramdiskTest() {
|
|
int fd = open ("/dev/ram2",O_RDWR);
|
|
|
|
if ( fd < 0 ) {
|
|
fprintf(stderr,"ramdisk: failed to open /dev/ram2\n");
|
|
return false;
|
|
}
|
|
|
|
char *buf[1000];
|
|
pwrite ( fd , buf , 1000, 0 );
|
|
|
|
close ( fd);
|
|
return true;
|
|
}
|
|
|
|
// CountDomains Structures and function definitions
|
|
struct lnk_info {
|
|
char *dom;
|
|
int32_t domLen;
|
|
int32_t pages;
|
|
};
|
|
|
|
struct dom_info {
|
|
char *dom;
|
|
int32_t domLen;
|
|
int32_t dHash;
|
|
int32_t pages;
|
|
struct ip_info **ip_list;
|
|
int32_t numIp;
|
|
int32_t *lnk_table;
|
|
int32_t tableSize;
|
|
int32_t lnkCnt;
|
|
int32_t lnkPages;
|
|
};
|
|
|
|
struct ip_info {
|
|
uint32_t ip;
|
|
int32_t pages;
|
|
struct dom_info **dom_list;
|
|
int32_t numDom;
|
|
};
|
|
|
|
static int ip_fcmp (const void *p1, const void *p2);
|
|
static int ip_dcmp (const void *p1, const void *p2);
|
|
|
|
static int dom_fcmp (const void *p1, const void *p2);
|
|
static int dom_lcmp (const void *p1, const void *p2);
|
|
|
|
void countdomains( const char* coll, int32_t numRecs, int32_t verbosity, int32_t output ) {
|
|
struct ip_info **ip_table;
|
|
struct dom_info **dom_table;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
key_t startKey;
|
|
key_t endKey ;
|
|
key_t lastKey ;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
lastKey.setMin();
|
|
|
|
g_titledb.init ();
|
|
g_titledb.getRdb()->addRdbBase1(coll );
|
|
|
|
log( LOG_INFO, "cntDm: parms: %s, %" PRId32, coll, numRecs );
|
|
int64_t time_start = gettimeofdayInMilliseconds();
|
|
|
|
// turn off threads
|
|
g_jobScheduler.disallow_new_jobs();
|
|
// get a meg at a time
|
|
int32_t minRecSizes = 1024*1024;
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
int32_t countDocs = 0;
|
|
int32_t countIp = 0;
|
|
int32_t countDom = 0;
|
|
int32_t attempts = 0;
|
|
|
|
ip_table = (struct ip_info **)mmalloc(sizeof(struct ip_info *) * numRecs,
|
|
"main-dcit" );
|
|
dom_table = (struct dom_info **)mmalloc(sizeof(struct dom_info *) * numRecs,
|
|
"main-dcdt" );
|
|
|
|
for( int32_t i = 0; i < numRecs; i++ ) {
|
|
ip_table[i] = NULL;
|
|
dom_table[i] = NULL;
|
|
}
|
|
loop:
|
|
// use msg5 to get the list, should ALWAYS block since no threads
|
|
if ( ! msg5.getList ( RDB_TITLEDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
minRecSizes ,
|
|
true , // Do we need to include tree?
|
|
0 , // max cache age
|
|
0 ,
|
|
-1 ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL , // cache key ptr
|
|
0 , // retry num
|
|
-1 , // maxRetries
|
|
true , // compensate for merge
|
|
-1LL, // syncPoint
|
|
false, // isRealMerge
|
|
true)) // allowPageCache
|
|
{
|
|
log(LOG_LOGIC,"db: getList did not block.");
|
|
return;
|
|
}
|
|
// all done if empty
|
|
if ( list.isEmpty() ) goto freeInfo;
|
|
// loop over entries in list
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
char *rec = list.getCurrentRec();
|
|
int32_t recSize = list.getCurrentRecSize();
|
|
int64_t docId = g_titledb.getDocId ( &k );
|
|
attempts++;
|
|
|
|
if ( k <= lastKey )
|
|
log("key out of order. "
|
|
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
|
|
"currKey.n1=%" PRIx32" n0=%" PRIx64" ",
|
|
lastKey.n1,lastKey.n0,
|
|
k.n1,k.n0);
|
|
lastKey = k;
|
|
// print deletes
|
|
if ( (k.n0 & 0x01) == 0) {
|
|
fprintf(stderr,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
|
|
"(del)\n",
|
|
k.n1 , k.n0 , docId );
|
|
continue;
|
|
}
|
|
|
|
if( (countIp >= numRecs) || (countDom >= numRecs) ) {
|
|
log( LOG_INFO, "cntDm: countIp | countDom, greater than"
|
|
"numRecs requested, should never happen!!!!" );
|
|
goto freeInfo;
|
|
}
|
|
|
|
XmlDoc xd;
|
|
if ( ! xd.set2 (rec, recSize, coll,NULL,0) )
|
|
continue;
|
|
|
|
struct ip_info *sipi ;
|
|
struct dom_info *sdomi;
|
|
|
|
int32_t i;
|
|
for( i = 0; i < countIp; i++ ) {
|
|
if( !ip_table[i] ) continue;
|
|
sipi = (struct ip_info *)ip_table[i];
|
|
if( sipi->ip == (uint32_t)xd.m_ip ) break;
|
|
}
|
|
|
|
if( i == countIp ) {
|
|
sipi = (struct ip_info *)mmalloc(sizeof(struct ip_info),
|
|
"main-dcip" );
|
|
if( !sipi ) { g_process.shutdownAbort(true); }
|
|
ip_table[countIp++] = sipi;
|
|
sipi->ip = xd.m_ip;//u->getIp();
|
|
sipi->pages = 1;
|
|
sipi->numDom = 0;
|
|
} else {
|
|
sipi->pages++;
|
|
}
|
|
|
|
char *fu = xd.ptr_firstUrl;
|
|
int32_t dlen;
|
|
const char *dom = getDomFast ( fu , &dlen );
|
|
int32_t dkey = hash32( dom , dlen );
|
|
|
|
for( i = 0; i < countDom; i++ ) {
|
|
if( !dom_table[i] ) continue;
|
|
sdomi = (struct dom_info *)dom_table[i];
|
|
if( sdomi->dHash == dkey ) break;
|
|
}
|
|
|
|
if( i == countDom ) {
|
|
sdomi =(struct dom_info*)mmalloc(sizeof(struct dom_info),
|
|
"main-dcdm" );
|
|
if( !sdomi ) { g_process.shutdownAbort(true); }
|
|
dom_table[countDom++] = sdomi;
|
|
sdomi->dom = (char *)mmalloc( dlen,"main-dcsdm" );
|
|
|
|
strncpy( sdomi->dom, dom , dlen );
|
|
sdomi->domLen = dlen;
|
|
sdomi->dHash = dkey;
|
|
sdomi->pages = 1;
|
|
sdomi->numIp = 0;
|
|
|
|
sdomi->tableSize = 0;
|
|
sdomi->lnkCnt = 0;
|
|
}
|
|
else {
|
|
sdomi->pages++;
|
|
}
|
|
|
|
Links *dlinks = xd.getLinks();
|
|
|
|
int32_t size = dlinks->getNumLinks();
|
|
if( !sdomi->tableSize ) {
|
|
sdomi->lnk_table = (int32_t *)mmalloc(size * sizeof(int32_t),
|
|
"main-dclt" );
|
|
sdomi->tableSize = size;
|
|
}
|
|
else {
|
|
if( size > (sdomi->tableSize - sdomi->lnkCnt) ) {
|
|
size += sdomi->lnkCnt;
|
|
sdomi->lnk_table = (int32_t *)
|
|
mrealloc(sdomi->lnk_table,
|
|
sdomi->tableSize*sizeof(int32_t),
|
|
size*sizeof(int32_t),
|
|
"main-dcrlt" );
|
|
sdomi->tableSize = size;
|
|
}
|
|
}
|
|
|
|
for( int32_t i = 0; i < dlinks->getNumLinks(); i++ ) {
|
|
//struct lnk_info *slink;
|
|
char *link = dlinks->getLink(i);
|
|
int32_t dlen;
|
|
const char *dom = getDomFast ( link , &dlen );
|
|
uint32_t lkey = hash32( dom , dlen );
|
|
int32_t j;
|
|
for( j = 0; j < sdomi->lnkCnt; j++ ) {
|
|
if( sdomi->lnk_table[j] == (int32_t)lkey ) break;
|
|
}
|
|
|
|
sdomi->lnkPages++;
|
|
if( j != sdomi->lnkCnt ) continue;
|
|
sdomi->lnk_table[sdomi->lnkCnt++] = lkey;
|
|
sdomi->lnkPages++;
|
|
}
|
|
|
|
// Handle lists
|
|
if( !sipi->numDom || !sdomi->numIp ){
|
|
sdomi->numIp++; sipi->numDom++;
|
|
//Add to IP list for Domain
|
|
sdomi->ip_list = (struct ip_info **)
|
|
mrealloc( sdomi->ip_list,
|
|
(sdomi->numIp-1)*sizeof(char *),
|
|
sdomi->numIp*sizeof(char *),
|
|
"main-dcldm" );
|
|
sdomi->ip_list[sdomi->numIp-1] = sipi;
|
|
|
|
//Add to domain list for IP
|
|
sipi->dom_list = (struct dom_info **)
|
|
mrealloc( sipi->dom_list,
|
|
(sipi->numDom-1)*sizeof(char *),
|
|
sipi->numDom*sizeof(char *),
|
|
"main-dclip" );
|
|
sipi->dom_list[sipi->numDom-1] = sdomi;
|
|
}
|
|
else {
|
|
int32_t i;
|
|
for( i = 0;
|
|
(i < sdomi->numIp)
|
|
&& (sdomi->ip_list[i] != sipi);
|
|
i++ );
|
|
if( sdomi->numIp != i ) goto updateIp;
|
|
|
|
sdomi->numIp++;
|
|
sdomi->ip_list = (struct ip_info **)
|
|
mrealloc( sdomi->ip_list,
|
|
(sdomi->numIp-1)*sizeof(int32_t),
|
|
sdomi->numIp*sizeof(int32_t),
|
|
"main-dcldm" );
|
|
sdomi->ip_list[sdomi->numIp-1] = sipi;
|
|
|
|
updateIp:
|
|
for( i = 0;
|
|
(i < sipi->numDom)
|
|
&& (sipi->dom_list[i] != sdomi);
|
|
i++ );
|
|
if( sipi->numDom != i ) goto endListUpdate;
|
|
|
|
sipi->numDom++;
|
|
sipi->dom_list = (struct dom_info **)
|
|
mrealloc( sipi->dom_list,
|
|
(sipi->numDom-1)*sizeof(int32_t),
|
|
sipi->numDom*sizeof(int32_t),
|
|
"main-dclip" );
|
|
sipi->dom_list[sipi->numDom-1] = sdomi;
|
|
|
|
endListUpdate:
|
|
i=0;
|
|
}
|
|
if( !((++countDocs) % 1000) )
|
|
log(LOG_INFO, "cntDm: %" PRId32" records searched.",countDocs);
|
|
if( countDocs == numRecs ) goto freeInfo;
|
|
//else countDocs++;
|
|
}
|
|
startKey = *(key_t *)list.getLastKey();
|
|
startKey += (uint32_t) 1;
|
|
// watch out for wrap around
|
|
if ( startKey < *(key_t *)list.getLastKey() ) {
|
|
log( LOG_INFO, "cntDm: Keys wrapped around! Exiting." );
|
|
goto freeInfo;
|
|
}
|
|
|
|
if ( countDocs >= numRecs ) {
|
|
freeInfo:
|
|
char buf[128];
|
|
//int32_t value ;
|
|
int32_t len ;
|
|
char loop ;
|
|
int32_t recsDisp;
|
|
struct ip_info *tmpipi ;
|
|
struct dom_info *tmpdomi ;
|
|
//struct lnk_info *tmplnk ;
|
|
loop = 0;
|
|
|
|
FILE *fhndl;
|
|
char out[128];
|
|
if( output != 9 ) goto printHtml;
|
|
// Dump raw data to a file to parse later
|
|
sprintf( out, "%scntdom.xml", g_hostdb.m_dir );
|
|
if( !(fhndl = fopen( out, "wb" )) ) {
|
|
log( LOG_INFO, "cntDm: File Open Failed." );
|
|
return;
|
|
}
|
|
|
|
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
|
|
for( int32_t i = 0; i < countDom; i++ ) {
|
|
if( !dom_table[i] ) continue;
|
|
tmpdomi = (struct dom_info *)dom_table[i];
|
|
len = tmpdomi->domLen;
|
|
if( tmpdomi->domLen > 127 ) len = 126;
|
|
strncpy( buf, tmpdomi->dom, len );
|
|
buf[len] = '\0';
|
|
fprintf(fhndl,
|
|
"<rec1>\n\t<domain>%s</domain>\n"
|
|
"\t<pages>%" PRId32"</pages>\n"
|
|
//"\t<quality>%" PRId64"</quality>\n"
|
|
"\t<block>\n",
|
|
buf, tmpdomi->pages
|
|
//,(tmpdomi->quality/tmpdomi->pages)
|
|
);
|
|
gbsort( tmpdomi->ip_list,tmpdomi->numIp, sizeof(int32_t),
|
|
ip_fcmp );
|
|
for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
|
|
if( !tmpdomi->ip_list[j] ) continue;
|
|
tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
|
|
strcpy ( buf , iptoa( tmpipi->ip ) );
|
|
fprintf(fhndl,"\t\t<ip>%s</ip>\n",buf);
|
|
}
|
|
fprintf(fhndl,
|
|
"\t</block>\n"
|
|
"\t<links>\n");
|
|
}
|
|
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
|
|
for( int32_t i = 0; i < countIp; i++ ) {
|
|
if( !ip_table[i] ) continue;
|
|
tmpipi = (struct ip_info *)ip_table[i];
|
|
strcpy ( buf , iptoa( tmpipi->ip ) );
|
|
fprintf(fhndl,
|
|
"<rec2>\n\t<ip>%s</ip>\n"
|
|
"\t<pages>%" PRId32"</pages>\n"
|
|
"\t<block>\n",
|
|
buf, tmpipi->pages);
|
|
for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
|
|
tmpdomi = (struct dom_info *)tmpipi->dom_list[j];
|
|
len = tmpdomi->domLen;
|
|
if( tmpdomi->domLen > 127 ) len = 126;
|
|
strncpy( buf, tmpdomi->dom, len );
|
|
buf[len] = '\0';
|
|
fprintf(fhndl,
|
|
"\t\t<domain>%s</domain>\n",
|
|
buf);
|
|
}
|
|
fprintf(fhndl,
|
|
"\t</block>\n"
|
|
"</rec2>\n");
|
|
}
|
|
|
|
if( fclose( fhndl ) < 0 ) {
|
|
log( LOG_INFO, "cntDm: File Close Failed." );
|
|
return;
|
|
}
|
|
fhndl = 0;
|
|
|
|
printHtml:
|
|
// HTML file Output
|
|
sprintf( out, "%scntdom.html", g_hostdb.m_dir );
|
|
if( !(fhndl = fopen( out, "wb" )) ) {
|
|
log( LOG_INFO, "cntDm: File Open Failed." );
|
|
return;
|
|
}
|
|
int64_t total = g_titledb.getGlobalNumDocs();
|
|
char link_ip[] = "http://www.gigablast.com/search?"
|
|
"code=gbmonitor&q=ip%3A";
|
|
char link_dom[] = "http://www.gigablast.com/search?"
|
|
"code=gbmonitor&q=site%3A";
|
|
char menu[] = "<table cellpadding=\"2\" cellspacing=\"2\">\n<tr>"
|
|
"<th bgcolor=\"#CCCC66\"><a href=\"#pid\">"
|
|
"Domains Sorted By Pages</a></th>"
|
|
"<th bgcolor=\"#CCCC66\"><a href=\"#lid\">"
|
|
"Domains Sorted By Links</a></th>"
|
|
"<th bgcolor=\"#CCCC66\"><a href=\"#pii\">"
|
|
"IPs Sorted By Pages</a></th>"
|
|
"<th bgcolor=\"#CCCC66\"><a href=\"#dii\">"
|
|
"IPs Sorted By Domains</a></th>"
|
|
"<th bgcolor=\"#CCCC66\"><a href=\"#stats\">"
|
|
"Stats</a></th>"
|
|
"</tr>\n</table>\n<br>\n";
|
|
|
|
char hdr[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
|
|
"<tr bgcolor=\"AAAAAA\">"
|
|
"<th>Domain</th>"
|
|
"<th>Domains Linked</th>"
|
|
//"<th>Avg Quality</th>"
|
|
"<th># Pages</th>"
|
|
"<th>Extrap # Pages</th>"
|
|
"<th>IP</th>"
|
|
"</tr>\n";
|
|
|
|
char hdr2[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
|
|
"<tr bgcolor=\"AAAAAA\">"
|
|
"<th>IP</th>"
|
|
"<th>Domain</th>"
|
|
"<th>Domains Linked</th>"
|
|
//"<th>Avg Quality</th>"
|
|
"<th># Pages</th>"
|
|
"<th>Extrap # Pages</th>"
|
|
"</tr>\n";
|
|
|
|
char clr1[] = "#FFFF00";//"yellow";
|
|
char clr2[] = "#FFFF66";//"orange";
|
|
char *color;
|
|
|
|
fprintf( fhndl,
|
|
"<html><head><title>Domain/IP Counter</title></head>\n"
|
|
"<body>"
|
|
"<h1>Domain/IP Counter</h1><br><br>"
|
|
"<a name=\"stats\">"
|
|
"<h2>Stats</h2>\n%s", menu );
|
|
|
|
// Stats
|
|
fprintf( fhndl, "<br>\n\n<table>\n"
|
|
"<tr><th align=\"left\">Total Number of Domains</th>"
|
|
"<td>%" PRId32"</td></tr>\n"
|
|
"<tr><th align=\"left\">Total Number of Ips</th>"
|
|
"<td>%" PRId32"</td></tr>\n"
|
|
"<tr><th align=\"left\">Number of Documents Searched"
|
|
"</th><td>%" PRId32"</td></tr>\n"
|
|
"<tr><th align=\"left\">Number of Failed Attempts</th>"
|
|
"<td>%" PRId32"</td></tr><tr></tr><tr>\n"
|
|
"<tr><th align=\"left\">Number of Documents in Index"
|
|
"</th><td>%" PRId64"</td></tr>\n"
|
|
"<tr><th align=\"left\">Estimated Domains in index</th>"
|
|
"<td>%" PRId64"</td></tr>"
|
|
"</table><br><br><br>\n"
|
|
,countDom,countIp,
|
|
countDocs, attempts-countDocs,total,
|
|
((countDom*total)/countDocs) );
|
|
|
|
|
|
fprintf( fhndl, "<a name=\"pid\">\n"
|
|
"<h2>Domains Sorted By Pages</h2>\n"
|
|
"%s", menu );
|
|
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
|
|
printDomLp:
|
|
|
|
fprintf( fhndl,"%s", hdr );
|
|
recsDisp = countDom;
|
|
if( countDom > 1000 ) recsDisp = 1000;
|
|
for( int32_t i = 0; i < recsDisp; i++ ) {
|
|
char buf[128];
|
|
int32_t len;
|
|
if( !dom_table[i] ) continue;
|
|
if( i%2 ) color = clr2;
|
|
else color = clr1;
|
|
tmpdomi = (struct dom_info *)dom_table[i];
|
|
len = tmpdomi->domLen;
|
|
if( tmpdomi->domLen > 127 ) len = 126;
|
|
strncpy( buf, tmpdomi->dom, len );
|
|
buf[len] = '\0';
|
|
fprintf( fhndl, "<tr bgcolor=\"%s\"><td>"
|
|
"<a href=\"%s%s\" target=\"_blank\">%s</a>"
|
|
"</td><td>%" PRId32"</td>"
|
|
//"<td>%" PRId64"</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>%" PRId64"</td><td>",
|
|
color, link_dom,
|
|
buf, buf, tmpdomi->lnkCnt,
|
|
//(tmpdomi->quality/tmpdomi->pages),
|
|
tmpdomi->pages,
|
|
((tmpdomi->pages*total)/countDocs) );
|
|
for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
|
|
tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
|
|
strcpy ( buf , iptoa(tmpipi->ip) );
|
|
fprintf( fhndl, "<a href=\"%s%s\""
|
|
"target=\"_blank\">%s</a>\n",
|
|
link_ip, buf, buf );
|
|
}
|
|
fprintf( fhndl, "</td></tr>\n" );
|
|
fprintf( fhndl, "\n" );
|
|
}
|
|
|
|
fprintf( fhndl, "</table>\n<br><br><br>" );
|
|
if( loop == 0 ) {
|
|
loop = 1;
|
|
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_lcmp );
|
|
fprintf( fhndl, "<a name=\"lid\">"
|
|
"<h2>Domains Sorted By Links</h2>\n%s", menu );
|
|
|
|
goto printDomLp;
|
|
}
|
|
loop = 0;
|
|
|
|
fprintf( fhndl, "<a name=\"pii\">"
|
|
"<h2>IPs Sorted By Pages</h2>\n%s", menu );
|
|
|
|
|
|
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
|
|
printIpLp:
|
|
fprintf( fhndl,"%s", hdr2 );
|
|
recsDisp = countIp;
|
|
if( countIp > 1000 ) recsDisp = 1000;
|
|
for( int32_t i = 0; i < recsDisp; i++ ) {
|
|
char buf[128];
|
|
if( !ip_table[i] ) continue;
|
|
tmpipi = (struct ip_info *)ip_table[i];
|
|
strcpy ( buf , iptoa(tmpipi->ip) );
|
|
if( i%2 ) color = clr2;
|
|
else color = clr1;
|
|
int32_t linked = 0;
|
|
for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
|
|
tmpdomi=(struct dom_info *)tmpipi->dom_list[j];
|
|
linked += tmpdomi->lnkCnt;
|
|
}
|
|
fprintf( fhndl, "\t<tr bgcolor=\"%s\"><td>"
|
|
"<a href=\"%s%s\" target=\"_blank\">%s</a>"
|
|
"</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>%" PRId32"</td>"
|
|
//"<td>%" PRId64"</td>"
|
|
"<td>%" PRId32"</td>"
|
|
"<td>%" PRId64"</td></tr>\n",
|
|
color,
|
|
link_ip, buf, buf, tmpipi->numDom, linked,
|
|
//(tmpipi->quality/tmpipi->pages),
|
|
tmpipi->pages,
|
|
((tmpipi->pages*total)/countDocs) );
|
|
fprintf( fhndl, "\n" );
|
|
}
|
|
|
|
fprintf( fhndl, "</table>\n<br><br><br>" );
|
|
if( loop == 0 ) {
|
|
loop = 1;
|
|
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_dcmp );
|
|
fprintf( fhndl, "<a name=\"dii\">"
|
|
"<h2>IPs Sorted By Domains</h2>\n%s", menu );
|
|
goto printIpLp;
|
|
}
|
|
|
|
if( fclose( fhndl ) < 0 ) {
|
|
log( LOG_INFO, "cntDm: File Close Failed." );
|
|
return;
|
|
}
|
|
fhndl = 0;
|
|
|
|
|
|
int32_t ima = 0;
|
|
int32_t dma = 0;
|
|
|
|
log( LOG_INFO, "cntDm: Freeing ip info struct..." );
|
|
for( int32_t i = 0; i < countIp; i++ ) {
|
|
if( !ip_table[i] ) continue;
|
|
//value = ipHT.getValue( ip_table[i] );
|
|
//if(value == 0) continue;
|
|
tmpipi = (struct ip_info *)ip_table[i];
|
|
mfree( tmpipi->dom_list, tmpipi->numDom*sizeof(tmpipi->dom_list[0]),
|
|
"main-dcflip" );
|
|
ima += tmpipi->numDom * sizeof(int32_t);
|
|
mfree( tmpipi, sizeof(struct ip_info), "main-dcfip" );
|
|
ima += sizeof(struct ip_info);
|
|
tmpipi = NULL;
|
|
}
|
|
mfree( ip_table, numRecs * sizeof(struct ip_info *), "main-dcfit" );
|
|
|
|
log( LOG_INFO, "cntDm: Freeing domain info struct..." );
|
|
for( int32_t i = 0; i < countDom; i++ ) {
|
|
if( !dom_table[i] ) continue;
|
|
tmpdomi = (struct dom_info *)dom_table[i];
|
|
mfree( tmpdomi->lnk_table,
|
|
tmpdomi->tableSize*sizeof(int32_t),
|
|
"main-dcfsdlt" );
|
|
dma += tmpdomi->tableSize * sizeof(int32_t);
|
|
mfree( tmpdomi->ip_list, tmpdomi->numIp*sizeof(tmpdomi->ip_list[0]),
|
|
"main-dcfldom" );
|
|
dma += tmpdomi->numIp * sizeof(int32_t);
|
|
mfree( tmpdomi->dom, tmpdomi->domLen, "main-dcfsdom" );
|
|
dma += tmpdomi->domLen;
|
|
mfree( tmpdomi, sizeof(struct dom_info), "main-dcfdom" );
|
|
dma+= sizeof(struct dom_info);
|
|
tmpdomi = NULL;
|
|
}
|
|
|
|
mfree( dom_table, numRecs * sizeof(struct dom_info *), "main-dcfdt" );
|
|
|
|
int64_t time_end = gettimeofdayInMilliseconds();
|
|
log( LOG_INFO, "cntDm: Took %" PRId64"ms to count domains in %" PRId32" recs.",
|
|
time_end-time_start, countDocs );
|
|
log( LOG_INFO, "cntDm: %" PRId32" bytes of Total Memory Used.",
|
|
ima + dma + (8 * numRecs) );
|
|
log( LOG_INFO, "cntDm: %" PRId32" bytes Total for IP.", ima );
|
|
log( LOG_INFO, "cntDm: %" PRId32" bytes Total for Dom.", dma );
|
|
log( LOG_INFO, "cntDm: %" PRId32" bytes Average for IP.", ima/countIp );
|
|
log( LOG_INFO, "cntDm: %" PRId32" bytes Average for Dom.", dma/countDom );
|
|
|
|
return;
|
|
}
|
|
goto loop;
|
|
}
|
|
|
|
// Sort by IP frequency in pages 9->0
|
|
int ip_fcmp (const void *p1, const void *p2) {
|
|
//int32_t n1, n2;
|
|
// break this! need to fix later MDW 11/12/14
|
|
char *n1 ;
|
|
char *n2 ;
|
|
struct ip_info *ii1;
|
|
struct ip_info *ii2;
|
|
|
|
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
|
|
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
|
|
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
|
|
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
|
|
|
|
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
|
|
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
|
|
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
|
|
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
|
|
|
|
ii1 = (struct ip_info *)n1;
|
|
ii2 = (struct ip_info *)n2;
|
|
|
|
return ii2->pages-ii1->pages;
|
|
}
|
|
|
|
// Sort by number of domains linked to IP, descending
|
|
int ip_dcmp (const void *p1, const void *p2) {
|
|
//int32_t n1, n2;
|
|
// break this! need to fix later MDW 11/12/14
|
|
char *n1 ;
|
|
char *n2 ;
|
|
|
|
struct ip_info *ii1;
|
|
struct ip_info *ii2;
|
|
|
|
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
|
|
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
|
|
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
|
|
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
|
|
|
|
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
|
|
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
|
|
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
|
|
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
|
|
|
|
ii1 = (struct ip_info *)n1;
|
|
ii2 = (struct ip_info *)n2;
|
|
|
|
return ii2->numDom-ii1->numDom;
|
|
}
|
|
|
|
// Sort by page frequency in titlerec 9->0
|
|
int dom_fcmp (const void *p1, const void *p2) {
|
|
//int32_t n1, n2;
|
|
// break this! need to fix later MDW 11/12/14
|
|
char *n1 ;
|
|
char *n2 ;
|
|
struct dom_info *di1;
|
|
struct dom_info *di2;
|
|
|
|
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
|
|
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
|
|
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
|
|
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
|
|
|
|
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
|
|
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
|
|
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
|
|
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
|
|
|
|
|
|
di1 = (struct dom_info *)n1;
|
|
di2 = (struct dom_info *)n2;
|
|
|
|
return di2->pages-di1->pages;
|
|
}
|
|
|
|
// Sort by quantity of outgoing links 9-0
|
|
int dom_lcmp (const void *p1, const void *p2) {
|
|
//int32_t n1, n2;
|
|
// break this! need to fix later MDW 11/12/14
|
|
char *n1 ;
|
|
char *n2 ;
|
|
struct dom_info *di1;
|
|
struct dom_info *di2;
|
|
|
|
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
|
|
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
|
|
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
|
|
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
|
|
|
|
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
|
|
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
|
|
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
|
|
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
|
|
|
|
|
|
di1 = (struct dom_info *)n1;
|
|
di2 = (struct dom_info *)n2;
|
|
|
|
return di2->lnkCnt-di1->lnkCnt;
|
|
}
|
|
|
|
// generate the copies that need to be done to scale from oldhosts.conf
|
|
// to newhosts.conf topology.
|
|
int collinject ( char *newHostsConf ) {
|
|
|
|
g_hostdb.resetPortTables();
|
|
|
|
Hostdb hdb;
|
|
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
|
|
if ( ! hdb.init( 0/*assume we're zero*/) ) {
|
|
log("collinject failed. Could not init hostdb with %s",
|
|
newHostsConf);
|
|
return -1;
|
|
}
|
|
|
|
// ptrs to the two hostdb's
|
|
Hostdb *hdb1 = &g_hostdb;
|
|
Hostdb *hdb2 = &hdb;
|
|
|
|
if ( hdb1->getNumHosts() != hdb2->getNumHosts() ) {
|
|
log("collinject: num hosts differ!");
|
|
return -1;
|
|
}
|
|
|
|
// . ensure old hosts in g_hostdb are in a derivate groupId in
|
|
// newHostsConf
|
|
// . old hosts may not even be present! consider them the same host,
|
|
// though, if have same ip and working dir, because that would
|
|
// interfere with a file copy.
|
|
for ( int32_t i = 0 ; i < hdb1->m_numShards ; i++ ) {
|
|
//Host *h1 = &hdb1->getHost(i);//m_hosts[i];
|
|
//int32_t gid = hdb1->getGroupId ( i ); // groupNum
|
|
uint32_t shardNum = (uint32_t)i;
|
|
|
|
Host *h1 = hdb1->getShard ( shardNum );
|
|
Host *h2 = hdb2->getShard ( shardNum );
|
|
|
|
printf("ssh %s 'nohup /w/gbi -w /w/ inject titledb "
|
|
"%s:%" PRId32" >& /w/ilog' &\n"
|
|
, h1->m_hostname
|
|
, iptoa(h2->m_ip)
|
|
//, h2->m_hostname
|
|
, (int32_t)h2->m_httpPort
|
|
);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
const char *getcwd2 ( char *arg2 ) {
|
|
char argBuf[1026];
|
|
char *arg = argBuf;
|
|
|
|
//
|
|
// arg2 examples:
|
|
// ./gb
|
|
// /bin/gb (symlink to ../../var/gigablast/data0/gb)
|
|
// /usr/bin/gb (symlink to ../../var/gigablast/data0/gb)
|
|
//
|
|
|
|
//
|
|
// if it is a symbolic link...
|
|
// get real path (no symlinks symbolic links)
|
|
char tmp[1026];
|
|
int32_t tlen = readlink ( arg2 , tmp , 1020 );
|
|
// if we got the actual path, copy that over
|
|
if ( tlen != -1 ) {
|
|
//fprintf(stderr,"tmp=%s\n",tmp);
|
|
// if symbolic link is relative...
|
|
if ( tmp[0]=='.' && tmp[1]=='.') {
|
|
// store original path (/bin/gb --> ../../var/gigablast/data/gb)
|
|
strcpy(arg,arg2); // /bin/gb
|
|
// back up to /
|
|
while(arg[strlen(arg)-1] != '/' ) arg[strlen(arg)-1] = '\0';
|
|
int32_t len2 = strlen(arg);
|
|
strcpy(arg+len2,tmp);
|
|
}
|
|
else {
|
|
strcpy(arg,tmp);
|
|
}
|
|
}
|
|
else {
|
|
strcpy(arg,arg2);
|
|
}
|
|
|
|
again:
|
|
// now remove ..'s from path
|
|
char *p = arg;
|
|
// char *start = arg;
|
|
for ( ; *p ; p++ ) {
|
|
if (p[0] != '.' || p[1] !='.' ) continue;
|
|
// if .. is at start of string
|
|
if ( p == arg ) {
|
|
gbmemcpy ( arg , p+2,strlen(p+2)+1);
|
|
goto again;
|
|
}
|
|
// find previous /
|
|
char *slash = p-1;
|
|
if ( *slash !='/' ) { g_process.shutdownAbort(true); }
|
|
slash--;
|
|
for ( ; slash > arg && *slash != '/' ; slash-- );
|
|
if ( slash<arg) slash=arg;
|
|
gbmemcpy(slash,p+2,strlen(p+2)+1);
|
|
goto again;
|
|
// if can't back up anymore...
|
|
}
|
|
|
|
char *a = arg;
|
|
|
|
// remove "gb" from the end
|
|
int32_t alen = 0;
|
|
for ( ; *a ; a++ ) {
|
|
if ( *a != '/' ) continue;
|
|
alen = a - arg + 1;
|
|
}
|
|
if ( alen > 512 ) {
|
|
log("db: path is too long");
|
|
g_errno = EBADENGINEER;
|
|
return NULL;
|
|
}
|
|
// hack off the "gb" (seems to hack off the "/gb")
|
|
//*a = '\0';
|
|
// don't hack off the "/gb" just the "gb"
|
|
arg[alen] = '\0';
|
|
|
|
// get cwd which is only relevant to us if arg starts
|
|
// with . at this point
|
|
static char s_cwdBuf[1025];
|
|
getcwd ( s_cwdBuf , 1020 );
|
|
char *end = s_cwdBuf + strlen(s_cwdBuf);
|
|
// make sure that shit ends in /
|
|
if ( s_cwdBuf[strlen(s_cwdBuf)-1] != '/' ) {
|
|
int32_t len = strlen(s_cwdBuf);
|
|
s_cwdBuf[len] = '/';
|
|
s_cwdBuf[len+1] = '\0';
|
|
end++;
|
|
}
|
|
|
|
// if "arg" is a RELATIVE path then append it
|
|
if ( arg && arg[0]!='/' ) {
|
|
if ( arg[0]=='.' && arg[1]=='/' ) {
|
|
gbmemcpy ( end , arg+2 , alen -2 );
|
|
end += alen - 2;
|
|
}
|
|
else {
|
|
gbmemcpy ( end , arg , alen );
|
|
end += alen;
|
|
}
|
|
*end = '\0';
|
|
}
|
|
// if our path started with / then it was absolute...
|
|
else {
|
|
strncpy(s_cwdBuf,arg,alen);
|
|
s_cwdBuf[alen]='\0';
|
|
}
|
|
|
|
// make sure it ends in / for consistency
|
|
int32_t clen = strlen(s_cwdBuf);
|
|
if ( s_cwdBuf[clen-1] != '/' ) {
|
|
s_cwdBuf[clen-1] = '/';
|
|
s_cwdBuf[clen] = '\0';
|
|
clen--;
|
|
}
|
|
|
|
// ensure 'gb' binary exists in that dir.
|
|
// binaryCmd is usually gb but use this just in case
|
|
char *binaryCmd = arg2 + strlen(arg2) - 1;
|
|
for ( ; binaryCmd[-1] && binaryCmd[-1] != '/' ; binaryCmd-- );
|
|
File fff;
|
|
fff.set (s_cwdBuf,binaryCmd);
|
|
|
|
// assume it is in the usual spot
|
|
if ( fff.doesExist() ) return s_cwdBuf;
|
|
|
|
// try just "gb" as binary
|
|
fff.set(s_cwdBuf,"gb");
|
|
if ( fff.doesExist() ) return s_cwdBuf;
|
|
|
|
// if nothing is found resort to the default location
|
|
return "/var/gigablast/data0/";
|
|
}
|
|
|
|
///////
|
|
//
|
|
// used to make package to install files for the package
|
|
//
|
|
///////
|
|
int copyFiles ( const char *dstDir ) {
|
|
|
|
const char *srcDir = "./";
|
|
SafeBuf fileListBuf;
|
|
g_process.getFilesToCopy ( srcDir , &fileListBuf );
|
|
|
|
SafeBuf tmp;
|
|
tmp.safePrintf(
|
|
"cp -r %s %s"
|
|
, fileListBuf.getBufStart()
|
|
, dstDir
|
|
);
|
|
|
|
//log(LOG_INIT,"admin: %s", tmp.getBufStart());
|
|
fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
|
|
system ( tmp.getBufStart() );
|
|
return 0;
|
|
}
|
|
|
|
void rmTest() {
|
|
|
|
// make five files
|
|
int32_t max = 100;
|
|
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
SafeBuf fn;
|
|
fn.safePrintf("./tmpfile%" PRId32,i);
|
|
SafeBuf sb;
|
|
for ( int32_t j = 0 ; j < 100 ; j++ ) {
|
|
sb.safePrintf("%" PRId32"\n",(int32_t)rand());
|
|
}
|
|
sb.save ( fn.getBufStart() );
|
|
}
|
|
|
|
// now delete
|
|
fprintf(stderr,"Deleting files\n");
|
|
int64_t now = gettimeofdayInMilliseconds();
|
|
|
|
for ( int32_t i = 0 ; i < max ; i++ ) {
|
|
SafeBuf fn;
|
|
fn.safePrintf("./tmpfile%" PRId32,i);
|
|
File f;
|
|
f.set ( fn.getBufStart() );
|
|
f.unlink();
|
|
}
|
|
|
|
int64_t took = gettimeofdayInMilliseconds() - now;
|
|
|
|
fprintf(stderr,"Deleting files took %" PRId64" ms\n",took);
|
|
|
|
}
|