mirror of
synced 2025-03-13 09:21:11 -04:00
5566 lines
155 KiB
5566 lines
155 KiB
// Matt Wells, copyright Sep 2001
#include <sched.h> // clone()
// declare this stuff up here for call the pread() in our seek test below
// maybe we should put this in a common header file so we don't have
// certain files compiled with the platform default, and some not -partap
#include "Version.h" // getVersion()
#include "Mem.h"
#include "Conf.h"
#include "JobScheduler.h"
#include "Hostdb.h"
#include "Posdb.h"
#include "Titledb.h"
#include "Tagdb.h"
#include "Spider.h"
#include "SpiderColl.h"
#include "SpiderLoop.h"
#include "SpiderCache.h"
#include "Doledb.h"
#include "Clusterdb.h"
#include "Collectiondb.h"
#include "Sections.h"
#include "UdpServer.h"
#include "Serialize.h"
#include "Repair.h"
#include "DailyMerge.h"
#include "MsgC.h"
#include "HttpServer.h"
#include "Loop.h"
#include "HighFrequencyTermShortcuts.h"
#include "PageTemperatureRegistry.h"
#include "Docid2Siteflags.h"
#include "SiteMedianPageTemperatureRegistry.h"
#include "UrlRealtimeClassification.h"
#include "IPAddressChecks.h"
#include <sys/resource.h> // setrlimit
#include "Stats.h"
#include "Statistics.h"
#include "Speller.h" // g_speller
#include "Wiki.h" // g_wiki
#include "Wiktionary.h" // g_wiktionary
#include "WordVariations.h"
#include "CountryCode.h"
#include "Domains.h"
#include "Pos.h"
#include "Title.h"
#include "Speller.h"
#include "SummaryCache.h"
#include "InstanceInfoExchange.h"
#include "WantedChecker.h"
#include "Dns.h"
#include "DumpSpiderdbSqlite.h"
// include all msgs that have request handlers, cuz we register them with g_udp
#include "Msg0.h"
#include "Msg4In.h"
#include "Msg4Out.h"
#include "Msg13.h"
#include "Msg20.h"
#include "Msg22.h"
#include "Msg25.h"
#include "Msg39.h"
#include "Msg40.h" // g_resultsCache
#include "Parms.h"
#include "Pages.h"
#include "PageInject.h"
#include "unicode/UCMaps.h"
#include "utf8_convert.h"
#include "Profiler.h"
#include "Proxy.h"
#include "linkspam.h"
#include "Process.h"
#include "sort.h"
#include "RdbBuckets.h"
#include "SpiderProxy.h"
#include "HashTable.h"
#include "GbUtil.h"
#include "Dir.h"
#include "File.h"
#include "DnsBlockList.h"
#include "ContentTypeBlockList.h"
#include "UrlMatchList.h"
#include "UrlBlockCheck.h"
#include "DocDelete.h"
#include "GbDns.h"
#include "ScopedLock.h"
#include "RobotsCheckList.h"
#include "ConvertSpiderdb.h"
#include "RobotsBlockedResultOverride.h"
#include "UrlResultOverride.h"
#include "FxCheckAdult.h"
#include "FxCheckSpam.h"
#include "GbCompress.h"
#include "DocRebuild.h"
#include "DocReindex.h"
#include "FxExplicitKeywords.h"
#include "IpBlockList.h"
#include "SpiderdbSqlite.h"
#include "QueryLanguage.h"
#include "SiteNumInlinks.h"
#include "ContentMatchList.h"
#include "SiteMedianPageTemperature.h"
#include "Lemma.h"
#include "ip.h"
#include "CountryLanguage.h"
#include "Errno.h"
#include "Docid.h"
#include <sys/stat.h> //umask()
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#include <valgrind/helgrind.h>
static bool registerMsgHandlers();
static bool registerMsgHandlers1();
static bool registerMsgHandlers2();
static const int32_t commandLineDumpdbRecSize = 10 * 1024 * 1024; //recSizes parameter for Msg5::getList() while dumping database from the command-line
static void printHelp();
static void dumpTitledb (const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
int64_t docId , bool justPrintDups );
static void dumpTagdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char req,
const char *site);
//dumpPosdb() is not local becaue it is called directly by unittests
void dumpPosdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, int64_t termId , bool justVerify);
static void dumpWaitingTree(const char *coll);
static void dumpRobotsTxtCache(const char *coll);
static void dumpDoledb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree);
static void dumpClusterdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree);
static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, const char *url, bool urlhash);
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static int copyFiles(const char *dstDir);
static const char *getAbsoluteGbDir(const char *argv0);
static int32_t checkDirPerms(const char *dir);
static bool hashtest();
// how fast to parse the content of this docId?
static bool parseTest(const char *coll, int64_t docId, const char *query);
static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t docId, const char *query );
static bool cacheTest();
static void countdomains(const char* coll, int32_t numRecs, int32_t output);
static bool argToBoolean(const char *arg);
static bool parseOptionalHostRange(int rangearg, int argc, char **argv, int *h1, int *h2);
static void wvg_log_function(WordVariationGenerator::log_class_t log_class, const char *fmt, va_list ap);
static void wakeupPollLoop() {
static UdpProtocol g_dp; // Default Proto
// installFlag konstants
typedef enum {
ifk_install = 1,
ifk_installgb ,
ifk_installconf ,
ifk_dsh ,
ifk_dsh2 ,
ifk_backupcopy ,
ifk_backupmove ,
ifk_backuprestore ,
ifk_installconf2 ,
ifk_start ,
ifk_tmpstart ,
ifk_installtmpgb ,
} install_flag_konst_t;
static int install_file(const char *file, int32_t hostId, int32_t hostId2);
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir = NULL,
int32_t hostId2 = -1, char *cmd = NULL );
bool doCmd ( const char *cmd , int32_t hostId , const char *filename , bool sendToHosts,
bool sendToProxies, int32_t hostId2=-1 );
static char unicode_data_dir[2014]; //filled in by main2() when hostdb has been initialized
//void tryMergingWrapper ( int fd , void *state ) ;
//void resetAll ( );
//void spamTest ( ) ;
extern void resetPageAddUrl ( );
extern void resetHttpMime ( );
extern void reset_iana_charset ( );
extern void resetAdultBit ( );
extern void resetEntities ( );
extern void resetQuery ( );
extern bool g_recoveryMode; // HostFlags.cpp
static int argc_copy;
static char **argv_copy;
static int rc_copy;
static int main2(int argc, char *argv[]);
static void *main2_trampoline(void *) {
rc_copy = main2(argc_copy,argv_copy);
return NULL;
int main ( int argc , char *argv[] ) {
//Run the main thread ... in a thread
//The reason for this is so that 'htop', 'perf' and other tools show metrics
//for the main thread instead of lumping it together wide process-wide
//aggregation (eg. linux kernel 4.4.x claims the main task/process does IO
//eventhough it provably doesn't)
argc_copy = argc;
argv_copy = argv;
pthread_t tid;
int rc = pthread_create(&tid,NULL,main2_trampoline,NULL);
fprintf(stderr,"pthread_create() failed with error %d (%s)",rc,strerror(rc));
return 99;
rc = pthread_join(tid,NULL);
if(rc!=0) {
fprintf(stderr,"pthread_join() failed with error %d (%s)",rc,strerror(rc));
return 99;
fprintf( stderr, "Failed to start gb. Exiting.\n" );
return rc_copy;
int main2 ( int argc , char *argv[] ) {
g_conf.m_runAsDaemon = false;
g_conf.m_logToFile = false;
#ifdef _VALGRIND_
//threads are incrementing the counters all over the place
// record time for uptime
g_stats.m_uptimeStart = time(NULL);
int32_t cmdarg = 0;
// get command
// it might not be there, might be a simple "./gb"
const char *cmd = "";
if ( argc >= 2 ) {
cmdarg = 1;
cmd = argv[1];
const char *cmd2 = "";
if ( argc >= 3 )
cmd2 = argv[2];
int arch = 64;
if ( sizeof(char *) == 4 ) arch = 32;
// help
if ( strcmp ( cmd , "-h" ) == 0 ) {
return 0;
// version
if ( strcmp ( cmd , "-v" ) == 0 ) {
return 0;
//send an email on startup for -r, like if we are recovering from an
//unclean shutdown.
g_recoveryMode = false;
if ( strncmp ( cmd , "-r" ,2 ) == 0 || strncmp ( cmd2 , "-r",2 ) == 0 ) {
g_recoveryMode = true;
// run as daemon? then we have to fork
if ( ( strcmp ( cmd , "-d" ) == 0 ) || ( strcmp ( cmd2 , "-d" ) == 0 ) ) {
g_conf.m_runAsDaemon = true;
if ( ( strcmp ( cmd , "-l" ) == 0 ) || ( strcmp ( cmd2 , "-l" ) == 0 ) ) {
g_conf.m_logToFile = true;
if( (strcmp( cmd, "countdomains" ) == 0) && (argc >= (cmdarg + 3)) ) {
uint32_t tmp = atoi( argv[cmdarg+1] );
if( (tmp * 10) > g_mem.getMemTableSize() )
g_mem.setMemTableSize(tmp * 10);
// these tests do not need a hosts.conf
if ( strcmp ( cmd , "hashtest" ) == 0 ) {
if ( argc > cmdarg+1 ) {
return 1;
return 0;
// these tests do not need a hosts.conf
if ( strcmp ( cmd , "cachetest" ) == 0 ) {
if ( argc > cmdarg+1 ) {
return 1;
return 0;
if ( strcmp ( cmd , "parsetest" ) == 0 ) {
if ( cmdarg+1 >= argc ) {
return 1;
// load up hosts.conf
//if ( ! g_hostdb.init(hostId) ) {
// log("db: hostdb init failed." ); return 1; }
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." ); return 1; }
int64_t docid = atoll1(argv[cmdarg+1]);
const char *coll = "";
const char *query = "";
if ( cmdarg+3 <= argc ) coll = argv[cmdarg+2];
if ( cmdarg+4 == argc ) query = argv[cmdarg+3];
parseTest( coll, docid, query );
return 0;
if ( strcmp ( cmd ,"isportinuse") == 0 ) {
if ( cmdarg+1 >= argc ) {
return 1;
int port = atol ( argv[cmdarg+1] );
// make sure port is available. returns false if in use.
if ( ! g_httpServer.m_tcp.testBind(port,false) )
// and we should return with 1 so the keep alive
// script will exit
exit (1);
// port is not in use, return 0
// need threads here for tests?
// note the stack size for debug purposes
struct rlimit rl;
getrlimit(RLIMIT_STACK, &rl);
log(LOG_INFO,"db: Stack size is %" PRId64".", (int64_t)rl.rlim_cur);
struct rlimit lim;
// limit fds
// try to prevent core from systems where it is above 1024
// because our FD_ISSET() libc function will core! (it's older)
int32_t NOFILE = 1024;
lim.rlim_cur = lim.rlim_max = NOFILE;
if ( setrlimit(RLIMIT_NOFILE,&lim)) {
log("db: setrlimit RLIMIT_NOFILE %" PRId32": %s.",
NOFILE,mstrerror(errno) );
struct rlimit rlim;
getrlimit ( RLIMIT_NOFILE,&rlim);
if ( (int32_t)rlim.rlim_max > NOFILE || (int32_t)rlim.rlim_cur > NOFILE ) {
log("db: setrlimit RLIMIT_NOFILE failed!");
// set the s_pages array for print admin pages
g_pages.init ( );
bool isProxy = false;
if ( strcmp( cmd , "proxy" ) == 0 && strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
isProxy = true;
// this is just like starting up a gb process, but we add one to
// each port, we are a dummy machine in the dummy cluster.
// gb -w <workingdir> tmpstart [hostId]
bool useTmpCluster = false;
if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
useTmpCluster = true;
// gb -w <workingdir> tmpstop [hostId]
if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
useTmpCluster = true;
// gb -w <workingdir> tmpstarthost
if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
useTmpCluster = true;
bool initMyHost = true;
if (strcmp(cmd, "install") == 0 ||
strcmp(cmd, "installfile") == 0) {
initMyHost = false;
// get current working dir that the gb binary is in. all the data
// files should in there too!!
const char *workingDir = getAbsoluteGbDir ( argv[0] );
if ( ! workingDir ) {
fprintf(stderr,"could not get working dir. Exiting.\n");
return 1;
//log("host: working directory is %s",workingDir);
//initialize IP address checks
// load up hosts.conf
// . it will determine our hostid based on the directory path of this
// gb binary and the ip address of this server
if ( ! g_hostdb.init(-1, isProxy, useTmpCluster, initMyHost, workingDir)) {
log( LOG_ERROR, "db: hostdb init failed." );
return 1;
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log( LOG_ERROR, "db: Failed to init hashtable." );
return 1;
// . hashinit() calls srand() w/ a fixed number
// . let's mix it up again
srand ( time(NULL) );
// Make sure TLD table is initializing before calling any URL handling function
if(!initializeDomains(g_hostdb.m_dir)) {
log( LOG_ERROR, "Domains initialization failed!" );
return 1;
// do not save conf if any core dump occurs starting here
// down to where we set this back to true
g_conf.m_save = false;
//Put this here so that now we can log messages
if ( strcmp ( cmd , "proxy" ) == 0 ) {
if (argc < 3){
return 1;
int32_t proxyId = -1;
if ( cmdarg+2 < argc ) proxyId = atoi ( argv[cmdarg+2] );
if ( strcmp ( argv[cmdarg+1] , "start" ) == 0 ) {
return install ( ifk_proxy_start , proxyId );
else if ( strcmp ( argv[cmdarg+1] , "stop" ) == 0 ) {
g_proxy.m_proxyRunning = true;
return doCmd ( "save=1" , proxyId , "master" , false, true );
else if ( strcmp ( argv[cmdarg+1] , "replacehost" ) == 0 ) {
g_proxy.m_proxyRunning = true;
int32_t hostId = -1;
int32_t spareId = -1;
if ( cmdarg + 2 < argc ) {
hostId = atoi ( argv[cmdarg+2] );
spareId = atoi ( argv[cmdarg+3] );
char replaceCmd[256];
sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32, hostId, spareId);
return doCmd ( replaceCmd, -1, "admin/hosts", false, true);
else if ( proxyId == -1 || strcmp ( argv[cmdarg+1] , "load" ) != 0 ) {
return 1;
Host *h = g_hostdb.getProxy( proxyId );
uint16_t httpPort = h->getInternalHttpPort();
uint16_t httpsPort = h->getInternalHttpsPort();
//we need udpserver for addurl
uint16_t udpPort = h->m_port;
if ( ! g_conf.init ( h->m_dir ) ) {
log( LOG_ERROR, "db: Conf init failed." );
return 1;
// init the loop before g_process since g_process
// registers a sleep callback!
if ( ! g_loop.init() ) {
log( LOG_ERROR, "db: Loop init failed." );
return 1;
//if ( ! g_jobScheduler.initialize() ) {
// log("db: Threads init failed." ); return 1; }
if ( ! g_process.checkNTPD() ) {
log( LOG_ERROR, "db: ntpd not running on proxy" );
return 1;
const char *errmsg=NULL;
if ( !UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log( LOG_ERROR, "db: Unicode initialization failed! %s", errmsg);
return 1;
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
return 1;
// load speller unifiedDict for spider compression proxy
//if ( g_hostdb.m_myHost->m_type & HT_SCPROXY )
// g_speller.init();
if ( ! g_udpServer.init( g_hostdb.getMyPort() ,
20000000 , // readBufSIze
20000000 , // writeBufSize
20 , // pollTime in ms
g_conf.m_udpMaxSockets , // max udp slots
false )){ // is dns?
log( LOG_ERROR, "db: UdpServer init failed." );
return 1;
if (!g_proxy.initProxy (proxyId, udpPort, 0, &g_dp)) {
log( LOG_ERROR, "proxy: init failed" );
return 1;
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log( LOG_ERROR, "db: Failed to init hashtable." );
return 1;
if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) {
log( LOG_ERROR, "db: HttpServer init failed. Another gb "
"already running? If not, try editing "
"./hosts.conf to "
"change the port from %" PRId32" to something bigger. "
"Or stop gb by running 'gb stop' or by "
"clicking 'save & exit' in the master controls."
, (int32_t)httpPort );
// this is dangerous!!! do not do the shutdown thing
return 1;
//we should save gb.conf right ?
g_conf.m_save = true;
// gb dsh cmd [hostrange]
if ( strcmp ( cmd , "dsh" ) == 0 ) {
if ( cmdarg+1 >= argc ) {
return 1;
char *cmd = argv[cmdarg+1];
int h1,h2;
return 1;
return install ( ifk_dsh, h1, NULL, h2, cmd );
// gb dsh2 cmd [hostrange]
if ( strcmp ( cmd , "dsh2" ) == 0 ) {
if ( cmdarg+1 >= argc ) {
return 1;
char *cmd = argv[cmdarg+1];
int h1,h2;
return 1;
return install ( ifk_dsh2, h1, NULL, h2, cmd );
// gb copyfiles, like gb install but takes a dir not a host #
if ( strcmp ( cmd , "copyfiles" ) == 0 ) {
if ( cmdarg + 1 >= argc ) {
return 1;
char *dir = argv[cmdarg+1];
return copyFiles ( dir );
// gb install [hostrange]
if ( strcmp ( cmd , "install" ) == 0 ) {
int h1,h2;
return 1;
return install ( ifk_install, h1, NULL, h2 );
// gb installgb [hostrange]
if ( strcmp ( cmd , "installgb" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_installgb, h1, NULL, h2);
// gb installfile filename [hostrange]
if ( strcmp ( cmd , "installfile" ) == 0 ) {
int h1,h2;
return 1;
return install_file(argv[cmdarg + 1], h1, h2);
// gb installtmpgb [hostrange]
if ( strcmp ( cmd , "installtmpgb" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_installtmpgb, h1, NULL, h2);
// gb installconf [hostrange]
if ( strcmp ( cmd , "installconf" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_installconf, h1, NULL, h2);
// gb installconf2 [hostrange]
if ( strcmp ( cmd , "installconf2" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_installconf2, h1, NULL, h2);
// gb start [hostId]
if ( strcmp ( cmd , "start" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_start, h1, NULL, h2);
// gb tmpstart [hostId]
if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
int h1,h2;
return 1;
return install(ifk_tmpstart, h1, NULL, h2);
if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("save=1", h1, "master", true, false, h2);
if ( strcmp ( cmd , "kstop" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("save=1", h1, "master", true, false, h2);
// gb backupcopy [hostId] <backupSubdirName>
if ( strcmp ( cmd , "backupcopy" ) == 0 ) {
if ( cmdarg + 1 >= argc ) {
return 1;
return install( ifk_backupcopy , -1 , argv[cmdarg+1] );
// gb backupmove [hostId] <backupSubdirName>
if ( strcmp ( cmd , "backupmove" ) == 0 ) {
if ( cmdarg + 1 >= argc ) {
return 1;
return install( ifk_backupmove , -1 , argv[cmdarg+1] );
// gb backupmove [hostId] <backupSubdirName>
if ( strcmp ( cmd , "backuprestore" ) == 0 ) {
if ( cmdarg + 1 >= argc ) {
return 1;
return install( ifk_backuprestore, -1 , argv[cmdarg+1] );
// gb stop [hostId]
if ( strcmp ( cmd , "stop" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("save=1" , h1 , "master", true, false, h2);
// gb save [hostId]
if ( strcmp ( cmd , "save" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("js=1", h1, "master", true, false, h2);
// gb spidersoff [hostId]
if ( strcmp ( cmd , "spidersoff" ) == 0 ) {
int32_t hostId = -1;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
return doCmd( "se=0", hostId, "master", true, false );
// gb spiderson [hostid]
if ( strcmp ( cmd , "spiderson" ) == 0 ) {
int32_t hostId = -1;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
return doCmd( "se=1", hostId, "master", true, false );
// gb cacheoff [hostId]
if ( strcmp ( cmd , "cacheoff" ) == 0 ) {
int32_t hostId = -1;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
return doCmd( "dpco=1", hostId, "master", true, false );
// gb ddump [hostId]
if ( strcmp ( cmd , "ddump" ) == 0 ) {
int32_t hostId = -1;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
return doCmd( "dump=1", hostId, "master", true, false );
// gb pmerge [hostId]
if ( strcmp ( cmd , "pmerge" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("pmerge=1", h1, "master", true, false, h2);
// gb spmerge [hostId]
if ( strcmp ( cmd , "spmerge" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("spmerge=1", h1, "master", true, false, h2);
// gb tmerge [hostId]
if ( strcmp ( cmd , "tmerge" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("tmerge=1", h1, "master", true, false, h2);
// gb merge [hostId]
if ( strcmp ( cmd , "merge" ) == 0 ) {
int h1,h2;
return 1;
return doCmd("merge=1", h1, "master", true, false, h2);
// gb setnote <hostid> <note>
if ( strcmp ( cmd, "setnote" ) == 0 ) {
int32_t hostId;
char *note;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
else return 0;
if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
else return 0;
char urlnote[1024];
urlEncode(urlnote, 1024, note, strlen(note));
log ( LOG_INIT, "conf: setnote %" PRId32": %s", hostId, urlnote );
char setnoteCmd[256];
sprintf(setnoteCmd, "setnote=1&host=%" PRId32"¬e=%s",
hostId, urlnote);
return doCmd( setnoteCmd, -1, "admin/hosts", true, false );
// gb setsparenote <spareid> <note>
if ( strcmp ( cmd, "setsparenote" ) == 0 ) {
int32_t spareId;
char *note;
if ( cmdarg + 1 < argc ) spareId = atoi ( argv[cmdarg+1] );
else return 0;
if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
else return 0;
char urlnote[1024];
urlEncode(urlnote, 1024, note, strlen(note));
log(LOG_INIT, "conf: setsparenote %" PRId32": %s", spareId, urlnote);
char setnoteCmd[256];
sprintf(setnoteCmd, "setsparenote=1&spare=%" PRId32"¬e=%s",
spareId, urlnote);
return doCmd( setnoteCmd, -1, "admin/hosts" , true, false );
// gb replacehost <hostid> <spareid>
if ( strcmp ( cmd, "replacehost" ) == 0 ) {
int32_t hostId = -1;
int32_t spareId = -1;
if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+2] );
char replaceCmd[256];
sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32,
hostId, spareId);
return doCmd( replaceCmd, -1, "admin/hosts", true, true );
// . read in the conf file
// . this now initializes from a dir and hostId, they should all be
// name gbHID.conf
// . now that hosts.conf has more of the burden, all gbHID.conf files
// can be identical
if ( ! g_conf.init ( g_hostdb.m_myHost->m_dir ) ) {
log( LOG_ERROR, "db: Conf init failed." );
return 1;
if ( ! g_jobScheduler.initialize(g_conf.m_maxCoordinatorThreads, g_conf.m_maxCpuThreads, g_conf.m_maxSummaryThreads, g_conf.m_maxIOThreads, g_conf.m_maxExternalThreads, g_conf.m_maxFileMetaThreads, g_conf.m_maxMergeThreads, wakeupPollLoop)) {
log( LOG_ERROR, "db: JobScheduler init failed." );
return 1;
// put in read only mode
if ( useTmpCluster ) {
g_conf.m_readOnlyMode = true;
// init the loop, needs g_conf
if ( ! g_loop.init() ) {
log( LOG_ERROR, "db: Loop init failed." );
return 1;
// the new way to save all rdbs and conf
// must call after Loop::init() so it can register its sleep callback
// set up the threads, might need g_conf
// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
// . spiderdb is special:
// gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
// [priority] [printStats?]
if ( strcmp ( cmd , "dump" ) == 0 ) {
// tell Collectiondb, not to verify each rdb's data
g_dumpMode = true;
if ( cmdarg+1 >= argc ) {
return 1;
int32_t startFileNum = 0;
int32_t numFiles = -1;
bool includeTree = true;
const char *coll = "";
// so we do not log every collection coll.conf we load
g_conf.m_doingCommandLine = true;
// we have to init collection db because we need to know if
// the collnum is legit or not in the tree
if ( ! g_collectiondb.loadAllCollRecs() ) {
log("db: Collectiondb init failed." ); return 1; }
if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
if ( cmdarg+5 < argc ) includeTree = argToBoolean(argv[cmdarg+5]);
if ( argv[cmdarg+1][0] == 't' ) {
int64_t docId = 0LL;
if ( cmdarg+6 < argc ) {
docId = atoll1(argv[cmdarg+6]);
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, false);
else if ( argv[cmdarg+1][0] == 'D' ) {
int64_t docId = 0LL;
if ( cmdarg+6 < argc ) {
docId = atoll1(argv[cmdarg+6]);
dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
else if (strcmp(argv[cmdarg+1], "w") == 0) {
else if (strcmp(argv[cmdarg+1], "rtc") == 0) {
else if ( argv[cmdarg+1][0] == 'x' )
dumpDoledb (coll,startFileNum,numFiles,includeTree);
else if (strcmp(argv[cmdarg+1], "s") == 0) {
int32_t firstIp = 0;
if(cmdarg+3 < argc) {
firstIp = atoip(argv[cmdarg + 3]);
dumpSpiderdbSqlite(coll, firstIp);
else if ( argv[cmdarg+1][0] == 'S' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) {
site = argv[ cmdarg + 6 ];
dumpTagdb( coll, startFileNum, numFiles, includeTree, 0, site );
} else if ( argv[cmdarg+1][0] == 'z' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) {
site = argv[ cmdarg + 6 ];
dumpTagdb( coll, startFileNum, numFiles, includeTree, 'z', site );
} else if ( argv[cmdarg+1][0] == 'A' ) {
dumpTagdb( coll, startFileNum, numFiles, includeTree, 'A', NULL );
} else if ( argv[cmdarg+1][0] == 'W' ) {
dumpTagdb( coll, startFileNum, numFiles, includeTree, 0, NULL );
} else if ( argv[cmdarg+1][0] == 'l' )
dumpClusterdb (coll,startFileNum,numFiles,includeTree);
else if (strcmp(argv[cmdarg+1], "Lu") == 0) {
char *url = NULL;
if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
else if (strcmp(argv[cmdarg+1], "Ls") == 0) {
char *url = NULL;
if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
} else if ( argv[cmdarg+1][0] == 'p' ) {
int64_t termId = -1;
if ( cmdarg+6 < argc ) {
char *targ = argv[cmdarg+6];
if ( is_alpha_a(targ[0]) ) {
char *colon = strstr(targ,":");
int64_t prefix64 = 0LL;
if ( colon ) {
*colon = '\0';
prefix64 = hash64n(targ);
targ = colon + 1;
// hash the term itself
termId = hash64n(targ);
// hash prefix with termhash
if ( prefix64 )
termId = hash64(termId,prefix64);
termId &= TERMID_MASK;
printf("termId=%ld\n", termId);
else {
termId = atoll1(targ);
dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
} else if (strcmp(argv[cmdarg+1], "u") == 0) {
dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "wt") == 0) {
dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "at") == 0) {
dumpAdultTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "st") == 0) {
dumpSpamTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else {
return 1;
// disable any further logging so final log msg is clear
g_log.m_disabled = true;
return 0;
// gb sitedeftemp prepare|switch [hostrange]
if(strcmp(cmd, "sitedeftemp") == 0) {
int h1,h2;
return 1;
return doCmd("sitedeftemp=prepare", h1, "master", true, false, h2);
else if(strcmp(argv[cmdarg+1],"switch")==0)
return doCmd("sitedeftemp=switch", h1, "master", true, false, h2);
else {
return 1;
if(strcmp(cmd, "dumpcsv") == 0) {
g_conf.m_readOnlyMode = true; //we don't need write access
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
if( !g_collectiondb.loadAllCollRecs()) {
log("db: Collectiondb init failed.");
return 1;
if(argv[cmdarg+1][0] == 's') {
bool interpret_values = argc>cmdarg+3 ? argToBoolean(argv[cmdarg+3]) : false;
g_log.m_disabled = true;
return 0;
if(strcmp(cmd, "convertspiderdb") == 0) {
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
if( !g_collectiondb.loadAllCollRecs()) {
log("db: Collectiondb init failed.");
return 1;
const char *collname = argc>cmdarg+1 ? argv[cmdarg+1] : "main";
g_log.m_disabled = true;
return 0;
if( strcmp( cmd, "countdomains" ) == 0 && argc >= (cmdarg + 2) ) {
const char *coll = "";
int32_t outpt;
coll = argv[cmdarg+1];
int32_t numRecs;
if(argc>cmdarg+2) {
if(!isdigit(argv[cmdarg+2][0])) {
return 1;
numRecs = atoi( argv[cmdarg+2] );
} else
numRecs = 1000000;
if( argc > (cmdarg + 2) ) outpt = atoi( argv[cmdarg+2] );
else outpt = 0;
log( LOG_INFO, "countdomains: Allocated Larger Mem Table for: %" PRId32,
g_mem.getMemTableSize() );
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
return 1;
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
return 1;
if ( ! g_collectiondb.loadAllCollRecs() ) {
log("db: Collectiondb init failed." ); return 1; }
countdomains( coll, numRecs, outpt );
g_log.m_disabled = true;
return 0;
if(!load_lemma_lexicon()) {
log(LOG_WARN,"db: could not load lemma lexicon");
//but not fatal
// file creation test, make sure we have dir control
if ( checkDirPerms ( g_hostdb.m_dir ) < 0 ) {
return 1;
// . make sure we have critical files
if ( ! g_process.checkFiles ( g_hostdb.m_dir ) ) {
return 1;
g_errno = 0;
// make sure port is available, no use loading everything up then
// failing because another process is already running using this port
if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->getInternalHttpPort(), true)) {
// return 0 so keep alive bash loop exits
int32_t *ips;
log("db: Logging to file %s.", g_hostdb.m_logFilename );
if ( ! g_conf.m_runAsDaemon )
log("db: Use 'gb -d' to run as daemon. Example: gb -d");
// start up log file
if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
fprintf (stderr,"db: Log file init failed. Exiting.\n" );
return 1;
g_log.m_logTimestamps = true;
g_log.m_logReadableTimestamps = true; // @todo BR: Should be configurable..
// in case we do not have one, we need it for Images.cpp
if ( ! makeTrashDir() ) {
fprintf (stderr,"db: failed to make trash dir. Exiting.\n" );
return 1;
g_errno = 0;
// run as daemon now
//fprintf(stderr,"running as daemon\n");
if ( g_conf.m_runAsDaemon ) {
pid_t pid, sid;
pid = fork();
if ( pid < 0 ) exit(EXIT_FAILURE);
// seems like we core unless parent sets this to NULL.
// it does not affect the child.
//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
// child gets a 0, parent gets the child's pid, so exit
if ( pid > 0 ) exit(EXIT_SUCCESS);
// change file mode mask
sid = setsid();
if ( sid < 0 ) exit(EXIT_FAILURE);
// if we do not do this we don't get sigalarms or quickpolls
// when running as 'gb -d'
// we register log rotation here because it's after g_loop is initialized
// log the version
log(LOG_INIT,"conf: Gigablast Version : %s", getVersion());
log(LOG_INIT,"conf: Gigablast Architecture : %d-bit", arch);
log(LOG_INIT,"conf: Gigablast Build config : %s", getBuildConfig());
log(LOG_INIT,"conf: Gigablast Git commit : %s", getCommitId());
// show current working dir
log("host: Working directory is %s",workingDir);
log("host: Using %shosts.conf",g_hostdb.m_dir);
pid_t pid = getpid();
log("host: Process ID is %" PRIu64,(int64_t)pid);
// from Hostdb.cpp
ips = getLocalIps();
for ( ; ips && *ips ; ips++ ) {
char ipbuf[16];
log("host: Detected local ip %s",iptoa(*ips,ipbuf));
// show it
log("host: Running as host id #%" PRId32,g_hostdb.m_myHostId );
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log( LOG_ERROR, "Unicode initialization failed! %s", errmsg);
return 1;
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
return 1;
// the wiktionary for lang identification and alternate word forms/
// synonyms
if ( ! g_wiktionary.load() ) {
log( LOG_ERROR, "Wiktionary initialization failed!" );
return 1;
if ( ! g_wiktionary.test() ) {
log( LOG_ERROR, "Wiktionary test failed!" );
return 1;
log(LOG_DEBUG,"main: initializing word variations: Danish");
if(!initializeWordVariationGenerator_Danish()) {
log(LOG_WARN, "word-variation-danish initialization failed" );
//but not fatal
log(LOG_DEBUG,"main: initialized word variations: Danish");
// the wiki titles
if ( ! g_wiki.load() ) {
log( LOG_ERROR, "Wiki initialization failed!" );
return 1;
// shout out if we're in read only mode
if ( g_conf.m_readOnlyMode )
log("db: -- Read Only Mode Set. Can Not Add New Data. --");
if (!Rdb::initializeRdbDumpThread()) {
logError("Unable to initialize rdb dump thread");
return 1;
// . collectiondb, does not use rdb, loads directly from disk
// . do this up here so RdbTree::fixTree_unlocked() can fix RdbTree::m_collnums
// . this is a fake init, cuz we pass in "true"
if ( ! g_collectiondb.loadAllCollRecs() ) {
log( LOG_ERROR, "db: Collectiondb load failed." );
return 1;
return 1;
// the spider cache used by SpiderLoop
if ( ! g_spiderCache.init() ) {
log( LOG_ERROR, "db: SpiderCache init failed." );
return 1;
// now clean the trees since all rdbs have loaded their rdb trees
// from disk, we need to remove bogus collection data from teh trees
// like if a collection was delete but tree never saved right it'll
// still have the collection's data in it
if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
log("db: Collectiondb init failed." );
// make sure the we have spiderdb sqlite if we still have spiderdb rdb files
for (collnum_t collNum = g_collectiondb.getFirstCollnum(); collNum < g_collectiondb.getNumRecs(); ++collNum) {
CollectionRec *collRec = g_collectiondb.getRec(collNum);
if (collRec != nullptr) {
RdbBase *base = collRec->getBase(RDB_SPIDERDB_DEPRECATED);
if (base->getNumFiles() != 0 && !g_spiderdb_sqlite.existDb(collNum)) {
// has rdb files but no sqlite file
log(LOG_ERROR, "Found spiderdb rdb files but no spiderdb sqlite files.");
log(LOG_ERROR, "Run ./gb convertspiderdb before starting up gb instances");
// initialize country languages
//Load the high-frequency term shortcuts (if they exist)
//Load the page temperature
//load docid->flags/sitehash map
//load sitehash32->default page temperature
// load block lists
// Initialize adult detection
g_checkAdultList.init("adultwords.txt", "adultphrases.txt");
// Initialize spam detection
if(!ExplicitKeywords::initialize()) {
log(LOG_ERROR,"Could not initialize explicit keywords file");
//but otherwise carry on
// initialize generate global index thread
if (!RdbBase::initializeGlobalIndexThread()) {
logError("Unable to initialize global index thread");
if (!Msg4In::initializeIncomingThread()) {
logError("Unable to initialize Msg4 incoming thread");
// test all collection dirs for write permission
int32_t pcount = 0;
for ( int32_t i = 0 ; i < g_collectiondb.getNumRecs(); i++ ) {
const CollectionRec *cr = g_collectiondb.getRec(i);
if ( ! cr ) continue;
if ( ++pcount >= 100 ) {
log("rdb: not checking directory permission for more than first 100 collections to save time.");
char tt[1024 + MAX_COLL_LEN ];
sprintf ( tt , "%scoll.%s.%" PRId32, g_hostdb.m_dir, cr->m_coll , (int32_t)cr->m_collnum );
checkDirPerms ( tt ) ;
// load the appropriate dictionaries
if ( ! g_speller.init() && g_conf.m_isLive ) {
// Load the category language table
// init minsitenuminlinks buffer
if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
log("db: failed to load sitelinks.txt data");
// . then our main udp server
// . must pass defaults since g_dns uses it's own port/instance of it
// . server should listen to a socket and register with g_loop
// . sock read/write buf sizes are both 64000
// . poll time is 60ms
// . if the read/write bufs are too small it severely degrades
// transmission times for big messages. just use ACK_WINDOW *
// MAX_DGRAM_SIZE as the size so when sending you don't drop dgrams
// . the 400k size allows us to cover Sync.cpp's activity well
if ( ! g_udpServer.init( g_hostdb.getMyPort() ,&g_dp,
40000000 , // readBufSIze
20000000 , // writeBufSize
20 , // pollTime in ms
g_conf.m_udpMaxSockets , // max udp slots
false )){ // is dns?
log("db: UdpServer init failed." ); return 1; }
// start up repair loop
if ( ! g_repair.init() ) {
log("db: Repair init failed." ); return 1; }
// start up repair loop
if ( ! g_dailyMerge.init() ) {
log("db: Daily merge init failed." ); return 1; }
// . then dns Distributed client
// . server should listen to a socket and register with g_loop
// . Only the distributed cache shall call the dns server.
if ( ! g_dns.init( g_hostdb.m_myHost->m_dnsClientPort ) ) {
log("db: Dns distributed client init failed." ); return 1; }
// initialize dns client library
if (!GbDns::initialize()) {
log(LOG_ERROR, "Unable to initialize dns client");
g_stable_summary_cache.configure(g_conf.m_stableSummaryCacheMaxAge, g_conf.m_stableSummaryCacheSize);
g_unstable_summary_cache.configure(g_conf.m_unstableSummaryCacheMaxAge, g_conf.m_unstableSummaryCacheSize);
// . then webserver
// . server should listen to a socket and register with g_loop
if ( ! g_httpServer.init( g_hostdb.m_myHost->getInternalHttpPort(), g_hostdb.m_myHost->getInternalHttpsPort() ) ) {
log("db: HttpServer init failed. Another gb already running?" );
// this is dangerous!!! do not do the shutdown thing
// . now register all msg handlers with g_udp server
if ( ! registerMsgHandlers() ) {
log("db: registerMsgHandlers failed" ); return 1; }
// gb dictLookupTest
if ( strcmp ( cmd , "dictlookuptest" ) == 0 ) {
if ( argc != cmdarg + 2 ) {
return 1;
g_speller.dictLookupTest ( argv[cmdarg + 1] );
if(cmd[0] && cmd[0]!='-') {
log(LOG_ERROR, "Unknown command: '%s'", cmd);
// . register a callback to try to merge everything every 60 seconds
// . do not exit if we couldn't do this, not a huge deal
// . put this in here instead of Rdb.cpp because we don't want generator commands merging on us
// . niceness is 1
// BR: Upped from 2 sec to 60. No need to check for merge every 2 seconds.
if (!g_loop.registerSleepCallback(60000, NULL, attemptMergeAllCallback, "Rdb::attemptMergeAllCallback", 1)) {
log( LOG_WARN, "db: Failed to init merge sleep callback." );
// try to sync parms (and collection recs) with host 0
if (!g_loop.registerSleepCallback(1000, NULL, Parms::tryToSyncWrapper, "Parms::tryToSyncWrapper", 0)) {
return 0;
if ( !Statistics::initialize() ) {
return 0;
// initialize clients
return 0;
return 0;
// initialize doc process
if (!g_docDelete.init()) {
logError("Unwable to initialize doc delete");
return 0;
if (!g_docDeleteUrl.init()) {
logError("Unwable to initialize doc delete url");
return 0;
if (!g_docRebuild.init()) {
logError("Unwable to initialize doc rebuild");
return 0;
if (!g_docRebuildUrl.init()) {
logError("Unwable to initialize doc rebuild url");
return 0;
if (!g_docReindex.init()) {
logError("Unwable to initialize doc reindex");
return 0;
if (!g_docReindexUrl.init()) {
logError("Unwable to initialize doc reindex url");
return 0;
// . start the spiderloop
// . comment out when testing SpiderCache
// allow saving of conf again
g_conf.m_save = true;
if(g_conf.m_mlockAllCurrent || g_conf.m_mlockAllFuture) {
log(LOG_DEBUG,"Locking memory");
int rc;
if(g_conf.m_mlockAllCurrent && g_conf.m_mlockAllFuture)
rc = mlockall(MCL_CURRENT|MCL_FUTURE);
else if(g_conf.m_mlockAllCurrent)
rc = mlockall(MCL_CURRENT);
else //if(g_conf.m_mlockAllFuture) //doesn't make a lot of sense to me
rc = mlockall(MCL_FUTURE);
log(LOG_WARN, "mlockall() failed with errno=%d (%s)", errno, mstrerror(errno));
log("db: gb is now ready");
// . now start g_loops main interrupt handling loop
// . it should block forever
// . when it gets a signal it dispatches to a server or db to handle it
static void printHelp() {
SafeBuf sb;
"Usage: gb [CMD]\n");
"\tgb will first try to load "
"the hosts.conf in the same directory as the "
"gb binary. "
"Then it will determine its hostId based on "
"the directory and IP address listed in the "
"hosts.conf file it loaded. Things in []'s "
"are optional.");
" [CMD] can have the following values:\n\n"
"-h\tPrint this help.\n\n"
"-v\tPrint version and exit.\n\n"
//"\tstart the gb process for this <hostId> locally."
//" <hostId> is 0 to run as host #0, for instance."
//"<hostId> -d\n\trun as daemon.\n\n"
"-d\tRun as daemon.\n\n"
//"-o\tprint the overview documentation in HTML. "
//"Contains the format of hosts.conf.\n\n"
// "<hostId> -r\n\tindicates recovery mode, "
// "sends email to addresses "
// "specified in Conf.h upon startup.\n\n"
// "-r\tindicates recovery mode, "
// "sends email to addresses "
// "specified in Conf.h upon startup.\n\n"
"start [hostId]\n"
"\tStart the gb process on all hosts or just on "
"[hostId], if specified, using an ssh command. Runs "
"each gb process in a keepalive loop under bash.\n\n"
"start <hostId1-hostId2>\n"
"\tLike above but just start gb on the supplied "
"range of hostIds.\n\n"
"stop [hostId]\n"
"\tSaves and exits for all gb hosts or "
"just on [hostId], if specified.\n\n"
"stop <hostId1-hostId2>\n"
"\tTell gb to save and exit on the given range of "
"save [hostId]\n"
"\tJust saves for all gb hosts or "
"just on [hostId], if specified.\n\n"
"tmpstart [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified, but "
"use the ports specified in hosts.conf PLUS one. "
"Then you can switch the "
"proxy over to point to those and upgrade the "
"original cluster's gb. "
"That can be done in the Master Controls of the "
"proxy using the 'use "
"temporary cluster'. Also, this assumes the binary "
"name is tmpgb not gb.\n\n"
"tmpstop [hostId]\n"
"\tsaves and exits for all gb hosts or "
"just on [hostId] if specified, for the "
"tmpstart command.\n\n"
"spidersoff [hostId]\n"
"\tDisables spidering for all gb hosts or "
"just on [hostId], if specified.\n\n"
"spiderson [hostId]\n"
"\tEnables spidering for all gb hosts or "
"just on [hostId], if specified.\n\n"
"cacheoff [hostId]\n"
"\tdisables all disk PAGE caches on all hosts or "
"just on [hostId] if specified.\n\n"
"freecache [maxShmid]\n"
"\tfinds and frees all shared memory up to shmid "
"maxShmid, default is 3000000.\n\n"
"ddump [hostId]\n"
"\tdump all b-trees in memory to sorted files on "
"disk. "
"Will likely trigger merges on files on disk. "
"Restrict to just host [hostId] if given.\n\n"
"pmerge [hostId|hostId1-hostId2]\n"
"\tforce merge of posdb files "
"just on [hostId] if specified.\n\n"
"smerge [hostId|hostId1-hostId2]\n"
"\tforce merge of sectiondb files "
"just on [hostId] if specified.\n\n"
"tmerge [hostId|hostId1-hostId2]\n"
"\tforce merge of titledb files "
"just on [hostId] if specified.\n\n"
"merge [hostId|hostId1-hostId2]\n"
"\tforce merge of all rdb files "
"just on [hostId] if specified.\n\n"
"dsh <CMD>\n"
"\tRun this command on the primary IPs of "
"all active hosts in hosts.conf. It will be "
"executed in the gigablast working directory on "
"each host. Example: "
"gb dsh 'ps auxw; uptime'\n\n"
"dsh2 <CMD>\n"
"\trun this command on the secondary IPs of "
"all active hosts in hosts.conf. Example: "
"gb dsh2 'ps auxw; uptime'\n\n"
"install [hostId]\n"
"\tInstall all required files for gb from "
"current working directory of the gb binary "
"to [hostId]. If no [hostId] is specified, install "
"to ALL hosts.\n\n"
"install2 [hostId]\n"
"\tlike above, but use the secondary IPs in the "
"installgb [hostId]\n"
"\tLike above, but install just the gb executable.\n\n"
"installfile <file>\n"
"\tInstalls the specified file on all hosts\n\n"
"installtmpgb [hostId]\n"
"\tlike above, but install just the gb executable "
"as tmpgb (for tmpstart).\n\n"
"installconf [hostId]\n"
"\tlike above, but install hosts.conf and gb.conf\n\n"
"installconf2 [hostId]\n"
"\tlike above, but install hosts.conf and gbN.conf "
"to the secondary IPs.\n\n"
"backupcopy <backupSubdir>\n"
"\tsave a copy of all xml, config, data and map files "
"into <backupSubdir> which is relative "
"to the working dir. Done for all hosts.\n\n"
"backupmove <backupSubdir>\n"
"\tmove all all xml, config, data and map files "
"into <backupSubdir> which is relative "
"to the working dir. Done for all hosts.\n\n"
"backuprestore <backupSubdir>\n"
"\tmove all all xml, config, data and map files "
"in <backupSubdir>, which is relative "
"to the working dir, into the working dir. "
"Will NOT overwrite anything. Done for all "
"proxy start [proxyId]\n"
"\tStart a proxy that acts as a frontend to gb "
"and passes on requests to random machines on "
"the cluster given in hosts.conf. Helps to "
"distribute the load evenly across all machines.\n\n"
"proxy load <proxyId>\n"
"\tStart a proxy process directly without calling "
"ssh. Called by 'gb proxy start'.\n\n"
"proxy stop [proxyId]\n"
"\tStop a proxy that acts as a frontend to gb.\n\n"
"dictlookuptest <file>\n"
"\tgets the popularities of the entries in the "
"<file>. Used to only check performance of "
// less common things
"gendict <coll> [numWordsToDump]\n\tgenerate "
"dictionary used for spellchecker "
"from titledb files in collection <coll>. Use "
"first [numWordsToDump] words.\n\n"
//"update\tupdate titledb0001.dat\n\n"
"treetest\n\ttree insertion speed test\n\n"
"hashtest\n\tadd and delete into hashtable test\n\n"
"parsetest <docIdToTest> [coll] [query]\n\t"
"parser speed tests\n\n"
// Quality Tests
"countdomains <coll> <X>\n"
"\tCounts the domains and IPs in collection coll and "
"in the first X titledb records. Results are sorted"
"by popularity and stored in the log file. \n\n"
"cache stability and speed tests\n\n"
"dump e <coll> <UTCtimestamp>\n\tdump all events "
"as if the time is UTCtimestamp.\n\n"
"dump es <coll> <UTCtimestamp>\n\tdump stats for "
"all events as if the time is UTCtimestamp.\n\n"
"dump <db> <collection> <fileNum> <numFiles> <includeTree> [other stuff]\n\tDump a db from disk. "
"Example: gb dump t main\n"
"\t<collection> is the name of the collection.\n\n"
"\t\tdump l <collection> <fileNum> <numFiles> <includeTree>\n"
"\t\tdump x <collection> <fileNum> <numFiles> <includeTree>\n"
"\tlinkdb (site):\n"
"\t\tdump Ls <collection> <fileNum> <numFiles> <includeTree> <url>\n"
"\tlinkdb (url):\n"
"\t\tdump Lu <collection> <fileNum> <numFiles> <includeTree> <url>\n"
"\tposdb (the index):\n"
"\t\tdump p <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"
"\t\tdump s <collection> <firstIp>\n"
"\t\tdump S <collection> <fileNum> <numFiles> <includeTree> <site>\n"
"\ttagdb (for wget):\n"
"\t\tdump W <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"
"\ttagdb (make sitelist.txt):\n"
"\t\tdump z <collection> <fileNum> <numFiles> <includeTree> <site>\n"
"\ttagdb (output HTTP commands for adding tags):\n"
"\t\tdump A <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"
"\t\tdump t <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
"\ttitledb (Unwanted documents, checked against blocklist, plugins):\n"
"\t\tdump u <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (Wanted documents, checked against blocklist, plugins):\n"
"\t\tdump wt <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (duplicates only):\n"
"\t\tdump at <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (Adult titlerecs):\n"
"\t\tdump st <collection> <fileNum> <numFiles> <includeTree>\n"
"\ttitledb (Spam titlerecs):\n"
"\t\tdump D <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
"\twaiting tree:\n"
"\t\tdump w <collection>\n"
"\t\tdump rtc <url>\n"
"\tPrepares or switches to a new site-default-page-temperature generation.\n"
"\tsitedeftemp prepare\n"
"\t\tPrepares a new site-default-page-temperature generation\n"
"\tsitedeftemp switch\n"
"\t\tSwitches to a new site-default-page-temperature generation previously prepared with 'sitedeftemp prepare'\n"
//word-wrap to screen width, if known
struct winsize w;
if(ioctl(STDOUT_FILENO,TIOCGWINSZ,&w)==0 && w.ws_col>0) {
SafeBuf sb2;
sb2.brify2(sb.getBufStart(), w.ws_col, "\n\t", false);
} else
// disable printing of used memory
//g_mem.m_used = 0;
/// @todo ALC wouldn't it be faster to actually check the dir permission instead of trying to write a tmp file?
int32_t checkDirPerms(const char *dir) {
if ( g_conf.m_readOnlyMode ) {
return 0;
File f;
f.set ( dir , "tmpfile" );
if ( ! f.open ( O_RDWR | O_CREAT | O_TRUNC ) ) {
log( LOG_ERROR, "disk: Unable to create %stmpfile. Need write permission in this directory.", dir );
return -1;
if ( ! f.unlink() ) {
log( LOG_ERROR, "disk: Unable to delete %stmpfile. Need write permission in this directory.", dir );
return -1;
return 0;
static bool argToBoolean(const char *arg) {
return strcmp(arg,"1")==0 ||
static bool parseOptionalHostRange(int rangearg, int argc, char **argv, int *h1, int *h2) {
if(rangearg < argc) {
int n = sscanf(argv[rangearg],"%u-%u", h1, h2);
if(n==0) {
fprintf(stderr,"Unrecognized host range: '%s'\n", argv[rangearg]);
return false;
} else if(n==1) {
*h2 = -1;
} else if(*h2<*h1) {
fprintf(stderr,"host2<host1 in host range: '%s'\n", argv[rangearg]);
return false;
} else {
*h1 = -1;
*h2 = -1;
return true;
// save them all
static void doCmdAll ( int fd, void *state ) ;
static bool s_sendToHosts;
static bool s_sendToProxies;
static int32_t s_hostId;
static int32_t s_hostId2;
static char s_buffer[128];
static HttpRequest s_r;
bool doCmd ( const char *cmd , int32_t hostId , const char *filename ,
bool sendToHosts , bool sendToProxies , int32_t hostId2 ) {
//so we don't supporess messages to dead hosts (we're not connected to vagus)
g_conf.m_doingCommandLine = true;
// need loop to work
if ( ! g_loop.init() ) {
log(LOG_WARN, "db: Loop init failed." );
return false;
// pass it on
s_hostId = hostId;
s_sendToHosts = sendToHosts;
s_sendToProxies = sendToProxies;
s_hostId2 = hostId2;
// set stuff so http server client-side works right
g_conf.m_httpMaxSockets = 512;
sprintf ( g_conf.m_spiderUserAgent ,"GigablastOpenSource/1.0");
sprintf ( g_conf.m_spiderBotName ,"gigablastopensource");
// register sleep callback to get started
if (!g_loop.registerSleepCallback(1, NULL, doCmdAll, "doCmdAll", 0)) {
log(LOG_WARN, "admin: Loop init failed.");
return false;
// not it
log(LOG_INFO,"admin: broadcasting %s",cmd);
// make a fake http request
sprintf ( s_buffer , "GET /%s?%s HTTP/1.0" , filename , cmd );
TcpSocket sock;
// make it local loopback so it passes the permission test in
// doCmdAll()'s call to convertHttpRequestToParmList
sock.m_ip = atoip("");
s_r.set ( s_buffer , strlen ( s_buffer ) , &sock );
// do not do sig alarms! for now just set this to null so
// the sigalarmhandler doesn't core
//g_hostdb.m_myHost = NULL;
// run the loop
[[ noreturn ]] void doneCmdAll ( void *state ) {
log("cmd: completed command");
exit ( 0 );
void doCmdAll ( int fd, void *state ) {
// do not keep calling it!
g_loop.unregisterSleepCallback ( NULL, doCmdAll );
// make port -1 to indicate none to listen on
if ( ! g_udpServer.init( 18123 , // port to listen on
20000000 , // readBufSIze
20000000 , // writeBufSize
20 , // pollTime in ms
3500 , // max udp slots
false )){ // is dns?
log("db: UdpServer init on port 18123 failed: %s" ,
// udpserver::sendRequest() checks we have a handle for msgs we send!
// so fake it out with this lest it cores
SafeBuf parmList;
// returns false and sets g_errno on error
if (!g_parms.convertHttpRequestToParmList(&s_r,&parmList,0,NULL)){
log("cmd: error converting command: %s",mstrerror(g_errno));
if ( parmList.length() <= 0 ) {
log("cmd: no parmlist to send");
// restrict broadcast to this hostid range!
// returns true with g_errno set on error. uses g_udpServer
if ( g_parms.broadcastParmList ( &parmList ,
doneCmdAll , // callback when done
s_sendToHosts ,
s_sendToProxies ,
s_hostId , // -1 means all
s_hostId2 ) ) { // -1 means all
log("cmd: error sending command: %s",mstrerror(g_errno));
// wait for it
log("cmd: sent command");
static int install_file(const char *dst_host, const char *src_file, const char *dst_file)
char cmd[1024];
sprintf(cmd, "scp -p %s %s:%s", src_file, dst_host, dst_file);
log(LOG_INIT,"admin: %s", cmd);
int rc = system(cmd);
return rc;
static int install_file(const char *file, int32_t hostId, int32_t hostId2) {
// use hostId2 to indicate the range hostId-hostId2, but if it is -1
// then it was not given, so restrict to just hostId
if ( hostId2 == -1 ) {
hostId2 = hostId;
for (int32_t i = 0; i < g_hostdb.getNumHosts(); i++) {
Host *h2 = g_hostdb.getHost(i);
if (h2 == g_hostdb.getMyHost()) {
continue; //skip ourselves
// if doing a range of hostid, hostId2 is >= 0
if (hostId >= 0 && hostId2 >= 0) {
if (h2->m_hostId < hostId || h2->m_hostId > hostId2) {
char full_dst_file[1024];
sprintf(full_dst_file, "%s%s", h2->m_dir, file);
char ipbuf[16];
install_file(iptoa(h2->m_ip, ipbuf), file, full_dst_file);
return 0; //return value is unclear
// installFlag is 1 if we are really installing, 2 if just starting up gb's
// installFlag should be a member of the ifk_ enum defined above
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir, int32_t hostId2, char *cmd ) {
// use hostId2 to indicate the range hostId-hostId2, but if it is -1
// then it was not given, so restrict to just hostId
if ( hostId2 == -1 ) {
hostId2 = hostId;
char tmp[1024];
if ( installFlag == ifk_proxy_start ) {
for ( int32_t i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
Host *h2 = g_hostdb.getProxy(i);
// limit install to this hostId if it is >= 0
if ( hostId >= 0 && h2->m_hostId != hostId ) continue;
// . assume conf file name gbHID.conf
// . assume working dir ends in a '/'
//to test add: ulimit -t 10; to the ssh cmd
char ipbuf[16];
"ssh %s \"cd %s ; "
"export MALLOC_CHECK_=0;"
"cp -f gb gb.oldsave ; "
"mv -f gb.installed gb ; "
"ADDARGS='' ; "
"while [ \\$EXITSTATUS != 0 ]; do "
"{ "
"./gb proxy load %" PRId32" " // mdw
" >& ./proxylog ;"
"EXITSTATUS=\\$? ; "
"ADDARGS='-r' ; "
"} "
"done >& /dev/null & \" & ",
h2->m_dir ,
h2->m_hostId );
// log it
log(LOG_INIT,"admin: %s", tmp);
// execute it
int32_t ret = system ( tmp );
if ( ret < 0 ) {
fprintf(stderr,"Error loading proxy: %s\n",
fprintf(stderr,"If proxy does not start, make sure "
"its ip is correct in hosts.conf\n");
return 0;
HashTableX iptab;
char tmpBuf[2048];
int32_t maxOut = 500;
// this is a big scp so only do two at a time...
if ( installFlag == ifk_install ) maxOut = 1;
if ( installFlag == ifk_installgb ) maxOut = 4;
// go through each host
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
Host *h2 = g_hostdb.getHost(i);
char ipbuf[16];
const char *amp = " ";
// if i is NOT multiple of maxOut then use '&'
// even if all all different machines (IPs) scp chokes and so
// does rcp a little. so restrict to maxOut at a time.
if ( (i+1) % maxOut ) {
amp = "&";
// if doing a range of hostid, hostId2 is >= 0
if ( hostId >= 0 && hostId2 >= 0 ) {
if ( h2->m_hostId < hostId || h2->m_hostId > hostId2 )
// backupcopy
if ( installFlag == ifk_backupcopy ) {
"ssh %s \"cd %s ; "
"mkdir %s ; "
"cp -ai *.dat* *.map gb.conf "
"hosts.conf %s\" &",
iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , dir );
// log it
log ( "%s", tmp);
// execute it
system ( tmp );
// backupmove
else if ( installFlag == ifk_backupmove ) {
"ssh %s \"cd %s ; "
"mkdir %s ; "
"mv -i *.dat* *.map "
"%s\" &",
iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , dir );
// log it
log ( "%s", tmp);
// execute it
system ( tmp );
// backuprestore
else if ( installFlag == ifk_backuprestore ) {
"ssh %s \"cd %s ; cd %s ; "
"mv -i *.dat* *.map gb.conf "
"hosts.conf %s\" &",
iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , h2->m_dir );
// log it
log ( "%s", tmp);
// execute it
system ( tmp );
const char *dir = "./";
// install to it
if ( installFlag == ifk_install ) {
const char *srcDir = "./";
SafeBuf fileListBuf;
g_process.getFilesToCopy ( srcDir , &fileListBuf );
fileListBuf.safePrintf(" %shosts.conf",srcDir);
fileListBuf.safePrintf(" %sgb.conf",srcDir);
SafeBuf tmpBuf;
// ensure directory is there, if
// not then make it
"ssh %s 'mkdir -p %s' ; "
"scp -p -r %s %s:%s"
, ipbuf
, h2->m_dir
, fileListBuf.getBufStart()
, ipbuf
, h2->m_dir
char *tmp = tmpBuf.getBufStart();
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
else if ( installFlag == ifk_installgb ) {
File f;
const char *target = "gb.new";
if ( ! f.doesExist() ) target = "gb";
"scp -p " // blowfish is faster
"%s%s "
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
else if ( installFlag == ifk_installtmpgb ) {
"scp -p "
"%sgb.new "
"%s:%s/tmpgb.installed &",
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
else if ( installFlag == ifk_installconf ) {
"scp -p %sgb.conf %shosts.conf %s:%s %s",
dir ,
dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
// start up a dummy cluster using hosts.conf ports + 1
else if ( installFlag == ifk_tmpstart ) {
// . assume conf file name gbHID.conf
// . assume working dir ends in a '/'
"ssh %s \"cd %s ; "
"cp -f tmpgb tmpgb.oldsave ; "
"mv -f tmpgb.installed tmpgb ; "
"%s/tmpgb tmpstarthost "
"%" PRId32" >& ./tmplog%03" PRId32" &\" &",
h2->m_dir ,
h2->m_dir ,
h2->m_hostId ,
h2->m_hostId );
// log it
log(LOG_INIT,"admin: %s", tmp);
// execute it
system ( tmp );
else if ( installFlag == ifk_start ) {
sprintf( tmp, "ssh %s '%sgbstart.sh %" PRId32"' %s", iptoa(h2->m_ip,ipbuf), h2->m_dir, h2->m_hostId, amp );
// log it
fprintf(stdout,"admin: %s\n", tmp);
// execute it
system ( tmp );
// dsh
else if ( installFlag == ifk_dsh ) {
"ssh %s 'cd %s ; %s' %s",
cmd ,
amp );
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
// dsh2
else if ( installFlag == ifk_dsh2 ) {
"ssh %s 'cd %s ; %s'",
cmd );
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
// installconf2
else if ( installFlag == ifk_installconf2 ) {
"rcp %sgb.conf %shosts.conf %shosts2.conf "
"%s:%s &",
dir ,
dir ,
dir ,
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
// return 0 on success
return 0;
static bool registerMsgHandlers() {
if (! registerMsgHandlers1()) return false;
if (! registerMsgHandlers2()) return false;
// in SpiderProxy.cpp...
return true;
static bool registerMsgHandlers1() {
if ( ! Msg20::registerHandler()) return false;
if ( ! MsgC::registerHandler()) return false;
if ( ! Msg22::registerHandler() ) return false;
return true;
static bool registerMsgHandlers2() {
if ( ! Msg0::registerHandler()) return false;
if ( ! Msg13::registerHandler() ) return false;
if ( ! Msg39::registerHandler()) return false;
if ( ! Msg4In::registerHandler() ) return false;
if ( ! Msg4::initializeOutHandling() ) return false;
if(! Parms::registerHandler3e()) return false;
if(! Parms::registerHandler3f()) return false;
if ( ! g_udpServer.registerHandler(msg_type_25,handleRequest25)) return false;
if ( ! g_udpServer.registerHandler(msg_type_7,handleRequest7)) return false;
return true;
#include "Rdb.h"
#include "Xml.h"
// dump routines here now
void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree,
int64_t docid , bool justPrintDups) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." ); return ; }
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
//g_conf.m_spiderdbMaxDiskPageCacheMem = 0;
g_titledb.init ();
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey = Titledb::makeFirstKey ( docid );
Msg5 msg5;
RdbList list;
int64_t prevId = 0LL;
int32_t count = 0;
char ttt[2048+MAX_URL_LEN];
HashTableX dedupTable;
// load the appropriate dictionaries -- why???
// make this
XmlDoc *xd;
try { xd = new (XmlDoc); }
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) return;
// loop over entries in list
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey ( &k );
if ( k <= lastKey )
log("key out of order. "
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
"currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey = k;
int32_t shard = g_hostdb.getShardNum ( RDB_TITLEDB , &k );
// print deletes
if ( (k.n0 & 0x01) == 0) {
fprintf(stdout,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
"shard=%" PRId32" (del)\n",
k.n1 , k.n0 , docId , shard );
// free the mem
// uncompress the title rec
//TitleRec tr;
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
// extract the url
Url *u = xd->getFirstUrl();
//int32_t nc = xd->size_catIds / 4;//tr.getNumCatids();
if ( justPrintDups ) {
// print into buf
if ( docId != prevId ) {
time_t ts = xd->m_spideredTime;//tr.getSpiderDa
struct tm tm_buf;
struct tm *timeStruct = localtime_r(&ts,&tm_buf);
//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
char ppp[100];
LinkInfo *info = xd->ptr_linkInfo1;//tr.ge
char foo[1024];
foo[0] = '\0';
//if ( tr.getVersion() >= 86 )
//"tw=%" PRId32" hw=%" PRId32" upw=%" PRId32" "
"sni=%" PRId32" ",
const char *ru = xd->ptr_redirUrl;
if ( ! ru ) ru = "";
char ipbuf2[16];
"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
//hh=%07" PRIx32" ch=%08" PRIx32" "
"size=%07" PRId32" "
"ch32=%010" PRIu32" "
"clen=%07" PRId32" "
"cs=%04d "
"lang=%02d "
"sni=%03" PRId32" "
"lastspidered=%s "
"ip=%s "
"numLinkTexts=%04" PRId32" "
"version=%02" PRId32" "
//"maxLinkTextWeight=%06" PRIu32"%% "
"redir=%s "
"url=%s "
"firstdup=1 "
"shard=%" PRId32" "
k.n1 , k.n0 ,
//rec[0] ,
docId ,
//hostHash ,
//contentHash ,
recSize - 16 ,
u->getUrl() ,
shard );
prevId = docId;
count = 0;
// print previous docid that is same as our
if ( count++ == 0 ) printf ( "\n%s" , ttt );
// nice, this is never 0 for a titlerec, so we can use 0 to signal
// that the following bytes are not compressed, and we can store
// out special checksum vector there for fuzzy deduping.
//if ( rec[0] != 0 ) continue;
// print it out
//printf("n1=%08" PRIx32" n0=%016" PRIx64" b=0x%02hhx docId=%012" PRId64" sh=%07" PRIx32" ch=%08" PRIx32" "
// date indexed as local time, not GMT/UTC
time_t ts = xd->m_spideredTime;//tr.getSpiderDate();
struct tm tm_buf;
struct tm *timeStruct = localtime_r(&ts,&tm_buf);
//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
char ppp[100];
LinkInfo *info = xd->ptr_linkInfo1;//tr.getLinkInfo();
char foo[1024];
foo[0] = '\0';
"sni=%" PRId32" ",
const char *ru = xd->ptr_redirUrl;
if ( ! ru ) ru = "";
char ipbuf2[16];
"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
"size=%07" PRId32" "
"ch32=%010" PRIu32" "
"clen=%07" PRId32" "
"cs=%04d "
"ctype=%s "
"lang=%02d "
"sni=%03" PRId32" "
"lastspidered=%s "
"ip=%s "
"numLinkTexts=%04" PRId32" "
"version=%02" PRId32" "
"shard=%" PRId32" "
"metadatasize=%" PRId32" "
"redir=%s "
k.n1 , k.n0 ,
docId ,
recSize - 16 ,
xd->size_utf8Content,//tr.getContentLen() ,
u->getUrl() );
// free the mem
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) return;
void dumpWaitingTree (const char *coll ) {
RdbTree wt;
if (!wt.set(0, -1, 20000000, true, "waittree2", "waitingtree", sizeof(key96_t))) {
collnum_t collnum = g_collectiondb.getCollnum ( coll );
// make dir
char dir[500];
sprintf(dir, "%scoll.%s.%" PRId32, g_hostdb.m_dir, coll, (int32_t)collnum);
// load in the waiting tree, IPs waiting to get into doledb
BigFile file;
file.set(dir, "waitingtree-saved.dat");
bool treeExists = file.doesExist() > 0;
// load the table with file named "THISDIR/saved"
RdbMem wm;
if ( treeExists && !wt.fastLoad(&file, &wm) ) return;
ScopedLock sl(wt.getLock());
// the the waiting tree
for (int32_t node = wt.getFirstNode_unlocked(); node >= 0; node = wt.getNextNode_unlocked(node)) {
// get key
const key96_t *key = reinterpret_cast<const key96_t*>(wt.getKey_unlocked(node));
// get ip from that
int32_t firstIp = (key->n0) & 0xffffffff;
// get the time
uint64_t spiderTimeMS = key->n1;
// shift upp
spiderTimeMS <<= 32;
// or in
spiderTimeMS |= (key->n0 >> 32);
// get the rest of the data
char ipbuf[16];
time_t now_t = spiderTimeMS/1000;
struct tm tm_buf;
struct tm *stm = gmtime_r(&now_t,&tm_buf);
fprintf(stdout,"time=%" PRIu64" (%04d-%02d-%02dT%02d:%02d:%02d.%03dZ) firstip=%s\n", spiderTimeMS, stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(spiderTimeMS%1000), iptoa(firstIp,ipbuf));
void dumpDoledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree){
g_doledb.init ();
g_doledb.getRdb()->addRdbBase1(coll );
key96_t startKey ;
key96_t endKey ;
Msg5 msg5;
RdbList list;
key96_t oldk; oldk.setMin();
const CollectionRec *cr = g_collectiondb.getRec(coll);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_DOLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) return;
// loop over entries in list
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
if ( oldk > k )
fprintf(stdout,"got bad key order. "
"%" PRIx32"/%" PRIx64" > %" PRIx32"/%" PRIx64"\n",
oldk = k;
// get it
const char *drec = list.getCurrentRec();
// sanity check
if ( (drec[0] & 0x01) == 0x00 ) {g_process.shutdownAbort(true); }
// get spider rec in it
const char *srec = drec + 12 + 4;
struct tm *timeStruct ;
char time[256];
time_t ts = (time_t)Doledb::getSpiderTime(&k);
struct tm tm_buf;
timeStruct = gmtime_r(&ts,&tm_buf);
strftime ( time , 256 , "%Y%m%d-%H%M%S UTC", timeStruct );
// print doledb info first then spider request
fprintf(stdout,"dolekey=%s (n1=%" PRIu32" n0=%" PRIu64") "
"pri=%" PRId32" "
"spidertime=%s(%" PRIu32") "
"uh48=0x%" PRIx64"\n",
// print it
Spiderdb::print ( srec );
// the \n
// must be a request -- for now, for stats
if ( ! Spiderdb::isSpiderRequest((key128_t *)srec) ) {
// error!
// cast it
const SpiderRequest *sreq = (const SpiderRequest *)srec;
// skip negatives
if ( (sreq->m_key.n0 & 0x01) == 0x00 ) { g_process.shutdownAbort(true); }
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) return;
void dumpRobotsTxtCache(const char *url) {
struct HttpCacheData {
int32_t m_errno;
char *ptr_reply;
int32_t size_reply;
} __attribute__((packed));
if( !url || strlen(url) <= 0 ) {
fprintf(stdout, "robots.txt.cache lookup failed, you must supply a url as parameter\n");
// Generate robots.txt url
Url u;
// build robots.txt url
char urlRobots[MAX_URL_LEN+1];
char *p = urlRobots;
if ( ! u.getScheme() )
p += sprintf ( p , "http://" );
memcpy ( p , u.getScheme() , u.getSchemeLen() );
p += u.getSchemeLen();
p += sprintf(p,"://");
memcpy ( p , u.getHost() , u.getHostLen() );
p += u.getHostLen();
// add port if not default
if ( u.getPort() != u.getDefaultPort() ) {
p += sprintf( p, ":%" PRId32, u.getPort() );
p += sprintf ( p , "/robots.txt" );
fprintf(stdout, "robots.txt.cache lookup of %s\n", urlRobots);
RdbCache httpCacheRobots;
int32_t memRobots = 3000000;
int32_t maxCacheNodesRobots = memRobots / 106;
if ( ! httpCacheRobots.init ( memRobots ,
-1 , // fixedDataSize
maxCacheNodesRobots ,
"robots.txt" , // dbname
true, // load from disk
12, // cachekeysize
-1)) { // numPtrsMax
fprintf(stdout, "Could not initialize local robots.txt.cache\n");
int32_t numElem = httpCacheRobots.getNumUsedNodes();
fprintf(stdout,"%" PRId32 " elements in cache.\n", numElem);
char *rec;
int32_t recSize;
key96_t k;
k.n1 = 0;
k.n0 = hash64(urlRobots, strlen(urlRobots));
k.n0 ^= 0xff; // for compressed keys
int64_t uh48 = k.n0 & 0x0000ffffffffffffLL;
fprintf(stdout, "Cache key=%" PRIu64 ", uh48=%" PRIu64 "\n", k.n0, uh48);
bool inCache = httpCacheRobots.getRecord ( (collnum_t)0 , // share btwn colls
k , // cacheKey
&rec ,
&recSize ,
true , // copy?
9999999, //r->m_maxCacheAge , // 24*60*60 ,
false); // stats?
fprintf(stdout, "Found: %s\n", inCache?"true":"false");
if( inCache ) {
HttpCacheData *httpCacheData = reinterpret_cast<HttpCacheData*>(rec);
if( deserializeMsg(sizeof(*httpCacheData), &httpCacheData->size_reply, &httpCacheData->size_reply, &httpCacheData->ptr_reply, ((char*)httpCacheData + sizeof(*httpCacheData))) != -1) {
fprintf(stdout, "deserializeMsg OK. errno=%" PRId32 ", size_reply=%" PRId32 "\n", httpCacheData->m_errno, httpCacheData->size_reply);
// get uncompressed size
uint32_t unzippedLen = *(int32_t*)httpCacheData->ptr_reply;
// sanity checks
if ( unzippedLen > 10000000 ) {
fprintf(stdout, "Unzipped length appears too big: %" PRId32 "\n", unzippedLen);
// make buffer to hold uncompressed data
char *newBuf = (char*)mmalloc(unzippedLen, "DumpUnzip");
if( ! newBuf ) {
fprintf(stdout, "Could not allocate memory for uncompressed document: %" PRId32 "\n", unzippedLen);
// make another var to get mangled by gbuncompress
uint32_t uncompressedLen = unzippedLen;
// uncompress it
int zipErr = gbuncompress( (unsigned char*)newBuf, // dst
&uncompressedLen, // dstLen
(unsigned char*)httpCacheData->ptr_reply+4, // src
httpCacheData->size_reply-4); // srcLen
if(zipErr != Z_OK || uncompressedLen != (uint32_t)unzippedLen) {
fprintf(stdout, "Error unzipping compressed robots.txt unzipped len should be %" PRId32" but is %" PRId32". ziperr=%" PRId32,
(int32_t)uncompressedLen, (int32_t)unzippedLen, (int32_t)zipErr);
mfree(newBuf, unzippedLen, "DumpUnzip");
fprintf(stdout,"\n%s\n\n", newBuf);
mfree(newBuf, unzippedLen, "DumpUnzip");
else {
fprintf(stderr,"deserialize failed\n");
#if 0
static int32_t dumpSpiderdbCsv(const char *coll) {
key128_t startKey;
Msg5 msg5;
RdbList list;
unsigned count = 0;
const SpiderReply *prevSpiderReply = NULL;
char prevSpiderReplyBuf[sizeof(SpiderReply)+MAX_URL_LEN+100];
int64_t prevSpiderReplyUrlHash48 = 0LL;
int64_t prevRequestUh48 = 0;
const CollectionRec *cr = g_collectiondb.getRec(coll);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if( ! msg5.getList(RDB_SPIDERDB,
true, //includeTree
0, //startFileNum
-1, //numFiles
NULL, // state
NULL, // callback
0, // niceness
false, // err correction?
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
return -1;
// all done if empty
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
if((count % 100000) == 0) {
fprintf( stderr, "Processed %u records.\n", count - 1);
const char *srec = list.getCurrentRec();
if(Spiderdb::isSpiderReply((const key128_t *)srec)) {
const SpiderReply *srep = reinterpret_cast<const SpiderReply *>(srec);
prevSpiderReplyUrlHash48 = srep->getUrlHash48();
prevSpiderReply = srep;
} else if(prevRequestUh48==Spiderdb::getUrlHash48(reinterpret_cast<const key128_t*>(srec))) {
//skip duplicate
} else {
const SpiderRequest *spiderRequest = reinterpret_cast<const SpiderRequest*>(srec);
int64_t uh48 = spiderRequest->getUrlHash48();
// count how many requests had replies and how many did not
bool hadReply = prevSpiderReply && (uh48 == prevSpiderReplyUrlHash48);
if( !hadReply ) {
// Last reply and current request do not belong together
prevSpiderReply = NULL;
prevRequestUh48 = spiderRequest->getUrlHash48();
// print it
if(prevSpiderReply) {
// Only dump these values if last reply and current request belong together
} else {
//copy prevspiderreply to tmp buf, so we can rememerb the value to next list
if(prevSpiderReply && sizeof(key128_t)+prevSpiderReply->m_dataSize < sizeof(prevSpiderReplyBuf)) {
memcpy(prevSpiderReplyBuf, prevSpiderReply, sizeof(key128_t)+prevSpiderReply->m_dataSize);
prevSpiderReply = reinterpret_cast<const SpiderReply*>(prevSpiderReplyBuf);
} else
prevSpiderReply = NULL;
const key128_t *listLastKey = reinterpret_cast<const key128_t *>(list.getLastKey());
startKey = *listLastKey;
// watch out for wrap around
if ( startKey < *listLastKey)
return 0;
// time speed of inserts into RdbTree for indexdb
static bool hashtest() {
// load em up
int32_t numKeys = 1000000;
log("db: speedtest: generating %" PRId32" random keys.",numKeys);
// seed randomizer
srand ( (int32_t)gettimeofdayInMilliseconds() );
// make list of one million random keys
key96_t *k = (key96_t *)mmalloc ( sizeof(key96_t) * numKeys , "main" );
if ( ! k ) {
log(LOG_WARN, "hashtest: malloc failed");
return false;
int32_t *r = (int32_t *)(void*)k;
for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) r[i] = rand();
// init the tree
//HashTableT<int32_t,int32_t> ht;
HashTable ht;
ht.set ( (int32_t)(1.1 * numKeys) );
// add to regular tree
int64_t t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < numKeys ; i++ )
if ( ! ht.addKey ( r[i] , 1 ) ) {
log(LOG_WARN, "hashtest: add key failed.");
return false;
// print time it took
int64_t e = gettimeofdayInMilliseconds();
// add times
log("db: added %" PRId32" keys in %" PRId64" ms",numKeys,e - t);
// do the delete test
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < numKeys ; i++ )
if ( ! ht.removeKey ( r[i] ) ) {
log(LOG_WARN, "hashtest: add key failed.");
return false;
// print time it took
e = gettimeofdayInMilliseconds();
// add times
log("db: deleted %" PRId32" keys in %" PRId64" ms",numKeys,e - t);
return true;
static void dumpTagdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char req,
const char *siteArg) {
g_tagdb.init ();
g_tagdb.getRdb()->addRdbBase1(coll );
key128_t startKey ;
key128_t endKey ;
if ( siteArg ) {
startKey = Tagdb::makeStartKey ( siteArg );
endKey = Tagdb::makeEndKey ( siteArg );
log("gb: using site %s for start key",siteArg );
Msg5 msg5;
RdbList list;
const CollectionRec *cr = g_collectiondb.getRec(coll);
int64_t hostHash = -1;
int64_t lastHostHash = -2;
const char *site = NULL;
char sbuf[1024*2];
int32_t siteNumInlinks = -1;
int32_t typeSite = hash64Lower_a("site",4);
int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TAGDB,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) return;
// loop over entries in list
for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
char *rec = list.getCurrentRec();
//key96_t k = list.getCurrentKey();
key128_t k;
list.getCurrentKey ( &k );
char *data = list.getCurrentData();
int32_t size = list.getCurrentDataSize();
// is it a delete?
if ( (k.n0 & 0x01) == 0 ) {
if ( req == 'z' ) continue;
printf("k.n1=%016" PRIx64" "
"k.n0=%016" PRIx64" (delete)\n",
k.n1 , k.n0 | 0x01 ); // fix it!
// point to the data
const char *p = data;
const char *pend = data + size;
// breach check
if ( p >= pend ) {
printf("corrupt tagdb rec k.n0=%" PRIu64,k.n0);
// parse it up
Tag *tag = (Tag *)rec;
// print the version and site
StackBuf<1024> sb;
bool match = false;
hostHash = tag->m_key.n1;
if ( hostHash == lastHostHash ) {
match = true;
else {
site = NULL;
siteNumInlinks = -1;
lastHostHash = hostHash;
// making sitelist.txt?
if ( tag->m_type == typeSite && req == 'z' ) {
site = tag->getTagData();
// make it null if too many .'s
if ( site ) {
const char *p = site;
int count = 0;
int alpha = 0;
int colons = 0;
// foo.bar.baz.com is ok
for ( ; *p ; p++ ) {
if ( *p == '.' ) count++;
if ( *p == ':' ) colons++;
if ( is_alpha_a(*p) || *p=='-' )
if ( count >= 4 )
site = NULL;
if ( colons > 1 )
site = NULL;
// no ip addresses allowed, need an alpha char
if ( alpha == 0 )
site = NULL;
// ends in :?
int slen = 0;
if ( site ) slen = strlen(site);
if ( site && site[slen-1] == ':' )
site = NULL;
// port bug
if ( site && site[slen-2] == ':' && site[slen-1]=='/')
site = NULL;
// remove heavy spammers to save space
if ( site && strstr(site,"daily-camshow-report") )
site = NULL;
if ( site && strstr(site,".livejasminhd.") )
site = NULL;
if ( site && strstr(site,".pornlivenews.") )
site = NULL;
if ( site && strstr(site,".isapornblog.") )
site = NULL;
if ( site && strstr(site,".teen-model-24.") )
site = NULL;
if ( site && ! is_ascii2_a ( site, strlen(site) ) ) {
site = NULL;
if ( match && siteNumInlinks>=0) {
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 && site )
printf("%i %s\n",siteNumInlinks,site);
siteNumInlinks = -1;
site = NULL;
// save it
if ( site ) strcpy ( sbuf , site );
if ( tag->m_type == typeInlinks && req == 'z' ) {
siteNumInlinks = atoi(tag->getTagData());
if ( match && site ) {
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 )
printf("%i %s\n",siteNumInlinks,sbuf);
siteNumInlinks = -1;
site = NULL;
if ( req == 'z' )
// print as an add request or just normal
if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb );
else tag->printToBuf ( &sb );
// dump it
startKey = *(key128_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key128_t *)list.getLastKey() ){
printf("\n"); return;}
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." );
g_titledb.init ();
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey = Titledb::makeFirstKey(0);
Msg5 msg5;
RdbList list;
HashTableX dedupTable;
// make this
XmlDoc *xd;
try {
xd = new (XmlDoc);
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) {
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey(&k);
if ( k <= lastKey ) {
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey.n1, lastKey.n0, k.n1, k.n0);
lastKey = k;
if ( (k.n0 & 0x01) == 0) {
// delete key
// free the mem
// uncompress the title rec
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
// extract the url
Url *url = xd->getFirstUrl();
const char *reason = NULL;
if (isUrlUnwanted(*url, &reason)) {
fprintf(stdout, "%" PRId64"|%s|%s\n", docId, reason, url->getUrl());
Url **redirUrlPtr = xd->getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
fprintf(stdout, "%" PRId64"|redir %s|%s|%s\n", docId, reason, url->getUrl(), redirUrl->getUrl());
uint8_t *contentType = xd->getContentType();
switch (*contentType) {
case CT_GIF:
case CT_JPG:
case CT_PNG:
case CT_TIFF:
case CT_BMP:
case CT_JS:
case CT_CSS:
case CT_JSON:
case CT_IMAGE:
case CT_GZ:
case CT_ARC:
case CT_WARC:
fprintf(stdout, "%" PRId64"|blocked content type|%s\n", docId, url->getUrl());
// check content
int32_t contentLen = xd->size_utf8Content > 0 ? (xd->size_utf8Content - 1) : 0;
if (contentLen > 0) {
if (!WantedChecker::check_single_content(url->getUrl(), xd->ptr_utf8Content, contentLen).wanted) {
fprintf(stdout, "%" PRId64"|blocked content|%s\n", docId, url->getUrl());
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." );
g_titledb.init ();
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey = Titledb::makeFirstKey(0);
Msg5 msg5;
RdbList list;
HashTableX dedupTable;
// make this
XmlDoc *xd;
try {
xd = new (XmlDoc);
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) {
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey(&k);
if ( k <= lastKey ) {
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey.n1, lastKey.n0, k.n1, k.n0);
lastKey = k;
if ( (k.n0 & 0x01) == 0) {
// delete key
// free the mem
// uncompress the title rec
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
// extract the url
Url *url = xd->getFirstUrl();
const char *reason = NULL;
if( ! isUrlUnwanted(*url, &reason)) {
Url **redirUrlPtr = xd->getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
fprintf(stdout, "%" PRId64 "|%s\n", docId, url->getUrl());
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." );
g_titledb.init ();
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey = Titledb::makeFirstKey(0);
Msg5 msg5;
RdbList list;
// make this
XmlDoc *xd;
try {
xd = new (XmlDoc);
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
g_checkAdultList.init("adultwords.txt", "adultphrases.txt");
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) {
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey(&k);
if ( k <= lastKey ) {
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey.n1, lastKey.n0, k.n1, k.n0);
lastKey = k;
if ( (k.n0 & 0x01) == 0) {
// delete key
// free the mem
// uncompress the title rec
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
// extract the url
Url *url = xd->getFirstUrl();
if( url == (void *)-1 ) {
log(LOG_WARN, "dbdump: XmlDoc::getFirstUrl() failed for docid %" PRId64, docId);
const char *reason = NULL;
// Don't dump unwanted URLs
if( ! isUrlUnwanted(*url, &reason)) {
Url **redirUrlPtr = xd->getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
// Get adult flag including generating debug info.
// Could just call xd->getIsAdult() to get the simple indicator
// without debug information.
CheckAdult achk(xd, true);
bool newblocked = achk.isDocAdult();
#if 0
// Sanity check.
bool gbadult = false;
char *adultbit = xd->getIsAdult();
if( adultbit ) {
if( *adultbit != newblocked ) {
// Mismatch - should never happen
log(LOG_ERROR, "Adult check mismatch! docid=%" PRId64 ", url=%s, gbadult=%s, score=%" PRId32 ", newblock=%s",
docId, url->getUrl(), gbadult?"true":"false", achk.getScore(), newblocked?"true":"false");
if( newblocked ) {
time_t idxtim = (time_t)xd->getIndexedTime();
struct tm tm_buf;
tm *tm1 = gmtime_r(&idxtim,&tm_buf);
char idxtim_s[64];
fprintf(stdout, "%" PRId64 "\t%s\t%s\t%s\tscore=%" PRId32 "\tunique dw=%" PRId32 "\tunique dp=%" PRId32 "\twords=%" PRId32 "\t%s\t%s\n",
docId, url->getUrl(), idxtim_s, achk.getReason(),
achk.getScore(), achk.getNumUniqueMatchedWords(), achk.getNumUniqueMatchedPhrases(),
achk.getNumWordsChecked(), achk.hasEmptyDocumentBody()?"EMPTYDOC":"HASCONTENT", achk.getDebugInfo());
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if(startFileNum!=0 && numFiles<0) {
//this may apply to all files, but I haven't checked into hash-based ones yet
fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("db: Failed to init hashtable." );
g_titledb.init ();
key96_t startKey ;
key96_t endKey ;
key96_t lastKey ;
startKey = Titledb::makeFirstKey(0);
Msg5 msg5;
RdbList list;
// make this
XmlDoc *xd;
try {
xd = new (XmlDoc);
catch(std::bad_alloc&) {
fprintf(stdout,"could not alloc for xmldoc\n");
const CollectionRec *cr = g_collectiondb.getRec(coll);
if(cr==NULL) {
fprintf(stderr,"Unknown collection '%s'\n", coll);
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) {
// loop over entries in list
for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocIdFromKey(&k);
if ( k <= lastKey ) {
log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey.n1, lastKey.n0, k.n1, k.n0);
lastKey = k;
if ( (k.n0 & 0x01) == 0) {
// delete key
// free the mem
// uncompress the title rec
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
// extract the url
Url *url = xd->getFirstUrl();
if( url == (void *)-1 ) {
log(LOG_WARN, "dbdump: XmlDoc::getFirstUrl() failed for docid %" PRId64, docId);
const char *reason = NULL;
// Don't dump unwanted URLs
if( ! isUrlUnwanted(*url, &reason)) {
Url **redirUrlPtr = xd->getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
// Get adult flag including generating debug info.
// Could just call xd->getIsAdult() to get the simple indicator
// without debug information.
CheckSpam schk(xd, true);
bool newblocked = schk.isDocSpam();
if( newblocked ) {
time_t idxtim = (time_t)xd->getIndexedTime();
struct tm tm_buf;
tm *tm1 = gmtime_r(&idxtim,&tm_buf);
char idxtim_s[64];
fprintf(stdout, "%" PRId64 "\t%s\t%s\t%s\tscore=%" PRId32 "\tunique dw=%" PRId32 "\tunique dp=%" PRId32 "\twords=%" PRId32 "\t%s\t%s\n",
docId, url->getUrl(), idxtim_s, schk.getReason(),
schk.getScore(), schk.getNumUniqueMatchedWords(), schk.getNumUniqueMatchedPhrases(),
schk.getNumWordsChecked(), schk.hasEmptyDocumentBody()?"EMPTYDOC":"HASCONTENT", schk.getDebugInfo());
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
static bool parseTest(const char *coll, int64_t docId, const char *query) {
g_conf.m_maxMem = 2000000000LL; // 2G
g_titledb.init ();
g_titledb.getRdb()->addRdbBase1 ( coll );
log(LOG_INIT, "build: Testing parse speed of html docId %" PRId64".",docId);
RdbList tlist;
key96_t startKey = Titledb::makeFirstKey ( docId );
key96_t endKey = Titledb::makeLastKey ( docId );
// a niceness of 0 tells it to block until it gets results!!
Msg5 msg5;
const CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&tlist ,
(char *)&startKey ,
(char *)&endKey , // should be maxed!
9999999 , // min rec sizes
true , // include tree?
0 , // startFileNum
-1 , // m_numFiles
NULL , // state
NULL , // callback
0 , // niceness
false , // do error correction?
-1 , // maxRetries
false)) { // isRealMerge
log(LOG_LOGIC, "build: getList did not block.");
return false;
// get the title rec
if ( tlist.isEmpty() ) {
log(LOG_WARN, "build: speedtestxml: docId %" PRId64" not found.", docId);
return false;
const char *errmsg=NULL;
if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
log("Unicode initialization failed! %s", errmsg);
return false;
if(!utf8_convert_initialize()) {
log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
return false;
// get raw rec from list
char *rec = tlist.getCurrentRec();
int32_t listSize = tlist.getListSize ();
XmlDoc xd;
if (!xd.set2(rec, listSize, coll, 0)) {
log(LOG_WARN, "build: speedtestxml: Error setting xml doc.");
return false;
log("build: Doc url is %s",xd.ptr_firstUrl);//tr.getUrl()->getUrl());
log("build: Doc is %" PRId32" bytes long.",xd.size_utf8Content-1);
log("build: Doc charset is %s",get_charset_str(xd.m_charset));
// time the summary/title generation code
log("build: Using query %s",query);
summaryTest1 ( rec , listSize , coll , docId , query );
// for a 128k latin1 doc: (access time is probably 15-20ms)
// 1.18 ms to set title rec (6ms total)
// 1.58 ms to set Xml
// 1.71 ms to set Words (~50% from Words::countWords())
// 0.42 ms to set Pos
// 0.66 ms to set Bits
// 0.51 ms to set Scores
// 0.35 ms to getText()
// speed test
int64_t t = gettimeofdayInMilliseconds();
for ( int32_t k = 0 ; k < 100 ; k++ )
xd.set2(rec, listSize, coll, 0);
int64_t e = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
// speed test
t = gettimeofdayInMilliseconds();
for ( int32_t k = 0 ; k < 100 ; k++ ) {
char *mm = (char *)mmalloc ( 300*1024 , "ztest");
mfree ( mm , 300*1024 ,"ztest");
e = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
// get content
char *content = xd.ptr_utf8Content;//tr.getContent();
int32_t contentLen = xd.size_utf8Content-1;//tr.getContentLen();
// loop parse
Xml xml;
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
if ( !xml.set( content, contentLen, xd.m_version, CT_HTML ) ) {
log(LOG_WARN, "build: speedtestxml: xml set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Xml::set() took %.3f ms to parse docId %" PRId64".",
(double)(e - t)/100.0,docId);
double bpms = contentLen/((double)(e-t)/100.0);
log("build: %.3f bytes/msec", bpms);
// get per char and per byte speeds
// loop parse
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
if ( !xml.set( content, contentLen, xd.m_version, CT_HTML ) ) {
log(LOG_WARN, "build: xml(setparents=false): %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Xml::set(setparents=false) took %.3f ms to "
"parse docId %" PRId64".", (double)(e - t)/100.0,docId);
TokenizerResult tr;
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Words::set(xml) took %.3f ms for %zu words"
" for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Words::set(content,computeIds=true) "
"took %.3f ms for %zu words "
"for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
Pos pos;
// computeWordIds from xml
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( ! pos.set ( &tr ) ) {
log(LOG_WARN, "build: speedtestxml: pos set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Pos::set() "
"took %.3f ms for %zu words "
"for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
Bits bits;
// computeWordIds from xml
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( ! bits.setForSummary ( &tr ) ) {
log(LOG_WARN, "build: speedtestxml: Bits set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Bits::setForSummary() "
"took %.3f ms for %zu words "
"for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
Sections sections;
// computeWordIds from xml
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ )
// do not supply xd so it will be set from scratch
if ( !sections.set( &tr, &bits, NULL, 0 ) ) {
log(LOG_WARN, "build: speedtestxml: sections set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Scores::set() "
"took %.3f ms for %zu words "
"for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
//Phrases phrases;
Phrases phrases;
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( !phrases.set( tr, bits ) ) {
log(LOG_WARN, "build: speedtestxml: Phrases set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Phrases::set() "
"took %.3f ms for %zu words "
"for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
char *buf = (char *)mmalloc(contentLen*2+1,"main");
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( !xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true ) ) {
log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Xml::getText(computeIds=false) took %.3f ms for docId "
"%" PRId64".",(double)(e - t)/100.0,docId);
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
int32_t bufLen = xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true );
if ( ! bufLen ) {
log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Xml::getText(computeIds=false) w/ word::set() "
"took %.3f ms for docId "
"%" PRId64".",(double)(e - t)/100.0,docId);
Matches matches;
Query q;
q.set(query, langUnknown, 1.0, 1.0, NULL, false, true, ABS_MAX_QUERY_TERMS);
matches.setQuery ( &q );
t = gettimeofdayInMilliseconds();
for ( int32_t i = 0 ; i < 100 ; i++ ) {
if ( ! matches.addMatches ( &tr ) ) {
log(LOG_WARN, "build: speedtestxml: matches set: %s", mstrerror(g_errno));
return false;
// print time it took
e = gettimeofdayInMilliseconds();
log("build: Matches::set() took %.3f ms for %zu words"
" for docId %" PRId64".",
(double)(e - t)/100.0,tr.size(),docId);
return true;
static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t docId, const char *query) {
// start the timer
int64_t t = gettimeofdayInMilliseconds();
Query q;
q.set(query, langUnknown, 1.0, 1.0, NULL, false, true, ABS_MAX_QUERY_TERMS);
char *content ;
int32_t contentLen ;
// loop parse
for ( int32_t i = 0 ; i < 100 ; i++ ) {
XmlDoc xd;
if (!xd.set2(rec, listSize, coll, 0)) {
log(LOG_ERROR,"%s:%s: XmlDoc.set2 failed", __FILE__, __func__);
return false;
// get content
content = xd.ptr_utf8Content;//tr.getContent();
contentLen = xd.size_utf8Content-1;//tr.getContentLen();
// now parse into xhtml (takes 15ms on lenny)
Xml xml;
xml.set( content, contentLen, xd.m_version, CT_HTML );
// print time it took
int64_t e = gettimeofdayInMilliseconds();
log("build: V1 Summary/Title/Gigabits generation took %.3f ms for docId "
"%" PRId64".",
(double)(e - t)/100.0,docId);
double bpms = contentLen/((double)(e-t)/100.0);
log("build: %.3f bytes/msec", bpms);
return true;
void dumpPosdb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int64_t termId , bool justVerify ) {
if ( ! justVerify ) {
g_posdb.init ();
g_posdb.getRdb()->addRdbBase1(coll );
key144_t startKey ;
key144_t endKey ;
if ( termId >= 0 ) {
Posdb::makeStartKey ( &startKey, termId );
Posdb::makeEndKey ( &endKey, termId );
printf("termid=%" PRIu64"\n", (uint64_t)termId);
// bail if not
if ( g_posdb.getRdb()->getNumFiles() <= startFileNum && numFiles > 0 ) {
printf("Request file #%" PRId32" but there are only %" PRId32" "
"posdb files\n",startFileNum,
key144_t lastKey;
Msg5 msg5;
RdbList list;
// set this flag so Msg5.cpp if it does error correction does not
// try to get the list from a twin...
g_isDumpingRdbFromMain = true;
const CollectionRec *cr = g_collectiondb.getRec(coll);
for (;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if (!msg5.getList(RDB_POSDB,
NULL, // state
NULL, // callback
0, // niceness
true, // to debug RdbList::removeBadData_r()
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC, "db: getList did not block.");
// all done if empty
if (list.isEmpty()) return;
// get last key in list
const char *ek2 = list.getEndKey();
// print it
printf("ek=%s\n", KEYSTR(ek2, list.getKeySize()));
// loop over entries in list
for (list.resetListPtr(); !list.isExhausted() && !justVerify; list.skipCurrentRecord()) {
key144_t k;
// compare to last
const char *err = "";
if (KEYCMP((char *)&k, (char *)&lastKey, sizeof(key144_t)) < 0)
err = " (out of order)";
lastKey = k;
// is it a delete?
const char *dd = "";
if ((k.n0 & 0x01) == 0x00) dd = " (delete)";
int64_t d = Posdb::getDocId(&k);
uint8_t dh = Docid::getDomHash8FromDocId(d);
char *rec = list.getCurrentRec();
int32_t recSize = 18;
if (rec[0] & 0x04) recSize = 6;
else if (rec[0] & 0x02) recSize = 12;
// alignment bits check
if (recSize == 6 && !(rec[1] & 0x02)) {
int64_t nd1 = g_posdb.getDocId(rec + 6);
err = " (alignerror1)";
if (nd1 < d) err = " (alignordererror1)";
if (recSize == 12 && !(rec[1] & 0x02)) {
// seems like nd2 is it, so it really is 12 bytes but
// does not have the alignment bit set...
int64_t nd2 = g_posdb.getDocId(rec + 12);
err = " (alignerror2)";
if (nd2 < d) err = " (alignorderrror2)";
// if it
if (recSize == 12 && (rec[7] & 0x02)) {
// seems like nd2 is it, so it really is 12 bytes but
// does not have the alignment bit set...
int64_t nd2 = g_posdb.getDocId(rec + 12);
err = " (alignerror3)";
if (nd2 < d) err = " (alignordererror3)";
if (KEYCMP((char *)&k, (char *)&startKey, list.getKeySize()) < 0 ||
KEYCMP((char *)&k, ek2, list.getKeySize()) > 0) {
err = " (out of range)";
" tid=%015" PRIu64
" docId=%012" PRId64
" siterank=%02" PRId32
" langid=%02" PRId32
" pos=%06" PRId32
" hgrp=%02" PRId32
" spamrank=%02" PRId32
" divrank=%02" PRId32
" syn=%01" PRId32
" densrank=%02" PRId32
" mult=%02" PRId32
" dh=0x%02" PRIx32
" rs=%" PRId32 //recSize
"%s" // dd
"%s" // err
KEYSTR(&k, sizeof(key144_t)),
startKey = *(key144_t *)list.getLastKey();
// watch out for wrap around
if (startKey < *(key144_t *)list.getLastKey()) return;
static void dumpClusterdb(const char *coll,
int32_t startFileNum,
int32_t numFiles,
bool includeTree) {
g_clusterdb.init ();
g_clusterdb.getRdb()->addRdbBase1(coll );
key96_t startKey ;
key96_t endKey ;
// bail if not
if ( g_clusterdb.getRdb()->getNumFiles() <= startFileNum ) {
printf("Request file #%" PRId32" but there are only %" PRId32" "
"clusterdb files\n",startFileNum,
Msg5 msg5;
RdbList list;
const CollectionRec *cr = g_collectiondb.getRec(coll);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_CLUSTERDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() )
// loop over entries in list
char strLanguage[256];
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
// is it a delete?
const char *dd = "";
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
// get the language string
languageToString ( Clusterdb::getLanguage(&k),
strLanguage );
//uint32_t gid = getGroupId ( RDB_CLUSTERDB , &k );
uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
Host *grp = g_hostdb.getShard ( shardNum );
Host *hh = &grp[0];
// print it
printf("k.n1=%08" PRIx32" k.n0=%016" PRIx64" "
"docId=%012" PRId64" family=%" PRIu32" "
"language=%" PRId32" (%s) siteHash26=%" PRIu32"%s "
"groupNum=%" PRIu32" "
"shardNum=%" PRIu32"\n",
k.n1, k.n0,
Clusterdb::getDocId(&k) ,
Clusterdb::hasAdultContent(&k) ,
Clusterdb::getSiteHash26(&k) ,
dd ,
(uint32_t)hh->m_hostId ,
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() )
static void dumpLinkdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, const char *url, bool urlhash) {
g_linkdb.init ();
g_linkdb.getRdb()->addRdbBase1(coll );
key224_t startKey ;
key224_t endKey ;
// set start/end key to url hash
if ( url ) {
Url u;
u.set( url, strlen( url ), false, false );
SiteGetter sg;
sg.getSite(url, NULL, 0, 0);
uint32_t h32 = hash32(sg.getSite(), sg.getSiteLen(), 0);
if( urlhash ) {
startKey = Linkdb::makeStartKey_uk(h32, u.getUrlHash64());
endKey = Linkdb::makeEndKey_uk (h32, u.getUrlHash64());
else {
startKey = Linkdb::makeStartKey_uk(h32, 0);
endKey = Linkdb::makeEndKey_uk (h32, LDB_MAXURLHASH);
printf("URL=%.*s, sitehash32=0x%08" PRIx32 ", urlhash=0x%012" PRIx64 "\n",
u.getUrlLen(), u.getUrl(), h32, u.getUrlHash64());
printf("Startkey=%s\n", KEYSTR(&startKey,sizeof(key224_t)));
printf("Endkey =%s\n", KEYSTR(&endKey,sizeof(key224_t)));
// bail if not
if ( g_linkdb.getRdb()->getNumFiles() <= startFileNum && !includeTree) {
printf("Request file #%" PRId32" but there are only %" PRId32" "
"linkdb files\n",startFileNum,
Msg5 msg5;
RdbList list;
const CollectionRec *cr = g_collectiondb.getRec(coll);
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_LINKDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
includeTree ,
startFileNum ,
numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1, // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) return;
// loop over entries in list
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char *) &k);
// is it a delete?
const char *dd = "";
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
uint64_t docId = (uint64_t)Linkdb::getLinkerDocId_uk(&k);
int32_t shardNum = getShardNum(RDB_LINKDB,&k);
char ipbuf[16];
printf("k=%s "
"linkeesitehash32=0x%08" PRIx32" "
"linkeeurlhash=0x%012" PRIx64" "
"linkspam=%" PRId32" "
"siterank=%02" PRId32" "
"ip32=%s "
"docId=%" PRIu64" "
"discovered=%" PRIu32" "
"lost=%" PRIu32" "
"sitehash32=0x%08" PRIx32" "
"shardNum=%" PRIu32" "
dd );
startKey = *(key224_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key224_t *)list.getLastKey() ) return;
static bool cacheTest() {
g_conf.m_maxMem = 2000000000LL; // 2G
//g_mem.m_maxMem = 2000000000LL; // 2G
if ( ! hashinit() ) {
log( LOG_ERROR, "db: Failed to init hashtable." );
return 1;
// use an rdb cache
RdbCache c;
// init, 50MB
int32_t maxMem = 50000000;
// . how many nodes in cache tree can we fit?
// . each rec is key (12) and ip(4)
// . overhead in cache is 56
// . that makes 56 + 4 = 60
// . not correct? stats suggest it's less than 25 bytes each
int32_t maxCacheNodes = maxMem / 25;
// set the cache
if ( ! c.init ( maxMem ,
4 , // fixed data size of rec
maxCacheNodes ,
"cachetest" , // dbname
false, // save cache to disk?
12, // cachekeysize
-1)) { // numptrsmax
log(LOG_WARN, "test: Cache init failed.");
return false;
#if 0
int32_t numRecs = 0 * maxCacheNodes;
logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);
// timestamp
int32_t timestamp = 42;
// keep ring buffer of last 10 keys
key96_t oldk[10];
int32_t oldip[10];
int32_t b = 0;
// fill with random recs
for ( int32_t i = 0 ; i < numRecs ; i++ ) {
if ( (i % 100000) == 0 )
logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache.",i);
// random key
key96_t k ;
k.n1 = rand();
k.n0 = rand();
k.n0 <<= 32;
k.n0 |= rand();
int32_t ip = rand();
// keep ring buffer
oldk [b] = k;
oldip[b] = ip;
if ( ++b >= 10 ) b = 0;
// make rec,size, like dns, will be 4 byte hash and 4 byte key?
c.addRecord((collnum_t)0,k,(char *)&ip,4,timestamp);
// reset g_errno in case it had an error (we don't care)
g_errno = 0;
// get a rec too!
if ( i < 10 ) continue;
int32_t next = b + 1;
if ( next >= 10 ) next = 0;
key96_t back = oldk[next];
char *rec;
int32_t recSize;
if ( ! c.getRecord ( (collnum_t)0 ,
back ,
&rec ,
&recSize ,
false , // do copy?
-1 , // maxAge ,
true , // inc count?
NULL , // *cachedTime = NULL,
true )){ // promoteRecord?
g_process.shutdownAbort(true); }
if ( ! rec || recSize != 4 || *(int32_t *)rec != oldip[next] ) {
g_process.shutdownAbort(true); }
// now try variable sized recs
logf(LOG_DEBUG,"test: Testing variably-sized recs.");
// init, 300MB
maxMem = 300000000;
// . how many nodes in cache tree can we fit?
// . each rec is key (12) and ip(4)
// . overhead in cache is 56
// . that makes 56 + 4 = 60
// . not correct? stats suggest it's less than 25 bytes each
maxCacheNodes = maxMem / 5000;
//maxCacheNodes = 1200;
// set the cache
if ( ! c.init ( maxMem ,
-1 , // fixed data size of rec
maxCacheNodes ,
"cachetest" , // dbname
false,12,-1 )) { // save cache to disk?
log(LOG_WARN, "test: Cache init failed.");
return false;
int32_t numRecs = 30 * maxCacheNodes;
logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);
key96_t oldk[10];
// timestamp
int32_t timestamp = 42;
// keep ring buffer of last 10 keys
int32_t oldrs[10];
int32_t b = 0;
// rec to add
char *rec;
int32_t recSize;
int32_t maxRecSize = 40000000; // 40MB for termlists
int32_t numMisses = 0;
char *buf = (char *)mmalloc ( maxRecSize + 64 ,"cachetest" );
if ( ! buf ) return false;
// fill with random recs
for ( int32_t i = 0 ; i < numRecs ; i++ ) {
if ( (i % 100) == 0 )
logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache. "
"Misses=%" PRId32".",i,numMisses);
// random key
key96_t k ;
k.n1 = rand();
k.n0 = rand();
k.n0 <<= 32;
k.n0 |= rand();
// random size
recSize = rand()%maxRecSize;//100000;
// keep ring buffer
oldk [b] = k;
oldrs[b] = recSize;
if ( ++b >= 10 ) b = 0;
// make the rec
rec = buf;
memset ( rec , (char)k.n1, recSize );
// make rec,size, like dns, will be 4 byte hash and 4 byte key?
if ( ! c.addRecord((collnum_t)0,k,rec,recSize,timestamp) ) {
g_process.shutdownAbort(true); }
// do a dup add 1% of the time
if ( (i % 100) == 0 )
g_process.shutdownAbort(true); }
// reset g_errno in case it had an error (we don't care)
g_errno = 0;
// get a rec too!
if ( i < 10 ) continue;
int32_t next = b + 1;
if ( next >= 10 ) next = 0;
key96_t back = oldk[next];
//log("cache: get rec");
if ( ! c.getRecord ( (collnum_t)0 ,
back ,
&rec ,
&recSize ,
false , // do copy?
-1 , // maxAge ,
true , // inc count?
NULL , // *cachedTime = NULL,
true) ) {//true )){ // promoteRecord?
if ( recSize != oldrs[next] ) {
logf(LOG_DEBUG,"test: bad rec size.");
char r = (char)back.n1;
for ( int32_t j = 0 ; j < recSize ; j++ ) {
if ( rec[j] == r ) continue;
logf(LOG_DEBUG,"test: bad char in rec.");
return true;
// CountDomains Structures and function definitions
struct dom_info {
char *dom;
int32_t domLen;
int32_t dHash;
int32_t pages;
struct ip_info **ip_list;
int32_t numIp;
int32_t *lnk_table;
int32_t tableSize;
int32_t lnkCnt;
int32_t lnkPages;
struct ip_info {
uint32_t ip;
int32_t pages;
struct dom_info **dom_list;
int32_t numDom;
static int ip_fcmp (const void *p1, const void *p2);
static int ip_dcmp (const void *p1, const void *p2);
static int dom_fcmp (const void *p1, const void *p2);
static int dom_lcmp (const void *p1, const void *p2);
static void countdomains(const char* coll, int32_t numRecs, int32_t output) {
struct ip_info **ip_table;
struct dom_info **dom_table;
const CollectionRec *cr = g_collectiondb.getRec(coll);
key96_t startKey;
key96_t endKey ;
key96_t lastKey ;
g_titledb.init ();
g_titledb.getRdb()->addRdbBase1(coll );
log( LOG_INFO, "countdomains: parms: coll=%s, numrec s=%d", coll, numRecs );
int64_t time_start = gettimeofdayInMilliseconds();
Msg5 msg5;
RdbList list;
int32_t countDocs = 0;
int32_t countIp = 0;
int32_t countDom = 0;
int32_t attempts = 0;
ip_table = (struct ip_info **)mmalloc(sizeof(struct ip_info *) * numRecs,
"main-dcit" );
dom_table = (struct dom_info **)mmalloc(sizeof(struct dom_info *) * numRecs,
"main-dcdt" );
for( int32_t i = 0; i < numRecs; i++ ) {
ip_table[i] = NULL;
dom_table[i] = NULL;
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
true , // Do we need to include tree?
0 ,
-1 ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
log(LOG_LOGIC,"db: getList did not block.");
// all done if empty
if ( list.isEmpty() ) goto freeInfo;
// loop over entries in list
for ( list.resetListPtr(); ! list.isExhausted(); list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = Titledb::getDocId ( &k );
if ( k <= lastKey )
log("key out of order. "
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
"currKey.n1=%" PRIx32" n0=%" PRIx64" ",
lastKey = k;
// print deletes
if ( (k.n0 & 0x01) == 0) {
fprintf(stderr,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
k.n1 , k.n0 , docId );
if( (countIp >= numRecs) || (countDom >= numRecs) ) {
log( LOG_INFO, "countdomains: countIp | countDom, greater than"
"numRecs requested, should never happen!!!!" );
goto freeInfo;
XmlDoc xd;
if (!xd.set2(rec, recSize, coll, 0))
struct ip_info *sipi ;
struct dom_info *sdomi;
int32_t i;
for( i = 0; i < countIp; i++ ) {
if( !ip_table[i] ) continue;
sipi = (struct ip_info *)ip_table[i];
if( sipi->ip == (uint32_t)xd.m_ip ) break;
if( i == countIp ) {
sipi = (struct ip_info *)mmalloc(sizeof(struct ip_info),
"main-dcip" );
if( !sipi ) { g_process.shutdownAbort(true); }
ip_table[countIp++] = sipi;
sipi->ip = xd.m_ip;//u->getIp();
sipi->pages = 1;
sipi->numDom = 0;
} else {
char *fu = xd.ptr_firstUrl;
int32_t dlen;
const char *dom = getDomFast ( fu , &dlen );
int32_t dkey = hash32( dom , dlen );
for( i = 0; i < countDom; i++ ) {
if( !dom_table[i] ) continue;
sdomi = (struct dom_info *)dom_table[i];
if( sdomi->dHash == dkey ) break;
if( i == countDom ) {
sdomi =(struct dom_info*)mmalloc(sizeof(struct dom_info),
"main-dcdm" );
if( !sdomi ) { g_process.shutdownAbort(true); }
dom_table[countDom++] = sdomi;
sdomi->dom = (char *)mmalloc( dlen,"main-dcsdm" );
strncpy( sdomi->dom, dom , dlen );
sdomi->domLen = dlen;
sdomi->dHash = dkey;
sdomi->pages = 1;
sdomi->numIp = 0;
sdomi->tableSize = 0;
sdomi->lnkCnt = 0;
else {
Links *dlinks = xd.getLinks();
int32_t size = dlinks->getNumLinks();
if( !sdomi->tableSize ) {
sdomi->lnk_table = (int32_t *)mmalloc(size * sizeof(int32_t),
"main-dclt" );
sdomi->tableSize = size;
else {
if( size > (sdomi->tableSize - sdomi->lnkCnt) ) {
size += sdomi->lnkCnt;
sdomi->lnk_table = (int32_t *)
"main-dcrlt" );
sdomi->tableSize = size;
for( int32_t i = 0; i < dlinks->getNumLinks(); i++ ) {
char *link = dlinks->getLinkPtr(i);
int32_t dlen;
const char *dom = getDomFast ( link , &dlen );
uint32_t lkey = hash32( dom , dlen );
int32_t j;
for( j = 0; j < sdomi->lnkCnt; j++ ) {
if( sdomi->lnk_table[j] == (int32_t)lkey ) break;
if( j != sdomi->lnkCnt ) continue;
sdomi->lnk_table[sdomi->lnkCnt++] = lkey;
// Handle lists
if( !sipi->numDom || !sdomi->numIp ){
sdomi->numIp++; sipi->numDom++;
//Add to IP list for Domain
sdomi->ip_list = (struct ip_info **)
mrealloc( sdomi->ip_list,
(sdomi->numIp-1)*sizeof(char *),
sdomi->numIp*sizeof(char *),
"main-dcldm" );
sdomi->ip_list[sdomi->numIp-1] = sipi;
//Add to domain list for IP
sipi->dom_list = (struct dom_info **)
mrealloc( sipi->dom_list,
(sipi->numDom-1)*sizeof(char *),
sipi->numDom*sizeof(char *),
"main-dclip" );
sipi->dom_list[sipi->numDom-1] = sdomi;
else {
int32_t i;
for( i = 0;
(i < sdomi->numIp)
&& (sdomi->ip_list[i] != sipi);
i++ );
if( sdomi->numIp != i ) goto updateIp;
sdomi->ip_list = (struct ip_info **)
mrealloc( sdomi->ip_list,
"main-dcldm" );
sdomi->ip_list[sdomi->numIp-1] = sipi;
for( i = 0;
(i < sipi->numDom)
&& (sipi->dom_list[i] != sdomi);
i++ );
if( sipi->numDom != i ) goto endListUpdate;
sipi->dom_list = (struct dom_info **)
mrealloc( sipi->dom_list,
"main-dclip" );
sipi->dom_list[sipi->numDom-1] = sdomi;
if( !((++countDocs) % 1000) )
log(LOG_INFO, "countdomains: %" PRId32" records searched.",countDocs);
if( countDocs == numRecs ) goto freeInfo;
//else countDocs++;
startKey = *(key96_t *)list.getLastKey();
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
log( LOG_INFO, "countdomains: Keys wrapped around! Exiting." );
goto freeInfo;
if ( countDocs >= numRecs ) {
char buf[128];
//int32_t value ;
int32_t len ;
char loop ;
int32_t recsDisp;
struct ip_info *tmpipi ;
struct dom_info *tmpdomi ;
loop = 0;
FILE *fhndl;
char out[sizeof(g_hostdb.m_dir)+128];
if( output != 9 ) goto printHtml;
// Dump raw data to a file to parse later
snprintf( out, sizeof(out), "%scntdom.xml", g_hostdb.m_dir );
out[ sizeof(out)-1 ] = '\0';
if( !(fhndl = fopen( out, "wb" )) ) {
log( LOG_INFO, "countdomains: File Open Failed." );
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
for( int32_t i = 0; i < countDom; i++ ) {
if( !dom_table[i] ) continue;
tmpdomi = (struct dom_info *)dom_table[i];
len = tmpdomi->domLen;
if( tmpdomi->domLen > 127 ) len = 126;
strncpy( buf, tmpdomi->dom, len );
buf[len] = '\0';
"\t<pages>%" PRId32"</pages>\n"
//"\t<quality>%" PRId64"</quality>\n"
buf, tmpdomi->pages
gbsort( tmpdomi->ip_list,tmpdomi->numIp, sizeof(int32_t),
ip_fcmp );
for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
if( !tmpdomi->ip_list[j] ) continue;
tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
iptoa( tmpipi->ip,buf);
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
for( int32_t i = 0; i < countIp; i++ ) {
if( !ip_table[i] ) continue;
tmpipi = (struct ip_info *)ip_table[i];
iptoa( tmpipi->ip,buf);
"\t<pages>%" PRId32"</pages>\n"
buf, tmpipi->pages);
for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
tmpdomi = (struct dom_info *)tmpipi->dom_list[j];
len = tmpdomi->domLen;
if( tmpdomi->domLen > 127 ) len = 126;
strncpy( buf, tmpdomi->dom, len );
buf[len] = '\0';
if( fclose( fhndl ) < 0 ) {
log( LOG_INFO, "countdomains: File Close Failed." );
fhndl = 0;
// HTML file Output
snprintf( out, sizeof(out), "%scntdom.html", g_hostdb.m_dir );
out[ sizeof(out)-1 ] = '\0';
if( !(fhndl = fopen( out, "wb" )) ) {
log( LOG_INFO, "countdomains: File Open Failed." );
int64_t total = g_titledb.estimateGlobalNumDocs();
static const char link_ip[] = "http://www.gigablast.com/search?"
static const char link_dom[] = "http://www.gigablast.com/search?"
static const char menu[] = "<table cellpadding=\"2\" cellspacing=\"2\">\n<tr>"
"<th bgcolor=\"#CCCC66\"><a href=\"#pid\">"
"Domains Sorted By Pages</a></th>"
"<th bgcolor=\"#CCCC66\"><a href=\"#lid\">"
"Domains Sorted By Links</a></th>"
"<th bgcolor=\"#CCCC66\"><a href=\"#pii\">"
"IPs Sorted By Pages</a></th>"
"<th bgcolor=\"#CCCC66\"><a href=\"#dii\">"
"IPs Sorted By Domains</a></th>"
"<th bgcolor=\"#CCCC66\"><a href=\"#stats\">"
static const char hdr[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
"<tr bgcolor=\"AAAAAA\">"
"<th>Domains Linked</th>"
//"<th>Avg Quality</th>"
"<th># Pages</th>"
"<th>Extrap # Pages</th>"
static const char hdr2[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
"<tr bgcolor=\"AAAAAA\">"
"<th>Domains Linked</th>"
//"<th>Avg Quality</th>"
"<th># Pages</th>"
"<th>Extrap # Pages</th>"
static const char clr1[] = "#FFFF00";//"yellow";
static const char clr2[] = "#FFFF66";//"orange";
const char *color;
fprintf( fhndl,
"<html><head><title>Domain/IP Counter</title></head>\n"
"<h1>Domain/IP Counter</h1><br><br>"
"<a name=\"stats\">"
"<h2>Stats</h2>\n%s", menu );
// Stats
fprintf( fhndl, "<br>\n\n<table>\n"
"<tr><th align=\"left\">Total Number of Domains</th>"
"<td>%" PRId32"</td></tr>\n"
"<tr><th align=\"left\">Total Number of Ips</th>"
"<td>%" PRId32"</td></tr>\n"
"<tr><th align=\"left\">Number of Documents Searched"
"</th><td>%" PRId32"</td></tr>\n"
"<tr><th align=\"left\">Number of Failed Attempts</th>"
"<td>%" PRId32"</td></tr><tr></tr><tr>\n"
"<tr><th align=\"left\">Number of Documents in Index"
"</th><td>%" PRId64"</td></tr>\n"
"<tr><th align=\"left\">Estimated Domains in index</th>"
"<td>%" PRId64"</td></tr>"
countDocs, attempts-countDocs,total,
countDocs ? ((countDom*total)/countDocs) : 0 );
fprintf( fhndl, "<a name=\"pid\">\n"
"<h2>Domains Sorted By Pages</h2>\n"
"%s", menu );
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
fprintf( fhndl,"%s", hdr );
recsDisp = countDom;
if( countDom > 1000 ) recsDisp = 1000;
for( int32_t i = 0; i < recsDisp; i++ ) {
char buf[128];
int32_t len;
if( !dom_table[i] ) continue;
if( i%2 ) color = clr2;
else color = clr1;
tmpdomi = (struct dom_info *)dom_table[i];
len = tmpdomi->domLen;
if( tmpdomi->domLen > 127 ) len = 126;
strncpy( buf, tmpdomi->dom, len );
buf[len] = '\0';
fprintf( fhndl, "<tr bgcolor=\"%s\"><td>"
"<a href=\"%s%s\" target=\"_blank\">%s</a>"
"</td><td>%" PRId32"</td>"
//"<td>%" PRId64"</td>"
"<td>%" PRId32"</td>"
"<td>%" PRId64"</td><td>",
color, link_dom,
buf, buf, tmpdomi->lnkCnt,
((tmpdomi->pages*total)/countDocs) );
for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
fprintf( fhndl, "<a href=\"%s%s\""
link_ip, buf, buf );
fprintf( fhndl, "</td></tr>\n" );
fprintf( fhndl, "\n" );
fprintf( fhndl, "</table>\n<br><br><br>" );
if( loop == 0 ) {
loop = 1;
gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_lcmp );
fprintf( fhndl, "<a name=\"lid\">"
"<h2>Domains Sorted By Links</h2>\n%s", menu );
goto printDomLp;
loop = 0;
fprintf( fhndl, "<a name=\"pii\">"
"<h2>IPs Sorted By Pages</h2>\n%s", menu );
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
fprintf( fhndl,"%s", hdr2 );
recsDisp = countIp;
if( countIp > 1000 ) recsDisp = 1000;
for( int32_t i = 0; i < recsDisp; i++ ) {
char buf[128];
if( !ip_table[i] ) continue;
tmpipi = (struct ip_info *)ip_table[i];
if( i%2 ) color = clr2;
else color = clr1;
int32_t linked = 0;
for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
tmpdomi=(struct dom_info *)tmpipi->dom_list[j];
linked += tmpdomi->lnkCnt;
fprintf( fhndl, "\t<tr bgcolor=\"%s\"><td>"
"<a href=\"%s%s\" target=\"_blank\">%s</a>"
"<td>%" PRId32"</td>"
"<td>%" PRId32"</td>"
//"<td>%" PRId64"</td>"
"<td>%" PRId32"</td>"
"<td>%" PRId64"</td></tr>\n",
link_ip, buf, buf, tmpipi->numDom, linked,
((tmpipi->pages*total)/countDocs) );
fprintf( fhndl, "\n" );
fprintf( fhndl, "</table>\n<br><br><br>" );
if( loop == 0 ) {
loop = 1;
gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_dcmp );
fprintf( fhndl, "<a name=\"dii\">"
"<h2>IPs Sorted By Domains</h2>\n%s", menu );
goto printIpLp;
if( fclose( fhndl ) < 0 ) {
log( LOG_INFO, "countdomains: File Close Failed." );
fhndl = 0;
int32_t ima = 0;
int32_t dma = 0;
log( LOG_INFO, "countdomains: Freeing ip info struct..." );
for( int32_t i = 0; i < countIp; i++ ) {
if( !ip_table[i] ) continue;
//value = ipHT.getValue( ip_table[i] );
//if(value == 0) continue;
tmpipi = (struct ip_info *)ip_table[i];
mfree( tmpipi->dom_list, tmpipi->numDom*sizeof(tmpipi->dom_list[0]),
"main-dcflip" );
ima += tmpipi->numDom * sizeof(int32_t);
mfree( tmpipi, sizeof(struct ip_info), "main-dcfip" );
ima += sizeof(struct ip_info);
tmpipi = NULL;
mfree( ip_table, numRecs * sizeof(struct ip_info *), "main-dcfit" );
log( LOG_INFO, "countdomains: Freeing domain info struct..." );
for( int32_t i = 0; i < countDom; i++ ) {
if( !dom_table[i] ) continue;
tmpdomi = (struct dom_info *)dom_table[i];
mfree( tmpdomi->lnk_table,
"main-dcfsdlt" );
dma += tmpdomi->tableSize * sizeof(int32_t);
mfree( tmpdomi->ip_list, tmpdomi->numIp*sizeof(tmpdomi->ip_list[0]),
"main-dcfldom" );
dma += tmpdomi->numIp * sizeof(int32_t);
mfree( tmpdomi->dom, tmpdomi->domLen, "main-dcfsdom" );
dma += tmpdomi->domLen;
mfree( tmpdomi, sizeof(struct dom_info), "main-dcfdom" );
dma+= sizeof(struct dom_info);
tmpdomi = NULL;
mfree( dom_table, numRecs * sizeof(struct dom_info *), "main-dcfdt" );
int64_t time_end = gettimeofdayInMilliseconds();
log( LOG_INFO, "countdomains: Took %" PRId64"ms to count domains in %" PRId32" recs.",
time_end-time_start, countDocs );
log( LOG_INFO, "countdomains: %" PRId32" bytes of Total Memory Used.",
ima + dma + (8 * numRecs) );
log( LOG_INFO, "countdomains: %" PRId32" bytes Total for IP.", ima );
log( LOG_INFO, "countdomains: %" PRId32" bytes Total for Dom.", dma );
log( LOG_INFO, "countdomains: %" PRId32" bytes Average for IP.", countIp ? ima/countIp : 0 );
log( LOG_INFO, "countdomains: %" PRId32" bytes Average for Dom.", countDom ? dma/countDom : 0 );
// Sort by IP frequency in pages 9->0
int ip_fcmp (const void *p1, const void *p2) {
//int32_t n1, n2;
// break this! need to fix later MDW 11/12/14
char *n1 ;
char *n2 ;
struct ip_info *ii1;
struct ip_info *ii2;
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
ii1 = (struct ip_info *)n1;
ii2 = (struct ip_info *)n2;
return ii2->pages-ii1->pages;
// Sort by number of domains linked to IP, descending
int ip_dcmp (const void *p1, const void *p2) {
//int32_t n1, n2;
// break this! need to fix later MDW 11/12/14
char *n1 ;
char *n2 ;
struct ip_info *ii1;
struct ip_info *ii2;
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
ii1 = (struct ip_info *)n1;
ii2 = (struct ip_info *)n2;
return ii2->numDom-ii1->numDom;
// Sort by page frequency in titlerec 9->0
int dom_fcmp (const void *p1, const void *p2) {
//int32_t n1, n2;
// break this! need to fix later MDW 11/12/14
char *n1 ;
char *n2 ;
struct dom_info *di1;
struct dom_info *di2;
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
di1 = (struct dom_info *)n1;
di2 = (struct dom_info *)n2;
return di2->pages-di1->pages;
// Sort by quantity of outgoing links 9-0
int dom_lcmp (const void *p1, const void *p2) {
//int32_t n1, n2;
// break this! need to fix later MDW 11/12/14
char *n1 ;
char *n2 ;
struct dom_info *di1;
struct dom_info *di2;
*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);
*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);
di1 = (struct dom_info *)n1;
di2 = (struct dom_info *)n2;
return di2->lnkCnt-di1->lnkCnt;
static const char *getAbsoluteGbDir(const char *argv0) {
static char s_buf[1024];
char *s = realpath(argv0, NULL);
return NULL;
if(strlen(s) >= sizeof(s_buf))
return NULL;
//chop off last component, so we hae just the directory (including a trailing / )
char *slash = strrchr(s_buf, '/');
if(slash==NULL) {
//what? this is not supposed to happen that realpath returns an absolute path that doesn't contain a slash
return NULL;
slash[1] = '\0';
return s_buf;
// used to make package to install files for the package
static int copyFiles(const char *dstDir) {
const char *srcDir = "./";
SafeBuf fileListBuf;
g_process.getFilesToCopy ( srcDir , &fileListBuf );
SafeBuf tmp;
"cp -r %s %s"
, fileListBuf.getBufStart()
, dstDir
//log(LOG_INIT,"admin: %s", tmp.getBufStart());
fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
system ( tmp.getBufStart() );
return 0;
static void wvg_log_function(WordVariationGenerator::log_class_t log_class, const char *fmt, va_list ap) {
char buf[2048];
vsnprintf(buf,sizeof(buf), fmt, ap);
int32_t type;
switch(log_class) {
case WordVariationGenerator::log_trace: type = LOG_TRACE; break;
case WordVariationGenerator::log_debug: type = LOG_DEBUG; break;
case WordVariationGenerator::log_info: type = LOG_INFO; break;
case WordVariationGenerator::log_warn: type = LOG_WARN; break;
case WordVariationGenerator::log_error: type = LOG_ERROR; break;