privacore-open-source-searc…/main.cpp

//
// Matt Wells, copyright Sep 2001
//


#include <sched.h>        // clone()
// declare this stuff up here for call the pread() in our seek test below
//
// maybe we should put this in a common header file so we don't have
// certain files compiled with the platform default, and some not -partap

#include "Version.h" // getVersion()
#include "Mem.h"
#include "Conf.h"
#include "JobScheduler.h"
#include "Hostdb.h"
#include "Posdb.h"
#include "Titledb.h"
#include "Tagdb.h"
#include "Spider.h"
#include "SpiderColl.h"
#include "SpiderLoop.h"
#include "SpiderCache.h"
#include "Doledb.h"
#include "Clusterdb.h"
#include "Collectiondb.h"
#include "Sections.h"
#include "UdpServer.h"
#include "Serialize.h"
#include "Repair.h"
#include "DailyMerge.h"
#include "MsgC.h"
#include "HttpServer.h"
#include "Loop.h"
#include "HighFrequencyTermShortcuts.h"
#include "PageTemperatureRegistry.h"
#include "Docid2Siteflags.h"
#include "SiteMedianPageTemperatureRegistry.h"
#include "UrlRealtimeClassification.h"
#include "IPAddressChecks.h"
#include <sys/resource.h>  // setrlimit
#include "Stats.h"
#include "Statistics.h"
#include "Speller.h"       // g_speller
#include "Wiki.h"          // g_wiki
#include "Wiktionary.h"    // g_wiktionary
#include "WordVariations.h"
#include "CountryCode.h"
#include "Domains.h"
#include "Pos.h"
#include "Title.h"
#include "Speller.h"
#include "SummaryCache.h"
#include "InstanceInfoExchange.h"
#include "WantedChecker.h"
#include "Dns.h"
#include "DumpSpiderdbSqlite.h"

// include all msgs that have request handlers, cuz we register them with g_udp
#include "Msg0.h"
#include "Msg4In.h"
#include "Msg4Out.h"

#include "Msg13.h"
#include "Msg20.h"
#include "Msg22.h"
#include "Msg25.h"
#include "Msg39.h"
#include "Msg40.h"    // g_resultsCache
#include "Parms.h"
#include "Pages.h"
#include "PageInject.h"
#include "unicode/UCMaps.h"
#include "utf8_convert.h"

#include "Profiler.h"
#include "Proxy.h"

#include "linkspam.h"
#include "Process.h"
#include "sort.h"
#include "RdbBuckets.h"
#include "SpiderProxy.h"
#include "HashTable.h"
#include "GbUtil.h"
#include "Dir.h"
#include "File.h"
#include "DnsBlockList.h"
#include "ContentTypeBlockList.h"
#include "UrlMatchList.h"
#include "UrlBlockCheck.h"
#include "DocDelete.h"
#include "GbDns.h"
#include "ScopedLock.h"
#include "RobotsCheckList.h"
#include "ConvertSpiderdb.h"
#include "RobotsBlockedResultOverride.h"
#include "UrlResultOverride.h"
#include "FxCheckAdult.h"
#include "FxCheckSpam.h"
#include "GbCompress.h"
#include "DocRebuild.h"
#include "DocReindex.h"
#include "FxExplicitKeywords.h"
#include "IpBlockList.h"
#include "SpiderdbSqlite.h"
#include "QueryLanguage.h"
#include "SiteNumInlinks.h"
#include "ContentMatchList.h"
#include "SiteMedianPageTemperature.h"
#include "Lemma.h"
#include "ip.h"
#include "CountryLanguage.h"
#include "Errno.h"
#include "Docid.h"


#include <sys/stat.h> //umask()
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#include <valgrind/helgrind.h>
#endif

static bool registerMsgHandlers();
static bool registerMsgHandlers1();
static bool registerMsgHandlers2();

static const int32_t commandLineDumpdbRecSize = 10 * 1024 * 1024; //recSizes parameter for Msg5::getList() while dumping database from the command-line

static void printHelp();

static void dumpTitledb  (const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
			   int64_t docId , bool justPrintDups );

static void dumpTagdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char req,
		      const char *site);

//dumpPosdb() is not local becaue it is called directly by unittests
void dumpPosdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, int64_t termId , bool justVerify);
static void dumpWaitingTree(const char *coll);
static void dumpRobotsTxtCache(const char *coll);

static void dumpDoledb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree);
static void dumpClusterdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree);
static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, const char *url, bool urlhash);

static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);

static int copyFiles(const char *dstDir);

static const char *getAbsoluteGbDir(const char *argv0);

static int32_t checkDirPerms(const char *dir);

static bool hashtest();
// how fast to parse the content of this docId?
static bool parseTest(const char *coll, int64_t docId, const char *query);
static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t docId, const char *query );

static bool cacheTest();
static void countdomains(const char* coll, int32_t numRecs, int32_t output);

static bool argToBoolean(const char *arg);
static bool parseOptionalHostRange(int rangearg, int argc, char **argv, int *h1, int *h2);

static void wvg_log_function(WordVariationGenerator::log_class_t log_class, const char *fmt, va_list ap);

static void wakeupPollLoop() {
	g_loop.wakeupPollLoop();
}

static UdpProtocol g_dp; // Default Proto

// installFlag konstants
typedef enum {
	ifk_install = 1,
	ifk_installgb ,
	ifk_installconf ,
	ifk_dsh ,
	ifk_dsh2 ,
	ifk_backupcopy ,
	ifk_backupmove ,
	ifk_backuprestore ,
	ifk_installconf2 ,
	ifk_start ,
	ifk_tmpstart ,
	ifk_installtmpgb ,
	ifk_proxy_start
} install_flag_konst_t;

static int install_file(const char *file, int32_t hostId, int32_t hostId2);
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir = NULL,
                     int32_t hostId2 = -1, char *cmd = NULL );

bool doCmd ( const char *cmd , int32_t hostId , const char *filename , bool sendToHosts,
	     bool sendToProxies, int32_t hostId2=-1 );

static char unicode_data_dir[2014]; //filled in by main2() when hostdb has been initialized

//void tryMergingWrapper ( int fd , void *state ) ;

//void resetAll ( );
//void spamTest ( ) ;

extern void resetPageAddUrl    ( );
extern void resetHttpMime      ( );
extern void reset_iana_charset ( );
extern void resetAdultBit      ( );
extern void resetEntities      ( );
extern void resetQuery         ( );


extern bool g_recoveryMode; // HostFlags.cpp

static int argc_copy;
static char **argv_copy;
static int rc_copy;

static int main2(int argc, char *argv[]);

static void *main2_trampoline(void *) {
	pthread_setname_np(pthread_self(),"main");
	rc_copy = main2(argc_copy,argv_copy);
	return NULL;
}


int main ( int argc , char *argv[] ) {
	//Run the main thread ... in a thread
	//The reason for this is so that 'htop', 'perf' and other tools show metrics
	//for the main thread instead of lumping it together wide process-wide
	//aggregation (eg. linux kernel 4.4.x claims the main task/process does IO
	//eventhough it provably doesn't)
	argc_copy = argc;
	argv_copy = argv;
	pthread_t tid;
	int rc = pthread_create(&tid,NULL,main2_trampoline,NULL);
	if(rc!=0){
		fprintf(stderr,"pthread_create() failed with error %d (%s)",rc,strerror(rc));
		return 99;
	}
	rc = pthread_join(tid,NULL);
	if(rc!=0) {
		fprintf(stderr,"pthread_join() failed with error %d (%s)",rc,strerror(rc));
		return 99;
	}

	if(rc_copy)
		fprintf( stderr, "Failed to start gb. Exiting.\n" );

	return rc_copy;
}

int main2 ( int argc , char *argv[] ) {
	g_conf.m_runAsDaemon = false;
	g_conf.m_logToFile = false;

#ifdef _VALGRIND_
	//threads are incrementing the counters all over the place
	VALGRIND_HG_DISABLE_CHECKING(&g_stats,sizeof(g_stats));
#endif
	// record time for uptime
	g_stats.m_uptimeStart = time(NULL);

	int32_t  cmdarg = 0;

	// get command

	// it might not be there, might be a simple "./gb"
	const char *cmd = "";
	if ( argc >= 2 ) {
		cmdarg = 1;
		cmd = argv[1];
	}

	const char *cmd2 = "";
	if ( argc >= 3 )
		cmd2 = argv[2];

	int arch = 64;
	if ( sizeof(char *) == 4 ) arch = 32;

	// help
	if ( strcmp ( cmd , "-h" ) == 0 ) {
		printHelp();
		return 0;
	}

	// version
	if ( strcmp ( cmd , "-v" ) == 0 ) {
		printVersion();
		return 0;
	}

	//send an email on startup for -r, like if we are recovering from an
	//unclean shutdown.
	g_recoveryMode = false;
	if ( strncmp ( cmd , "-r" ,2 ) == 0 || strncmp ( cmd2 , "-r",2 ) == 0 ) {
		g_recoveryMode = true;
	}

	// run as daemon? then we have to fork
	if ( ( strcmp ( cmd , "-d" ) == 0 ) || ( strcmp ( cmd2 , "-d" ) == 0 ) ) {
		g_conf.m_runAsDaemon = true;
	}

	if ( ( strcmp ( cmd , "-l" ) == 0 ) || ( strcmp ( cmd2 , "-l" ) == 0 ) ) {
		g_conf.m_logToFile = true;
	}

	if( (strcmp( cmd, "countdomains" ) == 0) &&  (argc >= (cmdarg + 3)) ) {
		uint32_t tmp = atoi( argv[cmdarg+1] );
		if( (tmp * 10) > g_mem.getMemTableSize() )
			g_mem.setMemTableSize(tmp * 10);
	}

	// these tests do not need a hosts.conf
	if ( strcmp ( cmd , "hashtest" ) == 0 ) {
		if ( argc > cmdarg+1 ) {
			printHelp();
			return 1;
		}
		hashtest();
		return 0;
	}
	// these tests do not need a hosts.conf
	if ( strcmp ( cmd , "cachetest" ) == 0 ) {
		if ( argc > cmdarg+1 ) {
			printHelp();
			return 1;
		}
		cacheTest();
		return 0;
	}
	if ( strcmp ( cmd , "parsetest"  ) == 0 ) {
		if ( cmdarg+1 >= argc ) {
			printHelp();
			return 1;
		}
		// load up hosts.conf
		//if ( ! g_hostdb.init(hostId) ) {
		//	log("db: hostdb init failed." ); return 1; }
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }

		int64_t docid = atoll1(argv[cmdarg+1]);
		const char *coll   = "";
		const char *query  = "";
		if ( cmdarg+3 <= argc ) coll  = argv[cmdarg+2];
		if ( cmdarg+4 == argc ) query = argv[cmdarg+3];
		parseTest( coll, docid, query );
		return 0;
	}

	if ( strcmp ( cmd ,"isportinuse") == 0 ) {
		if ( cmdarg+1 >= argc ) {
			printHelp();
			return 1;
		}
		int port = atol ( argv[cmdarg+1] );
		// make sure port is available. returns false if in use.
		if ( ! g_httpServer.m_tcp.testBind(port,false) )
			// and we should return with 1 so the keep alive
			// script will exit
			exit (1);
		// port is not in use, return 0
		exit(0);
	}

	// need threads here for tests?

	// note the stack size for debug purposes
	struct rlimit rl;
	getrlimit(RLIMIT_STACK, &rl);
	log(LOG_INFO,"db: Stack size is %" PRId64".", (int64_t)rl.rlim_cur);


	struct rlimit lim;
	// limit fds
	// try to prevent core from systems where it is above 1024
	// because our FD_ISSET() libc function will core! (it's older)
	int32_t NOFILE = 1024;
	lim.rlim_cur = lim.rlim_max = NOFILE;
	if ( setrlimit(RLIMIT_NOFILE,&lim)) {
		log("db: setrlimit RLIMIT_NOFILE %" PRId32": %s.",
		    NOFILE,mstrerror(errno) );
	}

	struct rlimit rlim;
	getrlimit ( RLIMIT_NOFILE,&rlim);
	if ( (int32_t)rlim.rlim_max > NOFILE || (int32_t)rlim.rlim_cur > NOFILE ) {
		log("db: setrlimit RLIMIT_NOFILE failed!");
		g_process.shutdownAbort(true);
	}

	// set the s_pages array for print admin pages
	g_pages.init ( );

	bool isProxy = false;
	if ( strcmp( cmd , "proxy" ) == 0 && strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
		isProxy = true;
	}

	// this is just like starting up a gb process, but we add one to
	// each port, we are a dummy machine in the dummy cluster.
	// gb -w <workingdir> tmpstart [hostId]
	bool useTmpCluster = false;
	if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
		useTmpCluster = true;
	}

	// gb -w <workingdir> tmpstop [hostId]
	if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
		useTmpCluster = true;
	}

	// gb -w <workingdir> tmpstarthost
	if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
		useTmpCluster = true;
	}

	bool initMyHost = true;
	if (strcmp(cmd, "install") == 0 ||
	    strcmp(cmd, "installfile") == 0) {
		initMyHost = false;
	}

	//
	// get current working dir that the gb binary is in. all the data
	// files should in there too!!
	const char *workingDir = getAbsoluteGbDir ( argv[0] );
	if ( ! workingDir ) {
		fprintf(stderr,"could not get working dir. Exiting.\n");
		return 1;
	}

	//log("host: working directory is %s",workingDir);

	//initialize IP address checks
	initialize_ip_address_checks();

	// load up hosts.conf
	// . it will determine our hostid based on the directory path of this
	//   gb binary and the ip address of this server
	if ( ! g_hostdb.init(-1, isProxy, useTmpCluster, initMyHost, workingDir)) {
		log( LOG_ERROR, "db: hostdb init failed." );
		return 1;
	}

	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log( LOG_ERROR, "db: Failed to init hashtable." );
		return 1;
	}

	sprintf(unicode_data_dir,"%s/ucdata/",g_hostdb.m_dir);

	// . hashinit() calls srand() w/ a fixed number
	// . let's mix it up again
	srand ( time(NULL) );

	// Make sure TLD table is initializing before calling any URL handling function
	if(!initializeDomains(g_hostdb.m_dir)) {
		log( LOG_ERROR, "Domains initialization failed!" );
		return 1;
	}

	// do not save conf if any core dump occurs starting here
	// down to where we set this back to true
	g_conf.m_save = false;

	//Put this here so that now we can log messages
  	if ( strcmp ( cmd , "proxy" ) == 0 ) {
		if (argc < 3){
			printHelp();
			return 1;
		}

		int32_t proxyId = -1;
		if ( cmdarg+2 < argc ) proxyId = atoi ( argv[cmdarg+2] );

		if ( strcmp ( argv[cmdarg+1] , "start" ) == 0 ) {
			return install ( ifk_proxy_start , proxyId );
		}
		else if ( strcmp ( argv[cmdarg+1] , "stop" ) == 0 ) {
			g_proxy.m_proxyRunning = true;
			return doCmd ( "save=1" , proxyId , "master" , false, true );
		}

		else if ( strcmp ( argv[cmdarg+1] , "replacehost" ) == 0 ) {
			g_proxy.m_proxyRunning = true;
			int32_t hostId = -1;
			int32_t spareId = -1;
			if ( cmdarg + 2 < argc ) {
				hostId = atoi ( argv[cmdarg+2] );
				spareId = atoi ( argv[cmdarg+3] );
			}
			char replaceCmd[256];
			sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32, hostId, spareId);
			return doCmd ( replaceCmd, -1, "admin/hosts", false, true);
		}

		else if ( proxyId == -1 || strcmp ( argv[cmdarg+1] , "load" ) != 0 ) {
			printHelp();
			return 1;
		}

		Host *h = g_hostdb.getProxy( proxyId );
		uint16_t httpPort = h->getInternalHttpPort();
		uint16_t httpsPort = h->getInternalHttpsPort();
		//we need udpserver for addurl
		uint16_t udpPort  = h->m_port;

		if ( ! g_conf.init ( h->m_dir ) ) {
			log( LOG_ERROR, "db: Conf init failed." );
			return 1;
		}

		// init the loop before g_process since g_process
		// registers a sleep callback!
		if ( ! g_loop.init() ) {
			log( LOG_ERROR, "db: Loop init failed." );
			return 1;
		}

		//if ( ! g_jobScheduler.initialize()     ) {
		//	log("db: Threads init failed." ); return 1; }

		g_process.init();

		if ( ! g_process.checkNTPD() ) {
			log( LOG_ERROR, "db: ntpd not running on proxy" );
			return 1;
		}

		const char *errmsg=NULL;
		if ( !UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
			log( LOG_ERROR, "db: Unicode initialization failed! %s", errmsg);
			return 1;
		}
		if(!utf8_convert_initialize()) {
			log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
			return 1;
		}

		// load speller unifiedDict for spider compression proxy
		//if ( g_hostdb.m_myHost->m_type & HT_SCPROXY )
		//	g_speller.init();

		if ( ! g_udpServer.init( g_hostdb.getMyPort() ,
					 &g_dp,
					 20000000 ,   // readBufSIze
					 20000000 ,   // writeBufSize
					 20       ,   // pollTime in ms
					 g_conf.m_udpMaxSockets     , // max udp slots
					 false    )){ // is dns?
			log( LOG_ERROR, "db: UdpServer init failed." );
			return 1;
		}


		if (!g_proxy.initProxy (proxyId, udpPort, 0, &g_dp)) {
			log( LOG_ERROR, "proxy: init failed" );
			return 1;
		}

		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log( LOG_ERROR, "db: Failed to init hashtable." );
			return 1;
		}

		if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) {
			log( LOG_ERROR, "db: HttpServer init failed. Another gb "
			    "already running? If not, try editing "
			    "./hosts.conf to "
			    "change the port from %" PRId32" to something bigger. "
			    "Or stop gb by running 'gb stop' or by "
			    "clicking 'save & exit' in the master controls."
			    , (int32_t)httpPort );
			// this is dangerous!!! do not do the shutdown thing
			return 1;
		}

		//we should save gb.conf right ?
		g_conf.m_save = true;

		g_loop.runLoop();
	}

	// gb dsh cmd [hostrange]
	if ( strcmp ( cmd , "dsh" ) == 0 ) {
		if ( cmdarg+1 >= argc ) {
			printHelp();
			return 1;
		}

		char *cmd = argv[cmdarg+1];

		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+2,argc,argv,&h1,&h2))
			return 1;

		return install ( ifk_dsh, h1, NULL, h2, cmd );
	}

	// gb dsh2 cmd [hostrange]
	if ( strcmp ( cmd , "dsh2" ) == 0 ) {
		if ( cmdarg+1 >= argc ) {
			printHelp();
			return 1;
		}
		char *cmd = argv[cmdarg+1];

		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+2,argc,argv,&h1,&h2))
			return 1;

		return install ( ifk_dsh2, h1, NULL, h2, cmd );
	}

	// gb copyfiles, like gb install but takes a dir not a host #
	if ( strcmp ( cmd , "copyfiles" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) {
			printHelp();
			return 1;
		}
		char *dir = argv[cmdarg+1];
		return copyFiles ( dir );
	}

	// gb install [hostrange]
	if ( strcmp ( cmd , "install" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install ( ifk_install, h1, NULL, h2 );
	}

	// gb installgb [hostrange]
	if ( strcmp ( cmd , "installgb" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_installgb, h1, NULL, h2);
	}

	// gb installfile filename [hostrange]
	if ( strcmp ( cmd , "installfile" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+2,argc,argv,&h1,&h2))
			return 1;
		return install_file(argv[cmdarg + 1], h1, h2);
	}

	// gb installtmpgb [hostrange]
	if ( strcmp ( cmd , "installtmpgb" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_installtmpgb, h1, NULL, h2);
	}

	// gb installconf [hostrange]
	if ( strcmp ( cmd , "installconf" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_installconf, h1, NULL, h2);
	}

	// gb installconf2 [hostrange]
	if ( strcmp ( cmd , "installconf2" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_installconf2, h1, NULL, h2);
	}

	// gb start [hostId]
	if ( strcmp ( cmd , "start" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_start, h1, NULL, h2);
	}

	// gb tmpstart [hostId]
	if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return install(ifk_tmpstart, h1, NULL, h2);
	}

	if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("save=1", h1, "master", true, false, h2);
	}

	if ( strcmp ( cmd , "kstop" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("save=1", h1, "master", true, false, h2);
	}

	// gb backupcopy [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backupcopy" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) {
			printHelp();
			return 1;
		}
		return install( ifk_backupcopy , -1 , argv[cmdarg+1] );
	}

	// gb backupmove [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backupmove" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) {
			printHelp();
			return 1;
		}
		return install( ifk_backupmove , -1 , argv[cmdarg+1] );
	}

	// gb backupmove [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backuprestore" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) {
			printHelp();
			return 1;
		}
		return install( ifk_backuprestore, -1 , argv[cmdarg+1] );
	}

	// gb stop [hostId]
	if ( strcmp ( cmd , "stop" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("save=1" , h1 , "master", true, false, h2);
	}

	// gb save [hostId]
	if ( strcmp ( cmd , "save" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("js=1", h1, "master", true, false, h2);
	}

	// gb spidersoff [hostId]
	if ( strcmp ( cmd , "spidersoff" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd( "se=0", hostId, "master", true, false );
	}

	// gb spiderson [hostid]
	if ( strcmp ( cmd , "spiderson" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd( "se=1", hostId, "master", true, false );
	}

	// gb cacheoff [hostId]
	if ( strcmp ( cmd , "cacheoff" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd( "dpco=1", hostId, "master", true, false );
	}

	// gb ddump [hostId]
	if ( strcmp ( cmd , "ddump" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd( "dump=1", hostId, "master", true, false );
	}

	// gb pmerge [hostId]
	if ( strcmp ( cmd , "pmerge" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("pmerge=1", h1, "master", true, false, h2);
	}

	// gb spmerge [hostId]
	if ( strcmp ( cmd , "spmerge" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("spmerge=1", h1, "master", true, false, h2);
	}

	// gb tmerge [hostId]
	if ( strcmp ( cmd , "tmerge" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("tmerge=1", h1, "master", true, false, h2);
	}

	// gb merge [hostId]
	if ( strcmp ( cmd , "merge" ) == 0 ) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+1,argc,argv,&h1,&h2))
			return 1;
		return doCmd("merge=1", h1, "master", true, false, h2);
	}

	// gb setnote <hostid> <note>
	if ( strcmp ( cmd, "setnote" ) == 0 ) {
		int32_t hostId;
		char *note;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		else return 0;
		if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
		else return 0;
		char urlnote[1024];
		urlEncode(urlnote, 1024, note, strlen(note));
		log ( LOG_INIT, "conf: setnote %" PRId32": %s", hostId, urlnote );
		char setnoteCmd[256];
		sprintf(setnoteCmd, "setnote=1&host=%" PRId32"&note=%s",
				    hostId, urlnote);
		return doCmd( setnoteCmd, -1, "admin/hosts", true, false );
	}

	// gb setsparenote <spareid> <note>
	if ( strcmp ( cmd, "setsparenote" ) == 0 ) {
		int32_t spareId;
		char *note;
		if ( cmdarg + 1 < argc ) spareId = atoi ( argv[cmdarg+1] );
		else return 0;
		if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
		else return 0;
		char urlnote[1024];
		urlEncode(urlnote, 1024, note, strlen(note));
		log(LOG_INIT, "conf: setsparenote %" PRId32": %s", spareId, urlnote);
		char setnoteCmd[256];
		sprintf(setnoteCmd, "setsparenote=1&spare=%" PRId32"&note=%s",
				    spareId, urlnote);
		return doCmd( setnoteCmd, -1, "admin/hosts" , true, false );
	}

	// gb replacehost <hostid> <spareid>
	if ( strcmp ( cmd, "replacehost" ) == 0 ) {
		int32_t hostId = -1;
		int32_t spareId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+2] );
		char replaceCmd[256];
		sprintf(replaceCmd, "replacehost=1&rhost=%" PRId32"&rspare=%" PRId32,
				    hostId, spareId);
		return doCmd( replaceCmd, -1, "admin/hosts", true, true );
	}

	// . read in the conf file
	// . this now initializes from a dir and hostId, they should all be
	//   name gbHID.conf
	// . now that hosts.conf has more of the burden, all gbHID.conf files
	//   can be identical
	if ( ! g_conf.init ( g_hostdb.m_myHost->m_dir ) ) {
		log( LOG_ERROR, "db: Conf init failed." );
		return 1;
	}

	if ( ! g_jobScheduler.initialize(g_conf.m_maxCoordinatorThreads, g_conf.m_maxCpuThreads, g_conf.m_maxSummaryThreads, g_conf.m_maxIOThreads, g_conf.m_maxExternalThreads, g_conf.m_maxFileMetaThreads, g_conf.m_maxMergeThreads, wakeupPollLoop)) {
		log( LOG_ERROR, "db: JobScheduler init failed." );
		return 1;
	}

	// put in read only mode
	if ( useTmpCluster ) {
		g_conf.m_readOnlyMode = true;
	}

	// init the loop, needs g_conf
	if ( ! g_loop.init() ) {
		log( LOG_ERROR, "db: Loop init failed." );
		return 1;
	}

	// the new way to save all rdbs and conf
	// must call after Loop::init() so it can register its sleep callback
	g_process.init();

	// set up the threads, might need g_conf

	// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
	// . spiderdb is special:
	//   gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
	//           [priority] [printStats?]
	if ( strcmp ( cmd , "dump" ) == 0 ) {
		//
		// tell Collectiondb, not to verify each rdb's data
		//
		g_dumpMode = true;

		if ( cmdarg+1 >= argc ) {
			printHelp();
			return 1;
		}
		int32_t startFileNum =  0;
		int32_t numFiles     = -1;
		bool includeTree     =  true;
		const char *coll = "";

		// so we do not log every collection coll.conf we load
		g_conf.m_doingCommandLine = true;

		// we have to init collection db because we need to know if
		// the collnum is legit or not in the tree
		if ( ! g_collectiondb.loadAllCollRecs()   ) {
			log("db: Collectiondb init failed." ); return 1; }

		if ( cmdarg+2 < argc ) coll         = argv[cmdarg+2];
		if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
		if ( cmdarg+4 < argc ) numFiles     = atoi(argv[cmdarg+4]);
		if ( cmdarg+5 < argc ) includeTree  = argToBoolean(argv[cmdarg+5]);


		if ( argv[cmdarg+1][0] == 't' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) {
				docId = atoll1(argv[cmdarg+6]);
			}

			dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, false);

		}
		else if ( argv[cmdarg+1][0] == 'D' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) {
				docId = atoll1(argv[cmdarg+6]);
			}

			dumpTitledb (coll, startFileNum, numFiles, includeTree, docId, true);
		}
		else if (strcmp(argv[cmdarg+1], "w") == 0) {
		       dumpWaitingTree(coll);
		}
		else if (strcmp(argv[cmdarg+1], "rtc") == 0) {
		       dumpRobotsTxtCache(coll);
		}
		else if ( argv[cmdarg+1][0] == 'x' )
			dumpDoledb  (coll,startFileNum,numFiles,includeTree);
 		else if (strcmp(argv[cmdarg+1], "s") == 0) {
 			int32_t firstIp = 0;
 			if(cmdarg+3 < argc) {
			    firstIp = atoip(argv[cmdarg + 3]);
		    }

			dumpSpiderdbSqlite(coll, firstIp);
 		}
		else if ( argv[cmdarg+1][0] == 'S' ) {
			char *site = NULL;
			if ( cmdarg+6 < argc ) {
				site = argv[ cmdarg + 6 ];
			}
			dumpTagdb( coll, startFileNum, numFiles, includeTree, 0, site );
		} else if ( argv[cmdarg+1][0] == 'z' ) {
			char *site = NULL;
			if ( cmdarg+6 < argc ) {
				site = argv[ cmdarg + 6 ];
			}
			dumpTagdb( coll, startFileNum, numFiles, includeTree, 'z', site );
		} else if ( argv[cmdarg+1][0] == 'A' ) {
			dumpTagdb( coll, startFileNum, numFiles, includeTree, 'A', NULL );
		} else if ( argv[cmdarg+1][0] == 'W' ) {
			dumpTagdb( coll, startFileNum, numFiles, includeTree,   0, NULL );
		} else if ( argv[cmdarg+1][0] == 'l' )
			dumpClusterdb (coll,startFileNum,numFiles,includeTree);
		else if (strcmp(argv[cmdarg+1], "Lu") == 0) {
			char *url = NULL;
			if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
			dumpLinkdb(coll,startFileNum,numFiles,includeTree,url,true);
		}
		else if (strcmp(argv[cmdarg+1], "Ls") == 0) {
			char *url = NULL;
			if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
			dumpLinkdb(coll,startFileNum,numFiles,includeTree,url,false);
		}  else if ( argv[cmdarg+1][0] == 'p' ) {
			int64_t termId  = -1;
			if ( cmdarg+6 < argc ) {
				char *targ = argv[cmdarg+6];
				if ( is_alpha_a(targ[0]) ) {
					char *colon = strstr(targ,":");
					int64_t prefix64 = 0LL;
					if ( colon ) {
						*colon = '\0';
						prefix64 = hash64n(targ);
						targ = colon + 1;
					}
					// hash the term itself
					termId = hash64n(targ);
					// hash prefix with termhash
					if ( prefix64 )
						termId = hash64(termId,prefix64);
					termId &= TERMID_MASK;
					printf("termId=%ld\n", termId);
				}
				else {
					termId = atoll1(targ);
				}
			}
			dumpPosdb( coll, startFileNum, numFiles, includeTree, termId, false );
		}  else if (strcmp(argv[cmdarg+1], "u") == 0) {
			dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
		}  else if (strcmp(argv[cmdarg+1], "wt") == 0) {
			dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
		}  else if (strcmp(argv[cmdarg+1], "at") == 0) {
			dumpAdultTitledbRecs(coll, startFileNum, numFiles, includeTree);
		}  else if (strcmp(argv[cmdarg+1], "st") == 0) {
			dumpSpamTitledbRecs(coll, startFileNum, numFiles, includeTree);
		} else {
			printHelp();
			return 1;
		}
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		g_collectiondb.reset();
		return 0;
	}

	// gb sitedeftemp prepare|switch [hostrange]
	if(strcmp(cmd, "sitedeftemp") == 0) {
		int h1,h2;
		if(!parseOptionalHostRange(cmdarg+2,argc,argv,&h1,&h2))
			return 1;
		if(strcmp(argv[cmdarg+1],"prepare")==0)
			return doCmd("sitedeftemp=prepare", h1, "master", true, false, h2);
		else if(strcmp(argv[cmdarg+1],"switch")==0)
			return doCmd("sitedeftemp=switch", h1, "master", true, false, h2);
		else {
			printHelp();
			return 1;
		}
	}

	if(strcmp(cmd, "dumpcsv") == 0) {
		g_conf.m_readOnlyMode = true; //we don't need write access
		g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
		if( !g_collectiondb.loadAllCollRecs()) {
			log("db: Collectiondb init failed.");
			return 1;
		}
		if(argv[cmdarg+1][0] == 's') {
			bool interpret_values = argc>cmdarg+3 ? argToBoolean(argv[cmdarg+3]) : false;
			dumpSpiderdbSqliteCsv(argv[cmdarg+2],interpret_values);
		}
		g_log.m_disabled = true;
		g_collectiondb.reset();
		return 0;
	}

	if(strcmp(cmd, "convertspiderdb") == 0) {
		g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
		if( !g_collectiondb.loadAllCollRecs()) {
			log("db: Collectiondb init failed.");
			return 1;
		}
		const char *collname = argc>cmdarg+1 ? argv[cmdarg+1] : "main";
		convertSpiderDb(collname);
		g_log.m_disabled = true;
		g_collectiondb.reset();
		return 0;
	}

	if( strcmp( cmd, "countdomains" ) == 0 && argc >= (cmdarg + 2) ) {
		const char *coll = "";
		int32_t outpt;
		coll = argv[cmdarg+1];

		int32_t numRecs;
		if(argc>cmdarg+2) {
			if(!isdigit(argv[cmdarg+2][0])) {
				printHelp();
				return 1;
			}
			numRecs = atoi( argv[cmdarg+2] );
		} else
			numRecs = 1000000;

		if( argc > (cmdarg + 2) ) outpt = atoi( argv[cmdarg+2] );
		else outpt = 0;

		log( LOG_INFO, "countdomains: Allocated Larger Mem Table for: %" PRId32,
		     g_mem.getMemTableSize() );
		const char *errmsg=NULL;
		if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
			log("Unicode initialization failed! %s", errmsg);
			return 1;
		}
		if(!utf8_convert_initialize()) {
			log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
			return 1;
		}

		if ( ! g_collectiondb.loadAllCollRecs()   ) {
			log("db: Collectiondb init failed." ); return 1; }

		countdomains( coll, numRecs, outpt );
		g_log.m_disabled = true;
		return 0;
	}

	if(!load_lemma_lexicon()) {
		log(LOG_WARN,"db: could not load lemma lexicon");
		//but not fatal
	}

	// file creation test, make sure we have dir control
	if ( checkDirPerms ( g_hostdb.m_dir ) < 0 ) {
		return 1;
	}

	// . make sure we have critical files
	if ( ! g_process.checkFiles ( g_hostdb.m_dir ) ) {
		return 1;
	}

	g_errno = 0;

	// make sure port is available, no use loading everything up then
	// failing because another process is already running using this port
	if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->getInternalHttpPort(), true)) {
		// return 0 so keep alive bash loop exits
		exit(0);
	}

	int32_t *ips;

	log("db: Logging to file %s.", g_hostdb.m_logFilename );

	if ( ! g_conf.m_runAsDaemon )
		log("db: Use 'gb -d' to run as daemon. Example: gb -d");

	// start up log file
	if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
		fprintf (stderr,"db: Log file init failed. Exiting.\n" );
		return 1;
	}

	g_log.m_logTimestamps = true;
	g_log.m_logReadableTimestamps = true;	// @todo BR: Should be configurable..

	// in case we do not have one, we need it for Images.cpp
	if ( ! makeTrashDir() ) {
		fprintf (stderr,"db: failed to make trash dir. Exiting.\n" );
		return 1;
	}


	g_errno = 0;

	//
	// run as daemon now
	//
	//fprintf(stderr,"running as daemon\n");
	if ( g_conf.m_runAsDaemon ) {
		pid_t pid, sid;
		pid = fork();
		if ( pid < 0 ) exit(EXIT_FAILURE);
		// seems like we core unless parent sets this to NULL.
		// it does not affect the child.
		//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
		// child gets a 0, parent gets the child's pid, so exit
		if ( pid > 0 ) exit(EXIT_SUCCESS);
		// change file mode mask
		umask(0);
		sid = setsid();
		if ( sid < 0 ) exit(EXIT_FAILURE);
		//fprintf(stderr,"done\n");

		// if we do not do this we don't get sigalarms or quickpolls
		// when running as 'gb -d'
		g_loop.init();
	}

	// we register log rotation here because it's after g_loop is initialized
	g_log.registerLogRotation();

	// log the version
	log(LOG_INIT,"conf: Gigablast Version      : %s", getVersion());
	log(LOG_INIT,"conf: Gigablast Architecture : %d-bit", arch);
	log(LOG_INIT,"conf: Gigablast Build config : %s", getBuildConfig());
	log(LOG_INIT,"conf: Gigablast Git commit   : %s", getCommitId());


	// show current working dir
	log("host: Working directory is %s",workingDir);

	log("host: Using %shosts.conf",g_hostdb.m_dir);

	{
		pid_t pid = getpid();
		log("host: Process ID is %" PRIu64,(int64_t)pid);
	}

	// from Hostdb.cpp
	ips = getLocalIps();
	for ( ; ips && *ips ; ips++ ) {
		char ipbuf[16];
		log("host: Detected local ip %s",iptoa(*ips,ipbuf));
	}

	// show it
	log("host: Running as host id #%" PRId32,g_hostdb.m_myHostId );


	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log( LOG_ERROR, "Unicode initialization failed! %s", errmsg);
		return 1;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return 1;
	}

	// the wiktionary for lang identification and alternate word forms/
	// synonyms
	if ( ! g_wiktionary.load() ) {
		log( LOG_ERROR, "Wiktionary initialization failed!" );
		return 1;
	}

	if ( ! g_wiktionary.test() ) {
		log( LOG_ERROR, "Wiktionary test failed!" );
		return 1;
	}

	WordVariationGenerator::set_log_function(wvg_log_function);
	log(LOG_DEBUG,"main: initializing word variations: Danish");
	if(!initializeWordVariationGenerator_Danish()) {
		log(LOG_WARN, "word-variation-danish initialization failed" );
		//but not fatal
	}
	log(LOG_DEBUG,"main: initialized word variations: Danish");

	// the wiki titles
	if ( ! g_wiki.load() ) {
		log( LOG_ERROR, "Wiki initialization failed!" );
		return 1;
	}

	// shout out if we're in read only mode
	if ( g_conf.m_readOnlyMode )
		log("db: -- Read Only Mode Set. Can Not Add New Data. --");

	if (!Rdb::initializeRdbDumpThread()) {
		logError("Unable to initialize rdb dump thread");
		return 1;
	}

	// . collectiondb, does not use rdb, loads directly from disk
	// . do this up here so RdbTree::fixTree_unlocked() can fix RdbTree::m_collnums
	// . this is a fake init, cuz we pass in "true"
	if ( ! g_collectiondb.loadAllCollRecs() ) {
		log( LOG_ERROR, "db: Collectiondb load failed." );
		return 1;
	}

	if(!initialiseAllPrimaryRdbs())
		return 1;

	// the spider cache used by SpiderLoop
	if ( ! g_spiderCache.init() ) {
		log( LOG_ERROR, "db: SpiderCache init failed." );
		return 1;
	}

	// now clean the trees since all rdbs have loaded their rdb trees
	// from disk, we need to remove bogus collection data from teh trees
	// like if a collection was delete but tree never saved right it'll
	// still have the collection's data in it
	if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
		log("db: Collectiondb init failed." );
		_exit(1);
	}

	// make sure the we have spiderdb sqlite if we still have spiderdb rdb files
	for (collnum_t collNum = g_collectiondb.getFirstCollnum(); collNum < g_collectiondb.getNumRecs(); ++collNum) {
		CollectionRec *collRec = g_collectiondb.getRec(collNum);
		if (collRec != nullptr) {
			RdbBase *base = collRec->getBase(RDB_SPIDERDB_DEPRECATED);
			if (base->getNumFiles() != 0 && !g_spiderdb_sqlite.existDb(collNum)) {
				// has rdb files but no sqlite file
				log(LOG_ERROR, "Found spiderdb rdb files but no spiderdb sqlite files.");
				log(LOG_ERROR, "Run ./gb convertspiderdb before starting up gb instances");
				gbshutdownCorrupted();
			}
		}
	}

	// initialize country languages
	CountryLanguage::init();

	//Load the high-frequency term shortcuts (if they exist)
	g_hfts.load();

	//Load the page temperature
	g_pageTemperatureRegistry.load();

	//load docid->flags/sitehash map
	g_d2fasm.load();

	//load sitehash32->default page temperature
	g_smptr.open();

	// load block lists
	g_dnsBlockList.init();
	g_contentTypeBlockList.init();
	g_ipBlockList.init();
	g_contentRetryProxyList.init();

	g_urlBlackList.init();
	g_urlWhiteList.init();
	g_urlProxyList.init();
	g_urlRetryProxyList.init();

	g_robotsCheckList.init();

	g_robotsBlockedResultOverride.init();
	g_urlResultOverride.init();

	// Initialize adult detection
	g_checkAdultList.init("adultwords.txt", "adultphrases.txt");

	// Initialize spam detection
	g_checkSpamList.init("spamphrases.txt");

	if(!ExplicitKeywords::initialize()) {
		log(LOG_ERROR,"Could not initialize explicit keywords file");
		//but otherwise carry on
	}

	// initialize generate global index thread
	if (!RdbBase::initializeGlobalIndexThread()) {
		logError("Unable to initialize global index thread");
		_exit(1);
	}

	if (!Msg4In::initializeIncomingThread()) {
		logError("Unable to initialize Msg4 incoming thread");
		_exit(1);
	}

	// test all collection dirs for write permission
	int32_t pcount = 0;
	for ( int32_t i = 0 ; i < g_collectiondb.getNumRecs(); i++ ) {
		const CollectionRec *cr = g_collectiondb.getRec(i);
		if ( ! cr ) continue;
		if ( ++pcount >= 100 ) {
			log("rdb: not checking directory permission for more than first 100 collections to save time.");
			break;
		}
		char tt[1024 + MAX_COLL_LEN ];
		sprintf ( tt , "%scoll.%s.%" PRId32, g_hostdb.m_dir, cr->m_coll , (int32_t)cr->m_collnum );
		checkDirPerms ( tt ) ;
	}

	//
	// NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT!
	//

	// load the appropriate dictionaries
	if ( ! g_speller.init() && g_conf.m_isLive ) {
		_exit(1);
	}

	// Load the category language table
	g_countryCode.loadHashTable();

	// init minsitenuminlinks buffer
	if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
		log("db: failed to load sitelinks.txt data");
		_exit(1);
	}

	// . then our main udp server
	// . must pass defaults since g_dns uses it's own port/instance of it
	// . server should listen to a socket and register with g_loop
	// . sock read/write buf sizes are both 64000
	// . poll time is 60ms
	// . if the read/write bufs are too small it severely degrades
	//   transmission times for big messages. just use ACK_WINDOW *
	//   MAX_DGRAM_SIZE as the size so when sending you don't drop dgrams
	// . the 400k size allows us to cover Sync.cpp's activity well
	if ( ! g_udpServer.init( g_hostdb.getMyPort() ,&g_dp,
				 40000000 ,   // readBufSIze
				 20000000 ,   // writeBufSize
				 20       ,   // pollTime in ms
				 g_conf.m_udpMaxSockets     ,   // max udp slots
				 false    )){ // is dns?
		log("db: UdpServer init failed." ); return 1; }

	// start up repair loop
	if ( ! g_repair.init() ) {
		log("db: Repair init failed." ); return 1; }

	// start up repair loop
	if ( ! g_dailyMerge.init() ) {
		log("db: Daily merge init failed." ); return 1; }

	// . then dns Distributed client
	// . server should listen to a socket and register with g_loop
	// . Only the distributed cache shall call the dns server.
	if ( ! g_dns.init( g_hostdb.m_myHost->m_dnsClientPort ) ) {
		log("db: Dns distributed client init failed." ); return 1; }

	// initialize dns client library
	if (!GbDns::initialize()) {
		log(LOG_ERROR, "Unable to initialize dns client");
		_exit(1);
	}

	g_stable_summary_cache.configure(g_conf.m_stableSummaryCacheMaxAge, g_conf.m_stableSummaryCacheSize);
	g_unstable_summary_cache.configure(g_conf.m_unstableSummaryCacheMaxAge, g_conf.m_unstableSummaryCacheSize);

	// . then webserver
	// . server should listen to a socket and register with g_loop
	if ( ! g_httpServer.init( g_hostdb.m_myHost->getInternalHttpPort(), g_hostdb.m_myHost->getInternalHttpsPort() ) ) {
		log("db: HttpServer init failed. Another gb already running?" );
		// this is dangerous!!! do not do the shutdown thing
		_exit(1);
	}

	// . now register all msg handlers with g_udp server
	if ( ! registerMsgHandlers() ) {
		log("db: registerMsgHandlers failed" ); return 1; }

	// gb dictLookupTest
	if ( strcmp ( cmd , "dictlookuptest" ) == 0 ) {
		if ( argc != cmdarg + 2 ) {
			printHelp();
			return 1;
		}
		g_speller.dictLookupTest ( argv[cmdarg + 1] );
		_exit(0);
	}

	if(cmd[0] && cmd[0]!='-') {
		log(LOG_ERROR, "Unknown command: '%s'", cmd);
		_exit(1);
	}

	// . register a callback to try to merge everything every 60 seconds
	// . do not exit if we couldn't do this, not a huge deal
	// . put this in here instead of Rdb.cpp because we don't want generator commands merging on us
	// . niceness is 1
	// BR: Upped from 2 sec to 60. No need to check for merge every 2 seconds.
	if (!g_loop.registerSleepCallback(60000, NULL, attemptMergeAllCallback, "Rdb::attemptMergeAllCallback", 1)) {
		log( LOG_WARN, "db: Failed to init merge sleep callback." );
	}

	// try to sync parms (and collection recs) with host 0
	if (!g_loop.registerSleepCallback(1000, NULL, Parms::tryToSyncWrapper, "Parms::tryToSyncWrapper", 0)) {
		return 0;
	}

	if ( !Statistics::initialize() ) {
		return 0;
	}

	// initialize clients
	g_urlRealtimeClassification.initialize();
	g_queryLanguage.initialize();
	g_siteNumInlinks.initialize();
	g_siteMedianPageTemperature.initialize();

	if(!WantedChecker::initialize())
		return 0;

	if(!InstanceInfoExchange::initialize())
		return 0;

	// initialize doc process
	if (!g_docDelete.init()) {
		logError("Unwable to initialize doc delete");
		return 0;
	}

	if (!g_docDeleteUrl.init()) {
		logError("Unwable to initialize doc delete url");
		return 0;
	}

	if (!g_docRebuild.init()) {
		logError("Unwable to initialize doc rebuild");
		return 0;
	}

	if (!g_docRebuildUrl.init()) {
		logError("Unwable to initialize doc rebuild url");
		return 0;
	}

	if (!g_docReindex.init()) {
		logError("Unwable to initialize doc reindex");
		return 0;
	}

	if (!g_docReindexUrl.init()) {
		logError("Unwable to initialize doc reindex url");
		return 0;
	}

	// . start the spiderloop
	// . comment out when testing SpiderCache
	g_spiderLoop.init();

	// allow saving of conf again
	g_conf.m_save = true;

	if(g_conf.m_mlockAllCurrent || g_conf.m_mlockAllFuture) {
		log(LOG_DEBUG,"Locking memory");
		int rc;
		if(g_conf.m_mlockAllCurrent && g_conf.m_mlockAllFuture)
			rc = mlockall(MCL_CURRENT|MCL_FUTURE);
		else if(g_conf.m_mlockAllCurrent)
			rc = mlockall(MCL_CURRENT);
		else //if(g_conf.m_mlockAllFuture) //doesn't make a lot of sense to me
			rc = mlockall(MCL_FUTURE);
		if(rc!=0)
			log(LOG_WARN, "mlockall() failed with errno=%d (%s)", errno, mstrerror(errno));
	}

	log("db: gb is now ready");

	// . now start g_loops main interrupt handling loop
	// . it should block forever
	// . when it gets a signal it dispatches to a server or db to handle it
	g_loop.runLoop();
}


static void printHelp() {
	SafeBuf sb;
	sb.safePrintf(
		      "\n"
		      "Usage: gb [CMD]\n");
	sb.safePrintf(
		      "\n"
		      "\tgb will first try to load "
		      "the hosts.conf in the same directory as the "
		      "gb binary. "
		      "Then it will determine its hostId based on "
		      "the directory and IP address listed in the "
		      "hosts.conf file it loaded. Things in []'s "
		      "are optional.");
	sb.safePrintf(
		" [CMD] can have the following values:\n\n"

		"-h\tPrint this help.\n\n"
		"-v\tPrint version and exit.\n\n"

		//"<hostId>\n"
		//"\tstart the gb process for this <hostId> locally."
		//" <hostId> is 0 to run as host #0, for instance."
		//"\n\n"


		//"<hostId> -d\n\trun as daemon.\n\n"
		"-d\tRun as daemon.\n\n"

		//"-o\tprint the overview documentation in HTML. "
		//"Contains the format of hosts.conf.\n\n"

		// "<hostId> -r\n\tindicates recovery mode, "
		// "sends email to addresses "
		// "specified in Conf.h upon startup.\n\n"
		// "-r\tindicates recovery mode, "
		// "sends email to addresses "
		// "specified in Conf.h upon startup.\n\n"

		"start [hostId]\n"
		"\tStart the gb process on all hosts or just on "
		"[hostId], if specified, using an ssh command. Runs "
		"each gb process in a keepalive loop under bash.\n\n"

		"start <hostId1-hostId2>\n"
		"\tLike above but just start gb on the supplied "
		"range of hostIds.\n\n"

		"stop [hostId]\n"
		"\tSaves and exits for all gb hosts or "
		"just on [hostId], if specified.\n\n"

		"stop <hostId1-hostId2>\n"
		"\tTell gb to save and exit on the given range of "
		"hostIds.\n\n"

		"save [hostId]\n"
		"\tJust saves for all gb hosts or "
		"just on [hostId], if specified.\n\n"


		/*
		"tmpstart [hostId]\n"
		"\tstart the gb process on all hosts or just on "
		"[hostId] if specified, but "
		"use the ports specified in hosts.conf PLUS one. "
		"Then you can switch the "
		"proxy over to point to those and upgrade the "
		"original cluster's gb. "
		"That can be done in the Master Controls of the "
		"proxy using the 'use "
		"temporary cluster'. Also, this assumes the binary "
		"name is tmpgb not gb.\n\n"

		"tmpstop [hostId]\n"
		"\tsaves and exits for all gb hosts or "
		"just on [hostId] if specified, for the "
		"tmpstart command.\n\n"
		*/

		"spidersoff [hostId]\n"
		"\tDisables spidering for all gb hosts or "
		"just on [hostId], if specified.\n\n"

		"spiderson [hostId]\n"
		"\tEnables spidering for all gb hosts or "
		"just on [hostId], if specified.\n\n"

		/*
		"cacheoff [hostId]\n"
		"\tdisables all disk PAGE caches on all hosts or "
		"just on [hostId] if specified.\n\n"

		"freecache [maxShmid]\n"
		"\tfinds and frees all shared memory up to shmid "
		"maxShmid, default is 3000000.\n\n"
		*/

		/*
		"ddump [hostId]\n"
		"\tdump all b-trees in memory to sorted files on "
		"disk. "
		"Will likely trigger merges on files on disk. "
		"Restrict to just host [hostId] if given.\n\n"
		*/

		/*
		"pmerge [hostId|hostId1-hostId2]\n"
		"\tforce merge of posdb files "
		"just on [hostId] if specified.\n\n"

		"smerge [hostId|hostId1-hostId2]\n"
		"\tforce merge of sectiondb files "
		"just on [hostId] if specified.\n\n"

		"tmerge [hostId|hostId1-hostId2]\n"
		"\tforce merge of titledb files "
		"just on [hostId] if specified.\n\n"

		"merge [hostId|hostId1-hostId2]\n"
		"\tforce merge of all rdb files "
		"just on [hostId] if specified.\n\n"
		*/

		"dsh <CMD>\n"
		"\tRun this command on the primary IPs of "
		"all active hosts in hosts.conf. It will be "
		"executed in the gigablast working directory on "
		"each host. Example: "
		"gb dsh 'ps auxw; uptime'\n\n"

		/*
		"dsh2 <CMD>\n"
		"\trun this command on the secondary IPs of "
		"all active hosts in hosts.conf. Example: "
		"gb dsh2 'ps auxw; uptime'\n\n"
		*/

		"install [hostId]\n"
		"\tInstall all required files for gb from "
		"current working directory of the gb binary "
		"to [hostId]. If no [hostId] is specified, install "
		"to ALL hosts.\n\n"

		/*
		"install2 [hostId]\n"
		"\tlike above, but use the secondary IPs in the "
		"hosts.conf.\n\n"
		*/

		"installgb [hostId]\n"
		"\tLike above, but install just the gb executable.\n\n"

		"installfile <file>\n"
		"\tInstalls the specified file on all hosts\n\n"

		/*
		"installtmpgb [hostId]\n"
		"\tlike above, but install just the gb executable "
		"as tmpgb (for tmpstart).\n\n"
		*/

		"installconf [hostId]\n"
		"\tlike above, but install hosts.conf and gb.conf\n\n"
		/*
		"installconf2 [hostId]\n"
		"\tlike above, but install hosts.conf and gbN.conf "
		"to the secondary IPs.\n\n"

		"backupcopy <backupSubdir>\n"
		"\tsave a copy of all xml, config, data and map files "
		"into <backupSubdir> which is relative "
		"to the working dir. Done for all hosts.\n\n"

		"backupmove <backupSubdir>\n"
		"\tmove all all xml, config, data and map files "
		"into <backupSubdir> which  is relative "
		"to the working dir. Done for all hosts.\n\n"

		"backuprestore <backupSubdir>\n"
		"\tmove all all xml, config, data and map files "
		"in <backupSubdir>,  which is relative "
		"to the working dir, into the working dir. "
		"Will NOT overwrite anything. Done for all "
		"hosts.\n\n"

		"proxy start [proxyId]\n"
		"\tStart a proxy that acts as a frontend to gb "
		"and passes on requests to random machines on "
		"the cluster given in hosts.conf. Helps to "
		"distribute the load evenly across all machines.\n\n"

		"proxy load <proxyId>\n"
		"\tStart a proxy process directly without calling "
		"ssh. Called by 'gb proxy start'.\n\n"

		"proxy stop [proxyId]\n"
		"\tStop a proxy that acts as a frontend to gb.\n\n"
		*/

		/*
		"dictlookuptest <file>\n"
		"\tgets the popularities of the entries in the "
		"<file>. Used to only check performance of "
		"getPhrasePopularity.\n\n"

		// less common things
		"gendict <coll> [numWordsToDump]\n\tgenerate "
		"dictionary used for spellchecker "
		"from titledb files in collection <coll>. Use "
		"first [numWordsToDump] words.\n\n"

		//"update\tupdate titledb0001.dat\n\n"
		"treetest\n\ttree insertion speed test\n\n"

		"hashtest\n\tadd and delete into hashtable test\n\n"

		"parsetest <docIdToTest> [coll] [query]\n\t"
		"parser speed tests\n\n"
		*/

		/*
		// Quality Tests
		"countdomains <coll> <X>\n"
		"\tCounts the domains and IPs in collection coll and "
		"in the first X titledb records.  Results are sorted"
		"by popularity and stored in the log file. \n\n"

		"cachetest\n\t"
		"cache stability and speed tests\n\n"

		"dump e <coll> <UTCtimestamp>\n\tdump all events "
		"as if the time is UTCtimestamp.\n\n"

		"dump es <coll> <UTCtimestamp>\n\tdump stats for "
		"all events as if the time is UTCtimestamp.\n\n"
		*/

		"dump <db> <collection> <fileNum> <numFiles> <includeTree> [other stuff]\n\tDump a db from disk. "
		"Example: gb dump t main\n"
		"\t<collection> is the name of the collection.\n\n"

		"\tclusterdb:\n"
		"\t\tdump l <collection> <fileNum> <numFiles> <includeTree>\n"

		"\tdoledb:\n"
		"\t\tdump x <collection> <fileNum> <numFiles> <includeTree>\n"

		"\tlinkdb (site):\n"
		"\t\tdump Ls <collection> <fileNum> <numFiles> <includeTree> <url>\n"
		"\tlinkdb (url):\n"
		"\t\tdump Lu <collection> <fileNum> <numFiles> <includeTree> <url>\n"

		"\tposdb (the index):\n"
		"\t\tdump p <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"

		"\tspiderdb:\n"
		"\t\tdump s <collection> <firstIp>\n"

		"\ttagdb:\n"
		"\t\tdump S <collection> <fileNum> <numFiles> <includeTree> <site>\n"
		"\ttagdb (for wget):\n"
		"\t\tdump W <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"
		"\ttagdb (make sitelist.txt):\n"
		"\t\tdump z <collection> <fileNum> <numFiles> <includeTree> <site>\n"
		"\ttagdb (output HTTP commands for adding tags):\n"
		"\t\tdump A <collection> <fileNum> <numFiles> <includeTree> <term-or-termId>\n"

		"\ttitledb:\n"
		"\t\tdump t <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
		"\ttitledb (Unwanted documents, checked against blocklist, plugins):\n"
		"\t\tdump u <collection> <fileNum> <numFiles> <includeTree>\n"
		"\ttitledb (Wanted documents, checked against blocklist, plugins):\n"
		"\t\tdump wt <collection> <fileNum> <numFiles> <includeTree>\n"
		"\ttitledb (duplicates only):\n"
		"\t\tdump at <collection> <fileNum> <numFiles> <includeTree>\n"
		"\ttitledb (Adult titlerecs):\n"
		"\t\tdump st <collection> <fileNum> <numFiles> <includeTree>\n"
		"\ttitledb (Spam titlerecs):\n"
		"\t\tdump D <collection> <fileNum> <numFiles> <includeTree> <docId>\n"
		"\twaiting tree:\n"
		"\t\tdump w <collection>\n"

		"\trobots.txt.cache:\n"
		"\t\tdump rtc <url>\n"
		"\n"
		"sitedeftemp\n"
		"\tPrepares or switches to a new site-default-page-temperature generation.\n"
		"\tsitedeftemp prepare\n"
		"\t\tPrepares a new site-default-page-temperature generation\n"
		"\tsitedeftemp switch\n"
		"\t\tSwitches to a new site-default-page-temperature generation previously prepared with 'sitedeftemp prepare'\n"
		"\n"
		);

	//word-wrap to screen width, if known
	struct winsize w;
	if(ioctl(STDOUT_FILENO,TIOCGWINSZ,&w)==0 && w.ws_col>0) {
		SafeBuf sb2;
		sb2.brify2(sb.getBufStart(), w.ws_col, "\n\t", false);
		sb2.safeMemcpy("",1);
		fprintf(stdout,"%s",sb2.getBufStart());
	} else
		fprintf(stdout,"%s",sb.getBufStart());
	// disable printing of used memory
	//g_mem.m_used = 0;
}


/// @todo ALC wouldn't it be faster to actually check the dir permission instead of trying to write a tmp file?
int32_t checkDirPerms(const char *dir) {
	if ( g_conf.m_readOnlyMode ) {
		return 0;
	}

	File f;
	f.set ( dir , "tmpfile" );
	if ( ! f.open ( O_RDWR | O_CREAT | O_TRUNC ) ) {
		log( LOG_ERROR, "disk: Unable to create %stmpfile. Need write permission in this directory.", dir );
		return -1;
	}
	if ( ! f.unlink() ) {
		log( LOG_ERROR, "disk: Unable to delete %stmpfile. Need write permission in this directory.", dir );
		return -1;
	}
	return 0;
}


static bool argToBoolean(const char *arg) {
	return strcmp(arg,"1")==0 ||
	       strcmp(arg,"true")==0;
}

static bool parseOptionalHostRange(int rangearg, int argc, char **argv, int *h1, int *h2) {
	if(rangearg < argc) {
		int n = sscanf(argv[rangearg],"%u-%u", h1, h2);
		if(n==0) {
			fprintf(stderr,"Unrecognized host range: '%s'\n", argv[rangearg]);
			return false;
		} else if(n==1) {
			*h2 = -1;
		} else if(*h2<*h1) {
			fprintf(stderr,"host2<host1 in host range: '%s'\n", argv[rangearg]);
			return false;
		}
	} else {
		*h1 = -1;
		*h2 = -1;
	}
	return true;
}


// save them all
static       void doCmdAll   ( int fd, void *state ) ;
static       bool  s_sendToHosts;
static       bool  s_sendToProxies;
static       int32_t  s_hostId;
static       int32_t  s_hostId2;
static       char  s_buffer[128];
static HttpRequest s_r;
bool doCmd ( const char *cmd , int32_t hostId , const char *filename ,
	     bool sendToHosts , bool sendToProxies , int32_t hostId2 ) {

	//so we don't supporess messages to dead hosts (we're not connected to vagus)
	g_conf.m_doingCommandLine = true;

	// need loop to work
	if ( ! g_loop.init() ) {
		log(LOG_WARN, "db: Loop init failed." );
		return false;
	}

	// pass it on
	s_hostId = hostId;
	s_sendToHosts = sendToHosts;
	s_sendToProxies = sendToProxies;
	s_hostId2 = hostId2;
	// set stuff so http server client-side works right
	g_conf.m_httpMaxSockets = 512;
	sprintf ( g_conf.m_spiderUserAgent ,"GigablastOpenSource/1.0");
	sprintf ( g_conf.m_spiderBotName ,"gigablastopensource");


	// register sleep callback to get started
	if (!g_loop.registerSleepCallback(1, NULL, doCmdAll, "doCmdAll", 0)) {
		log(LOG_WARN, "admin: Loop init failed.");
		return false;
	}
	// not it
	log(LOG_INFO,"admin: broadcasting %s",cmd);
	// make a fake http request
	sprintf ( s_buffer , "GET /%s?%s HTTP/1.0" , filename , cmd );
	TcpSocket sock;
	// make it local loopback so it passes the permission test in
	// doCmdAll()'s call to convertHttpRequestToParmList
	sock.m_ip = atoip("127.0.0.1");
	s_r.set ( s_buffer , strlen ( s_buffer ) , &sock );
	// do not do sig alarms! for now just set this to null so
	// the sigalarmhandler doesn't core
	//g_hostdb.m_myHost = NULL;

	// run the loop
	g_loop.runLoop();
}

[[ noreturn ]] void doneCmdAll ( void *state ) {
	log("cmd: completed command");
	exit ( 0 );
}


void doCmdAll ( int fd, void *state ) {

	// do not keep calling it!
	g_loop.unregisterSleepCallback ( NULL, doCmdAll );

	// make port -1 to indicate none to listen on
	if ( ! g_udpServer.init( 18123 , // port to listen on
				 &g_dp,
				 20000000 ,   // readBufSIze
				 20000000 ,   // writeBufSize
				 20       ,   // pollTime in ms
				 3500     ,   // max udp slots
				 false    )){ // is dns?
		log("db: UdpServer init  on port 18123 failed: %s" ,
		    mstrerror(g_errno));
		exit(0);
	}

	// udpserver::sendRequest() checks we have a handle for msgs we send!
	// so fake it out with this lest it cores
	Parms::registerHandler3f();


	SafeBuf parmList;
	// returns false and sets g_errno on error
	if (!g_parms.convertHttpRequestToParmList(&s_r,&parmList,0,NULL)){
		log("cmd: error converting command: %s",mstrerror(g_errno));
		exit(0);
	}

	if ( parmList.length() <= 0 ) {
		log("cmd: no parmlist to send");
		exit(0);
	}

	// restrict broadcast to this hostid range!

	// returns true with g_errno set on error. uses g_udpServer
	if ( g_parms.broadcastParmList ( &parmList ,
					 NULL ,
					 doneCmdAll , // callback when done
					 s_sendToHosts ,
					 s_sendToProxies ,
					 s_hostId ,  // -1 means all
					 s_hostId2 ) ) { // -1 means all
		log("cmd: error sending command: %s",mstrerror(g_errno));
		exit(0);
	}
	// wait for it
	log("cmd: sent command");
}

static int install_file(const char *dst_host, const char *src_file, const char *dst_file)
{
	char cmd[1024];
	sprintf(cmd, "scp -p %s %s:%s", src_file, dst_host, dst_file);
	log(LOG_INIT,"admin: %s", cmd);
	int rc = system(cmd);
	return rc;
}


static int install_file(const char *file, int32_t hostId, int32_t hostId2) {
	// use hostId2 to indicate the range hostId-hostId2, but if it is -1
	// then it was not given, so restrict to just hostId
	if ( hostId2 == -1 ) {
		hostId2 = hostId;
	}

	for (int32_t i = 0; i < g_hostdb.getNumHosts(); i++) {
		Host *h2 = g_hostdb.getHost(i);
		if (h2 == g_hostdb.getMyHost()) {
			continue; //skip ourselves
		}

		// if doing a range of hostid, hostId2 is >= 0
		if (hostId >= 0 && hostId2 >= 0) {
			if (h2->m_hostId < hostId || h2->m_hostId > hostId2) {
				continue;
			}
		}

		char full_dst_file[1024];
		sprintf(full_dst_file, "%s%s", h2->m_dir, file);

		char ipbuf[16];
		install_file(iptoa(h2->m_ip, ipbuf), file, full_dst_file);
	}
	return 0; //return value is unclear
}


// installFlag is 1 if we are really installing, 2 if just starting up gb's
// installFlag should be a member of the ifk_ enum defined above
static int install ( install_flag_konst_t installFlag, int32_t hostId, char *dir, int32_t hostId2, char *cmd ) {

	// use hostId2 to indicate the range hostId-hostId2, but if it is -1
	// then it was not given, so restrict to just hostId
	if ( hostId2 == -1 ) {
		hostId2 = hostId;
	}

	char tmp[1024];
	if ( installFlag == ifk_proxy_start ) {
		for ( int32_t i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
			Host *h2 = g_hostdb.getProxy(i);
			// limit install to this hostId if it is >= 0
			if ( hostId >= 0 && h2->m_hostId != hostId ) continue;

			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			//to test add: ulimit -t 10; to the ssh cmd
			char ipbuf[16];
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"export MALLOC_CHECK_=0;"
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				"ADDARGS='' ; "
				"EXITSTATUS=1 ; "
				"while [ \\$EXITSTATUS != 0 ]; do "
 				"{ "
				"./gb proxy load %" PRId32" " // mdw
				"\\$ADDARGS "
				" >& ./proxylog ;"
				"EXITSTATUS=\\$? ; "
				"ADDARGS='-r' ; "
				"} "
 				"done >& /dev/null & \" & ",
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir      ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			int32_t ret = system ( tmp );
			if ( ret < 0 ) {
				fprintf(stderr,"Error loading proxy: %s\n",
					mstrerror(errno));
				exit(-1);
			}
			fprintf(stderr,"If proxy does not start, make sure "
				"its ip is correct in hosts.conf\n");
		}
		return 0;
	}

	HashTableX iptab;
	char tmpBuf[2048];
	iptab.set(4,4,64,tmpBuf,2048,true,"iptsu");

	int32_t maxOut = 500;

	// this is a big scp so only do two at a time...
	if  ( installFlag == ifk_install ) maxOut = 1;
	if  ( installFlag == ifk_installgb ) maxOut = 4;

	// go through each host
	for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
		Host *h2 = g_hostdb.getHost(i);
		char ipbuf[16];

		const char *amp = " ";

		// if i is NOT multiple of maxOut then use '&'
		// even if all all different machines (IPs) scp chokes and so
		// does rcp a little. so restrict to maxOut at a time.
		if ( (i+1) % maxOut ) {
			amp = "&";
		}

		// if doing a range of hostid, hostId2 is >= 0
		if ( hostId >= 0 && hostId2 >= 0 ) {
			if ( h2->m_hostId < hostId || h2->m_hostId > hostId2 )
				continue;
		}

		// backupcopy
		if ( installFlag == ifk_backupcopy ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"mkdir %s ; "
				"cp -ai *.dat* *.map gb.conf "
				"hosts.conf %s\" &",
				iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}
		// backupmove
		else if ( installFlag == ifk_backupmove ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"mkdir %s ; "
				"mv -i *.dat* *.map "
				"%s\" &",
				iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}
		// backuprestore
		else if ( installFlag == ifk_backuprestore ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; cd %s ; "
				"mv -i *.dat* *.map gb.conf "
				"hosts.conf %s\" &",
				iptoa(h2->m_ip,ipbuf), h2->m_dir , dir , h2->m_dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}

		const char *dir = "./";
		// install to it
		if ( installFlag == ifk_install ) {
			const char *srcDir = "./";
			SafeBuf fileListBuf;
			g_process.getFilesToCopy ( srcDir , &fileListBuf );

			fileListBuf.safePrintf(" %shosts.conf",srcDir);
			fileListBuf.safePrintf(" %sgb.conf",srcDir);

			iptoa(h2->m_ip,ipbuf);

			SafeBuf tmpBuf;
			tmpBuf.safePrintf(
					  // ensure directory is there, if
					  // not then make it
					  "ssh %s 'mkdir -p %s' ; "
					  "scp -p -r %s %s:%s"
					  , ipbuf
					  , h2->m_dir

					  , fileListBuf.getBufStart()
					  , ipbuf
					  , h2->m_dir
					  );
			char *tmp = tmpBuf.getBufStart();
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installgb ) {
			File f;
			const char *target = "gb.new";
			f.set(g_hostdb.m_myHost->m_dir,target);
			if ( ! f.doesExist() ) target = "gb";

			sprintf(tmp,
				"scp -p " // blowfish is faster
				"%s%s "
				"%s:%s/gb.installed%s",
				dir,
				target,
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir,
				amp);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installtmpgb ) {
			sprintf(tmp,
				"scp -p "
				"%sgb.new "
				"%s:%s/tmpgb.installed &",
				dir,
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installconf ) {
			sprintf(tmp,
				"scp -p %sgb.conf %shosts.conf %s:%s %s",
				dir ,
				dir ,
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir,
				amp);

			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		// start up a dummy cluster using hosts.conf ports + 1
		else if ( installFlag == ifk_tmpstart ) {
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f tmpgb tmpgb.oldsave ; "
				"mv -f tmpgb.installed tmpgb ; "
				"%s/tmpgb tmpstarthost "
				"%" PRId32" >& ./tmplog%03" PRId32" &\" &",
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir      ,
				h2->m_dir      ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		else if ( installFlag == ifk_start ) {
			sprintf( tmp, "ssh %s '%sgbstart.sh %" PRId32"' %s", iptoa(h2->m_ip,ipbuf), h2->m_dir, h2->m_hostId, amp );

			// log it
			fprintf(stdout,"admin: %s\n", tmp);

			// execute it
			system ( tmp );
		}
		// dsh
		else if ( installFlag == ifk_dsh ) {
			sprintf(tmp,
				"ssh %s 'cd %s ; %s' %s",
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir,
				cmd ,
				amp );
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		// dsh2
		else if ( installFlag == ifk_dsh2 ) {
			sprintf(tmp,
				"ssh %s 'cd %s ; %s'",
				iptoa(h2->m_ip,ipbuf),
				h2->m_dir,
				cmd );
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		// installconf2
		else if ( installFlag == ifk_installconf2 ) {
			sprintf(tmp,
				"rcp %sgb.conf %shosts.conf %shosts2.conf "
				"%s:%s &",
				dir ,
				dir ,
				dir ,
				iptoa(h2->m_ipShotgun,ipbuf),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
	}
	// return 0 on success
	return 0;
}

static bool registerMsgHandlers() {
	if (! registerMsgHandlers1()) return false;
	if (! registerMsgHandlers2()) return false;

	// in SpiderProxy.cpp...
	initSpiderProxyStuff();
	return true;
}

static bool registerMsgHandlers1() {
	if ( ! Msg20::registerHandler()) return false;
	if ( ! MsgC::registerHandler()) return false;

	if ( ! Msg22::registerHandler() ) return false;

	return true;
}

static bool registerMsgHandlers2() {
	if ( ! Msg0::registerHandler()) return false;

	if ( ! Msg13::registerHandler() ) return false;

	if ( ! Msg39::registerHandler()) return false;

	if ( ! Msg4In::registerHandler() ) return false;
	if ( ! Msg4::initializeOutHandling() ) return false;

	if(! Parms::registerHandler3e()) return false;
	if(! Parms::registerHandler3f()) return false;

	if ( ! g_udpServer.registerHandler(msg_type_25,handleRequest25)) return false;
	if ( ! g_udpServer.registerHandler(msg_type_7,handleRequest7)) return false;

	return true;
}

#include "Rdb.h"
#include "Xml.h"

//
// dump routines here now
//

void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree,
                  int64_t docid , bool justPrintDups) {

	if(startFileNum!=0 && numFiles<0) {
		//this may apply to all files, but I haven't checked into hash-based ones yet
		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
		return;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." ); return ; }
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	g_titledb.init ();
	//g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1(coll);
	key96_t startKey ;
	key96_t endKey   ;
	key96_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = Titledb::makeFirstKey ( docid );
	Msg5 msg5;
	RdbList list;
	int64_t prevId = 0LL;
	int32_t count = 0;
	char ttt[2048+MAX_URL_LEN];
	HashTableX dedupTable;
	dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");
	//g_synonyms.init();
	// load the appropriate dictionaries -- why???
	//g_speller.init();

	// make this
	XmlDoc *xd;
	try { xd = new (XmlDoc); }
	catch(std::bad_alloc&) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}
	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if(cr==NULL) {
		fprintf(stderr,"Unknown collection '%s'\n", coll);
		return;
	}

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1            , // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) return;

		// loop over entries in list
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {
			key96_t k       = list.getCurrentKey();
			char *rec     = list.getCurrentRec();
			int32_t  recSize = list.getCurrentRecSize();
			int64_t docId       = Titledb::getDocIdFromKey ( &k );
			if ( k <= lastKey )
				log("key out of order. "
				    "lastKey.n1=%" PRIx32" n0=%" PRIx64" "
				    "currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				    lastKey.n1,lastKey.n0,
				    k.n1,k.n0);
			lastKey = k;
			int32_t shard = g_hostdb.getShardNum ( RDB_TITLEDB , &k );
			// print deletes
			if ( (k.n0 & 0x01) == 0) {
				fprintf(stdout,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
				       "shard=%" PRId32" (del)\n",
					k.n1 , k.n0 , docId , shard );
				continue;
			}
			// free the mem
			xd->reset();
			// uncompress the title rec
			//TitleRec tr;
			if (!xd->set2(rec, recSize, coll, 0)) {
				//set2() may have logged something but not the docid
				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
				continue;
			}

			// extract the url
			Url *u = xd->getFirstUrl();

			//int32_t nc = xd->size_catIds / 4;//tr.getNumCatids();
			if ( justPrintDups ) {
				// print into buf
				if ( docId != prevId ) {
					time_t ts = xd->m_spideredTime;//tr.getSpiderDa
					struct tm tm_buf;
					struct tm *timeStruct = localtime_r(&ts,&tm_buf);
					//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
					char ppp[100];
					strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",
						 timeStruct);
					LinkInfo *info = xd->ptr_linkInfo1;//tr.ge
					char foo[1024];
					foo[0] = '\0';
					//if ( tr.getVersion() >= 86 )
					sprintf(foo,
						//"tw=%" PRId32" hw=%" PRId32" upw=%" PRId32" "
						"sni=%" PRId32" ",
						//(int32_t)xd->m_titleWeight,
						//(int32_t)xd->m_headerWeight,
						//(int32_t)xd->m_urlPathWeight,
						(int32_t)xd->m_siteNumInlinks);
					const char *ru = xd->ptr_redirUrl;
					if ( ! ru ) ru = "";
					char ipbuf2[16];
					sprintf(ttt,
						"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
						//hh=%07" PRIx32" ch=%08" PRIx32" "
						"size=%07" PRId32" "
						"ch32=%010" PRIu32" "
						"clen=%07" PRId32" "
						"cs=%04d "
						"lang=%02d "
						"sni=%03" PRId32" "
						"lastspidered=%s "
						"ip=%s "
						"numLinkTexts=%04" PRId32" "
						"%s"
						"version=%02" PRId32" "
						//"maxLinkTextWeight=%06" PRIu32"%% "
						"redir=%s "
						"url=%s "
						"firstdup=1 "
						"shard=%" PRId32" "
						"\n",
						k.n1 , k.n0 ,
						//rec[0] ,
						docId ,
						//hostHash ,
						//contentHash ,
						recSize - 16 ,
						(uint32_t)xd->m_contentHash32,
						xd->size_utf8Content,//tr.getContentLen
						xd->m_charset,//tr.getCharset(),
						xd->m_langId,//tr.getLanguage(),
						(int32_t)xd->m_siteNumInlinks,//tr.getDo
						//nc,
						ppp,
						iptoa(xd->m_ip,ipbuf2),
						info->getNumGoodInlinks(),
						foo,
						(int32_t)xd->m_version,
						//ms,
						ru,
						u->getUrl() ,
						shard );
					prevId = docId;
					count = 0;
					continue;
				}
				// print previous docid that is same as our
				if ( count++ == 0 ) printf ( "\n%s" , ttt );
			}
			// nice, this is never 0 for a titlerec, so we can use 0 to signal
			// that the following bytes are not compressed, and we can store
			// out special checksum vector there for fuzzy deduping.
			//if ( rec[0] != 0 ) continue;
			// print it out
			//printf("n1=%08" PRIx32" n0=%016" PRIx64" b=0x%02hhx docId=%012" PRId64" sh=%07" PRIx32" ch=%08" PRIx32" "
			// date indexed as local time, not GMT/UTC
			time_t ts = xd->m_spideredTime;//tr.getSpiderDate();
			struct tm tm_buf;
			struct tm *timeStruct = localtime_r(&ts,&tm_buf);
			//struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);

			LinkInfo *info = xd->ptr_linkInfo1;//tr.getLinkInfo();

			char foo[1024];
			foo[0] = '\0';
			sprintf(foo,
				"sni=%" PRId32" ",
				(int32_t)xd->m_siteNumInlinks);

			const char *ru = xd->ptr_redirUrl;
			if ( ! ru ) ru = "";

			char ipbuf2[16];
			fprintf(stdout,
				"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
				"size=%07" PRId32" "
				"ch32=%010" PRIu32" "
				"clen=%07" PRId32" "
				"cs=%04d "
				"ctype=%s "
				"lang=%02d "
				"sni=%03" PRId32" "
				"lastspidered=%s "
				"ip=%s "
				"numLinkTexts=%04" PRId32" "
				"%s"
				"version=%02" PRId32" "
				"shard=%" PRId32" "
				"metadatasize=%" PRId32" "
				"redir=%s "
				"url=%s\n",
				k.n1 , k.n0 ,
				docId ,
				recSize - 16 ,
				(uint32_t)xd->m_contentHash32,
				xd->size_utf8Content,//tr.getContentLen() ,
				xd->m_charset,//tr.getCharset(),
				g_contentTypeStrings[xd->m_contentType],
				xd->m_langId,//tr.getLanguage(),
				(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
				ppp,
				iptoa(xd->m_ip,ipbuf2),
				info->getNumGoodInlinks(),
				foo,
				(int32_t)xd->m_version,
				shard,
				0,
				ru,
				u->getUrl() );
			// free the mem
			xd->reset();
		}
		startKey = *(key96_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) return;
	}
}

void dumpWaitingTree (const char *coll ) {
	RdbTree wt;
	if (!wt.set(0, -1, 20000000, true, "waittree2", "waitingtree", sizeof(key96_t))) {
		return;
	}

	collnum_t collnum = g_collectiondb.getCollnum ( coll );
	// make dir
	char dir[500];
	sprintf(dir, "%scoll.%s.%" PRId32, g_hostdb.m_dir, coll, (int32_t)collnum);

	// load in the waiting tree, IPs waiting to get into doledb
	BigFile file;
	file.set(dir, "waitingtree-saved.dat");
	bool treeExists = file.doesExist() > 0;
	// load the table with file named "THISDIR/saved"
	RdbMem wm;
	if ( treeExists && !wt.fastLoad(&file, &wm) ) return;
	ScopedLock sl(wt.getLock());
	// the the waiting tree
	for (int32_t node = wt.getFirstNode_unlocked(); node >= 0; node = wt.getNextNode_unlocked(node)) {
		// get key
		const key96_t *key = reinterpret_cast<const key96_t*>(wt.getKey_unlocked(node));
		// get ip from that
		int32_t firstIp = (key->n0) & 0xffffffff;
		// get the time
		uint64_t spiderTimeMS = key->n1;
		// shift upp
		spiderTimeMS <<= 32;
		// or in
		spiderTimeMS |= (key->n0 >> 32);
		// get the rest of the data
		char ipbuf[16];

		time_t now_t = spiderTimeMS/1000;
		struct tm tm_buf;
		struct tm *stm = gmtime_r(&now_t,&tm_buf);

		fprintf(stdout,"time=%" PRIu64" (%04d-%02d-%02dT%02d:%02d:%02d.%03dZ) firstip=%s\n", spiderTimeMS, stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(spiderTimeMS%1000), iptoa(firstIp,ipbuf));
	}
}


void dumpDoledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree){
	g_doledb.init ();
	g_doledb.getRdb()->addRdbBase1(coll );
	key96_t startKey ;
	key96_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	Msg5 msg5;
	RdbList list;
	key96_t oldk; oldk.setMin();
	const CollectionRec *cr = g_collectiondb.getRec(coll);

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_DOLEDB    ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1,             // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) return;
		// loop over entries in list
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {
			key96_t k    = list.getCurrentKey();
			if ( oldk > k )
				fprintf(stdout,"got bad key order. "
					"%" PRIx32"/%" PRIx64" > %" PRIx32"/%" PRIx64"\n",
					oldk.n1,oldk.n0,k.n1,k.n0);
			oldk = k;
			// get it
			const char *drec = list.getCurrentRec();
			// sanity check
			if ( (drec[0] & 0x01) == 0x00 ) {g_process.shutdownAbort(true); }
			// get spider rec in it
			const char *srec = drec + 12 + 4;

			struct tm *timeStruct ;
			char time[256];
			time_t ts = (time_t)Doledb::getSpiderTime(&k);
			struct tm tm_buf;
			timeStruct = gmtime_r(&ts,&tm_buf);
			strftime ( time , 256 , "%Y%m%d-%H%M%S UTC", timeStruct );

			// print doledb info first then spider request
			fprintf(stdout,"dolekey=%s (n1=%" PRIu32" n0=%" PRIu64") "
				"pri=%" PRId32" "
				"spidertime=%s(%" PRIu32") "
				"uh48=0x%" PRIx64"\n",
				KEYSTR(&k,12),
				k.n1,
				k.n0,
				(int32_t)Doledb::getPriority(&k),
				time,
				(uint32_t)Doledb::getSpiderTime(&k),
				Doledb::getUrlHash48(&k));
			fprintf(stdout,"spiderkey=");
			// print it
			Spiderdb::print ( srec );
			// the \n
			printf("\n");
			// must be a request -- for now, for stats
			if ( ! Spiderdb::isSpiderRequest((key128_t *)srec) ) {
				// error!
				continue;
			}
			// cast it
			const SpiderRequest *sreq = (const SpiderRequest *)srec;
			// skip negatives
			if ( (sreq->m_key.n0 & 0x01) == 0x00 ) { g_process.shutdownAbort(true); }
		}
		startKey = *(key96_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) return;
	}
}


void dumpRobotsTxtCache(const char *url) {
	struct HttpCacheData {
		int32_t m_errno;
		char *ptr_reply;
		int32_t size_reply;
	} __attribute__((packed));

	if( !url || strlen(url) <= 0 ) {
		fprintf(stdout, "robots.txt.cache lookup failed, you must supply a url as parameter\n");
		return;
	}

	// Generate robots.txt url
	Url u;
	u.set(url);

	// build robots.txt url
	char urlRobots[MAX_URL_LEN+1];
	char *p = urlRobots;
	if ( ! u.getScheme() )
	{
		p += sprintf ( p , "http://" );
	}
	else
	{
		memcpy ( p , u.getScheme() , u.getSchemeLen() );
		p += u.getSchemeLen();
		p += sprintf(p,"://");
	}

	memcpy ( p , u.getHost() , u.getHostLen() );
	p += u.getHostLen();

	// add port if not default
	if ( u.getPort() != u.getDefaultPort() ) {
		p += sprintf( p, ":%" PRId32, u.getPort() );
	}
	p += sprintf ( p , "/robots.txt" );


	fprintf(stdout, "robots.txt.cache lookup of %s\n", urlRobots);

	RdbCache httpCacheRobots;
	int32_t memRobots = 3000000;
	int32_t maxCacheNodesRobots = memRobots / 106;

	if ( ! httpCacheRobots.init ( memRobots ,
					-1        , // fixedDataSize
					maxCacheNodesRobots ,
					"robots.txt"  , // dbname
					true,          // load from disk
					12,            // cachekeysize
					-1)) {           // numPtrsMax
		fprintf(stdout, "Could not initialize local robots.txt.cache\n");
		return;
	}

	int32_t numElem = httpCacheRobots.getNumUsedNodes();
	fprintf(stdout,"%" PRId32 " elements in cache.\n", numElem);

	char *rec;
	int32_t  recSize;
	key96_t k;
	k.n1 = 0;
	k.n0 = hash64(urlRobots, strlen(urlRobots));
	k.n0 ^= 0xff;	// for compressed keys

	int64_t uh48 = k.n0 & 0x0000ffffffffffffLL;
	fprintf(stdout, "Cache key=%" PRIu64 ", uh48=%" PRIu64 "\n", k.n0, uh48);

	bool inCache = httpCacheRobots.getRecord ( (collnum_t)0     , // share btwn colls
					k                , // cacheKey
					&rec             ,
					&recSize         ,
					true             , // copy?
					9999999,		//r->m_maxCacheAge , // 24*60*60 ,
					false); // stats?
	fprintf(stdout, "Found: %s\n", inCache?"true":"false");

	if( inCache ) {
		HttpCacheData *httpCacheData = reinterpret_cast<HttpCacheData*>(rec);

		if( deserializeMsg(sizeof(*httpCacheData), &httpCacheData->size_reply, &httpCacheData->size_reply, &httpCacheData->ptr_reply, ((char*)httpCacheData + sizeof(*httpCacheData))) != -1) {
			fprintf(stdout, "deserializeMsg OK. errno=%" PRId32 ", size_reply=%" PRId32 "\n", httpCacheData->m_errno, httpCacheData->size_reply);

			// get uncompressed size
			uint32_t unzippedLen = *(int32_t*)httpCacheData->ptr_reply;
			// sanity checks
			if ( unzippedLen > 10000000 ) {
				fprintf(stdout, "Unzipped length appears too big: %" PRId32 "\n", unzippedLen);
				return;
			}
			// make buffer to hold uncompressed data
			char *newBuf = (char*)mmalloc(unzippedLen, "DumpUnzip");
			if( ! newBuf ) {
				fprintf(stdout, "Could not allocate memory for uncompressed document: %" PRId32 "\n", unzippedLen);
				return;
			}
			// make another var to get mangled by gbuncompress
			uint32_t uncompressedLen = unzippedLen;
			// uncompress it
			int zipErr = gbuncompress( (unsigned char*)newBuf, // dst
						   &uncompressedLen, // dstLen
						   (unsigned char*)httpCacheData->ptr_reply+4, // src
						   httpCacheData->size_reply-4); // srcLen

			if(zipErr != Z_OK || uncompressedLen != (uint32_t)unzippedLen) {
				fprintf(stdout, "Error unzipping compressed robots.txt unzipped len should be %" PRId32" but is %" PRId32". ziperr=%" PRId32,
					(int32_t)uncompressedLen, (int32_t)unzippedLen, (int32_t)zipErr);
				mfree(newBuf, unzippedLen, "DumpUnzip");
				return;
			}

			fprintf(stdout,"\n%s\n\n", newBuf);
			mfree(newBuf, unzippedLen, "DumpUnzip");
		}
		else {
			fprintf(stderr,"deserialize failed\n");
		}
	}
}

#if 0
static int32_t dumpSpiderdbCsv(const char *coll) {
	g_spiderdb.init();
	g_spiderdb.getRdb()->addRdbBase1(coll);

	key128_t startKey;
	startKey.setMin();

	Msg5 msg5;
	RdbList list;

	unsigned count = 0;
	const SpiderReply *prevSpiderReply = NULL;
	char prevSpiderReplyBuf[sizeof(SpiderReply)+MAX_URL_LEN+100];
	int64_t prevSpiderReplyUrlHash48 = 0LL;
	int64_t prevRequestUh48 = 0;

	const CollectionRec *cr = g_collectiondb.getRec(coll);

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if( ! msg5.getList(RDB_SPIDERDB,
				   cr->m_collnum,
				   &list,
				   &startKey,
				   KEYMAX(),
				   commandLineDumpdbRecSize,
				   true,           //includeTree
				   0,              //startFileNum
				   -1,             //numFiles
				   NULL,           // state
				   NULL,           // callback
				   0,              // niceness
				   false,          // err correction?
				   -1,             // maxRetries
				   false))         // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return -1;
		}
		// all done if empty
		if(list.isEmpty())
			break;

		// loop over entries in list
		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
			count++;
			if((count % 100000) == 0) {
				fprintf( stderr, "Processed %u records.\n", count - 1);
			}

			const char *srec = list.getCurrentRec();

			if(Spiderdb::isSpiderReply((const key128_t *)srec)) {
				const SpiderReply *srep = reinterpret_cast<const SpiderReply *>(srec);
				prevSpiderReplyUrlHash48 = srep->getUrlHash48();
				prevSpiderReply = srep;
			} else if(prevRequestUh48==Spiderdb::getUrlHash48(reinterpret_cast<const key128_t*>(srec))) {
				//skip duplicate
			} else {
				const SpiderRequest *spiderRequest = reinterpret_cast<const SpiderRequest*>(srec);

				int64_t uh48 = spiderRequest->getUrlHash48();
				// count how many requests had replies and how many did not
				bool hadReply = prevSpiderReply && (uh48 == prevSpiderReplyUrlHash48);

				if( !hadReply ) {
					// Last reply and current request do not belong together
					prevSpiderReply = NULL;
				}

				prevRequestUh48 = spiderRequest->getUrlHash48();

				// print it
				printf("%u,",spiderRequest->m_firstIp);
				printf("%lu,",spiderRequest->getUrlHash48());
				printf("%u,",spiderRequest->m_hostHash32);
				printf("%u,",spiderRequest->m_domHash32);
				printf("%u,",spiderRequest->m_siteHash32);
				printf("%d,",spiderRequest->m_siteNumInlinks);
				printf("%d,",spiderRequest->m_addedTime);
				printf("%d,",spiderRequest->m_pageNumInlinks);
				printf("%d,",spiderRequest->m_sameErrCount);
				printf("%d,",spiderRequest->m_version);
				printf("%d,",spiderRequest->m_discoveryTime);
				printf("%d,",spiderRequest->m_contentHash32);
				printf("%d,",spiderRequest->m_hopCount);
				printf("%d,",spiderRequest->m_recycleContent);
				printf("%d,",spiderRequest->m_hopCountValid);
				printf("%d,",spiderRequest->m_isAddUrl);
				printf("%d,",spiderRequest->m_isPageReindex);
				printf("%d,",spiderRequest->m_isUrlCanonical);
				printf("%d,",spiderRequest->m_isPageParser);
				printf("%d,",spiderRequest->m_urlIsDocId);
				printf("%d,",spiderRequest->m_isRSSExt);
				printf("%d,",spiderRequest->m_isUrlPermalinkFormat);
				printf("%d,",spiderRequest->m_forceDelete);
				printf("%d,",spiderRequest->m_isInjecting);
				printf("%d,",spiderRequest->m_hadReply);
				printf("%d,",spiderRequest->m_fakeFirstIp);
				printf("%d,",spiderRequest->m_hasAuthorityInlink);
				printf("%d,",spiderRequest->m_hasAuthorityInlinkValid);
				printf("%d,",spiderRequest->m_siteNumInlinksValid);
				printf("%d,",spiderRequest->m_avoidSpiderLinks);
				printf("%d,",spiderRequest->m_ufn);
				printf("%d,",spiderRequest->m_priority);
				printf("%d,",spiderRequest->m_errCount);
				printf("\"%s\",",spiderRequest->m_url);

				if(prevSpiderReply) {
					// Only dump these values if last reply and current request belong together
					//printf("%d,",prevSpiderReply->m_firstIp);
					//printf("%d,",prevSpiderReply->m_siteHash32);
					//printf("%d,",prevSpiderReply->m_domHash32);
					printf("%f,",prevSpiderReply->m_percentChangedPerDay);
					printf("%d,",prevSpiderReply->m_spideredTime);
					printf("%d,",prevSpiderReply->m_errCode);
					//printf("%d,",prevSpiderReply->m_siteNumInlinks);
					//printf("%d,",prevSpiderReply->m_sameErrCount);
					//printf("%d,",prevSpiderReply->m_version);
					//printf("%d,",prevSpiderReply->m_contentHash32);
					printf("%d,",prevSpiderReply->m_crawlDelayMS);
					printf("%ld,",prevSpiderReply->m_downloadEndTime);
					printf("%d,",prevSpiderReply->m_httpStatus);
					//printf("%d,",prevSpiderReply->m_errCount);
					printf("%d,",prevSpiderReply->m_langId);
					printf("%d,",prevSpiderReply->m_isRSS);
					printf("%d,",prevSpiderReply->m_isPermalink);
					printf("%d,",prevSpiderReply->m_isIndexed);
					printf("%d,",prevSpiderReply->m_isIndexedINValid);
					printf("%d,",prevSpiderReply->m_fromInjectionRequest);
				} else {
					printf(",,,,,,,,,,,");
				}
				printf("\n");
			}
		}

		//copy prevspiderreply to tmp buf, so we can rememerb the value to next list
		if(prevSpiderReply && sizeof(key128_t)+prevSpiderReply->m_dataSize < sizeof(prevSpiderReplyBuf)) {
			memcpy(prevSpiderReplyBuf, prevSpiderReply, sizeof(key128_t)+prevSpiderReply->m_dataSize);
			prevSpiderReply = reinterpret_cast<const SpiderReply*>(prevSpiderReplyBuf);
		} else
			prevSpiderReply = NULL;

		const key128_t *listLastKey = reinterpret_cast<const key128_t *>(list.getLastKey());
		startKey = *listLastKey;
		startKey++;

		// watch out for wrap around
		if ( startKey < *listLastKey)
			break;
	}
	return 0;
}
#endif

// time speed of inserts into RdbTree for indexdb
static bool hashtest() {
	// load em up
	int32_t numKeys = 1000000;
	log("db: speedtest: generating %" PRId32" random keys.",numKeys);
	// seed randomizer
	srand ( (int32_t)gettimeofdayInMilliseconds() );
	// make list of one million random keys
	key96_t *k = (key96_t *)mmalloc ( sizeof(key96_t) * numKeys , "main" );
	if ( ! k ) {
		log(LOG_WARN, "hashtest: malloc failed");
		return false;
	}
	int32_t *r = (int32_t *)(void*)k;
	for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) r[i] = rand();
	// init the tree
	//HashTableT<int32_t,int32_t> ht;
	HashTable ht;
	ht.set ( (int32_t)(1.1 * numKeys) );
	// add to regular tree
	int64_t t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < numKeys ; i++ )
		if ( ! ht.addKey ( r[i] , 1 ) ) {
			log(LOG_WARN, "hashtest: add key failed.");
			return false;
		}
	// print time it took
	int64_t e = gettimeofdayInMilliseconds();
	// add times
	log("db: added %" PRId32" keys in %" PRId64" ms",numKeys,e - t);

	// do the delete test
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < numKeys ; i++ )
		if ( ! ht.removeKey ( r[i] ) ) {
			log(LOG_WARN, "hashtest: add key failed.");
			return false;
		}
	// print time it took
	e = gettimeofdayInMilliseconds();
	// add times
	log("db: deleted %" PRId32" keys in %" PRId64" ms",numKeys,e - t);

	return true;
}


static void dumpTagdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, char req,
		      const char *siteArg) {
	g_tagdb.init ();

	g_tagdb.getRdb()->addRdbBase1(coll );

	key128_t startKey ;
	key128_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	if ( siteArg ) {
		startKey = Tagdb::makeStartKey ( siteArg );
		endKey = Tagdb::makeEndKey ( siteArg );
		log("gb: using site %s for start key",siteArg );
	}

	Msg5 msg5;
	RdbList list;

	const CollectionRec *cr = g_collectiondb.getRec(coll);

	int64_t hostHash = -1;
	int64_t lastHostHash = -2;
	const char *site = NULL;
	char sbuf[1024*2];
	int32_t siteNumInlinks = -1;
	int32_t typeSite = hash64Lower_a("site",4);
	int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TAGDB,
				      cr->m_collnum      ,
				      &list         ,
				      (char *)&startKey      ,
				      (char *)&endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1,             // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) return;
		// loop over entries in list
		for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
			char *rec  = list.getCurrentRec();
			//key96_t k    = list.getCurrentKey();
			key128_t k;
			list.getCurrentKey ( &k );
			char *data = list.getCurrentData();
			int32_t  size = list.getCurrentDataSize();
			// is it a delete?
			if ( (k.n0 & 0x01) == 0 ) {
				if ( req == 'z' ) continue;
				printf("k.n1=%016" PRIx64" "
				       "k.n0=%016" PRIx64" (delete)\n",
				       k.n1  , k.n0   | 0x01  );  // fix it!
				continue;
			}
			// point to the data
			const char  *p       = data;
			const char  *pend    = data + size;
			// breach check
			if ( p >= pend ) {
				printf("corrupt tagdb rec k.n0=%" PRIu64,k.n0);
				continue;
			}

			// parse it up
			Tag *tag = (Tag *)rec;

			// print the version and site
			StackBuf<1024> sb;

			bool match = false;

			hostHash = tag->m_key.n1;

			if ( hostHash == lastHostHash ) {
				match = true;
			}
			else {
				site = NULL;
				siteNumInlinks = -1;
			}

			lastHostHash = hostHash;

			// making sitelist.txt?
			if ( tag->m_type == typeSite && req == 'z' ) {
				site = tag->getTagData();
				// make it null if too many .'s
				if ( site ) {
					const char *p = site;
					int count = 0;
					int alpha = 0;
					int colons = 0;
					// foo.bar.baz.com is ok
					for ( ; *p ; p++ ) {
						if ( *p == '.' ) count++;
						if ( *p == ':' ) colons++;
						if ( is_alpha_a(*p) || *p=='-' )
							alpha++;
					}
					if ( count >= 4 )
						site = NULL;
					if ( colons > 1 )
						site = NULL;
					// no ip addresses allowed, need an alpha char
					if ( alpha == 0 )
						site = NULL;
				}
				// ends in :?
				int slen = 0;
				if ( site ) slen = strlen(site);
				if ( site && site[slen-1] == ':' )
					site = NULL;
				// port bug
				if ( site && site[slen-2] == ':' && site[slen-1]=='/')
					site = NULL;
				// remove heavy spammers to save space
				if ( site && strstr(site,"daily-camshow-report") )
					site = NULL;
				if ( site && strstr(site,".livejasminhd.") )
					site = NULL;
				if ( site && strstr(site,".pornlivenews.") )
					site = NULL;
				if ( site && strstr(site,".isapornblog.") )
					site = NULL;
				if ( site && strstr(site,".teen-model-24.") )
					site = NULL;
				if ( site && ! is_ascii2_a ( site, strlen(site) ) ) {
					site = NULL;
					continue;
				}
				if ( match && siteNumInlinks>=0) {
					// if we ask for 1 or 2 we end up with 100M
					// entries, but with 3+ we get 27M
					if ( siteNumInlinks > 2 && site )
						printf("%i %s\n",siteNumInlinks,site);
					siteNumInlinks = -1;
					site = NULL;
				}
				// save it
				if ( site ) strcpy ( sbuf , site );
				continue;
			}

			if ( tag->m_type == typeInlinks && req == 'z' ) {
				siteNumInlinks = atoi(tag->getTagData());
				if ( match && site ) {
					// if we ask for 1 or 2 we end up with 100M
					// entries, but with 3+ we get 27M
					if ( siteNumInlinks > 2 )
						printf("%i %s\n",siteNumInlinks,sbuf);
					siteNumInlinks = -1;
					site = NULL;
				}
				continue;
			}

			if ( req == 'z' )
				continue;

			// print as an add request or just normal
			if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb );
			else              tag->printToBuf             ( &sb );

			// dump it
			printf("%s\n",sb.getBufStart());

		}

		startKey = *(key128_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key128_t *)list.getLastKey() ){
			printf("\n"); return;}
	}
}

static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {

	if(startFileNum!=0 && numFiles<0) {
		//this may apply to all files, but I haven't checked into hash-based ones yet
		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
		return;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." );
		return;
	}

	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1(coll);
	key96_t startKey ;
	key96_t endKey   ;
	key96_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = Titledb::makeFirstKey(0);
	Msg5 msg5;
	RdbList list;
	HashTableX dedupTable;
	dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");

	// make this
	XmlDoc *xd;
	try {
		xd = new (XmlDoc);
	}
	catch(std::bad_alloc&) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}

	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if(cr==NULL) {
		fprintf(stderr,"Unknown collection '%s'\n", coll);
		return;
	}

	// initialize shlib & blacklist
	if (!WantedChecker::initialize()) {
		fprintf(stderr, "Unable to initialize WantedChecker");
		return;
	}

	g_urlBlackList.init();
	g_urlWhiteList.init();

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1            , // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) {
			return;
		}

		// loop over entries in list
		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
			key96_t k = list.getCurrentKey();
			char *rec = list.getCurrentRec();
			int32_t recSize = list.getCurrentRecSize();
			int64_t docId = Titledb::getDocIdFromKey(&k);

			if ( k <= lastKey ) {
				log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				    lastKey.n1, lastKey.n0, k.n1, k.n0);
			}

			lastKey = k;

			if ( (k.n0 & 0x01) == 0) {
				// delete key
				continue;
			}

			// free the mem
			xd->reset();

			// uncompress the title rec
			if (!xd->set2(rec, recSize, coll, 0)) {
				//set2() may have logged something but not the docid
				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
				continue;
			}

			// extract the url
			Url *url = xd->getFirstUrl();
			const char *reason = NULL;

			if (isUrlUnwanted(*url, &reason)) {
				fprintf(stdout, "%" PRId64"|%s|%s\n", docId, reason, url->getUrl());
				continue;
			}

			Url **redirUrlPtr = xd->getRedirUrl();
			if (redirUrlPtr && *redirUrlPtr) {
				Url *redirUrl = *redirUrlPtr;
				if (isUrlUnwanted(*redirUrl, &reason)) {
					fprintf(stdout, "%" PRId64"|redir %s|%s|%s\n", docId, reason, url->getUrl(), redirUrl->getUrl());
					continue;
				}
			}

			uint8_t *contentType = xd->getContentType();
			switch (*contentType) {
				case CT_GIF:
				case CT_JPG:
				case CT_PNG:
				case CT_TIFF:
				case CT_BMP:
				case CT_JS:
				case CT_CSS:
				case CT_JSON:
				case CT_IMAGE:
				case CT_GZ:
				case CT_ARC:
				case CT_WARC:
					fprintf(stdout, "%" PRId64"|blocked content type|%s\n", docId, url->getUrl());
					continue;
				default:
					break;
			}

			// check content
			int32_t contentLen = xd->size_utf8Content > 0 ? (xd->size_utf8Content - 1) : 0;
			if (contentLen > 0) {
				if (!WantedChecker::check_single_content(url->getUrl(), xd->ptr_utf8Content, contentLen).wanted) {
					fprintf(stdout, "%" PRId64"|blocked content|%s\n", docId, url->getUrl());
					continue;
				}
			}
		}
		startKey = *(key96_t *)list.getLastKey();
		startKey++;

		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) {
			return;
		}
	}
}


static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {

	if(startFileNum!=0 && numFiles<0) {
		//this may apply to all files, but I haven't checked into hash-based ones yet
		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
		return;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." );
		return;
	}

	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1(coll);
	key96_t startKey ;
	key96_t endKey   ;
	key96_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = Titledb::makeFirstKey(0);
	Msg5 msg5;
	RdbList list;
	HashTableX dedupTable;
	dedupTable.set(4,0,10000,NULL,0,false,"maintitledb");

	// make this
	XmlDoc *xd;
	try {
		xd = new (XmlDoc);
	}
	catch(std::bad_alloc&) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}

	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if(cr==NULL) {
		fprintf(stderr,"Unknown collection '%s'\n", coll);
		return;
	}

	// initialize shlib & blacklist
	if (!WantedChecker::initialize()) {
		fprintf(stderr, "Unable to initialize WantedChecker");
		return;
	}

	g_urlBlackList.init();
	g_urlWhiteList.init();

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1            , // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) {
			return;
		}

		// loop over entries in list
		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
			key96_t k = list.getCurrentKey();
			char *rec = list.getCurrentRec();
			int32_t recSize = list.getCurrentRecSize();
			int64_t docId = Titledb::getDocIdFromKey(&k);

			if ( k <= lastKey ) {
				log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				    lastKey.n1, lastKey.n0, k.n1, k.n0);
			}

			lastKey = k;

			if ( (k.n0 & 0x01) == 0) {
				// delete key
				continue;
			}

			// free the mem
			xd->reset();

			// uncompress the title rec
			if (!xd->set2(rec, recSize, coll, 0)) {
				//set2() may have logged something but not the docid
				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
				continue;
			}

			// extract the url
			Url *url = xd->getFirstUrl();
			const char *reason = NULL;

			if( ! isUrlUnwanted(*url, &reason)) {
				Url **redirUrlPtr = xd->getRedirUrl();
				if (redirUrlPtr && *redirUrlPtr) {
					Url *redirUrl = *redirUrlPtr;
					if (isUrlUnwanted(*redirUrl, &reason)) {
						continue;
					}
				}

				fprintf(stdout, "%" PRId64 "|%s\n", docId, url->getUrl());
			}
		}
		startKey = *(key96_t *)list.getLastKey();
		startKey++;

		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) {
			return;
		}
	}
}


static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
	if(startFileNum!=0 && numFiles<0) {
		//this may apply to all files, but I haven't checked into hash-based ones yet
		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
		return;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." );
		return;
	}

	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1(coll);
	key96_t startKey ;
	key96_t endKey   ;
	key96_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = Titledb::makeFirstKey(0);
	Msg5 msg5;
	RdbList list;

	// make this
	XmlDoc *xd;
	try {
		xd = new (XmlDoc);
	}
	catch(std::bad_alloc&) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}

	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if(cr==NULL) {
		fprintf(stderr,"Unknown collection '%s'\n", coll);
		return;
	}

	// initialize shlib & blacklist
	if (!WantedChecker::initialize()) {
		fprintf(stderr, "Unable to initialize WantedChecker");
		return;
	}
	g_urlBlackList.init();
	g_urlWhiteList.init();

	g_checkAdultList.init("adultwords.txt", "adultphrases.txt");

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1            , // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) {
			return;
		}

		// loop over entries in list
		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
			key96_t k = list.getCurrentKey();
			char *rec = list.getCurrentRec();
			int32_t recSize = list.getCurrentRecSize();
			int64_t docId = Titledb::getDocIdFromKey(&k);

			if ( k <= lastKey ) {
				log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				    lastKey.n1, lastKey.n0, k.n1, k.n0);
			}

			lastKey = k;

			if ( (k.n0 & 0x01) == 0) {
				// delete key
				continue;
			}

			// free the mem
			xd->reset();

			// uncompress the title rec
			if (!xd->set2(rec, recSize, coll, 0)) {
				//set2() may have logged something but not the docid
				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
				continue;
			}

			// extract the url
			Url *url = xd->getFirstUrl();
			if( url == (void *)-1 ) {
				log(LOG_WARN, "dbdump: XmlDoc::getFirstUrl() failed for docid %" PRId64, docId);
				continue;
			}

			const char *reason = NULL;

			// Don't dump unwanted URLs
			if( ! isUrlUnwanted(*url, &reason)) {
				Url **redirUrlPtr = xd->getRedirUrl();
				if (redirUrlPtr && *redirUrlPtr) {
					Url *redirUrl = *redirUrlPtr;
					if (isUrlUnwanted(*redirUrl, &reason)) {
						continue;
					}
				}

				// Get adult flag including generating debug info.
				// Could just call xd->getIsAdult() to get the simple indicator
				// without debug information.
				CheckAdult achk(xd, true);
				bool newblocked = achk.isDocAdult();

#if 0
				// Sanity check.
				bool gbadult = false;
				char *adultbit = xd->getIsAdult();
				if( adultbit ) {
					if( *adultbit != newblocked ) {
						// Mismatch - should never happen
						log(LOG_ERROR, "Adult check mismatch! docid=%" PRId64 ", url=%s, gbadult=%s, score=%" PRId32 ", newblock=%s",
						docId, url->getUrl(), gbadult?"true":"false", achk.getScore(), newblocked?"true":"false");
						gbshutdownLogicError();
					}
				}
#endif

				if( newblocked ) {
					time_t idxtim = (time_t)xd->getIndexedTime();
					struct tm tm_buf;
					tm *tm1 = gmtime_r(&idxtim,&tm_buf);
					char idxtim_s[64];
					strftime(idxtim_s,64,"%Y%m%d-%H%M%S",tm1);

					fprintf(stdout, "%" PRId64 "\t%s\t%s\t%s\tscore=%" PRId32 "\tunique dw=%" PRId32 "\tunique dp=%" PRId32 "\twords=%" PRId32 "\t%s\t%s\n",
						docId, url->getUrl(), idxtim_s, achk.getReason(),
						achk.getScore(), achk.getNumUniqueMatchedWords(), achk.getNumUniqueMatchedPhrases(),
						achk.getNumWordsChecked(), achk.hasEmptyDocumentBody()?"EMPTYDOC":"HASCONTENT", achk.getDebugInfo());
				}
			}
		}

		startKey = *(key96_t *)list.getLastKey();
		startKey++;

		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) {
			return;
		}
	}
}


static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
	if(startFileNum!=0 && numFiles<0) {
		//this may apply to all files, but I haven't checked into hash-based ones yet
		fprintf(stderr,"If <startFileNum> is specified then <numFiles> must be too\n");
		return;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." );
		return;
	}

	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1(coll);
	key96_t startKey ;
	key96_t endKey   ;
	key96_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = Titledb::makeFirstKey(0);
	Msg5 msg5;
	RdbList list;

	// make this
	XmlDoc *xd;
	try {
		xd = new (XmlDoc);
	}
	catch(std::bad_alloc&) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}

	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if(cr==NULL) {
		fprintf(stderr,"Unknown collection '%s'\n", coll);
		return;
	}

	// initialize shlib & blacklist
	if (!WantedChecker::initialize()) {
		fprintf(stderr, "Unable to initialize WantedChecker");
		return;
	}
	g_urlBlackList.init();
	g_urlWhiteList.init();

	g_checkSpamList.init("spamphrases.txt");

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1            , // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) {
			return;
		}

		// loop over entries in list
		for(list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
			key96_t k = list.getCurrentKey();
			char *rec = list.getCurrentRec();
			int32_t recSize = list.getCurrentRecSize();
			int64_t docId = Titledb::getDocIdFromKey(&k);

			if ( k <= lastKey ) {
				log("key out of order. lastKey.n1=%" PRIx32" n0=%" PRIx64" currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				    lastKey.n1, lastKey.n0, k.n1, k.n0);
			}

			lastKey = k;

			if ( (k.n0 & 0x01) == 0) {
				// delete key
				continue;
			}

			// free the mem
			xd->reset();

			// uncompress the title rec
			if (!xd->set2(rec, recSize, coll, 0)) {
				//set2() may have logged something but not the docid
				log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
				continue;
			}

			// extract the url
			Url *url = xd->getFirstUrl();
			if( url == (void *)-1 ) {
				log(LOG_WARN, "dbdump: XmlDoc::getFirstUrl() failed for docid %" PRId64, docId);
				continue;
			}

			const char *reason = NULL;

			// Don't dump unwanted URLs
			if( ! isUrlUnwanted(*url, &reason)) {
				Url **redirUrlPtr = xd->getRedirUrl();
				if (redirUrlPtr && *redirUrlPtr) {
					Url *redirUrl = *redirUrlPtr;
					if (isUrlUnwanted(*redirUrl, &reason)) {
						continue;
					}
				}

				// Get adult flag including generating debug info.
				// Could just call xd->getIsAdult() to get the simple indicator
				// without debug information.
				CheckSpam schk(xd, true);
				bool newblocked = schk.isDocSpam();

				if( newblocked ) {
					time_t idxtim = (time_t)xd->getIndexedTime();
					struct tm tm_buf;
					tm *tm1 = gmtime_r(&idxtim,&tm_buf);
					char idxtim_s[64];
					strftime(idxtim_s,64,"%Y%m%d-%H%M%S",tm1);

					fprintf(stdout, "%" PRId64 "\t%s\t%s\t%s\tscore=%" PRId32 "\tunique dw=%" PRId32 "\tunique dp=%" PRId32 "\twords=%" PRId32 "\t%s\t%s\n",
						docId, url->getUrl(), idxtim_s, schk.getReason(),
						schk.getScore(), schk.getNumUniqueMatchedWords(), schk.getNumUniqueMatchedPhrases(),
						schk.getNumWordsChecked(), schk.hasEmptyDocumentBody()?"EMPTYDOC":"HASCONTENT", schk.getDebugInfo());
				}
			}
		}

		startKey = *(key96_t *)list.getLastKey();
		startKey++;

		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) {
			return;
		}
	}
}

static bool parseTest(const char *coll, int64_t docId, const char *query) {
	g_conf.m_maxMem = 2000000000LL; // 2G
	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1 ( coll );
	log(LOG_INIT, "build: Testing parse speed of html docId %" PRId64".",docId);
	RdbList tlist;
	key96_t startKey = Titledb::makeFirstKey ( docId );
	key96_t endKey   = Titledb::makeLastKey  ( docId );
	// a niceness of 0 tells it to block until it gets results!!
	Msg5 msg5;

	const CollectionRec *cr = g_collectiondb.getRec(coll);
	if ( ! msg5.getList ( RDB_TITLEDB    ,
			      cr->m_collnum        ,
			      &tlist         ,
			      (char *)&startKey       ,
			      (char *)&endKey         , // should be maxed!
			      9999999        , // min rec sizes
			      true           , // include tree?
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      false          , // do error correction?
			      -1             , // maxRetries
			      false))         { // isRealMerge
		log(LOG_LOGIC, "build: getList did not block.");
		return false;
	}

	// get the title rec
	if ( tlist.isEmpty() ) {
		log(LOG_WARN, "build: speedtestxml: docId %" PRId64" not found.", docId);
		return false;
	}
	const char *errmsg=NULL;
	if (!UnicodeMaps::load_maps(unicode_data_dir,&errmsg)) {
		log("Unicode initialization failed! %s", errmsg);
		return false;
	}
	if(!utf8_convert_initialize()) {
		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
		return false;
	}

	// get raw rec from list
	char *rec      = tlist.getCurrentRec();
	int32_t  listSize = tlist.getListSize ();
	XmlDoc xd;
	if (!xd.set2(rec, listSize, coll, 0)) {
		log(LOG_WARN, "build: speedtestxml: Error setting xml doc.");
		return false;
	}
	log("build: Doc url is %s",xd.ptr_firstUrl);//tr.getUrl()->getUrl());
	log("build: Doc is %" PRId32" bytes long.",xd.size_utf8Content-1);
	log("build: Doc charset is %s",get_charset_str(xd.m_charset));


	// time the summary/title generation code
	log("build: Using query %s",query);
	summaryTest1   ( rec , listSize , coll , docId , query );

	// for a 128k latin1 doc: (access time is probably 15-20ms)
	// 1.18 ms to set title rec (6ms total)
	// 1.58 ms to set Xml
	// 1.71 ms to set Words (~50% from Words::countWords())
	// 0.42 ms to set Pos
	// 0.66 ms to set Bits
	// 0.51 ms to set Scores
	// 0.35 ms to getText()

	// speed test
	int64_t t = gettimeofdayInMilliseconds();
	for ( int32_t k = 0 ; k < 100 ; k++ )
		xd.set2(rec, listSize, coll, 0);
	int64_t e = gettimeofdayInMilliseconds();
	logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
	     (float)(e-t)/100.0);

	// speed test
	t = gettimeofdayInMilliseconds();
	for ( int32_t k = 0 ; k < 100 ; k++ ) {
		char *mm = (char *)mmalloc ( 300*1024 , "ztest");
		mfree ( mm , 300*1024 ,"ztest");
	}
	e = gettimeofdayInMilliseconds();
	logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
	     (float)(e-t)/100.0);

	// get content
	char *content    = xd.ptr_utf8Content;//tr.getContent();
	int32_t  contentLen = xd.size_utf8Content-1;//tr.getContentLen();

	// loop parse
	Xml xml;
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		if ( !xml.set( content, contentLen, xd.m_version, CT_HTML ) ) {
			log(LOG_WARN, "build: speedtestxml: xml set: %s", mstrerror(g_errno));
			return false;
		}
	}

	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Xml::set() took %.3f ms to parse docId %" PRId64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	// get per char and per byte speeds
	xml.reset();

	// loop parse
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		if ( !xml.set( content, contentLen, xd.m_version, CT_HTML ) ) {
			log(LOG_WARN, "build: xml(setparents=false): %s", mstrerror(g_errno));
			return false;
		}
	}

	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Xml::set(setparents=false) took %.3f ms to "
	    "parse docId %" PRId64".", (double)(e - t)/100.0,docId);


	TokenizerResult tr;

	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		xml_tokenizer_phase_1(&xml,&tr);
		tr.clear();
	}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Words::set(xml) took %.3f ms for %zu words"
	    " for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		plain_tokenizer_phase_1(content,contentLen,&tr);
		tr.clear();
	}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Words::set(content,computeIds=true) "
	    "took %.3f ms for %zu words "
	    "for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	Pos pos;
	// computeWordIds from xml
	tr.clear();
	xml_tokenizer_phase_1(&xml,&tr);
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! pos.set ( &tr ) ) {
			log(LOG_WARN, "build: speedtestxml: pos set: %s", mstrerror(g_errno));
			return false;
		}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Pos::set() "
	    "took %.3f ms for %zu words "
	    "for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	Bits bits;
	// computeWordIds from xml
	tr.clear();
	xml_tokenizer_phase_1(&xml,&tr);
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! bits.setForSummary ( &tr ) ) {
			log(LOG_WARN, "build: speedtestxml: Bits set: %s", mstrerror(g_errno));
			return false;
		}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Bits::setForSummary() "
	    "took %.3f ms for %zu words "
	    "for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	Sections sections;
	// computeWordIds from xml
	tr.clear();
	xml_tokenizer_phase_1(&xml,&tr);
	bits.set(&tr);
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		// do not supply xd so it will be set from scratch
		if ( !sections.set( &tr, &bits, NULL, 0 ) ) {
			log(LOG_WARN, "build: speedtestxml: sections set: %s", mstrerror(g_errno));
			return false;
		}

	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Scores::set() "
	    "took %.3f ms for %zu words "
	    "for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	//Phrases phrases;
	Phrases phrases;
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( !phrases.set( tr, bits ) ) {
			log(LOG_WARN, "build: speedtestxml: Phrases set: %s", mstrerror(g_errno));
			return false;
		}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Phrases::set() "
	    "took %.3f ms for %zu words "
	    "for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);

	char *buf = (char *)mmalloc(contentLen*2+1,"main");
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( !xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true ) ) {
			log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
			return false;
		}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Xml::getText(computeIds=false) took %.3f ms for docId "
	    "%" PRId64".",(double)(e - t)/100.0,docId);


	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		int32_t bufLen = xml.getText( buf, contentLen * 2 + 1, 0, 9999999, true );
		if ( ! bufLen ) {
			log(LOG_WARN, "build: speedtestxml: getText: %s", mstrerror(g_errno));
			return false;
		}
		plain_tokenizer_phase_1(buf,bufLen,&tr);
		tr.clear();
	}

	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Xml::getText(computeIds=false) w/ word::set() "
	    "took %.3f ms for docId "
	    "%" PRId64".",(double)(e - t)/100.0,docId);


	Matches matches;
	Query q;
	q.set(query, langUnknown, 1.0, 1.0, NULL, false, true, ABS_MAX_QUERY_TERMS);
	matches.setQuery ( &q );
	tr.clear();
	xml_tokenizer_phase_1(&xml,&tr);
	t = gettimeofdayInMilliseconds();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		matches.reset();
		if ( ! matches.addMatches ( &tr ) ) {
			log(LOG_WARN, "build: speedtestxml: matches set: %s", mstrerror(g_errno));
			return false;
		}
	}
	// print time it took
	e = gettimeofdayInMilliseconds();
	log("build: Matches::set() took %.3f ms for %zu words"
	    " for docId %" PRId64".",
	    (double)(e - t)/100.0,tr.size(),docId);


	return true;
}


static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t docId, const char *query) {

	// start the timer
	int64_t t = gettimeofdayInMilliseconds();

	Query q;
	q.set(query, langUnknown, 1.0, 1.0, NULL, false, true, ABS_MAX_QUERY_TERMS);

	char *content ;
	int32_t  contentLen ;

	// loop parse
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		XmlDoc xd;
		if (!xd.set2(rec, listSize, coll, 0)) {
			log(LOG_ERROR,"%s:%s: XmlDoc.set2 failed", __FILE__, __func__);
			return false;
		}
		// get content
		content    = xd.ptr_utf8Content;//tr.getContent();
		contentLen = xd.size_utf8Content-1;//tr.getContentLen();

		// now parse into xhtml (takes 15ms on lenny)
		Xml xml;
		xml.set( content, contentLen, xd.m_version, CT_HTML );

		xd.getSummary();
	}

	// print time it took
	int64_t e = gettimeofdayInMilliseconds();
	log("build: V1  Summary/Title/Gigabits generation took %.3f ms for docId "
	    "%" PRId64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	return true;
}

void dumpPosdb (const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int64_t termId , bool justVerify ) {
	if ( ! justVerify ) {
		g_posdb.init ();
		g_posdb.getRdb()->addRdbBase1(coll );
	}

	key144_t startKey ;
	key144_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	if ( termId >= 0 ) {
		Posdb::makeStartKey ( &startKey, termId );
		Posdb::makeEndKey  ( &endKey, termId );
		printf("termid=%" PRIu64"\n", (uint64_t)termId);
		printf("startkey=%s\n",KEYSTR(&startKey,sizeof(posdbkey_t)));
		printf("endkey=%s\n",KEYSTR(&endKey,sizeof(posdbkey_t)));
	}

	// bail if not
	if ( g_posdb.getRdb()->getNumFiles() <= startFileNum && numFiles > 0 ) {
		printf("Request file #%" PRId32" but there are only %" PRId32" "
		       "posdb files\n",startFileNum,
		       g_posdb.getRdb()->getNumFiles());
		return;
	}

	key144_t lastKey;
	lastKey.setMin();

	Msg5 msg5;
	RdbList list;

	// set this flag so Msg5.cpp if it does error correction does not
	// try to get the list from a twin...
	g_isDumpingRdbFromMain = true;
	const CollectionRec *cr = g_collectiondb.getRec(coll);

	for (;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if (!msg5.getList(RDB_POSDB,
		                  cr->m_collnum,
		                  &list,
		                  &startKey,
		                  &endKey,
		                  commandLineDumpdbRecSize,
		                  includeTree,
		                  startFileNum,
		                  numFiles,
		                  NULL, // state
		                  NULL, // callback
		                  0, // niceness
		                  true,           // to debug RdbList::removeBadData_r()
		                  -1,             // maxRetries
		                  false))          // isRealMerge
		{
			log(LOG_LOGIC, "db: getList did not block.");
			return;
		}
		// all done if empty
		if (list.isEmpty()) return;

		// get last key in list
		const char *ek2 = list.getEndKey();
		// print it
		printf("ek=%s\n", KEYSTR(ek2, list.getKeySize()));

		// loop over entries in list
		for (list.resetListPtr(); !list.isExhausted() && !justVerify; list.skipCurrentRecord()) {
			key144_t k;
			list.getCurrentKey(&k);
			// compare to last
			const char *err = "";
			if (KEYCMP((char *)&k, (char *)&lastKey, sizeof(key144_t)) < 0)
				err = " (out of order)";
			lastKey = k;
			// is it a delete?
			const char *dd = "";
			if ((k.n0 & 0x01) == 0x00) dd = " (delete)";
			int64_t d = Posdb::getDocId(&k);
			uint8_t dh = Docid::getDomHash8FromDocId(d);
			char *rec = list.getCurrentRec();
			int32_t recSize = 18;
			if (rec[0] & 0x04) recSize = 6;
			else if (rec[0] & 0x02) recSize = 12;
			// alignment bits check
			if (recSize == 6 && !(rec[1] & 0x02)) {
				int64_t nd1 = g_posdb.getDocId(rec + 6);
				err = " (alignerror1)";
				if (nd1 < d) err = " (alignordererror1)";
				//g_process.shutdownAbort(true);
			}
			if (recSize == 12 && !(rec[1] & 0x02)) {
				// seems like nd2 is it, so it really is 12 bytes but
				// does not have the alignment bit set...
				int64_t nd2 = g_posdb.getDocId(rec + 12);
				err = " (alignerror2)";
				if (nd2 < d) err = " (alignorderrror2)";
			}
			// if it
			if (recSize == 12 && (rec[7] & 0x02)) {
				// seems like nd2 is it, so it really is 12 bytes but
				// does not have the alignment bit set...
				int64_t nd2 = g_posdb.getDocId(rec + 12);
				err = " (alignerror3)";
				if (nd2 < d) err = " (alignordererror3)";
			}
			if (KEYCMP((char *)&k, (char *)&startKey, list.getKeySize()) < 0 ||
			    KEYCMP((char *)&k, ek2, list.getKeySize()) > 0) {
				err = " (out of range)";
			}

			printf("k=%s"
					       " tid=%015" PRIu64
					       " docId=%012" PRId64
					       " siterank=%02" PRId32
					       " langid=%02" PRId32
					       " pos=%06" PRId32
					       " hgrp=%02" PRId32
					       " spamrank=%02" PRId32
					       " divrank=%02" PRId32
					       " syn=%01" PRId32
					       " densrank=%02" PRId32
					       " mult=%02" PRId32
					       " dh=0x%02" PRIx32
					       " rs=%" PRId32 //recSize
					       "%s" // dd
					       "%s" // err
					       "\n",
			       KEYSTR(&k, sizeof(key144_t)),
			       (uint64_t)Posdb::getTermId(&k),
			       d,
			       (int32_t)Posdb::getSiteRank(&k),
			       (int32_t)Posdb::getLangId(&k),
			       (int32_t)Posdb::getWordPos(&k),
			       (int32_t)Posdb::getHashGroup(&k),
			       (int32_t)Posdb::getWordSpamRank(&k),
			       (int32_t)Posdb::getDiversityRank(&k),
			       (int32_t)Posdb::getIsSynonym(&k),
			       (int32_t)Posdb::getDensityRank(&k),
			       (int32_t)Posdb::getMultiplier(&k),
			       (int32_t)dh,
			       recSize,
			       dd,
			       err);

			continue;
		}

		startKey = *(key144_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if (startKey < *(key144_t *)list.getLastKey()) return;
	}
}

static void dumpClusterdb(const char *coll,
			  int32_t startFileNum,
			  int32_t numFiles,
			  bool includeTree) {
	g_clusterdb.init ();
	g_clusterdb.getRdb()->addRdbBase1(coll );
	key96_t startKey ;
	key96_t endKey   ;
	startKey.setMin();
	endKey.setMax();

	// bail if not
	if ( g_clusterdb.getRdb()->getNumFiles() <= startFileNum ) {
		printf("Request file #%" PRId32" but there are only %" PRId32" "
		       "clusterdb files\n",startFileNum,
		       g_clusterdb.getRdb()->getNumFiles());
		return;
	}

	Msg5 msg5;
	RdbList list;
	const CollectionRec *cr = g_collectiondb.getRec(coll);

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_CLUSTERDB ,
				      cr->m_collnum          ,
				      &list         ,
				      &startKey      ,
				      &endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1,             // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() )
			return;
		// loop over entries in list
		char strLanguage[256];
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {
			key96_t k    = list.getCurrentKey();
			// is it a delete?
			const char *dd = "";
			if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
			// get the language string
			languageToString ( Clusterdb::getLanguage(&k),
					   strLanguage );
			//uint32_t gid = getGroupId ( RDB_CLUSTERDB , &k );
			uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
			Host *grp = g_hostdb.getShard ( shardNum );
			Host *hh = &grp[0];
			// print it
			printf("k.n1=%08" PRIx32" k.n0=%016" PRIx64" "
			       "docId=%012" PRId64" family=%" PRIu32" "
			       "language=%" PRId32" (%s) siteHash26=%" PRIu32"%s "
			       "groupNum=%" PRIu32" "
			       "shardNum=%" PRIu32"\n",
			       k.n1, k.n0,
			       Clusterdb::getDocId(&k) ,
			       Clusterdb::hasAdultContent(&k) ,
			       (int32_t)Clusterdb::getLanguage(&k),
			       strLanguage,
			       Clusterdb::getSiteHash26(&k)    ,
			       dd ,
			       (uint32_t)hh->m_hostId ,
			       shardNum);
			continue;
		}

		startKey = *(key96_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() )
			return;
	}
}

static void dumpLinkdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, const char *url, bool urlhash) {
	g_linkdb.init ();
	g_linkdb.getRdb()->addRdbBase1(coll );
	key224_t startKey ;
	key224_t endKey   ;
	startKey.setMin();
	endKey.setMax();

	// set start/end key to url hash
	if ( url ) {
		Url u;
		u.set( url, strlen( url ), false, false );

		SiteGetter sg;
		sg.getSite(url, NULL, 0, 0);

		uint32_t h32 = hash32(sg.getSite(), sg.getSiteLen(), 0);
		if( urlhash ) {
			startKey = Linkdb::makeStartKey_uk(h32, u.getUrlHash64());
			endKey   = Linkdb::makeEndKey_uk  (h32, u.getUrlHash64());
		}
		else {
			startKey = Linkdb::makeStartKey_uk(h32, 0);
			endKey   = Linkdb::makeEndKey_uk  (h32, LDB_MAXURLHASH);
		}


		printf("URL=%.*s, sitehash32=0x%08" PRIx32 ", urlhash=0x%012" PRIx64 "\n",
			u.getUrlLen(), u.getUrl(), h32, u.getUrlHash64());

		printf("Startkey=%s\n", KEYSTR(&startKey,sizeof(key224_t)));
		printf("Endkey  =%s\n", KEYSTR(&endKey,sizeof(key224_t)));
	}

	// bail if not
	if ( g_linkdb.getRdb()->getNumFiles() <= startFileNum  && !includeTree) {
		printf("Request file #%" PRId32" but there are only %" PRId32" "
		       "linkdb files\n",startFileNum,
		       g_linkdb.getRdb()->getNumFiles());
		return;
	}

	Msg5 msg5;
	RdbList list;
	const CollectionRec *cr = g_collectiondb.getRec(coll);

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_LINKDB ,
				      cr->m_collnum      ,
				      &list         ,
				      (char *)&startKey      ,
				      (char *)&endKey        ,
				      commandLineDumpdbRecSize,
				      includeTree   ,
				      startFileNum  ,
				      numFiles      ,
				      NULL          , // state
				      NULL          , // callback
				      0             , // niceness
				      false         , // err correction?
				      -1,             // maxRetries
				      false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) return;
		// loop over entries in list
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {
			key224_t k;
			list.getCurrentKey((char *) &k);
			// is it a delete?
			const char *dd = "";
			if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
			uint64_t docId = (uint64_t)Linkdb::getLinkerDocId_uk(&k);
			int32_t shardNum = getShardNum(RDB_LINKDB,&k);
			char ipbuf[16];
			printf("k=%s "
			       "linkeesitehash32=0x%08" PRIx32" "
			       "linkeeurlhash=0x%012" PRIx64" "
			       "linkspam=%" PRId32" "
			       "siterank=%02" PRId32" "
			       "ip32=%s "
			       "docId=%" PRIu64" "
			       "discovered=%" PRIu32" "
			       "lost=%" PRIu32" "
			       "sitehash32=0x%08" PRIx32" "
			       "shardNum=%" PRIu32" "
			       "%s\n",
			       KEYSTR(&k,sizeof(key224_t)),
			       (int32_t)Linkdb::getLinkeeSiteHash32_uk(&k),
			       (int64_t)Linkdb::getLinkeeUrlHash64_uk(&k),
			       (int32_t)Linkdb::isLinkSpam_uk(&k),
			       (int32_t)Linkdb::getLinkerSiteRank_uk(&k),
			       iptoa((int32_t)Linkdb::getLinkerIp_uk(&k),ipbuf),
			       docId,
			       (uint32_t)Linkdb::getDiscoveryDate_uk(&k),
			       (uint32_t)Linkdb::getLostDate_uk(&k),
			       (int32_t)Linkdb::getLinkerSiteHash32_uk(&k),
			       (uint32_t)shardNum,
			       dd );
		}

		startKey = *(key224_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key224_t *)list.getLastKey() ) return;
	}
}


static bool cacheTest() {

	g_conf.m_maxMem = 2000000000LL; // 2G
	//g_mem.m_maxMem  = 2000000000LL; // 2G

	if ( ! hashinit() ) {
		log( LOG_ERROR, "db: Failed to init hashtable." );
		return 1;
	}

	// use an rdb cache
	RdbCache c;
	// init, 50MB
	int32_t maxMem = 50000000;
	// . how many nodes in cache tree can we fit?
	// . each rec is key (12) and ip(4)
	// . overhead in cache is 56
	// . that makes 56 + 4 = 60
	// . not correct? stats suggest it's less than 25 bytes each
	int32_t maxCacheNodes = maxMem / 25;
	// set the cache
	if ( ! c.init ( maxMem        ,
			4             ,  // fixed data size of rec
			maxCacheNodes ,
			"cachetest"        ,  // dbname
			false,            // save cache to disk?
			12,               // cachekeysize
			-1)) {            // numptrsmax
		log(LOG_WARN, "test: Cache init failed.");
		return false;
	}

#if 0
	int32_t numRecs = 0 * maxCacheNodes;
	logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);

	// timestamp
	int32_t timestamp = 42;
	// keep ring buffer of last 10 keys
	key96_t oldk[10];
	int32_t  oldip[10];
	int32_t  b = 0;
	// fill with random recs
	for ( int32_t i = 0 ; i < numRecs ; i++ ) {
		if ( (i % 100000) == 0 )
			logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache.",i);
		// random key
		key96_t k ;
		k.n1 = rand();
		k.n0 = rand();
		k.n0 <<= 32;
		k.n0 |= rand();
		int32_t ip = rand();
		// keep ring buffer
		oldk [b] = k;
		oldip[b] = ip;
		if ( ++b >= 10 ) b = 0;
		// make rec,size, like dns, will be 4 byte hash and 4 byte key?
		c.addRecord((collnum_t)0,k,(char *)&ip,4,timestamp);
		// reset g_errno in case it had an error (we don't care)
		g_errno = 0;
		// get a rec too!
		if ( i < 10 ) continue;
		int32_t next = b + 1;
		if ( next >= 10 ) next = 0;
		key96_t back = oldk[next];
		char *rec;
		int32_t  recSize;
		if ( ! c.getRecord ( (collnum_t)0 ,
				     back         ,
				     &rec     ,
				     &recSize ,
				     false    ,  // do copy?
				     -1       ,  // maxAge   ,
				     true     , // inc count?
				     NULL     , // *cachedTime = NULL,
				     true     )){ // promoteRecord?
			g_process.shutdownAbort(true); }
		if ( ! rec || recSize != 4 || *(int32_t *)rec != oldip[next] ) {
			g_process.shutdownAbort(true); }
	}
#endif

	// now try variable sized recs
	c.reset();

	logf(LOG_DEBUG,"test: Testing variably-sized recs.");

	// init, 300MB
	maxMem = 300000000;
	// . how many nodes in cache tree can we fit?
	// . each rec is key (12) and ip(4)
	// . overhead in cache is 56
	// . that makes 56 + 4 = 60
	// . not correct? stats suggest it's less than 25 bytes each
	maxCacheNodes = maxMem / 5000;
	//maxCacheNodes = 1200;
	// set the cache
	if ( ! c.init ( maxMem        ,
			-1            ,  // fixed data size of rec
			maxCacheNodes ,
			"cachetest"        ,  // dbname
			false,12,-1         )) { // save cache to disk?
		log(LOG_WARN, "test: Cache init failed.");
		return false;
	}

	int32_t numRecs = 30 * maxCacheNodes;
	logf(LOG_DEBUG,"test: Adding %" PRId32" recs to cache.",numRecs);

	key96_t oldk[10];

	// timestamp
	int32_t timestamp = 42;
	// keep ring buffer of last 10 keys
	int32_t oldrs[10];
	int32_t b = 0;
	// rec to add
	char *rec;
	int32_t  recSize;
	int32_t  maxRecSize = 40000000; // 40MB for termlists
	int32_t  numMisses = 0;
	char *buf = (char *)mmalloc ( maxRecSize + 64 ,"cachetest" );
	if ( ! buf ) return false;
	// fill with random recs
	for ( int32_t i = 0 ; i < numRecs ; i++ ) {
		if ( (i % 100) == 0 )
			logf(LOG_DEBUG,"test: Added %" PRId32" recs to cache. "
			     "Misses=%" PRId32".",i,numMisses);
		// random key
		key96_t k ;
		k.n1 = rand();
		k.n0 = rand();
		k.n0 <<= 32;
		k.n0 |= rand();
		// random size
		recSize = rand()%maxRecSize;//100000;
		// keep ring buffer
		oldk [b] = k;
		oldrs[b] = recSize;
		if ( ++b >= 10 ) b = 0;
		// make the rec
		rec = buf;
		memset ( rec , (char)k.n1, recSize );
		// make rec,size, like dns, will be 4 byte hash and 4 byte key?
		if ( ! c.addRecord((collnum_t)0,k,rec,recSize,timestamp) ) {
			g_process.shutdownAbort(true); }
		// do a dup add 1% of the time
		if ( (i % 100) == 0 )
			if(!c.addRecord((collnum_t)0,k,rec,recSize,timestamp)){
				g_process.shutdownAbort(true); }
		// reset g_errno in case it had an error (we don't care)
		g_errno = 0;
		// get a rec too!
		if ( i < 10 ) continue;
		int32_t next = b + 1;
		if ( next >= 10 ) next = 0;
		key96_t back = oldk[next];
		//log("cache: get rec");
		if ( ! c.getRecord ( (collnum_t)0 ,
				     back         ,
				     &rec     ,
				     &recSize ,
				     false    ,  // do copy?
				     -1       ,  // maxAge   ,
				     true     , // inc count?
				     NULL     , // *cachedTime = NULL,
				     true) ) {//true     )){ // promoteRecord?
			numMisses++;
			continue;
		}
		if ( recSize != oldrs[next] ) {
			logf(LOG_DEBUG,"test: bad rec size.");
			g_process.shutdownAbort(true);
		}

		char r = (char)back.n1;
		for ( int32_t j = 0 ; j < recSize ; j++ ) {
			if ( rec[j] == r ) continue;
			logf(LOG_DEBUG,"test: bad char in rec.");
			g_process.shutdownAbort(true);
		}
	}

	c.verify();

	c.reset();

	return true;
}

// CountDomains Structures and function definitions
struct dom_info {
	char          *dom;
	int32_t           domLen;
	int32_t           dHash;
	int32_t           pages;
	struct ip_info 	      **ip_list;
	int32_t           numIp;
	int32_t 	      *lnk_table;
	int32_t           tableSize;
	int32_t           lnkCnt;
	int32_t	       lnkPages;
};

struct ip_info {
	uint32_t  ip;
	int32_t           pages;
	struct dom_info **dom_list;
	int32_t           numDom;
};

static int ip_fcmp  (const void *p1, const void *p2);
static int ip_dcmp  (const void *p1, const void *p2);

static int dom_fcmp (const void *p1, const void *p2);
static int dom_lcmp (const void *p1, const void *p2);

static void countdomains(const char* coll, int32_t numRecs, int32_t output) {
	struct ip_info **ip_table;
	struct dom_info **dom_table;

	const CollectionRec *cr = g_collectiondb.getRec(coll);

	key96_t startKey;
	key96_t endKey  ;
	key96_t lastKey ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();

	g_titledb.init ();
	g_titledb.getRdb()->addRdbBase1(coll );

	log( LOG_INFO, "countdomains: parms: coll=%s, numrec s=%d", coll, numRecs );
	int64_t time_start = gettimeofdayInMilliseconds();

	Msg5 msg5;
	RdbList list;
	int32_t countDocs = 0;
	int32_t countIp = 0;
	int32_t countDom = 0;
	int32_t attempts = 0;

	ip_table  = (struct ip_info **)mmalloc(sizeof(struct ip_info *) * numRecs,
					     "main-dcit" );
	dom_table = (struct dom_info **)mmalloc(sizeof(struct dom_info *) * numRecs,
					     "main-dcdt" );

	for( int32_t i = 0; i < numRecs; i++ ) {
		ip_table[i] = NULL;
		dom_table[i] = NULL;
	}

	for(;;) {
		// use msg5 to get the list, should ALWAYS block since no threads
		if ( ! msg5.getList ( RDB_TITLEDB   ,
				cr->m_collnum       ,
				&list         ,
				&startKey      ,
				&endKey        ,
				commandLineDumpdbRecSize,
				true         , // Do we need to include tree?
				0             ,
				-1            ,
				NULL          , // state
				NULL          , // callback
				0             , // niceness
				false         , // err correction?
				-1            , // maxRetries
				false))          // isRealMerge
		{
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		// all done if empty
		if ( list.isEmpty() ) goto freeInfo;
		// loop over entries in list
		for ( list.resetListPtr(); ! list.isExhausted(); list.skipCurrentRecord() ) {
			key96_t k       = list.getCurrentKey();
			char *rec     = list.getCurrentRec();
			int32_t  recSize = list.getCurrentRecSize();
			int64_t docId       = Titledb::getDocId        ( &k );
			attempts++;

			if ( k <= lastKey )
				log("key out of order. "
				"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
				"currKey.n1=%" PRIx32" n0=%" PRIx64" ",
				lastKey.n1,lastKey.n0,
				k.n1,k.n0);
			lastKey = k;
			// print deletes
			if ( (k.n0 & 0x01) == 0) {
				fprintf(stderr,"n1=%08" PRIx32" n0=%016" PRIx64" docId=%012" PRId64" "
					"(del)\n",
				k.n1 , k.n0 , docId );
				continue;
			}

			if( (countIp >= numRecs) || (countDom >= numRecs) ) {
				log( LOG_INFO, "countdomains: countIp | countDom, greater than"
				"numRecs requested, should never happen!!!!" );
				goto freeInfo;
			}

			XmlDoc xd;
			if (!xd.set2(rec, recSize, coll, 0))
				continue;

			struct ip_info  *sipi ;
			struct dom_info *sdomi;

			int32_t i;
			for( i = 0; i < countIp; i++ ) {
				if( !ip_table[i] ) continue;
				sipi = (struct ip_info *)ip_table[i];
				if( sipi->ip == (uint32_t)xd.m_ip ) break;
			}

			if( i == countIp ) {
				sipi = (struct ip_info *)mmalloc(sizeof(struct ip_info),
								"main-dcip" );
				if( !sipi ) { g_process.shutdownAbort(true); }
				ip_table[countIp++]  = sipi;
				sipi->ip = xd.m_ip;//u->getIp();
				sipi->pages = 1;
				sipi->numDom = 0;
			} else {
				sipi->pages++;
			}

			char *fu = xd.ptr_firstUrl;
			int32_t dlen;
			const char *dom = getDomFast ( fu , &dlen );
			int32_t dkey = hash32( dom , dlen );

			for( i = 0; i < countDom; i++ ) {
				if( !dom_table[i] ) continue;
				sdomi = (struct dom_info *)dom_table[i];
				if( sdomi->dHash == dkey ) break;
			}

			if( i == countDom ) {
				sdomi =(struct dom_info*)mmalloc(sizeof(struct dom_info),
								"main-dcdm" );
				if( !sdomi ) { g_process.shutdownAbort(true); }
				dom_table[countDom++] = sdomi;
				sdomi->dom = (char *)mmalloc( dlen,"main-dcsdm" );

				strncpy( sdomi->dom, dom , dlen );
				sdomi->domLen = dlen;
				sdomi->dHash = dkey;
				sdomi->pages = 1;
				sdomi->numIp = 0;

				sdomi->tableSize = 0;
				sdomi->lnkCnt = 0;
			}
			else {
				sdomi->pages++;
			}

			Links *dlinks = xd.getLinks();

			int32_t size = dlinks->getNumLinks();
			if( !sdomi->tableSize ) {
				sdomi->lnk_table = (int32_t *)mmalloc(size * sizeof(int32_t),
								"main-dclt" );
				sdomi->tableSize = size;
			}
			else {
				if( size > (sdomi->tableSize - sdomi->lnkCnt) ) {
					size += sdomi->lnkCnt;
					sdomi->lnk_table = (int32_t *)
						mrealloc(sdomi->lnk_table,
							sdomi->tableSize*sizeof(int32_t),
							size*sizeof(int32_t),
							"main-dcrlt" );
					sdomi->tableSize = size;
				}
			}

			for( int32_t i = 0; i < dlinks->getNumLinks(); i++ ) {
				char *link = dlinks->getLinkPtr(i);
				int32_t dlen;
				const char *dom = getDomFast ( link , &dlen );
				uint32_t lkey = hash32( dom , dlen );
				int32_t j;
				for( j = 0; j < sdomi->lnkCnt; j++ ) {
					if( sdomi->lnk_table[j] == (int32_t)lkey ) break;
				}

				sdomi->lnkPages++;
				if( j != sdomi->lnkCnt ) continue;
				sdomi->lnk_table[sdomi->lnkCnt++] = lkey;
				sdomi->lnkPages++;
			}

			// Handle lists
			if( !sipi->numDom || !sdomi->numIp ){
				sdomi->numIp++; sipi->numDom++;
				//Add to IP list for Domain
				sdomi->ip_list = (struct ip_info **)
					mrealloc( sdomi->ip_list,
						(sdomi->numIp-1)*sizeof(char *),
						sdomi->numIp*sizeof(char *),
						"main-dcldm" );
				sdomi->ip_list[sdomi->numIp-1] = sipi;

				//Add to domain list for IP
				sipi->dom_list = (struct dom_info **)
					mrealloc( sipi->dom_list,
						(sipi->numDom-1)*sizeof(char *),
						sipi->numDom*sizeof(char *),
						"main-dclip" );
				sipi->dom_list[sipi->numDom-1] = sdomi;
			}
			else {
				int32_t i;
				for( i = 0;
				(i < sdomi->numIp)
					&& (sdomi->ip_list[i] != sipi);
				i++ );
				if( sdomi->numIp != i ) goto updateIp;

				sdomi->numIp++;
				sdomi->ip_list = (struct ip_info **)
					mrealloc( sdomi->ip_list,
						(sdomi->numIp-1)*sizeof(int32_t),
						sdomi->numIp*sizeof(int32_t),
						"main-dcldm" );
				sdomi->ip_list[sdomi->numIp-1] = sipi;

			updateIp:
				for( i = 0;
				(i < sipi->numDom)
					&& (sipi->dom_list[i] != sdomi);
				i++ );
				if( sipi->numDom != i ) goto endListUpdate;

				sipi->numDom++;
				sipi->dom_list = (struct dom_info **)
					mrealloc( sipi->dom_list,
						(sipi->numDom-1)*sizeof(int32_t),
						sipi->numDom*sizeof(int32_t),
						"main-dclip" );
				sipi->dom_list[sipi->numDom-1] = sdomi;

			endListUpdate:
				i=0;
			}
			if( !((++countDocs) % 1000) )
				log(LOG_INFO, "countdomains: %" PRId32" records searched.",countDocs);
			if( countDocs == numRecs ) goto freeInfo;
			//else countDocs++;
		}
		startKey = *(key96_t *)list.getLastKey();
		startKey++;
		// watch out for wrap around
		if ( startKey < *(key96_t *)list.getLastKey() ) {
			log( LOG_INFO, "countdomains: Keys wrapped around! Exiting." );
			goto freeInfo;
		}

		if ( countDocs >= numRecs ) {
		freeInfo:
			char             buf[128];
			//int32_t             value   ;
			int32_t             len     ;
			char             loop    ;
			int32_t             recsDisp;
			struct ip_info  *tmpipi  ;
			struct dom_info *tmpdomi ;
			loop = 0;

			FILE *fhndl;
			char out[sizeof(g_hostdb.m_dir)+128];
			if( output != 9 ) goto printHtml;
			// Dump raw data to a file to parse later
			snprintf( out, sizeof(out), "%scntdom.xml", g_hostdb.m_dir );
			out[ sizeof(out)-1 ] = '\0';
			if( !(fhndl = fopen( out, "wb" )) ) {
				log( LOG_INFO, "countdomains: File Open Failed." );
				return;
			}

			gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
			for( int32_t i = 0; i < countDom; i++ ) {
				if( !dom_table[i] ) continue;
				tmpdomi = (struct dom_info *)dom_table[i];
				len = tmpdomi->domLen;
				if( tmpdomi->domLen > 127 ) len = 126;
				strncpy( buf, tmpdomi->dom, len );
				buf[len] = '\0';
				fprintf(fhndl,
					"<rec1>\n\t<domain>%s</domain>\n"
					"\t<pages>%" PRId32"</pages>\n"
					//"\t<quality>%" PRId64"</quality>\n"
					"\t<block>\n",
					buf, tmpdomi->pages
					//,(tmpdomi->quality/tmpdomi->pages)
					);
				gbsort( tmpdomi->ip_list,tmpdomi->numIp, sizeof(int32_t),
				ip_fcmp );
				for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
					if( !tmpdomi->ip_list[j] ) continue;
					tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
					iptoa( tmpipi->ip,buf);
					fprintf(fhndl,"\t\t<ip>%s</ip>\n",buf);
				}
				fprintf(fhndl,
					"\t</block>\n"
					"\t<links>\n");
			}
			gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
			for( int32_t i = 0; i < countIp; i++ ) {
				if( !ip_table[i] ) continue;
				tmpipi = (struct ip_info *)ip_table[i];
				iptoa( tmpipi->ip,buf);
				fprintf(fhndl,
					"<rec2>\n\t<ip>%s</ip>\n"
					"\t<pages>%" PRId32"</pages>\n"
					"\t<block>\n",
					buf, tmpipi->pages);
				for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
					tmpdomi = (struct dom_info *)tmpipi->dom_list[j];
					len = tmpdomi->domLen;
					if( tmpdomi->domLen > 127 ) len = 126;
					strncpy( buf, tmpdomi->dom, len );
					buf[len] = '\0';
					fprintf(fhndl,
						"\t\t<domain>%s</domain>\n",
						buf);
				}
				fprintf(fhndl,
					"\t</block>\n"
					"</rec2>\n");
			}

			if( fclose( fhndl ) < 0 ) {
				log( LOG_INFO, "countdomains: File Close Failed." );
				return;
			}
			fhndl = 0;

		printHtml:
			// HTML file Output
			snprintf( out, sizeof(out), "%scntdom.html", g_hostdb.m_dir );
			out[ sizeof(out)-1 ] = '\0';
			if( !(fhndl = fopen( out, "wb" )) ) {
				log( LOG_INFO, "countdomains: File Open Failed." );
				return;
			}
			int64_t total = g_titledb.estimateGlobalNumDocs();
			static const char link_ip[]  = "http://www.gigablast.com/search?"
					"code=gbmonitor&q=ip%3A";
			static const char link_dom[] = "http://www.gigablast.com/search?"
					"code=gbmonitor&q=site%3A";
			static const char menu[] = "<table cellpadding=\"2\" cellspacing=\"2\">\n<tr>"
				"<th bgcolor=\"#CCCC66\"><a href=\"#pid\">"
				"Domains Sorted By Pages</a></th>"
				"<th bgcolor=\"#CCCC66\"><a href=\"#lid\">"
				"Domains Sorted By Links</a></th>"
				"<th bgcolor=\"#CCCC66\"><a href=\"#pii\">"
				"IPs Sorted By Pages</a></th>"
				"<th bgcolor=\"#CCCC66\"><a href=\"#dii\">"
				"IPs Sorted By Domains</a></th>"
				"<th bgcolor=\"#CCCC66\"><a href=\"#stats\">"
				"Stats</a></th>"
				"</tr>\n</table>\n<br>\n";

			static const char hdr[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
				"<tr bgcolor=\"AAAAAA\">"
				"<th>Domain</th>"
				"<th>Domains Linked</th>"
				//"<th>Avg Quality</th>"
				"<th># Pages</th>"
				"<th>Extrap # Pages</th>"
				"<th>IP</th>"
				"</tr>\n";

			static const char hdr2[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
				"<tr bgcolor=\"AAAAAA\">"
				"<th>IP</th>"
				"<th>Domain</th>"
				"<th>Domains Linked</th>"
				//"<th>Avg Quality</th>"
				"<th># Pages</th>"
				"<th>Extrap # Pages</th>"
				"</tr>\n";

			static const char clr1[] = "#FFFF00";//"yellow";
			static const char clr2[] = "#FFFF66";//"orange";
			const char *color;

			fprintf( fhndl,
				"<html><head><title>Domain/IP Counter</title></head>\n"
				"<body>"
				"<h1>Domain/IP Counter</h1><br><br>"
				"<a name=\"stats\">"
				"<h2>Stats</h2>\n%s", menu );

			// Stats
			fprintf( fhndl, "<br>\n\n<table>\n"
				"<tr><th align=\"left\">Total Number of Domains</th>"
				"<td>%" PRId32"</td></tr>\n"
				"<tr><th align=\"left\">Total Number of Ips</th>"
				"<td>%" PRId32"</td></tr>\n"
				"<tr><th align=\"left\">Number of Documents Searched"
				"</th><td>%" PRId32"</td></tr>\n"
				"<tr><th align=\"left\">Number of Failed Attempts</th>"
				"<td>%" PRId32"</td></tr><tr></tr><tr>\n"
				"<tr><th align=\"left\">Number of Documents in Index"
				"</th><td>%" PRId64"</td></tr>\n"
				"<tr><th align=\"left\">Estimated Domains in index</th>"
				"<td>%" PRId64"</td></tr>"
				"</table><br><br><br>\n"
				,countDom,countIp,
				countDocs, attempts-countDocs,total,
				countDocs ? ((countDom*total)/countDocs) : 0 );


			fprintf( fhndl, "<a name=\"pid\">\n"
				"<h2>Domains Sorted By Pages</h2>\n"
				"%s", menu );
			gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
		printDomLp:

			fprintf( fhndl,"%s", hdr );
			recsDisp = countDom;
			if( countDom > 1000 ) recsDisp = 1000;
			for( int32_t i = 0; i < recsDisp; i++ ) {
				char buf[128];
				int32_t len;
				if( !dom_table[i] ) continue;
				if( i%2 ) color = clr2;
				else color = clr1;
				tmpdomi = (struct dom_info *)dom_table[i];
				len = tmpdomi->domLen;
				if( tmpdomi->domLen > 127 ) len = 126;
				strncpy( buf, tmpdomi->dom, len );
				buf[len] = '\0';
				fprintf( fhndl, "<tr bgcolor=\"%s\"><td>"
					"<a href=\"%s%s\" target=\"_blank\">%s</a>"
					"</td><td>%" PRId32"</td>"
					//"<td>%" PRId64"</td>"
					"<td>%" PRId32"</td>"
					"<td>%" PRId64"</td><td>",
					color, link_dom,
					buf, buf, tmpdomi->lnkCnt,
					//(tmpdomi->quality/tmpdomi->pages),
					tmpdomi->pages,
					((tmpdomi->pages*total)/countDocs) );
				for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
					tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
					iptoa(tmpipi->ip,buf);
					fprintf( fhndl, "<a href=\"%s%s\""
						"target=\"_blank\">%s</a>\n",
						link_ip, buf, buf );
				}
				fprintf( fhndl, "</td></tr>\n" );
				fprintf( fhndl, "\n" );
			}

			fprintf( fhndl, "</table>\n<br><br><br>" );
			if( loop == 0 ) {
				loop = 1;
				gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_lcmp );
				fprintf( fhndl, "<a name=\"lid\">"
					"<h2>Domains Sorted By Links</h2>\n%s", menu );

				goto printDomLp;
			}
			loop = 0;

			fprintf( fhndl, "<a name=\"pii\">"
				"<h2>IPs Sorted By Pages</h2>\n%s", menu );


			gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
		printIpLp:
			fprintf( fhndl,"%s", hdr2 );
			recsDisp = countIp;
			if( countIp > 1000 ) recsDisp = 1000;
			for( int32_t i = 0; i < recsDisp; i++ ) {
				char buf[128];
				if( !ip_table[i] ) continue;
				tmpipi = (struct ip_info *)ip_table[i];
				iptoa(tmpipi->ip,buf);
				if( i%2 ) color = clr2;
				else color = clr1;
				int32_t linked = 0;
				for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
					tmpdomi=(struct dom_info *)tmpipi->dom_list[j];
					linked += tmpdomi->lnkCnt;
				}
				fprintf( fhndl, "\t<tr bgcolor=\"%s\"><td>"
					"<a href=\"%s%s\" target=\"_blank\">%s</a>"
					"</td>"
					"<td>%" PRId32"</td>"
					"<td>%" PRId32"</td>"
					//"<td>%" PRId64"</td>"
					"<td>%" PRId32"</td>"
					"<td>%" PRId64"</td></tr>\n",
					color,
					link_ip, buf, buf, tmpipi->numDom, linked,
					//(tmpipi->quality/tmpipi->pages),
					tmpipi->pages,
					((tmpipi->pages*total)/countDocs) );
				fprintf( fhndl, "\n" );
			}

			fprintf( fhndl, "</table>\n<br><br><br>" );
			if( loop == 0 ) {
				loop = 1;
				gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_dcmp );
				fprintf( fhndl, "<a name=\"dii\">"
					"<h2>IPs Sorted By Domains</h2>\n%s", menu );
				goto printIpLp;
			}

			if( fclose( fhndl ) < 0 ) {
				log( LOG_INFO, "countdomains: File Close Failed." );
				return;
			}
			fhndl = 0;


			int32_t ima = 0;
			int32_t dma = 0;

			log( LOG_INFO, "countdomains: Freeing ip info struct..." );
			for( int32_t i = 0; i < countIp; i++ ) {
				if( !ip_table[i] ) continue;
				//value = ipHT.getValue( ip_table[i] );
				//if(value == 0) continue;
				tmpipi = (struct ip_info *)ip_table[i];
				mfree( tmpipi->dom_list, tmpipi->numDom*sizeof(tmpipi->dom_list[0]),
				"main-dcflip" );
				ima += tmpipi->numDom * sizeof(int32_t);
				mfree( tmpipi, sizeof(struct ip_info), "main-dcfip" );
				ima += sizeof(struct ip_info);
				tmpipi = NULL;
			}
			mfree( ip_table, numRecs * sizeof(struct ip_info *), "main-dcfit" );

			log( LOG_INFO, "countdomains: Freeing domain info struct..." );
			for( int32_t i = 0; i < countDom; i++ ) {
				if( !dom_table[i] ) continue;
				tmpdomi = (struct dom_info *)dom_table[i];
				mfree( tmpdomi->lnk_table,
				tmpdomi->tableSize*sizeof(int32_t),
				"main-dcfsdlt" );
				dma += tmpdomi->tableSize * sizeof(int32_t);
				mfree( tmpdomi->ip_list, tmpdomi->numIp*sizeof(tmpdomi->ip_list[0]),
				"main-dcfldom" );
				dma += tmpdomi->numIp * sizeof(int32_t);
				mfree( tmpdomi->dom, tmpdomi->domLen, "main-dcfsdom" );
				dma += tmpdomi->domLen;
				mfree( tmpdomi, sizeof(struct dom_info), "main-dcfdom" );
				dma+= sizeof(struct dom_info);
				tmpdomi = NULL;
			}

			mfree( dom_table, numRecs * sizeof(struct dom_info *), "main-dcfdt" );

			int64_t time_end = gettimeofdayInMilliseconds();
			log( LOG_INFO, "countdomains: Took %" PRId64"ms to count domains in %" PRId32" recs.",
			time_end-time_start, countDocs );
			log( LOG_INFO, "countdomains: %" PRId32" bytes of Total Memory Used.",
			ima + dma + (8 * numRecs) );
			log( LOG_INFO, "countdomains: %" PRId32" bytes Total for IP.", ima );
			log( LOG_INFO, "countdomains: %" PRId32" bytes Total for Dom.", dma );
			log( LOG_INFO, "countdomains: %" PRId32" bytes Average for IP.", countIp ? ima/countIp : 0 );
			log( LOG_INFO, "countdomains: %" PRId32" bytes Average for Dom.", countDom ? dma/countDom : 0 );

			return;
		}
	}
}

// Sort by IP frequency in pages 9->0
int ip_fcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct ip_info *ii1;
	struct ip_info *ii2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	ii1 = (struct ip_info *)n1;
	ii2 = (struct ip_info *)n2;

	return ii2->pages-ii1->pages;
}

// Sort by number of domains linked to IP, descending
int ip_dcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;

	struct ip_info *ii1;
	struct ip_info *ii2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	ii1 = (struct ip_info *)n1;
	ii2 = (struct ip_info *)n2;

	return ii2->numDom-ii1->numDom;
}

// Sort by page frequency in titlerec 9->0
int dom_fcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct dom_info *di1;
	struct dom_info *di2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	di1 = (struct dom_info *)n1;
	di2 = (struct dom_info *)n2;

	return di2->pages-di1->pages;
}

// Sort by quantity of outgoing links 9-0
int dom_lcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct dom_info *di1;
	struct dom_info *di2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	di1 = (struct dom_info *)n1;
	di2 = (struct dom_info *)n2;

	return di2->lnkCnt-di1->lnkCnt;
}

static const char *getAbsoluteGbDir(const char *argv0) {
	static char s_buf[1024];

	char *s = realpath(argv0, NULL);
	if(!s)
		return NULL;
	if(strlen(s) >= sizeof(s_buf))
		return NULL;
	strcpy(s_buf,s);
	free(s);

	//chop off last component, so we hae just the directory (including a trailing / )
	char *slash = strrchr(s_buf, '/');
	if(slash==NULL) {
		//what? this is not supposed to happen that realpath returns an absolute path that doesn't contain a slash
		return NULL;
	}
	slash[1] = '\0';
	return s_buf;
}


///////
//
// used to make package to install files for the package
//
///////
static int copyFiles(const char *dstDir) {

	const char *srcDir = "./";
	SafeBuf fileListBuf;
	g_process.getFilesToCopy ( srcDir , &fileListBuf );

	SafeBuf tmp;
	tmp.safePrintf(
		       "cp -r %s %s"
		       , fileListBuf.getBufStart()
		       , dstDir
		       );

	//log(LOG_INIT,"admin: %s", tmp.getBufStart());
	fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
	system ( tmp.getBufStart() );
	return 0;
}


static void wvg_log_function(WordVariationGenerator::log_class_t log_class, const char *fmt, va_list ap) {
	char buf[2048];
	vsnprintf(buf,sizeof(buf), fmt, ap);
	buf[sizeof(buf)-1]='\0';
	int32_t type;
	switch(log_class) {
		case WordVariationGenerator::log_trace: type = LOG_TRACE; break;
		case WordVariationGenerator::log_debug: type = LOG_DEBUG; break;
		case WordVariationGenerator::log_info: type = LOG_INFO; break;
		case WordVariationGenerator::log_warn: type = LOG_WARN; break;
		case WordVariationGenerator::log_error: type = LOG_ERROR; break;
	}
	log(type,"wordvar:%s",buf);
}