open-source-search-engine/Collectiondb.cpp

#include "gb-include.h"

#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "Spider.h"  // for calling SpiderLoop::collectionsUpdated()
#include "Posdb.h"
//#include "Indexdb.h"
#include "Datedb.h"
#include "Titledb.h"
//#include "Revdb.h"
//#include "Sections.h"
#include "Placedb.h"
#include "Tagdb.h"
#include "Catdb.h"
#include "Tfndb.h"
#include "Spider.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Spider.h"
#include "Repair.h"
#include "Users.h"
#include "Parms.h"

void testRegex ( ) ;

HashTableX g_collTable;

// a global class extern'd in .h file
Collectiondb g_collectiondb;

Collectiondb::Collectiondb ( ) {
	m_wrapped = 0;
	m_numRecs = 0;
	m_numRecsUsed = 0;
	m_numCollsSwappedOut = 0;
	m_initializing = false;
	//m_lastUpdateTime = 0LL;
	m_needsSave = false;
	// sanity
	if ( RDB_END2 >= RDB_END ) return;
	log("db: increase RDB_END2 to at least %" INT32 " in "
	    "Collectiondb.h",(int32_t)RDB_END);
	char *xx=NULL;*xx=0;
}

// reset rdb
void Collectiondb::reset() {
	log(LOG_INFO,"db: resetting collectiondb.");
	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
		if ( ! m_recs[i] ) continue;
		mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
		delete ( m_recs[i] );
		m_recs[i] = NULL;
	}
	m_numRecs     = 0;
	m_numRecsUsed = 0;
	g_collTable.reset();
}

/*
bool Collectiondb::init ( bool isDump ) {
	reset();
	if ( g_isYippy ) return true;
	// reset # of recs
	//m_numRecs     = 0;
	//m_numRecsUsed = 0;
	// . now load ALL recs
	// . returns false and sets g_errno on error
	if ( ! load ( isDump ) ) return false;
	// update time
	updateTime();
	// so we don't save again
	m_needsSave = false;
	// sanity
	if ( RDB_END2 < RDB_END ) {
		log("db: increase RDB_END2 to at least %" INT32 " in "
		    "Collectiondb.h",(int32_t)RDB_END);
		char *xx=NULL;*xx=0;
	}
	// if it set g_errno, return false
	//if ( g_errno ) return log("admin: Had init error: %s.",
	//			  mstrerror(g_errno));
	g_errno = 0;
	// otherwise, true, even if reloadList() blocked
	return true;
}
*/

extern bool g_inAutoSave;

// . save to disk
// . returns false if blocked, true otherwise
bool Collectiondb::save ( ) {
	if ( g_conf.m_readOnlyMode ) return true;

	if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
		return true;

	// which collection rec needs a save
	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
		if ( ! m_recs[i]              ) continue;
		// temp debug message
		//logf(LOG_DEBUG,"admin: SAVING collection #%" INT32 " ANYWAY",i);
		if ( ! m_recs[i]->m_needsSave ) continue;

		// if we core in malloc we won't be able to save the
		// coll.conf files
		if ( m_recs[i]->m_isCustomCrawl &&
		     g_inMemFunction &&
		     g_hostdb.m_hostId != 0 )
			continue;

		//log(LOG_INFO,"admin: Saving collection #%" INT32 ".",i);
		m_recs[i]->save ( );
	}
	// oh well
	return true;
}

///////////
//
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
//
///////////
bool Collectiondb::loadAllCollRecs ( ) {

	m_initializing = true;

	char dname[1024];
	// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
	sprintf ( dname , "%s" , g_hostdb.m_dir );
	Dir d;
	d.set ( dname );
	if ( ! d.open ()) return log("admin: Could not load collection config "
				     "files.");
	int32_t count = 0;
	char *f;
	while ( ( f = d.getNextFilename ( "*" ) ) ) {
		// skip if first char not "coll."
		if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
		// must end on a digit (i.e. coll.main.0)
		if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
		// count them
		count++;
	}

	// reset directory for another scan
	d.set ( dname );
	if ( ! d.open ()) return log("admin: Could not load collection config "
				     "files.");

	// note it
	//log(LOG_INFO,"db: loading collection config files.");
	// . scan through all subdirs in the collections dir
	// . they should be like, "coll.main/" and "coll.mycollection/"
	while ( ( f = d.getNextFilename ( "*" ) ) ) {
		// skip if first char not "coll."
		if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
		// must end on a digit (i.e. coll.main.0)
		if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
		// point to collection
		char *coll = f + 5;
		// NULL terminate at .
		char *pp = strchr ( coll , '.' );
		if ( ! pp ) continue;
		*pp = '\0';
		// get collnum
		collnum_t collnum = atol ( pp + 1 );
		// add it
		if ( ! addExistingColl ( coll , collnum ) )
			return false;
		// swap it out if we got 100+ collections
		// if ( count < 100 ) continue;
		// CollectionRec *cr = getRec ( collnum );
		// if ( cr ) cr->swapOut();
	}
	// if no existing recs added... add coll.main.0 always at startup
	if ( m_numRecs == 0 ) {
		log("admin: adding main collection.");
		addNewColl ( "main",
			     0 , // customCrawl ,
			     NULL,
			     0 ,
			     true , // bool saveIt ,
			     // Parms.cpp reserves this so it can be sure
			     // to add the same collnum to every shard
			     0 );
	}

	m_initializing = false;

	// note it
	//log(LOG_INFO,"db: Loaded data for %" INT32 " collections. Ranging from "
	//    "collection #0 to #%" INT32 ".",m_numRecsUsed,m_numRecs-1);
	// update the time
	//updateTime();
	// don't clean the tree if just dumpin
	//if ( isDump ) return true;
	return true;
}

// after we've initialized all rdbs in main.cpp call this to clean out
// our rdb trees
bool Collectiondb::cleanTrees ( ) {

	// remove any nodes with illegal collnums
	Rdb *r;
	//r = g_indexdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	r = g_posdb.getRdb();
	//r->m_tree.cleanTree    ();//(char **)r->m_bases);
	r->m_buckets.cleanBuckets();
	//r = g_datedb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);

	r = g_titledb.getRdb();
	r->m_tree.cleanTree    ();//(char **)r->m_bases);
	//r = g_revdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_sectiondb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_checksumdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_tfndb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	r = g_spiderdb.getRdb();
	r->m_tree.cleanTree    ();//(char **)r->m_bases);
	r = g_doledb.getRdb();
	r->m_tree.cleanTree    ();//(char **)r->m_bases);
	// success
	return true;
}
/*
void Collectiondb::updateTime() {
	// get time now in milliseconds
	int64_t newTime = gettimeofdayInMilliseconds();
	// change it
	if ( m_lastUpdateTime == newTime ) newTime++;
	// update it
	m_lastUpdateTime = newTime;
	// we need a save
	m_needsSave = true;
}
*/

#include "Statsdb.h"
#include "Cachedb.h"
#include "Syncdb.h"

// same as addOldColl()
bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {

	int32_t i = collnum;

	// ensure does not already exist in memory
	collnum_t oldCollnum = getCollnum(coll);
	if ( oldCollnum >= 0 ) {
		g_errno = EEXIST;
		log("admin: Trying to create collection \"%s\" but "
		    "already exists in memory. Do an ls on "
		    "the working dir to see if there are two "
		    "collection dirs with the same coll name",coll);
		char *xx=NULL;*xx=0;
	}

	// also try by #, i've seen this happen too
	CollectionRec *ocr = getRec ( i );
	if ( ocr ) {
		g_errno = EEXIST;
		log("admin: Collection id %i is in use already by "
		    "%s, so we can not add %s. moving %s to trash."
		    ,(int)i,ocr->m_coll,coll,coll);
		SafeBuf cmd;
		int64_t now = gettimeofdayInMilliseconds();
		cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%" UINT64
				 , coll
				 ,(int)i
				 , coll
				 ,(int)i
				 , now );
		//log("admin: %s",cmd.getBufStart());
		gbsystem ( cmd.getBufStart() );
		return true;
	}

	// create the record in memory
	CollectionRec *cr = new (CollectionRec);
	if ( ! cr )
		return log("admin: Failed to allocated %" INT32 " bytes for new "
			   "collection record for \"%s\".",
			   (int32_t)sizeof(CollectionRec),coll);
	mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );

	// set collnum right for g_parms.setToDefault() call just in case
	// because before it was calling CollectionRec::reset() which
	// was resetting the RdbBases for the m_collnum which was garbage
	// and ended up resetting random collections' rdb. but now
	// CollectionRec::CollectionRec() sets m_collnum to -1 so we should
	// not need this!
	//cr->m_collnum = oldCollnum;

	// get the default.conf from working dir if there
	g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );

	strcpy ( cr->m_coll , coll );
	cr->m_collLen = gbstrlen ( coll );
	cr->m_collnum = i;

	// point to this, so Rdb and RdbBase can reference it
	coll = cr->m_coll;

	//log("admin: loaded old coll \"%s\"",coll);

	// load coll.conf file
	if ( ! cr->load ( coll , i ) ) {
		mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
		log("admin: Failed to load coll.%s.%" INT32 "/coll.conf",coll,i);
		delete ( cr );
		if ( m_recs ) m_recs[i] = NULL;
		return false;
	}

	if ( ! registerCollRec ( cr , false ) ) return false;

	// always index spider status docs now for custom crawls
	if ( cr->m_isCustomCrawl )
		cr->m_indexSpiderReplies = true;

	// and don't do link voting, will help speed up
	if ( cr->m_isCustomCrawl ) {
		cr->m_getLinkInfo = false;
		cr->m_computeSiteNumInlinks = false;
		// limit each shard to 5 spiders per collection to prevent
		// ppl from spidering the web and hogging up resources
		cr->m_maxNumSpiders = 5;
		// diffbot download docs up to 50MB so we don't truncate
		// things like sitemap.xml. but keep regular html pages
		// 1MB
		cr->m_maxTextDocLen  = 1024*1024;
		// xml, pdf, etc can be this. 50MB
		cr->m_maxOtherDocLen = 50000000;
	}

	// we need to compile the regular expressions or update the url
	// filters with new logic that maps crawlbot parms to url filters
	return cr->rebuildUrlFilters ( );
}

// . add a new rec
// . returns false and sets g_errno on error
// . was addRec()
// . "isDump" is true if we don't need to initialize all the rdbs etc
//   because we are doing a './gb dump ...' cmd to dump out data from
//   one Rdb which we will custom initialize in main.cpp where the dump
//   code is. like for instance, posdb.
// . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
//   2 for a bulk job. diffbot terminology.
bool Collectiondb::addNewColl ( char *coll ,
				char customCrawl ,
				char *cpc ,
				int32_t cpclen ,
				bool saveIt ,
				// Parms.cpp reserves this so it can be sure
				// to add the same collnum to every shard
				collnum_t newCollnum ) {


	//do not send add/del coll request until we are in sync with shard!!
	// just return ETRYAGAIN for the parmlist...

	// ensure coll name is legit
	char *p = coll;
	for ( ; *p ; p++ ) {
		if ( is_alnum_a(*p) ) continue;
		if ( *p == '-' ) continue;
		if ( *p == '_' ) continue; // underscore now allowed
		break;
	}
	if ( *p ) {
		g_errno = EBADENGINEER;
		log("admin: \"%s\" is a malformed collection name because it "
		    "contains the '%c' character.",coll,*p);
		return false;
	}
	// . scan for holes
	// . i is also known as the collection id
	//int32_t i = (int32_t)newCollnum;
	// no longer fill empty slots because if they do a reset then
	// a new rec right away it will be filled with msg4 recs not
	// destined for it. Later we will have to recycle some how!!
	//else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
	// right now we #define collnum_t int16_t. so do not breach that!
	//if ( m_numRecs < 0x7fff ) {
	//	// set it
	//	i = m_numRecs;
	//	// claim it
	//	// we don't do it here, because we check i below and
	//	// increment m_numRecs below.
	//	//m_numRecs++;
	//}
	// TODO: scan for holes here...
	//else {
	if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }

	// ceiling?
	//int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
	//if ( i >= maxColls ) {
	//	g_errno = ENOBUFS;
	//	return log("admin: Limit of %" INT64 " collection reached. "
	//		   "Collection not created.",maxColls);
	//}
	// if empty... bail, no longer accepted, use "main"
	if ( ! coll || !coll[0] ) {
		g_errno = EBADENGINEER;
		return log("admin: Trying to create a new collection "
			   "but no collection name provided. Use the \"c\" "
			   "cgi parameter to specify it.");
	}
	// or if too big
	if ( gbstrlen(coll) > MAX_COLL_LEN ) {
		g_errno = ENOBUFS;
		return log("admin: Trying to create a new collection "
			   "whose name \"%s\" of %i chars is longer than the "
			   "max of %" INT32 " chars.",coll,gbstrlen(coll),
			   (int32_t)MAX_COLL_LEN);
	}

	// ensure does not already exist in memory
	if ( getCollnum ( coll ) >= 0 ) {
		g_errno = EEXIST;
		log("admin: Trying to create collection \"%s\" but "
		    "already exists in memory.",coll);
		// just let it pass...
		g_errno = 0 ;
		return true;
	}
	// MDW: ensure not created on disk since time of last load
	char dname[512];
	sprintf(dname, "%scoll.%s.%" INT32 "/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
	DIR *dir = opendir ( dname );
	if ( dir ) closedir ( dir );
	if ( dir ) {
		g_errno = EEXIST;
		return log("admin: Trying to create collection %s but "
			   "directory %s already exists on disk.",coll,dname);
	}

	// create the record in memory
	CollectionRec *cr = new (CollectionRec);
	if ( ! cr )
		return log("admin: Failed to allocated %" INT32 " bytes for new "
			   "collection record for \"%s\".",
			   (int32_t)sizeof(CollectionRec),coll);
	// register the mem
	mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );

	// get copy collection
	//CollectionRec *cpcrec = NULL;
	//if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
	//if ( cpc && cpc[0] && ! cpcrec )
	//	log("admin: Collection \"%s\" to copy config from does not "
	//	    "exist.",cpc);

	// set collnum right for g_parms.setToDefault() call
	//cr->m_collnum = newCollnum;

	// . get the default.conf from working dir if there
	// . i think this calls CollectionRec::reset() which resets all of its
	//   rdbbase classes for its collnum so m_collnum needs to be right
	//g_parms.setToDefault( (char *)cr );
	// get the default.conf from working dir if there
	//g_parms.setToDefault( (char *)cr , OBJ_COLL );
	g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );

	// put search results back so it doesn't mess up results in qatest123
	if ( strcmp(coll,"qatest123") == 0 )
		cr->m_sameLangWeight = 20.0;
	/*
	// the default conf file
	char tmp1[1024];
	sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
	// . set our parms from the file.
	// . accepts OBJ_COLLECTIONREC or OBJ_CONF
	g_parms.setFromFile ( cr , NULL , tmp1 );
	*/

	// this will override all
	// if ( cpcrec ) {
	// 	// copy it, but not the timedb hashtable, etc.
	// 	int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
	// 	// JAB: bad gbmemcpy - no donut!
	// 	// this is not how objects are supposed to be copied!!!
	// 	gbmemcpy ( cr , cpcrec , size);
	// }

	// set coll id and coll name for coll id #i
	strcpy ( cr->m_coll , coll );
	cr->m_collLen = gbstrlen ( coll );
	cr->m_collnum = newCollnum;

	// point to this, so Rdb and RdbBase can reference it
	coll = cr->m_coll;

	//
	// BEGIN NEW CODE
	//

	//
	// get token and crawlname if customCrawl is 1 or 2
	//
	char *token = NULL;
	char *crawl = NULL;
	SafeBuf tmp;
	// . return true with g_errno set on error
	// . if we fail to set a parm right we should force ourselves
	//   out sync
	if ( customCrawl ) {
		if ( ! tmp.safeStrcpy ( coll ) ) return true;
		token = tmp.getBufStart();
		// diffbot coll name format is <token>-<crawlname>
		char *h = strchr ( tmp.getBufStart() , '-' );
		if ( ! h ) {
			log("crawlbot: bad custom collname");
			g_errno = EBADENGINEER;
			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
			delete ( cr );
			return true;
		}
		*h = '\0';
		crawl = h + 1;
		if ( ! crawl[0] ) {
			log("crawlbot: bad custom crawl name");
			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
			delete ( cr );
			g_errno = EBADENGINEER;
			return true;
		}
		// or if too big!
		if ( gbstrlen(crawl) > 30 ) {
			log("crawlbot: crawlbot crawl NAME is over 30 chars");
			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
			delete ( cr );
			g_errno = EBADENGINEER;
			return true;
		}
	}

	//log("parms: added new collection \"%s\"", collName );

	cr->m_maxToCrawl = -1;
	cr->m_maxToProcess = -1;


	if ( customCrawl ) {
		// always index spider status docs now
		cr->m_indexSpiderReplies = true;
		// remember the token
		cr->m_diffbotToken.set ( token );
		cr->m_diffbotCrawlName.set ( crawl );
		// bring this back
		cr->m_diffbotApiUrl.set ( "" );
		cr->m_diffbotUrlCrawlPattern.set ( "" );
		cr->m_diffbotUrlProcessPattern.set ( "" );
		cr->m_diffbotPageProcessPattern.set ( "" );
		cr->m_diffbotUrlCrawlRegEx.set ( "" );
		cr->m_diffbotUrlProcessRegEx.set ( "" );
    cr->m_diffbotMaxHops = -1;

    cr->m_spiderStatus = SP_INITIALIZING;
		// do not spider more than this many urls total.
		// -1 means no max.
		cr->m_maxToCrawl = 100000;
		// do not process more than this. -1 means no max.
		cr->m_maxToProcess = 100000;
		// -1 means no max
		cr->m_maxCrawlRounds = -1;
		// diffbot download docs up to 10MB so we don't truncate
		// things like sitemap.xml
		cr->m_maxTextDocLen  = 10000000;
		cr->m_maxOtherDocLen = 10000000;
		// john wants deduping on by default to avoid
		// processing similar pgs
		cr->m_dedupingEnabled = true;
		// show the ban links in the search results. the
		// collection name is cryptographic enough to show that
		cr->m_isCustomCrawl = customCrawl;
		cr->m_diffbotOnlyProcessIfNewUrl = true;
		// default respider to off
		cr->m_collectiveRespiderFrequency = 0.0;
		//cr->m_restrictDomain = true;
		// reset the crawl stats
		// always turn off gigabits so &s=1000 can do summary skipping
		cr->m_docsToScanForTopics = 0;
		// turn off link voting, etc. to speed up
		cr->m_getLinkInfo = false;
		cr->m_computeSiteNumInlinks = false;
	}

	// . this will core if a host was dead and then when it came
	//   back up host #0's parms.cpp told it to add a new coll
	cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
	cr->m_diffbotCrawlEndTime   = 0;

	// . just the basics on these for now
	// . if certain parms are changed then the url filters
	//   must be rebuilt, as well as possibly the waiting tree!!!
	// . need to set m_urlFiltersHavePageCounts etc.
	cr->rebuildUrlFilters ( );

	cr->m_useRobotsTxt = true;

	// reset crawler stats.they should be loaded from crawlinfo.txt
	memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
	memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );

	// note that
	log("colldb: initial revival for %s",cr->m_coll);

	// . assume we got some urls ready to spider
	// . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
	//   urls it spidered in that time these will get set to 0 and it
	//   will send out an email alert if m_sentCrawlDoneAlert is not true.
	cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
	cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;

	// set some defaults. max spiders for all priorities in this
	// collection. NO, default is in Parms.cpp.
	//cr->m_maxNumSpiders = 10;

	//cr->m_needsSave = 1;

	// start the spiders!
	cr->m_spideringEnabled = true;

	// override this?
	saveIt = true;

	//
	// END NEW CODE
	//

	//log("admin: adding coll \"%s\" (new=%" INT32 ")",coll,(int32_t)isNew);

	// MDW: create the new directory
 retry22:
	if ( ::mkdir ( dname ,
		       getDirCreationFlags() ) ) {
		       // S_IRUSR | S_IWUSR | S_IXUSR |
		       // S_IRGRP | S_IWGRP | S_IXGRP |
		       // S_IROTH | S_IXOTH ) ) {
		// valgrind?
		if ( errno == EINTR ) goto retry22;
		g_errno = errno;
		mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
		delete ( cr );
		return log("admin: Creating directory %s had error: "
			   "%s.", dname,mstrerror(g_errno));
	}

	// save it into this dir... might fail!
	if ( saveIt && ! cr->save() ) {
		mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
		delete ( cr );
		return log("admin: Failed to save file %s: %s",
			   dname,mstrerror(g_errno));
	}


	if ( ! registerCollRec ( cr , true ) )
		return false;

	// add the rdbbases for this coll, CollectionRec::m_bases[]
	if ( ! addRdbBasesForCollRec ( cr ) )
		return false;

	return true;
}

void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
	// if in the process of swapping in, this will be false...
	//if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
	if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
	// Rdb::deleteColl() will call this even though we are swapped in
	// but it calls it with "base" set to NULL after it nukes the RdbBase
	// so check if base is null here.
	if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
	m_bases [ (unsigned char)rdbId ] = base;
}

RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
	if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
	return m_bases [ (unsigned char)rdbId ];
}

static bool s_inside = false;

// . returns NULL w/ g_errno set on error.
// . TODO: ensure not called from in thread, not thread safe
RdbBase *CollectionRec::getBase ( char rdbId ) {

	if ( s_inside ) { char *xx=NULL;*xx=0; }

	if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];

	log("cdb: swapin collnum=%" INT32 "",(int32_t)m_collnum);

	// sanity!
	if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }

	s_inside = true;

	// turn off quickpoll to avoid getbase() being re-called and
	// coring from s_inside being true
	int32_t saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;

	// load them back in. return NULL w/ g_errno set on error.
	if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
		log("coll: error swapin: %s",mstrerror(g_errno));
		g_conf.m_useQuickpoll = saved;
		s_inside = false;
		return NULL;
	}

	g_conf.m_useQuickpoll = saved;
	s_inside = false;

	g_collectiondb.m_numCollsSwappedOut--;
	m_swappedOut = false;

	log("coll: swapin was successful for collnum=%" INT32 "",(int32_t)m_collnum);

	return m_bases[(unsigned char)rdbId];
}

bool CollectionRec::swapOut ( ) {

	if ( m_swappedOut ) return true;

	log("cdb: swapout collnum=%" INT32 "",(int32_t)m_collnum);

	// free all RdbBases in each rdb
	for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
	     Rdb *rdb = g_process.m_rdbs[i];
	     // this frees all the RdbBase::m_files and m_maps for the base
	     rdb->resetBase ( m_collnum );
	}

	// now free each base itself
	for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
		RdbBase *base = m_bases[i];
		if ( ! base ) continue;
		mdelete (base, sizeof(RdbBase), "Rdb Coll");
		delete  (base);
		m_bases[i] = NULL;
	}

	m_swappedOut = true;
	g_collectiondb.m_numCollsSwappedOut++;

	return true;
}

// . called only by addNewColl() and by addExistingColl()
bool Collectiondb::registerCollRec ( CollectionRec *cr ,  bool isNew ) {

	// add m_recs[] and to hashtable
	if ( ! setRecPtr ( cr->m_collnum , cr ) )
		return false;

	return true;
}

// swap it in
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
		CollectionRec *cr = m_recs[i];
		if ( ! cr ) continue;
		// skip if swapped out
		if ( cr->m_swappedOut ) continue;
		// add rdb base files etc. for it
		addRdbBasesForCollRec ( cr );
	}

	// now clean the trees. moved this into here from
	// addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
	// now from getBase() to load on-demand for saving memory
	cleanTrees();

	return true;
}

bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {

	char *coll = cr->m_coll;

	//////
	//
	// if we are doing a dump from the command line, skip this stuff
	//
	//////
	if ( g_dumpMode ) return true;

	// tell rdbs to add one, too
	//if ( ! g_indexdb.getRdb()->addRdbBase1    ( coll ) ) goto hadError;
	if ( ! g_posdb.getRdb()->addRdbBase1        ( coll ) ) goto hadError;
	//if ( ! g_datedb.getRdb()->addRdbBase1     ( coll ) ) goto hadError;

	if ( ! g_titledb.getRdb()->addRdbBase1      ( coll ) ) goto hadError;
	//if ( ! g_revdb.getRdb()->addRdbBase1      ( coll ) ) goto hadError;
	//if ( ! g_sectiondb.getRdb()->addRdbBase1  ( coll ) ) goto hadError;
	if ( ! g_tagdb.getRdb()->addRdbBase1        ( coll ) ) goto hadError;
	//if ( ! g_catdb.getRdb()->addRdbBase1      ( coll ) ) goto hadError;
	//if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
	//if ( ! g_tfndb.getRdb()->addRdbBase1      ( coll ) ) goto hadError;
	if ( ! g_clusterdb.getRdb()->addRdbBase1    ( coll ) ) goto hadError;
	if ( ! g_linkdb.getRdb()->addRdbBase1       ( coll ) ) goto hadError;
	if ( ! g_spiderdb.getRdb()->addRdbBase1     ( coll ) ) goto hadError;
	if ( ! g_doledb.getRdb()->addRdbBase1       ( coll ) ) goto hadError;

	// now clean the trees
	//cleanTrees();

	// debug message
	//log ( LOG_INFO, "db: verified collection \"%s\" (%" INT32 ").",
	//      coll,(int32_t)cr->m_collnum);

	// tell SpiderCache about this collection, it will create a
	// SpiderCollection class for it.
	//g_spiderCache.reset1();

	// success
	return true;

 hadError:
	log("db: error registering coll: %s",mstrerror(g_errno));
	return false;
}


/*
bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
	if ( r->getLong("admin",1) == 0 ) return false;
	if ( g_conf.isMasterAdmin ( s , r ) ) return true;
	char *c = r->getString ( "c" );
	CollectionRec *cr = getRec ( c );
	if ( ! cr ) return false;
	return g_users.hasPermission ( r , PAGE_SEARCH );
	//return cr->hasPermission ( r , s );
}

void savingCheckWrapper1 ( int fd , void *state ) {
	WaitEntry *we = (WaitEntry *)state;
	// no state?
	if ( ! we ) { log("colldb: we1 is null"); return; }
	// unregister too
	g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
	// if it blocked again i guess tree is still saving
	if ( ! g_collectiondb.resetColl ( we->m_coll ,
					  we ,
					  we->m_purgeSeeds))
		return;
	// all done
	we->m_callback ( we->m_state );
}

void savingCheckWrapper2 ( int fd , void *state ) {
	WaitEntry *we = (WaitEntry *)state;
	// no state?
	if ( ! we ) { log("colldb: we2 is null"); return; }
	// unregister too
	g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
	// if it blocked again i guess tree is still saving
	if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
	// all done
	we->m_callback ( we->m_state );
}
*/

/*
// delete all records checked in the list
bool Collectiondb::deleteRecs ( HttpRequest *r ) {
	for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
		char *f = r->getField ( i );
		if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
		char *coll = f + 3;
		//if ( ! is_digit ( f[3] )            ) continue;
		//int32_t h = atol ( f + 3 );
		deleteRec ( coll , NULL );
	}
	return true;
}
*/

/*
// . delete a collection
// . this uses blocking unlinks, may make non-blocking later
// . returns false if blocked, true otherwise
bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
	// force on for now
	//deleteTurkdb = true;
	// no spiders can be out. they may be referencing the CollectionRec
	// in XmlDoc.cpp... quite likely.
	//if ( g_conf.m_spideringEnabled ||
	//     g_spiderLoop.m_numSpidersOut > 0 ) {
	//	log("admin: Can not delete collection while "
	//	    "spiders are enabled or active.");
	//	return false;
	//}
	// ensure it's not NULL
	if ( ! coll ) {
		log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
		g_errno = ENOTFOUND;
		return true;
	}
	// find the rec for this collection
	collnum_t collnum = getCollnum ( coll );
	return deleteRec2 ( collnum , we );
}
*/

// if there is an outstanding disk read thread or merge thread then
// Spider.cpp will handle the delete in the callback.
// this is now tryToDeleteSpiderColl in Spider.cpp
/*
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {

	sc->m_deleteMyself = true;

	// if not currently being accessed nuke it now
	if ( ! sc->m_msg5.m_waitingForList &&
	     ! sc->m_msg5b.m_waitingForList &&
	     ! sc->m_msg1.m_mcast.m_inUse ) {
		mdelete ( sc, sizeof(SpiderColl),"nukecr2");
		delete ( sc );
		return;
	}
}
*/

/// this deletes the collection, not just part of a reset.
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
	// do not allow this if in repair mode
	if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
		log("admin: Can not delete collection while in repair mode.");
		g_errno = EBADENGINEER;
		return true;
	}
	// bitch if not found
	if ( collnum < 0 ) {
		g_errno = ENOTFOUND;
		log(LOG_LOGIC,"admin: Collection #%" INT32 " is bad, "
		    "delete failed.",(int32_t)collnum);
		return true;
	}
	CollectionRec *cr = m_recs [ collnum ];
	if ( ! cr ) {
		log("admin: Collection id problem. Delete failed.");
		g_errno = ENOTFOUND;
		return true;
	}

	if ( g_process.isAnyTreeSaving() ) {
		// note it
		log("admin: tree is saving. waiting2.");
		// all done
		return false;
	}

	// spiders off
	//if ( cr->m_spiderColl &&
	//     cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
	//	log("admin: Can not delete collection while "
	//	    "spiders are outstanding for collection. Turn off "
	//	    "spiders and wait for them to exit.");
	//	return false;
	//}

	char *coll = cr->m_coll;

	// note it
	log(LOG_INFO,"db: deleting coll \"%s\" (%" INT32 ")",coll,
	    (int32_t)cr->m_collnum);

	// we need a save
	m_needsSave = true;

	// nuke doleiptable and waintree and waitingtable
	/*
	SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
	sc->m_waitingTree.clear();
	sc->m_waitingTable.clear();
	sc->m_doleIpTable.clear();
	g_spiderLoop.m_lockTable.clear();
	g_spiderLoop.m_lockCache.clear(0);
	sc->m_lastDownloadCache.clear(collnum);
	*/

	// CAUTION: tree might be in the middle of saving
	// we deal with this in Process.cpp now

	// remove from spider cache, tell it to sync up with collectiondb
	//g_spiderCache.reset1();
	// . TODO: remove from g_sync
	// . remove from all rdbs
	//g_indexdb.getRdb()->delColl    ( coll );
	g_posdb.getRdb()->delColl    ( coll );
	//g_datedb.getRdb()->delColl     ( coll );

	g_titledb.getRdb()->delColl    ( coll );
	//g_revdb.getRdb()->delColl      ( coll );
	//g_sectiondb.getRdb()->delColl  ( coll );
	g_tagdb.getRdb()->delColl ( coll );
	// let's preserve the tags... they have all the turk votes in them
	//if ( deleteTurkdb ) {
	//}
	//g_catdb.getRdb()->delColl      ( coll );
	//g_checksumdb.getRdb()->delColl ( coll );
	g_spiderdb.getRdb()->delColl   ( coll );
	g_doledb.getRdb()->delColl     ( coll );
	//g_tfndb.getRdb()->delColl      ( coll );
	g_clusterdb.getRdb()->delColl  ( coll );
	g_linkdb.getRdb()->delColl     ( coll );

	// reset spider info
	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
	if ( sc ) {
		// remove locks from lock table:
		sc->clearLocks();
		//sc->m_collnum = newCollnum;
		//sc->reset();
		// you have to set this for tryToDeleteSpiderColl to
		// actually have a shot at deleting it
		sc->m_deleteMyself = true;
		// cr will be invalid int16_tly after this
		// MDW: this is causing the core...
		// use fake ptrs for easier debugging
		//sc->m_cr = (CollectionRec *)0x99999;//NULL;
		//sc->m_cr = NULL;
		sc->setCollectionRec ( NULL );
		// this will put it on "death row" so it will be deleted
		// once Msg5::m_waitingForList/Merge is NULL
		tryToDeleteSpiderColl ( sc ,"10");
		//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
		//delete ( sc );
		// don't let cr reference us anymore, sc is on deathrow
		// and "cr" is delete below!
		//cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
		cr->m_spiderColl = NULL;
	}


	// the bulk urls file too i guess
	if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
		SafeBuf bu;
		bu.safePrintf("%sbulkurls-%s.txt",
			      g_hostdb.m_dir , cr->m_coll );
		File bf;
		bf.set ( bu.getBufStart() );
		if ( bf.doesExist() ) bf.unlink();
	}

	// now remove from list of collections that might need a disk merge
	removeFromMergeLinkedList ( cr );

	//////
	//
	// remove from m_recs[]
	//
	//////
	setRecPtr ( cr->m_collnum , NULL );

	// free it
	mdelete ( cr, sizeof(CollectionRec),  "CollectionRec" );
	delete ( cr );

	// do not do this here in case spiders were outstanding
	// and they added a new coll right away and it ended up getting
	// recs from the deleted coll!!
	//while ( ! m_recs[m_numRecs-1] ) m_numRecs--;

	// update the time
	//updateTime();
	// done
	return true;
}

//#include "PageTurk.h"

/*
// . reset a collection
// . returns false if blocked and will call callback
bool Collectiondb::resetColl ( char *coll ,  bool purgeSeeds) {

	// ensure it's not NULL
	if ( ! coll ) {
		log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
		g_errno = ENOCOLLREC;
		return true;
	}

	// get the CollectionRec for "qatest123"
	CollectionRec *cr = getRec ( coll ); // "qatest123" );

	// must be there. if not, we create test i guess
	if ( ! cr ) {
		log("db: could not get coll rec \"%s\" to reset", coll);
		char *xx=NULL;*xx=0;
	}

	return resetColl2 ( cr->m_collnum, purgeSeeds);
}
*/

// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {

	// an add, make sure big enough
	int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
	int32_t have = m_recPtrBuf.getLength();
	int32_t need2 = need - have;

	// if already big enough
	if ( need2 <= 0 ) {
		m_recs [ collnum ] = NULL;
		return true;
	}

	m_recPtrBuf.setLabel ("crecptrb");

	// . true here means to clear the new space to zeroes
	// . this shit works based on m_length not m_capacity
	if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
		log("admin: error growing rec ptr buf2.");
		return false;
	}

	// sanity
	if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }

	// set it
	m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();

	// update length of used bytes in case we re-alloc
	m_recPtrBuf.setLength ( need );

	// re-max
	int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
	// sanity
	if ( collnum >= max ) { char *xx=NULL;*xx=0; }

	// initialize slot
	m_recs [ collnum ] = NULL;

	return true;
}


bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {

	// first time init hashtable that maps coll to collnum
	if ( g_collTable.m_numSlots == 0 &&
	     ! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
			       false,0,"nhshtbl"))
		return false;

	// sanity
	if ( collnum < 0 ) { char *xx=NULL;*xx=0; }

	// sanity
	int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);

	// set it
	m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();

	// tell spiders to re-upadted the active list
	g_spiderLoop.m_activeListValid = false;
	g_spiderLoop.m_activeListModified = true;

	// a delete?
	if ( ! cr ) {
		// sanity
		if ( collnum >= max ) { char *xx=NULL;*xx=0; }
		// get what's there
		CollectionRec *oc = m_recs[collnum];
		// let it go
		m_recs[collnum] = NULL;
		// if nothing already, done
		if ( ! oc ) return true;
		// tally it up
		m_numRecsUsed--;
		// delete key
		int64_t h64 = hash64n(oc->m_coll);
		// if in the hashtable UNDER OUR COLLNUM then nuke it
		// otherwise, we might be called from resetColl2()
		void *vp = g_collTable.getValue ( &h64 );
		if ( ! vp ) return true;
		collnum_t ct = *(collnum_t *)vp;
		if ( ct != collnum ) return true;
		g_collTable.removeKey ( &h64 );
		return true;
	}

	// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
	if ( ! growRecPtrBuf ( collnum ) )
		return false;

	// sanity
	if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }

	// add to hash table to map name to collnum_t
	int64_t h64 = hash64n(cr->m_coll);
	// debug
	//log("coll: adding key %" INT64 " for %s",h64,cr->m_coll);
	if ( ! g_collTable.addKey ( &h64 , &collnum ) )
		return false;

	// ensure last is NULL
	m_recs[collnum] = cr;

	// count it
	m_numRecsUsed++;

	//log("coll: adding key4 %" UINT64 " for coll \"%s\" (%" INT32 ")",h64,cr->m_coll,
	//    (int32_t)i);

	// reserve it
	if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;

	// sanity to make sure collectionrec ptrs are legit
	for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
		if ( ! m_recs[j] ) continue;
		if ( m_recs[j]->m_collnum == 1 ) continue;
	}

	// update the time
	//updateTime();

	return true;
}

// moves a file by first trying rename, then copying since cross device renaming doesn't work
// returns 0 on success
int mv(char* src, char* dest) {
    int status = rename( src , dest );

    if (status == 0)
        return 0;
    FILE *fsrc, *fdest;
    fsrc = fopen(src, "r");
    if (fsrc == NULL)
        return -1;
    fdest = fopen(dest, "w");
    if (fdest == NULL) {
        fclose(fsrc);
        return -1;
    }

    const int BUF_SIZE = 1024;
    char buf[BUF_SIZE];
    while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
        int read = fread(buf, 1, BUF_SIZE, fsrc);
        fwrite(buf, 1, read, fdest);
    }

    fclose(fsrc);
    fclose(fdest);
    if (ferror(fdest) || ferror(fsrc))
        return -1;

    remove(src);
    return 0;
}

// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2( collnum_t oldCollnum,
			       collnum_t newCollnum,
			       //WaitEntry *we,
			       bool purgeSeeds){

	// save parms in case we block
	//we->m_purgeSeeds = purgeSeeds;

	// now must be "qatest123" only for now
	//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
	// no spiders can be out. they may be referencing the CollectionRec
	// in XmlDoc.cpp... quite likely.
	//if ( g_conf.m_spideringEnabled ||
	//     g_spiderLoop.m_numSpidersOut > 0 ) {
	//	log("admin: Can not delete collection while "
	//	    "spiders are enabled or active.");
	//	return false;
	//}

	// do not allow this if in repair mode
	if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
		log("admin: Can not delete collection while in repair mode.");
		g_errno = EBADENGINEER;
		return true;
	}

	//log("admin: resetting collnum %" INT32 "",(int32_t)oldCollnum);

	// CAUTION: tree might be in the middle of saving
	// we deal with this in Process.cpp now
	if ( g_process.isAnyTreeSaving() ) {
		// we could not complete...
		return false;
	}

	CollectionRec *cr = m_recs [ oldCollnum ];

	// let's reset crawlinfo crap
	cr->m_globalCrawlInfo.reset();
	cr->m_localCrawlInfo.reset();

	//collnum_t oldCollnum = cr->m_collnum;
	//collnum_t newCollnum = m_numRecs;

	// in case of bulk job, be sure to save list of spots
	// copy existing list to a /tmp, where they will later be transferred back to the new folder
	// now i just store in the root working dir... MDW
	/*
	char oldbulkurlsname[1036];
	snprintf(oldbulkurlsname, 1036, "%scoll.%s.%" INT32 "/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
	char newbulkurlsname[1036];
	snprintf(newbulkurlsname, 1036, "%scoll.%s.%" INT32 "/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
	char tmpbulkurlsname[1036];
	snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%" INT32 ".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);

	if (cr->m_isCustomCrawl == 2)
	    mv( oldbulkurlsname , tmpbulkurlsname );
	*/

	// reset spider info
	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
	if ( sc ) {
		// remove locks from lock table:
		sc->clearLocks();
		// don't do this anymore, just nuke it in case
		// m_populatingDoledb was true etc. there are too many
		// flags to worry about
		//sc->m_collnum = newCollnum;
		//sc->reset();
		// this will put it on "death row" so it will be deleted
		// once Msg5::m_waitingForList/Merge is NULL
		tryToDeleteSpiderColl ( sc,"11" );
		//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
		//delete ( sc );
		cr->m_spiderColl = NULL;
	}

	// reset spider round
	cr->m_spiderRoundNum = 0;
	cr->m_spiderRoundStartTime = 0;

	cr->m_spiderStatus = SP_INITIALIZING; // this is 0
	//cr->m_spiderStatusMsg = NULL;

	// reset seed buf
	if ( purgeSeeds ) {
		// free the buffer of seed urls
		cr->m_diffbotSeeds.purge();
		// reset seed dedup table
		HashTableX *ht = &cr->m_seedHashTable;
		ht->reset();
	}

	// so XmlDoc.cpp can detect if the collection was reset since it
	// launched its spider:
	cr->m_lastResetCount++;


	if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;

	// advance sanity check. did we wrap around?
	// right now we #define collnum_t int16_t
	if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }

	// make a new collnum so records in transit will not be added
	// to any rdb...
	cr->m_collnum = newCollnum;

	// update the timestamps since we are restarting/resetting
	cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
	cr->m_diffbotCrawlEndTime   = 0;


	////////
	//
	// ALTER m_recs[] array
	//
	////////

	// Rdb::resetColl() needs to know the new cr so it can move
	// the RdbBase into cr->m_bases[rdbId] array. recycling.
	setRecPtr ( newCollnum , cr );

	// a new directory then since we changed the collnum
	char dname[512];
	sprintf(dname, "%scoll.%s.%" INT32 "/",
		g_hostdb.m_dir,
		cr->m_coll,
		(int32_t)newCollnum);
	DIR *dir = opendir ( dname );
	if ( dir )
	     closedir ( dir );
	if ( dir ) {
		//g_errno = EEXIST;
		log("admin: Trying to create collection %s but "
		    "directory %s already exists on disk.",cr->m_coll,dname);
	}
	if ( ::mkdir ( dname ,
		       getDirCreationFlags() ) ) {
		       // S_IRUSR | S_IWUSR | S_IXUSR |
		       // S_IRGRP | S_IWGRP | S_IXGRP |
		       // S_IROTH | S_IXOTH ) ) {
		// valgrind?
		//if ( errno == EINTR ) goto retry22;
		//g_errno = errno;
		log("admin: Creating directory %s had error: "
		    "%s.", dname,mstrerror(g_errno));
	}

	// be sure to copy back the bulk urls for bulk jobs
	// MDW: now i just store that file in the root working dir
	//if (cr->m_isCustomCrawl == 2)
	//	mv( tmpbulkurlsname, newbulkurlsname );

	// . unlink all the *.dat and *.map files for this coll in its subdir
	// . remove all recs from this collnum from m_tree/m_buckets
	// . updates RdbBase::m_collnum
	// . so for the tree it just needs to mark the old collnum recs
	//   with a collnum -1 in case it is saving...
	g_posdb.getRdb()->deleteColl     ( oldCollnum , newCollnum );
	g_titledb.getRdb()->deleteColl   ( oldCollnum , newCollnum );
	g_tagdb.getRdb()->deleteColl     ( oldCollnum , newCollnum );
	g_spiderdb.getRdb()->deleteColl  ( oldCollnum , newCollnum );
	g_doledb.getRdb()->deleteColl    ( oldCollnum , newCollnum );
	g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
	g_linkdb.getRdb()->deleteColl    ( oldCollnum , newCollnum );

	// reset crawl status too!
	cr->m_spiderStatus = SP_INITIALIZING;

	// . set m_recs[oldCollnum] to NULL and remove from hash table
	// . do after calls to deleteColl() above so it won't crash
	setRecPtr ( oldCollnum , NULL );


	// save coll.conf to new directory
	cr->save();

	// and clear the robots.txt cache in case we recently spidered a
	// robots.txt, we don't want to use it, we want to use the one we
	// have in the test-parser subdir so we are consistent
	//RdbCache *robots = Msg13::getHttpCacheRobots();
	//RdbCache *others = Msg13::getHttpCacheOthers();
	// clear() was removed do to possible corruption
	//robots->clear ( oldCollnum );
	//others->clear ( oldCollnum );

	//g_templateTable.reset();
	//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );

	// repopulate CollectionRec::m_sortByDateTable. should be empty
	// since we are resetting here.
	//initSortByDateTable ( coll );

	// done
	return true;
}

// a hack function
bool addCollToTable ( char *coll , collnum_t collnum ) {
	// readd it to the hashtable that maps name to collnum too
	int64_t h64 = hash64n(coll);
	g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
			false,0,"nhshtbl");
	return g_collTable.addKey ( &h64 , &collnum );
}

// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
	char *coll = r->getString ( "c" );
	if ( coll && ! coll[0] ) coll = NULL;
	// maybe it is crawlbot?
	char *name = NULL;
	char *token = NULL;
	if ( ! coll ) {
		name = r->getString("name");
		token = r->getString("token");
	}
	char tmp[MAX_COLL_LEN+1];
	if ( ! coll && token && name ) {
		snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
		coll = tmp;
	}

	// default to main first
	if ( ! coll && useDefaultRec ) {
		CollectionRec *cr = g_collectiondb.getRec("main");
		if ( cr ) return cr;
	}

	// try next in line
	if ( ! coll && useDefaultRec ) {
		return getFirstRec ();
	}

	// give up?
	if ( ! coll ) return NULL;
	//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
	return g_collectiondb.getRec ( coll );
}

char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
	char *coll = r->getString ( "c" );
	if ( coll && ! coll[0] ) coll = NULL;
	if ( coll ) return coll;
	CollectionRec *cr = NULL;
	// default to main first
	if ( ! coll ) {
		cr = g_collectiondb.getRec("main");
		// CAUTION: cr could be deleted so don't trust this ptr
		// if you give up control of the cpu
		if ( cr ) return cr->m_coll;
	}
	// try next in line
	if ( ! coll ) {
		cr = getFirstRec ();
		if ( cr ) return cr->m_coll;
	}
	// give up?
	return NULL;
}


//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
//	char *coll = getDefaultColl();
//	return g_collectiondb.getRec(coll);
//}

// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
	if ( ! coll ) coll = "";
	return getRec ( coll , gbstrlen(coll) );
}

CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
	if ( ! coll ) coll = "";
	collnum_t collnum = getCollnum ( coll , collLen );
	if ( collnum < 0 ) return NULL;
	return m_recs [ (int32_t)collnum ];
}

CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
	if ( collnum >= m_numRecs || collnum < 0 ) {
		// Rdb::resetBase() gets here, so don't always log.
		// it is called from CollectionRec::reset() which is called
		// from the CollectionRec constructor and ::load() so
		// it won't have anything in rdb at that time
		//log("colldb: collnum %" INT32 " > numrecs = %" INT32 "",
		//    (int32_t)collnum,(int32_t)m_numRecs);
		return NULL;
	}
	return m_recs[collnum];
}


//CollectionRec *Collectiondb::getDefaultRec ( ) {
//	if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
//	collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
//	if ( collnum < (collnum_t)0 ) return NULL;
//	return m_recs[(int32_t)collnum];
//}

CollectionRec *Collectiondb::getFirstRec ( ) {
	for ( int32_t i = 0 ; i < m_numRecs ; i++ )
		if ( m_recs[i] ) return m_recs[i];
	return NULL;
}

collnum_t Collectiondb::getFirstCollnum ( ) {
	for ( int32_t i = 0 ; i < m_numRecs ; i++ )
		if ( m_recs[i] ) return i;
	return (collnum_t)-1;
}

char *Collectiondb::getFirstCollName ( ) {
	for ( int32_t i = 0 ; i < m_numRecs ; i++ )
		if ( m_recs[i] ) return m_recs[i]->m_coll;
	return NULL;
}

char *Collectiondb::getCollName ( collnum_t collnum ) {
	if ( collnum < 0 || collnum > m_numRecs ) return NULL;
	if ( ! m_recs[(int32_t)collnum] ) return NULL;
	return m_recs[collnum]->m_coll;
}

collnum_t Collectiondb::getCollnum ( char *coll ) {

	int32_t clen = 0;
	if ( coll ) clen = gbstrlen(coll );
	return getCollnum ( coll , clen );
	/*
	//if ( ! coll ) coll = "";

	// default empty collection names
	if ( coll && ! coll[0] ) coll = NULL;
	if ( ! coll ) coll = g_conf.m_defaultColl;
	if ( ! coll || ! coll[0] ) coll = "main";


	// This is necessary for Statsdb to work, as it is
	// not associated with any collection. Is this
	// necessary for Catdb?
	if ( coll[0]=='s' && coll[1] =='t' &&
	     strcmp ( "statsdb\0", coll ) == 0)
		return 0;
	if ( coll[0]=='f' && coll[1]=='a' &&
	     strcmp ( "facebookdb\0", coll ) == 0)
		return 0;
	if ( coll[0]=='a' && coll[1]=='c' &&
	     strcmp ( "accessdb\0", coll ) == 0)
		return 0;

	// because diffbot may have thousands of crawls/collections
	// let's improve the speed here. try hashing it...
	int64_t h64 = hash64n(coll);
	void *vp = g_collTable.getValue ( &h64 );
	if ( ! vp ) return -1; // not found
	return *(collnum_t *)vp;
	*/
	/*
	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
		if ( ! m_recs[i] ) continue;
		if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
		if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
	}
	//if ( strcmp ( "catdb\0", coll ) == 0) return 0;
	return (collnum_t)-1; // not found
	*/
}

collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {

	// default empty collection names
	if ( coll && ! coll[0] ) coll = NULL;
	if ( ! coll ) {
		coll = g_conf.m_defaultColl;
		if ( coll ) clen = gbstrlen(coll);
		else clen = 0;
	}
	if ( ! coll || ! coll[0] ) {
		coll = "main";
		clen = gbstrlen(coll);
	}

	// This is necessary for Statsdb to work, as it is
	//if ( ! coll ) coll = "";

	// not associated with any collection. Is this
	// necessary for Catdb?
	if ( coll[0]=='s' && coll[1] =='t' &&
	     strcmp ( "statsdb\0", coll ) == 0)
		return 0;
	if ( coll[0]=='f' && coll[1]=='f' &&
	     strcmp ( "facebookdb\0", coll ) == 0)
		return 0;
	if ( coll[0]=='a' && coll[1]=='c' &&
	     strcmp ( "accessdb\0", coll ) == 0)
		return 0;


	// because diffbot may have thousands of crawls/collections
	// let's improve the speed here. try hashing it...
	int64_t h64 = hash64(coll,clen);
	void *vp = g_collTable.getValue ( &h64 );
	if ( ! vp ) return -1; // not found
	return *(collnum_t *)vp;

	/*
	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
		if ( ! m_recs[i] ) continue;
		if ( m_recs[i]->m_collLen != clen ) continue;
		if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
	}

	//if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
	return (collnum_t)-1; // not found
	*/
}

//collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
//	for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
//		if ( m_recs[i] ) return i;
//	// no next one, use -1
//	return (collnum_t) -1;
//}

// what collnum will be used the next time a coll is added?
collnum_t Collectiondb::reserveCollNum ( ) {

	if ( m_numRecs < 0x7fff ) {
		collnum_t next = m_numRecs;
		// make the ptr NULL at least to accommodate the
		// loop that scan up to m_numRecs lest we core
		growRecPtrBuf ( next );
		m_numRecs++;
		return next;
	}

	// collnum_t is signed right now because we use -1 to indicate a
	// bad collnum.
	int32_t scanned = 0;
	// search for an empty slot
	for ( int32_t i = m_wrapped ; ; i++ ) {
		// because collnum_t is 2 bytes, signed, limit this here
		if ( i > 0x7fff ) i = 0;
		// how can this happen?
		if ( i < 0      ) i = 0;
		// if we scanned the max # of recs we could have, we are done
		if ( ++scanned >= m_numRecs ) break;
		// skip if this is in use
		if ( m_recs[i] ) continue;
		// start after this one next time
		m_wrapped = i+1;
		// note it
		log("colldb: returning wrapped collnum "
		    "of %" INT32 "",(int32_t)i);
		return (collnum_t)i;
	}

	log("colldb: no new collnum available. consider upping collnum_t");
	// none available!!
	return -1;
}


///////////////
//
// COLLECTIONREC
//
///////////////

#include "gb-include.h"

//#include "CollectionRec.h"
//#include "Collectiondb.h"
#include "HttpServer.h"     // printColors2()
#include "Msg5.h"
#include "Threads.h"
#include "Datedb.h"
#include "Timedb.h"
#include "Spider.h"
#include "Process.h"

static CollectionRec g_default;


CollectionRec::CollectionRec() {
	m_nextLink = NULL;
	m_prevLink = NULL;
	m_spiderCorruptCount = 0;
	m_collnum = -1;
	m_coll[0] = '\0';
	m_updateRoundNum = 0;
	m_swappedOut = false;
	//m_numSearchPwds = 0;
	//m_numBanIps     = 0;
	//m_numSearchIps  = 0;
	//m_numSpamIps    = 0;
	//m_numAdminPwds  = 0;
	//m_numAdminIps   = 0;
	memset ( m_bases , 0 , sizeof(RdbBase *)*RDB_END );
	// how many keys in the tree of each rdb? we now store this stuff
	// here and not in RdbTree.cpp because we no longer have a maximum
	// # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
	// it is 4 * RDB_END...
	memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
	memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
	m_spiderColl = NULL;
	m_overflow  = 0x12345678;
	m_overflow2 = 0x12345678;
	// the spiders are currently uninhibited i guess
	m_spiderStatus = SP_INITIALIZING; // this is 0
	//m_spiderStatusMsg = NULL;
	// for Url::getSite()
	m_updateSiteRulesTable = 1;
	//m_lastUpdateTime = 0LL;
	m_clickNScrollEnabled = false;
	// inits for sortbydatetable
	m_inProgress = false;
	m_msg5       = NULL;
	m_importState = NULL;
	// JAB - track which regex parsers have been initialized
	//log(LOG_DEBUG,"regex: %p initializing empty parsers", m_pRegExParser);

	// clear these out so Parms::calcChecksum can work:
	memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
	//for ( int i = 0; i < MAX_FILTERS ; i++ )
	//	m_spiderQuotas[i] = -1;
	memset( m_spiderPriorities, 0,
		MAX_FILTERS*sizeof(*m_spiderPriorities) );
	memset ( m_harvestLinks,0,MAX_FILTERS);
	memset ( m_forceDelete,0,MAX_FILTERS);
	//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
	//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
	//	*(m_searchPwds[i]) = '\0';
	//}
	//for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
	//	*(m_adminPwds[i]) = '\0';
	//}
	//memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
	//memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
	//memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
	//memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );

	//for ( int i = 0; i < MAX_FILTERS; i++ ) {
	//	//m_pRegExParser[i] = NULL;
	//	*(m_regExs[i]) = '\0';
	//}
	m_numRegExs = 0;

	//m_requests = 0;
	//m_replies = 0;
	//m_doingCallbacks = false;

	m_lastResetCount = 0;

	// regex_t types
	m_hasucr = false;
	m_hasupr = false;

	// for diffbot caching the global spider stats
	reset();

	// add default reg ex if we do not have one
	//setUrlFiltersToDefaults();
	//rebuildUrlFilters();
}

CollectionRec::~CollectionRec() {
	//invalidateRegEx ();
        reset();
}

// new collection recs get this called on them
void CollectionRec::setToDefaults ( ) {
	g_parms.setFromFile ( this , NULL , NULL  , OBJ_COLL );
	// add default reg ex
	//setUrlFiltersToDefaults();
	rebuildUrlFilters();
}

void CollectionRec::reset() {

	//log("coll: resetting collnum=%" INT32 "",(int32_t)m_collnum);

	// . grows dynamically
	// . setting to 0 buckets should never have error
	//m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );

	// regex_t types
	if ( m_hasucr ) regfree ( &m_ucr );
	if ( m_hasupr ) regfree ( &m_upr );

	m_hasucr = false;
	m_hasupr = false;

	m_sendingAlertInProgress = false;

	// make sure we do not leave spiders "hanging" waiting for their
	// callback to be called... and it never gets called
	//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
	//if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
	//if ( m_replies != m_requests  ) { char *xx=NULL;*xx=0; }
	m_localCrawlInfo.reset();
	m_globalCrawlInfo.reset();
	//m_requests = 0;
	//m_replies = 0;
	// free all RdbBases in each rdb
	for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
	     Rdb *rdb = g_process.m_rdbs[i];
	     rdb->resetBase ( m_collnum );
	}

	for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
		RdbBase *base = m_bases[i];
		if ( ! base ) continue;
		mdelete (base, sizeof(RdbBase), "Rdb Coll");
		delete  (base);
	}

	SpiderColl *sc = m_spiderColl;
	// debug hack thing
	//if ( sc == (SpiderColl *)0x8888 ) return;
	// if never made one, we are done
	if ( ! sc ) return;

	// spider coll also!
	sc->m_deleteMyself = true;

	// if not currently being accessed nuke it now
	tryToDeleteSpiderColl ( sc ,"12");

	// if ( ! sc->m_msg5.m_waitingForList &&
	//      ! sc->m_msg5b.m_waitingForList &&
	//      ! sc->m_msg1.m_mcast.m_inUse ) {
	// 	mdelete ( sc, sizeof(SpiderColl),"nukecr2");
	// 	delete ( sc );
	// }
}

CollectionRec *g_cr = NULL;

// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
//   collection config file. if it does not have them then we use
//   the value we received from call to setToDefaults()
// . returns false and sets g_errno on load error
bool CollectionRec::load ( char *coll , int32_t i ) {
	// also reset some counts not included in parms list
	reset();
	// before we load, set to defaults in case some are not in xml file
	g_parms.setToDefault ( (char *)this , OBJ_COLL , this );
	// get the filename with that id
	File f;
	char tmp2[1024];
	sprintf ( tmp2 , "%scoll.%s.%" INT32 "/coll.conf", g_hostdb.m_dir , coll,i);
	f.set ( tmp2 );
	if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
	// set our collection number
	m_collnum = i;
	// set our collection name
	m_collLen = gbstrlen ( coll );
	strcpy ( m_coll , coll );

	if ( ! g_conf.m_doingCommandLine )
		log(LOG_INFO,"db: Loading conf for collection %s (%" INT32 ")",coll,
		    (int32_t)m_collnum);

	// collection name HACK for backwards compatibility
	//if ( strcmp ( coll , "main" ) == 0 ) {
	//	m_coll[0] = '\0';
	//	m_collLen = 0;
	//}

	// the default conf file
	char tmp1[1024];
	snprintf ( tmp1 , 1023, "%sdefault.conf" , g_hostdb.m_dir );

	// . set our parms from the file.
	// . accepts OBJ_COLLECTIONREC or OBJ_CONF
	g_parms.setFromFile ( this , tmp2 , tmp1 , OBJ_COLL );

	// add default reg ex IFF there are no url filters there now
	//if(m_numRegExs == 0) rebuildUrlFilters();//setUrlFiltersToDefaults();

	// this only rebuild them if necessary
	rebuildUrlFilters();//setUrlFiltersToDefaults();

	// temp check
	//testRegex();

	//
	// LOAD the crawlinfo class in the collectionrec for diffbot
	//
	// LOAD LOCAL
	snprintf ( tmp1 , 1023, "%scoll.%s.%" INT32 "/localcrawlinfo.dat",
		  g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	log(LOG_DEBUG,"db: Loading %s",tmp1);
	m_localCrawlInfo.reset();
	SafeBuf sb;
	// fillfromfile returns 0 if does not exist, -1 on read error
	if ( sb.fillFromFile ( tmp1 ) > 0 )
		//m_localCrawlInfo.setFromSafeBuf(&sb);
		// it is binary now
		gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );

	// if it had corrupted data from saving corrupted mem zero it out
	CrawlInfo *stats = &m_localCrawlInfo;
	// point to the stats for that host
	int64_t *ss = (int64_t *)stats;
	// are stats crazy?
	bool crazy = false;
	for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
		// crazy stat?
		if ( *ss > 1000000000LL ||
		     *ss < -1000000000LL ) {
			crazy = true;
			break;
		}
		ss++;
	}
	if ( m_localCrawlInfo.m_collnum != m_collnum )
		crazy = true;
	if ( crazy ) {
		log("coll: had crazy spider stats for coll %s. zeroing out.",
		    m_coll);
		m_localCrawlInfo.reset();
	}


	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
		log("coll: Loaded %s (%" INT32 ") local hasurlsready=%" INT32 "",
		    m_coll,
		    (int32_t)m_collnum,
		    (int32_t)m_localCrawlInfo.m_hasUrlsReadyToSpider);


	// we introduced the this round counts, so don't start them at 0!!
	if ( m_spiderRoundNum == 0 &&
	     m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
	     m_localCrawlInfo.m_pageDownloadSuccesses ) {
		log("coll: fixing process count this round for %s",m_coll);
		m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
			m_localCrawlInfo.m_pageDownloadSuccesses;
	}

	// we introduced the this round counts, so don't start them at 0!!
	if ( m_spiderRoundNum == 0 &&
	     m_localCrawlInfo.m_pageProcessSuccessesThisRound <
	     m_localCrawlInfo.m_pageProcessSuccesses ) {
		log("coll: fixing process count this round for %s",m_coll);
		m_localCrawlInfo.m_pageProcessSuccessesThisRound =
			m_localCrawlInfo.m_pageProcessSuccesses;
	}

	// fix from old bug that was fixed
	//if ( m_spiderRoundNum == 0 &&
	//     m_collectiveRespiderFrequency > 0.0 &&
	//     m_localCrawlInfo.m_sentCrawlDoneAlert ) {
	//	log("coll: bug fix: resending email alert for coll %s (%" INT32 ") "
	//	    "of respider freq %f",m_coll,(int32_t)m_collnum,
	//	    m_collectiveRespiderFrequency);
	//	m_localCrawlInfo.m_sentCrawlDoneAlert = false;
	//}


	// LOAD GLOBAL
	snprintf ( tmp1 , 1023, "%scoll.%s.%" INT32 "/globalcrawlinfo.dat",
		  g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	log(LOG_DEBUG,"db: Loading %s",tmp1);
	m_globalCrawlInfo.reset();
	sb.reset();
	if ( sb.fillFromFile ( tmp1 ) > 0 )
		//m_globalCrawlInfo.setFromSafeBuf(&sb);
		// it is binary now
		gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );

	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
		log("coll: Loaded %s (%" INT32 ") global hasurlsready=%" INT32 "",
		    m_coll,
		    (int32_t)m_collnum,
		    (int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);

	// the list of ip addresses that we have detected as being throttled
	// and therefore backoff and use proxies for
	if ( ! g_conf.m_doingCommandLine ) {
		sb.reset();
		sb.safePrintf("%scoll.%s.%" INT32 "/",
			      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
		m_twitchyTable.m_allocName = "twittbl";
		m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
	}


	////////////
	//
	// PAGE COUNT TABLE for doing quotas in url filters
	//
	/////////////
	// log it up if there on disk
	//snprintf ( tmp1 , 1023, "/coll.%s.%" INT32 "/pagecounts.dat",
	//	   m_coll , (int32_t)m_collnum );
	//if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
	//	log("db: failed to load page count table: %s",
	//	    mstrerror(g_errno));

	// ignore errors i guess
	g_errno = 0;


	// fix for diffbot, spider time deduping
	if ( m_isCustomCrawl ) m_dedupingEnabled = true;

	// always turn off gigabits so &s=1000 can do summary skipping
	if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;

	// make min to merge smaller than normal since most collections are
	// small and we want to reduce the # of vfds (files) we have
	if ( m_isCustomCrawl ) {
		m_posdbMinFilesToMerge   = 6;
		m_titledbMinFilesToMerge = 4;
		m_linkdbMinFilesToMerge  = 3;
		m_tagdbMinFilesToMerge   = 2;
	}

	// always turn on distributed spider locking because otherwise
	// we end up calling Msg50 which calls Msg25 for the same root url
	// at the same time, thereby wasting massive resources. it is also
	// dangerous to run without this because webmaster get pissed when
	// we slam their servers.
	// This is now deprecated...
	//m_useSpiderLocks      = false;
	// and all pages downloaded from a particular ip should be done
	// by the same host in our cluster to prevent webmaster rage
	//m_distributeSpiderGet = true;

	//initSortByDateTable(m_coll);

	return true;
}

/*
bool CollectionRec::countEvents ( ) {
	// set our m_numEventsOnHost value
	log("coll: loading event count termlist gbeventcount");
	// temporarily turn off threads
	bool enabled = g_threads.areThreadsEnabled();
	g_threads.disableThreads();
	// count them
	m_numEventsOnHost = 0;
	// 1MB at a time
	int32_t minRecSizes = 1000000;
	// look up this termlist, gbeventcount which we index in XmlDoc.cpp
	int64_t termId = hash64n("gbeventcount") & TERMID_MASK;
	// make datedb key from it
	key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
	key128_t endKey   = g_datedb.makeEndKey ( termId , 0 );

	Msg5 msg5;
	RdbList list;

	// . init m_numEventsOnHost by getting the exact length of that
	//   termlist on this host
	// . send in the ping request packet so all hosts can total up
	// . Rdb.cpp should be added to incrementally so we should have no
	//   double positives.
	// . Rdb.cpp should inspect each datedb rec for this termid in
	//   a fast an efficient manner
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_DATEDB    ,
			      m_coll        ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      NULL          )){// msg5b
		// not allowed to block!
		char *xx=NULL;*xx=0; }
	// scan the list, score is how many valid events from that docid
	uint32_t total = 0;
	for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
		unsigned char *rec = (unsigned char *)list.getCurrentRec();
		// in datedb score is byte #5
		total += (255-rec[5]);
	}
	// declare
	char    *lastKeyPtr;
	key128_t  newStartKey;
	// add to count. datedb uses half keys so subtract 6 bytes
	// since the termids will be the same...
	//m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
	m_numEventsOnHost += total;
	// bail if under limit
	if ( list.getListSize() < minRecSizes ) goto done;
	// update key
	lastKeyPtr = list.m_listEnd - 10;
	// we make a new start key
	list.getKey ( lastKeyPtr , (char *)&newStartKey );
	// maxxed out?
	if ( newStartKey.n0==0xffffffffffffffffLL &&
	     newStartKey.n1==0xffffffffffffffffLL )
		goto done;
	// sanity check
	if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
	if ( newStartKey > endKey   ) { char *xx=NULL;*xx=0; }
	// inc it
	newStartKey.n0++;
	// in the top if the bottom wrapped
	if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
	// assign
	startKey = newStartKey;
	// and loop back up for more now
	goto loop;

 done:

	// update all colls count
	g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;

	if ( enabled ) g_threads.enableThreads();
	log("coll: got %" INT32 " local events in termlist",m_numEventsOnHost);

	// set "m_hasDocQualityFiler"
	//updateFilters();

	return true;
}
*/

bool CollectionRec::rebuildUrlFilters2 ( ) {

	// tell spider loop to update active list
	g_spiderLoop.m_activeListValid = false;


	bool rebuild = true;
	if ( m_numRegExs == 0 )
		rebuild = true;
	// don't touch it if not supposed to as int32_t as we have some already
	//if ( m_urlFiltersProfile != UFP_NONE )
	//	rebuild = true;
	// never for custom crawls however
	if ( m_isCustomCrawl )
		rebuild = false;

	char *s = m_urlFiltersProfile.getBufStart();


	// support the old UFP_CUSTOM, etc. numeric values
	if ( !strcmp(s,"0" ) )
		s = "custom";
	// UFP_WEB SUPPORT
	if ( !strcmp(s,"1" ) )
		s = "web";
	// UFP_NEWS
	if ( !strcmp(s,"2" ) )
		s = "shallow";


	// leave custom profiles alone
	if ( !strcmp(s,"custom" ) )
		rebuild = false;


	//if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
	//	addDefault = true;
	if ( ! rebuild ) return true;


	if ( !strcmp(s,"shallow" ) )
		return rebuildShallowRules();

	//if ( strcmp(s,"web") )
	// just fall through for that


	if ( !strcmp(s,"english") )
		return rebuildLangRules( "en","com,us,gov");

	if ( !strcmp(s,"german") )
		return rebuildLangRules( "de","de");

	if ( !strcmp(s,"french") )
		return rebuildLangRules( "fr","fr");

	if ( !strcmp(s,"norwegian") )
		return rebuildLangRules( "nl","nl");

	if ( !strcmp(s,"spanish") )
		return rebuildLangRules( "es","es");

	//if ( m_urlFiltersProfile == UFP_EURO )
	//	return rebuildLangRules( "de,fr,nl,es,sv,no,it",
	//				 "com,gov,org,de,fr,nl,es,sv,no,it");


	if ( !strcmp(s,"romantic") )
		return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",

					"de,fr,nl,es,sv,no,it,fi,pt,"

					"com,gov,org"
					);

	if ( !strcmp(s,"chinese") )
		return rebuildLangRules( "zh_cn,zh_tw","cn");


	int32_t n = 0;

	/*
	m_regExs[n].set("default");
	m_regExs[n].nullTerm();
	m_spiderFreqs     [n] = 30; // 30 days default
	m_spiderPriorities[n] = 0;
	m_maxSpidersPerRule[n] = 99;
	m_spiderIpWaits[n] = 1000;
	m_spiderIpMaxSpiders[n] = 7;
	m_harvestLinks[n] = 1;
	*/

	// max spiders per ip
	int32_t ipms = 7;

	m_regExs[n].set("isreindex");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 80;
	n++;

	m_regExs[n].set("ismedia");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100; // delete!
	m_forceDelete        [n] = 1;
	n++;

	// if not in the site list then nuke it
	m_regExs[n].set("!ismanualadd && !insitelist");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100;
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=3 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100;
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=1 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 45;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	// a non temporary error, like a 404? retry once per 3 months i guess
	m_regExs[n].set("errorcount>=1");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 5; // 5 day retry
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 2;
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("isaddurl");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 85;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	// 20+ unique c block parent request urls means it is important!
	m_regExs[n].set("numinlinks>7 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 52;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	// 20+ unique c block parent request urls means it is important!
	m_regExs[n].set("numinlinks>7");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 51;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;


	m_regExs[n].set("hopcount==0 && iswww && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 50;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	m_regExs[n].set("hopcount==0 && iswww");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0; // days b4 respider
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 48;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	m_regExs[n].set("hopcount==0 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 49;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	m_regExs[n].set("hopcount==0");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 10.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 47;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;


	m_regExs[n].set("isparentrss && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 45;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	m_regExs[n].set("isparentsitemap && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 44;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;


	m_regExs[n].set("isparentrss");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 43;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;

	m_regExs[n].set("isparentsitemap");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 42;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .00347; // 5 mins
	n++;


	m_regExs[n].set("hopcount==1 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 40;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .04166; // 60 minutes
	n++;

	m_regExs[n].set("hopcount==1");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 39;
	if ( ! strcmp(s,"news") )
		m_spiderFreqs [n] = .04166; // 60 minutes
	n++;

	m_regExs[n].set("hopcount==2 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	// do not harvest links if we are spiderings NEWS
	if ( ! strcmp(s,"news") ) {
		m_spiderFreqs  [n] = 5.0;
		m_harvestLinks [n] = 0;
	}
	n++;

	m_regExs[n].set("hopcount==2");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 29;
	// do not harvest links if we are spiderings NEWS
	if ( ! strcmp(s,"news") ) {
		m_spiderFreqs  [n] = 5.0;
		m_harvestLinks [n] = 0;
	}
	n++;

	m_regExs[n].set("hopcount>=3 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 20;
	// turn off spidering if hopcount is too big and we are spiderings NEWS
	if ( ! strcmp(s,"news") ) {
		m_maxSpidersPerRule [n] = 0;
		m_harvestLinks      [n] = 0;
	}
	else {
		n++;
	}

	m_regExs[n].set("hopcount>=3");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 19;
	// turn off spidering if hopcount is too big and we are spiderings NEWS
	if ( ! strcmp(s,"news") ) {
		m_maxSpidersPerRule [n] = 0;
		m_harvestLinks      [n] = 0;
	}
	else {
		n++;
	}

	/*
	m_regExs[n].set("isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = resp4;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 2;
	n++;
	*/

	m_regExs[n].set("default");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 1;
	if ( ! strcmp(s,"news") ) {
		m_maxSpidersPerRule [n] = 0;
		m_harvestLinks      [n] = 0;
	}
	n++;


	m_numRegExs   = n;
	m_numRegExs2  = n;
	m_numRegExs3  = n;
	m_numRegExs10 = n;
	m_numRegExs5  = n;
	m_numRegExs6  = n;
	m_numRegExs8  = n;
	m_numRegExs7  = n;

	// more rules


	//m_spiderDiffbotApiNum[n] = 1;
	//m_numRegExs11++;
	//m_spiderDiffbotApiUrl[n].set("");
	//m_spiderDiffbotApiUrl[n].nullTerm();
	//m_numRegExs11++;

	return true;
}


bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {

	// max spiders per ip
	int32_t ipms = 7;

	int32_t n = 0;

	m_regExs[n].set("isreindex");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 80;
	n++;

	m_regExs[n].set("ismedia");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100; // delete!
	m_forceDelete        [n] = 1;
	n++;

	// if not in the site list then nuke it
	m_regExs[n].set("!ismanualadd && !insitelist");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100; // delete!
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=3 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100;
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=1 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 45;
	n++;

	m_regExs[n].set("isaddurl");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 85;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
			       tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 50;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
			       "parentlang==%s,xx"
			       ,langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 50;
	n++;

	// m_regExs[n].set("hopcount==0 && iswww && isnew");
	// m_harvestLinks       [n] = 1;
	// m_spiderFreqs        [n] = 7; // 30 days default
	// m_maxSpidersPerRule  [n] = 9; // max spiders
	// m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	// m_spiderIpWaits      [n] = 1000; // same ip wait
	// m_spiderPriorities   [n] = 20;
	// n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0; // days b4 respider
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 48;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
			       langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0; // days b4 respider
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 48;
	n++;

	m_regExs[n].set("hopcount==0 && iswww");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0; // days b4 respider
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 19;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 49;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
			       langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 49;
	n++;

	m_regExs[n].set("hopcount==0 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 18;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 10.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 47;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 10.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 47;
	n++;

	m_regExs[n].set("hopcount==0");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 10.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 17;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 40;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
			       tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 40;
	n++;

	m_regExs[n].set("hopcount==1 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 16;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 39;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 39;
	n++;

	m_regExs[n].set("hopcount==1");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 15;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
			       langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	n++;

	m_regExs[n].set("hopcount==2 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 14;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 29;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 29;
	n++;

	m_regExs[n].set("hopcount==2");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 13;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 22;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
			       langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 22;
	n++;

	m_regExs[n].set("hopcount>=3 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 12;
	n++;


	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 21;
	n++;

	m_regExs[n].reset();
	m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 21;
	n++;

	m_regExs[n].set("hopcount>=3");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 11;
	n++;


	m_regExs[n].set("default");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 1;
	n++;

	m_numRegExs   = n;
	m_numRegExs2  = n;
	m_numRegExs3  = n;
	m_numRegExs10 = n;
	m_numRegExs5  = n;
	m_numRegExs6  = n;
	m_numRegExs8  = n;
	m_numRegExs7  = n;

	// done rebuilding CHINESE rules
	return true;
}

bool CollectionRec::rebuildShallowRules ( ) {

	// max spiders per ip
	int32_t ipms = 7;

	int32_t n = 0;

	m_regExs[n].set("isreindex");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 80;
	n++;

	m_regExs[n].set("ismedia");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100; // delete!
	m_forceDelete        [n] = 1;
	n++;

	// if not in the site list then nuke it
	m_regExs[n].set("!ismanualadd && !insitelist");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 0; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100; // delete!
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=3 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 100;
	m_forceDelete        [n] = 1;
	n++;

	m_regExs[n].set("errorcount>=1 && hastmperror");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 1; // 30 days default
	m_maxSpidersPerRule  [n] = 1; // max spiders
	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 45;
	n++;

	m_regExs[n].set("isaddurl");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 99; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 85;
	n++;


	//
	// stop if hopcount>=2 for things tagged shallow in sitelist
	//
	m_regExs[n].set("tag:shallow && hopcount>=2");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 0; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	n++;


	// if # of pages in this site indexed is >= 10 then stop as well...
	m_regExs[n].set("tag:shallow && sitepages>=10");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 0; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	n++;


	m_regExs[n].set("hopcount==0 && iswww && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7; // 30 days default
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 50;
	n++;

	m_regExs[n].set("hopcount==0 && iswww");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0; // days b4 respider
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 48;
	n++;


	m_regExs[n].set("hopcount==0 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 7.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 49;
	n++;


	m_regExs[n].set("hopcount==0");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 10.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 47;
	n++;


	m_regExs[n].set("hopcount==1 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 40;
	n++;


	m_regExs[n].set("hopcount==1");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 20.0;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 39;
	n++;


	m_regExs[n].set("hopcount==2 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 30;
	n++;

	m_regExs[n].set("hopcount==2");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 40;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 29;
	n++;


	m_regExs[n].set("hopcount>=3 && isnew");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 22;
	n++;

	m_regExs[n].set("hopcount>=3");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 21;
	n++;


	m_regExs[n].set("default");
	m_harvestLinks       [n] = 1;
	m_spiderFreqs        [n] = 60;
	m_maxSpidersPerRule  [n] = 9; // max spiders
	m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
	m_spiderIpWaits      [n] = 1000; // same ip wait
	m_spiderPriorities   [n] = 1;
	n++;

	m_numRegExs   = n;
	m_numRegExs2  = n;
	m_numRegExs3  = n;
	m_numRegExs10 = n;
	m_numRegExs5  = n;
	m_numRegExs6  = n;
	m_numRegExs8  = n;
	m_numRegExs7  = n;

	// done rebuilding SHALLOW rules
	return true;
}

/*
bool CrawlInfo::print (SafeBuf *sb ) {
	return sb->safePrintf("objectsAdded:%" INT64 "\n"
			      "objectsDeleted:%" INT64 "\n"
			      "urlsConsidered:%" INT64 "\n"
			      "downloadAttempts:%" INT64 "\n"
			      "downloadSuccesses:%" INT64 "\n"
			      "processAttempts:%" INT64 "\n"
			      "processSuccesses:%" INT64 "\n"
			      "lastupdate:%" UINT32 "\n"
			      , m_objectsAdded
			      , m_objectsDeleted
			      , m_urlsConsidered
			      , m_pageDownloadAttempts
			      , m_pageDownloadSuccesses
			      , m_pageProcessAttempts
			      , m_pageProcessSuccesses
			      , m_lastUpdateTime
			      );
}

bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
	return sscanf(sb->getBufStart(),
		      "objectsAdded:%" INT64 "\n"
		      "objectsDeleted:%" INT64 "\n"
		      "urlsConsidered:%" INT64 "\n"
		      "downloadAttempts:%" INT64 "\n"
		      "downloadSuccesses:%" INT64 "\n"
		      "processAttempts:%" INT64 "\n"
		      "processSuccesses:%" INT64 "\n"
		      "lastupdate:%" UINT32 "\n"
		      , &m_objectsAdded
		      , &m_objectsDeleted
		      , &m_urlsConsidered
		      , &m_pageDownloadAttempts
		      , &m_pageDownloadSuccesses
		      , &m_pageProcessAttempts
		      , &m_pageProcessSuccesses
		      , &m_lastUpdateTime
		      );
}
*/

// returns false on failure and sets g_errno, true otherwise
bool CollectionRec::save ( ) {
	if ( g_conf.m_readOnlyMode ) return true;
	//File f;
	char tmp[1024];
	//sprintf ( tmp , "%scollections/%" INT32 ".%s/c.conf",
	//	  g_hostdb.m_dir,m_id,m_coll);
	// collection name HACK for backwards compatibility
	//if ( m_collLen == 0 )
	//	sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
	//else
	snprintf ( tmp , 1023, "%scoll.%s.%" INT32 "/coll.conf",
		  g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) return false;
	// log msg
	//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());

	//
	// save the crawlinfo class in the collectionrec for diffbot
	//
	// SAVE LOCAL
	snprintf ( tmp , 1023, "%scoll.%s.%" INT32 "/localcrawlinfo.dat",
		  g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	//log("coll: saving %s",tmp);
	// in case emergency save from malloc core, do not alloc
	char stack[1024];
	SafeBuf sb(stack,1024);
	//m_localCrawlInfo.print ( &sb );
	// binary now
	sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
	if ( sb.safeSave ( tmp ) == -1 ) {
		log("db: failed to save file %s : %s",
		    tmp,mstrerror(g_errno));
		g_errno = 0;
	}
	// SAVE GLOBAL
	snprintf ( tmp , 1023, "%scoll.%s.%" INT32 "/globalcrawlinfo.dat",
		  g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	//log("coll: saving %s",tmp);
	sb.reset();
	//m_globalCrawlInfo.print ( &sb );
	// binary now
	sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
	if ( sb.safeSave ( tmp ) == -1 ) {
		log("db: failed to save file %s : %s",
		    tmp,mstrerror(g_errno));
		g_errno = 0;
	}

	// the list of ip addresses that we have detected as being throttled
	// and therefore backoff and use proxies for
	sb.reset();
	sb.safePrintf("%scoll.%s.%" INT32 "/",
		      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
	m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );

	// do not need a save now
	m_needsSave = false;

	// waiting tree is saved in SpiderCache::save() called by Process.cpp
	//SpiderColl *sc = m_spiderColl;
	//if ( ! sc ) return true;

	// save page count table which has # of pages indexed per
	// subdomain/site and firstip for doing quotas in url filters table
	//snprintf ( tmp , 1023, "coll.%s.%" INT32 "/pagecounts.dat",
	//	   m_coll , (int32_t)m_collnum );
	//if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
	//	log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
	//	g_errno = 0;
	//}


	return true;
}

// calls hasPermissin() below
bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
	int32_t  plen;
	char *p     = r->getString ( "pwd" , &plen );
	int32_t  ip    = s->m_ip;
	return hasPermission ( p , plen , ip );
}


// . does this password work for this collection?
bool CollectionRec::isAssassin ( int32_t ip ) {
	// ok, make sure they came from an acceptable IP
	//for ( int32_t i = 0 ; i < m_numSpamIps ; i++ )
	//	// they also have a matching IP, so they now have permission
	//	if ( m_spamIps[i] == ip ) return true;
	return false;
}

// . does this password work for this collection?
bool CollectionRec::hasPermission ( char *p, int32_t plen , int32_t ip ) {
	// just return true
	// collection permission is checked from Users::verifyColl
	// in User::getUserType for every request
	return true;

	// scan the passwords
	// MDW: no longer, this is too vulnerable!!!
	/*
	for ( int32_t i = 0 ; i < m_numAdminPwds ; i++ ) {
		int32_t len = gbstrlen ( m_adminPwds[i] );
		if ( len != plen ) continue;
		if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
		// otherwise it's a match!
		//goto checkIp;
		// . matching one password is good enough now, default OR
		// . because just matching an IP is good enough security,
		//   there is really no need for both IP AND passwd match
		return true;
	}
	*/
	// . if had passwords but the provided one didn't match, return false
	// . matching one password is good enough now, default OR
	//if ( m_numPasswords > 0 ) return false;
	// checkIp:
	// ok, make sure they came from an acceptable IP
	//for ( int32_t i = 0 ; i < m_numAdminIps ; i++ )
	//	// they also have a matching IP, so they now have permission
	//	if ( m_adminIps[i] == ip ) return true;
	// if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
	//if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
	// if they did not match an ip or password, even if both lists
	// are empty, do not allow access... this prevents security breeches
	// by accident
	return false;
	// if there were IPs then they failed to get in
	//if ( m_numAdminIps > 0 ) return false;
	// otherwise, they made it
	//return true;
}

// can this ip perform a search or add url on this collection?
bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
	// get the ip
	int32_t ip = 0; if ( s ) ip = s->m_ip;
	// and the ip domain
	int32_t ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
	// and top 2 bytes for the israel isp that has this huge block
	int32_t ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
	// is it in the ban list?
	/*
	for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
		if ( isIpTop ( m_banIps[i] ) ) {
			if ( m_banIps[i] == ipt ) return false;
			continue;
		}
		// check for ip domain match if this banned ip is an ip domain
		if ( isIpDom ( m_banIps[i] ) ) {
			if ( m_banIps[i] == ipd ) return false;
			continue;
		}
		// otherwise it's just a single banned ip
		if ( m_banIps[i] == ip ) return false;
	}
	*/
	// check the encapsulate ip if any
	// 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
	// /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
	// +nj+state HTTP/1.0
	/*
	if ( encapIp ) {
		ipd = ipdom ( encapIp );
		ip  = encapIp;
		for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
			if ( isIpDom ( m_banIps[i] ) ) {
				if ( m_banIps[i] == ipd ) return false;
				continue;
			}
			if ( isIpTop ( m_banIps[i] ) ) {
				if ( m_banIps[i] == ipt ) return false;
				continue;
			}
			if ( m_banIps[i] == ip ) return false;
		}
	}
	*/

	return true;
	/*
	// do we have an "only" list?
	if ( m_numSearchIps == 0 ) return true;
	// it must be in that list if we do
	for ( int32_t i = 0 ; i < m_numSearchIps ; i++ ) {
		// check for ip domain match if this banned ip is an ip domain
		if ( isIpDom ( m_searchIps[i] ) ) {
			if ( m_searchIps[i] == ipd ) return true;
			continue;
		}
		// otherwise it's just a single ip
		if ( m_searchIps[i] == ip ) return true;
	}
	*/

	// otherwise no permission
	return false;
}

bool expandRegExShortcuts ( SafeBuf *sb ) ;
void nukeDoledb ( collnum_t collnum );

// rebuild the regexes related to diffbot, such as the one for the URL pattern
bool CollectionRec::rebuildDiffbotRegexes() {
		//logf(LOG_DEBUG,"db: rebuilding url filters");
		char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
		if ( ucp && ! ucp[0] ) ucp = NULL;

		// get the regexes
		if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
		if ( ucp && ! ucp[0] ) ucp = NULL;
		char *upp = m_diffbotUrlProcessPattern.getBufStart();
		if ( upp && ! upp[0] ) upp = NULL;

		if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
		if ( upp && ! upp[0] ) upp = NULL;
		char *ppp = m_diffbotPageProcessPattern.getBufStart();
		if ( ppp && ! ppp[0] ) ppp = NULL;

		// recompiling regexes starts now
		if ( m_hasucr ) {
			regfree ( &m_ucr );
			m_hasucr = false;
		}
		if ( m_hasupr ) {
			regfree ( &m_upr );
			m_hasupr = false;
		}

		// copy into tmpbuf
		SafeBuf tmp;
		char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
		if ( rx && ! rx[0] ) rx = NULL;
		if ( rx ) {
			tmp.reset();
			tmp.safeStrcpy ( rx );
			expandRegExShortcuts ( &tmp );
			m_hasucr = true;
		}
		if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
				     REG_EXTENDED| //REG_ICASE|
				     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
			// error!
			log("coll: regcomp %s failed: %s. "
				   "Ignoring.",
				   rx,mstrerror(errno));
			regfree ( &m_ucr );
			m_hasucr = false;
		}


		rx = m_diffbotUrlProcessRegEx.getBufStart();
		if ( rx && ! rx[0] ) rx = NULL;
		if ( rx ) m_hasupr = true;
		if ( rx ) {
			tmp.reset();
			tmp.safeStrcpy ( rx );
			expandRegExShortcuts ( &tmp );
			m_hasupr = true;
		}
		if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
				     REG_EXTENDED| // REG_ICASE|
				     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
			// error!
			log("coll: regcomp %s failed: %s. "
			    "Ignoring.",
			    rx,mstrerror(errno));
			regfree ( &m_upr );
			m_hasupr = false;
		}
		return true;

}

bool CollectionRec::rebuildUrlFiltersDiffbot() {

	//logf(LOG_DEBUG,"db: rebuilding url filters");
	char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
	if ( ucp && ! ucp[0] ) ucp = NULL;

	// if we had a regex, that works for this purpose as well
	if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
	if ( ucp && ! ucp[0] ) ucp = NULL;
	char *upp = m_diffbotUrlProcessPattern.getBufStart();
	if ( upp && ! upp[0] ) upp = NULL;

	// if we had a regex, that works for this purpose as well
	if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
	if ( upp && ! upp[0] ) upp = NULL;
	char *ppp = m_diffbotPageProcessPattern.getBufStart();
	if ( ppp && ! ppp[0] ) ppp = NULL;

	///////
	//
	// recompile regular expressions
	//
	///////


	if ( m_hasucr ) {
		regfree ( &m_ucr );
		m_hasucr = false;
	}

	if ( m_hasupr ) {
		regfree ( &m_upr );
		m_hasupr = false;
	}

	// copy into tmpbuf
	SafeBuf tmp;

	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
	if ( rx && ! rx[0] ) rx = NULL;
	if ( rx ) {
		tmp.reset();
		tmp.safeStrcpy ( rx );
		expandRegExShortcuts ( &tmp );
		m_hasucr = true;
	}
	int32_t err;
	if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
				     REG_EXTENDED| //REG_ICASE|
				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
		// error!
		char errbuf[1024];
		regerror(err,&m_ucr,errbuf,1000);
		log("coll: regcomp %s failed: %s. "
		    "Ignoring.",
		    rx,errbuf);
		regfree ( &m_ucr );
		m_hasucr = false;
	}


	rx = m_diffbotUrlProcessRegEx.getBufStart();
	if ( rx && ! rx[0] ) rx = NULL;
	if ( rx ) m_hasupr = true;
	if ( rx ) {
		tmp.reset();
		tmp.safeStrcpy ( rx );
		expandRegExShortcuts ( &tmp );
		m_hasupr = true;
	}
	if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
				     REG_EXTENDED| // REG_ICASE|
				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
		char errbuf[1024];
		regerror(err,&m_upr,errbuf,1000);
		// error!
		log("coll: regcomp %s failed: %s. "
		    "Ignoring.",
		    rx,errbuf);
		regfree ( &m_upr );
		m_hasupr = false;
	}

	// what diffbot url to use for processing
	char *api = m_diffbotApiUrl.getBufStart();
	if ( api && ! api[0] ) api = NULL;

	// convert from seconds to milliseconds. default is 250ms?
	int32_t wait = (int32_t)(m_collectiveCrawlDelay * 1000.0);
	// default to 250ms i guess. -1 means unset i think.
	if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;

	bool isEthan = false;
	if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");

	// it looks like we are assuming all crawls are repeating so that
	// &rountStart=<currenttime> or &roundStart=0 which is the same
	// thing, will trigger a re-crawl. so if collectiveRespiderFreq
	// is 0 assume it is like 999999.0 days. so that stuff works.
	// also i had to make the "default" rule below always have a respider
	// freq of 0.0 so it will respider right away if we make it past the
	// "lastspidertime>={roundstart}" rule which we will if they
	// set the roundstart time to the current time using &roundstart=0
	float respiderFreq = m_collectiveRespiderFrequency;
	if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;

	// lower from 7 to 1 since we have so many collections now
	// ok, now we have much less colls so raise back to 7
	int32_t diffbotipms = 7;//1; // 7

	// make the gigablast regex table just "default" so it does not
	// filtering, but accepts all urls. we will add code to pass the urls
	// through m_diffbotUrlCrawlPattern alternatively. if that itself
	// is empty, we will just restrict to the seed urls subdomain.
	for ( int32_t i = 0 ; i < MAX_FILTERS ; i++ ) {
		m_regExs[i].purge();
		m_spiderPriorities[i] = 0;
		m_maxSpidersPerRule [i] = 100;

		// when someone has a bulk job of thousands of different
		// domains it slows diffbot back-end down, so change this
		// from 100 to 7 if doing a bulk job
		if ( m_isCustomCrawl == 2 )
			m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster

		m_spiderIpWaits     [i] = wait;
		m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
		// ethan wants some speed
		// if ( isEthan )
		// 	m_spiderIpMaxSpiders[i] = 30;
		//m_spidersEnabled    [i] = 1;
		m_spiderFreqs       [i] = respiderFreq;
		//m_spiderDiffbotApiUrl[i].purge();
		m_harvestLinks[i] = true;
		m_forceDelete [i] = false;
	}

	int32_t i = 0;

	// 1st one! for query reindex/ query delete
	m_regExs[i].set("isreindex");
	m_spiderIpMaxSpiders [i] = 10;
	m_spiderPriorities   [i] = 70;
	i++;

	// 2nd default url
	m_regExs[i].set("ismedia && !ismanualadd");
	m_maxSpidersPerRule  [i] = 0;
	m_spiderPriorities   [i] = 100; // delete!
	m_forceDelete        [i] = 1;
	i++;

	// de-prioritize fakefirstip urls so we don't give the impression our
	// spiders are slow. like if someone adds a bulk job with 100,000 urls
	// then we sit there and process to lookup their ips and add a real
	// spider request (if it falls onto the same shard) before we actually
	// do any real spidering. so keep the priority here low.
	m_regExs[i].set("isfakeip");
	m_maxSpidersPerRule  [i] = 7;
	m_spiderIpMaxSpiders [i] = 7;
	m_spiderPriorities   [i] = 20;
	m_spiderIpWaits      [i] = 0;
	i++;

	// hopcount filter if asked for
	if( m_diffbotMaxHops >= 0 ) {

		// transform long to string
		char numstr[21]; // enough to hold all numbers up to 64-bits
		sprintf(numstr, "%" INT32 "", (int32_t)m_diffbotMaxHops);

		// form regEx like: hopcount>3
		char hopcountStr[30];
		strcpy(hopcountStr, "hopcount>");
		strcat(hopcountStr, numstr);

		m_regExs[i].set(hopcountStr);

		// means DELETE :
		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;

		//  just don't spider
		m_maxSpidersPerRule[i] = 0;

		// compatibility with m_spiderRoundStartTime:
		m_spiderFreqs[i] = 0.0;
		i++;
	}

	// 2nd default filter
	// always turn this on for now. they need to add domains they want
	// to crawl as seeds so they do not spider the web.
	// no because FTB seeds with link pages that link to another
	// domain. they just need to be sure to supply a crawl pattern
	// to avoid spidering the whole web.
	//
	// if they did not EXPLICITLY provide a url crawl pattern or
	// url crawl regex then restrict to seeds to prevent from spidering
	// the entire internet.
	//if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
	// MDW: even if they supplied a crawl pattern let's restrict to seed
	// domains 12/15/14
	m_regExs[i].set("!isonsamedomain && !ismanualadd");
	m_maxSpidersPerRule  [i] = 0;
	m_spiderPriorities   [i] = 100; // delete!
	m_forceDelete        [i] = 1;
	i++;
	//}

	bool ucpHasPositive = false;
	// . scan them to see if all patterns start with '!' or not
	// . if pattern starts with ! it is negative, otherwise positive
	if ( ucp ) ucpHasPositive = hasPositivePattern ( ucp );

	// if no crawl regex, and it has a crawl pattern consisting of
	// only negative patterns then restrict to domains of seeds
	if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
		m_regExs[i].set("!isonsamedomain && !ismanualadd");
		m_maxSpidersPerRule  [i] = 0;
		m_spiderPriorities   [i] = 100; // delete!
		m_forceDelete        [i] = 1;
		i++;
	}

	// don't bother re-spidering old pages if hopcount == maxhopcount
	// and only process new urls is true. because we don't need to
	// harvest outlinks from them.
	if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
	     // only crawls, not bulk jobs
	     m_isCustomCrawl == 1 ) {
		m_regExs[i].purge();
		m_regExs[i].safePrintf("isindexed && hopcount==%" INT32 ,
				       m_diffbotMaxHops );
		m_spiderPriorities   [i] = 14;
		m_spiderFreqs        [i] = 0.0;
		m_maxSpidersPerRule  [i] = 0; // turn off spiders
		m_harvestLinks       [i] = false;
		i++;
	}

	// 3rd rule for respidering
	// put this above the errocount>= rules below otherwise the crawl
	// may never advance its round because it keeps retrying a ton of
	// errored urls.
	if ( respiderFreq > 0.0 ) {
		m_regExs[i].set("lastspidertime>={roundstart}");
		// do not "remove" from index
		m_spiderPriorities   [i] = 10;
		// just turn off spidering. if we were to set priority to
		// filtered it would be removed from index!
		//m_spidersEnabled     [i] = 0;
		m_maxSpidersPerRule[i] = 0;
		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
		// which has been obsoleted, but we are running old code now!
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
	}
	// if doing a one-shot crawl limit error retries to 3 times or
	// if no urls currently available to spider, whichever comes first.
	else {
		m_regExs[i].set("errorcount>=3");
		m_spiderPriorities   [i] = 11;
		m_spiderFreqs        [i] = 0.0416;
		m_maxSpidersPerRule  [i] = 0; // turn off spiders
		i++;
	}

	// diffbot needs to retry even on 500 or 404 errors since sometimes
	// a seed url gets a 500 error mistakenly and it haults the crawl.
	// so take out "!hastmperror".

	m_regExs[i].set("errorcount>=1 && !hastmperror");
	m_spiderPriorities   [i] = 14;
	m_spiderFreqs        [i] = 0.0416; // every hour
	//m_maxSpidersPerRule  [i] = 0; // turn off spiders if not tmp error
	i++;

	// and for docs that have errors respider once every 5 hours
	m_regExs[i].set("errorcount==1 && hastmperror");
	m_spiderPriorities   [i] = 40;
	m_spiderFreqs        [i] = 0.001; // 86 seconds
	i++;

	// and for docs that have errors respider once every 5 hours
	m_regExs[i].set("errorcount==2 && hastmperror");
	m_spiderPriorities   [i] = 40;
	m_spiderFreqs        [i] = 0.003; // 3*86 seconds (was 24 hrs)
	i++;

	// excessive errors? (tcp/dns timed out, etc.) retry once per month?
	m_regExs[i].set("errorcount>=3 && hastmperror");
	m_spiderPriorities   [i] = 39;
	m_spiderFreqs        [i] = .25; // 1/4 day
	// if bulk job, do not download a url more than 3 times
	if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
	i++;

	// if collectiverespiderfreq is 0 or less then do not RE-spider
	// documents already indexed.
	if ( respiderFreq <= 0.0 ) { // else {
		// this does NOT work! error docs continuously respider
		// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
		//m_regExs[i].set("isindexed");
		m_regExs[i].set("hasreply");
		m_spiderPriorities   [i] = 10;
		// just turn off spidering. if we were to set priority to
		// filtered it would be removed from index!
		//m_spidersEnabled     [i] = 0;
		m_maxSpidersPerRule[i] = 0;
		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
		// which has been obsoleted, but we are running old code now!
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
	}

	// url crawl and PAGE process pattern
	if ( ucp && ! upp && ppp ) {
		// if just matches ucp, just crawl it, do not process
		m_regExs[i].set("matchesucp");
		m_spiderPriorities   [i] = 53;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		i++;
		// crawl everything else, but don't harvest links,
		// we have to see if the page content matches the "ppp"
		// to determine whether the page should be processed or not.
		m_regExs[i].set("default");
		m_spiderPriorities   [i] = 52;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		m_harvestLinks       [i] = false;
		i++;
		goto done;
	}

	// url crawl and process pattern
	if ( ucp && upp ) {
		m_regExs[i].set("matchesucp && matchesupp");
		m_spiderPriorities   [i] = 55;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
		// if just matches ucp, just crawl it, do not process
		m_regExs[i].set("matchesucp");
		m_spiderPriorities   [i] = 53;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		i++;
		// just process, do not spider links if does not match ucp
		m_regExs[i].set("matchesupp");
		m_spiderPriorities   [i] = 54;
		m_harvestLinks       [i] = false;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
		// do not crawl anything else
		m_regExs[i].set("default");
		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
		// don't spider
		m_maxSpidersPerRule[i] = 0;
		// this needs to be zero so &spiderRoundStart=0
		// functionality which sets m_spiderRoundStartTime
		// to the current time works
		// otherwise Spider.cpp's getSpiderTimeMS() returns a time
		// in the future and we can't force the round
		m_spiderFreqs[i] = 0.0;
		i++;
	}

	// harvest links if we should crawl it
	if ( ucp && ! upp ) {
		m_regExs[i].set("matchesucp");
		m_spiderPriorities   [i] = 53;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away.
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		// process everything since upp is empty
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
		// do not crawl anything else
		m_regExs[i].set("default");
		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
		// don't delete, just don't spider
		m_maxSpidersPerRule[i] = 0;
		// this needs to be zero so &spiderRoundStart=0
		// functionality which sets m_spiderRoundStartTime
		// to the current time works
		// otherwise Spider.cpp's getSpiderTimeMS() returns a time
		// in the future and we can't force the rounce
		m_spiderFreqs[i] = 0.0;
		i++;
	}

	// just process
	if ( upp && ! ucp ) {
		m_regExs[i].set("matchesupp");
		m_spiderPriorities   [i] = 54;
		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
		// let's always make this without delay because if we
		// restart the round we want these to process right away
		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
		//m_harvestLinks       [i] = false;
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
		// crawl everything by default, no processing
		m_regExs[i].set("default");
		m_spiderPriorities   [i] = 50;
		// this needs to be zero so &spiderRoundStart=0
		// functionality which sets m_spiderRoundStartTime
		// to the current time works
		// otherwise Spider.cpp's getSpiderTimeMS() returns a time
		// in the future and we can't force the rounce
		m_spiderFreqs[i] = 0.0;
		i++;
	}

	// no restraints
	if ( ! upp && ! ucp ) {
		// crawl everything by default, no processing
		m_regExs[i].set("default");
		m_spiderPriorities   [i] = 50;
		// this needs to be zero so &spiderRoundStart=0
		// functionality which sets m_spiderRoundStartTime
		// to the current time works
		// otherwise Spider.cpp's getSpiderTimeMS() returns a time
		// in the future and we can't force the rounce
		m_spiderFreqs[i] = 0.0;
		//m_spiderDiffbotApiUrl[i].set ( api );
		i++;
	}

 done:
	m_numRegExs   = i;
	m_numRegExs2  = i;
	m_numRegExs3  = i;
	m_numRegExs10 = i;
	m_numRegExs5  = i;
	m_numRegExs6  = i;
	//m_numRegExs7  = i;
	m_numRegExs8  = i;
	m_numRegExs7  = i;
	//m_numRegExs11 = i;


	//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
	//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }

	return true;


}


// . anytime the url filters are updated, this function is called
// . it is also called on load of the collection at startup
bool CollectionRec::rebuildUrlFilters ( ) {

	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
		log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
		    m_urlFiltersProfile.getBufStart());

	// if not a custom crawl, and no expressions, add a default one
	//if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {
	//	setUrlFiltersToDefaults();
	//}

	// if not a custom crawl then set the url filters based on
	// the url filter profile, if any
	if ( ! m_isCustomCrawl )
		rebuildUrlFilters2();

	// set this so we know whether we have to keep track of page counts
	// per subdomain/site and per domain. if the url filters have
	// 'sitepages' 'domainpages' 'domainadds' or 'siteadds' we have to keep
	// the count table SpiderColl::m_pageCountTable.
	m_urlFiltersHavePageCounts = false;
	for ( int32_t i = 0 ; i < m_numRegExs ; i++ ) {
		// get the ith rule
		SafeBuf *sb = &m_regExs[i];
		char *p = sb->getBufStart();
		if ( strstr(p,"sitepages") ||
		     strstr(p,"domainpages") ||
		     strstr(p,"siteadds") ||
		     strstr(p,"domainadds") ) {
			m_urlFiltersHavePageCounts = true;
			break;
		}
	}

	// if collection is brand new being called from addNewColl()
	// then sc will be NULL
	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);

	// . do not do this at startup
	// . this essentially resets doledb
	if ( g_doledb.m_rdb.m_initialized &&
	     // somehow this is initialized before we set m_recs[m_collnum]
	     // so we gotta do the two checks below...
	     sc &&
	     // must be a valid coll
	     m_collnum < g_collectiondb.m_numRecs &&
	     g_collectiondb.m_recs[m_collnum] ) {


		log("coll: resetting doledb for %s (%li)",m_coll,
		    (long)m_collnum);

		// clear doledb recs from tree
		//g_doledb.getRdb()->deleteAllRecs ( m_collnum );
		nukeDoledb ( m_collnum );

		// add it back
		//if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
		//	log("coll: error re-adding doledb for %s",m_coll);

		// just start this over...
		// . MDW left off here
		//tryToDelete ( sc );
		// maybe this is good enough
		//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;

		//CollectionRec *cr = sc->m_cr;

		// . rebuild sitetable? in PageBasic.cpp.
		// . re-adds seed spdierrequests using msg4
		// . true = addSeeds
		// . no, don't do this now because we call updateSiteList()
		//   when we have &sitelist=xxxx in the request which will
		//   handle updating those tables
		//updateSiteListTables ( m_collnum ,
		//		       true ,
		//		       cr->m_siteListBuf.getBufStart() );
	}


	// If the crawl is not generated by crawlbot, then we will just update
	// the regexes concerning the urls to process
	rebuildDiffbotRegexes();
	if ( ! m_isCustomCrawl ){
		return true;
	}

	// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
	// to some default ones.
	return rebuildUrlFiltersDiffbot();
}

// for some reason the libc we use doesn't support these int16_tcuts,
// so expand them to something it does support
bool expandRegExShortcuts ( SafeBuf *sb ) {
	if ( ! sb->safeReplace3 ( "\\d" , "[0-9]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\D" , "[^0-9]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\l" , "[a-z]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\a" , "[A-Za-z]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\u" , "[A-Z]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\w" , "[A-Za-z0-9_]" ) ) return false;
	if ( ! sb->safeReplace3 ( "\\W" , "[^A-Za-z0-9_]" ) ) return false;
	return true;
}


void testRegex ( ) {

	//
	// TEST
	//

	char *rx;

	rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";

	rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";

	rx = ".*?article[0-9]*?.html";

	regex_t ucr;
	int32_t err;

	if ( ( err = regcomp ( &ucr , rx ,
			       REG_ICASE
			       |REG_EXTENDED
			       //|REG_NEWLINE
			       //|REG_NOSUB
			       ) ) ) {
		// error!
		char errbuf[1024];
		regerror(err,&ucr,errbuf,1000);
		log("xmldoc: regcomp %s failed: %s. "
		    "Ignoring.",
		    rx,errbuf);
	}

	logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);

	//char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
	char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";

	if ( regexec(&ucr,url,0,NULL,0) )
		logf(LOG_DEBUG,"db: failed to match %s on %s",
		     url,rx);
	else
		logf(LOG_DEBUG,"db: MATCHED %s on %s",
		     url,rx);
	exit(0);
}

int64_t CollectionRec::getNumDocsIndexed() {
	RdbBase *base = getBase(RDB_TITLEDB);//m_bases[RDB_TITLEDB];
	if ( ! base ) return 0LL;
	return base->getNumGlobalRecs();
}

// messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
// so we do not have to keep sending this huge msg!
bool CollectionRec::shouldSendLocalCrawlInfoToHost ( int32_t hostId ) {
	if ( ! m_spiderColl ) return false;
	if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
	if ( hostId >= g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
	// sanity
	return m_spiderColl->m_sendLocalCrawlInfoToHost[hostId];
}

void CollectionRec::localCrawlInfoUpdate() {
	if ( ! m_spiderColl ) return;
	// turn on all the flags
	memset(m_spiderColl->m_sendLocalCrawlInfoToHost,1,g_hostdb.m_numHosts);
}

// right after we send copy it for sending we set this so we do not send
// again unless localCrawlInfoUpdate() is called
void CollectionRec::sentLocalCrawlInfoToHost ( int32_t hostId ) {
	if ( ! m_spiderColl ) return;
	m_spiderColl->m_sendLocalCrawlInfoToHost[hostId] = 0;
}