Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2025-07-14 02:36:06 -04:00 · 2013-10-24 17:56:10 -07:00
parent fa9f81bd7c fb7096dc5d
commit 242873b272
12 changed files with 372 additions and 96 deletions
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -101,6 +101,8 @@ void CollectionRec::reset() {
 	m_replies = 0;
 }

+CollectionRec *g_cr = NULL;
+
 // . load this data from a conf file
 // . values we do not explicitly have will be taken from "default",
 //   collection config file. if it does not have them then we use
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -97,6 +97,11 @@ class CrawlInfo {
 	// this is non-zero if urls are available to be spidered right now.
 	long m_hasUrlsReadyToSpider;

+	// last time we launched a spider. 0 on startup.
+	time_t m_lastSpiderAttempt;
+	// time we had or might have had a url available for spidering
+	time_t m_lastSpiderCouldLaunch;
+
 	// have we sent out email/webhook notifications crawl has no urls
 	// currently in the ready queue (doledb) to spider?
 	char m_sentCrawlDoneAlert;
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -774,9 +774,16 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
 	cr->m_spiderRoundNum = 0;
 	cr->m_spiderRoundStartTime = 0;

+	cr->m_spiderStatus = 0;
+	cr->m_spiderStatusMsg = NULL;
+
 	// reset seed buf
 	cr->m_diffbotSeeds.purge();

+	// reset seed dedup table
+	HashTableX *ht = &cr->m_seedHashTable;
+	ht->reset();
+
 	// so XmlDoc.cpp can detect if the collection was reset since it
 	// launched its spider:
 	cr->m_lastResetCount++;
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -194,6 +194,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 			// skip known directives
 			if ( ! strncmp(p,"port-offset:",12) ||
 			     ! strncmp(p,"index-splits:",13) ||
+			     ! strncmp(p,"num-mirrors:",12) ||
 			     ! strncmp(p,"working-dir:",12) )
 				p = p;
 			// check if this is a spare host
@ -243,13 +244,14 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	if ( ! m_hosts ) return log(
 				    "conf: Memory allocation failed.");

-	unsigned long maxShard = 0;
+	//unsigned long maxShard = 0;
+	long numGrunts = 0;

 	// now fill up m_hosts
 	p = m_buf;
 	i = 0;
 	long line = 1;
-	unsigned long lastShard = 0;
+	//unsigned long lastShard = 0;
 	long proxyNum = 0;

 	// assume defaults
@ -257,6 +259,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	long indexSplits = 0;
 	char *wdir2 = NULL;
 	long  wdirlen2 = 0;
+	long numMirrors = -1;

 	for ( ; *p ; p++ , line++ ) {
 		if ( is_wspace_a (*p) ) continue;
@ -273,6 +276,15 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 			continue; 
 		}

+		if ( ! strncmp(p,"num-mirrors:",12) ) {
+			p += 12;
+			// skip spaces after the colon
+			while (  is_wspace_a(*p) ) p++;			
+			numMirrors = atol(p);
+			while ( *p && *p != '\n' ) p++; 
+			continue; 
+		}
+
 		// does the line say "working-dir: xxxx" ?
 		if ( ! strncmp(p,"working-dir:",12) ) {
 			p += 12;
@ -351,13 +363,6 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		// skip numeric hostid or "proxy" keyword
 		while ( ! is_wspace_a(*p) ) p++;

-		if ( indexSplits == 0 ) {
-			g_errno = EBADENGINEER;
-			log("admin: need index-splits: xxx directive "
-			    "in hosts.conf");
-			return false;
-		}
-
 		// read in switch id
 		//h->m_switchId = atoi(p);

@ -590,7 +595,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		// our group is based on our split!
 		//h->m_group = i % g_hostdb.m_indexSplits; // # grps
 		//h->m_group = i % indexSplits; // # grps
-		h->m_shardNum = i % indexSplits;
+		//h->m_shardNum = i % indexSplits;
 		// i guess proxy and spares don't count
 		if ( h->m_type != HT_GRUNT ) h->m_shardNum = 0;
 		
@ -665,9 +670,12 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		h->m_externalHttpsPort = h->m_httpsPort;

 		// get max group number
-		if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
-			maxShard = h->m_shardNum;
+		//if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
+		//	maxShard = h->m_shardNum;
+		if ( h->m_type == HT_GRUNT )
+			numGrunts++;

+		/*
 		if ( h->m_shardNum <= lastShard && h->m_shardNum != 0 
 		     && !(h->m_type&(HT_ALL_PROXIES)) ) {
 		      g_errno = EBADENGINEER;
@ -678,6 +686,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 				 filename,line);
 		}
 		lastShard = h->m_shardNum;
+		*/

 		// skip line now
 		while ( *p && *p != '\n' )
@ -742,10 +751,46 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	//m_numHosts = i;
 	m_numTotalHosts = i;
 	// how many shards are we configure for?
-	m_numShards = maxShard + 1; // g_conf.m_numGroups;
+	//m_numShards = maxShard + 1; // g_conf.m_numGroups;
+
+
+	// # of mirrors is zero if no mirrors,
+	// if it is 1 then each host has ONE MIRROR host
+	if ( numMirrors == 0 )
+		indexSplits = numGrunts;
+	if ( numMirrors > 0 )
+		indexSplits = numGrunts / (numMirrors+1);
+
+	if ( indexSplits == 0 ) {
+		g_errno = EBADENGINEER;
+		log("admin: need num-mirrors: xxx or "
+		    "index-splits: xxx directive "
+		    "in hosts.conf");
+		return false;
+	}
+
+	numMirrors = (numGrunts / indexSplits) - 1 ;
+
+	if ( numMirrors < 0 ) {
+		g_errno = EBADENGINEER;
+		log("admin: need num-mirrors: xxx or "
+		    "index-splits: xxx directive "
+		    "in hosts.conf (2)");
+		return false;
+	}

 	m_indexSplits = indexSplits;

+	m_numShards = numGrunts / (numMirrors+1);
+
+	//
+	// set Host::m_shardNum
+	//
+	for ( long i = 0 ; i < numGrunts ; i++ ) {
+		Host *h = &m_hosts[i];
+		h->m_shardNum = i % indexSplits;
+	}
+
 	// assign spare hosts
 	if ( m_numSpareHosts > MAX_SPARES ) {
 		log ( "conf: Number of spares (%li) exceeds max of %i, "
--- a/Mem.cpp
+++ b/Mem.cpp
@ -12,7 +12,7 @@
 //#include "Stats.h"

 // put me back
-#define _EFENCE_
+//#define _EFENCE_

 // uncomment this for _EFENCE_ to do underflow checks instead of the
 // default overflow checks
--- a/Msg4.cpp
+++ b/Msg4.cpp
@ -16,6 +16,21 @@
 #include "Multicast.h"
 #include "Syncdb.h"

+//////////////
+//
+// Send out our records to add every X ms here:
+//
+// Batching up the add requests saves udp traffic
+// on large networks (100+ hosts).
+//
+// . currently: send out adds once every 500ms
+// . when this was 5000ms (5s) it would wait like
+//   5s to spider a url after adding it.
+//
+//////////////
+#define MSG4_WAIT 500
+
+
 // we have up to this many outstanding Multicasts to send add requests to hosts
 #define MAX_MCASTS 128
 Multicast  s_mcasts[MAX_MCASTS];
@ -98,8 +113,11 @@ bool registerHandler4 ( ) {
 	}

 	// . register sleep handler every 5 seconds = 5000 ms
+	// . right now MSG4_WAIT is 500ms... i lowered it from 5s
+	//   to speed up spidering so it would harvest outlinks
+	//   faster and be able to spider them right away.
 	// . returns false on failure
-	return g_loop.registerSleepCallback ( 5000 , NULL , sleepCallback4 );
+	return g_loop.registerSleepCallback(MSG4_WAIT,NULL,sleepCallback4 );
 }

 static void flushLocal ( ) ;
@ -475,7 +493,8 @@ bool Msg4::addMetaList ( char      *metaList                 ,
 	if ( metaListSize == 0 ) return true;

 	// sanity
-	if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
+	//if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
+	if ( collnum < 0 ) { char *xx=NULL;*xx=0; }

 	// if first time set this
 	m_currentPtr   = metaList;
@ -547,7 +566,8 @@ bool Msg4::addMetaList2 ( ) {

 	char *pend = m_metaList + m_metaListSize;

-	if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
+	//if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
+	if ( m_collnum < 0 ) { char *xx=NULL;*xx=0; }

 	// store each record in the list into the send buffers
 	for ( ; p < pend ; ) {
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -1256,6 +1256,10 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
 	long prevReplyError = 0;
 	time_t prevReplyDownloadTime = 0LL;
 	long badCount = 0;
+
+	long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
+	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
+	
 	// parse through it
 	for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
 		// this record is either a SpiderRequest or SpiderReply
@ -1316,8 +1320,36 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
 		if ( status == 0 ) msg = "Unexamined";
 		if ( status == -1 ) msg = mstrerror(prevReplyError);

+		// matching url filter, print out the expression
+		long ufn ;
+		ufn = ::getUrlFilterNum(sreq,
+					srep,
+					nowGlobalMS,
+					false,
+					MAX_NICENESS,
+					cr);
+		char *expression = NULL;
+		long  priority = -4;
+		// sanity check
+		if ( ufn >= 0 ) { 
+			expression = cr->m_regExs[ufn].getBufStart();
+			priority   = cr->m_spiderPriorities[ufn];
+		}
+
+		if ( ! expression ) {
+			expression = "error. matches no expression!";
+			priority = -4;
+		}
+
+		// when spidering rounds we use the 
+		// lastspidertime>={roundstart} --> spiders disabled rule
+		// so that we do not spider a url twice in the same round
+		if ( ufn >= 0 && ! cr->m_spidersEnabled[ufn] ) {
+			priority = -5;
+		}
+
 		// "csv" is default if json not specified
-		if ( m_fmt == FMT_JSON )
+		if ( m_fmt == FMT_JSON ) 
 			sb->safePrintf("[{"
 				       "{\"url\":"
 				       "\"%s\"},"
@ -1338,18 +1370,35 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
 				       , msg
 				       );
 		// but default to csv
-		else
-			sb->safePrintf("%s,%lu,%li,\"%s\""
+		else {
+			sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
 				       //",%s"
-				       "\n"
+				       //"\n"
 				       , sreq->m_url
 				       // when was it first added to spiderdb?
 				       , sreq->m_addedTime
-				       , status
+				       //, status
 				       , msg
+				       // the url filter expression it matches
+				       , expression
+				       // the priority
+				       //, priorityMsg
 				       //, iptoa(sreq->m_firstIp)
 				       );
-
+			// print priority
+			if ( priority == SPIDER_PRIORITY_FILTERED )
+				sb->safePrintf("url ignored");
+			else if ( priority == SPIDER_PRIORITY_BANNED )
+				sb->safePrintf("url banned");
+			else if ( priority == -4 )
+				sb->safePrintf("error");
+			else if ( priority == -5 )
+				sb->safePrintf("will spider next round");
+			else 
+				sb->safePrintf("%li",priority);
+			sb->safePrintf("\""
+				       "\n");
+		}
 	}

 	if ( ! badCount ) return;
@ -2649,15 +2698,40 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			//if ( cx->m_collectionNameAlias.length() > 0 )
 			//	alias=cx->m_collectionNameAlias.getBufStart();
 			//long paused = 1;
-			char *ss = "Normal";
+			char *ss = "In progress.";
 			if ( cx->m_spiderStatusMsg )
 				ss = cx->m_spiderStatusMsg;
+			// 0 means not to RE-crawl
+			char tmp[256];
+			// indicate if we are WAITING for next round...
+			if ( cx->m_collectiveRespiderFrequency > 0.0 &&
+			     getTimeGlobal() < cx->m_spiderRoundStartTime ) {
+				long now = getTimeGlobal();
+				sprintf(tmp,"Spidering next round in %li "
+					"seconds.",
+					cx->m_spiderRoundStartTime - now
+					);
+				ss = tmp;
+			}
+			// if we sent an email simply because no urls
+			// were left and we are not recrawling!
+			if ( cx->m_collectiveRespiderFrequency == 0.0 &&
+			     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
+				ss = "Crawl has exhausted all urls and "
+					"repeatCrawl is set to 0.0.";
+			}
+			if ( ! cx->m_spideringEnabled )
+				ss = "Crawl paused.";
+			CrawlInfo *ci = &cx->m_localCrawlInfo;
+			long sentAlert = (long)ci->m_sentCrawlDoneAlert;
+			if ( sentAlert ) sentAlert = 1;
 			//if ( cx->m_spideringEnabled ) paused = 0;
 			sb.safePrintf("\n\n{"
 				      "\"name\":\"%s\",\n"
 				      //"\"alias\":\"%s\",\n"
-				      "\"crawlingEnabled\":%li,\n"
-				      "\"crawlingStatus\":\"%s\",\n"
+				      //"\"crawlingEnabled\":%li,\n"
+				      "\"crawlStatus\":\"%s\",\n"
+				      "\"sentCrawlDoneNotification\":%li,\n"
 				      //"\"crawlingPaused\":%li,\n"
 				      "\"objectsFound\":%lli,\n"
 				      "\"urlsHarvested\":%lli,\n"
@ -2676,8 +2750,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 				      //,cx->m_coll
 				      , cx->m_diffbotCrawlName.getBufStart()
 				      //, alias
-				      , (long)cx->m_spideringEnabled
+				      //, (long)cx->m_spideringEnabled
 				      , ss
+				      , sentAlert
 				      //, (long)paused
 				      , cx->m_globalCrawlInfo.m_objectsAdded -
 				      cx->m_globalCrawlInfo.m_objectsDeleted
@ -4085,7 +4160,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 	// if collectiverespiderfreq is 0 or less then do not RE-spider
 	// documents already indexed.
 	else {
-		cr->m_regExs[i].set("isindexed");
+		// this does NOT work! error docs continuosly respider
+		// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
+		//cr->m_regExs[i].set("isindexed");
+		cr->m_regExs[i].set("hasreply");
 		cr->m_spiderPriorities   [i] = 10;
 		// just turn off spidering. if we were to set priority to
 		// filtered it would be removed from index!
@ -4376,3 +4454,27 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,

 	return true;
 }
+
+
+///////////
+//
+// SUPPORT for getting the last 100 spidered urls
+//
+// . sends request to each node
+// . each node returns top 100 after scanning spiderdb (cache for speed)
+// . master node gets top 100 of the top 100s
+// . sends pretty html or json back to socket
+// . then user can see why their crawl isn't working
+// . also since we are scanning spiderdb indicate how many urls are
+//   ignored because they match "ismedia" or "!isonsamedomain" etc. so
+//   show each url filter expression then show how many urls matched that.
+//   when doing this make the spiderReply null, b/c the purpose is to see
+//   what urls 
+// . BUT url may never be attempted because it matches "ismedia" so that kind
+//   of thing might have to be indicated on the spiderdb dump above, not here.
+//
+//////////
+
+//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {
+
+
--- a/Parms.cpp
+++ b/Parms.cpp
@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
 			  "it from."
 			  "</td></tr>"

-			  "<tr><td>isnew | !isnew</td>"
+			  "<tr><td>hasreply | !hasreply</td>"
 			  "<td>"
-			  "This is true if we have never tried to spider "
-			  "this url. If we have tried to spider it and "
-			  "received an error, like a timeout or something, "
-			  "then it will no longer match <i>isnew</i>."
+			  "This is true if we have tried to spider "
+			  "this url, even if we got an error while trying."
 			  "</td></tr>"
 			  

--- a/Spider.cpp
+++ b/Spider.cpp
@ -1000,8 +1000,8 @@ SpiderColl::SpiderColl () {
 	m_numAdded = 0;
 	m_numBytesScanned = 0;
 	m_lastPrintCount = 0;
-	m_lastSpiderAttempt = 0;
-	m_lastSpiderCouldLaunch = 0;
+	//m_lastSpiderAttempt = 0;
+	//m_lastSpiderCouldLaunch = 0;
 	//m_numRoundsDone = 0;
 	//m_lastDoledbReadEmpty = false; // over all priorities in this coll
 	// re-set this to min and set m_needsWaitingTreeRebuild to true
@ -3954,14 +3954,24 @@ void doneSendingNotification ( void *state ) {
 	// as false again! use LOCAL crawlInfo, since global is reset often.
 	cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;

+	// be sure to save state so we do not re-send emails
+	cr->m_needsSave = 1;
+
+	// sanity
+	if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
+
 	// sanity check
 	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

-	// if not round done we are done
-	if ( cr->m_spiderStatus != SP_ROUNDDONE ) return;
+	// advance round if that round has completed, or there are no
+	// more urls to spider. if we hit maxToProcess/maxToCrawl then 
+	// do not increment the round #. otherwise we should increment it.
+	if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) return;
+	if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) return;
+

 	// this should have been set below
-	if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
+	//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }

 	// how is this possible
 	//if ( getTimeGlobal() 
@ -3980,7 +3990,10 @@ void doneSendingNotification ( void *state ) {
 		break;
 	}

-	if ( respiderFreq == -1.0 ) return;
+	// if not REcrawling, set this to 0 so we at least update our
+	// round # and round start time...
+	if ( respiderFreq == -1.0 ) 
+		respiderFreq = 0.0;

 	if ( respiderFreq < 0.0 ) {
 		log("spider: bad respiderFreq of %f. making 0.",
@ -3989,6 +4002,9 @@ void doneSendingNotification ( void *state ) {
 	}

 	long seconds = respiderFreq * 24*3600;
+	// add 1 for lastspidertime round off errors so we can be assured
+	// all spiders have a lastspidertime LESS than the new
+	// m_spiderRoundStartTime we set below.
 	if ( seconds <= 0 ) seconds = 1;

 	// now update this round start time. all the other hosts should
@ -4011,6 +4027,15 @@ void doneSendingNotification ( void *state ) {

 bool sendNotificationForCollRec ( CollectionRec *cr )  {

+	// do not send email for maxrounds hit, it will send a round done
+	// email for that. otherwise we end up calling doneSendingEmail()
+	// twice and increment the round twice
+	if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
+		log("spider: not sending email for max rounds limit "
+		    "since already sent for round done.");
+		return true;
+	}
+
 	// . if already sent email for this, skip
 	// . localCrawlInfo stores this value on disk so it is persistent
 	// . we do it this way so SP_ROUNDDONE can be emailed and then
@ -4139,11 +4164,14 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		if ( ! cr->m_spideringEnabled ) continue;

 		// hit crawl round max?
-		if ( //cr->m_maxCrawlRounds > 0 &&
+		if ( cr->m_maxCrawlRounds > 0 &&
 		     cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
 			cr->m_spiderStatus = SP_MAXROUNDS;
 			cr->m_spiderStatusMsg = "Hit maxCrawlRounds limit.";
-			sendNotificationForCollRec ( cr );
+			// it'll send a SP_ROUNDDONE email first
+			// so no need to repeat it, but we do want to
+			// update the status msg
+			//sendNotificationForCollRec ( cr );
 			continue;
 		}

@ -4175,6 +4203,24 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		// set current time, synced with host #0
 		nowGlobal = getTimeGlobal();

+		// shortcut
+		CrawlInfo *ci = &cr->m_localCrawlInfo;
+
+		// the last time we attempted to spider a url for this coll
+		//m_sc->m_lastSpiderAttempt = nowGlobal;
+		// now we save this so when we restart these two times
+		// are from where we left off so we do not end up setting
+		// hasUrlsReadyToSpider to true which in turn sets
+		// the sentEmailAlert flag to false, which makes us
+		// send ANOTHER email alert!!
+		ci->m_lastSpiderAttempt = nowGlobal;
+
+		// update this for the first time in case it is never updated.
+		// then after 60 seconds we assume the crawl is done and
+		// we send out notifications. see below.
+		if ( ci->m_lastSpiderCouldLaunch == 0 )
+			ci->m_lastSpiderCouldLaunch = nowGlobal;
+
 		//
 		// . if doing respider with roundstarttime....
 		// . roundstarttime is > 0 if m_collectiveRespiderFrequency
@ -4184,19 +4230,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		//
 		if ( nowGlobal < cr->m_spiderRoundStartTime ) continue;

-		// the last time we attempted to spider a url for this coll
-		m_sc->m_lastSpiderAttempt = nowGlobal;
-		// update this for the first time in case it is never updated.
-		// then after 60 seconds we assume the crawl is done and
-		// we send out notifications. see below.
-		if ( m_sc->m_lastSpiderCouldLaunch == 0 )
-			m_sc->m_lastSpiderCouldLaunch = nowGlobal;
 		// if populating this collection's waitingtree assume
 		// we would have found something to launch as well. it might
 		// mean the waitingtree-saved.dat file was deleted from disk
 		// so we need to rebuild it at startup.
 		if ( m_sc->m_waitingTreeNeedsRebuild ) 
-			m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+			ci->m_lastSpiderCouldLaunch = nowGlobal;
+
 		// get max spiders
 		long maxSpiders = cr->m_maxNumSpiders;
 		if ( m_sc->m_isTestColl ) {
@ -4215,7 +4255,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		// obey max spiders per collection too
 		if ( m_sc->m_spidersOut >= maxSpiders ) {
 			// assume we would have launched a spider
-			m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+			ci->m_lastSpiderCouldLaunch = nowGlobal;
 			// try next collection
 			continue;
 		}
@ -4279,10 +4319,13 @@ void SpiderLoop::spiderDoledUrls ( ) {

 loop:

+	// shortcut
+	CrawlInfo *ci = &cr->m_localCrawlInfo;
+
 	// bail if waiting for lock reply, no point in reading more
 	if ( m_msg12.m_gettingLocks ) {
 		// assume we would have launched a spider for this coll
-		m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+		ci->m_lastSpiderCouldLaunch = nowGlobal;
 		// wait for sleep callback to re-call us in 10ms
 		return;
 	}
@ -4344,7 +4387,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
 	// skip?
 	if ( out >= max ) {
 		// assume we could have launched a spider
-		if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+		if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
 		// count as non-empty then!
 		//m_sc->m_encounteredDoledbRecs = true;
 		// try the priority below us
@ -4464,6 +4507,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// unlock
 	m_gettingDoledbList = false;

+	// shortcuts
+	CollectionRec *cr = m_sc->m_cr;
+	CrawlInfo *ci = &cr->m_localCrawlInfo;
+
 	// update m_msg5StartKey for next read
 	if ( m_list.getListSize() > 0 ) {
 		m_list.getLastKey((char *)&m_sc->m_msg5StartKey);
@ -4495,7 +4542,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {

 	if ( bail ) {
 		// assume we could have launched a spider
-		m_sc->m_lastSpiderCouldLaunch = getTimeGlobal();
+		ci->m_lastSpiderCouldLaunch = getTimeGlobal();
 		// return false to indicate to try another
 		return false;
 	}
@ -4623,7 +4670,6 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
 	// skip the priority if we already have enough spiders on it
 	long out = m_sc->m_outstandingSpiders[pri];
-	CollectionRec *cr = m_sc->m_cr;
 	// get the first ufn that uses this priority
 	//long max = getMaxAllowableSpidersOut ( pri );
 	// how many spiders can we have out?
@ -4661,7 +4707,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// skip? and re-get another doledb list from next priority...
 	if ( out >= max ) {
 		// assume we could have launched a spider
-		if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+		if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
 		// this priority is maxed out, try next
 		m_sc->devancePriority();
 		// assume not an empty read
@ -4850,12 +4896,22 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// assume we launch the spider below. really this timestamp indicates
 	// the last time we COULD HAVE LAUNCHED *OR* did actually launch
 	// a spider
-	m_sc->m_lastSpiderCouldLaunch = nowGlobal;
+	ci->m_lastSpiderCouldLaunch = nowGlobal;

 	// set crawl done email sent flag so another email can be sent again
 	// in case the user upped the maxToCrawl limit, for instance,
 	// so that the crawl could continue.
-	m_sc->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+	//ci->m_sentCrawlDoneAlert = 0;
+
+	// there are urls ready to spider
+	ci->m_hasUrlsReadyToSpider = true;
+
+	// reset reason why crawl is not running, because we basically are now
+	cr->m_spiderStatus = 0;
+	cr->m_spiderStatusMsg = NULL;
+
+	// be sure to save state so we do not re-send emails
+	cr->m_needsSave = 1;

 	// assume not an empty read
 	//m_sc->m_encounteredDoledbRecs = true;
@ -8322,6 +8378,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) return -1;
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if we got a reply, we are not new!!
+			if ( (bool)srep == (bool)val ) continue;
+			// skip it for speed
+			p += 8;
+			// check for &&
+			p = strstr(p, "&&");
+			// if nothing, else then it is a match
+			if ( ! p ) return i;
+			// skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// hastmperror, if while spidering, the last reply was
 		// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
 		// usually temporary condition that warrants a retry
@ -8802,7 +8876,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			p += 2;
 			goto checkNextRule;
 		}
-
 		// iswww, means url is like www.xyz.com/...
 		if ( strncmp(p,"iswww", 5) == 0 ) {
 			// now this is a bit
@ -9863,9 +9936,13 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 			// but only if it was a crawl round done alert,
 			// not a maxToCrawl or maxToProcess or maxRounds
 			// alert.
-			if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
-			     SP_ROUNDDONE )
-				cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+			// we can't do this because on startup we end up
+			// setting hasUrlsReadyToSpider to true and we
+			// may have already sent an email, and it gets RESET
+			// here when it shouldn't be
+			//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
+			//     SP_ROUNDDONE )
+			//	cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
 		}
 	}
 	// return if still waiting on more to come in
@ -9874,6 +9951,15 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// sanity check
 	if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }

+
+	//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
+
+	// if we have urls ready to be spidered then prepare to send another
+	// email/webhook notification
+	if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
+		cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+
+
 	// update cache time
 	cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();

@ -9932,9 +10018,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	// if urls were considered and roundstarttime is still 0 then
 	// set it to the current time...
-	if ( cr->m_spiderRoundStartTime == 0 )
-		// all hosts in the network should sync with host #0 on this
-		cr->m_spiderRoundStartTime = getTimeGlobal();
+	//if ( cr->m_spiderRoundStartTime == 0 )
+	//	// all hosts in the network should sync with host #0 on this
+	//	cr->m_spiderRoundStartTime = getTimeGlobal();

 	// but of course if it has urls ready to spider, do not send alert...
 	// or if this is -1, indicating "unknown".
@ -9987,20 +10073,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {

 	//long now = getTimeGlobal();

-	SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
+	//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
+
+	// shortcut
+	CrawlInfo *ci = &cr->m_localCrawlInfo;

 	// assume it does
-	cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
+	//ci->m_hasUrlsReadyToSpider = 1;

 	// if we haven't spidered anything in 1 min assume the
 	// queue is basically empty...
-	if ( sc->m_lastSpiderAttempt &&
-	     sc->m_lastSpiderCouldLaunch &&
+	if ( ci->m_lastSpiderAttempt &&
+	     ci->m_lastSpiderCouldLaunch &&
 	     //cr->m_spideringEnabled &&
 	     //g_conf.m_spideringEnabled &&
-	     sc->m_lastSpiderAttempt - sc->m_lastSpiderCouldLaunch > 60 )
+	     ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
 		// assume our crawl on this host is completed i guess
-		cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
+		ci->m_hasUrlsReadyToSpider = 0;



--- a/Spider.h
+++ b/Spider.h
@ -980,11 +980,6 @@ class SpiderColl {

 	bool m_useTree;

-	// last time we launched a spider. 0 on startup.
-	time_t m_lastSpiderAttempt;
-	// time we had or might have had a url available for spidering
-	time_t m_lastSpiderCouldLaunch;
-
 	//bool m_lastDoledbReadEmpty;
 	//bool m_encounteredDoledbRecs;
 	//long long m_numRoundsDone;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -12898,8 +12898,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;

+	// add a '?' if none
+	if ( ! strchr ( apiUrl.getUrl() , '?' ) )
+		diffbotUrl.pushChar('?');
+	else
+		diffbotUrl.pushChar('&');
+
 	//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
-	diffbotUrl.safePrintf("&token=%s",cr->m_diffbotToken.getBufStart());
+	diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
 	diffbotUrl.safePrintf("&url=");
 	// give diffbot the url to process
 	diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
@ -21492,20 +21498,20 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		//   so if your first X filters all map to a "FILTERED" 
 		//   priority and this url matches one of them we can 
 		//   confidently toss this guy out.
-		long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
-					       false, m_niceness, cr);
+		//long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
+		//			       false, m_niceness, cr);

 		// bad?
-		if ( ufn < 0 ) {
-			log("build: link %s had bad url filter."
-			    , ksr.m_url );
-			g_errno = EBADENGINEER;
-			return NULL;
-		}
+		//if ( ufn < 0 ) {
+		//	log("build: link %s had bad url filter."
+		//	    , ksr.m_url );
+		//	g_errno = EBADENGINEER;
+		//	return NULL;
+		//}

-		long priority = -1;
-		if ( ufn >= 0 )
-			priority = cr->m_spiderPriorities[ufn];
+		//long priority = -1;
+		//if ( ufn >= 0 )
+		//	priority = cr->m_spiderPriorities[ufn];

 		// debug
 		if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
@ -21526,10 +21532,15 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 			     sb2.getBufStart());
 		}
 		// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
-		if ( priority == SPIDER_PRIORITY_FILTERED ) {
-			linksFiltered++; continue; }
-		if ( priority == SPIDER_PRIORITY_BANNED   ) {
-			linksBanned++; continue; }
+		// . mdw: oct 24, 2013. now i add so the urls show up in
+		//   the pagecrawlbot.cpp spiderdb dump, so you can examine
+		//   exactly why a url was crawled or not. plus if you change
+		//   your mind about banning/filtering then it'd be nice to
+		//   have these urls readily available.
+		//if ( priority == SPIDER_PRIORITY_FILTERED ) {
+		//	linksFiltered++; continue; }
+		//if ( priority == SPIDER_PRIORITY_BANNED   ) {
+		//	linksBanned++; continue; }

 		// serialize into the buffer
 		long need = ksr.getRecSize();
--- a/hosts.conf
+++ b/hosts.conf
@ -2,12 +2,14 @@
 # Tells us what hosts are participating in the distributed search engine.


-# This is how many pieces you want the index split into.
-# So if you have 64 machines, and you want a unique piece of index on
-# each machine, then make this 64. But if you have 64 machines and you
-# want one level of redundancy then make this 32.
+# How many mirrors do you want? If this is 0 then your data
+# will NOT be replicated. If it is 1 then each host listed
+# below will have one host that mirrors it, thereby decreasing
+# total index capacity, but increasing redundancy. If this is
+# 1 then the first half of hosts will be replicated by the
+# second half of the hosts listed below.

-index-splits: 1
+num-mirrors: 0