fixes for sudden revitilization of dead crawls.

2025-07-17 02:56:07 -04:00 · 2014-01-25 11:03:15 -08:00
parent c207c3c456
commit e3f769dffe
5 changed files with 123 additions and 16 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1525,7 +1525,8 @@ bool CollectionRec::load ( char *coll , long i ) {
 	m_collLen = gbstrlen ( coll );
 	strcpy ( m_coll , coll );

-	log(LOG_INFO,"db: loading conf for collection %s",coll);
+	log(LOG_INFO,"db: loading conf for collection %s (%li)",coll,
+	    (long)m_collnum);

 	// collection name HACK for backwards compatibility
 	//if ( strcmp ( coll , "main" ) == 0 ) {
@ -1562,6 +1563,12 @@ bool CollectionRec::load ( char *coll , long i ) {
 		// it is binary now
 		memcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );

+	
+	log("coll: loaded %s (%li) local hasurlsready=%li",
+	    m_coll,
+	    (long)m_collnum,
+	    (long)m_localCrawlInfo.m_hasUrlsReadyToSpider);
+

 	// we introduced the this round counts, so don't start them at 0!!
 	if ( m_spiderRoundNum == 0 &&
@ -1594,6 +1601,12 @@ bool CollectionRec::load ( char *coll , long i ) {
 		// it is binary now
 		memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );

+	log("coll: loaded %s (%li) global hasurlsready=%li",
+	    m_coll,
+	    (long)m_collnum,
+	    (long)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
+	
+
 	////////////
 	//
 	// PAGE COUNT TABLE for doing quotas in url filters
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -68,6 +68,16 @@ Hostdb::~Hostdb () {
 }

 void Hostdb::reset ( ) {
+
+	for ( long i = 0 ; i < m_numHosts ; i++ ) {
+		Host *h = &m_hosts[i];
+		if ( ! h->m_lastKnownGoodCrawlInfoReply ) continue;
+		mfree ( h->m_lastKnownGoodCrawlInfoReply ,
+			h->m_lastKnownGoodCrawlInfoReplyEnd -
+			h->m_lastKnownGoodCrawlInfoReply , "lknown" );
+		h->m_lastKnownGoodCrawlInfoReply = NULL;
+	}
+
 	if ( m_hosts ) 
 		mfree ( m_hosts, m_allocSize,"Hostdb" );
 	if ( m_ips   ) mfree ( m_ips  , m_numIps * 4, "Hostdb" );
--- a/Hostdb.h
+++ b/Hostdb.h
@ -273,6 +273,9 @@ class Host {
 	char           m_inSync ;
 	char           m_isPermanentOutOfSync ;

+	char *m_lastKnownGoodCrawlInfoReply;
+	char *m_lastKnownGoodCrawlInfoReplyEnd;
+
 	// . used by Parms.cpp for broadcasting parm change requests
 	// . each parm change request has an id
 	// . this let's us know which id is in progress and what the last
--- a/Spider.cpp
+++ b/Spider.cpp
@ -38,6 +38,8 @@ SpiderRequest *g_sreq = NULL;

 long g_corruptCount = 0;

+static char s_countsAreValid = 1;
+
 /////////////////////////
 /////////////////////////      SPIDEREC
 /////////////////////////
@ -2307,8 +2309,9 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp ,
 		if ( more < 10 ) more = 10;
 		if ( more > 100000 ) more = 100000;
 		long newNum = max + more;
-		log("spider: growing waiting tree to from %li to %li nodes",
-		    max , newNum );
+		log("spider: growing waiting tree to from %li to %li nodes "
+		    "for collnum %li",
+		    max , newNum , (long)m_collnum );
 		if ( ! m_waitingTree.growTree ( newNum , MAX_NICENESS ) )
 			return log("spider: failed to grow waiting tree to "
 				   "add firstip %s",iptoa(firstIp) );
@ -3145,7 +3148,9 @@ bool SpiderColl::readListFromSpiderdb ( ) {
 	// sanity check
 	long wn = m_waitingTree.getNode(0,(char *)&m_waitingTreeKey);
 	if ( wn < 0 ) { 
-		log("spider: waiting tree key removed while reading list");
+		log("spider: waiting tree key removed while reading list "
+		    "for %s (%li)",
+		    cr->m_coll,(long)m_collnum);
 		return true;
 	}
 	// sanity. if first time, this must be invalid
@ -4602,8 +4607,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 				sc->m_waitingTreeNeedsRebuild = true;
 				log(LOG_INFO,
 				    "spider: hit spider queue "
-				    "rebuild timeout for %s",
-				    cr->m_coll);
+				    "rebuild timeout for %s (%li)",
+				    cr->m_coll,(long)cr->m_collnum);
 				// flush the ufn table
 				//clearUfnTable();
 			}
@ -4853,6 +4858,9 @@ void SpiderLoop::spiderDoledUrls ( ) {

 	// must be spidering to dole out
 	if ( ! g_conf.m_spideringEnabled ) return;
+	// if we don't have all the url counts from all hosts, then wait.
+	// one host is probably down and was never up to begin with
+	if ( ! s_countsAreValid ) return;
 	//if ( ! g_conf.m_webSpideringEnabled )  return;
 	// if we do not overlap ourselves
 	if ( m_gettingDoledbList ) return;
@ -4927,6 +4935,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		     cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
 			cr->m_spiderStatus = SP_MAXROUNDS;
 			cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
+			cr->m_needsSave = true;
 			continue;
 		}

@ -4939,6 +4948,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
 			// then the send email code will be called.
 			// do it this way for code simplicity.
 			cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
+			cr->m_needsSave = true;
 			continue;
 		}

@ -4948,6 +4958,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		     cr->m_maxToProcess ) {
 			cr->m_spiderStatus = SP_MAXTOPROCESS;
 			cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = false;
+			cr->m_needsSave = true;
 			continue;
 		}

@ -5735,7 +5746,8 @@ bool SpiderLoop::gotDoledbList2 ( ) {

 	// if we thought we were done, note it if something comes back up
 	if ( ! ci->m_hasUrlsReadyToSpider ) 
-		log("spider: got a reviving url to crawl %s",sreq->m_url);
+		log("spider: got a reviving url for coll %s (%li) to crawl %s",
+		    cr->m_coll,(long)cr->m_collnum,sreq->m_url);

 	// there are urls ready to spider
 	ci->m_hasUrlsReadyToSpider = true;
@ -11040,6 +11052,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot);

 static long s_requests = 0;
 static long s_replies  = 0;
+static long s_validReplies  = 0;
 static bool s_inUse = false;

 // . just call this once per second for all collections
@ -11137,21 +11150,48 @@ void spiderRoundIncremented ( CollectionRec *cr ) {
 	cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = true;
 	cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;

-
 	cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
 	cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
 	cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
 	cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
+
+	cr->m_needsSave = true;
 }

 void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

-	// reply is error?
+	// loop over each LOCAL crawlinfo we received from this host
+	CrawlInfo *ptr = (CrawlInfo *)(slot->m_readBuf);
+	CrawlInfo *end = (CrawlInfo *)(slot->m_readBuf+ slot->m_readBufSize);
+
+	// host sending us this reply
+	Host *h = slot->m_host;
+
+	// assume it is a valid reply, not an error, like a udptimedout
+	s_validReplies++;
+
+	// reply is error? then use the last known good reply we had from him
 	if ( ! slot->m_readBuf || g_errno ) {
 		log("spider: got crawlinfo reply error: %s",
 		    mstrerror(g_errno));
 		// just clear it
 		g_errno = 0;
+		// just use his last known good reply
+		ptr = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReply;
+		end = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReplyEnd;
+		// if never had any reply... can't be valid then
+		if ( ! ptr ) s_validReplies--;
+	}
+	// otherwise, if reply was good it is the last known good now!
+	else {
+		// free the old good
+		long size = 
+			h->m_lastKnownGoodCrawlInfoReplyEnd -
+			h->m_lastKnownGoodCrawlInfoReply;
+		mfree ( h->m_lastKnownGoodCrawlInfoReply , size , "lknown");
+		// add in the new good in case he goes down in the future
+		h->m_lastKnownGoodCrawlInfoReply    = (char *)ptr;
+		h->m_lastKnownGoodCrawlInfoReplyEnd = (char *)end;
 	}

 	// inc it
@ -11159,13 +11199,20 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	if ( s_replies > s_requests ) { char *xx=NULL;*xx=0; }

+
+	// crap, if any host is dead and not reporting it's number then
+	// that seriously fucks us up because our global count will drop
+	// and something that had hit a max limit, like maxToCrawl, will
+	// now be under the limit and the crawl will resume.
+	// what's the best way to fix this?
+	//
+	// perhaps, let's just keep the dead host's counts the same
+	// as the last time we got them. or maybe the simplest way is to
+	// just not allow spidering if a host is dead 
+
 	// the sendbuf should never be freed! it points into collrec
 	slot->m_sendBufAlloc = NULL;

-	// loop over each LOCAL crawlinfo we received from this host
-	CrawlInfo *ptr = (CrawlInfo *)(slot->m_readBuf);
-	CrawlInfo *end = (CrawlInfo *)(slot->m_readBuf+ slot->m_readBufSize);
-
 	/////
 	//  SCAN the list of CrawlInfos we received from this host, 
 	//  one for each non-null collection
@ -11239,6 +11286,22 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 		// if not the last reply, skip this part
 		if ( s_replies < s_requests ) continue;

+		// if it's the last reply we are to receive, and 1 or more 
+		// hosts did not have a valid reply, and not even a
+		// "last known good reply" then then we can't do
+		// much, so do not spider then because our counts could be
+		// way off and cause us to start spidering again even though
+		// we hit a maxtocrawl limit!!!!!
+		if ( s_validReplies < s_replies ) {
+			// this will tell us to halt all spidering
+			// because a host is essentially down!
+			s_countsAreValid = false;
+			// might as well stop the loop here since we are
+			// not updating our crawlinfo states.
+			break;
+		}
+
+
 		// revival?
 		//if ( cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider &&
 		//     ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
@ -11246,7 +11309,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 		//	    cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider);
 		//}

-		bool has = cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider;
+		//bool has = cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider;

 		// now copy over to global crawl info so things are not
 		// half ass should we try to read globalcrawlinfo
@ -11257,8 +11320,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 		// turn not assume we are out of urls just yet if a host
 		// in the network has not reported...
-		if ( g_hostdb.hasDeadHost() && has )
-			cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
+		//if ( g_hostdb.hasDeadHost() && has )
+		//	cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
 		     

 		// should we reset our "sent email" flag?
@ -11349,6 +11412,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// initialize
 	s_replies  = 0;
 	s_requests = 0;
+	s_validReplies = 0;
 	s_inUse    = false;
 }

@ -11459,6 +11523,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 			    cr->m_coll);
 			// assume our crawl on this host is completed i guess
 			ci->m_hasUrlsReadyToSpider = 0;
+			// save that!
+			cr->m_needsSave = true;
 		}
 		
 		// save it
@ -11577,6 +11643,16 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
 				       "maintenance.");
 	}

+	// out CollectionRec::m_globalCrawlInfo counts do not have a dead
+	// host's counts tallied into it, which could make a difference on
+	// whether we have exceed a maxtocrawl limit or some such, so wait...
+	if ( ! s_countsAreValid ) {
+		*status = SP_ADMIN_PAUSED;
+		return msg->safePrintf("All crawling temporarily paused "
+				       "because a shard is down.");
+	}
+
+
 	// if spiderdb is empty for this coll, then no url
 	// has been added to spiderdb yet.. either seed or spot
 	//CrawlInfo *cg = &cx->m_globalCrawlInfo;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -13896,6 +13896,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	// count it for stats
 	cr->m_localCrawlInfo.m_pageProcessAttempts++;
 	cr->m_globalCrawlInfo.m_pageProcessAttempts++;
+	cr->m_needsSave = true;

 	char *additionalHeaders = NULL;
 	if ( headers.length() > 0 )
@ -14405,6 +14406,7 @@ char **XmlDoc::gotHttpReply ( ) {
 		cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
 		cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
 		m_incrementedDownloadCount = true;
+		cr->m_needsSave = true;
 	}

 	// this means the spider compression proxy's reply got corrupted
@ -19279,6 +19281,7 @@ long *XmlDoc::nukeJSONObjects ( ) {
 		// count as deleted
 		cr->m_localCrawlInfo.m_objectsDeleted++;
 		cr->m_globalCrawlInfo.m_objectsDeleted++;
+		cr->m_needsSave = true;
 		// but gotta set this crap back
 		//log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
 		// clear for next guy if there is one. clears 
@ -20202,6 +20205,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// count as deleted
 		cr->m_localCrawlInfo.m_objectsAdded++;
 		cr->m_globalCrawlInfo.m_objectsAdded++;
+		cr->m_needsSave = true;
 		// we successfully index the json object, skip to next one
 		m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
 		// but gotta set this crap back
@ -22831,6 +22835,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 	// this is just how many urls we tried to index
 	cr->m_localCrawlInfo.m_urlsHarvested += numAdded;
 	cr->m_globalCrawlInfo.m_urlsHarvested += numAdded;
+	cr->m_needsSave = true;

 	// save it
 	m_numOutlinksAdded      = numAdded;