fix critical spider issue of an IP corking an

entire spider priority. also exit faster on 'save & exit' if in evalIpLoop().
2016-01-23 10:32:14 -08:00 · 2016-01-23 10:32:14 -08:00 · 4c7f969988
commit 4c7f969988
parent 7e13b147e4
1 changed files with 56 additions and 9 deletions
--- a/Spider.cpp
+++ b/Spider.cpp
@ -3198,6 +3198,13 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
 		return;
 	}

+	// are we trying to exit? some firstip lists can be quite long, so
+	// terminate here so all threads can return and we can exit properly
+	if ( g_process.m_mode == EXIT_MODE ) {
+		m_isPopulatingDoledb = false; 
+		return;
+	}
+
 	// . get next IP that is due to be spidered from
 	// . also sets m_waitingTreeKey so we can delete it easily!
 	int32_t ip = getNextIpFromWaitingTree();
@ -3564,6 +3571,10 @@ bool SpiderColl::evalIpLoop ( ) {
 	// sanity
 	if ( m_scanningIp == 0 || m_scanningIp == -1 ) { char *xx=NULL;*xx=0;}

+	// are we trying to exit? some firstip lists can be quite long, so
+	// terminate here so all threads can return and we can exit properly
+	if ( g_process.m_mode == EXIT_MODE ) return true;
+
 	// if this ip is in the winnerlistcache use that. it saves
 	// us a lot of time.
 	key_t cacheKey;
@ -7236,7 +7247,14 @@ void SpiderLoop::spiderDoledUrls ( ) {
 				// "goto listLoop" below if the url we want
 				// to dole is locked.
 				// seems like a ton of negative recs
-				2000            , // minRecSizes
+				// MDW: let's now read in 50k, not 2k,of doledb
+				// spiderrequests because often the first one
+				// has an ip already in use and then we'd
+				// just give up on the whole PRIORITY! which
+				// really freezes the spiders up.
+				// Also, if a spider request is corrupt in
+				// doledb it would cork us up too!
+				50000            , // minRecSizes
 				true            , // includeTree
 				false           , // addToCache
 				0               , // max cache age
@ -7575,7 +7593,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// skip? and re-get another doledb list from next priority...
 	if ( out >= max ) {
 		// come here if we hit our ip limit too
-	hitMax:
+		//	hitMax:
 		// assume we could have launched a spider
 		if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
 		// this priority is maxed out, try next
@ -7607,6 +7625,9 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// what is this? a dataless positive key?
 	if ( m_list.getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0; }

+	int32_t ipOut = 0;
+	int32_t globalOut = 0;
+
 	// get the "spider rec" (SpiderRequest) (embedded in the doledb rec)
 	SpiderRequest *sreq = (SpiderRequest *)(rec + sizeof(key_t)+4);
 	// sanity check. check for http(s)://
@ -7618,20 +7639,19 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 			log("spider: got corrupt doledb record. ignoring. "
 			    "pls fix!!! cn=%i",(int)m_collnum);
 		g_corruptCount++;
+		goto skipDoledbRec;
 		// skip for now....!! what is causing this???
-		m_list.skipCurrentRecord();
+		//m_list.skipCurrentRecord();
 		// if exhausted -- try another load with m_nextKey set
-		if ( m_list.isExhausted() ) return true;
+		//if ( m_list.isExhausted() ) return true;
 		// otherwise, try the next doledb rec in this list
-		goto listLoop;
+		//goto listLoop;
 	}		


 	// . how many spiders out for this ip now?
 	// . TODO: count locks in case twin is spidering... but it did not seem
 	//   to work right for some reason
-	int32_t ipOut = 0;
-	int32_t globalOut = 0;
 	for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
 		// get it
 		XmlDoc *xd = m_docs[i];
@ -7648,13 +7668,40 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 		if ( xd->m_collnum != cr->m_collnum ) continue;
 		if ( xd->m_sreq.m_firstIp == sreq->m_firstIp ) ipOut++;
 	}
-	if ( ipOut >= maxSpidersOutPerIp ) goto hitMax;
+	// don't give up on this priority, just try next in the list.
+	// we now read 50k instead of 2k from doledb in order to fix
+	// one ip from bottle corking the whole priority!!
+	if ( ipOut >= maxSpidersOutPerIp ) {
+		// assume we could have launched a spider
+		if ( maxSpidersOutPerIp > 0 ) 
+			ci->m_lastSpiderCouldLaunch = nowGlobal;
+		//goto hitMax;
+	skipDoledbRec:
+		// skip
+		m_list.skipCurrentRecord();
+		// if not exhausted try the next doledb rec in this list
+		if ( ! m_list.isExhausted() ) goto listLoop;
+		// print a log msg if we corked things up even
+		// though we read 50k from doledb
+		static bool s_flag = true;
+		if ( m_list.m_listSize > 50000 && s_flag ) {
+			s_flag = true;
+			log("spider: 50k not big enough");
+		}
+		// list is exhausted...
+		return true;
+	}

 	// but if the global is high, only allow one out per coll so at 
 	// least we dont starve and at least we don't make a huge wait in
 	// line of queued results just sitting there taking up mem and
 	// spider slots so the crawlbot hourly can't pass.
-	if ( globalOut >= maxSpidersOutPerIp && ipOut >= 1 ) goto hitMax;
+	if ( globalOut >= maxSpidersOutPerIp && ipOut >= 1 ) {
+		// assume we could have launched a spider
+		if ( maxSpidersOutPerIp > 0 ) 
+			ci->m_lastSpiderCouldLaunch = nowGlobal;
+		goto skipDoledbRec;
+	}

 	if ( g_conf.m_logDebugSpider )
 		log("spider: %"INT32" spiders out for %s for %s",