fix round based spidering some more

2014-01-23 15:03:37 -08:00
parent edb01b0abb
commit dd663eb9f7
3 changed files with 62 additions and 16 deletions
--- a/Parms.cpp
+++ b/Parms.cpp
@ -18266,11 +18266,9 @@ void handleRequest3fLoop ( void *weArg ) {

 	// reset page round counts
 	if ( we->m_updatedRound && cx ) {
-		log("parms: clearing this round page counts");
-		cx->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
-		cx->m_localCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
-		cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
-		cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
+		// Spider.cpp will reset the *ThisRound page counts and
+		// the sent notification flag
+		spiderRoundIncremented ( cx );
 	}

 	// basically resetting the spider here...
--- a/Spider.cpp
+++ b/Spider.cpp
@ -4685,6 +4685,9 @@ void doneSendingNotification ( void *state ) {
 	if ( cr->m_spiderStatus == SP_MAXTOCRAWL && respiderFreq <= 0.0)return;
 	if ( cr->m_spiderStatus == SP_MAXTOPROCESS && respiderFreq<=0.0)return;

+	// if we hit the max to crawl rounds, then stop!!! do not
+	// increment the round...
+	if ( cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) return;

 	// this should have been set below
 	//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
@ -4716,7 +4719,7 @@ void doneSendingNotification ( void *state ) {
 	//cr->m_spiderRoundStartTime += respiderFreq;
 	char roundTime[128];
 	sprintf(roundTime,"%lu", (long)(getTimeGlobal() + seconds));
-
+	// roundNum++ round++
 	char roundStr[128];
 	sprintf(roundStr,"%li", cr->m_spiderRoundNum + 1);

@ -4755,11 +4758,11 @@ bool sendNotificationForCollRec ( CollectionRec *cr )  {
 	// do not send email for maxrounds hit, it will send a round done
 	// email for that. otherwise we end up calling doneSendingEmail()
 	// twice and increment the round twice
-	if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
-		log("spider: not sending email for max rounds limit "
-		    "since already sent for round done.");
-		return true;
-	}
+	//if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
+	//	log("spider: not sending email for max rounds limit "
+	//	    "since already sent for round done.");
+	//	return true;
+	//}

 	// wtf? caller must set this
 	if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; }
@ -11101,6 +11104,32 @@ void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) {
 	//return true;
 }

+// . Parms.cpp calls this when it receives our "spiderRoundNum" increment above
+// . all hosts should get it at *about* the same time
+void spiderRoundIncremented ( CollectionRec *cr ) {
+
+	log("spider: incrementing spider round for coll %s to %li (%lu)",
+	    cr->m_coll,cr->m_spiderRoundNum,cr->m_spiderRoundStartTime);
+
+	// . need to send a notification for this round
+	cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+
+	// . if we set sentCrawlDoneALert to 0 it will immediately
+	//   trigger another round increment !! so we have to set these
+	//   to true to prevent that.
+	// . if we learnt that there really are no more urls ready to spider
+	//   then we'll go to the next round. but that can take like
+	//   SPIDER_DONE_TIMER seconds of getting nothing.
+	cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = true;
+	cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
+
+
+	cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
+	cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
+	cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
+	cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound  = 0;
+}
+
 void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	// reply is error?
@ -11246,10 +11275,10 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 		// update status if nto already SP_MAXTOCRAWL, etc. we might
 		// just be flat out of urls
-		if ( ! cr->m_spiderStatus || 
-		     cr->m_spiderStatus == SP_INPROGRESS ||
-		     cr->m_spiderStatus == SP_INITIALIZING )
-			cr->m_spiderStatus = SP_ROUNDDONE;
+		//if ( ! cr->m_spiderStatus || 
+		//     cr->m_spiderStatus == SP_INPROGRESS ||
+		//     cr->m_spiderStatus == SP_INITIALIZING )
+		//	cr->m_spiderStatus = SP_ROUNDDONE;


 		// only host #0 sends emails
@ -11332,6 +11361,8 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 	//long now = getTimeGlobal();
 	SafeBuf replyBuf;

+	long now = getTimeGlobal();
+
 	//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);

 	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
@ -11347,11 +11378,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {

 		SpiderColl *sc = cr->m_spiderColl;

+		/////////
+		//
+		// ARE WE DONE SPIDERING?????
+		//
+		/////////
+
 		// if we haven't spidered anything in 1 min assume the
 		// queue is basically empty...
 		if ( ci->m_lastSpiderAttempt &&
 		     ci->m_lastSpiderCouldLaunch &&
 		     ci->m_hasUrlsReadyToSpider &&
+		     // the next round we are waiting for, if any, must
+		     // have had some time to get urls! otherwise we
+		     // will increment the round # and wait just
+		     // SPIDER_DONE_TIMER seconds and end up setting
+		     // hasUrlsReadyToSpider to false!
+		     now > cr->m_spiderRoundStartTime + SPIDER_DONE_TIMER &&
 		     // no spiders currently out. i've seen a couple out
 		     // waiting for a diffbot reply. wait for them to
 		     // return before ending the round...
@ -11363,9 +11406,13 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 		     //cr->m_spideringEnabled &&
 		     //g_conf.m_spideringEnabled &&
 		     ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 
-		     (long) SPIDER_DONE_TIMER )
+		     (long) SPIDER_DONE_TIMER ) {
+			// this is the MOST IMPORTANT variable so note it
+			log("spider: coll %s has no more urls to spider",
+			    cr->m_coll);
 			// assume our crawl on this host is completed i guess
 			ci->m_hasUrlsReadyToSpider = 0;
+		}
 		
 		// save it
 		replyBuf.safeMemcpy ( ci , sizeof(CrawlInfo) );
--- a/Spider.h
+++ b/Spider.h
@ -39,6 +39,7 @@
 #define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
 #define SP_COMPLETED    9 // crawl is done, and no repeatCrawl is scheduled

+void spiderRoundIncremented ( class CollectionRec *cr ) ;
 bool testPatterns ( ) ;
 bool doesStringContainPattern ( char *content , char *pattern ) ;