more respidering fixes

2013-10-23 17:05:56 -07:00 · 2013-10-23 17:05:56 -07:00 · 1b738466c1
commit 1b738466c1
parent 70d7f715df
3 changed files with 34 additions and 12 deletions
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -2649,18 +2649,18 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			//if ( cx->m_collectionNameAlias.length() > 0 )
 			//	alias=cx->m_collectionNameAlias.getBufStart();
 			//long paused = 1;
-			char *ss = "Normal";
+			char *ss = "In progress.";
 			if ( cx->m_spiderStatusMsg )
 				ss = cx->m_spiderStatusMsg;
 			// 0 means not to RE-crawl
 			char tmp[256];
 			// indicate if we are WAITING for next round...
 			if ( cx->m_collectiveRespiderFrequency > 0.0 &&
-			     getTimeGlobal() < cr->m_spiderRoundStartTime ) {
+			     getTimeGlobal() < cx->m_spiderRoundStartTime ) {
 				long now = getTimeGlobal();
 				sprintf(tmp,"Spidering next round in %li "
 					"seconds.",
-					cr->m_spiderRoundStartTime - now
+					cx->m_spiderRoundStartTime - now
 					);
 				ss = tmp;
 			}
@ -4109,7 +4109,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 	// if collectiverespiderfreq is 0 or less then do not RE-spider
 	// documents already indexed.
 	else {
-		cr->m_regExs[i].set("isindexed");
+		// this does NOT work! error docs continuosly respider
+		// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
+		//cr->m_regExs[i].set("isindexed");
+		cr->m_regExs[i].set("hasreply");
 		cr->m_spiderPriorities   [i] = 10;
 		// just turn off spidering. if we were to set priority to
 		// filtered it would be removed from index!
@ -4119,14 +4122,14 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 	}

 	// and for docs that have errors respider once every 5 hours
-	cr->m_regExs[i].set("hastmperror && errorcount>0 && errcount<3");
+	cr->m_regExs[i].set("errorcount>0 && errcount<3");
 	cr->m_spiderPriorities   [i] = 40;
 	cr->m_spiderFreqs        [i] = 0.2; // half a day
 	cr->m_spiderDiffbotApiUrl[i].purge();
 	i++;

 	// excessive errors? (tcp/dns timed out, etc.) retry once per month?
-	cr->m_regExs[i].set("hastmperror && errorcount>=3");
+	cr->m_regExs[i].set("errorcount>=3");
 	cr->m_spiderPriorities   [i] = 30;
 	cr->m_spiderFreqs        [i] = 30; // 30 days
 	cr->m_spiderDiffbotApiUrl[i].purge();
--- a/Parms.cpp
+++ b/Parms.cpp
@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
 			  "it from."
 			  "</td></tr>"

-			  "<tr><td>isnew | !isnew</td>"
+			  "<tr><td>hasreply | !hasreply</td>"
 			  "<td>"
-			  "This is true if we have never tried to spider "
-			  "this url. If we have tried to spider it and "
-			  "received an error, like a timeout or something, "
-			  "then it will no longer match <i>isnew</i>."
+			  "This is true if we have tried to spider "
+			  "this url, even if we got an error while trying."
 			  "</td></tr>"
 			  

--- a/Spider.cpp
+++ b/Spider.cpp
@ -4893,6 +4893,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	// there are urls ready to spider
 	ci->m_hasUrlsReadyToSpider = true;

+	// reset reason why crawl is not running, because we basically are now
+	cr->m_spiderStatus = 0;
+	cr->m_spiderStatusMsg = NULL;
+
 	// be sure to save state so we do not re-send emails
 	cr->m_needsSave = 1;

@ -8361,6 +8365,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) return -1;
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if we got a reply, we are not new!!
+			if ( (bool)srep == (bool)val ) continue;
+			// skip it for speed
+			p += 8;
+			// check for &&
+			p = strstr(p, "&&");
+			// if nothing, else then it is a match
+			if ( ! p ) return i;
+			// skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// hastmperror, if while spidering, the last reply was
 		// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
 		// usually temporary condition that warrants a retry
@ -8841,7 +8863,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			p += 2;
 			goto checkNextRule;
 		}
-
 		// iswww, means url is like www.xyz.com/...
 		if ( strncmp(p,"iswww", 5) == 0 ) {
 			// now this is a bit