more respidering fixes

This commit is contained in:
Matt Wells 2013-10-23 17:05:56 -07:00
parent 70d7f715df
commit 1b738466c1
3 changed files with 34 additions and 12 deletions

@ -2649,18 +2649,18 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//if ( cx->m_collectionNameAlias.length() > 0 )
// alias=cx->m_collectionNameAlias.getBufStart();
//long paused = 1;
char *ss = "Normal";
char *ss = "In progress.";
if ( cx->m_spiderStatusMsg )
ss = cx->m_spiderStatusMsg;
// 0 means not to RE-crawl
char tmp[256];
// indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
getTimeGlobal() < cr->m_spiderRoundStartTime ) {
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
long now = getTimeGlobal();
sprintf(tmp,"Spidering next round in %li "
"seconds.",
cr->m_spiderRoundStartTime - now
cx->m_spiderRoundStartTime - now
);
ss = tmp;
}
@ -4109,7 +4109,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
else {
cr->m_regExs[i].set("isindexed");
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//cr->m_regExs[i].set("isindexed");
cr->m_regExs[i].set("hasreply");
cr->m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
@ -4119,14 +4122,14 @@ bool resetUrlFilters ( CollectionRec *cr ) {
}
// and for docs that have errors respider once every 5 hours
cr->m_regExs[i].set("hastmperror && errorcount>0 && errcount<3");
cr->m_regExs[i].set("errorcount>0 && errcount<3");
cr->m_spiderPriorities [i] = 40;
cr->m_spiderFreqs [i] = 0.2; // half a day
cr->m_spiderDiffbotApiUrl[i].purge();
i++;
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
cr->m_regExs[i].set("hastmperror && errorcount>=3");
cr->m_regExs[i].set("errorcount>=3");
cr->m_spiderPriorities [i] = 30;
cr->m_spiderFreqs [i] = 30; // 30 days
cr->m_spiderDiffbotApiUrl[i].purge();

@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
"it from."
"</td></tr>"
"<tr><td>isnew | !isnew</td>"
"<tr><td>hasreply | !hasreply</td>"
"<td>"
"This is true if we have never tried to spider "
"this url. If we have tried to spider it and "
"received an error, like a timeout or something, "
"then it will no longer match <i>isnew</i>."
"This is true if we have tried to spider "
"this url, even if we got an error while trying."
"</td></tr>"

@ -4893,6 +4893,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// there are urls ready to spider
ci->m_hasUrlsReadyToSpider = true;
// reset reason why crawl is not running, because we basically are now
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
@ -8361,6 +8365,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
if ( (bool)srep == (bool)val ) continue;
// skip it for speed
p += 8;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// hastmperror, if while spidering, the last reply was
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
// usually temporary condition that warrants a retry
@ -8841,7 +8863,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
p += 2;
goto checkNextRule;
}
// iswww, means url is like www.xyz.com/...
if ( strncmp(p,"iswww", 5) == 0 ) {
// now this is a bit