more respidering fixes
This commit is contained in:
parent
70d7f715df
commit
1b738466c1
@ -2649,18 +2649,18 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//if ( cx->m_collectionNameAlias.length() > 0 )
|
||||
// alias=cx->m_collectionNameAlias.getBufStart();
|
||||
//long paused = 1;
|
||||
char *ss = "Normal";
|
||||
char *ss = "In progress.";
|
||||
if ( cx->m_spiderStatusMsg )
|
||||
ss = cx->m_spiderStatusMsg;
|
||||
// 0 means not to RE-crawl
|
||||
char tmp[256];
|
||||
// indicate if we are WAITING for next round...
|
||||
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
getTimeGlobal() < cr->m_spiderRoundStartTime ) {
|
||||
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
|
||||
long now = getTimeGlobal();
|
||||
sprintf(tmp,"Spidering next round in %li "
|
||||
"seconds.",
|
||||
cr->m_spiderRoundStartTime - now
|
||||
cx->m_spiderRoundStartTime - now
|
||||
);
|
||||
ss = tmp;
|
||||
}
|
||||
@ -4109,7 +4109,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
// if collectiverespiderfreq is 0 or less then do not RE-spider
|
||||
// documents already indexed.
|
||||
else {
|
||||
cr->m_regExs[i].set("isindexed");
|
||||
// this does NOT work! error docs continuosly respider
|
||||
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
|
||||
//cr->m_regExs[i].set("isindexed");
|
||||
cr->m_regExs[i].set("hasreply");
|
||||
cr->m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
@ -4119,14 +4122,14 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
}
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
cr->m_regExs[i].set("hastmperror && errorcount>0 && errcount<3");
|
||||
cr->m_regExs[i].set("errorcount>0 && errcount<3");
|
||||
cr->m_spiderPriorities [i] = 40;
|
||||
cr->m_spiderFreqs [i] = 0.2; // half a day
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();
|
||||
i++;
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
cr->m_regExs[i].set("hastmperror && errorcount>=3");
|
||||
cr->m_regExs[i].set("errorcount>=3");
|
||||
cr->m_spiderPriorities [i] = 30;
|
||||
cr->m_spiderFreqs [i] = 30; // 30 days
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();
|
||||
|
@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
|
||||
"it from."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td>isnew | !isnew</td>"
|
||||
"<tr><td>hasreply | !hasreply</td>"
|
||||
"<td>"
|
||||
"This is true if we have never tried to spider "
|
||||
"this url. If we have tried to spider it and "
|
||||
"received an error, like a timeout or something, "
|
||||
"then it will no longer match <i>isnew</i>."
|
||||
"This is true if we have tried to spider "
|
||||
"this url, even if we got an error while trying."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
|
23
Spider.cpp
23
Spider.cpp
@ -4893,6 +4893,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
// there are urls ready to spider
|
||||
ci->m_hasUrlsReadyToSpider = true;
|
||||
|
||||
// reset reason why crawl is not running, because we basically are now
|
||||
cr->m_spiderStatus = 0;
|
||||
cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// be sure to save state so we do not re-send emails
|
||||
cr->m_needsSave = 1;
|
||||
|
||||
@ -8361,6 +8365,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) return -1;
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if we got a reply, we are not new!!
|
||||
if ( (bool)srep == (bool)val ) continue;
|
||||
// skip it for speed
|
||||
p += 8;
|
||||
// check for &&
|
||||
p = strstr(p, "&&");
|
||||
// if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
// skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// hastmperror, if while spidering, the last reply was
|
||||
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
|
||||
// usually temporary condition that warrants a retry
|
||||
@ -8841,7 +8863,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// iswww, means url is like www.xyz.com/...
|
||||
if ( strncmp(p,"iswww", 5) == 0 ) {
|
||||
// now this is a bit
|
||||
|
Loading…
x
Reference in New Issue
Block a user