fix critical spider issue of an IP corking an

entire spider priority. also exit faster on
'save & exit' if in evalIpLoop().
This commit is contained in:
Matt Wells 2016-01-23 10:32:14 -08:00
parent 7e13b147e4
commit 4c7f969988

@ -3198,6 +3198,13 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
return;
}
// are we trying to exit? some firstip lists can be quite long, so
// terminate here so all threads can return and we can exit properly
if ( g_process.m_mode == EXIT_MODE ) {
m_isPopulatingDoledb = false;
return;
}
// . get next IP that is due to be spidered from
// . also sets m_waitingTreeKey so we can delete it easily!
int32_t ip = getNextIpFromWaitingTree();
@ -3564,6 +3571,10 @@ bool SpiderColl::evalIpLoop ( ) {
// sanity
if ( m_scanningIp == 0 || m_scanningIp == -1 ) { char *xx=NULL;*xx=0;}
// are we trying to exit? some firstip lists can be quite long, so
// terminate here so all threads can return and we can exit properly
if ( g_process.m_mode == EXIT_MODE ) return true;
// if this ip is in the winnerlistcache use that. it saves
// us a lot of time.
key_t cacheKey;
@ -7236,7 +7247,14 @@ void SpiderLoop::spiderDoledUrls ( ) {
// "goto listLoop" below if the url we want
// to dole is locked.
// seems like a ton of negative recs
2000 , // minRecSizes
// MDW: let's now read in 50k, not 2k,of doledb
// spiderrequests because often the first one
// has an ip already in use and then we'd
// just give up on the whole PRIORITY! which
// really freezes the spiders up.
// Also, if a spider request is corrupt in
// doledb it would cork us up too!
50000 , // minRecSizes
true , // includeTree
false , // addToCache
0 , // max cache age
@ -7575,7 +7593,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// skip? and re-get another doledb list from next priority...
if ( out >= max ) {
// come here if we hit our ip limit too
hitMax:
// hitMax:
// assume we could have launched a spider
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// this priority is maxed out, try next
@ -7607,6 +7625,9 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// what is this? a dataless positive key?
if ( m_list.getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0; }
int32_t ipOut = 0;
int32_t globalOut = 0;
// get the "spider rec" (SpiderRequest) (embedded in the doledb rec)
SpiderRequest *sreq = (SpiderRequest *)(rec + sizeof(key_t)+4);
// sanity check. check for http(s)://
@ -7618,20 +7639,19 @@ bool SpiderLoop::gotDoledbList2 ( ) {
log("spider: got corrupt doledb record. ignoring. "
"pls fix!!! cn=%i",(int)m_collnum);
g_corruptCount++;
goto skipDoledbRec;
// skip for now....!! what is causing this???
m_list.skipCurrentRecord();
//m_list.skipCurrentRecord();
// if exhausted -- try another load with m_nextKey set
if ( m_list.isExhausted() ) return true;
//if ( m_list.isExhausted() ) return true;
// otherwise, try the next doledb rec in this list
goto listLoop;
//goto listLoop;
}
// . how many spiders out for this ip now?
// . TODO: count locks in case twin is spidering... but it did not seem
// to work right for some reason
int32_t ipOut = 0;
int32_t globalOut = 0;
for ( int32_t i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
@ -7648,13 +7668,40 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( xd->m_collnum != cr->m_collnum ) continue;
if ( xd->m_sreq.m_firstIp == sreq->m_firstIp ) ipOut++;
}
if ( ipOut >= maxSpidersOutPerIp ) goto hitMax;
// don't give up on this priority, just try next in the list.
// we now read 50k instead of 2k from doledb in order to fix
// one ip from bottle corking the whole priority!!
if ( ipOut >= maxSpidersOutPerIp ) {
// assume we could have launched a spider
if ( maxSpidersOutPerIp > 0 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
//goto hitMax;
skipDoledbRec:
// skip
m_list.skipCurrentRecord();
// if not exhausted try the next doledb rec in this list
if ( ! m_list.isExhausted() ) goto listLoop;
// print a log msg if we corked things up even
// though we read 50k from doledb
static bool s_flag = true;
if ( m_list.m_listSize > 50000 && s_flag ) {
s_flag = true;
log("spider: 50k not big enough");
}
// list is exhausted...
return true;
}
// but if the global is high, only allow one out per coll so at
// least we dont starve and at least we don't make a huge wait in
// line of queued results just sitting there taking up mem and
// spider slots so the crawlbot hourly can't pass.
if ( globalOut >= maxSpidersOutPerIp && ipOut >= 1 ) goto hitMax;
if ( globalOut >= maxSpidersOutPerIp && ipOut >= 1 ) {
// assume we could have launched a spider
if ( maxSpidersOutPerIp > 0 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
goto skipDoledbRec;
}
if ( g_conf.m_logDebugSpider )
log("spider: %"INT32" spiders out for %s for %s",