Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

This commit is contained in:
Matt 2015-09-09 19:26:43 -06:00
commit b2f3c44650
8 changed files with 244 additions and 89 deletions

@ -1847,6 +1847,8 @@ void CollectionRec::reset() {
m_hasucr = false;
m_hasupr = false;
m_sendingAlertInProgress = false;
// make sure we do not leave spiders "hanging" waiting for their
// callback to be called... and it never gets called
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }

@ -766,7 +766,7 @@ class CollectionRec {
// last time we computed global crawl info
//time_t m_globalCrawlInfoUpdateTime;
EmailInfo m_emailInfo;
//EmailInfo m_emailInfo;
// for counting replies
//int32_t m_replies;
//int32_t m_requests;
@ -974,6 +974,8 @@ class CollectionRec {
// NARROW SEARCH
char m_doNarrowSearch;
char m_sendingAlertInProgress;
// Allow Links: searches on the collection
//char m_allowLinksSearch;
// . reference pages parameters

@ -3347,7 +3347,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</tr>"
"<tr>"
"<td><b>Crawl Completion Time:</td>"
"<td><b>Last Crawl Completion Time:</td>"
"<td>%"UINT32"</td>"
"</tr>"
@ -3362,6 +3362,46 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<td>%"INT32"</td>"
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (int32_t)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cr->m_diffbotCrawlEndTime
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
);
// show crawlinfo crap
CrawlInfo *cis = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
sb.safePrintf("<tr><td><b>Ready Hosts</b></td><td>");
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
CrawlInfo *ci = &cis[i];
if ( ! ci->m_hasUrlsReadyToSpider ) continue;
Host *h = g_hostdb.getHost ( i );
if ( ! h ) continue;
sb.safePrintf("<a href=http://%s:%i/crawlbot?c=%s>"
"%i</a> "
, iptoa(h->m_ip)
, (int)h->m_httpPort
, cr->m_coll
, (int)i
);
}
sb.safePrintf("</tr>\n");
sb.safePrintf(
// this will have to be in crawlinfo too!
//"<tr>"
@ -3416,24 +3456,6 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (int32_t)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_diffbotCrawlStartTime
// this is 0 if not over yet
, cr->m_diffbotCrawlEndTime
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, cr->m_globalCrawlInfo.m_objectsAdded -
cr->m_globalCrawlInfo.m_objectsDeleted
, cr->m_globalCrawlInfo.m_urlsHarvested

@ -3228,9 +3228,11 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
// wait for post url to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
//ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
}
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
@ -3242,9 +3244,11 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
// wait for email to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
//ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
}
// for printCrawlDetailsInJson()
@ -3259,7 +3263,7 @@ bool sendNotification ( EmailInfo *ei ) {
//log("ping: NOT SENDING NOTIFICATION -- DEBUG!!");
//return true;
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
//if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
// caller must set this, as well as m_finalCallback/m_finalState
CollectionRec *cr = g_collectiondb.m_recs[ei->m_collnum];
@ -3275,7 +3279,7 @@ bool sendNotification ( EmailInfo *ei ) {
// sanity check, can only call once
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
ei->m_inUse = true;
//ei->m_inUse = true;
if ( email && email[0] ) {
@ -3371,7 +3375,9 @@ bool sendNotification ( EmailInfo *ei ) {
}
if ( ei->m_notifyBlocked == 0 ) {
ei->m_inUse = false;
//ei->m_inUse = false;
// nuke it
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
return true;
}

@ -30,16 +30,17 @@ public:
// ip address of MX record for this domain
int32_t m_mxIp;
int32_t m_notifyBlocked;
bool m_inUse;
class CollectionRec *m_collRec;
//bool m_inUse;
EmailInfo() {
memset ( this,0,sizeof(EmailInfo) );
};
void reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
memset ( this,0,sizeof(EmailInfo) );
};
//EmailInfo() {
// memset ( this,0,sizeof(EmailInfo) );
//};
//void reset() {
// if ( m_inUse ) { char *xx=NULL;*xx=0; }
// if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
// memset ( this,0,sizeof(EmailInfo) );
//};
};
class PingServer {

@ -1132,6 +1132,8 @@ bool RdbBase::incorporateMerge ( ) {
if ( ! m_files[i]->unlink ( doneWrapper , this ) ) {
m_numThreads++; g_numThreads++; }
// debug msg
// MDW this cores if file is bad... if collection
// got delete from under us i guess!!
else log(LOG_INFO,"merge: Unlinked %s (#%"INT32").",
m_files[i]->getFilename(),i);
// debug msg

@ -1219,7 +1219,7 @@ SpiderColl::SpiderColl () {
m_gettingList1 = false;
m_gettingList2 = false;
m_lastScanTime = 0;
m_isPopulating = false;
m_isPopulatingDoledb = false;
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
@ -1270,7 +1270,7 @@ bool SpiderColl::load ( ) {
// reset this once
//m_msg1Avail = true;
m_isPopulating = false;
m_isPopulatingDoledb = false;
// keep it kinda low if we got a ton of collections
int32_t maxMem = 15000;
@ -1820,7 +1820,7 @@ void SpiderColl::reset ( ) {
m_twinDied = false;
m_lastUrlFiltersUpdate = 0;
m_isPopulating = false;
m_isPopulatingDoledb = false;
char *coll = "unknown";
if ( m_coll[0] ) coll = m_coll;
@ -2832,6 +2832,8 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( THIS ,"2") ) return;
THIS->m_gettingList2 = false;
THIS->populateWaitingTreeFromSpiderdb ( true );
}
@ -2965,6 +2967,9 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
SpiderRequest *sreq = (SpiderRequest *)rec;
// get first ip
int32_t firstIp = sreq->m_firstIp;
// corruption?
// if ( firstIp == 0 || firstIp == -1 )
// gotCorruption = true;
// if same as last, skip it
if ( firstIp == lastOne ) continue;
// set this lastOne for speed
@ -3014,7 +3019,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
}
// are we the final list in the scan?
bool int16_tRead = ( list->getListSize() <= 0);// (int32_t)SR_READ_SIZE ) ;
bool shortRead = ( list->getListSize() <= 0);//(int32_t)SR_READ_SIZE) ;
m_numBytesScanned += list->getListSize();
@ -3036,21 +3041,40 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
g_errno = 0;
// if not done, keep going
if ( ! int16_tRead ) {
if ( ! shortRead ) {
// . inc it here
// . it can also be reset on a collection rec update
key128_t endKey = *(key128_t *)list->getLastKey();
m_nextKey2 = endKey;
m_nextKey2 += (uint32_t) 1;
key128_t lastKey = *(key128_t *)list->getLastKey();
if ( lastKey < m_nextKey2 ) {
log("spider: got corruption 9. spiderdb "
"keys out of order for "
"collnum=%"INT32, (int32_t)m_collnum);
g_corruptCount++;
// this should result in an empty list read for
// our next scan of spiderdb. unfortunately we could
// miss a lot of spider requests then
m_nextKey2 = m_endKey2;
}
else {
m_nextKey2 = lastKey;
m_nextKey2 += (uint32_t) 1;
}
// watch out for wrap around
if ( m_nextKey2 < endKey ) int16_tRead = true;
if ( m_nextKey2 < lastKey ) shortRead = true;
// nah, advance the firstip, should be a lot faster when
// we are only a few firstips...
if ( lastOne && lastOne != -1 )
m_nextKey2 = g_spiderdb.makeFirstKey(lastOne+1);
if ( lastOne && lastOne != -1 ) { // && ! gotCorruption ) {
key128_t cand = g_spiderdb.makeFirstKey(lastOne+1);
// corruption still seems to happen, so only
// do this part if it increases the key to avoid
// putting us into an infinite loop.
if ( cand > m_nextKey2 ) m_nextKey2 = cand;
}
}
if ( int16_tRead ) {
if ( shortRead ) {
// mark when the scan completed so we can do another one
// like 24 hrs from that...
m_lastScanTime = getTimeLocal();
@ -3071,6 +3095,12 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
m_nextKey2.setMin();
// no longer need rebuild
m_waitingTreeNeedsRebuild = false;
// and re-send the crawlinfo in handerequestc1 to each host
// so they no if we have urls ready to spider or not. because
// if we told them no before we completed this rebuild we might
// have found some urls.
// MDW: let's not do this unless we find it is a problem
//m_cr->localCrawlInfoUpdate();
}
// free list to save memory
@ -3113,9 +3143,13 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// only one loop can run at a time!
//if ( ! reentry && m_isPopulating ) return;
if ( m_isPopulating ) return;
if ( m_isPopulatingDoledb ) return;
// skip if in repair mode
if ( g_repairMode ) return;
// if rebuilding the waiting tree, do that first
// MDW. re-allow us to populate doledb while waiting tree is being
// build so spiders can go right away. i had this in there to debug.
//if ( m_waitingTreeNeedsRebuild ) return;
// let's skip if spiders off so we can inject/popoulate the index quick
// since addSpiderRequest() calls addToWaitingTree() which then calls
@ -3136,28 +3170,35 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// m_waitingTree.m_numUsedNodes);
// set this flag so we are not re-entered
m_isPopulating = true;
m_isPopulatingDoledb = true;
loop:
// if waiting tree is being saved, we can't write to it
// so in that case, bail and wait to be called another time
RdbTree *wt = &m_waitingTree;
if( wt->m_isSaving || ! wt->m_isWritable ) {
m_isPopulating = false;
m_isPopulatingDoledb = false;
return;
}
// . get next IP that is due to be spidered from
// . also sets m_waitingTreeKey so we can delete it easily!
int32_t ip = getNextIpFromWaitingTree();
// . return if none. all done. unset populating flag.
// . it returns 0 if the next firstip has a spidertime in the future
if ( ip == 0 ) { m_isPopulating = false; return; }
if ( ip == 0 ) { m_isPopulatingDoledb = false; return; }
// set read range for scanning spiderdb
m_nextKey = g_spiderdb.makeFirstKey(ip);
m_endKey = g_spiderdb.makeLastKey (ip);
if ( g_conf.m_logDebugSpider )
log("spider: for cn=%i nextip=%s nextkey=%s",
(int)m_collnum,
iptoa(ip),
KEYSTR(&m_nextKey,sizeof(key128_t)));
//////
//
// do TWO PASSES, one to count pages, the other to get the best url!!
@ -3279,7 +3320,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
false, // useprotection?
false, // allowdups?
-1 ) ) { // rdbid
m_isPopulating = false;
m_isPopulatingDoledb = false;
log("spider: winntree set: %s",mstrerror(g_errno));
return;
}
@ -3293,7 +3334,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
false , // allow dups?
MAX_NICENESS ,
"wtdedup" ) ) {
m_isPopulating = false;
m_isPopulatingDoledb = false;
log("spider: wintable set: %s",mstrerror(g_errno));
return;
}
@ -3321,7 +3362,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
// oom error? i've seen this happen and we end up locking up!
if ( g_errno ) {
log("spider: evalIpLoop: %s",mstrerror(g_errno));
m_isPopulating = false;
m_isPopulatingDoledb = false;
return;
}
// try more
@ -3477,8 +3518,8 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
// return if that blocked
if ( ! THIS->evalIpLoop() ) return;
// we are done, re-entry popuatedoledb
THIS->m_isPopulating = false;
// gotta set m_isPopulating to false lest it won't work
THIS->m_isPopulatingDoledb = false;
// gotta set m_isPopulatingDoledb to false lest it won't work
THIS->populateDoledbFromWaitingTree ( );
}
@ -3624,16 +3665,32 @@ bool SpiderColl::evalIpLoop ( ) {
if ( ! m_list.isEmpty() ) {
// update m_nextKey for successive reads of spiderdb by
// calling readListFromSpiderdb()
key128_t endKey = *(key128_t *)m_list.getLastKey();
key128_t lastKey = *(key128_t *)m_list.getLastKey();
// sanity
//if ( endKey != finalKey ) { char *xx=NULL;*xx=0; }
m_nextKey = endKey;
m_nextKey += (uint32_t) 1;
// crazy corruption?
if ( lastKey < m_nextKey ) {
log("spider: got corruption. spiderdb "
"keys out of order for "
"collnum=%"INT32" for evaluation of "
"firstip=%s so terminating evaluation of that "
"firstip." ,
(int32_t)m_collnum,
iptoa(m_scanningIp));
g_corruptCount++;
// this should result in an empty list read for
// m_scanningIp in spiderdb
m_nextKey = m_endKey;
}
else {
m_nextKey = lastKey;
m_nextKey += (uint32_t) 1;
}
// . watch out for wrap around
// . normally i would go by this to indicate that we are
// done reading, but there's some bugs... so we go
// by whether our list is empty or not for now
if ( m_nextKey < endKey ) m_nextKey = endKey;
if ( m_nextKey < lastKey ) m_nextKey = lastKey;
// reset list to save mem
m_list.reset();
// read more! return if it blocked
@ -6026,20 +6083,46 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
// count these calls
s_count++;
top:
int32_t now = getTimeLocal();
// reset SpiderColl::m_didRound and m_nextDoledbKey if it is maxed
// because we might have had a lock collision
//int32_t nc = g_collectiondb.m_numRecs;
// start again at head
class CollectionRec *crp = g_spiderLoop.getActiveList();
redo:
// point to head of active linked list of collection recs
CollectionRec *nextActive = g_spiderLoop.getActiveList();
collnum_t nextActiveCollnum ;
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
//for ( int32_t i = 0 ; i < nc ; i++ ) {
for ( ; crp ; crp = crp->m_nextActive ) {
for ( ; nextActive ; ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// before we assign crp to nextActive, ensure that it did
// not get deleted on us.
// if the next collrec got deleted, tr will be NULL
CollectionRec *tr = g_collectiondb.getRec( nextActiveCollnum );
// if it got deleted or restarted then it will not
// match most likely
if ( tr != nextActive ) {
// this shouldn't happen much so log it
log("spider: collnum %"INT32" got deleted. "
"rebuilding active list",
(int32_t)nextActiveCollnum);
// rebuild the active list now
goto redo;
}
// now we become him
CollectionRec *crp = nextActive;
// update these two vars for next iteration
nextActive = crp->m_nextActive;
nextActiveCollnum = -1;
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
// if list was modified a collection was deleted/added
if ( g_spiderLoop.m_activeListModified ) goto top;
//if ( g_spiderLoop.m_activeListModified ) goto top;
// // get collectionrec
// CollectionRec *cr = g_collectiondb.getRec(i);
// if ( ! cr ) continue;
@ -6052,28 +6135,30 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
if ( ! sc ) continue;
// also scan spiderdb to populate waiting tree now but
// only one read per 100ms!!
if ( (s_count % 10) == 0 ) {
// always do a scan at startup & every 24 hrs
// AND at process startup!!!
if ( ! sc->m_waitingTreeNeedsRebuild &&
getTimeLocal() - sc->m_lastScanTime > 24*3600) {
// if a scan is ongoing, this will re-set it
sc->m_nextKey2.setMin();
sc->m_waitingTreeNeedsRebuild = true;
log(LOG_INFO,
"spider: hit spider queue "
"rebuild timeout for %s (%"INT32")",
crp->m_coll,(int32_t)crp->m_collnum);
// flush the ufn table
//clearUfnTable();
}
// try this then. it just returns if
// sc->m_waitingTreeNeedsRebuild is false so it
// should be fast in those cases
sc->populateWaitingTreeFromSpiderdb ( false );
// MDW: try taking this out
//if ( (s_count % 10) == 0 ) {
// always do a scan at startup & every 24 hrs
// AND at process startup!!!
if ( ! sc->m_waitingTreeNeedsRebuild &&
now - sc->m_lastScanTime > 24*3600) {
// if a scan is ongoing, this will re-set it
sc->m_nextKey2.setMin();
sc->m_waitingTreeNeedsRebuild = true;
log(LOG_INFO,
"spider: hit spider queue "
"rebuild timeout for %s (%"INT32")",
crp->m_coll,(int32_t)crp->m_collnum);
// flush the ufn table
//clearUfnTable();
}
// try this then. it just returns if
// sc->m_waitingTreeNeedsRebuild is false so it
// should be fast in those cases
sc->populateWaitingTreeFromSpiderdb ( false );
//}
// if list was modified a collection was deleted/added
if ( g_spiderLoop.m_activeListModified ) goto top;
//if ( g_spiderLoop.m_activeListModified ) goto top;
// re-entry is false because we are entering for the first time
sc->populateDoledbFromWaitingTree ( );
// . skip if still loading doledb lists from disk this round
@ -6092,7 +6177,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
//sc->m_encounteredDoledbRecs = false;
//sc->m_nextDoledbKey.setMin();
// if list was modified a collection was deleted/added
if ( g_spiderLoop.m_activeListModified ) goto top;
//if ( g_spiderLoop.m_activeListModified ) goto top;
}
// set initial priority to the highest to start spidering there
@ -6106,13 +6191,18 @@ void doneSendingNotification ( void *state ) {
EmailInfo *ei = (EmailInfo *)state;
collnum_t collnum = ei->m_collnum;
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( cr != ei->m_collRec ) cr = NULL;
char *coll = "lostcoll";
if ( cr ) coll = cr->m_coll;
log(LOG_INFO,"spider: done sending notifications for coll=%s", coll);
log(LOG_INFO,"spider: done sending notifications for coll=%s (%i)",
coll,(int)ei->m_collnum);
// all done if collection was deleted from under us
if ( ! cr ) return;
// do not re-call this stuff
cr->m_sendingAlertInProgress = false;
// we can re-use the EmailInfo class now
// pingserver.cpp sets this
//ei->m_inUse = false;
@ -6268,11 +6358,19 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
// since we reset global.
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return true;
if ( cr->m_sendingAlertInProgress ) return true;
// ok, send it
EmailInfo *ei = &cr->m_emailInfo;
//EmailInfo *ei = &cr->m_emailInfo;
EmailInfo *ei = (EmailInfo *)mcalloc ( sizeof(EmailInfo),"eialrt");
if ( ! ei ) {
log("spider: could not send email alert: %s",
mstrerror(g_errno));
return true;
}
// in use already?
if ( ei->m_inUse ) return true;
//if ( ei->m_inUse ) return true;
// pingserver.cpp sets this
//ei->m_inUse = true;
@ -6281,6 +6379,8 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
ei->m_finalCallback = doneSendingNotification;
ei->m_finalState = ei;
ei->m_collnum = cr->m_collnum;
// collnums can be recycled, so ensure collection with the ptr
ei->m_collRec = cr;
SafeBuf *buf = &ei->m_spiderStatusMsg;
// stop it from accumulating status msgs
@ -6293,6 +6393,9 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
//log("spider: SENDING EMAIL NOT");
// do not re-call this stuff
cr->m_sendingAlertInProgress = true;
// ok, put it back...
if ( ! sendNotification ( ei ) ) return false;
@ -7443,6 +7546,11 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( cr->m_spiderStatus == SP_INITIALIZING ) {
// this is the GLOBAL crawl info, not the LOCAL, which
// is what "ci" represents...
// MDW: is this causing the bug?
// the other have already reported that there are no urls
// to spider, so they do not re-report. we already
// had 'hasurlsreadytospider' set to true so we didn't get
// the reviving log msg.
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
// set this right i guess...?
ci->m_lastSpiderAttempt = nowGlobal;
@ -11339,6 +11447,10 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( errCode != EDNSTIMEDOUT &&
errCode != ETCPTIMEDOUT &&
errCode != EDNSDEAD &&
// add this here too now because we had some
// seeds that failed one time and the crawl
// never repeated after that!
errCode != EBADIP &&
// assume diffbot is temporarily experiencing errs
errCode != EDIFFBOTINTERNALERROR &&
// if diffbot received empty content when d'lding
@ -13459,6 +13571,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
cr->m_crawlInfoBuf.reserve(need);
// in case one was udp server timed out or something
cr->m_crawlInfoBuf.zeroOut();
cr->m_crawlInfoBuf.setLabel("cibuf");
}
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
@ -13656,6 +13769,10 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// if spidering disabled in master controls then send no
// notifications
// crap, but then we can not update the round start time
// because that is done in doneSendingNotification().
// but why does it say all 32 report done, but then
// it has urls ready to spider?
if ( ! g_conf.m_spideringEnabled )
continue;
@ -14466,6 +14583,9 @@ void SpiderLoop::buildActiveList ( ) {
// }
}
// MDW: let's not do this either unless it proves to be a prob
//if ( sc->m_needsRebuild ) active = true;
// we are at the tail of the linked list OR not in the list
cr->m_nextActive = NULL;

@ -1136,7 +1136,7 @@ class SpiderColl {
bool m_isLoading;
// for scanning the wait tree...
bool m_isPopulating;
bool m_isPopulatingDoledb;
// for reading from spiderdb
//bool m_isReadDone;
bool m_didRead;