Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
This commit is contained in:
commit
b2f3c44650
@ -1847,6 +1847,8 @@ void CollectionRec::reset() {
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
m_sendingAlertInProgress = false;
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
@ -766,7 +766,7 @@ class CollectionRec {
|
||||
|
||||
// last time we computed global crawl info
|
||||
//time_t m_globalCrawlInfoUpdateTime;
|
||||
EmailInfo m_emailInfo;
|
||||
//EmailInfo m_emailInfo;
|
||||
// for counting replies
|
||||
//int32_t m_replies;
|
||||
//int32_t m_requests;
|
||||
@ -974,6 +974,8 @@ class CollectionRec {
|
||||
// NARROW SEARCH
|
||||
char m_doNarrowSearch;
|
||||
|
||||
char m_sendingAlertInProgress;
|
||||
|
||||
// Allow Links: searches on the collection
|
||||
//char m_allowLinksSearch;
|
||||
// . reference pages parameters
|
||||
|
@ -3347,7 +3347,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Completion Time:</td>"
|
||||
"<td><b>Last Crawl Completion Time:</td>"
|
||||
"<td>%"UINT32"</td>"
|
||||
"</tr>"
|
||||
|
||||
@ -3362,6 +3362,46 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<td>%"INT32"</td>"
|
||||
"</tr>"
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
, (int32_t)cr->m_isCustomCrawl
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
|
||||
, cr->m_diffbotCrawlStartTime
|
||||
// this is 0 if not over yet
|
||||
, cr->m_diffbotCrawlEndTime
|
||||
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
|
||||
);
|
||||
|
||||
// show crawlinfo crap
|
||||
CrawlInfo *cis = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
|
||||
sb.safePrintf("<tr><td><b>Ready Hosts</b></td><td>");
|
||||
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
||||
CrawlInfo *ci = &cis[i];
|
||||
if ( ! ci->m_hasUrlsReadyToSpider ) continue;
|
||||
Host *h = g_hostdb.getHost ( i );
|
||||
if ( ! h ) continue;
|
||||
sb.safePrintf("<a href=http://%s:%i/crawlbot?c=%s>"
|
||||
"%i</a> "
|
||||
, iptoa(h->m_ip)
|
||||
, (int)h->m_httpPort
|
||||
, cr->m_coll
|
||||
, (int)i
|
||||
);
|
||||
}
|
||||
sb.safePrintf("</tr>\n");
|
||||
|
||||
|
||||
sb.safePrintf(
|
||||
|
||||
// this will have to be in crawlinfo too!
|
||||
//"<tr>"
|
||||
@ -3416,24 +3456,6 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</tr>"
|
||||
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
|
||||
, (int32_t)cr->m_isCustomCrawl
|
||||
|
||||
, cr->m_diffbotToken.getBufStart()
|
||||
|
||||
, seedStr
|
||||
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
|
||||
, cr->m_diffbotCrawlStartTime
|
||||
// this is 0 if not over yet
|
||||
, cr->m_diffbotCrawlEndTime
|
||||
|
||||
, cr->m_spiderRoundNum
|
||||
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
||||
|
||||
, cr->m_globalCrawlInfo.m_objectsAdded -
|
||||
cr->m_globalCrawlInfo.m_objectsDeleted
|
||||
, cr->m_globalCrawlInfo.m_urlsHarvested
|
||||
|
@ -3228,9 +3228,11 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
// wait for post url to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
}
|
||||
|
||||
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
@ -3242,9 +3244,11 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
// wait for email to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
}
|
||||
|
||||
// for printCrawlDetailsInJson()
|
||||
@ -3259,7 +3263,7 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
//log("ping: NOT SENDING NOTIFICATION -- DEBUG!!");
|
||||
//return true;
|
||||
|
||||
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// caller must set this, as well as m_finalCallback/m_finalState
|
||||
CollectionRec *cr = g_collectiondb.m_recs[ei->m_collnum];
|
||||
@ -3275,7 +3279,7 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
// sanity check, can only call once
|
||||
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
ei->m_inUse = true;
|
||||
//ei->m_inUse = true;
|
||||
|
||||
|
||||
if ( email && email[0] ) {
|
||||
@ -3371,7 +3375,9 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
}
|
||||
|
||||
if ( ei->m_notifyBlocked == 0 ) {
|
||||
ei->m_inUse = false;
|
||||
//ei->m_inUse = false;
|
||||
// nuke it
|
||||
mfree ( ei , sizeof(EmailInfo) ,"eialrt" );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
19
PingServer.h
19
PingServer.h
@ -30,16 +30,17 @@ public:
|
||||
// ip address of MX record for this domain
|
||||
int32_t m_mxIp;
|
||||
int32_t m_notifyBlocked;
|
||||
bool m_inUse;
|
||||
class CollectionRec *m_collRec;
|
||||
//bool m_inUse;
|
||||
|
||||
EmailInfo() {
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
void reset() {
|
||||
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
//EmailInfo() {
|
||||
// memset ( this,0,sizeof(EmailInfo) );
|
||||
//};
|
||||
//void reset() {
|
||||
// if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
// if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
// memset ( this,0,sizeof(EmailInfo) );
|
||||
//};
|
||||
};
|
||||
|
||||
class PingServer {
|
||||
|
@ -1132,6 +1132,8 @@ bool RdbBase::incorporateMerge ( ) {
|
||||
if ( ! m_files[i]->unlink ( doneWrapper , this ) ) {
|
||||
m_numThreads++; g_numThreads++; }
|
||||
// debug msg
|
||||
// MDW this cores if file is bad... if collection
|
||||
// got delete from under us i guess!!
|
||||
else log(LOG_INFO,"merge: Unlinked %s (#%"INT32").",
|
||||
m_files[i]->getFilename(),i);
|
||||
// debug msg
|
||||
|
228
Spider.cpp
228
Spider.cpp
@ -1219,7 +1219,7 @@ SpiderColl::SpiderColl () {
|
||||
m_gettingList1 = false;
|
||||
m_gettingList2 = false;
|
||||
m_lastScanTime = 0;
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
m_numAdded = 0;
|
||||
m_numBytesScanned = 0;
|
||||
m_lastPrintCount = 0;
|
||||
@ -1270,7 +1270,7 @@ bool SpiderColl::load ( ) {
|
||||
|
||||
// reset this once
|
||||
//m_msg1Avail = true;
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
|
||||
// keep it kinda low if we got a ton of collections
|
||||
int32_t maxMem = 15000;
|
||||
@ -1820,7 +1820,7 @@ void SpiderColl::reset ( ) {
|
||||
m_twinDied = false;
|
||||
m_lastUrlFiltersUpdate = 0;
|
||||
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
|
||||
char *coll = "unknown";
|
||||
if ( m_coll[0] ) coll = m_coll;
|
||||
@ -2832,6 +2832,8 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( THIS ,"2") ) return;
|
||||
|
||||
THIS->m_gettingList2 = false;
|
||||
|
||||
THIS->populateWaitingTreeFromSpiderdb ( true );
|
||||
}
|
||||
|
||||
@ -2965,6 +2967,9 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
SpiderRequest *sreq = (SpiderRequest *)rec;
|
||||
// get first ip
|
||||
int32_t firstIp = sreq->m_firstIp;
|
||||
// corruption?
|
||||
// if ( firstIp == 0 || firstIp == -1 )
|
||||
// gotCorruption = true;
|
||||
// if same as last, skip it
|
||||
if ( firstIp == lastOne ) continue;
|
||||
// set this lastOne for speed
|
||||
@ -3014,7 +3019,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
}
|
||||
|
||||
// are we the final list in the scan?
|
||||
bool int16_tRead = ( list->getListSize() <= 0);// (int32_t)SR_READ_SIZE ) ;
|
||||
bool shortRead = ( list->getListSize() <= 0);//(int32_t)SR_READ_SIZE) ;
|
||||
|
||||
m_numBytesScanned += list->getListSize();
|
||||
|
||||
@ -3036,21 +3041,40 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
g_errno = 0;
|
||||
|
||||
// if not done, keep going
|
||||
if ( ! int16_tRead ) {
|
||||
if ( ! shortRead ) {
|
||||
// . inc it here
|
||||
// . it can also be reset on a collection rec update
|
||||
key128_t endKey = *(key128_t *)list->getLastKey();
|
||||
m_nextKey2 = endKey;
|
||||
m_nextKey2 += (uint32_t) 1;
|
||||
key128_t lastKey = *(key128_t *)list->getLastKey();
|
||||
|
||||
if ( lastKey < m_nextKey2 ) {
|
||||
log("spider: got corruption 9. spiderdb "
|
||||
"keys out of order for "
|
||||
"collnum=%"INT32, (int32_t)m_collnum);
|
||||
g_corruptCount++;
|
||||
// this should result in an empty list read for
|
||||
// our next scan of spiderdb. unfortunately we could
|
||||
// miss a lot of spider requests then
|
||||
m_nextKey2 = m_endKey2;
|
||||
}
|
||||
else {
|
||||
m_nextKey2 = lastKey;
|
||||
m_nextKey2 += (uint32_t) 1;
|
||||
}
|
||||
|
||||
// watch out for wrap around
|
||||
if ( m_nextKey2 < endKey ) int16_tRead = true;
|
||||
if ( m_nextKey2 < lastKey ) shortRead = true;
|
||||
// nah, advance the firstip, should be a lot faster when
|
||||
// we are only a few firstips...
|
||||
if ( lastOne && lastOne != -1 )
|
||||
m_nextKey2 = g_spiderdb.makeFirstKey(lastOne+1);
|
||||
if ( lastOne && lastOne != -1 ) { // && ! gotCorruption ) {
|
||||
key128_t cand = g_spiderdb.makeFirstKey(lastOne+1);
|
||||
// corruption still seems to happen, so only
|
||||
// do this part if it increases the key to avoid
|
||||
// putting us into an infinite loop.
|
||||
if ( cand > m_nextKey2 ) m_nextKey2 = cand;
|
||||
}
|
||||
}
|
||||
|
||||
if ( int16_tRead ) {
|
||||
if ( shortRead ) {
|
||||
// mark when the scan completed so we can do another one
|
||||
// like 24 hrs from that...
|
||||
m_lastScanTime = getTimeLocal();
|
||||
@ -3071,6 +3095,12 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
m_nextKey2.setMin();
|
||||
// no longer need rebuild
|
||||
m_waitingTreeNeedsRebuild = false;
|
||||
// and re-send the crawlinfo in handerequestc1 to each host
|
||||
// so they no if we have urls ready to spider or not. because
|
||||
// if we told them no before we completed this rebuild we might
|
||||
// have found some urls.
|
||||
// MDW: let's not do this unless we find it is a problem
|
||||
//m_cr->localCrawlInfoUpdate();
|
||||
}
|
||||
|
||||
// free list to save memory
|
||||
@ -3113,9 +3143,13 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
// only one loop can run at a time!
|
||||
//if ( ! reentry && m_isPopulating ) return;
|
||||
if ( m_isPopulating ) return;
|
||||
if ( m_isPopulatingDoledb ) return;
|
||||
// skip if in repair mode
|
||||
if ( g_repairMode ) return;
|
||||
// if rebuilding the waiting tree, do that first
|
||||
// MDW. re-allow us to populate doledb while waiting tree is being
|
||||
// build so spiders can go right away. i had this in there to debug.
|
||||
//if ( m_waitingTreeNeedsRebuild ) return;
|
||||
|
||||
// let's skip if spiders off so we can inject/popoulate the index quick
|
||||
// since addSpiderRequest() calls addToWaitingTree() which then calls
|
||||
@ -3136,28 +3170,35 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
// m_waitingTree.m_numUsedNodes);
|
||||
|
||||
// set this flag so we are not re-entered
|
||||
m_isPopulating = true;
|
||||
m_isPopulatingDoledb = true;
|
||||
loop:
|
||||
|
||||
// if waiting tree is being saved, we can't write to it
|
||||
// so in that case, bail and wait to be called another time
|
||||
RdbTree *wt = &m_waitingTree;
|
||||
if( wt->m_isSaving || ! wt->m_isWritable ) {
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// . get next IP that is due to be spidered from
|
||||
// . also sets m_waitingTreeKey so we can delete it easily!
|
||||
int32_t ip = getNextIpFromWaitingTree();
|
||||
|
||||
// . return if none. all done. unset populating flag.
|
||||
// . it returns 0 if the next firstip has a spidertime in the future
|
||||
if ( ip == 0 ) { m_isPopulating = false; return; }
|
||||
if ( ip == 0 ) { m_isPopulatingDoledb = false; return; }
|
||||
|
||||
// set read range for scanning spiderdb
|
||||
m_nextKey = g_spiderdb.makeFirstKey(ip);
|
||||
m_endKey = g_spiderdb.makeLastKey (ip);
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: for cn=%i nextip=%s nextkey=%s",
|
||||
(int)m_collnum,
|
||||
iptoa(ip),
|
||||
KEYSTR(&m_nextKey,sizeof(key128_t)));
|
||||
|
||||
//////
|
||||
//
|
||||
// do TWO PASSES, one to count pages, the other to get the best url!!
|
||||
@ -3279,7 +3320,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
false, // useprotection?
|
||||
false, // allowdups?
|
||||
-1 ) ) { // rdbid
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
log("spider: winntree set: %s",mstrerror(g_errno));
|
||||
return;
|
||||
}
|
||||
@ -3293,7 +3334,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
false , // allow dups?
|
||||
MAX_NICENESS ,
|
||||
"wtdedup" ) ) {
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
log("spider: wintable set: %s",mstrerror(g_errno));
|
||||
return;
|
||||
}
|
||||
@ -3321,7 +3362,7 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
// oom error? i've seen this happen and we end up locking up!
|
||||
if ( g_errno ) {
|
||||
log("spider: evalIpLoop: %s",mstrerror(g_errno));
|
||||
m_isPopulating = false;
|
||||
m_isPopulatingDoledb = false;
|
||||
return;
|
||||
}
|
||||
// try more
|
||||
@ -3477,8 +3518,8 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
|
||||
// return if that blocked
|
||||
if ( ! THIS->evalIpLoop() ) return;
|
||||
// we are done, re-entry popuatedoledb
|
||||
THIS->m_isPopulating = false;
|
||||
// gotta set m_isPopulating to false lest it won't work
|
||||
THIS->m_isPopulatingDoledb = false;
|
||||
// gotta set m_isPopulatingDoledb to false lest it won't work
|
||||
THIS->populateDoledbFromWaitingTree ( );
|
||||
}
|
||||
|
||||
@ -3624,16 +3665,32 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
if ( ! m_list.isEmpty() ) {
|
||||
// update m_nextKey for successive reads of spiderdb by
|
||||
// calling readListFromSpiderdb()
|
||||
key128_t endKey = *(key128_t *)m_list.getLastKey();
|
||||
key128_t lastKey = *(key128_t *)m_list.getLastKey();
|
||||
// sanity
|
||||
//if ( endKey != finalKey ) { char *xx=NULL;*xx=0; }
|
||||
m_nextKey = endKey;
|
||||
m_nextKey += (uint32_t) 1;
|
||||
// crazy corruption?
|
||||
if ( lastKey < m_nextKey ) {
|
||||
log("spider: got corruption. spiderdb "
|
||||
"keys out of order for "
|
||||
"collnum=%"INT32" for evaluation of "
|
||||
"firstip=%s so terminating evaluation of that "
|
||||
"firstip." ,
|
||||
(int32_t)m_collnum,
|
||||
iptoa(m_scanningIp));
|
||||
g_corruptCount++;
|
||||
// this should result in an empty list read for
|
||||
// m_scanningIp in spiderdb
|
||||
m_nextKey = m_endKey;
|
||||
}
|
||||
else {
|
||||
m_nextKey = lastKey;
|
||||
m_nextKey += (uint32_t) 1;
|
||||
}
|
||||
// . watch out for wrap around
|
||||
// . normally i would go by this to indicate that we are
|
||||
// done reading, but there's some bugs... so we go
|
||||
// by whether our list is empty or not for now
|
||||
if ( m_nextKey < endKey ) m_nextKey = endKey;
|
||||
if ( m_nextKey < lastKey ) m_nextKey = lastKey;
|
||||
// reset list to save mem
|
||||
m_list.reset();
|
||||
// read more! return if it blocked
|
||||
@ -6026,20 +6083,46 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// count these calls
|
||||
s_count++;
|
||||
|
||||
top:
|
||||
int32_t now = getTimeLocal();
|
||||
|
||||
|
||||
// reset SpiderColl::m_didRound and m_nextDoledbKey if it is maxed
|
||||
// because we might have had a lock collision
|
||||
//int32_t nc = g_collectiondb.m_numRecs;
|
||||
// start again at head
|
||||
class CollectionRec *crp = g_spiderLoop.getActiveList();
|
||||
|
||||
redo:
|
||||
|
||||
// point to head of active linked list of collection recs
|
||||
CollectionRec *nextActive = g_spiderLoop.getActiveList();
|
||||
collnum_t nextActiveCollnum ;
|
||||
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
|
||||
|
||||
//for ( int32_t i = 0 ; i < nc ; i++ ) {
|
||||
for ( ; crp ; crp = crp->m_nextActive ) {
|
||||
for ( ; nextActive ; ) {
|
||||
// breathe
|
||||
QUICKPOLL(MAX_NICENESS);
|
||||
// before we assign crp to nextActive, ensure that it did
|
||||
// not get deleted on us.
|
||||
// if the next collrec got deleted, tr will be NULL
|
||||
CollectionRec *tr = g_collectiondb.getRec( nextActiveCollnum );
|
||||
// if it got deleted or restarted then it will not
|
||||
// match most likely
|
||||
if ( tr != nextActive ) {
|
||||
// this shouldn't happen much so log it
|
||||
log("spider: collnum %"INT32" got deleted. "
|
||||
"rebuilding active list",
|
||||
(int32_t)nextActiveCollnum);
|
||||
// rebuild the active list now
|
||||
goto redo;
|
||||
}
|
||||
// now we become him
|
||||
CollectionRec *crp = nextActive;
|
||||
// update these two vars for next iteration
|
||||
nextActive = crp->m_nextActive;
|
||||
nextActiveCollnum = -1;
|
||||
if ( nextActive ) nextActiveCollnum = nextActive->m_collnum;
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
//if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
// // get collectionrec
|
||||
// CollectionRec *cr = g_collectiondb.getRec(i);
|
||||
// if ( ! cr ) continue;
|
||||
@ -6052,28 +6135,30 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
if ( ! sc ) continue;
|
||||
// also scan spiderdb to populate waiting tree now but
|
||||
// only one read per 100ms!!
|
||||
if ( (s_count % 10) == 0 ) {
|
||||
// always do a scan at startup & every 24 hrs
|
||||
// AND at process startup!!!
|
||||
if ( ! sc->m_waitingTreeNeedsRebuild &&
|
||||
getTimeLocal() - sc->m_lastScanTime > 24*3600) {
|
||||
// if a scan is ongoing, this will re-set it
|
||||
sc->m_nextKey2.setMin();
|
||||
sc->m_waitingTreeNeedsRebuild = true;
|
||||
log(LOG_INFO,
|
||||
"spider: hit spider queue "
|
||||
"rebuild timeout for %s (%"INT32")",
|
||||
crp->m_coll,(int32_t)crp->m_collnum);
|
||||
// flush the ufn table
|
||||
//clearUfnTable();
|
||||
}
|
||||
// try this then. it just returns if
|
||||
// sc->m_waitingTreeNeedsRebuild is false so it
|
||||
// should be fast in those cases
|
||||
sc->populateWaitingTreeFromSpiderdb ( false );
|
||||
// MDW: try taking this out
|
||||
//if ( (s_count % 10) == 0 ) {
|
||||
// always do a scan at startup & every 24 hrs
|
||||
// AND at process startup!!!
|
||||
if ( ! sc->m_waitingTreeNeedsRebuild &&
|
||||
now - sc->m_lastScanTime > 24*3600) {
|
||||
// if a scan is ongoing, this will re-set it
|
||||
sc->m_nextKey2.setMin();
|
||||
sc->m_waitingTreeNeedsRebuild = true;
|
||||
log(LOG_INFO,
|
||||
"spider: hit spider queue "
|
||||
"rebuild timeout for %s (%"INT32")",
|
||||
crp->m_coll,(int32_t)crp->m_collnum);
|
||||
// flush the ufn table
|
||||
//clearUfnTable();
|
||||
}
|
||||
// try this then. it just returns if
|
||||
// sc->m_waitingTreeNeedsRebuild is false so it
|
||||
// should be fast in those cases
|
||||
sc->populateWaitingTreeFromSpiderdb ( false );
|
||||
//}
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
//if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
|
||||
// re-entry is false because we are entering for the first time
|
||||
sc->populateDoledbFromWaitingTree ( );
|
||||
// . skip if still loading doledb lists from disk this round
|
||||
@ -6092,7 +6177,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
//sc->m_encounteredDoledbRecs = false;
|
||||
//sc->m_nextDoledbKey.setMin();
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
//if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
}
|
||||
|
||||
// set initial priority to the highest to start spidering there
|
||||
@ -6106,13 +6191,18 @@ void doneSendingNotification ( void *state ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
collnum_t collnum = ei->m_collnum;
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
if ( cr != ei->m_collRec ) cr = NULL;
|
||||
char *coll = "lostcoll";
|
||||
if ( cr ) coll = cr->m_coll;
|
||||
log(LOG_INFO,"spider: done sending notifications for coll=%s", coll);
|
||||
log(LOG_INFO,"spider: done sending notifications for coll=%s (%i)",
|
||||
coll,(int)ei->m_collnum);
|
||||
|
||||
// all done if collection was deleted from under us
|
||||
if ( ! cr ) return;
|
||||
|
||||
// do not re-call this stuff
|
||||
cr->m_sendingAlertInProgress = false;
|
||||
|
||||
// we can re-use the EmailInfo class now
|
||||
// pingserver.cpp sets this
|
||||
//ei->m_inUse = false;
|
||||
@ -6268,11 +6358,19 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
// since we reset global.
|
||||
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return true;
|
||||
|
||||
if ( cr->m_sendingAlertInProgress ) return true;
|
||||
|
||||
// ok, send it
|
||||
EmailInfo *ei = &cr->m_emailInfo;
|
||||
//EmailInfo *ei = &cr->m_emailInfo;
|
||||
EmailInfo *ei = (EmailInfo *)mcalloc ( sizeof(EmailInfo),"eialrt");
|
||||
if ( ! ei ) {
|
||||
log("spider: could not send email alert: %s",
|
||||
mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
// in use already?
|
||||
if ( ei->m_inUse ) return true;
|
||||
//if ( ei->m_inUse ) return true;
|
||||
|
||||
// pingserver.cpp sets this
|
||||
//ei->m_inUse = true;
|
||||
@ -6281,6 +6379,8 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
ei->m_finalCallback = doneSendingNotification;
|
||||
ei->m_finalState = ei;
|
||||
ei->m_collnum = cr->m_collnum;
|
||||
// collnums can be recycled, so ensure collection with the ptr
|
||||
ei->m_collRec = cr;
|
||||
|
||||
SafeBuf *buf = &ei->m_spiderStatusMsg;
|
||||
// stop it from accumulating status msgs
|
||||
@ -6293,6 +6393,9 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
//log("spider: SENDING EMAIL NOT");
|
||||
|
||||
// do not re-call this stuff
|
||||
cr->m_sendingAlertInProgress = true;
|
||||
|
||||
// ok, put it back...
|
||||
if ( ! sendNotification ( ei ) ) return false;
|
||||
|
||||
@ -7443,6 +7546,11 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
if ( cr->m_spiderStatus == SP_INITIALIZING ) {
|
||||
// this is the GLOBAL crawl info, not the LOCAL, which
|
||||
// is what "ci" represents...
|
||||
// MDW: is this causing the bug?
|
||||
// the other have already reported that there are no urls
|
||||
// to spider, so they do not re-report. we already
|
||||
// had 'hasurlsreadytospider' set to true so we didn't get
|
||||
// the reviving log msg.
|
||||
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
|
||||
// set this right i guess...?
|
||||
ci->m_lastSpiderAttempt = nowGlobal;
|
||||
@ -11339,6 +11447,10 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( errCode != EDNSTIMEDOUT &&
|
||||
errCode != ETCPTIMEDOUT &&
|
||||
errCode != EDNSDEAD &&
|
||||
// add this here too now because we had some
|
||||
// seeds that failed one time and the crawl
|
||||
// never repeated after that!
|
||||
errCode != EBADIP &&
|
||||
// assume diffbot is temporarily experiencing errs
|
||||
errCode != EDIFFBOTINTERNALERROR &&
|
||||
// if diffbot received empty content when d'lding
|
||||
@ -13459,6 +13571,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
cr->m_crawlInfoBuf.reserve(need);
|
||||
// in case one was udp server timed out or something
|
||||
cr->m_crawlInfoBuf.zeroOut();
|
||||
cr->m_crawlInfoBuf.setLabel("cibuf");
|
||||
}
|
||||
|
||||
CrawlInfo *cia = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart();
|
||||
@ -13656,6 +13769,10 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
// if spidering disabled in master controls then send no
|
||||
// notifications
|
||||
// crap, but then we can not update the round start time
|
||||
// because that is done in doneSendingNotification().
|
||||
// but why does it say all 32 report done, but then
|
||||
// it has urls ready to spider?
|
||||
if ( ! g_conf.m_spideringEnabled )
|
||||
continue;
|
||||
|
||||
@ -14466,6 +14583,9 @@ void SpiderLoop::buildActiveList ( ) {
|
||||
// }
|
||||
}
|
||||
|
||||
// MDW: let's not do this either unless it proves to be a prob
|
||||
//if ( sc->m_needsRebuild ) active = true;
|
||||
|
||||
// we are at the tail of the linked list OR not in the list
|
||||
cr->m_nextActive = NULL;
|
||||
|
||||
|
2
Spider.h
2
Spider.h
@ -1136,7 +1136,7 @@ class SpiderColl {
|
||||
bool m_isLoading;
|
||||
|
||||
// for scanning the wait tree...
|
||||
bool m_isPopulating;
|
||||
bool m_isPopulatingDoledb;
|
||||
// for reading from spiderdb
|
||||
//bool m_isReadDone;
|
||||
bool m_didRead;
|
||||
|
Loading…
x
Reference in New Issue
Block a user