fix for changing maxtocrawl/process/rounds.

fix for thinking a crawl is done when it is just taking
a while to populate doledb from the waiting tree for
that SpiderColl. we just call populateDoledbFromWaiting in
doneSleepingWrapper avery 50ms. it loops over every coll
so it could be more efficient.
This commit is contained in:
Matt Wells 2015-03-12 13:15:52 -07:00
parent 3efb5bd310
commit 485c600c7c
3 changed files with 100 additions and 0 deletions

View File

@ -22006,6 +22006,41 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
}
}
//
// if user changed the crawl/process max then reset here so
// spiders will resume
//
if ( base == cr &&
dst == (char *)&cr->m_maxToCrawl &&
cr->m_spiderStatus == SP_MAXTOCRAWL ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
if ( base == cr &&
dst == (char *)&cr->m_maxToProcess &&
cr->m_spiderStatus == SP_MAXTOPROCESS ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
if ( base == cr &&
dst == (char *)&cr->m_maxCrawlRounds &&
cr->m_spiderStatus == SP_MAXROUNDS ) {
// reset this for rebuilding of active spider collections
// so this collection can be in the linked list again
cr->m_spiderStatus = SP_INPROGRESS;
// rebuild list of active spider collections then
g_spiderLoop.m_activeListValid = false;
}
//
// END HACK
//

View File

@ -1209,6 +1209,7 @@ CollectionRec *SpiderColl::getCollectionRec ( ) {
SpiderColl::SpiderColl () {
m_overflowList = NULL;
m_lastOverflowFirstIp = 0;
m_lastPrinted = 0;
m_deleteMyself = false;
m_isLoading = false;
m_gettingList1 = false;
@ -2787,6 +2788,27 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
return firstIp;
}
uint64_t SpiderColl::getNextSpiderTimeFromWaitingTree ( ) {
// if nothing to scan, bail
if ( m_waitingTree.isEmpty() ) return 0LL;
// the key
key_t mink; mink.setMin();
// set node from wait tree key. this way we can resume from a prev key
int32_t node = m_waitingTree.getNextNode (0,(char *)&mink );
// if empty, stop
if ( node < 0 ) return 0LL;
// get the key
key_t *wk = (key_t *)m_waitingTree.getKey ( node );
// time from that
uint64_t spiderTimeMS = (wk->n1);
spiderTimeMS <<= 32;
spiderTimeMS |= ((wk->n0) >> 32);
// stop if need to wait for this one
return spiderTimeMS;
}
static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
SpiderColl *THIS = (SpiderColl *)state;
@ -13317,6 +13339,8 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
uint32_t now = (uint32_t)getTimeGlobalNoCore();
uint64_t nowMS = gettimeofdayInMillisecondsGlobalNoCore();
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
@ -13370,6 +13394,44 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
//g_conf.m_spideringEnabled &&
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch >
spiderDoneTimer ) {
// break it here for our collnum to see if
// doledb was just lagging or not.
bool printIt = true;
if ( g_now < sc->m_lastPrinted + 5 ) printIt = false;
if ( printIt ) sc->m_lastPrinted = g_now;
// doledb must be empty
if ( ! sc->m_doleIpTable.isEmpty() ) {
if ( printIt )
log("spider: not ending crawl because "
"doledb not empty for coll=%s",cr->m_coll);
goto doNotEnd;
}
uint64_t nextTimeMS ;
nextTimeMS = sc->getNextSpiderTimeFromWaitingTree ( );
// and no ips awaiting scans to get into doledb
// except for ips needing scans 60+ seconds from now
if ( nextTimeMS < nowMS + 60000 ) {
if ( printIt )
log("spider: not ending crawl because "
"waiting tree key is ready for scan "
"%"INT64" ms from now for coll=%s",
nextTimeMS - nowMS,cr->m_coll );
goto doNotEnd;
}
// maybe wait for waiting tree population to finish
if ( sc->m_waitingTreeNeedsRebuild ) {
if ( printIt )
log("spider: not ending crawl because "
"waiting tree is building for coll=%s",
cr->m_coll );
goto doNotEnd;
}
// this is the MOST IMPORTANT variable so note it
log(LOG_INFO,
"spider: coll %s has no more urls to spider",
@ -13382,6 +13444,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
cr->m_needsSave = true;
}
doNotEnd:
int32_t hostId = slot->m_host->m_hostId;

View File

@ -1212,6 +1212,7 @@ class SpiderColl {
int32_t m_numAdded;
int64_t m_numBytesScanned;
int64_t m_lastPrintCount;
int64_t m_lastPrinted;
// used by SpiderLoop.cpp
int32_t m_spidersOut;
@ -1253,6 +1254,7 @@ class SpiderColl {
bool addToWaitingTree ( uint64_t spiderTime , int32_t firstIp ,
bool callForScan );
int32_t getNextIpFromWaitingTree ( );
uint64_t getNextSpiderTimeFromWaitingTree ( ) ;
void populateDoledbFromWaitingTree ( );
//bool scanSpiderdb ( bool needList );