mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Minor spider code rearrangements
This commit is contained in:
34
Spider.cpp
34
Spider.cpp
@ -607,6 +607,8 @@ bool Spiderdb::init ( ) {
|
||||
sizeof(key128_t) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
||||
bool Spiderdb::init2 ( int32_t treeMem ) {
|
||||
// . what's max # of tree nodes?
|
||||
@ -632,19 +634,7 @@ bool Spiderdb::init2 ( int32_t treeMem ) {
|
||||
sizeof(key128_t));
|
||||
}
|
||||
|
||||
/*
|
||||
bool Spiderdb::addColl ( char *coll, bool doVerify ) {
|
||||
if ( ! m_rdb.addColl ( coll ) ) return false;
|
||||
if ( ! doVerify ) return true;
|
||||
// verify
|
||||
if ( verify(coll) ) return true;
|
||||
// if not allowing scale, return false
|
||||
if ( ! g_conf.m_allowScale ) return false;
|
||||
// otherwise let it go
|
||||
log ( "db: Verify failed, but scaling is allowed, passing." );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
bool Spiderdb::verify ( char *coll ) {
|
||||
//return true;
|
||||
@ -721,6 +711,8 @@ bool Spiderdb::verify ( char *coll ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
key128_t Spiderdb::makeKey ( int32_t firstIp ,
|
||||
int64_t urlHash48 ,
|
||||
bool isRequest ,
|
||||
@ -1061,22 +1053,6 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
|
||||
|
||||
|
||||
key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , int32_t firstIp ) {
|
||||
// sanity
|
||||
if ( ((int64_t)spiderTimeMS) < 0 ) { char *xx=NULL;*xx=0; }
|
||||
// make the wait tree key
|
||||
key_t wk;
|
||||
wk.n1 = (spiderTimeMS>>32);
|
||||
wk.n0 = (spiderTimeMS&0xffffffff);
|
||||
wk.n0 <<= 32;
|
||||
wk.n0 |= (uint32_t)firstIp;
|
||||
// sanity
|
||||
if ( wk.n1 & 0x8000000000000000LL ) { char *xx=NULL;*xx=0; }
|
||||
return wk;
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////
|
||||
//
|
||||
// winner tree key. holds the top/best spider requests for a firstIp
|
||||
|
2
Spider.h
2
Spider.h
@ -1099,6 +1099,4 @@ key192_t makeWinnerTreeKey ( int32_t firstIp ,
|
||||
int64_t spiderTimeMS ,
|
||||
int64_t uh48 );
|
||||
|
||||
key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , int32_t firstIp );
|
||||
|
||||
#endif
|
||||
|
@ -307,7 +307,6 @@ char *SpiderColl::getCollName() {
|
||||
}
|
||||
|
||||
|
||||
|
||||
// this one has to scan all of spiderdb
|
||||
bool SpiderColl::makeWaitingTree ( ) {
|
||||
|
||||
@ -1534,6 +1533,7 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////
|
||||
//////////////////
|
||||
//
|
||||
@ -1551,7 +1551,7 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
|
||||
// . scan spiderdb to make sure each firstip represented in spiderdb is
|
||||
// in the waiting tree. it seems they fall out over time. we need to fix
|
||||
// that but in the meantime this should do a bg repair. and is nice to have
|
||||
// . the waiting tree key is reall just a spidertime and a firstip. so we will
|
||||
// . the waiting tree key is really just a spidertime and a firstip. so we will
|
||||
// still need populatedoledbfromwaitingtree to periodically scan firstips
|
||||
// that are already in doledb to see if it has a higher-priority request
|
||||
// for that firstip. in which case it can add that to doledb too, but then
|
||||
@ -1696,7 +1696,11 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
continue;
|
||||
}
|
||||
// if its a SpiderReply skip it
|
||||
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec)) continue;
|
||||
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// cast it
|
||||
SpiderRequest *sreq = (SpiderRequest *)rec;
|
||||
// get first ip
|
||||
@ -1872,6 +1876,8 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//static bool s_ufnTreeSet = false;
|
||||
//static RdbTree s_ufnTree;
|
||||
//static time_t s_lastUfnTreeFlushTime = 0;
|
||||
@ -2028,6 +2034,9 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
log(LOG_DEBUG,"spider: evalIpLoop: waitingtree nextip=%s "
|
||||
"numUsedNodes=%"INT32"",iptoa(ip),m_waitingTree.m_numUsedNodes);
|
||||
|
||||
|
||||
//@@@@@@ BR: THIS SHOULD BE DEBUGGED AND ENABLED
|
||||
|
||||
/*
|
||||
// assume using tree
|
||||
m_useTree = true;
|
||||
@ -2177,8 +2186,6 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
|
||||
SpiderColl *THIS = (SpiderColl *)state;
|
||||
// prevent a core
|
||||
@ -2195,6 +2202,7 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////
|
||||
//
|
||||
// KEYSTONE FUNCTION
|
||||
@ -2614,6 +2622,7 @@ bool SpiderColl::readListFromSpiderdb ( ) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int32_t s_lastIn = 0;
|
||||
static int32_t s_lastOut = 0;
|
||||
|
||||
@ -2635,6 +2644,8 @@ bool SpiderColl::isFirstIpInOverflowList ( int32_t firstIp ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . ADDS top X winners to m_winnerTree
|
||||
// . this is ONLY CALLED from evalIpLoop() above
|
||||
// . scan m_list that we read from spiderdb for m_scanningIp IP
|
||||
@ -3429,36 +3440,38 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// so we can kick out a lower priority version of the same url.
|
||||
int32_t winSlot = m_winnerTable.getSlot ( &uh48 );
|
||||
if ( winSlot >= 0 ) {
|
||||
key192_t *oldwk = (key192_t *)m_winnerTable.
|
||||
getDataFromSlot ( winSlot );
|
||||
key192_t *oldwk = (key192_t *)m_winnerTable.getDataFromSlot ( winSlot );
|
||||
|
||||
// get the min hopcount
|
||||
SpiderRequest *wsreq ;
|
||||
wsreq =(SpiderRequest *)m_winnerTree.
|
||||
getData(0,(char *)oldwk);
|
||||
wsreq =(SpiderRequest *)m_winnerTree.getData(0,(char *)oldwk);
|
||||
|
||||
if ( wsreq ) {
|
||||
if ( sreq->m_hopCount < wsreq->m_hopCount )
|
||||
wsreq->m_hopCount = sreq->m_hopCount;
|
||||
|
||||
if ( wsreq->m_hopCount < sreq->m_hopCount )
|
||||
sreq->m_hopCount = wsreq->m_hopCount;
|
||||
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
|
||||
wsreq->m_discoveryTime =
|
||||
sreq->m_discoveryTime;
|
||||
wsreq->m_discoveryTime = sreq->m_discoveryTime;
|
||||
|
||||
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
|
||||
sreq->m_discoveryTime =
|
||||
wsreq->m_discoveryTime;
|
||||
sreq->m_discoveryTime = wsreq->m_discoveryTime;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// are we lower priority? (or equal)
|
||||
// smaller keys are HIGHER priority.
|
||||
if(KEYCMP((char *)&wk,(char *)oldwk,
|
||||
sizeof(key192_t))>=0)
|
||||
if(KEYCMP( (char *)&wk, (char *)oldwk, sizeof(key192_t)) >= 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// from table too. no it's a dup uh48!
|
||||
//m_winnerTable.deleteKey ( &uh48 );
|
||||
// otherwise we supplant it. remove old key from tree.
|
||||
@ -3470,6 +3483,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
|
||||
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
|
||||
|
||||
|
||||
//@todo BR: Why max winners based on bytes scanned??
|
||||
// if less than 10MB of spiderdb requests limit to 400
|
||||
if ( m_totalBytesScanned < 10000000 ) maxWinners = 400;
|
||||
|
||||
@ -3797,6 +3812,7 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// have 10M spider requests for it.
|
||||
// lower for testing
|
||||
//if ( m_totalNewSpiderRequests > 1 )
|
||||
// @todo BR: Another hardcoded limit..
|
||||
if ( m_totalNewSpiderRequests > 10000000 )
|
||||
overflow = true;
|
||||
|
||||
@ -3868,15 +3884,13 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// END maintain firstip overflow list
|
||||
//
|
||||
/////
|
||||
|
||||
|
||||
|
||||
|
||||
// ok we've updated m_bestRequest!!!
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . this is ONLY CALLED from evalIpLoop() above
|
||||
// . add another 0 entry into waiting tree, unless we had no winner
|
||||
// . add winner in here into doledb
|
||||
@ -4067,6 +4081,8 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
|
||||
return addDoleBufIntoDoledb ( &doleBuf , false );//, 0 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
|
||||
char *doleBufEnd = doleBuf->getBuf();
|
||||
// get offset
|
||||
@ -4100,6 +4116,8 @@ bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool SpiderColl::addDoleBufIntoDoledb ( SafeBuf *doleBuf, bool isFromCache ) {
|
||||
// uint32_t cachedTimestamp ) {
|
||||
|
||||
@ -4648,6 +4666,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
|
||||
return spiderTimeMS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . returns false with g_errno set on error
|
||||
// . Rdb.cpp should call this when it receives a doledb key
|
||||
// . when trying to add a SpiderRequest to the waiting tree we first check
|
||||
@ -4716,7 +4736,6 @@ bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) {
|
||||
// reset scan for this priority in doledb
|
||||
m_nextKeys [pri] =g_doledb.makeFirstKey2 ( pri );
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -4739,6 +4758,8 @@ void SpiderColl::devancePriority() {
|
||||
m_msg5StartKey = m_nextDoledbKey;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void SpiderColl::setPriority(int32_t pri) {
|
||||
m_pri2 = pri;
|
||||
m_nextDoledbKey = m_nextKeys [ m_pri2 ];
|
||||
@ -4750,3 +4771,22 @@ bool SpiderColl::printStats ( SafeBuf &sb ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , int32_t firstIp ) {
|
||||
// sanity
|
||||
if ( ((int64_t)spiderTimeMS) < 0 ) { char *xx=NULL;*xx=0; }
|
||||
// make the wait tree key
|
||||
key_t wk;
|
||||
wk.n1 = (spiderTimeMS>>32);
|
||||
wk.n0 = (spiderTimeMS&0xffffffff);
|
||||
wk.n0 <<= 32;
|
||||
wk.n0 |= (uint32_t)firstIp;
|
||||
// sanity
|
||||
if ( wk.n1 & 0x8000000000000000LL ) { char *xx=NULL;*xx=0; }
|
||||
return wk;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -286,5 +286,7 @@ class SpiderColl {
|
||||
class CollectionRec *m_cr;
|
||||
};
|
||||
|
||||
key_t makeWaitingTreeKey ( uint64_t spiderTimeMS , int32_t firstIp );
|
||||
|
||||
|
||||
#endif
|
||||
|
194
SpiderLoop.cpp
194
SpiderLoop.cpp
@ -117,6 +117,8 @@ void SpiderLoop::reset() {
|
||||
|
||||
void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) ;
|
||||
|
||||
|
||||
|
||||
void SpiderLoop::startLoop ( ) {
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
@ -171,8 +173,9 @@ void SpiderLoop::startLoop ( ) {
|
||||
//g_spiderLoop.spiderDoledUrls( );
|
||||
// sleep for .1 seconds = 100ms
|
||||
if (!g_loop.registerSleepCallback(50,this,doneSleepingWrapperSL))
|
||||
log("build: Failed to register timer callback. Spidering "
|
||||
"is permanently disabled. Restart to fix.");
|
||||
{
|
||||
log(LOG_ERROR, "build: Failed to register timer callback. Spidering is permanently disabled. Restart to fix.");
|
||||
}
|
||||
|
||||
// crawlinfo updating
|
||||
// save bandwidth for now make this every 4 seconds not 1 second
|
||||
@ -187,11 +190,15 @@ void SpiderLoop::startLoop ( ) {
|
||||
if ( !g_loop.registerSleepCallback(20000,
|
||||
this,
|
||||
updateAllCrawlInfosSleepWrapper))
|
||||
log("build: failed to register updatecrawlinfowrapper");
|
||||
{
|
||||
log(LOG_ERROR, "build: failed to register updatecrawlinfowrapper");
|
||||
}
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// call this every 50ms it seems to try to spider urls and populate doledb
|
||||
// from the waiting tree
|
||||
void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
@ -296,15 +303,19 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// flush the ufn table
|
||||
//clearUfnTable();
|
||||
}
|
||||
|
||||
|
||||
//@@@@@@
|
||||
//@@@ BR: Why not check m_waitingTreeNeedsRebuild before calling??
|
||||
// try this then. it just returns if
|
||||
// sc->m_waitingTreeNeedsRebuild is false so it
|
||||
// should be fast in those cases
|
||||
sc->populateWaitingTreeFromSpiderdb ( false );
|
||||
//}
|
||||
// if list was modified a collection was deleted/added
|
||||
//if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
|
||||
// re-entry is false because we are entering for the first time
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Calling populateWaitingTreeFromSpiderdb", __FILE__, __func__, __LINE__);
|
||||
sc->populateWaitingTreeFromSpiderdb ( false );
|
||||
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Calling populateDoledbFromWaitingTree", __FILE__, __func__, __LINE__);
|
||||
sc->populateDoledbFromWaitingTree ( );
|
||||
// . skip if still loading doledb lists from disk this round
|
||||
// . we use m_didRound to share spiders across all collections
|
||||
@ -356,6 +367,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
|
||||
|
||||
// spider some urls that were doled to us
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Calling spiderDoledUrls", __FILE__, __func__, __LINE__);
|
||||
g_spiderLoop.spiderDoledUrls( );
|
||||
}
|
||||
|
||||
@ -394,6 +406,8 @@ void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
// now check our RDB_DOLEDB for SpiderRequests to spider!
|
||||
void SpiderLoop::spiderDoledUrls ( ) {
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
//char *reb = g_rebalance.getNeedsRebalance();
|
||||
//if ( ! reb || *reb ) {return;
|
||||
|
||||
@ -452,38 +466,109 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
QUICKPOLL(MAX_NICENESS);
|
||||
|
||||
// must be spidering to dole out
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
|
||||
if ( ! g_conf.m_spideringEnabled )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, spidering disabled", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
if ( ! g_hostdb.getMyHost( )->m_spiderEnabled )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, spidering disabled (2)", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// or if trying to exit
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
if ( g_process.m_mode == EXIT_MODE )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, shutting down", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we don't have all the url counts from all hosts, then wait.
|
||||
// one host is probably down and was never up to begin with
|
||||
if ( ! s_countsAreValid ) return;
|
||||
//if ( ! g_conf.m_webSpideringEnabled ) return;
|
||||
if ( ! s_countsAreValid )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, counts not valid", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we do not overlap ourselves
|
||||
if ( m_gettingDoledbList ) return;
|
||||
if ( m_gettingDoledbList )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, already getting DoledbList", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// bail instantly if in read-only mode (no RdbTrees!)
|
||||
if ( g_conf.m_readOnlyMode ) return;
|
||||
if ( g_conf.m_readOnlyMode )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, in read-only mode", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// or if doing a daily merge
|
||||
if ( g_dailyMerge.m_mergeMode ) return;
|
||||
if ( g_dailyMerge.m_mergeMode )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, doing daily merge", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// skip if too many udp slots being used
|
||||
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
|
||||
if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, using max UDP slots", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// stop if too many out. this is now 50 down from 500.
|
||||
if ( m_numSpidersOut >= MAX_SPIDERS ) return;
|
||||
if ( m_numSpidersOut >= MAX_SPIDERS )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, reached max spiders", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// a new global conf rule
|
||||
if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
|
||||
if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, reached max total spiders", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// bail if no collections
|
||||
if ( g_collectiondb.m_numRecs <= 0 ) return;
|
||||
if ( g_collectiondb.m_numRecs <= 0 )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, no collections", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// not while repairing
|
||||
if ( g_repairMode ) return;
|
||||
if ( g_repairMode )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, in repair mode", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// do not spider until collections/parms in sync with host #0
|
||||
if ( ! g_parms.m_inSyncWithHost0 ) return;
|
||||
if ( ! g_parms.m_inSyncWithHost0 )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, not in sync with host#0", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// don't spider if not all hosts are up, or they do not all
|
||||
// have the same hosts.conf.
|
||||
if ( ! g_pingServer.m_hostsConfInAgreement ) return;
|
||||
if ( ! g_pingServer.m_hostsConfInAgreement )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, host config disagreement", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// if nothin in the active list then return as well
|
||||
if ( ! m_activeList ) return;
|
||||
if ( ! m_activeList )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, nothing in active list", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we hit the end of the list, wrap it around
|
||||
if ( ! m_crx ) m_crx = m_activeList;
|
||||
@ -501,7 +586,10 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// spider. i could see a single collection dominating all the spider
|
||||
// slots in some scenarios with this approach unfortunately.
|
||||
if ( m_crx == m_bookmark && ! firstTime && m_launches == 0 )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, end of list?", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// reset # launches after doing a round and having launched > 0
|
||||
if ( m_crx == m_bookmark && ! firstTime )
|
||||
@ -513,10 +601,17 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// we don't core trying to access a delete collectionrec.
|
||||
// i'm not sure if this can happen here but i put this in as a
|
||||
// precaution.
|
||||
if ( ! m_activeListValid ) { m_crx = NULL; goto collLoop; }
|
||||
if ( ! m_activeListValid ) {
|
||||
m_crx = NULL;
|
||||
goto collLoop;
|
||||
}
|
||||
|
||||
// return now if list is just empty
|
||||
if ( ! m_activeList ) return;
|
||||
if ( ! m_activeList )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, active list empty", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
cr = m_crx;
|
||||
|
||||
@ -527,7 +622,11 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// get the spider collection for this collnum
|
||||
m_sc = g_spiderCache.getSpiderColl(cr->m_collnum);//m_cri);
|
||||
// skip if none
|
||||
if ( ! m_sc ) goto subloop;
|
||||
if ( ! m_sc ) {
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, no spider cache for this collection", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
// always reset priority to max at start
|
||||
m_sc->setPriority ( MAX_SPIDER_PRIORITIES - 1 );
|
||||
|
||||
@ -540,7 +639,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// get rec
|
||||
//cr = g_collectiondb.m_recs[m_cri];
|
||||
// skip if gone
|
||||
if ( ! cr ) goto subloop;
|
||||
if ( ! cr ) goto subloop;
|
||||
|
||||
// stop if not enabled
|
||||
if ( ! cr->m_spideringEnabled ) goto subloop;
|
||||
|
||||
@ -558,6 +658,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
cr->m_needsSave = true;
|
||||
cr->m_spiderStatus = SP_MAXROUNDS;
|
||||
}
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, crawl round max reached", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
@ -568,7 +670,10 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
cr->m_maxToCrawl ) {
|
||||
// should we resend our local crawl info to all hosts?
|
||||
if ( cr->m_localCrawlInfo.m_hasUrlsReadyToSpider )
|
||||
{
|
||||
cr->localCrawlInfoUpdate();
|
||||
}
|
||||
|
||||
// now once all hosts have no urls ready to spider
|
||||
// then the send email code will be called.
|
||||
// do it this way for code simplicity.
|
||||
@ -578,6 +683,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
cr->m_needsSave = true;
|
||||
cr->m_spiderStatus = SP_MAXTOCRAWL;
|
||||
}
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, max pages to crawl reached", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
@ -596,6 +703,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
cr->m_needsSave = true;
|
||||
cr->m_spiderStatus = SP_MAXTOPROCESS;
|
||||
}
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, max pages to process reached", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
@ -662,7 +771,11 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// . if m_collectiveRespiderFrequency was set to 0.0 then
|
||||
// PageCrawlBot.cpp also sets m_roundStartTime to 0.
|
||||
//
|
||||
if ( nowGlobal < cr->m_spiderRoundStartTime ) goto subloop;
|
||||
if ( nowGlobal < cr->m_spiderRoundStartTime )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, Spider start time not reached", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
// if populating this collection's waitingtree assume
|
||||
// we would have found something to launch as well. it might
|
||||
@ -684,25 +797,35 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( g_conf.m_testSpiderEnabled ) maxSpiders = 6;
|
||||
}
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: maxSpiders: %"INT32"", __FILE__, __func__, __LINE__, maxSpiders);
|
||||
|
||||
// if some spiders are currently outstanding
|
||||
if ( m_sc->m_spidersOut )
|
||||
{
|
||||
// do not end the crawl until empty of urls because
|
||||
// that url might end up adding more links to spider
|
||||
// when it finally completes
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
}
|
||||
|
||||
// debug log
|
||||
//if ( g_conf.m_logDebugSpider )
|
||||
// log("spider: has %"INT32" spiders out",m_sc->m_spidersOut);
|
||||
// obey max spiders per collection too
|
||||
if ( m_sc->m_spidersOut >= maxSpiders )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, Too many spiders active for collection", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
// shortcut
|
||||
SpiderColl *sc = cr->m_spiderColl;
|
||||
|
||||
if ( sc && sc->m_doleIpTable.isEmpty() )
|
||||
{
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, doleIpTable is empty", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
/*
|
||||
// . HACK.
|
||||
@ -805,6 +928,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// assume we would have launched a spider for this coll
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// wait for sleep callback to re-call us in 10ms
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, still waiting for lock reply", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -834,6 +958,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// m_sc->m_lastDoledbReadEmpty = true;
|
||||
// and go up top
|
||||
//goto collLoop;
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, pri2 < 0", __FILE__, __func__, __LINE__);
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
@ -893,6 +1018,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// // g_doledb.makeFirstKey2(m_sc->m_pri);
|
||||
// m_sc->m_nextDoledbKey = m_sc->m_nextKeys[m_sc->m_pri2];
|
||||
// and try again
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, trying previous priority", __FILE__, __func__, __LINE__);
|
||||
goto loop;
|
||||
}
|
||||
|
||||
@ -911,6 +1037,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// seems like we need this reset here... strange
|
||||
m_list.reset();
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Getting list (msg5)", __FILE__, __func__, __LINE__);
|
||||
// get a spider rec for us to spider from doledb (mdw)
|
||||
if ( ! m_msg5.getList ( RDB_DOLEDB ,
|
||||
cr->m_collnum, // coll ,
|
||||
@ -941,8 +1068,12 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
gotDoledbListWrapper2 ,
|
||||
MAX_NICENESS , // niceness
|
||||
true ))// do error correction?
|
||||
{
|
||||
// return if it blocked
|
||||
return ;
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, getList blocked", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// debug
|
||||
//log(LOG_DEBUG,"spider: read list of %"INT32" bytes from spiderdb for "
|
||||
// "pri=%"INT32"+",m_list.m_listSize,(int32_t)m_sc->m_pri);
|
||||
@ -955,16 +1086,19 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// . returns true if we should read another list
|
||||
// . will set startKey to next key to start at
|
||||
bool status = gotDoledbList2 ( );
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Back from gotDoledList2. Get more? %s", __FILE__, __func__, __LINE__, status?"true":"false");
|
||||
|
||||
// if we did not launch anything, then decrement priority and
|
||||
// try again. but if priority hits -1 then subloop2 will just go to
|
||||
// the next collection.
|
||||
if ( saved == m_launches ) {
|
||||
m_sc->devancePriority();
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: Loop, get next priority", __FILE__, __func__, __LINE__);
|
||||
goto subloopNextPriority;
|
||||
}
|
||||
|
||||
|
||||
if( g_conf.m_logTraceSpider ) log(LOG_TRACE,"%s:%s:%d: END, loop", __FILE__, __func__, __LINE__);
|
||||
if ( status ) {
|
||||
// . if priority is -1 that means try next priority
|
||||
// . DO NOT reset the whole scan. that was what was happening
|
||||
|
Reference in New Issue
Block a user