forked from Mirrors/privacore-open-source-search-engine
Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
This commit is contained in:
@ -827,12 +827,18 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
|
||||
// actually have a shot at deleting it
|
||||
sc->m_deleteMyself = true;
|
||||
// cr will be invalid shortly after this
|
||||
sc->m_cr = NULL;
|
||||
// MDW: this is causing the core...
|
||||
// use fake ptrs for easier debugging
|
||||
//sc->m_cr = (CollectionRec *)0x99999;//NULL;
|
||||
//sc->m_cr = NULL;
|
||||
sc->setCollectionRec ( NULL );
|
||||
// this will put it on "death row" so it will be deleted
|
||||
// once Msg5::m_waitingForList/Merge is NULL
|
||||
tryToDeleteSpiderColl ( sc ,"10");
|
||||
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
|
||||
//delete ( sc );
|
||||
// don't let cr reference us anymore, sc is on deathrow
|
||||
//cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
|
||||
cr->m_spiderColl = NULL;
|
||||
}
|
||||
|
||||
@ -1655,6 +1661,7 @@ void CollectionRec::reset() {
|
||||
}
|
||||
|
||||
SpiderColl *sc = m_spiderColl;
|
||||
//if ( sc == (SpiderColl *)0x8888 ) return;
|
||||
// if never made one, we are done
|
||||
if ( ! sc ) return;
|
||||
|
||||
|
@ -409,7 +409,7 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
HashTableX *dt = &sc->m_siteListDomTable;
|
||||
|
||||
// get this
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
CollectionRec *cr = sc->getCollectionRec();
|
||||
|
||||
// need to build dom table for pattern matching?
|
||||
if ( dt->getNumSlotsUsed() == 0 && cr ) {
|
||||
|
43
Spider.cpp
43
Spider.cpp
@ -1054,8 +1054,9 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) {
|
||||
// . make sure nobody has it
|
||||
// . cr might be NULL because Collectiondb.cpp::deleteRec2() might
|
||||
// have nuked it
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
if ( cr ) cr->m_spiderColl = NULL;
|
||||
//CollectionRec *cr = sc->m_cr;
|
||||
// use fake ptrs for easier debugging
|
||||
//if ( cr ) cr->m_spiderColl = NULL;
|
||||
mdelete ( sc , sizeof(SpiderColl),"postdel1");
|
||||
delete ( sc );
|
||||
return true;
|
||||
@ -1094,7 +1095,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
//m_spiderColls [ collnum ] = sc;
|
||||
cr->m_spiderColl = sc;
|
||||
// note it
|
||||
log(LOG_DEBUG,"spider: made spidercoll=%lx for cr=%lx",
|
||||
logf(LOG_DEBUG,"spider: made spidercoll=%lx for cr=%lx",
|
||||
(long)sc,(long)cr);
|
||||
// update this
|
||||
//if ( m_numSpiderColls < collnum + 1 )
|
||||
@ -1107,8 +1108,16 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true;
|
||||
else sc->m_isTestColl = false;
|
||||
|
||||
// set this
|
||||
sc->setCollectionRec ( cr ); // sc->m_cr = cr;
|
||||
|
||||
// set first doledb scan key
|
||||
sc->m_nextDoledbKey.setMin();
|
||||
|
||||
// turn off quickpolling while loading incase a parm update comes in
|
||||
bool saved = g_conf.m_useQuickpoll;
|
||||
g_conf.m_useQuickpoll = false;
|
||||
|
||||
// mark it as loading so it can't be deleted while loading
|
||||
sc->m_isLoading = true;
|
||||
// . load its tables from disk
|
||||
@ -1117,12 +1126,16 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
sc->load();
|
||||
// mark it as loading
|
||||
sc->m_isLoading = false;
|
||||
// set this
|
||||
sc->m_cr = cr;
|
||||
|
||||
// restore
|
||||
g_conf.m_useQuickpoll = saved;
|
||||
|
||||
// did crawlbottesting delete it right away?
|
||||
if ( tryToDeleteSpiderColl( sc ,"1") ) return NULL;
|
||||
// sanity check
|
||||
if ( ! cr ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! cr ) { char *xx=NULL;*xx=0; }
|
||||
// deleted right away?
|
||||
//if ( sc->getCollectionRec() == NULL ) { char *xx=NULL;*xx=0; }
|
||||
// note it!
|
||||
log(LOG_DEBUG,"spider: adding new spider collection for %s",
|
||||
cr->m_coll);
|
||||
@ -1134,6 +1147,17 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
///////////////////////// SpiderColl
|
||||
/////////////////////////
|
||||
|
||||
void SpiderColl::setCollectionRec ( CollectionRec *cr ) {
|
||||
m_cr = cr;
|
||||
// this was useful for debugging a null m_cr bug
|
||||
//log("sc: sc 0x%lx setting cr to 0x%lx",(long)this,(long)cr);
|
||||
}
|
||||
|
||||
CollectionRec *SpiderColl::getCollectionRec ( ) {
|
||||
//log("sc: sc 0x%lx getting cr of 0x%lx",(long)this,(long)m_cr);
|
||||
return m_cr;
|
||||
}
|
||||
|
||||
SpiderColl::SpiderColl () {
|
||||
m_deleteMyself = false;
|
||||
m_isLoading = false;
|
||||
@ -1145,6 +1169,7 @@ SpiderColl::SpiderColl () {
|
||||
m_numBytesScanned = 0;
|
||||
m_lastPrintCount = 0;
|
||||
m_siteListIsEmptyValid = false;
|
||||
m_cr = NULL;
|
||||
//m_lastSpiderAttempt = 0;
|
||||
//m_lastSpiderCouldLaunch = 0;
|
||||
//m_numRoundsDone = 0;
|
||||
@ -5882,7 +5907,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// shortcut
|
||||
//CollectionRec *cr = m_sc->m_cr;
|
||||
// sanity
|
||||
if ( cr != m_sc->m_cr ) { char *xx=NULL;*xx=0; }
|
||||
if ( cr != m_sc->getCollectionRec() ) { char *xx=NULL;*xx=0; }
|
||||
// skip the priority if we already have enough spiders on it
|
||||
long out = m_sc->m_outstandingSpiders[m_sc->m_pri2];
|
||||
// how many spiders can we have out?
|
||||
@ -6035,7 +6060,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
m_gettingDoledbList = false;
|
||||
|
||||
// shortcuts
|
||||
CollectionRec *cr = m_sc->m_cr;
|
||||
CollectionRec *cr = m_sc->getCollectionRec();
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
|
||||
// update m_msg5StartKey for next read
|
||||
@ -6521,7 +6546,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
//char *coll = m_sc->m_cr->m_coll;
|
||||
// sometimes the spider coll is reset/deleted while we are
|
||||
// trying to get the lock in spiderUrl9() so let's use collnum
|
||||
collnum_t collnum = m_sc->m_cr->m_collnum;
|
||||
collnum_t collnum = m_sc->getCollectionRec()->m_collnum;
|
||||
|
||||
// . spider that. we don't care wheter it blocks or not
|
||||
// . crap, it will need to block to get the locks!
|
||||
|
7
Spider.h
7
Spider.h
@ -1067,6 +1067,9 @@ class SpiderColl {
|
||||
~SpiderColl ( );
|
||||
SpiderColl ( ) ;
|
||||
|
||||
void setCollectionRec ( class CollectionRec *cr );
|
||||
class CollectionRec *getCollectionRec ( );
|
||||
|
||||
void clearLocks();
|
||||
|
||||
// called by main.cpp on exit to free memory
|
||||
@ -1207,7 +1210,6 @@ class SpiderColl {
|
||||
collnum_t m_collnum;
|
||||
char m_coll [ MAX_COLL_LEN + 1 ] ;
|
||||
class CollectionRec *getCollRec();
|
||||
class CollectionRec *m_cr;
|
||||
char *getCollName();
|
||||
bool m_isTestColl;
|
||||
|
||||
@ -1286,6 +1288,9 @@ class SpiderColl {
|
||||
long m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
|
||||
|
||||
bool printStats ( SafeBuf &sb ) ;
|
||||
|
||||
private:
|
||||
class CollectionRec *m_cr;
|
||||
};
|
||||
|
||||
class SpiderCache {
|
||||
|
Reference in New Issue
Block a user