fix rdbcache corruption bugs for winnerlistcache.

This commit is contained in:
Matt Wells 2015-03-07 11:09:06 -08:00
parent 102f2c1ea0
commit c6a59d0810
2 changed files with 113 additions and 56 deletions

@ -15,6 +15,7 @@
//#include "Msg10.h" // g_deadWaitCache
#include "Dns.h"
#include "BigFile.h"
#include "Spider.h"
bool g_cacheWritesEnabled = true;
@ -467,6 +468,12 @@ bool RdbCache::getRecord ( collnum_t collnum ,
}
// return ptr to rec
char *p = m_ptrs[n];
// if collnum is -1 then that means we set it to that in
// RdbCache::clear(). this is kinda hacky.
if ( *(collnum_t *)p == (collnum_t)-1 ) {
if ( incCounts ) m_numMisses++;
return false;
}
// skip over collnum and key
//p += sizeof(collnum_t) + sizeof(key_t);
p += sizeof(collnum_t) + m_cks;
@ -594,9 +601,12 @@ bool RdbCache::getRecord ( collnum_t collnum ,
//int32_t n = hash32 ( cacheKey , m_cks ) % m_numPtrsMax;
//if ( this == &g_robotdb.m_rdbCache )
// logf(LOG_DEBUG, "db: cachebug: promoting record "
// "k.n0=0x%"XINT64" n=%"INT32"",((key_t *)cacheKey)->n0,
// *recSize);
// if ( this == &g_spiderLoop.m_winnerListCache ) {
// logf(LOG_DEBUG, "db: cachebug: promoting record "
// "k.n0=0x%"XINT64" n=%"INT32"",
// ((key_t *)cacheKey)->n0,
// *recSize);
// }
char *retRec = NULL;
addRecord ( collnum , cacheKey , *rec , *recSize , timestamp ,
&retRec );
@ -818,10 +828,11 @@ bool RdbCache::addRecord ( collnum_t collnum ,
"cache. Max size is %i.",need,m_dbname,BUFSIZE);
// if too many slots in hash table used free one up
while ( m_numPtrsUsed >= m_threshold )
while ( m_numPtrsUsed >= m_threshold ) {
if ( ! deleteRec() ) {
return false;
}
}
// . do NOT split across buffers, align on a boundary if we need to
// . "i1" is where we PLAN to store the record
@ -858,6 +869,10 @@ bool RdbCache::addRecord ( collnum_t collnum ,
i2c = i1c + need;
}
// save for debug
//int32_t saved = m_tail;
// . increase m_tail so it is NOT in the range: [i1,i2b)
// . NEVER do this if we are the first rec added though, because
// m_tail will equal i1 at that point...
@ -884,15 +899,22 @@ bool RdbCache::addRecord ( collnum_t collnum ,
//if ( start <= rec2 && start+32>= rec2 ) { char*xx=NULL;*xx=0;}
//if ( this == &g_robotdb.m_rdbCache )
// logf(LOG_DEBUG, "db: cachebug: adding rec k.n0=0x%"XINT64" rs=%"INT32" "
// "off=%"INT32" bufNum=%"INT32" ptr=0x%"XINT32" tail=%"INT32" numPtrs=%"INT32"",
// ((key_t *)cacheKey)->n0,recSize1+recSize2,
// i1c,bufNumStart,(int32_t)p,m_tail,m_numPtrsUsed);
// if ( this == &g_spiderLoop.m_winnerListCache )
// logf(LOG_DEBUG, "db: cachebug: adding rec k.n0=0x%"XINT64" "
// "rs=%"INT32" "
// "off=%"INT32" bufNum=%"INT32" ptr=0x%"PTRFMT" "
// "oldtail=%"INT32" "
// "newtail=%"INT32" "
// "numPtrs=%"INT32"",
// ((key_t *)cacheKey)->n0,recSize1+recSize2,
// i1c,bufNumStart,(PTRTYPE)p,saved,m_tail,m_numPtrsUsed);
// if we wiped out all recs then reset tail to m_offset
if ( m_numPtrsUsed == 0 ) {
//if ( this == &g_robotdb.m_rdbCache )
// log("db: cachebug: full tail reset. tail=0");
// if ( this == &g_spiderLoop.m_winnerListCache )
// logf(LOG_DEBUG,"db: cachebug: full tail reset. "
// "tail=0");
m_tail = 0;
}
@ -961,6 +983,8 @@ bool RdbCache::addRecord ( collnum_t collnum ,
(PTRTYPE)this,
((key_t *)(&cacheKey))->n1 ,
((key_t *)(&cacheKey))->n0 );
//log("%s addRecord %"INT32" bytes @ offset=%"INT32" k.n1=%"UINT32" n0=%"UINT64" "
// "TOOK %"INT64" ms" ,
// m_dbname , need , i ,
@ -1063,6 +1087,8 @@ bool RdbCache::deleteRec ( ) {
// sanity
//if ( m_tail < 0 || m_tail > m_totalBufSize ) {
// char *xx = NULL; *xx = 0;}
// if ( this == &g_spiderLoop.m_winnerListCache )
// logf(LOG_DEBUG, "db: cachebug: wrapping tail to 0");
//return true; // continue;
goto top;
}
@ -1079,10 +1105,8 @@ bool RdbCache::deleteRec ( ) {
if ( dataSize < 0 || dataSize > m_totalBufSize ){
char *xx = NULL; *xx = 0;
}
//if ( this == &g_robotdb.m_rdbCache )
// logf(LOG_DEBUG, "db: cachebug: removing k.n0=0x%"XINT64" "
// "tail=%"INT32" ds=%"INT32"", ((key_t *)k)->n0,m_tail,dataSize);
//int32_t saved = m_tail;
// debug msg (MDW)
//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
@ -1107,6 +1131,14 @@ bool RdbCache::deleteRec ( ) {
removeKey ( collnum , k , start );
markDeletedRecord(start);
}
//if ( this == &g_robotdb.m_rdbCache )
// if ( this == &g_spiderLoop.m_winnerListCache )
// logf(LOG_DEBUG, "db: cachebug: removing k.n0=0x%"XINT64" "
// "oldtail=%"INT32" newtail=%"INT32" ds=%"INT32"",
// ((key_t *)k)->n0,saved,m_tail,dataSize);
//else
// logf(LOG_DEBUG,"test: oops");
// count as a delete
@ -1274,10 +1306,6 @@ void RdbCache::addKey ( collnum_t collnum , char *key , char *ptr ) {
// log("%s update key.n1=%"UINT32" key.n0=%"UINT64" in slot #%"INT32"",
// m_dbname,key.n1,key.n0,n);
//if ( this == &g_robotdb.m_rdbCache )
// log("db: cachebug: key @ slot #%"INT32" has ptr=0x%"XINT32"",
// n,(int32_t)ptr);
// If this pointer is already set, we may be replacing it from
// Msg5::needRecall. We need to mark the old record as deleted
if (m_ptrs[n]){
@ -1288,6 +1316,12 @@ void RdbCache::addKey ( collnum_t collnum , char *key , char *ptr ) {
m_ptrs[n] = ptr;
// debug testing
//m_crcs[n] = crc;
//if ( this == &g_robotdb.m_rdbCache )
// if ( this == &g_spiderLoop.m_winnerListCache )
// logf(LOG_DEBUG,"db: cachebug: addkey slot #%"INT32" has "
// "ptr=0x%"PTRFMT"",n,(PTRTYPE)ptr);
}
/*
@ -1345,7 +1379,10 @@ void RdbCache::clear ( collnum_t collnum ) {
if ( *(collnum_t *)m_ptrs[i] != collnum ) continue;
// change to the -1 collection, nobody should use that and
// it should get kicked out over time
*(collnum_t *)m_ptrs[i] = -1;
//*(collnum_t *)m_ptrs[i] = -1;
// just change the collnum to something impossible
// this is kinda hacky but hopefully will not cause corruption
*(collnum_t *)m_ptrs[i] = (collnum_t)-1;
}
}
@ -1827,6 +1864,7 @@ bool RdbCache::convertCache ( int32_t numPtrsMax , int32_t maxMem ) {
void RdbCache::verify(){
bool foundTail = false;
int32_t count = 0;
logf(LOG_DEBUG,"db: cachebug: verifying");
for ( int32_t i = 0; i < m_numPtrsMax; i++ ){
char *start = m_ptrs[i];
if ( !start ) continue;
@ -1835,9 +1873,10 @@ void RdbCache::verify(){
char *p = start;
// get collnum
collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t);
// -1 this means cleared! set in RdbCache::clear(collnum_t)
// collnum can be 0 in case we have to go to next buffer
if ( collnum != 0 && ( collnum >= m_maxColls || collnum < 0 ||
!g_collectiondb.m_recs[collnum] ) ) {
if ( collnum != 0 && ( collnum >= m_maxColls || collnum <-1)){
// !g_collectiondb.m_recs[collnum] ) ) {
char *xx = NULL; *xx = 0;
}

@ -1495,9 +1495,12 @@ static void nukeDoledbWrapper ( int fd , void *state ) {
void nukeDoledb ( collnum_t collnum ) {
//g_spiderLoop.m_winnerListCache.verify();
// in case we changed url filters for this collection #
g_spiderLoop.m_winnerListCache.clear ( collnum );
//g_spiderLoop.m_winnerListCache.verify();
//WaitEntry *we = (WaitEntry *)state;
//if ( we->m_registered )
@ -3478,7 +3481,8 @@ bool SpiderColl::evalIpLoop ( ) {
if ( m_countingPagesIndexed )
useCache = false;
// assume not from cache
if ( useCache )
if ( useCache ) {
//wc->verify();
inCache = wc->getRecord ( m_collnum ,
(char *)&cacheKey ,
&doleBuf,
@ -3488,6 +3492,10 @@ bool SpiderColl::evalIpLoop ( ) {
true ,// incCounts
&cachedTimestamp , // rec timestamp
true ); // promote rec?
//wc->verify();
}
// doleBuf could be NULL i guess...
if ( inCache ) { // && doleBufSize > 0 ) {
if ( g_conf.m_logDebugSpider )
@ -3495,11 +3503,15 @@ bool SpiderColl::evalIpLoop ( ) {
"from winnerlistcache for ip %s",doleBufSize,
iptoa(m_scanningIp));
// set own to false so it doesn't get freed
m_doleBuf.setBuf ( doleBuf ,
doleBufSize ,
doleBufSize ,
false , // ownData?
0 ); // encoding. doesn't matter.
// m_doleBuf.setBuf ( doleBuf ,
// doleBufSize ,
// doleBufSize ,
// false , // ownData?
// 0 ); // encoding. doesn't matter.
m_doleBuf.reset();
// gotta copy it because we end up re-adding part of it
// to rdbcache below
m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
// now add the first rec m_doleBuf into doledb's tree
// and re-add the rest back to the cache with the same key.
return addDoleBufIntoDoledb ( true , cachedTimestamp );
@ -5249,11 +5261,13 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
//wc->verify();
wc->addRecord ( m_collnum,
(char *)&cacheKey,
&byte ,
1 ,
12345 );//cachedTimestamp );
//wc->verify();
}
if ( addToCache ) {
@ -5264,12 +5278,14 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
log("spider: adding %"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s",
m_doleBuf.length()-skipSize,iptoa(firstIp));
//wc->verify();
// inherit timestamp. if 0, RdbCache will set to current time
wc->addRecord ( m_collnum,
(char *)&cacheKey,
m_doleBuf.getBufStart() + skipSize ,
m_doleBuf.length() - skipSize ,
cachedTimestamp );
//wc->verify();
}
// and the whole thing is no longer empty
@ -6042,36 +6058,6 @@ void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) ;
// now check our RDB_DOLEDB for SpiderRequests to spider!
void SpiderLoop::spiderDoledUrls ( ) {
// must be spidering to dole out
if ( ! g_conf.m_spideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// if we don't have all the url counts from all hosts, then wait.
// one host is probably down and was never up to begin with
if ( ! s_countsAreValid ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// if we do not overlap ourselves
if ( m_gettingDoledbList ) return;
// bail instantly if in read-only mode (no RdbTrees!)
if ( g_conf.m_readOnlyMode ) return;
// or if doing a daily merge
if ( g_dailyMerge.m_mergeMode ) return;
// skip if too many udp slots being used
if ( g_udpServer.getNumUsedSlots() >= 1300 ) return;
// stop if too many out. this is now 50 down from 500.
if ( m_numSpidersOut >= MAX_SPIDERS ) return;
// a new global conf rule
if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
// bail if no collections
if ( g_collectiondb.m_numRecs <= 0 ) return;
// not while repairing
if ( g_repairMode ) return;
// do not spider until collections/parms in sync with host #0
if ( ! g_parms.m_inSyncWithHost0 ) return;
// don't spider if not all hosts are up, or they do not all
// have the same hosts.conf.
if ( ! g_pingServer.m_hostsConfInAgreement ) return;
//char *reb = g_rebalance.getNeedsRebalance();
//if ( ! reb || *reb ) {return;
@ -6142,6 +6128,38 @@ void SpiderLoop::spiderDoledUrls ( ) {
subloop:
// must be spidering to dole out
if ( ! g_conf.m_spideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// if we don't have all the url counts from all hosts, then wait.
// one host is probably down and was never up to begin with
if ( ! s_countsAreValid ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// if we do not overlap ourselves
if ( m_gettingDoledbList ) return;
// bail instantly if in read-only mode (no RdbTrees!)
if ( g_conf.m_readOnlyMode ) return;
// or if doing a daily merge
if ( g_dailyMerge.m_mergeMode ) return;
// skip if too many udp slots being used
if ( g_udpServer.getNumUsedSlots() >= 1300 ) return;
// stop if too many out. this is now 50 down from 500.
if ( m_numSpidersOut >= MAX_SPIDERS ) return;
// a new global conf rule
if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
// bail if no collections
if ( g_collectiondb.m_numRecs <= 0 ) return;
// not while repairing
if ( g_repairMode ) return;
// do not spider until collections/parms in sync with host #0
if ( ! g_parms.m_inSyncWithHost0 ) return;
// don't spider if not all hosts are up, or they do not all
// have the same hosts.conf.
if ( ! g_pingServer.m_hostsConfInAgreement ) return;
// if we hit the end of the list, wrap it around
if ( ! m_crx ) m_crx = m_activeList;