fix rdbcache corruption bugs for winnerlistcache.

2015-03-07 11:09:06 -08:00 · 2015-03-07 11:09:06 -08:00 · c6a59d0810
commit c6a59d0810
parent 102f2c1ea0
2 changed files with 113 additions and 56 deletions
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -15,6 +15,7 @@
 //#include "Msg10.h"   // g_deadWaitCache
 #include "Dns.h"
 #include "BigFile.h"
+#include "Spider.h"

 bool g_cacheWritesEnabled = true;

@ -467,6 +468,12 @@ bool RdbCache::getRecord ( collnum_t collnum   ,
 	}
 	// return ptr to rec
 	char *p = m_ptrs[n];
+	// if collnum is -1 then that means we set it to that in
+	// RdbCache::clear(). this is kinda hacky.
+	if ( *(collnum_t *)p == (collnum_t)-1 ) {
+		if ( incCounts ) m_numMisses++;
+		return false;
+	}
 	// skip over collnum and key
 	//p += sizeof(collnum_t) + sizeof(key_t);
 	p += sizeof(collnum_t) + m_cks;
@ -594,9 +601,12 @@ bool RdbCache::getRecord ( collnum_t collnum   ,

 		//int32_t n = hash32 ( cacheKey , m_cks ) % m_numPtrsMax;
 		//if ( this == &g_robotdb.m_rdbCache )
-		//	logf(LOG_DEBUG, "db: cachebug: promoting record "
-		//	     "k.n0=0x%"XINT64" n=%"INT32"",((key_t *)cacheKey)->n0,
-		//	     *recSize);
+		// if ( this == &g_spiderLoop.m_winnerListCache ) {
+		// 	logf(LOG_DEBUG, "db: cachebug: promoting record "
+		// 	     "k.n0=0x%"XINT64" n=%"INT32"",
+		// 	     ((key_t *)cacheKey)->n0,
+		// 	     *recSize);
+		// }
 		char *retRec = NULL;
 		addRecord ( collnum , cacheKey , *rec , *recSize , timestamp ,
 			    &retRec );
@ -818,10 +828,11 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 			   "cache. Max size is %i.",need,m_dbname,BUFSIZE);

 	// if too many slots in hash table used free one up
-	while ( m_numPtrsUsed >= m_threshold )
+	while ( m_numPtrsUsed >= m_threshold ) {
 		if ( ! deleteRec() ) {
 			return false;
 		}
+	}

 	// . do NOT split across buffers, align on a boundary if we need to
 	// . "i1" is where we PLAN to store the record
@ -858,6 +869,10 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 		i2c = i1c + need;
 	}

+	// save for debug
+	//int32_t saved = m_tail;
+
+
 	// . increase m_tail so it is NOT in the range: [i1,i2b)
 	// . NEVER do this if we are the first rec added though, because
 	//   m_tail will equal i1 at that point...
@ -884,15 +899,22 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 	//if ( start <= rec2 && start+32>= rec2       ) { char*xx=NULL;*xx=0;}

 	//if ( this == &g_robotdb.m_rdbCache )
-	//	logf(LOG_DEBUG, "db: cachebug: adding rec k.n0=0x%"XINT64" rs=%"INT32" "
-	//	     "off=%"INT32" bufNum=%"INT32" ptr=0x%"XINT32" tail=%"INT32" numPtrs=%"INT32"",
-	//	     ((key_t *)cacheKey)->n0,recSize1+recSize2,
-	//	     i1c,bufNumStart,(int32_t)p,m_tail,m_numPtrsUsed);
+	// if ( this == &g_spiderLoop.m_winnerListCache )
+	// 	logf(LOG_DEBUG, "db: cachebug: adding rec k.n0=0x%"XINT64" "
+	// 	     "rs=%"INT32" "
+	// 	     "off=%"INT32" bufNum=%"INT32" ptr=0x%"PTRFMT" "
+	// 	     "oldtail=%"INT32" "
+	// 	     "newtail=%"INT32" "
+	// 	     "numPtrs=%"INT32"",
+	// 	     ((key_t *)cacheKey)->n0,recSize1+recSize2,
+	// 	     i1c,bufNumStart,(PTRTYPE)p,saved,m_tail,m_numPtrsUsed);

 	// if we wiped out all recs then reset tail to m_offset
 	if ( m_numPtrsUsed == 0 ) {
 		//if ( this == &g_robotdb.m_rdbCache )
-		//	log("db: cachebug: full tail reset. tail=0");
+		// if ( this == &g_spiderLoop.m_winnerListCache )
+		// 	logf(LOG_DEBUG,"db: cachebug: full tail reset. "
+		// 	     "tail=0");
 		m_tail = 0;
 	}

@ -961,6 +983,8 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 	    (PTRTYPE)this,
 	    ((key_t *)(&cacheKey))->n1 ,
 	    ((key_t *)(&cacheKey))->n0 );
+
+
 	//log("%s addRecord %"INT32" bytes @ offset=%"INT32" k.n1=%"UINT32" n0=%"UINT64" "
 	//     "TOOK %"INT64" ms" , 
 	//     m_dbname , need , i , 
@ -1063,6 +1087,8 @@ bool RdbCache::deleteRec ( ) {
 		// sanity
 		//if ( m_tail < 0  || m_tail > m_totalBufSize ) {
 		//	char *xx = NULL; *xx = 0;}
+		// if ( this == &g_spiderLoop.m_winnerListCache )
+		// 	logf(LOG_DEBUG, "db: cachebug: wrapping tail to 0");
 		//return true; // continue;
 		goto top;
 	}
@ -1079,10 +1105,8 @@ bool RdbCache::deleteRec ( ) {
 	if ( dataSize < 0 || dataSize > m_totalBufSize ){
 		char *xx = NULL; *xx = 0;
 	}
-	
-	//if ( this == &g_robotdb.m_rdbCache ) 
-	//	logf(LOG_DEBUG, "db: cachebug: removing k.n0=0x%"XINT64" "
-	//	     "tail=%"INT32" ds=%"INT32"", ((key_t *)k)->n0,m_tail,dataSize);
+
+	//int32_t saved = m_tail;
 	
 	// debug msg (MDW)
 	//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
@ -1107,6 +1131,14 @@ bool RdbCache::deleteRec ( ) {
 		removeKey ( collnum , k , start );
 		markDeletedRecord(start);
 	}
+
+
+	//if ( this == &g_robotdb.m_rdbCache ) 
+	// if ( this == &g_spiderLoop.m_winnerListCache )
+	// 	logf(LOG_DEBUG, "db: cachebug: removing k.n0=0x%"XINT64" "
+	// 	     "oldtail=%"INT32" newtail=%"INT32" ds=%"INT32"", 
+	// 	     ((key_t *)k)->n0,saved,m_tail,dataSize);
+
 	//else
 	//	logf(LOG_DEBUG,"test: oops");
 	// count as a delete
@ -1274,10 +1306,6 @@ void RdbCache::addKey ( collnum_t collnum , char *key , char *ptr ) {
 	//	log("%s update key.n1=%"UINT32" key.n0=%"UINT64" in slot #%"INT32"",
 	//	    m_dbname,key.n1,key.n0,n);
 		
-	//if ( this == &g_robotdb.m_rdbCache ) 
-	//	log("db: cachebug: key @ slot #%"INT32" has ptr=0x%"XINT32"",
-	//	    n,(int32_t)ptr);
-
 	// If this pointer is already set, we may be replacing it from 
 	// Msg5::needRecall.  We need to mark the old record as deleted
 	if (m_ptrs[n]){
@ -1288,6 +1316,12 @@ void RdbCache::addKey ( collnum_t collnum , char *key , char *ptr ) {
 	m_ptrs[n] = ptr;
 	// debug testing
 	//m_crcs[n] = crc;
+
+	//if ( this == &g_robotdb.m_rdbCache ) 
+	// if ( this == &g_spiderLoop.m_winnerListCache )
+	// 	logf(LOG_DEBUG,"db: cachebug: addkey slot #%"INT32" has "
+	// 	     "ptr=0x%"PTRFMT"",n,(PTRTYPE)ptr);
+
 }

 /*
@ -1345,7 +1379,10 @@ void RdbCache::clear ( collnum_t collnum ) {
 		if ( *(collnum_t *)m_ptrs[i] != collnum ) continue;
 		// change to the -1 collection, nobody should use that and
 		// it should get kicked out over time
-		*(collnum_t *)m_ptrs[i] = -1;
+		//*(collnum_t *)m_ptrs[i] = -1;
+		// just change the collnum to something impossible
+		// this is kinda hacky but hopefully will not cause corruption
+		*(collnum_t *)m_ptrs[i] = (collnum_t)-1;
 	}
 }

@ -1827,6 +1864,7 @@ bool RdbCache::convertCache ( int32_t numPtrsMax , int32_t maxMem ) {
 void RdbCache::verify(){
 	 bool foundTail = false;
 	 int32_t count = 0;
+	 logf(LOG_DEBUG,"db: cachebug: verifying");
 	 for ( int32_t i = 0; i < m_numPtrsMax; i++ ){
 		 char *start = m_ptrs[i];
 		 if ( !start ) continue;
@ -1835,9 +1873,10 @@ void RdbCache::verify(){
 		 char *p      = start;
 		 // get collnum
 		 collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t);
+		 // -1 this means cleared! set in RdbCache::clear(collnum_t)
 		 // collnum can be 0 in case we have to go to next buffer
-		 if ( collnum != 0 && ( collnum >= m_maxColls || collnum < 0 ||
-					!g_collectiondb.m_recs[collnum] ) ) {
+		 if ( collnum != 0 && ( collnum >= m_maxColls || collnum <-1)){
+			 //	!g_collectiondb.m_recs[collnum] ) ) {
 			 char *xx = NULL; *xx = 0;
 		 }
 	
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1495,9 +1495,12 @@ static void nukeDoledbWrapper ( int fd , void *state ) {

 void nukeDoledb ( collnum_t collnum ) {

+	//g_spiderLoop.m_winnerListCache.verify();	
 	// in case we changed url filters for this collection #
 	g_spiderLoop.m_winnerListCache.clear ( collnum );

+	//g_spiderLoop.m_winnerListCache.verify();	
+
 	//WaitEntry *we = (WaitEntry *)state;

 	//if ( we->m_registered )
@ -3478,7 +3481,8 @@ bool SpiderColl::evalIpLoop ( ) {
 	if ( m_countingPagesIndexed )
 		useCache = false;
 	// assume not from cache
-	if ( useCache )
+	if ( useCache ) {
+		//wc->verify();
 		inCache = wc->getRecord ( m_collnum     ,
 					  (char *)&cacheKey ,
 					  &doleBuf,
@ -3488,6 +3492,10 @@ bool SpiderColl::evalIpLoop ( ) {
 					  true ,// incCounts
 					  &cachedTimestamp , // rec timestamp
 					  true );  // promote rec?
+		//wc->verify();
+	}
+
+
 	// doleBuf could be NULL i guess...
 	if ( inCache ) { // && doleBufSize > 0 ) {
 		if ( g_conf.m_logDebugSpider )
@ -3495,11 +3503,15 @@ bool SpiderColl::evalIpLoop ( ) {
 			    "from winnerlistcache for ip %s",doleBufSize,
 			    iptoa(m_scanningIp));
 		// set own to false so it doesn't get freed
-		m_doleBuf.setBuf ( doleBuf , 
-				   doleBufSize ,
-				   doleBufSize , 
-				   false , // ownData?
-				   0 ); // encoding. doesn't matter.
+		// m_doleBuf.setBuf ( doleBuf , 
+		// 		   doleBufSize ,
+		// 		   doleBufSize , 
+		// 		   false , // ownData?
+		// 		   0 ); // encoding. doesn't matter.
+		m_doleBuf.reset();
+		// gotta copy it because we end up re-adding part of it
+		// to rdbcache below
+		m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
 		// now add the first rec m_doleBuf into doledb's tree
 		// and re-add the rest back to the cache with the same key.
 		return addDoleBufIntoDoledb ( true , cachedTimestamp );
@ -5249,11 +5261,13 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 		key_t cacheKey;
 		cacheKey.n0 = firstIp;
 		cacheKey.n1 = 0;
+		//wc->verify();
 		wc->addRecord ( m_collnum,
 				(char *)&cacheKey,
 				&byte ,
 				1 ,
 		 		12345 );//cachedTimestamp );
+		//wc->verify();
 	}

 	if ( addToCache ) {
@ -5264,12 +5278,14 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 			log("spider: adding %"INT32" bytes of SpiderRequests "
 			    "to winnerlistcache for ip %s",
 			    m_doleBuf.length()-skipSize,iptoa(firstIp));
+		//wc->verify();
 		// inherit timestamp. if 0, RdbCache will set to current time
 		wc->addRecord ( m_collnum,
 				(char *)&cacheKey,
 				m_doleBuf.getBufStart() + skipSize ,
 				m_doleBuf.length() - skipSize ,
 				cachedTimestamp );
+		//wc->verify();
 	}

 	// and the whole thing is no longer empty
@ -6042,36 +6058,6 @@ void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) ;
 // now check our RDB_DOLEDB for SpiderRequests to spider!
 void SpiderLoop::spiderDoledUrls ( ) {

-	// must be spidering to dole out
-	if ( ! g_conf.m_spideringEnabled ) return;
-	// or if trying to exit
-	if ( g_process.m_mode == EXIT_MODE ) return;	
-	// if we don't have all the url counts from all hosts, then wait.
-	// one host is probably down and was never up to begin with
-	if ( ! s_countsAreValid ) return;
-	//if ( ! g_conf.m_webSpideringEnabled )  return;
-	// if we do not overlap ourselves
-	if ( m_gettingDoledbList ) return;
-	// bail instantly if in read-only mode (no RdbTrees!)
-	if ( g_conf.m_readOnlyMode ) return;
-	// or if doing a daily merge
-	if ( g_dailyMerge.m_mergeMode ) return;
-	// skip if too many udp slots being used
-	if ( g_udpServer.getNumUsedSlots() >= 1300 ) return;
-	// stop if too many out. this is now 50 down from 500.
-	if ( m_numSpidersOut >= MAX_SPIDERS ) return;
-	// a new global conf rule
-	if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
-	// bail if no collections
-	if ( g_collectiondb.m_numRecs <= 0 ) return;
-	// not while repairing
-	if ( g_repairMode ) return;
-	// do not spider until collections/parms in sync with host #0
-	if ( ! g_parms.m_inSyncWithHost0 ) return;
-	// don't spider if not all hosts are up, or they do not all
-	// have the same hosts.conf.
-	if ( ! g_pingServer.m_hostsConfInAgreement ) return;
-
 	//char *reb = g_rebalance.getNeedsRebalance();
 	//if ( ! reb || *reb ) {return;

@ -6142,6 +6128,38 @@ void SpiderLoop::spiderDoledUrls ( ) {

 subloop:

+	// must be spidering to dole out
+	if ( ! g_conf.m_spideringEnabled ) return;
+	// or if trying to exit
+	if ( g_process.m_mode == EXIT_MODE ) return;	
+	// if we don't have all the url counts from all hosts, then wait.
+	// one host is probably down and was never up to begin with
+	if ( ! s_countsAreValid ) return;
+	//if ( ! g_conf.m_webSpideringEnabled )  return;
+	// if we do not overlap ourselves
+	if ( m_gettingDoledbList ) return;
+	// bail instantly if in read-only mode (no RdbTrees!)
+	if ( g_conf.m_readOnlyMode ) return;
+	// or if doing a daily merge
+	if ( g_dailyMerge.m_mergeMode ) return;
+	// skip if too many udp slots being used
+	if ( g_udpServer.getNumUsedSlots() >= 1300 ) return;
+	// stop if too many out. this is now 50 down from 500.
+	if ( m_numSpidersOut >= MAX_SPIDERS ) return;
+	// a new global conf rule
+	if ( m_numSpidersOut >= g_conf.m_maxTotalSpiders ) return;
+	// bail if no collections
+	if ( g_collectiondb.m_numRecs <= 0 ) return;
+	// not while repairing
+	if ( g_repairMode ) return;
+	// do not spider until collections/parms in sync with host #0
+	if ( ! g_parms.m_inSyncWithHost0 ) return;
+	// don't spider if not all hosts are up, or they do not all
+	// have the same hosts.conf.
+	if ( ! g_pingServer.m_hostsConfInAgreement ) return;
+
+
+
 	// if we hit the end of the list, wrap it around
 	if ( ! m_crx ) m_crx = m_activeList;