fix empty rdbcache bug of not enough buf mem.

2014-11-27 13:17:00 -08:00
parent 53207d80e8
commit 8e315504a2
4 changed files with 56 additions and 16 deletions
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -601,7 +601,7 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness  ) {

 	if ( ! s_flag ) {
 		s_flag = true;
-		s_hammerCache.init ( 5000       , // maxcachemem,
+		s_hammerCache.init ( 15000       , // maxcachemem,
 				     8          , // fixed data size
 				     false      , // support lists?
 				     500        , // max nodes
@ -887,6 +887,11 @@ void downloadTheDocForReals3a ( Msg13Request *r ) {

 void downloadTheDocForReals3b ( Msg13Request *r ) {

+	int64_t nowms = gettimeofdayInMilliseconds();
+
+	// assume no download start time
+	r->m_downloadStartTimeMS = 0;
+
 	// . store time now
 	// . no, now we store 0 to indicate in progress, then we
 	//   will overwrite it with a timestamp when the download completes
@ -901,7 +906,6 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
 	}
 	else if ( ! r->m_skipHammerCheck ) {
 		// get time now
-		int64_t nowms = gettimeofdayInMilliseconds();
 		s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
 		log(LOG_DEBUG,
 		    "spider: adding new time to hammercache for %s %s = %"INT64"",
@ -920,7 +924,7 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
 		    "firstIp=%s "
 		    "url=%s "
 		    "to msg13::hammerCache",
-		    -1,
+		    0,//-1,
 		    iptoa(r->m_firstIp),
 		    r->ptr_url);

@ -1011,6 +1015,9 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
 	if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
 		fixGETorPOST ( exactRequest );

+	// indicate start of download so we can overwrite the 0 we stored
+	// into the hammercache
+	r->m_downloadStartTimeMS = nowms;

 	// . download it
 	// . if m_proxyIp is non-zero it will make requests like:
@ -1347,18 +1354,30 @@ void gotHttpReply2 ( void *state ,

 	// get time now
 	int64_t nowms = gettimeofdayInMilliseconds();
+
+	// right now there is a 0 in there to indicate in-progress.
+	// so we must overwrite with either the download start time or the
+	// download end time.
+	int64_t timeToAdd = r->m_downloadStartTimeMS;
+	if ( r->m_crawlDelayFromEnd ) timeToAdd = nowms;
+
 	// . now store the current time in the cache
 	// . do NOT do this for robots.txt etc. where we skip hammer check
-	if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
-		s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
-	// note it
-	if ( g_conf.m_logDebugSpider )
-		log("spider: adding final download end time of %"INT64" for "
-		    "firstIp=%s "
-		    "url=%s "
-		    "to msg13::hammerCache",
-		    nowms,iptoa(r->m_firstIp),r->ptr_url);
+	if ( ! r->m_skipHammerCheck ) 
+		s_hammerCache.addLongLong(0,r->m_firstIp,timeToAdd);

+	// note it
+	if ( g_conf.m_logDebugSpider && ! r->m_skipHammerCheck )
+		log(LOG_DEBUG,"spider: adding last download time "
+		    "of %"INT64" for firstIp=%s url=%s "
+		    "to msg13::hammerCache",
+		    timeToAdd,iptoa(r->m_firstIp),r->ptr_url);
+
+
+	if ( g_conf.m_logDebugSpider )
+		log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s",
+		    iptoa(r->m_firstIp),r->ptr_url);
+	

 	// sanity. this was happening from iframe download
 	//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }
@ -2786,7 +2805,7 @@ bool addToHammerQueue ( Msg13Request *r ) {
 	// . make sure we are not hammering an ip
 	// . returns 0 if currently downloading a url from that ip
 	// . returns -1 if not found
-	int64_t last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
+	int64_t last = s_hammerCache.getLongLong(0,r->m_firstIp,-1,true);
 	// get time now
 	int64_t nowms = gettimeofdayInMilliseconds();
 	// how long has it been since last download START time?
@ -2794,6 +2813,11 @@ bool addToHammerQueue ( Msg13Request *r ) {

 	int32_t crawlDelayMS = r->m_crawlDelayMS;

+	if ( g_conf.m_logDebugSpider )
+		log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
+		    "hammercache (waited=%"INT64") for %s",last,waited,
+		    iptoa(r->m_firstIp));
+
 	// . if we got a proxybackoff base it on # of banned proxies for urlIp
 	// . try to be more sensitive for more sensitive website policies
 	// . we don't know why this proxy was banned, or if we were 
--- a/Msg13.h
+++ b/Msg13.h
@ -56,6 +56,8 @@ public:
 	char *m_proxiedUrl;
 	int32_t  m_proxiedUrlLen;

+	int64_t m_downloadStartTimeMS;
+
 	char  m_niceness;
 	int32_t  m_ifModifiedSince;
 	int32_t  m_maxCacheAge;
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -152,6 +152,20 @@ bool RdbCache::init ( int32_t  maxMem        ,
 	// . make the 128MB buffers
 	// . if we do more than 128MB per buf then pthread_create() will fail
 	int32_t bufMem = m_maxMem - m_memAlloced;
+	if( bufMem <= 0 ) {
+		log("rdbcache: cache for %s does not have enough mem. fix "
+		    "by increasing maxmem or number of recs, etc.",m_dbname);
+		char *xx=NULL;*xx=0;
+	}
+	if ( bufMem  && m_fixedDataSize > 0 &&
+	     bufMem / m_fixedDataSize < maxRecs / 2 ) {
+		log("cache: warning. "
+		    "cache for %s can have %i ptrs but buf mem "
+		    "can only hold %i objects"
+		    ,m_dbname
+		    ,(int)maxRecs
+		    ,(int)(bufMem/m_fixedDataSize));
+	}
 	m_totalBufSize = 0LL;
 	m_offset       = 0LL;
 	while ( bufMem > 0 && m_numBufs < 32 ) {
--- a/Spider.cpp
+++ b/Spider.cpp
@ -2073,8 +2073,8 @@ bool SpiderColl::isInDupCache ( SpiderRequest *sreq , bool addToCache ) {
 	// init dup cache?
 	if ( ! m_dupCache.isInitialized() )
 		// use 50k i guess of 64bit numbers and linked list info
-		m_dupCache.init ( 50000, 
-				  4 , // fixeddatasize (don't really need this)
+		m_dupCache.init ( 90000, 
+				  8 , // fixeddatasize (don't really need this)
 				  false, // list support?
 				  5000, // maxcachenodes
 				  false, // usehalfkeys?
@ -5257,7 +5257,7 @@ void SpiderLoop::startLoop ( ) {
 	m_lockTable.set ( 8,sizeof(UrlLock),0,NULL,0,false,MAX_NICENESS,
 			  "splocks", true ); // useKeyMagic? yes.

-	if ( ! m_lockCache.init ( 10000 , // maxcachemem
+	if ( ! m_lockCache.init ( 20000 , // maxcachemem
 				  4     , // fixedatasize
 				  false , // supportlists?
 				  1000  , // maxcachenodes