Merge branch 'diffbot-testing' into testing

2015-10-02 19:26:15 -06:00
parent 83ac18fff4 42cdd5b382
commit 39214a9dc6
19 changed files with 412 additions and 111 deletions
--- a/4
+++ b/4
@ -532,6 +532,10 @@ Timedb.o:
 HashTableX.o:
 	$(CC) $(DEFS) $(CPPFLAGS)  -O2 -c $*.cpp 

+# getUrlFilterNum2()
+Spider.o:
+	$(CC) $(DEFS) $(CPPFLAGS)  -O2 -c $*.cpp 
+
 SpiderCache.o:
 	$(CC) $(DEFS) $(CPPFLAGS)  -O2 -c $*.cpp 

--- a/Mem.cpp
+++ b/Mem.cpp
@ -21,7 +21,7 @@

 // uncomment this for EFENCE to do underflow checks instead of the
 // default overflow checks
-//#define _CHECKUNDERFLOW_
+//#define CHECKUNDERFLOW

 // only Mem.cpp can call ::malloc, everyone else must call mmalloc() so
 // we can keep tabs on memory usage. in Mem.h we #define this to be coreme()
@ -2168,7 +2168,7 @@ void *getElecMem ( int32_t size ) {
 	// a page above OR a page below
 	// let's go below this time since that seems to be the problem

-#ifdef _CHECKUNDERFLOW_
+#ifdef CHECKUNDERFLOW
 	// how much to alloc
 	// . assume sysmalloc returs one byte above a page, so we need
 	//   MEMPAGESIZE-1 bytes to move p up to page boundary, another
@ -2302,7 +2302,7 @@ void freeElecMem ( void *fakeMem ) {
 	char *label    = &s_labels[((uint32_t)h)*16];
 	int32_t  fakeSize =  s_sizes[h];

-#ifdef _CHECKUNDERFLOW_
+#ifdef CHECKUNDERFLOW
 	char *oldProtMem = cp - MEMPAGESIZE;
 #else
 	char *oldProtMem = cp + fakeSize;
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -1222,13 +1222,16 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {

 	// if it is a seed url and there are no links, then perhaps we
 	// are in a blacklist somewhere already from triggering a spider trap
-	if ( //isInSeedBuf ( cr , r->ptr_url ) &&
-	     // this is set in XmlDoc.cpp based on hopcount really
-	     r->m_isRootSeedUrl &&
-	     ! strstr ( ts->m_readBuf, "<a href" ) ) {
-		*msg = "root/seed url with no outlinks";
-		return true;
-	}
+	// i've seen this flub on a site where they just return a script
+	// and it is not banned, so let's remove this until we thinkg
+	// of something better.
+	// if ( //isInSeedBuf ( cr , r->ptr_url ) &&
+	//      // this is set in XmlDoc.cpp based on hopcount really
+	//      r->m_isRootSeedUrl &&
+	//      ! strstr ( ts->m_readBuf, "<a href" ) ) {
+	// 	*msg = "root/seed url with no outlinks";
+	// 	return true;
+	// }


 	// TODO: compare a simple checksum of the page content to what
--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -157,6 +157,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
 	// do not re-route to twins if accessing an external network
 	if ( hostdb != &g_hostdb ) req->m_expected = false;

+	if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
+		log("msg20: docid<0 and no url for msg20::getsummary");
+		g_errno = EBADREQUEST;
+		return true;
+	}
+
 	// get groupId from docId, if positive
 	uint32_t shardNum;
 	if ( req->m_docId >= 0 ) 
@ -398,7 +404,8 @@ void handleRequest20 ( UdpSlot *slot , int32_t netnice ) {

 	// sanity check, the size include the \0
 	if ( req->m_collnum < 0 ) {
-		log("query: Got empty collection in msg20 handler. FIX!");
+		log("query: Got empty collection in msg20 handler. FIX! "
+		    "from ip=%s port=%i",iptoa(slot->m_ip),(int)slot->m_port);
 	        g_udpServer.sendErrorReply ( slot , ENOTFOUND );
 		return; 
 		//char *xx =NULL; *xx = 0; 
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -76,7 +76,7 @@ class RdbCache *getDiskPageCache ( char rdbId ) {
 		rpc = &g_rdbCaches[2];
 		maxSizePtr = &g_conf.m_clusterdbFileCacheSize;
 		maxMem = *maxSizePtr;
-		maxRecs = maxMem / 16;
+		maxRecs = maxMem / 32;
 		dbname = "clustcache";
 	}
 	if ( rdbId == RDB_TITLEDB ) {
--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -154,6 +154,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
        int32_t requestSize = m_slot->m_readBufSize;
        // ensure it's size is ok
        if ( requestSize < 8 ) { 
+	BadReq:
 		g_errno = EBADREQUESTSIZE; 
 		log(LOG_LOGIC,"query: msg39: getDocIds: %s." , 
 		    mstrerror(g_errno) );
@ -169,7 +170,11 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
 					  m_r->m_buf );

 	// sanity check
-	if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
+	if ( finalSize != requestSize ) {
+		log("msg39: sending bad request.");
+		goto BadReq;
+		//char *xx=NULL;*xx=0; }
+	}

 	getDocIds2 ( m_r );
 }
--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -742,14 +742,6 @@ bool Msg3a::gotAllShardReplies ( ) {
 		// cast it and set it
 		m_reply       [i] = mr;
 		m_replyMaxSize[i] = replyMaxSize;
-		// deserialize it (just sets the ptr_ and size_ member vars)
-		//mr->deserialize ( );
-		deserializeMsg ( sizeof(Msg39Reply) ,
-				 &mr->size_docIds,
-				 &mr->size_clusterRecs,
-				 &mr->ptr_docIds,
-				 mr->m_buf );
-
 		// sanity check
 		if ( mr->m_nqt != m_q->getNumTerms() ) {
 			g_errno = EBADREPLY;
@ -767,6 +759,20 @@ bool Msg3a::gotAllShardReplies ( ) {
 			    mstrerror(g_errno));
 			return true;
 		}
+		// deserialize it (just sets the ptr_ and size_ member vars)
+		//mr->deserialize ( );
+		if ( ! deserializeMsg ( sizeof(Msg39Reply) ,
+					&mr->size_docIds,
+					&mr->size_clusterRecs,
+					&mr->ptr_docIds,
+					mr->m_buf ) ) {
+			g_errno = ECORRUPTDATA;
+			m_errno = ECORRUPTDATA;
+			log("query: msg3a: Shard had error: %s",
+			    mstrerror(g_errno));
+			return true;
+
+		}
 		// skip down here if reply was already set
 		//skip:
 		// add of the total hits from each shard, this is how many
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -1071,7 +1071,7 @@ bool Msg40::reallocMsg20Buf ( ) {
 	// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
 	// . how much mem do we need?
 	// . need space for the msg20 ptrs
-	int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
+	int64_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
 	// need space for the classes themselves, only if "visible" though
 	for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) 
 		if ( m_msg3a.m_clusterLevels[i] == CR_OK ) 
@ -1243,6 +1243,12 @@ bool Msg40::reallocMsg20Buf ( ) {
 	m_buf2        = NULL;
 	m_bufMaxSize2 = need;

+	if ( need > 2000000000 ) {
+		log("msg40: need too much mem=%"INT64,need);
+		m_errno = g_errno; 
+		return false; 
+	}
+
 	// do the alloc
 	if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
 	if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1256,10 +1256,18 @@ bool gotResults ( void *state ) {
 	// into it, and it must be the SAME ptr too!
 	CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
 	if ( ! cr ) { // || cr != si->m_cr ) {
-	       g_errno = ENOCOLLREC;
-	       return sendReply(st,NULL);
+		g_errno = ENOCOLLREC;
+		return sendReply(st,NULL);
 	}

+	if ( ! msg40->m_msg20 && ! si->m_docIdsOnly ) {
+		log("msg40: failed to get results q=%s",si->m_q.m_orig);
+		g_errno = ENOMEM;
+		return sendReply(st,NULL);
+	}
+
+
+
 	//char *coll = cr->m_coll;

 	/*
--- a/PageStats.cpp
+++ b/PageStats.cpp
@ -734,6 +734,18 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
 		p.safePrintf("<td>%"INT64"</td>",a);
 	}

+	p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>dropped recs</td>" );
+	for ( int32_t i = 0 ; i < numCaches ; i++ ) {
+		int64_t a = caches[i]->m_deletes;
+		p.safePrintf("<td>%"INT64"</td>",a);
+	}
+
+	p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>added recs</td>" );
+	for ( int32_t i = 0 ; i < numCaches ; i++ ) {
+		int64_t a = caches[i]->m_adds;
+		p.safePrintf("<td>%"INT64"</td>",a);
+	}
+
 	//p.safePrintf ("</tr>\n<tr class=poo><td><b><nobr>max age</td>" );
 	//for ( int32_t i = 0 ; i < numCaches ; i++ ) {
 	//	int64_t a = caches[i]->getMaxMem();
@ -2150,6 +2162,34 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
 	p.safePrintf("<td>%"UINT64"</td></tr>\n",total);


+	p.safePrintf("<tr class=poo><td><b>file cache adds</b></td>");
+	total = 0;
+	for ( int32_t i = 0 ; i < nr ; i++ ) {
+		Rdb *rdb = rdbs[i];
+		RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
+		if ( ! rpc ) {
+			p.safePrintf("<td>--</td>");
+			continue;
+		}
+		p.safePrintf("<td>%"UINT64"</td>",rpc->m_adds);
+	}
+	p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
+
+
+	p.safePrintf("<tr class=poo><td><b>file cache drops</b></td>");
+	total = 0;
+	for ( int32_t i = 0 ; i < nr ; i++ ) {
+		Rdb *rdb = rdbs[i];
+		RdbCache *rpc = getDiskPageCache ( rdb->m_rdbId );
+		if ( ! rpc ) {
+			p.safePrintf("<td>--</td>");
+			continue;
+		}
+		p.safePrintf("<td>%"UINT64"</td>",rpc->m_deletes);
+	}
+	p.safePrintf("<td>%"UINT64"</td></tr>\n",total);
+
+
 	p.safePrintf("<tr class=poo><td><b>file cache used</b></td>");
 	total = 0;
 	for ( int32_t i = 0 ; i < nr ; i++ ) {
--- a/Process.cpp
+++ b/Process.cpp
@ -1687,6 +1687,9 @@ bool Process::shutdown2 ( ) {
 	if ( g_process.m_threadOut ) 
 		log(LOG_INFO,"gb: still has hdtemp thread");

+
+	log("gb. EXITING.");
+
 	// exit abruptly
 	exit(0);

--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -543,7 +543,7 @@ bool RdbCache::getRecord ( collnum_t collnum   ,
 	// of the delete head's space i guess.
 	// i do this for all caches now... what are the downsides? i forget.
 	//
-	bool check = false;
+	bool check = true;//false;
 	//if ( this == &g_genericCache[SITEQUALITY_CACHEID] ) check = true;
 	if ( this ==  g_dns.getCache      ()              ) check = true;
 	if ( this ==  g_dns.getCacheLocal ()              ) check = true;
@ -558,11 +558,11 @@ bool RdbCache::getRecord ( collnum_t collnum   ,
 	//if ( this == &g_tagdb.m_listCache                ) check = true;
 	// the exact count cache...
 	//if ( this == &g_qtable                            ) check = true;
-	if ( m_totalBufSize < 20000                       ) check = false;
+	//if ( m_totalBufSize < 20000                       ) check = false;
 	if ( check ) promoteRecord = false;
 	// sanity check, do not allow the site quality cache or dns cache to 
 	// be > 128MB, that just does not make sense and it complicates things
-	if ( check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
+	//if(check && m_totalBufSize > BUFSIZE ) { char *xx = NULL; *xx = 0; }
 	// sanity check
 	if ( m_tail < 0 || m_tail > m_totalBufSize ) { 
 		char *xx = NULL; *xx = 0; }
@ -957,11 +957,13 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 	m_memOccupied += ( p - start ); 

 	// debug msg (MDW)
-	//log("cache: adding rec @ %"UINT32" size=%"INT32" tail=%"UINT32"",
-	//    i1c,p-start,m_tail);
-	//log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
-	//    ((key_t *)cacheKey)->n1,
-	//    ((key_t *)cacheKey)->n0,p-start,i1c,m_tail);
+	// if ( this == &g_spiderLoop.m_winnerListCache ) {
+	// log("cache: adding rec @ %"UINT32" size=%i tail=%"INT32"",
+	//     i1c,(int)(p-start),m_tail);
+	// log("cache: stored k.n1=%"UINT32" k.n0=%"UINT64" %"INT32" bytes @ %"UINT32" tail=%"UINT32"",
+	//     ((key_t *)cacheKey)->n1,
+	//     ((key_t *)cacheKey)->n0,(int)(p-start),i1c,m_tail);
+	// }
 	//if ( m_cks == 4 )
 	//	log("stored k=%"XINT32" %"INT32" bytes @ %"UINT32"",
 	//	    *(int32_t *)cacheKey,p-start,i);//(uint32_t)start);
@ -1113,8 +1115,10 @@ bool RdbCache::deleteRec ( ) {
 	//int32_t saved = m_tail;
 	
 	// debug msg (MDW)
-	//log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
-	//    dataSize+2+12+4+4);
+	// if ( this == &g_spiderLoop.m_winnerListCache ) {
+	// log("cache: deleting rec @ %"INT32" size=%"INT32"",m_tail,
+	//     dataSize+2+12+4+4);
+	// }

 	// skip over rest of rec
 	p += dataSize;
@ -1128,6 +1132,10 @@ bool RdbCache::deleteRec ( ) {
 	     m_tail +(int32_t)sizeof(collnum_t)+m_cks+4>m_totalBufSize){
 		char *xx = NULL; *xx = 0;}
 	
+	// if ( this == &g_spiderLoop.m_winnerListCache )
+	// 	log("spider: rdbcache: removing tail rec collnum=%i",
+	// 	    (int)collnum);
+
 	// delete key from hash table, iff is for THIS record
 	// but if it has not already been voided.
 	// we set key to KEYMAX() in markDeletedRecord()
@ -1167,8 +1175,10 @@ bool RdbCache::deleteRec ( ) {
 void RdbCache::markDeletedRecord(char *ptr){
 	int32_t dataSize = sizeof(collnum_t)+m_cks+sizeof(int32_t);
 	// debug it 
-	//logf(LOG_DEBUG,"cache: makeDeleteRecord ptr=0x%"XINT32" off=%"INT32"",
-	//     (int32_t)ptr,ptr-m_bufs[0]);
+	// if ( this == &g_spiderLoop.m_winnerListCache ) {
+	//logf(LOG_DEBUG,"cache: makeDeleteRec ptr=0x%"PTRFMT" off=%"INT32"",
+	//      (PTRTYPE)ptr,(int32_t)(ptr-m_bufs[0]));
+	// }
 	// get dataSize and data
 	if ( m_fixedDataSize == -1 || m_supportLists ) {
 		dataSize += 4 +                      // size
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -3566,4 +3566,32 @@ void RdbList::setFromSafeBuf ( SafeBuf *sb , char rdbId ) {

 }

+void RdbList::setFromPtr ( char *p , int32_t psize , char rdbId ) {
+
+	// free and NULLify any old m_list we had to make room for our new list
+	freeList();
+
+	// set this first since others depend on it
+	m_ks = getKeySizeFromRdbId ( rdbId );
+
+	// set our list parms
+	m_list          = p;
+	m_listSize      = psize;
+	m_alloc         = p;
+	m_allocSize     = psize;
+	m_listEnd       = m_list + m_listSize;
+
+	KEYMIN(m_startKey,m_ks);
+	KEYMAX(m_endKey  ,m_ks);
+
+	m_fixedDataSize = getDataSizeFromRdbId ( rdbId );
+
+	m_ownData       = false;//ownData;
+	m_useHalfKeys   = false;//useHalfKeys;
+
+	// use this call now to set m_listPtr and m_listPtrHi based on m_list
+	resetListPtr();
+
+}
+

--- a/RdbList.h
+++ b/RdbList.h
@ -107,6 +107,7 @@ class RdbList {
 		  char  keySize       = sizeof(key_t) );

 	void setFromSafeBuf ( class SafeBuf *sb , char rdbId );
+	void setFromPtr ( char *p , int32_t psize , char rdbId ) ;

 	// just set the start and end keys
 	//void set ( key_t startKey , key_t endKey );
--- a/Spider.cpp
+++ b/Spider.cpp
@ -3579,7 +3579,7 @@ bool SpiderColl::evalIpLoop ( ) {
 					  &doleBuf,
 					  &doleBufSize  ,
 					  false, // doCopy?
-					  300, // maxAge, 300 seconds
+					  600, // maxAge, 600 seconds
 					  true ,// incCounts
 					  &cachedTimestamp , // rec timestamp
 					  true );  // promote rec?
@ -3587,25 +3587,47 @@ bool SpiderColl::evalIpLoop ( ) {
 	}


+	// if ( m_collnum == 18752 ) {
+	// 	int32_t coff = 0;
+	// 	if ( inCache && doleBufSize >= 4 ) coff = *(int32_t *)doleBuf;
+	// 	log("spider: usecache=%i incache=%i dbufsize=%i currentoff=%i "
+	// 	    "ctime=%i ip=%s"
+	// 	    ,(int)useCache
+	// 	    ,(int)inCache
+	// 	    ,(int)doleBufSize
+	// 	    ,(int)coff
+	// 	    ,(int)cachedTimestamp
+	// 	    ,iptoa(m_scanningIp));
+	// }
+
 	// doleBuf could be NULL i guess...
 	if ( inCache ) { // && doleBufSize > 0 ) {
-		if ( g_conf.m_logDebugSpider )
+		int32_t crc = hash32 ( doleBuf + 4 , doleBufSize - 4 );
+		if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
 			log("spider: GOT %"INT32" bytes of SpiderRequests "
-			    "from winnerlistcache for ip %s",doleBufSize,
-			    iptoa(m_scanningIp));
+			    "from winnerlistcache for ip %s ptr=0x%"PTRFMT
+			    " crc=%"UINT32
+			    ,doleBufSize,
+			    iptoa(m_scanningIp),
+			    (PTRTYPE)doleBuf,
+			    crc);
 		// set own to false so it doesn't get freed
 		// m_doleBuf.setBuf ( doleBuf , 
 		// 		   doleBufSize ,
 		// 		   doleBufSize , 
 		// 		   false , // ownData?
 		// 		   0 ); // encoding. doesn't matter.
-		m_doleBuf.reset();
+		//m_doleBuf.reset();
 		// gotta copy it because we end up re-adding part of it
 		// to rdbcache below
-		m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
+		//m_doleBuf.safeMemcpy ( doleBuf , doleBufSize );
+		// we no longer re-add to avoid churn. but do not free it
+		// so do not 'own' it.
+		SafeBuf sb;
+		sb.setBuf ( doleBuf, doleBufSize, doleBufSize, false );
 		// now add the first rec m_doleBuf into doledb's tree
 		// and re-add the rest back to the cache with the same key.
-		return addDoleBufIntoDoledb ( true , cachedTimestamp );
+		return addDoleBufIntoDoledb(&sb,true);//,cachedTimestamp)
 	}

 top:
@ -4721,6 +4743,9 @@ bool SpiderColl::scanListForWinners ( ) {
 		int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
 		//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;

+		// if less than 10MB of spiderdb requests limit to 400
+		if ( m_totalBytesScanned < 10000000 ) maxWinners = 400;
+
 		// only put one doledb record into winner tree if
 		// the list is pretty short. otherwise, we end up caching
 		// too much. granted, we only cache for about 2 mins.
@ -5228,16 +5253,23 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
 	}


+	// i've seen this happen, wtf?
+	if ( m_winnerTree.isEmpty() && m_minFutureTimeMS ) { 
+		// this will update the waiting tree key with minFutureTimeMS
+		addDoleBufIntoDoledb ( NULL , false );
+		return true;
+	}
+
 	// i am seeing dup uh48's in the m_winnerTree
 	int32_t firstIp = m_waitingTreeKey.n0 & 0xffffffff;
-	char dbuf[3*MAX_WINNER_NODES*(8+1)];
+	char dbuf[147456];//3*MAX_WINNER_NODES*(8+1)];
 	HashTableX dedup;
 	int32_t ntn = m_winnerTree.getNumNodes();
 	dedup.set ( 8,
 		    0,
 		    (int32_t)2*ntn, // # slots to initialize to
 		    dbuf,
-		    (int32_t)(3*MAX_WINNER_NODES*(8+1)),
+		    147456,//(int32_t)(3*MAX_WINNER_NODES*(8+1)),
 		    false,
 		    MAX_NICENESS,
 		    "windt");
@ -5247,7 +5279,14 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
 	// make winner tree into doledb list to add
 	//
 	///////////
-	m_doleBuf.reset();
+	//m_doleBuf.reset();
+	//m_doleBuf.setLabel("dolbuf");
+	// first 4 bytes is offset of next doledb record to add to doledb
+	// so we do not have to re-add the dolebuf to the cache and make it
+	// churn. it is really inefficient.
+	SafeBuf doleBuf;
+	doleBuf.pushLong(4);
+	int32_t added = 0;
 	for ( int32_t node = m_winnerTree.getFirstNode() ; 
 	      node >= 0 ; 
 	      node = m_winnerTree.getNextNode ( node ) ) {
@ -5297,16 +5336,18 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
 			log("spider: got dup uh48=%"UINT64" dammit", winUh48);
 			continue;
 		}
+		// count it
+		added++;
 		// do not allow dups
 		dedup.addKey ( &winUh48 );
 		// store doledb key first
-		if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) ) 
+		if ( ! doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) ) 
 			hadError = true;
 		// then size of spiderrequest
-		if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) ) 
+		if ( ! doleBuf.pushLong ( sreq2->getRecSize() ) ) 
 			hadError = true;
 		// then the spiderrequest encapsulated
-		if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() )) 
+		if ( ! doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() )) 
 			hadError=true;
 		// note and error
 		if ( hadError ) {
@ -5316,11 +5357,52 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
 		}
 	}

-	return addDoleBufIntoDoledb ( false , 0 );
+	// log("spider: added %"INT32" doledb recs to cache for cn=%i "
+	//     "dolebufsize=%i",
+	//     added,
+	//     (int)m_collnum,
+	//     (int)doleBuf.length());
+
+	return addDoleBufIntoDoledb ( &doleBuf , false );//, 0 );
 }

-bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
-					uint32_t cachedTimestamp ) {
+bool SpiderColl::validateDoleBuf ( SafeBuf *doleBuf ) {
+	char *doleBufEnd = doleBuf->getBuf();
+	// get offset
+	char *pstart = doleBuf->getBufStart();
+	char *p = pstart;
+	int32_t jump = *(int32_t *)p;
+	p += 4;
+	// sanity
+	if ( jump < 4 || jump > doleBuf->getLength() ) {
+		char *xx=NULL;*xx=0; }
+	bool gotIt = false;
+	for ( ; p < doleBuf->getBuf() ; ) {
+		if ( p == pstart + jump )
+			gotIt = true;
+		// first is doledbkey
+		p += sizeof(key_t);
+		// then size of spider request
+		int32_t recSize = *(int32_t *)p;
+		p += 4;
+		// the spider request encapsulated
+		SpiderRequest *sreq3;
+		sreq3 = (SpiderRequest *)p;
+		// point "p" to next spiderrequest
+		if ( recSize != sreq3->getRecSize() ) { char *xx=NULL;*xx=0;}
+		p += recSize;//sreq3->getRecSize();
+		// sanity
+		if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
+		if ( p < pstart     ) { char *xx=NULL;*xx=0; }
+	}
+	if ( ! gotIt ) { char *xx=NULL;*xx=0; }
+	return true;
+}
+
+bool SpiderColl::addDoleBufIntoDoledb ( SafeBuf *doleBuf, bool isFromCache ) {
+					// uint32_t cachedTimestamp ) {
+
+	//validateDoleBuf ( doleBuf );

 	////////////////////
 	//
@ -5390,6 +5472,10 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 	// right now.
 	if ( m_winnerTree.isEmpty() && m_minFutureTimeMS && ! isFromCache ) {

+		// save memory
+		m_winnerTree.reset();
+		m_winnerTable.reset();
+
 		// if in the process of being added to doledb or in doledb...
 		if ( m_doleIpTable.isInTable ( &firstIp ) ) {
 			// sanity i guess. remove this line if it hits this!
@ -5500,6 +5586,8 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 	// how did this happen?
 	//if ( ! m_msg1Avail ) { char *xx=NULL;*xx=0; }

+	char *doleBufEnd = doleBuf->getBuf();
+
 	// add it to doledb ip table now so that waiting tree does not
 	// immediately get another spider request from this same ip added
 	// to it while the msg4 is out. but if add failes we totally bail
@ -5510,36 +5598,50 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 	//if ( ! addToDoleTable ( m_bestRequest ) ) return true;
 	// . MDW: now we have a list of doledb records in a SafeBuf:
 	// . scan the requests in safebuf
-	int32_t skipSize = 0;
-	for ( char *p = m_doleBuf.getBufStart() ; p < m_doleBuf.getBuf() ; ) {
-		// first is doledbkey
-		p += sizeof(key_t);
-		// then size of spider request
-		p += 4;
-		// the spider request encapsulated
-		SpiderRequest *sreq3;
-		sreq3 = (SpiderRequest *)p;
-		// point "p" to next spiderrequest
-		p += sreq3->getRecSize();
-		// for caching logic below, set this
-		skipSize = sizeof(key_t) + 4 + sreq3->getRecSize();
-		// process sreq3 my incrementing the firstip count in 
-		// m_doleIpTable
-		if ( ! addToDoleTable ( sreq3 ) ) return true;	

-		// only add the top key for now!
-		break;
+	// get offset
+	char *p = doleBuf->getBufStart();
+	int32_t jump = *(int32_t *)p;
+	// sanity
+	if ( jump < 4 || jump > doleBuf->getLength() ) {
+		char *xx=NULL;*xx=0; }
+	// the jump includes itself
+	p += jump;
+	//for ( ; p < m_doleBuf.getBuf() ; ) {
+	// save it
+	char *doledbRec = p;
+	// first is doledbkey
+	p += sizeof(key_t);
+	// then size of spider request
+	p += 4;
+	// the spider request encapsulated
+	SpiderRequest *sreq3;
+	sreq3 = (SpiderRequest *)p;
+	// point "p" to next spiderrequest
+	p += sreq3->getRecSize();

-		// this logic is now in addToDoleTable()
-		// . if it was empty it is no longer
-		// . we have this flag here to avoid scanning empty doledb 
-		//   priorities because it saves us a msg5 call to doledb in 
-		//   the scanning loop
-		//int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
-		//if ( bp <  0                     ) { char *xx=NULL;*xx=0; }
-		//if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
-		//m_isDoledbEmpty [ bp ] = 0;
-	}
+	// sanity
+	if ( p > doleBufEnd ) { char *xx=NULL;*xx=0; }
+
+	// for caching logic below, set this
+	int32_t doledbRecSize = sizeof(key_t) + 4 + sreq3->getRecSize();
+	// process sreq3 my incrementing the firstip count in 
+	// m_doleIpTable
+	if ( ! addToDoleTable ( sreq3 ) ) return true;	
+
+	// only add the top key for now!
+	//break;
+
+	// 	// this logic is now in addToDoleTable()
+	// 	// . if it was empty it is no longer
+	// 	// . we have this flag here to avoid scanning empty doledb 
+	// 	//   priorities because it saves us a msg5 call to doledb in 
+	// 	//   the scanning loop
+	// 	//int32_t bp = sreq3->m_priority;//m_bestRequest->m_priority;
+	// 	//if ( bp <  0                     ) { char *xx=NULL;*xx=0; }
+	// 	//if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
+	// 	//m_isDoledbEmpty [ bp ] = 0;
+	// }

 	// now cache the REST of the spider requests to speed up scanning.
 	// better than adding 400 recs per firstip to doledb because
@ -5548,20 +5650,25 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 	// top rec.
 	// allow this to add a 0 length record otherwise we keep the same
 	// old url in here and keep spidering it over and over again!
-	bool addToCache = false;
-	if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
+
+	//bool addToCache = false;
+	//if( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
 	// if winnertree was empty, then we might have scanned like 10M
 	// twitter.com urls and not wanted any of them, so we don't want to
 	// have to keep redoing that!
-	if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
+	//if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;

 	RdbCache *wc = &g_spiderLoop.m_winnerListCache;

 	// remove from cache? if we added the last spider request in the
 	// cached dolebuf to doledb then remove it from cache so it's not
 	// a cached empty dolebuf and we recompute it not using the cache.
-	if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) {
-		if ( addToCache ) { char *xx=NULL;*xx=0; }
+	if ( isFromCache && p >= doleBufEnd ) {
+		//if ( addToCache ) { char *xx=NULL;*xx=0; }
+		// debug note
+		// if ( m_collnum == 18752 )
+		// 	log("spider: rdbcache: adding single byte. skipsize=%i"
+		// 	    ,doledbRecSize);
 		// let's get this working right...
 		//wc->removeKey ( collnum , k , start );
 		//wc->markDeletedRecord(start);
@ -5582,21 +5689,67 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 		//wc->verify();
 	}

-	if ( addToCache ) {
+	// if it wasn't in the cache and it was only one record we
+	// obviously do not want to add it to the cache.
+	else if ( p < doleBufEnd ) { // if ( addToCache ) {
 		key_t cacheKey;
 		cacheKey.n0 = firstIp;
 		cacheKey.n1 = 0;
-		if ( g_conf.m_logDebugSpider )
-			log("spider: adding %"INT32" bytes of SpiderRequests "
-			    "to winnerlistcache for ip %s",
-			    m_doleBuf.length()-skipSize,iptoa(firstIp));
+		char *x = doleBuf->getBufStart();
+		// the new offset is the next record after the one we
+		// just added to doledb
+		int32_t newJump = (int32_t)(p - x);
+		int32_t oldJump = *(int32_t *)x;
+		// NO! we do a copy in rdbcache and copy the thing over
+		// since we promote it. so this won't work...
+		*(int32_t *)x = newJump;
+		if ( newJump >= doleBuf->getLength() ) { char *xx=NULL;*xx=0;}
+		if ( newJump < 4 ) { char *xx=NULL;*xx=0;}
+		if ( g_conf.m_logDebugSpider ) // || m_collnum == 18752 )
+			log("spider: rdbcache: updating "
+			    "%"INT32" bytes of SpiderRequests "
+			    "to winnerlistcache for ip %s oldjump=%"INT32
+			    " newJump=%"INT32" ptr=0x%"PTRFMT,
+			    doleBuf->length(),iptoa(firstIp),oldJump,
+			    newJump,
+			    (PTRTYPE)x);
+		//validateDoleBuf ( doleBuf );
 		//wc->verify();
 		// inherit timestamp. if 0, RdbCache will set to current time
-		wc->addRecord ( m_collnum,
-				(char *)&cacheKey,
-				m_doleBuf.getBufStart() + skipSize ,
-				m_doleBuf.length() - skipSize ,
-				cachedTimestamp );
+		// don't re-add just use the same modified buffer so we
+		// don't churn the cache.
+		// but do add it to cache if not already in there yet.
+		if ( ! isFromCache ) {
+			// if ( m_collnum == 18752 )
+			// 	log("spider: rdbcache: adding record a new "
+			// 	    "dbufsize=%i",(int)doleBuf->length());
+			wc->addRecord ( m_collnum,
+					(char *)&cacheKey,
+					doleBuf->getBufStart(),//+ skipSize ,
+					doleBuf->length() ,//- skipSize ,
+					0);//cachedTimestamp );
+		}
+		//validateDoleBuf( doleBuf );
+		/*
+		// test it
+		char *testPtr;
+		int32_t testLen;
+		bool inCache2 = wc->getRecord ( m_collnum     ,
+						(char *)&cacheKey ,
+						&testPtr,
+						&testLen,
+						false, // doCopy?
+						600, // maxAge,600 secs
+						true ,// incCounts
+						NULL , // rec timestamp
+						true );  // promote?
+		if ( ! inCache2 ) { char *xx=NULL;*xx=0; }
+		if ( testLen != m_doleBuf.length() ) {char *xx=NULL;*xx=0; }
+		if ( *(int32_t *)testPtr != newJump ){char *xx=NULL;*xx=0; }
+		SafeBuf tmp;
+		tmp.setBuf ( testPtr , testLen , testLen , false );
+		validateDoleBuf ( &tmp );
+		*/
 		//wc->verify();
 	}

@ -5634,16 +5787,18 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,

 	// only add one doledb record at a time now since we
 	// have the winnerListCache
-	m_doleBuf.setLength ( skipSize );
+	//m_doleBuf.setLength ( skipSize );

-	tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
+	//tmpList.setFromSafeBuf ( &m_doleBuf , RDB_DOLEDB );
+	tmpList.setFromPtr ( doledbRec , doledbRecSize , RDB_DOLEDB );

 	// now that doledb is tree-only and never dumps to disk, just
 	// add it directly
 	g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );

 	if ( g_conf.m_logDebugSpider )
-		log("spider: adding doledb tree node size=%"INT32"",skipSize);
+		log("spider: adding doledb tree node size=%"INT32"",
+		    doledbRecSize);


 	// and it happens right away. just add it locally.
@ -5703,6 +5858,12 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
 		    "removed from waiting table",
 		    iptoa(firstIp));

+	// save memory
+	m_winnerTree.reset();
+	m_winnerTable.reset();
+
+	//validateDoleBuf( doleBuf );
+
 	// add did not block
 	return status;
 }
@ -10011,10 +10172,23 @@ bool sendPage ( State11 *st ) {
 	// print time format: 7/23/1971 10:45:32
 	int64_t timems = gettimeofdayInMillisecondsGlobal();
 	sb.safePrintf("</b> (current time = %"UINT64")(totalcount=%"INT32")"
-		      "(waittablecount=%"INT32")</td></tr>\n",
+		      "(waittablecount=%"INT32")",
 		      timems,
 		      sc->m_waitingTree.getNumUsedNodes(),
 		      sc->m_waitingTable.getNumUsedSlots());
+
+	double a = (double)g_spiderdb.getUrlHash48 ( &sc->m_firstKey );
+	double b = (double)g_spiderdb.getUrlHash48 ( &sc->m_endKey );
+	double c = (double)g_spiderdb.getUrlHash48 ( &sc->m_nextKey );
+	double percent = (100.0 * (c-a)) ;
+	if ( b-a > 0 ) percent /= (b-a);
+	if ( percent > 100.0 ) percent = 100.0;
+	if ( percent < 0.0 ) percent = 0.0;
+	sb.safePrintf("(spiderdb scan for ip %s is %.2f%% complete)",
+		      iptoa(sc->m_scanningIp),
+		      (float)percent );
+
+	sb.safePrintf("</td></tr>\n");
 	sb.safePrintf("<tr bgcolor=#%s>",DARK_BLUE);
 	sb.safePrintf("<td><b>spidertime (MS)</b></td>\n");
 	sb.safePrintf("<td><b>firstip</b></td>\n");
--- a/Spider.h
+++ b/Spider.h
@ -1131,7 +1131,7 @@ class SpiderColl {

 	// doledbkey + dataSize + bestRequestRec
 	//char m_doleBuf[MAX_DOLEREC_SIZE];
-	SafeBuf m_doleBuf;
+	//SafeBuf m_doleBuf;

 	bool m_isLoading;

@ -1192,7 +1192,9 @@ class SpiderColl {

 	bool  addToDoleTable   ( SpiderRequest *sreq ) ;

-	bool addDoleBufIntoDoledb (bool isFromCache,uint32_t cachedTimestamp);
+	bool validateDoleBuf ( SafeBuf *doleBuf ) ;
+	bool addDoleBufIntoDoledb ( SafeBuf *doleBuf , bool isFromCache);
+	//,uint32_t cachedTimestamp);

 	bool updateSiteNumInlinksTable ( int32_t siteHash32,int32_t sni,
 					 time_t tstamp); // time_t
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -16963,9 +16963,8 @@ char **XmlDoc::getHttpReply2 ( ) {
 	bool isInjecting = getIsInjecting();
 	if ( ! isInjecting && m_sreqValid     && m_sreq.m_hopCount == 0 )
 		r->m_isRootSeedUrl = 1;
-	// only if it was a seed for now... so comment out
-	// if ( ! isInjecting && m_hopCountValid && m_hopCount        == 0 )
-	// 	r->m_isRootSeedUrl = 1;
+	if ( ! isInjecting && m_hopCountValid && m_hopCount        == 0 )
+		r->m_isRootSeedUrl = 1;

 	// sanity check
 	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
@ -19219,6 +19218,9 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
 		// <iframe src=""> which ends up embedding the root url.
 		if ( urlLen == 0 ) 
 			continue;
+		// skip if "about:blank"
+		if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
+			continue;
 		// get our current url
 		//cu = getCurrentUrl();
 		// set our frame url
@ -21580,12 +21582,13 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 	//
 	// print # of link texts from 2nd coll
 	//
-	if ( m_linkInfo2Valid ) {
-		LinkInfo *info = ptr_linkInfo2; 
-		int32_t nt = 0;
-		if ( info ) nt = info->getNumLinkTexts();
-		if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
-	}
+	// this is not used for what it was used for.
+	// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
+	// 	LinkInfo *info = ptr_linkInfo2; 
+	// 	int32_t nt = 0;
+	// 	if ( info ) nt = info->getNumLinkTexts();
+	// 	if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
+	// }

 	if (  m_docIdValid ) 
 		sb->safePrintf("docid=%"UINT64" ",m_docId);
--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -2504,7 +2504,7 @@ int32_t deserializeMsg ( int32_t  baseSize ,
 		// make it NULL if size is 0 though
 		if ( *sizePtr == 0 ) *strPtr = NULL;
 		// sanity check
-		if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
+		if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
 		// advance our destination ptr
 		p += *sizePtr;
 		// advance both ptrs to next string
--- a/fctypes.h
+++ b/fctypes.h
@ -620,6 +620,7 @@ char *serializeMsg2 ( void *thisPtr ,
 		      int32_t *retSize );

 // convert offsets back into ptrs
+// returns -1 on error
 int32_t deserializeMsg ( int32_t  baseSize ,
 		      int32_t *firstSizeParm ,
 		      int32_t *lastSizeParm ,