Merge branch 'ia' into ia-zak

2015-05-10 09:41:55 -06:00 · 2015-05-10 09:41:55 -06:00 · 29212bbe9c
commit 29212bbe9c
parent 443135a7b5 1a4bd55e0d
12 changed files with 280 additions and 51 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -463,12 +463,24 @@ bool Collectiondb::addNewColl ( char *coll ,
 		if ( ! h ) {
 			log("crawlbot: bad custom collname");
 			g_errno = EBADENGINEER;
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
 			return true;
 		}
 		*h = '\0';
 		crawl = h + 1;
 		if ( ! crawl[0] ) {
 			log("crawlbot: bad custom crawl name");
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
+			g_errno = EBADENGINEER;
+			return true;
+		}
+		// or if too big!
+		if ( gbstrlen(crawl) > 30 ) {
+			log("crawlbot: crawlbot crawl NAME is over 30 chars");
+			mdelete ( cr, sizeof(CollectionRec), "CollectionRec" ); 
+			delete ( cr );
 			g_errno = EBADENGINEER;
 			return true;
 		}
@ -1939,12 +1951,13 @@ bool CollectionRec::load ( char *coll , int32_t i ) {

 	// the list of ip addresses that we have detected as being throttled
 	// and therefore backoff and use proxies for
-	sb.reset();
-	sb.safePrintf("%scoll.%s.%"INT32"/",
-		      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
-	m_twitchyTable.m_allocName = "twittbl";
-	m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
-
+	if ( ! g_conf.m_doingCommandLine ) {
+		sb.reset();
+		sb.safePrintf("%scoll.%s.%"INT32"/",
+			      g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
+		m_twitchyTable.m_allocName = "twittbl";
+		m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
+	}

 	

@ -3472,6 +3485,70 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	char *ppp = m_diffbotPageProcessPattern.getBufStart();
 	if ( ppp && ! ppp[0] ) ppp = NULL;

+	///////
+	//
+	// recompile regular expressions
+	//
+	///////
+
+
+	if ( m_hasucr ) {
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+	if ( m_hasupr ) {
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}
+
+	// copy into tmpbuf
+	SafeBuf tmp;
+
+	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasucr = true;
+	}
+	int32_t err;
+	if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
+				     REG_EXTENDED| //REG_ICASE|
+				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
+		// error!
+		char errbuf[1024];
+		regerror(err,&m_ucr,errbuf,1000);
+		log("coll: regcomp %s failed: %s. "
+		    "Ignoring.",
+		    rx,errbuf);
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+
+	rx = m_diffbotUrlProcessRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) m_hasupr = true;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasupr = true;
+	}
+	if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
+				     REG_EXTENDED| // REG_ICASE|
+				     REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
+		char errbuf[1024];
+		regerror(err,&m_upr,errbuf,1000);
+		// error!
+		log("coll: regcomp %s failed: %s. "
+		    "Ignoring.",
+		    rx,errbuf);
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}

 	// what diffbot url to use for processing
 	char *api = m_diffbotApiUrl.getBufStart();
@ -3913,17 +3990,20 @@ void testRegex ( ) {
 	rx = ".*?article[0-9]*?.html";

 	regex_t ucr;
+	int32_t err;

-	if ( regcomp ( &ucr , rx ,
-		       REG_ICASE
-		       |REG_EXTENDED
-		       //|REG_NEWLINE
-		       //|REG_NOSUB
-		       ) ) {
+	if ( ( err = regcomp ( &ucr , rx ,
+			       REG_ICASE
+			       |REG_EXTENDED
+			       //|REG_NEWLINE
+			       //|REG_NOSUB
+			       ) ) ) {
 		// error!
+		char errbuf[1024];
+		regerror(err,&ucr,errbuf,1000);
 		log("xmldoc: regcomp %s failed: %s. "
 		    "Ignoring.",
-		    rx,mstrerror(errno));
+		    rx,errbuf);
 	}

 	logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
--- a/Errno.cpp
+++ b/Errno.cpp
@ -108,6 +108,7 @@ case	EDNSBAD          : return "DNS sent an unknown response code";
 case	EDNSREFUSED      : return "DNS refused to talk";
 case	EDNSDEAD         : return "DNS hostname does not exist";
 case	EDNSTIMEDOUT     : return "DNS timed out";
+case	EDNSERROR        : return "DNS lookup error";
 case	ECOLLTOOBIG      : return "Collection is too long";
 case	ESTRIKEOUT       : return "Retried enough times, deleting doc";
 case	ENOPERM          : return "Permission Denied";
--- a/Errno.h
+++ b/Errno.h
@ -112,6 +112,7 @@ enum {
 	EDNSREFUSED      , //dns refused to talk to us
 	EDNSDEAD         , //dns is dead
 	EDNSTIMEDOUT     , //was just EUDPTIMEDOUT
+	EDNSERROR        ,
 	ECOLLTOOBIG      , //collection is too long
 	ESTRIKEOUT       , //retried enough times; deleting doc & giving up
 	ENOPERM          , //permission denied
--- a/Mem.cpp
+++ b/Mem.cpp
@ -700,7 +700,7 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 	//(int32_t)mem,size,h,s_n,note);
 	s_n++;
 	// debug
-	if ( size > MINMEM && g_conf.m_logDebugMemUsage )
+	if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
 		log(LOG_INFO,"mem: addMem(%"INT32"): %s. ptr=0x%"PTRFMT" "
 		    "used=%"INT64"",
 		    size,note,(PTRTYPE)mem,m_used);
@ -1023,7 +1023,7 @@ bool Mem::rmMem  ( void *mem , int32_t size , const char *note ) {

 keepgoing:
 	// debug
-	if ( size > MINMEM && g_conf.m_logDebugMemUsage )
+	if ( (size > MINMEM && g_conf.m_logDebugMemUsage) || size>=100000000 )
 		log(LOG_INFO,"mem: rmMem (%"INT32"): "
 		    "ptr=0x%"PTRFMT" %s.",size,(PTRTYPE)mem,note);

--- a/Msg2.cpp
+++ b/Msg2.cpp
@ -194,11 +194,11 @@ bool Msg2::getLists ( ) {
 		int32_t minRecSize = m_minRecSizes[m_i];

 		// sanity check
-		if ( ( minRecSize > ( 500 * 1024 * 1024 ) || 
-		       minRecSize < 0) ){
-			log( "minRecSize = %"INT32"", minRecSize );
-			char *xx=NULL; *xx=0;
-		}
+		// if ( ( minRecSize > ( 500 * 1024 * 1024 ) || 
+		//        minRecSize < 0) ){
+		// 	log( "minRecSize = %"INT32"", minRecSize );
+		// 	char *xx=NULL; *xx=0;
+		// }

 		//bool forceLocalIndexdb = true;
 		// if it is a no-split term, we may gotta get it over the net
@ -407,7 +407,13 @@ bool Msg2::getLists ( ) {

 		// like 90MB last time i checked. so it won't read more
 		// than that...
-		int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
+		// MDW: no, it's better to print oom then not give all the
+		// results leaving users scratching their heads. besides,
+		// we should do docid range splitting before we go out of
+		// mem. we should also report the size of each termlist
+		// in bytes in the query info header.
+		//int32_t minRecSizes = DEFAULT_POSDB_READSIZE;
+		int32_t minRecSizes = -1;

 		// start up the read. thread will wait in thread queue to 
 		// launch if too many threads are out.
@ -596,12 +602,13 @@ bool Msg2::gotList ( RdbList *list ) {
 	for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
 		if ( m_lists[i].m_listSize < m_minRecSizes[i] ) continue;
 		if ( m_minRecSizes[i] == 0 ) continue;
+		if ( m_minRecSizes[i] == -1 ) continue;
 		// do not print this if compiling section xpathsitehash stats
 		// because we only need like 10k of list to get a decent
 		// reading
 		if ( m_req->m_forSectionStats ) break;
-		log("msg2: read termlist #%"INT32" size=%"INT32" maxSize=%"INT32". losing "
-		    "docIds!",
+		log("msg2: read termlist #%"INT32" size=%"INT32" "
+		    "maxSize=%"INT32". losing docIds!",
 		    i,m_lists[i].m_listSize,m_minRecSizes[i]);
 	}

--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -377,6 +377,9 @@ bool Msg3a::gotCacheReply ( ) {
 		// 'time enough for love' query was hitting 30MB termlists.
 		//rs = 50000000;
 		rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
+		// it is better to go oom then leave users scratching their
+		// heads as to why some results are not being returned.
+		rs = -1;
 		// if section stats, limit to 1MB
 		//if ( m_r->m_getSectionStats ) rs = 1000000;
 		// get the jth query term
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -182,9 +182,10 @@ bool Msg5::getList ( char     rdbId         ,
 	// log("Msg5::readList: startKey > endKey warning"); 
 	// we no longer allow negative minRecSizes
 	if ( minRecSizes < 0 ) {
-		log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 1.");
-		minRecSizes = 1;
-		char *xx = NULL; *xx = 0;
+		if ( g_conf.m_logDebugDb )
+		      log(LOG_LOGIC,"net: msg5: MinRecSizes < 0, using 2GB.");
+		minRecSizes = 0x7fffffff;
+		//char *xx = NULL; *xx = 0;
 	}
 	// ensure startKey last bit clear, endKey last bit set
 	//if ( (startKey.n0 & 0x01) == 0x01 ) 
--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -438,6 +438,25 @@ bool Msg1c::gotList ( ) {
 		// use only 64k values so we don't stress doledb/waittrees/etc.
 		// for large #'s of docids
 		int32_t firstIp = (docId & 0x0000ffff);
+
+		// bits 6-13 of the docid are the domain hash so use those
+		// when doing a REINDEX (not delete!) to ensure that requests
+		// on the same domain go to the same shard, at least when
+		// we have up to 256 shards. if we have more than 256 shards
+		// at this point some shards will not participate in the
+		// query reindex/delete process because of this, so 
+		// we'll want to allow more bits in in that case perhaps.
+		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
+		// to see what shard is responsible for storing and indexing 
+		// this SpiderRequest based on the firstIp.
+		if ( ! m_forceDel ) { 
+			// if we are a REINDEX not a delete because 
+			// deletes don't need to spider/redownload the doc
+			// so the distribution can be more random
+			firstIp >>= 6;
+			firstIp &= 0xff;
+		}
+
 		// 0 is not a legit val. it'll core below.
 		if ( firstIp == 0 ) firstIp = 1;
 		// use a fake ip
--- a/Parms.cpp
+++ b/Parms.cpp
@ -22598,15 +22598,25 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "to adjust how often you want things respidered."
 			  "</td></tr>"

+			  "<tr class=poo><td>indexage</td>"
+			  "<td>"
+			  "How long has it been since the url was last "
+			  "successfully indexed? In seconds. "
+			  "Can use <, >, <=, >=, ==, != comparison operators."
+			  "</td></tr>"
+
 			  "<tr class=poo><td>urlage</td>"
 			  "<td>"
-			  "This is the time, in seconds, since a url was first "
-			  "added to spiderdb to be spidered. This is "
+			  "This uses the time, in seconds, since a url was "
+			  "first added to spiderdb to be spidered, aka "
 			  "its discovery date. "
 			  "Can use <, >, <=, >=, ==, != comparison operators."
 			  "</td></tr>"
 			  

+			  
+
+
 			  //"<tr class=poo><td>!newoutlink</td>"
 			  //"<td>Matches if document is NOT a new outlink."
 			  //"</td></tr>"
--- a/Spider.cpp
+++ b/Spider.cpp
@ -207,9 +207,21 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
 }

 void SpiderReply::setKey (int32_t firstIp,
-			  int64_t parentDocId,
+			  // no need for parentdocid in this any more.
+			  //int64_t parentDocId,
 			  int64_t uh48,
 			  bool isDel) {
+	// now we use a 1 parentdocid for replies that were successful
+	int64_t parentDocId = 1;
+	// or 0 if had error. this way we only keep at most 2 SpiderReplies
+	// for each url in spiderdb. we need to keep the last successful
+	// spiderreply  in spiderdb so 
+	// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
+	// this way the reply that was successful will occur after the
+	// one that had an error, so we can just check the last spider reply
+	// when doing our scan in scanListForWinners().
+	if ( m_errCode ) parentDocId = 0;
+
 	m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
 	// set dataSize too!
 	m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
@ -4565,6 +4577,13 @@ bool SpiderColl::scanListForWinners ( ) {
 		// assume our added time is the first time this url was added
 		sreq->m_discoveryTime = sreq->m_addedTime;

+		// record the last time we successfully indexed this doc, ifany
+		if ( srep && ! srep->m_errCode )
+			sreq->m_lastSuccessfulSpideredTime =
+				srep->m_spideredTime;
+		else
+			sreq->m_lastSuccessfulSpideredTime = 0;
+
 		// if ( uh48 == 110582802025376LL )
 		// 	log("hey");

@ -4594,10 +4613,12 @@ bool SpiderColl::scanListForWinners ( ) {
 				// and the min added time as well!
 				// get the oldest timestamp so
 				// gbssDiscoveryTime will be accurate.
-				if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
+				if ( sreq->m_discoveryTime < 
+				     wsreq->m_discoveryTime )
 					wsreq->m_discoveryTime = 
 						sreq->m_discoveryTime;
-				if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
+				if ( wsreq->m_discoveryTime < 
+				     sreq->m_discoveryTime )
 					sreq->m_discoveryTime = 
 						wsreq->m_discoveryTime;
 			}
@ -11313,6 +11334,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,

 		if ( *p != 'i' ) goto skipi;

+
 		if ( strncmp(p,"isinjected",10) == 0 ) {
 			// skip for msg20
 			if ( isForMsg20 ) continue;
@ -11923,6 +11945,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+
 		// non-boolen junk
 skipi:

@ -12407,6 +12430,32 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		// constraint for last time url was successfully indexed
+		if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if never successfully indexed, skip this one
+			if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
+			int32_t age;
+			age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
+			// the argument entered by user
+			int32_t uage = atoi(s) ;
+			if ( sign == SIGN_EQ && age != uage ) continue;
+			if ( sign == SIGN_NE && age == uage ) continue;
+			if ( sign == SIGN_GT && age <= uage ) continue;
+			if ( sign == SIGN_LT && age >= uage ) continue;
+			if ( sign == SIGN_GE && age <  uage ) continue;
+			if ( sign == SIGN_LE && age >  uage ) continue;
+			// skip over 'indexage'
+			p += 8;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) return i;
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// selector using the first time it was added to the Spiderdb
 		// added by Sam, May 5th 2015
 		if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
@ -12430,6 +12479,8 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
 			if ( sign == SIGN_GE && sreq_age <  argument_age ) continue;
 			if ( sign == SIGN_LE && sreq_age >  argument_age ) continue;
+			// skip over 'urlage'
+			p += 6;
 			p = strstr(s, "&&");
 			//if nothing, else then it is a match
 			if ( ! p ) return i;
--- a/Spider.h
+++ b/Spider.h
@ -532,7 +532,11 @@ class SpiderRequest {
 	// then we increment the last 8 bits or so. see Msg22.cpp.
 	//int64_t m_probDocId;
 	//int32_t m_reservedc1;
-	int32_t m_reservedc2;
+	//int32_t m_reservedc2;
+
+	// if there is a 'successful' SpiderReply for this url then this is
+	// the SpiderReply::m_spideredTime of the most recent one.
+	int32_t m_lastSuccessfulSpideredTime;

 	//int32_t  m_parentPubDate;

@ -955,7 +959,7 @@ class SpiderReply {
 	void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };

 	void setKey ( int32_t firstIp,
-		      int64_t parentDocId , 
+		      //int64_t parentDocId , 
 		      int64_t uh48 , 
 		      bool isDel ) ;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1859,7 +1859,10 @@ bool XmlDoc::set2 ( char    *titleRec ,
 	//m_hasContactInfoValid = true;

 	// sanity check. if m_siteValid is true, this must be there
-	if ( ! ptr_site ) { char *xx=NULL;*xx=0; }
+	if ( ! ptr_site ) { 
+		log("set4: ptr_site is null for docid %"INT64"",m_docId);
+		//char *xx=NULL;*xx=0; }
+	}

 	// lookup the tagdb rec fresh if setting for a summary. that way we
 	// can see if it is banned or not
@ -2534,16 +2537,50 @@ bool XmlDoc::indexDoc ( ) {
 		if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
 		// sanity log
 		if ( *fip == 0 || *fip == -1 ) {
+			// 
+			// now add a spider status doc for this so we know
+			// why a crawl might have failed to start
+			//
+			SafeBuf *ssDocMetaList = NULL;
+			// save this
+			int32_t saved = m_indexCode;
+			// and make it the real reason for the spider status doc
+			m_indexCode = EDNSERROR;
+			// get the spiderreply ready to be added
+			
+			ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
+			// revert
+			m_indexCode = saved;
+			// error?
+			if ( ! ssDocMetaList ) return true;
+			// blocked?
+			if ( ssDocMetaList == (void *)-1 ) return false;
+			// need to alloc space for it too
+			char *list = ssDocMetaList->getBufStart();
+			int32_t len = ssDocMetaList->length();
+			//needx += len;
+			// this too
+			m_addedStatusDocSize = len;
+			m_addedStatusDocSizeValid = true;
+
 			char *url = "unknown";
 			if ( m_sreqValid ) url = m_sreq.m_url;
 			log("build: error2 getting real firstip of %"INT32" for "
 			    "%s. Not adding new spider req", (int32_t)*fip,url);
+			// also count it as a crawl attempt
+			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
+			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
+
+			if ( ! m_metaList2.safeMemcpy ( list , len ) )
+				return true;
+
 			goto skipNewAdd1;
 		}
 		// store the new request (store reply for this below)
 		char rd = RDB_SPIDERDB;
 		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
-		m_metaList2.pushChar(rd);
+		if ( ! m_metaList2.pushChar(rd) )
+			return true;
 		// store it here
 		SpiderRequest revisedReq;
 		// this fills it in
@ -23044,7 +23081,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		srep.m_domHash32  = m_sreq.m_domHash32;
 		srep.m_spideredTime = getTimeGlobal();
 		int64_t uh48 = m_sreq.getUrlHash48();
-		int64_t parentDocId = 0LL;
+		//int64_t parentDocId = 0LL;
 		srep.m_contentHash32 = 0;
 		// were we already in titledb before we started spidering?
 		// yes otherwise we would have called "goto skip9" above
@ -23054,7 +23091,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		srep.m_isIndexedINValid = false;
 		srep.m_errCode      = EREINDEXREDIR; // indexCode
 		srep.m_downloadEndTime = 0;
-		srep.setKey (  srep.m_firstIp, parentDocId , uh48 , false );
+		srep.setKey (  srep.m_firstIp, /*parentDocId ,*/uh48 , false );
 		// lock of request needs to match that of reply so the
 		// reply, when recevied by Rdb.cpp which calls addSpiderReply()
 		// can unlock this url so it can be spidered again.
@ -25945,7 +25982,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 		log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);

 	// set the key, m_srep.m_key
-	m_srep.setKey (  firstIp, parentDocId , uh48 , false );
+	m_srep.setKey (  firstIp, /*parentDocId ,*/ uh48 , false );

 	// . did we download a page? even if indexcode is set we might have
 	// . if this is non-zero that means its valid
@ -28372,7 +28409,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
 // . TODO:
 //   usedProxy:1
 //   proxyIp:1.2.3.4
-SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {	
+SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {	

 	setStatus ( "making spider reply meta list");

@ -28393,8 +28430,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	int64_t *uqd = getAvailDocIdOnly ( d );
 	if ( ! uqd || uqd == (void *)-1 ) return  (SafeBuf *)uqd;

-	unsigned char *hc = (unsigned char *)getHopCount();
-	if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
+	// unsigned char *hc = (unsigned char *)getHopCount();
+	// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;

 	int32_t tmpVal = -1;
 	int32_t *priority = &tmpVal;
@ -28430,7 +28467,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }

 	// why isn't gbhopcount: being indexed consistently?
-	if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }
+	//if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }

 	// reset just in case
 	m_spiderStatusDocMetaList.reset();
@ -28465,12 +28502,17 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 		jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
 			      ptr_redirUrl);

+	if ( m_indexCodeValid ) {
+		jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
+		jd.safePrintf("\"gbssStatusMsg\":\"");
+		jd.jsonEncode (mstrerror(m_indexCode));
+		jd.safePrintf("\",\n");
+	}
+	else {
+		jd.safePrintf("\"gbssStatusCode\":-1,\n");
+		jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
+	}

-	jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
-
-	jd.safePrintf("\"gbssStatusMsg\":\"");
-	jd.jsonEncode (mstrerror(m_indexCode));
-	jd.safePrintf("\",\n");

 	if ( m_httpStatusValid )
 		jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
@ -28514,12 +28556,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	jd.safePrintf("\",\n");

 	//if ( m_redirUrlPtr && m_redirUrlValid )
-	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
-		      m_numRedirects);
+	//if ( m_numRedirectsValid )
+	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);

-	jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);
+	if ( m_docIdValid )
+		jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);

-	jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
+	if ( m_hopCountValid )
+		//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
+		jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);

 	// crawlbot round
 	if ( cr->m_isCustomCrawl )
@ -28946,6 +28991,13 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	xd->ptr_site = ptr_site;
 	xd->size_site = size_site;

+	// if this is null then ip lookup failed i guess so just use
+	// the subdomain
+	if ( ! ptr_site && m_firstUrlValid ) {
+		xd->ptr_site  = m_firstUrl.getHost();
+		xd->size_site = m_firstUrl.getHostLen();
+	}
+
 	// use the same uh48 of our parent
 	int64_t uh48 = m_firstUrl.getUrlHash48();
 	// then make into a titlerec but store in metalistbuf, not m_titleRec