now we add the spider status docs as json documents.

so you can facet/sortby the various fields, etc.
2015-03-19 16:17:36 -06:00
parent d0be9f68a7
commit 90456222b6
11 changed files with 500 additions and 77 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -476,6 +476,8 @@ bool Collectiondb::addNewColl ( char *coll ,


 	if ( customCrawl ) {
+		// always index spider status docs now
+		cr->m_indexSpiderReplies = true;
 		// remember the token
 		cr->m_diffbotToken.set ( token );
 		cr->m_diffbotCrawlName.set ( crawl );
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -4258,7 +4258,7 @@ bool getSpiderRequestMetaList ( char *doc ,
 		sreq.m_hostHash32 = url.getHostHash32();
 		sreq.m_domHash32  = url.getDomainHash32();
 		sreq.m_siteHash32 = url.getHostHash32();
-		sreq.m_probDocId  = probDocId;
+		//sreq.m_probDocId  = probDocId;
 		sreq.m_hopCount   = 0; // we're a seed
 		sreq.m_hopCountValid = true;
 		sreq.m_addedTime = now;
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -407,6 +407,10 @@ bool processLoop ( void *state ) {
 	if ( format == FORMAT_XML ) sb->reset();
 	if ( format == FORMAT_JSON ) sb->reset();

+	if ( xd->m_contentType == CT_JSON ) sb->reset();
+	if ( xd->m_contentType == CT_XML  ) sb->reset();
+	if ( xd->m_contentType == CT_STATUS ) sb->reset();
+
 	// for undoing the stuff below
 	int32_t startLen2 = sb->length();//p;

@ -431,6 +435,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		printDisclaimer = false;

+	if ( xd->m_contentType == CT_STATUS )
+		printDisclaimer = false;
+
 	if ( format == FORMAT_XML ) printDisclaimer = false;
 	if ( format == FORMAT_JSON ) printDisclaimer = false;

@ -624,6 +631,8 @@ bool processLoop ( void *state ) {
 		includeHeader = false;
 	if ( xd->m_contentType == CT_XML )
 		includeHeader = false;
+	if ( xd->m_contentType == CT_STATUS )
+		includeHeader = false;

 	if ( format == FORMAT_XML ) includeHeader = false;
 	if ( format == FORMAT_JSON ) includeHeader = false;
@ -679,6 +688,7 @@ bool processLoop ( void *state ) {
 	// do not calc title or print it if doc is xml or json
 	if ( ctype == CT_XML ) sbend = sbstart;
 	if ( ctype == CT_JSON ) sbend = sbstart;
+	if ( ctype == CT_STATUS ) sbend = sbstart;

 	for ( char *t = sbstart ; t < sbend ; t++ ) {
 		// title tag?
@ -813,6 +823,8 @@ bool processLoop ( void *state ) {
 	// do not do term highlighting if json
 	if ( xd->m_contentType == CT_JSON )
 		queryHighlighting = false;
+	if ( xd->m_contentType == CT_STATUS )
+		queryHighlighting = false;

 	SafeBuf tmp;
 	SafeBuf *xb = sb;
@ -917,6 +929,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		contentType = "application/json";

+	if ( xd->m_contentType == CT_STATUS )
+		contentType = "application/json";
+
 	if ( xd->m_contentType == CT_XML )
 		contentType = "test/xml";

--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -449,7 +449,7 @@ bool Msg1c::gotList ( ) {
 		sr.m_urlIsDocId     =  1;
 		sr.m_fakeFirstIp    =  1;
 		// for msg12 locking
-		sr.m_probDocId      = docId;
+		//sr.m_probDocId      = docId;
 		// use test-parser not test-spider
 		//sr.m_useTestSpiderDir = 0;
 		sr.m_parentIsSiteMap = 0;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -4040,6 +4040,71 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		sb->safePrintf("\",\n");
 	}

+	// print spider status pages special
+	if ( mr->ptr_content && 
+	     si->m_format == FORMAT_HTML &&
+	     mr->m_contentType == CT_STATUS ) {
+		if ( *numPrintedSoFar )
+			sb->safePrintf("<br><hr><br>\n");
+		// skip to gbssurl
+		char *s = strstr ( mr->ptr_content,"\"gbssUrl\":");
+		if ( ! s ) goto badformat;
+		// then do two columns after the two urls
+		char *e = strstr ( s , "\"gbssStatusCode\":" );
+		if ( ! e ) goto badformat;
+		char *m = strstr ( e , "\"gbssNumOutlinksAdded\":");
+		if ( ! m ) goto badformat;
+		// exclude \0
+		char *end = mr->ptr_content + mr->size_content - 1;
+		// use a table with 2 columns
+		// so we can use \n to separate lines and don't have to add brs
+		// and boldify just the main url, not the redir url!
+		sb->safePrintf("<pre style=display:inline;>"
+			       "\"gbssUrl\":\""
+			       "<b style=color:blue;><a href=/get?d=%"INT64">"
+			       , mr->m_docId
+			       );
+		char *s2 = strstr ( s , "\"gbssFinalRedirectUrl\":");
+		char *bend = e - 3;
+		if ( s2 ) bend = s2 - 3;
+		sb->safeMemcpy ( s+11 , bend - (s+11));
+		sb->safePrintf("</a></b></pre>\",<br>");
+		// now print redir url if there
+		if ( s2 ) {
+			sb->safePrintf("<pre style=display:inline;>");
+			sb->safeMemcpy ( s2 , e-s2 );
+			sb->removeLastChar('\n');
+			sb->safePrintf("</pre>");
+		}
+		sb->safePrintf("<table border=0 cellpadding=0 cellspacing=0>"
+			       "<tr><td>");
+		sb->safePrintf("<pre>");
+		//int32_t off = sb->length();
+		sb->safeMemcpy ( e , m - e );
+		sb->safePrintf("</pre>");
+		sb->safePrintf("</td><td>");
+		sb->safePrintf("<pre>");
+		sb->safeMemcpy ( m , end - m );
+		// remove last \n
+		sb->removeLastChar('\n');
+		sb->removeLastChar('}');
+		sb->removeLastChar('\n');
+		sb->safePrintf("</pre>\n");
+		sb->safePrintf("</td></tr></table>");
+		// replace \n with <br>
+		// sb->safeReplace2 ( "\n" , 1 ,
+		// 		   "<br>" , 4 ,
+		// 		   0,//niceness ,
+		// 		   off );
+		// inc it
+		*numPrintedSoFar = *numPrintedSoFar + 1;
+		// just in case
+		sb->nullTerm();
+		return true;
+	}
+
+	badformat:
+
 	Highlight hi;

 	// get the url
@ -4373,7 +4438,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	//
 	///////

-
 	// the a href tag
 	if ( si->m_format == FORMAT_HTML ) {
 		sb->safePrintf ( "<a href=" );
@ -5196,7 +5260,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		sb->safePrintf (" - "
 				"<a style=color:blue; "
 				"href=\"/search?sb=1&c=%s&"
-				"q=url2%%3A" 
+				//"q=url2%%3A" 
+				"q=gbfieldmatch%%3AgbssUrl%%3A"
 				, coll 
 				);
 		sb->urlEncode ( url , gbstrlen(url) , false );
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2324,11 +2324,13 @@ bool Rdb::addRecord ( collnum_t collnum,
 			SpiderRequest *sreq = (SpiderRequest *)data;
 			logf(LOG_DEBUG,"spider: added doledb key "
 			     "for pri=%"INT32" time=%"UINT32" "
-			     "uh48=%"UINT64" docid=%"INT64" u=%s",
+			     "uh48=%"UINT64" "
+			     //"docid=%"INT64" "
+			     "u=%s",
 			     (int32_t)g_doledb.getPriority(&doleKey),
 			     (uint32_t)g_doledb.getSpiderTime(&doleKey),
 			     g_doledb.getUrlHash48(&doleKey),
-			     sreq->m_probDocId,
+			     //sreq->m_probDocId,
 			     sreq->m_url);
 		}
 	}
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1802,6 +1802,9 @@ void SpiderColl::clearLocks ( ) {

 void SpiderColl::reset ( ) {

+	m_numSuccessReplies = 0;
+	m_numFailedReplies  = 0;
+
 	// reset these for SpiderLoop;
 	m_nextDoledbKey.setMin();
 	//m_didRound = false;
@ -3973,8 +3976,20 @@ bool SpiderColl::scanListForWinners ( ) {
 		}
 		// if its a SpiderReply set it for an upcoming requests
 		if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
+
 			// see if this is the most recent one
 			SpiderReply *tmp = (SpiderReply *)rec;
+
+			// reset reply stats if beginning a new url
+			if ( srepUh48 != tmp->getUrlHash48() ) {
+				m_numSuccessReplies = 0;
+				m_numFailedReplies  = 0;
+			}
+
+			// inc stats
+			if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
+			else                       m_numFailedReplies ++;
+
 			// if we have a more recent reply already, skip this 
 			if ( srep && 
 			     srep->getUrlHash48() == tmp->getUrlHash48() &&
@ -3994,6 +4009,12 @@ bool SpiderColl::scanListForWinners ( ) {

 		int64_t uh48 = sreq->getUrlHash48();

+		// reset reply stats if beginning a new url
+		if ( ! srep ) {
+			m_numSuccessReplies = 0;
+			m_numFailedReplies  = 0;
+		}
+
 		// . skip if our twin should add it to doledb
 		// . waiting tree only has firstIps assigned to us so
 		//   this should not be necessary
@ -4032,21 +4053,27 @@ bool SpiderColl::scanListForWinners ( ) {
 		     ! sreq->m_fakeFirstIp )
 			m_totalNewSpiderRequests++;

-		// reset page inlink count on url request change
-		if ( m_lastSreqUh48 != uh48 )
-			m_pageNumInlinks = 1;
-
 		//int32_t  ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
 		int32_t cblock = ipdom ( sreq->m_firstIp );

 		bool countIt = true;

-		if ( uh48 != m_lastSreqUh48 )
-			countIt = false;
+		// reset page inlink count on url request change
+		if ( m_lastSreqUh48 != uh48 ) {
+			m_pageNumInlinks = 0;
+			m_lastCBlockIp = 0;
+		}
+
+		//if ( uh48 != m_lastSreqUh48 )
+		//	countIt = false;

 		if ( cblock == m_lastCBlockIp )
 			countIt = false;

+		// do not count manually added spider requests
+		if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
+			countIt = false;
+
 		// 20 is good enough
 		if ( m_pageNumInlinks >= 20 )
 			countIt = false;
@ -4069,6 +4096,12 @@ bool SpiderColl::scanListForWinners ( ) {
 		// set this now. it does increase with each request. so 
 		// initial requests will not see the full # of inlinks.
 		sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
+
+		// put these in the spiderequest in doledb so we can
+		// show in the json spider status docs in 
+		// XmlDoc::getSpiderStatusDocMetaList2()
+		sreq->m_reservedc1 = m_numSuccessReplies;
+		sreq->m_reservedc2 = m_numFailedReplies;
 		
 		m_lastSreqUh48 = uh48;
 		m_lastCBlockIp = cblock;
@ -11032,21 +11065,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		}


-		if ( strncmp(p,"numinlinks",10) == 0 ) {
-			// skip for msg20
-			if ( isForMsg20 ) continue;
-			// if no match continue
-			if ( (bool)sreq->m_pageNumInlinks == val) continue;
-			// skip
-			p += 10;
-			// skip to next constraint
-			p = strstr(p, "&&");
-			// all done?
-			if ( ! p ) return i;
-			p += 2;
-			goto checkNextRule;
-		}
-
 		if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) {
 			// skip for msg20
 			if ( isForMsg20 ) continue;
@ -12308,12 +12326,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// these are -1 if they are NOT valid
+			int32_t a = sreq->m_pageNumInlinks;
+			// make it point to the priority
+			int32_t b = atoi(s);
+			// compare
+			if ( sign == SIGN_EQ && a != b ) continue;
+			if ( sign == SIGN_NE && a == b ) continue;
+			if ( sign == SIGN_GT && a <= b ) continue;
+			if ( sign == SIGN_LT && a >= b ) continue;
+			if ( sign == SIGN_GE && a <  b ) continue;
+			if ( sign == SIGN_LE && a >  b ) continue;
+			// skip fast
+			p += 10;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) return i;
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// siteNumInlinks >= 300 [&&]
 		if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
 			// these are -1 if they are NOT valid
 			int32_t a1 = sreq->m_siteNumInlinks;
 			// only assign if valid
-			int32_t a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
+			int32_t a2 = -1; 
+			if ( srep ) a2 = srep->m_siteNumInlinks;
 			// assume a1 is the best
 			int32_t a ;
 			// assign to the first valid one
@ -13991,7 +14034,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
 	m_isAddUrl     = 1;
 	m_addedTime    = (uint32_t)getTimeGlobal();//now;
 	m_fakeFirstIp   = 1;
-	m_probDocId     = probDocId;
+	//m_probDocId     = probDocId;
 	m_firstIp       = firstIp;
 	m_hopCount      = 0;

--- a/Spider.h
+++ b/Spider.h
@ -524,7 +524,9 @@ class SpiderRequest {

 	// the PROBABLE DOCID. if there is a collision with another docid
 	// then we increment the last 8 bits or so. see Msg22.cpp.
-	int64_t m_probDocId;
+	//int64_t m_probDocId;
+	int32_t m_reservedc1;
+	int32_t m_reservedc2;

 	//int32_t  m_parentPubDate;

@ -1153,6 +1155,9 @@ class SpiderColl {
 	int32_t      m_tailHopCount;
 	int64_t m_minFutureTimeMS;

+	int32_t m_numSuccessReplies;
+	int32_t m_numFailedReplies;
+
 	// . do not re-send CrawlInfoLocal for a coll if not update
 	// . we store the flags in here as true if we should send our
 	//   CrawlInfoLocal for this coll to this hostId
--- a/Test.cpp
+++ b/Test.cpp
@ -932,7 +932,7 @@ bool Test::injectLoop ( ) {
 	m_sreq.m_domHash32  = fakeIp;
 	m_sreq.m_hostHash32 = fakeIp;
 	m_sreq.m_siteHash32 = fakeIp;
-	m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
+	//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
 	// this crap is fake
 	m_sreq.m_isInjecting = 1;
 	// use test-spider subdir for storing pages and spider times?
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -189,6 +189,10 @@ static int64_t s_lastTimeStart = 0LL;

 void XmlDoc::reset ( ) {

+	m_ipStartTime = 0;
+	m_ipEndTime   = 0;
+	m_diffbotReplyRetries = 0;
+
 	m_isImporting = false;
 	
 	m_printedMenu = false;
@ -13106,6 +13110,10 @@ int32_t *XmlDoc::getIp ( ) {
 	// update status msg
 	setStatus ( "getting ip" );

+	m_ipStartTime = 0;
+	// assume the same in case we get it right away
+	m_ipEndTime = 0;
+
 	// if set from docid and recycling
 	if ( m_recycleContent ) {
 		// get the old xml doc from the old title rec
@ -13214,6 +13222,8 @@ int32_t *XmlDoc::getIp ( ) {
 	// update status msg
 	setStatus ( "getting ip" );

+	m_ipStartTime = gettimeofdayInMillisecondsGlobal();
+
 	// assume valid! if reply handler gets g_errno set then m_masterLoop
 	// should see that and call the final callback
 	//m_ipValid = true;
@ -13232,6 +13242,9 @@ int32_t *XmlDoc::getIp ( ) {
 void gotIpWrapper ( void *state , int32_t ip ) {
 	// point to us
 	XmlDoc *THIS = (XmlDoc *)state;
+
+	THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
+
 	// wrap it up
 	THIS->gotIp ( true );
 	// . call the master callback
@ -14307,11 +14320,13 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
 		// m_diffbotReplyValid to true, below.
 		THIS->m_diffbotReplyError = 0;
 		log("buld: retrying diffbot reply");
+		THIS->m_diffbotReplyRetries++;
 		// resume. this checks g_errno for being set.
 		THIS->m_masterLoop ( THIS->m_masterState );
 		return;
 	}
 		
+	THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();

 	//char *buf = s->m_readBuf;
 	// do not allow TcpServer.cpp to free it since m_diffbotReply
@ -15454,6 +15469,8 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	    "diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
 	    additionalHeaders);

+	m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
+
 	if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
 				     0 , // ip
 				     0 , // offset
@ -15930,6 +15947,8 @@ char **XmlDoc::getHttpReply2 ( ) {
 		char *xx=NULL;*xx=0; 
 	}

+	m_downloadStartTimeValid = true;
+	m_downloadStartTime = gettimeofdayInMillisecondsGlobal();

 	if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
 		// return -1 if blocked
@ -20092,6 +20111,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 		sb->safePrintf("siterank=%"INT32" ", sr );
 	}

+	if ( m_sreqValid )
+		sb->safePrintf("pageinlinks=%04"INT32" ",
+			       m_sreq.m_pageNumInlinks);
+
 	// int16_tcut
 	int64_t uh48 = hash64b ( m_firstUrl.m_url );
 	// mask it
@ -25494,7 +25517,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		// now we need this so we can share Msg12 spider locks with
 		// query reindex docid-based spider requests. that way
 		// we do not spider the same document at the same time.
-		ksr.m_probDocId = g_titledb.getProbableDocId(&url);
+		//ksr.m_probDocId = g_titledb.getProbableDocId(&url);

 		//ksr.m_pageNumInlinks = 0;

@ -27046,7 +27069,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ) {
 	return mbuf;
 }

-// the spider status doc
+// . the spider status doc
+// . TODO:
+//   usedProxy:1
+//   proxyIp:1.2.3.4
 SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {	

 	setStatus ( "making spider reply meta list");
@ -27070,6 +27096,21 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	unsigned char *hc = (unsigned char *)getHopCount();
 	if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;

+	int32_t *priority = getSpiderPriority();
+	if ( ! priority || priority == (void *)-1 ) return (SafeBuf *)priority;
+
+	int32_t *ufn = getUrlFilterNum();
+	if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
+
+	CollectionRec *cr = getCollRec();
+	if ( ! cr ) return NULL;
+
+	// sanity
+	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
+
+	// why isn't gbhopcount: being indexed consistently?
+	if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }
+
 	// reset just in case
 	m_spiderStatusDocMetaList.reset();

@ -27082,30 +27123,230 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 		return &m_spiderStatusDocMetaList;
 	}

+	// the old doc
+	XmlDoc *od = NULL;
+	if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
+
+	Url *fu = &m_firstUrl;
+
+	// . make a little json doc that we'll hash up
+	// . only index the fields in this doc, no extra gbdocid: inurl:
+	//   hash terms
+	SafeBuf jd;
+	jd.safePrintf("{\n");
+
+	// so type:status query works
+	jd.safePrintf("\"type\":\"status\",\n");
+
+	jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl()  );
+
+	if ( ptr_redirUrl ) 
+		jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
+			      ptr_redirUrl);
+
+
+	jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
+
+	jd.safePrintf("\"gbssStatusMsg\":\"");
+	jd.jsonEncode (mstrerror(m_indexCode));
+	jd.safePrintf("\",\n");
+
+	if ( m_httpStatusValid )
+		jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
+			      (int32_t)m_httpStatus);
+
+	if ( od )
+		jd.safePrintf("\"gbssPreviouslyIndexed\":1,\n");
+	else
+		jd.safePrintf("\"gbssPreviouslyIndexed\":0,\n");
+
+	jd.safePrintf("\"gbssDomain\":\"");
+	jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
+	jd.safePrintf("\",\n");
+
+	jd.safePrintf("\"gbssSubdomain\":\"");
+	jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
+	jd.safePrintf("\",\n");
+
+	//if ( m_redirUrlPtr && m_redirUrlValid )
+	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
+		      m_numRedirects);
+
+	jd.safePrintf("\"gbssDocId\":%"INT64",\n", *uqd);
+
+	jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
+
+	// crawlbot round
+	if ( cr->m_isCustomCrawl )
+		jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
+			      cr->m_spiderRoundNum);
+
+	if ( m_isDupValid && m_isDup )
+		jd.safePrintf("\"gbssDupOfDocId:%"INT64",\n",
+			      m_docIdWeAreADupOf);
+
+	// how many spiderings were successful vs. failed
+	if ( m_sreqValid ) {
+		jd.safePrintf("\"gbssPrevTotalNumSpiderAttempts\":%"INT32",\n",
+			      m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
+		jd.safePrintf("\"gbssPrevTotalNumSpiderSuccesses\":%"INT32",\n",
+			      m_sreq.m_reservedc1);
+		jd.safePrintf("\"gbssPrevTotalNumSpiderFailures\":%"INT32",\n",
+			      m_sreq.m_reservedc2);
+	}
+
+	if ( m_firstIndexedDateValid )
+		jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
+			      m_firstIndexedDate);
+
+	if ( m_contentHash32Valid )
+		jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
+			      m_contentHash32);
+
+	if ( m_downloadStartTimeValid ) {
+		jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
+			      m_downloadStartTime);
+		jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
+			      (uint32_t)(m_downloadStartTime/1000));
+	}
+
+	if ( m_downloadEndTimeValid ) {
+		jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
+			      m_downloadEndTime);
+		jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
+			      (uint32_t)(m_downloadEndTime/1000));
+	}
+
+	if ( m_downloadEndTimeValid ) {
+		int64_t took = m_downloadEndTime - m_downloadStartTime;
+		jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);
+	}
+
+	jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
+		      m_useRobotsTxt);
+
+	//if ( m_numOutlinksAddedValid ) 
+	jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
+		      (int32_t)m_numOutlinksAdded);
+
+	// how many download/indexing errors we've had, including this one
+	// if applicable.
+	jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
+		      m_srep.m_errCount);
+
+
+	if ( od )
+		jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
+			      "%"UINT32",\n",od->m_spideredTime);
+	else
+		jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
+			      "%"UINT32",\n",0);
+
+	if ( m_ipValid )
+		jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
+	else
+		jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
+
+	if ( m_ipEndTime ) {
+		int64_t took = m_ipEndTime - m_ipStartTime;
+		jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
+	}
+
+	if ( m_siteNumInlinksValid ) {
+		jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
+			      (int32_t)m_siteNumInlinks);
+		char siteRank = getSiteRank();
+		jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
+			      (int32_t)siteRank);
+	}
+
+	jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
+		      (int32_t)m_contentInjected);
+
+	if ( m_percentChangedValid && od ) 
+		jd.safePrintf("\"gbssPercentContentChanged\""
+			      ":\"%.01f\"%%,\n",
+			      m_percentChanged);
+
+	jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n", 
+		      *priority);
+
+	jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n", 
+		      cr->m_regExs[*ufn].getBufStart());
+
+	if ( m_langIdValid )
+		jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
+			      getLangAbbr(m_langId));
+
+	if ( m_contentTypeValid )
+		jd.safePrintf("\"gbssContentType\":\"%s\",\n",
+			      g_contentTypeStrings[m_contentType]);
+
+	if ( m_contentValid )
+		jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
+			      m_contentLen);
+
+	if (  m_crawlDelayValid )
+		// -1 if none?
+		jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
+			      (int32_t)m_crawlDelay);
+		
+	// sent to diffbot?
+	jd.safePrintf("\"gbssSentToDiffbot\":%i,\n",
+		      (int)m_sentToDiffbot);
+
+	if ( m_diffbotReplyValid ) {
+		jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
+			      m_diffbotReplyError);
+		jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
+		jd.jsonEncode(mstrerror(m_diffbotReplyError));
+		jd.safePrintf("\",\n");		
+		jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
+			      m_diffbotReply.length());
+		int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
+		jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
+			      took );
+		jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
+			      m_diffbotReplyRetries );
+		jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
+			      m_diffbotJSONCount);
+	}	
+
+	// remove last ,\n
+	jd.incrementLength(-2);
+	// end the json spider status doc
+	jd.safePrintf("}\n");
+
+
 	// the posdb table
 	HashTableX tt4;
 	if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
 		return NULL;

+
+	Json jp;
+	if ( ! jp.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
+		g_errno = EBADJSONPARSER;
+		return NULL;
+	}
+
 	// BEFORE ANY HASHING
 	int32_t savedDist = m_dist;
 	// re-set to 0
 	m_dist = 0;

-	// sanity
-	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
-
-	// why isn't gbhopcount: being indexed consistently?
-	if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }
-
 	// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
 	HashInfo hi;
 	hi.m_hashGroup = HASHGROUP_INTAG;
 	hi.m_tt = &tt4;
+	hi.m_desc = "json spider status object";
 	hi.m_useCountTable = false;
 	hi.m_useSections = false;

+	// fill up tt4. false -> do not hash without field prefixes.
+	hashJSONFields2 ( &tt4 , &hi , &jp , false );

+	/*
 	char buf[64];
 	int32_t bufLen;

@ -27120,6 +27361,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	hi.m_desc   = "spider error number as string";
 	bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
 	if ( ! hashString( buf , &hi ) ) return NULL;
+	*/

 	/*
 	logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
@ -27174,6 +27416,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	// was here....
 	*/

+	/*
 	// gbstatus:"tcp timed out"
 	hi.m_prefix = "gbstatusmsg";
 	hi.m_desc   = "spider error msg";
@ -27191,6 +27434,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {

 	// false --> do not hash the gbdoc* terms (CT_STATUS)
 	hashDateNumbers ( &tt4 , true );
+	*/

 	// store keys in safebuf then to make our own meta list
 	addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
@ -27230,6 +27474,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	int32_t fullsize = &m_dummyEnd - (char *)this;
 	if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }

+	/*
 	// the ptr_* were all zero'd out, put the ones we want to keep back in
 	SafeBuf tmp;
 	// was "Spider Status: %s" but that is unnecessary
@ -27242,6 +27487,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {

 	if ( m_redirUrlPtr && m_redirUrlValid )
 		tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
+	*/

 	// put stats like we log out from logIt
 	//tmp.safePrintf("<div style=max-width:800px;>\n");
@ -27250,8 +27496,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 	//tmp.safePrintf("\n</div>");

 	// the content is just the title tag above
-	xd->ptr_utf8Content = tmp.getBufStart();
-	xd->size_utf8Content = tmp.length()+1;
+	// xd->ptr_utf8Content = tmp.getBufStart();
+	// xd->size_utf8Content = tmp.length()+1;
+	xd->ptr_utf8Content = jd.getBufStart();
+	xd->size_utf8Content = jd.length()+1;

 	// keep the same url as the doc we are the spider reply for
 	xd->ptr_firstUrl = ptr_firstUrl;
@ -27423,7 +27671,7 @@ int32_t XmlDoc::getIndexedTime() {

 // . hash dates for sorting by using gbsortby: and gbrevsortby:
 // . do 'gbsortby:gbspiderdate' as your query to see this in action
-bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
+bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {

 	// stop if already set
 	if ( ! m_spideredTimeValid ) return true;
@ -27453,7 +27701,7 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
 	// do not index the rest if we are a "spider reply" document
 	// which is like a fake document for seeing spider statuses
 	//if ( isStatusDoc == CT_STATUS ) return true;
-	if ( isStatusDoc ) return true;
+	//if ( isStatusDoc ) return true;

 	// now for CT_STATUS spider status "documents" we also index
 	// gbspiderdate so index this so we can just do a 
@ -27873,7 +28121,7 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {

 // . returns false and sets g_errno on error
 // . copied Url2.cpp into here basically, so we can now dump Url2.cpp
-bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
+bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {

 	setStatus ( "hashing url colon" );

@ -27893,7 +28141,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	// append a "www." for doing url: searches
 	Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
 	hi.m_prefix    = "url";
-	if ( isStatusDoc ) hi.m_prefix = "url2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "url2";
 	if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) 
 		return false;

@ -27908,7 +28157,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	char *s    = fu->getUrl   ();
 	int32_t  slen = fu->getUrlLen();
 	hi.m_prefix = "inurl";
-	if ( isStatusDoc ) hi.m_prefix = "inurl2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "inurl2";
 	if ( ! hashString ( s,slen, &hi ) ) return false;

 	setStatus ( "hashing ip colon" );
@ -27923,7 +28173,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	//char *tmp = iptoa ( m_ip );
 	//int32_t  tlen = gbstrlen(tmp);
 	hi.m_prefix = "ip";
-	if ( isStatusDoc ) hi.m_prefix = "ip2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "ip2";
 	if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;

 	//
@ -27993,7 +28244,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
 	// update parms
 	hi.m_prefix    = "gbpathdepth";
-	if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
 	hi.m_hashGroup = HASHGROUP_INTAG;
 	// hash gbpathdepth:X
 	if ( ! hashString ( buf,blen,&hi) ) return false;
@ -28008,7 +28260,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
 	// update parms
 	hi.m_prefix    = "gbhopcount";
-	if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
 	hi.m_hashGroup = HASHGROUP_INTAG;
 	// hash gbpathdepth:X
 	if ( ! hashString ( buf,blen,&hi) ) return false;
@ -28025,7 +28278,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	else                        hm = "0";
 	// update parms
 	hi.m_prefix = "gbhasfilename";
-	if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
 	// hash gbhasfilename:[0|1]
 	if ( ! hashString ( hm,1,&hi) ) return false;

@ -28037,7 +28291,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	if ( fu->isCgi() ) hm = "1";
 	else               hm = "0";
 	hi.m_prefix = "gbiscgi";
-	if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
 	if ( ! hashString ( hm,1,&hi) ) return false;


@ -28051,7 +28306,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	if ( fu->getExtensionLen() ) hm = "1";
 	else                         hm = "0";
 	hi.m_prefix = "gbhasext";
-	if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
 	if ( ! hashString ( hm,1,&hi) ) return false;

 	//
@ -28096,7 +28352,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 		*p = '\0';
 		// update hash parms
 		hi.m_prefix    = "site";
-		if ( isStatusDoc ) hi.m_prefix = "site2";
+		// no longer, we just index json now
+		//if ( isStatusDoc ) hi.m_prefix = "site2";
 		hi.m_hashGroup = HASHGROUP_INURL;
 		// this returns false on failure
 		if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
@ -28120,13 +28377,15 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 	int32_t  elen = fu->getExtensionLen();
 	// update hash parms
 	hi.m_prefix    = "ext";
-	if ( isStatusDoc ) hi.m_prefix = "ext2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "ext2";
 	if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;


 	setStatus ( "hashing gbdocid" );
 	hi.m_prefix = "gbdocid";
-	if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
+	// no longer, we just index json now
+	//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
 	char buf2[32];
 	sprintf(buf2,"%"UINT64"",(m_docId) );
 	if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
@ -28146,12 +28405,13 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
 		// append a "www." as part of normalization
 		uw.set ( fu->getUrl() , p - fu->getUrl() , true );
 		hi.m_prefix    = "gbparenturl";
-		if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
+		// no longer, we just index json now
+		//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
 		if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) 
 			return false;
 	}

-	if ( isStatusDoc ) return true;
+	//if ( isStatusDoc ) return true;

 	setStatus ( "hashing SiteGetter terms");

@ -30054,7 +30314,9 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 		// if we had a facet, get the values it has in the doc
 		if ( qs && *qs ) {
 			// need this for storeFacetValues() if we are json
-			if ( m_contentType == CT_JSON ) {
+			if ( m_contentType == CT_JSON ||
+			     // spider status docs are really json
+			     m_contentType == CT_STATUS ) {
 				Json *jp = getParsedJson();
 				if ( ! jp || jp == (void *)-1)
 					return (Msg20Reply *)jp;
@ -30576,7 +30838,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 	reply->size_gbAdIds = size_adVector;

 	// need full cached page of each search result?
-	if ( m_req->m_includeCachedCopy ) {
+	// include it always for spider status docs.
+	if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
 		reply-> ptr_content =  ptr_utf8Content;
 		reply->size_content = size_utf8Content;
 	}
@ -49681,7 +49944,9 @@ Json *XmlDoc::getParsedJson ( ) {
 	if ( m_jpValid ) return &m_jp;

 	// core if not a json object
-	if ( m_contentTypeValid && m_contentType != CT_JSON ) {
+	if ( m_contentTypeValid && m_contentType != CT_JSON &&
+	     // spider status docs are now really json
+	     m_contentType != CT_STATUS ) {
 		char *xx=NULL;*xx=0; }

 	// \0 terminated
@ -49724,7 +49989,15 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
 	// use new json parser
 	Json *jp = getParsedJson();
 	if ( ! jp || jp == (void *)-1 ) return (char *)jp;
+
+	return hashJSONFields2 ( table , &hi , jp , true );
+}
+
 	
+char *XmlDoc::hashJSONFields2 ( HashTableX *table , 
+				HashInfo *hi , Json *jp ,
+				bool hashWithoutFieldNames ) {
+
 	JsonItem *ji = jp->getFirstItem();

 	char nb[1024];
@ -49788,17 +50061,17 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
 		// DIFFBOT special field hacks
 		//
 		char *name = nameBuf.getBufStart();
-		hi.m_hashGroup = HASHGROUP_BODY;
+		hi->m_hashGroup = HASHGROUP_BODY;
 		if ( strstr(name,"title") )
-			hi.m_hashGroup = HASHGROUP_TITLE;
+			hi->m_hashGroup = HASHGROUP_TITLE;
 		if ( strstr(name,"url") )
-			hi.m_hashGroup = HASHGROUP_INURL;
+			hi->m_hashGroup = HASHGROUP_INURL;
 		if ( strstr(name,"resolved_url") )
-			hi.m_hashGroup = HASHGROUP_INURL;
+			hi->m_hashGroup = HASHGROUP_INURL;
 		if ( strstr(name,"tags") )
-			hi.m_hashGroup = HASHGROUP_INTAG;
+			hi->m_hashGroup = HASHGROUP_INTAG;
 		if ( strstr(name,"meta") )
-			hi.m_hashGroup = HASHGROUP_INMETATAG;
+			hi->m_hashGroup = HASHGROUP_INMETATAG;
 		//
 		// now Json.cpp decodes and stores the value into
 		// a buffer, so ji->getValue() should be decoded completely
@ -49845,7 +50118,7 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
 		// set EDOCUNCHANGED in ::getIndexCode() above.
 		//
 		/*
-		if ( hi.m_hashGroup != HASHGROUP_INURL ) {
+		if ( hi->m_hashGroup != HASHGROUP_INURL ) {
 			// make the content hash so we can set m_contentHash32
 			// for deduping
 			int32_t nh32 = hash32n ( name );
@ -49858,28 +50131,31 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
 		*/

 		// index like "title:whatever"
-		hi.m_prefix = name;
-		hashString ( val , vlen , &hi );
+		hi->m_prefix = name;
+		hashString ( val , vlen , hi );

 		// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
 		if ( name ) 
-			hashFieldMatchTerm ( val , (int32_t)vlen , &hi );
+			hashFieldMatchTerm ( val , (int32_t)vlen , hi );
+
+		if ( ! hashWithoutFieldNames )
+			continue;

 		// hash without the field name as well
-		hi.m_prefix = NULL;
-		hashString ( val , vlen , &hi );
+		hi->m_prefix = NULL;
+		hashString ( val , vlen , hi );

 		/*
 		// a number? hash special then as well
 		if ( ji->m_type != JT_NUMBER ) continue;

 		// use prefix for this though
-		hi.m_prefix = name;
+		hi->m_prefix = name;

 		// hash as a number so we can sort search results by
 		// this number and do range constraints
 		float f = ji->m_valueDouble;
-		if ( ! hashNumber2 ( f , &hi ) )
+		if ( ! hashNumber2 ( f , hi ) )
 			return NULL;
 		*/
 	}
@ -49986,7 +50262,8 @@ bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
 		return storeFacetValuesSections ( qs , sb , fvh );

 	// if a json doc, get json field
-	if ( m_contentType == CT_JSON ) 
+	// spider status docs are really json now
+	if ( m_contentType == CT_JSON || m_contentType == CT_STATUS ) 
 		return storeFacetValuesJSON ( qs , sb , fvh );

 	if ( m_contentType == CT_HTML ) 
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -735,6 +735,16 @@ class XmlDoc {

 	char *getDiffbotParentUrl( char *myUrl );

+	int64_t m_diffbotReplyEndTime;
+	int64_t m_diffbotReplyStartTime;
+	int32_t m_diffbotReplyRetries;
+
+	uint64_t m_downloadStartTime;
+	//uint64_t m_downloadEndTime;
+
+	uint64_t m_ipStartTime;
+	uint64_t m_ipEndTime;
+
 	void copyFromOldDoc ( class XmlDoc *od ) ;

 	class SpiderReply *getFakeSpiderReply ( );
@ -786,8 +796,8 @@ class XmlDoc {
 	bool hashContentType ( class HashTableX *table ) ;
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
-	bool hashUrl ( class HashTableX *table , bool isStatusDoc = false ) ;
-	bool hashDateNumbers ( class HashTableX *tt , bool isStatusDoc=false) ;
+	bool hashUrl ( class HashTableX *table );
+	bool hashDateNumbers ( class HashTableX *tt );
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
 				    bool       hashAnomalies    ,
@ -1149,6 +1159,7 @@ class XmlDoc {
 	char     m_addedSpiderRequestSizeValid;
 	char     m_addedSpiderReplySizeValid;
 	char     m_addedStatusDocSizeValid;
+	char     m_downloadStartTimeValid;
 	//char   m_docQualityValid;
 	char     m_siteValid;
 	char     m_startTimeValid;
@ -1716,6 +1727,9 @@ class XmlDoc {
 	bool doesPageContentMatchDiffbotProcessPattern() ;
 	int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
 	char *hashJSONFields ( HashTableX *table );
+	char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
+				bool hashWithoutFieldNames ) ;
+
 	char *hashXMLFields ( HashTableX *table );
 	int32_t *reindexJSONObjects ( int32_t *newTitleHashes , 
 				      int32_t numNewHashes ) ;