now we add the spider status docs as json documents.

so you can facet/sortby the various fields, etc.
This commit is contained in:
Matt
2015-03-19 16:17:36 -06:00
parent d0be9f68a7
commit 90456222b6
11 changed files with 500 additions and 77 deletions

@ -476,6 +476,8 @@ bool Collectiondb::addNewColl ( char *coll ,
if ( customCrawl ) {
// always index spider status docs now
cr->m_indexSpiderReplies = true;
// remember the token
cr->m_diffbotToken.set ( token );
cr->m_diffbotCrawlName.set ( crawl );

@ -4258,7 +4258,7 @@ bool getSpiderRequestMetaList ( char *doc ,
sreq.m_hostHash32 = url.getHostHash32();
sreq.m_domHash32 = url.getDomainHash32();
sreq.m_siteHash32 = url.getHostHash32();
sreq.m_probDocId = probDocId;
//sreq.m_probDocId = probDocId;
sreq.m_hopCount = 0; // we're a seed
sreq.m_hopCountValid = true;
sreq.m_addedTime = now;

@ -407,6 +407,10 @@ bool processLoop ( void *state ) {
if ( format == FORMAT_XML ) sb->reset();
if ( format == FORMAT_JSON ) sb->reset();
if ( xd->m_contentType == CT_JSON ) sb->reset();
if ( xd->m_contentType == CT_XML ) sb->reset();
if ( xd->m_contentType == CT_STATUS ) sb->reset();
// for undoing the stuff below
int32_t startLen2 = sb->length();//p;
@ -431,6 +435,9 @@ bool processLoop ( void *state ) {
if ( xd->m_contentType == CT_JSON )
printDisclaimer = false;
if ( xd->m_contentType == CT_STATUS )
printDisclaimer = false;
if ( format == FORMAT_XML ) printDisclaimer = false;
if ( format == FORMAT_JSON ) printDisclaimer = false;
@ -624,6 +631,8 @@ bool processLoop ( void *state ) {
includeHeader = false;
if ( xd->m_contentType == CT_XML )
includeHeader = false;
if ( xd->m_contentType == CT_STATUS )
includeHeader = false;
if ( format == FORMAT_XML ) includeHeader = false;
if ( format == FORMAT_JSON ) includeHeader = false;
@ -679,6 +688,7 @@ bool processLoop ( void *state ) {
// do not calc title or print it if doc is xml or json
if ( ctype == CT_XML ) sbend = sbstart;
if ( ctype == CT_JSON ) sbend = sbstart;
if ( ctype == CT_STATUS ) sbend = sbstart;
for ( char *t = sbstart ; t < sbend ; t++ ) {
// title tag?
@ -813,6 +823,8 @@ bool processLoop ( void *state ) {
// do not do term highlighting if json
if ( xd->m_contentType == CT_JSON )
queryHighlighting = false;
if ( xd->m_contentType == CT_STATUS )
queryHighlighting = false;
SafeBuf tmp;
SafeBuf *xb = sb;
@ -917,6 +929,9 @@ bool processLoop ( void *state ) {
if ( xd->m_contentType == CT_JSON )
contentType = "application/json";
if ( xd->m_contentType == CT_STATUS )
contentType = "application/json";
if ( xd->m_contentType == CT_XML )
contentType = "test/xml";

@ -449,7 +449,7 @@ bool Msg1c::gotList ( ) {
sr.m_urlIsDocId = 1;
sr.m_fakeFirstIp = 1;
// for msg12 locking
sr.m_probDocId = docId;
//sr.m_probDocId = docId;
// use test-parser not test-spider
//sr.m_useTestSpiderDir = 0;
sr.m_parentIsSiteMap = 0;

@ -4040,6 +4040,71 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
sb->safePrintf("\",\n");
}
// print spider status pages special
if ( mr->ptr_content &&
si->m_format == FORMAT_HTML &&
mr->m_contentType == CT_STATUS ) {
if ( *numPrintedSoFar )
sb->safePrintf("<br><hr><br>\n");
// skip to gbssurl
char *s = strstr ( mr->ptr_content,"\"gbssUrl\":");
if ( ! s ) goto badformat;
// then do two columns after the two urls
char *e = strstr ( s , "\"gbssStatusCode\":" );
if ( ! e ) goto badformat;
char *m = strstr ( e , "\"gbssNumOutlinksAdded\":");
if ( ! m ) goto badformat;
// exclude \0
char *end = mr->ptr_content + mr->size_content - 1;
// use a table with 2 columns
// so we can use \n to separate lines and don't have to add brs
// and boldify just the main url, not the redir url!
sb->safePrintf("<pre style=display:inline;>"
"\"gbssUrl\":\""
"<b style=color:blue;><a href=/get?d=%"INT64">"
, mr->m_docId
);
char *s2 = strstr ( s , "\"gbssFinalRedirectUrl\":");
char *bend = e - 3;
if ( s2 ) bend = s2 - 3;
sb->safeMemcpy ( s+11 , bend - (s+11));
sb->safePrintf("</a></b></pre>\",<br>");
// now print redir url if there
if ( s2 ) {
sb->safePrintf("<pre style=display:inline;>");
sb->safeMemcpy ( s2 , e-s2 );
sb->removeLastChar('\n');
sb->safePrintf("</pre>");
}
sb->safePrintf("<table border=0 cellpadding=0 cellspacing=0>"
"<tr><td>");
sb->safePrintf("<pre>");
//int32_t off = sb->length();
sb->safeMemcpy ( e , m - e );
sb->safePrintf("</pre>");
sb->safePrintf("</td><td>");
sb->safePrintf("<pre>");
sb->safeMemcpy ( m , end - m );
// remove last \n
sb->removeLastChar('\n');
sb->removeLastChar('}');
sb->removeLastChar('\n');
sb->safePrintf("</pre>\n");
sb->safePrintf("</td></tr></table>");
// replace \n with <br>
// sb->safeReplace2 ( "\n" , 1 ,
// "<br>" , 4 ,
// 0,//niceness ,
// off );
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
// just in case
sb->nullTerm();
return true;
}
badformat:
Highlight hi;
// get the url
@ -4373,7 +4438,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
//
///////
// the a href tag
if ( si->m_format == FORMAT_HTML ) {
sb->safePrintf ( "<a href=" );
@ -5196,7 +5260,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
sb->safePrintf (" - "
"<a style=color:blue; "
"href=\"/search?sb=1&c=%s&"
"q=url2%%3A"
//"q=url2%%3A"
"q=gbfieldmatch%%3AgbssUrl%%3A"
, coll
);
sb->urlEncode ( url , gbstrlen(url) , false );

@ -2324,11 +2324,13 @@ bool Rdb::addRecord ( collnum_t collnum,
SpiderRequest *sreq = (SpiderRequest *)data;
logf(LOG_DEBUG,"spider: added doledb key "
"for pri=%"INT32" time=%"UINT32" "
"uh48=%"UINT64" docid=%"INT64" u=%s",
"uh48=%"UINT64" "
//"docid=%"INT64" "
"u=%s",
(int32_t)g_doledb.getPriority(&doleKey),
(uint32_t)g_doledb.getSpiderTime(&doleKey),
g_doledb.getUrlHash48(&doleKey),
sreq->m_probDocId,
//sreq->m_probDocId,
sreq->m_url);
}
}

@ -1802,6 +1802,9 @@ void SpiderColl::clearLocks ( ) {
void SpiderColl::reset ( ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
// reset these for SpiderLoop;
m_nextDoledbKey.setMin();
//m_didRound = false;
@ -3973,8 +3976,20 @@ bool SpiderColl::scanListForWinners ( ) {
}
// if its a SpiderReply set it for an upcoming requests
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
// see if this is the most recent one
SpiderReply *tmp = (SpiderReply *)rec;
// reset reply stats if beginning a new url
if ( srepUh48 != tmp->getUrlHash48() ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
}
// inc stats
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
else m_numFailedReplies ++;
// if we have a more recent reply already, skip this
if ( srep &&
srep->getUrlHash48() == tmp->getUrlHash48() &&
@ -3994,6 +4009,12 @@ bool SpiderColl::scanListForWinners ( ) {
int64_t uh48 = sreq->getUrlHash48();
// reset reply stats if beginning a new url
if ( ! srep ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
}
// . skip if our twin should add it to doledb
// . waiting tree only has firstIps assigned to us so
// this should not be necessary
@ -4032,21 +4053,27 @@ bool SpiderColl::scanListForWinners ( ) {
! sreq->m_fakeFirstIp )
m_totalNewSpiderRequests++;
// reset page inlink count on url request change
if ( m_lastSreqUh48 != uh48 )
m_pageNumInlinks = 1;
//int32_t ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
int32_t cblock = ipdom ( sreq->m_firstIp );
bool countIt = true;
if ( uh48 != m_lastSreqUh48 )
countIt = false;
// reset page inlink count on url request change
if ( m_lastSreqUh48 != uh48 ) {
m_pageNumInlinks = 0;
m_lastCBlockIp = 0;
}
//if ( uh48 != m_lastSreqUh48 )
// countIt = false;
if ( cblock == m_lastCBlockIp )
countIt = false;
// do not count manually added spider requests
if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
countIt = false;
// 20 is good enough
if ( m_pageNumInlinks >= 20 )
countIt = false;
@ -4069,6 +4096,12 @@ bool SpiderColl::scanListForWinners ( ) {
// set this now. it does increase with each request. so
// initial requests will not see the full # of inlinks.
sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
// put these in the spiderequest in doledb so we can
// show in the json spider status docs in
// XmlDoc::getSpiderStatusDocMetaList2()
sreq->m_reservedc1 = m_numSuccessReplies;
sreq->m_reservedc2 = m_numFailedReplies;
m_lastSreqUh48 = uh48;
m_lastCBlockIp = cblock;
@ -11032,21 +11065,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
}
if ( strncmp(p,"numinlinks",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_pageNumInlinks == val) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
@ -12308,12 +12326,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// these are -1 if they are NOT valid
int32_t a = sreq->m_pageNumInlinks;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 10;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// siteNumInlinks >= 300 [&&]
if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
// these are -1 if they are NOT valid
int32_t a1 = sreq->m_siteNumInlinks;
// only assign if valid
int32_t a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
int32_t a2 = -1;
if ( srep ) a2 = srep->m_siteNumInlinks;
// assume a1 is the best
int32_t a ;
// assign to the first valid one
@ -13991,7 +14034,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
m_isAddUrl = 1;
m_addedTime = (uint32_t)getTimeGlobal();//now;
m_fakeFirstIp = 1;
m_probDocId = probDocId;
//m_probDocId = probDocId;
m_firstIp = firstIp;
m_hopCount = 0;

@ -524,7 +524,9 @@ class SpiderRequest {
// the PROBABLE DOCID. if there is a collision with another docid
// then we increment the last 8 bits or so. see Msg22.cpp.
int64_t m_probDocId;
//int64_t m_probDocId;
int32_t m_reservedc1;
int32_t m_reservedc2;
//int32_t m_parentPubDate;
@ -1153,6 +1155,9 @@ class SpiderColl {
int32_t m_tailHopCount;
int64_t m_minFutureTimeMS;
int32_t m_numSuccessReplies;
int32_t m_numFailedReplies;
// . do not re-send CrawlInfoLocal for a coll if not update
// . we store the flags in here as true if we should send our
// CrawlInfoLocal for this coll to this hostId

@ -932,7 +932,7 @@ bool Test::injectLoop ( ) {
m_sreq.m_domHash32 = fakeIp;
m_sreq.m_hostHash32 = fakeIp;
m_sreq.m_siteHash32 = fakeIp;
m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
// this crap is fake
m_sreq.m_isInjecting = 1;
// use test-spider subdir for storing pages and spider times?

@ -189,6 +189,10 @@ static int64_t s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
m_ipStartTime = 0;
m_ipEndTime = 0;
m_diffbotReplyRetries = 0;
m_isImporting = false;
m_printedMenu = false;
@ -13106,6 +13110,10 @@ int32_t *XmlDoc::getIp ( ) {
// update status msg
setStatus ( "getting ip" );
m_ipStartTime = 0;
// assume the same in case we get it right away
m_ipEndTime = 0;
// if set from docid and recycling
if ( m_recycleContent ) {
// get the old xml doc from the old title rec
@ -13214,6 +13222,8 @@ int32_t *XmlDoc::getIp ( ) {
// update status msg
setStatus ( "getting ip" );
m_ipStartTime = gettimeofdayInMillisecondsGlobal();
// assume valid! if reply handler gets g_errno set then m_masterLoop
// should see that and call the final callback
//m_ipValid = true;
@ -13232,6 +13242,9 @@ int32_t *XmlDoc::getIp ( ) {
void gotIpWrapper ( void *state , int32_t ip ) {
// point to us
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
// wrap it up
THIS->gotIp ( true );
// . call the master callback
@ -14307,11 +14320,13 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
// m_diffbotReplyValid to true, below.
THIS->m_diffbotReplyError = 0;
log("buld: retrying diffbot reply");
THIS->m_diffbotReplyRetries++;
// resume. this checks g_errno for being set.
THIS->m_masterLoop ( THIS->m_masterState );
return;
}
THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();
//char *buf = s->m_readBuf;
// do not allow TcpServer.cpp to free it since m_diffbotReply
@ -15454,6 +15469,8 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
"diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
additionalHeaders);
m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
0 , // ip
0 , // offset
@ -15930,6 +15947,8 @@ char **XmlDoc::getHttpReply2 ( ) {
char *xx=NULL;*xx=0;
}
m_downloadStartTimeValid = true;
m_downloadStartTime = gettimeofdayInMillisecondsGlobal();
if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
// return -1 if blocked
@ -20092,6 +20111,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
sb->safePrintf("siterank=%"INT32" ", sr );
}
if ( m_sreqValid )
sb->safePrintf("pageinlinks=%04"INT32" ",
m_sreq.m_pageNumInlinks);
// int16_tcut
int64_t uh48 = hash64b ( m_firstUrl.m_url );
// mask it
@ -25494,7 +25517,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// now we need this so we can share Msg12 spider locks with
// query reindex docid-based spider requests. that way
// we do not spider the same document at the same time.
ksr.m_probDocId = g_titledb.getProbableDocId(&url);
//ksr.m_probDocId = g_titledb.getProbableDocId(&url);
//ksr.m_pageNumInlinks = 0;
@ -27046,7 +27069,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ) {
return mbuf;
}
// the spider status doc
// . the spider status doc
// . TODO:
// usedProxy:1
// proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
setStatus ( "making spider reply meta list");
@ -27070,6 +27096,21 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
int32_t *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 ) return (SafeBuf *)priority;
int32_t *ufn = getUrlFilterNum();
if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// sanity
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// why isn't gbhopcount: being indexed consistently?
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
// reset just in case
m_spiderStatusDocMetaList.reset();
@ -27082,30 +27123,230 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
return &m_spiderStatusDocMetaList;
}
// the old doc
XmlDoc *od = NULL;
if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
Url *fu = &m_firstUrl;
// . make a little json doc that we'll hash up
// . only index the fields in this doc, no extra gbdocid: inurl:
// hash terms
SafeBuf jd;
jd.safePrintf("{\n");
// so type:status query works
jd.safePrintf("\"type\":\"status\",\n");
jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl() );
if ( ptr_redirUrl )
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
ptr_redirUrl);
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
jd.safePrintf("\"gbssStatusMsg\":\"");
jd.jsonEncode (mstrerror(m_indexCode));
jd.safePrintf("\",\n");
if ( m_httpStatusValid )
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
(int32_t)m_httpStatus);
if ( od )
jd.safePrintf("\"gbssPreviouslyIndexed\":1,\n");
else
jd.safePrintf("\"gbssPreviouslyIndexed\":0,\n");
jd.safePrintf("\"gbssDomain\":\"");
jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
jd.safePrintf("\",\n");
jd.safePrintf("\"gbssSubdomain\":\"");
jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
jd.safePrintf("\",\n");
//if ( m_redirUrlPtr && m_redirUrlValid )
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
m_numRedirects);
jd.safePrintf("\"gbssDocId\":%"INT64",\n", *uqd);
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
// crawlbot round
if ( cr->m_isCustomCrawl )
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
cr->m_spiderRoundNum);
if ( m_isDupValid && m_isDup )
jd.safePrintf("\"gbssDupOfDocId:%"INT64",\n",
m_docIdWeAreADupOf);
// how many spiderings were successful vs. failed
if ( m_sreqValid ) {
jd.safePrintf("\"gbssPrevTotalNumSpiderAttempts\":%"INT32",\n",
m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
jd.safePrintf("\"gbssPrevTotalNumSpiderSuccesses\":%"INT32",\n",
m_sreq.m_reservedc1);
jd.safePrintf("\"gbssPrevTotalNumSpiderFailures\":%"INT32",\n",
m_sreq.m_reservedc2);
}
if ( m_firstIndexedDateValid )
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
m_firstIndexedDate);
if ( m_contentHash32Valid )
jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
m_contentHash32);
if ( m_downloadStartTimeValid ) {
jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
m_downloadStartTime);
jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
(uint32_t)(m_downloadStartTime/1000));
}
if ( m_downloadEndTimeValid ) {
jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
m_downloadEndTime);
jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
(uint32_t)(m_downloadEndTime/1000));
}
if ( m_downloadEndTimeValid ) {
int64_t took = m_downloadEndTime - m_downloadStartTime;
jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);
}
jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
m_useRobotsTxt);
//if ( m_numOutlinksAddedValid )
jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
(int32_t)m_numOutlinksAdded);
// how many download/indexing errors we've had, including this one
// if applicable.
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
m_srep.m_errCount);
if ( od )
jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
"%"UINT32",\n",od->m_spideredTime);
else
jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
"%"UINT32",\n",0);
if ( m_ipValid )
jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
else
jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
if ( m_ipEndTime ) {
int64_t took = m_ipEndTime - m_ipStartTime;
jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
}
if ( m_siteNumInlinksValid ) {
jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
(int32_t)m_siteNumInlinks);
char siteRank = getSiteRank();
jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
(int32_t)siteRank);
}
jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
(int32_t)m_contentInjected);
if ( m_percentChangedValid && od )
jd.safePrintf("\"gbssPercentContentChanged\""
":\"%.01f\"%%,\n",
m_percentChanged);
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
*priority);
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
cr->m_regExs[*ufn].getBufStart());
if ( m_langIdValid )
jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
getLangAbbr(m_langId));
if ( m_contentTypeValid )
jd.safePrintf("\"gbssContentType\":\"%s\",\n",
g_contentTypeStrings[m_contentType]);
if ( m_contentValid )
jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
m_contentLen);
if ( m_crawlDelayValid )
// -1 if none?
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
(int32_t)m_crawlDelay);
// sent to diffbot?
jd.safePrintf("\"gbssSentToDiffbot\":%i,\n",
(int)m_sentToDiffbot);
if ( m_diffbotReplyValid ) {
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
m_diffbotReplyError);
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
jd.jsonEncode(mstrerror(m_diffbotReplyError));
jd.safePrintf("\",\n");
jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
m_diffbotReply.length());
int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
took );
jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
m_diffbotReplyRetries );
jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
m_diffbotJSONCount);
}
// remove last ,\n
jd.incrementLength(-2);
// end the json spider status doc
jd.safePrintf("}\n");
// the posdb table
HashTableX tt4;
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
return NULL;
Json jp;
if ( ! jp.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
g_errno = EBADJSONPARSER;
return NULL;
}
// BEFORE ANY HASHING
int32_t savedDist = m_dist;
// re-set to 0
m_dist = 0;
// sanity
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
// why isn't gbhopcount: being indexed consistently?
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = &tt4;
hi.m_desc = "json spider status object";
hi.m_useCountTable = false;
hi.m_useSections = false;
// fill up tt4. false -> do not hash without field prefixes.
hashJSONFields2 ( &tt4 , &hi , &jp , false );
/*
char buf[64];
int32_t bufLen;
@ -27120,6 +27361,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
hi.m_desc = "spider error number as string";
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
if ( ! hashString( buf , &hi ) ) return NULL;
*/
/*
logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
@ -27174,6 +27416,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
// was here....
*/
/*
// gbstatus:"tcp timed out"
hi.m_prefix = "gbstatusmsg";
hi.m_desc = "spider error msg";
@ -27191,6 +27434,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
// false --> do not hash the gbdoc* terms (CT_STATUS)
hashDateNumbers ( &tt4 , true );
*/
// store keys in safebuf then to make our own meta list
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
@ -27230,6 +27474,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
int32_t fullsize = &m_dummyEnd - (char *)this;
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
/*
// the ptr_* were all zero'd out, put the ones we want to keep back in
SafeBuf tmp;
// was "Spider Status: %s" but that is unnecessary
@ -27242,6 +27487,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
if ( m_redirUrlPtr && m_redirUrlValid )
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
*/
// put stats like we log out from logIt
//tmp.safePrintf("<div style=max-width:800px;>\n");
@ -27250,8 +27496,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
//tmp.safePrintf("\n</div>");
// the content is just the title tag above
xd->ptr_utf8Content = tmp.getBufStart();
xd->size_utf8Content = tmp.length()+1;
// xd->ptr_utf8Content = tmp.getBufStart();
// xd->size_utf8Content = tmp.length()+1;
xd->ptr_utf8Content = jd.getBufStart();
xd->size_utf8Content = jd.length()+1;
// keep the same url as the doc we are the spider reply for
xd->ptr_firstUrl = ptr_firstUrl;
@ -27423,7 +27671,7 @@ int32_t XmlDoc::getIndexedTime() {
// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
// stop if already set
if ( ! m_spideredTimeValid ) return true;
@ -27453,7 +27701,7 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
// do not index the rest if we are a "spider reply" document
// which is like a fake document for seeing spider statuses
//if ( isStatusDoc == CT_STATUS ) return true;
if ( isStatusDoc ) return true;
//if ( isStatusDoc ) return true;
// now for CT_STATUS spider status "documents" we also index
// gbspiderdate so index this so we can just do a
@ -27873,7 +28121,7 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {
setStatus ( "hashing url colon" );
@ -27893,7 +28141,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
// append a "www." for doing url: searches
Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
hi.m_prefix = "url";
if ( isStatusDoc ) hi.m_prefix = "url2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "url2";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
@ -27908,7 +28157,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
char *s = fu->getUrl ();
int32_t slen = fu->getUrlLen();
hi.m_prefix = "inurl";
if ( isStatusDoc ) hi.m_prefix = "inurl2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "inurl2";
if ( ! hashString ( s,slen, &hi ) ) return false;
setStatus ( "hashing ip colon" );
@ -27923,7 +28173,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
//char *tmp = iptoa ( m_ip );
//int32_t tlen = gbstrlen(tmp);
hi.m_prefix = "ip";
if ( isStatusDoc ) hi.m_prefix = "ip2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "ip2";
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
//
@ -27993,7 +28244,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
// update parms
hi.m_prefix = "gbpathdepth";
if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash gbpathdepth:X
if ( ! hashString ( buf,blen,&hi) ) return false;
@ -28008,7 +28260,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
// update parms
hi.m_prefix = "gbhopcount";
if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
hi.m_hashGroup = HASHGROUP_INTAG;
// hash gbpathdepth:X
if ( ! hashString ( buf,blen,&hi) ) return false;
@ -28025,7 +28278,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
else hm = "0";
// update parms
hi.m_prefix = "gbhasfilename";
if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
// hash gbhasfilename:[0|1]
if ( ! hashString ( hm,1,&hi) ) return false;
@ -28037,7 +28291,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
if ( fu->isCgi() ) hm = "1";
else hm = "0";
hi.m_prefix = "gbiscgi";
if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
if ( ! hashString ( hm,1,&hi) ) return false;
@ -28051,7 +28306,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
if ( fu->getExtensionLen() ) hm = "1";
else hm = "0";
hi.m_prefix = "gbhasext";
if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
if ( ! hashString ( hm,1,&hi) ) return false;
//
@ -28096,7 +28352,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
*p = '\0';
// update hash parms
hi.m_prefix = "site";
if ( isStatusDoc ) hi.m_prefix = "site2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "site2";
hi.m_hashGroup = HASHGROUP_INURL;
// this returns false on failure
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
@ -28120,13 +28377,15 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
int32_t elen = fu->getExtensionLen();
// update hash parms
hi.m_prefix = "ext";
if ( isStatusDoc ) hi.m_prefix = "ext2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "ext2";
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
char buf2[32];
sprintf(buf2,"%"UINT64"",(m_docId) );
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
@ -28146,12 +28405,13 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
// append a "www." as part of normalization
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
hi.m_prefix = "gbparenturl";
if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
}
if ( isStatusDoc ) return true;
//if ( isStatusDoc ) return true;
setStatus ( "hashing SiteGetter terms");
@ -30054,7 +30314,9 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// if we had a facet, get the values it has in the doc
if ( qs && *qs ) {
// need this for storeFacetValues() if we are json
if ( m_contentType == CT_JSON ) {
if ( m_contentType == CT_JSON ||
// spider status docs are really json
m_contentType == CT_STATUS ) {
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1)
return (Msg20Reply *)jp;
@ -30576,7 +30838,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
reply->size_gbAdIds = size_adVector;
// need full cached page of each search result?
if ( m_req->m_includeCachedCopy ) {
// include it always for spider status docs.
if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
reply-> ptr_content = ptr_utf8Content;
reply->size_content = size_utf8Content;
}
@ -49681,7 +49944,9 @@ Json *XmlDoc::getParsedJson ( ) {
if ( m_jpValid ) return &m_jp;
// core if not a json object
if ( m_contentTypeValid && m_contentType != CT_JSON ) {
if ( m_contentTypeValid && m_contentType != CT_JSON &&
// spider status docs are now really json
m_contentType != CT_STATUS ) {
char *xx=NULL;*xx=0; }
// \0 terminated
@ -49724,7 +49989,15 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
// use new json parser
Json *jp = getParsedJson();
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
return hashJSONFields2 ( table , &hi , jp , true );
}
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
HashInfo *hi , Json *jp ,
bool hashWithoutFieldNames ) {
JsonItem *ji = jp->getFirstItem();
char nb[1024];
@ -49788,17 +50061,17 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
// DIFFBOT special field hacks
//
char *name = nameBuf.getBufStart();
hi.m_hashGroup = HASHGROUP_BODY;
hi->m_hashGroup = HASHGROUP_BODY;
if ( strstr(name,"title") )
hi.m_hashGroup = HASHGROUP_TITLE;
hi->m_hashGroup = HASHGROUP_TITLE;
if ( strstr(name,"url") )
hi.m_hashGroup = HASHGROUP_INURL;
hi->m_hashGroup = HASHGROUP_INURL;
if ( strstr(name,"resolved_url") )
hi.m_hashGroup = HASHGROUP_INURL;
hi->m_hashGroup = HASHGROUP_INURL;
if ( strstr(name,"tags") )
hi.m_hashGroup = HASHGROUP_INTAG;
hi->m_hashGroup = HASHGROUP_INTAG;
if ( strstr(name,"meta") )
hi.m_hashGroup = HASHGROUP_INMETATAG;
hi->m_hashGroup = HASHGROUP_INMETATAG;
//
// now Json.cpp decodes and stores the value into
// a buffer, so ji->getValue() should be decoded completely
@ -49845,7 +50118,7 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
// set EDOCUNCHANGED in ::getIndexCode() above.
//
/*
if ( hi.m_hashGroup != HASHGROUP_INURL ) {
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
// make the content hash so we can set m_contentHash32
// for deduping
int32_t nh32 = hash32n ( name );
@ -49858,28 +50131,31 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
*/
// index like "title:whatever"
hi.m_prefix = name;
hashString ( val , vlen , &hi );
hi->m_prefix = name;
hashString ( val , vlen , hi );
// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
if ( name )
hashFieldMatchTerm ( val , (int32_t)vlen , &hi );
hashFieldMatchTerm ( val , (int32_t)vlen , hi );
if ( ! hashWithoutFieldNames )
continue;
// hash without the field name as well
hi.m_prefix = NULL;
hashString ( val , vlen , &hi );
hi->m_prefix = NULL;
hashString ( val , vlen , hi );
/*
// a number? hash special then as well
if ( ji->m_type != JT_NUMBER ) continue;
// use prefix for this though
hi.m_prefix = name;
hi->m_prefix = name;
// hash as a number so we can sort search results by
// this number and do range constraints
float f = ji->m_valueDouble;
if ( ! hashNumber2 ( f , &hi ) )
if ( ! hashNumber2 ( f , hi ) )
return NULL;
*/
}
@ -49986,7 +50262,8 @@ bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
return storeFacetValuesSections ( qs , sb , fvh );
// if a json doc, get json field
if ( m_contentType == CT_JSON )
// spider status docs are really json now
if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
return storeFacetValuesJSON ( qs , sb , fvh );
if ( m_contentType == CT_HTML )

@ -735,6 +735,16 @@ class XmlDoc {
char *getDiffbotParentUrl( char *myUrl );
int64_t m_diffbotReplyEndTime;
int64_t m_diffbotReplyStartTime;
int32_t m_diffbotReplyRetries;
uint64_t m_downloadStartTime;
//uint64_t m_downloadEndTime;
uint64_t m_ipStartTime;
uint64_t m_ipEndTime;
void copyFromOldDoc ( class XmlDoc *od ) ;
class SpiderReply *getFakeSpiderReply ( );
@ -786,8 +796,8 @@ class XmlDoc {
bool hashContentType ( class HashTableX *table ) ;
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table , bool isStatusDoc = false ) ;
bool hashDateNumbers ( class HashTableX *tt , bool isStatusDoc=false) ;
bool hashUrl ( class HashTableX *table );
bool hashDateNumbers ( class HashTableX *tt );
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
@ -1149,6 +1159,7 @@ class XmlDoc {
char m_addedSpiderRequestSizeValid;
char m_addedSpiderReplySizeValid;
char m_addedStatusDocSizeValid;
char m_downloadStartTimeValid;
//char m_docQualityValid;
char m_siteValid;
char m_startTimeValid;
@ -1716,6 +1727,9 @@ class XmlDoc {
bool doesPageContentMatchDiffbotProcessPattern() ;
int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
char *hashJSONFields ( HashTableX *table );
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
bool hashWithoutFieldNames ) ;
char *hashXMLFields ( HashTableX *table );
int32_t *reindexJSONObjects ( int32_t *newTitleHashes ,
int32_t numNewHashes ) ;