now we add the spider status docs as json documents.
so you can facet/sortby the various fields, etc.
This commit is contained in:
@ -476,6 +476,8 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
|
||||
|
||||
if ( customCrawl ) {
|
||||
// always index spider status docs now
|
||||
cr->m_indexSpiderReplies = true;
|
||||
// remember the token
|
||||
cr->m_diffbotToken.set ( token );
|
||||
cr->m_diffbotCrawlName.set ( crawl );
|
||||
|
@ -4258,7 +4258,7 @@ bool getSpiderRequestMetaList ( char *doc ,
|
||||
sreq.m_hostHash32 = url.getHostHash32();
|
||||
sreq.m_domHash32 = url.getDomainHash32();
|
||||
sreq.m_siteHash32 = url.getHostHash32();
|
||||
sreq.m_probDocId = probDocId;
|
||||
//sreq.m_probDocId = probDocId;
|
||||
sreq.m_hopCount = 0; // we're a seed
|
||||
sreq.m_hopCountValid = true;
|
||||
sreq.m_addedTime = now;
|
||||
|
15
PageGet.cpp
15
PageGet.cpp
@ -407,6 +407,10 @@ bool processLoop ( void *state ) {
|
||||
if ( format == FORMAT_XML ) sb->reset();
|
||||
if ( format == FORMAT_JSON ) sb->reset();
|
||||
|
||||
if ( xd->m_contentType == CT_JSON ) sb->reset();
|
||||
if ( xd->m_contentType == CT_XML ) sb->reset();
|
||||
if ( xd->m_contentType == CT_STATUS ) sb->reset();
|
||||
|
||||
// for undoing the stuff below
|
||||
int32_t startLen2 = sb->length();//p;
|
||||
|
||||
@ -431,6 +435,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
printDisclaimer = false;
|
||||
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
printDisclaimer = false;
|
||||
|
||||
if ( format == FORMAT_XML ) printDisclaimer = false;
|
||||
if ( format == FORMAT_JSON ) printDisclaimer = false;
|
||||
|
||||
@ -624,6 +631,8 @@ bool processLoop ( void *state ) {
|
||||
includeHeader = false;
|
||||
if ( xd->m_contentType == CT_XML )
|
||||
includeHeader = false;
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
includeHeader = false;
|
||||
|
||||
if ( format == FORMAT_XML ) includeHeader = false;
|
||||
if ( format == FORMAT_JSON ) includeHeader = false;
|
||||
@ -679,6 +688,7 @@ bool processLoop ( void *state ) {
|
||||
// do not calc title or print it if doc is xml or json
|
||||
if ( ctype == CT_XML ) sbend = sbstart;
|
||||
if ( ctype == CT_JSON ) sbend = sbstart;
|
||||
if ( ctype == CT_STATUS ) sbend = sbstart;
|
||||
|
||||
for ( char *t = sbstart ; t < sbend ; t++ ) {
|
||||
// title tag?
|
||||
@ -813,6 +823,8 @@ bool processLoop ( void *state ) {
|
||||
// do not do term highlighting if json
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
queryHighlighting = false;
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
queryHighlighting = false;
|
||||
|
||||
SafeBuf tmp;
|
||||
SafeBuf *xb = sb;
|
||||
@ -917,6 +929,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
contentType = "application/json";
|
||||
|
||||
if ( xd->m_contentType == CT_STATUS )
|
||||
contentType = "application/json";
|
||||
|
||||
if ( xd->m_contentType == CT_XML )
|
||||
contentType = "test/xml";
|
||||
|
||||
|
@ -449,7 +449,7 @@ bool Msg1c::gotList ( ) {
|
||||
sr.m_urlIsDocId = 1;
|
||||
sr.m_fakeFirstIp = 1;
|
||||
// for msg12 locking
|
||||
sr.m_probDocId = docId;
|
||||
//sr.m_probDocId = docId;
|
||||
// use test-parser not test-spider
|
||||
//sr.m_useTestSpiderDir = 0;
|
||||
sr.m_parentIsSiteMap = 0;
|
||||
|
@ -4040,6 +4040,71 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
// print spider status pages special
|
||||
if ( mr->ptr_content &&
|
||||
si->m_format == FORMAT_HTML &&
|
||||
mr->m_contentType == CT_STATUS ) {
|
||||
if ( *numPrintedSoFar )
|
||||
sb->safePrintf("<br><hr><br>\n");
|
||||
// skip to gbssurl
|
||||
char *s = strstr ( mr->ptr_content,"\"gbssUrl\":");
|
||||
if ( ! s ) goto badformat;
|
||||
// then do two columns after the two urls
|
||||
char *e = strstr ( s , "\"gbssStatusCode\":" );
|
||||
if ( ! e ) goto badformat;
|
||||
char *m = strstr ( e , "\"gbssNumOutlinksAdded\":");
|
||||
if ( ! m ) goto badformat;
|
||||
// exclude \0
|
||||
char *end = mr->ptr_content + mr->size_content - 1;
|
||||
// use a table with 2 columns
|
||||
// so we can use \n to separate lines and don't have to add brs
|
||||
// and boldify just the main url, not the redir url!
|
||||
sb->safePrintf("<pre style=display:inline;>"
|
||||
"\"gbssUrl\":\""
|
||||
"<b style=color:blue;><a href=/get?d=%"INT64">"
|
||||
, mr->m_docId
|
||||
);
|
||||
char *s2 = strstr ( s , "\"gbssFinalRedirectUrl\":");
|
||||
char *bend = e - 3;
|
||||
if ( s2 ) bend = s2 - 3;
|
||||
sb->safeMemcpy ( s+11 , bend - (s+11));
|
||||
sb->safePrintf("</a></b></pre>\",<br>");
|
||||
// now print redir url if there
|
||||
if ( s2 ) {
|
||||
sb->safePrintf("<pre style=display:inline;>");
|
||||
sb->safeMemcpy ( s2 , e-s2 );
|
||||
sb->removeLastChar('\n');
|
||||
sb->safePrintf("</pre>");
|
||||
}
|
||||
sb->safePrintf("<table border=0 cellpadding=0 cellspacing=0>"
|
||||
"<tr><td>");
|
||||
sb->safePrintf("<pre>");
|
||||
//int32_t off = sb->length();
|
||||
sb->safeMemcpy ( e , m - e );
|
||||
sb->safePrintf("</pre>");
|
||||
sb->safePrintf("</td><td>");
|
||||
sb->safePrintf("<pre>");
|
||||
sb->safeMemcpy ( m , end - m );
|
||||
// remove last \n
|
||||
sb->removeLastChar('\n');
|
||||
sb->removeLastChar('}');
|
||||
sb->removeLastChar('\n');
|
||||
sb->safePrintf("</pre>\n");
|
||||
sb->safePrintf("</td></tr></table>");
|
||||
// replace \n with <br>
|
||||
// sb->safeReplace2 ( "\n" , 1 ,
|
||||
// "<br>" , 4 ,
|
||||
// 0,//niceness ,
|
||||
// off );
|
||||
// inc it
|
||||
*numPrintedSoFar = *numPrintedSoFar + 1;
|
||||
// just in case
|
||||
sb->nullTerm();
|
||||
return true;
|
||||
}
|
||||
|
||||
badformat:
|
||||
|
||||
Highlight hi;
|
||||
|
||||
// get the url
|
||||
@ -4373,7 +4438,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
// the a href tag
|
||||
if ( si->m_format == FORMAT_HTML ) {
|
||||
sb->safePrintf ( "<a href=" );
|
||||
@ -5196,7 +5260,8 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
sb->safePrintf (" - "
|
||||
"<a style=color:blue; "
|
||||
"href=\"/search?sb=1&c=%s&"
|
||||
"q=url2%%3A"
|
||||
//"q=url2%%3A"
|
||||
"q=gbfieldmatch%%3AgbssUrl%%3A"
|
||||
, coll
|
||||
);
|
||||
sb->urlEncode ( url , gbstrlen(url) , false );
|
||||
|
6
Rdb.cpp
6
Rdb.cpp
@ -2324,11 +2324,13 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
SpiderRequest *sreq = (SpiderRequest *)data;
|
||||
logf(LOG_DEBUG,"spider: added doledb key "
|
||||
"for pri=%"INT32" time=%"UINT32" "
|
||||
"uh48=%"UINT64" docid=%"INT64" u=%s",
|
||||
"uh48=%"UINT64" "
|
||||
//"docid=%"INT64" "
|
||||
"u=%s",
|
||||
(int32_t)g_doledb.getPriority(&doleKey),
|
||||
(uint32_t)g_doledb.getSpiderTime(&doleKey),
|
||||
g_doledb.getUrlHash48(&doleKey),
|
||||
sreq->m_probDocId,
|
||||
//sreq->m_probDocId,
|
||||
sreq->m_url);
|
||||
}
|
||||
}
|
||||
|
89
Spider.cpp
89
Spider.cpp
@ -1802,6 +1802,9 @@ void SpiderColl::clearLocks ( ) {
|
||||
|
||||
void SpiderColl::reset ( ) {
|
||||
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
|
||||
// reset these for SpiderLoop;
|
||||
m_nextDoledbKey.setMin();
|
||||
//m_didRound = false;
|
||||
@ -3973,8 +3976,20 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
}
|
||||
// if its a SpiderReply set it for an upcoming requests
|
||||
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
|
||||
|
||||
// see if this is the most recent one
|
||||
SpiderReply *tmp = (SpiderReply *)rec;
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
|
||||
// inc stats
|
||||
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
else m_numFailedReplies ++;
|
||||
|
||||
// if we have a more recent reply already, skip this
|
||||
if ( srep &&
|
||||
srep->getUrlHash48() == tmp->getUrlHash48() &&
|
||||
@ -3994,6 +4009,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
|
||||
int64_t uh48 = sreq->getUrlHash48();
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( ! srep ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
|
||||
// . skip if our twin should add it to doledb
|
||||
// . waiting tree only has firstIps assigned to us so
|
||||
// this should not be necessary
|
||||
@ -4032,21 +4053,27 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
! sreq->m_fakeFirstIp )
|
||||
m_totalNewSpiderRequests++;
|
||||
|
||||
// reset page inlink count on url request change
|
||||
if ( m_lastSreqUh48 != uh48 )
|
||||
m_pageNumInlinks = 1;
|
||||
|
||||
//int32_t ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
|
||||
int32_t cblock = ipdom ( sreq->m_firstIp );
|
||||
|
||||
bool countIt = true;
|
||||
|
||||
if ( uh48 != m_lastSreqUh48 )
|
||||
countIt = false;
|
||||
// reset page inlink count on url request change
|
||||
if ( m_lastSreqUh48 != uh48 ) {
|
||||
m_pageNumInlinks = 0;
|
||||
m_lastCBlockIp = 0;
|
||||
}
|
||||
|
||||
//if ( uh48 != m_lastSreqUh48 )
|
||||
// countIt = false;
|
||||
|
||||
if ( cblock == m_lastCBlockIp )
|
||||
countIt = false;
|
||||
|
||||
// do not count manually added spider requests
|
||||
if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
|
||||
countIt = false;
|
||||
|
||||
// 20 is good enough
|
||||
if ( m_pageNumInlinks >= 20 )
|
||||
countIt = false;
|
||||
@ -4069,6 +4096,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// set this now. it does increase with each request. so
|
||||
// initial requests will not see the full # of inlinks.
|
||||
sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
|
||||
|
||||
// put these in the spiderequest in doledb so we can
|
||||
// show in the json spider status docs in
|
||||
// XmlDoc::getSpiderStatusDocMetaList2()
|
||||
sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
sreq->m_reservedc2 = m_numFailedReplies;
|
||||
|
||||
m_lastSreqUh48 = uh48;
|
||||
m_lastCBlockIp = cblock;
|
||||
@ -11032,21 +11065,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
}
|
||||
|
||||
|
||||
if ( strncmp(p,"numinlinks",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
if ( (bool)sreq->m_pageNumInlinks == val) continue;
|
||||
// skip
|
||||
p += 10;
|
||||
// skip to next constraint
|
||||
p = strstr(p, "&&");
|
||||
// all done?
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
@ -12308,12 +12326,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// these are -1 if they are NOT valid
|
||||
int32_t a = sreq->m_pageNumInlinks;
|
||||
// make it point to the priority
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
if ( sign == SIGN_EQ && a != b ) continue;
|
||||
if ( sign == SIGN_NE && a == b ) continue;
|
||||
if ( sign == SIGN_GT && a <= b ) continue;
|
||||
if ( sign == SIGN_LT && a >= b ) continue;
|
||||
if ( sign == SIGN_GE && a < b ) continue;
|
||||
if ( sign == SIGN_LE && a > b ) continue;
|
||||
// skip fast
|
||||
p += 10;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// siteNumInlinks >= 300 [&&]
|
||||
if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
|
||||
// these are -1 if they are NOT valid
|
||||
int32_t a1 = sreq->m_siteNumInlinks;
|
||||
// only assign if valid
|
||||
int32_t a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
|
||||
int32_t a2 = -1;
|
||||
if ( srep ) a2 = srep->m_siteNumInlinks;
|
||||
// assume a1 is the best
|
||||
int32_t a ;
|
||||
// assign to the first valid one
|
||||
@ -13991,7 +14034,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
|
||||
m_isAddUrl = 1;
|
||||
m_addedTime = (uint32_t)getTimeGlobal();//now;
|
||||
m_fakeFirstIp = 1;
|
||||
m_probDocId = probDocId;
|
||||
//m_probDocId = probDocId;
|
||||
m_firstIp = firstIp;
|
||||
m_hopCount = 0;
|
||||
|
||||
|
7
Spider.h
7
Spider.h
@ -524,7 +524,9 @@ class SpiderRequest {
|
||||
|
||||
// the PROBABLE DOCID. if there is a collision with another docid
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
int64_t m_probDocId;
|
||||
//int64_t m_probDocId;
|
||||
int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
|
||||
@ -1153,6 +1155,9 @@ class SpiderColl {
|
||||
int32_t m_tailHopCount;
|
||||
int64_t m_minFutureTimeMS;
|
||||
|
||||
int32_t m_numSuccessReplies;
|
||||
int32_t m_numFailedReplies;
|
||||
|
||||
// . do not re-send CrawlInfoLocal for a coll if not update
|
||||
// . we store the flags in here as true if we should send our
|
||||
// CrawlInfoLocal for this coll to this hostId
|
||||
|
2
Test.cpp
2
Test.cpp
@ -932,7 +932,7 @@ bool Test::injectLoop ( ) {
|
||||
m_sreq.m_domHash32 = fakeIp;
|
||||
m_sreq.m_hostHash32 = fakeIp;
|
||||
m_sreq.m_siteHash32 = fakeIp;
|
||||
m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
|
||||
//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
|
||||
// this crap is fake
|
||||
m_sreq.m_isInjecting = 1;
|
||||
// use test-spider subdir for storing pages and spider times?
|
||||
|
365
XmlDoc.cpp
365
XmlDoc.cpp
@ -189,6 +189,10 @@ static int64_t s_lastTimeStart = 0LL;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
m_ipStartTime = 0;
|
||||
m_ipEndTime = 0;
|
||||
m_diffbotReplyRetries = 0;
|
||||
|
||||
m_isImporting = false;
|
||||
|
||||
m_printedMenu = false;
|
||||
@ -13106,6 +13110,10 @@ int32_t *XmlDoc::getIp ( ) {
|
||||
// update status msg
|
||||
setStatus ( "getting ip" );
|
||||
|
||||
m_ipStartTime = 0;
|
||||
// assume the same in case we get it right away
|
||||
m_ipEndTime = 0;
|
||||
|
||||
// if set from docid and recycling
|
||||
if ( m_recycleContent ) {
|
||||
// get the old xml doc from the old title rec
|
||||
@ -13214,6 +13222,8 @@ int32_t *XmlDoc::getIp ( ) {
|
||||
// update status msg
|
||||
setStatus ( "getting ip" );
|
||||
|
||||
m_ipStartTime = gettimeofdayInMillisecondsGlobal();
|
||||
|
||||
// assume valid! if reply handler gets g_errno set then m_masterLoop
|
||||
// should see that and call the final callback
|
||||
//m_ipValid = true;
|
||||
@ -13232,6 +13242,9 @@ int32_t *XmlDoc::getIp ( ) {
|
||||
void gotIpWrapper ( void *state , int32_t ip ) {
|
||||
// point to us
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
|
||||
THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();
|
||||
|
||||
// wrap it up
|
||||
THIS->gotIp ( true );
|
||||
// . call the master callback
|
||||
@ -14307,11 +14320,13 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// m_diffbotReplyValid to true, below.
|
||||
THIS->m_diffbotReplyError = 0;
|
||||
log("buld: retrying diffbot reply");
|
||||
THIS->m_diffbotReplyRetries++;
|
||||
// resume. this checks g_errno for being set.
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
return;
|
||||
}
|
||||
|
||||
THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();
|
||||
|
||||
//char *buf = s->m_readBuf;
|
||||
// do not allow TcpServer.cpp to free it since m_diffbotReply
|
||||
@ -15454,6 +15469,8 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
"diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
|
||||
additionalHeaders);
|
||||
|
||||
m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();
|
||||
|
||||
if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
@ -15930,6 +15947,8 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
m_downloadStartTimeValid = true;
|
||||
m_downloadStartTime = gettimeofdayInMillisecondsGlobal();
|
||||
|
||||
if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
|
||||
// return -1 if blocked
|
||||
@ -20092,6 +20111,10 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
sb->safePrintf("siterank=%"INT32" ", sr );
|
||||
}
|
||||
|
||||
if ( m_sreqValid )
|
||||
sb->safePrintf("pageinlinks=%04"INT32" ",
|
||||
m_sreq.m_pageNumInlinks);
|
||||
|
||||
// int16_tcut
|
||||
int64_t uh48 = hash64b ( m_firstUrl.m_url );
|
||||
// mask it
|
||||
@ -25494,7 +25517,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// now we need this so we can share Msg12 spider locks with
|
||||
// query reindex docid-based spider requests. that way
|
||||
// we do not spider the same document at the same time.
|
||||
ksr.m_probDocId = g_titledb.getProbableDocId(&url);
|
||||
//ksr.m_probDocId = g_titledb.getProbableDocId(&url);
|
||||
|
||||
//ksr.m_pageNumInlinks = 0;
|
||||
|
||||
@ -27046,7 +27069,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ) {
|
||||
return mbuf;
|
||||
}
|
||||
|
||||
// the spider status doc
|
||||
// . the spider status doc
|
||||
// . TODO:
|
||||
// usedProxy:1
|
||||
// proxyIp:1.2.3.4
|
||||
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
|
||||
setStatus ( "making spider reply meta list");
|
||||
@ -27070,6 +27096,21 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
unsigned char *hc = (unsigned char *)getHopCount();
|
||||
if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;
|
||||
|
||||
int32_t *priority = getSpiderPriority();
|
||||
if ( ! priority || priority == (void *)-1 ) return (SafeBuf *)priority;
|
||||
|
||||
int32_t *ufn = getUrlFilterNum();
|
||||
if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// sanity
|
||||
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// why isn't gbhopcount: being indexed consistently?
|
||||
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// reset just in case
|
||||
m_spiderStatusDocMetaList.reset();
|
||||
|
||||
@ -27082,30 +27123,230 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
return &m_spiderStatusDocMetaList;
|
||||
}
|
||||
|
||||
// the old doc
|
||||
XmlDoc *od = NULL;
|
||||
if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;
|
||||
|
||||
Url *fu = &m_firstUrl;
|
||||
|
||||
// . make a little json doc that we'll hash up
|
||||
// . only index the fields in this doc, no extra gbdocid: inurl:
|
||||
// hash terms
|
||||
SafeBuf jd;
|
||||
jd.safePrintf("{\n");
|
||||
|
||||
// so type:status query works
|
||||
jd.safePrintf("\"type\":\"status\",\n");
|
||||
|
||||
jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl() );
|
||||
|
||||
if ( ptr_redirUrl )
|
||||
jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
|
||||
ptr_redirUrl);
|
||||
|
||||
|
||||
jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
|
||||
|
||||
jd.safePrintf("\"gbssStatusMsg\":\"");
|
||||
jd.jsonEncode (mstrerror(m_indexCode));
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
if ( m_httpStatusValid )
|
||||
jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
|
||||
(int32_t)m_httpStatus);
|
||||
|
||||
if ( od )
|
||||
jd.safePrintf("\"gbssPreviouslyIndexed\":1,\n");
|
||||
else
|
||||
jd.safePrintf("\"gbssPreviouslyIndexed\":0,\n");
|
||||
|
||||
jd.safePrintf("\"gbssDomain\":\"");
|
||||
jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
jd.safePrintf("\"gbssSubdomain\":\"");
|
||||
jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
|
||||
jd.safePrintf("\",\n");
|
||||
|
||||
//if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",
|
||||
m_numRedirects);
|
||||
|
||||
jd.safePrintf("\"gbssDocId\":%"INT64",\n", *uqd);
|
||||
|
||||
jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
|
||||
|
||||
// crawlbot round
|
||||
if ( cr->m_isCustomCrawl )
|
||||
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
|
||||
cr->m_spiderRoundNum);
|
||||
|
||||
if ( m_isDupValid && m_isDup )
|
||||
jd.safePrintf("\"gbssDupOfDocId:%"INT64",\n",
|
||||
m_docIdWeAreADupOf);
|
||||
|
||||
// how many spiderings were successful vs. failed
|
||||
if ( m_sreqValid ) {
|
||||
jd.safePrintf("\"gbssPrevTotalNumSpiderAttempts\":%"INT32",\n",
|
||||
m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
|
||||
jd.safePrintf("\"gbssPrevTotalNumSpiderSuccesses\":%"INT32",\n",
|
||||
m_sreq.m_reservedc1);
|
||||
jd.safePrintf("\"gbssPrevTotalNumSpiderFailures\":%"INT32",\n",
|
||||
m_sreq.m_reservedc2);
|
||||
}
|
||||
|
||||
if ( m_firstIndexedDateValid )
|
||||
jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
|
||||
m_firstIndexedDate);
|
||||
|
||||
if ( m_contentHash32Valid )
|
||||
jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
|
||||
m_contentHash32);
|
||||
|
||||
if ( m_downloadStartTimeValid ) {
|
||||
jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
|
||||
m_downloadStartTime);
|
||||
jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
|
||||
(uint32_t)(m_downloadStartTime/1000));
|
||||
}
|
||||
|
||||
if ( m_downloadEndTimeValid ) {
|
||||
jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
|
||||
m_downloadEndTime);
|
||||
jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
|
||||
(uint32_t)(m_downloadEndTime/1000));
|
||||
}
|
||||
|
||||
if ( m_downloadEndTimeValid ) {
|
||||
int64_t took = m_downloadEndTime - m_downloadStartTime;
|
||||
jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);
|
||||
}
|
||||
|
||||
jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
|
||||
m_useRobotsTxt);
|
||||
|
||||
//if ( m_numOutlinksAddedValid )
|
||||
jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
|
||||
(int32_t)m_numOutlinksAdded);
|
||||
|
||||
// how many download/indexing errors we've had, including this one
|
||||
// if applicable.
|
||||
jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
|
||||
m_srep.m_errCount);
|
||||
|
||||
|
||||
if ( od )
|
||||
jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
|
||||
"%"UINT32",\n",od->m_spideredTime);
|
||||
else
|
||||
jd.safePrintf("\"gbssLastSuccessfulDownloadEndTime\":"
|
||||
"%"UINT32",\n",0);
|
||||
|
||||
if ( m_ipValid )
|
||||
jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
|
||||
else
|
||||
jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");
|
||||
|
||||
if ( m_ipEndTime ) {
|
||||
int64_t took = m_ipEndTime - m_ipStartTime;
|
||||
jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
|
||||
}
|
||||
|
||||
if ( m_siteNumInlinksValid ) {
|
||||
jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
|
||||
(int32_t)m_siteNumInlinks);
|
||||
char siteRank = getSiteRank();
|
||||
jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
|
||||
(int32_t)siteRank);
|
||||
}
|
||||
|
||||
jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
|
||||
(int32_t)m_contentInjected);
|
||||
|
||||
if ( m_percentChangedValid && od )
|
||||
jd.safePrintf("\"gbssPercentContentChanged\""
|
||||
":\"%.01f\"%%,\n",
|
||||
m_percentChanged);
|
||||
|
||||
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
|
||||
*priority);
|
||||
|
||||
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
|
||||
cr->m_regExs[*ufn].getBufStart());
|
||||
|
||||
if ( m_langIdValid )
|
||||
jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
|
||||
getLangAbbr(m_langId));
|
||||
|
||||
if ( m_contentTypeValid )
|
||||
jd.safePrintf("\"gbssContentType\":\"%s\",\n",
|
||||
g_contentTypeStrings[m_contentType]);
|
||||
|
||||
if ( m_contentValid )
|
||||
jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
|
||||
m_contentLen);
|
||||
|
||||
if ( m_crawlDelayValid )
|
||||
// -1 if none?
|
||||
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
|
||||
(int32_t)m_crawlDelay);
|
||||
|
||||
// sent to diffbot?
|
||||
jd.safePrintf("\"gbssSentToDiffbot\":%i,\n",
|
||||
(int)m_sentToDiffbot);
|
||||
|
||||
if ( m_diffbotReplyValid ) {
|
||||
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
|
||||
m_diffbotReplyError);
|
||||
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
|
||||
jd.jsonEncode(mstrerror(m_diffbotReplyError));
|
||||
jd.safePrintf("\",\n");
|
||||
jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
|
||||
m_diffbotReply.length());
|
||||
int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
|
||||
jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
|
||||
took );
|
||||
jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
|
||||
m_diffbotReplyRetries );
|
||||
jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
|
||||
m_diffbotJSONCount);
|
||||
}
|
||||
|
||||
// remove last ,\n
|
||||
jd.incrementLength(-2);
|
||||
// end the json spider status doc
|
||||
jd.safePrintf("}\n");
|
||||
|
||||
|
||||
// the posdb table
|
||||
HashTableX tt4;
|
||||
if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
|
||||
return NULL;
|
||||
|
||||
|
||||
Json jp;
|
||||
if ( ! jp.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
|
||||
g_errno = EBADJSONPARSER;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BEFORE ANY HASHING
|
||||
int32_t savedDist = m_dist;
|
||||
// re-set to 0
|
||||
m_dist = 0;
|
||||
|
||||
// sanity
|
||||
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// why isn't gbhopcount: being indexed consistently?
|
||||
if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_tt = &tt4;
|
||||
hi.m_desc = "json spider status object";
|
||||
hi.m_useCountTable = false;
|
||||
hi.m_useSections = false;
|
||||
|
||||
// fill up tt4. false -> do not hash without field prefixes.
|
||||
hashJSONFields2 ( &tt4 , &hi , &jp , false );
|
||||
|
||||
/*
|
||||
char buf[64];
|
||||
int32_t bufLen;
|
||||
|
||||
@ -27120,6 +27361,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
hi.m_desc = "spider error number as string";
|
||||
bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
|
||||
if ( ! hashString( buf , &hi ) ) return NULL;
|
||||
*/
|
||||
|
||||
/*
|
||||
logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
|
||||
@ -27174,6 +27416,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
// was here....
|
||||
*/
|
||||
|
||||
/*
|
||||
// gbstatus:"tcp timed out"
|
||||
hi.m_prefix = "gbstatusmsg";
|
||||
hi.m_desc = "spider error msg";
|
||||
@ -27191,6 +27434,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
|
||||
// false --> do not hash the gbdoc* terms (CT_STATUS)
|
||||
hashDateNumbers ( &tt4 , true );
|
||||
*/
|
||||
|
||||
// store keys in safebuf then to make our own meta list
|
||||
addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
|
||||
@ -27230,6 +27474,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
int32_t fullsize = &m_dummyEnd - (char *)this;
|
||||
if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
/*
|
||||
// the ptr_* were all zero'd out, put the ones we want to keep back in
|
||||
SafeBuf tmp;
|
||||
// was "Spider Status: %s" but that is unnecessary
|
||||
@ -27242,6 +27487,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
|
||||
if ( m_redirUrlPtr && m_redirUrlValid )
|
||||
tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
|
||||
*/
|
||||
|
||||
// put stats like we log out from logIt
|
||||
//tmp.safePrintf("<div style=max-width:800px;>\n");
|
||||
@ -27250,8 +27496,10 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
//tmp.safePrintf("\n</div>");
|
||||
|
||||
// the content is just the title tag above
|
||||
xd->ptr_utf8Content = tmp.getBufStart();
|
||||
xd->size_utf8Content = tmp.length()+1;
|
||||
// xd->ptr_utf8Content = tmp.getBufStart();
|
||||
// xd->size_utf8Content = tmp.length()+1;
|
||||
xd->ptr_utf8Content = jd.getBufStart();
|
||||
xd->size_utf8Content = jd.length()+1;
|
||||
|
||||
// keep the same url as the doc we are the spider reply for
|
||||
xd->ptr_firstUrl = ptr_firstUrl;
|
||||
@ -27423,7 +27671,7 @@ int32_t XmlDoc::getIndexedTime() {
|
||||
|
||||
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
||||
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
||||
bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
|
||||
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
||||
|
||||
// stop if already set
|
||||
if ( ! m_spideredTimeValid ) return true;
|
||||
@ -27453,7 +27701,7 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) {
|
||||
// do not index the rest if we are a "spider reply" document
|
||||
// which is like a fake document for seeing spider statuses
|
||||
//if ( isStatusDoc == CT_STATUS ) return true;
|
||||
if ( isStatusDoc ) return true;
|
||||
//if ( isStatusDoc ) return true;
|
||||
|
||||
// now for CT_STATUS spider status "documents" we also index
|
||||
// gbspiderdate so index this so we can just do a
|
||||
@ -27873,7 +28121,7 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
||||
bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
||||
|
||||
setStatus ( "hashing url colon" );
|
||||
|
||||
@ -27893,7 +28141,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
// append a "www." for doing url: searches
|
||||
Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
|
||||
hi.m_prefix = "url";
|
||||
if ( isStatusDoc ) hi.m_prefix = "url2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "url2";
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
|
||||
@ -27908,7 +28157,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
char *s = fu->getUrl ();
|
||||
int32_t slen = fu->getUrlLen();
|
||||
hi.m_prefix = "inurl";
|
||||
if ( isStatusDoc ) hi.m_prefix = "inurl2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "inurl2";
|
||||
if ( ! hashString ( s,slen, &hi ) ) return false;
|
||||
|
||||
setStatus ( "hashing ip colon" );
|
||||
@ -27923,7 +28173,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
//char *tmp = iptoa ( m_ip );
|
||||
//int32_t tlen = gbstrlen(tmp);
|
||||
hi.m_prefix = "ip";
|
||||
if ( isStatusDoc ) hi.m_prefix = "ip2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "ip2";
|
||||
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
|
||||
|
||||
//
|
||||
@ -27993,7 +28244,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
|
||||
// update parms
|
||||
hi.m_prefix = "gbpathdepth";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
// hash gbpathdepth:X
|
||||
if ( ! hashString ( buf,blen,&hi) ) return false;
|
||||
@ -28008,7 +28260,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
|
||||
// update parms
|
||||
hi.m_prefix = "gbhopcount";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
// hash gbpathdepth:X
|
||||
if ( ! hashString ( buf,blen,&hi) ) return false;
|
||||
@ -28025,7 +28278,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
else hm = "0";
|
||||
// update parms
|
||||
hi.m_prefix = "gbhasfilename";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
|
||||
// hash gbhasfilename:[0|1]
|
||||
if ( ! hashString ( hm,1,&hi) ) return false;
|
||||
|
||||
@ -28037,7 +28291,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
if ( fu->isCgi() ) hm = "1";
|
||||
else hm = "0";
|
||||
hi.m_prefix = "gbiscgi";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
|
||||
if ( ! hashString ( hm,1,&hi) ) return false;
|
||||
|
||||
|
||||
@ -28051,7 +28306,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
if ( fu->getExtensionLen() ) hm = "1";
|
||||
else hm = "0";
|
||||
hi.m_prefix = "gbhasext";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
|
||||
if ( ! hashString ( hm,1,&hi) ) return false;
|
||||
|
||||
//
|
||||
@ -28096,7 +28352,8 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
*p = '\0';
|
||||
// update hash parms
|
||||
hi.m_prefix = "site";
|
||||
if ( isStatusDoc ) hi.m_prefix = "site2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "site2";
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
// this returns false on failure
|
||||
if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
|
||||
@ -28120,13 +28377,15 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
int32_t elen = fu->getExtensionLen();
|
||||
// update hash parms
|
||||
hi.m_prefix = "ext";
|
||||
if ( isStatusDoc ) hi.m_prefix = "ext2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "ext2";
|
||||
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing gbdocid" );
|
||||
hi.m_prefix = "gbdocid";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
|
||||
char buf2[32];
|
||||
sprintf(buf2,"%"UINT64"",(m_docId) );
|
||||
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
||||
@ -28146,12 +28405,13 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
|
||||
// append a "www." as part of normalization
|
||||
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
|
||||
hi.m_prefix = "gbparenturl";
|
||||
if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( isStatusDoc ) return true;
|
||||
//if ( isStatusDoc ) return true;
|
||||
|
||||
setStatus ( "hashing SiteGetter terms");
|
||||
|
||||
@ -30054,7 +30314,9 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
// if we had a facet, get the values it has in the doc
|
||||
if ( qs && *qs ) {
|
||||
// need this for storeFacetValues() if we are json
|
||||
if ( m_contentType == CT_JSON ) {
|
||||
if ( m_contentType == CT_JSON ||
|
||||
// spider status docs are really json
|
||||
m_contentType == CT_STATUS ) {
|
||||
Json *jp = getParsedJson();
|
||||
if ( ! jp || jp == (void *)-1)
|
||||
return (Msg20Reply *)jp;
|
||||
@ -30576,7 +30838,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
reply->size_gbAdIds = size_adVector;
|
||||
|
||||
// need full cached page of each search result?
|
||||
if ( m_req->m_includeCachedCopy ) {
|
||||
// include it always for spider status docs.
|
||||
if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
|
||||
reply-> ptr_content = ptr_utf8Content;
|
||||
reply->size_content = size_utf8Content;
|
||||
}
|
||||
@ -49681,7 +49944,9 @@ Json *XmlDoc::getParsedJson ( ) {
|
||||
if ( m_jpValid ) return &m_jp;
|
||||
|
||||
// core if not a json object
|
||||
if ( m_contentTypeValid && m_contentType != CT_JSON ) {
|
||||
if ( m_contentTypeValid && m_contentType != CT_JSON &&
|
||||
// spider status docs are now really json
|
||||
m_contentType != CT_STATUS ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
// \0 terminated
|
||||
@ -49724,7 +49989,15 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
||||
// use new json parser
|
||||
Json *jp = getParsedJson();
|
||||
if ( ! jp || jp == (void *)-1 ) return (char *)jp;
|
||||
|
||||
return hashJSONFields2 ( table , &hi , jp , true );
|
||||
}
|
||||
|
||||
|
||||
char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
HashInfo *hi , Json *jp ,
|
||||
bool hashWithoutFieldNames ) {
|
||||
|
||||
JsonItem *ji = jp->getFirstItem();
|
||||
|
||||
char nb[1024];
|
||||
@ -49788,17 +50061,17 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
||||
// DIFFBOT special field hacks
|
||||
//
|
||||
char *name = nameBuf.getBufStart();
|
||||
hi.m_hashGroup = HASHGROUP_BODY;
|
||||
hi->m_hashGroup = HASHGROUP_BODY;
|
||||
if ( strstr(name,"title") )
|
||||
hi.m_hashGroup = HASHGROUP_TITLE;
|
||||
hi->m_hashGroup = HASHGROUP_TITLE;
|
||||
if ( strstr(name,"url") )
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
hi->m_hashGroup = HASHGROUP_INURL;
|
||||
if ( strstr(name,"resolved_url") )
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
hi->m_hashGroup = HASHGROUP_INURL;
|
||||
if ( strstr(name,"tags") )
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi->m_hashGroup = HASHGROUP_INTAG;
|
||||
if ( strstr(name,"meta") )
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
hi->m_hashGroup = HASHGROUP_INMETATAG;
|
||||
//
|
||||
// now Json.cpp decodes and stores the value into
|
||||
// a buffer, so ji->getValue() should be decoded completely
|
||||
@ -49845,7 +50118,7 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
||||
// set EDOCUNCHANGED in ::getIndexCode() above.
|
||||
//
|
||||
/*
|
||||
if ( hi.m_hashGroup != HASHGROUP_INURL ) {
|
||||
if ( hi->m_hashGroup != HASHGROUP_INURL ) {
|
||||
// make the content hash so we can set m_contentHash32
|
||||
// for deduping
|
||||
int32_t nh32 = hash32n ( name );
|
||||
@ -49858,28 +50131,31 @@ char *XmlDoc::hashJSONFields ( HashTableX *table ) {
|
||||
*/
|
||||
|
||||
// index like "title:whatever"
|
||||
hi.m_prefix = name;
|
||||
hashString ( val , vlen , &hi );
|
||||
hi->m_prefix = name;
|
||||
hashString ( val , vlen , hi );
|
||||
|
||||
// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
|
||||
if ( name )
|
||||
hashFieldMatchTerm ( val , (int32_t)vlen , &hi );
|
||||
hashFieldMatchTerm ( val , (int32_t)vlen , hi );
|
||||
|
||||
if ( ! hashWithoutFieldNames )
|
||||
continue;
|
||||
|
||||
// hash without the field name as well
|
||||
hi.m_prefix = NULL;
|
||||
hashString ( val , vlen , &hi );
|
||||
hi->m_prefix = NULL;
|
||||
hashString ( val , vlen , hi );
|
||||
|
||||
/*
|
||||
// a number? hash special then as well
|
||||
if ( ji->m_type != JT_NUMBER ) continue;
|
||||
|
||||
// use prefix for this though
|
||||
hi.m_prefix = name;
|
||||
hi->m_prefix = name;
|
||||
|
||||
// hash as a number so we can sort search results by
|
||||
// this number and do range constraints
|
||||
float f = ji->m_valueDouble;
|
||||
if ( ! hashNumber2 ( f , &hi ) )
|
||||
if ( ! hashNumber2 ( f , hi ) )
|
||||
return NULL;
|
||||
*/
|
||||
}
|
||||
@ -49986,7 +50262,8 @@ bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
|
||||
return storeFacetValuesSections ( qs , sb , fvh );
|
||||
|
||||
// if a json doc, get json field
|
||||
if ( m_contentType == CT_JSON )
|
||||
// spider status docs are really json now
|
||||
if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
|
||||
return storeFacetValuesJSON ( qs , sb , fvh );
|
||||
|
||||
if ( m_contentType == CT_HTML )
|
||||
|
18
XmlDoc.h
18
XmlDoc.h
@ -735,6 +735,16 @@ class XmlDoc {
|
||||
|
||||
char *getDiffbotParentUrl( char *myUrl );
|
||||
|
||||
int64_t m_diffbotReplyEndTime;
|
||||
int64_t m_diffbotReplyStartTime;
|
||||
int32_t m_diffbotReplyRetries;
|
||||
|
||||
uint64_t m_downloadStartTime;
|
||||
//uint64_t m_downloadEndTime;
|
||||
|
||||
uint64_t m_ipStartTime;
|
||||
uint64_t m_ipEndTime;
|
||||
|
||||
void copyFromOldDoc ( class XmlDoc *od ) ;
|
||||
|
||||
class SpiderReply *getFakeSpiderReply ( );
|
||||
@ -786,8 +796,8 @@ class XmlDoc {
|
||||
bool hashContentType ( class HashTableX *table ) ;
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table , bool isStatusDoc = false ) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt , bool isStatusDoc=false) ;
|
||||
bool hashUrl ( class HashTableX *table );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
@ -1149,6 +1159,7 @@ class XmlDoc {
|
||||
char m_addedSpiderRequestSizeValid;
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
@ -1716,6 +1727,9 @@ class XmlDoc {
|
||||
bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
|
||||
char *hashJSONFields ( HashTableX *table );
|
||||
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
|
||||
bool hashWithoutFieldNames ) ;
|
||||
|
||||
char *hashXMLFields ( HashTableX *table );
|
||||
int32_t *reindexJSONObjects ( int32_t *newTitleHashes ,
|
||||
int32_t numNewHashes ) ;
|
||||
|
Reference in New Issue
Block a user