Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
Conflicts: Collectiondb.cpp
This commit is contained in:
@ -3383,24 +3383,93 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
|
||||
bool expandRegExShortcuts ( SafeBuf *sb ) ;
|
||||
void nukeDoledb ( collnum_t collnum );
|
||||
|
||||
bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
// rebuild the regexes related to diffbot, such as the one for the URL pattern
|
||||
bool CollectionRec::rebuildDiffbotRegexes() {
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
// get the regexes
|
||||
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
char *upp = m_diffbotUrlProcessPattern.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
// recompiling regexes starts now
|
||||
if ( m_hasucr ) {
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
if ( m_hasupr ) {
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
// copy into tmpbuf
|
||||
SafeBuf tmp;
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| //REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
}
|
||||
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
|
||||
REG_EXTENDED| // REG_ICASE|
|
||||
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
//logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
char *upp = m_diffbotUrlProcessPattern.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
char *ppp = m_diffbotPageProcessPattern.getBufStart();
|
||||
if ( ppp && ! ppp[0] ) ppp = NULL;
|
||||
|
||||
@ -3469,7 +3538,6 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
if ( api && ! api[0] ) api = NULL;
|
||||
@ -3869,9 +3937,15 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
}
|
||||
|
||||
|
||||
// the code beow is only for diffbot custom crawls
|
||||
if ( ! m_isCustomCrawl ) return true; //!= 1 && // crawl api
|
||||
// If the crawl is not generated by crawlbot, then we will just update
|
||||
// the regexes concerning the urls to process
|
||||
rebuildDiffbotRegexes();
|
||||
if ( ! m_isCustomCrawl ){
|
||||
return true;
|
||||
}
|
||||
|
||||
// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
|
||||
// to some default ones.
|
||||
return rebuildUrlFiltersDiffbot();
|
||||
}
|
||||
|
||||
|
@ -394,6 +394,9 @@ class CollectionRec {
|
||||
// for diffbot crawl or bulk jobs
|
||||
bool rebuildUrlFiltersDiffbot();
|
||||
|
||||
// rebuild the regexes related to diffbot, such as the one for the URL pattern
|
||||
bool rebuildDiffbotRegexes();
|
||||
|
||||
bool rebuildLangRules( char *lang , char *tld );
|
||||
|
||||
bool rebuildShallowRules();
|
||||
|
93
Msg40.cpp
93
Msg40.cpp
@ -6352,6 +6352,41 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
|
||||
int32_t saved = sb->length();
|
||||
|
||||
// If json, print beginning of json array
|
||||
if ( format == FORMAT_JSON ) {
|
||||
if ( m_si->m_streamResults ) {
|
||||
// if we are streaming results in json, we may have hacked off
|
||||
// the last ,\n so we need a comma to put it back
|
||||
bool needComma = true;
|
||||
|
||||
// check if the last non-whitespace char in the
|
||||
// buffer is a comma
|
||||
for (int32_t i= sb->m_length-1; i >= 0; i--) {
|
||||
char c = sb->getBufStart()[i];
|
||||
if (c == '\n' || c == ' ') {
|
||||
// ignore whitespace chars
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the loop reaches this point, we have a
|
||||
// non-whitespace char, so we break the loop
|
||||
// either way
|
||||
if (c == ',') {
|
||||
// last non-whitespace char is a comma,
|
||||
// so we don't need to add an extra one
|
||||
needComma = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if ( needComma ) {
|
||||
sb->safeStrcpy(",\n\n");
|
||||
}
|
||||
}
|
||||
sb->safePrintf("\"facets\":[");
|
||||
}
|
||||
|
||||
int numTablesPrinted = 0;
|
||||
for ( int32_t i = 0 ; i < m_si->m_q.getNumTerms() ; i++ ) {
|
||||
// only for html for now i guess
|
||||
//if ( m_si->m_format != FORMAT_HTML ) break;
|
||||
@ -6363,10 +6398,25 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
continue;
|
||||
|
||||
// if had facet ranges, print them out
|
||||
printFacetsForTable ( sb , qt );;
|
||||
|
||||
if ( printFacetsForTable ( sb , qt ) > 0 )
|
||||
numTablesPrinted++;
|
||||
}
|
||||
|
||||
// If josn, print end of json array
|
||||
if ( format == FORMAT_JSON ) {
|
||||
if ( numTablesPrinted > 0 ) {
|
||||
sb->m_length -= 2; // hack off trailing comma
|
||||
sb->safePrintf("],\n"); // close off json array
|
||||
}
|
||||
// if no facets then do not print "facets":[]\n,
|
||||
else {
|
||||
// revert string buf to original length
|
||||
sb->m_length = saved;
|
||||
// and cap the string buf just in case
|
||||
sb->nullTerm();
|
||||
}
|
||||
}
|
||||
|
||||
// if json, remove ending ,\n and make it just \n
|
||||
if ( format == FORMAT_JSON && sb->length() != saved ) {
|
||||
// remove ,\n
|
||||
@ -6387,7 +6437,7 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
int32_t Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
|
||||
//QueryWord *qw = qt->m_qword;
|
||||
//if ( qw->m_numFacetRanges > 0 )
|
||||
@ -6397,9 +6447,14 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
int32_t *ptrs = (int32_t *)qt->m_facetIndexBuf.getBufStart();
|
||||
int32_t numPtrs = qt->m_facetIndexBuf.length() / sizeof(int32_t);
|
||||
|
||||
if ( numPtrs == 0 )
|
||||
return 0;
|
||||
|
||||
int32_t numPrinted = 0;
|
||||
|
||||
// now scan the slots and print out
|
||||
HttpRequest *hr = &m_si->m_hr;
|
||||
bool firstTime = true;
|
||||
|
||||
bool isString = false;
|
||||
bool isFloat = false;
|
||||
bool isInt = false;
|
||||
@ -6409,6 +6464,7 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
char format = m_si->m_format;
|
||||
// a new table for each facet query term
|
||||
bool needTable = true;
|
||||
|
||||
// print out the dumps
|
||||
for ( int32_t x= 0 ; x < numPtrs ; x++ ) {
|
||||
// skip empty slots
|
||||
@ -6516,7 +6572,9 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
text = m_facetTextBuf.getBufStart() + *offset;
|
||||
}
|
||||
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
numPrinted++;
|
||||
sb->safePrintf("\t<facet>\n"
|
||||
"\t\t<field>%s</field>\n"
|
||||
, term );
|
||||
@ -6573,17 +6631,6 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON && firstTime ) {
|
||||
firstTime = false;
|
||||
// if streaming results we may have hacked off
|
||||
// the last ,\n so put it back
|
||||
if ( m_si->m_streamResults ) {
|
||||
//sb->m_length -= 1;
|
||||
sb->safeStrcpy(",\n\n");
|
||||
}
|
||||
//sb->safePrintf("\"facets\":[\n");
|
||||
}
|
||||
|
||||
// print that out
|
||||
if ( needTable && format == FORMAT_HTML ) {
|
||||
needTable = false;
|
||||
@ -6619,13 +6666,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
}
|
||||
|
||||
|
||||
if ( needTable && format == FORMAT_JSON ) {
|
||||
needTable = false;
|
||||
sb->safePrintf("\"facets\":[");
|
||||
}
|
||||
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
numPrinted++;
|
||||
sb->safePrintf("{\n"
|
||||
"\t\"field\":\"%s\",\n"
|
||||
, term
|
||||
@ -6779,6 +6821,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
SafeBuf newUrl;
|
||||
replaceParm ( newStuff.getBufStart(), &newUrl , hr );
|
||||
|
||||
numPrinted++;
|
||||
|
||||
// print the facet in its numeric form
|
||||
// we will have to lookup based on its docid
|
||||
// and get it from the cached page later
|
||||
@ -6799,13 +6843,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
|
||||
,count); // count for printing
|
||||
}
|
||||
|
||||
if ( ! needTable && format == FORMAT_JSON ) {
|
||||
sb->m_length -= 2; // hack off trailing comma
|
||||
sb->safePrintf("],\n"); // close off json array
|
||||
}
|
||||
|
||||
if ( ! needTable && format == FORMAT_HTML )
|
||||
sb->safePrintf("</table></div><br>\n");
|
||||
|
||||
return true;
|
||||
return numPrinted;
|
||||
}
|
||||
|
2
Msg40.h
2
Msg40.h
@ -227,7 +227,7 @@ class Msg40 {
|
||||
int32_t m_omitCount;
|
||||
|
||||
bool printFacetTables ( class SafeBuf *sb ) ;
|
||||
bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
|
||||
int32_t printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
|
||||
bool lookupFacets ( ) ;
|
||||
void lookupFacets2 ( ) ;
|
||||
void gotFacetText ( class Msg20 *msg20 ) ;
|
||||
|
22
Parms.cpp
22
Parms.cpp
@ -22562,6 +22562,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"You have to use the respider frequency as well "
|
||||
"to adjust how often you want things respidered."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>urlage</td>"
|
||||
"<td>"
|
||||
"This is the time, in seconds, since a url was first "
|
||||
"added to spiderdb to be spidered. This is "
|
||||
"its discovery date. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
//"<tr class=poo><td>!newoutlink</td>"
|
||||
@ -22584,6 +22592,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"older permalinks into a slower spider queue."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>spiderwaited < 3600</td>"
|
||||
"<td>"
|
||||
"<i>spiderwaited</i> is how many seconds have elapsed "
|
||||
"since the last time "
|
||||
"we tried to spider/download the url. "
|
||||
"The constaint containing <i>spiderwaited</i> will "
|
||||
"fail to be matched if the url has never been "
|
||||
"attempted to be spidered/downloaded before. Therefore, "
|
||||
"it will only ever match urls that have a spider reply "
|
||||
"of some sort, so there is no need to add an additional "
|
||||
"<i>hasreply</i>-based constraint."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
"<tr class=poo><td>"
|
||||
"<a name=insitelist>"
|
||||
"insitelist | !insitelist"
|
||||
|
61
Spider.cpp
61
Spider.cpp
@ -4562,6 +4562,9 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
spiderTimeMS ,
|
||||
uh48 );
|
||||
|
||||
// assume our added time is the first time this url was added
|
||||
sreq->m_discoveryTime = sreq->m_addedTime;
|
||||
|
||||
// if ( uh48 == 110582802025376LL )
|
||||
// log("hey");
|
||||
|
||||
@ -4591,10 +4594,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_addedTime < wsreq->m_addedTime )
|
||||
wsreq->m_addedTime = sreq->m_addedTime;
|
||||
if ( wsreq->m_addedTime < sreq->m_addedTime )
|
||||
sreq->m_addedTime = wsreq->m_addedTime;
|
||||
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
|
||||
wsreq->m_discoveryTime =
|
||||
sreq->m_discoveryTime;
|
||||
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
|
||||
sreq->m_discoveryTime =
|
||||
wsreq->m_discoveryTime;
|
||||
}
|
||||
|
||||
|
||||
@ -12399,6 +12404,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// selector using the first time it was added to the Spiderdb
|
||||
// added by Sam, May 5th 2015
|
||||
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) {
|
||||
//log("was for message 20");
|
||||
continue;
|
||||
|
||||
}
|
||||
// get the age of the spider_request.
|
||||
// (substraction of uint with int, hope
|
||||
// every thing goes well there)
|
||||
int32_t sreq_age = 0;
|
||||
if ( sreq ) sreq_age = nowGlobal-sreq->m_discoveryTime;
|
||||
//log("spiderage=%d",sreq_age);
|
||||
// the argument entered by user
|
||||
int32_t argument_age=atoi(s) ;
|
||||
if ( sign == SIGN_EQ && sreq_age != argument_age ) continue;
|
||||
if ( sign == SIGN_NE && sreq_age == argument_age ) continue;
|
||||
if ( sign == SIGN_GT && sreq_age <= argument_age ) continue;
|
||||
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
|
||||
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
|
||||
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
|
||||
if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
@ -12521,16 +12557,16 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// do not match rule if never attempted
|
||||
if ( srep->m_spideredTime == 0 ) {
|
||||
char*xx=NULL;*xx=0;}
|
||||
if ( srep->m_spideredTime == (uint32_t)-1){
|
||||
char*xx=NULL;*xx=0;}
|
||||
// int16_tcut
|
||||
float af = (srep->m_spideredTime - nowGlobal);
|
||||
// if ( srep->m_spideredTime == 0 ) {
|
||||
// char*xx=NULL;*xx=0;}
|
||||
// if ( srep->m_spideredTime == (uint32_t)-1){
|
||||
// char*xx=NULL;*xx=0;}
|
||||
// shortcut
|
||||
int32_t a = nowGlobal - srep->m_spideredTime;
|
||||
// make into days
|
||||
af /= (3600.0*24.0);
|
||||
//af /= (3600.0*24.0);
|
||||
// back to a int32_t, round it
|
||||
int32_t a = (int32_t)(af + 0.5);
|
||||
//int32_t a = (int32_t)(af + 0.5);
|
||||
// make it point to the priority
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
@ -13062,6 +13098,7 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
||||
// . if the same check who has the most recent added time
|
||||
// . if we are not the most recent, just do not add us
|
||||
// . no, now i want the oldest so we can do gbssDiscoveryTime
|
||||
// and set sreq->m_discoveryTime accurately, above
|
||||
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
|
||||
// otherwise, erase over him
|
||||
dst = restorePoint;
|
||||
|
9
Spider.h
9
Spider.h
@ -522,10 +522,16 @@ class SpiderRequest {
|
||||
int32_t m_parentDomHash32;
|
||||
int32_t m_parentSiteHash32;
|
||||
|
||||
// if there are several spiderrequests for a url, this should be
|
||||
// the earliest m_addedTime, basically, the url discovery time. this is
|
||||
// NOT valid in spiderdb, but only set upon selecting the url to spider
|
||||
// when we scan all of the SpiderRequests it has.
|
||||
int32_t m_discoveryTime;
|
||||
|
||||
// the PROBABLE DOCID. if there is a collision with another docid
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
//int64_t m_probDocId;
|
||||
int32_t m_reservedc1;
|
||||
//int32_t m_reservedc1;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
@ -829,6 +835,7 @@ class SpiderReply {
|
||||
// a SpiderRec outright
|
||||
key128_t m_key;
|
||||
|
||||
// this can be used for something else really. all SpiderReplies are fixed sz
|
||||
int32_t m_dataSize;
|
||||
|
||||
// for calling getHostIdToDole()
|
||||
|
21
XmlDoc.cpp
21
XmlDoc.cpp
@ -5362,7 +5362,8 @@ Dates *XmlDoc::getDates ( ) {
|
||||
m_sreq.m_parentPrevSpiderTime ) {
|
||||
// pub date is somewhere between these two times
|
||||
minPubDate = m_sreq.m_parentPrevSpiderTime;
|
||||
maxPubDate = m_sreq.m_addedTime;
|
||||
//maxPubDate = m_sreq.m_addedTime;
|
||||
maxPubDate = m_sreq.m_discoveryTime;
|
||||
}
|
||||
|
||||
// now set part2 , returns false and sets g_errno on error
|
||||
@ -20150,6 +20151,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
tmp,(uint32_t)m_sreq.m_addedTime);
|
||||
}
|
||||
|
||||
// discovery date, first time spiderrequest was added to spiderdb
|
||||
if ( m_sreqValid && m_sreq.m_discoveryTime ) {
|
||||
time_t ts = m_sreq.m_discoveryTime;
|
||||
struct tm *timeStruct = gmtime ( &ts );
|
||||
char tmp[64];
|
||||
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
|
||||
sb->safePrintf("discoverydate=%s(%"UINT32") ",
|
||||
tmp,(uint32_t)m_sreq.m_discoveryTime);
|
||||
}
|
||||
|
||||
// print first indexed time
|
||||
if ( m_firstIndexedDateValid ) {
|
||||
time_t ts = m_firstIndexedDate;
|
||||
@ -27456,13 +27467,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
cr->m_spiderRoundNum);
|
||||
|
||||
// for -diffbotxyz fake docs addedtime is 0
|
||||
if ( m_sreqValid && m_sreq.m_addedTime != 0 ) {
|
||||
if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
|
||||
// in Spider.cpp we try to set m_sreq's m_addedTime to the
|
||||
// min of all the spider requests, and we try to ensure
|
||||
// that in the case of deduping we preserve the one with
|
||||
// the oldest time.
|
||||
// the oldest time. no, now we actually use
|
||||
// m_discoveryTime since we were using m_addedTime in
|
||||
// the url filters as it was originally intended.
|
||||
jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
|
||||
m_sreq.m_addedTime);
|
||||
m_sreq.m_discoveryTime);
|
||||
}
|
||||
|
||||
if ( m_isDupValid && m_isDup )
|
||||
|
Reference in New Issue
Block a user