Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts:
	Collectiondb.cpp
This commit is contained in:
Matt Wells
2015-05-06 09:58:51 -07:00
8 changed files with 248 additions and 53 deletions

@ -3383,24 +3383,93 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
bool expandRegExShortcuts ( SafeBuf *sb ) ;
void nukeDoledb ( collnum_t collnum );
bool CollectionRec::rebuildUrlFiltersDiffbot() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
// rebuild the regexes related to diffbot, such as the one for the URL pattern
bool CollectionRec::rebuildDiffbotRegexes() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// get the regexes
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
// recompiling regexes starts now
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_upr );
m_hasupr = false;
}
return true;
}
bool CollectionRec::rebuildUrlFiltersDiffbot() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
@ -3469,7 +3538,6 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
if ( api && ! api[0] ) api = NULL;
@ -3869,9 +3937,15 @@ bool CollectionRec::rebuildUrlFilters ( ) {
}
// the code beow is only for diffbot custom crawls
if ( ! m_isCustomCrawl ) return true; //!= 1 && // crawl api
// If the crawl is not generated by crawlbot, then we will just update
// the regexes concerning the urls to process
rebuildDiffbotRegexes();
if ( ! m_isCustomCrawl ){
return true;
}
// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
// to some default ones.
return rebuildUrlFiltersDiffbot();
}

@ -394,6 +394,9 @@ class CollectionRec {
// for diffbot crawl or bulk jobs
bool rebuildUrlFiltersDiffbot();
// rebuild the regexes related to diffbot, such as the one for the URL pattern
bool rebuildDiffbotRegexes();
bool rebuildLangRules( char *lang , char *tld );
bool rebuildShallowRules();

@ -6352,6 +6352,41 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
int32_t saved = sb->length();
// If json, print beginning of json array
if ( format == FORMAT_JSON ) {
if ( m_si->m_streamResults ) {
// if we are streaming results in json, we may have hacked off
// the last ,\n so we need a comma to put it back
bool needComma = true;
// check if the last non-whitespace char in the
// buffer is a comma
for (int32_t i= sb->m_length-1; i >= 0; i--) {
char c = sb->getBufStart()[i];
if (c == '\n' || c == ' ') {
// ignore whitespace chars
continue;
}
// If the loop reaches this point, we have a
// non-whitespace char, so we break the loop
// either way
if (c == ',') {
// last non-whitespace char is a comma,
// so we don't need to add an extra one
needComma = false;
}
break;
}
if ( needComma ) {
sb->safeStrcpy(",\n\n");
}
}
sb->safePrintf("\"facets\":[");
}
int numTablesPrinted = 0;
for ( int32_t i = 0 ; i < m_si->m_q.getNumTerms() ; i++ ) {
// only for html for now i guess
//if ( m_si->m_format != FORMAT_HTML ) break;
@ -6363,10 +6398,25 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
continue;
// if had facet ranges, print them out
printFacetsForTable ( sb , qt );;
if ( printFacetsForTable ( sb , qt ) > 0 )
numTablesPrinted++;
}
// If josn, print end of json array
if ( format == FORMAT_JSON ) {
if ( numTablesPrinted > 0 ) {
sb->m_length -= 2; // hack off trailing comma
sb->safePrintf("],\n"); // close off json array
}
// if no facets then do not print "facets":[]\n,
else {
// revert string buf to original length
sb->m_length = saved;
// and cap the string buf just in case
sb->nullTerm();
}
}
// if json, remove ending ,\n and make it just \n
if ( format == FORMAT_JSON && sb->length() != saved ) {
// remove ,\n
@ -6387,7 +6437,7 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
return true;
}
bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
int32_t Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
//QueryWord *qw = qt->m_qword;
//if ( qw->m_numFacetRanges > 0 )
@ -6397,9 +6447,14 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
int32_t *ptrs = (int32_t *)qt->m_facetIndexBuf.getBufStart();
int32_t numPtrs = qt->m_facetIndexBuf.length() / sizeof(int32_t);
if ( numPtrs == 0 )
return 0;
int32_t numPrinted = 0;
// now scan the slots and print out
HttpRequest *hr = &m_si->m_hr;
bool firstTime = true;
bool isString = false;
bool isFloat = false;
bool isInt = false;
@ -6409,6 +6464,7 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
char format = m_si->m_format;
// a new table for each facet query term
bool needTable = true;
// print out the dumps
for ( int32_t x= 0 ; x < numPtrs ; x++ ) {
// skip empty slots
@ -6516,7 +6572,9 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
text = m_facetTextBuf.getBufStart() + *offset;
}
if ( format == FORMAT_XML ) {
numPrinted++;
sb->safePrintf("\t<facet>\n"
"\t\t<field>%s</field>\n"
, term );
@ -6573,17 +6631,6 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
continue;
}
if ( format == FORMAT_JSON && firstTime ) {
firstTime = false;
// if streaming results we may have hacked off
// the last ,\n so put it back
if ( m_si->m_streamResults ) {
//sb->m_length -= 1;
sb->safeStrcpy(",\n\n");
}
//sb->safePrintf("\"facets\":[\n");
}
// print that out
if ( needTable && format == FORMAT_HTML ) {
needTable = false;
@ -6619,13 +6666,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
}
if ( needTable && format == FORMAT_JSON ) {
needTable = false;
sb->safePrintf("\"facets\":[");
}
if ( format == FORMAT_JSON ) {
numPrinted++;
sb->safePrintf("{\n"
"\t\"field\":\"%s\",\n"
, term
@ -6779,6 +6821,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
SafeBuf newUrl;
replaceParm ( newStuff.getBufStart(), &newUrl , hr );
numPrinted++;
// print the facet in its numeric form
// we will have to lookup based on its docid
// and get it from the cached page later
@ -6799,13 +6843,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
,count); // count for printing
}
if ( ! needTable && format == FORMAT_JSON ) {
sb->m_length -= 2; // hack off trailing comma
sb->safePrintf("],\n"); // close off json array
}
if ( ! needTable && format == FORMAT_HTML )
sb->safePrintf("</table></div><br>\n");
return true;
return numPrinted;
}

@ -227,7 +227,7 @@ class Msg40 {
int32_t m_omitCount;
bool printFacetTables ( class SafeBuf *sb ) ;
bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
int32_t printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
bool lookupFacets ( ) ;
void lookupFacets2 ( ) ;
void gotFacetText ( class Msg20 *msg20 ) ;

@ -22562,6 +22562,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"You have to use the respider frequency as well "
"to adjust how often you want things respidered."
"</td></tr>"
"<tr class=poo><td>urlage</td>"
"<td>"
"This is the time, in seconds, since a url was first "
"added to spiderdb to be spidered. This is "
"its discovery date. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
//"<tr class=poo><td>!newoutlink</td>"
@ -22584,6 +22592,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"older permalinks into a slower spider queue."
"</td></tr>"
"<tr class=poo><td>spiderwaited &lt; 3600</td>"
"<td>"
"<i>spiderwaited</i> is how many seconds have elapsed "
"since the last time "
"we tried to spider/download the url. "
"The constaint containing <i>spiderwaited</i> will "
"fail to be matched if the url has never been "
"attempted to be spidered/downloaded before. Therefore, "
"it will only ever match urls that have a spider reply "
"of some sort, so there is no need to add an additional "
"<i>hasreply</i>-based constraint."
"</td></tr>"
"<tr class=poo><td>"
"<a name=insitelist>"
"insitelist | !insitelist"

@ -4562,6 +4562,9 @@ bool SpiderColl::scanListForWinners ( ) {
spiderTimeMS ,
uh48 );
// assume our added time is the first time this url was added
sreq->m_discoveryTime = sreq->m_addedTime;
// if ( uh48 == 110582802025376LL )
// log("hey");
@ -4591,10 +4594,12 @@ bool SpiderColl::scanListForWinners ( ) {
// and the min added time as well!
// get the oldest timestamp so
// gbssDiscoveryTime will be accurate.
if ( sreq->m_addedTime < wsreq->m_addedTime )
wsreq->m_addedTime = sreq->m_addedTime;
if ( wsreq->m_addedTime < sreq->m_addedTime )
sreq->m_addedTime = wsreq->m_addedTime;
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
wsreq->m_discoveryTime =
sreq->m_discoveryTime;
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
sreq->m_discoveryTime =
wsreq->m_discoveryTime;
}
@ -12399,6 +12404,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
// selector using the first time it was added to the Spiderdb
// added by Sam, May 5th 2015
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
// skip for msg20
if ( isForMsg20 ) {
//log("was for message 20");
continue;
}
// get the age of the spider_request.
// (substraction of uint with int, hope
// every thing goes well there)
int32_t sreq_age = 0;
if ( sreq ) sreq_age = nowGlobal-sreq->m_discoveryTime;
//log("spiderage=%d",sreq_age);
// the argument entered by user
int32_t argument_age=atoi(s) ;
if ( sign == SIGN_EQ && sreq_age != argument_age ) continue;
if ( sign == SIGN_NE && sreq_age == argument_age ) continue;
if ( sign == SIGN_GT && sreq_age <= argument_age ) continue;
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
// if we do not have enough info for outlink, all done
@ -12521,16 +12557,16 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
// skip for msg20
if ( isForMsg20 ) continue;
// do not match rule if never attempted
if ( srep->m_spideredTime == 0 ) {
char*xx=NULL;*xx=0;}
if ( srep->m_spideredTime == (uint32_t)-1){
char*xx=NULL;*xx=0;}
// int16_tcut
float af = (srep->m_spideredTime - nowGlobal);
// if ( srep->m_spideredTime == 0 ) {
// char*xx=NULL;*xx=0;}
// if ( srep->m_spideredTime == (uint32_t)-1){
// char*xx=NULL;*xx=0;}
// shortcut
int32_t a = nowGlobal - srep->m_spideredTime;
// make into days
af /= (3600.0*24.0);
//af /= (3600.0*24.0);
// back to a int32_t, round it
int32_t a = (int32_t)(af + 0.5);
//int32_t a = (int32_t)(af + 0.5);
// make it point to the priority
int32_t b = atoi(s);
// compare
@ -13062,6 +13098,7 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
// . if the same check who has the most recent added time
// . if we are not the most recent, just do not add us
// . no, now i want the oldest so we can do gbssDiscoveryTime
// and set sreq->m_discoveryTime accurately, above
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
// otherwise, erase over him
dst = restorePoint;

@ -522,10 +522,16 @@ class SpiderRequest {
int32_t m_parentDomHash32;
int32_t m_parentSiteHash32;
// if there are several spiderrequests for a url, this should be
// the earliest m_addedTime, basically, the url discovery time. this is
// NOT valid in spiderdb, but only set upon selecting the url to spider
// when we scan all of the SpiderRequests it has.
int32_t m_discoveryTime;
// the PROBABLE DOCID. if there is a collision with another docid
// then we increment the last 8 bits or so. see Msg22.cpp.
//int64_t m_probDocId;
int32_t m_reservedc1;
//int32_t m_reservedc1;
int32_t m_reservedc2;
//int32_t m_parentPubDate;
@ -829,6 +835,7 @@ class SpiderReply {
// a SpiderRec outright
key128_t m_key;
// this can be used for something else really. all SpiderReplies are fixed sz
int32_t m_dataSize;
// for calling getHostIdToDole()

@ -5362,7 +5362,8 @@ Dates *XmlDoc::getDates ( ) {
m_sreq.m_parentPrevSpiderTime ) {
// pub date is somewhere between these two times
minPubDate = m_sreq.m_parentPrevSpiderTime;
maxPubDate = m_sreq.m_addedTime;
//maxPubDate = m_sreq.m_addedTime;
maxPubDate = m_sreq.m_discoveryTime;
}
// now set part2 , returns false and sets g_errno on error
@ -20150,6 +20151,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
tmp,(uint32_t)m_sreq.m_addedTime);
}
// discovery date, first time spiderrequest was added to spiderdb
if ( m_sreqValid && m_sreq.m_discoveryTime ) {
time_t ts = m_sreq.m_discoveryTime;
struct tm *timeStruct = gmtime ( &ts );
char tmp[64];
strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
sb->safePrintf("discoverydate=%s(%"UINT32") ",
tmp,(uint32_t)m_sreq.m_discoveryTime);
}
// print first indexed time
if ( m_firstIndexedDateValid ) {
time_t ts = m_firstIndexedDate;
@ -27456,13 +27467,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
cr->m_spiderRoundNum);
// for -diffbotxyz fake docs addedtime is 0
if ( m_sreqValid && m_sreq.m_addedTime != 0 ) {
if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
// in Spider.cpp we try to set m_sreq's m_addedTime to the
// min of all the spider requests, and we try to ensure
// that in the case of deduping we preserve the one with
// the oldest time.
// the oldest time. no, now we actually use
// m_discoveryTime since we were using m_addedTime in
// the url filters as it was originally intended.
jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
m_sreq.m_addedTime);
m_sreq.m_discoveryTime);
}
if ( m_isDupValid && m_isDup )