Merge branch 'diffbot'

This commit is contained in:
Matt Wells
2014-05-28 09:15:48 -07:00
4 changed files with 42 additions and 7 deletions

@ -1390,7 +1390,7 @@ bool Msg25::sendRequests ( ) {
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) {
log("linkdb: collnum %li is gone",(long)m_collnum);
log("linkdb: collnum %li is gone 1",(long)m_collnum);
return true;
}
//char *coll = cr->m_coll;
@ -2334,7 +2334,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) {
log("linkdb: collnum %li is gone",(long)m_collnum);
log("linkdb: collnum %li is gone 2",(long)m_collnum);
return true;
}
char *coll = cr->m_coll;

@ -2600,6 +2600,12 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// so fix that shit here...
//float f = mr->m_lastSpidered;
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
// MDW: this is VERY convenient for debugging pls
// leave in. we can easily see if a result
// should be there for a query like
// gbmin:gbspiderdate:12345678
sb->safePrintf(",\"lastCrawlTimeUTC\":%li",
mr->m_lastSpidered);
// also include a timestamp field with an RFC 1123 formatted date
char timestamp[50];
struct tm *ptm = gmtime ( &mr->m_lastSpidered );

@ -18462,6 +18462,25 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
if ( cr ) cr->m_needsSave = true;
//
// HACK
//
// special hack. if spidering re-enabled then reset last spider
// attempt time to 0 to avoid the "has no more urls to spider"
// msg followed by the reviving url msg.
if ( base == cr && dst == (char *)&cr->m_spideringEnabled )
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
if ( base == &g_conf && dst == (char *)&g_conf.m_spideringEnabled ){
for(long i = 0;i<g_collectiondb.m_numRecs;i++){
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
}
}
//
// END HACK
//
// all done
return true;
}

@ -1300,7 +1300,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// now we can have url-based page reindex requests because
// if we have a diffbot json object fake url reindex request
// we add a spider request of the PARENT url for it as page reindex
if ( is_digit ( sreq->m_url[0] ) ) {
//if ( is_digit ( sreq->m_url[0] ) ) {
// watch out for 0.r.msn.com!!
if ( sreq->m_urlIsDocId ) {
m_docId = atoll(sreq->m_url);
// assume its good
m_docIdValid = true;
@ -14246,15 +14248,12 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
else
m_diffbotUrl.pushChar('&');
// make sure that diffbot expands all objects
m_diffbotUrl.safePrintf("expand");
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
// only print token if we have one, because if user provides their
// own diffbot url (apiUrl in Parms.cpp) then they might include
// the token in that for their non-custom crawl. m_customCrawl=0.
if ( cr->m_diffbotToken.length())
m_diffbotUrl.safePrintf("&token=%s",
m_diffbotUrl.safePrintf("token=%s",
cr->m_diffbotToken.getBufStart());
m_diffbotUrl.safePrintf("&url=");
@ -14267,6 +14266,17 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// then user provided parms that are dependent on if it is an
// article, product, etc. like "&dontstripads=1" or whatever
//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());
// for analyze requests without mode=, make sure that diffbot expands all objects
// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
if (m_diffbotUrl.nullTerm()) {
char *u = m_diffbotUrl.getBufStart();
if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
m_diffbotUrl.safePrintf("&expand");
}
}
// null term it
m_diffbotUrl.nullTerm();