Merge branch 'diffbot'
This commit is contained in:
@ -1390,7 +1390,7 @@ bool Msg25::sendRequests ( ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
log("linkdb: collnum %li is gone",(long)m_collnum);
|
||||
log("linkdb: collnum %li is gone 1",(long)m_collnum);
|
||||
return true;
|
||||
}
|
||||
//char *coll = cr->m_coll;
|
||||
@ -2334,7 +2334,7 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
log("linkdb: collnum %li is gone",(long)m_collnum);
|
||||
log("linkdb: collnum %li is gone 2",(long)m_collnum);
|
||||
return true;
|
||||
}
|
||||
char *coll = cr->m_coll;
|
||||
|
@ -2600,6 +2600,12 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
// so fix that shit here...
|
||||
//float f = mr->m_lastSpidered;
|
||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||
// MDW: this is VERY convenient for debugging pls
|
||||
// leave in. we can easily see if a result
|
||||
// should be there for a query like
|
||||
// gbmin:gbspiderdate:12345678
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li",
|
||||
mr->m_lastSpidered);
|
||||
// also include a timestamp field with an RFC 1123 formatted date
|
||||
char timestamp[50];
|
||||
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
|
||||
|
19
Parms.cpp
19
Parms.cpp
@ -18462,6 +18462,25 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
|
||||
|
||||
if ( cr ) cr->m_needsSave = true;
|
||||
|
||||
//
|
||||
// HACK
|
||||
//
|
||||
// special hack. if spidering re-enabled then reset last spider
|
||||
// attempt time to 0 to avoid the "has no more urls to spider"
|
||||
// msg followed by the reviving url msg.
|
||||
if ( base == cr && dst == (char *)&cr->m_spideringEnabled )
|
||||
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
|
||||
if ( base == &g_conf && dst == (char *)&g_conf.m_spideringEnabled ){
|
||||
for(long i = 0;i<g_collectiondb.m_numRecs;i++){
|
||||
CollectionRec *cr = g_collectiondb.m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
|
||||
}
|
||||
}
|
||||
//
|
||||
// END HACK
|
||||
//
|
||||
|
||||
// all done
|
||||
return true;
|
||||
}
|
||||
|
20
XmlDoc.cpp
20
XmlDoc.cpp
@ -1300,7 +1300,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
// now we can have url-based page reindex requests because
|
||||
// if we have a diffbot json object fake url reindex request
|
||||
// we add a spider request of the PARENT url for it as page reindex
|
||||
if ( is_digit ( sreq->m_url[0] ) ) {
|
||||
//if ( is_digit ( sreq->m_url[0] ) ) {
|
||||
// watch out for 0.r.msn.com!!
|
||||
if ( sreq->m_urlIsDocId ) {
|
||||
m_docId = atoll(sreq->m_url);
|
||||
// assume its good
|
||||
m_docIdValid = true;
|
||||
@ -14246,15 +14248,12 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
else
|
||||
m_diffbotUrl.pushChar('&');
|
||||
|
||||
// make sure that diffbot expands all objects
|
||||
m_diffbotUrl.safePrintf("expand");
|
||||
|
||||
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
||||
// only print token if we have one, because if user provides their
|
||||
// own diffbot url (apiUrl in Parms.cpp) then they might include
|
||||
// the token in that for their non-custom crawl. m_customCrawl=0.
|
||||
if ( cr->m_diffbotToken.length())
|
||||
m_diffbotUrl.safePrintf("&token=%s",
|
||||
m_diffbotUrl.safePrintf("token=%s",
|
||||
cr->m_diffbotToken.getBufStart());
|
||||
|
||||
m_diffbotUrl.safePrintf("&url=");
|
||||
@ -14267,6 +14266,17 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// then user provided parms that are dependent on if it is an
|
||||
// article, product, etc. like "&dontstripads=1" or whatever
|
||||
//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());
|
||||
|
||||
// for analyze requests without mode=, make sure that diffbot expands all objects
|
||||
// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
|
||||
// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
|
||||
if (m_diffbotUrl.nullTerm()) {
|
||||
char *u = m_diffbotUrl.getBufStart();
|
||||
if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
|
||||
m_diffbotUrl.safePrintf("&expand");
|
||||
}
|
||||
}
|
||||
|
||||
// null term it
|
||||
m_diffbotUrl.nullTerm();
|
||||
|
||||
|
Reference in New Issue
Block a user