forked from Mirrors/privacore-open-source-search-engine
Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
This commit is contained in:
@ -449,7 +449,7 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
// show the ban links in the search results. the
|
||||
// collection name is cryptographic enough to show that
|
||||
cr->m_isCustomCrawl = customCrawl;
|
||||
cr->m_diffbotOnlyProcessIfNew = true;
|
||||
cr->m_diffbotOnlyProcessIfNewUrl = true;
|
||||
// default respider to off
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
cr->m_restrictDomain = true;
|
||||
|
@ -640,7 +640,7 @@ class CollectionRec {
|
||||
long m_hasucr:1;
|
||||
long m_hasupr:1;
|
||||
|
||||
char m_diffbotOnlyProcessIfNew;
|
||||
char m_diffbotOnlyProcessIfNewUrl;
|
||||
|
||||
//SafeBuf m_diffbotClassify;
|
||||
//char m_diffbotClassify;
|
||||
|
@ -2280,7 +2280,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
, cx->m_maxToCrawl
|
||||
, cx->m_maxToProcess
|
||||
, (long)cx->m_restrictDomain
|
||||
, (long)cx->m_diffbotOnlyProcessIfNew
|
||||
, (long)cx->m_diffbotOnlyProcessIfNewUrl
|
||||
);
|
||||
sb.safePrintf("\"seeds\":\"");
|
||||
sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
|
||||
@ -3264,7 +3264,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
char *isNewYes = "";
|
||||
char *isNewNo = " checked";
|
||||
if ( cr->m_diffbotOnlyProcessIfNew ) {
|
||||
if ( cr->m_diffbotOnlyProcessIfNewUrl ) {
|
||||
isNewYes = " checked";
|
||||
isNewNo = "";
|
||||
}
|
||||
|
@ -9299,7 +9299,7 @@ void Parms::init ( ) {
|
||||
m->m_cgi = "onlyProcessIfNew";
|
||||
m->m_xml = "diffbotOnlyProcessIfNew";
|
||||
m->m_title = "onlyProcessIfNew";
|
||||
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
|
||||
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "1";
|
||||
|
62
XmlDoc.cpp
62
XmlDoc.cpp
@ -3042,8 +3042,8 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
// . i moved this up to perhaps fix problems of two dup pages being downloaded
|
||||
// at about the same time
|
||||
// . i moved this up to perhaps fix problems of two dup pages being
|
||||
// downloaded at about the same time
|
||||
// . are we a dup of another doc from any other site already indexed?
|
||||
char *isDup = getIsDup();
|
||||
if ( ! isDup || isDup == (char *)-1 ) return (long *)isDup;
|
||||
@ -3066,6 +3066,30 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
// was page unchanged since last time we downloaded it?
|
||||
XmlDoc **pod = getOldXmlDoc ( );
|
||||
if ( ! pod || pod == (XmlDoc **)-1 ) return (long *)pod;
|
||||
XmlDoc *od = NULL;
|
||||
if ( *pod ) od = *pod;
|
||||
bool check = true;
|
||||
if ( ! od ) check = false;
|
||||
// do not do this logic for diffbot because it might want to get
|
||||
// the diffbot reply even if page content is the same, because it
|
||||
// might have an ajax call that updates the product price.
|
||||
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
|
||||
// crawls will do this check.
|
||||
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl )
|
||||
check = false;
|
||||
if ( check ) {
|
||||
long *ch32 = getContentHash32();
|
||||
if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32;
|
||||
if ( *ch32 == od->m_contentHash32 ) {
|
||||
m_indexCode = EDOCUNCHANGED;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
}
|
||||
}
|
||||
|
||||
// words
|
||||
Words *words = getWords();
|
||||
if ( ! words || words == (Words *)-1 ) return (long *)words;
|
||||
@ -13384,7 +13408,8 @@ SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
|
||||
return &m_diffbotApiUrl;
|
||||
}
|
||||
|
||||
// if only processing NEW is enabled, then do not
|
||||
// if only processing NEW URLs is enabled, then do not get diffbot reply
|
||||
// if we already got one before
|
||||
bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
|
||||
if ( m_recycleDiffbotReplyValid )
|
||||
@ -13408,7 +13433,7 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
// ***RECYCLE*** the diffbot reply!
|
||||
m_recycleDiffbotReply = false;
|
||||
|
||||
if ( cr->m_diffbotOnlyProcessIfNew &&
|
||||
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
||||
od && od->m_gotDiffbotSuccessfulReply )
|
||||
m_recycleDiffbotReply = true;
|
||||
|
||||
@ -13690,7 +13715,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
|
||||
|
||||
// if already processed and onlyprocessifnew is enabled then
|
||||
// if already processed and onlyprocessifnewurl is enabled then
|
||||
// we recycle and do not bother with this, we also do not nuke
|
||||
// the diffbot json objects we have already indexed by calling
|
||||
// nukeJSONObjects()
|
||||
@ -13964,6 +13989,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
if ( headers.length() > 0 )
|
||||
additionalHeaders = headers.getBufStart();
|
||||
|
||||
// if did not get the web page first and we are crawling, not
|
||||
// doing a bulk, then core. we need the webpage to harvest links
|
||||
// and sometimes to check the pageprocesspattern to see if we should
|
||||
// process.
|
||||
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
log("diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
|
||||
additionalHeaders);
|
||||
|
||||
@ -19704,10 +19736,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// . then just add the SpiderReply to avoid respidering
|
||||
// . NO! still need to add outlinks
|
||||
//|| diffbotEmptyReply
|
||||
// . treat this as a temporary error i guess
|
||||
// . getNewSpiderReply() below will clear the error in it and
|
||||
// copy stuff over from m_sreq and m_oldDoc for this case
|
||||
//|| *indexCode == EDOCUNCHANGED
|
||||
// . treat this as a temporary error i guess
|
||||
// . getNewSpiderReply() below will clear the error in it and
|
||||
// copy stuff over from m_sreq and m_oldDoc for this case
|
||||
|| *indexCode == EDOCUNCHANGED
|
||||
) {
|
||||
// sanity - in repair mode?
|
||||
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
|
||||
@ -19738,6 +19770,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
m_metaList = (char *)0x1;
|
||||
return m_metaList;
|
||||
}
|
||||
// save this
|
||||
long savedCode = *indexCode;
|
||||
// before getting our spider reply, assign crap from the old
|
||||
// doc to us since we are unchanged! this will allow us to
|
||||
// call getNewSpiderReply() without doing any processing, like
|
||||
@ -19745,12 +19779,16 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
copyFromOldDoc ( od );
|
||||
// need this though! i don't want to print out "Success"
|
||||
// in the log in the logIt() function
|
||||
m_indexCode = *indexCode;
|
||||
m_indexCode = savedCode;
|
||||
m_indexCodeValid = true;
|
||||
// but set our m_contentHash32 from the spider request
|
||||
// which got it from the spiderreply in the case of
|
||||
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
|
||||
if ( *indexCode == EDOCUNCHANGED && m_sreqValid ) {
|
||||
// I think this is only when EDOCUNCHANGED is set in the
|
||||
// Msg13.cpp code, when we have a spider compression proxy.
|
||||
if ( *indexCode == EDOCUNCHANGED &&
|
||||
m_sreqValid &&
|
||||
! m_contentHash32Valid ) {
|
||||
m_contentHash32 = m_sreq.m_contentHash32;
|
||||
m_contentHash32Valid = true;
|
||||
}
|
||||
@ -19913,7 +19951,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
}
|
||||
|
||||
// . should we recycle the diffbot reply for this url?
|
||||
// . if m_diffbotOnlyProcessIfNew is true then we want to keep
|
||||
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
|
||||
// our existing diffbot reply, i.e. recycle it, even though we
|
||||
// respidered this page.
|
||||
bool *recycle = getRecycleDiffbotReply();
|
||||
|
Reference in New Issue
Block a user