Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

This commit is contained in:
Matt Wells
2014-02-12 13:21:57 -08:00
5 changed files with 55 additions and 17 deletions

@ -449,7 +449,7 @@ bool Collectiondb::addNewColl ( char *coll ,
// show the ban links in the search results. the
// collection name is cryptographic enough to show that
cr->m_isCustomCrawl = customCrawl;
cr->m_diffbotOnlyProcessIfNew = true;
cr->m_diffbotOnlyProcessIfNewUrl = true;
// default respider to off
cr->m_collectiveRespiderFrequency = 0.0;
cr->m_restrictDomain = true;

@ -640,7 +640,7 @@ class CollectionRec {
long m_hasucr:1;
long m_hasupr:1;
char m_diffbotOnlyProcessIfNew;
char m_diffbotOnlyProcessIfNewUrl;
//SafeBuf m_diffbotClassify;
//char m_diffbotClassify;

@ -2280,7 +2280,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
, cx->m_maxToCrawl
, cx->m_maxToProcess
, (long)cx->m_restrictDomain
, (long)cx->m_diffbotOnlyProcessIfNew
, (long)cx->m_diffbotOnlyProcessIfNewUrl
);
sb.safePrintf("\"seeds\":\"");
sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
@ -3264,7 +3264,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
char *isNewYes = "";
char *isNewNo = " checked";
if ( cr->m_diffbotOnlyProcessIfNew ) {
if ( cr->m_diffbotOnlyProcessIfNewUrl ) {
isNewYes = " checked";
isNewNo = "";
}

@ -9299,7 +9299,7 @@ void Parms::init ( ) {
m->m_cgi = "onlyProcessIfNew";
m->m_xml = "diffbotOnlyProcessIfNew";
m->m_title = "onlyProcessIfNew";
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x;
m->m_type = TYPE_BOOL;
m->m_page = PAGE_NONE;
m->m_def = "1";

@ -3042,8 +3042,8 @@ long *XmlDoc::getIndexCode2 ( ) {
return &m_indexCode;
}
// . i moved this up to perhaps fix problems of two dup pages being downloaded
// at about the same time
// . i moved this up to perhaps fix problems of two dup pages being
// downloaded at about the same time
// . are we a dup of another doc from any other site already indexed?
char *isDup = getIsDup();
if ( ! isDup || isDup == (char *)-1 ) return (long *)isDup;
@ -3066,6 +3066,30 @@ long *XmlDoc::getIndexCode2 ( ) {
return &m_indexCode;
}
// was page unchanged since last time we downloaded it?
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (long *)pod;
XmlDoc *od = NULL;
if ( *pod ) od = *pod;
bool check = true;
if ( ! od ) check = false;
// do not do this logic for diffbot because it might want to get
// the diffbot reply even if page content is the same, because it
// might have an ajax call that updates the product price.
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
// crawls will do this check.
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl )
check = false;
if ( check ) {
long *ch32 = getContentHash32();
if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32;
if ( *ch32 == od->m_contentHash32 ) {
m_indexCode = EDOCUNCHANGED;
m_indexCodeValid = true;
return &m_indexCode;
}
}
// words
Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (long *)words;
@ -13384,7 +13408,8 @@ SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
return &m_diffbotApiUrl;
}
// if only processing NEW is enabled, then do not
// if only processing NEW URLs is enabled, then do not get diffbot reply
// if we already got one before
bool *XmlDoc::getRecycleDiffbotReply ( ) {
if ( m_recycleDiffbotReplyValid )
@ -13408,7 +13433,7 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
// ***RECYCLE*** the diffbot reply!
m_recycleDiffbotReply = false;
if ( cr->m_diffbotOnlyProcessIfNew &&
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
od && od->m_gotDiffbotSuccessfulReply )
m_recycleDiffbotReply = true;
@ -13690,7 +13715,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// if already processed and onlyprocessifnew is enabled then
// if already processed and onlyprocessifnewurl is enabled then
// we recycle and do not bother with this, we also do not nuke
// the diffbot json objects we have already indexed by calling
// nukeJSONObjects()
@ -13964,6 +13989,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
if ( headers.length() > 0 )
additionalHeaders = headers.getBufStart();
// if did not get the web page first and we are crawling, not
// doing a bulk, then core. we need the webpage to harvest links
// and sometimes to check the pageprocesspattern to see if we should
// process.
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
char *xx=NULL;*xx=0; }
log("diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
additionalHeaders);
@ -19704,10 +19736,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// . then just add the SpiderReply to avoid respidering
// . NO! still need to add outlinks
//|| diffbotEmptyReply
// . treat this as a temporary error i guess
// . getNewSpiderReply() below will clear the error in it and
// copy stuff over from m_sreq and m_oldDoc for this case
//|| *indexCode == EDOCUNCHANGED
// . treat this as a temporary error i guess
// . getNewSpiderReply() below will clear the error in it and
// copy stuff over from m_sreq and m_oldDoc for this case
|| *indexCode == EDOCUNCHANGED
) {
// sanity - in repair mode?
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
@ -19738,6 +19770,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
m_metaList = (char *)0x1;
return m_metaList;
}
// save this
long savedCode = *indexCode;
// before getting our spider reply, assign crap from the old
// doc to us since we are unchanged! this will allow us to
// call getNewSpiderReply() without doing any processing, like
@ -19745,12 +19779,16 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
copyFromOldDoc ( od );
// need this though! i don't want to print out "Success"
// in the log in the logIt() function
m_indexCode = *indexCode;
m_indexCode = savedCode;
m_indexCodeValid = true;
// but set our m_contentHash32 from the spider request
// which got it from the spiderreply in the case of
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
if ( *indexCode == EDOCUNCHANGED && m_sreqValid ) {
// I think this is only when EDOCUNCHANGED is set in the
// Msg13.cpp code, when we have a spider compression proxy.
if ( *indexCode == EDOCUNCHANGED &&
m_sreqValid &&
! m_contentHash32Valid ) {
m_contentHash32 = m_sreq.m_contentHash32;
m_contentHash32Valid = true;
}
@ -19913,7 +19951,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
}
// . should we recycle the diffbot reply for this url?
// . if m_diffbotOnlyProcessIfNew is true then we want to keep
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
// our existing diffbot reply, i.e. recycle it, even though we
// respidered this page.
bool *recycle = getRecycleDiffbotReply();