Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

2014-02-12 13:21:57 -08:00
parent 0e48bbcea9 69fa6662bc
commit 25eae3da39
5 changed files with 55 additions and 17 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -449,7 +449,7 @@ bool Collectiondb::addNewColl ( char *coll ,
 		// show the ban links in the search results. the 
 		// collection name is cryptographic enough to show that
 		cr->m_isCustomCrawl = customCrawl;
-		cr->m_diffbotOnlyProcessIfNew = true;
+		cr->m_diffbotOnlyProcessIfNewUrl = true;
 		// default respider to off
 		cr->m_collectiveRespiderFrequency = 0.0;
 		cr->m_restrictDomain = true;
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -640,7 +640,7 @@ class CollectionRec {
 	long    m_hasucr:1;
 	long    m_hasupr:1;

-	char    m_diffbotOnlyProcessIfNew;
+	char    m_diffbotOnlyProcessIfNewUrl;

 	//SafeBuf m_diffbotClassify;
 	//char m_diffbotClassify;
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -2280,7 +2280,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
 			      , cx->m_maxToCrawl
 			      , cx->m_maxToProcess
 			      , (long)cx->m_restrictDomain
-			      , (long)cx->m_diffbotOnlyProcessIfNew
+			      , (long)cx->m_diffbotOnlyProcessIfNewUrl
 			      );
 		sb.safePrintf("\"seeds\":\"");
 		sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
@ -3264,7 +3264,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,

 		char *isNewYes = "";
 		char *isNewNo  = " checked";
-		if ( cr->m_diffbotOnlyProcessIfNew ) {
+		if ( cr->m_diffbotOnlyProcessIfNewUrl ) {
 			isNewYes = " checked";
 			isNewNo  = "";
 		}
--- a/Parms.cpp
+++ b/Parms.cpp
@ -9299,7 +9299,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "onlyProcessIfNew";
 	m->m_xml   = "diffbotOnlyProcessIfNew";
 	m->m_title = "onlyProcessIfNew";
-	m->m_off   = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
+	m->m_off   = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x;
 	m->m_type  = TYPE_BOOL;
 	m->m_page  = PAGE_NONE;
 	m->m_def   = "1";
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -3042,8 +3042,8 @@ long *XmlDoc::getIndexCode2 ( ) {
 		return &m_indexCode;
 	}

-	// . i moved this up to perhaps fix problems of two dup pages being downloaded
-	//   at about the same time
+	// . i moved this up to perhaps fix problems of two dup pages being 
+	//   downloaded at about the same time
 	// . are we a dup of another doc from any other site already indexed?
 	char *isDup = getIsDup();
 	if ( ! isDup || isDup == (char *)-1 ) return (long *)isDup;
@ -3066,6 +3066,30 @@ long *XmlDoc::getIndexCode2 ( ) {
 		return &m_indexCode;
 	}

+	// was page unchanged since last time we downloaded it?
+	XmlDoc **pod = getOldXmlDoc ( );
+	if ( ! pod || pod == (XmlDoc **)-1 ) return (long *)pod;
+	XmlDoc *od = NULL;
+	if ( *pod ) od = *pod;
+	bool check = true;
+	if ( ! od ) check = false;
+	// do not do this logic for diffbot because it might want to get
+	// the diffbot reply even if page content is the same, because it
+	// might have an ajax call that updates the product price.
+	// onlyProcessIfNewUrl defaults to true, so typically even diffbot
+	// crawls will do this check.
+	if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl )
+		check = false;
+	if ( check ) {
+		long *ch32 = getContentHash32();
+		if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32;
+		if ( *ch32 == od->m_contentHash32 ) {
+			m_indexCode = EDOCUNCHANGED;
+			m_indexCodeValid = true;
+			return &m_indexCode;
+		}
+	}
+
 	// words
 	Words *words = getWords();
 	if ( ! words || words == (Words *)-1 ) return (long *)words;
@ -13384,7 +13408,8 @@ SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
 	return &m_diffbotApiUrl;
 }

-// if only processing NEW is enabled, then do not
+// if only processing NEW URLs is enabled, then do not get diffbot reply
+// if we already got one before
 bool *XmlDoc::getRecycleDiffbotReply ( ) {

 	if ( m_recycleDiffbotReplyValid )
@ -13408,7 +13433,7 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
 	// ***RECYCLE*** the diffbot reply!
 	m_recycleDiffbotReply = false;

-	if ( cr->m_diffbotOnlyProcessIfNew &&
+	if ( cr->m_diffbotOnlyProcessIfNewUrl &&
 	     od && od->m_gotDiffbotSuccessfulReply ) 
 		m_recycleDiffbotReply = true;

@ -13690,7 +13715,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {

 	

-	// if already processed and onlyprocessifnew is enabled then
+	// if already processed and onlyprocessifnewurl is enabled then
 	// we recycle and do not bother with this, we also do not nuke
 	// the diffbot json objects we have already indexed by calling
 	// nukeJSONObjects()
@ -13964,6 +13989,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	if ( headers.length() > 0 )
 		additionalHeaders = headers.getBufStart();

+	// if did not get the web page first and we are crawling, not
+	// doing a bulk, then core. we need the webpage to harvest links
+	// and sometimes to check the pageprocesspattern to see if we should
+	// process.
+	if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) { 
+		char *xx=NULL;*xx=0; }
+
 	log("diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
 	    additionalHeaders);

@ -19704,10 +19736,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// . then just add the SpiderReply to avoid respidering
 		// . NO! still need to add outlinks
 		//|| diffbotEmptyReply
-	     // . treat this as a temporary error i guess
-	     // . getNewSpiderReply() below will clear the error in it and
-	     //   copy stuff over from m_sreq and m_oldDoc for this case
-		//|| *indexCode == EDOCUNCHANGED
+		// . treat this as a temporary error i guess
+		// . getNewSpiderReply() below will clear the error in it and
+		//   copy stuff over from m_sreq and m_oldDoc for this case
+		|| *indexCode == EDOCUNCHANGED
 		) {
 		// sanity - in repair mode?
 		if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
@ -19738,6 +19770,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			m_metaList     = (char *)0x1;
 			return m_metaList;
 		}
+		// save this
+		long savedCode = *indexCode;
 		// before getting our spider reply, assign crap from the old
 		// doc to us since we are unchanged! this will allow us to
 		// call getNewSpiderReply() without doing any processing, like
@ -19745,12 +19779,16 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		copyFromOldDoc ( od );
 		// need this though! i don't want to print out "Success"
 		// in the log in the logIt() function
-		m_indexCode = *indexCode;
+		m_indexCode = savedCode;
 		m_indexCodeValid = true;
 		// but set our m_contentHash32 from the spider request
 		// which got it from the spiderreply in the case of
 		// EDOCUNCHANGED. this way ch32=xxx will log correctly.
-		if ( *indexCode == EDOCUNCHANGED && m_sreqValid ) {
+		// I think this is only when EDOCUNCHANGED is set in the
+		// Msg13.cpp code, when we have a spider compression proxy.
+		if ( *indexCode == EDOCUNCHANGED && 
+		     m_sreqValid &&
+		     ! m_contentHash32Valid ) {
 			m_contentHash32      = m_sreq.m_contentHash32;
 			m_contentHash32Valid = true;
 		}
@ -19913,7 +19951,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	}

 	// . should we recycle the diffbot reply for this url?
-	// . if m_diffbotOnlyProcessIfNew is true then we want to keep
+	// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
 	//   our existing diffbot reply, i.e. recycle it, even though we
 	//   respidered this page.
 	bool *recycle = getRecycleDiffbotReply();