Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2013-12-23 10:31:00 -08:00
parent 6cc69106c2 3acd6a08d5
commit 8537a02008
4 changed files with 54 additions and 8 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -2081,6 +2081,8 @@ void testRegex ( ) {

 	rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";

+	rx = ".*?article[0-9]*?.html";
+
 	regex_t ucr;

 	if ( regcomp ( &ucr , rx ,
@ -2097,7 +2099,8 @@ void testRegex ( ) {

 	logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);

-	char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
+	//char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
+	char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";

 	if ( regexec(&ucr,url,0,NULL,0) )
 		logf(LOG_DEBUG,"db: failed to match %s on %s",
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -69,7 +69,9 @@ public:
 	bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;

 	long long m_lastUh48;
+	long m_lastFirstIp;
 	long long m_prevReplyUh48;
+	long m_prevReplyFirstIp;
 	long m_prevReplyError;
 	time_t m_prevReplyDownloadTime;

@ -247,7 +249,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 	st->m_needHeaderRow = true;

 	st->m_lastUh48 = 0LL;
+	st->m_lastFirstIp = 0;
 	st->m_prevReplyUh48 = 0LL;
+	st->m_prevReplyFirstIp = 0;
 	st->m_prevReplyError = 0;
 	st->m_prevReplyDownloadTime = 0LL;

@ -714,6 +718,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 			else if ( srep->m_spideredTime > lastSpidered )
 				lastSpidered = srep->m_spideredTime;
 			m_prevReplyUh48 = srep->getUrlHash48();
+			m_prevReplyFirstIp = srep->m_firstIp;
 			// 0 means indexed successfully. not sure if
 			// this includes http status codes like 404 etc.
 			// i don't think it includes those types of errors!
@ -734,11 +739,17 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){

 		// print the url if not yet printed
 		long long uh48 = sreq->getUrlHash48  ();
+		long firstIp = sreq->m_firstIp;
 		bool printIt = false;
 		// there can be multiple spiderrequests for the same url!
 		if ( m_lastUh48 != uh48 ) printIt = true;
+		// sometimes the same url has different firstips now that
+		// we have the EFAKEFIRSTIP spider error to avoid spidering
+		// seeds twice...
+		if ( m_lastFirstIp != firstIp ) printIt = true;
 		if ( ! printIt ) continue;
 		m_lastUh48 = uh48;
+		m_lastFirstIp = firstIp;

 		// make sure spiderreply is for the same url!
 		if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
@ -762,6 +773,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		// so set "status" to 0 to indicate hasn't been 
 		// downloaded yet.
 		if ( m_lastUh48 != m_prevReplyUh48 ) status = 0;
+		if ( m_lastFirstIp != m_prevReplyFirstIp ) status = 0;
 		// if it matches, perhaps an error spidering it?
 		if ( status && m_prevReplyError ) status = -1;

--- a/Spider.cpp
+++ b/Spider.cpp
@ -3169,6 +3169,21 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 			continue;
 		}

+		// if the spiderrequest has a fake firstip that means it
+		// was injected without doing a proper ip lookup for speed.
+		// xmldoc.cpp will check for m_fakeFirstIp and it that is
+		// set in the spiderrequest it will simply add a new request
+		// with the correct firstip. it will be a completely different
+		// spiderrequest key then. so no need to keep the "fakes".
+		// it will log the EFAKEFIRSTIP error msg.
+		if ( sreq->m_fakeFirstIp &&
+		     srep && 
+		     srep->m_spideredTime > sreq->m_addedTime ) {
+			if ( g_conf.m_logDebugSpider )
+				log("spider: skipping6 %s", sreq->m_url);
+			continue;
+		}
+
 		// once we have a spiderreply, even i guess if its an error,
 		// for a url, then bail if respidering is disabled
 		if ( m_cr->m_isCustomCrawl && 
@ -10315,6 +10330,14 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
 			// and url has since been spidered, nuke it!
 			if ( sreq->m_urlIsDocId ) continue;

+			// same if indexcode was EFAKEFIRSTIP which XmlDoc.cpp
+			// re-adds to spiderdb with the right firstip. once
+			// those guys have a reply we can ignore them.
+			// TODO: what about diffbotxyz spider requests? those
+			// have a fakefirstip... they should not have requests
+			// though, since their parent url has that.
+			if ( sreq->m_fakeFirstIp ) continue;
+
 			SpiderReply *old = oldRep;
 			sreq->m_inGoogle           = old->m_inGoogle;
 			sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2022,6 +2022,9 @@ bool XmlDoc::indexDoc ( ) {

 	m_msg4Launched = true;

+	// display the url that had the error
+	logIt();
+
 	// log this for debug now
 	SafeBuf tmp;
 	nsr->print(&tmp);
@ -2037,13 +2040,15 @@ bool XmlDoc::indexDoc ( ) {
 				    m_masterLoop   ,
 				    m_niceness     ) ) {
 		// spider hang bug
-		if ( g_conf.m_testSpiderEnabled )
-			logf(LOG_DEBUG,"build: msg4 meta add3 blocked" 
-			     "msg4=0x%lx" ,(long)&m_msg4);
+		//if ( g_conf.m_testSpiderEnabled )
+		//	logf(LOG_DEBUG,"build: msg4 meta add3 blocked" 
+		//	     "msg4=0x%lx" ,(long)&m_msg4);
 		m_msg4Waiting = true;
 		return false;
 	}

+	//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );
+
 	m_msg4Launched = false;

 	// all done
@ -2068,7 +2073,9 @@ bool XmlDoc::indexDoc2 ( ) {

 	// do this before we increment pageDownloadAttempts below so that
 	// john's smoke tests, which use those counts, are not affected
-	if ( m_oldsrValid && m_oldsr.m_fakeFirstIp ) {
+	if ( m_oldsrValid && m_oldsr.m_fakeFirstIp &&
+	     // diffbot requests are ok though!
+	     ! strstr(m_oldsr.m_url,"-diffbotxyz") ) {
 		m_indexCodeValid = true;
 		m_indexCode = EFAKEFIRSTIP;
 		return true;
@ -7910,8 +7917,9 @@ char *XmlDoc::getIsDup ( ) {
 		//	continue;
 		//}
 		// for debug
-		log("build: doc %s is dup of doid %lli",
-		    m_firstUrl.m_url,d);
+		if ( d != m_docId )
+			log("build: doc %s is dup of doid %lli",
+			    m_firstUrl.m_url,d);
 		// get the winner
 		//if ( score > maxScore ) maxScore = score;
 		if ( sr > maxSiteRank || maxSiteRank == -1 ) {
@ -17523,7 +17531,7 @@ bool XmlDoc::logIt ( ) {
 	//   make queues in the case of hammering an ip, which i think
 	//   it already does...
 	if ( m_oldsrValid && m_oldsr.m_firstIp != m_firstIp )
-		sb.safePrintf("fakesreqfirstip=%s ",iptoa(m_firstIp) );
+		sb.safePrintf("fakesreqfirstip=%s ",iptoa(m_oldsr.m_firstIp) );

 	//
 	// print when this spider request was added