Merge branch 'master' of github.com:privacore/open-source-search-engine

2016-03-17 21:54:47 +01:00
parent 15d143764d 67a63880ee
commit 4af9d2a0f4
11 changed files with 236 additions and 39 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3056,19 +3056,35 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 	}

-	// try to fix bug of EBADURL when it wasn't really
-	// EBADURL is 32880
-	// this is a HACK!
-	m_regExs[i].set("errorcount==1 && errorcode==32880");
-	m_spiderPriorities   [i] = 15;
-	m_spiderFreqs        [i] = 0.1;
-	m_maxSpidersPerRule  [i] = 1; 
-	i++;
+
+	// 3rd rule for respidering
+	if ( respiderFreq > 0.0 ) {
+		m_regExs[i].set("lastspidertime>={roundstart}");
+		// do not "remove" from index
+		m_spiderPriorities   [i] = 10;
+		// just turn off spidering. if we were to set priority to
+		// filtered it would be removed from index!
+		//m_spidersEnabled     [i] = 0;
+		m_maxSpidersPerRule[i] = 0;
+		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
+		// which has been obsoleted, but we are running old code now!
+		//m_spiderDiffbotApiUrl[i].set ( api );
+		i++;
+	}
+	// if doing a one-shot crawl limit error retries to 3 times or
+	// if no urls currently available to spider, whichever comes first.
+	else {
+		m_regExs[i].set("errorcount>=3");
+		m_spiderPriorities   [i] = 11;
+		m_spiderFreqs        [i] = 0.0416;
+		m_maxSpidersPerRule  [i] = 0; // turn off spiders
+		i++;
+	}

 	m_regExs[i].set("errorcount>=1 && !hastmperror");
 	m_spiderPriorities   [i] = 14;
-	m_spiderFreqs        [i] = 0.0;
-	m_maxSpidersPerRule  [i] = 0; // turn off spiders if not tmp error
+	m_spiderFreqs        [i] = 0.0416; // every hour
+	//m_maxSpidersPerRule  [i] = 0; // turn off spiders if not tmp error
 	i++;

 	// and for docs that have errors respider once every 5 hours
@ -3091,23 +3107,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
 	i++;

-	// 3rd rule for respidering
-	if ( respiderFreq > 0.0 ) {
-		m_regExs[i].set("lastspidertime>={roundstart}");
-		// do not "remove" from index
-		m_spiderPriorities   [i] = 10;
-		// just turn off spidering. if we were to set priority to
-		// filtered it would be removed from index!
-		//m_spidersEnabled     [i] = 0;
-		m_maxSpidersPerRule[i] = 0;
-		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
-		// which has been obsoleted, but we are running old code now!
-		//m_spiderDiffbotApiUrl[i].set ( api );
-		i++;
-	}
 	// if collectiverespiderfreq is 0 or less then do not RE-spider
 	// documents already indexed.
-	else {
+	if ( respiderFreq <= 0.0 ) { // else {
 		// this does NOT work! error docs continuosly respider
 		// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
 		//m_regExs[i].set("isindexed");
@ -3129,13 +3131,20 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		i++;
+
 		// crawl everything else, but don't harvest links,
 		// we have to see if the page content matches the "ppp"
 		// to determine whether the page should be processed or not.
 		m_regExs[i].set("default");
 		m_spiderPriorities   [i] = 52;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		m_harvestLinks       [i] = false;
 		i++;
 		goto done;
@ -3146,6 +3155,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp && matchesupp");
 		m_spiderPriorities   [i] = 55;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;

 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
@ -3153,12 +3165,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		i++;
 		// just process, do not spider links if does not match ucp
 		m_regExs[i].set("matchesupp");
 		m_spiderPriorities   [i] = 54;
 		m_harvestLinks       [i] = false;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 		// do not crawl anything else
@ -3180,6 +3198,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		// process everything since upp is empty
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
@ -3202,6 +3223,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesupp");
 		m_spiderPriorities   [i] = 54;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		//m_harvestLinks       [i] = false;
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
--- a/Conf.h
+++ b/Conf.h
@ -310,6 +310,11 @@ class Conf {
 	// lookup requests to a host to maxmize tfndb page cache hits?
 	//bool   m_useBiasedTfndb;

+  
+	// just ensure lists being written are valid rdb records (titlerecs)
+	// trying to isolate titlerec corruption
+	bool m_verifyDumpedLists;
+
 	// calls fsync(fd) if true after each write
 	bool   m_flushWrites ; 
 	bool   m_verifyWrites;
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -4120,7 +4120,7 @@ bool Links::addLink ( const char *link , int32_t linkLen , int32_t nodeNum ,

 	// stop http://0x0017.0000000000000000000000000000000000000024521276/
 	// which somehow make it through without this!!
-	if ( url.getTLDLen() <= 0 ) return true;
+	if ( ! url.isIp() && url.getTLDLen() <= 0 ) return true;

 	// Allocate more link buffer space?
 	int32_t bufSpace ;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -7262,6 +7262,21 @@ void Parms::init ( ) {
 	m->m_group = false;
 	m++;

+	m->m_title = "verify written lists";
+	m->m_desc  = "Ensure lists being written to disk are not corrupt. "
+		"That title recs appear valid, etc. Helps isolate sources "
+		"of corruption. Used for debugging.";
+	m->m_cgi   = "vwl";
+	m->m_off   = offsetof(Conf,m_verifyDumpedLists);
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_group = 0;
+	m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
+	m->m_page  = PAGE_MASTER;
+	m->m_obj   = OBJ_CONF;
+	m->m_group = 0;
+	m++;
+
 	m->m_title = "verify disk writes";
 	m->m_desc  = "Read what was written in a verification step. Decreases "
 		"performance, but may help fight disk corruption mostly on "
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -1676,6 +1676,11 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
 		g_errno = ETRYAGAIN; 
 		return false;
 	}
+	// if ( m_inDumpLoop ) {
+	// 	g_errno = ETRYAGAIN;
+	// 	return false;
+	// }
+
 	// if we are well into repair mode, level 2, do not add anything
 	// to spiderdb or titledb... that can mess up our titledb scan.
 	// we always rebuild tfndb, clusterdb and spiderdb
@ -2068,6 +2073,28 @@ bool Rdb::addRecord ( collnum_t collnum,
 		return false;
 	}

+	// do not add if range being dumped at all because when the
+	// dump completes it calls deleteList() and removes the nodes from
+	// the tree, so if you were overriding a node currently being dumped
+	// we would lose it.
+	if ( m_dump.isDumping() &&
+		 //oppKey >= m_dump.getFirstKeyInQueue() &&
+		 // ensure the dump is dumping the collnum of this key
+		 m_dump.m_collnum == collnum &&
+		 m_dump.m_lastKeyInQueue &&
+		 // the dump should not split positive/negative keys so
+		 // if our positive/negative twin should be in the dump with us
+		 // or not in the dump with us, so any positive/negative
+		 // annihilation below should be ok and we should be save
+		 // to call deleteNode() below
+		 KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
+		 //oppKey <= m_dump.getLastKeyInQueue ()   ) goto addIt;
+		 KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0   )  {
+		    // tell caller to wait and try again later
+		    g_errno = ETRYAGAIN;
+		    return false;
+	}
+
 	// save orig
 	char *orig = NULL;

@ -2229,6 +2256,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 		// CAUTION: we should not annihilate with oppKey if oppKey may
 		// be in the process of being dumped to disk! This would 
 		// render our annihilation useless and make undeletable data
+		/*
 		if ( m_dump.isDumping() &&
 		     //oppKey >= m_dump.getFirstKeyInQueue() &&
 		     m_dump.m_lastKeyInQueue &&
@ -2236,6 +2264,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 		     //oppKey <= m_dump.getLastKeyInQueue ()   ) goto addIt;
 		     KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0   ) 
 			goto addIt;
+		*/
 		// BEFORE we delete it, save it. this is a special hack
 		// so we can UNDO this deleteNode() should the titledb rec
 		// add fail.
@ -2309,7 +2338,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 	// if we did not find an oppKey and are tfndb, flag this
 	//if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false;

- addIt:
+ //addIt:
 	// mark as changed
 	//if ( ! m_needsSave ) {
 	//	m_needsSave = true;
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -424,10 +424,12 @@ bool RdbDump::dumpTree ( bool recall ) {
 		// . check the list we got from the tree for problems
 		// . ensures keys are ordered from lowest to highest as well
 		//#ifdef GBSANITYCHECK
-		if ( g_conf.m_verifyWrites ) {
+		if ( g_conf.m_verifyWrites || g_conf.m_verifyDumpedLists ) {
 			char *s = "none";
 			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
-			log("dump: verifying list before dumping (rdb=%s)",s);
+			log("dump: verifying list before dumping (rdb=%s "
+				"collnum=%i)",s,(int)m_collnum);
+				
 			m_list->checkList_r ( false , // removeNegRecs?
 					      false , // sleep on problem?
 					      m_rdb->m_rdbId );
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -772,7 +772,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
 		if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
 			char *rec = getCurrentRec();
 			int32_t usize = *(int32_t *)(rec+12+4);
-			if ( usize <= 0 ) {
+			if ( usize <= 0 || usize>100000000 ) {
 				log("db: bad titlerec uncompress size");
 				char *xx=NULL;*xx=0;
 			}
--- a/RdbMem.cpp
+++ b/RdbMem.cpp
@ -89,15 +89,22 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
 void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) {
 	// if we're dumping and key has been dumped, use the secondary mem
 	//if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) {
-	if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() && 
-	     ( collnum < m_rdb->m_dumpCollnum ||
-	       (collnum == m_rdb->m_dumpCollnum &&
-		// if dump fails to alloc mem in RdbDump::dumpTree it does
-		// a sleep wrapper and keeps retrying, and 
-		// RdbDump::m_lastKeyInQueue can remain NULL because we've
-		// never dumped out a list from the tree yet
-		m_rdb->m_dump.m_lastKeyInQueue &&
-		KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
+	if ( m_rdb->m_inDumpLoop ) {
+		/////
+		// MDW: 3/15/2016
+		// if we're dumping then ALWAYS use secondary mem, wtf...
+		// primary is being dumped out and when the dump completes
+		// the ptr gets reset so we'll end up point to garbage.
+		///////
+
+	    // ( collnum < m_rdb->m_dumpCollnum ||
+	    //   (collnum == m_rdb->m_dumpCollnum &&
+		// // if dump fails to alloc mem in RdbDump::dumpTree it does
+		// // a sleep wrapper and keeps retrying, and 
+		// // RdbDump::m_lastKeyInQueue can remain NULL because we've
+		// // never dumped out a list from the tree yet
+		// m_rdb->m_dump.m_lastKeyInQueue &&
+		// KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
 		// if secondary mem is growing down...
 		if ( m_ptr2 > m_ptr1 ) {
 			// return NULL if it would breech,
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -8,6 +8,7 @@
 #include "Loop.h"
 #include "Threads.h"
 #include "Linkdb.h"
+#include "Spider.h"

 RdbTree::RdbTree () {
 	//m_countsInitialized = false;
@ -1128,6 +1129,12 @@ bool RdbTree::fixTree ( ) {
 	//CollectionRec *recs = g_collectiondb.m_recs;
 	int32_t           max  = g_collectiondb.m_numRecs;
 	log("db: Valid collection numbers range from 0 to %"INT32".",max);
+	
+	bool isTitledb = false;
+	if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
+	bool isSpiderdb = false;
+	if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
+
 	// now re-add the old nods to the tree, they should not be overwritten
 	// by addNode()
 	for ( int32_t i = 0 ; i < n ; i++ ) {
@ -1136,6 +1143,34 @@ bool RdbTree::fixTree ( ) {
 			log("db: Fixing node #%"INT32" of %"INT32".",i,n);
 		// skip if empty
 		if ( m_parents[i] <= -2 ) continue;
+			
+		if ( isTitledb && m_data[i] ) {
+			char *data = m_data[i];
+			int32_t ucompSize = *(int32_t *)data;
+			if ( ucompSize < 0 || ucompSize > 100000000 ) {
+				log("db: removing titlerec with uncompressed "
+				     "size of %i from tree",(int)ucompSize);
+				continue;
+			}
+		}
+
+		char *key = &m_keys[i*m_ks];
+		if ( isSpiderdb && m_data[i] &&
+			g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
+			char *data = m_data[i];
+			data -= sizeof(SPIDERDBKEY);
+			data -= 4;
+			SpiderRequest *sreq ;
+			sreq =(SpiderRequest *)data;
+			if ( strncmp(sreq->m_url,"http",4) ) {
+				log("db: removing spiderrequest bad url "
+					"%s from tree",sreq->m_url);
+				//return false;
+				continue;
+			}
+		}
+			
+			
 		collnum_t cn = m_collnums[i];
 		// verify collnum
 		if ( cn <  0   ) continue;
@ -1191,6 +1226,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 	if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"tfndb"  ) ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
+
+	bool isTitledb = false;
+	if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
+	bool isSpiderdb = false;
+	if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
+
 	// now check parent kid correlations
 	for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
 		// this thing blocks for 1.5 secs for indexdb
@ -1208,6 +1249,32 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
 			char *xx=NULL;*xx=0; }

+		if ( isTitledb && m_data[i] ) {
+			char *data = m_data[i];
+			int32_t ucompSize = *(int32_t *)data;
+			if ( ucompSize < 0 || ucompSize > 100000000 ) {
+				log("db: found titlerec with uncompressed "
+					"size of %i from tree",(int)ucompSize);
+				return false;
+			}
+		}
+
+		char *key = &m_keys[i*m_ks];
+		if ( isSpiderdb && m_data[i] &&
+			g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
+				char *data = m_data[i];
+				data -= sizeof(SPIDERDBKEY);
+				data -= 4;
+				SpiderRequest *sreq ;
+				sreq =(SpiderRequest *)data;
+				if ( strncmp(sreq->m_url,"http",4) ) {
+					log("db: spiderrequest bad url "
+						"%s",sreq->m_url);
+					return false;
+			}
+		}
+
+
 		// bad collnum?
 		if ( doCollRecCheck ) {
 			collnum_t cn = m_collnums[i];
--- a/SpiderColl.cpp
+++ b/SpiderColl.cpp
@ -2245,6 +2245,14 @@ bool SpiderColl::evalIpLoop ( ) {
 	bool inCache = false;
 	bool useCache = true;
 	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
+	
+
+	// did our collection rec get deleted? since we were doing a read
+	// the SpiderColl will have been preserved in that case but its
+	// m_deleteMyself flag will have been set.
+	if ( tryToDeleteSpiderColl ( this ,"6") ) return false;
+
+
 	// if doing site or page quotes for the sitepages or domainpages
 	// url filter expressions, we can't muck with the cache because
 	// we end up skipping the counting part.
@ -4658,6 +4666,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 	int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
 	// do not spider more than once per 15 seconds ever!
 	// no! might be a query reindex!!
+	/*
 	if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) { 
 		static bool s_printed = false;
 		if ( ! s_printed ) {
@ -4667,6 +4676,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 		}
 		waitInSecs = 15;//900; this was 15 minutes
 	}
+	*/
 	// in fact, force docid based guys to be zero!
 	//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
 	if ( sreq->m_isPageReindex ) waitInSecs = 0;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -5897,6 +5897,16 @@ char *XmlDoc::getIsDup ( ) {
 		return &m_isDup;
 	}

+
+	// do not dedup seeds
+	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
+	if ( cr->m_isCustomCrawl && isSeed ) {
+		m_isDupValid = true;
+		m_isDup = false;
+		return &m_isDup;
+	}
+
+
 	setStatus ( "checking for dups" );

 	// BUT if we are already indexed and a a crawlbot/bulk diffbot job
@ -10328,6 +10338,28 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
 	for ( ; p < pend ; p++ ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
+
+		// fix <!--[if lte IE 6]>
+		// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
+		if ( *p == '!' &&
+			 p[-1]=='<' &&
+			 p[1] == '-' &&
+			 p[2] == '-' ) {
+				// find end of comment
+				for ( ; p < pend ; p++ ) {
+					QUICKPOLL(m_niceness);
+					if (p[0] == '-' &&
+						p[1] == '-' &&
+						p[2] == '>' )
+							break;
+				}
+				// if found no end of comment, then stop
+				if ( p >= pend )
+					break;
+				// resume looking for meta redirect tags
+				continue;
+		}
+
 		// base everything off the equal sign
 		if ( *p != '=' ) continue;
 		// did we match "http-equiv="?
@ -20249,6 +20281,11 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
 			"<td>%s</td>"
 			"</tr>\n"

+			"<tr>"
+			"<td>http status</td>"
+			"<td>%i</td>"
+			"</tr>\n"
+
 			"<tr>"
 			"<td>url filter num</td>"
 			"<td>%"INT32"</td>"
@ -20284,6 +20321,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
 			getFirstUrlHash64(), // uh48

 			mstrerror(m_indexCode),
+			m_httpStatus,
 			ufn,
 			mstrerror(g_errno),
 			allowed,