Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

2014-02-16 16:02:00 -08:00
parent c691b2dd5f a4b6716623
commit 88a151f1d9
3 changed files with 16 additions and 9 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -2096,6 +2096,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	// default to 250ms i guess. -1 means unset i think.
 	if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;

+	bool isEthan = false;
+	if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
+
 	// make the gigablast regex table just "default" so it does not
 	// filtering, but accepts all urls. we will add code to pass the urls
 	// through m_diffbotUrlCrawlPattern alternatively. if that itself
@ -2106,6 +2109,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		m_maxSpidersPerRule [i] = 100;
 		m_spiderIpWaits     [i] = wait;
 		m_spiderIpMaxSpiders[i] = 7; // keep it respectful
+		// ethan wants some speed
+		if ( isEthan )
+			m_spiderIpMaxSpiders[i] = 30;
 		//m_spidersEnabled    [i] = 1;
 		m_spiderFreqs       [i] =m_collectiveRespiderFrequency;
 		//m_spiderDiffbotApiUrl[i].purge();
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2802,9 +2802,13 @@ long *XmlDoc::getIndexCode2 ( ) {
 		return &m_indexCode;
 	}

+	CollectionRec *cr = getCollRec();
+	if ( ! cr ) return NULL;
+
+	// "url is repeating path components" error?
 	if ( ! m_check1 ) {
 		m_check1         = true;
-		if ( m_firstUrl.isLinkLoop() ) {
+		if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
 			m_indexCode      = ELINKLOOP;
 			m_indexCodeValid = true;
 			return &m_indexCode;
@ -2818,9 +2822,6 @@ long *XmlDoc::getIndexCode2 ( ) {
 		return &m_indexCode;
 	}

-	CollectionRec *cr = getCollRec();
-	if ( ! cr ) return NULL;
-
 	if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
 		m_check2         = true;
 		if ( m_firstUrl.isSpam() ) {
--- a/gb.conf
+++ b/gb.conf
@ -328,10 +328,10 @@
 <collectionsToRepairOrRebuild><![CDATA[main]]></>

 # In bytes.
-<memoryToUseForRepair>300000000</>
+<memoryToUseForRepair>200000000</>

 # Maximum number of outstanding inject spiders for repair.
-<maxRepairSpiders>32</>
+<maxRepairSpiders>2</>

 # If enabled, gigablast will reinject the content of all title recs into a
 # secondary rdb system. That will the primary rdb system when complete.
@ -342,13 +342,13 @@
 <keepNewSpiderdbRecs>1</>

 # If enabled, gigablast will recycle the link info when rebuilding titledb.
-<recycleLinkInfo>0</>
+<recycleLinkInfo>1</>

 # If enabled, gigablast will rebuild this rdb
 <rebuildTitledb>1</>

 # If enabled, gigablast will rebuild this rdb
-<rebuildPosdb>0</>
+<rebuildPosdb>1</>

 # If enabled, gigablast will rebuild this rdb
 <rebuildClusterdb>0</>
@ -368,4 +368,4 @@
 # When rebuilding spiderdb and scanning it for new spiderdb records, should a
 # tagdb lookup be performed? Runs much much faster without it. Will also keep
 # the original doc quality and spider priority in tact.
-<skipTagdbLookup>0</>
+<skipTagdbLookup>1</>