Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing
This commit is contained in:
@ -2096,6 +2096,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// default to 250ms i guess. -1 means unset i think.
|
||||
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
|
||||
|
||||
bool isEthan = false;
|
||||
if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
// through m_diffbotUrlCrawlPattern alternatively. if that itself
|
||||
@ -2106,6 +2109,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_maxSpidersPerRule [i] = 100;
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
// ethan wants some speed
|
||||
if ( isEthan )
|
||||
m_spiderIpMaxSpiders[i] = 30;
|
||||
//m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
|
@ -2802,9 +2802,13 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// "url is repeating path components" error?
|
||||
if ( ! m_check1 ) {
|
||||
m_check1 = true;
|
||||
if ( m_firstUrl.isLinkLoop() ) {
|
||||
if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
|
||||
m_indexCode = ELINKLOOP;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
@ -2818,9 +2822,6 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
|
||||
m_check2 = true;
|
||||
if ( m_firstUrl.isSpam() ) {
|
||||
|
10
gb.conf
10
gb.conf
@ -328,10 +328,10 @@
|
||||
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
||||
|
||||
# In bytes.
|
||||
<memoryToUseForRepair>300000000</>
|
||||
<memoryToUseForRepair>200000000</>
|
||||
|
||||
# Maximum number of outstanding inject spiders for repair.
|
||||
<maxRepairSpiders>32</>
|
||||
<maxRepairSpiders>2</>
|
||||
|
||||
# If enabled, gigablast will reinject the content of all title recs into a
|
||||
# secondary rdb system. That will the primary rdb system when complete.
|
||||
@ -342,13 +342,13 @@
|
||||
<keepNewSpiderdbRecs>1</>
|
||||
|
||||
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
||||
<recycleLinkInfo>0</>
|
||||
<recycleLinkInfo>1</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildTitledb>1</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildPosdb>0</>
|
||||
<rebuildPosdb>1</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildClusterdb>0</>
|
||||
@ -368,4 +368,4 @@
|
||||
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
||||
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
||||
# the original doc quality and spider priority in tact.
|
||||
<skipTagdbLookup>0</>
|
||||
<skipTagdbLookup>1</>
|
||||
|
Reference in New Issue
Block a user