Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

This commit is contained in:
Matt Wells
2014-02-16 16:02:00 -08:00
3 changed files with 16 additions and 9 deletions

@ -2096,6 +2096,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// default to 250ms i guess. -1 means unset i think.
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
bool isEthan = false;
if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
// through m_diffbotUrlCrawlPattern alternatively. if that itself
@ -2106,6 +2109,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
m_maxSpidersPerRule [i] = 100;
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
// ethan wants some speed
if ( isEthan )
m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
//m_spiderDiffbotApiUrl[i].purge();

@ -2802,9 +2802,13 @@ long *XmlDoc::getIndexCode2 ( ) {
return &m_indexCode;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// "url is repeating path components" error?
if ( ! m_check1 ) {
m_check1 = true;
if ( m_firstUrl.isLinkLoop() ) {
if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
m_indexCode = ELINKLOOP;
m_indexCodeValid = true;
return &m_indexCode;
@ -2818,9 +2822,6 @@ long *XmlDoc::getIndexCode2 ( ) {
return &m_indexCode;
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
m_check2 = true;
if ( m_firstUrl.isSpam() ) {

10
gb.conf

@ -328,10 +328,10 @@
<collectionsToRepairOrRebuild><![CDATA[main]]></>
# In bytes.
<memoryToUseForRepair>300000000</>
<memoryToUseForRepair>200000000</>
# Maximum number of outstanding inject spiders for repair.
<maxRepairSpiders>32</>
<maxRepairSpiders>2</>
# If enabled, gigablast will reinject the content of all title recs into a
# secondary rdb system. That will the primary rdb system when complete.
@ -342,13 +342,13 @@
<keepNewSpiderdbRecs>1</>
# If enabled, gigablast will recycle the link info when rebuilding titledb.
<recycleLinkInfo>0</>
<recycleLinkInfo>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildTitledb>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildPosdb>0</>
<rebuildPosdb>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildClusterdb>0</>
@ -368,4 +368,4 @@
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
# tagdb lookup be performed? Runs much much faster without it. Will also keep
# the original doc quality and spider priority in tact.
<skipTagdbLookup>0</>
<skipTagdbLookup>1</>