do not cache winner list if

the # of requests from the IP is
less than 25k about.
This commit is contained in:
Matt 2015-02-18 19:14:06 -07:00
parent dbaff2dfb8
commit 860ff24227

View File

@ -4368,10 +4368,12 @@ bool SpiderColl::scanListForWinners ( ) {
int32_t maxWinners = (int32_t)MAX_WINNER_NODES; // 40
//if ( ! m_cr->m_isCustomCrawl ) maxWinners = 1;
// only put 40 urls from the same firstIp into doledb if
// we have a lot of urls in our spiderdb already.
// only put one doledb record into winner tree if
// the list is pretty short. otherwise, we end up caching
// too much. granted, we only cache for about 2 mins.
// mdw: for testing take this out!
//if ( m_totalBytesScanned < 200000 ) maxWinners = 1;
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
// sanity. make sure read is somewhat hefty for our
// maxWinners=1 thing
if ( (int32_t)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; }