forked from Mirrors/privacore-open-source-search-engine
fix empty rdbcache bug of not enough buf mem.
This commit is contained in:
50
Msg13.cpp
50
Msg13.cpp
@ -601,7 +601,7 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
|
||||
if ( ! s_flag ) {
|
||||
s_flag = true;
|
||||
s_hammerCache.init ( 5000 , // maxcachemem,
|
||||
s_hammerCache.init ( 15000 , // maxcachemem,
|
||||
8 , // fixed data size
|
||||
false , // support lists?
|
||||
500 , // max nodes
|
||||
@ -887,6 +887,11 @@ void downloadTheDocForReals3a ( Msg13Request *r ) {
|
||||
|
||||
void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
|
||||
int64_t nowms = gettimeofdayInMilliseconds();
|
||||
|
||||
// assume no download start time
|
||||
r->m_downloadStartTimeMS = 0;
|
||||
|
||||
// . store time now
|
||||
// . no, now we store 0 to indicate in progress, then we
|
||||
// will overwrite it with a timestamp when the download completes
|
||||
@ -901,7 +906,6 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
}
|
||||
else if ( ! r->m_skipHammerCheck ) {
|
||||
// get time now
|
||||
int64_t nowms = gettimeofdayInMilliseconds();
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
|
||||
log(LOG_DEBUG,
|
||||
"spider: adding new time to hammercache for %s %s = %"INT64"",
|
||||
@ -920,7 +924,7 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
-1,
|
||||
0,//-1,
|
||||
iptoa(r->m_firstIp),
|
||||
r->ptr_url);
|
||||
|
||||
@ -1011,6 +1015,9 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
|
||||
fixGETorPOST ( exactRequest );
|
||||
|
||||
// indicate start of download so we can overwrite the 0 we stored
|
||||
// into the hammercache
|
||||
r->m_downloadStartTimeMS = nowms;
|
||||
|
||||
// . download it
|
||||
// . if m_proxyIp is non-zero it will make requests like:
|
||||
@ -1347,18 +1354,30 @@ void gotHttpReply2 ( void *state ,
|
||||
|
||||
// get time now
|
||||
int64_t nowms = gettimeofdayInMilliseconds();
|
||||
|
||||
// right now there is a 0 in there to indicate in-progress.
|
||||
// so we must overwrite with either the download start time or the
|
||||
// download end time.
|
||||
int64_t timeToAdd = r->m_downloadStartTimeMS;
|
||||
if ( r->m_crawlDelayFromEnd ) timeToAdd = nowms;
|
||||
|
||||
// . now store the current time in the cache
|
||||
// . do NOT do this for robots.txt etc. where we skip hammer check
|
||||
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
||||
// note it
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding final download end time of %"INT64" for "
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
nowms,iptoa(r->m_firstIp),r->ptr_url);
|
||||
if ( ! r->m_skipHammerCheck )
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp,timeToAdd);
|
||||
|
||||
// note it
|
||||
if ( g_conf.m_logDebugSpider && ! r->m_skipHammerCheck )
|
||||
log(LOG_DEBUG,"spider: adding last download time "
|
||||
"of %"INT64" for firstIp=%s url=%s "
|
||||
"to msg13::hammerCache",
|
||||
timeToAdd,iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s",
|
||||
iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
// sanity. this was happening from iframe download
|
||||
//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }
|
||||
@ -2786,7 +2805,7 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
// . make sure we are not hammering an ip
|
||||
// . returns 0 if currently downloading a url from that ip
|
||||
// . returns -1 if not found
|
||||
int64_t last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
|
||||
int64_t last = s_hammerCache.getLongLong(0,r->m_firstIp,-1,true);
|
||||
// get time now
|
||||
int64_t nowms = gettimeofdayInMilliseconds();
|
||||
// how long has it been since last download START time?
|
||||
@ -2794,6 +2813,11 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
|
||||
int32_t crawlDelayMS = r->m_crawlDelayMS;
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
|
||||
"hammercache (waited=%"INT64") for %s",last,waited,
|
||||
iptoa(r->m_firstIp));
|
||||
|
||||
// . if we got a proxybackoff base it on # of banned proxies for urlIp
|
||||
// . try to be more sensitive for more sensitive website policies
|
||||
// . we don't know why this proxy was banned, or if we were
|
||||
|
2
Msg13.h
2
Msg13.h
@ -56,6 +56,8 @@ public:
|
||||
char *m_proxiedUrl;
|
||||
int32_t m_proxiedUrlLen;
|
||||
|
||||
int64_t m_downloadStartTimeMS;
|
||||
|
||||
char m_niceness;
|
||||
int32_t m_ifModifiedSince;
|
||||
int32_t m_maxCacheAge;
|
||||
|
14
RdbCache.cpp
14
RdbCache.cpp
@ -152,6 +152,20 @@ bool RdbCache::init ( int32_t maxMem ,
|
||||
// . make the 128MB buffers
|
||||
// . if we do more than 128MB per buf then pthread_create() will fail
|
||||
int32_t bufMem = m_maxMem - m_memAlloced;
|
||||
if( bufMem <= 0 ) {
|
||||
log("rdbcache: cache for %s does not have enough mem. fix "
|
||||
"by increasing maxmem or number of recs, etc.",m_dbname);
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
if ( bufMem && m_fixedDataSize > 0 &&
|
||||
bufMem / m_fixedDataSize < maxRecs / 2 ) {
|
||||
log("cache: warning. "
|
||||
"cache for %s can have %i ptrs but buf mem "
|
||||
"can only hold %i objects"
|
||||
,m_dbname
|
||||
,(int)maxRecs
|
||||
,(int)(bufMem/m_fixedDataSize));
|
||||
}
|
||||
m_totalBufSize = 0LL;
|
||||
m_offset = 0LL;
|
||||
while ( bufMem > 0 && m_numBufs < 32 ) {
|
||||
|
@ -2073,8 +2073,8 @@ bool SpiderColl::isInDupCache ( SpiderRequest *sreq , bool addToCache ) {
|
||||
// init dup cache?
|
||||
if ( ! m_dupCache.isInitialized() )
|
||||
// use 50k i guess of 64bit numbers and linked list info
|
||||
m_dupCache.init ( 50000,
|
||||
4 , // fixeddatasize (don't really need this)
|
||||
m_dupCache.init ( 90000,
|
||||
8 , // fixeddatasize (don't really need this)
|
||||
false, // list support?
|
||||
5000, // maxcachenodes
|
||||
false, // usehalfkeys?
|
||||
@ -5257,7 +5257,7 @@ void SpiderLoop::startLoop ( ) {
|
||||
m_lockTable.set ( 8,sizeof(UrlLock),0,NULL,0,false,MAX_NICENESS,
|
||||
"splocks", true ); // useKeyMagic? yes.
|
||||
|
||||
if ( ! m_lockCache.init ( 10000 , // maxcachemem
|
||||
if ( ! m_lockCache.init ( 20000 , // maxcachemem
|
||||
4 , // fixedatasize
|
||||
false , // supportlists?
|
||||
1000 , // maxcachenodes
|
||||
|
Reference in New Issue
Block a user