a few core dump fixes. get crawl-delay

working a little. about half way done.
This commit is contained in:
Matt Wells
2013-10-22 15:44:10 -07:00
parent 8c3a61f070
commit 8f5bb4a787
5 changed files with 83 additions and 40 deletions

@ -257,11 +257,13 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
if ( i >= m_numRecs &&
(i+1)*4 > m_recPtrBuf.getCapacity() ) {
long need = (i+1)*sizeof(CollectionRec *);
long have = m_recPtrBuf.getLength();//Capacity();
long have = m_recPtrBuf.getLength();
need -= have;
// true here means to clear the new space to zeroes
if ( ! m_recPtrBuf.reserve ( need ,NULL, true ) )
return log("admin: error growing rec ptr buf");
// don't forget to do this...
m_recPtrBuf.setLength ( need );
}
// re-ref it in case it is different
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
@ -681,6 +683,21 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
char *xx=NULL;*xx=0;
}
// inc the rec ptr buf i guess
long need = (m_numRecs+1)*sizeof(CollectionRec *);
long have = m_recPtrBuf.getLength();
need -= have;
// true here means to clear the new space to zeroes
if ( ! m_recPtrBuf.reserve ( need ,NULL, true ) )
return log("admin: error growing rec ptr buf2.");
// re-ref it in case it is different
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// ensure last is NULL
m_recs[m_numRecs] = NULL;
// update length of used bytes
m_recPtrBuf.setLength ( need );
/*
// make sure an update not in progress
if ( cr->m_inProgress ) { char *xx=NULL;*xx=0; }
@ -745,7 +762,7 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
collnum_t oldCollnum = cr->m_collnum;
collnum_t newCollnum = m_numRecs;
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {

@ -15793,6 +15793,8 @@ void Parms::overlapTest ( char step ) {
// because that modifies another parm, "spidering enabled"
if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue;
p1 = NULL;
if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr;
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;
@ -15833,6 +15835,8 @@ void Parms::overlapTest ( char step ) {
// because that modifies another parm, "spidering enabled"
if ( m_parms[i].m_type == TYPE_BOOL2 ) continue;
if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue;
p1 = NULL;
if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr;
if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf;

@ -312,8 +312,8 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
memcpy(m_buf, tmpBuf, m_length);
// reset to 0's?
if ( clearIt ) {
long clearSize = m_capacity - m_length;
memset(m_buf+m_length,0,clearSize);
long clearSize = m_capacity - tmpCap;
memset(m_buf+tmpCap,0,clearSize);
}
m_usingStack = false;
return true;
@ -326,13 +326,18 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
m_capacity = tmpCap;
return false;
}
// reset to 0's?
if ( clearIt ) {
long clearSize = m_capacity - tmpCap;
memset(m_buf+tmpCap,0,clearSize);
}
log(LOG_DEBUG, "query: resize safebuf %li to %li",
tmpCap, m_capacity);
}
// reset to 0's?
if ( ! clearIt ) return true;
long clearSize = m_capacity - m_length;
memset(m_buf+m_length,0,clearSize);
//if ( ! clearIt ) return true;
//long clearSize = m_capacity - m_length;
//memset(m_buf+m_length,0,clearSize);
return true;
}

@ -1050,7 +1050,7 @@ bool SpiderColl::load ( ) {
if (!m_sniTable.set ( 4,8,5000,NULL,0,false,MAX_NICENESS,"snitbl") )
return false;
if (!m_cdTable.set (4,8,3000,NULL,0,false,MAX_NICENESS,"cdtbl"))
if (!m_cdTable.set (4,4,3000,NULL,0,false,MAX_NICENESS,"cdtbl"))
return false;
// doledb seems to have like 32000 entries in it
if (!m_doleIpTable.set(4,4,128000,NULL,0,false,MAX_NICENESS,"doleip"))
@ -1588,24 +1588,22 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
// . -1 implies an invalid or unknown crawl delay
if ( srep->m_crawlDelayMS >= 0 ) {
// use the domain hash for this guy! since its from robots.txt
uint64_t *cdp ;
cdp = (uint64_t *)m_cdTable.getValue32(srep->m_domHash32);
long *cdp = (long *)m_cdTable.getValue32(srep->m_domHash32);
// update it only if better or empty
bool update = false;
if ( ! cdp )
update = true;
else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime)
update = true;
if ( ! cdp ) update = true;
//else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime)
// update = true;
// update m_sniTable if we should
if ( update ) {
// . make new data for this key
// . lower 32 bits is the addedTime
// . upper 32 bits is the siteNumInlinks
uint64_t nv = (uint32_t)(srep->m_crawlDelayMS);
// . lower 32 bits is the spideredTime
// . upper 32 bits is the crawldelay
long nv = (long)(srep->m_crawlDelayMS);
// shift up
nv <<= 32;
//nv <<= 32;
// or in time
nv |= (uint32_t)srep->m_spideredTime;
//nv |= (uint32_t)srep->m_spideredTime;
// just direct update if faster
if ( cdp ) *cdp = nv;
// store it anew otherwise
@ -1785,7 +1783,8 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
// spiders disabled for this row in url filteres?
if ( ! m_cr->m_spidersEnabled[ufn] ) {
if ( g_conf.m_logDebugSpider )
log("spider: request spidersoff ufn=%li",ufn);
log("spider: request spidersoff ufn=%li url=%s",ufn,
sreq->m_url);
return true;
}
@ -3513,12 +3512,12 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
uint64_t nowGlobalMS ) {
// . get the scheduled spiderTime for it
// . assume this SpiderRequest never been successfully spidered
uint64_t spiderTimeMS = ((uint64_t)sreq->m_addedTime) * 1000LL;
long long spiderTimeMS = ((uint64_t)sreq->m_addedTime) * 1000LL;
// if injecting for first time, use that!
if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS;
// to avoid hammering an ip, get last time we spidered it...
uint64_t lastMS ;
long long lastMS ;
lastMS = m_lastDownloadCache.getLongLong ( m_collnum ,
sreq->m_firstIp ,
-1 , // maxAge
@ -3531,16 +3530,18 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
lastMS = 0;
}
// min time we can spider it
uint64_t minSpiderTimeMS = lastMS + m_cr->m_spiderIpWaits[ufn];
long long minSpiderTimeMS1 = lastMS + m_cr->m_spiderIpWaits[ufn];
// if not found in cache
if ( lastMS == (uint64_t)-1 ) minSpiderTimeMS = 0LL;
if ( lastMS == -1 ) minSpiderTimeMS1 = 0LL;
/////////////////////////////////////////////////
/////////////////////////////////////////////////
// TODO: put crawldelay table check in here!!!!
// crawldelay table check!!!!
/////////////////////////////////////////////////
/////////////////////////////////////////////////
long *cdp = (long *)m_cdTable.getValue ( &sreq->m_domHash32 );
long long minSpiderTimeMS2 = 0;
if ( cdp && *cdp >= 0 ) minSpiderTimeMS2 = lastMS + *cdp;
// wait 5 seconds for all outlinks in order for them to have a
// chance to get any link info that might have been added
@ -3551,7 +3552,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
//spiderTimeMS += 5000;
// ensure min
if ( spiderTimeMS < minSpiderTimeMS ) spiderTimeMS = minSpiderTimeMS;
if ( spiderTimeMS < minSpiderTimeMS1 ) spiderTimeMS = minSpiderTimeMS1;
if ( spiderTimeMS < minSpiderTimeMS2 ) spiderTimeMS = minSpiderTimeMS2;
// if no reply, use that
if ( ! srep ) return spiderTimeMS;
// if this is not the first try, then re-compute the spiderTime
@ -3559,33 +3561,33 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
// sanity check
if ( srep->m_spideredTime <= 0 ) {
// a lot of times these are corrupt! wtf???
spiderTimeMS = minSpiderTimeMS;
//spiderTimeMS = minSpiderTimeMS;
return spiderTimeMS;
//{ char*xx=NULL;*xx=0;}
}
// compute new spiderTime for this guy, in seconds
uint64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
// do not spider more than once per 15 minutes ever!
long long waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
// do not spider more than once per 15 seconds ever!
// no! might be a query reindex!!
if ( waitInSecs < 900 && ! sreq->m_urlIsDocId ) {
if ( waitInSecs < 15 && ! sreq->m_urlIsDocId ) {
static bool s_printed = false;
if ( ! s_printed ) {
s_printed = true;
log("spider: min spider wait is 900, "
log("spider: min spider wait is 15 seconds, "
"not %llu (ufn=%li)",waitInSecs,ufn);
}
waitInSecs = 900;
waitInSecs = 15;//900; this was 15 minutes
}
// in fact, force docid based guys to be zero!
if ( sreq->m_urlIsDocId ) waitInSecs = 0;
// when it was spidered
uint64_t lastSpideredMS = ((uint64_t)srep->m_spideredTime) * 1000;
long long lastSpideredMS = ((uint64_t)srep->m_spideredTime) * 1000;
// . when we last attempted to spider it... (base time)
// . use a lastAttempt of 0 to indicate never!
// (first time)
spiderTimeMS = lastSpideredMS + (waitInSecs * 1000LL);
long long minSpiderTimeMS3 = lastSpideredMS + (waitInSecs * 1000LL);
// ensure min
if ( spiderTimeMS < minSpiderTimeMS ) spiderTimeMS = minSpiderTimeMS;
if ( spiderTimeMS < minSpiderTimeMS3 ) spiderTimeMS = minSpiderTimeMS3;
// sanity
if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
@ -8824,6 +8826,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( isForMsg20 ) continue;
// reply based
long a = 0;
// if no spider reply we can't match this rule!
if ( ! srep ) continue;
// shortcut
if ( srep ) a = srep->m_spideredTime;
// make it point to the retry count

@ -11784,11 +11784,11 @@ bool isAllowed2 ( Url *url ,
if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) {
// set flag
flag = 1;
// skip if invalid
if ( ! is_digit ( *v ) ) goto urlLoop;
// skip if invalid. it could be ".5" seconds
if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop;
// get this. multiply crawl delay by x1000 to be in
// milliseconds/ms
long long vv = atoll(v) * 1000LL;
long long vv = atof(v) * 1000LL;
// truncate to 0x7fffffff
if ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff;
else if ( vv < 0 ) *crawlDelay = -1;
@ -12024,7 +12024,20 @@ bool *XmlDoc::getIsAllowed ( ) {
m_isAllowed = true;
m_isAllowedValid = true;
// put in a crawldelay test for diffbot
/*
SafeBuf tmp;
if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
tmp.safePrintf("User-Agent: *\n"
"Crawl-Delay: 10.1\n"
);
content = tmp.getBufStart();
contentLen = tmp.getLength();
}
// if not success, assume no robots.txt
else*/
if ( mime->getHttpStatus() != 200 ) {
// nuke it to save mem
nukeDoc ( ed );
@ -12070,7 +12083,7 @@ bool *XmlDoc::getIsAllowed ( ) {
&cacheLen ,
&hadAllowOrDisallow );
// bring back?
if ( savedCrawlDelay ) m_crawlDelay = savedCrawlDelay;
if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay;
// nuke it to save mem
nukeDoc ( ed );
// we are legit