crawldelay works now but it measures

from the end of the download, not the
beginning.
This commit is contained in:
Matt Wells
2013-11-26 12:58:14 -08:00
parent 1c7c9a4d80
commit 8bb086ac60
3 changed files with 121 additions and 22 deletions

@ -528,7 +528,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
bool queueIt = false;
if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
// a "last" of 0 means currently downloading
if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
// a last of -1 means not found. so first time i guess.
if ( last == -1 ) queueIt = false;
// . queue it up if we haven't waited long enough
@ -537,6 +539,8 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// . it will just lookup the lastdownload time in the cache,
// which will store maybe a -1 if currently downloading...
if ( queueIt ) {
// debug
//log("spider: adding %s to crawldelayqueue",r->m_url);
// save this
r->m_udpSlot = slot;
r->m_nextLink = NULL;
@ -545,17 +549,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
s_hammerQueueHead = r;
s_hammerQueueTail = r;
}
else
else {
s_hammerQueueTail->m_nextLink = r;
s_hammerQueueTail = r;
}
return;
}
// if we had it in cache check the wait time
if ( last > 0 && waited < 400 ) {
if ( last > 0 && waited < r->m_crawlDelayMS ) {
log("spider: hammering firstIp=%s url=%s "
"only waited %lli ms",
iptoa(r->m_firstIp),r->m_url,waited);
"only waited %lli ms of %li ms",
iptoa(r->m_firstIp),r->m_url,waited,
r->m_crawlDelayMS);
// this guy has too many redirects and it fails us...
// BUT do not core if running live, only if for test
// collection
@ -645,17 +652,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
}
// are we the first?
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
// wait in line cuz someone else downloading it now
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
g_udpServer.sendErrorReply(slot,g_errno);
return;
}
// this means our callback will be called
if ( ! firstInLine ) return;
// do not get .google.com/ crap
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
@ -664,6 +660,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
void downloadTheDocForReals ( Msg13Request *r ) {
// are we the first?
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
// wait in line cuz someone else downloading it now
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
return;
}
// this means our callback will be called
if ( ! firstInLine ) {
//log("spider: inlining %s",r->m_url);
return;
}
// . store time now
// . no, now we store 0 to indicate in progress, then we
// will overwrite it with a timestamp when the download completes
@ -2166,20 +2176,39 @@ void scanHammerQueue ( int fd , void *state ) {
long long nowms = gettimeofdayInMilliseconds();
Msg13Request *prev = NULL;
long long waited = -1LL;
// scan down the linked list of queued of msg13 requests
for ( ; r ; r = r->m_nextLink ) {
for ( ; r ; prev = r , r = r->m_nextLink ) {
long long last;
last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
// is one from this ip outstanding?
if ( last == 0LL ) continue;
// download finished?
if ( last > 0 ) {
long long waited = nowms - last;
waited = nowms - last;
// but skip if haven't waited long enough
if ( waited < r->m_crawlDelayMS ) continue;
}
// debug
//log("spider: downloading %s from crawldelay queue "
// "waited=%llims crawldelay=%lims",
// r->m_url,waited,r->m_crawlDelayMS);
// good to go
downloadTheDocForReals ( r );
//
// remove from future scans
//
if ( prev )
prev->m_nextLink = r->m_nextLink;
if ( s_hammerQueueHead == r )
s_hammerQueueHead = r->m_nextLink;
if ( s_hammerQueueTail == r )
s_hammerQueueTail = prev;
// try to download some more i guess...
}
}

@ -11982,6 +11982,67 @@ bool isAllowed2 ( Url *url ,
goto urlLoop;
}
// when doing a custom crawl we have to decide between the provided crawl
// delay, and the one in the robots.txt...
long *XmlDoc::getFinalCrawlDelay() {
if ( m_finalCrawlDelayValid )
return &m_finalCrawlDelay;
bool *isAllowed = getIsAllowed();
if ( ! isAllowed || isAllowed == (void *)-1 ) return (long *)isAllowed;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
m_finalCrawlDelayValid = true;
// getIsAllowed already sets m_crawlDelayValid to true
if ( ! cr->m_isCustomCrawl ) {
m_finalCrawlDelay = m_crawlDelay;
// default to 250ms i guess if none specified in robots
// just to be somewhat nice by default
if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250;
return &m_finalCrawlDelay;
}
// get manually specified crawl delay in seconds. convert to ms.
long manual = cr->m_collectiveCrawlDelay * 1000.0;
// negative means -1 means unknown or not specified
if ( manual < 0 ) manual = -1;
// if both are unknown...
if ( m_crawlDelay == -1 && manual == -1 ) {
m_finalCrawlDelay = -1;
return &m_finalCrawlDelay;
}
// if not in robots.txt use manual
if ( m_crawlDelay == -1 ) {
m_finalCrawlDelay = manual;
return &m_finalCrawlDelay;
}
// if manually provided crawldelay is -1, use robots.txt then
if ( manual == -1 ) {
m_finalCrawlDelay = m_crawlDelay;
return &m_finalCrawlDelay;
}
// let robots.txt dictate if both are >= 0
if ( m_useRobotsTxt ) {
m_finalCrawlDelay = m_crawlDelay;
return &m_finalCrawlDelay;
}
// if not using robots.txt, pick the smallest
if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
else m_finalCrawlDelay = manual;
return &m_finalCrawlDelay;
}
// . get the Robots.txt and see if we are allowed
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
@ -12025,6 +12086,9 @@ bool *XmlDoc::getIsAllowed ( ) {
if ( isRobotsTxt ) {
m_isAllowed = true;
m_isAllowedValid = true;
m_crawlDelayValid = true;
// make it super fast...
m_crawlDelay = 0;
return &m_isAllowed;
}
@ -13550,6 +13614,9 @@ char **XmlDoc::getHttpReply2 ( ) {
// this must be valid, since we share m_msg13 with it
if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }
long *cd = getFinalCrawlDelay();
if ( ! cd || cd == (void *)-1 ) return (char **)cd;
// we might bail
if ( ! *isAllowed ) {
m_httpReplyValid = true;
@ -13678,11 +13745,11 @@ char **XmlDoc::getHttpReply2 ( ) {
r->m_ifModifiedSince = 0;
r->m_skipHammerCheck = 0;
// . this is -1 if none found in robots.txt etc.
// . if not using robots.txt it will always be -1
// . it should also be -1 for the robots.txt file itself
if ( m_crawlDelayValid ) r->m_crawlDelayMS = m_crawlDelay;
else r->m_crawlDelayMS = -1;
// . this is -1 if unknown. none found in robots.txt or provided
// in the custom crawl parms.
// . it should also be 0 for the robots.txt file itself
r->m_crawlDelayMS = *cd;
// need this in order to get all languages, etc. and avoid having
// to set words class at the spider compression proxy level

@ -269,6 +269,7 @@ class XmlDoc {
uint32_t m_internalFlags1;
long m_ip;
long m_crawlDelay;
long m_finalCrawlDelay;
// . use this to quickly detect if doc is unchanged
// . we can avoid setting Xml and Words classes etc...
long m_contentHash32;
@ -630,6 +631,7 @@ class XmlDoc {
long *getIp ( ) ;
long *gotIp ( bool save ) ;
bool *getIsAllowed ( ) ;
long *getFinalCrawlDelay();
//long getTryAgainTimeDelta() {
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
// return m_tryAgainTimeDelta;
@ -1169,6 +1171,7 @@ class XmlDoc {
*/
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
bool m_titleRecKeyValid;
bool m_adVectorValid;
bool m_wikiDocIdsValid;