forked from Mirrors/privacore-open-source-search-engine
crawldelay works now but it measures
from the end of the download, not the beginning.
This commit is contained in:
63
Msg13.cpp
63
Msg13.cpp
@ -528,7 +528,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
bool queueIt = false;
|
||||
if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
|
||||
// a "last" of 0 means currently downloading
|
||||
if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
|
||||
// a last of -1 means not found. so first time i guess.
|
||||
if ( last == -1 ) queueIt = false;
|
||||
|
||||
// . queue it up if we haven't waited long enough
|
||||
@ -537,6 +539,8 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// . it will just lookup the lastdownload time in the cache,
|
||||
// which will store maybe a -1 if currently downloading...
|
||||
if ( queueIt ) {
|
||||
// debug
|
||||
//log("spider: adding %s to crawldelayqueue",r->m_url);
|
||||
// save this
|
||||
r->m_udpSlot = slot;
|
||||
r->m_nextLink = NULL;
|
||||
@ -545,17 +549,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
s_hammerQueueHead = r;
|
||||
s_hammerQueueTail = r;
|
||||
}
|
||||
else
|
||||
else {
|
||||
s_hammerQueueTail->m_nextLink = r;
|
||||
s_hammerQueueTail = r;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// if we had it in cache check the wait time
|
||||
if ( last > 0 && waited < 400 ) {
|
||||
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
||||
log("spider: hammering firstIp=%s url=%s "
|
||||
"only waited %lli ms",
|
||||
iptoa(r->m_firstIp),r->m_url,waited);
|
||||
"only waited %lli ms of %li ms",
|
||||
iptoa(r->m_firstIp),r->m_url,waited,
|
||||
r->m_crawlDelayMS);
|
||||
// this guy has too many redirects and it fails us...
|
||||
// BUT do not core if running live, only if for test
|
||||
// collection
|
||||
@ -645,17 +652,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
}
|
||||
|
||||
|
||||
// are we the first?
|
||||
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
||||
// wait in line cuz someone else downloading it now
|
||||
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
// this means our callback will be called
|
||||
if ( ! firstInLine ) return;
|
||||
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -664,6 +660,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
|
||||
// are we the first?
|
||||
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
||||
// wait in line cuz someone else downloading it now
|
||||
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
// this means our callback will be called
|
||||
if ( ! firstInLine ) {
|
||||
//log("spider: inlining %s",r->m_url);
|
||||
return;
|
||||
}
|
||||
|
||||
// . store time now
|
||||
// . no, now we store 0 to indicate in progress, then we
|
||||
// will overwrite it with a timestamp when the download completes
|
||||
@ -2166,20 +2176,39 @@ void scanHammerQueue ( int fd , void *state ) {
|
||||
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
|
||||
Msg13Request *prev = NULL;
|
||||
long long waited = -1LL;
|
||||
|
||||
// scan down the linked list of queued of msg13 requests
|
||||
for ( ; r ; r = r->m_nextLink ) {
|
||||
for ( ; r ; prev = r , r = r->m_nextLink ) {
|
||||
long long last;
|
||||
last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
|
||||
// is one from this ip outstanding?
|
||||
if ( last == 0LL ) continue;
|
||||
// download finished?
|
||||
if ( last > 0 ) {
|
||||
long long waited = nowms - last;
|
||||
waited = nowms - last;
|
||||
// but skip if haven't waited long enough
|
||||
if ( waited < r->m_crawlDelayMS ) continue;
|
||||
}
|
||||
// debug
|
||||
//log("spider: downloading %s from crawldelay queue "
|
||||
// "waited=%llims crawldelay=%lims",
|
||||
// r->m_url,waited,r->m_crawlDelayMS);
|
||||
// good to go
|
||||
downloadTheDocForReals ( r );
|
||||
//
|
||||
// remove from future scans
|
||||
//
|
||||
if ( prev )
|
||||
prev->m_nextLink = r->m_nextLink;
|
||||
|
||||
if ( s_hammerQueueHead == r )
|
||||
s_hammerQueueHead = r->m_nextLink;
|
||||
|
||||
if ( s_hammerQueueTail == r )
|
||||
s_hammerQueueTail = prev;
|
||||
|
||||
// try to download some more i guess...
|
||||
}
|
||||
}
|
||||
|
77
XmlDoc.cpp
77
XmlDoc.cpp
@ -11982,6 +11982,67 @@ bool isAllowed2 ( Url *url ,
|
||||
goto urlLoop;
|
||||
}
|
||||
|
||||
// when doing a custom crawl we have to decide between the provided crawl
|
||||
// delay, and the one in the robots.txt...
|
||||
long *XmlDoc::getFinalCrawlDelay() {
|
||||
|
||||
if ( m_finalCrawlDelayValid )
|
||||
return &m_finalCrawlDelay;
|
||||
|
||||
bool *isAllowed = getIsAllowed();
|
||||
if ( ! isAllowed || isAllowed == (void *)-1 ) return (long *)isAllowed;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
m_finalCrawlDelayValid = true;
|
||||
|
||||
// getIsAllowed already sets m_crawlDelayValid to true
|
||||
if ( ! cr->m_isCustomCrawl ) {
|
||||
m_finalCrawlDelay = m_crawlDelay;
|
||||
// default to 250ms i guess if none specified in robots
|
||||
// just to be somewhat nice by default
|
||||
if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250;
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
// get manually specified crawl delay in seconds. convert to ms.
|
||||
long manual = cr->m_collectiveCrawlDelay * 1000.0;
|
||||
// negative means -1 means unknown or not specified
|
||||
if ( manual < 0 ) manual = -1;
|
||||
|
||||
// if both are unknown...
|
||||
if ( m_crawlDelay == -1 && manual == -1 ) {
|
||||
m_finalCrawlDelay = -1;
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
// if not in robots.txt use manual
|
||||
if ( m_crawlDelay == -1 ) {
|
||||
m_finalCrawlDelay = manual;
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
// if manually provided crawldelay is -1, use robots.txt then
|
||||
if ( manual == -1 ) {
|
||||
m_finalCrawlDelay = m_crawlDelay;
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
// let robots.txt dictate if both are >= 0
|
||||
if ( m_useRobotsTxt ) {
|
||||
m_finalCrawlDelay = m_crawlDelay;
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
// if not using robots.txt, pick the smallest
|
||||
if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
|
||||
else m_finalCrawlDelay = manual;
|
||||
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
|
||||
// . get the Robots.txt and see if we are allowed
|
||||
// . returns NULL and sets g_errno on error
|
||||
// . returns -1 if blocked, will re-call m_callback
|
||||
@ -12025,6 +12086,9 @@ bool *XmlDoc::getIsAllowed ( ) {
|
||||
if ( isRobotsTxt ) {
|
||||
m_isAllowed = true;
|
||||
m_isAllowedValid = true;
|
||||
m_crawlDelayValid = true;
|
||||
// make it super fast...
|
||||
m_crawlDelay = 0;
|
||||
return &m_isAllowed;
|
||||
}
|
||||
|
||||
@ -13550,6 +13614,9 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// this must be valid, since we share m_msg13 with it
|
||||
if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
long *cd = getFinalCrawlDelay();
|
||||
if ( ! cd || cd == (void *)-1 ) return (char **)cd;
|
||||
|
||||
// we might bail
|
||||
if ( ! *isAllowed ) {
|
||||
m_httpReplyValid = true;
|
||||
@ -13678,11 +13745,11 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
r->m_ifModifiedSince = 0;
|
||||
r->m_skipHammerCheck = 0;
|
||||
|
||||
// . this is -1 if none found in robots.txt etc.
|
||||
// . if not using robots.txt it will always be -1
|
||||
// . it should also be -1 for the robots.txt file itself
|
||||
if ( m_crawlDelayValid ) r->m_crawlDelayMS = m_crawlDelay;
|
||||
else r->m_crawlDelayMS = -1;
|
||||
|
||||
// . this is -1 if unknown. none found in robots.txt or provided
|
||||
// in the custom crawl parms.
|
||||
// . it should also be 0 for the robots.txt file itself
|
||||
r->m_crawlDelayMS = *cd;
|
||||
|
||||
// need this in order to get all languages, etc. and avoid having
|
||||
// to set words class at the spider compression proxy level
|
||||
|
3
XmlDoc.h
3
XmlDoc.h
@ -269,6 +269,7 @@ class XmlDoc {
|
||||
uint32_t m_internalFlags1;
|
||||
long m_ip;
|
||||
long m_crawlDelay;
|
||||
long m_finalCrawlDelay;
|
||||
// . use this to quickly detect if doc is unchanged
|
||||
// . we can avoid setting Xml and Words classes etc...
|
||||
long m_contentHash32;
|
||||
@ -630,6 +631,7 @@ class XmlDoc {
|
||||
long *getIp ( ) ;
|
||||
long *gotIp ( bool save ) ;
|
||||
bool *getIsAllowed ( ) ;
|
||||
long *getFinalCrawlDelay();
|
||||
//long getTryAgainTimeDelta() {
|
||||
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
|
||||
// return m_tryAgainTimeDelta;
|
||||
@ -1169,6 +1171,7 @@ class XmlDoc {
|
||||
*/
|
||||
bool m_httpStatusValid;
|
||||
bool m_crawlDelayValid;
|
||||
bool m_finalCrawlDelayValid;
|
||||
bool m_titleRecKeyValid;
|
||||
bool m_adVectorValid;
|
||||
bool m_wikiDocIdsValid;
|
||||
|
Reference in New Issue
Block a user