crawldelay works now but it measures

from the end of the download, not the beginning.
2013-11-26 12:58:14 -08:00
parent 1c7c9a4d80
commit 8bb086ac60
3 changed files with 121 additions and 22 deletions
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -528,7 +528,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {

 		bool queueIt = false;
 		if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
+		// a "last" of 0 means currently downloading
 		if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
+		// a last of -1 means not found. so first time i guess.
 		if ( last == -1 ) queueIt = false;

 		// . queue it up if we haven't waited long enough
@ -537,6 +539,8 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 		// . it will just lookup the lastdownload time in the cache,
 		//   which will store maybe a -1 if currently downloading...
 		if ( queueIt ) {
+			// debug
+			//log("spider: adding %s to crawldelayqueue",r->m_url);
 			// save this
 			r->m_udpSlot = slot;
 			r->m_nextLink = NULL;
@ -545,17 +549,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 				s_hammerQueueHead = r;
 				s_hammerQueueTail = r;
 			}
-			else
+			else {
 				s_hammerQueueTail->m_nextLink = r;
+				s_hammerQueueTail = r;
+			}
 			return;
 		}
 			

 		// if we had it in cache check the wait time
-		if ( last > 0 && waited < 400 ) {
+		if ( last > 0 && waited < r->m_crawlDelayMS ) {
 			log("spider: hammering firstIp=%s url=%s "
-			    "only waited %lli ms",
-			    iptoa(r->m_firstIp),r->m_url,waited);
+			    "only waited %lli ms of %li ms",
+			    iptoa(r->m_firstIp),r->m_url,waited,
+			    r->m_crawlDelayMS);
 			// this guy has too many redirects and it fails us...
 			// BUT do not core if running live, only if for test
 			// collection
@ -645,17 +652,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	}


-	// are we the first?
-	bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
-	// wait in line cuz someone else downloading it now
-	if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
-		g_udpServer.sendErrorReply(slot,g_errno);
-		return;
-	}
-
-	// this means our callback will be called
-	if ( ! firstInLine ) return;
-
 	// do not get .google.com/ crap
 	//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }

@ -664,6 +660,20 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {

 void downloadTheDocForReals ( Msg13Request *r ) {

+	// are we the first?
+	bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
+	// wait in line cuz someone else downloading it now
+	if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
+		g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
+		return;
+	}
+
+	// this means our callback will be called
+	if ( ! firstInLine ) {
+		//log("spider: inlining %s",r->m_url);
+		return;
+	}
+
 	// . store time now
 	// . no, now we store 0 to indicate in progress, then we
 	//   will overwrite it with a timestamp when the download completes
@ -2166,20 +2176,39 @@ void scanHammerQueue ( int fd , void *state ) {

 	long long nowms = gettimeofdayInMilliseconds();

+	Msg13Request *prev = NULL;
+	long long waited = -1LL;
+
 	// scan down the linked list of queued of msg13 requests
-	for ( ; r ; r = r->m_nextLink ) { 
+	for ( ; r ; prev = r , r = r->m_nextLink ) { 
 		long long last;
 		last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
 		// is one from this ip outstanding?
 		if ( last == 0LL ) continue;
 		// download finished? 
 		if ( last > 0 ) {
-			long long waited = nowms - last;
+		        waited = nowms - last;
 			// but skip if haven't waited long enough
 			if ( waited < r->m_crawlDelayMS ) continue;
 		}
+		// debug
+		//log("spider: downloading %s from crawldelay queue "
+		//    "waited=%llims crawldelay=%lims", 
+		//    r->m_url,waited,r->m_crawlDelayMS);
 		// good to go
 		downloadTheDocForReals ( r );
+		//
+		// remove from future scans
+		//
+		if ( prev ) 
+			prev->m_nextLink = r->m_nextLink;
+
+		if ( s_hammerQueueHead == r )
+			s_hammerQueueHead = r->m_nextLink;
+
+		if ( s_hammerQueueTail == r )
+			s_hammerQueueTail = prev;
+
 		// try to download some more i guess...
 	}
 }
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -11982,6 +11982,67 @@ bool isAllowed2 ( Url   *url            ,
 	goto urlLoop;
 }

+// when doing a custom crawl we have to decide between the provided crawl
+// delay, and the one in the robots.txt...
+long *XmlDoc::getFinalCrawlDelay() {
+
+	if ( m_finalCrawlDelayValid )
+		return &m_finalCrawlDelay;
+
+	bool *isAllowed = getIsAllowed();
+	if ( ! isAllowed || isAllowed == (void *)-1 ) return (long *)isAllowed;
+
+	CollectionRec *cr = getCollRec();
+	if ( ! cr ) return NULL;
+
+	m_finalCrawlDelayValid = true;
+
+	// getIsAllowed already sets m_crawlDelayValid to true
+	if ( ! cr->m_isCustomCrawl ) {
+		m_finalCrawlDelay = m_crawlDelay;
+		// default to 250ms i guess if none specified in robots
+		// just to be somewhat nice by default
+		if ( m_crawlDelay < 0 )	m_finalCrawlDelay = 250;
+		return &m_finalCrawlDelay;
+	}
+
+	// get manually specified crawl delay in seconds. convert to ms.
+	long manual = cr->m_collectiveCrawlDelay * 1000.0;
+	// negative means -1 means unknown or not specified
+	if ( manual < 0 ) manual = -1;
+
+	// if both are unknown...
+	if ( m_crawlDelay == -1 && manual == -1 ) {
+		m_finalCrawlDelay = -1;
+		return &m_finalCrawlDelay;
+	}
+
+	// if not in robots.txt use manual
+	if ( m_crawlDelay == -1 ) {
+		m_finalCrawlDelay = manual;
+		return &m_finalCrawlDelay;
+	}
+
+	// if manually provided crawldelay is -1, use robots.txt then
+	if ( manual == -1 ) {
+		m_finalCrawlDelay = m_crawlDelay;
+		return &m_finalCrawlDelay;
+	}
+
+	// let robots.txt dictate if both are >= 0
+	if ( m_useRobotsTxt ) {
+		m_finalCrawlDelay = m_crawlDelay;
+		return &m_finalCrawlDelay;
+	}
+
+	// if not using robots.txt, pick the smallest
+	if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
+	else                         m_finalCrawlDelay = manual;
+
+	return &m_finalCrawlDelay;
+}
+
+
 // . get the Robots.txt and see if we are allowed
 // . returns NULL and sets g_errno on error
 // . returns -1 if blocked, will re-call m_callback
@ -12025,6 +12086,9 @@ bool *XmlDoc::getIsAllowed ( ) {
 	if ( isRobotsTxt ) {
 		m_isAllowed      = true;
 		m_isAllowedValid = true;
+		m_crawlDelayValid = true;
+		// make it super fast...
+		m_crawlDelay      = 0;
 		return &m_isAllowed;
 	}

@ -13550,6 +13614,9 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// this must be valid, since we share m_msg13 with it
 	if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }

+	long *cd = getFinalCrawlDelay();
+	if ( ! cd || cd == (void *)-1 ) return (char **)cd;
+
 	// we might bail
 	if ( ! *isAllowed ) {
 		m_httpReplyValid          = true;
@ -13678,11 +13745,11 @@ char **XmlDoc::getHttpReply2 ( ) {
 	r->m_ifModifiedSince        = 0;
 	r->m_skipHammerCheck        = 0;

-	// . this is -1 if none found in robots.txt etc.
-	// . if not using robots.txt it will always be -1
-	// . it should also be -1 for the robots.txt file itself
-	if ( m_crawlDelayValid ) r->m_crawlDelayMS = m_crawlDelay;
-	else                     r->m_crawlDelayMS = -1;
+
+	// . this is -1 if unknown. none found in robots.txt or provided
+	//   in the custom crawl parms.
+	// . it should also be 0 for the robots.txt file itself
+	r->m_crawlDelayMS = *cd;

 	// need this in order to get all languages, etc. and avoid having
 	// to set words class at the spider compression proxy level
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -269,6 +269,7 @@ class XmlDoc {
 	uint32_t  m_internalFlags1;
 	long      m_ip;
 	long      m_crawlDelay;
+	long      m_finalCrawlDelay;
 	// . use this to quickly detect if doc is unchanged
 	// . we can avoid setting Xml and Words classes etc...
 	long      m_contentHash32;
@ -630,6 +631,7 @@ class XmlDoc {
 	long *getIp ( ) ;
 	long *gotIp ( bool save ) ;
 	bool *getIsAllowed ( ) ;
+	long *getFinalCrawlDelay();
 	//long getTryAgainTimeDelta() { 
 	//	if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
 	//	return m_tryAgainTimeDelta;
@ -1169,6 +1171,7 @@ class XmlDoc {
 	*/
 	bool m_httpStatusValid;
 	bool m_crawlDelayValid;
+	bool m_finalCrawlDelayValid;
 	bool m_titleRecKeyValid;
 	bool m_adVectorValid;
 	bool m_wikiDocIdsValid;