do not download bulkjob urls in crawlbot.

just return a fake http reply. however, do use crawl-delay throttling logic. deduping is already turned off for bulk jobs so it should be ok.
2014-03-21 12:40:38 -07:00 · 2014-03-21 12:40:38 -07:00 · b6e5424e32
commit b6e5424e32
parent b33121af7d
3 changed files with 22 additions and 0 deletions
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 			"(compatible; MSIE 6.0; Windows 98; "
 			"Win 9x 4.90)" ;

+	// for bulk jobs avoid actual downloads of the page for efficiency
+	if ( r->m_isCustomCrawl == 2 ) {
+		char *s = 
+			"HTTP/1.0 200 (OK)\r\n"
+			"Content-Length: 0\r\n"
+			"Connection: Close\r\n"
+			"Content-Type: text/html\r\n\r\n";
+		long slen = gbstrlen(s);
+		long fakeBufSize = slen + 1;
+		char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
+		gotHttpReply2 ( r , 
+				fakeBuf,
+				fakeBufSize, // include \0
+				fakeBufSize, // allocsize
+				NULL ); // tcpsock
+		return;
+	}
+
+
 	// download it
 	if ( ! g_httpServer.getDoc ( r->m_url             ,
 				     r->m_urlIp           ,
--- a/Msg13.h
+++ b/Msg13.h
@ -32,6 +32,8 @@ public:
 	// if doing spider compression, compute contentHash32 of document
 	// downloaded, and if it matches this then send back EDOCUNCHANGED
 	long  m_contentHash32;
+	// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
+	char m_isCustomCrawl;
 	// send back error ENOGOODDATE if it does not have one. but if
 	// harvestLinks is true, just send back a filtered list of links
 	long  m_requireGoodDate:1;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// turn off
 	r->m_useCompressionProxy = false;
 	r->m_compressReply       = false;
+	r->m_isCustomCrawl       = cr->m_isCustomCrawl;

 	// set it for this too
 	if ( g_conf.m_useCompressionProxy &&