do not download bulkjob urls in crawlbot.

just return a fake http reply.
however, do use crawl-delay throttling
logic. deduping is already turned off for
bulk jobs so it should be ok.
This commit is contained in:
mwells 2014-03-21 12:40:38 -07:00
parent b33121af7d
commit b6e5424e32
3 changed files with 22 additions and 0 deletions

@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
"(compatible; MSIE 6.0; Windows 98; "
"Win 9x 4.90)" ;
// for bulk jobs avoid actual downloads of the page for efficiency
if ( r->m_isCustomCrawl == 2 ) {
char *s =
"HTTP/1.0 200 (OK)\r\n"
"Content-Length: 0\r\n"
"Connection: Close\r\n"
"Content-Type: text/html\r\n\r\n";
long slen = gbstrlen(s);
long fakeBufSize = slen + 1;
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
gotHttpReply2 ( r ,
fakeBuf,
fakeBufSize, // include \0
fakeBufSize, // allocsize
NULL ); // tcpsock
return;
}
// download it
if ( ! g_httpServer.getDoc ( r->m_url ,
r->m_urlIp ,

@ -32,6 +32,8 @@ public:
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
long m_contentHash32;
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
char m_isCustomCrawl;
// send back error ENOGOODDATE if it does not have one. but if
// harvestLinks is true, just send back a filtered list of links
long m_requireGoodDate:1;

@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
// turn off
r->m_useCompressionProxy = false;
r->m_compressReply = false;
r->m_isCustomCrawl = cr->m_isCustomCrawl;
// set it for this too
if ( g_conf.m_useCompressionProxy &&