do not download bulkjob urls in crawlbot.
just return a fake http reply. however, do use crawl-delay throttling logic. deduping is already turned off for bulk jobs so it should be ok.
This commit is contained in:
parent
b33121af7d
commit
b6e5424e32
19
Msg13.cpp
19
Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
|
||||
// for bulk jobs avoid actual downloads of the page for efficiency
|
||||
if ( r->m_isCustomCrawl == 2 ) {
|
||||
char *s =
|
||||
"HTTP/1.0 200 (OK)\r\n"
|
||||
"Content-Length: 0\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: text/html\r\n\r\n";
|
||||
long slen = gbstrlen(s);
|
||||
long fakeBufSize = slen + 1;
|
||||
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
|
||||
gotHttpReply2 ( r ,
|
||||
fakeBuf,
|
||||
fakeBufSize, // include \0
|
||||
fakeBufSize, // allocsize
|
||||
NULL ); // tcpsock
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
r->m_urlIp ,
|
||||
|
2
Msg13.h
2
Msg13.h
@ -32,6 +32,8 @@ public:
|
||||
// if doing spider compression, compute contentHash32 of document
|
||||
// downloaded, and if it matches this then send back EDOCUNCHANGED
|
||||
long m_contentHash32;
|
||||
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
|
||||
char m_isCustomCrawl;
|
||||
// send back error ENOGOODDATE if it does not have one. but if
|
||||
// harvestLinks is true, just send back a filtered list of links
|
||||
long m_requireGoodDate:1;
|
||||
|
@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// turn off
|
||||
r->m_useCompressionProxy = false;
|
||||
r->m_compressReply = false;
|
||||
r->m_isCustomCrawl = cr->m_isCustomCrawl;
|
||||
|
||||
// set it for this too
|
||||
if ( g_conf.m_useCompressionProxy &&
|
||||
|
Loading…
x
Reference in New Issue
Block a user