mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
2781 lines
84 KiB
C++
2781 lines
84 KiB
C++
#include "Msg13.h"
|
|
#include "UdpServer.h"
|
|
#include "UdpSlot.h"
|
|
#include "Serialize.h"
|
|
#include "HttpServer.h"
|
|
#include "Conf.h"
|
|
#include "Stats.h"
|
|
#include "HashTableX.h"
|
|
#include "XmlDoc.h"
|
|
#include "SpiderProxy.h" // OP_GETPROXY OP_RETPROXY
|
|
#include "RdbCache.h"
|
|
#include "Collectiondb.h"
|
|
#include "WantedChecker.h"
|
|
#include "ip.h"
|
|
#include "GbUtil.h"
|
|
#include "zlib.h"
|
|
#include "Mem.h"
|
|
#include "PageInject.h"
|
|
#include "Pages.h"
|
|
#include "Statistics.h"
|
|
#include "Sanity.h"
|
|
#include "UrlMatchList.h"
|
|
#include "ContentMatchList.h"
|
|
#include "Errno.h"
|
|
#include "Log.h"
|
|
#include <string.h>
|
|
|
|
|
|
static const char g_fakeReply[] =
|
|
"HTTP/1.0 200 (OK)\r\n"
|
|
"Content-Length: 0\r\n"
|
|
"Connection: Close\r\n"
|
|
"Content-Type: text/html\r\n\r\n\0";
|
|
|
|
static bool getIframeExpandedContent(Msg13Request *r, TcpSocket *ts);
|
|
static void gotIframeExpandedContent(void *state);
|
|
|
|
static bool addToHammerQueue(Msg13Request *r);
|
|
static void scanHammerQueue(int fd, void *state);
|
|
static void downloadTheDocForReals(Msg13Request *r);
|
|
|
|
static void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) ;
|
|
static void handleRequest13 ( UdpSlot *slot , int32_t niceness ) ;
|
|
static void gotHttpReply ( void *state , TcpSocket *ts ) ;
|
|
static void gotHttpReply2 ( void *state ,
|
|
char *reply ,
|
|
int32_t replySize ,
|
|
int32_t replyAllocSize ,
|
|
TcpSocket *ts ) ;
|
|
static void passOnReply ( void *state , UdpSlot *slot ) ;
|
|
|
|
static bool hasIframe(char *reply, int32_t replySize);
|
|
|
|
|
|
static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
|
|
static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
|
|
static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
|
|
static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
|
|
static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;
|
|
|
|
// cache for robots.txt pages
|
|
static RdbCache s_httpCacheRobots;
|
|
// cache for other pages
|
|
static RdbCache s_httpCacheOthers;
|
|
// queue up identical requests
|
|
static HashTableX s_rt;
|
|
|
|
struct HttpCacheData {
|
|
HttpCacheData()
|
|
: m_errno(0)
|
|
, ptr_reply(nullptr)
|
|
, size_reply(0) {
|
|
}
|
|
|
|
HttpCacheData(char *reply, int32_t replySize, int32_t err)
|
|
: m_errno(err)
|
|
, ptr_reply(reply)
|
|
, size_reply(replySize) {
|
|
}
|
|
|
|
int32_t m_errno;
|
|
|
|
char *ptr_reply;
|
|
int32_t size_reply;
|
|
} __attribute__((packed));
|
|
|
|
void resetMsg13Caches ( ) {
|
|
s_httpCacheRobots.reset();
|
|
s_httpCacheOthers.reset();
|
|
s_rt.reset();
|
|
}
|
|
|
|
RdbCache *Msg13::getHttpCacheRobots() { return &s_httpCacheRobots; }
|
|
RdbCache *Msg13::getHttpCacheOthers() { return &s_httpCacheOthers; }
|
|
|
|
Msg13::Msg13() {
|
|
m_replyBuf = NULL;
|
|
m_state = NULL;
|
|
m_callback = NULL;
|
|
m_request = NULL;
|
|
m_replyBufSize = 0;
|
|
m_replyBufAllocSize = 0;
|
|
}
|
|
|
|
Msg13::~Msg13() {
|
|
reset();
|
|
}
|
|
|
|
void Msg13::reset() {
|
|
if (m_replyBuf) {
|
|
mfree(m_replyBuf,m_replyBufAllocSize,"msg13rb");
|
|
}
|
|
|
|
m_replyBuf = NULL;
|
|
m_replyBufSize = 0;
|
|
m_replyBufAllocSize = 0;
|
|
}
|
|
|
|
|
|
bool Msg13::registerHandler ( ) {
|
|
// . register ourselves with the udp server
|
|
// . it calls our callback when it receives a msg of type 0x0A
|
|
if ( ! g_udpServer.registerHandler ( msg_type_13, handleRequest13 ))
|
|
return false;
|
|
|
|
// use 3MB per cache
|
|
int32_t memRobots = 3000000;
|
|
// make 10MB now that we have proxied url (i.e. squid) capabilities
|
|
int32_t memOthers = 10000000;
|
|
// assume 15k avg cache file
|
|
int32_t maxCacheNodesRobots = memRobots / 106;
|
|
int32_t maxCacheNodesOthers = memOthers / (10*1024);
|
|
|
|
if ( ! s_httpCacheRobots.init ( memRobots ,
|
|
-1 , // fixedDataSize
|
|
maxCacheNodesRobots ,
|
|
"robots.txt" , // dbname
|
|
true, // save to disk
|
|
12, // cachekeysize
|
|
-1)) // numPtrsMax
|
|
return false;
|
|
|
|
if ( ! s_httpCacheOthers.init ( memOthers ,
|
|
-1 , // fixedDataSize
|
|
maxCacheNodesOthers ,
|
|
"htmlPages" , // dbname
|
|
true, // save to disk
|
|
12, // cachekeysize
|
|
-1)) // numPtrsMax
|
|
return false;
|
|
|
|
// . set up the request table (aka wait in line table)
|
|
// . allowDups = "true"
|
|
if ( ! s_rt.set ( 8 ,sizeof(UdpSlot *),0,NULL,0,true,"wait13tbl") )
|
|
return false;
|
|
|
|
if (!g_loop.registerSleepCallback(10, NULL, scanHammerQueue, "Msg13::scanHammerQueue")) {
|
|
log( "build: Failed to register timer callback for hammer queue." );
|
|
return false;
|
|
}
|
|
|
|
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, returns true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Msg13::getDoc ( Msg13Request *r, void *state, void(*callback)(void *) ) {
|
|
// reset in case we are being reused
|
|
reset();
|
|
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
m_request = r;
|
|
|
|
// sanity check
|
|
if ( r->m_urlIp == 0 ) { gbshutdownAbort(true); }
|
|
if ( r->m_urlIp == -1 ) { gbshutdownAbort(true); }
|
|
|
|
// set this
|
|
r->m_urlHash64 = hash64 ( r->ptr_url , r->size_url-1);
|
|
|
|
// is this a /robots.txt url?
|
|
if ( r->size_url - 1 > 12 &&
|
|
! strncmp ( r->ptr_url + r->size_url -1 -11,"/robots.txt",11)) {
|
|
r->m_isRobotsTxt = true;
|
|
}
|
|
|
|
// force caching if getting robots.txt so is compressed in cache
|
|
if ( r->m_isRobotsTxt ) {
|
|
r->m_compressReply = true;
|
|
}
|
|
|
|
// make the cache key
|
|
r->m_cacheKey = r->m_urlHash64;
|
|
|
|
// a compressed reply is different than a non-compressed reply
|
|
if ( r->m_compressReply ) {
|
|
r->m_cacheKey ^= 0xff;
|
|
}
|
|
|
|
if ( r->m_isSquidProxiedUrl ) {
|
|
// sets r->m_proxiedUrl that we use a few times below
|
|
setProxiedUrlFromSquidProxiedRequest( r );
|
|
}
|
|
|
|
// . if gigablast is acting like a squid proxy, then r->ptr_url
|
|
// is a COMPLETE http mime request, so hash the following fields in
|
|
// the http mime request to make the cache key
|
|
// * url
|
|
// * cookie
|
|
// . this is r->m_proxiedUrl which we set above
|
|
if ( r->m_isSquidProxiedUrl ) {
|
|
r->m_cacheKey = computeProxiedCacheKey64( r );
|
|
}
|
|
|
|
// assume no http proxy ip/port
|
|
r->m_proxyIp = 0;
|
|
r->m_proxyPort = 0;
|
|
|
|
return forwardRequest ( );
|
|
}
|
|
|
|
bool Msg13::forwardRequest ( ) {
|
|
|
|
//
|
|
// forward this request to the host responsible for this url's ip
|
|
//
|
|
int32_t nh = g_hostdb.getNumHosts();
|
|
int32_t hostId = hash32h(((uint32_t)m_request->m_firstIp >> 8), 0) % nh;
|
|
|
|
if((uint32_t)m_request->m_firstIp >> 8 == 0) {
|
|
// If the first IP is not set for the request then we don't
|
|
// want to hammer the first host with spidering enabled.
|
|
hostId = hash32n ( m_request->ptr_url ) % nh;
|
|
}
|
|
|
|
// get host to send to from hostId
|
|
Host *h = NULL;
|
|
// . pick first alive host, starting with "hostId" as the hostId
|
|
// . if all dead, send to the original and we will timeout > 200 secs
|
|
for ( int32_t count = 0 ; count <= nh ; count++ ) {
|
|
// get that host
|
|
//h = g_hostdb.getProxy ( hostId );;
|
|
h = g_hostdb.getHost ( hostId );
|
|
|
|
// Get the other one in shard instead of getting the first
|
|
// one we find sequentially because that makes the load
|
|
// imbalanced to the lowest host with spidering enabled.
|
|
if(!h->m_spiderEnabled) {
|
|
h = g_hostdb.getHost(g_hostdb.getHostIdWithSpideringEnabled(h->m_hostId, true));
|
|
}
|
|
|
|
// stop if he is alive and able to spider
|
|
if ( h->m_spiderEnabled && ! g_hostdb.isDead ( h ) ) break;
|
|
// get the next otherwise
|
|
if ( ++hostId >= nh ) hostId = 0;
|
|
}
|
|
|
|
if(!h) {
|
|
//all spider hosts dead (or misconfiguration)
|
|
if(!g_errno)
|
|
g_errno = ENOHOSTS;
|
|
log("spider: msg13 request: %s",mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
hostId = 0; // HACK!!
|
|
|
|
// forward it to self if we are the spider proxy!!!
|
|
if ( g_hostdb.m_myHost->m_isProxy )
|
|
h = g_hostdb.m_myHost;
|
|
|
|
// log it
|
|
if ( g_conf.m_logDebugSpider ) {
|
|
char ipbuf[16];
|
|
logf ( LOG_DEBUG,
|
|
"spider: sending download request of %s firstIp=%s "
|
|
"uh48=%" PRIu64" to "
|
|
"host %" PRId32" (child=%" PRId32")", m_request->ptr_url, iptoa(m_request->m_firstIp,ipbuf),
|
|
m_request->m_urlHash48, hostId,
|
|
m_request->m_skipHammerCheck);
|
|
}
|
|
|
|
// fill up the request
|
|
int32_t requestBufSize = m_request->getSize();
|
|
|
|
// we have to serialize it now because it has cookies as well as
|
|
// the url.
|
|
char *requestBuf = serializeMsg ( sizeof(Msg13Request),
|
|
&m_request->size_url,
|
|
&m_request->size_cookie,
|
|
&m_request->ptr_url,
|
|
m_request,
|
|
&requestBufSize ,
|
|
NULL ,
|
|
0); //RBUF_SIZE
|
|
// g_errno should be set in this case, most likely to ENOMEM
|
|
if ( ! requestBuf ) return true;
|
|
|
|
// . otherwise, send the request to the key host
|
|
// . returns false and sets g_errno on error
|
|
// . now wait for 2 minutes before timing out
|
|
// it was not using the proxy! because it thinks the hostid #0 is not the proxy... b/c ninad screwed that
|
|
// up by giving proxies the same ids as regular hosts!
|
|
if (!g_udpServer.sendRequest(requestBuf, requestBufSize, msg_type_13, h->m_ip, h->m_port, -1, NULL, this, gotForwardedReplyWrapper, 200000, 1)) {
|
|
// sanity check
|
|
if ( ! g_errno ) { gbshutdownLogicError(); }
|
|
// report it
|
|
log("spider: msg13 request: %s",mstrerror(g_errno));
|
|
// g_errno must be set!
|
|
return true;
|
|
}
|
|
// otherwise we block
|
|
return false;
|
|
}
|
|
|
|
void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
Msg13 *THIS = (Msg13 *)state;
|
|
// return if this blocked
|
|
if ( ! THIS->gotForwardedReply ( slot ) ) return;
|
|
// callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
|
|
// what did he give us?
|
|
char *reply = slot->m_readBuf;
|
|
int32_t replySize = slot->m_readBufSize;
|
|
int32_t replyAllocSize = slot->m_readBufMaxSize;
|
|
// UdpServer::makeReadBuf() sets m_readBuf to -1 when calling
|
|
// alloc() with a zero length, so fix that
|
|
if ( replySize == 0 ) reply = NULL;
|
|
// this is messed up. why is it happening?
|
|
if ( reply == (void *)-1 ) { gbshutdownAbort(true); }
|
|
|
|
// we are responsible for freeing reply now
|
|
if ( ! g_errno ) {
|
|
slot->m_readBuf = NULL;
|
|
slot->m_readBufSize = 0;
|
|
slot->m_readBufMaxSize = 0;
|
|
}
|
|
|
|
return gotFinalReply ( reply , replySize , replyAllocSize );
|
|
}
|
|
|
|
|
|
bool Msg13::gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize ){
|
|
|
|
// how is this happening? ah from image downloads...
|
|
if ( m_replyBuf ) { gbshutdownAbort(true); }
|
|
|
|
// assume none
|
|
m_replyBuf = NULL;
|
|
m_replyBufSize = 0;
|
|
m_replyBufAllocSize = 0;
|
|
|
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ) {
|
|
char ipbuf[16];
|
|
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
|
|
m_request->ptr_url,iptoa(m_request->m_firstIp,ipbuf));
|
|
}
|
|
|
|
|
|
// . if timed out probably the host is now dead so try another one!
|
|
// . return if that blocked
|
|
if ( g_errno == EUDPTIMEDOUT ) {
|
|
// try again
|
|
log("spider: retrying1. had error for %s : %s",
|
|
m_request->ptr_url,mstrerror(g_errno));
|
|
// return if that blocked
|
|
if ( ! forwardRequest ( ) ) return false;
|
|
// a different g_errno should be set now!
|
|
}
|
|
|
|
if ( g_errno ) {
|
|
// this error msg is repeated in XmlDoc::logIt() so no need
|
|
// for it here
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("spider: error for %s: %s",
|
|
m_request->ptr_url,mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// set it
|
|
m_replyBuf = reply;
|
|
m_replyBufSize = replySize;
|
|
m_replyBufAllocSize = replyAllocSize;
|
|
|
|
// sanity check
|
|
if ( replySize > 0 && ! reply ) { gbshutdownAbort(true); }
|
|
|
|
// no uncompressing if reply is empty
|
|
if ( replySize == 0 ) return true;
|
|
|
|
// if it was not compressed we are done! no need to uncompress it
|
|
if ( ! m_request->m_compressReply ) return true;
|
|
|
|
// get uncompressed size
|
|
uint32_t unzippedLen = *(int32_t*)reply;
|
|
// sanity checks
|
|
if ( unzippedLen > 10000000 ) {
|
|
log("spider: downloaded probable corrupt gzipped doc "
|
|
"with unzipped len of %" PRId32,(int32_t)unzippedLen);
|
|
g_errno = ECORRUPTDATA;
|
|
return true;
|
|
}
|
|
// make buffer to hold uncompressed data
|
|
char *newBuf = (char*)mmalloc(unzippedLen, "Msg13Unzip");
|
|
if( ! newBuf ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
// make another var to get mangled by gbuncompress
|
|
uint32_t uncompressedLen = unzippedLen;
|
|
// uncompress it
|
|
int zipErr = gbuncompress( (unsigned char*)newBuf , // dst
|
|
&uncompressedLen , // dstLen
|
|
(unsigned char*)reply+4 , // src
|
|
replySize - 4 ); // srcLen
|
|
if(zipErr != Z_OK ||
|
|
uncompressedLen!=(uint32_t)unzippedLen) {
|
|
log("spider: had error unzipping Msg13 reply. unzipped "
|
|
"len should be %" PRId32" but is %" PRId32". ziperr=%" PRId32,
|
|
(int32_t)uncompressedLen,
|
|
(int32_t)unzippedLen,
|
|
(int32_t)zipErr);
|
|
mfree (newBuf, unzippedLen, "Msg13UnzipError");
|
|
g_errno = ECORRUPTDATA;//EBADREPLYSIZE;
|
|
return true;
|
|
}
|
|
|
|
// count it for stats
|
|
g_stats.m_compressedBytesIn += replySize;
|
|
|
|
// free compressed
|
|
mfree ( reply , replyAllocSize ,"ufree" );
|
|
|
|
// assign uncompressed
|
|
m_replyBuf = newBuf;
|
|
m_replyBufSize = uncompressedLen;
|
|
m_replyBufAllocSize = unzippedLen;
|
|
|
|
// log it for now
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("http: got doc %s %" PRId32" to %" PRId32,
|
|
m_request->ptr_url,(int32_t)replySize,(int32_t)uncompressedLen);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool isIpInTwitchyTable(CollectionRec *cr, int32_t ip) {
|
|
if ( ! cr ) return false;
|
|
HashTableX *ht = &cr->m_twitchyTable;
|
|
return ( ht->getSlot ( &ip ) >= 0 );
|
|
}
|
|
|
|
static bool addIpToTwitchyTable(CollectionRec *cr, int32_t ip) {
|
|
if ( ! cr ) return true;
|
|
HashTableX *ht = &cr->m_twitchyTable;
|
|
if ( !ht->isInitialized() )
|
|
ht->set ( 4,0,16,NULL,0,false,"twitchtbl",true);
|
|
return ht->addKey ( &ip );
|
|
}
|
|
|
|
RdbCache s_hammerCache;
|
|
static bool s_flag = false;
|
|
static Msg13Request *s_hammerQueueHead = NULL;
|
|
static Msg13Request *s_hammerQueueTail = NULL;
|
|
|
|
// . only return false if you want slot to be nuked w/o replying
|
|
// . MUST always call g_udpServer::sendReply() or sendErrorReply()
|
|
void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
|
|
|
// cast it
|
|
Msg13Request *r = (Msg13Request *)slot->m_readBuf;
|
|
// use slot niceness
|
|
r->m_niceness = niceness;
|
|
|
|
// deserialize it now
|
|
deserializeMsg ( sizeof(Msg13),
|
|
&r->size_url,
|
|
&r->size_cookie,
|
|
&r->ptr_url,
|
|
((char*)r) + sizeof(*r) );
|
|
|
|
// make sure we do not download gigablast.com admin pages!
|
|
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->size_url-1 >= 7 ) {
|
|
Url url;
|
|
url.set ( r->ptr_url );
|
|
// . never download /master urls from ips of hosts in cluster
|
|
// . TODO: FIX! the pages might be in another cluster!
|
|
// . pages are now /admin/* not any /master/* any more.
|
|
if ( ( //strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
|
|
strncasecmp ( url.getPath() , "/admin/" , 7 ) == 0 )) {
|
|
log(LOG_WARN, "spider: Got request to download possible "
|
|
"gigablast control page %s. Sending back "
|
|
"ERESTRICTEDPAGE.",
|
|
url.getUrl());
|
|
g_errno = ERESTRICTEDPAGE;
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// . use a max cached age of 24hrs for robots.txt files
|
|
// . this returns true if robots.txt file for hostname found in cache
|
|
// . don't copy since, we analyze immediately and don't block
|
|
char *rec;
|
|
int32_t recSize;
|
|
// get the cache
|
|
RdbCache *c = r->m_isRobotsTxt ? &s_httpCacheRobots : &s_httpCacheOthers;
|
|
|
|
// the key is just the 64 bit hash of the url
|
|
key96_t k; k.n1 = 0; k.n0 = r->m_cacheKey;
|
|
// see if in there already
|
|
RdbCacheLock rcl(*c);
|
|
bool inCache = c->getRecord ( (collnum_t)0 , // share btwn colls
|
|
k , // cacheKey
|
|
&rec ,
|
|
&recSize ,
|
|
true , // copy?
|
|
r->m_maxCacheAge , // 24*60*60 ,
|
|
true ); // stats?
|
|
|
|
// . an empty rec is a cached not found (no robot.txt file)
|
|
// . therefore it's allowed, so set *reply to 1 (true)
|
|
if (inCache) {
|
|
HttpCacheData *httpCacheData = reinterpret_cast<HttpCacheData*>(rec);
|
|
if (deserializeMsg(sizeof(*httpCacheData), &httpCacheData->size_reply, &httpCacheData->size_reply, &httpCacheData->ptr_reply, ((char*)httpCacheData + sizeof(*httpCacheData))) != -1) {
|
|
logDebug(g_conf.m_logDebugSpider, "proxy: found %" PRId32" bytes in cache for %s", recSize, r->ptr_url);
|
|
|
|
if (httpCacheData->m_errno == 0) {
|
|
// . send the cached reply back
|
|
// . this will free send/read bufs on completion/g_errno
|
|
char *reply = (char*)mdup(httpCacheData->ptr_reply, httpCacheData->size_reply, "Msg13CacheReply");
|
|
g_udpServer.sendReply(reply, httpCacheData->size_reply, reply, httpCacheData->size_reply, slot);
|
|
} else {
|
|
g_udpServer.sendErrorReply(slot, httpCacheData->m_errno);
|
|
}
|
|
|
|
mfree(rec,recSize,"RdbCache3");
|
|
|
|
return;
|
|
}
|
|
|
|
// we need to free memory even when we're unable to deserialize message
|
|
mfree(rec,recSize,"RdbCache3");
|
|
}
|
|
rcl.unlock();
|
|
|
|
// log it so we can see if we are hammering
|
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ||
|
|
g_conf.m_logDebugMsg13 ) {
|
|
char ipbuf[16];
|
|
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
|
r->ptr_url,iptoa(r->m_firstIp,ipbuf));
|
|
}
|
|
|
|
// temporary hack
|
|
if ( r->m_parent ) { gbshutdownAbort(true); }
|
|
|
|
if ( ! s_flag ) {
|
|
s_flag = true;
|
|
s_hammerCache.init ( 15000 , // maxcachemem,
|
|
8 , // fixed data size
|
|
500 , // max nodes
|
|
"hamcache" , // dbname
|
|
false , // load from disk?
|
|
12 , // key size
|
|
-1 );// numPtrsMax
|
|
}
|
|
|
|
// save this
|
|
r->m_udpSlot = slot;
|
|
|
|
// send to a proxy if we are doing compression and not a proxy
|
|
if ( r->m_useCompressionProxy && ! g_hostdb.m_myHost->m_isProxy ) {
|
|
// use this key to select which proxy host
|
|
int32_t key = ((uint32_t)r->m_firstIp >> 8);
|
|
// send to host "h"
|
|
Host *h = g_hostdb.getBestSpiderCompressionProxy(&key);
|
|
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) {
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,"spider: sending to compression proxy %s:%" PRIu32,
|
|
iptoa(h->m_ip,ipbuf), (uint32_t)h->m_port);
|
|
}
|
|
|
|
// . otherwise, send the request to the key host
|
|
// . returns false and sets g_errno on error
|
|
// . now wait for 2 minutes before timing out
|
|
// we are sending to the proxy so make hostId -1
|
|
if (!g_udpServer.sendRequest((char *)r, r->getSize(), msg_type_13, h->m_ip, h->m_port, -1, NULL, r, passOnReply, 200000, niceness)) {
|
|
// g_errno should be set
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply. error=%s", __FILE__, __func__, __LINE__, mstrerror(g_errno));
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
// wait for it
|
|
return;
|
|
}
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
|
|
|
// was it in our table of ips that are throttling us?
|
|
r->m_wasInTableBeforeStarting = isIpInTwitchyTable ( cr , r->m_urlIp );
|
|
|
|
downloadTheDocForReals ( r );
|
|
}
|
|
|
|
static void downloadTheDocForReals2 ( Msg13Request *r ) ;
|
|
static void downloadTheDocForReals3a ( Msg13Request *r ) ;
|
|
static void downloadTheDocForReals3b ( Msg13Request *r ) ;
|
|
|
|
static void gotHttpReply9 ( void *state , TcpSocket *ts ) ;
|
|
|
|
static void gotProxyHostReplyWrapper ( void *state , UdpSlot *slot ) ;
|
|
|
|
void downloadTheDocForReals(Msg13Request *r) {
|
|
|
|
// are we the first?
|
|
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
|
// wait in line cuz someone else downloading it now
|
|
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
|
log(LOG_WARN, "spider: error adding to waiting table %s",r->ptr_url);
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// this means our callback will be called
|
|
if ( ! firstInLine ) {
|
|
log("spider: waiting in line %s",r->ptr_url);
|
|
return;
|
|
}
|
|
|
|
downloadTheDocForReals2 ( r );
|
|
}
|
|
|
|
// insertion point when we try to get another proxy to use because the one
|
|
// we tried seemed to be ip-banned
|
|
void downloadTheDocForReals2 ( Msg13Request *r ) {
|
|
|
|
bool useProxies = false;
|
|
|
|
// we gotta have some proxy ips that we can use
|
|
if (g_conf.m_proxyIps.hasDigits()) {
|
|
// for diffbot turn ON if use robots is off
|
|
if (r->m_forceUseFloaters) {
|
|
useProxies = true;
|
|
}
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
|
|
|
|
// if you turned on automatically use proxies in spider controls...
|
|
if (!useProxies &&
|
|
cr && cr->m_automaticallyUseProxies &&
|
|
r->m_urlIp != 0 && r->m_urlIp != -1 && isIpInTwitchyTable(cr, r->m_urlIp)) {
|
|
useProxies = true;
|
|
}
|
|
|
|
Url url;
|
|
url.set(r->ptr_url, r->size_url);
|
|
if (g_urlProxyList.isUrlMatched(url)) {
|
|
useProxies = true;
|
|
}
|
|
}
|
|
|
|
// we did not need a spider proxy ip so send this reuest to a host
|
|
// to download the url
|
|
if ( ! useProxies ) {
|
|
downloadTheDocForReals3a ( r );
|
|
return;
|
|
}
|
|
|
|
// before we send out the msg13 request try to get the spider proxy
|
|
// that is the best one to use. only do this if we had spider proxies
|
|
// specified in m_spiderProxyBuf
|
|
|
|
r->m_opCode = OP_GETPROXY;
|
|
|
|
// if we are being called from gotHttpReply9() below trying to
|
|
// get a new proxy because the last was banned, we have to set
|
|
// these so handleRequest54() in SpiderProxy.cpp can call
|
|
// returnProxy()
|
|
|
|
// get first alive host, usually host #0 but if he is dead then
|
|
// host #1 must take over! if all are dead, it returns host #0.
|
|
// so we are guaranteed "h will be non-null
|
|
Host *h = g_hostdb.getFirstAliveHost();
|
|
|
|
// now ask that host for the best spider proxy to send to
|
|
// just the top part of the Msg13Request is sent to handleRequest54() now
|
|
if (!g_udpServer.sendRequest((char *)r, r->getProxyRequestSize(), msg_type_54, h->m_ip, h->m_port, -1, NULL, r, gotProxyHostReplyWrapper, udpserver_sendrequest_infinite_timeout)) {
|
|
// sanity check
|
|
if ( ! g_errno ) { gbshutdownLogicError(); }
|
|
// report it
|
|
log(LOG_WARN, "spider: msg54 request1: %s %s",
|
|
mstrerror(g_errno),r->ptr_url);
|
|
// crap we gotta send back a reply i guess
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
|
// g_errno must be set!
|
|
return;
|
|
}
|
|
// otherwise we block
|
|
return;
|
|
}
|
|
|
|
void gotProxyHostReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
Msg13Request *r = (Msg13Request *)state;
|
|
//Msg13 *THIS = r->m_parent;
|
|
// don't let udpserver free the request, it's our m_urlIp
|
|
slot->m_sendBufAlloc = NULL;
|
|
// error getting spider proxy to use?
|
|
if ( g_errno ) {
|
|
// note it
|
|
log(LOG_WARN, "sproxy: got proxy request error: %s",mstrerror(g_errno));
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
|
return;
|
|
}
|
|
//
|
|
// the reply is the ip and port of the spider proxy to use
|
|
//
|
|
// what did he give us?
|
|
char *reply = slot->m_readBuf;
|
|
int32_t replySize = slot->m_readBufSize;
|
|
//int32_t replyAllocSize = slot->m_readBufMaxSize;
|
|
// bad reply? ip/port/LBid
|
|
if ( replySize != sizeof(ProxyReply) ) {
|
|
log(LOG_WARN, "sproxy: bad 54 reply size of %" PRId32" != %" PRId32" %s",
|
|
replySize,(int32_t)sizeof(ProxyReply),r->ptr_url);
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// set that
|
|
ProxyReply *prep = (ProxyReply *)reply;
|
|
|
|
r->m_proxyIp = prep->m_proxyIp;
|
|
r->m_proxyPort = prep->m_proxyPort;
|
|
// the id of this transaction for the LoadBucket
|
|
// need to save this so we can use it when we send a msg55 request
|
|
// out to tell host #0 how long it took use to use this proxy, etc.
|
|
r->m_lbId = prep->m_lbId;
|
|
|
|
// assume no username:password
|
|
r->m_proxyUsernamePwdAuth[0] = '\0';
|
|
|
|
// if proxy had one copy into the buf
|
|
if ( prep->m_usernamePwd[0] ) {
|
|
int32_t len = strlen(prep->m_usernamePwd);
|
|
memcpy ( r->m_proxyUsernamePwdAuth ,
|
|
prep->m_usernamePwd ,
|
|
len );
|
|
r->m_proxyUsernamePwdAuth[len] = '\0';
|
|
}
|
|
|
|
// if this proxy ip seems banned, are there more proxies to try?
|
|
r->m_hasMoreProxiesToTry = prep->m_hasMoreProxiesToTry;
|
|
|
|
// . how many proxies have been banned by the urlIP?
|
|
// . the more that are banned the higher the self-imposed crawl delay.
|
|
// . handleRequest54() in SpiderProxy.cpp will check s_banTable
|
|
// to count how many are banned for this urlIp. it saves s_banTable
|
|
// (a hashtable) to disk so it should be persistent.
|
|
r->m_numBannedProxies = prep->m_numBannedProxies;
|
|
|
|
downloadTheDocForReals3a ( r );
|
|
}
|
|
|
|
//
|
|
// we need r->m_numBannedProxies to be valid for hammer queueing
|
|
// so we have to do the hammer queue stuff after getting the proxy reply
|
|
//
|
|
void downloadTheDocForReals3a ( Msg13Request *r ) {
|
|
|
|
// if addToHammerQueue() returns true and queues this url for
|
|
// download later, when ready to download call this function
|
|
r->m_hammerCallback = downloadTheDocForReals3b;
|
|
|
|
// . returns true if we queued it for trying later
|
|
// . scanHammerQueue() will call downloadTheDocForReals3(r) for us
|
|
if ( addToHammerQueue ( r ) ) return;
|
|
|
|
downloadTheDocForReals3b( r );
|
|
}
|
|
|
|
void downloadTheDocForReals3b ( Msg13Request *r ) {
|
|
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
|
|
// assume no download start time
|
|
r->m_downloadStartTimeMS = 0;
|
|
|
|
// . store time now
|
|
// . no, now we store 0 to indicate in progress, then we
|
|
// will overwrite it with a timestamp when the download completes
|
|
// . but if measuring crawldelay from beginning of the download then
|
|
// store the current time
|
|
// . do NOT do this when downloading robots.txt etc. type files
|
|
// which should have skipHammerCheck set to true
|
|
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck ) {
|
|
RdbCacheLock rcl(s_hammerCache);
|
|
s_hammerCache.addLongLong(0,r->m_firstIp, 0LL);//nowms);
|
|
char ipbuf[16];
|
|
log("spider: delay from end for %s %s", iptoa(r->m_firstIp,ipbuf), r->ptr_url);
|
|
}
|
|
else if ( ! r->m_skipHammerCheck ) {
|
|
// get time now
|
|
RdbCacheLock rcl(s_hammerCache);
|
|
s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,
|
|
"spider: adding new time to hammercache for %s %s = %" PRId64,
|
|
iptoa(r->m_firstIp,ipbuf), r->ptr_url,nowms);
|
|
}
|
|
else {
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,
|
|
"spider: not adding new time to hammer cache for %s %s",
|
|
iptoa(r->m_firstIp,ipbuf), r->ptr_url);
|
|
}
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider ) {
|
|
char ipbuf[16];
|
|
log("spider: adding special \"in-progress\" time "
|
|
"of %" PRId32" for "
|
|
"firstIp=%s "
|
|
"url=%s "
|
|
"to msg13::hammerCache",
|
|
0,//-1,
|
|
iptoa(r->m_firstIp,ipbuf),
|
|
r->ptr_url);
|
|
}
|
|
|
|
// note it here
|
|
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) {
|
|
char ipbuf[16];
|
|
log("spider: downloading %s (%s) (skiphammercheck=%" PRId32")",
|
|
r->ptr_url,iptoa(r->m_urlIp,ipbuf),
|
|
(int32_t)r->m_skipHammerCheck);
|
|
}
|
|
|
|
char *agent = g_conf.m_spiderUserAgent;
|
|
|
|
// after downloading the doc call this callback
|
|
void (* callback) ( void *,TcpSocket *);
|
|
|
|
// if using a proxy tell host #0 we are done with that proxy so
|
|
// it can do its proxy load balancing
|
|
if ( r->m_proxyIp && r->m_proxyIp != -1 )
|
|
callback = gotHttpReply9;
|
|
// otherwise not using a proxy
|
|
else
|
|
callback = gotHttpReply;
|
|
|
|
// debug note
|
|
if ( r->m_proxyIp ) {
|
|
char ipbuf1[16];
|
|
char ipbuf2[16];
|
|
log(LOG_INFO,
|
|
"sproxy: got proxy %s:%" PRIu32" "
|
|
"and agent=\"%s\" to spider "
|
|
"%s %s (numBannedProxies=%" PRId32")",
|
|
iptoa(r->m_proxyIp,ipbuf1),
|
|
(uint32_t)(uint16_t)r->m_proxyPort,
|
|
agent,
|
|
iptoa(r->m_urlIp,ipbuf2),
|
|
r->ptr_url,
|
|
r->m_numBannedProxies);
|
|
}
|
|
|
|
char *exactRequest = NULL;
|
|
|
|
// if simulating squid just pass the proxied request on
|
|
// to the webserver as it is, but without the secret
|
|
// Proxy-Authorization: Basic abcdefgh base64 encoded
|
|
// username/password info.
|
|
if ( r->m_isSquidProxiedUrl ) {
|
|
exactRequest = r->ptr_url;
|
|
stripProxyAuthorization ( exactRequest );
|
|
}
|
|
|
|
// convert "GET http://xyz.com/abc" to "GET /abc" if not sending
|
|
// to another proxy... and sending to the actual webserver
|
|
if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
|
|
fixGETorPOST ( exactRequest );
|
|
|
|
// ALSO ADD authorization to the NEW proxy we are sending to
|
|
// r->m_proxyIp/r->m_proxyPort that has a username:password
|
|
StackBuf<1024> newReq;
|
|
if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
|
|
newReq.safeStrcpy ( exactRequest );
|
|
addNewProxyAuthorization ( &newReq , r );
|
|
newReq.nullTerm();
|
|
exactRequest = newReq.getBufStart();
|
|
}
|
|
|
|
// indicate start of download so we can overwrite the 0 we stored
|
|
// into the hammercache
|
|
r->m_downloadStartTimeMS = nowms;
|
|
|
|
// prevent core from violating MAX_DGRAMS #defined in UdpSlot.h
|
|
int32_t maxDocLen1 = r->m_maxTextDocLen;
|
|
int32_t maxDocLen2 = r->m_maxOtherDocLen;
|
|
|
|
// fix core in UdpServer.cpp from sending back a too big reply
|
|
if ( maxDocLen1 < 0 || maxDocLen1 > MAX_ABSDOCLEN )
|
|
maxDocLen1 = MAX_ABSDOCLEN;
|
|
if ( maxDocLen2 < 0 || maxDocLen2 > MAX_ABSDOCLEN )
|
|
maxDocLen2 = MAX_ABSDOCLEN;
|
|
|
|
// . download it
|
|
// . if m_proxyIp is non-zero it will make requests like:
|
|
// GET http://xyz.com/abc
|
|
if ( ! g_httpServer.getDoc ( r->ptr_url ,
|
|
r->m_urlIp ,
|
|
0 , // offset
|
|
-1 ,
|
|
r->m_ifModifiedSince ,
|
|
r , // state
|
|
callback , // callback
|
|
30*1000 , // 30 sec timeout
|
|
r->m_proxyIp ,
|
|
r->m_proxyPort ,
|
|
maxDocLen1,//r->m_maxTextDocLen ,
|
|
maxDocLen2,//r->m_maxOtherDocLen ,
|
|
agent ,
|
|
DEFAULT_HTTP_PROTO , // "HTTP/1.0"
|
|
false , // doPost?
|
|
r->ptr_cookie , // cookie
|
|
NULL , // additionalHeader
|
|
exactRequest , // our own mime!
|
|
NULL , // postContent
|
|
// this is NULL or '\0' if not there
|
|
r->m_proxyUsernamePwdAuth ) ) {
|
|
// return false if blocked
|
|
return;
|
|
}
|
|
|
|
// . log this so i know about it
|
|
// . g_errno MUST be set so that we do not DECREMENT
|
|
// the outstanding dom/ip counts in gotDoc() below
|
|
// because we did not increment them above
|
|
logf(LOG_DEBUG,"spider: http server had error: %s",mstrerror(g_errno));
|
|
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { gbshutdownLogicError(); }
|
|
|
|
// if did not block -- should have been an error. call callback
|
|
gotHttpReply ( r , NULL );
|
|
return ;
|
|
}
|
|
|
|
static int32_t s_55Out = 0;
|
|
|
|
static void doneReportingStatsWrapper(void * /*state*/, UdpSlot *slot) {
|
|
// note it
|
|
if ( g_errno )
|
|
log("sproxy: 55 reply: %s",mstrerror(g_errno));
|
|
|
|
// clear g_errno i guess
|
|
g_errno = 0;
|
|
|
|
// don't let udpserver free the request, it's our m_urlIp
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
s_55Out--;
|
|
}
|
|
|
|
static bool ipWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
|
|
|
// ts will be null if we got a fake reply from a bulk job
|
|
if ( ! ts )
|
|
return false;
|
|
|
|
// do not do this on robots.txt files
|
|
if ( r->m_isRobotsTxt )
|
|
return false;
|
|
|
|
// g_errno is 104 for 'connection reset by peer'
|
|
if ( g_errno == ECONNRESET ) {
|
|
*msg = "connection reset";
|
|
return true;
|
|
}
|
|
|
|
// proxy returns empty reply not ECONNRESET if it experiences
|
|
// a conn reset
|
|
if ( g_errno == EBADMIME && ts->m_readOffset == 0 ) {
|
|
*msg = "empty reply";
|
|
return true;
|
|
}
|
|
|
|
// on other errors do not do the ban check. it might be a
|
|
// tcp time out or something so we have no reply. but connection resets
|
|
// are a popular way of saying, hey, don't hit me so hard.
|
|
if ( g_errno ) return false;
|
|
|
|
// if they closed the socket on us we read 0 bytes, assumed
|
|
// we were banned...
|
|
if ( ts->m_readOffset == 0 ) {
|
|
*msg = "empty reply";
|
|
return true;
|
|
}
|
|
|
|
// check the http mime for 403 Forbidden
|
|
HttpMime mime;
|
|
mime.set ( ts->m_readBuf , ts->m_readOffset , NULL );
|
|
|
|
int32_t httpStatus = mime.getHttpStatus();
|
|
if ( httpStatus == 403 ) {
|
|
*msg = "status 403 forbidden";
|
|
return true;
|
|
}
|
|
if ( httpStatus == 999 ) {
|
|
*msg = "status 999 request denied";
|
|
return true;
|
|
}
|
|
// let's add this new one
|
|
if ( httpStatus == 503 ) {
|
|
*msg = "status 503 service unavailable";
|
|
return true;
|
|
}
|
|
|
|
// TODO: compare a simple checksum of the page content to what
|
|
// we have downloaded previously from this domain or ip. if it
|
|
// seems to be the same no matter what the url, then perhaps we
|
|
// are banned as well.
|
|
|
|
// otherwise assume not.
|
|
*msg = NULL;
|
|
|
|
return false;
|
|
}
|
|
|
|
static void appendRetryProxy(const char *url, int urlLen, const char *location = nullptr, int locationLen = 0) {
|
|
char filename[1024];
|
|
sprintf(filename,"%s/retryproxy.txt", g_hostdb.m_myHost->m_dir);
|
|
FILE *fp = fopen(filename,"a");
|
|
if (fp) {
|
|
fprintf(fp, "%.*s|%.*s\n", urlLen, url, locationLen, location);
|
|
fclose(fp);
|
|
}
|
|
}
|
|
|
|
static bool retryProxy(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
|
if (!ts) {
|
|
return false;
|
|
}
|
|
|
|
//we only do proxy checks if there weren't any other error
|
|
if (g_errno != 0) {
|
|
return false;
|
|
}
|
|
|
|
// don't check for retries if it's already done
|
|
if (r->m_proxyTries > 0) {
|
|
return false;
|
|
}
|
|
|
|
Url url;
|
|
url.set(r->ptr_url, r->size_url);
|
|
|
|
HttpMime mime;
|
|
mime.set(ts->m_readBuf, ts->m_readOffset, &url);
|
|
|
|
int32_t httpStatus = mime.getHttpStatus();
|
|
if (httpStatus == 301 || httpStatus == 302 || httpStatus == 307 || httpStatus == 308) {
|
|
// we only retry when list matches redirected url & does not match original url
|
|
if (g_urlRetryProxyList.isUrlMatched(url)) {
|
|
return false;
|
|
}
|
|
|
|
const Url *location = mime.getLocationUrl();
|
|
|
|
if (g_urlRetryProxyList.isUrlMatched(*location)) {
|
|
*msg = "redir url proxy match list";
|
|
appendRetryProxy(url.getUrl(), url.getUrlLen(), location->getUrl(), location->getUrlLen());
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
|
size_t haystack_size = ts->m_readOffset - pre_size;
|
|
const char *haystack = ts->m_readBuf + pre_size;
|
|
|
|
if (g_contentRetryProxyList.isContentMatched(haystack, haystack_size)) {
|
|
*msg = "content proxy match list";
|
|
appendRetryProxy(url.getUrl(), url.getUrlLen());
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void appendCrawlBan(const char *group, const char *url, int urlLen) {
|
|
char filename[1024];
|
|
sprintf(filename,"%s/crawlban.%s", g_hostdb.m_myHost->m_dir, group);
|
|
FILE *fp = fopen(filename,"a");
|
|
if(fp) {
|
|
fprintf(fp,"%.*s\n",urlLen,url);
|
|
fclose(fp);
|
|
}
|
|
Statistics::increment_crawl_ban_counter(group);
|
|
}
|
|
|
|
|
|
static bool crawlWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
|
//logTrace ..."crawlWasBanned, %.*s", r->size_url, r->ptr_url);
|
|
// no socket -> must be a bulk import job so obviously not banned
|
|
if(!ts)
|
|
return false;
|
|
|
|
//todo: should we check for specific error cores, eg ECONNRESET which some webservers
|
|
//might use for requests they don't like?
|
|
//if(g_errno==ECONNRESET || ... )
|
|
// *msg = "connection reset";
|
|
// return true;
|
|
//}
|
|
|
|
//todo: proxy proxy returns empty reply not ECONNRESET if it experiences a conn reset
|
|
//if(g_errno==EBADMIME && ts->m_readOffset==0) ...
|
|
|
|
//we only do ban checks if there weren't any other error
|
|
if(g_errno!=0)
|
|
return false;
|
|
|
|
//todo: if the server returned an empty response then we might be banned. But let's assume not for now.
|
|
|
|
// check the http mime for 403 Forbidden
|
|
HttpMime mime;
|
|
mime.set ( ts->m_readBuf , ts->m_readOffset , NULL );
|
|
|
|
int32_t httpStatus = mime.getHttpStatus();
|
|
if(httpStatus == 403) { //forbidden
|
|
//Cloudflare's only indication that it didn't like is that we get a 403 and the response is "branded" by cloudflare,
|
|
//which apparently means the response body contains the text "Cloudflare"
|
|
//The string Cloudflare should be within the first 1KB but we search the first 4KB
|
|
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
|
size_t haystack_size = ts->m_readOffset - pre_size;
|
|
if(haystack_size>4096)
|
|
haystack_size = 4096;
|
|
const void *haystack = ts->m_readBuf + pre_size;
|
|
if(memmem(haystack,haystack_size,"Cloudflare",10)!=0) {
|
|
log(LOG_INFO,"Url %.*s appears to be blocked by cloudflare (http-status-code=%d, response contains 'Cloudflare')", r->size_url, r->ptr_url, httpStatus);
|
|
*msg = "403 forbidden";
|
|
appendCrawlBan("cloudflare", r->ptr_url, r->size_url);
|
|
return true;
|
|
}
|
|
}
|
|
if(httpStatus == 200) { //ok
|
|
//Detect Distil networks capcha
|
|
//No documentation but it appears that their responses always contain a refresh header like:
|
|
// http-equiv="refresh" content="10; url=/distil_r_captcha.html ....
|
|
//Their documentation say that they include a X-DB header which is a bitmask but inconsistencies
|
|
//have been seen where they also include two such headers (?) and the name is very generic. So
|
|
//checking for X-DB header is probably not the best way.
|
|
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
|
size_t haystack_size = ts->m_readOffset - pre_size;
|
|
if(haystack_size>4096)
|
|
haystack_size = 4096;
|
|
const void *haystack = ts->m_readBuf + pre_size;
|
|
if(memmem(haystack,haystack_size,"distil_r_captcha",16)!=0) {
|
|
log(LOG_INFO,"Url %.*s appears to be captcha-blocked by distilnetworks (http-status-code=%d, response contains 'distil_r_captcha')", r->size_url, r->ptr_url, httpStatus);
|
|
*msg = "captcha-blocked";
|
|
appendCrawlBan("distil-captcha", r->ptr_url, r->size_url);
|
|
return true;
|
|
}
|
|
}
|
|
if(httpStatus==405) { //method not allowed
|
|
//Apparently a simple block by Distil network. a status-code of 405 and a content-length > 0 but no content
|
|
if(mime.getContentLen()>0) {
|
|
int32_t actual_content_size = ts->m_readOffset - mime.getMimeLen();
|
|
if(actual_content_size<=0) {
|
|
log(LOG_INFO,"Url %.*s appears to be blocked by distilnetworks (http-status-code=%d, response is shorter than Content-Length)", r->size_url, r->ptr_url, httpStatus);
|
|
*msg = "405-blocked";
|
|
appendCrawlBan("distil-block", r->ptr_url, r->size_url);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
if(httpStatus==503) { //service unavailable
|
|
//Wordfence captcha is not that reasy to detect. The only way so far is to look for the status code and text "Generated by Wordfence"
|
|
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
|
size_t haystack_size = ts->m_readOffset - pre_size;
|
|
const void *haystack = ts->m_readBuf + pre_size;
|
|
if(memmem(haystack,haystack_size,"Generated by Wordfence",22)!=0) {
|
|
log(LOG_INFO,"Url %.*s appears to be captcha-blocked by wordfence (http-status-code=%d, response contains 'Generated by Wordfence')", r->size_url, r->ptr_url, httpStatus);
|
|
*msg = "captcha-blocked";
|
|
appendCrawlBan("wordfence-captcha", r->ptr_url, r->size_url);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
//logTrace ..."Url crawl seems to not be banned");
|
|
// otherwise assume not.
|
|
*msg = NULL;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
static bool contentIsUnwanted(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
|
//logTrace ..."contentIsUnwanted, %.*s", r->size_url, r->ptr_url);
|
|
// no socket -> must be a bulk import job so obviously wanted
|
|
if(!ts)
|
|
return false;
|
|
|
|
//we only do ban checks if there weren't any other error
|
|
if(g_errno!=0)
|
|
return false;
|
|
|
|
//todo: if the server returned an empty response then we might be banned. But let's assume not for now.
|
|
|
|
// check the http mime for 403 Forbidden
|
|
HttpMime mime;
|
|
mime.set ( ts->m_readBuf , ts->m_readOffset , NULL );
|
|
|
|
int32_t httpStatus = mime.getHttpStatus();
|
|
if(httpStatus == 200) { //ok
|
|
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
|
size_t haystack_size = ts->m_readOffset - pre_size;
|
|
const void *haystack = ts->m_readBuf + pre_size;
|
|
if(!WantedChecker::check_single_content(r->ptr_url,haystack,haystack_size).wanted) {
|
|
log(LOG_INFO,"Url %s is unwanted by shlib", r->ptr_url);
|
|
*msg = "shlib-unwanted";
|
|
return true;
|
|
}
|
|
}
|
|
|
|
//logTrace ..."Url crawl seems to not be banned");
|
|
// otherwise assume not.
|
|
*msg = NULL;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// come here after telling host #0 we are done using this proxy.
|
|
// host #0 will update the loadbucket for it, using m_lbId.
|
|
void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
|
|
|
// cast it
|
|
Msg13Request *r = (Msg13Request *)state;
|
|
|
|
|
|
// if we got a 403 Forbidden or an empty reply
|
|
// then assume the proxy ip got banned so try another.
|
|
const char *banMsg = NULL;
|
|
//bool banned = false;
|
|
|
|
if ( g_errno )
|
|
log("msg13: got error from proxy: %s",mstrerror(g_errno));
|
|
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("msg13: got proxy reply for %s",r->ptr_url);
|
|
|
|
//if ( ! g_errno )
|
|
bool banned = ipWasBanned ( ts , &banMsg , r );
|
|
|
|
|
|
// inc this every time we try
|
|
r->m_proxyTries++;
|
|
|
|
// log a handy msg if proxy was banned
|
|
if ( banned ) {
|
|
const char *msg = "No more proxies to try. Using reply as is.";
|
|
if ( r->m_hasMoreProxiesToTry ) msg = "Trying another proxy.";
|
|
char ipbuf1[16];
|
|
char ipbuf2[16];
|
|
log("msg13: detected that proxy %s is banned "
|
|
"(banmsg=%s) "
|
|
"(tries=%" PRId32") by "
|
|
"url %s %s. %s"
|
|
, iptoa(r->m_proxyIp,ipbuf1) // r->m_banProxyIp
|
|
, banMsg
|
|
, r->m_proxyTries
|
|
, iptoa(r->m_urlIp,ipbuf2)
|
|
, r->ptr_url
|
|
, msg );
|
|
}
|
|
|
|
if ( banned &&
|
|
// try up to 5 different proxy ips. try to try new c blocks
|
|
// if available.
|
|
//r->m_proxyTries < 5 &&
|
|
// if all proxies are banned for this r->m_urlIp then
|
|
// this will be false
|
|
r->m_hasMoreProxiesToTry ) {
|
|
// tell host #0 to add this urlip/proxyip pair to its ban tbl
|
|
// when it sends a msg 0x54 request to get another proxy.
|
|
// TODO: shit, it also has to return the banned proxy...
|
|
r->m_banProxyIp = r->m_proxyIp;
|
|
r->m_banProxyPort = r->m_proxyPort;
|
|
// . re-download but using a different proxy
|
|
// . handleRequest54 should not increment the outstanding
|
|
// count beause we should give it the same m_lbId
|
|
// . skip s_rt table since we are already first in line and
|
|
// others may be waiting for us...
|
|
downloadTheDocForReals2 ( r );
|
|
return;
|
|
}
|
|
|
|
// tell host #0 to reduce proxy load cnt
|
|
r->m_opCode = OP_RETPROXY;
|
|
|
|
Host *h = g_hostdb.getFirstAliveHost();
|
|
// now return the proxy. this will decrement the load count on
|
|
// host "h" for this proxy.
|
|
if (g_udpServer.sendRequest((char *)r, r->getProxyRequestSize(), msg_type_54, h->m_ip, h->m_port, -1, NULL, NULL, doneReportingStatsWrapper, 10000)) {
|
|
// it blocked!
|
|
//r->m_blocked = true;
|
|
s_55Out++;
|
|
// sanity
|
|
if ( s_55Out > 500 )
|
|
log("sproxy: s55out > 500 = %" PRId32,s_55Out);
|
|
}
|
|
// sanity check
|
|
//if ( ! g_errno ) { gbshutdownLogicError(); }
|
|
// report it
|
|
if ( g_errno ) log("spider: msg54 request2: %s %s",
|
|
mstrerror(g_errno),r->ptr_url);
|
|
// it failed i guess proceed
|
|
gotHttpReply( state , ts );
|
|
}
|
|
|
|
void gotHttpReply ( void *state , TcpSocket *ts ) {
|
|
// if we had no error, TcpSocket should be legit
|
|
if ( ts ) {
|
|
gotHttpReply2 ( state ,
|
|
ts->m_readBuf ,
|
|
ts->m_readOffset ,
|
|
ts->m_readBufSize,
|
|
ts );
|
|
// now after we return TcpServer will DESTROY "ts" and
|
|
// free m_readBuf... so we should not have any reference to it
|
|
return;
|
|
}
|
|
// sanity check, if ts is NULL must have g_errno set
|
|
if ( ! g_errno ) { gbshutdownLogicError(); } // g_errno=EBADENG...
|
|
// if g_errno is set i guess ts is NULL!
|
|
gotHttpReply2 ( state , NULL ,0 , 0 , NULL );
|
|
}
|
|
|
|
void gotHttpReply2 ( void *state ,
|
|
char *reply ,
|
|
int32_t replySize ,
|
|
int32_t replyAllocSize ,
|
|
TcpSocket *ts ) {
|
|
|
|
// save error
|
|
int32_t savedErr = g_errno;
|
|
|
|
Msg13Request *r = (Msg13Request *) state;
|
|
UdpSlot *slot = r->m_udpSlot;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
|
|
|
// error?
|
|
if ( g_errno && ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) ) {
|
|
char ipbuf[16];
|
|
log("spider: http reply (msg13) had error = %s for %s at ip %s",
|
|
mstrerror(savedErr),r->ptr_url,iptoa(r->m_urlIp,ipbuf));
|
|
}
|
|
|
|
bool inTable = false;
|
|
bool checkIfBanned = false;
|
|
if ( cr && cr->m_automaticallyBackOff ) checkIfBanned = true;
|
|
if ( cr && cr->m_automaticallyUseProxies ) checkIfBanned = true;
|
|
// must have a collrec to hold the ips
|
|
if ( checkIfBanned && cr && r->m_urlIp != 0 && r->m_urlIp != -1 )
|
|
inTable = isIpInTwitchyTable ( cr , r->m_urlIp );
|
|
|
|
// check if our ip seems banned. if g_errno was ECONNRESET that
|
|
// is an indicator it was throttled/banned.
|
|
const char *banMsg = NULL;
|
|
bool banned = false;
|
|
if ( checkIfBanned )
|
|
banned = ipWasBanned ( ts , &banMsg , r );
|
|
if ( banned ) {
|
|
// should we turn proxies on for this IP address only?
|
|
char ipbuf[16];
|
|
log("msg13: url %s detected as banned (%s), "
|
|
"for ip %s"
|
|
, r->ptr_url
|
|
, banMsg
|
|
, iptoa(r->m_urlIp,ipbuf)
|
|
);
|
|
}
|
|
|
|
bool retry_proxy = false;
|
|
if (retryProxy(ts, &banMsg, r)) {
|
|
retry_proxy = true;
|
|
char ipbuf[16];
|
|
log("msg13: retry using proxy for url %s due to %s, for ip %s", r->ptr_url, banMsg, iptoa(r->m_urlIp, ipbuf));
|
|
}
|
|
|
|
if(crawlWasBanned(ts,&banMsg,r)) {
|
|
char ipbuf[16];
|
|
log("msg13: url %.*s detected as banned2 (%s), for ip %s"
|
|
, (int)r->size_url, r->ptr_url
|
|
, banMsg
|
|
, iptoa(r->m_urlIp,ipbuf)
|
|
);
|
|
savedErr = g_errno = EBANNEDCRAWL;
|
|
}
|
|
|
|
if(contentIsUnwanted(ts,&banMsg,r)) {
|
|
log("msg13: url %.*s is unwanted (%s)"
|
|
, (int)r->size_url, r->ptr_url
|
|
, banMsg
|
|
);
|
|
savedErr = g_errno = EDOCBLOCKEDSHLIBCONTENT;
|
|
}
|
|
|
|
if (ts && ts->m_truncated) {
|
|
if (ts->m_blockedContentType) {
|
|
savedErr = g_errno = EDOCBADCONTENTTYPE;
|
|
} else {
|
|
savedErr = g_errno = EDOCTOOBIG;
|
|
}
|
|
}
|
|
|
|
// . add to the table if not in there yet
|
|
// . store in our table of ips we should use proxies for
|
|
// . also start off with a crawldelay of like 1 sec for this
|
|
// which is not normal for using proxies.
|
|
if ( banned && ! inTable )
|
|
addIpToTwitchyTable ( cr , r->m_urlIp );
|
|
|
|
// did we detect it as banned?
|
|
if ( banned &&
|
|
// retry iff we haven't already, but if we did stop the inf loop
|
|
! r->m_wasInTableBeforeStarting &&
|
|
cr && ( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
|
|
// but this is not for proxies... only native crawlbot backoff
|
|
! r->m_proxyIp ) {
|
|
// note this as well
|
|
log("msg13: retrying spidered page with new logic for %s",
|
|
r->ptr_url);
|
|
// reset this so we don't endless loop it
|
|
r->m_wasInTableBeforeStarting = true;
|
|
// reset error
|
|
g_errno = 0;
|
|
/// and retry. it should use the proxy... or at least
|
|
// use a crawldelay of 3 seconds since we added it to the
|
|
// twitchy table.
|
|
downloadTheDocForReals2 ( r );
|
|
// that's it. if it had an error it will send back a reply.
|
|
return;
|
|
}
|
|
|
|
if (retry_proxy) {
|
|
// note this as well
|
|
log("msg13: retrying spidered page with proxy for %s", r->ptr_url);
|
|
|
|
// reset error
|
|
g_errno = 0;
|
|
|
|
r->m_forceUseFloaters = 1;
|
|
|
|
downloadTheDocForReals2(r);
|
|
return;
|
|
}
|
|
|
|
// do not print this if we are already using proxies, it is for
|
|
// the auto crawldelay backoff logic only
|
|
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )
|
|
log("msg13: can not retry banned download of %s "
|
|
"because we knew ip was banned at start",r->ptr_url);
|
|
|
|
// get time now
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
|
|
// right now there is a 0 in there to indicate in-progress.
|
|
// so we must overwrite with either the download start time or the
|
|
// download end time.
|
|
int64_t timeToAdd = r->m_downloadStartTimeMS;
|
|
if ( r->m_crawlDelayFromEnd ) timeToAdd = nowms;
|
|
|
|
// . now store the current time in the cache
|
|
// . do NOT do this for robots.txt etc. where we skip hammer check
|
|
if ( ! r->m_skipHammerCheck ) {
|
|
RdbCacheLock rcl(s_hammerCache);
|
|
s_hammerCache.addLongLong(0,r->m_firstIp,timeToAdd);
|
|
}
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider && ! r->m_skipHammerCheck ) {
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,"spider: adding last download time "
|
|
"of %" PRId64" for firstIp=%s url=%s "
|
|
"to msg13::hammerCache",
|
|
timeToAdd,iptoa(r->m_firstIp,ipbuf),r->ptr_url);
|
|
}
|
|
|
|
|
|
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) {
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s "
|
|
"err=%s",
|
|
iptoa(r->m_firstIp,ipbuf),r->ptr_url,mstrerror(savedErr));
|
|
}
|
|
|
|
// . sanity check - robots.txt requests must always be compressed
|
|
// . saves space in the cache
|
|
if ( ! r->m_compressReply && r->m_isRobotsTxt ) { gbshutdownLogicError();}
|
|
// null terminate it always! -- unless already null terminated...
|
|
if ( replySize > 0 && reply[replySize-1] ) reply[replySize++] = '\0';
|
|
// sanity check
|
|
if ( replySize > replyAllocSize ) { gbshutdownLogicError(); }
|
|
|
|
// save original size
|
|
int32_t originalSize = replySize;
|
|
|
|
// sanity check
|
|
if ( replySize>0 && reply[replySize-1]!= '\0') { gbshutdownLogicError(); }
|
|
|
|
// assume http status is 200
|
|
bool goodStatus = true;
|
|
|
|
int64_t *docsPtr = NULL;
|
|
int64_t *bytesInPtr = NULL;
|
|
int64_t *bytesOutPtr = NULL;
|
|
|
|
// use this mime
|
|
HttpMime mime;
|
|
int32_t httpStatus = 0; // 200;
|
|
|
|
// do not do any of the content analysis routines below if we
|
|
// had a g_errno like ETCPTIMEDOUT or EBADMIME or whatever...
|
|
if ( savedErr ) goodStatus = false;
|
|
|
|
// no, its on the content only, NOT including mime
|
|
int32_t mimeLen = 0;
|
|
|
|
// only bother rewriting the error mime if user wanted compression
|
|
// otherwise, don't bother rewriting it.
|
|
// DO NOT do this if savedErr is set because we end up calling
|
|
// sendErorrReply() below for that!
|
|
if ( replySize>0 && r->m_compressReply && ! savedErr ) {
|
|
// assume fake reply
|
|
if ( reply == g_fakeReply ) {
|
|
httpStatus = 200;
|
|
}
|
|
else {
|
|
// exclude the \0 i guess. use NULL for url.
|
|
mime.set ( reply , replySize - 1, NULL );
|
|
// no, its on the content only, NOT including mime
|
|
mimeLen = mime.getMimeLen();
|
|
// get this
|
|
httpStatus = mime.getHttpStatus();
|
|
}
|
|
// if it's -1, unknown i guess, then force to 505
|
|
// server side error. we get an EBADMIME for our g_errno
|
|
// when we enter this loop sometimes, so in that case...
|
|
if ( httpStatus == -1 ) httpStatus = 505;
|
|
if ( savedErr ) httpStatus = 505;
|
|
// if bad http status, re-write it
|
|
if ( httpStatus != 200 ) {
|
|
char tmpBuf[2048];
|
|
char *p = tmpBuf;
|
|
p += sprintf( tmpBuf,
|
|
"HTTP/1.0 %" PRId32"\r\n"
|
|
"Content-Length: 0\r\n" ,
|
|
httpStatus );
|
|
// convery redirect urls back to requester
|
|
const char *loc = mime.getLocationField();
|
|
int32_t locLen = mime.getLocationFieldLen();
|
|
// if too big, forget it! otherwise we breach tmpBuf
|
|
if ( loc && locLen > 0 && locLen < 1024 ) {
|
|
p += sprintf ( p , "Location: " );
|
|
memcpy ( p , loc , locLen );
|
|
p += locLen;
|
|
memcpy ( p , "\r\n", 2 );
|
|
p += 2;
|
|
}
|
|
// close it up
|
|
p += sprintf ( p , "\r\n" );
|
|
// copy it over as new reply, include \0
|
|
int32_t newSize = p - tmpBuf + 1;
|
|
if ( newSize >= 2048 ) { gbshutdownLogicError(); }
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressMimeErrorDocs;
|
|
bytesInPtr = &g_stats.m_compressMimeErrorBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressMimeErrorBytesOut;
|
|
// only replace orig reply if we are smaller
|
|
if ( newSize < replySize ) {
|
|
memcpy ( reply , tmpBuf , newSize );
|
|
replySize = newSize;
|
|
}
|
|
// reset content hash
|
|
goodStatus = false;
|
|
}
|
|
}
|
|
|
|
// point to the content
|
|
char *content = reply + mimeLen;
|
|
// reduce length by that
|
|
int32_t contentLen = replySize - 1 - mimeLen;
|
|
// fix bad crap
|
|
if ( contentLen < 0 ) contentLen = 0;
|
|
|
|
// fake http 200 reply?
|
|
if ( reply == g_fakeReply ) { content = NULL; contentLen = 0; }
|
|
|
|
if ( replySize > 0 &&
|
|
goodStatus &&
|
|
!r->m_isRobotsTxt &&
|
|
r->m_compressReply ) {
|
|
// get the content type from mime
|
|
char ct = mime.getContentType();
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_PDF &&
|
|
ct != CT_DOC &&
|
|
ct != CT_XLS &&
|
|
ct != CT_PPT &&
|
|
ct != CT_PS ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressBadCTypeDocs;
|
|
bytesInPtr = &g_stats.m_compressBadCTypeBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressBadCTypeBytesOut;
|
|
replySize = 0;
|
|
}
|
|
}
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
bool hasIframe2 = false;
|
|
if ( r->m_compressReply &&
|
|
goodStatus &&
|
|
! r->m_isRobotsTxt )
|
|
hasIframe2 = hasIframe(reply, replySize);
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
if ( hasIframe2 &&
|
|
! r->m_attemptedIframeExpansion &&
|
|
! r->m_isSquidProxiedUrl ) {
|
|
// must have ts i think
|
|
if ( ! ts ) { gbshutdownAbort(true); }
|
|
// sanity
|
|
if ( ts->m_readBuf != reply ) { gbshutdownLogicError();}
|
|
// . try to expand each iframe tag in there
|
|
// . return without sending a reply back if this blocks
|
|
// . it will return true and set g_errno on error
|
|
// . when it has fully expanded the doc's iframes it we
|
|
// re-call this gotHttpReply() function but with the
|
|
// TcpServer's buf swapped out to be the buf that has the
|
|
// expanded iframes in it
|
|
// . returns false if blocks
|
|
// . returns true if did not block, sets g_errno on error
|
|
// . if it blocked it will recall THIS function
|
|
if ( ! getIframeExpandedContent ( r , ts ) ) {
|
|
if ( g_conf.m_logDebugMsg13 ||
|
|
g_conf.m_logDebugSpider )
|
|
log("msg13: iframe expansion blocked %s",
|
|
r->ptr_url);
|
|
return;
|
|
}
|
|
// ok, did we have an error?
|
|
if ( g_errno )
|
|
log("scproxy: xml set for %s had error: %s",
|
|
r->ptr_url,mstrerror(g_errno));
|
|
// otherwise, i guess we had no iframes worthy of expanding
|
|
// so pretend we do not have any iframes
|
|
hasIframe2 = false;
|
|
}
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// compute content hash
|
|
if ( r->m_contentHash32 &&
|
|
replySize>0 &&
|
|
goodStatus &&
|
|
r->m_compressReply &&
|
|
// if we got iframes we can't tell if content changed
|
|
! hasIframe2 ) {
|
|
// compute it
|
|
int32_t ch32 = getContentHash32Fast( (unsigned char *)content , contentLen);
|
|
// unchanged?
|
|
if ( ch32 == r->m_contentHash32 ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressUnchangedDocs;
|
|
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressUnchangedBytesOut;
|
|
// do not send anything back
|
|
replySize = 0;
|
|
// and set error
|
|
savedErr = EDOCUNCHANGED;
|
|
}
|
|
}
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
// these are typically roots!
|
|
if ( // override HasIFrame with "FullPageRequested" if it has
|
|
// an iframe, because that is the overriding stat. i.e. if
|
|
// we ignored if it had iframes, we'd still end up here...
|
|
( ! docsPtr || docsPtr == &g_stats.m_compressHasIframeDocs ) &&
|
|
r->m_compressReply ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressFullPageDocs;
|
|
bytesInPtr = &g_stats.m_compressFullPageBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressFullPageBytesOut;
|
|
}
|
|
else if ( ! docsPtr &&
|
|
r->m_compressReply ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressHasDateDocs;
|
|
bytesInPtr = &g_stats.m_compressHasDateBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressHasDateBytesOut;
|
|
}
|
|
|
|
|
|
if ( r->m_isRobotsTxt &&
|
|
goodStatus &&
|
|
! savedErr &&
|
|
r->m_compressReply &&
|
|
httpStatus == 200 ) {
|
|
|
|
// . just take out the lines we need...
|
|
// . if no user-agent line matches * or gigabot/flurbot we
|
|
// will get just a \0 for the reply, replySize=1!
|
|
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressRobotsTxtDocs;
|
|
bytesInPtr = &g_stats.m_compressRobotsTxtBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressRobotsTxtBytesOut;
|
|
}
|
|
|
|
// unknown by default
|
|
if ( ! docsPtr ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressUnknownTypeDocs;
|
|
bytesInPtr = &g_stats.m_compressUnknownTypeBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressUnknownTypeBytesOut;
|
|
}
|
|
|
|
// assume we did not compress it
|
|
bool compressed = false;
|
|
// compress if we should. do not compress if we are original requester
|
|
// because we call gotFinalReply() with the reply right below here.
|
|
// CAUTION: do not compress empty replies.
|
|
// do not bother if savedErr is set because we use sendErrorReply
|
|
// to send that back!
|
|
if ( r->m_compressReply && replySize>0 && ! savedErr ) {
|
|
// how big should the compression buf be?
|
|
int32_t need = sizeof(int32_t) + // unzipped size
|
|
(int32_t)(replySize * 1.01) + // worst case size
|
|
25; // for zlib
|
|
// for 7-zip
|
|
need += 300;
|
|
// back buffer to hold compressed reply
|
|
uint32_t compressedLen;
|
|
char *compressedBuf = (char*)mmalloc(need, "Msg13Zip");
|
|
if ( ! compressedBuf ) {
|
|
g_errno = ENOMEM;
|
|
log(LOG_WARN, "msg13: compression failed1 %s",r->ptr_url);
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// store uncompressed length as first four bytes in the
|
|
// compressedBuf
|
|
*(int32_t *)compressedBuf = replySize;
|
|
// the remaining bytes are for data
|
|
compressedLen = need - 4;
|
|
// leave the first 4 bytes to hold the uncompressed size
|
|
int zipErr = gbcompress( (unsigned char*)compressedBuf+4,
|
|
&compressedLen,
|
|
(unsigned char*)reply,
|
|
replySize);
|
|
if(zipErr != Z_OK) {
|
|
log("spider: had error zipping Msg13 reply. %s "
|
|
"(%" PRId32") url=%s",
|
|
zError(zipErr),(int32_t)zipErr,r->ptr_url);
|
|
mfree (compressedBuf, need, "Msg13ZipError");
|
|
g_errno = ECORRUPTDATA;
|
|
log(LOG_WARN, "msg13: compression failed2 %s",r->ptr_url);
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// record the uncompressed size.
|
|
reply = compressedBuf;
|
|
replySize = 4 + compressedLen;
|
|
replyAllocSize = need;
|
|
// sanity check
|
|
if ( replySize<0||replySize>100000000 ) { gbshutdownAbort(true);}
|
|
// we did compress it
|
|
compressed = true;
|
|
}
|
|
|
|
// record the stats
|
|
if ( docsPtr ) {
|
|
// we download a doc
|
|
*docsPtr = *docsPtr + 1;
|
|
// we spidered it at this size
|
|
*bytesInPtr += originalSize;
|
|
// and spit it back out at this size
|
|
*bytesOutPtr += replySize;
|
|
// and this always, the total
|
|
g_stats.m_compressAllDocs++;
|
|
g_stats.m_compressAllBytesIn += originalSize;
|
|
g_stats.m_compressAllBytesOut += replySize;
|
|
}
|
|
|
|
// store reply in the cache (might be compressed)
|
|
if (r->m_maxCacheAge > 0) {
|
|
// get the cache
|
|
RdbCache *c = r->m_isRobotsTxt ? &s_httpCacheRobots : &s_httpCacheOthers;
|
|
|
|
// key is based on url hash
|
|
key96_t k; k.n1 = 0; k.n0 = r->m_cacheKey;
|
|
|
|
HttpCacheData cacheData(reply, replySize, savedErr);
|
|
int32_t serializeCacheDataSize = 0;
|
|
char *serializeCacheData = serializeMsg(sizeof(cacheData), &cacheData.size_reply, &cacheData.size_reply, &cacheData.ptr_reply, &cacheData, &serializeCacheDataSize, NULL, 0);
|
|
|
|
// add it, use a generic collection
|
|
RdbCacheLock rcl(*c);
|
|
c->addRecord((collnum_t) 0, k, serializeCacheData, serializeCacheDataSize);
|
|
|
|
// ignore errors caching it
|
|
g_errno = 0;
|
|
}
|
|
|
|
// how many have this key?
|
|
int32_t count = s_rt.getCount ( &r->m_cacheKey );
|
|
// sanity check
|
|
if ( count < 1 ) { gbshutdownAbort(true); }
|
|
|
|
// send a reply for all waiting in line
|
|
int32_t tableSlot;
|
|
// loop
|
|
for ( ; ( tableSlot = s_rt.getSlot ( &r->m_cacheKey) ) >= 0 ; ) {
|
|
// use this
|
|
int32_t err = 0;
|
|
// set g_errno appropriately
|
|
if ( savedErr ) err = savedErr;
|
|
// sanity check. must be empty on any error
|
|
if ( reply && replySize > 0 && err ) {
|
|
// ETCPIMEDOUT can happen with a partial buf
|
|
if ( err != ETCPTIMEDOUT &&
|
|
// sometimes zipped content from page
|
|
// is corrupt... we don't even request
|
|
// gzipped http replies but they send it anyway
|
|
err != ECORRUPTHTTPGZIP &&
|
|
// for proxied https urls
|
|
err != EPROXYSSLCONNECTFAILED &&
|
|
// now httpserver::gotDoc's call to
|
|
// unzipReply() can also set g_errno to
|
|
// EBADMIME
|
|
err != EBADMIME &&
|
|
// this happens sometimes in unzipReply()
|
|
err != ENOMEM &&
|
|
// broken pipe
|
|
err != EPIPE &&
|
|
// connection reset by peer
|
|
err != ECONNRESET &&
|
|
err != EBANNEDCRAWL &&
|
|
err != EDOCBLOCKEDSHLIBCONTENT &&
|
|
err != EDOCTOOBIG &&
|
|
err != EDOCBADCONTENTTYPE)
|
|
{
|
|
log("http: bad error from httpserver get doc: %s",
|
|
mstrerror(err));
|
|
gbshutdownAbort(true);
|
|
}
|
|
}
|
|
// replicate the reply. might return NULL and set g_errno
|
|
char *copy = reply;
|
|
int32_t copyAllocSize = replyAllocSize;
|
|
// . only copy it if we are not the last guy in the table
|
|
// . no, now always copy it
|
|
if ( --count > 0 && ! err ) {
|
|
copy = (char *)mdup(reply,replySize,"msg13d");
|
|
copyAllocSize = replySize;
|
|
// oom doing the mdup? i've seen this core us so fix it
|
|
// because calling sendreply with a NULL
|
|
// 'copy' cores it.
|
|
if ( reply && ! copy ) {
|
|
copyAllocSize = 0;
|
|
err = ENOMEM;
|
|
}
|
|
}
|
|
// this is not freeable
|
|
if ( copy == g_fakeReply ) copyAllocSize = 0;
|
|
// get request
|
|
Msg13Request *r2 = *(Msg13Request **)s_rt.getValueFromSlot(tableSlot);
|
|
// get udp slot for this transaction
|
|
UdpSlot *slot = r2->m_udpSlot;
|
|
// remove from list
|
|
s_rt.removeSlot ( tableSlot );
|
|
// send back error? maybe...
|
|
if ( err ) {
|
|
if ( g_conf.m_logDebugSpider ||
|
|
g_conf.m_logDebugMsg13 ) {
|
|
char ipbuf[16];
|
|
log("proxy: msg13: sending back error: %s "
|
|
"for url %s with ip %s",
|
|
mstrerror(err),
|
|
r2->ptr_url,
|
|
iptoa(r2->m_urlIp,ipbuf));
|
|
}
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply. error=%s", __FILE__, __func__, __LINE__, mstrerror(err));
|
|
g_udpServer.sendErrorReply(slot, err);
|
|
continue;
|
|
}
|
|
// for debug for now
|
|
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
|
log("msg13: sending reply for %s",r->ptr_url);
|
|
|
|
// send reply
|
|
g_udpServer.sendReply(copy, replySize, copy, copyAllocSize, slot);
|
|
|
|
// now final udp slot will free the reply, so tcp server
|
|
// no longer has to. set this tcp buf to null then.
|
|
if ( ts->m_readBuf == reply && count == 0 )
|
|
ts->m_readBuf = NULL;
|
|
}
|
|
|
|
// we free it - if it was never sent over a udp slot
|
|
if ( savedErr && compressed )
|
|
mfree ( reply , replyAllocSize , "msg13ubuf" );
|
|
|
|
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
|
log("msg13: handled reply ok %s",r->ptr_url);
|
|
}
|
|
|
|
|
|
void passOnReply ( void *state , UdpSlot *slot ) {
|
|
// send that back
|
|
Msg13Request *r = (Msg13Request *)state;
|
|
|
|
// don't let udpserver free the request, it's our m_request[]
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
if ( g_errno ) {
|
|
log(LOG_WARN, "spider: error from proxy for %s: %s",
|
|
r->ptr_url,mstrerror(g_errno));
|
|
|
|
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
|
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
|
|
return;
|
|
}
|
|
|
|
// what did he give us?
|
|
char *reply = slot->m_readBuf;
|
|
int32_t replySize = slot->m_readBufSize;
|
|
int32_t replyAllocSize = slot->m_readBufMaxSize;
|
|
// do not allow "slot" to free the read buf since it is being used
|
|
// as the send buf for "udpSlot"
|
|
slot->m_readBuf = NULL;
|
|
slot->m_readBufSize = 0;
|
|
slot->m_readBufMaxSize = 0;
|
|
|
|
// prevent udpserver from trying to free g_fakeReply
|
|
if ( reply == g_fakeReply ) replyAllocSize = 0;
|
|
|
|
// just forward it on
|
|
g_udpServer.sendReply(reply, replySize, reply, replyAllocSize, r->m_udpSlot);
|
|
}
|
|
|
|
// returns true if <iframe> tag in there
|
|
bool hasIframe(char *reply, int32_t replySize) {
|
|
if ( ! reply || replySize <= 0 ) return false;
|
|
char *p = reply;
|
|
// exclude \0
|
|
char *pend = reply + replySize - 1;
|
|
for ( ; p < pend ; p++ ) {
|
|
if ( *p != '<' ) continue;
|
|
if ( to_lower_a (p[1]) != 'i' ) continue;
|
|
if ( to_lower_a (p[2]) != 'f' ) continue;
|
|
if ( to_lower_a (p[3]) != 'r' ) continue;
|
|
if ( to_lower_a (p[4]) != 'a' ) continue;
|
|
if ( to_lower_a (p[5]) != 'm' ) continue;
|
|
if ( to_lower_a (p[6]) != 'e' ) continue;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// returns false if blocks, true otherwise
|
|
static bool getIframeExpandedContent(Msg13Request *r, TcpSocket *ts) {
|
|
|
|
if ( ! ts ) { gbshutdownLogicError(); }
|
|
|
|
int32_t niceness = r->m_niceness;
|
|
|
|
// ok, we've an attempt now
|
|
r->m_attemptedIframeExpansion = true;
|
|
|
|
// we are doing something to destroy reply, so make a copy of it!
|
|
int32_t copySize = ts->m_readOffset + 1;
|
|
char *copy = (char *)mdup ( ts->m_readBuf , copySize , "ifrmcpy" );
|
|
if ( ! copy ) return true;
|
|
// sanity, must include \0 at the end
|
|
if ( copy[copySize-1] ) { gbshutdownLogicError(); }
|
|
|
|
// need a new state for it, use XmlDoc itself
|
|
XmlDoc *xd;
|
|
try { xd = new ( XmlDoc ); }
|
|
catch(std::bad_alloc&) {
|
|
mfree ( copy , copySize , "ifrmcpy" );
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( xd , sizeof(XmlDoc),"msg13xd");
|
|
|
|
// make a fake spider request so we can do it
|
|
SpiderRequest sreq;
|
|
strcpy(sreq.m_url,r->ptr_url);
|
|
int32_t firstIp = hash32n(r->ptr_url);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
|
|
// log it now
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: expanding iframes for %s",r->ptr_url);
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
// . sometimes niceness is 0, like when the UdpSlot
|
|
// gets its niceness converted, (see
|
|
// UdpSlot::m_converetedNiceness).
|
|
if ( ! xd->set4 ( &sreq ,
|
|
NULL ,
|
|
"main", // HACK!! m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
1, //niceness, // 1 ,
|
|
NULL , // content ,
|
|
false, // deleteFromIndex ,
|
|
0 )) { // forcedIp
|
|
// log it
|
|
log("scproxy: xmldoc set error: %s",mstrerror(g_errno));
|
|
// now nuke xmldoc
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
// g_errno should be set if that returned false
|
|
return true;
|
|
}
|
|
|
|
// . re-set the niceness because it will core if we set it with
|
|
// a niceness of 0...
|
|
xd->m_niceness = niceness;
|
|
|
|
// we already downloaded the httpReply so this is valid. no need
|
|
// to check robots.txt again for that url, but perhaps for the
|
|
// iframe urls.
|
|
xd->m_isAllowed = true;
|
|
xd->m_isAllowedValid = true;
|
|
|
|
// save stuff for calling gotHttpReply() back later with the
|
|
// iframe expanded document
|
|
xd->m_r = r;
|
|
|
|
// so XmlDoc::getExtraDoc doesn't have any issues
|
|
xd->m_firstIp = 123456;
|
|
xd->m_firstIpValid = true;
|
|
|
|
// try using xmldoc to do it
|
|
xd->m_httpReply = copy;
|
|
xd->m_httpReplySize = copySize;
|
|
xd->m_httpReplyAllocSize = copySize;
|
|
xd->m_httpReplyValid = true;
|
|
|
|
// we claimed this buffer, do not let TcpServer destroy it!
|
|
//ts->m_readBuf = NULL;//(char *)0x1234;
|
|
|
|
// tell it to skip msg13 and call httpServer.getDoc directly
|
|
xd->m_isSpiderProxy = true;
|
|
|
|
// do not let XmlDoc::getRedirUrl() try to get old title rec
|
|
xd->m_oldDocValid = true;
|
|
xd->m_oldDoc = NULL;
|
|
// can't be NULL, xmldoc uses for g_errno
|
|
xd->ptr_linkInfo1 = (LinkInfo *)0x01;
|
|
xd->size_linkInfo1 = 0 ;
|
|
xd->m_linkInfo1Valid = true;
|
|
|
|
// call this as callback
|
|
xd->setCallback ( xd , gotIframeExpandedContent );
|
|
|
|
xd->m_redirUrlValid = true;
|
|
xd->ptr_redirUrl = NULL;
|
|
xd->size_redirUrl = 0;
|
|
|
|
xd->m_downloadEndTimeValid = true;
|
|
xd->m_downloadEndTime = gettimeofdayInMilliseconds();
|
|
|
|
// now get the expanded content
|
|
char **ec = xd->getExpandedUtf8Content();
|
|
// this means it blocked
|
|
if ( ec == (void *)-1 ) {
|
|
//log("scproxy: waiting for %s",r->ptr_url);
|
|
return false;
|
|
}
|
|
// return true with g_errno set
|
|
if ( ! ec ) {
|
|
log("scproxy: iframe expansion error: %s",mstrerror(g_errno));
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { gbshutdownLogicError(); }
|
|
// clean up
|
|
}
|
|
|
|
// it did not block so signal gotIframeExpandedContent to not call
|
|
// gotHttpReply()
|
|
//xd->m_r = NULL;
|
|
|
|
// hey... it did block and we are stil;l printing this!!
|
|
// it happens when the iframe src is google or bing.. usually maps
|
|
// so i'd think indicative of something special
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: got iframe expansion without blocking for url=%s"
|
|
" err=%s",r->ptr_url,mstrerror(g_errno));
|
|
|
|
// save g_errno for returning
|
|
int32_t saved = g_errno;
|
|
|
|
// this also means that the iframe tag was probably not expanded
|
|
// because it was from google.com or bing.com or had a bad src attribut
|
|
// or bad url in the src attribute.
|
|
// so we have set m_attemptedIframeExpansion, just recall using
|
|
// the original TcpSocket ptr... and this time we should not be
|
|
// re-called because m_attemptedIframeExpansion is now true
|
|
//gotHttpReply2 ( r, NULL , 0 , 0 , NULL );
|
|
|
|
// we can't be messing with it!! otherwise we'd have to reutrn
|
|
// a new reply size i guess
|
|
if ( xd->m_didExpansion ) { gbshutdownAbort(true); }
|
|
|
|
// now nuke xmldoc
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
|
|
// reinstate g_errno in case mdelete() reset it
|
|
g_errno = saved;
|
|
|
|
// no blocking then...
|
|
return true;
|
|
}
|
|
|
|
static void gotIframeExpandedContent(void *state) {
|
|
// save error in case mdelete nukes it
|
|
int32_t saved = g_errno;
|
|
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
// this was stored in xd
|
|
Msg13Request *r = xd->m_r;
|
|
|
|
//log("scproxy: done waiting for %s",r->ptr_url);
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: got iframe expansion for url=%s",r->ptr_url);
|
|
|
|
// assume we had no expansion or there was an error
|
|
char *reply = NULL;
|
|
int32_t replySize = 0;
|
|
|
|
// . if no error, then grab it
|
|
// . if failed to get the iframe content then m_didExpansion should
|
|
// be false
|
|
if ( ! g_errno && xd->m_didExpansion ) {
|
|
// original mime should have been valid
|
|
if ( ! xd->m_mimeValid ) { gbshutdownAbort(true); }
|
|
// insert the mime into the expansion buffer! m_esbuf
|
|
xd->m_esbuf.insert2 ( xd->m_httpReply ,
|
|
xd->m_mime.getMimeLen() ,
|
|
0 );
|
|
// . get our buffer with the expanded iframes in it
|
|
// . make sure that has the mime in it too
|
|
//reply = xd->m_expandedUtf8Content;
|
|
//replySize = xd->m_expandedUtf8ContentSize;
|
|
// just to make sure nothing bad happens, null this out
|
|
xd->m_expandedUtf8Content = NULL;
|
|
// this new reply includes the original mime!
|
|
reply = xd->m_esbuf.getBufStart();
|
|
// include \0? yes.
|
|
replySize = xd->m_esbuf.length() + 1;
|
|
// sanity. must be null terminated
|
|
if ( reply[replySize-1] ) { gbshutdownLogicError(); }
|
|
}
|
|
// if expansion did not pan out, use original reply i guess
|
|
else if ( ! g_errno ) {
|
|
reply = xd->m_httpReply;
|
|
replySize = xd->m_httpReplySize;
|
|
}
|
|
|
|
// log it so we know why we are getting EDNSTIMEDOUT msgs back
|
|
// on the main cluster!
|
|
if ( g_errno )
|
|
log("scproxy: error getting iframe content for url=%s : %s",
|
|
r->ptr_url,mstrerror(g_errno));
|
|
// sanity check
|
|
if ( reply && reply[replySize-1] != '\0') { gbshutdownLogicError(); }
|
|
// pass back the error we had, if any
|
|
g_errno = saved;
|
|
// . then resume the reply processing up above as if this was the
|
|
// document that was downloaded.
|
|
// . PASS g_errno BACK TO THIS if it was set, like ETCPTIMEDOUT
|
|
gotHttpReply2 ( r, reply, replySize , replySize , NULL );
|
|
|
|
// no, let's not dup it and pass what we got in, since ts is NULL
|
|
// it should not free it!!!
|
|
|
|
// . now destroy it
|
|
// . the reply should have been sent back as a msg13 reply either
|
|
// as a normal reply or an error reply
|
|
// . nuke out state then, including the xmldoc
|
|
// . was there an error, maybe a TCPTIMEDOUT???
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
}
|
|
|
|
#define DELAYPERBAN 500
|
|
|
|
// how many milliseconds should spiders use for a crawldelay if
|
|
// ban was detected and no proxies are being used.
|
|
#define AUTOCRAWLDELAY 5000
|
|
|
|
// returns true if we queue the request to download later
|
|
static bool addToHammerQueue(Msg13Request *r) {
|
|
|
|
// sanity
|
|
if ( ! r->m_udpSlot ) { gbshutdownLogicError(); }
|
|
|
|
// skip if not needed
|
|
if ( r->m_skipHammerCheck ) return false;
|
|
|
|
// . make sure we are not hammering an ip
|
|
// . returns 0 if currently downloading a url from that ip
|
|
// . returns -1 if not found
|
|
int64_t last = s_hammerCache.getLongLong(0,r->m_firstIp,-1,true);
|
|
// get time now
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
// how long has it been since last download START time?
|
|
int64_t waited = nowms - last;
|
|
|
|
int32_t crawlDelayMS = r->m_crawlDelayMS;
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
|
|
|
bool canUseProxies = false;
|
|
if ( cr && cr->m_automaticallyUseProxies ) canUseProxies = true;
|
|
if ( r->m_forceUseFloaters ) canUseProxies = true;
|
|
|
|
// if no proxies listed, then it is pointless
|
|
if ( ! g_conf.m_proxyIps.hasDigits() ) canUseProxies = false;
|
|
|
|
// if not using proxies, but the ip is banning us, then at least
|
|
// backoff a bit
|
|
if ( cr &&
|
|
r->m_urlIp != 0 &&
|
|
r->m_urlIp != -1 &&
|
|
cr->m_automaticallyBackOff &&
|
|
// and it is in the twitchy table
|
|
isIpInTwitchyTable ( cr , r->m_urlIp ) ) {
|
|
// then just back off with a crawldelay of 3 seconds
|
|
if ( ! canUseProxies && crawlDelayMS < AUTOCRAWLDELAY )
|
|
crawlDelayMS = AUTOCRAWLDELAY;
|
|
// mark this so we do not retry pointlessly
|
|
r->m_wasInTableBeforeStarting = true;
|
|
// and obey crawl delay
|
|
r->m_skipHammerCheck = false;
|
|
}
|
|
|
|
|
|
// . if we got a proxybackoff base it on # of banned proxies for urlIp
|
|
// . try to be more sensitive for more sensitive website policies
|
|
// . we don't know why this proxy was banned, or if we were
|
|
// responsible, or who banned it, but be more sensitive anyway
|
|
if ( r->m_numBannedProxies &&
|
|
r->m_numBannedProxies * DELAYPERBAN > crawlDelayMS ) {
|
|
crawlDelayMS = r->m_numBannedProxies * DELAYPERBAN;
|
|
if ( crawlDelayMS > MAX_PROXYCRAWLDELAYMS )
|
|
crawlDelayMS = MAX_PROXYCRAWLDELAYMS;
|
|
}
|
|
|
|
// set the crawldelay we actually used when downloading this
|
|
//r->m_usedCrawlDelay = crawlDelayMS;
|
|
|
|
if ( g_conf.m_logDebugSpider ) {
|
|
char ipbuf[16];
|
|
log(LOG_DEBUG,"spider: got timestamp of %" PRId64" from "
|
|
"hammercache (waited=%" PRId64" crawlDelayMS=%" PRId32") "
|
|
"for %s"
|
|
,last
|
|
,waited
|
|
,crawlDelayMS
|
|
,iptoa(r->m_firstIp,ipbuf));
|
|
}
|
|
|
|
bool queueIt = false;
|
|
if ( last > 0 && waited < crawlDelayMS ) queueIt = true;
|
|
// a "last" of 0 means currently downloading
|
|
if ( crawlDelayMS > 0 && last == 0LL ) queueIt = true;
|
|
// a last of -1 means not found. so first time i guess.
|
|
if ( last == -1 ) queueIt = false;
|
|
// ignore it if from iframe expansion etc.
|
|
if ( r->m_skipHammerCheck ) queueIt = false;
|
|
|
|
// . queue it up if we haven't waited long enough
|
|
// . then the functionr, scanHammerQueue(), will re-eval all
|
|
// the download requests in this hammer queue every 10ms.
|
|
// . it will just lookup the lastdownload time in the cache,
|
|
// which will store maybe a -1 if currently downloading...
|
|
if ( queueIt ) {
|
|
// debug
|
|
char ipbuf[16];
|
|
log(LOG_INFO,
|
|
"spider: adding %s to crawldelayqueue cd=%" PRId32"ms "
|
|
"ip=%s",
|
|
r->ptr_url,crawlDelayMS,iptoa(r->m_urlIp,ipbuf));
|
|
// save this
|
|
//r->m_udpSlot = slot; // this is already saved!
|
|
r->m_nextLink = NULL;
|
|
// we gotta update the crawldelay here in case we modified
|
|
// it in the above logic.
|
|
r->m_crawlDelayMS = crawlDelayMS;
|
|
// when we stored it in the hammer queue
|
|
r->m_stored = nowms;
|
|
// add it to queue
|
|
if ( ! s_hammerQueueHead ) {
|
|
s_hammerQueueHead = r;
|
|
s_hammerQueueTail = r;
|
|
}
|
|
else {
|
|
s_hammerQueueTail->m_nextLink = r;
|
|
s_hammerQueueTail = r;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// if we had it in cache check the wait time
|
|
if ( last > 0 && waited < crawlDelayMS ) {
|
|
char ipbuf[16];
|
|
log("spider: hammering firstIp=%s url=%s "
|
|
"only waited %" PRId64" ms of %" PRId32" ms",
|
|
iptoa(r->m_firstIp,ipbuf),r->ptr_url,waited,
|
|
crawlDelayMS);
|
|
// this guy has too many redirects and it fails us...
|
|
// BUT do not core if running live, only if for test
|
|
// collection
|
|
// for now disable it seems like 99.9% good... but
|
|
// still cores on some wierd stuff...
|
|
}
|
|
// store time now
|
|
//RdbCacheLock rcl(s_hammercache);
|
|
//s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
|
// note it
|
|
//if ( g_conf.m_logDebugSpider )
|
|
// log("spider: adding download end time of %" PRIu64" for "
|
|
// "firstIp=%s "
|
|
// "url=%s "
|
|
// "to msg13::hammerCache",
|
|
// nowms,iptoa(r->m_firstIp),r->ptr_url);
|
|
// clear error from that if any, not important really
|
|
g_errno = 0;
|
|
return false;
|
|
}
|
|
|
|
// call this once every 10ms to launch queued up download requests so that
|
|
// we respect crawl delay for sure
|
|
static void scanHammerQueue(int fd, void *state) {
|
|
|
|
if ( ! s_hammerQueueHead ) return;
|
|
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
|
|
while (Msg13Request *r = s_hammerQueueHead) {
|
|
Msg13Request *prev = NULL;
|
|
int64_t waited = -1LL;
|
|
Msg13Request *nextLink = NULL;
|
|
bool rescanHammerQueue = false;
|
|
|
|
// scan down the linked list of queued of msg13 requests
|
|
for (; r; prev = r, r = nextLink) {
|
|
|
|
// downloadTheDocForReals() could free "r" so save this here
|
|
nextLink = r->m_nextLink;
|
|
|
|
int64_t last;
|
|
last = s_hammerCache.getLongLong(0, r->m_firstIp, 30, true);
|
|
// is one from this ip outstanding?
|
|
if (last == 0LL && r->m_crawlDelayFromEnd) continue;
|
|
|
|
|
|
int32_t crawlDelayMS = r->m_crawlDelayMS;
|
|
|
|
// . if we got a proxybackoff base it on # of banned proxies
|
|
// . try to be more sensitive for more sensitive website policy
|
|
// . we don't know why this proxy was banned, or if we were
|
|
// responsible, or who banned it, but be more sensitive
|
|
if ( //useProxies &&
|
|
r->m_numBannedProxies &&
|
|
r->m_hammerCallback == downloadTheDocForReals3b)
|
|
crawlDelayMS = r->m_numBannedProxies * DELAYPERBAN;
|
|
|
|
// download finished?
|
|
if (last > 0) {
|
|
waited = nowms - last;
|
|
// but skip if haven't waited long enough
|
|
if (waited < crawlDelayMS) continue;
|
|
}
|
|
// debug
|
|
//log("spider: downloading %s from crawldelay queue "
|
|
// "waited=%" PRId64"ms crawldelay=%" PRId32"ms",
|
|
// r->ptr_url,waited,r->m_crawlDelayMS);
|
|
|
|
// good to go
|
|
//downloadTheDocForReals ( r );
|
|
|
|
// sanity check
|
|
if (!r->m_hammerCallback) { gbshutdownLogicError(); }
|
|
|
|
// callback can now be either downloadTheDocForReals(r)
|
|
// or downloadTheDocForReals3b(r) if it is waiting after
|
|
// getting a ProxyReply that had a m_proxyBackoff set
|
|
|
|
if (g_conf.m_logDebugSpider)
|
|
log(LOG_DEBUG, "spider: calling hammer callback for %s (timestamp=%" PRId64",waited=%" PRId64",crawlDelayMS=%" PRId32")",
|
|
r->ptr_url, last, waited, crawlDelayMS);
|
|
|
|
//
|
|
// it should also add the current time to the hammer cache
|
|
// for r->m_firstIp
|
|
r->m_hammerCallback(r);
|
|
|
|
//
|
|
// remove from future scans
|
|
//
|
|
if (prev)
|
|
prev->m_nextLink = nextLink;
|
|
|
|
if (s_hammerQueueHead == r)
|
|
s_hammerQueueHead = nextLink;
|
|
|
|
if (s_hammerQueueTail == r)
|
|
s_hammerQueueTail = prev;
|
|
|
|
// if "r" was freed by downloadTheDocForReals() then
|
|
// in the next iteration of this loop, "prev" will point
|
|
// to a freed memory area, so start from the top again
|
|
// by breaking out of this inner loop and trying the outer loop again
|
|
rescanHammerQueue = true;
|
|
break;
|
|
}
|
|
|
|
if (!rescanHammerQueue) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
|
|
|
|
if ( ! r->m_proxyIp ) return true;
|
|
if ( ! r->m_proxyPort ) return true;
|
|
|
|
// get proxy from list to get username/password
|
|
SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
|
|
|
|
// if none required, all done
|
|
if ( ! sp->m_usernamePwd[0] ) return true;
|
|
// strange?
|
|
if ( req->length() < 8 ) return false;
|
|
// back up over final \r\n
|
|
req->m_length -= 2 ;
|
|
// insert it
|
|
req->safePrintf("Proxy-Authorization: Basic ");
|
|
req->base64Encode ( sp->m_usernamePwd );
|
|
req->safePrintf("\r\n");
|
|
req->safePrintf("\r\n");
|
|
req->nullTerm();
|
|
return true;
|
|
}
|
|
|
|
// When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
|
|
// using us like a proxy, so Msg13Request::m_url is in reality a complete
|
|
// HTTP request mime. so in that case we have to call this code to
|
|
// fix the HTTP request before sending it to its final destination.
|
|
//
|
|
// Remove "Proxy-authorization: Basic abcdefghij\r\n"
|
|
void stripProxyAuthorization ( char *squidProxiedReqBuf ) {
|
|
//
|
|
// remove the proxy authorization that has the username/pwd
|
|
// so the websites we download the url from do not see it in the
|
|
// http request mime
|
|
//
|
|
|
|
for(;;) {
|
|
// include space so it won't match anything in url
|
|
char *s = gb_strcasestr ( squidProxiedReqBuf , "Proxy-Authorization: " );
|
|
if ( ! s ) return;
|
|
// find next \r\n
|
|
const char *end = strstr ( s , "\r\n");
|
|
if ( ! end ) return;
|
|
// bury the \r\n as well
|
|
end += 2;
|
|
// bury that string
|
|
int32_t reqLen = strlen(squidProxiedReqBuf);
|
|
const char *reqEnd = squidProxiedReqBuf + reqLen;
|
|
// include \0, so add +1
|
|
memmove ( s ,end , reqEnd-end + 1);
|
|
}
|
|
}
|
|
|
|
|
|
// . convert "GET http://xyz.com/abc" to "GET /abc"
|
|
// . TODO: add "Host:xyz.con\r\n" ?
|
|
void fixGETorPOST ( char *squidProxiedReqBuf ) {
|
|
char *s = strstr ( squidProxiedReqBuf , "GET http" );
|
|
int32_t slen = 8;
|
|
if ( ! s ) {
|
|
s = strstr ( squidProxiedReqBuf , "POST http");
|
|
slen = 9;
|
|
}
|
|
if ( ! s ) {
|
|
s = strstr ( squidProxiedReqBuf , "HEAD http");
|
|
slen = 9;
|
|
}
|
|
if ( ! s ) return;
|
|
// point to start of http://...
|
|
char *httpStart = s + slen - 4;
|
|
// https?
|
|
s += slen;
|
|
if ( *s == 's' ) s++;
|
|
// skip ://
|
|
if ( *s++ != ':' ) return;
|
|
if ( *s++ != '/' ) return;
|
|
if ( *s++ != '/' ) return;
|
|
// skip until / or space or \r or \n or \0
|
|
for ( ; *s && ! is_wspace_a(*s) && *s != '/' ; s++ );
|
|
// bury the http://xyz.com part now
|
|
char *reqEnd = squidProxiedReqBuf + strlen(squidProxiedReqBuf);
|
|
// include the terminating \0, so add +1
|
|
memmove ( httpStart , s , reqEnd - s + 1 );
|
|
// now make HTTP/1.1 into HTTP/1.0
|
|
char *hs = strstr ( httpStart , "HTTP/1.1" );
|
|
if ( ! hs ) return;
|
|
hs[7] = '0';
|
|
}
|
|
|
|
// . sets Msg13Request m_proxiedUrl and m_proxiedUrlLen
|
|
bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r ) {
|
|
|
|
// this is actually the entire http request mime, not a url
|
|
//char *squidProxiedReqBuf = r->ptr_url;
|
|
|
|
// shortcut. this is the actual squid request like
|
|
// "CONNECT www.youtube.com:443 HTTP/1.1\r\nProxy-COnnection: ... "
|
|
// or
|
|
// "GET http://www.youtube.com/..."
|
|
char *s = r->ptr_url;
|
|
char *pu = NULL;
|
|
|
|
if ( strncmp ( s , "GET http" ,8 ) == 0 )
|
|
pu = s + 4;
|
|
else if ( strncmp ( s , "POST http" ,9 ) == 0 )
|
|
pu = s + 5;
|
|
else if ( strncmp ( s , "HEAD http" ,9 ) == 0 )
|
|
pu = s + 5;
|
|
// this doesn't always have http:// usually just a hostname
|
|
else if ( strncmp ( s , "CONNECT " ,8 ) == 0 )
|
|
pu = s + 8;
|
|
|
|
if ( ! pu ) return false;
|
|
|
|
r->m_proxiedUrl = pu;
|
|
|
|
// find end of it
|
|
char *p = r->m_proxiedUrl;
|
|
for ( ; *p && !is_wspace_a(*p) ; p++ );
|
|
|
|
r->m_proxiedUrlLen = p - r->m_proxiedUrl;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . for the page cache we hash the url and the cookie to make the cache key
|
|
// . also the GET/POST method i guess
|
|
// . returns 0 on issues
|
|
int64_t computeProxiedCacheKey64 ( Msg13Request *r ) {
|
|
|
|
// hash the url
|
|
char *start = r->m_proxiedUrl;
|
|
|
|
// how can this happen?
|
|
if ( ! start ) {
|
|
log("proxy: no proxied url");
|
|
return 0LL;
|
|
}
|
|
|
|
// skip http:// or https://
|
|
// skip forward
|
|
char *s = start;
|
|
if ( strncmp(s,"http://",7) == 0 ) s += 7;
|
|
if ( strncmp(s,"https://",8) == 0 ) s += 8;
|
|
// skip till we hit end of url
|
|
// skip until / or space or \r or \n or \0
|
|
char *cgi = NULL;
|
|
for ( ; *s && ! is_wspace_a(*s) ; s++ ) {
|
|
if ( *s == '?' && ! cgi ) cgi = s; }
|
|
// hash the url
|
|
int64_t h64 = hash64 ( start , s - start );
|
|
|
|
|
|
//
|
|
// if file extension implies it is an image, do not hash cookie
|
|
//
|
|
char *extEnd = NULL;
|
|
if ( cgi ) extEnd = cgi;
|
|
else extEnd = s;
|
|
char *ext = extEnd;
|
|
for ( ; ext>extEnd-6 && ext>start && *ext!='.' && *ext!='/' ; ext-- );
|
|
if ( *ext == '.' && ext+1 < extEnd ) {
|
|
HttpMime mime;
|
|
const char *cts;
|
|
ext++; // skip over .
|
|
cts = mime.getContentTypeFromExtension ( ext , extEnd-ext );
|
|
if ( strncmp(cts,"image/",6) == 0 ) return h64;
|
|
}
|
|
|
|
// this is actually the entire http request mime, not a url
|
|
char *squidProxiedReqBuf = r->ptr_url;
|
|
// now for cookie
|
|
s = strstr ( squidProxiedReqBuf , "Cookie: ");
|
|
// if not there, just return url hash
|
|
if ( ! s ) return h64;
|
|
// save start
|
|
start = s + 8;
|
|
// skip till we hit end of cookie line
|
|
for ( ; *s && *s != '\r' && *s != '\n' ; s++ );
|
|
// incorporate cookie hash
|
|
h64 = hash64 ( start , s - start , h64 );
|
|
|
|
//log("debug: cookiehash=%" PRId64,hash64(start,s-start));
|
|
|
|
return h64;
|
|
}
|
|
|
|
|
|
bool printHammerQueueTable ( SafeBuf *sb ) {
|
|
|
|
const char *title = "Queued Download Requests";
|
|
sb->safePrintf (
|
|
"<table %s>"
|
|
"<tr class=hdrow><td colspan=19>"
|
|
"<center>"
|
|
"<b>%s</b>"
|
|
"</td></tr>"
|
|
|
|
"<tr bgcolor=#%s>"
|
|
"<td><b>#</td>"
|
|
"<td><b>age</td>"
|
|
"<td><b>first ip found</td>"
|
|
"<td><b>actual ip</td>"
|
|
"<td><b>crawlDelayMS</td>"
|
|
"<td><b># proxies banning</td>"
|
|
|
|
"<td><b>coll</td>"
|
|
"<td><b>url</td>"
|
|
|
|
"</tr>\n"
|
|
, TABLE_STYLE
|
|
, title
|
|
, DARK_BLUE
|
|
);
|
|
|
|
int32_t count = 0;
|
|
int64_t nowms = gettimeofdayInMilliseconds();
|
|
|
|
|
|
for(Msg13Request *r = s_hammerQueueHead; r; r = r->m_nextLink) {
|
|
// print row
|
|
char ipbuf[16];
|
|
sb->safePrintf( "<tr bgcolor=#%s>"
|
|
"<td>%i</td>" // #
|
|
"<td>%ims</td>" // age in hammer queue
|
|
"<td>%s</td>"
|
|
,LIGHT_BLUE
|
|
,(int)count
|
|
,(int)(nowms - r->m_stored)
|
|
,iptoa(r->m_firstIp,ipbuf)
|
|
);
|
|
|
|
sb->safePrintf("<td>%s</td>" // actual ip
|
|
, iptoa(r->m_urlIp,ipbuf));
|
|
|
|
// print crawl delay as link to robots.txt
|
|
sb->safePrintf( "<td><a href=\"");
|
|
Url cu;
|
|
cu.set ( r->ptr_url );
|
|
bool isHttps = cu.isHttps();
|
|
if ( isHttps ) {
|
|
sb->safeStrcpy( "https://" );
|
|
} else {
|
|
sb->safeStrcpy( "http://" );
|
|
}
|
|
|
|
sb->safeMemcpy ( cu.getHost() , cu.getHostLen() );
|
|
int32_t port = cu.getPort();
|
|
int32_t defPort = isHttps ? 443 : 80;
|
|
|
|
if ( port != defPort ) {
|
|
sb->safePrintf( ":%" PRId32, port );
|
|
}
|
|
|
|
sb->safePrintf ( "/robots.txt\">"
|
|
"%i"
|
|
"</a>"
|
|
"</td>" // crawl delay MS
|
|
"<td>%i</td>" // proxies banning
|
|
, r->m_crawlDelayMS
|
|
, r->m_numBannedProxies
|
|
);
|
|
|
|
// show collection name as a link, also truncate to 32 chars
|
|
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
|
const char *coll = "none";
|
|
if ( cr ) coll = cr->m_coll;
|
|
sb->safePrintf("<td>");
|
|
if ( cr ) {
|
|
sb->safePrintf("<a href=\"/admin/sockets?c=");
|
|
urlEncode(sb,coll);
|
|
sb->safePrintf("\">");
|
|
}
|
|
sb->safeTruncateEllipsis ( coll , 32 );
|
|
if ( cr ) sb->safePrintf("</a>");
|
|
sb->safePrintf("</td>");
|
|
// then the url itself
|
|
sb->safePrintf("<td><a href=%s>",r->ptr_url);
|
|
sb->safeTruncateEllipsis ( r->ptr_url , 128 );
|
|
sb->safePrintf("</a></td>");
|
|
sb->safePrintf("</tr>\n");
|
|
}
|
|
return true;
|
|
}
|