Files

177 lines
4.5 KiB
C
Raw Permalink Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, copyright Oct 2001
// . ask another host to download a url for you
// . the remote host will also use a cache if m_maxCacheAge > 0
// . used for downloading and caching robots.txt
// . if m_compressReply then the host compressed the http reply before
// sending it back to you via udp
2016-03-08 22:14:30 +01:00
#ifndef GB_MSG13_H
#define GB_MSG13_H
2013-08-02 13:12:24 -07:00
#include "Url.h" // MAX_URL_LEN
2015-02-02 14:06:38 -08:00
#include "SpiderProxy.h" // MAXUSERNAMEPWD
2013-08-02 13:12:24 -07:00
// max crawl delay form proxy backoff of 1 minute (60 seconds)
#define MAX_PROXYCRAWLDELAYMS 60000
class RdbCache;
2013-08-02 13:12:24 -07:00
void resetMsg13Caches ( ) ;
bool printHammerQueueTable ( SafeBuf *sb ) ;
2013-08-02 13:12:24 -07:00
class Msg13Request {
public:
2014-06-02 14:59:15 -07:00
2014-06-06 15:11:51 -07:00
// the top portion of Msg13Request is sent to handleRequest54()
// in SpiderProxy.cpp to get and return proxies, as well as to
// ban proxies.
2016-05-19 18:37:26 +02:00
int32_t getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;}
2014-11-10 14:45:11 -08:00
int32_t m_urlIp;
int32_t m_lbId; // loadbucket id
2014-06-06 15:11:51 -07:00
// the http proxy to use to download
2014-11-10 14:45:11 -08:00
int32_t m_proxyIp;
2015-02-02 14:06:38 -08:00
uint16_t m_proxyPort;
2014-11-10 14:45:11 -08:00
int32_t m_banProxyIp;
2015-02-02 14:06:38 -08:00
uint16_t m_banProxyPort;
2014-06-06 15:11:51 -07:00
char m_opCode;
char m_lastHack;
2015-04-30 13:28:57 -07:00
collnum_t m_collnum;
2014-06-06 15:11:51 -07:00
// not part of the proxy request, but set from ProxyReply:
2014-11-10 14:45:11 -08:00
int32_t m_numBannedProxies;
2014-06-06 15:11:51 -07:00
// . if using proxies, how many proxies have we tried to download
// this url through
// . used internally in Msg13.cpp
2014-11-10 14:45:11 -08:00
int32_t m_proxyTries;
2014-06-06 15:11:51 -07:00
// if using proxies, did host #0 tell us there were more to try if
// this one did not work out?
bool m_hasMoreProxiesToTry;
// we call this function after the imposed crawl-delay is over
void (*m_hammerCallback)(class Msg13Request *r);
2014-10-30 13:36:39 -06:00
int64_t m_urlHash48;
2014-11-10 14:45:11 -08:00
int32_t m_firstIp;
2014-06-02 14:59:15 -07:00
// when it was stored in the hammer queue
int64_t m_stored;
2014-06-12 13:05:45 -07:00
// a tmp hack var referencing into m_url[] below
char *m_proxiedUrl;
2014-11-10 14:45:11 -08:00
int32_t m_proxiedUrlLen;
2014-06-12 13:05:45 -07:00
int64_t m_downloadStartTimeMS;
2013-08-02 13:12:24 -07:00
char m_niceness;
2014-11-10 14:45:11 -08:00
int32_t m_ifModifiedSince;
int32_t m_maxCacheAge;
int32_t m_maxTextDocLen;
int32_t m_maxOtherDocLen;
2013-11-22 18:26:34 -08:00
// in milliseconds. use -1 if none or unknown.
2014-11-10 14:45:11 -08:00
int32_t m_crawlDelayMS;
2013-11-22 18:26:34 -08:00
// for linked list, this is the hammer queue
class Msg13Request *m_nextLink;
2015-02-02 14:06:38 -08:00
char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD];
2013-08-02 13:12:24 -07:00
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
2014-11-10 14:45:11 -08:00
int32_t m_contentHash32;
unsigned m_compressReply:1;
unsigned m_useCompressionProxy:1;
2013-08-02 13:12:24 -07:00
// does url end in /robots.txt ?
2016-07-26 12:20:22 +02:00
unsigned m_isRobotsTxt:1;
unsigned m_skipHammerCheck:1;
unsigned m_attemptedIframeExpansion:1;
unsigned m_crawlDelayFromEnd:1;
2014-06-09 12:42:05 -07:00
// does m_url represent a FULL http request mime and NOT just a url?
// this happens when gigablast is being used like a squid proxy.
unsigned m_isSquidProxiedUrl:1;
2014-06-09 12:42:05 -07:00
unsigned m_forceUseFloaters:1;
unsigned m_wasInTableBeforeStarting:1;
2013-08-02 13:12:24 -07:00
// if we just end up calling HttpServer::getDoc() via calling
// downloadDoc() then we set this for callback purposes
class Msg13 *m_parent;
// on the other hand, if we are called indirectly by handleRequest13()
// then we set m_udpSlot.
class UdpSlot *m_udpSlot;
// used for addTestDoc() and caching. msg13 sets this
2014-10-30 13:36:39 -06:00
int64_t m_urlHash64;
2014-11-10 14:45:11 -08:00
int32_t m_spideredTime;
2013-08-02 13:12:24 -07:00
// used for caching (and for request table, wait in line table)
2014-10-30 13:36:39 -06:00
int64_t m_cacheKey;
char *ptr_url;
char *ptr_cookie;
2014-11-10 14:45:11 -08:00
int32_t size_url;
int32_t size_cookie;
2013-08-02 13:12:24 -07:00
// variable data starts here
2014-08-06 16:00:25 -07:00
2014-11-10 14:45:11 -08:00
int32_t getSize() {
return ((char *)ptr_url-(char *)this) +size_url+size_cookie;
}
2013-08-02 13:12:24 -07:00
// zero it all out
void reset() {
//memset (this,0,(char *)m_url - (char *)this + 1);
memset (this,0,sizeof(Msg13Request));
2013-08-02 13:12:24 -07:00
m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit
2013-11-22 18:26:34 -08:00
m_crawlDelayMS = -1; // unknown or none
2015-04-30 13:28:57 -07:00
m_collnum = (collnum_t)-1;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
};
class Msg13 {
public:
Msg13() ;
~Msg13();
void reset() ;
// register our request handler with g_udpServer (called by main.cpp)
static bool registerHandler();
static RdbCache *getHttpCacheRobots();
static RdbCache *getHttpCacheOthers();
2013-08-02 13:12:24 -07:00
bool getDoc ( Msg13Request *r ,
void *state ,
void (*callback)(void *state) );
bool forwardRequest();
bool gotForwardedReply ( class UdpSlot *slot );
2014-11-10 14:45:11 -08:00
bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize);
2013-08-02 13:12:24 -07:00
// keep public so wrappers can access
void *m_state;
void (* m_callback) (void *state );
// we now store the uncompressed http reply in here
char *m_replyBuf;
2014-11-10 14:45:11 -08:00
int32_t m_replyBufSize;
int32_t m_replyBufAllocSize;
2013-08-02 13:12:24 -07:00
// point to it
Msg13Request *m_request;
};
extern RdbCache s_hammerCache;
2016-03-08 22:14:30 +01:00
#endif // GB_MSG13_H