// Matt Wells, copyright Oct 2001 // . ask another host to download a url for you // . the remote host will also use a cache if m_maxCacheAge > 0 // . used for downloading and caching robots.txt // . if m_compressReply then the host compressed the http reply before // sending it back to you via udp #ifndef GB_MSG13_H #define GB_MSG13_H #include "Url.h" // MAX_URL_LEN #include "SpiderProxy.h" // MAXUSERNAMEPWD // max crawl delay form proxy backoff of 1 minute (60 seconds) #define MAX_PROXYCRAWLDELAYMS 60000 class RdbCache; void resetMsg13Caches ( ) ; bool printHammerQueueTable ( SafeBuf *sb ) ; class Msg13Request { public: // the top portion of Msg13Request is sent to handleRequest54() // in SpiderProxy.cpp to get and return proxies, as well as to // ban proxies. int32_t getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;} int32_t m_urlIp; int32_t m_lbId; // loadbucket id // the http proxy to use to download int32_t m_proxyIp; uint16_t m_proxyPort; int32_t m_banProxyIp; uint16_t m_banProxyPort; char m_opCode; char m_lastHack; collnum_t m_collnum; // not part of the proxy request, but set from ProxyReply: int32_t m_numBannedProxies; // . if using proxies, how many proxies have we tried to download // this url through // . used internally in Msg13.cpp int32_t m_proxyTries; // if using proxies, did host #0 tell us there were more to try if // this one did not work out? bool m_hasMoreProxiesToTry; // we call this function after the imposed crawl-delay is over void (*m_hammerCallback)(class Msg13Request *r); int64_t m_urlHash48; int32_t m_firstIp; // when it was stored in the hammer queue int64_t m_stored; // a tmp hack var referencing into m_url[] below char *m_proxiedUrl; int32_t m_proxiedUrlLen; int64_t m_downloadStartTimeMS; char m_niceness; int32_t m_ifModifiedSince; int32_t m_maxCacheAge; int32_t m_maxTextDocLen; int32_t m_maxOtherDocLen; // in milliseconds. use -1 if none or unknown. int32_t m_crawlDelayMS; // for linked list, this is the hammer queue class Msg13Request *m_nextLink; char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD]; // if doing spider compression, compute contentHash32 of document // downloaded, and if it matches this then send back EDOCUNCHANGED int32_t m_contentHash32; unsigned m_compressReply:1; unsigned m_useCompressionProxy:1; // does url end in /robots.txt ? unsigned m_isRobotsTxt:1; unsigned m_skipHammerCheck:1; unsigned m_attemptedIframeExpansion:1; unsigned m_crawlDelayFromEnd:1; // does m_url represent a FULL http request mime and NOT just a url? // this happens when gigablast is being used like a squid proxy. unsigned m_isSquidProxiedUrl:1; unsigned m_forceUseFloaters:1; unsigned m_wasInTableBeforeStarting:1; // if we just end up calling HttpServer::getDoc() via calling // downloadDoc() then we set this for callback purposes class Msg13 *m_parent; // on the other hand, if we are called indirectly by handleRequest13() // then we set m_udpSlot. class UdpSlot *m_udpSlot; // used for addTestDoc() and caching. msg13 sets this int64_t m_urlHash64; int32_t m_spideredTime; // used for caching (and for request table, wait in line table) int64_t m_cacheKey; char *ptr_url; char *ptr_cookie; int32_t size_url; int32_t size_cookie; // variable data starts here int32_t getSize() { return ((char *)ptr_url-(char *)this) +size_url+size_cookie; } // zero it all out void reset() { //memset (this,0,(char *)m_url - (char *)this + 1); memset (this,0,sizeof(Msg13Request)); m_maxTextDocLen = -1; // no limit m_maxOtherDocLen = -1; // no limit m_crawlDelayMS = -1; // unknown or none m_collnum = (collnum_t)-1; } }; class Msg13 { public: Msg13() ; ~Msg13(); void reset() ; // register our request handler with g_udpServer (called by main.cpp) static bool registerHandler(); static RdbCache *getHttpCacheRobots(); static RdbCache *getHttpCacheOthers(); bool getDoc ( Msg13Request *r , void *state , void (*callback)(void *state) ); bool forwardRequest(); bool gotForwardedReply ( class UdpSlot *slot ); bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize); // keep public so wrappers can access void *m_state; void (* m_callback) (void *state ); // we now store the uncompressed http reply in here char *m_replyBuf; int32_t m_replyBufSize; int32_t m_replyBufAllocSize; // point to it Msg13Request *m_request; }; extern RdbCache s_hammerCache; #endif // GB_MSG13_H