// Matt Wells, copyright Nov 2002 #ifndef GB_SPIDERCOLL_H #define GB_SPIDERCOLL_H #include "RdbList.h" #include "RdbTree.h" #include "HashTableX.h" #include "Msg5.h" #include "Msg4Out.h" #include "hash.h" #include "RdbCache.h" #include "Spider.h" //MAX_SP_REPLY_SIZE #include "types.h" #include "max_coll_len.h" #include <time.h> class CollectionRec; class SpiderRequest; class SpiderReply; #define OVERFLOWLISTSIZE 200 // we have one SpiderColl for each collection record class SpiderColl { public: SpiderColl(CollectionRec *cr); ~SpiderColl(); CollectionRec *getCollectionRec(); void setCollectionRec(CollectionRec *cr); void clearLocks(); // called by main.cpp on exit to free memory void reset(); static bool tryToDeleteSpiderColl(SpiderColl *sc, const char *msg); // corresponding to CollectionRec::m_siteListBuf bool m_siteListIsEmpty; bool m_siteListIsEmptyValid; // data buckets in this table are of type HashTableX m_siteListDomTable; // substring matches like "contains:goodstuff" or // later "regex:.*" SafeBuf m_negSubstringBuf; SafeBuf m_posSubstringBuf; // . do not re-send CrawlInfoLocal for a coll if not update // . we store the flags in here as true if we should send our // CrawlInfoLocal for this coll to this hostId char m_sendLocalCrawlInfoToHost[MAX_HOSTS]; Msg4 m_msg4x; bool isInDupCache(const SpiderRequest *sreq, bool addToCache); // Rdb.cpp calls this bool addSpiderReply(const SpiderReply *srep); bool addSpiderRequest(const SpiderRequest *sreq, int64_t nowGlobalMS); // doledb cursor keys for each priority to speed up performance key96_t m_nextKeys[MAX_SPIDER_PRIORITIES]; int64_t m_lastPrintCount; int64_t m_lastPrinted; // used by SpiderLoop.cpp int32_t m_spidersOut; // . hash of collection name this arena represents // . 0 for main collection collnum_t m_collnum; char m_coll[MAX_COLL_LEN + 1]; class CollectionRec *getCollRec(); const char *getCollName(); void removeFromDoledbIpTable(int32_t firstIp); int32_t getDoledbIpTableCount() const; bool isDoledbIpTableEmpty() const; void clearDoledbIpTable(); HashTableX m_localTable; bool printWaitingTree ( ) ; bool addToWaitingTree(int32_t firstIp); uint64_t getNextSpiderTimeFromWaitingTree ( ) ; void populateDoledbFromWaitingTree ( ); void populateWaitingTreeFromSpiderdb ( bool reentry ) ; int32_t getWaitingTableCount() const; void clearWaitingTable(); bool m_waitingTreeNeedsRebuild; RdbTree m_waitingTree; RdbMem m_waitingMem; // used by m_waitingTree key96_t m_waitingTreeKey; bool m_waitingTreeKeyValid; void resetWaitingTreeNextKey() { m_waitingTreeNextKey.setMin(); } time_t getLastScanTime() const { return m_lastScanTime; } int32_t getScanningIp() const {return m_scanningIp; } bool m_deleteMyself; // start key for reading doledb key96_t m_msg5StartKey; void devancePriority(); void setPriority(int32_t pri); key96_t m_nextDoledbKey; int32_t m_pri2; bool gettingSpiderdbList() const { return m_gettingList1; } // how many outstanding spiders a priority has int32_t m_outstandingSpiders[MAX_SPIDER_PRIORITIES]; bool printStats ( SafeBuf &sb ) ; bool isFirstIpInOverflowList ( int32_t firstIp ) ; private: bool load(); bool makeDoledbIPTable(); bool addToDoledbIpTable(SpiderRequest *sreq); bool isInDoledbIpTable(int32_t firstIp) const; bool validateDoleBuf(SafeBuf *doleBuf); bool addDoleBufIntoDoledb(SafeBuf *doleBuf, bool isFromCache); bool updateSiteNumInlinksTable(int32_t siteHash32, int32_t sni, time_t tstamp); uint64_t getSpiderTimeMS(SpiderRequest *sreq, int32_t ufn, SpiderReply *srep); bool makeWaitingTable(); bool addToWaitingTable(int32_t firstIp, int64_t timeMs); bool getFromWaitingTable(int32_t firstIp, int64_t *timeMs); void removeFromWaitingTable(int32_t firstIp); bool isInWaitingTable(int32_t firstIp) const; bool setWaitingTableSize(int32_t numSlots); int32_t getNextIpFromWaitingTree ( ); // broke up scanSpiderdb into simpler functions: bool evalIpLoop ( ) ; bool readListFromSpiderdb ( ) ; bool scanListForWinners ( ) ; bool addWinnersIntoDoledb ( ) ; key128_t m_firstKey; key128_t m_nextKey; key128_t m_endKey; bool m_lastReplyValid; char m_lastReplyBuf[MAX_SP_REPLY_SIZE]; bool m_isLoading; // for scanning the wait tree... bool m_isPopulatingDoledb; bool m_didRead; RdbCache m_dupCache; HashTableX m_waitingTable; mutable GbMutex m_waitingTableMtx; // m_doledbIpTable (HashTableX, 96 bit keys, no data) // Purpose: let's us know how many SpiderRequests have been doled out for a given firstIP // Key is simply a 4-byte IP. // Data is the number of doled out SpiderRequests from that IP. // we use m_doledbIpTable for keeping counts based on ip of what is doled out. HashTableX m_doledbIpTable; mutable GbMutex m_doledbIpTableMtx; RdbTree m_winnerTree; HashTableX m_winnerTable; int32_t m_tailIp; int32_t m_tailPriority; int64_t m_tailTimeMS; int64_t m_tailUh48; int32_t m_tailHopCount; int64_t m_minFutureTimeMS; // list for loading spiderdb recs during the spiderdb scan RdbList m_list; int32_t m_numAdded; int64_t m_numBytesScanned; // freshest m_siteNumInlinks per site stored in here HashTableX m_sniTable; GbMutex m_sniTableMtx; // maps a domainHash32 to a crawl delay in milliseconds HashTableX m_cdTable; GbMutex m_cdTableMtx; RdbCache m_lastDownloadCache; bool m_countingPagesIndexed; int64_t m_lastReqUh48a; int64_t m_lastReqUh48b; int64_t m_lastRepUh48; // spiderdb scan for populating waiting tree RdbList m_waitingTreeList; Msg5 m_msg5b; bool m_gettingWaitingTreeList; key128_t m_waitingTreeNextKey; key128_t m_waitingTreeEndKey; time_t m_lastScanTime; std::atomic<int32_t> m_scanningIp; std::atomic<int32_t> m_gotNewDataForScanningIp; int32_t m_lastListSize; int32_t m_lastScanningIp; int64_t m_totalBytesScanned; // for reading lists from spiderdb Msg5 m_msg5; bool m_gettingList1; int32_t *m_overflowList; int64_t m_totalNewSpiderRequests; int64_t m_lastSreqUh48; int32_t m_cblocks[20]; int32_t m_pageNumInlinks; int32_t m_lastCBlockIp; int32_t m_lastOverflowFirstIp; CollectionRec *m_cr; static void gotSpiderdbListWrapper(void *state, RdbList *list, Msg5 *msg5); static void gotSpiderdbWaitingTreeListWrapper(void *state, RdbList *list, Msg5 *msg5); }; #endif // GB_SPIDERCOLL_H