privacore-open-source-searc.../SpiderColl.h
2018-03-19 10:15:38 +01:00

243 lines
6.2 KiB
C++

// Matt Wells, copyright Nov 2002
#ifndef GB_SPIDERCOLL_H
#define GB_SPIDERCOLL_H
#include "RdbList.h"
#include "RdbTree.h"
#include "HashTableX.h"
#include "Msg5.h"
#include "Msg4Out.h"
#include "hash.h"
#include "RdbCache.h"
#include "Spider.h" //MAX_SP_REPLY_SIZE
#include "types.h"
#include "max_coll_len.h"
#include <time.h>
#include <vector>
class CollectionRec;
class SpiderRequest;
class SpiderReply;
// we have one SpiderColl for each collection record
class SpiderColl {
public:
SpiderColl(CollectionRec *cr);
~SpiderColl();
CollectionRec *getCollectionRec();
const CollectionRec *getCollectionRec() const;
void setCollectionRec(CollectionRec *cr);
void clearLocks();
// called by main.cpp on exit to free memory
void reset();
static bool tryToDeleteSpiderColl(SpiderColl *sc, const char *msg);
// corresponding to CollectionRec::m_siteListBuf
bool m_siteListIsEmpty;
bool m_siteListIsEmptyValid;
// data buckets in this table are of type
HashTableX m_siteListDomTable;
// substring matches like "contains:goodstuff" or
// later "regex:.*"
SafeBuf m_negSubstringBuf;
SafeBuf m_posSubstringBuf;
Msg4 m_msg4x;
bool isInDupCache(const SpiderRequest *sreq, bool addToCache);
// Rdb.cpp calls this
bool addSpiderReply(const SpiderReply *srep);
bool addSpiderRequest(const SpiderRequest *sreq);
// doledb cursor keys for each priority to speed up performance
key96_t m_nextKeys[MAX_SPIDER_PRIORITIES];
// used by SpiderLoop.cpp
int32_t m_spidersOut;
// . hash of collection name this arena represents
// . 0 for main collection
collnum_t m_collnum;
char m_coll[MAX_COLL_LEN + 1];
CollectionRec *getCollRec();
const CollectionRec *getCollRec() const;
const char *getCollName() const;
void removeFromDoledbIpTable(int32_t firstIp);
int32_t getDoledbIpTableCount() const;
bool isDoledbIpTableEmpty() const;
void clearDoledbIpTable();
std::vector<uint32_t> getDoledbIpTable() const;
HashTableX m_siteIndexedDocumentCount;
bool printWaitingTree();
bool addToWaitingTree(int32_t firstIp);
void populateDoledbFromWaitingTree();
void populateWaitingTreeFromSpiderdb(bool reentry);
int32_t getWaitingTableCount() const;
void clearWaitingTable();
bool m_waitingTreeNeedsRebuild;
RdbTree m_waitingTree;
RdbMem m_waitingMem; // used by m_waitingTree
key96_t m_waitingTreeKey;
bool m_waitingTreeKeyValid;
void resetWaitingTreeNextKey() { m_waitingTreeNextKey.setMin(); }
time_t getLastScanTime() const { return m_lastScanTime; }
int32_t getScanningIp() const {return m_scanningIp; }
bool m_deleteMyself;
// start key for reading doledb
key96_t m_msg5StartKey;
void devancePriority();
void setPriority(int32_t pri);
key96_t m_nextDoledbKey;
int32_t m_pri2;
// how many outstanding spiders a priority has
int32_t m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
bool isFirstIpInOverflowList(int32_t firstIp) const;
private:
bool load();
bool makeDoledbIPTable();
bool addToDoledbIpTable(const SpiderRequest *sreq);
bool isInDoledbIpTable(int32_t firstIp) const;
bool validateDoleBuf(const SafeBuf *doleBuf);
bool addDoleBufIntoDoledb(SafeBuf *doleBuf, bool isFromCache);
bool updateSiteNumInlinksTable(int32_t siteHash32, int32_t sni, time_t tstamp);
uint64_t getSpiderTimeMS(const SpiderRequest *sreq, int32_t ufn, const SpiderReply *srep, int64_t nowMS);
bool makeWaitingTable();
bool addToWaitingTable(int32_t firstIp, int64_t timeMs);
bool getFromWaitingTable(int32_t firstIp, int64_t *timeMs);
void removeFromWaitingTable(int32_t firstIp);
bool isInWaitingTable(int32_t firstIp) const;
bool setWaitingTableSize(int32_t numSlots);
int32_t getNextIpFromWaitingTree ( );
// broke up scanSpiderdb into simpler functions:
bool evalIpLoop ( ) ;
bool readListFromSpiderdb ( ) ;
bool scanListForWinners ( ) ;
bool addWinnersIntoDoledb ( ) ;
key128_t m_firstKey;
key128_t m_nextKey;
key128_t m_endKey;
bool m_lastReplyValid;
char m_lastReplyBuf[sizeof(SpiderReply)];
bool m_isLoading;
// for scanning the wait tree...
bool m_isPopulatingDoledb;
bool m_didRead;
RdbCache m_dupCache;
HashTableX m_waitingTable;
mutable GbMutex m_waitingTableMtx;
// m_doledbIpTable (HashTableX, 96 bit keys, no data)
// Purpose: let's us know how many SpiderRequests have been doled out for a given firstIP
// Key is simply a 4-byte IP.
// Data is the number of doled out SpiderRequests from that IP.
// we use m_doledbIpTable for keeping counts based on ip of what is doled out.
HashTableX m_doledbIpTable;
mutable GbMutex m_doledbIpTableMtx;
RdbTree m_winnerTree;
HashTableX m_winnerTable;
int32_t m_tailIp;
int32_t m_tailPriority;
int64_t m_tailTimeMS;
int64_t m_tailUh48;
int64_t m_minFutureTimeMS;
// list for loading spiderdb recs during the spiderdb scan
RdbList m_list;
int32_t m_numAdded;
int64_t m_numBytesScanned;
int64_t m_lastPrintCount;
// freshest m_siteNumInlinks per site stored in here
HashTableX m_sniTable;
GbMutex m_sniTableMtx;
// maps a domainHash32 to a crawl delay in milliseconds
HashTableX m_cdTable;
GbMutex m_cdTableMtx;
RdbCache m_lastDownloadCache;
int64_t m_lastReindexTimeMS;
bool m_countingPagesIndexed;
int64_t m_lastReqUh48a;
int64_t m_lastReqUh48b;
int64_t m_lastRepUh48;
// spiderdb scan for populating waiting tree
RdbList m_waitingTreeList; // temporary list for reading spiderdb records
bool m_gettingWaitingTreeList; // true when waiting for spiderdb read to complete
key128_t m_waitingTreeNextKey; // temporary key used for reading spiderdb records
time_t m_lastScanTime;
std::atomic<int32_t> m_scanningIp;
std::atomic<int32_t> m_gotNewDataForScanningIp;
int32_t m_lastListSize;
int32_t m_lastScanningIp;
int64_t m_totalBytesScanned;
int32_t *m_overflowList;
int64_t m_totalNewSpiderRequests;
int64_t m_lastSreqUh48;
int32_t m_cblocks[20];
int32_t m_pageNumInlinks;
int32_t m_lastCBlockIp;
int32_t m_lastOverflowFirstIp;
CollectionRec *m_cr;
static void getSpiderdbWaitingTreeListWrapper(void *state);
static void gotSpiderdbWaitingTreeListWrapper(void *state, job_exit_t exit_type);
static void getSpiderdbListWrapper(void *state);
static void gotSpiderdbListWrapper(void *state, job_exit_t exit_type);
};
#endif // GB_SPIDERCOLL_H