forked from Mirrors/privacore-open-source-search-engine
545 lines
16 KiB
C++
545 lines
16 KiB
C++
// Matt Wells, copyright Feb 2001
|
|
|
|
// maintains a simple array of CollectionRecs
|
|
|
|
#ifndef GB_COLLECTIONDB_H
|
|
#define GB_COLLECTIONDB_H
|
|
|
|
#include "SafeBuf.h"
|
|
#include "rdbid_t.h"
|
|
|
|
bool addCollToTable ( const char *coll , collnum_t collnum ) ;
|
|
|
|
class WaitEntry {
|
|
public:
|
|
void (* m_callback) (void *state);
|
|
// ptr to list of parm recs for Parms.cpp
|
|
char *m_parmPtr;
|
|
char *m_parmEnd;
|
|
class UdpSlot *m_slot;
|
|
bool m_doRebuilds;
|
|
bool m_rebuildActiveList;
|
|
bool m_doProxyRebuild;
|
|
bool m_updatedRound;
|
|
collnum_t m_collnum;
|
|
int32_t m_errno;
|
|
bool m_sentReply;
|
|
};
|
|
|
|
class Collectiondb {
|
|
|
|
public:
|
|
Collectiondb();
|
|
|
|
// does nothing
|
|
void reset() ;
|
|
|
|
// called by main.cpp to fill in our m_recs[] array with
|
|
// all the coll.*.*/coll.conf info
|
|
bool loadAllCollRecs ( );
|
|
|
|
// after main.cpp loads all rdb trees it calls this to remove
|
|
// bogus collnums from the trees i guess
|
|
bool cleanTrees ( ) ;
|
|
|
|
// . this will save all conf files back to disk that need it
|
|
// . returns false and sets g_errno on error, true on success
|
|
bool save ( );
|
|
bool m_needsSave;
|
|
|
|
// returns i so that m_recs[i].m_coll = coll
|
|
collnum_t getCollnum ( const char *coll , int32_t collLen );
|
|
collnum_t getCollnum ( const char *coll ); // coll is NULL terminated here
|
|
|
|
char *getCollName ( collnum_t collnum );
|
|
char *getColl ( collnum_t collnum ) {return getCollName(collnum);}
|
|
|
|
// get coll rec specified in the HTTP request
|
|
class CollectionRec *getRec ( class HttpRequest *r ,
|
|
bool useDefaultRec = true );
|
|
|
|
// do not support diffbot style token/name style for this one:
|
|
const char *getDefaultColl ( HttpRequest *r ) ;
|
|
|
|
// . get collectionRec from name
|
|
// returns NULL if not available
|
|
class CollectionRec *getRec ( const char *coll );
|
|
|
|
class CollectionRec *getRec ( const char *coll , int32_t collLen );
|
|
|
|
class CollectionRec *getRec ( collnum_t collnum);
|
|
|
|
//class CollectionRec *getDefaultRec ( ) ;
|
|
|
|
class CollectionRec *getFirstRec ( ) ;
|
|
char *getFirstCollName ( ) ;
|
|
collnum_t getFirstCollnum ( ) ;
|
|
|
|
// . how many collections we have in here
|
|
// . only counts valid existing collections
|
|
int32_t getNumRecsUsed() { return m_numRecsUsed; }
|
|
|
|
// what collnum will be used the next time a coll is added?
|
|
collnum_t reserveCollNum ( ) ;
|
|
|
|
bool addExistingColl ( const char *coll, collnum_t collnum );
|
|
|
|
bool addNewColl( const char *coll, bool saveIt, collnum_t newCollnum ) ;
|
|
|
|
bool registerCollRec ( CollectionRec *cr , bool isNew ) ;
|
|
|
|
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
|
|
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
|
|
|
|
bool growRecPtrBuf ( collnum_t collnum ) ;
|
|
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;
|
|
|
|
// returns false if blocked, true otherwise.
|
|
bool deleteRec2 ( collnum_t collnum );//, WaitEntry *we ) ;
|
|
|
|
//void deleteSpiderColl ( class SpiderColl *sc );
|
|
|
|
// returns false if blocked, true otherwise.
|
|
bool resetColl2 ( collnum_t oldCollnum,
|
|
collnum_t newCollnum,
|
|
//WaitEntry *we ,
|
|
bool purgeSeeds );
|
|
|
|
// . keep up to 128 of them, these reference into m_list
|
|
// . COllectionRec now includes m_needsSave and m_lastUpdateTime
|
|
class CollectionRec **m_recs;// [ MAX_COLLS ];
|
|
|
|
// now m_recs[] points into a safebuf that is just an array
|
|
// of collectionrec ptrs. so we have to grow that safebuf possibly
|
|
// in order to add a new collection rec ptr to m_recs
|
|
SafeBuf m_recPtrBuf;
|
|
|
|
int32_t m_numRecs;
|
|
int32_t m_numRecsUsed;
|
|
|
|
int32_t m_wrapped;
|
|
|
|
int32_t m_numCollsSwappedOut;
|
|
|
|
bool m_initializing;
|
|
//int64_t m_lastUpdateTime;
|
|
};
|
|
|
|
extern class Collectiondb g_collectiondb;
|
|
|
|
// Matt Wells, copyright Feb 2002
|
|
|
|
// . a collection record specifies the spider/index/search parms of a
|
|
// collection of web pages
|
|
// . there's a Msg class to send an update signal to all the hosts once
|
|
// we've used Msg1 to add a new rec or delete an old. The update signal
|
|
// will make the receiving hosts flush their CollectionRec buf so they
|
|
// have to send out a Msg0 to get it again
|
|
// . we have a default collection record, a main collection record and
|
|
// then other collection records
|
|
// . the default collection record values override all
|
|
// . but the collection record values can override SiteRec values
|
|
// . so if spider is disabled in default collection record, then nobody
|
|
// can spider!
|
|
// . override the g_conf.* vars where * is in this class to use
|
|
// Collection db's default values
|
|
// . then add in the values of the specialzed collection record
|
|
// . so change "if ( g_conf.m_spideringEnabled )" to something like
|
|
// Msg33 msg33;
|
|
// if ( ! msg33.getCollectionRec ( m_coll, m_collLen ) ) return false;
|
|
// CollectionRec *r = msg33.getRec();
|
|
// CollectoinRec *d = msg33.getDefaultRec();
|
|
// if ( ! r->m_spideringEnabled || ! d->m_spideringEnabled ) continue;
|
|
// ... otherwise, spider for the m_coll collection
|
|
// ... pass msg33 to Msg14::spiderDoc(), etc...
|
|
|
|
// how many url filtering patterns?
|
|
#define MAX_FILTERS 96 // up to 96 url regular expression patterns
|
|
//#define MAX_PRIORITY_QUEUES MAX_SPIDER_PRIORITIES * 2//each can be old or new
|
|
// max chars the executable path+name can be
|
|
#define MAX_FILTER_LEN 64
|
|
|
|
#define SUMMARYHIGHLIGHTTAGMAXSIZE 128
|
|
|
|
#include "Url.h" // MAX_COLL_LEN
|
|
#include "HashTableX.h"
|
|
|
|
// fake this for now
|
|
#define RDB_END2 80
|
|
|
|
|
|
// how many counts are in CrawlInfo below????
|
|
#define NUMCRAWLSTATS 8
|
|
|
|
// used by diffbot to control spidering per collection
|
|
class CrawlInfo {
|
|
public:
|
|
|
|
//
|
|
// WARNING!! Add NEW stats below the LAST member variable in
|
|
// this class so that it can still load the OLD file on disk
|
|
// which is in the OLD format!
|
|
//
|
|
|
|
int64_t m_objectsDeleted; // 1
|
|
int64_t m_objectsAdded; // 2
|
|
int64_t m_urlsConsideredNOTUSED; // 3
|
|
int64_t m_pageDownloadAttempts; // 4
|
|
int64_t m_pageDownloadSuccesses; // 5
|
|
int64_t m_pageProcessAttempts; // 6
|
|
int64_t m_pageProcessSuccesses; // 7
|
|
int64_t m_urlsHarvested; // 8
|
|
|
|
|
|
int32_t m_lastUpdateTime;
|
|
|
|
// this is non-zero if urls are available to be spidered right now.
|
|
int32_t m_hasUrlsReadyToSpider;
|
|
|
|
// last time we launched a spider. 0 on startup.
|
|
uint32_t m_lastSpiderAttempt; // time_t
|
|
// time we had or might have had a url available for spidering
|
|
uint32_t m_lastSpiderCouldLaunch; // time_t
|
|
|
|
int32_t m_collnum;
|
|
|
|
char m_reserved1;
|
|
int32_t m_reserved2;
|
|
|
|
// keep separate because when we receive a crawlinfo struct from
|
|
// a host we only add these in if it matches our round #
|
|
int64_t m_pageDownloadSuccessesThisRound;
|
|
int64_t m_pageProcessSuccessesThisRound;
|
|
|
|
|
|
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); }
|
|
} __attribute__((packed, aligned(4)));
|
|
|
|
|
|
class CollectionRec {
|
|
|
|
public:
|
|
|
|
// active linked list of collectionrecs used by spider.cpp
|
|
class CollectionRec *m_nextActive;
|
|
|
|
// these just set m_xml to NULL
|
|
CollectionRec();
|
|
virtual ~CollectionRec();
|
|
|
|
int64_t getNumDocsIndexed();
|
|
|
|
// messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
|
|
// so we do not have to keep sending this huge msg!
|
|
bool shouldSendLocalCrawlInfoToHost ( int32_t hostId );
|
|
void sentLocalCrawlInfoToHost ( int32_t hostId );
|
|
void localCrawlInfoUpdate();
|
|
|
|
// . stuff used by Collectiondb
|
|
// . do we need a save or not?
|
|
bool save ();
|
|
bool m_needsSave;
|
|
|
|
bool load ( const char *coll , int32_t collNum ) ;
|
|
void reset();
|
|
|
|
// Clear memory structures used by URL filters
|
|
void clearUrlFilters();
|
|
|
|
// for customcrawls
|
|
bool rebuildUrlFilters();
|
|
|
|
// for regular crawls
|
|
bool rebuildUrlFilters2();
|
|
|
|
bool rebuildLangRules( const char *lang , const char *tld );
|
|
|
|
bool rebuildPrivacoreRules();
|
|
bool rebuildShallowRules();
|
|
|
|
bool m_urlFiltersHavePageCounts;
|
|
|
|
// the all important collection name, NULL terminated
|
|
char m_coll [ MAX_COLL_LEN + 1 ] ;
|
|
int32_t m_collLen;
|
|
|
|
// used by SpiderCache.cpp. g_collectiondb.m_recs[m_collnum] = this
|
|
collnum_t m_collnum;
|
|
|
|
// for doing DailyMerge.cpp stuff
|
|
int32_t m_dailyMergeStarted; // time_t
|
|
int32_t m_dailyMergeTrigger;
|
|
|
|
class CollectionRec *m_nextLink;
|
|
class CollectionRec *m_prevLink;
|
|
|
|
char m_dailyMergeDOWList[48];
|
|
|
|
int32_t m_treeCount;
|
|
|
|
int64_t m_spiderCorruptCount;
|
|
|
|
// holds ips that have been detected as being throttled and we need
|
|
// to backoff and use proxies on
|
|
HashTableX m_twitchyTable;
|
|
|
|
// spider controls for this collection
|
|
char m_spideringEnabled ;
|
|
int32_t m_spiderDelayInMilliseconds;
|
|
|
|
// is in active list in spider.cpp?
|
|
bool m_isActive;
|
|
|
|
// . at what time did the spiders start?
|
|
// . this is incremented when all urls have been spidered and
|
|
// the next round begins
|
|
uint32_t m_spiderRoundStartTime; // time_t
|
|
// this begins at 0, and increments when all the urls have been
|
|
// spidered and begin the next round
|
|
int32_t m_spiderRoundNum;
|
|
|
|
char m_makeImageThumbnails;
|
|
|
|
int32_t m_thumbnailMaxWidthHeight ;
|
|
|
|
char m_indexSpiderReplies;
|
|
char m_indexBody;
|
|
|
|
// do not re-add outlinks to spiderdb if less than this many days
|
|
// have elapsed since the last time we added them to spiderdb
|
|
float m_outlinksRecycleFrequencyDays ;
|
|
|
|
char m_dedupingEnabled ; // dedup content on same hostname
|
|
char m_dupCheckWWW ;
|
|
char m_detectCustomErrorPages ;
|
|
char m_useSimplifiedRedirects ;
|
|
char m_useIfModifiedSince ;
|
|
char m_useTimeAxis ;
|
|
char m_buildVecFromCont ;
|
|
int32_t m_maxPercentSimilarPublishDate;
|
|
char m_useSimilarityPublishDate;
|
|
char m_oneVotePerIpDom ;
|
|
char m_doUrlSpamCheck ; //filter urls w/ naughty hostnames
|
|
char m_doLinkSpamCheck ; //filters dynamically generated pages
|
|
char m_tagdbColl [MAX_COLL_LEN+1]; // coll to use for tagdb lookups
|
|
char m_delete404s ;
|
|
char m_siteClusterByDefault ;
|
|
char m_doIpLookups ; // considered iff using proxy
|
|
char m_useRobotsTxt ;
|
|
char m_obeyRelNoFollowLinks ;
|
|
char m_forceUseFloaters ;
|
|
char m_automaticallyUseProxies ;
|
|
char m_automaticallyBackOff ;
|
|
char m_recycleContent ;
|
|
char m_getLinkInfo ; // turn off to save seeks
|
|
char m_computeSiteNumInlinks ;
|
|
char m_indexInlinkNeighborhoods;
|
|
char m_removeBannedPages ;
|
|
|
|
int32_t m_percentSimilarSummary ; // Dedup by summary similiarity
|
|
int32_t m_summDedupNumLines ;
|
|
|
|
int32_t m_maxQueryTerms;
|
|
|
|
char m_spiderStatus;
|
|
|
|
//ranking settings
|
|
float m_sameLangWeight;
|
|
|
|
// Language stuff
|
|
char m_defaultSortLanguage2[6];
|
|
|
|
// for Spider.cpp
|
|
int32_t m_updateRoundNum;
|
|
|
|
// IMPORT PARMS
|
|
char m_importEnabled;
|
|
SafeBuf m_importDir;
|
|
int32_t m_numImportInjects;
|
|
class ImportState *m_importState;
|
|
|
|
SafeBuf m_collectionPasswords;
|
|
SafeBuf m_collectionIps;
|
|
|
|
// from Conf.h
|
|
int32_t m_posdbMinFilesToMerge ;
|
|
int32_t m_titledbMinFilesToMerge ;
|
|
int32_t m_linkdbMinFilesToMerge ;
|
|
int32_t m_tagdbMinFilesToMerge ;
|
|
int32_t m_spiderdbMinFilesToMerge;
|
|
|
|
char m_dedupResultsByDefault ;
|
|
char m_doTagdbLookups ;
|
|
char m_deleteTimeouts ; // can delete docs that time out?
|
|
char m_allowAdultDocs ;
|
|
char m_useCanonicalRedirects ;
|
|
|
|
int32_t m_maxNumSpiders ; // per local spider host
|
|
|
|
int32_t m_lastResetCount;
|
|
|
|
// start another set of flags using the old m_spiderTimeShift
|
|
char m_useCurrentTime ; // ... for m_spiderTime2
|
|
|
|
// controls for query-dependent summary/title generation
|
|
int32_t m_titleMaxLen;
|
|
int32_t m_summaryMaxLen;
|
|
int32_t m_summaryMaxNumLines;
|
|
int32_t m_summaryMaxNumCharsPerLine;
|
|
|
|
char m_getDocIdScoringInfo;
|
|
|
|
/*****
|
|
* !! Start Diffbot paramamters !! *
|
|
*****/
|
|
|
|
SafeBuf m_diffbotSeeds;
|
|
|
|
// in seconds now
|
|
uint32_t m_diffbotCrawlStartTime;
|
|
uint32_t m_diffbotCrawlEndTime;
|
|
|
|
// our local crawling stats
|
|
CrawlInfo m_localCrawlInfo;
|
|
// total crawling stats summed up from all hosts in network
|
|
CrawlInfo m_globalCrawlInfo;
|
|
|
|
// holds the latest CrawlInfo for each host for this collrec
|
|
SafeBuf m_crawlInfoBuf;
|
|
|
|
/*****
|
|
* !! End of Diffbot paramamters !! *
|
|
*****/
|
|
|
|
// list of url patterns to be indexed.
|
|
SafeBuf m_siteListBuf;
|
|
|
|
// can be "web" "english" "romantic" "german" etc.
|
|
SafeBuf m_urlFiltersProfile;
|
|
|
|
// . now the url regular expressions
|
|
// . we chain down the regular expressions
|
|
// . if a url matches we use that tagdb rec #
|
|
// . if it doesn't match any of the patterns, we use the default site #
|
|
// . just one regexp per Pattern
|
|
// . all of these arrays should be the same size, but we need to
|
|
// include a count because Parms.cpp expects a count before each
|
|
// array since it handle them each individually
|
|
|
|
int32_t m_numRegExs;
|
|
// make this now use g_collectiondb.m_stringBuf safebuf and
|
|
// make Parms.cpp use that stringbuf rather than store into here...
|
|
SafeBuf m_regExs[ MAX_FILTERS ];
|
|
|
|
int32_t m_numSpiderFreqs; // useless, just for Parms::setParm()
|
|
float m_spiderFreqs[ MAX_FILTERS ];
|
|
|
|
int32_t m_numSpiderPriorities; // useless, just for Parms::setParm()
|
|
char m_spiderPriorities[ MAX_FILTERS ];
|
|
|
|
int32_t m_numMaxSpidersPerRule; // useless, just for Parms::setParm()
|
|
int32_t m_maxSpidersPerRule[ MAX_FILTERS ];
|
|
|
|
// same ip waits now here instead of "page priority"
|
|
int32_t m_numSpiderIpWaits; // useless, just for Parms::setParm()
|
|
int32_t m_spiderIpWaits[ MAX_FILTERS ];
|
|
|
|
// same goes for max spiders per ip
|
|
int32_t m_numSpiderIpMaxSpiders;
|
|
int32_t m_spiderIpMaxSpiders [ MAX_FILTERS ];
|
|
|
|
int32_t m_numHarvestLinks;
|
|
char m_harvestLinks[ MAX_FILTERS ];
|
|
|
|
int32_t m_numForceDelete;
|
|
char m_forceDelete[ MAX_FILTERS ];
|
|
|
|
// dummy?
|
|
int32_t m_numRegExs9;
|
|
|
|
char m_doQueryHighlighting;
|
|
|
|
char m_summaryFrontHighlightTag[SUMMARYHIGHLIGHTTAGMAXSIZE] ;
|
|
char m_summaryBackHighlightTag [SUMMARYHIGHLIGHTTAGMAXSIZE] ;
|
|
|
|
SafeBuf m_htmlRoot;
|
|
SafeBuf m_htmlHead;
|
|
SafeBuf m_htmlTail;
|
|
|
|
// SPELL CHECK
|
|
char m_spellCheck;
|
|
|
|
char m_sendingAlertInProgress;
|
|
|
|
class SpiderColl *m_spiderColl;
|
|
|
|
int32_t m_overflow;
|
|
int32_t m_overflow2;
|
|
|
|
HashTableX m_seedHashTable;
|
|
|
|
// only spider urls due to be spidered in this time range
|
|
int32_t m_spiderTimeMin;
|
|
int32_t m_spiderTimeMax;
|
|
|
|
int32_t m_maxAddUrlsPerIpDomPerDay;
|
|
|
|
// . max content length of text/html or text/plain document
|
|
// . we will not download, index or store more than this many bytes
|
|
int32_t m_maxTextDocLen;
|
|
|
|
// . max content length of other (pdf, word, xls, ppt, ps)
|
|
// . we will not download, index or store more than this many bytes
|
|
// . if content would be truncated, we will not even download at all
|
|
// because the html converter needs 100% of the doc otherwise it
|
|
// will have an error
|
|
int32_t m_maxOtherDocLen;
|
|
|
|
// . puts <br>s in the summary to keep its width below this
|
|
// . but we exceed this width before we would split a word
|
|
int32_t m_summaryMaxWidth;
|
|
|
|
// how long a robots.txt can be in the cache (Msg13.cpp/Robotdb.cpp)
|
|
int32_t m_maxRobotsCacheAge;
|
|
|
|
// use query expansion for this collection?
|
|
char m_queryExpansion;
|
|
|
|
// read from cache
|
|
char m_rcache;
|
|
|
|
char m_hideAllClustered;
|
|
|
|
// special var to prevent Collectiondb.cpp from copying the crap
|
|
// below here
|
|
char m_END_COPY;
|
|
|
|
// use this not m_bases to get the RdbBase
|
|
class RdbBase *getBase(rdbid_t rdbId);
|
|
|
|
// Rdb.cpp uses this after deleting an RdbBase and adding new one
|
|
void setBasePtr(rdbid_t rdbId, class RdbBase *base);
|
|
class RdbBase *getBasePtr(rdbid_t rdbId);
|
|
|
|
private:
|
|
// . now chuck this into CollectionRec instead of having a fixed
|
|
// array of them in Rdb.h called m_bases[]
|
|
// . leave this out of any copy of course
|
|
class RdbBase *m_bases[RDB_END2];
|
|
|
|
public:
|
|
// for poulating the sortbydate table
|
|
class Msg5 *m_msg5;
|
|
|
|
// used by Parms.cpp
|
|
char m_hackFlag;
|
|
|
|
// each Rdb has a tree, so keep the pos/neg key count here so
|
|
// that RdbTree does not have to have its own array limited by
|
|
// MAX_COLLS which we did away with because we made this dynamic.
|
|
int32_t m_numPosKeysInTree[RDB_END2];
|
|
int32_t m_numNegKeysInTree[RDB_END2];
|
|
};
|
|
|
|
#endif // GB_COLLECTIONDB_H
|