forked from Mirrors/privacore-open-source-search-engine
506 lines
14 KiB
C++
506 lines
14 KiB
C++
// Copyright Matt Wells, Apr 2001
|
|
|
|
// . every host has a config record
|
|
// . like tagdb, record in 100% xml
|
|
// . allows remote configuration of hosts through Msg4 class
|
|
// . remote user sends some xml, we set our member vars using that xml
|
|
// . when we save to disk we convert our mem vars to xml
|
|
// . is global so everybody can see it
|
|
// . conf record can be changed by director OR with the host's priv key
|
|
// . use Conf remotely to get setup info about a specific host
|
|
// . get your local ip/port/groupMask/etc. from this class not HostMap
|
|
|
|
#ifndef GB_CONF_H
|
|
#define GB_CONF_H
|
|
|
|
#include "Collectiondb.h" // MAX_COLL_LEN
|
|
#include "SafeBuf.h"
|
|
#include "Xml.h"
|
|
|
|
#define USERAGENTMAXSIZE 128
|
|
|
|
#define MAX_DNSIPS 16
|
|
#define MAX_RNSIPS 13
|
|
#define MAX_MX_LEN 128
|
|
#define MAX_EMAIL_LEN 64
|
|
|
|
//Publicly accessible and generallyy HA / reachable DNS servers. Use Google's servers - works reasonably well
|
|
#define PUBLICLY_AVAILABLE_DNS1 "8.8.8.8"
|
|
#define PUBLICLY_AVAILABLE_DNS2 "8.8.4.4"
|
|
|
|
class TcpSocket;
|
|
class HttpRequest;
|
|
|
|
|
|
mode_t getFileCreationFlags();
|
|
mode_t getDirCreationFlags ();
|
|
|
|
class Conf {
|
|
|
|
public:
|
|
|
|
Conf();
|
|
|
|
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
|
|
bool isCollAdminForColl (TcpSocket *sock, HttpRequest *hr, const char *coll );
|
|
bool isCollAdmin2 (TcpSocket *socket , HttpRequest *hr,
|
|
class CollectionRec *cr) ;
|
|
|
|
|
|
bool isMasterAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
|
|
bool hasMasterPwd ( HttpRequest *hr ) ;
|
|
bool isMasterIp ( uint32_t ip );
|
|
bool isConnectIp ( uint32_t ip );
|
|
|
|
// loads conf parms from this file "{dir}/gb.conf"
|
|
bool init ( char *dir );
|
|
|
|
void setRootIps();
|
|
|
|
// saves any changes to the conf file
|
|
bool save ( );
|
|
|
|
// reset all values to their defaults
|
|
void reset();
|
|
|
|
// . get the default collection based on hostname
|
|
// will look for the hostname in each collection for a match
|
|
// no match defaults to default collection
|
|
const char *getDefaultColl ( char *hostname, int32_t hostnameLen );
|
|
|
|
// max amount of memory we can use
|
|
size_t m_maxMem;
|
|
|
|
// if this is false, we do not save, used by dump routines
|
|
// in main.cpp so they can change parms here and not worry about
|
|
// a core dump saving them
|
|
char m_save;
|
|
|
|
bool m_runAsDaemon;
|
|
|
|
bool m_logToFile;
|
|
|
|
bool m_isLocal;
|
|
|
|
// an additional strip directory on a different drive
|
|
char m_stripeDir[256];
|
|
|
|
char m_defaultColl [ MAX_COLL_LEN + 1 ];
|
|
|
|
char m_clusterName[32];
|
|
|
|
// . dns parameters
|
|
// . dnsDir should hold our saved cached (TODO: save the dns cache)
|
|
int32_t m_numDns ;
|
|
int32_t m_dnsIps[MAX_DNSIPS];
|
|
int16_t m_dnsPorts[MAX_DNSIPS];
|
|
|
|
int32_t m_dnsMaxCacheMem;
|
|
bool m_dnsSaveCache;
|
|
|
|
int32_t m_wikiProxyIp;
|
|
int32_t m_wikiProxyPort;
|
|
|
|
SafeBuf m_proxyIps;
|
|
bool m_useProxyIps;
|
|
bool m_automaticallyUseProxyIps;
|
|
SafeBuf m_proxyAuth;
|
|
|
|
// built-in dns parameters using name servers
|
|
char m_askRootNameservers;
|
|
int32_t m_numRns;
|
|
int32_t m_rnsIps[MAX_RNSIPS];
|
|
|
|
// used to limit all rdb's to one merge per machine at a time
|
|
int32_t m_mergeBufSize;
|
|
|
|
// rdb settings
|
|
// posdb
|
|
int64_t m_posdbFileCacheSize;
|
|
int32_t m_posdbMaxTreeMem;
|
|
|
|
// tagdb
|
|
int64_t m_tagdbFileCacheSize;
|
|
int32_t m_tagdbMaxTreeMem;
|
|
|
|
// clusterdb for site clustering, each rec is 16 bytes
|
|
int64_t m_clusterdbFileCacheSize;
|
|
int32_t m_clusterdbMaxTreeMem;
|
|
int32_t m_clusterdbMinFilesToMerge;
|
|
|
|
// titledb
|
|
int64_t m_titledbFileCacheSize;
|
|
int32_t m_titledbMaxTreeMem;
|
|
|
|
// spiderdb
|
|
int64_t m_spiderdbFileCacheSize;
|
|
int32_t m_spiderdbMaxTreeMem;
|
|
|
|
// linkdb for storing linking relations
|
|
int32_t m_linkdbMaxTreeMem;
|
|
int32_t m_linkdbMinFilesToMerge;
|
|
|
|
// statdb
|
|
int32_t m_statsdbMaxTreeMem;
|
|
bool m_useStatsdb;
|
|
|
|
// are we doing a command line thing like 'gb 0 dump s ....' in
|
|
// which case we do not want to log certain things
|
|
bool m_doingCommandLine;
|
|
|
|
int32_t m_maxCpuThreads;
|
|
int32_t m_maxIOThreads;
|
|
int32_t m_maxExternalThreads;
|
|
|
|
int32_t m_deadHostTimeout;
|
|
int32_t m_sendEmailTimeout;
|
|
int32_t m_pingSpacer;
|
|
|
|
int32_t m_maxDocsWanted; //maximum number of results in one go. Puts a limit on SearchInput::m_docsWanted
|
|
int32_t m_maxFirstResultNum; //maximum document offset / result-page. Puts a limit on SearchInput::m_firstResultNum
|
|
|
|
int32_t min_docid_splits; //minimum number of DocId splits using Msg40
|
|
int32_t max_docid_splits; //maximum number of DocId splits using Msg40
|
|
int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds.
|
|
int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network.
|
|
|
|
char m_useHighFrequencyTermCache;
|
|
|
|
bool m_spideringEnabled ;
|
|
bool m_injectionsEnabled ;
|
|
bool m_queryingEnabled ;
|
|
bool m_returnResultsAnyway;
|
|
|
|
bool m_addUrlEnabled ; // TODO: use at http interface level
|
|
bool m_doStripeBalancing ;
|
|
|
|
// . true if the server is on the production cluster
|
|
// . we enforce the 'elvtune -w 32 /dev/sd?' cmd on all drives because
|
|
// that yields higher performance when dumping/merging on disk
|
|
bool m_isLive;
|
|
|
|
int32_t m_maxTotalSpiders;
|
|
|
|
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
|
|
int32_t m_indexdbMaxIndexListAge;
|
|
|
|
// TODO: parse these out!!!!
|
|
int32_t m_httpMaxSockets ;
|
|
int32_t m_httpsMaxSockets ;
|
|
int32_t m_httpMaxSendBufSize ;
|
|
|
|
// a search results cache (for Msg40)
|
|
int32_t m_searchResultsMaxCacheMem ;
|
|
int32_t m_searchResultsMaxCacheAge ; // in seconds
|
|
int64_t m_docSummaryWithDescriptionMaxCacheAge; //cache timeout for document summaries for documents with a meta-tag with description, in milliseconds
|
|
|
|
// for Weights.cpp
|
|
int32_t m_sliderParm;
|
|
|
|
//ranking settings
|
|
float m_termFreqWeightFreqMin;
|
|
float m_termFreqWeightFreqMax;
|
|
float m_termFreqWeightMin;
|
|
float m_termFreqWeightMax;
|
|
|
|
float m_densityWeightMin;
|
|
float m_densityWeightMax;
|
|
|
|
float m_diversityWeightMin;
|
|
float m_diversityWeightMax;
|
|
|
|
float m_hashGroupWeightBody;
|
|
float m_hashGroupWeightTitle;
|
|
float m_hashGroupWeightHeading;
|
|
float m_hashGroupWeightInlist;
|
|
float m_hashGroupWeightInMetaTag;
|
|
float m_hashGroupWeightInLinkText;
|
|
float m_hashGroupWeightInTag;
|
|
float m_hashGroupWeightNeighborhood;
|
|
float m_hashGroupWeightInternalLinkText;
|
|
float m_hashGroupWeightInUrl;
|
|
float m_hashGroupWeightInMenu;
|
|
|
|
float m_synonymWeight;
|
|
|
|
// send emails when a host goes down?
|
|
bool m_sendEmailAlerts;
|
|
//should we delay when only 1 host goes down out of twins till 9 30 am?
|
|
bool m_delayNonCriticalEmailAlerts;
|
|
bool m_sendEmailAlertsToSysadmin;
|
|
|
|
bool m_sendEmailAlertsToEmail1;
|
|
char m_email1MX[MAX_MX_LEN];
|
|
char m_email1Addr[MAX_EMAIL_LEN];
|
|
char m_email1From[MAX_EMAIL_LEN];
|
|
|
|
bool m_sendEmailAlertsToEmail2;
|
|
char m_email2MX[MAX_MX_LEN];
|
|
char m_email2Addr[MAX_EMAIL_LEN];
|
|
char m_email2From[MAX_EMAIL_LEN];
|
|
|
|
bool m_sendEmailAlertsToEmail3;
|
|
char m_email3MX[MAX_MX_LEN];
|
|
char m_email3Addr[MAX_EMAIL_LEN];
|
|
char m_email3From[MAX_EMAIL_LEN];
|
|
|
|
bool m_sendEmailAlertsToEmail4;
|
|
char m_email4MX[MAX_MX_LEN];
|
|
char m_email4Addr[MAX_EMAIL_LEN];
|
|
char m_email4From[MAX_EMAIL_LEN];
|
|
|
|
//bool m_sendEmailAlertsToSabino;
|
|
|
|
char m_errstr1[MAX_URL_LEN];
|
|
char m_errstr2[MAX_URL_LEN];
|
|
char m_errstr3[MAX_URL_LEN];
|
|
|
|
char m_sendParmChangeAlertsToEmail1;
|
|
char m_sendParmChangeAlertsToEmail2;
|
|
char m_sendParmChangeAlertsToEmail3;
|
|
char m_sendParmChangeAlertsToEmail4;
|
|
|
|
float m_avgQueryTimeThreshold;
|
|
//float m_maxQueryTime;
|
|
float m_querySuccessThreshold;
|
|
int32_t m_numQueryTimes;
|
|
int32_t m_maxCorruptLists;
|
|
|
|
int32_t m_defaultQueryResultsValidityTime; //in seconds
|
|
|
|
bool m_useCollectionPasswords;
|
|
|
|
bool m_allowCloudUsers;
|
|
|
|
// if in read-only mode we do no spidering and load no saved trees
|
|
// so we can use all mem for caching index lists
|
|
bool m_readOnlyMode;
|
|
|
|
// if this is true we use /etc/hosts for hostname lookup before dns
|
|
bool m_useEtcHosts;
|
|
|
|
// . should we always read data from local machine if available?
|
|
// . if your network is not gigabit, this may be a good idea
|
|
bool m_preferLocalReads;
|
|
|
|
// should we bypass load balancing and always send titledb record
|
|
// lookup requests to a host to maxmize tfndb page cache hits?
|
|
//bool m_useBiasedTfndb;
|
|
|
|
|
|
// just ensure lists being written are valid rdb records (titlerecs)
|
|
// trying to isolate titlerec corruption
|
|
bool m_verifyDumpedLists;
|
|
|
|
// calls fsync(fd) if true after each write
|
|
bool m_flushWrites ;
|
|
bool m_verifyWrites;
|
|
int32_t m_corruptRetries;
|
|
|
|
// log unfreed memory on exit
|
|
bool m_detectMemLeaks;
|
|
|
|
// . if false we will not keep spelling information in memory
|
|
bool m_doSpellChecking;
|
|
|
|
bool m_forceIt;
|
|
|
|
// if this is true we do not add indexdb keys that *should* already
|
|
// be in indexdb. but if you recently upped the m_truncationLimit
|
|
// then you can set this to false to add all indexdb keys.
|
|
//bool m_onlyAddUnchangedTermIds;
|
|
bool m_doIncrementalUpdating;
|
|
|
|
// Temporary (?) setting for new Posdb no-in-memory-merge feature
|
|
bool m_noInMemoryPosdbMerge;
|
|
|
|
bool m_useQuickpoll;
|
|
|
|
int64_t m_stableSummaryCacheSize;
|
|
int64_t m_stableSummaryCacheMaxAge;
|
|
int64_t m_unstableSummaryCacheSize;
|
|
int64_t m_unstableSummaryCacheMaxAge;
|
|
|
|
// tagrec cache (for Msg8a)
|
|
int64_t m_tagRecCacheSize;
|
|
int64_t m_tagRecCacheMaxAge;
|
|
|
|
//bool m_quickpollCoreOnError;
|
|
bool m_useShotgun;
|
|
bool m_testMem;
|
|
bool m_doConsistencyTesting;
|
|
|
|
// defaults to "Gigabot/1.0"
|
|
char m_spiderUserAgent [ USERAGENTMAXSIZE ];
|
|
|
|
char m_spiderBotName [ USERAGENTMAXSIZE ];
|
|
|
|
int32_t m_autoSaveFrequency;
|
|
|
|
int32_t m_docCountAdjustment;
|
|
|
|
bool m_profilingEnabled;
|
|
bool m_dynamicPerfGraph;
|
|
int32_t m_minProfThreshold;
|
|
bool m_sequentialProfiling;
|
|
int32_t m_realTimeProfilerMinQuickPollDelta;
|
|
|
|
//
|
|
// See Log.h for an explanation of the switches below
|
|
//
|
|
|
|
// GET and POST requests.
|
|
bool m_logHttpRequests;
|
|
bool m_logAutobannedQueries;
|
|
//bool m_logQueryTimes;
|
|
// if query took this or more milliseconds, log its time
|
|
int32_t m_logQueryTimeThreshold;
|
|
// if disk read took this or more milliseconds, log its time
|
|
int32_t m_logDiskReadTimeThreshold;
|
|
|
|
bool m_logQueryReply;
|
|
// log what gets into the index
|
|
bool m_logSpideredUrls;
|
|
// log informational messages, they are not indicative of any error.
|
|
bool m_logInfo;
|
|
// when out of udp slots
|
|
bool m_logNetCongestion;
|
|
// doc quota limits, url truncation limits
|
|
bool m_logLimits;
|
|
// log debug switches
|
|
bool m_logDebugAddurl ;
|
|
bool m_logDebugAdmin ;
|
|
bool m_logDebugBuild ;
|
|
bool m_logDebugBuildTime ;
|
|
bool m_logDebugDb ;
|
|
bool m_logDebugDirty ;
|
|
bool m_logDebugDisk ;
|
|
bool m_logDebugDiskPageCache;
|
|
bool m_logDebugDns ;
|
|
bool m_logDebugDownloads;
|
|
bool m_logDebugHttp ;
|
|
bool m_logDebugImage ;
|
|
bool m_logDebugLoop ;
|
|
bool m_logDebugLang ;
|
|
bool m_logDebugLinkInfo ;
|
|
bool m_logDebugMem ;
|
|
bool m_logDebugMemUsage;
|
|
bool m_logDebugMerge ;
|
|
bool m_logDebugNet ;
|
|
bool m_logDebugProxies ;
|
|
bool m_logDebugQuery ;
|
|
bool m_logDebugQuota ;
|
|
bool m_logDebugRobots ;
|
|
bool m_logDebugSpcache ; // SpiderCache.cpp debug
|
|
bool m_logDebugSpeller ;
|
|
bool m_logDebugTagdb ;
|
|
bool m_logDebugSections;
|
|
bool m_logDebugSEO;
|
|
bool m_logDebugStats ;
|
|
bool m_logDebugSummary ;
|
|
bool m_logDebugSpider ;
|
|
bool m_logDebugMsg13 ;
|
|
bool m_logDebugUrlAttempts ;
|
|
bool m_logDebugTcp ;
|
|
bool m_logDebugTcpBuf ;
|
|
bool m_logDebugThread ;
|
|
bool m_logDebugTitle ;
|
|
bool m_logDebugTopDocs ;
|
|
bool m_logDebugUdp ;
|
|
bool m_logDebugUnicode ;
|
|
bool m_logDebugRepair ;
|
|
bool m_logDebugDate ;
|
|
bool m_logDebugDetailed;
|
|
|
|
bool m_logTraceBigFile;
|
|
bool m_logTraceDns;
|
|
bool m_logTraceFile;
|
|
bool m_logTraceMem;
|
|
bool m_logTraceMsg0;
|
|
bool m_logTraceMsg4;
|
|
bool m_logTracePosdb;
|
|
bool m_logTraceRdb;
|
|
bool m_logTraceRdbBase;
|
|
bool m_logTraceRdbBuckets;
|
|
bool m_logTraceRdbDump;
|
|
bool m_logTraceRdbMap;
|
|
bool m_logTraceRdbIndex;
|
|
bool m_logTraceRepairs;
|
|
bool m_logTraceRobots;
|
|
bool m_logTraceSpider;
|
|
bool m_logTraceXmlDoc;
|
|
|
|
// expensive timing messages
|
|
bool m_logTimingAddurl ;
|
|
bool m_logTimingAdmin ;
|
|
bool m_logTimingBuild;
|
|
bool m_logTimingDb;
|
|
bool m_logTimingNet;
|
|
bool m_logTimingQuery;
|
|
bool m_logTimingSpcache;
|
|
bool m_logTimingRobots;
|
|
|
|
// programmer reminders.
|
|
bool m_logReminders;
|
|
|
|
SafeBuf m_masterPwds;
|
|
|
|
// these are the new master ips
|
|
SafeBuf m_connectIps;
|
|
|
|
// should we generate similarity/content vector for titleRecs lacking?
|
|
// this takes a ~100+ ms, very expensive, so it is just meant for
|
|
// testing.
|
|
bool m_generateVectorAtQueryTime;
|
|
|
|
char m_redirect[MAX_URL_LEN];
|
|
char m_useCompressionProxy;
|
|
char m_gzipDownloads;
|
|
|
|
// used by proxy to make proxy point to the temp cluster while
|
|
// the original cluster is updated
|
|
char m_useTmpCluster;
|
|
|
|
char m_timeSyncProxy;
|
|
|
|
Xml m_xml;
|
|
|
|
// . for specifying if this is an interface machine
|
|
// messages are rerouted from this machine to the main
|
|
// cluster set in the hosts.conf.
|
|
bool m_interfaceMachine;
|
|
|
|
// allow scaling up of hosts by removing recs not in the correct
|
|
// group. otherwise a sanity check will happen.
|
|
char m_allowScale;
|
|
// . timeout on dead hosts, only set when we know a host is dead and
|
|
// will not come back online. Messages will timeout on the dead
|
|
// host, but not error, allowing outstanding spidering to finish
|
|
// to the twin
|
|
char m_giveupOnDeadHosts;
|
|
char m_bypassValidation;
|
|
|
|
int32_t m_maxHeartbeatDelay;
|
|
int32_t m_maxCallbackDelay;
|
|
|
|
// used by Repair.cpp
|
|
char m_repairingEnabled ;
|
|
int32_t m_maxRepairSpiders ;
|
|
int64_t m_repairMem;
|
|
SafeBuf m_collsToRepair;
|
|
char m_fullRebuild ;
|
|
char m_rebuildAddOutlinks;
|
|
char m_rebuildRecycleLinkInfo ;
|
|
char m_rebuildTitledb ;
|
|
char m_rebuildPosdb ;
|
|
char m_rebuildClusterdb ;
|
|
char m_rebuildSpiderdb ;
|
|
char m_rebuildLinkdb ;
|
|
char m_rebuildRoots ;
|
|
char m_rebuildNonRoots ;
|
|
};
|
|
|
|
extern class Conf g_conf;
|
|
|
|
#endif // GB_CONF_H
|