Files
privacore-open-source-searc…/Conf.h
2016-08-31 11:13:31 +02:00

506 lines
14 KiB
C++

// Copyright Matt Wells, Apr 2001
// . every host has a config record
// . like tagdb, record in 100% xml
// . allows remote configuration of hosts through Msg4 class
// . remote user sends some xml, we set our member vars using that xml
// . when we save to disk we convert our mem vars to xml
// . is global so everybody can see it
// . conf record can be changed by director OR with the host's priv key
// . use Conf remotely to get setup info about a specific host
// . get your local ip/port/groupMask/etc. from this class not HostMap
#ifndef GB_CONF_H
#define GB_CONF_H
#include "Collectiondb.h" // MAX_COLL_LEN
#include "SafeBuf.h"
#include "Xml.h"
#define USERAGENTMAXSIZE 128
#define MAX_DNSIPS 16
#define MAX_RNSIPS 13
#define MAX_MX_LEN 128
#define MAX_EMAIL_LEN 64
//Publicly accessible and generallyy HA / reachable DNS servers. Use Google's servers - works reasonably well
#define PUBLICLY_AVAILABLE_DNS1 "8.8.8.8"
#define PUBLICLY_AVAILABLE_DNS2 "8.8.4.4"
class TcpSocket;
class HttpRequest;
mode_t getFileCreationFlags();
mode_t getDirCreationFlags ();
class Conf {
public:
Conf();
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
bool isCollAdminForColl (TcpSocket *sock, HttpRequest *hr, const char *coll );
bool isCollAdmin2 (TcpSocket *socket , HttpRequest *hr,
class CollectionRec *cr) ;
bool isMasterAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
bool hasMasterPwd ( HttpRequest *hr ) ;
bool isMasterIp ( uint32_t ip );
bool isConnectIp ( uint32_t ip );
// loads conf parms from this file "{dir}/gb.conf"
bool init ( char *dir );
void setRootIps();
// saves any changes to the conf file
bool save ( );
// reset all values to their defaults
void reset();
// . get the default collection based on hostname
// will look for the hostname in each collection for a match
// no match defaults to default collection
const char *getDefaultColl ( char *hostname, int32_t hostnameLen );
// max amount of memory we can use
size_t m_maxMem;
// if this is false, we do not save, used by dump routines
// in main.cpp so they can change parms here and not worry about
// a core dump saving them
char m_save;
bool m_runAsDaemon;
bool m_logToFile;
bool m_isLocal;
// an additional strip directory on a different drive
char m_stripeDir[256];
char m_defaultColl [ MAX_COLL_LEN + 1 ];
char m_clusterName[32];
// . dns parameters
// . dnsDir should hold our saved cached (TODO: save the dns cache)
int32_t m_numDns ;
int32_t m_dnsIps[MAX_DNSIPS];
int16_t m_dnsPorts[MAX_DNSIPS];
int32_t m_dnsMaxCacheMem;
bool m_dnsSaveCache;
int32_t m_wikiProxyIp;
int32_t m_wikiProxyPort;
SafeBuf m_proxyIps;
bool m_useProxyIps;
bool m_automaticallyUseProxyIps;
SafeBuf m_proxyAuth;
// built-in dns parameters using name servers
char m_askRootNameservers;
int32_t m_numRns;
int32_t m_rnsIps[MAX_RNSIPS];
// used to limit all rdb's to one merge per machine at a time
int32_t m_mergeBufSize;
// rdb settings
// posdb
int64_t m_posdbFileCacheSize;
int32_t m_posdbMaxTreeMem;
// tagdb
int64_t m_tagdbFileCacheSize;
int32_t m_tagdbMaxTreeMem;
// clusterdb for site clustering, each rec is 16 bytes
int64_t m_clusterdbFileCacheSize;
int32_t m_clusterdbMaxTreeMem;
int32_t m_clusterdbMinFilesToMerge;
// titledb
int64_t m_titledbFileCacheSize;
int32_t m_titledbMaxTreeMem;
// spiderdb
int64_t m_spiderdbFileCacheSize;
int32_t m_spiderdbMaxTreeMem;
// linkdb for storing linking relations
int32_t m_linkdbMaxTreeMem;
int32_t m_linkdbMinFilesToMerge;
// statdb
int32_t m_statsdbMaxTreeMem;
bool m_useStatsdb;
// are we doing a command line thing like 'gb 0 dump s ....' in
// which case we do not want to log certain things
bool m_doingCommandLine;
int32_t m_maxCpuThreads;
int32_t m_maxIOThreads;
int32_t m_maxExternalThreads;
int32_t m_deadHostTimeout;
int32_t m_sendEmailTimeout;
int32_t m_pingSpacer;
int32_t m_maxDocsWanted; //maximum number of results in one go. Puts a limit on SearchInput::m_docsWanted
int32_t m_maxFirstResultNum; //maximum document offset / result-page. Puts a limit on SearchInput::m_firstResultNum
int32_t min_docid_splits; //minimum number of DocId splits using Msg40
int32_t max_docid_splits; //maximum number of DocId splits using Msg40
int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds.
int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network.
char m_useHighFrequencyTermCache;
bool m_spideringEnabled ;
bool m_injectionsEnabled ;
bool m_queryingEnabled ;
bool m_returnResultsAnyway;
bool m_addUrlEnabled ; // TODO: use at http interface level
bool m_doStripeBalancing ;
// . true if the server is on the production cluster
// . we enforce the 'elvtune -w 32 /dev/sd?' cmd on all drives because
// that yields higher performance when dumping/merging on disk
bool m_isLive;
int32_t m_maxTotalSpiders;
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
int32_t m_indexdbMaxIndexListAge;
// TODO: parse these out!!!!
int32_t m_httpMaxSockets ;
int32_t m_httpsMaxSockets ;
int32_t m_httpMaxSendBufSize ;
// a search results cache (for Msg40)
int32_t m_searchResultsMaxCacheMem ;
int32_t m_searchResultsMaxCacheAge ; // in seconds
int64_t m_docSummaryWithDescriptionMaxCacheAge; //cache timeout for document summaries for documents with a meta-tag with description, in milliseconds
// for Weights.cpp
int32_t m_sliderParm;
//ranking settings
float m_termFreqWeightFreqMin;
float m_termFreqWeightFreqMax;
float m_termFreqWeightMin;
float m_termFreqWeightMax;
float m_densityWeightMin;
float m_densityWeightMax;
float m_diversityWeightMin;
float m_diversityWeightMax;
float m_hashGroupWeightBody;
float m_hashGroupWeightTitle;
float m_hashGroupWeightHeading;
float m_hashGroupWeightInlist;
float m_hashGroupWeightInMetaTag;
float m_hashGroupWeightInLinkText;
float m_hashGroupWeightInTag;
float m_hashGroupWeightNeighborhood;
float m_hashGroupWeightInternalLinkText;
float m_hashGroupWeightInUrl;
float m_hashGroupWeightInMenu;
float m_synonymWeight;
// send emails when a host goes down?
bool m_sendEmailAlerts;
//should we delay when only 1 host goes down out of twins till 9 30 am?
bool m_delayNonCriticalEmailAlerts;
bool m_sendEmailAlertsToSysadmin;
bool m_sendEmailAlertsToEmail1;
char m_email1MX[MAX_MX_LEN];
char m_email1Addr[MAX_EMAIL_LEN];
char m_email1From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail2;
char m_email2MX[MAX_MX_LEN];
char m_email2Addr[MAX_EMAIL_LEN];
char m_email2From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail3;
char m_email3MX[MAX_MX_LEN];
char m_email3Addr[MAX_EMAIL_LEN];
char m_email3From[MAX_EMAIL_LEN];
bool m_sendEmailAlertsToEmail4;
char m_email4MX[MAX_MX_LEN];
char m_email4Addr[MAX_EMAIL_LEN];
char m_email4From[MAX_EMAIL_LEN];
//bool m_sendEmailAlertsToSabino;
char m_errstr1[MAX_URL_LEN];
char m_errstr2[MAX_URL_LEN];
char m_errstr3[MAX_URL_LEN];
char m_sendParmChangeAlertsToEmail1;
char m_sendParmChangeAlertsToEmail2;
char m_sendParmChangeAlertsToEmail3;
char m_sendParmChangeAlertsToEmail4;
float m_avgQueryTimeThreshold;
//float m_maxQueryTime;
float m_querySuccessThreshold;
int32_t m_numQueryTimes;
int32_t m_maxCorruptLists;
int32_t m_defaultQueryResultsValidityTime; //in seconds
bool m_useCollectionPasswords;
bool m_allowCloudUsers;
// if in read-only mode we do no spidering and load no saved trees
// so we can use all mem for caching index lists
bool m_readOnlyMode;
// if this is true we use /etc/hosts for hostname lookup before dns
bool m_useEtcHosts;
// . should we always read data from local machine if available?
// . if your network is not gigabit, this may be a good idea
bool m_preferLocalReads;
// should we bypass load balancing and always send titledb record
// lookup requests to a host to maxmize tfndb page cache hits?
//bool m_useBiasedTfndb;
// just ensure lists being written are valid rdb records (titlerecs)
// trying to isolate titlerec corruption
bool m_verifyDumpedLists;
// calls fsync(fd) if true after each write
bool m_flushWrites ;
bool m_verifyWrites;
int32_t m_corruptRetries;
// log unfreed memory on exit
bool m_detectMemLeaks;
// . if false we will not keep spelling information in memory
bool m_doSpellChecking;
bool m_forceIt;
// if this is true we do not add indexdb keys that *should* already
// be in indexdb. but if you recently upped the m_truncationLimit
// then you can set this to false to add all indexdb keys.
//bool m_onlyAddUnchangedTermIds;
bool m_doIncrementalUpdating;
// Temporary (?) setting for new Posdb no-in-memory-merge feature
bool m_noInMemoryPosdbMerge;
bool m_useQuickpoll;
int64_t m_stableSummaryCacheSize;
int64_t m_stableSummaryCacheMaxAge;
int64_t m_unstableSummaryCacheSize;
int64_t m_unstableSummaryCacheMaxAge;
// tagrec cache (for Msg8a)
int64_t m_tagRecCacheSize;
int64_t m_tagRecCacheMaxAge;
//bool m_quickpollCoreOnError;
bool m_useShotgun;
bool m_testMem;
bool m_doConsistencyTesting;
// defaults to "Gigabot/1.0"
char m_spiderUserAgent [ USERAGENTMAXSIZE ];
char m_spiderBotName [ USERAGENTMAXSIZE ];
int32_t m_autoSaveFrequency;
int32_t m_docCountAdjustment;
bool m_profilingEnabled;
bool m_dynamicPerfGraph;
int32_t m_minProfThreshold;
bool m_sequentialProfiling;
int32_t m_realTimeProfilerMinQuickPollDelta;
//
// See Log.h for an explanation of the switches below
//
// GET and POST requests.
bool m_logHttpRequests;
bool m_logAutobannedQueries;
//bool m_logQueryTimes;
// if query took this or more milliseconds, log its time
int32_t m_logQueryTimeThreshold;
// if disk read took this or more milliseconds, log its time
int32_t m_logDiskReadTimeThreshold;
bool m_logQueryReply;
// log what gets into the index
bool m_logSpideredUrls;
// log informational messages, they are not indicative of any error.
bool m_logInfo;
// when out of udp slots
bool m_logNetCongestion;
// doc quota limits, url truncation limits
bool m_logLimits;
// log debug switches
bool m_logDebugAddurl ;
bool m_logDebugAdmin ;
bool m_logDebugBuild ;
bool m_logDebugBuildTime ;
bool m_logDebugDb ;
bool m_logDebugDirty ;
bool m_logDebugDisk ;
bool m_logDebugDiskPageCache;
bool m_logDebugDns ;
bool m_logDebugDownloads;
bool m_logDebugHttp ;
bool m_logDebugImage ;
bool m_logDebugLoop ;
bool m_logDebugLang ;
bool m_logDebugLinkInfo ;
bool m_logDebugMem ;
bool m_logDebugMemUsage;
bool m_logDebugMerge ;
bool m_logDebugNet ;
bool m_logDebugProxies ;
bool m_logDebugQuery ;
bool m_logDebugQuota ;
bool m_logDebugRobots ;
bool m_logDebugSpcache ; // SpiderCache.cpp debug
bool m_logDebugSpeller ;
bool m_logDebugTagdb ;
bool m_logDebugSections;
bool m_logDebugSEO;
bool m_logDebugStats ;
bool m_logDebugSummary ;
bool m_logDebugSpider ;
bool m_logDebugMsg13 ;
bool m_logDebugUrlAttempts ;
bool m_logDebugTcp ;
bool m_logDebugTcpBuf ;
bool m_logDebugThread ;
bool m_logDebugTitle ;
bool m_logDebugTopDocs ;
bool m_logDebugUdp ;
bool m_logDebugUnicode ;
bool m_logDebugRepair ;
bool m_logDebugDate ;
bool m_logDebugDetailed;
bool m_logTraceBigFile;
bool m_logTraceDns;
bool m_logTraceFile;
bool m_logTraceMem;
bool m_logTraceMsg0;
bool m_logTraceMsg4;
bool m_logTracePosdb;
bool m_logTraceRdb;
bool m_logTraceRdbBase;
bool m_logTraceRdbBuckets;
bool m_logTraceRdbDump;
bool m_logTraceRdbMap;
bool m_logTraceRdbIndex;
bool m_logTraceRepairs;
bool m_logTraceRobots;
bool m_logTraceSpider;
bool m_logTraceXmlDoc;
// expensive timing messages
bool m_logTimingAddurl ;
bool m_logTimingAdmin ;
bool m_logTimingBuild;
bool m_logTimingDb;
bool m_logTimingNet;
bool m_logTimingQuery;
bool m_logTimingSpcache;
bool m_logTimingRobots;
// programmer reminders.
bool m_logReminders;
SafeBuf m_masterPwds;
// these are the new master ips
SafeBuf m_connectIps;
// should we generate similarity/content vector for titleRecs lacking?
// this takes a ~100+ ms, very expensive, so it is just meant for
// testing.
bool m_generateVectorAtQueryTime;
char m_redirect[MAX_URL_LEN];
char m_useCompressionProxy;
char m_gzipDownloads;
// used by proxy to make proxy point to the temp cluster while
// the original cluster is updated
char m_useTmpCluster;
char m_timeSyncProxy;
Xml m_xml;
// . for specifying if this is an interface machine
// messages are rerouted from this machine to the main
// cluster set in the hosts.conf.
bool m_interfaceMachine;
// allow scaling up of hosts by removing recs not in the correct
// group. otherwise a sanity check will happen.
char m_allowScale;
// . timeout on dead hosts, only set when we know a host is dead and
// will not come back online. Messages will timeout on the dead
// host, but not error, allowing outstanding spidering to finish
// to the twin
char m_giveupOnDeadHosts;
char m_bypassValidation;
int32_t m_maxHeartbeatDelay;
int32_t m_maxCallbackDelay;
// used by Repair.cpp
char m_repairingEnabled ;
int32_t m_maxRepairSpiders ;
int64_t m_repairMem;
SafeBuf m_collsToRepair;
char m_fullRebuild ;
char m_rebuildAddOutlinks;
char m_rebuildRecycleLinkInfo ;
char m_rebuildTitledb ;
char m_rebuildPosdb ;
char m_rebuildClusterdb ;
char m_rebuildSpiderdb ;
char m_rebuildLinkdb ;
char m_rebuildRoots ;
char m_rebuildNonRoots ;
};
extern class Conf g_conf;
#endif // GB_CONF_H