// Copyright Matt Wells, Apr 2001 // . every host has a config record // . like tagdb, record in 100% xml // . allows remote configuration of hosts through Msg4 class // . remote user sends some xml, we set our member vars using that xml // . when we save to disk we convert our mem vars to xml // . is global so everybody can see it // . conf record can be changed by director OR with the host's priv key // . use Conf remotely to get setup info about a specific host // . get your local ip/port/groupMask/etc. from this class not HostMap #ifndef GB_CONF_H #define GB_CONF_H #include "max_coll_len.h" #include "max_url_len.h" #include "SafeBuf.h" #include "BaseScoringParameters.h" #define USERAGENTMAXSIZE 128 #define MAX_DNSIPS 16 #define MAX_RNSIPS 13 //Publicly accessible and generallyy HA / reachable DNS servers. Use Google's servers - works reasonably well #define PUBLICLY_AVAILABLE_DNS1 "8.8.8.8" #define PUBLICLY_AVAILABLE_DNS2 "8.8.4.4" class TcpSocket; class HttpRequest; mode_t getFileCreationFlags(); mode_t getDirCreationFlags (); class Conf { public: Conf(); bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ); bool isCollAdminForColl (TcpSocket *sock, HttpRequest *hr, const char *coll ); bool isCollAdmin2 (TcpSocket *socket , HttpRequest *hr, class CollectionRec *cr); bool isMasterAdmin ( TcpSocket *socket , HttpRequest *hr ); bool hasMasterPwd ( HttpRequest *hr ); bool isMasterIp ( uint32_t ip ); bool isConnectIp ( uint32_t ip ); // loads conf parms from this file "{dir}/gb.conf" bool init ( char *dir ); void setRootIps(); // saves any changes to the conf file bool save ( ); // reset all values to their defaults void reset(); // defaults to default collection const char *getDefaultColl ( ); // max amount of memory we can use size_t m_maxMem; bool m_mlockAllCurrent; bool m_mlockAllFuture; // if this is false, we do not save, used by dump routines // in main.cpp so they can change parms here and not worry about // a core dump saving them bool m_save; bool m_runAsDaemon; bool m_logToFile; char m_defaultColl[MAX_COLL_LEN + 1]; // . dns parameters // . dnsDir should hold our saved cached (TODO: save the dns cache) int32_t m_numDns; int32_t m_dnsIps[MAX_DNSIPS]; int16_t m_dnsPorts[MAX_DNSIPS]; int64_t m_dnsCacheSize; int64_t m_dnsCacheMaxAge; int64_t m_dnsCacheMaxMem; SafeBuf m_proxyIps; SafeBuf m_proxyAuth; // built-in dns parameters using name servers bool m_askRootNameservers; int32_t m_numRns; int32_t m_rnsIps[MAX_RNSIPS]; char m_urlClassificationServerName[64]; int32_t m_urlClassificationServerPort; unsigned m_maxOutstandingUrlClassifications; unsigned m_urlClassificationTimeout; // used to limit all rdb's to one merge per machine at a time int32_t m_mergeBufSize; int32_t m_doledbNukeInterval; // rdb settings // posdb int32_t m_posdbMaxLostPositivesPercentage; int64_t m_posdbFileCacheSize; int32_t m_posdbMaxTreeMem; // tagdb int32_t m_tagdbMaxLostPositivesPercentage; int64_t m_tagdbFileCacheSize; int32_t m_tagdbMaxTreeMem; char m_mergespaceLockDirectory[1024]; int32_t m_mergespaceMinLockFiles; char m_mergespaceDirectory[1024]; // clusterdb for site clustering, each rec is 16 bytes int32_t m_clusterdbMaxLostPositivesPercentage; int64_t m_clusterdbFileCacheSize; int32_t m_clusterdbMaxTreeMem; int32_t m_clusterdbMinFilesToMerge; // titledb int32_t m_titledbMaxLostPositivesPercentage; int64_t m_titledbFileCacheSize; int32_t m_titledbMaxTreeMem; // spiderdb int32_t m_spiderdbMaxLostPositivesPercentage; int64_t m_spiderdbFileCacheSize; int32_t m_spiderdbMaxTreeMem; // linkdb for storing linking relations int32_t m_linkdbMaxLostPositivesPercentage; int32_t m_linkdbMaxTreeMem; int32_t m_linkdbMinFilesToMerge; // are we doing a command line thing like 'gb 0 dump s ....' in // which case we do not want to log certain things bool m_doingCommandLine; int32_t m_maxCoordinatorThreads; int32_t m_maxCpuThreads; int32_t m_maxSummaryThreads; int32_t m_maxIOThreads; int32_t m_maxExternalThreads; int32_t m_maxFileMetaThreads; int32_t m_maxMergeThreads; int32_t m_maxJobCleanupTime; char m_vagusClusterId[128]; int32_t m_vagusPort; int32_t m_vagusKeepaliveSendInterval; //milliseconds int32_t m_vagusKeepaliveLifetime; //milliseconds int32_t m_vagusMaxDeadTime; //minutes int32_t m_maxDocsWanted; //maximum number of results in one go. Puts a limit on SearchInput::m_docsWanted int32_t m_maxFirstResultNum; //maximum document offset / result-page. Puts a limit on SearchInput::m_firstResultNum int32_t min_docid_splits; //minimum number of DocId splits using Msg40 int32_t max_docid_splits; //maximum number of DocId splits using Msg40 int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds. int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network. bool m_useHighFrequencyTermCache; bool m_spideringEnabled; bool m_injectionsEnabled; bool m_queryingEnabled; bool m_returnResultsAnyway; bool m_spiderIPUrl; bool m_spiderAdultContent; bool m_addUrlEnabled; // TODO: use at http interface level bool m_doStripeBalancing; // . true if the server is on the production cluster // . we enforce the 'elvtune -w 32 /dev/sd?' cmd on all drives because // that yields higher performance when dumping/merging on disk bool m_isLive; int32_t m_maxTotalSpiders; int32_t m_spiderFilterableMaxWordCount; int32_t m_spiderDeadHostCheckInterval; int64_t m_spiderUrlCacheMaxAge; int64_t m_spiderUrlCacheSize; int64_t m_spiderUrlCacheMaxMem; // indexdb has a max cached age for getting IndexLists (10 mins deflt) int32_t m_indexdbMaxIndexListAge; int32_t m_udpMaxSockets; // TODO: parse these out!!!! int32_t m_httpMaxSockets; int32_t m_httpsMaxSockets; int32_t m_httpMaxSendBufSize; // a search results cache (for Msg40) int64_t m_docSummaryWithDescriptionMaxCacheAge; //cache timeout for document summaries for documents with a meta-tag with description, in milliseconds // for Weights.cpp int32_t m_sliderParm; BaseScoringParameters m_baseScoringParameters; int32_t m_numFlagScoreMultipliers; //constant = 26 int32_t m_numFlagRankAdjustments; //constant = 26 int32_t m_maxCorruptLists; int32_t m_defaultQueryResultsValidityTime; //in seconds bool m_useCollectionPasswords; // if in read-only mode we do no spidering and load no saved trees // so we can use all mem for caching index lists bool m_readOnlyMode; // if this is true we use /etc/hosts for hostname lookup before dns bool m_useEtcHosts; //verify integrity of tree/buckets after modification operations bool m_verifyTreeIntegrity; // just ensure lists being written are valid rdb records (titlerecs) // trying to isolate titlerec corruption bool m_verifyDumpedLists; // verify validity of index while merging bool m_verifyIndex; // calls fsync(fd) if true after each write bool m_flushWrites; bool m_verifyWrites; int32_t m_corruptRetries; bool m_msg20FallbackToAllHosts; // log unfreed memory on exit bool m_detectMemLeaks; bool m_forceIt; // if this is true we do not add indexdb keys that *should* already // be in indexdb. but if you recently upped the m_truncationLimit // then you can set this to false to add all indexdb keys. //bool m_onlyAddUnchangedTermIds; bool m_doIncrementalUpdating; int64_t m_stableSummaryCacheSize; int64_t m_stableSummaryCacheMaxAge; int64_t m_unstableSummaryCacheSize; int64_t m_unstableSummaryCacheMaxAge; bool m_useShotgun; bool m_testMem; bool m_doConsistencyTesting; int32_t m_titleRecVersion; // defaults to "Gigabot/1.0" char m_spiderUserAgent[USERAGENTMAXSIZE]; char m_spiderBotName[USERAGENTMAXSIZE]; int32_t m_autoSaveFrequency; int32_t m_docCountAdjustment; bool m_profilingEnabled; // // See Log.h for an explanation of the switches below // // GET and POST requests. bool m_logHttpRequests; bool m_logAutobannedQueries; int32_t m_logLoopTimeThreshold; int32_t m_logRdbIndexAddListTimeThreshold; int32_t m_logRdbMapAddListTimeThreshold; // if query took this or more milliseconds, log its time int32_t m_logQueryTimeThreshold; // if disk read took this or more milliseconds, log its time int32_t m_logDiskReadTimeThreshold; bool m_logQueryReply; // log what gets into the index bool m_logSpideredUrls; // log informational messages, they are not indicative of any error. bool m_logInfo; // when out of udp slots bool m_logNetCongestion; // doc quota limits, url truncation limits bool m_logLimits; // log debug switches bool m_logDebugAddurl; bool m_logDebugAdmin; bool m_logDebugBuild; bool m_logDebugBuildTime; bool m_logDebugDate; bool m_logDebugDb; bool m_logDebugDetailed; bool m_logDebugDirty; bool m_logDebugDisk; bool m_logDebugDns; bool m_logDebugDownloads; bool m_logDebugHttp; bool m_logDebugImage; bool m_logDebugLang; bool m_logDebugLinkInfo; bool m_logDebugLoop; bool m_logDebugMem; bool m_logDebugMemUsage; bool m_logDebugMerge; bool m_logDebugMsg13; bool m_logDebugMsg20; bool m_logDebugMulticast; bool m_logDebugNet; bool m_logDebugProxies; bool m_logDebugQuery; bool m_logDebugRepair; bool m_logDebugRobots; bool m_logDebugSections; bool m_logDebugSpcache; // SpiderCache.cpp debug bool m_logDebugSpeller; bool m_logDebugSpider; bool m_logDebugReindex; bool m_logDebugSEO; bool m_logDebugStats; bool m_logDebugSummary; bool m_logDebugTagdb; bool m_logDebugTcp; bool m_logDebugTcpBuf; bool m_logDebugTitle; bool m_logDebugTopDocs; bool m_logDebugUdp; bool m_logDebugUnicode; bool m_logDebugUrlAttempts; bool m_logDebugVagus; bool m_logTraceBigFile; bool m_logTraceBlockList; bool m_logTraceContentTypeBlockList; bool m_logTraceDocProcess; bool m_logTraceDns; bool m_logTraceDnsBlockList; bool m_logTraceDnsCache; bool m_logTraceFile; bool m_logTraceHttpMime; bool m_logTraceLanguageResultOverride; bool m_logTraceMem; bool m_logTraceMsg0; bool m_logTraceMsg4; bool m_logTraceMsg25; bool m_logTracePageLinkdbLookup; bool m_logTracePageSpiderdbLookup; bool m_logTracePos; bool m_logTracePosdb; bool m_logTraceQuery; bool m_logTraceRdb; bool m_logTraceRdbBase; bool m_logTraceRdbBuckets; bool m_logTraceRdbDump; bool m_logTraceRdbIndex; bool m_logTraceRdbList; bool m_logTraceRdbMap; bool m_logTraceRdbMerge; bool m_logTraceRdbTree; bool m_logTraceRepairs; bool m_logTraceRobots; bool m_logTraceRobotsCheckList; bool m_logTraceSpider; bool m_logTraceSpiderUrlCache; bool m_logTraceSpiderdbHostDelete; bool m_logTraceReindex; bool m_logTraceSummary; bool m_logTraceTitledb; bool m_logTraceXmlDoc; bool m_logTracePhrases; bool m_logTraceUrlMatchList; bool m_logTraceUrlMatchHostList; bool m_logTraceUrlResultOverride; bool m_logTraceWordSpam; bool m_logTraceUrlClassification; bool m_logTraceTopTree; bool m_logTraceTermCheckList; // expensive timing messages bool m_logTimingAddurl; bool m_logTimingAdmin; bool m_logTimingBuild; bool m_logTimingDb; bool m_logTimingNet; bool m_logTimingQuery; bool m_logTimingSpcache; bool m_logTimingRobots; // programmer reminders. bool m_logReminders; SafeBuf m_masterPwds; // these are the new master ips SafeBuf m_connectIps; char m_redirect[MAX_URL_LEN]; bool m_useCompressionProxy; bool m_gzipDownloads; // used by proxy to make proxy point to the temp cluster while // the original cluster is updated bool m_useTmpCluster; // allow scaling up of hosts by removing recs not in the correct // group. otherwise a sanity check will happen. bool m_allowScale; bool m_bypassValidation; int32_t m_maxCallbackDelay; // used by Repair.cpp bool m_repairingEnabled; int32_t m_maxRepairinjections; int64_t m_repairMem; SafeBuf m_collsToRepair; bool m_fullRebuild; bool m_rebuildAddOutlinks; bool m_rebuildRecycleLinkInfo; bool m_rebuildTitledb; bool m_rebuildPosdb; bool m_rebuildClusterdb; bool m_rebuildSpiderdb; bool m_rebuildSpiderdbSmall; bool m_rebuildLinkdb; bool m_rebuildRoots; bool m_rebuildNonRoots; }; extern class Conf g_conf; #endif // GB_CONF_H