privacore-open-source-searc.../Conf.cpp
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

581 lines
15 KiB
C++

#include "Conf.h"
#include "Parms.h"
#include "File.h"
#include "Proxy.h"
#include "Msg3a.h" // MAX_SHARDS
#include "Collectiondb.h"
#include "TcpSocket.h"
#include "HttpRequest.h"
#include "Process.h"
#include "Mem.h"
#include "TitleRecVersion.h"
#include "ip.h"
#include "Errno.h"
#include <sys/stat.h> //umask()
Conf g_conf;
static bool s_setUmask = false;
mode_t getFileCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
}
mode_t getDirCreationFlags() {
if ( ! s_setUmask ) {
s_setUmask = true;
umask ( 0 );
}
return S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IXUSR | S_IXGRP;
}
Conf::Conf ( ) {
m_save = true;
m_doingCommandLine = false;
// set max mem to 16GB at least until we load on disk
m_maxMem = 16000000000;
m_mlockAllCurrent = true;
m_mlockAllFuture = true;
// Coverity.. a bit overkill, but wth...
m_runAsDaemon = false;
m_logToFile = false;
memset(m_defaultColl, 0, sizeof(m_defaultColl));
m_numDns = 0;
memset(m_dnsIps, 0, sizeof(m_dnsIps));
memset(m_dnsPorts, 0, sizeof(m_dnsPorts));
m_dnsCacheMaxAge = 0;
m_dnsCacheSize = 0;
m_dnsMaxCacheMem = 0;
m_clusterdbQuickCacheMem = 0;
m_askRootNameservers = false;
m_numRns = 0;
memset(m_rnsIps, 0, sizeof(m_rnsIps));
m_queryLanguageServerName[0] = '\0';
m_queryLanguageServerPort = 0;
m_maxOutstandingQueryLanguage = 0;
m_queryLanguageTimeout = 0;
m_siteMedianPageTemperatureServerName[0] = '\0';
m_siteMedianPageTemperatureServerPort = 0;
m_maxOutstandingSiteMedianPageTemperature = 0;
m_siteMedianPageTemperatureTimeout = 0;
m_siteNumInlinksServerName[0] = '\0';
m_siteNumInlinksServerPort = 0;
m_maxOutstandingSiteNumInlinks = 0;
m_siteNumInlinksTimeout = 0;
m_urlClassificationServerName[0] = '\0';
m_urlClassificationServerPort = 0;
m_maxOutstandingUrlClassifications = 0;
m_urlClassificationTimeout = 0;
m_mergeBufSize = 0;
m_doledbNukeInterval = 86400;
m_posdbMaxLostPositivesPercentage = 0;
m_posdbFileCacheSize = 0;
m_posdbMaxTreeMem = 0;
m_tagdbMaxLostPositivesPercentage = 0;
m_tagdbFileCacheSize = 0;
m_tagdbMaxTreeMem = 0;
m_mergespaceLockDirectory[0] = '\0';
m_mergespaceMinLockFiles = 0;
m_mergespaceDirectory[0] = '\0';
m_clusterdbMaxLostPositivesPercentage = 0;
m_clusterdbFileCacheSize = 0;
m_clusterdbMaxTreeMem = 0;
m_clusterdbMinFilesToMerge = 0;
m_titledbMaxLostPositivesPercentage = 0;
m_titledbFileCacheSize = 0;
m_titledbMaxTreeMem = 0;
m_spiderdbMaxLostPositivesPercentage = 0;
m_spiderdbFileCacheSize = 0;
m_spiderdbMaxTreeMem = 0;
m_linkdbMaxLostPositivesPercentage = 0;
m_linkdbMaxTreeMem = 0;
m_linkdbMinFilesToMerge = 0;
m_maxCpuThreads = 0;
m_maxIOThreads = 0;
m_maxExternalThreads = 0;
m_maxJobCleanupTime = 0;
m_vagusClusterId[0] = '\0';
m_vagusPort = 8720;
m_vagusKeepaliveSendInterval = 500;
m_vagusKeepaliveLifetime = 5000;
m_vagusMaxDeadTime = 5;
m_maxDocsWanted = 0;
m_maxFirstResultNum = 0;
min_docid_splits = 0;
max_docid_splits = 0;
m_msg40_msg39_timeout = 0;
m_msg3a_msg39_network_overhead = 0;
m_useHighFrequencyTermCache = false;
m_spideringEnabled = false;
m_injectionsEnabled = false;
m_queryingEnabled = false;
m_returnResultsAnyway = false;
m_spiderIPUrl = true;
m_spiderAdultContent = true;
m_addUrlEnabled = false;
m_doStripeBalancing = false;
m_isLive = false;
m_maxTotalSpiders = 0;
m_spiderFilterableMaxWordCount = 0;
m_spiderDeadHostCheckInterval = 0;
m_spiderUrlCacheMaxAge = 0;
m_spiderUrlCacheSize = 0;
m_indexdbMaxIndexListAge = 0;
m_udpMaxSockets = 0;
m_httpMaxSockets = 0;
m_httpsMaxSockets = 0;
m_httpMaxSendBufSize = 0;
m_docSummaryWithDescriptionMaxCacheAge = 0;
m_sliderParm = 0;
m_sameLangWeight = 0.0;
m_unknownLangWeight = 0.0;
m_baseScoringParameters.clear();
m_numFlagScoreMultipliers = 26;
m_numFlagRankAdjustments = 26;
for(int i=0; i<26; i++) {
m_baseScoringParameters.m_flagScoreMultiplier[i] = 1.0;
m_baseScoringParameters.m_flagRankAdjustment[i] = 0;
}
m_maxCorruptLists = 0;
m_defaultQueryResultsValidityTime = 0;
m_useCollectionPasswords = false;
m_readOnlyMode = false;
m_useEtcHosts = false;
m_verifyTreeIntegrity = false;
m_verifyDumpedLists = false;
m_verifyIndex = false;
m_flushWrites = false;
m_verifyWrites = false;
m_verifyTagRec = false;
m_corruptRetries = 0;
m_sqliteSynchronous = 1;
m_docDeleteDelayMs = 0;
m_docRebuildDelayMs = 0;
m_docReindexDelayMs = 1000;
m_docDeleteMaxPending = 1;
m_docRebuildMaxPending = 1;
m_docReindexMaxPending = 10;
m_detectMemLeaks = false;
m_forceIt = false;
m_doIncrementalUpdating = false;
m_stableSummaryCacheSize = 0;
m_stableSummaryCacheMaxAge = 0;
m_unstableSummaryCacheSize = 0;
m_unstableSummaryCacheMaxAge = 0;
m_useShotgun = false;
m_testMem = false;
m_doConsistencyTesting = false;
m_titleRecVersion = TITLEREC_CURRENT_VERSION;
memset(m_spiderUserAgent, 0, sizeof(m_spiderUserAgent));
memset(m_spiderBotName, 0, sizeof(m_spiderBotName));
m_autoSaveFrequency = 0;
m_docCountAdjustment = 0;
m_profilingEnabled = false;
m_logHttpRequests = false;
m_logAutobannedQueries = false;
m_logLoopTimeThreshold = 500;
m_logRdbIndexAddListTimeThreshold = 50;
m_logRdbMapAddListTimeThreshold = 50;
m_logQueryTimeThreshold = 0;
m_logDiskReadTimeThreshold = 0;
m_logSqliteTransactionTimeThreshold = 0;
m_logQueryReply = false;
m_logSpideredUrls = false;
m_logInfo = false;
m_logNetCongestion = false;
m_logLimits = false;
m_logDebugAddurl = false;
m_logDebugAdmin = false;
m_logDebugBuild = false;
m_logDebugBuildTime = false;
m_logDebugDate = false;
m_logDebugDb = false;
m_logDebugDetailed = false;
m_logDebugDirty = false;
m_logDebugDisk = false;
m_logDebugDns = false;
m_logDebugDownloads = false;
m_logDebugHttp = false;
m_logDebugImage = false;
m_logDebugLang = false;
m_logDebugLinkInfo = false;
m_logDebugLoop = false;
m_logDebugMem = false;
m_logDebugMemUsage = false;
m_logDebugMerge = false;
m_logDebugMsg13 = false;
m_logDebugMsg20 = false;
m_logDebugMulticast = false;
m_logDebugNet = false;
m_logDebugProxies = false;
m_logDebugQuery = false;
m_logDebugRepair = false;
m_logDebugRobots = false;
m_logDebugSections = false;
m_logDebugSpcache = false;
m_logDebugSpeller = false;
m_logDebugSpider = false;
m_logDebugReindex = false;
m_logDebugSEO = false;
m_logDebugStats = false;
m_logDebugSummary = false;
m_logDebugTagdb = false;
m_logDebugTcp = false;
m_logDebugTcpBuf = false;
m_logDebugTitle = false;
m_logDebugTopDocs = false;
m_logDebugUdp = false;
m_logDebugUnicode = false;
m_logDebugUrlAttempts = false;
m_logDebugVagus = false;
m_logTraceBigFile = false;
m_logTraceMatchList = false;
m_logTraceContentTypeBlockList = false;
m_logTraceDocid2FlagsAndSiteMap = false;
m_logTraceDocProcess = false;
m_logTraceDns = false;
m_logTraceDnsBlockList = false;
m_logTraceDnsCache = false;
m_logTraceFile = false;
m_logTraceHttpMime = false;
m_logTraceIpBlockList = false;
m_logTraceLanguageResultOverride = false;
m_logTraceMem = false;
m_logTraceMsg0 = false;
m_logTraceMsg4In = false;
m_logTraceMsg4Out = false;
m_logTraceMsg4OutData = false;
m_logTraceMsg25 = false;
m_logTracePageLinkdbLookup = false;
m_logTracePageSpiderdbLookup = false;
m_logTracePos = false;
m_logTracePosdb = false;
m_logTraceQuery = false;
m_logTraceQueryLanguage = false;
m_logTraceRdb = false;
m_logTraceRdbBase = false;
m_logTraceRdbBuckets = false;
m_logTraceRdbDump = false;
m_logTraceRdbIndex = false;
m_logTraceRdbList = false;
m_logTraceRdbMap = false;
m_logTraceRdbMerge = false;
m_logTraceRdbTree = false;
m_logTraceRepairs = false;
m_logTraceRobots = false;
m_logTraceRobotsCheckList = false;
m_logTraceSpider = false;
m_logTraceSpiderUrlCache = false;
m_logTraceReindex = false;
m_logTraceSiteMedianPageTemperature = false;
m_logTraceSiteNumInlinks = false;
m_logTraceSpiderdbRdbSqliteBridge = false;
m_logTraceSummary = false;
m_logTraceTitledb = false;
m_logTraceXmlDoc = false;
m_logTracePhrases= false;
m_logTraceTokenIndexing = false;
m_logTraceUrlMatchList = false;
m_logTraceUrlResultOverride = false;
m_logTraceWordSpam=false;
m_logTraceUrlClassification = false;
m_logTraceTermCheckList = false;
m_logTimingAddurl = false;
m_logTimingAdmin = false;
m_logTimingBuild = false;
m_logTimingDb = false;
m_logTimingNet = false;
m_logTimingQuery = false;
m_logTimingLinkInfo = false;
m_logTimingRobots = false;
m_logReminders = false;
memset(m_redirect, 0, sizeof(m_redirect));
m_useCompressionProxy = false;
m_gzipDownloads = false;
m_useTmpCluster = false;
m_allowScale = true;
m_bypassValidation = false;
m_maxCallbackDelay = -1;
m_repairingEnabled = false;
m_maxRepairinjections = 0;
m_repairMem = 0;
m_fullRebuild = true;
m_rebuildAddOutlinks = false;
m_rebuildRecycleLinkInfo = true;
m_rebuildUseTitleRecTagRec = true;
m_rebuildTitledb = false;
m_rebuildPosdb = false;
m_rebuildClusterdb = false;
m_rebuildSpiderdb = false;
m_rebuildLinkdb = false;
m_rebuildRoots = true;
m_rebuildNonRoots = true;
}
static bool isInWhiteSpaceList ( const char *p , const char *buf ) {
if ( ! p ) return false;
const char *match = strstr ( buf , p );
if ( ! match ) return false;
int32_t len = strlen(p);
// ensure book-ended by whitespace
if ( match &&
(match == buf || is_wspace_a(match[-1])) &&
(!match[len] || is_wspace_a(match[len])) )
return true;
// no match
return false;
}
bool Conf::isCollAdmin ( TcpSocket *sock , HttpRequest *hr ) {
// master always does
if ( isMasterAdmin ( sock , hr ) ) return true;
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
}
bool Conf::isCollAdminForColl ( TcpSocket *sock, HttpRequest *hr, const char *coll ) {
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
}
bool Conf::isCollAdmin2 ( TcpSocket *sock, HttpRequest *hr, CollectionRec *cr ) {
if ( ! cr ) return false;
// never for main! must be root!
if ( strcmp(cr->m_coll,"main")==0 ) return false;
if ( ! g_conf.m_useCollectionPasswords) return false;
// empty password field? then allow them through
if ( cr->m_collectionPasswords.length() <= 0 &&
cr->m_collectionIps .length() <= 0 )
return true;
// a good ip?
char ipbuf[16];
const char *p = iptoa(sock->m_ip,ipbuf);
char *buf = cr->m_collectionIps.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
// if they got the password, let them in
p = hr->getString("pwd");
if ( ! p ) p = hr->getString("password");
if ( ! p ) p = hr->getStringFromCookie("pwd");
if ( ! p ) return false;
buf = cr->m_collectionPasswords.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
return false;
}
// . is user a root administrator?
// . only need to be from root IP *OR* have password, not both
bool Conf::isMasterAdmin ( TcpSocket *socket , HttpRequest *hr ) {
bool isAdmin = false;
// totally open access?
//if ( m_numConnectIps <= 0 && m_numMasterPwds <= 0 )
if ( m_connectIps.length() <= 0 &&
m_masterPwds.length() <= 0 )
isAdmin = true;
// coming from root gets you in
if ( socket && isMasterIp ( socket->m_ip ) )
isAdmin = true;
if ( hasMasterPwd ( hr ) )
isAdmin = true;
if ( ! isAdmin )
return false;
// default this to true so if user specifies &admin=0 then it
// cancels our admin view
if ( hr && ! hr->getLong("admin",1) )
return false;
return true;
}
bool Conf::hasMasterPwd ( HttpRequest *hr ) {
if ( m_masterPwds.length() <= 0 )
return false;
const char *p = hr->getString("pwd");
if ( ! p ) p = hr->getString("password");
if ( ! p ) p = hr->getStringFromCookie("pwd");
if ( ! p ) return false;
const char *buf = m_masterPwds.getBufStart();
return isInWhiteSpaceList ( p , buf );
}
// . check this ip in the list of admin ips
bool Conf::isMasterIp ( uint32_t ip ) {
if ( m_connectIps.length() <= 0 ) return false;
char ipbuf[16];
const char *p = iptoa(ip,ipbuf);
char *buf = m_connectIps.getBufStart();
return isInWhiteSpaceList ( p , buf );
}
bool Conf::isConnectIp ( uint32_t ip ) {
return isMasterIp(ip);
}
// . set all member vars to their default values
void Conf::reset ( ) {
g_parms.setToDefault ( (char *)this , OBJ_CONF ,NULL);
m_save = true;
}
bool Conf::init ( char *dir ) { // , int32_t hostId ) {
g_parms.setToDefault ( (char *)this , OBJ_CONF ,NULL);
m_save = true;
char fname[1024];
File f;
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
else sprintf ( fname , "./gb.conf" );
// try regular gb.conf then
f.set ( fname );
// make sure g_mem.maxMem is big enough temporarily
g_conf.m_maxMem = 8000000000; // 8gb temp
bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );
if ( g_conf.m_maxMem < 10000000 ) g_conf.m_maxMem = 10000000;
// if not there, create it!
if ( ! status ) {
log("gb: Creating %s from defaults.",fname);
g_errno = 0;
// set to defaults
g_conf.reset();
// and save it
m_save = true;
save();
// clear errors
g_errno = 0;
status = true;
}
if ( ! g_mem.init ( ) ) return false;
// always turn this off
g_conf.m_testMem = false;
// and this, in case you forgot to turn it off
if ( g_conf.m_isLive ) g_conf.m_doConsistencyTesting = false;
// this off
g_conf.m_repairingEnabled = false;
// hard-code disable this -- could be dangerous
g_conf.m_bypassValidation = true;
// this could too! (need this)
g_conf.m_allowScale = true;//false;
g_conf.m_forceIt = false;
// sanity check
if ( g_hostdb.m_indexSplits > MAX_SHARDS ) {
log("db: Increase MAX_SHARDS");
g_process.shutdownAbort(true);
}
// HACK: set this now
setRootIps();
return status;
}
void Conf::setRootIps ( ) {
// set m_numDns based on Conf::m_dnsIps[] array
int32_t i; for ( i = 0; i < MAX_DNSIPS ; i++ ) {
m_dnsPorts[i] = 53;
if ( ! g_conf.m_dnsIps[i] ) break;
}
m_numDns = i;
// fail back to public dns
const char *ipStr = PUBLICLY_AVAILABLE_DNS1;
if ( m_numDns == 0 ) {
m_dnsIps[0] = atoip( ipStr , strlen(ipStr) );
m_dnsPorts[0] = 53;
m_numDns = 1;
}
// default this to off on startup for now until it works better
m_askRootNameservers = false;
// and return as well
return;
}
// . parameters can be changed on the fly so we must save Conf
bool Conf::save ( ) {
if ( ! m_save ) {
return true;
}
// fix so if we core in malloc/free we can still save conf
StackBuf<1024> fn;
fn.safePrintf("%sgb.conf",g_hostdb.m_dir);
log(LOG_INFO, "db: Saving %s", fn.getBufStart());
bool status = g_parms.saveToXml ( (char *)this , fn.getBufStart(), OBJ_CONF );
return status;
}
// . get the default collection based on hostname
// will look for the hostname in each collection for a match
// no match defaults to default collection
const char *Conf::getDefaultColl ( ) {
if ( ! m_defaultColl[0] ) {
return "main";
}
// just use default coll for now to keep things simple
return m_defaultColl;
}