5a2cb35e6c
display from admin.html.
696 lines
24 KiB
Plaintext
696 lines
24 KiB
Plaintext
# All <, >, " and # characters that are values for a field contained herein
|
|
# must be represented as <, >, " and # respectively.
|
|
|
|
# Mem available to this process. May be exceeded due to fragmentation.
|
|
<maxMem>4000000000</>
|
|
|
|
# Below the various Gigablast databases are configured.
|
|
# <*dbMaxTreeMem> - mem used for holding new recs
|
|
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
|
|
# <*dbMaxCacheMem> - cache mem for holding single recs
|
|
# <*dbSaveCache> - save the rec cache on exit?
|
|
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
|
|
# See that Stats page for record counts and stats.
|
|
|
|
# How many bytes should be used for caching DNS replies?
|
|
<dnsMaxCacheMem>128000</>
|
|
|
|
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
|
|
# about 100 bytes or so.
|
|
<tagdbMaxTreeMem>1028000</>
|
|
<tagdbMaxPageCacheMem>200000</>
|
|
|
|
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
|
|
# is about 100 bytes.
|
|
<catdbMaxTreeMem>1000000</>
|
|
<catdbMaxPageCacheMem>25000000</>
|
|
<catdbMaxCacheMem>0</>
|
|
|
|
# Clusterdb caches small records for site clustering and deduping.
|
|
<clusterdbMaxTreeMem>1000000</>
|
|
<clusterdbSaveCache>0</>
|
|
|
|
# Max memory for dup vector cache.
|
|
<maxVectorCacheMem>10000000</>
|
|
|
|
# Robotdb caches robot.txt files.
|
|
<robotdbMaxCacheMem>128000</>
|
|
<robotdbSaveCache>0</>
|
|
<linkdbMaxPageCacheMem>0</>
|
|
<statsdbMaxTreeMem>5000000</>
|
|
<statsdbMaxCacheMem>0</>
|
|
<statsdbMaxDiskPageCacheMem>1000000</>
|
|
|
|
# Maximum bytes of a doc that can be sent before having to read more from disk
|
|
<httpMaxSendBufSize>128000</>
|
|
|
|
# Bytes to use for caching search result pages.
|
|
<searchResultsMaxCacheMem>100000</>
|
|
|
|
# Read only mode does not allow spidering.
|
|
<readOnlyMode>0</>
|
|
|
|
# Spell check using the dictionary.
|
|
<doSpellChecking>0</>
|
|
|
|
# give narrow search suggestions.
|
|
<doNarrowSearch>0</>
|
|
|
|
# Overrides all spidering for all collections on just this host.
|
|
<localSpideringEnabled>1</>
|
|
|
|
# Overrides all add urls for all collections on just this host.
|
|
<localAddUrlEnabled>1</>
|
|
|
|
# Used by proxy to point to a temporary cluster while the original cluster is
|
|
# updated with a new binary. The temporary cluster is the same as the original
|
|
# cluster but the ports are all incremented by one from what is in the
|
|
# hosts.conf. This should ONLY be used for the proxy.
|
|
<useTemporaryCluster>0</>
|
|
|
|
# If enabled gb does the search queries in ./test-search/queries.txt and
|
|
# compares to the last run and outputs the diffs for inspection and validation.
|
|
<qaSearchTestEnabled>1</>
|
|
|
|
# Enable spidering on all hosts
|
|
<allSpidersOn>0</>
|
|
|
|
# Serves ads unless pure=1 is in cgi parms.
|
|
<adFeedEnabled>0</>
|
|
|
|
# Stripe #n contains twin #n from each group. Doing stripe balancing helps
|
|
# prevent too many query requests coming into one host. This parm is only for
|
|
# the proxy. Stripe balancing is done by default unless the parm is disabled
|
|
# on the proxy in which case it appends a &dsb=0 to the query url it sends to
|
|
# the host. The proxy alternates to which host it forwards the incoming query
|
|
# based on the stripe. It takes the number of query terms in the query into
|
|
# account to make a more even balance.
|
|
<doStripeBalancing>1</>
|
|
|
|
# Is this cluster part of a live production cluster? If this is true we make
|
|
# sure that elvtune is being set properly for best performance, otherwise, gb
|
|
# will not startup.
|
|
<isLiveCluster>0</>
|
|
|
|
# Is this cluster just used for indexing wikipedia pages?
|
|
<isWikipediaCluster>0</>
|
|
|
|
# At what temperature in Celsius should we send an email alert if a hard drive
|
|
# reaches it?
|
|
<maxHardDriveTemperature>45</>
|
|
|
|
# If a heartbeat is delayed this many milliseconds dump a core so we can see
|
|
# where the CPU was. Logs 'db: missed heartbeat by %lli ms'. Use 0 or less to
|
|
# disable.
|
|
<maxHeartbeatDelayInMilliseconds>0</>
|
|
|
|
# If a call to a message callback or message handler in the udp server takes
|
|
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
|
|
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
|
|
# the logging.
|
|
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
|
|
|
|
# Sends emails to admin if a host goes down.
|
|
<sendEmailAlerts>1</>
|
|
|
|
# Sends to sysadmin@gigablast.com.
|
|
<sendEmailAlertsToSysadmin>1</>
|
|
|
|
# Sends to email address 1 through email server 1.
|
|
<sendEmailAlertsToEmail1>1</>
|
|
|
|
# Sends to email address 1 through email server 1 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail1>1</>
|
|
|
|
# Connects to this server directly when sending email 1
|
|
<emailServer1><![CDATA[10.5.54.47]]></>
|
|
|
|
# Sends to this address when sending email 1
|
|
<emailAddress1><![CDATA[5051234567@vtext.com]]></>
|
|
|
|
# The from field when sending email 1
|
|
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 2 through email server 2.
|
|
<sendEmailAlertsToEmail2>0</>
|
|
|
|
# Sends to email address 2 through email server 2 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail2>1</>
|
|
|
|
# Connects to this server directly when sending email 2
|
|
<emailServer2><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 2
|
|
<emailAddress2><![CDATA[]]></>
|
|
|
|
# The from field when sending email 2
|
|
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 3 through email server 3.
|
|
<sendEmailAlertsToEmail3>0</>
|
|
|
|
# Sends to email address 3 through email server 3 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail3>1</>
|
|
|
|
# Connects to this server directly when sending email 3
|
|
<emailServer3><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 3
|
|
<emailAddress3><![CDATA[]]></>
|
|
|
|
# The from field when sending email 3
|
|
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Sends to email address 4 through email server 4.
|
|
<sendEmailAlertsToEmail4>0</>
|
|
|
|
# Sends to email address 4 through email server 4 if any parm is changed.
|
|
<sendParmChangeEmailAlertsToEmail4>1</>
|
|
|
|
# Connects to this server directly when sending email 4
|
|
<emailServer4><![CDATA[mail.mydomain.com]]></>
|
|
|
|
# Sends to this address when sending email 4
|
|
<emailAddress4><![CDATA[]]></>
|
|
|
|
# The from field when sending email 4
|
|
<fromEmailAddress4><![CDATA[sysadmin@mydomain.com]]></>
|
|
|
|
# Do not send email alerts about dead hosts to anyone except
|
|
# sysadmin@gigablast.com between the times given below unless all the twins of
|
|
# the dead host are also dead. Instead, wait till after if the host is still
|
|
# dead.
|
|
<delayNonCriticalEmailAlerts>0</>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString1><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString2><![CDATA[]]></>
|
|
|
|
# Look for this string in the kernel buffer for sending email
|
|
<errorString3><![CDATA[]]></>
|
|
|
|
# If you have scsi drives or a slow network, say yes here to minimize data
|
|
# fetches across the network.
|
|
<preferLocalReads>0</>
|
|
|
|
# If enabled then all writes will be flushed to disk. This is generally a good
|
|
# thing.
|
|
<doSynchronousWrites>1</>
|
|
|
|
# Read what was written in a verification step. Decreases performance, but may
|
|
# help fight disk corruption mostly on Maxtors and Western Digitals.
|
|
<verifyDiskWrites>0</>
|
|
|
|
# When reindexing a document, do not re-add data that should already be in
|
|
# index or clusterdb since the last time the document was indexed. Otherwise,
|
|
# re-add the data regardless.
|
|
<doIncrementalUpdating>0</>
|
|
|
|
# Use /etc/hosts file to resolve hostnames? the /etc/host file is reloaded
|
|
# every minute, so if you make a change to it you might have to wait one
|
|
# minute for the change to take affect.
|
|
<useEtcHosts>0</>
|
|
|
|
# If enabled, Gigablast assumes the first half of machines in hosts.conf are
|
|
# on a different network switch than the second half, and minimizes transmits
|
|
# between the switches.
|
|
<twinsAreSplit>0</>
|
|
|
|
# When enabled Gigablast will randomly fail at allocating memory. Used for
|
|
# testing stability.
|
|
<doOutOfMemoryTesting>0</>
|
|
|
|
# When enabled Gigablast will make sure it reparses the document exactly the
|
|
# same way. It does this every 1000th document anyway, but enabling this makes
|
|
# it do it for every document.
|
|
<doConsistencyTesting>0</>
|
|
|
|
# If enabled, all servers must have two gigabit ethernet ports hooked up and
|
|
# Gigablast will round robin packets between both ethernet ports when sending
|
|
# to another host. Can speed up network transmissions as much as 2x.
|
|
<useShotgun>0</>
|
|
|
|
# If enabled, Gigablast will use quickpoll. Significantly improves
|
|
# performance. Only turn this off for testing.
|
|
<useQuickpoll>1</>
|
|
|
|
# If enabled, Gigablast will use threads.
|
|
<useThreads>1</>
|
|
|
|
# If enabled, Gigablast will use shared memory. Should really only be used on
|
|
# the live cluster, keep this on the testing cluster since it can leak easily.
|
|
<useSharedMem>0</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForPosdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForDatedb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForTitledb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForSpiderdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForTagdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForChecksumdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForClusterdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForCatdb>1</>
|
|
|
|
# Use disk page cache?
|
|
<useDiskPageCacheForLinkdb>1</>
|
|
|
|
# Scan all titledb files if rec not found. You should keep this on to avoid
|
|
# corruption. Do not turn it off unless you are Matt Wells.
|
|
<scanAllIfNotFound>1</>
|
|
|
|
# for specifying if this is an interface machinemessages are rerouted from
|
|
# this machine to the maincluster set in the hosts.conf.
|
|
<interfaceMachine>0</>
|
|
|
|
# At query time, should Gigablast generate content vectors for title records
|
|
# lacking them? This is an expensive operation, so is really just for testing
|
|
# purposes.
|
|
<generateVectorAtQueryTime>0</>
|
|
|
|
# Keep track of ips which do queries, disallow non-customers from hitting us
|
|
# too hard.
|
|
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
|
|
|
|
# Non-customers get this many queries per day beforebeing autobanned
|
|
<freeQueriesPerDay>1024</>
|
|
|
|
# Non-customers get this many queries per minute beforebeing autobanned
|
|
<freeQueriesPerMinute>30</>
|
|
|
|
# If this is non empty, http traffic will be redirected to the specified
|
|
# address.
|
|
<redirectNonrawTraffic><![CDATA[]]></>
|
|
|
|
# If this is true, gb will route download requests for web pages to proxies in
|
|
# hosts.conf. Proxies will download and compress docs before sending back.
|
|
<sendRequestsToCompressionProxy>0</>
|
|
|
|
# Enable/disable the ability to synchronize time between the cluster and the
|
|
# proxy
|
|
<synchronizeProxyToClusterTime>0</>
|
|
|
|
# Allows scaling up of hosts by deleting recs not in the correct group. This
|
|
# should only happen why copying a set of servers to the new hosts. Otherwise
|
|
# corrupted data will cause a halt.
|
|
<allowScalingOfHosts>0</>
|
|
|
|
# Allows bypass of db validation so gigablast will not halt if a corrupt db is
|
|
# discovered durring load. Use this when attempting to load with a collection
|
|
# that has known corruption.
|
|
<allowBypassOfDbValidation>0</>
|
|
|
|
# IP address of the primary DNS server. Assumes UDP port 53.
|
|
<dns0>8.8.8.8</>
|
|
|
|
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
|
|
# accessed in conjunction with the primary dns, so make sure this is always
|
|
# up. An ip of 0 means disabled.
|
|
<dns1>8.8.8.4</>
|
|
<dns2>0.0.0.0</>
|
|
<dns3>0.0.0.0</>
|
|
<dns4>0.0.0.0</>
|
|
<dns5>0.0.0.0</>
|
|
<dns6>0.0.0.0</>
|
|
<dns7>0.0.0.0</>
|
|
<dns8>0.0.0.0</>
|
|
<dns9>0.0.0.0</>
|
|
<dns10>0.0.0.0</>
|
|
<dns11>0.0.0.0</>
|
|
<dns12>0.0.0.0</>
|
|
<dns13>0.0.0.0</>
|
|
<dns14>0.0.0.0</>
|
|
<dns15>0.0.0.0</>
|
|
<geocoderIP1>10.5.66.11</>
|
|
<geocoderIP2>0.0.0.0</>
|
|
<geocoderIP3>0.0.0.0</>
|
|
<geocoderIP4>0.0.0.0</>
|
|
|
|
# Access the wiki coll through this proxy ip
|
|
<wikiProxyIp>0.0.0.0</>
|
|
|
|
# Access the wiki coll through this proxy port
|
|
<wikiProxyPort>0</>
|
|
|
|
# Email alerts will include the cluster name
|
|
<clusterName><![CDATA[unspecified]]></>
|
|
|
|
# Identification seen by web servers when the Gigablast spider downloads their
|
|
# web pages. It is polite to insert a contact email address here so webmaster
|
|
# that experience problems from the Gigablast spider have somewhere to vent.
|
|
<spiderUserAgent><![CDATA[GigablastOpenSource/1]]></>
|
|
|
|
# If this is true, gb will send accept-encoding: gzipwhen doing http downloads.
|
|
<askForGzippedDocsWhenDownloading>0</>
|
|
|
|
# When no collection is explicitly specified, assume this collection name.
|
|
<defaultCollection><![CDATA[main]]></>
|
|
|
|
# Collection to be used for directory searching and display of directory topic
|
|
# pages.
|
|
<directoryCollection><![CDATA[]]></>
|
|
|
|
# Hostname of the server providing the directory. Leave empty to use this host.
|
|
<directoryHostname><![CDATA[]]></>
|
|
|
|
# Total incoming bandwidth used by all spiders should not exceed this many
|
|
# kilobits per second.
|
|
<maxIncomingBandwidthForSpider>999999.000</>
|
|
|
|
# Spiders will shed load when their host exceeds this value for the 1-minute
|
|
# load average in /proc/loadavg. The value 0.0 disables this feature.
|
|
<max1minuteSlidingwindowLoadavg>0.000</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for intersecting
|
|
# docid lists. Generally, set this to the number of CPUs on the machine.
|
|
<maxCpuThreads>1</>
|
|
|
|
# Maximum number of pages to index or delete from index per second for all
|
|
# hosts combined.
|
|
<maxPagesPerSecond>999999.000</>
|
|
|
|
# Consider a host in the Gigablast network to be dead if it does not respond
|
|
# to successive pings for this number of seconds. Gigablast does not send
|
|
# requests to dead hosts. Outstanding requests may be re-routed to a twin.
|
|
<deadHostTimeout>4000</>
|
|
|
|
# Send an email after a host has not responded to successive pings for this
|
|
# many milliseconds.
|
|
<sendEmailTimeout>62000</>
|
|
|
|
# Wait this many milliseconds before pinging the next host. Each host pings
|
|
# all other hosts in the network.
|
|
<pingSpacer>100</>
|
|
|
|
# Send email alerts when average query latency goes above this threshold.
|
|
<averageQueryLatencyThreshold>2.000</>
|
|
|
|
# Send email alerts when query success rate goes below this threshold.
|
|
<querySuccessRateThreshold>0.850</>
|
|
|
|
# Record this number of query times before calculating average query latency.
|
|
<numberOfQueryTimesInAverage>3000</>
|
|
|
|
# If we reach this many corrupt index lists, send an admin email. Set to -1
|
|
# to disable.
|
|
<maxCorruptIndexLists>5</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for writing data to
|
|
# the disk. Keep low to reduce file interlace effects and impact on query
|
|
# response time.
|
|
<maxWriteThreads>1</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for accessing the
|
|
# disk for index-building purposes. Keep low to reduce impact on query
|
|
# response time. Increase for RAID systems or when initially building an index.
|
|
<maxSpiderReadThreads>7</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxSpiderBigReadThreads>3</>
|
|
|
|
# This particular number applies to all reads above 100K.
|
|
<maxSpiderMediumReadThreads>4</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxSpiderSmallReadThreads>5</>
|
|
|
|
# Maximum number of threads to use per Gigablast process for accessing the
|
|
# disk for querying purposes. IDE systems tend to be more responsive when this
|
|
# is low. Increase for SCSI or RAID systems.
|
|
<maxQueryReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxQueryBigReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 100K.
|
|
<maxQueryMediumReadThreads>20</>
|
|
|
|
# This particular number applies to all reads above 1MB.
|
|
<maxQuerySmallReadThreads>20</>
|
|
|
|
# Word or phrase must be present in this percent of documents in order to
|
|
# qualify as a spelling recommendation.
|
|
<minPopularityForSpeller>0.010</>
|
|
|
|
# Percent to weight phrases in queries.
|
|
<phraseWeight>100.000</>
|
|
|
|
# Percent of how much to use words to phrase ratio weights.
|
|
<weightscppSliderParmtmp>90</>
|
|
|
|
# When passing queries around the network, send the raw string instead of the
|
|
# serialized query if the required buffer is bigger than this. Smaller values
|
|
# decrease network traffic for large queries at the expense of processing time.
|
|
<maximumSerializedQuerySize>8192</>
|
|
|
|
# Read and write this many bytes at a time when merging files. Smaller values
|
|
# are kinder to query performance, but the merge takes longer. Use at least
|
|
# 1000000 for fast merging.
|
|
<mergeBufSize>800000</>
|
|
|
|
# minRecSizes for Catdb lookups
|
|
<catdbMinRecSizes>100000000</>
|
|
|
|
# Maximum sockets available to serve incoming HTTP requests. Too many
|
|
# outstanding requests will increase query latency. Excess requests will
|
|
# simply have their sockets closed.
|
|
<maxHttpSockets>100</>
|
|
|
|
# Maximum sockets available to serve incoming HTTPS requests. Like max http
|
|
# sockets, but for secure sockets.
|
|
<maxHttpsSockets>100</>
|
|
|
|
# Copy data in memory to disk after this many minutes have passed without the
|
|
# data having been dumped or saved to disk. Use 0 to disable.
|
|
<autoSaveFrequency>5</>
|
|
|
|
# Add this number to the total document count in the index. Just used for
|
|
# displaying on the homepage.
|
|
<docCountAdjustment>0</>
|
|
|
|
# Generates profiling data for callbacks on page performance
|
|
<dynamicPerformanceGraph>0</>
|
|
|
|
# Enable profiler to do accounting of time taken by functions.
|
|
<enableProfiling>1</>
|
|
|
|
# Profiler will not show functions which take less than this many milliseconds
|
|
# in the log or on the perfomance graph.
|
|
<minimumProfilingThreshold>10</>
|
|
|
|
# Produce a LOG_TIMING log message for each callback called, along with the
|
|
# time it took. Profiler must be enabled.
|
|
<sequentialProfiling>0</>
|
|
|
|
# Archive system statistics information in Statsdb.
|
|
<useStatsdb>1</>
|
|
|
|
# How many seconds should we cache a search results page for?
|
|
<searchResultsCacheMaxAge>10800</>
|
|
|
|
# add Ips here to bar them from accessing this gigablast server.
|
|
<banIps><![CDATA[]]></>
|
|
|
|
# add Ips here to give them an infinite query quota.
|
|
<allowIps><![CDATA[]]></>
|
|
|
|
# Don't try to autoban queries that have one of these codes. Also, the code
|
|
# must be valid for us to use &uip=IPADDRESS as the IP address of the
|
|
# submitter for purposes of autoban AND purposes of addurl daily quotas.
|
|
<validCodes><![CDATA[]]></>
|
|
|
|
# Append extra default parms to queries that match certain substrings.
|
|
# Format: text to match in url, followed by a space, then the list of extra
|
|
# parms as they would appear appended to the url. One match per line.
|
|
<extraParms><![CDATA[]]></>
|
|
|
|
# ban any query that matches this list of substrings. Must match all
|
|
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
|
|
<banRegex><![CDATA[]]></>
|
|
|
|
# Add facebook user IDs here so those people can turk the results. Later we
|
|
# may limit each person to turking a geographic region.
|
|
<supterturks><![CDATA[]]></>
|
|
|
|
# Add users here. The format is
|
|
# collection:ip:username:password:relogin:pages:tagnames Username and password
|
|
# cannot be blank. You can specify * for collection to indicate all
|
|
# collections. * can be used in IP as wildcard. * in pages means user has
|
|
# access to all pages. Also you can specify individual pages. A '-' sign at
|
|
# the start of page means user is not allowed to access that page. Please
|
|
# refer the page reference table at the bottom of this page for available
|
|
# pages. If you want to just login once and avoid relogin for gb shutdowns
|
|
# then set relogin=1, else set it to 0. If relogin is 1 your login will never
|
|
# expire either.<br> Ex: 1. master user -> *:*:master:master:1:*:english<br>
|
|
# 2. public user ->
|
|
# *:*:public:1234:0:index.html,get,search,login,dir:english<br>3. turk user ->
|
|
#
|
|
#
|
|
#
|
|
#
|
|
#
|
|
# 8.58.122:main:turk:1234:0:pageturkhome,pageturk,pageturkget,get,login:english
|
|
<users><![CDATA[*:*:mwells:mwells62:1:*:
|
|
*:*:public:1234:0:index.html,get,search,login,dir:]]></>
|
|
|
|
# Allow UDP requests from this list of IPs. Any datagram received not coming
|
|
# from one of these IPs, or an IP in hosts.conf, is dropped. If another
|
|
# cluster is accessing this cluster for getting link text or whatever, you
|
|
# will need to list the IPs of the accessing machines here. These IPs are also
|
|
# used to allow access to the HTTP server even if it was disabled in the
|
|
# Master Controls. IPs that have 0 has their Least Significant Byte are
|
|
# treated as wildcards for IP blocks. That is, 1.2.3.0 means 1.2.3.*.
|
|
<connectIp>10.5.0.3</>
|
|
|
|
# Log GET and POST requests received from the http server?
|
|
<logHttpRequests>1</>
|
|
|
|
# Should we log queries that are autobanned? They can really fill up the log.
|
|
<logAutobannedQueries>1</>
|
|
|
|
# If query took this many millliseconds or longer, then log the query and the
|
|
# time it took to process.
|
|
<logQueryTimeThreshold>5000</>
|
|
|
|
# Log query reply in proxy, but only for those queries above the time
|
|
# threshold above.
|
|
<logQueryReply>0</>
|
|
|
|
# Log status of spidered or injected urls?
|
|
<logSpideredUrls>1</>
|
|
|
|
# Log messages if Gigablast runs out of udp sockets?
|
|
<logNetworkCongestion>0</>
|
|
|
|
# Log messages not related to an error condition, but meant more to give an
|
|
# idea of the state of the gigablast process. These can be useful when
|
|
# diagnosing problems.
|
|
<logInformationalMessages>1</>
|
|
|
|
# Log it when document not added due to quota breech. Log it when url is too
|
|
# long and it gets truncated.
|
|
<logLimitBreeches>0</>
|
|
|
|
# Log various debug messages.
|
|
<logDebugAdminMessages>0</>
|
|
<logDebugBuildMessages>0</>
|
|
<logDebugBuildTimeMessages>0</>
|
|
<logDebugDatabaseMessages>0</>
|
|
<logDebugDiskMessages>0</>
|
|
<logDebugDnsMessages>0</>
|
|
<logDebugHttpMessages>0</>
|
|
<logDebugLoopMessages>0</>
|
|
<logDebugLanguageDetectionMessages>0</>
|
|
<logDebugLinkInfo>0</>
|
|
<logDebugMemMessages>0</>
|
|
<logDebugMemUsageMessages>0</>
|
|
<logDebugNetMessages>0</>
|
|
<logDebugPostQueryRerankMessages>0</>
|
|
<logDebugQueryMessages>0</>
|
|
<logDebugQuotaMessages>0</>
|
|
<logDebugRobotsMessages>0</>
|
|
<logDebugSpiderCacheMessages>0</>
|
|
<logDebugSpellerMessages>0</>
|
|
<logDebugSectionsMessages>0</>
|
|
<logDebugSeoInsertMessages>1</>
|
|
<logDebugSeoMessages>0</>
|
|
<logDebugStatsMessages>0</>
|
|
<logDebugSummaryMessages>0</>
|
|
<logDebugSpiderMessages>0</>
|
|
<logDebugUrlAttempts>0</>
|
|
<logDebugSpiderDownloads>0</>
|
|
<logDebugFacebook>0</>
|
|
<logDebugTagdbMessages>0</>
|
|
<logDebugTcpMessages>0</>
|
|
<logDebugThreadMessages>0</>
|
|
<logDebugTitleMessages>0</>
|
|
<logDebugTimedbMessages>0</>
|
|
<logDebugTopicMessages>0</>
|
|
<logDebugTopDocMessages>0</>
|
|
<logDebugUdpMessages>0</>
|
|
<logDebugUnicodeMessages>0</>
|
|
<logDebugRepairMessages>0</>
|
|
<logDebugPubDateExtractionMessages>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForBuild>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForAdmin>0</>
|
|
<logTimingMessagesForDatabase>0</>
|
|
<logTimingMessagesForNetworkLayer>0</>
|
|
<logTimingMessagesForQuery>0</>
|
|
|
|
# Log various timing related messages.
|
|
<logTimingMessagesForSpcache>0</>
|
|
<logTimingMessagesForRelatedTopics>0</>
|
|
|
|
# Log reminders to the programmer. You do not need this.
|
|
<logReminderMessages>0</>
|
|
|
|
# If enabled, gigablast will repair the rdbs as specified by the parameters
|
|
# below. When a particular collection is in repair mode, it can not spider or
|
|
# merge titledb files.
|
|
<repairModeEnabled>0</>
|
|
|
|
# Comma or space separated list of the collections to repair or rebuild.
|
|
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
|
|
|
# In bytes.
|
|
<memoryToUseForRepair>300000000</>
|
|
|
|
# Maximum number of outstanding inject spiders for repair.
|
|
<maxRepairSpiders>32</>
|
|
|
|
# If enabled, gigablast will reinject the content of all title recs into a
|
|
# secondary rdb system. That will the primary rdb system when complete.
|
|
<fullRebuild>0</>
|
|
|
|
# If enabled, gigablast will keep the new spiderdb records when doing the full
|
|
# rebuild or the spiderdb rebuild.
|
|
<keepNewSpiderdbRecs>1</>
|
|
|
|
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
|
<recycleLinkInfo>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildTitledb>1</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildPosdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildClusterdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildSpiderdb>0</>
|
|
|
|
# If enabled, gigablast will rebuild this rdb
|
|
<rebuildLinkdb>0</>
|
|
|
|
# If disabled, gigablast will skip root urls.
|
|
<rebuildRootUrls>1</>
|
|
|
|
# If disabled, gigablast will skip non-root urls.
|
|
<rebuildNonrootUrls>1</>
|
|
|
|
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
|
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
|
# the original doc quality and spider priority in tact.
|
|
<skipTagdbLookup>0</>
|