added gbdocspiderdate and gbdocindexdate terms

just for docs and not spider reply "documents".
do not index plain terms for CT_STATUS spider reply
docs. create gb.conf if does not exist, take out of
repo.
This commit is contained in:
Matt Wells
2014-06-19 15:27:46 -07:00
parent 27193d444a
commit aaec46f612
5 changed files with 156 additions and 461 deletions

@ -182,12 +182,30 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
m_isLocal = false;
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
else sprintf ( fname , "./gb.conf" );
// try regular gb.conf then
f.set ( fname );
}
// make sure g_mem.maxMem is big enough temporarily
if ( g_mem.m_maxMem < 10000000 ) g_mem.m_maxMem = 10000000;
bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );
// if not there, create it!
if ( ! status ) {
log("gb: Creating %s from defaults.",fname);
g_errno = 0;
// set to defaults
g_conf.reset();
// and save it
//log("gb: Saving %s",fname);
m_save = true;
save();
// clear errors
g_errno = 0;
status = true;
}
// ignore if yippy
if ( g_isYippy ) {
//g_conf.m_doAutoBan = true;
@ -415,7 +433,7 @@ bool Conf::save ( ) {
if(access(fname2, F_OK) == 0) unlink(fname2);
if(link(fname, fname2) == 0) {
unlink(fname);
log(LOG_INFO,"admin: Saved %s.",fname);
log(LOG_INFO,"admin: Saved %s.",fname2);
} else {
log(LOG_INFO,"admin: Unable to save %s:%s",
fname, strerror(errno));

@ -186,6 +186,8 @@ static long long s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
m_indexedTime = 0;
m_didDelete = false;
m_metaList2.purge();
@ -950,8 +952,8 @@ long XmlDoc::getSpideredTime ( ) {
m_spideredTime = date;
// hack for test coll which has fake vals for these because
// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
m_minPubDate = m_spideredTime - 48*3600;
m_maxPubDate = m_spideredTime - 24*3600;
//m_minPubDate = m_spideredTime - 48*3600;
//m_maxPubDate = m_spideredTime - 24*3600;
return m_spideredTime;
}
@ -1231,7 +1233,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
m_niceness = niceness;
m_version = TITLEREC_CURRENT_VERSION;
m_versionValid = true;
/*
// set min/max pub dates right away
m_minPubDate = -1;
m_maxPubDate = -1;
@ -1245,6 +1248,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
m_minPubDate = sreq->m_parentPrevSpiderTime;
m_maxPubDate = sreq->m_addedTime;
}
*/
// this is used to removing the rec from doledb after we spider it
m_doledbKey.setMin();
@ -5000,8 +5004,21 @@ Dates *XmlDoc::getDates ( ) {
if ( *isRSS ) isXml = true;
if ( *ctype == CT_XML ) isXml = true;
long minPubDate = -1;
long maxPubDate = -1;
// parentPrevSpiderTime is 0 if that was the first time that the
// parent was spidered, in which case isNewOutlink will always be set
// for every outlink it had!
if ( m_sreqValid &&
m_sreq.m_isNewOutlink &&
m_sreq.m_parentPrevSpiderTime ) {
// pub date is somewhere between these two times
minPubDate = m_sreq.m_parentPrevSpiderTime;
maxPubDate = m_sreq.m_addedTime;
}
// now set part2 , returns false and sets g_errno on error
if ( ! m_dates.setPart2 ( aa , m_minPubDate, m_maxPubDate,//osvt,
if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
isXml , *isRoot )) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// note it
@ -25130,7 +25147,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashTagRec ( table ) ) return NULL;
// hash for gbsortby:gbspiderdate
if ( ! hashDateNumbers ( table ) ) return NULL;
if ( ! hashDateNumbers ( table , true ) ) return NULL;
// has gbhasthumbnail:1 or 0
if ( ! hashImageStuff ( table ) ) return NULL;
// and the json itself
@ -25203,7 +25220,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashLinks ( table ) ) return NULL;
if ( ! hashDateNumbers ( table ) ) return NULL;
if ( ! hashDateNumbers ( table , true ) ) return NULL;
if ( ! hashMetaTags ( table ) ) return NULL;
if ( ! hashMetaZip ( table ) ) return NULL;
if ( ! hashDMOZCategories( table ) ) return NULL;
@ -25427,26 +25444,18 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList2 ( SpiderReply *reply ) {
hi.m_desc = "spider error msg";
if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;
hi.m_prefix = "gbdocid";
hi.m_desc = "docid";
bufLen = sprintf ( buf , "%llu", *uqd ) ;
if ( ! hashString( buf , &hi ) ) return NULL;
//hi.m_prefix = "gbdocid";
//hi.m_desc = "docid";
//bufLen = sprintf ( buf , "%llu", *uqd ) ;
//if ( ! hashString( buf , &hi ) ) return NULL;
// . then the url. url: site: ip: etc. terms
// . do NOT hash non-fielded terms so we do not get "status"
// results poluting the serps => false
if ( ! hashUrl ( &tt4 , false ) ) return NULL;
// hash the last spidered date, very useful!
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
hi.m_desc = "last spidered date";
// make this different so it doesn't coexist with regular results
// when someone does a gbsortby:gbspiderdate query
//hi.m_prefix = "gbreplyspiderdate";
hi.m_prefix = "gbspiderdate";
if ( reply->m_spideredTime <= 0 ) { char *xx=NULL;*xx=0; }
bufLen = sprintf ( buf , "%lu", reply->m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return NULL;
// false --> do not hash the gbdoc* terms
hashDateNumbers ( &tt4 , false );
// store keys in safebuf then to make our own meta list
addTable144 ( &tt4 , *uqd , &m_spiderReplyMetaList );
@ -25654,13 +25663,24 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
return true;
}
// slightly greater than m_spideredTime, which is the download time.
// we use this for sorting as well, like for the widget so things
// don't really get added out of order and not show up in the top spot
// of the widget list.
long XmlDoc::getIndexedTime() {
if ( m_indexedTimeValid ) return m_indexedTime;
m_indexedTime = getTimeGlobal();
return m_indexedTime;
}
// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool hashgbdocTerms) {
// stop if already set
if ( ! m_spideredTimeValid ) return true;
long indexedTime = getIndexedTime();
// first the last spidered date
HashInfo hi;
@ -25671,10 +25691,38 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
char buf[64];
long bufLen = sprintf ( buf , "%lu", m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// and index time is >= spider time, so you want to sort by that for
// the widget for instance
hi.m_desc = "last indexed date";
hi.m_prefix = "gbindexdate";
bufLen = sprintf ( buf , "%lu", indexedTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// do not index the rest if we are a "spider reply" document
// which is like a fake document for seeing spider statuses
if ( ! hashgbdocTerms ) return true;
// now for CT_STATUS spider status "documents" we also index
// gbspiderdate so index this so we can just do a
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
// spider status "documents"
hi.m_desc = "doc last spidered date";
hi.m_prefix = "gbdocspiderdate";
bufLen = sprintf ( buf , "%lu", m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
hi.m_desc = "doc last indexed date";
hi.m_prefix = "gbdocindexdate";
bufLen = sprintf ( buf , "%lu", indexedTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// all done
return true;
}
@ -26285,35 +26333,33 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2,"%llu",(m_docId) );
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
// if indexing a json diffbot object, index
// gbparenturl:xxxx of the original url from which the json was
// datamined. we use this so we can act as a diffbot json cache.
if ( m_isDiffbotJSONObject ) {
setStatus ( "hashing gbparenturl term");
char *p = fu->getUrl() + fu->getUrlLen() - 1;
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
for ( ; *p && *p != '-' ; p-- );
// set up the hashing parms
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_desc = "diffbot parent url";
// append a "www." as part of normalization
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
hi.m_prefix = "gbparenturl";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
}
if ( ! hashNonFieldTerms ) return true;
setStatus ( "hashing url mid domain");
// the final score
//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";//tmp3;
hi.m_hashGroup = HASHGROUP_INURL;
// if parm "index article content only" is true, do not index this!
//if ( m_eliminateMenus ) plainScore = 0;
//char *mid = fu->getMidDomain ();
//long mlen = fu->getMidDomainLen();
//hi.m_desc = "url mid dom";
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
//hi.m_desc = "url host";
char *host = fu->getHost ();
long hlen = fu->getHostLen ();
if ( ! hashString ( host,hlen,&hi)) return false;
setStatus ( "hashing url path");
// hash the path plain
if ( ! hashString (path,plen,&hi) ) return false;
setStatus ( "hashing SiteGetter terms");
//
@ -26337,6 +26383,10 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
// . that way we do not confuse all the pages in dictionary.com or
// wikipedia.org as subsites!!
if ( ! m_links.hasSubdirOutlink() ) add = false;
char *host = fu->getHost ();
long hlen = fu->getHostLen ();
// tags from here out
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_shardByTermId = true;
@ -26371,30 +26421,28 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
hi.m_prefix = "urlhashdiv100";
if ( ! hashString(buf,blen,&hi) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2,"%llu",(m_docId) );
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
// if indexing a json diffbot object, index
// gbparenturl:xxxx of the original url from which the json was
// datamined. we use this so we can act as a diffbot json cache.
if ( m_isDiffbotJSONObject ) {
setStatus ( "hashing gbparenturl term");
char *p = fu->getUrl() + fu->getUrlLen() - 1;
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
for ( ; *p && *p != '-' ; p-- );
// set up the hashing parms
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_tt = tt;
hi.m_desc = "diffbot parent url";
// append a "www." as part of normalization
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
hi.m_prefix = "gbparenturl";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
}
setStatus ( "hashing url mid domain");
// the final score
//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";//tmp3;
hi.m_hashGroup = HASHGROUP_INURL;
// if parm "index article content only" is true, do not index this!
//if ( m_eliminateMenus ) plainScore = 0;
//char *mid = fu->getMidDomain ();
//long mlen = fu->getMidDomainLen();
//hi.m_desc = "url mid dom";
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
//hi.m_desc = "url host";
if ( ! hashString ( host,hlen,&hi)) return false;
setStatus ( "hashing url path");
// hash the path plain
if ( ! hashString (path,plen,&hi) ) return false;
return true;
}
@ -32081,7 +32129,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
);
/*
char *ms = "-1";
if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
sb->safePrintf (
@ -32097,7 +32145,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
"<td>max pub date</td>"
"<td>%s UTC</td>"
"</tr>\n" , ms );
*/
// our html template fingerprint
sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");

@ -280,8 +280,12 @@ class XmlDoc {
long m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
long m_siteNumInlinksUniqueCBlock; // m_sitePop;
time_t m_spideredTime;
time_t m_minPubDate;
time_t m_maxPubDate;
// just don't throw away any relevant SpiderRequests and we have
// the data that m_minPubDate and m_maxPubDate provided
//time_t m_minPubDate;
//time_t m_maxPubDate;
time_t m_indexedTime; // slightly > m_spideredTime
uint32_t m_reserved32;
time_t m_pubDate; // aka m_datedbDate
//time_t m_nextSpiderTime;
time_t m_firstIndexedDate;
@ -473,6 +477,10 @@ class XmlDoc {
// we now call this right away rather than at download time!
long getSpideredTime();
// time right before adding the termlists to the index, etc.
// whereas spider time is the download time
long getIndexedTime();
// another entry point, like set3() kinda
bool loadFromOldTitleRec ();
@ -773,7 +781,7 @@ class XmlDoc {
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table , bool hashNonFieldTerms=true) ;
bool hashDateNumbers ( class HashTableX *tt ) ;
bool hashDateNumbers ( class HashTableX *tt , bool hashAll ) ;
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
@ -1193,6 +1201,7 @@ class XmlDoc {
bool m_firstIpValid;
bool m_spideredTimeValid;
//bool m_nextSpiderTimeValid;
bool m_indexedTimeValid;
bool m_firstIndexedValid;
bool m_isInIndexValid;
bool m_wasInIndexValid;

382
gb.conf

@ -1,382 +0,0 @@
# All <, >, " and # characters that are values for a field contained herein
# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
# Mem available to this process. May be exceeded due to fragmentation.
<maxMem>4000000000</>
# Below the various Gigablast databases are configured.
# <*dbMaxTreeMem> - mem used for holding new recs
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
# <*dbMaxCacheMem> - cache mem for holding single recs
# <*dbSaveCache> - save the rec cache on exit?
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
# See that Stats page for record counts and stats.
# How many bytes should be used for caching DNS replies?
<dnsMaxCacheMem>128000</>
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
# about 100 bytes or so.
<tagdbMaxTreeMem>1028000</>
<tagdbMaxPageCacheMem>200000</>
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
# is about 100 bytes.
<catdbMaxTreeMem>1000000</>
<catdbMaxPageCacheMem>25000000</>
<catdbMaxCacheMem>0</>
# Clusterdb caches small records for site clustering and deduping.
<clusterdbMaxTreeMem>1000000</>
<clusterdbSaveCache>0</>
# Max memory for dup vector cache.
<maxVectorCacheMem>10000000</>
# Robotdb caches robot.txt files.
<robotdbMaxCacheMem>128000</>
<robotdbSaveCache>0</>
<linkdbMaxPageCacheMem>0</>
<statsdbMaxTreeMem>5000000</>
<statsdbMaxCacheMem>0</>
<statsdbMaxDiskPageCacheMem>1000000</>
# Maximum bytes of a doc that can be sent before having to read more from disk
<httpMaxSendBufSize>128000</>
# Bytes to use for caching search result pages.
<searchResultsMaxCacheMem>100000</>
# Read only mode does not allow spidering.
<readOnlyMode>0</>
# Controls all spidering for all collections
<spideringEnabled>1</>
# What is the maximum number of web pages the spider is allowed to download
# simultaneously for ALL collections PER HOST?
<maxTotalSpiders>100</>
# Can people use the add url interface to add urls to the index?
<addUrlEnabled>1</>
# Save data in memory to disk after this many minutes have passed without the
# data having been dumped or saved to disk. Use 0 to disable.
<autoSaveFrequency>5</>
# Maximum sockets available to serve incoming HTTP requests. Too many
# outstanding requests will increase query latency. Excess requests will
# simply have their sockets closed.
<maxHttpSockets>100</>
# Maximum sockets available to serve incoming HTTPS requests. Like max http
# sockets, but for secure sockets.
<maxHttpsSockets>100</>
# Identification seen by web servers when the Gigablast spider downloads their
# web pages. It is polite to insert a contact email address here so webmasters
# that experience problems from the Gigablast spider have somewhere to vent.
<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
# If this is true, gb will send Accept-Encoding: gzip to web servers when
# doing http downloads.
<askForGzippedDocsWhenDownloading>0</>
# How many seconds should we cache a search results page for?
<searchResultsCacheMaxAge>10800</>
# Keep track of ips which do queries, disallow non-customers from hitting us
# too hard.
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
# If a call to a message callback or message handler in the udp server takes
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
# the logging.
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
# Sends emails to admin if a host goes down.
<sendEmailAlerts>0</>
# Do not send email alerts about dead hosts to anyone except
# sysadmin@gigablast.com between the times given below unless all the twins of
# the dead host are also dead. Instead, wait till after if the host is still
# dead.
<delayNonCriticalEmailAlerts>0</>
# Email alerts will include the cluster name
<clusterName><![CDATA[unspecified]]></>
# Send an email after a host has not responded to successive pings for this
# many milliseconds.
<sendEmailTimeout>62000</>
# Send email alerts when query success rate goes below this threshold.
# (percent rate between 0.0 and 1.0)
<querySuccessRateThreshold>0.850000</>
# Send email alerts when average query latency goes above this threshold. (in
# seconds)
<averageQueryLatencyThreshold>2.000000</>
# Record this number of query times before calculating average query latency.
<numberOfQueryTimesInAverage>300</>
# At what temperature in Celsius should we send an email alert if a hard drive
# reaches it?
<maxHardDriveTemperature>45</>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString1><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString2><![CDATA[]]></>
# Look for this string in the kernel buffer for sending email alert. Useful
# for detecting some strange hard drive failures that really slow performance.
<errorString3><![CDATA[]]></>
# Sends to email address 1 through email server 1.
<sendEmailAlertsToEmail1>0</>
# Sends to email address 1 through email server 1 if any parm is changed.
<sendParmChangeEmailAlertsToEmail1>0</>
# Connects to this IP or hostname directly when sending email 1. Use
# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
# changes so sendmail will forward Gigablast's mail to the address you give
# below.
<emailServer1><![CDATA[127.0.0.1]]></>
# Sends to this address when sending email 1
<emailAddress1><![CDATA[4081234567@vtext.com]]></>
# The from field when sending email 1
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 2 through email server 2.
<sendEmailAlertsToEmail2>0</>
# Sends to email address 2 through email server 2 if any parm is changed.
<sendParmChangeEmailAlertsToEmail2>0</>
# Connects to this server directly when sending email 2
<emailServer2><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 2
<emailAddress2><![CDATA[]]></>
# The from field when sending email 2
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
# Sends to email address 3 through email server 3.
<sendEmailAlertsToEmail3>0</>
# Sends to email address 3 through email server 3 if any parm is changed.
<sendParmChangeEmailAlertsToEmail3>0</>
# Connects to this server directly when sending email 3
<emailServer3><![CDATA[mail.mydomain.com]]></>
# Sends to this address when sending email 3
<emailAddress3><![CDATA[]]></>
# The from field when sending email 3
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
<dns0>8.8.8.8</>
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
# accessed in conjunction with the primary dns, so make sure this is always
# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
<dns1>8.8.4.4</>
# All hosts send to these DNSes based on hash of the subdomain to try to split
# DNS load evenly.
<dns2>0.0.0.0</>
<dns3>0.0.0.0</>
<dns4>0.0.0.0</>
<dns5>0.0.0.0</>
<dns6>0.0.0.0</>
<dns7>0.0.0.0</>
<dns8>0.0.0.0</>
<dns9>0.0.0.0</>
<dns10>0.0.0.0</>
<dns11>0.0.0.0</>
<dns12>0.0.0.0</>
<dns13>0.0.0.0</>
<dns14>0.0.0.0</>
<dns15>0.0.0.0</>
# If enabled, gigablast will repair the rdbs as specified by the parameters
# below. When a particular collection is in repair mode, it can not spider or
# merge titledb files.
<repairModeEnabled>0</>
# Comma or space separated list of the collections to repair or rebuild.
<collectionsToRepairOrRebuild><![CDATA[main]]></>
# In bytes.
<memoryToUseForRepair>300000000</>
# Maximum number of outstanding inject spiders for repair.
<maxRepairSpiders>32</>
# If enabled, gigablast will reinject the content of all title recs into a
# secondary rdb system. That will the primary rdb system when complete.
<fullRebuild>0</>
# If enabled, gigablast will keep the new spiderdb records when doing the full
# rebuild or the spiderdb rebuild.
<keepNewSpiderdbRecs>1</>
# If enabled, gigablast will recycle the link info when rebuilding titledb.
<recycleLinkInfo>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildTitledb>1</>
# If enabled, gigablast will rebuild this rdb
<rebuildPosdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildClusterdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildSpiderdb>0</>
# If enabled, gigablast will rebuild this rdb
<rebuildLinkdb>0</>
# If disabled, gigablast will skip root urls.
<rebuildRootUrls>1</>
# If disabled, gigablast will skip non-root urls.
<rebuildNonrootUrls>1</>
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
# tagdb lookup be performed? Runs much much faster without it. Will also keep
# the original doc quality and spider priority in tact.
<skipTagdbLookup>0</>
# add Ips here to bar them from accessing this gigablast server.
<banIps><![CDATA[]]></>
# add Ips here to give them an infinite query quota.
<allowIps><![CDATA[]]></>
# Don't try to autoban queries that have one of these codes. Also, the code
# must be valid for us to use &uip=IPADDRESS as the IP address of the
# submitter for purposes of autoban AND purposes of addurl daily quotas.
<validCodes><![CDATA[]]></>
# Append extra default parms to queries that match certain substrings.
# Format: text to match in url, followed by a space, then the list of extra
# parms as they would appear appended to the url. One match per line.
<extraParms><![CDATA[]]></>
# ban any query that matches this list of substrings. Must match all
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
<banRegex><![CDATA[]]></>
# Any matching password will have administrative access to Gigablast and all
# collections.
# Use <masterPassword> tag.
# Any IPs in this list will have administrative access to Gigablast and all
# collections.
# Use <masterIp> tag.
# Log GET and POST requests received from the http server?
<logHttpRequests>1</>
# Should we log queries that are autobanned? They can really fill up the log.
<logAutobannedQueries>1</>
# If query took this many millliseconds or longer, then log the query and the
# time it took to process.
<logQueryTimeThreshold>5000</>
# Log query reply in proxy, but only for those queries above the time
# threshold above.
<logQueryReply>0</>
# Log status of spidered or injected urls?
<logSpideredUrls>1</>
# Log messages if Gigablast runs out of udp sockets?
<logNetworkCongestion>0</>
# Log messages not related to an error condition, but meant more to give an
# idea of the state of the gigablast process. These can be useful when
# diagnosing problems.
<logInformationalMessages>1</>
# Log it when document not added due to quota breech. Log it when url is too
# long and it gets truncated.
<logLimitBreeches>0</>
# Log various debug messages.
<logDebugAdminMessages>0</>
<logDebugBuildMessages>0</>
<logDebugBuildTimeMessages>0</>
<logDebugDatabaseMessages>0</>
<logDebugDirtyMessages>0</>
<logDebugDiskMessages>0</>
<logDebugDnsMessages>0</>
<logDebugHttpMessages>0</>
<logDebugImageMessages>0</>
<logDebugLoopMessages>0</>
<logDebugLanguageDetectionMessages>0</>
<logDebugLinkInfo>0</>
<logDebugMemMessages>0</>
<logDebugMemUsageMessages>0</>
<logDebugNetMessages>0</>
<logDebugQueryMessages>0</>
<logDebugQuotaMessages>0</>
<logDebugRobotsMessages>0</>
<logDebugSpiderCacheMessages>0</>
<logDebugSpellerMessages>0</>
<logDebugSectionsMessages>0</>
<logDebugSeoInsertMessages>0</>
<logDebugSeoMessages>0</>
<logDebugStatsMessages>0</>
<logDebugSummaryMessages>0</>
<logDebugSpiderMessages>0</>
<logDebugUrlAttempts>0</>
<logDebugSpiderDownloads>0</>
<logDebugFacebook>0</>
<logDebugTagdbMessages>0</>
<logDebugTcpMessages>0</>
<logDebugThreadMessages>0</>
<logDebugTitleMessages>0</>
<logDebugTimedbMessages>0</>
<logDebugTopicMessages>0</>
<logDebugTopDocMessages>0</>
<logDebugUdpMessages>0</>
<logDebugUnicodeMessages>0</>
<logDebugRepairMessages>0</>
<logDebugPubDateExtractionMessages>0</>
# Log various timing related messages.
<logTimingMessagesForBuild>0</>
# Log various timing related messages.
<logTimingMessagesForAdmin>0</>
<logTimingMessagesForDatabase>0</>
<logTimingMessagesForNetworkLayer>0</>
<logTimingMessagesForQuery>0</>
# Log various timing related messages.
<logTimingMessagesForSpcache>0</>
<logTimingMessagesForRelatedTopics>0</>
# Log reminders to the programmer. You do not need this.
<logReminderMessages>0</>

@ -1572,7 +1572,9 @@ int main2 ( int argc , char *argv[] ) {
log("db: HttpServer init failed. Another gb "
"already running? If not, try editing "
"./hosts.conf to "
"change the port from %li to something bigger"
"change the port from %li to something bigger. "
"Or stop gb by running 'gb stop' or by "
"clicking 'save & exit' in the master controls."
, (long)httpPort );
// this is dangerous!!! do not do the shutdown thing
return 1;