added gbdocspiderdate and gbdocindexdate terms
just for docs and not spider reply "documents". do not index plain terms for CT_STATUS spider reply docs. create gb.conf if does not exist, take out of repo.
This commit is contained in:
20
Conf.cpp
20
Conf.cpp
@ -182,12 +182,30 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
|
||||
m_isLocal = false;
|
||||
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
|
||||
else sprintf ( fname , "./gb.conf" );
|
||||
// try regular gb.conf then
|
||||
f.set ( fname );
|
||||
}
|
||||
|
||||
// make sure g_mem.maxMem is big enough temporarily
|
||||
if ( g_mem.m_maxMem < 10000000 ) g_mem.m_maxMem = 10000000;
|
||||
bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );
|
||||
|
||||
// if not there, create it!
|
||||
if ( ! status ) {
|
||||
log("gb: Creating %s from defaults.",fname);
|
||||
g_errno = 0;
|
||||
// set to defaults
|
||||
g_conf.reset();
|
||||
// and save it
|
||||
//log("gb: Saving %s",fname);
|
||||
m_save = true;
|
||||
save();
|
||||
// clear errors
|
||||
g_errno = 0;
|
||||
status = true;
|
||||
}
|
||||
|
||||
|
||||
// ignore if yippy
|
||||
if ( g_isYippy ) {
|
||||
//g_conf.m_doAutoBan = true;
|
||||
@ -415,7 +433,7 @@ bool Conf::save ( ) {
|
||||
if(access(fname2, F_OK) == 0) unlink(fname2);
|
||||
if(link(fname, fname2) == 0) {
|
||||
unlink(fname);
|
||||
log(LOG_INFO,"admin: Saved %s.",fname);
|
||||
log(LOG_INFO,"admin: Saved %s.",fname2);
|
||||
} else {
|
||||
log(LOG_INFO,"admin: Unable to save %s:%s",
|
||||
fname, strerror(errno));
|
||||
|
196
XmlDoc.cpp
196
XmlDoc.cpp
@ -186,6 +186,8 @@ static long long s_lastTimeStart = 0LL;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
m_indexedTime = 0;
|
||||
|
||||
m_didDelete = false;
|
||||
|
||||
m_metaList2.purge();
|
||||
@ -950,8 +952,8 @@ long XmlDoc::getSpideredTime ( ) {
|
||||
m_spideredTime = date;
|
||||
// hack for test coll which has fake vals for these because
|
||||
// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
|
||||
m_minPubDate = m_spideredTime - 48*3600;
|
||||
m_maxPubDate = m_spideredTime - 24*3600;
|
||||
//m_minPubDate = m_spideredTime - 48*3600;
|
||||
//m_maxPubDate = m_spideredTime - 24*3600;
|
||||
|
||||
return m_spideredTime;
|
||||
}
|
||||
@ -1231,7 +1233,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
m_niceness = niceness;
|
||||
m_version = TITLEREC_CURRENT_VERSION;
|
||||
m_versionValid = true;
|
||||
|
||||
|
||||
/*
|
||||
// set min/max pub dates right away
|
||||
m_minPubDate = -1;
|
||||
m_maxPubDate = -1;
|
||||
@ -1245,6 +1248,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
m_minPubDate = sreq->m_parentPrevSpiderTime;
|
||||
m_maxPubDate = sreq->m_addedTime;
|
||||
}
|
||||
*/
|
||||
|
||||
// this is used to removing the rec from doledb after we spider it
|
||||
m_doledbKey.setMin();
|
||||
@ -5000,8 +5004,21 @@ Dates *XmlDoc::getDates ( ) {
|
||||
if ( *isRSS ) isXml = true;
|
||||
if ( *ctype == CT_XML ) isXml = true;
|
||||
|
||||
long minPubDate = -1;
|
||||
long maxPubDate = -1;
|
||||
// parentPrevSpiderTime is 0 if that was the first time that the
|
||||
// parent was spidered, in which case isNewOutlink will always be set
|
||||
// for every outlink it had!
|
||||
if ( m_sreqValid &&
|
||||
m_sreq.m_isNewOutlink &&
|
||||
m_sreq.m_parentPrevSpiderTime ) {
|
||||
// pub date is somewhere between these two times
|
||||
minPubDate = m_sreq.m_parentPrevSpiderTime;
|
||||
maxPubDate = m_sreq.m_addedTime;
|
||||
}
|
||||
|
||||
// now set part2 , returns false and sets g_errno on error
|
||||
if ( ! m_dates.setPart2 ( aa , m_minPubDate, m_maxPubDate,//osvt,
|
||||
if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
|
||||
isXml , *isRoot )) {
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// note it
|
||||
@ -25130,7 +25147,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
if ( ! hashCountry ( table ) ) return NULL;
|
||||
if ( ! hashTagRec ( table ) ) return NULL;
|
||||
// hash for gbsortby:gbspiderdate
|
||||
if ( ! hashDateNumbers ( table ) ) return NULL;
|
||||
if ( ! hashDateNumbers ( table , true ) ) return NULL;
|
||||
// has gbhasthumbnail:1 or 0
|
||||
if ( ! hashImageStuff ( table ) ) return NULL;
|
||||
// and the json itself
|
||||
@ -25203,7 +25220,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
|
||||
|
||||
if ( ! hashLinks ( table ) ) return NULL;
|
||||
if ( ! hashDateNumbers ( table ) ) return NULL;
|
||||
if ( ! hashDateNumbers ( table , true ) ) return NULL;
|
||||
if ( ! hashMetaTags ( table ) ) return NULL;
|
||||
if ( ! hashMetaZip ( table ) ) return NULL;
|
||||
if ( ! hashDMOZCategories( table ) ) return NULL;
|
||||
@ -25427,26 +25444,18 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList2 ( SpiderReply *reply ) {
|
||||
hi.m_desc = "spider error msg";
|
||||
if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;
|
||||
|
||||
hi.m_prefix = "gbdocid";
|
||||
hi.m_desc = "docid";
|
||||
bufLen = sprintf ( buf , "%llu", *uqd ) ;
|
||||
if ( ! hashString( buf , &hi ) ) return NULL;
|
||||
//hi.m_prefix = "gbdocid";
|
||||
//hi.m_desc = "docid";
|
||||
//bufLen = sprintf ( buf , "%llu", *uqd ) ;
|
||||
//if ( ! hashString( buf , &hi ) ) return NULL;
|
||||
|
||||
// . then the url. url: site: ip: etc. terms
|
||||
// . do NOT hash non-fielded terms so we do not get "status"
|
||||
// results poluting the serps => false
|
||||
if ( ! hashUrl ( &tt4 , false ) ) return NULL;
|
||||
|
||||
// hash the last spidered date, very useful!
|
||||
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
|
||||
hi.m_desc = "last spidered date";
|
||||
// make this different so it doesn't coexist with regular results
|
||||
// when someone does a gbsortby:gbspiderdate query
|
||||
//hi.m_prefix = "gbreplyspiderdate";
|
||||
hi.m_prefix = "gbspiderdate";
|
||||
if ( reply->m_spideredTime <= 0 ) { char *xx=NULL;*xx=0; }
|
||||
bufLen = sprintf ( buf , "%lu", reply->m_spideredTime );
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return NULL;
|
||||
// false --> do not hash the gbdoc* terms
|
||||
hashDateNumbers ( &tt4 , false );
|
||||
|
||||
// store keys in safebuf then to make our own meta list
|
||||
addTable144 ( &tt4 , *uqd , &m_spiderReplyMetaList );
|
||||
@ -25654,13 +25663,24 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// slightly greater than m_spideredTime, which is the download time.
|
||||
// we use this for sorting as well, like for the widget so things
|
||||
// don't really get added out of order and not show up in the top spot
|
||||
// of the widget list.
|
||||
long XmlDoc::getIndexedTime() {
|
||||
if ( m_indexedTimeValid ) return m_indexedTime;
|
||||
m_indexedTime = getTimeGlobal();
|
||||
return m_indexedTime;
|
||||
}
|
||||
|
||||
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
||||
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
||||
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
|
||||
bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool hashgbdocTerms) {
|
||||
|
||||
// stop if already set
|
||||
if ( ! m_spideredTimeValid ) return true;
|
||||
|
||||
long indexedTime = getIndexedTime();
|
||||
|
||||
// first the last spidered date
|
||||
HashInfo hi;
|
||||
@ -25671,10 +25691,38 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
|
||||
|
||||
char buf[64];
|
||||
long bufLen = sprintf ( buf , "%lu", m_spideredTime );
|
||||
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
// and index time is >= spider time, so you want to sort by that for
|
||||
// the widget for instance
|
||||
hi.m_desc = "last indexed date";
|
||||
hi.m_prefix = "gbindexdate";
|
||||
bufLen = sprintf ( buf , "%lu", indexedTime );
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
// do not index the rest if we are a "spider reply" document
|
||||
// which is like a fake document for seeing spider statuses
|
||||
if ( ! hashgbdocTerms ) return true;
|
||||
|
||||
// now for CT_STATUS spider status "documents" we also index
|
||||
// gbspiderdate so index this so we can just do a
|
||||
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
|
||||
// spider status "documents"
|
||||
hi.m_desc = "doc last spidered date";
|
||||
hi.m_prefix = "gbdocspiderdate";
|
||||
bufLen = sprintf ( buf , "%lu", m_spideredTime );
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
hi.m_desc = "doc last indexed date";
|
||||
hi.m_prefix = "gbdocindexdate";
|
||||
bufLen = sprintf ( buf , "%lu", indexedTime );
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
|
||||
// all done
|
||||
return true;
|
||||
}
|
||||
@ -26285,35 +26333,33 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
|
||||
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing gbdocid" );
|
||||
hi.m_prefix = "gbdocid";
|
||||
char buf2[32];
|
||||
sprintf(buf2,"%llu",(m_docId) );
|
||||
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
||||
|
||||
// if indexing a json diffbot object, index
|
||||
// gbparenturl:xxxx of the original url from which the json was
|
||||
// datamined. we use this so we can act as a diffbot json cache.
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
setStatus ( "hashing gbparenturl term");
|
||||
char *p = fu->getUrl() + fu->getUrlLen() - 1;
|
||||
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
|
||||
for ( ; *p && *p != '-' ; p-- );
|
||||
// set up the hashing parms
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "diffbot parent url";
|
||||
// append a "www." as part of normalization
|
||||
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
|
||||
hi.m_prefix = "gbparenturl";
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( ! hashNonFieldTerms ) return true;
|
||||
|
||||
|
||||
setStatus ( "hashing url mid domain");
|
||||
// the final score
|
||||
//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
|
||||
// update parms
|
||||
hi.m_prefix = NULL;
|
||||
hi.m_desc = "middle domain";//tmp3;
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
// if parm "index article content only" is true, do not index this!
|
||||
//if ( m_eliminateMenus ) plainScore = 0;
|
||||
//char *mid = fu->getMidDomain ();
|
||||
//long mlen = fu->getMidDomainLen();
|
||||
//hi.m_desc = "url mid dom";
|
||||
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
|
||||
//hi.m_desc = "url host";
|
||||
char *host = fu->getHost ();
|
||||
long hlen = fu->getHostLen ();
|
||||
if ( ! hashString ( host,hlen,&hi)) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing url path");
|
||||
|
||||
// hash the path plain
|
||||
if ( ! hashString (path,plen,&hi) ) return false;
|
||||
|
||||
|
||||
|
||||
setStatus ( "hashing SiteGetter terms");
|
||||
|
||||
//
|
||||
@ -26337,6 +26383,10 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
|
||||
// . that way we do not confuse all the pages in dictionary.com or
|
||||
// wikipedia.org as subsites!!
|
||||
if ( ! m_links.hasSubdirOutlink() ) add = false;
|
||||
|
||||
char *host = fu->getHost ();
|
||||
long hlen = fu->getHostLen ();
|
||||
|
||||
// tags from here out
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_shardByTermId = true;
|
||||
@ -26371,30 +26421,28 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
|
||||
hi.m_prefix = "urlhashdiv100";
|
||||
if ( ! hashString(buf,blen,&hi) ) return false;
|
||||
|
||||
setStatus ( "hashing gbdocid" );
|
||||
hi.m_prefix = "gbdocid";
|
||||
char buf2[32];
|
||||
sprintf(buf2,"%llu",(m_docId) );
|
||||
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
||||
|
||||
// if indexing a json diffbot object, index
|
||||
// gbparenturl:xxxx of the original url from which the json was
|
||||
// datamined. we use this so we can act as a diffbot json cache.
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
setStatus ( "hashing gbparenturl term");
|
||||
char *p = fu->getUrl() + fu->getUrlLen() - 1;
|
||||
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
|
||||
for ( ; *p && *p != '-' ; p-- );
|
||||
// set up the hashing parms
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "diffbot parent url";
|
||||
// append a "www." as part of normalization
|
||||
uw.set ( fu->getUrl() , p - fu->getUrl() , true );
|
||||
hi.m_prefix = "gbparenturl";
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
}
|
||||
setStatus ( "hashing url mid domain");
|
||||
// the final score
|
||||
//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
|
||||
// update parms
|
||||
hi.m_prefix = NULL;
|
||||
hi.m_desc = "middle domain";//tmp3;
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
// if parm "index article content only" is true, do not index this!
|
||||
//if ( m_eliminateMenus ) plainScore = 0;
|
||||
//char *mid = fu->getMidDomain ();
|
||||
//long mlen = fu->getMidDomainLen();
|
||||
//hi.m_desc = "url mid dom";
|
||||
//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
|
||||
//hi.m_desc = "url host";
|
||||
if ( ! hashString ( host,hlen,&hi)) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing url path");
|
||||
|
||||
// hash the path plain
|
||||
if ( ! hashString (path,plen,&hi) ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -32081,7 +32129,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
char *ms = "-1";
|
||||
if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
|
||||
sb->safePrintf (
|
||||
@ -32097,7 +32145,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
"<td>max pub date</td>"
|
||||
"<td>%s UTC</td>"
|
||||
"</tr>\n" , ms );
|
||||
|
||||
*/
|
||||
|
||||
// our html template fingerprint
|
||||
sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
|
||||
|
15
XmlDoc.h
15
XmlDoc.h
@ -280,8 +280,12 @@ class XmlDoc {
|
||||
long m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
|
||||
long m_siteNumInlinksUniqueCBlock; // m_sitePop;
|
||||
time_t m_spideredTime;
|
||||
time_t m_minPubDate;
|
||||
time_t m_maxPubDate;
|
||||
// just don't throw away any relevant SpiderRequests and we have
|
||||
// the data that m_minPubDate and m_maxPubDate provided
|
||||
//time_t m_minPubDate;
|
||||
//time_t m_maxPubDate;
|
||||
time_t m_indexedTime; // slightly > m_spideredTime
|
||||
uint32_t m_reserved32;
|
||||
time_t m_pubDate; // aka m_datedbDate
|
||||
//time_t m_nextSpiderTime;
|
||||
time_t m_firstIndexedDate;
|
||||
@ -473,6 +477,10 @@ class XmlDoc {
|
||||
// we now call this right away rather than at download time!
|
||||
long getSpideredTime();
|
||||
|
||||
// time right before adding the termlists to the index, etc.
|
||||
// whereas spider time is the download time
|
||||
long getIndexedTime();
|
||||
|
||||
// another entry point, like set3() kinda
|
||||
bool loadFromOldTitleRec ();
|
||||
|
||||
@ -773,7 +781,7 @@ class XmlDoc {
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table , bool hashNonFieldTerms=true) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt ) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt , bool hashAll ) ;
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
@ -1193,6 +1201,7 @@ class XmlDoc {
|
||||
bool m_firstIpValid;
|
||||
bool m_spideredTimeValid;
|
||||
//bool m_nextSpiderTimeValid;
|
||||
bool m_indexedTimeValid;
|
||||
bool m_firstIndexedValid;
|
||||
bool m_isInIndexValid;
|
||||
bool m_wasInIndexValid;
|
||||
|
382
gb.conf
382
gb.conf
@ -1,382 +0,0 @@
|
||||
# All <, >, " and # characters that are values for a field contained herein
|
||||
# must be represented as <, >, " and # respectively.
|
||||
|
||||
# Mem available to this process. May be exceeded due to fragmentation.
|
||||
<maxMem>4000000000</>
|
||||
|
||||
# Below the various Gigablast databases are configured.
|
||||
# <*dbMaxTreeMem> - mem used for holding new recs
|
||||
# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
|
||||
# <*dbMaxCacheMem> - cache mem for holding single recs
|
||||
# <*dbSaveCache> - save the rec cache on exit?
|
||||
# <*dbMaxCacheAge> - max age (seconds) for recs in rec cache
|
||||
# See that Stats page for record counts and stats.
|
||||
|
||||
# How many bytes should be used for caching DNS replies?
|
||||
<dnsMaxCacheMem>128000</>
|
||||
|
||||
# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
|
||||
# about 100 bytes or so.
|
||||
<tagdbMaxTreeMem>1028000</>
|
||||
<tagdbMaxPageCacheMem>200000</>
|
||||
|
||||
# A catdb record assigns a url or site to DMOZ categories. Each catdb record
|
||||
# is about 100 bytes.
|
||||
<catdbMaxTreeMem>1000000</>
|
||||
<catdbMaxPageCacheMem>25000000</>
|
||||
<catdbMaxCacheMem>0</>
|
||||
|
||||
# Clusterdb caches small records for site clustering and deduping.
|
||||
<clusterdbMaxTreeMem>1000000</>
|
||||
<clusterdbSaveCache>0</>
|
||||
|
||||
# Max memory for dup vector cache.
|
||||
<maxVectorCacheMem>10000000</>
|
||||
|
||||
# Robotdb caches robot.txt files.
|
||||
<robotdbMaxCacheMem>128000</>
|
||||
<robotdbSaveCache>0</>
|
||||
<linkdbMaxPageCacheMem>0</>
|
||||
<statsdbMaxTreeMem>5000000</>
|
||||
<statsdbMaxCacheMem>0</>
|
||||
<statsdbMaxDiskPageCacheMem>1000000</>
|
||||
|
||||
# Maximum bytes of a doc that can be sent before having to read more from disk
|
||||
<httpMaxSendBufSize>128000</>
|
||||
|
||||
# Bytes to use for caching search result pages.
|
||||
<searchResultsMaxCacheMem>100000</>
|
||||
|
||||
# Read only mode does not allow spidering.
|
||||
<readOnlyMode>0</>
|
||||
|
||||
# Controls all spidering for all collections
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously for ALL collections PER HOST?
|
||||
<maxTotalSpiders>100</>
|
||||
|
||||
# Can people use the add url interface to add urls to the index?
|
||||
<addUrlEnabled>1</>
|
||||
|
||||
# Save data in memory to disk after this many minutes have passed without the
|
||||
# data having been dumped or saved to disk. Use 0 to disable.
|
||||
<autoSaveFrequency>5</>
|
||||
|
||||
# Maximum sockets available to serve incoming HTTP requests. Too many
|
||||
# outstanding requests will increase query latency. Excess requests will
|
||||
# simply have their sockets closed.
|
||||
<maxHttpSockets>100</>
|
||||
|
||||
# Maximum sockets available to serve incoming HTTPS requests. Like max http
|
||||
# sockets, but for secure sockets.
|
||||
<maxHttpsSockets>100</>
|
||||
|
||||
# Identification seen by web servers when the Gigablast spider downloads their
|
||||
# web pages. It is polite to insert a contact email address here so webmasters
|
||||
# that experience problems from the Gigablast spider have somewhere to vent.
|
||||
<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
|
||||
|
||||
# If this is true, gb will send Accept-Encoding: gzip to web servers when
|
||||
# doing http downloads.
|
||||
<askForGzippedDocsWhenDownloading>0</>
|
||||
|
||||
# How many seconds should we cache a search results page for?
|
||||
<searchResultsCacheMaxAge>10800</>
|
||||
|
||||
# Keep track of ips which do queries, disallow non-customers from hitting us
|
||||
# too hard.
|
||||
<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
|
||||
|
||||
# If a call to a message callback or message handler in the udp server takes
|
||||
# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
|
||||
# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
|
||||
# the logging.
|
||||
<maxDelayBeforeLoggingACallbackOrHandler>-1</>
|
||||
|
||||
# Sends emails to admin if a host goes down.
|
||||
<sendEmailAlerts>0</>
|
||||
|
||||
# Do not send email alerts about dead hosts to anyone except
|
||||
# sysadmin@gigablast.com between the times given below unless all the twins of
|
||||
# the dead host are also dead. Instead, wait till after if the host is still
|
||||
# dead.
|
||||
<delayNonCriticalEmailAlerts>0</>
|
||||
|
||||
# Email alerts will include the cluster name
|
||||
<clusterName><![CDATA[unspecified]]></>
|
||||
|
||||
# Send an email after a host has not responded to successive pings for this
|
||||
# many milliseconds.
|
||||
<sendEmailTimeout>62000</>
|
||||
|
||||
# Send email alerts when query success rate goes below this threshold.
|
||||
# (percent rate between 0.0 and 1.0)
|
||||
<querySuccessRateThreshold>0.850000</>
|
||||
|
||||
# Send email alerts when average query latency goes above this threshold. (in
|
||||
# seconds)
|
||||
<averageQueryLatencyThreshold>2.000000</>
|
||||
|
||||
# Record this number of query times before calculating average query latency.
|
||||
<numberOfQueryTimesInAverage>300</>
|
||||
|
||||
# At what temperature in Celsius should we send an email alert if a hard drive
|
||||
# reaches it?
|
||||
<maxHardDriveTemperature>45</>
|
||||
|
||||
# Look for this string in the kernel buffer for sending email alert. Useful
|
||||
# for detecting some strange hard drive failures that really slow performance.
|
||||
<errorString1><![CDATA[]]></>
|
||||
|
||||
# Look for this string in the kernel buffer for sending email alert. Useful
|
||||
# for detecting some strange hard drive failures that really slow performance.
|
||||
<errorString2><![CDATA[]]></>
|
||||
|
||||
# Look for this string in the kernel buffer for sending email alert. Useful
|
||||
# for detecting some strange hard drive failures that really slow performance.
|
||||
<errorString3><![CDATA[]]></>
|
||||
|
||||
# Sends to email address 1 through email server 1.
|
||||
<sendEmailAlertsToEmail1>0</>
|
||||
|
||||
# Sends to email address 1 through email server 1 if any parm is changed.
|
||||
<sendParmChangeEmailAlertsToEmail1>0</>
|
||||
|
||||
# Connects to this IP or hostname directly when sending email 1. Use
|
||||
# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
|
||||
# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
|
||||
# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
|
||||
# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
|
||||
# changes so sendmail will forward Gigablast's mail to the address you give
|
||||
# below.
|
||||
<emailServer1><![CDATA[127.0.0.1]]></>
|
||||
|
||||
# Sends to this address when sending email 1
|
||||
<emailAddress1><![CDATA[4081234567@vtext.com]]></>
|
||||
|
||||
# The from field when sending email 1
|
||||
<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
|
||||
|
||||
# Sends to email address 2 through email server 2.
|
||||
<sendEmailAlertsToEmail2>0</>
|
||||
|
||||
# Sends to email address 2 through email server 2 if any parm is changed.
|
||||
<sendParmChangeEmailAlertsToEmail2>0</>
|
||||
|
||||
# Connects to this server directly when sending email 2
|
||||
<emailServer2><![CDATA[mail.mydomain.com]]></>
|
||||
|
||||
# Sends to this address when sending email 2
|
||||
<emailAddress2><![CDATA[]]></>
|
||||
|
||||
# The from field when sending email 2
|
||||
<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
|
||||
|
||||
# Sends to email address 3 through email server 3.
|
||||
<sendEmailAlertsToEmail3>0</>
|
||||
|
||||
# Sends to email address 3 through email server 3 if any parm is changed.
|
||||
<sendParmChangeEmailAlertsToEmail3>0</>
|
||||
|
||||
# Connects to this server directly when sending email 3
|
||||
<emailServer3><![CDATA[mail.mydomain.com]]></>
|
||||
|
||||
# Sends to this address when sending email 3
|
||||
<emailAddress3><![CDATA[]]></>
|
||||
|
||||
# The from field when sending email 3
|
||||
<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
|
||||
|
||||
# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
|
||||
# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
|
||||
<dns0>8.8.8.8</>
|
||||
|
||||
# IP address of the secondary DNS server. Assumes UDP port 53. Will be
|
||||
# accessed in conjunction with the primary dns, so make sure this is always
|
||||
# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
|
||||
<dns1>8.8.4.4</>
|
||||
|
||||
# All hosts send to these DNSes based on hash of the subdomain to try to split
|
||||
# DNS load evenly.
|
||||
<dns2>0.0.0.0</>
|
||||
<dns3>0.0.0.0</>
|
||||
<dns4>0.0.0.0</>
|
||||
<dns5>0.0.0.0</>
|
||||
<dns6>0.0.0.0</>
|
||||
<dns7>0.0.0.0</>
|
||||
<dns8>0.0.0.0</>
|
||||
<dns9>0.0.0.0</>
|
||||
<dns10>0.0.0.0</>
|
||||
<dns11>0.0.0.0</>
|
||||
<dns12>0.0.0.0</>
|
||||
<dns13>0.0.0.0</>
|
||||
<dns14>0.0.0.0</>
|
||||
<dns15>0.0.0.0</>
|
||||
|
||||
# If enabled, gigablast will repair the rdbs as specified by the parameters
|
||||
# below. When a particular collection is in repair mode, it can not spider or
|
||||
# merge titledb files.
|
||||
<repairModeEnabled>0</>
|
||||
|
||||
# Comma or space separated list of the collections to repair or rebuild.
|
||||
<collectionsToRepairOrRebuild><![CDATA[main]]></>
|
||||
|
||||
# In bytes.
|
||||
<memoryToUseForRepair>300000000</>
|
||||
|
||||
# Maximum number of outstanding inject spiders for repair.
|
||||
<maxRepairSpiders>32</>
|
||||
|
||||
# If enabled, gigablast will reinject the content of all title recs into a
|
||||
# secondary rdb system. That will the primary rdb system when complete.
|
||||
<fullRebuild>0</>
|
||||
|
||||
# If enabled, gigablast will keep the new spiderdb records when doing the full
|
||||
# rebuild or the spiderdb rebuild.
|
||||
<keepNewSpiderdbRecs>1</>
|
||||
|
||||
# If enabled, gigablast will recycle the link info when rebuilding titledb.
|
||||
<recycleLinkInfo>0</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildTitledb>1</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildPosdb>0</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildClusterdb>0</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildSpiderdb>0</>
|
||||
|
||||
# If enabled, gigablast will rebuild this rdb
|
||||
<rebuildLinkdb>0</>
|
||||
|
||||
# If disabled, gigablast will skip root urls.
|
||||
<rebuildRootUrls>1</>
|
||||
|
||||
# If disabled, gigablast will skip non-root urls.
|
||||
<rebuildNonrootUrls>1</>
|
||||
|
||||
# When rebuilding spiderdb and scanning it for new spiderdb records, should a
|
||||
# tagdb lookup be performed? Runs much much faster without it. Will also keep
|
||||
# the original doc quality and spider priority in tact.
|
||||
<skipTagdbLookup>0</>
|
||||
|
||||
# add Ips here to bar them from accessing this gigablast server.
|
||||
<banIps><![CDATA[]]></>
|
||||
|
||||
# add Ips here to give them an infinite query quota.
|
||||
<allowIps><![CDATA[]]></>
|
||||
|
||||
# Don't try to autoban queries that have one of these codes. Also, the code
|
||||
# must be valid for us to use &uip=IPADDRESS as the IP address of the
|
||||
# submitter for purposes of autoban AND purposes of addurl daily quotas.
|
||||
<validCodes><![CDATA[]]></>
|
||||
|
||||
# Append extra default parms to queries that match certain substrings.
|
||||
# Format: text to match in url, followed by a space, then the list of extra
|
||||
# parms as they would appear appended to the url. One match per line.
|
||||
<extraParms><![CDATA[]]></>
|
||||
|
||||
# ban any query that matches this list of substrings. Must match all
|
||||
# comma-separated strings on the same line. ('\n' = OR, ',' = AND)
|
||||
<banRegex><![CDATA[]]></>
|
||||
|
||||
# Any matching password will have administrative access to Gigablast and all
|
||||
# collections.
|
||||
# Use <masterPassword> tag.
|
||||
|
||||
# Any IPs in this list will have administrative access to Gigablast and all
|
||||
# collections.
|
||||
# Use <masterIp> tag.
|
||||
|
||||
# Log GET and POST requests received from the http server?
|
||||
<logHttpRequests>1</>
|
||||
|
||||
# Should we log queries that are autobanned? They can really fill up the log.
|
||||
<logAutobannedQueries>1</>
|
||||
|
||||
# If query took this many millliseconds or longer, then log the query and the
|
||||
# time it took to process.
|
||||
<logQueryTimeThreshold>5000</>
|
||||
|
||||
# Log query reply in proxy, but only for those queries above the time
|
||||
# threshold above.
|
||||
<logQueryReply>0</>
|
||||
|
||||
# Log status of spidered or injected urls?
|
||||
<logSpideredUrls>1</>
|
||||
|
||||
# Log messages if Gigablast runs out of udp sockets?
|
||||
<logNetworkCongestion>0</>
|
||||
|
||||
# Log messages not related to an error condition, but meant more to give an
|
||||
# idea of the state of the gigablast process. These can be useful when
|
||||
# diagnosing problems.
|
||||
<logInformationalMessages>1</>
|
||||
|
||||
# Log it when document not added due to quota breech. Log it when url is too
|
||||
# long and it gets truncated.
|
||||
<logLimitBreeches>0</>
|
||||
|
||||
# Log various debug messages.
|
||||
<logDebugAdminMessages>0</>
|
||||
<logDebugBuildMessages>0</>
|
||||
<logDebugBuildTimeMessages>0</>
|
||||
<logDebugDatabaseMessages>0</>
|
||||
<logDebugDirtyMessages>0</>
|
||||
<logDebugDiskMessages>0</>
|
||||
<logDebugDnsMessages>0</>
|
||||
<logDebugHttpMessages>0</>
|
||||
<logDebugImageMessages>0</>
|
||||
<logDebugLoopMessages>0</>
|
||||
<logDebugLanguageDetectionMessages>0</>
|
||||
<logDebugLinkInfo>0</>
|
||||
<logDebugMemMessages>0</>
|
||||
<logDebugMemUsageMessages>0</>
|
||||
<logDebugNetMessages>0</>
|
||||
<logDebugQueryMessages>0</>
|
||||
<logDebugQuotaMessages>0</>
|
||||
<logDebugRobotsMessages>0</>
|
||||
<logDebugSpiderCacheMessages>0</>
|
||||
<logDebugSpellerMessages>0</>
|
||||
<logDebugSectionsMessages>0</>
|
||||
<logDebugSeoInsertMessages>0</>
|
||||
<logDebugSeoMessages>0</>
|
||||
<logDebugStatsMessages>0</>
|
||||
<logDebugSummaryMessages>0</>
|
||||
<logDebugSpiderMessages>0</>
|
||||
<logDebugUrlAttempts>0</>
|
||||
<logDebugSpiderDownloads>0</>
|
||||
<logDebugFacebook>0</>
|
||||
<logDebugTagdbMessages>0</>
|
||||
<logDebugTcpMessages>0</>
|
||||
<logDebugThreadMessages>0</>
|
||||
<logDebugTitleMessages>0</>
|
||||
<logDebugTimedbMessages>0</>
|
||||
<logDebugTopicMessages>0</>
|
||||
<logDebugTopDocMessages>0</>
|
||||
<logDebugUdpMessages>0</>
|
||||
<logDebugUnicodeMessages>0</>
|
||||
<logDebugRepairMessages>0</>
|
||||
<logDebugPubDateExtractionMessages>0</>
|
||||
|
||||
# Log various timing related messages.
|
||||
<logTimingMessagesForBuild>0</>
|
||||
|
||||
# Log various timing related messages.
|
||||
<logTimingMessagesForAdmin>0</>
|
||||
<logTimingMessagesForDatabase>0</>
|
||||
<logTimingMessagesForNetworkLayer>0</>
|
||||
<logTimingMessagesForQuery>0</>
|
||||
|
||||
# Log various timing related messages.
|
||||
<logTimingMessagesForSpcache>0</>
|
||||
<logTimingMessagesForRelatedTopics>0</>
|
||||
|
||||
# Log reminders to the programmer. You do not need this.
|
||||
<logReminderMessages>0</>
|
4
main.cpp
4
main.cpp
@ -1572,7 +1572,9 @@ int main2 ( int argc , char *argv[] ) {
|
||||
log("db: HttpServer init failed. Another gb "
|
||||
"already running? If not, try editing "
|
||||
"./hosts.conf to "
|
||||
"change the port from %li to something bigger"
|
||||
"change the port from %li to something bigger. "
|
||||
"Or stop gb by running 'gb stop' or by "
|
||||
"clicking 'save & exit' in the master controls."
|
||||
, (long)httpPort );
|
||||
// this is dangerous!!! do not do the shutdown thing
|
||||
return 1;
|
||||
|
Reference in New Issue
Block a user