added gbdocspiderdate and gbdocindexdate terms

just for docs and not spider reply "documents". do not index plain terms for CT_STATUS spider reply docs. create gb.conf if does not exist, take out of repo.
2014-06-19 15:27:46 -07:00
parent 27193d444a
commit aaec46f612
5 changed files with 156 additions and 461 deletions
--- a/Conf.cpp
+++ b/Conf.cpp
@ -182,12 +182,30 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 		m_isLocal = false;
 		if ( dir ) sprintf ( fname , "%sgb.conf", dir );
 		else       sprintf ( fname , "./gb.conf" );
+		// try regular gb.conf then
+		f.set ( fname );
 	}

 	// make sure g_mem.maxMem is big enough temporarily
 	if ( g_mem.m_maxMem < 10000000 ) g_mem.m_maxMem = 10000000;
 	bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );

+	// if not there, create it!
+	if ( ! status ) {
+		log("gb: Creating %s from defaults.",fname);
+		g_errno = 0;
+		// set to defaults
+		g_conf.reset();
+		// and save it
+		//log("gb: Saving %s",fname);
+		m_save = true;
+		save();
+		// clear errors
+		g_errno = 0;
+		status = true;
+	}
+		
+
 	// ignore if yippy
 	if ( g_isYippy ) {
 		//g_conf.m_doAutoBan = true;
@ -415,7 +433,7 @@ bool Conf::save ( ) {
 		if(access(fname2, F_OK) == 0) unlink(fname2);
 		if(link(fname, fname2) == 0) {
 			unlink(fname);
-			log(LOG_INFO,"admin: Saved %s.",fname);
+			log(LOG_INFO,"admin: Saved %s.",fname2);
 		} else {
 			log(LOG_INFO,"admin: Unable to save %s:%s",
 					fname, strerror(errno));
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -186,6 +186,8 @@ static long long s_lastTimeStart = 0LL;

 void XmlDoc::reset ( ) {

+	m_indexedTime = 0;
+
 	m_didDelete = false;

 	m_metaList2.purge();
@ -950,8 +952,8 @@ long XmlDoc::getSpideredTime ( ) {
 	m_spideredTime      = date;
 	// hack for test coll which has fake vals for these because
 	// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
-	m_minPubDate = m_spideredTime - 48*3600;
-	m_maxPubDate = m_spideredTime - 24*3600;
+	//m_minPubDate = m_spideredTime - 48*3600;
+	//m_maxPubDate = m_spideredTime - 24*3600;

 	return m_spideredTime;
 }
@ -1231,7 +1233,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	m_niceness  = niceness;
 	m_version   = TITLEREC_CURRENT_VERSION;
 	m_versionValid = true;
-
+	
+	/*
 	// set min/max pub dates right away
 	m_minPubDate = -1;
 	m_maxPubDate = -1;
@ -1245,6 +1248,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 		m_minPubDate = sreq->m_parentPrevSpiderTime;
 		m_maxPubDate = sreq->m_addedTime;
 	}
+	*/

 	// this is used to removing the rec from doledb after we spider it
 	m_doledbKey.setMin();
@ -5000,8 +5004,21 @@ Dates *XmlDoc::getDates ( ) {
 	if ( *isRSS ) isXml = true;
 	if ( *ctype == CT_XML ) isXml = true;

+	long minPubDate = -1;
+	long maxPubDate = -1;
+	// parentPrevSpiderTime is 0 if that was the first time that the
+	// parent was spidered, in which case isNewOutlink will always be set
+	// for every outlink it had!
+	if ( m_sreqValid &&
+	     m_sreq.m_isNewOutlink && 
+	     m_sreq.m_parentPrevSpiderTime ) {
+		// pub date is somewhere between these two times
+		minPubDate = m_sreq.m_parentPrevSpiderTime;
+		maxPubDate = m_sreq.m_addedTime;
+	}
+
 	// now set part2 , returns false and sets g_errno on error
-	if ( ! m_dates.setPart2 ( aa , m_minPubDate, m_maxPubDate,//osvt,
+	if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
 				  isXml , *isRoot )) {
 		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
 		// note it
@ -25130,7 +25147,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 		if ( ! hashCountry       ( table ) ) return NULL;
 		if ( ! hashTagRec        ( table ) ) return NULL;
 		// hash for gbsortby:gbspiderdate
-		if ( ! hashDateNumbers   ( table ) ) return NULL;
+		if ( ! hashDateNumbers   ( table , true ) ) return NULL;
 		// has gbhasthumbnail:1 or 0
 		if ( ! hashImageStuff    ( table ) ) return NULL;
 		// and the json itself
@ -25203,7 +25220,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {


 	if ( ! hashLinks         ( table ) ) return NULL;
-	if ( ! hashDateNumbers   ( table ) ) return NULL;
+	if ( ! hashDateNumbers   ( table , true ) ) return NULL;
 	if ( ! hashMetaTags      ( table ) ) return NULL;
 	if ( ! hashMetaZip       ( table ) ) return NULL;
 	if ( ! hashDMOZCategories( table ) ) return NULL;
@ -25427,26 +25444,18 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList2 ( SpiderReply *reply ) {
 	hi.m_desc   = "spider error msg";
 	if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;

-	hi.m_prefix = "gbdocid";
-	hi.m_desc   = "docid";
-	bufLen = sprintf ( buf , "%llu", *uqd ) ;
-	if ( ! hashString( buf , &hi ) ) return NULL;
+	//hi.m_prefix = "gbdocid";
+	//hi.m_desc   = "docid";
+	//bufLen = sprintf ( buf , "%llu", *uqd ) ;
+	//if ( ! hashString( buf , &hi ) ) return NULL;

 	// . then the url. url: site: ip: etc. terms
 	// . do NOT hash non-fielded terms so we do not get "status" 
 	//   results poluting the serps => false
 	if ( ! hashUrl ( &tt4 , false ) ) return NULL;

-	// hash the last spidered date, very useful!
-	hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
-	hi.m_desc      = "last spidered date";
-	// make this different so it doesn't coexist with regular results
-	// when someone does a gbsortby:gbspiderdate query
-	//hi.m_prefix    = "gbreplyspiderdate";
-	hi.m_prefix    = "gbspiderdate";
-	if ( reply->m_spideredTime <= 0 ) { char *xx=NULL;*xx=0; }
-	bufLen = sprintf ( buf , "%lu", reply->m_spideredTime );
-	if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return NULL;
+	// false --> do not hash the gbdoc* terms
+	hashDateNumbers ( &tt4 , false );

 	// store keys in safebuf then to make our own meta list
 	addTable144 ( &tt4 , *uqd , &m_spiderReplyMetaList );
@ -25654,13 +25663,24 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
 	return true;
 }

+// slightly greater than m_spideredTime, which is the download time.
+// we use this for sorting as well, like for the widget so things
+// don't really get added out of order and not show up in the top spot
+// of the widget list.
+long XmlDoc::getIndexedTime() {
+	if ( m_indexedTimeValid ) return m_indexedTime;
+	m_indexedTime = getTimeGlobal();
+	return m_indexedTime;
+}
+
 // . hash dates for sorting by using gbsortby: and gbrevsortby:
 // . do 'gbsortby:gbspiderdate' as your query to see this in action
-bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
+bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool hashgbdocTerms) {

 	// stop if already set
 	if ( ! m_spideredTimeValid ) return true;

+	long indexedTime = getIndexedTime();

 	// first the last spidered date
 	HashInfo hi;
@ -25671,10 +25691,38 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {

 	char buf[64];
 	long bufLen = sprintf ( buf , "%lu", m_spideredTime );
-
 	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
 		return false;

+	// and index time is >= spider time, so you want to sort by that for
+	// the widget for instance
+ 	hi.m_desc      = "last indexed date";
+ 	hi.m_prefix    = "gbindexdate";
+ 	bufLen = sprintf ( buf , "%lu", indexedTime );
+ 	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
+ 		return false;
+
+	// do not index the rest if we are a "spider reply" document
+	// which is like a fake document for seeing spider statuses
+	if ( ! hashgbdocTerms ) return true;
+
+	// now for CT_STATUS spider status "documents" we also index
+	// gbspiderdate so index this so we can just do a 
+	// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
+	// spider status "documents"
+	hi.m_desc      = "doc last spidered date";
+	hi.m_prefix    = "gbdocspiderdate";
+	bufLen = sprintf ( buf , "%lu", m_spideredTime );
+	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
+		return false;
+
+ 	hi.m_desc      = "doc last indexed date";
+ 	hi.m_prefix    = "gbdocindexdate";
+ 	bufLen = sprintf ( buf , "%lu", indexedTime );
+ 	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
+ 		return false;
+
+
 	// all done
 	return true;
 }
@ -26285,35 +26333,33 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
 	if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;


+	setStatus ( "hashing gbdocid" );
+	hi.m_prefix = "gbdocid";
+	char buf2[32];
+	sprintf(buf2,"%llu",(m_docId) );
+	if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
+
+	// if indexing a json diffbot object, index 
+	// gbparenturl:xxxx of the original url from which the json was
+	// datamined. we use this so we can act as a diffbot json cache.
+	if ( m_isDiffbotJSONObject ) {
+		setStatus ( "hashing gbparenturl term");
+		char *p = fu->getUrl() + fu->getUrlLen() - 1;
+		// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
+		for ( ; *p && *p != '-' ; p-- );
+		// set up the hashing parms
+		hi.m_hashGroup = HASHGROUP_INTAG;
+		hi.m_tt        = tt;
+		hi.m_desc      = "diffbot parent url";
+		// append a "www." as part of normalization
+		uw.set ( fu->getUrl() , p - fu->getUrl() , true );
+		hi.m_prefix    = "gbparenturl";
+		if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) 
+			return false;
+	}
+
 	if ( ! hashNonFieldTerms ) return true;

-
-	setStatus ( "hashing url mid domain");
-	// the final score
-	//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
-	// update parms
-	hi.m_prefix    = NULL;
-	hi.m_desc      = "middle domain";//tmp3;
-	hi.m_hashGroup = HASHGROUP_INURL;
-	// if parm "index article content only" is true, do not index this!
-	//if ( m_eliminateMenus ) plainScore = 0;
-	//char *mid  = fu->getMidDomain   ();
-	//long  mlen = fu->getMidDomainLen();
-	//hi.m_desc = "url mid dom";
-	//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
-	//hi.m_desc = "url host";
-	char *host = fu->getHost        ();
-	long  hlen = fu->getHostLen     ();
-	if ( ! hashString ( host,hlen,&hi)) return false;
-
-
-	setStatus ( "hashing url path");
-
-	// hash the path plain
-	if ( ! hashString (path,plen,&hi) ) return false;
-
-
-
 	setStatus ( "hashing SiteGetter terms");

 	//
@ -26337,6 +26383,10 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
 	// . that way we do not confuse all the pages in dictionary.com or
 	//   wikipedia.org as subsites!!
 	if ( ! m_links.hasSubdirOutlink() ) add = false;
+
+	char *host = fu->getHost        ();
+	long  hlen = fu->getHostLen     ();
+
 	// tags from here out
 	hi.m_hashGroup = HASHGROUP_INTAG;
 	hi.m_shardByTermId = true;
@ -26371,30 +26421,28 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool hashNonFieldTerms ) {
 	hi.m_prefix = "urlhashdiv100";
 	if ( ! hashString(buf,blen,&hi) ) return false;

-	setStatus ( "hashing gbdocid" );
-	hi.m_prefix = "gbdocid";
-	char buf2[32];
-	sprintf(buf2,"%llu",(m_docId) );
-	if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;

-	// if indexing a json diffbot object, index 
-	// gbparenturl:xxxx of the original url from which the json was
-	// datamined. we use this so we can act as a diffbot json cache.
-	if ( m_isDiffbotJSONObject ) {
-		setStatus ( "hashing gbparenturl term");
-		char *p = fu->getUrl() + fu->getUrlLen() - 1;
-		// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
-		for ( ; *p && *p != '-' ; p-- );
-		// set up the hashing parms
-		hi.m_hashGroup = HASHGROUP_INTAG;
-		hi.m_tt        = tt;
-		hi.m_desc      = "diffbot parent url";
-		// append a "www." as part of normalization
-		uw.set ( fu->getUrl() , p - fu->getUrl() , true );
-		hi.m_prefix    = "gbparenturl";
-		if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) 
-			return false;
-	}
+	setStatus ( "hashing url mid domain");
+	// the final score
+	//long plainScore = (long)(256.0 * boost1 * boost2 * fw);
+	// update parms
+	hi.m_prefix    = NULL;
+	hi.m_desc      = "middle domain";//tmp3;
+	hi.m_hashGroup = HASHGROUP_INURL;
+	// if parm "index article content only" is true, do not index this!
+	//if ( m_eliminateMenus ) plainScore = 0;
+	//char *mid  = fu->getMidDomain   ();
+	//long  mlen = fu->getMidDomainLen();
+	//hi.m_desc = "url mid dom";
+	//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
+	//hi.m_desc = "url host";
+	if ( ! hashString ( host,hlen,&hi)) return false;
+
+
+	setStatus ( "hashing url path");
+
+	// hash the path plain
+	if ( ! hashString (path,plen,&hi) ) return false;

 	return true;
 }
@ -32081,7 +32129,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
 			);


-
+	/*
 	char *ms = "-1";
 	if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
 	sb->safePrintf ( 
@ -32097,7 +32145,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
 			"<td>max pub date</td>"
 			"<td>%s UTC</td>"
 			"</tr>\n" , ms );
-
+	*/

 	// our html template fingerprint
 	sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -280,8 +280,12 @@ class XmlDoc {
 	long      m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
 	long      m_siteNumInlinksUniqueCBlock; // m_sitePop;
 	time_t    m_spideredTime;
-	time_t    m_minPubDate;
-	time_t    m_maxPubDate;
+	// just don't throw away any relevant SpiderRequests and we have
+	// the data that m_minPubDate and m_maxPubDate provided
+	//time_t    m_minPubDate;
+	//time_t    m_maxPubDate;
+	time_t    m_indexedTime; // slightly > m_spideredTime
+	uint32_t  m_reserved32;
 	time_t    m_pubDate;    // aka m_datedbDate
 	//time_t  m_nextSpiderTime;
 	time_t    m_firstIndexedDate;
@ -473,6 +477,10 @@ class XmlDoc {
 	// we now call this right away rather than at download time!
 	long getSpideredTime();

+	// time right before adding the termlists to the index, etc.
+	// whereas spider time is the download time
+	long getIndexedTime();
+
 	// another entry point, like set3() kinda
 	bool loadFromOldTitleRec ();

@ -773,7 +781,7 @@ class XmlDoc {
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
 	bool hashUrl ( class HashTableX *table , bool hashNonFieldTerms=true) ;
-	bool hashDateNumbers ( class HashTableX *tt ) ;
+	bool hashDateNumbers ( class HashTableX *tt , bool hashAll ) ;
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
 				    bool       hashAnomalies    ,
@ -1193,6 +1201,7 @@ class XmlDoc {
 	bool m_firstIpValid;
 	bool m_spideredTimeValid;
 	//bool m_nextSpiderTimeValid;
+	bool m_indexedTimeValid;
 	bool m_firstIndexedValid;
 	bool m_isInIndexValid;
 	bool m_wasInIndexValid;
--- a/gb.conf
+++ b/gb.conf
@ -1,382 +0,0 @@
-# All <, >, " and # characters that are values for a field contained herein
-# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
-
-# Mem available to this process. May be exceeded due to fragmentation.
-<maxMem>4000000000</>
-
-# Below the various Gigablast databases are configured.
-# <*dbMaxTreeMem>          - mem used for holding new recs
-# <*dbMaxDiskPageCacheMem> - disk page cache mem for this db
-# <*dbMaxCacheMem>         - cache mem for holding single recs
-# <*dbSaveCache>           - save the rec cache on exit?
-# <*dbMaxCacheAge>         - max age (seconds) for recs in rec cache
-# See that Stats page for record counts and stats.
-
-# How many bytes should be used for caching DNS replies?
-<dnsMaxCacheMem>128000</>
-
-# A tagdb record assigns a url or site to a ruleset. Each tagdb record is
-# about 100 bytes or so.
-<tagdbMaxTreeMem>1028000</>
-<tagdbMaxPageCacheMem>200000</>
-
-# A catdb record assigns a url or site to DMOZ categories. Each catdb record
-# is about 100 bytes.
-<catdbMaxTreeMem>1000000</>
-<catdbMaxPageCacheMem>25000000</>
-<catdbMaxCacheMem>0</>
-
-# Clusterdb caches small records for site clustering and deduping.
-<clusterdbMaxTreeMem>1000000</>
-<clusterdbSaveCache>0</>
-
-# Max memory for dup vector cache.
-<maxVectorCacheMem>10000000</>
-
-# Robotdb caches robot.txt files.
-<robotdbMaxCacheMem>128000</>
-<robotdbSaveCache>0</>
-<linkdbMaxPageCacheMem>0</>
-<statsdbMaxTreeMem>5000000</>
-<statsdbMaxCacheMem>0</>
-<statsdbMaxDiskPageCacheMem>1000000</>
-
-# Maximum bytes of a doc that can be sent before having to read more from disk
-<httpMaxSendBufSize>128000</>
-
-# Bytes to use for caching search result pages.
-<searchResultsMaxCacheMem>100000</>
-
-# Read only mode does not allow spidering.
-<readOnlyMode>0</>
-
-# Controls all spidering for all collections
-<spideringEnabled>1</>
-
-# What is the maximum number of web pages the spider is allowed to download
-# simultaneously for ALL collections PER HOST?
-<maxTotalSpiders>100</>
-
-# Can people use the add url interface to add urls to the index?
-<addUrlEnabled>1</>
-
-# Save data in memory to disk after this many minutes have passed without the
-# data having been dumped or saved to disk. Use 0 to disable.
-<autoSaveFrequency>5</>
-
-# Maximum sockets available to serve incoming HTTP requests. Too many
-# outstanding requests will increase query latency. Excess requests will
-# simply have their sockets closed.
-<maxHttpSockets>100</>
-
-# Maximum sockets available to serve incoming HTTPS requests. Like max http
-# sockets, but for secure sockets.
-<maxHttpsSockets>100</>
-
-# Identification seen by web servers when the Gigablast spider downloads their
-# web pages. It is polite to insert a contact email address here so webmasters
-# that experience problems from the Gigablast spider have somewhere to vent.
-<spiderUserAgent><![CDATA[GigablastOpenSource/1.0]]></>
-
-# If this is true, gb will send Accept-Encoding: gzip to web servers when
-# doing http downloads.
-<askForGzippedDocsWhenDownloading>0</>
-
-# How many seconds should we cache a search results page for?
-<searchResultsCacheMaxAge>10800</>
-
-# Keep track of ips which do queries, disallow non-customers from hitting us
-# too hard.
-<autobanIPsWhichViolateTheQueriesPerDayQuotas>0</>
-
-# If a call to a message callback or message handler in the udp server takes
-# more than this many milliseconds, then log it. Logs 'udp: Took %lli ms to
-# call callback for msgType=0x%hhx niceness=%li'. Use -1 or less to disable
-# the logging.
-<maxDelayBeforeLoggingACallbackOrHandler>-1</>
-
-# Sends emails to admin if a host goes down.
-<sendEmailAlerts>0</>
-
-# Do not send email alerts about dead hosts to anyone except
-# sysadmin@gigablast.com between the times given below unless all the twins of
-# the dead host are also dead. Instead, wait till after if the host is still
-# dead. 
-<delayNonCriticalEmailAlerts>0</>
-
-# Email alerts will include the cluster name
-<clusterName><![CDATA[unspecified]]></>
-
-# Send an email after a host has not responded to successive pings for this
-# many milliseconds.
-<sendEmailTimeout>62000</>
-
-# Send email alerts when query success rate goes below this threshold.
-# (percent rate between 0.0 and 1.0)
-<querySuccessRateThreshold>0.850000</>
-
-# Send email alerts when average query latency goes above this threshold. (in
-# seconds)
-<averageQueryLatencyThreshold>2.000000</>
-
-# Record this number of query times before calculating average query latency.
-<numberOfQueryTimesInAverage>300</>
-
-# At what temperature in Celsius should we send an email alert if a hard drive
-# reaches it?
-<maxHardDriveTemperature>45</>
-
-# Look for this string in the kernel buffer for sending email alert. Useful
-# for detecting some strange hard drive failures that really slow performance.
-<errorString1><![CDATA[]]></>
-
-# Look for this string in the kernel buffer for sending email alert. Useful
-# for detecting some strange hard drive failures that really slow performance.
-<errorString2><![CDATA[]]></>
-
-# Look for this string in the kernel buffer for sending email alert. Useful
-# for detecting some strange hard drive failures that really slow performance.
-<errorString3><![CDATA[]]></>
-
-# Sends to email address 1 through email server 1.
-<sendEmailAlertsToEmail1>0</>
-
-# Sends to email address 1 through email server 1 if any parm is changed.
-<sendParmChangeEmailAlertsToEmail1>0</>
-
-# Connects to this IP or hostname directly when sending email 1. Use
-# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
-# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
-# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
-# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
-# changes so sendmail will forward Gigablast's mail to the address you give
-# below.
-<emailServer1><![CDATA[127.0.0.1]]></>
-
-# Sends to this address when sending email 1 
-<emailAddress1><![CDATA[4081234567@vtext.com]]></>
-
-# The from field when sending email 1 
-<fromEmailAddress1><![CDATA[sysadmin@mydomain.com]]></>
-
-# Sends to email address 2 through email server 2.
-<sendEmailAlertsToEmail2>0</>
-
-# Sends to email address 2 through email server 2 if any parm is changed.
-<sendParmChangeEmailAlertsToEmail2>0</>
-
-# Connects to this server directly when sending email 2 
-<emailServer2><![CDATA[mail.mydomain.com]]></>
-
-# Sends to this address when sending email 2 
-<emailAddress2><![CDATA[]]></>
-
-# The from field when sending email 2 
-<fromEmailAddress2><![CDATA[sysadmin@mydomain.com]]></>
-
-# Sends to email address 3 through email server 3.
-<sendEmailAlertsToEmail3>0</>
-
-# Sends to email address 3 through email server 3 if any parm is changed.
-<sendParmChangeEmailAlertsToEmail3>0</>
-
-# Connects to this server directly when sending email 3 
-<emailServer3><![CDATA[mail.mydomain.com]]></>
-
-# Sends to this address when sending email 3 
-<emailAddress3><![CDATA[]]></>
-
-# The from field when sending email 3 
-<fromEmailAddress3><![CDATA[sysadmin@mydomain.com]]></>
-
-# IP address of the primary DNS server. Assumes UDP port 53. REQUIRED FOR
-# SPIDERING! Use Google's public DNS 8.8.8.8 as default.
-<dns0>8.8.8.8</>
-
-# IP address of the secondary DNS server. Assumes UDP port 53. Will be
-# accessed in conjunction with the primary dns, so make sure this is always
-# up. An ip of 0 means disabled. Google's secondary public DNS is 8.8.4.4.
-<dns1>8.8.4.4</>
-
-# All hosts send to these DNSes based on hash of the subdomain to try to split
-# DNS load evenly.
-<dns2>0.0.0.0</>
-<dns3>0.0.0.0</>
-<dns4>0.0.0.0</>
-<dns5>0.0.0.0</>
-<dns6>0.0.0.0</>
-<dns7>0.0.0.0</>
-<dns8>0.0.0.0</>
-<dns9>0.0.0.0</>
-<dns10>0.0.0.0</>
-<dns11>0.0.0.0</>
-<dns12>0.0.0.0</>
-<dns13>0.0.0.0</>
-<dns14>0.0.0.0</>
-<dns15>0.0.0.0</>
-
-# If enabled, gigablast will repair the rdbs as specified by the parameters
-# below. When a particular collection is in repair mode, it can not spider or
-# merge titledb files.
-<repairModeEnabled>0</>
-
-# Comma or space separated list of the collections to repair or rebuild.
-<collectionsToRepairOrRebuild><![CDATA[main]]></>
-
-# In bytes.
-<memoryToUseForRepair>300000000</>
-
-# Maximum number of outstanding inject spiders for repair.
-<maxRepairSpiders>32</>
-
-# If enabled, gigablast will reinject the content of all title recs into a
-# secondary rdb system. That will the primary rdb system when complete.
-<fullRebuild>0</>
-
-# If enabled, gigablast will keep the new spiderdb records when doing the full
-# rebuild or the spiderdb rebuild.
-<keepNewSpiderdbRecs>1</>
-
-# If enabled, gigablast will recycle the link info when rebuilding titledb.
-<recycleLinkInfo>0</>
-
-# If enabled, gigablast will rebuild this rdb
-<rebuildTitledb>1</>
-
-# If enabled, gigablast will rebuild this rdb
-<rebuildPosdb>0</>
-
-# If enabled, gigablast will rebuild this rdb
-<rebuildClusterdb>0</>
-
-# If enabled, gigablast will rebuild this rdb
-<rebuildSpiderdb>0</>
-
-# If enabled, gigablast will rebuild this rdb
-<rebuildLinkdb>0</>
-
-# If disabled, gigablast will skip root urls.
-<rebuildRootUrls>1</>
-
-# If disabled, gigablast will skip non-root urls.
-<rebuildNonrootUrls>1</>
-
-# When rebuilding spiderdb and scanning it for new spiderdb records, should a
-# tagdb lookup be performed? Runs much much faster without it. Will also keep
-# the original doc quality and spider priority in tact.
-<skipTagdbLookup>0</>
-
-# add Ips here to bar them from accessing this gigablast server.
-<banIps><![CDATA[]]></>
-
-# add Ips here to give them an infinite query quota.
-<allowIps><![CDATA[]]></>
-
-# Don't try to autoban queries that have one of these codes. Also, the code
-# must be valid for us to use &uip=IPADDRESS as the IP address of the
-# submitter for purposes of autoban AND purposes of addurl daily quotas.
-<validCodes><![CDATA[]]></>
-
-# Append extra default parms to queries that match certain substrings. 
-# Format: text to match in url, followed by a space, then the list of extra
-# parms as they would appear appended to the url.  One match per line.
-<extraParms><![CDATA[]]></>
-
-# ban any query that matches this list of substrings.  Must match all
-# comma-separated strings on the same line.  ('\n' = OR, ',' = AND)
-<banRegex><![CDATA[]]></>
-
-# Any matching password will have administrative access to Gigablast and all
-# collections.
-# Use <masterPassword> tag.
-
-# Any IPs in this list will have administrative access to Gigablast and all
-# collections.
-# Use <masterIp> tag.
-
-# Log GET and POST requests received from the http server?
-<logHttpRequests>1</>
-
-# Should we log queries that are autobanned? They can really fill up the log.
-<logAutobannedQueries>1</>
-
-# If query took this many millliseconds or longer, then log the query and the
-# time it took to process.
-<logQueryTimeThreshold>5000</>
-
-# Log query reply in proxy, but only for those queries above the time
-# threshold above.
-<logQueryReply>0</>
-
-# Log status of spidered or injected urls?
-<logSpideredUrls>1</>
-
-# Log messages if Gigablast runs out of udp sockets?
-<logNetworkCongestion>0</>
-
-# Log messages not related to an error condition, but meant more to give an
-# idea of the state of the gigablast process. These can be useful when
-# diagnosing problems.
-<logInformationalMessages>1</>
-
-# Log it when document not added due to quota breech. Log it when url is too
-# long and it gets truncated.
-<logLimitBreeches>0</>
-
-# Log various debug messages.
-<logDebugAdminMessages>0</>
-<logDebugBuildMessages>0</>
-<logDebugBuildTimeMessages>0</>
-<logDebugDatabaseMessages>0</>
-<logDebugDirtyMessages>0</>
-<logDebugDiskMessages>0</>
-<logDebugDnsMessages>0</>
-<logDebugHttpMessages>0</>
-<logDebugImageMessages>0</>
-<logDebugLoopMessages>0</>
-<logDebugLanguageDetectionMessages>0</>
-<logDebugLinkInfo>0</>
-<logDebugMemMessages>0</>
-<logDebugMemUsageMessages>0</>
-<logDebugNetMessages>0</>
-<logDebugQueryMessages>0</>
-<logDebugQuotaMessages>0</>
-<logDebugRobotsMessages>0</>
-<logDebugSpiderCacheMessages>0</>
-<logDebugSpellerMessages>0</>
-<logDebugSectionsMessages>0</>
-<logDebugSeoInsertMessages>0</>
-<logDebugSeoMessages>0</>
-<logDebugStatsMessages>0</>
-<logDebugSummaryMessages>0</>
-<logDebugSpiderMessages>0</>
-<logDebugUrlAttempts>0</>
-<logDebugSpiderDownloads>0</>
-<logDebugFacebook>0</>
-<logDebugTagdbMessages>0</>
-<logDebugTcpMessages>0</>
-<logDebugThreadMessages>0</>
-<logDebugTitleMessages>0</>
-<logDebugTimedbMessages>0</>
-<logDebugTopicMessages>0</>
-<logDebugTopDocMessages>0</>
-<logDebugUdpMessages>0</>
-<logDebugUnicodeMessages>0</>
-<logDebugRepairMessages>0</>
-<logDebugPubDateExtractionMessages>0</>
-
-# Log various timing related messages.
-<logTimingMessagesForBuild>0</>
-
-# Log various timing related messages.
-<logTimingMessagesForAdmin>0</>
-<logTimingMessagesForDatabase>0</>
-<logTimingMessagesForNetworkLayer>0</>
-<logTimingMessagesForQuery>0</>
-
-# Log various timing related messages.
-<logTimingMessagesForSpcache>0</>
-<logTimingMessagesForRelatedTopics>0</>
-
-# Log reminders to the programmer. You do not need this.
-<logReminderMessages>0</>
--- a/main.cpp
+++ b/main.cpp
@ -1572,7 +1572,9 @@ int main2 ( int argc , char *argv[] ) {
 			log("db: HttpServer init failed. Another gb "
 			    "already running? If not, try editing "
 			    "./hosts.conf to "
-			    "change the port from %li to something bigger"
+			    "change the port from %li to something bigger. "
+			    "Or stop gb by running 'gb stop' or by "
+			    "clicking 'save & exit' in the master controls."
 			    , (long)httpPort ); 
 			// this is dangerous!!! do not do the shutdown thing
 			return 1;