Merge branch 'master' into diffbot-matt

2014-04-11 00:29:40 -07:00 · 2014-04-11 00:29:40 -07:00 · 5d6975b2c2
commit 5d6975b2c2
parent 957c399a75 fe90148075
14 changed files with 143 additions and 48 deletions
--- a/Log.cpp
+++ b/Log.cpp
@ -31,6 +31,10 @@ long       g_dbufSize       = 0;
 // main process id
 static pid_t s_pid = -1;

+void Log::setPid ( ) {
+	s_pid = getpidtid();
+}
+
 Log::Log () { 
 	m_fd = -1; 
 	m_filename = NULL; 
@ -55,7 +59,8 @@ void Log::reset ( ) {

 bool Log::init ( char *filename ) {
 	// set the main process id
-	s_pid = getpidtid();
+	//s_pid = getpidtid();
+	setPid();
 	// init these
 	m_numErrors =  0;
 	m_bufPtr    =  0;
--- a/Log.h
+++ b/Log.h
@ -124,6 +124,8 @@ class Log {

 	void reset ( );

+	void setPid();
+
 	// save before exiting
 	void close () { dumpLog();  };

--- a/8
+++ b/8
@ -124,11 +124,11 @@ gb: $(OBJS) main.o $(LIBFILES)
 	$(CC) $(DEFS) $(CPPFLAGS) -o $@ main.o $(OBJS) $(LIBS)


-iana_charset.cpp: parse_iana_charsets.pl character-sets supported_charsets.txt
-	./parse_iana_charsets.pl < character-sets
+#iana_charset.cpp: parse_iana_charsets.pl character-sets supported_charsets.txt
+#	./parse_iana_charsets.pl < character-sets

-iana_charset.h: parse_iana_charsets.pl character-sets supported_charsets.txt
-	./parse_iana_charsets.pl < character-sets
+#iana_charset.h: parse_iana_charsets.pl character-sets supported_charsets.txt
+#	./parse_iana_charsets.pl < character-sets

 run_parser: test_parser
 	./test_parser ~/turkish.html
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -575,7 +575,7 @@ bool Msg40::getDocIds ( bool recall ) {

 	// make enough for ptrs
 	long need = sizeof(Msg3a *) * m_numCollsToSearch;
-	if ( ! m_msg3aPtrBuf.reserve ( need ) ) return NULL;
+	if ( ! m_msg3aPtrBuf.reserve ( need ) ) return true;
 	// cast the mem buffer
 	m_msg3aPtrs = (Msg3a **)m_msg3aPtrBuf.getBufStart();

--- a/Msg8b.cpp
+++ b/Msg8b.cpp
@ -508,6 +508,9 @@ void gotCatRecWrapper ( void *state ) { // , CatRec *catrec ) {
 // . each normal tagdb record has the following format:
 //      templateKey (12 bytes) then non-NULL-terminated site string
 bool Msg8b::gotList ( ) {
+	// ignore this...
+	if ( g_errno == ENOCOLLREC )
+		g_errno = 0;
 	// return on error
 	if (g_errno){
 		log("build: Had error getting ruleset record: %s.",
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -102,6 +102,7 @@ bool updateSiteListTables ( collnum_t collnum ,
 	// because we set it here from a call to CommandUpdateSiteList()
 	// because it requires all this computational crap.
 	char *op = cr->m_siteListBuf.getBufStart();
+
 	// scan and hash each line in it
 	for ( ; *op ; op++ ) {
 		// get end
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -116,6 +116,8 @@ bool sendReply ( State0 *st , char *reply ) {
 		// . one hour cache time... no 1000 hours, basically infinite
 		// . no because if we redo the query the results are cached
 		long cacheTime = 3600;//*1000;
+		// no... do not use cache
+		cacheTime = -1;
 		// the "Check it" link on add url uses &usecache=0 to tell
 		// the browser not to use its cache...
 		//if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -431,6 +431,24 @@ bool CommandRestartColl ( char *rec , WaitEntry *we ) {
 	// to avoid user confusion
 	if ( cr ) cr->m_spideringEnabled = 1;

+	if ( ! cr ) return true;
+
+	//
+	// repopulate spiderdb with the same sites
+	//
+
+	char *oldSiteList = cr->m_siteListBuf.getBufStart();
+	// do not let it have the buf any more
+	cr->m_siteListBuf.detachBuf();
+	// can't leave it NULL, safebuf parms do not like to be null
+	cr->m_siteListBuf.nullTerm();
+	// re-add the buf so it re-seeds spiderdb. it will not dedup these
+	// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
+	// "true" = addSeeds.
+	updateSiteListTables ( newCollnum , true , oldSiteList );
+	// now put it back
+	if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
+
 	// all done
 	return true;
 }
@ -468,9 +486,27 @@ bool CommandResetColl ( char *rec , WaitEntry *we ) {
 	// turn on spiders on new collrec. collname is same but collnum
 	// will be different.
 	CollectionRec *cr = g_collectiondb.getRec ( newCollnum );
-	// if reset from crawlbot api page then enable spiders
-	// to avoid user confusion
-	if ( cr ) cr->m_spideringEnabled = 1;
+
+	if ( ! cr ) return true;
+
+	//
+	// repopulate spiderdb with the same sites
+	//
+
+	char *oldSiteList = cr->m_siteListBuf.getBufStart();
+	// do not let it have the buf any more
+	cr->m_siteListBuf.detachBuf();
+	// can't leave it NULL, safebuf parms do not like to be null
+	cr->m_siteListBuf.nullTerm();
+	// re-add the buf so it re-seeds spiderdb. it will not dedup these
+	// urls in "oldSiteList" with "m_siteListBuf" which is now empty.
+	// "true" = addSeeds.
+	updateSiteListTables ( newCollnum , true , oldSiteList );
+	// now put it back
+	if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList );
+
+	// turn spiders off
+	if ( cr ) cr->m_spideringEnabled = 0;

 	return true;
 }
@ -7709,8 +7745,9 @@ void Parms::init ( ) {
 	*/

 	m->m_title = "restart collection";
-	m->m_desc  = "Remove all documents from this collection and restart "
-		"spidering.";// If you do this accidentally there "
+	m->m_desc  = "Remove all documents from the collection and re-add "
+		"seed urls from site list.";
+	// If you do this accidentally there "
 	//"is a <a href=/admin.html#recover>recovery procedure</a> to "
 	//	"get back the trashed data.";
 	m->m_cgi   = "restart";
@ -8756,11 +8793,12 @@ void Parms::init ( ) {
 	m->m_page  = PAGE_SPIDER;
 	m->m_func2 = CommandResetColl;
 	m->m_cast  = 1;
+	m->m_flags = PF_HIDDEN;
 	m++;

 	m->m_title = "restart collection";
-	m->m_desc  = "Remove all documents from the collection and start "
-		"spidering over again.";
+	m->m_desc  = "Remove all documents from the collection and re-add "
+		"seed urls from site list.";
 	m->m_cgi   = "restart";
 	m->m_type  = TYPE_CMD;
 	m->m_page  = PAGE_SPIDER;
--- a/Speller.cpp
+++ b/Speller.cpp
@ -998,8 +998,13 @@ char *Speller::getRandomWord() {
 // dict into memory using Language.loadWordList(), loadTitleRecDict(), etc
 bool Speller::loadUnifiedDict() {

+	bool building = false;
+
+ reload:
+
 	bool needRebuild = false;

+	m_unifiedBuf.purge();
 	m_unifiedBuf.setLabel("unibuf");

 	// this MUST be there
@ -1046,6 +1051,13 @@ bool Speller::loadUnifiedDict() {
 		return true;
 	}

+	if ( building ) {
+		log("gb: rebuild failed. exiting.");
+		exit(0);
+	}
+
+	building = true;
+
 	log("gb: REBUILDING unifiedDict-buf.txt and unifiedDict-map.dat");

 	// just in case that was there and the buf wasn't
@ -1385,6 +1397,9 @@ bool Speller::loadUnifiedDict() {
 	if ( m_unifiedDict.save(g_hostdb.m_dir,"unifiedDict-map.dat")<=0 )
 		return false;

+	// start over and load what we created
+	goto reload;
+
 	// hmmm... seems like we need to re-run for some reason
 	log("spell: PLEASE RERUN gb");
 	log("spell: PLEASE RERUN gb");
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -326,8 +326,10 @@ retry19:
 		close ( m_sock );
 		fprintf(stderr,"Failed to bind socket on port %li: %s."
 			"\n"
-			"Are you already running gb?\n",
-		   	(long)port,mstrerror(g_errno));
+			"Are you already running gb?\n"
+			"If not, try editing ./hosts.conf to\n"
+			"change the port from %li to something bigger.\n",
+		   	(long)port,mstrerror(g_errno),(long)port);
 		return false;
 	}
 	close ( m_sock );
--- a/Threads.cpp
+++ b/Threads.cpp
@ -178,17 +178,7 @@ Threads::Threads ( ) {
 	m_initialized = false;
 }

-bool Threads::init ( ) {
-
-	if ( m_initialized ) return true;
-	m_initialized = true;
-
-	m_needsCleanup = false;
-	//m_needBottom = false;
-
-	// sanity check
-	if ( sizeof(pthread_t) > sizeof(pid_t) ) { char *xx=NULL;*xx=0; }
-
+void Threads::setPid ( ) {
 	// set s_pid to the main process id
 #ifdef PTHREADS
 	s_pid = pthread_self();
@ -207,7 +197,20 @@ bool Threads::init ( ) {
 #else
 	s_pid = getpid();
 #endif
+}

+bool Threads::init ( ) {
+
+	if ( m_initialized ) return true;
+	m_initialized = true;
+
+	m_needsCleanup = false;
+	//m_needBottom = false;
+
+	// sanity check
+	if ( sizeof(pthread_t) > sizeof(pid_t) ) { char *xx=NULL;*xx=0; }
+
+	setPid();

 #ifdef _STACK_GROWS_UP
 	return log("thread: Stack growing up not supported.");
--- a/Threads.h
+++ b/Threads.h
@ -172,6 +172,7 @@ class Threads {

 	long getStack ( ) ;
 	void returnStack ( long si );
+	void setPid();
 	void reset ( ) ;

 	// . we restrict the # of threads based on their type
--- a/html/admin.html
+++ b/html/admin.html
@ -23,13 +23,17 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
 <h1>Table of Contents</h1>
 <br>
 <a href=#quickstart>Quick Start</a><br><br>
+<a href=#src>Build from Source</a><br><br>
 <a href=#features>Features</a><br><br>
 <a href=/searchfeed.html>XML/REST Search Feed API</a><br><br>


 <!--<a href=#weighting>Weighting Query Terms</a> - how to pass in your own query term weights<br><br>-->

-<a href=#requirements>Hardware Requirements</a> - what is required to run gigablast<br><br><a href=#perf>Performance Specifications</a> - various statistics.<br><br><a href=#files>List of Files</a> - the necessary files to run Gigablast<br><br><a href=#cmdline>Command Line Options</a> - various command line options (coming soon)<br><br><a href=#clustermaint>Cluster Maintenance</a> - running Gigablast on a cluster of computers.<br><br><a href=#trouble>Troubleshooting</a> - how to fix problems<br><br><a href=#disaster>Disaster Recovery</a> - dealing with a crashed host<br><br><a href=#security>The Security System</a> - how to control access<br><br>
+<a href=#requirements>Hardware Requirements</a> - what is required to run gigablast<br><br><a href=#perf>Performance Specifications</a> - various statistics.<br><br><a href=#files>List of Files</a> - the necessary files to run Gigablast<br><br><a href=#cmdline>Command Line Options</a> - various command line options (coming soon)<br><br><a href=#clustermaint>Cluster Maintenance</a> - running Gigablast on a cluster of computers.<br><br><a href=#trouble>Troubleshooting</a> - how to fix problems<br><br><a href=#disaster>Disaster Recovery</a> - dealing with a crashed host
+<!--<br><br>
+<a href=#security>The Security System</a> - how to control access-->
+<br><br>

 <a href=#build>Building an Index</a> - how to start building your index<br><br>
 <a href=#spider>The Spider</a> - all about Gigabot, Gigablast's crawling agent<br><br>
@ -44,8 +48,14 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
 <br><br><a href=#dmoz>Building a DMOZ Based Directory</a> - build a web directory based on open DMOZ data<br><br>

 <a href=#optimizing>Optimizing</a> - optimizing Gigablast's spider and query performance<br><br>
-<a href=#logs>The Log System</a> - how Gigablast logs information<br><br><a href=#config>gb.conf</a> - describes the gb configuration file<br><br><a href=#hosts>hosts.conf</a> - the file that describes all participating hosts in the network<br><br>
+<a href=#logs>The Log System</a> - how Gigablast logs information
 <!--
+<br><br>
+<a href=#config>gb.conf</a> - describes the gb configuration file
+<br><br>
+<a href=#hosts>hosts.conf</a> - the file that describes all participating hosts in the network
+<br><br>
+
 <a href=#stopwords>Stopwords</a> - list of common words generally ignored at query time<br><br>
 <a href=#phrasebreaks>Phrase Breaks</a> - list of punctuation that breaks a phrase<br><br>
 -->
@ -53,47 +63,52 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
 <br><br><a name=quickstart></a>
 <h1>Quick Start</h1>

+Until I get the binary packages ready, <a href=#src>build from the source code</a>, it should only take about 30 seconds to type the three commands.
+<!--
 Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>

-Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigaablast-1.0-1.rpm>Gigablast package for RedHat</a>.
+Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigablast-1.0-1.rpm>Gigablast package for RedHat</a>.

 <br><br>
 If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
+-->

 <br>
 <br>
-
+<a name=src></a>
 <h1>Build From Source</h1>

 Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>

 If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
 <br><br>
-
+<!--
 You will need the following packages installed<br>
 <ul>
-<li>apt-get install make
-<li>apt-get install g++
+<li>do a <b>apt-get install make g++ gcc-multilib</b>
+-->
+<!--<li>apt-get install g++
 <li>apt-get install gcc-multilib <i>(for 32-bit compilation support)</i>
+-->
 <!--<li>apt-get install libssl-dev <i>(for the includes, 32-bit libs are here)</i>-->
 <!--<li>apt-get install libplot-dev <i>(for the includes, 32-bit libs are here)</i>-->
 <!--<li>apt-get install lib32stdc++6-->
 <!--<li>apt-get install ia32-libs-->
-<li>I supply libstdc++.a but you might need the include headers and have to do <i>apt-get install lib32stdc++6</i> or something.
+<!--<li>I supply libstdc++.a but you might need the include headers and have to do <b>apt-get install lib32stdc++6</b> or something.
 </ul>
+-->

-
-
-
-1. Run 'make' to compile. (e.g. use 'make -j 4' to compile on four cores)
-<br><br>
-2. Run './gb 0'  to start a single gigablast node which listens on port 8000.
-<br><br>
-3. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing.  Check the log file to see when it completes.
-<br><br>
-4. Re-run it after it builds those binaries.
-<br><br>
-5. Go to the <a href=http://127.0.0.1:8000/>root page</a> to begin.
+<b>1.</b> Do <b>apt-get install make g++ gcc-multilib lib32stdc++6</b>
+<br>
+<b>2.</b> Download the <a href=https://github.com/gigablast/open-source-search-engine>Gigablast source code</a> using <b>wget --no-check-certificate "https://github.com/gigablast/open-source-search-engine/archive/master.zip"</b>, unzip it and cd into it.
+<br>
+<b>3.</b> Run <b>make</b> to compile. (e.g. use 'make -j 4' to compile on four cores)
+<br>
+<b>4.</b> Run <b>./gb 0</b>  to start a single gigablast node which listens on port 8000.
+<br>
+<b>5.</b> The first time you run gb, wait about 30 seconds for it to build some files. Check the log file to see when it completes.
+<br>
+<b>6.</b> Go to the <a href=http://127.0.0.1:8000/>root page</a> to begin.

 <br>

@ -329,13 +344,14 @@ For the purposes of this section, we assume the name of the cluster is gf and al
 <tr><td>It often does a dns lookup on each link if it has not encountered that subdomain before. Otherwise, the subdomain IP when first encountered is stored in tagdb in the <i>firstIp</i> field. You might try using more DNSesor disabling link spidering.
 </td></tr>
 </table>
+<!--
 <br><br><a name=security></a>
 <table cellpadding=1 border=0 width=100% bgcolor=#0079ba>
 <tr><td><center><b><font color=#ffffff size=+1>The Security System
 </td></tr></table>
 <br>
 Right now any local IP can adminster Gigablast, so any IP on the same network with a netmask of 255.255.255.0 can get in. There was an accounting system but it was disabled for simplicity. So we need to at least partially re-enable it, but still keep things simple for single administrators on small networks.
-<!--
+
 Every request sent to the Gigablast server is assumed to come from one of four types of users. A public user, a spam assassin, a collection admin, or a master admin. A collection admin has control over the controls corresponding to a particular collection. A spam assassin has control over even fewer controls over a particular collection in order to remove pages from it. A master admin has control over all aspects and all collections. <br><br>To verify a request is from an admin or spam assassin Gigablast requires that the request contain a password or come from a listed IP. To maintain these lists of passwords and IPs for the master admin, click on the "security" tab. To maintain them for a collection admin or for a spam assassin, click on the "access" tab for that collection. Alternatively, the master passwords and IPs can be edited in the gb.conf file in the working dir and collection admin passwords and IPs can be edited in the coll.conf file in the collections subdirectory in the working dir. <br><br>To add a further layer of security, Gigablast can server all of its pages through the https interface. By changing http:// to https:// and using the SSL port you specified in hosts.conf, all requests and responses will be made secure.-->
 <br><br>
 <a name=build></a>
--- a/main.cpp
+++ b/main.cpp
@ -1480,7 +1480,10 @@ int main2 ( int argc , char *argv[] ) {
 	tryagain:
 		if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) {
 			log("db: HttpServer init failed. Another gb "
-			    "already running?" ); 
+			    "already running? If not, try editing "
+			    "./hosts.conf to "
+			    "change the port from %li to something bigger"
+			    , (long)httpPort ); 
 			// this is dangerous!!! do not do the shutdown thing
 			return 1;
 			// just open a socket to port X and send
@ -2763,6 +2766,8 @@ int main2 ( int argc , char *argv[] ) {
 		//fprintf(stderr,"done\n");
 		// set our new pid
 		g_mem.setPid();
+		g_threads.setPid();
+		g_log.setPid();
 	}

 	// initialize threads down here now so it logs to the logfile and
@ -3514,6 +3519,8 @@ int main2 ( int argc , char *argv[] ) {
 	// ok, now activate statsdb
 	g_statsdb.m_disabled = false;

+	log("db: gb is now ready");
+
 	// sync loop
 	//if ( ! g_sync.init() ) {
 	//	log("db: Sync init failed." ); return 1; }