clean up logging so i can see what's going on

2013-12-10 16:41:30 -08:00 · 2013-12-10 16:41:30 -08:00 · 76bb3d05e1
commit 76bb3d05e1
parent db74af766b
15 changed files with 1612 additions and 69 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -37,7 +37,7 @@ Collectiondb::Collectiondb ( ) {

 // reset rdb
 void Collectiondb::reset() {
-	log("db: resetting collectiondb.");
+	log(LOG_INFO,"db: resetting collectiondb.");
 	for ( long i = 0 ; i < m_numRecs ; i++ ) {
 		if ( ! m_recs[i] ) continue;
 		mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); 
@ -96,7 +96,7 @@ bool Collectiondb::load ( bool isDump ) {
 	if ( ! d.open ()) return log("admin: Could not load collection config "
 				     "files.");
 	// note it
-	log(LOG_INIT,"admin: Loading collection config files.");
+	log(LOG_INFO,"db: Loading collection config files.");
 	// . scan through all subdirs in the collections dir
 	// . they should be like, "coll.main/" and "coll.mycollection/"
 	char *f;
@ -118,7 +118,7 @@ bool Collectiondb::load ( bool isDump ) {
 			return false;
 	}
 	// note it
-	log(LOG_INIT,"admin: Loaded data for %li collections. Ranging from "
+	log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
 	    "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
 	// update the time
 	updateTime();
@ -570,7 +570,7 @@ bool Collectiondb::registerCollRec ( CollectionRec *cr ,
 	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;

 	// debug message
-	log ( LOG_INFO, "admin: verified collection \"%s\" (%li).",
+	log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
 	      coll,(long)i);

 	// tell SpiderCache about this collection, it will create a 
@ -703,7 +703,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum , WaitEntry *we ) {
 	char *coll = cr->m_coll;

 	// note it
-	log("coll: deleting coll \"%s\"",coll);
+	log(LOG_INFO,"db: deleting coll \"%s\"",coll);
 	// we need a save
 	m_needsSave = true;

@ -1437,7 +1437,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD LOCAL
 	sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log("coll: loading %s",tmp1);
+	log(LOG_INFO,"db: loading %s",tmp1);
 	m_localCrawlInfo.reset();
 	SafeBuf sb;
 	// fillfromfile returns 0 if does not exist, -1 on read error
@ -1448,7 +1448,7 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// LOAD GLOBAL
 	sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log("coll: loading %s",tmp1);
+	log(LOG_INFO,"db: loading %s",tmp1);
 	m_globalCrawlInfo.reset();
 	sb.reset();
 	if ( sb.fillFromFile ( tmp1 ) > 0 )
@ -1694,7 +1694,7 @@ bool CollectionRec::save ( ) {
 	// binary now
 	sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
 	if ( sb.dumpToFile ( tmp ) == -1 ) {
-		log("coll: failed to save file %s : %s",
+		log("db: failed to save file %s : %s",
 		    tmp,mstrerror(g_errno));
 		g_errno = 0;
 	}
@ -1707,7 +1707,7 @@ bool CollectionRec::save ( ) {
 	// binary now
 	sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
 	if ( sb.dumpToFile ( tmp ) == -1 ) {
-		log("coll: failed to save file %s : %s",
+		log("db: failed to save file %s : %s",
 		    tmp,mstrerror(g_errno));
 		g_errno = 0;
 	}
--- a/Conf.cpp
+++ b/Conf.cpp
@ -328,7 +328,7 @@ void Conf::setRootIps ( ) {
 	for ( long i = 0 ; i < n ; i++ ) {
 		m_rnsIps  [i] = atoip(rootIps[i],gbstrlen(rootIps[i]));
 		m_rnsPorts[i] = 53;
-		log("dns: Using root nameserver #%li %s.",
+		log(LOG_INIT,"dns: Using root nameserver #%li %s.",
 		    i,iptoa(m_rnsIps[i]));
 	}
 }
--- a/File.cpp
+++ b/File.cpp
@ -699,7 +699,7 @@ bool File::unlink ( ) {
 	// return false and set g_errno on error
 	if ( status  < 0 ) return false;
 	// log it so we can see what happened to timedb!
-	log("disk: unlinking %s", m_filename );
+	log(LOG_INFO,"disk: unlinking %s", m_filename );
 	// remove ourselves from the disk
 	if ( ::unlink ( m_filename ) == 0 ) return true;
 	// sync it to disk in case power goes out
--- a/Json.cpp
+++ b/Json.cpp
@ -346,10 +346,13 @@ void Json::test ( ) {


 	long niceness = 0;
+
 	JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );

 	// print them out?
-	log("json: type0=%li",(long)ji->m_type);
+	//log("json: type0=%li",(long)ji->m_type);
+	// sanity test
+	if ( ji->m_type != 6 ) { char *xx=NULL;*xx=0; }

 	return;
 }
--- a/Make.depend
+++ b/Make.depend
--- a/PageEvents.cpp
+++ b/PageEvents.cpp
@ -17156,7 +17156,7 @@ bool sendPageSiteMap ( TcpSocket *s , HttpRequest *r ) {
 #include "HashTable.h"
 #include "Msg4.h"
 #include "AutoBan.h"
-#include "CollectionRec.h"
+//#include "CollectionRec.h"
 //#include "Links.h"
 #include "Users.h"
 #include "HashTableT.h"
--- a/Parms.cpp
+++ b/Parms.cpp
@ -3700,13 +3700,14 @@ bool Parms::setFromFile ( void *THIS        ,
 	// . all the collectionRecs have the same default file in
 	//   the workingDir/collections/default.conf
 	// . so use our built in buffer for that
+	/*
 	if ( THIS != &g_conf && ! m_isDefaultLoaded ) {
 		m_isDefaultLoaded = true;
 		File f;
 		f.set ( filenameDef );
 		if ( ! f.doesExist() ) {
 			log(LOG_INIT,
-			    "admin: Default collection configuration file "
+			    "db: Default collection configuration file "
 			    "%s was not found. Newly created collections "
 			    "will use hard coded defaults.",f.getFilename());
 			goto skip;
@ -3718,6 +3719,7 @@ bool Parms::setFromFile ( void *THIS        ,
 	}

 skip:
+	*/
 	long  vlen;
 	char *v ;
 	//char  c ;
@ -3928,9 +3930,9 @@ bool Parms::setFromFile ( void *THIS        ,

 	// always make sure we got some admin security
 	if ( g_conf.m_numMasterIps <= 0 && g_conf.m_numMasterPwds <= 0 ) {
-		log(LOG_INFO,
-		    "conf: No master IP or password provided. Using default "
-		    "password 'footbar23'." );
+		//log(LOG_INFO,
+		//    "conf: No master IP or password provided. Using default "
+		//    "password 'footbar23'." );
 		//g_conf.m_masterIps[0] = atoip ( "64.139.94.202", 13 );
 		//g_conf.m_numMasterIps = 1;
 		strcpy ( g_conf.m_masterPwds[0] , "footbar23" );
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -100,7 +100,7 @@ void Rdb::addBase ( collnum_t collnum , RdbBase *base ) {
 	if ( ! cr ) return;
 	if ( cr->m_bases[(unsigned char)m_rdbId] ) { char *xx=NULL;*xx=0; }
 	cr->m_bases[(unsigned char)m_rdbId] = base;
-	log("rdb: added base to collrec "
+	log ( LOG_INFO,"db: added base to collrec "
 	    "for rdb=%s rdbid=%li coll=%s collnum=%li base=0x%lx",
 	    m_dbname,(long)m_rdbId,cr->m_coll,(long)collnum,(long)base);
 }
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -132,8 +132,9 @@ bool RdbBase::init ( char  *dir            ,
 	char tmp[1024];
 	sprintf ( tmp , "%scoll.%s.%li" , dir , coll , (long)collnum );

-	// debug
-	log("base: adding new base for dir=%s coll=%s collnum=%li db=%s",
+	// logDebugAdmin
+	log(LOG_INIT,"db: "
+	    "adding new base for dir=%s coll=%s collnum=%li db=%s",
 	    dir,coll,(long)collnum,dbname);

 	// catdb is collection independent
@ -502,7 +503,7 @@ bool RdbBase::setFiles ( ) {
 		// we are getting this from a bogus m_dir
 		return log("db: Had error opening directory %s", getDir());
 	// note it
-	logf(LOG_INFO,"db: Loading files for %s coll=%s (%li).",
+	log(LOG_DEBUG,"db: Loading files for %s coll=%s (%li).",
 	     m_dbname,m_coll,(long)m_collnum );
 	// . set our m_files array
 	// . addFile() will return -1 and set g_errno on error
--- a/Speller.cpp
+++ b/Speller.cpp
@ -1021,7 +1021,7 @@ bool Speller::loadUnifiedDict() {
 		char *end   = start + m_unifiedBuf.length();
 		for ( char *p = start ; p < end ; p++ )
 			if ( *p == '\n' ) *p = '\0';
-		log("speller: done loading successfully");
+		log(LOG_DEBUG,"speller: done loading successfully");

 		// a quick little checksum
 		if ( ! g_conf.m_isLive ) return true;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -915,7 +915,7 @@ bool SpiderCache::needsSave ( ) {
 }

 void SpiderCache::reset ( ) {
-	log("spider: resetting spidercache");
+	log(LOG_DEBUG,"spider: resetting spidercache");
 	// loop over all SpiderColls and get the best
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
 		SpiderColl *sc = getSpiderCollIffNonNull(i);
@ -970,7 +970,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 	//m_spiderColls [ collnum ] = sc;
 	cr->m_spiderColl = sc;
 	// note it
-	log("spider: made spidercoll=%lx for cr=%lx",
+	log(LOG_DEBUG,"spider: made spidercoll=%lx for cr=%lx",
 	    (long)sc,(long)cr);
 	// update this
 	//if ( m_numSpiderColls < collnum + 1 )
@ -992,7 +992,8 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 	// sanity check
 	if ( ! cr ) { char *xx=NULL;*xx=0; }
 	// note it!
-	log("spider: adding new spider collection for %s",cr->m_coll);
+	log(LOG_DEBUG,"spider: adding new spider collection for %s",
+	    cr->m_coll);
 	// that was it
 	return sc;
 }
@ -1130,7 +1131,7 @@ bool SpiderColl::load ( ) {
 //   this should block since we are at startup...
 bool SpiderColl::makeDoleIPTable ( ) {

-	log("spider: making dole ip table for %s",m_coll);
+	log(LOG_DEBUG,"spider: making dole ip table for %s",m_coll);

 	key_t startKey ; startKey.setMin();
 	key_t endKey   ; endKey.setMax();
@ -1203,7 +1204,7 @@ bool SpiderColl::makeDoleIPTable ( ) {
 	// watch out for wrap around
 	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;
 done:
-	log("spider: making dole ip table done.");
+	log(LOG_DEBUG,"spider: making dole ip table done.");
 	// re-enable threads
 	if ( enabled ) g_threads.enableThreads();
 	// we wrapped, all done
@ -1317,7 +1318,8 @@ void SpiderColl::urlFiltersChanged ( ) {

 // this one has to scan all of spiderdb
 bool SpiderColl::makeWaitingTree ( ) {
-	log("spider: making waiting tree for %s",m_coll);
+
+	log(LOG_DEBUG,"spider: making waiting tree for %s",m_coll);

 	key128_t startKey ; startKey.setMin();
 	key128_t endKey   ; endKey.setMax();
@ -1408,7 +1410,7 @@ bool SpiderColl::makeWaitingTree ( ) {
 	// watch out for wrap around
 	if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop;
 done:
-	log("spider: making waiting tree done.");
+	log(LOG_DEBUG,"spider: making waiting tree done.");
 	// re-enable threads
 	if ( enabled ) g_threads.enableThreads();
 	// we wrapped, all done
@ -1444,7 +1446,7 @@ long long SpiderColl::getEarliestSpiderTimeFromWaitingTree ( long firstIp ) {


 bool SpiderColl::makeWaitingTable ( ) {
-	logf(LOG_INFO,"spider: making waiting table for %s.",m_coll);
+	log(LOG_DEBUG,"spider: making waiting table for %s.",m_coll);
 	long node = m_waitingTree.getFirstNode();
 	for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) {
 		// breathe
@ -1460,7 +1462,7 @@ bool SpiderColl::makeWaitingTable ( ) {
 		// store in waiting table
 		if ( ! m_waitingTable.addKey(&ip,&spiderTimeMS) ) return false;
 	}
-	logf(LOG_INFO,"spider: making waiting table done.");
+	log(LOG_DEBUG,"spider: making waiting table done.");
 	return true;
 }

@ -1536,7 +1538,7 @@ void SpiderColl::reset ( ) {

 	char *coll = "unknown";
 	if ( m_coll[0] ) coll = m_coll;
-	logf(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);
+	log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);

 	m_ufnMapValid = false;

@ -4221,7 +4223,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 				// if a scan is ongoing, this will re-set it
 				sc->m_nextKey2.setMin();
 				sc->m_waitingTreeNeedsRebuild = true;
-				log("spider: hit rebuild timeout for %s",
+				log(LOG_INFO,
+				    "spider: hit rebuild timeout for %s",
 				    cr->m_coll);
 				// flush the ufn table
 				clearUfnTable();
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -1873,7 +1873,7 @@ bool Tagdb::verify ( char *coll ) {
 	char *rdbName = NULL;
 	rdbName = "Tagdb";
 	
-	log ( LOG_INFO, "tagdb: Verifying %s for coll %s...", rdbName, coll );
+	log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll );
 	
 	g_threads.disableThreads();

@ -1945,7 +1945,7 @@ bool Tagdb::verify ( char *coll ) {
 		g_threads.enableThreads();
 		return g_conf.m_bypassValidation;
 	}
-	log ( LOG_INFO, "tagdb: %s passed verification successfully for %li "
+	log ( LOG_INFO, "db: %s passed verification successfully for %li "
 	      "recs.",rdbName, count );

 	// turn threads back on
--- a/Threads.cpp
+++ b/Threads.cpp
@ -183,13 +183,15 @@ bool Threads::init ( ) {
 	// set s_pid to the main process id
 #ifdef PTHREADS
 	s_pid = pthread_self();
-	log("threads: main process THREAD id = %lu",(long unsigned)s_pid);
+	log(LOG_INFO,
+	    "threads: main process THREAD id = %lu",(long unsigned)s_pid);
 	pthread_t tid = pthread_self();
 	sched_param param;
 	int policy;
 	// scheduling parameters of target thread
 	pthread_getschedparam ( tid, &policy, &param);
-	log("threads: min/max thread priority settings = %li/%li (policy=%li)",
+	log(LOG_INFO,
+	    "threads: min/max thread priority settings = %li/%li (policy=%li)",
 	    (long)sched_get_priority_min(policy),
 	    (long)sched_get_priority_max(policy),
 	    (long)policy);
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -84,9 +84,6 @@
 # The spider round number.
 <spiderRoundNum>0</>

-# The spider status number.
-<spiderStatus>0</>
-
 # Do searches for queries in this hosts part of the query log.
 <scrapingEnabledProcog>0</>

@ -354,34 +351,6 @@
 # <i>undefined</i> to indicate no change in the priority of the url.
 <priorityOfUrlsBeingRetried>-1</>

-# Weight title this much more or less. This units are percentage. A 100 means
-# to not give the title any special weight. Generally, though, you want to
-# give it significantly more weight than that, so 2400 is the default.
-<titleWeight>4600</>
-
-# Weight terms in header tags by this much more or less. This units are
-# percentage. A 100 means to not give the header any special weight.
-# Generally, though, you want to give it significantly more weight than that,
-# so 600 is the default.
-<headerWeight>600</>
-
-# Weight text in url path this much more. The units are percentage. A 100
-# means to not give any special weight. Generally, though, you want to give it
-# significantly more weight than that, so 600 is the default.
-<urlPathWordWeight>1600</>
-
-# Weight text in the incoming external link text this much more. The units are
-# percentage. It already receives a decent amount of weight naturally.
-<externalLinkTextWeight>600</>
-
-# Weight text in the incoming internal link text this much more. The units are
-# percentage. It already receives a decent amount of weight naturally.
-<internalLinkTextWeight>200</>
-
-# Weight concepts this much more. The units are percentage. It already
-# receives a decent amount of weight naturally. AKA: surrounding text boost.
-<conceptWeight>50</>
-
 # If this is true Gigablast will only search the root index file for docIds.
 # Saves on disk seeks, but may use older versions of indexed web pages.
 <restrictIndexdbForQueries>0</>
--- a/hosts.conf
+++ b/hosts.conf
@ -55,7 +55,7 @@ num-mirrors: 0
 # The working directory is the last string on each line. That is where the
 # 'gb' binary resides.
 #
-0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/github/
+0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/parmdb/