clean up logging so i can see what's going on

This commit is contained in:
mwells 2013-12-10 16:41:30 -08:00
parent db74af766b
commit 76bb3d05e1
15 changed files with 1612 additions and 69 deletions

@ -37,7 +37,7 @@ Collectiondb::Collectiondb ( ) {
// reset rdb
void Collectiondb::reset() {
log("db: resetting collectiondb.");
log(LOG_INFO,"db: resetting collectiondb.");
for ( long i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
@ -96,7 +96,7 @@ bool Collectiondb::load ( bool isDump ) {
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
// note it
log(LOG_INIT,"admin: Loading collection config files.");
log(LOG_INFO,"db: Loading collection config files.");
// . scan through all subdirs in the collections dir
// . they should be like, "coll.main/" and "coll.mycollection/"
char *f;
@ -118,7 +118,7 @@ bool Collectiondb::load ( bool isDump ) {
return false;
}
// note it
log(LOG_INIT,"admin: Loaded data for %li collections. Ranging from "
log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
"collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
// update the time
updateTime();
@ -570,7 +570,7 @@ bool Collectiondb::registerCollRec ( CollectionRec *cr ,
if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError;
// debug message
log ( LOG_INFO, "admin: verified collection \"%s\" (%li).",
log ( LOG_INFO, "db: verified collection \"%s\" (%li).",
coll,(long)i);
// tell SpiderCache about this collection, it will create a
@ -703,7 +703,7 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum , WaitEntry *we ) {
char *coll = cr->m_coll;
// note it
log("coll: deleting coll \"%s\"",coll);
log(LOG_INFO,"db: deleting coll \"%s\"",coll);
// we need a save
m_needsSave = true;
@ -1437,7 +1437,7 @@ bool CollectionRec::load ( char *coll , long i ) {
// LOAD LOCAL
sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: loading %s",tmp1);
log(LOG_INFO,"db: loading %s",tmp1);
m_localCrawlInfo.reset();
SafeBuf sb;
// fillfromfile returns 0 if does not exist, -1 on read error
@ -1448,7 +1448,7 @@ bool CollectionRec::load ( char *coll , long i ) {
// LOAD GLOBAL
sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: loading %s",tmp1);
log(LOG_INFO,"db: loading %s",tmp1);
m_globalCrawlInfo.reset();
sb.reset();
if ( sb.fillFromFile ( tmp1 ) > 0 )
@ -1694,7 +1694,7 @@ bool CollectionRec::save ( ) {
// binary now
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
if ( sb.dumpToFile ( tmp ) == -1 ) {
log("coll: failed to save file %s : %s",
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
@ -1707,7 +1707,7 @@ bool CollectionRec::save ( ) {
// binary now
sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
if ( sb.dumpToFile ( tmp ) == -1 ) {
log("coll: failed to save file %s : %s",
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}

@ -328,7 +328,7 @@ void Conf::setRootIps ( ) {
for ( long i = 0 ; i < n ; i++ ) {
m_rnsIps [i] = atoip(rootIps[i],gbstrlen(rootIps[i]));
m_rnsPorts[i] = 53;
log("dns: Using root nameserver #%li %s.",
log(LOG_INIT,"dns: Using root nameserver #%li %s.",
i,iptoa(m_rnsIps[i]));
}
}

@ -699,7 +699,7 @@ bool File::unlink ( ) {
// return false and set g_errno on error
if ( status < 0 ) return false;
// log it so we can see what happened to timedb!
log("disk: unlinking %s", m_filename );
log(LOG_INFO,"disk: unlinking %s", m_filename );
// remove ourselves from the disk
if ( ::unlink ( m_filename ) == 0 ) return true;
// sync it to disk in case power goes out

@ -346,10 +346,13 @@ void Json::test ( ) {
long niceness = 0;
JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );
// print them out?
log("json: type0=%li",(long)ji->m_type);
//log("json: type0=%li",(long)ji->m_type);
// sanity test
if ( ji->m_type != 6 ) { char *xx=NULL;*xx=0; }
return;
}

File diff suppressed because it is too large Load Diff

@ -17156,7 +17156,7 @@ bool sendPageSiteMap ( TcpSocket *s , HttpRequest *r ) {
#include "HashTable.h"
#include "Msg4.h"
#include "AutoBan.h"
#include "CollectionRec.h"
//#include "CollectionRec.h"
//#include "Links.h"
#include "Users.h"
#include "HashTableT.h"

@ -3700,13 +3700,14 @@ bool Parms::setFromFile ( void *THIS ,
// . all the collectionRecs have the same default file in
// the workingDir/collections/default.conf
// . so use our built in buffer for that
/*
if ( THIS != &g_conf && ! m_isDefaultLoaded ) {
m_isDefaultLoaded = true;
File f;
f.set ( filenameDef );
if ( ! f.doesExist() ) {
log(LOG_INIT,
"admin: Default collection configuration file "
"db: Default collection configuration file "
"%s was not found. Newly created collections "
"will use hard coded defaults.",f.getFilename());
goto skip;
@ -3718,6 +3719,7 @@ bool Parms::setFromFile ( void *THIS ,
}
skip:
*/
long vlen;
char *v ;
//char c ;
@ -3928,9 +3930,9 @@ bool Parms::setFromFile ( void *THIS ,
// always make sure we got some admin security
if ( g_conf.m_numMasterIps <= 0 && g_conf.m_numMasterPwds <= 0 ) {
log(LOG_INFO,
"conf: No master IP or password provided. Using default "
"password 'footbar23'." );
//log(LOG_INFO,
// "conf: No master IP or password provided. Using default "
// "password 'footbar23'." );
//g_conf.m_masterIps[0] = atoip ( "64.139.94.202", 13 );
//g_conf.m_numMasterIps = 1;
strcpy ( g_conf.m_masterPwds[0] , "footbar23" );

@ -100,7 +100,7 @@ void Rdb::addBase ( collnum_t collnum , RdbBase *base ) {
if ( ! cr ) return;
if ( cr->m_bases[(unsigned char)m_rdbId] ) { char *xx=NULL;*xx=0; }
cr->m_bases[(unsigned char)m_rdbId] = base;
log("rdb: added base to collrec "
log ( LOG_INFO,"db: added base to collrec "
"for rdb=%s rdbid=%li coll=%s collnum=%li base=0x%lx",
m_dbname,(long)m_rdbId,cr->m_coll,(long)collnum,(long)base);
}

@ -132,8 +132,9 @@ bool RdbBase::init ( char *dir ,
char tmp[1024];
sprintf ( tmp , "%scoll.%s.%li" , dir , coll , (long)collnum );
// debug
log("base: adding new base for dir=%s coll=%s collnum=%li db=%s",
// logDebugAdmin
log(LOG_INIT,"db: "
"adding new base for dir=%s coll=%s collnum=%li db=%s",
dir,coll,(long)collnum,dbname);
// catdb is collection independent
@ -502,7 +503,7 @@ bool RdbBase::setFiles ( ) {
// we are getting this from a bogus m_dir
return log("db: Had error opening directory %s", getDir());
// note it
logf(LOG_INFO,"db: Loading files for %s coll=%s (%li).",
log(LOG_DEBUG,"db: Loading files for %s coll=%s (%li).",
m_dbname,m_coll,(long)m_collnum );
// . set our m_files array
// . addFile() will return -1 and set g_errno on error

@ -1021,7 +1021,7 @@ bool Speller::loadUnifiedDict() {
char *end = start + m_unifiedBuf.length();
for ( char *p = start ; p < end ; p++ )
if ( *p == '\n' ) *p = '\0';
log("speller: done loading successfully");
log(LOG_DEBUG,"speller: done loading successfully");
// a quick little checksum
if ( ! g_conf.m_isLive ) return true;

@ -915,7 +915,7 @@ bool SpiderCache::needsSave ( ) {
}
void SpiderCache::reset ( ) {
log("spider: resetting spidercache");
log(LOG_DEBUG,"spider: resetting spidercache");
// loop over all SpiderColls and get the best
for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
SpiderColl *sc = getSpiderCollIffNonNull(i);
@ -970,7 +970,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
//m_spiderColls [ collnum ] = sc;
cr->m_spiderColl = sc;
// note it
log("spider: made spidercoll=%lx for cr=%lx",
log(LOG_DEBUG,"spider: made spidercoll=%lx for cr=%lx",
(long)sc,(long)cr);
// update this
//if ( m_numSpiderColls < collnum + 1 )
@ -992,7 +992,8 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
// sanity check
if ( ! cr ) { char *xx=NULL;*xx=0; }
// note it!
log("spider: adding new spider collection for %s",cr->m_coll);
log(LOG_DEBUG,"spider: adding new spider collection for %s",
cr->m_coll);
// that was it
return sc;
}
@ -1130,7 +1131,7 @@ bool SpiderColl::load ( ) {
// this should block since we are at startup...
bool SpiderColl::makeDoleIPTable ( ) {
log("spider: making dole ip table for %s",m_coll);
log(LOG_DEBUG,"spider: making dole ip table for %s",m_coll);
key_t startKey ; startKey.setMin();
key_t endKey ; endKey.setMax();
@ -1203,7 +1204,7 @@ bool SpiderColl::makeDoleIPTable ( ) {
// watch out for wrap around
if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;
done:
log("spider: making dole ip table done.");
log(LOG_DEBUG,"spider: making dole ip table done.");
// re-enable threads
if ( enabled ) g_threads.enableThreads();
// we wrapped, all done
@ -1317,7 +1318,8 @@ void SpiderColl::urlFiltersChanged ( ) {
// this one has to scan all of spiderdb
bool SpiderColl::makeWaitingTree ( ) {
log("spider: making waiting tree for %s",m_coll);
log(LOG_DEBUG,"spider: making waiting tree for %s",m_coll);
key128_t startKey ; startKey.setMin();
key128_t endKey ; endKey.setMax();
@ -1408,7 +1410,7 @@ bool SpiderColl::makeWaitingTree ( ) {
// watch out for wrap around
if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop;
done:
log("spider: making waiting tree done.");
log(LOG_DEBUG,"spider: making waiting tree done.");
// re-enable threads
if ( enabled ) g_threads.enableThreads();
// we wrapped, all done
@ -1444,7 +1446,7 @@ long long SpiderColl::getEarliestSpiderTimeFromWaitingTree ( long firstIp ) {
bool SpiderColl::makeWaitingTable ( ) {
logf(LOG_INFO,"spider: making waiting table for %s.",m_coll);
log(LOG_DEBUG,"spider: making waiting table for %s.",m_coll);
long node = m_waitingTree.getFirstNode();
for ( ; node >= 0 ; node = m_waitingTree.getNextNode(node) ) {
// breathe
@ -1460,7 +1462,7 @@ bool SpiderColl::makeWaitingTable ( ) {
// store in waiting table
if ( ! m_waitingTable.addKey(&ip,&spiderTimeMS) ) return false;
}
logf(LOG_INFO,"spider: making waiting table done.");
log(LOG_DEBUG,"spider: making waiting table done.");
return true;
}
@ -1536,7 +1538,7 @@ void SpiderColl::reset ( ) {
char *coll = "unknown";
if ( m_coll[0] ) coll = m_coll;
logf(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);
log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);
m_ufnMapValid = false;
@ -4221,7 +4223,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
// if a scan is ongoing, this will re-set it
sc->m_nextKey2.setMin();
sc->m_waitingTreeNeedsRebuild = true;
log("spider: hit rebuild timeout for %s",
log(LOG_INFO,
"spider: hit rebuild timeout for %s",
cr->m_coll);
// flush the ufn table
clearUfnTable();

@ -1873,7 +1873,7 @@ bool Tagdb::verify ( char *coll ) {
char *rdbName = NULL;
rdbName = "Tagdb";
log ( LOG_INFO, "tagdb: Verifying %s for coll %s...", rdbName, coll );
log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll );
g_threads.disableThreads();
@ -1945,7 +1945,7 @@ bool Tagdb::verify ( char *coll ) {
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_INFO, "tagdb: %s passed verification successfully for %li "
log ( LOG_INFO, "db: %s passed verification successfully for %li "
"recs.",rdbName, count );
// turn threads back on

@ -183,13 +183,15 @@ bool Threads::init ( ) {
// set s_pid to the main process id
#ifdef PTHREADS
s_pid = pthread_self();
log("threads: main process THREAD id = %lu",(long unsigned)s_pid);
log(LOG_INFO,
"threads: main process THREAD id = %lu",(long unsigned)s_pid);
pthread_t tid = pthread_self();
sched_param param;
int policy;
// scheduling parameters of target thread
pthread_getschedparam ( tid, &policy, &param);
log("threads: min/max thread priority settings = %li/%li (policy=%li)",
log(LOG_INFO,
"threads: min/max thread priority settings = %li/%li (policy=%li)",
(long)sched_get_priority_min(policy),
(long)sched_get_priority_max(policy),
(long)policy);

@ -84,9 +84,6 @@
# The spider round number.
<spiderRoundNum>0</>
# The spider status number.
<spiderStatus>0</>
# Do searches for queries in this hosts part of the query log.
<scrapingEnabledProcog>0</>
@ -354,34 +351,6 @@
# <i>undefined</i> to indicate no change in the priority of the url.
<priorityOfUrlsBeingRetried>-1</>
# Weight title this much more or less. This units are percentage. A 100 means
# to not give the title any special weight. Generally, though, you want to
# give it significantly more weight than that, so 2400 is the default.
<titleWeight>4600</>
# Weight terms in header tags by this much more or less. This units are
# percentage. A 100 means to not give the header any special weight.
# Generally, though, you want to give it significantly more weight than that,
# so 600 is the default.
<headerWeight>600</>
# Weight text in url path this much more. The units are percentage. A 100
# means to not give any special weight. Generally, though, you want to give it
# significantly more weight than that, so 600 is the default.
<urlPathWordWeight>1600</>
# Weight text in the incoming external link text this much more. The units are
# percentage. It already receives a decent amount of weight naturally.
<externalLinkTextWeight>600</>
# Weight text in the incoming internal link text this much more. The units are
# percentage. It already receives a decent amount of weight naturally.
<internalLinkTextWeight>200</>
# Weight concepts this much more. The units are percentage. It already
# receives a decent amount of weight naturally. AKA: surrounding text boost.
<conceptWeight>50</>
# If this is true Gigablast will only search the root index file for docIds.
# Saves on disk seeks, but may use older versions of indexed web pages.
<restrictIndexdbForQueries>0</>

@ -55,7 +55,7 @@ num-mirrors: 0
# The working directory is the last string on each line. That is where the
# 'gb' binary resides.
#
0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/github/
0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/parmdb/