More differentiated trace log

This commit is contained in:
Brian Rasmusson
2016-02-10 14:49:51 +01:00
parent badf208efe
commit f662aed2af
6 changed files with 703 additions and 367 deletions

3
Conf.h

@ -576,6 +576,9 @@ class Conf {
bool m_logTraceBigFile;
bool m_logTraceRepairs;
bool m_logTraceSpider;
bool m_logTraceXmlDoc;
bool m_logTraceMsg0;
bool m_logTraceMsg;
// expensive timing messages
bool m_logTimingAddurl ;

@ -132,7 +132,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// bool allowPageCache ) {
//#endif
// if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN. hostId: %"INT64", rdbId: %d", __FILE__,__func__, __LINE__, hostId, (int)rdbId);
// this is obsolete! mostly, but we need it for PageIndexdb.cpp to
// show a "termlist" for a given query term in its entirety so you
@ -153,11 +153,11 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// get keySize of rdb
m_ks = getKeySizeFromRdbId ( rdbId );
// if( g_conf.m_logDebugDetailed )
// if( g_conf.m_logTraceMsg0 )
// {
// log("%s:%s: rdbId. [%d]", __FILE__,__FUNCTION__, (int)rdbId);
// log("%s:%s: m_ks.. [%d]", __FILE__,__FUNCTION__, (int)m_ks);
// log("%s:%s: hostId [%"INT64"]", __FILE__,__FUNCTION__, hostId);
// log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId);
// log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks);
// log("%s:%s:%d: hostId [%"INT64"]", __FILE__,__func__,__LINE__, hostId);
// }
@ -170,7 +170,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// no longer accept negative minrecsize
if ( minRecSizes < 0 ) {
g_errno = EBADENGINEER;
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
log(LOG_LOGIC,
"net: msg0: Negative minRecSizes no longer supported.");
@ -235,7 +235,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();
// if( g_conf.m_logDebugDetailed ) log("%s:%s: shardNum [%"INT32"]", __FILE__,__FUNCTION__, m_shardNum);
// if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%"INT32"]", __FILE__,__func__, __LINE__, m_shardNum);
// . store these parameters
@ -321,7 +321,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// . don't do this if m_hostId was specified
if ( isLocal )
{
// if( g_conf.m_logDebugDetailed ) log("%s:%s: isLocal", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: isLocal", __FILE__, __func__, __LINE__);
if ( msg5 ) {
@ -389,6 +389,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
m_allowPageCache ) ) return false;
// nuke it
reset();
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
return true;
}
skip:
@ -454,6 +455,8 @@ skip:
"data read remotely from %s: %s.",
replyBufMaxSize,getDbnameFromId(m_rdbId),
mstrerror(g_errno));
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return true. Could not allocate memory.", __FILE__, __func__, __LINE__);
return true;
}
}
@ -491,8 +494,10 @@ skip:
g_errno = EBADHOSTID;
log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".",
m_hostId);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, retruen true. Bad hostId", __FILE__, __func__, __LINE__);
return true;
}
// if niceness is 0, use the higher priority udpServer
UdpServer *us ;
uint16_t port;
@ -520,8 +525,13 @@ skip:
replyBuf ,
replyBufMaxSize ,
m_niceness ) ) // cback niceness
{
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return true. Request sent.", __FILE__, __func__, __LINE__);
return true;
}
// return false cuz it blocked
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return false. sendRequest blocked", __FILE__, __func__, __LINE__);
return false;
}
// timing debug
@ -635,8 +645,9 @@ skip:
// this is a 96-bit key. TODO: fix...
0 , // *(key_t *)cacheKey ,
rdbId ,
minRecSizes ) ) {
log("net: Failed to send request for data from %s in shard "
minRecSizes ) )
{
log(LOG_ERROR, "net: Failed to send request for data from %s in shard "
"#%"UINT32" over network: %s.",
getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
// no, multicast will free this when it is destroyed
@ -647,37 +658,38 @@ skip:
m->reset();
if ( m_numRequests > 0 )
{
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning false", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning false", __FILE__, __func__, __LINE__);
return false;
}
//#else
// m_mcast.reset();
//#endif
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning true", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning true", __FILE__, __func__, __LINE__);
return true;
}
//#ifdef SPLIT_INDEXDB
m_numRequests++;
//#endif
// we blocked
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning false", __FILE__,__FUNCTION__);
// we blocked
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning false, blocked", __FILE__, __func__, __LINE__);
return false;
}
// . this is called when we got a local RdbList
// . we need to call it to call the original caller callback
void gotListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
Msg0 *THIS = (Msg0 *) state;
THIS->reset(); // delete m_msg5
THIS->m_callback ( THIS->m_state );//, THIS->m_list );
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. rdbId=%d", __FILE__, __func__, __LINE__, (int)THIS->m_rdbId);
}
@ -700,7 +712,7 @@ void gotSingleReplyWrapper ( void *state , UdpSlot *slot ) {
void gotMulticastReplyWrapper0 ( void *state , void *state2 )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
Msg0 *THIS = (Msg0 *)state;
//#ifdef SPLIT_INDEXDB
@ -746,7 +758,7 @@ void gotMulticastReplyWrapper0 ( void *state , void *state2 )
THIS->m_callback ( THIS->m_state );//, THIS->m_list );
//}
//#endif
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
}
@ -857,7 +869,7 @@ void Msg0::gotSplitReply ( ) {
// . we are responsible for freeing reply/replySize
void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
// timing debug
@ -909,7 +921,7 @@ void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize )
//cache->addList ( m_startKey , m_list ) ;
// reset g_errno -- we don't care if cache coulnd't add it
//g_errno = 0;
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
}
@ -932,7 +944,7 @@ public:
void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN. Got request for an RdbList", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN. Got request for an RdbList", __FILE__, __func__, __LINE__);
// if niceness is 0, use the higher priority udpServer
UdpServer *us = &g_udpServer;
@ -975,18 +987,18 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
CollectionRec *xcr = g_collectiondb.getRec ( collnum );
if ( ! xcr ) g_errno = ENOCOLLREC;
if( g_conf.m_logDebugDetailed )
if( g_conf.m_logTraceMsg0 )
{
log("%s:%s: rdbId....... %d", __FILE__,__FUNCTION__, (int)rdbId);
log("%s:%s: key size.... %d", __FILE__,__FUNCTION__, (int)ks);
log("%s:%s: startFileNum %"INT32"", __FILE__,__FUNCTION__, startFileNum);
log("%s:%s: numFiles.... %"INT32"", __FILE__,__FUNCTION__, numFiles);
log("%s:%s:%d: rdbId....... %d", __FILE__,__func__, __LINE__, (int)rdbId);
log("%s:%s:%d: key size.... %d", __FILE__,__func__, __LINE__, (int)ks);
log("%s:%s:%d: startFileNum %"INT32"", __FILE__,__func__, __LINE__,startFileNum);
log("%s:%s:%d: numFiles.... %"INT32"", __FILE__,__func__, __LINE__, numFiles);
}
// error set from XmlDoc::cacheTermLists()?
if ( g_errno )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. Invalid collection", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. Invalid collection", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , EBADRDBID );
return;
}
@ -998,7 +1010,7 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
Rdb *rdb = getRdbFromId ( rdbId );
if ( ! rdb )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. Invalid rdbId", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. Invalid rdbId", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , EBADRDBID );
return;
@ -1064,16 +1076,16 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
false,
allowPageCache ) )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. m_msg5.getList returned false", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. m_msg5.getList returned false", __FILE__, __func__, __LINE__);
return;
}
// call wrapper ouselves
if( g_conf.m_logDebugDetailed ) log("%s:%s: Calling gotListWrapper", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: Calling gotListWrapper", __FILE__, __func__, __LINE__);
gotListWrapper ( st0 , NULL , NULL );
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
}
#include "Sections.h" // SectionVote
@ -1082,7 +1094,7 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
{
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
// get the state
State00 *st0 = (State00 *)state;
@ -1313,7 +1325,7 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
// if need be
st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true );
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
}

@ -12285,6 +12285,17 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for BigFile";
m->m_cgi = "ltrc_bf";
m->m_off = (char *)&g_conf.m_logTraceBigFile - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for RdbBase";
m->m_cgi = "ltrc_rb";
m->m_off = (char *)&g_conf.m_logTraceRdbBase - g;
@ -12305,16 +12316,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for BigFile";
m->m_cgi = "ltrc_bf";
m->m_off = (char *)&g_conf.m_logTraceBigFile - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for Repairs";
m->m_cgi = "ltrc_rp";
m->m_off = (char *)&g_conf.m_logTraceRepairs - g;
@ -12335,6 +12336,26 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for Msg0";
m->m_cgi = "ltrc_msgzero";
m->m_off = (char *)&g_conf.m_logTraceMsg0 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for XmlDoc";
m->m_cgi = "ltrc_xmldoc";
m->m_off = (char *)&g_conf.m_logTraceXmlDoc - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log trace info for network messages (excessive!)";
m->m_cgi = "trcmsg";

@ -1139,15 +1139,24 @@ bool Repair::loop ( void *state ) {
// BEGIN NEW STUFF
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: injectTitleRec", __FILE__, __func__, __LINE__);
bool status = injectTitleRec();
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: injectTitleRec returned %s", __FILE__, __func__, __LINE__, status?"true":"false");
//return false; // (state)
// try to launch another
if ( m_numOutstandingInjects<g_conf.m_maxRepairSpiders ) {
m_stage = STAGE_TITLEDB_0;
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: Still have more free repair spiders, loop.", __FILE__, __func__, __LINE__);
goto loop1;
}
// if we are full and it blocked... wait now
if ( ! status ) return false;
if ( ! status )
{
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: END, return false. Full queue and blocked.", __FILE__, __func__, __LINE__);
return false;
}
}
if ( m_stage == STAGE_TITLEDB_4 ) {
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: STAGE_TITLEDB_4", __FILE__, __func__, __LINE__);
m_stage++;
@ -1166,6 +1175,7 @@ bool Repair::loop ( void *state ) {
// tell injection complete wrapper to call us back, otherwise
// we never end up moving on to the spider phase
g_repair.m_allowInjectToLoop = true;
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: END, return false. Have %"INT32" outstanding injects", __FILE__, __func__, __LINE__, m_numOutstandingInjects);
return false;
}
@ -2182,6 +2192,7 @@ bool Repair::injectTitleRec ( ) {
// . get the meta list to add
// . sets m_usePosdb, m_useTitledb, etc.
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: Calling indexDoc", __FILE__, __func__, __LINE__);
bool status = xd->indexDoc ( );
// blocked?
if ( ! status )

File diff suppressed because it is too large Load Diff

@ -363,6 +363,8 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll ( HashTableX *table ) {
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__,__func__, __LINE__);
setStatus ( "hashing document" );
if ( m_allHashed ) return (char *)1;
@ -378,7 +380,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
uint8_t *ct = getContentType();
if ( ! ct )
{
if( g_conf.m_logDebugDetailed ) log(LOG_TRACE,"%s:%s: getContentType failed", __FILE__,__func__);
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getContentType failed", __FILE__,__func__, __LINE__);
return NULL;
}
@ -390,6 +392,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// eventually ban it.
if ( !hashUrl( table, true ) ) // urlOnly (skip IP and term generation)
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
return NULL;
}
m_allHashed = true;
@ -399,44 +402,78 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
if ( ! hc || hc == (void *)-1 )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getHopCount returned -1", __FILE__,__func__, __LINE__);
return (char *)hc;
}
// need this for hashing
HashTableX *cnt = getCountTable();
if ( ! cnt ) return (char *)cnt;
if ( ! cnt )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCountTable failed", __FILE__,__func__, __LINE__);
return (char *)cnt;
}
if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
// and this
//Weights *we = getWeights();
//if ( ! we || we == (void *)-1 ) return (char *)we;
// and this
Links *links = getLinks();
if ( ! links ) return (char *)links;
if ( ! links )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLinks failed", __FILE__,__func__, __LINE__);
return (char *)links;
}
if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
// and now this
//Synonyms *syn = getSynonyms();
//if ( ! syn || syn == (void *)-1 ) return (char *)syn;
char *wordSpamVec = getWordSpamVec();
if (!wordSpamVec) return (char *)wordSpamVec;
if (!wordSpamVec)
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getWordSpamVec failed", __FILE__,__func__, __LINE__);
return (char *)wordSpamVec;
}
if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
if ( ! fragVec ) return (char *)fragVec;
if ( ! fragVec )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getFragVec failed", __FILE__,__func__, __LINE__);
return (char *)fragVec;
}
if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }
// why do we need this?
if ( m_wts ) {
uint8_t *lv = getLangVector();
if ( ! lv ) return (char *)lv;
if ( ! lv )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLangVector failed", __FILE__,__func__, __LINE__);
return (char *)lv;
}
if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
}
TagRec *gr = getTagRec();
if ( ! gr ) return (char *)gr;
if ( ! gr )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getTagRec failed", __FILE__,__func__, __LINE__);
return (char *)gr;
}
if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
if ( ! cr )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCollRec failed", __FILE__,__func__, __LINE__);
return NULL;
}
// do not repeat this if the cachedb storage call blocks
@ -446,15 +483,39 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
m_dist = 0;
if ( ! hashContentType ( table ) ) return NULL;
if ( ! hashUrl ( table, false ) ) return NULL;
if ( ! hashLanguage ( table ) ) return NULL;
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashContentType ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashContentType failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashUrl ( table, false ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashLanguage ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguage failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashCountry ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
return NULL;
}
// BR 20160117 removed: if ( ! hashSiteNumInlinks( table ) ) return NULL;
// BR 20160117 removed: if ( ! hashTagRec ( table ) ) return NULL;
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIsAdult failed", __FILE__,__func__, __LINE__);
return NULL;
}
// has gbhasthumbnail:1 or 0
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
@ -467,8 +528,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
// which we use for diffbot custom crawls as well.
if ( ! hashNoSplit ( table ) ) return NULL;
if ( ! hashNoSplit ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNoSplit failed", __FILE__,__func__, __LINE__);
return NULL;
}
// MDW: i think we just inject empty html with a diffbotreply into
// global index now, so don't need this... 9/28/2014
@ -482,7 +546,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if ( ! indexDoc ) return (char *)1;
if ( ! indexDoc )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, !indexDoc", __FILE__,__func__, __LINE__);
return (char *)1;
}
// hash json fields
if ( *ct == CT_JSON ) {
@ -500,7 +568,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections
if ( ! hashBody2 (table ) ) return NULL;
if ( ! hashBody2 (table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashBody2 failed", __FILE__,__func__, __LINE__);
return NULL;
}
// hash the title now too so neighborhood singles have more
// to match. plus, we only hash these title terms iff they
@ -508,42 +580,89 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// repeated title terms because we do not do spam detection
// on them. thus, we need to hash these first before anything
// else. give them triple the body score
if ( ! hashTitle ( table )) return NULL;
if ( ! hashTitle ( table ))
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashTitle failed", __FILE__,__func__, __LINE__);
return NULL;
}
// . hash the keywords tag, limited to first 2k of them so far
// . hash above the neighborhoods so the neighborhoods only index
// what is already in the hash table
if ( ! hashMetaKeywords(table ) ) return NULL;
if ( ! hashMetaKeywords(table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaKeywords failed", __FILE__,__func__, __LINE__);
return NULL;
}
// then hash the incoming link text, NO ANOMALIES, because
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if ( ! hashIncomingLinkText(table,false,true)) return NULL;
if ( ! hashIncomingLinkText(table,false,true))
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIncomingLinkText failed", __FILE__,__func__, __LINE__);
return NULL;
}
// then the meta summary and description tags with half the score of
// the body, and only hash a term if was not already hashed above
// somewhere.
if ( ! hashMetaSummary(table) ) return NULL;
if ( ! hashMetaSummary(table) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaSummary failed", __FILE__,__func__, __LINE__);
return NULL;
}
skip:
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
if ( ! hashNeighborhoods ( table ) ) return NULL;
if ( ! hashNeighborhoods ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNeighborhoods failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashLinks ( table ) ) return NULL;
if ( ! hashDateNumbers ( table ) ) return NULL;
if ( ! hashMetaTags ( table ) ) return NULL;
if ( ! hashMetaZip ( table ) ) return NULL;
if ( ! hashLinks ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLinks failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashDateNumbers ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashDateNumbers failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashMetaTags ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashMetaZip ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaZip failed", __FILE__,__func__, __LINE__);
return NULL;
}
// BR 20160107 removed: if ( ! hashCharset ( table ) ) return NULL;
// BR 20160107 removed: if ( ! hashRSSInfo ( table ) ) return NULL;
if ( ! hashPermalink ( table ) ) return NULL;
if ( ! hashPermalink ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
return NULL;
}
// hash gblang:de last for parsing consistency
if ( ! hashLanguageString ( table ) ) return NULL;
if ( ! hashLanguageString ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguageString failed", __FILE__,__func__, __LINE__);
return NULL;
}
// . hash gbkeyword:gbmininlinks where the score is the inlink count
// . the inlink count can go from 1 to 255
@ -556,6 +675,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
//if ( ! m_pbuf ) return true;
// print out the table into g_bufPtr now if we need to
//table->print ( );
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, OK", __FILE__,__func__, __LINE__);
return (char *)1;
}
@ -985,7 +1105,7 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) {
link.isDomainUnwantedForIndexing() ||
link.isPathUnwantedForIndexing() )
{
if( g_conf.m_logDebugDetailed ) log(LOG_TRACE,"%s:%s:%d: Unwanted for indexing [%s]", __FILE__, __func__, __LINE__, link.getUrl());
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: Unwanted for indexing [%s]", __FILE__, __func__, __LINE__, link.getUrl());
continue;
}