forked from Mirrors/privacore-open-source-search-engine
More differentiated trace log
This commit is contained in:
3
Conf.h
3
Conf.h
@ -576,6 +576,9 @@ class Conf {
|
||||
bool m_logTraceBigFile;
|
||||
bool m_logTraceRepairs;
|
||||
bool m_logTraceSpider;
|
||||
bool m_logTraceXmlDoc;
|
||||
bool m_logTraceMsg0;
|
||||
|
||||
bool m_logTraceMsg;
|
||||
// expensive timing messages
|
||||
bool m_logTimingAddurl ;
|
||||
|
78
Msg0.cpp
78
Msg0.cpp
@ -132,7 +132,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
// bool allowPageCache ) {
|
||||
//#endif
|
||||
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN. hostId: %"INT64", rdbId: %d", __FILE__,__func__, __LINE__, hostId, (int)rdbId);
|
||||
|
||||
// this is obsolete! mostly, but we need it for PageIndexdb.cpp to
|
||||
// show a "termlist" for a given query term in its entirety so you
|
||||
@ -153,11 +153,11 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
// get keySize of rdb
|
||||
m_ks = getKeySizeFromRdbId ( rdbId );
|
||||
|
||||
// if( g_conf.m_logDebugDetailed )
|
||||
// if( g_conf.m_logTraceMsg0 )
|
||||
// {
|
||||
// log("%s:%s: rdbId. [%d]", __FILE__,__FUNCTION__, (int)rdbId);
|
||||
// log("%s:%s: m_ks.. [%d]", __FILE__,__FUNCTION__, (int)m_ks);
|
||||
// log("%s:%s: hostId [%"INT64"]", __FILE__,__FUNCTION__, hostId);
|
||||
// log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId);
|
||||
// log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks);
|
||||
// log("%s:%s:%d: hostId [%"INT64"]", __FILE__,__func__,__LINE__, hostId);
|
||||
// }
|
||||
|
||||
|
||||
@ -170,7 +170,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
// no longer accept negative minrecsize
|
||||
if ( minRecSizes < 0 ) {
|
||||
g_errno = EBADENGINEER;
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
|
||||
log(LOG_LOGIC,
|
||||
"net: msg0: Negative minRecSizes no longer supported.");
|
||||
@ -235,7 +235,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();
|
||||
|
||||
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: shardNum [%"INT32"]", __FILE__,__FUNCTION__, m_shardNum);
|
||||
// if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%"INT32"]", __FILE__,__func__, __LINE__, m_shardNum);
|
||||
|
||||
|
||||
// . store these parameters
|
||||
@ -321,7 +321,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
// . don't do this if m_hostId was specified
|
||||
if ( isLocal )
|
||||
{
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: isLocal", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: isLocal", __FILE__, __func__, __LINE__);
|
||||
|
||||
|
||||
if ( msg5 ) {
|
||||
@ -389,6 +389,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
m_allowPageCache ) ) return false;
|
||||
// nuke it
|
||||
reset();
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
return true;
|
||||
}
|
||||
skip:
|
||||
@ -454,6 +455,8 @@ skip:
|
||||
"data read remotely from %s: %s.",
|
||||
replyBufMaxSize,getDbnameFromId(m_rdbId),
|
||||
mstrerror(g_errno));
|
||||
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return true. Could not allocate memory.", __FILE__, __func__, __LINE__);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -491,8 +494,10 @@ skip:
|
||||
g_errno = EBADHOSTID;
|
||||
log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".",
|
||||
m_hostId);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, retruen true. Bad hostId", __FILE__, __func__, __LINE__);
|
||||
return true;
|
||||
}
|
||||
|
||||
// if niceness is 0, use the higher priority udpServer
|
||||
UdpServer *us ;
|
||||
uint16_t port;
|
||||
@ -520,8 +525,13 @@ skip:
|
||||
replyBuf ,
|
||||
replyBufMaxSize ,
|
||||
m_niceness ) ) // cback niceness
|
||||
{
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return true. Request sent.", __FILE__, __func__, __LINE__);
|
||||
return true;
|
||||
}
|
||||
|
||||
// return false cuz it blocked
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END, return false. sendRequest blocked", __FILE__, __func__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
// timing debug
|
||||
@ -635,8 +645,9 @@ skip:
|
||||
// this is a 96-bit key. TODO: fix...
|
||||
0 , // *(key_t *)cacheKey ,
|
||||
rdbId ,
|
||||
minRecSizes ) ) {
|
||||
log("net: Failed to send request for data from %s in shard "
|
||||
minRecSizes ) )
|
||||
{
|
||||
log(LOG_ERROR, "net: Failed to send request for data from %s in shard "
|
||||
"#%"UINT32" over network: %s.",
|
||||
getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
|
||||
// no, multicast will free this when it is destroyed
|
||||
@ -647,37 +658,38 @@ skip:
|
||||
m->reset();
|
||||
if ( m_numRequests > 0 )
|
||||
{
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning false", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning false", __FILE__, __func__, __LINE__);
|
||||
|
||||
return false;
|
||||
}
|
||||
//#else
|
||||
// m_mcast.reset();
|
||||
//#endif
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning true", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning true", __FILE__, __func__, __LINE__);
|
||||
return true;
|
||||
}
|
||||
//#ifdef SPLIT_INDEXDB
|
||||
m_numRequests++;
|
||||
|
||||
//#endif
|
||||
// we blocked
|
||||
// if( g_conf.m_logDebugDetailed ) log("%s:%s: END - returning false", __FILE__,__FUNCTION__);
|
||||
|
||||
// we blocked
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END - returning false, blocked", __FILE__, __func__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// . this is called when we got a local RdbList
|
||||
// . we need to call it to call the original caller callback
|
||||
void gotListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
Msg0 *THIS = (Msg0 *) state;
|
||||
THIS->reset(); // delete m_msg5
|
||||
THIS->m_callback ( THIS->m_state );//, THIS->m_list );
|
||||
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. rdbId=%d", __FILE__, __func__, __LINE__, (int)THIS->m_rdbId);
|
||||
}
|
||||
|
||||
|
||||
@ -700,7 +712,7 @@ void gotSingleReplyWrapper ( void *state , UdpSlot *slot ) {
|
||||
|
||||
void gotMulticastReplyWrapper0 ( void *state , void *state2 )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
Msg0 *THIS = (Msg0 *)state;
|
||||
//#ifdef SPLIT_INDEXDB
|
||||
@ -746,7 +758,7 @@ void gotMulticastReplyWrapper0 ( void *state , void *state2 )
|
||||
THIS->m_callback ( THIS->m_state );//, THIS->m_list );
|
||||
//}
|
||||
//#endif
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
@ -857,7 +869,7 @@ void Msg0::gotSplitReply ( ) {
|
||||
// . we are responsible for freeing reply/replySize
|
||||
void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
|
||||
// timing debug
|
||||
@ -909,7 +921,7 @@ void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize )
|
||||
//cache->addList ( m_startKey , m_list ) ;
|
||||
// reset g_errno -- we don't care if cache coulnd't add it
|
||||
//g_errno = 0;
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
@ -932,7 +944,7 @@ public:
|
||||
void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN. Got request for an RdbList", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN. Got request for an RdbList", __FILE__, __func__, __LINE__);
|
||||
|
||||
// if niceness is 0, use the higher priority udpServer
|
||||
UdpServer *us = &g_udpServer;
|
||||
@ -975,18 +987,18 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
|
||||
CollectionRec *xcr = g_collectiondb.getRec ( collnum );
|
||||
if ( ! xcr ) g_errno = ENOCOLLREC;
|
||||
|
||||
if( g_conf.m_logDebugDetailed )
|
||||
if( g_conf.m_logTraceMsg0 )
|
||||
{
|
||||
log("%s:%s: rdbId....... %d", __FILE__,__FUNCTION__, (int)rdbId);
|
||||
log("%s:%s: key size.... %d", __FILE__,__FUNCTION__, (int)ks);
|
||||
log("%s:%s: startFileNum %"INT32"", __FILE__,__FUNCTION__, startFileNum);
|
||||
log("%s:%s: numFiles.... %"INT32"", __FILE__,__FUNCTION__, numFiles);
|
||||
log("%s:%s:%d: rdbId....... %d", __FILE__,__func__, __LINE__, (int)rdbId);
|
||||
log("%s:%s:%d: key size.... %d", __FILE__,__func__, __LINE__, (int)ks);
|
||||
log("%s:%s:%d: startFileNum %"INT32"", __FILE__,__func__, __LINE__,startFileNum);
|
||||
log("%s:%s:%d: numFiles.... %"INT32"", __FILE__,__func__, __LINE__, numFiles);
|
||||
}
|
||||
|
||||
// error set from XmlDoc::cacheTermLists()?
|
||||
if ( g_errno )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. Invalid collection", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. Invalid collection", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply ( slot , EBADRDBID );
|
||||
return;
|
||||
}
|
||||
@ -998,7 +1010,7 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
|
||||
Rdb *rdb = getRdbFromId ( rdbId );
|
||||
if ( ! rdb )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. Invalid rdbId", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. Invalid rdbId", __FILE__, __func__, __LINE__);
|
||||
|
||||
us->sendErrorReply ( slot , EBADRDBID );
|
||||
return;
|
||||
@ -1064,16 +1076,16 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
|
||||
false,
|
||||
allowPageCache ) )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END. m_msg5.getList returned false", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END. m_msg5.getList returned false", __FILE__, __func__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
// call wrapper ouselves
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: Calling gotListWrapper", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: Calling gotListWrapper", __FILE__, __func__, __LINE__);
|
||||
|
||||
gotListWrapper ( st0 , NULL , NULL );
|
||||
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
#include "Sections.h" // SectionVote
|
||||
@ -1082,7 +1094,7 @@ void handleRequest0 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
|
||||
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: BEGIN", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
// get the state
|
||||
State00 *st0 = (State00 *)state;
|
||||
@ -1313,7 +1325,7 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx )
|
||||
// if need be
|
||||
st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true );
|
||||
|
||||
if( g_conf.m_logDebugDetailed ) log("%s:%s: END", __FILE__,__FUNCTION__);
|
||||
if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: END", __FILE__, __func__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
41
Parms.cpp
41
Parms.cpp
@ -12285,6 +12285,17 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "log trace info for BigFile";
|
||||
m->m_cgi = "ltrc_bf";
|
||||
m->m_off = (char *)&g_conf.m_logTraceBigFile - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for RdbBase";
|
||||
m->m_cgi = "ltrc_rb";
|
||||
m->m_off = (char *)&g_conf.m_logTraceRdbBase - g;
|
||||
@ -12305,16 +12316,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for BigFile";
|
||||
m->m_cgi = "ltrc_bf";
|
||||
m->m_off = (char *)&g_conf.m_logTraceBigFile - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for Repairs";
|
||||
m->m_cgi = "ltrc_rp";
|
||||
m->m_off = (char *)&g_conf.m_logTraceRepairs - g;
|
||||
@ -12335,6 +12336,26 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for Msg0";
|
||||
m->m_cgi = "ltrc_msgzero";
|
||||
m->m_off = (char *)&g_conf.m_logTraceMsg0 - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for XmlDoc";
|
||||
m->m_cgi = "ltrc_xmldoc";
|
||||
m->m_off = (char *)&g_conf.m_logTraceXmlDoc - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "log trace info for network messages (excessive!)";
|
||||
m->m_cgi = "trcmsg";
|
||||
|
13
Repair.cpp
13
Repair.cpp
@ -1139,15 +1139,24 @@ bool Repair::loop ( void *state ) {
|
||||
// BEGIN NEW STUFF
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: injectTitleRec", __FILE__, __func__, __LINE__);
|
||||
bool status = injectTitleRec();
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: injectTitleRec returned %s", __FILE__, __func__, __LINE__, status?"true":"false");
|
||||
|
||||
//return false; // (state)
|
||||
// try to launch another
|
||||
if ( m_numOutstandingInjects<g_conf.m_maxRepairSpiders ) {
|
||||
m_stage = STAGE_TITLEDB_0;
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: Still have more free repair spiders, loop.", __FILE__, __func__, __LINE__);
|
||||
goto loop1;
|
||||
}
|
||||
|
||||
// if we are full and it blocked... wait now
|
||||
if ( ! status ) return false;
|
||||
if ( ! status )
|
||||
{
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: END, return false. Full queue and blocked.", __FILE__, __func__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if ( m_stage == STAGE_TITLEDB_4 ) {
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: STAGE_TITLEDB_4", __FILE__, __func__, __LINE__);
|
||||
m_stage++;
|
||||
@ -1166,6 +1175,7 @@ bool Repair::loop ( void *state ) {
|
||||
// tell injection complete wrapper to call us back, otherwise
|
||||
// we never end up moving on to the spider phase
|
||||
g_repair.m_allowInjectToLoop = true;
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: END, return false. Have %"INT32" outstanding injects", __FILE__, __func__, __LINE__, m_numOutstandingInjects);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2182,6 +2192,7 @@ bool Repair::injectTitleRec ( ) {
|
||||
|
||||
// . get the meta list to add
|
||||
// . sets m_usePosdb, m_useTitledb, etc.
|
||||
if( g_conf.m_logTraceRepairs ) log(LOG_TRACE,"%s:%s:%d: Calling indexDoc", __FILE__, __func__, __LINE__);
|
||||
bool status = xd->indexDoc ( );
|
||||
// blocked?
|
||||
if ( ! status )
|
||||
|
753
XmlDoc.cpp
753
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
@ -363,6 +363,8 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// . "ws" store the terms for PageParser.cpp display
|
||||
char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__,__func__, __LINE__);
|
||||
|
||||
setStatus ( "hashing document" );
|
||||
|
||||
if ( m_allHashed ) return (char *)1;
|
||||
@ -378,7 +380,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log(LOG_TRACE,"%s:%s: getContentType failed", __FILE__,__func__);
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getContentType failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -390,6 +392,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// eventually ban it.
|
||||
if ( !hashUrl( table, true ) ) // urlOnly (skip IP and term generation)
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
m_allHashed = true;
|
||||
@ -399,44 +402,78 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
|
||||
|
||||
unsigned char *hc = (unsigned char *)getHopCount();
|
||||
if ( ! hc || hc == (void *)-1 ) return (char *)hc;
|
||||
if ( ! hc || hc == (void *)-1 )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getHopCount returned -1", __FILE__,__func__, __LINE__);
|
||||
return (char *)hc;
|
||||
}
|
||||
|
||||
// need this for hashing
|
||||
HashTableX *cnt = getCountTable();
|
||||
if ( ! cnt ) return (char *)cnt;
|
||||
if ( ! cnt )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCountTable failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)cnt;
|
||||
}
|
||||
if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// and this
|
||||
//Weights *we = getWeights();
|
||||
//if ( ! we || we == (void *)-1 ) return (char *)we;
|
||||
// and this
|
||||
Links *links = getLinks();
|
||||
if ( ! links ) return (char *)links;
|
||||
if ( ! links )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLinks failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)links;
|
||||
}
|
||||
if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// and now this
|
||||
//Synonyms *syn = getSynonyms();
|
||||
//if ( ! syn || syn == (void *)-1 ) return (char *)syn;
|
||||
|
||||
char *wordSpamVec = getWordSpamVec();
|
||||
if (!wordSpamVec) return (char *)wordSpamVec;
|
||||
if (!wordSpamVec)
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getWordSpamVec failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)wordSpamVec;
|
||||
}
|
||||
if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}
|
||||
|
||||
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
|
||||
if ( ! fragVec ) return (char *)fragVec;
|
||||
if ( ! fragVec )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getFragVec failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)fragVec;
|
||||
}
|
||||
if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// why do we need this?
|
||||
if ( m_wts ) {
|
||||
uint8_t *lv = getLangVector();
|
||||
if ( ! lv ) return (char *)lv;
|
||||
if ( ! lv )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLangVector failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)lv;
|
||||
}
|
||||
if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
TagRec *gr = getTagRec();
|
||||
if ( ! gr ) return (char *)gr;
|
||||
if ( ! gr )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getTagRec failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)gr;
|
||||
}
|
||||
if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
if ( ! cr )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCollRec failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// do not repeat this if the cachedb storage call blocks
|
||||
@ -446,15 +483,39 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
m_dist = 0;
|
||||
|
||||
|
||||
if ( ! hashContentType ( table ) ) return NULL;
|
||||
if ( ! hashUrl ( table, false ) ) return NULL;
|
||||
if ( ! hashLanguage ( table ) ) return NULL;
|
||||
if ( ! hashCountry ( table ) ) return NULL;
|
||||
if ( ! hashContentType ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashContentType failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashUrl ( table, false ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashLanguage ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguage failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashCountry ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BR 20160117 removed: if ( ! hashSiteNumInlinks( table ) ) return NULL;
|
||||
// BR 20160117 removed: if ( ! hashTagRec ( table ) ) return NULL;
|
||||
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
|
||||
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
|
||||
if ( ! hashIsAdult ( table ) ) return NULL;
|
||||
if ( ! hashIsAdult ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIsAdult failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// has gbhasthumbnail:1 or 0
|
||||
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
|
||||
@ -467,8 +528,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// just set a special bit in posdb key so Rebalance.cpp can work.
|
||||
// this will hash the content checksum which we need for deduping
|
||||
// which we use for diffbot custom crawls as well.
|
||||
if ( ! hashNoSplit ( table ) ) return NULL;
|
||||
|
||||
if ( ! hashNoSplit ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNoSplit failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// MDW: i think we just inject empty html with a diffbotreply into
|
||||
// global index now, so don't need this... 9/28/2014
|
||||
@ -482,7 +546,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// global index unless this is a json object in which case it is
|
||||
// hashed above in the call to hashJSON(). this will decrease disk
|
||||
// usage by about half, posdb* files are pretty big.
|
||||
if ( ! indexDoc ) return (char *)1;
|
||||
if ( ! indexDoc )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, !indexDoc", __FILE__,__func__, __LINE__);
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
// hash json fields
|
||||
if ( *ct == CT_JSON ) {
|
||||
@ -500,7 +568,11 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
|
||||
// hash the body of the doc first so m_dist is 0 to match
|
||||
// the rainbow display of sections
|
||||
if ( ! hashBody2 (table ) ) return NULL;
|
||||
if ( ! hashBody2 (table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashBody2 failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// hash the title now too so neighborhood singles have more
|
||||
// to match. plus, we only hash these title terms iff they
|
||||
@ -508,42 +580,89 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// repeated title terms because we do not do spam detection
|
||||
// on them. thus, we need to hash these first before anything
|
||||
// else. give them triple the body score
|
||||
if ( ! hashTitle ( table )) return NULL;
|
||||
if ( ! hashTitle ( table ))
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashTitle failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// . hash the keywords tag, limited to first 2k of them so far
|
||||
// . hash above the neighborhoods so the neighborhoods only index
|
||||
// what is already in the hash table
|
||||
if ( ! hashMetaKeywords(table ) ) return NULL;
|
||||
if ( ! hashMetaKeywords(table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaKeywords failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// then hash the incoming link text, NO ANOMALIES, because
|
||||
// we index the single words in the neighborhoods next, and
|
||||
// we had songfacts.com coming up for the 'street light facts'
|
||||
// query because it had a bunch of anomalous inlink text.
|
||||
if ( ! hashIncomingLinkText(table,false,true)) return NULL;
|
||||
if ( ! hashIncomingLinkText(table,false,true))
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIncomingLinkText failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// then the meta summary and description tags with half the score of
|
||||
// the body, and only hash a term if was not already hashed above
|
||||
// somewhere.
|
||||
if ( ! hashMetaSummary(table) ) return NULL;
|
||||
if ( ! hashMetaSummary(table) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaSummary failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
skip:
|
||||
|
||||
// this will only increment the scores of terms already in the table
|
||||
// because we neighborhoods are not techincally in the document
|
||||
// necessarily and we do not want to ruin our precision
|
||||
if ( ! hashNeighborhoods ( table ) ) return NULL;
|
||||
if ( ! hashNeighborhoods ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNeighborhoods failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
if ( ! hashLinks ( table ) ) return NULL;
|
||||
if ( ! hashDateNumbers ( table ) ) return NULL;
|
||||
if ( ! hashMetaTags ( table ) ) return NULL;
|
||||
if ( ! hashMetaZip ( table ) ) return NULL;
|
||||
if ( ! hashLinks ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLinks failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashDateNumbers ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashDateNumbers failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashMetaTags ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashMetaZip ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaZip failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BR 20160107 removed: if ( ! hashCharset ( table ) ) return NULL;
|
||||
// BR 20160107 removed: if ( ! hashRSSInfo ( table ) ) return NULL;
|
||||
if ( ! hashPermalink ( table ) ) return NULL;
|
||||
if ( ! hashPermalink ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// hash gblang:de last for parsing consistency
|
||||
if ( ! hashLanguageString ( table ) ) return NULL;
|
||||
if ( ! hashLanguageString ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguageString failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// . hash gbkeyword:gbmininlinks where the score is the inlink count
|
||||
// . the inlink count can go from 1 to 255
|
||||
@ -556,6 +675,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
//if ( ! m_pbuf ) return true;
|
||||
// print out the table into g_bufPtr now if we need to
|
||||
//table->print ( );
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, OK", __FILE__,__func__, __LINE__);
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
@ -985,7 +1105,7 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
||||
link.isDomainUnwantedForIndexing() ||
|
||||
link.isPathUnwantedForIndexing() )
|
||||
{
|
||||
if( g_conf.m_logDebugDetailed ) log(LOG_TRACE,"%s:%s:%d: Unwanted for indexing [%s]", __FILE__, __func__, __LINE__, link.getUrl());
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: Unwanted for indexing [%s]", __FILE__, __func__, __LINE__, link.getUrl());
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user