Merge branch 'master' into dev-dumpthread

This commit is contained in:
Ai Lin Chia
2017-05-01 14:45:20 +02:00
35 changed files with 574 additions and 672 deletions

@ -1021,6 +1021,7 @@ CollectionRec::CollectionRec() {
m_summDedupNumLines = 0;
m_maxQueryTerms = 0;
m_sameLangWeight = 0.0;
m_unknownLangWeight = 0.0;
memset(m_defaultSortLanguage2, 0, sizeof(m_defaultSortLanguage2));
m_importEnabled = false;
m_numImportInjects = 0;

@ -240,6 +240,7 @@ public:
//ranking settings
float m_sameLangWeight;
float m_unknownLangWeight;
// Language stuff
char m_defaultSortLanguage2[6];

@ -131,6 +131,8 @@ Conf::Conf ( ) {
m_hashGroupWeightInUrl = 0.0;
m_hashGroupWeightInMenu = 0.0;
m_synonymWeight = 0.0;
m_pageTemperatureWeightMin = 0.0;
m_pageTemperatureWeightMax = 0.0;
m_usePageTemperatureForRanking = true;
m_numFlagScoreMultipliers = 26;
m_numFlagRankAdjustments = 26;

3
Conf.h

@ -224,8 +224,9 @@ class Conf {
float m_hashGroupWeightInternalLinkText;
float m_hashGroupWeightInUrl;
float m_hashGroupWeightInMenu;
float m_synonymWeight;
float m_pageTemperatureWeightMin;
float m_pageTemperatureWeightMax;
bool m_usePageTemperatureForRanking;

@ -683,7 +683,7 @@ createFile:
m_hosts[i].m_emailCode = -2;
// reset these
m_hosts[i].m_pingInfo.m_flags = 0;
m_hosts[i].m_pingInfo.m_cpuUsage = 0.0;
m_hosts[i].m_pingInfo.m_unused4 = 0.0;
m_hosts[i].m_loadAvg = 0.0;
m_hosts[i].m_lastResponseReceiveTimestamp = 0;
@ -1237,8 +1237,9 @@ Host *Hostdb::getHostWithSpideringEnabled ( uint32_t shardNum ) {
// if niceness 0 can't pick noquery host/ must pick spider host.
// if niceness 1 can't pick nospider host/ must pick query host.
// Used to select based on PingInfo::m_udpSlotsInUseIncoming but that information is not exchanged often enough to
// be even remotely accurate with any realistic number of shards.
Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
int32_t minOutstandingRequests = 0x7fffffff;
int32_t minOutstandingRequestsIndex = -1;
Host *shard = getShard ( shardNum );
Host *bestDead = NULL;
@ -1251,13 +1252,7 @@ Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
if ( niceness == 0 && ! hh->m_queryEnabled ) continue;
if ( ! bestDead ) bestDead = hh;
if(isDead(hh)) continue;
// log("host %" PRId32 " numOutstanding is %" PRId32, hh->m_hostId,
// hh->m_pingInfo.m_udpSlotsInUseIncoming);
if ( hh->m_pingInfo.m_udpSlotsInUseIncoming >
minOutstandingRequests )
continue;
minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
minOutstandingRequestsIndex = i;
}
// we should never return a nospider/noquery host depending on
@ -1374,7 +1369,7 @@ bool Hostdb::replaceHost ( int32_t origHostId, int32_t spareHostId ) {
oldHost->m_ping = g_conf.m_deadHostTimeout;
oldHost->m_pingShotgun = g_conf.m_deadHostTimeout;
oldHost->m_emailCode = 0;
oldHost->m_pingInfo.m_udpSlotsInUseIncoming = 0;
oldHost->m_pingInfo.m_unused12 = 0;
oldHost->m_errorReplies = 0;
oldHost->m_dgramsTo = 0;
oldHost->m_dgramsFrom = 0;
@ -1431,27 +1426,27 @@ void Hostdb::updatePingInfo(Host *h, const PingInfo &pi) {
h->m_pingInfo.m_unused0 = 0;
h->m_pingInfo.m_hostId = pi.m_hostId;
h->m_pingInfo.m_loadAvg = pi.m_loadAvg;
h->m_pingInfo.m_percentMemUsed = pi.m_percentMemUsed;
h->m_pingInfo.m_cpuUsage = pi.m_cpuUsage;
h->m_pingInfo.m_unused2 = 0;
h->m_pingInfo.m_unused3 = 0;
h->m_pingInfo.m_unused4 = 0.0;
h->m_pingInfo.m_totalDocsIndexed = pi.m_totalDocsIndexed;
h->m_pingInfo.m_hostsConfCRC = pi.m_hostsConfCRC;
h->m_pingInfo.m_diskUsage = pi.m_diskUsage;
h->m_pingInfo.m_unused7 = 0.0;
h->m_pingInfo.m_flags = pi.m_flags;
h->m_pingInfo.m_numCorruptDiskReads = pi.m_numCorruptDiskReads;
h->m_pingInfo.m_numOutOfMems = pi.m_numOutOfMems;
h->m_pingInfo.m_socketsClosedFromHittingLimit = pi.m_socketsClosedFromHittingLimit;
h->m_pingInfo.m_unused9 = 0;
h->m_pingInfo.m_unused10 = 0;
h->m_pingInfo.m_unused11 = 0;
//m_totalResends is updated direclty by UdpSlot
//h->m_pingInfo.m_totalResends = pi.m_totalResends;
//m_etryagains is updated directly by UdpServer
//h->m_pingInfo.m_etryagains = pi.m_etryagains;
h->m_pingInfo.m_udpSlotsInUseIncoming = pi.m_udpSlotsInUseIncoming;
h->m_pingInfo.m_tcpSocketsInUse = pi.m_tcpSocketsInUse;
h->m_pingInfo.m_currentSpiders = pi.m_currentSpiders;
h->m_pingInfo.m_unused12 = 0;
h->m_pingInfo.m_unused13 = 0;
h->m_pingInfo.m_unused14 = 0;
h->m_pingInfo.m_dailyMergeCollnum = pi.m_dailyMergeCollnum;
memcpy(h->m_pingInfo.m_gbVersionStr,pi.m_gbVersionStr,sizeof(pi.m_gbVersionStr));
h->m_pingInfo.m_repairMode = pi.m_repairMode;
h->m_pingInfo.m_recoveryLevel = pi.m_recoveryLevel;
h->m_pingInfo.m_unused18 = 0;
}
@ -1750,15 +1745,14 @@ int32_t *getLocalIps ( ) {
log("hostdb: getifaddrs: %s.",mstrerror(errno));
return NULL;
}
ifaddrs *p = ifap;
int32_t ni = 0;
// store loopback just in case
int32_t loopback = atoip("127.0.0.1");
s_localIps[ni++] = loopback;
for ( ; p && ni < 18 ; p = p->ifa_next ) {
// avoid possible core dump
for(ifaddrs *p = ifap; p && ni < 18 ; p = p->ifa_next) {
if ( ! p->ifa_addr ) continue;
//break; // mdw hack...
if(p->ifa_addr->sa_family != AF_INET)
continue;
struct sockaddr_in *xx = (sockaddr_in *)(void*)p->ifa_addr;
int32_t ip = xx->sin_addr.s_addr;
// skip if loopback we stored above

@ -45,27 +45,27 @@ class PingInfo {
public:
int64_t m_unused0; //used to be a timestamp for clock synchronization
int32_t m_hostId;
int32_t m_loadAvg;
float m_percentMemUsed;
float m_cpuUsage;
int32_t m_unused2; //used for the m_loadAvg
float m_unused3; //used to me m_percentMemUsed;
float m_unused4; //used to be m_cpuUsage
int32_t m_totalDocsIndexed;
int32_t m_hostsConfCRC;
float m_diskUsage;
float m_unused7; //used to be m_diskUsage
int32_t m_flags;
// some new stuff
int32_t m_numCorruptDiskReads;
int32_t m_numOutOfMems;
int32_t m_socketsClosedFromHittingLimit;
int32_t m_unused9;
int32_t m_unused10;
int32_t m_unused11;
int32_t m_udpSlotsInUseIncoming;
int32_t m_tcpSocketsInUse;
int32_t m_unused12;
int32_t m_unused13;
int16_t m_currentSpiders;
int16_t m_unused14;
collnum_t m_dailyMergeCollnum;
char m_gbVersionStr[21];
char m_repairMode;
uint8_t m_recoveryLevel;
uint8_t m_unused18;
};
class Host {

@ -7,6 +7,7 @@
#include "Collectiondb.h"
#include "HashTable.h"
#include "Stats.h"
#include "Statistics.h"
#include "HttpMime.h"
#include "Hostdb.h"
#include "Loop.h"
@ -521,7 +522,7 @@ void HttpServer::requestHandler ( TcpSocket *s ) {
sendErrorReply ( s , 500 , "Too many sockets open.");
// count as a failed query so we send an email alert if too
// many of these happen
g_stats.m_closedSockets++;
Statistics::register_socket_limit_hit();
return;
}

@ -49,8 +49,15 @@ void Msg39Request::reset() {
m_collnum = -1;
m_useQueryStopWords = true;
m_doMaxScoreAlgo = true;
m_termFreqWeightFreqMin = 0.0;
m_termFreqWeightFreqMax = 0.5;
m_termFreqWeightMin = 0.5;
m_termFreqWeightMax = 1.0;
m_synonymWeight = 0.9;
m_pageTemperatureWeightMin = 1.0;
m_pageTemperatureWeightMax = 20.0;
m_usePageTemperatureForRanking = true;
for(int i=0; i<26; i++)
m_flagScoreMultiplier[i] = 1.0;
for(int i=0; i<26; i++)
@ -61,6 +68,7 @@ void Msg39Request::reset() {
size_query = 0;
size_whiteList = 0;
m_sameLangWeight = 20.0;
m_unknownLangWeight = 10.0;
// -1 means to not to docid range restriction
m_minDocId = -1LL;

@ -37,6 +37,7 @@ class Msg39Request {
int32_t m_maxQueryTerms;
int32_t m_numDocIdSplits;
float m_sameLangWeight;
float m_unknownLangWeight;
//int32_t m_compoundListMaxSize;
uint8_t m_language;
@ -58,7 +59,13 @@ class Msg39Request {
bool m_doMaxScoreAlgo;
ScoringWeights m_scoringWeights;
float m_termFreqWeightFreqMin;
float m_termFreqWeightFreqMax;
float m_termFreqWeightMin;
float m_termFreqWeightMax;
float m_synonymWeight;
float m_pageTemperatureWeightMin;
float m_pageTemperatureWeightMax;
bool m_usePageTemperatureForRanking;
float m_flagScoreMultiplier[26];

@ -200,7 +200,8 @@ bool Msg3a::getDocIds(const SearchInput *si, Query *q, void *state,
(PTRTYPE)this);
}
setTermFreqWeights(m_msg39req.m_collnum, m_q);
setTermFreqWeights(m_msg39req.m_collnum, m_q, m_msg39req.m_termFreqWeightFreqMin, m_msg39req.m_termFreqWeightFreqMax,
m_msg39req.m_termFreqWeightMin, m_msg39req.m_termFreqWeightMax);
if ( m_debug ) {
for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) {
@ -1005,15 +1006,15 @@ void Msg3a::printTerms ( ) {
}
static float getTermFreqWeight(int64_t termFreq, int64_t numDocsInColl) {
static float getTermFreqWeight(int64_t termFreq, int64_t numDocsInColl, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax) {
if(numDocsInColl>0)
return scale_linear(((float)termFreq)/numDocsInColl, g_conf.m_termFreqWeightFreqMin, g_conf.m_termFreqWeightFreqMax, g_conf.m_termFreqWeightMax, g_conf.m_termFreqWeightMin);
return scale_linear(((float)termFreq)/numDocsInColl, termFreqWeightFreqMin, termFreqWeightFreqMax, termFreqWeightMax, termFreqWeightMin);
else
return 1.0; //whatever...
}
void setTermFreqWeights ( collnum_t collnum , Query *q ) {
void setTermFreqWeights ( collnum_t collnum , Query *q, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax) {
int64_t numDocsInColl = 0;
RdbBase *base = getRdbBase ( RDB_CLUSTERDB, collnum );
if ( base ) numDocsInColl = base->estimateNumGlobalRecs();
@ -1032,7 +1033,7 @@ void setTermFreqWeights ( collnum_t collnum , Query *q ) {
// GET THE TERMFREQ for setting weights
int64_t tf = g_posdb.getTermFreq ( collnum ,qt->m_termId);
qt->m_termFreq = tf;
float tfw = getTermFreqWeight(tf,numDocsInColl);
float tfw = getTermFreqWeight(tf,numDocsInColl, termFreqWeightFreqMin, termFreqWeightFreqMax, termFreqWeightMin, termFreqWeightMax);
qt->m_termFreqWeight = tfw;
}
}

@ -7,7 +7,7 @@
class SearchInput;
class Query;
void setTermFreqWeights ( collnum_t collnum, class Query *q );
void setTermFreqWeights ( collnum_t collnum , Query *q, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax);
#define MAX_SHARDS 1024

@ -344,7 +344,16 @@ bool Msg40::federatedLoop ( ) {
m_si->m_hashGroupWeightInternalLinkText,
m_si->m_hashGroupWeightInUrl,
m_si->m_hashGroupWeightInMenu);
mr.m_termFreqWeightFreqMin = m_si->m_termFreqWeightFreqMin;
mr.m_termFreqWeightFreqMax = m_si->m_termFreqWeightFreqMax;
mr.m_termFreqWeightMin = m_si->m_termFreqWeightMin;
mr.m_termFreqWeightMax = m_si->m_termFreqWeightMax;
mr.m_synonymWeight = m_si->m_synonymWeight;
mr.m_pageTemperatureWeightMin = m_si->m_pageTemperatureWeightMin;
mr.m_pageTemperatureWeightMax = m_si->m_pageTemperatureWeightMax;
mr.m_usePageTemperatureForRanking = m_si->m_usePageTemperatureForRanking;
memcpy(mr.m_flagScoreMultiplier, m_si->m_flagScoreMultiplier, sizeof(mr.m_flagScoreMultiplier));
memcpy(mr.m_flagRankAdjustment, m_si->m_flagRankAdjustment, sizeof(mr.m_flagRankAdjustment));
@ -364,6 +373,7 @@ bool Msg40::federatedLoop ( ) {
mr.m_minSerpDocId = m_si->m_minSerpDocId;
mr.m_maxSerpScore = m_si->m_maxSerpScore;
mr.m_sameLangWeight = m_si->m_sameLangWeight;
mr.m_unknownLangWeight = m_si->m_unknownLangWeight;
memcpy(mr.m_queryId, m_si->m_queryId, sizeof(m_si->m_queryId));
if ( mr.m_timeout < m_si->m_minMsg3aTimeout )
@ -1494,6 +1504,14 @@ bool Msg40::gotSummary ( ) {
continue;
}
// filter simplified redirection/non-caconical document
if (mr && mr->size_rubuf > 1 && mr->m_contentLen == 0) {
if (!m_si->m_showErrors) {
*level = CR_EMPTY_REDIRECTION_PAGE;
continue;
}
}
// filter empty title & summaries
if ( mr && mr->size_tbuf <= 1 && mr->size_displaySum <= 1 ) {
if ( ! m_si->m_showErrors ) {

@ -34,8 +34,8 @@ const char * const g_crStrings[] = {
"summary error" ,
"duplicate" ,
"clusterdb error (subcount of visible)" ,
"duplicate url",
"wasted summary lookup" ,
"duplicate url",
"empty redirection page" ,
"visible" ,
"blacklisted" ,
"ruleset filtered" ,

@ -48,9 +48,8 @@ enum {
CR_ERROR_CLUSTERDB ,
// the url is a dup of a previous url (wiki pages capitalization)
CR_DUP_URL ,
// . subset of the CR_OK (visible) results are "wasted" titlerec lookup
// . only used for stats by Msg40.cpp/Stats.cpp
CR_WASTED ,
// the url doesn't have any content due to simplified redirection page/non-caconical page
CR_EMPTY_REDIRECTION_PAGE,
// the docid is ok to display!
CR_OK ,
// from a blacklisted site hash

@ -21,9 +21,6 @@ static int errorsSort ( const void *i1, const void *i2 );
static int tryagainSort ( const void *i1, const void *i2 );
static int dgramsToSort ( const void *i1, const void *i2 );
static int dgramsFromSort ( const void *i1, const void *i2 );
static int memUsedSort ( const void *i1, const void *i2 );
static int cpuUsageSort ( const void *i1, const void *i2 );
static int diskUsageSort ( const void *i1, const void *i2 );
static int32_t generatePingMsg( Host *h, int64_t nowms, char *buffer );
@ -156,15 +153,6 @@ skipReplaceHost:
"<td><b>docs indexed</a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=9\">"
"<b>mem used</a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=10\">"
"<b>cpu used</b></a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=17\">"
"<b>disk used</b></a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=14\">"
"<b>max ping1</b></a></td>"
@ -190,9 +178,6 @@ skipReplaceHost:
cs,
cs,
cs,
cs,
cs,
cs,
shotcol );
// loop through each host we know and print it's stats
@ -225,15 +210,12 @@ skipReplaceHost:
case 6: gbsort ( hostSort, nh, sizeof(int32_t), dgramsToSort ); break;
case 7: gbsort ( hostSort, nh, sizeof(int32_t), dgramsFromSort ); break;
//case 8:
case 9: gbsort ( hostSort, nh, sizeof(int32_t), memUsedSort ); break;
case 10:gbsort ( hostSort, nh, sizeof(int32_t), cpuUsageSort ); break;
case 11:gbsort ( hostSort, nh, sizeof(int32_t), pingAgeSort ); break;
case 12:gbsort ( hostSort, nh, sizeof(int32_t), flagSort ); break;
case 13:gbsort ( hostSort, nh, sizeof(int32_t), splitTimeSort ); break;
case 14:gbsort ( hostSort, nh, sizeof(int32_t), pingMaxSort ); break;
//case 15:
case 16:gbsort ( hostSort, nh, sizeof(int32_t), defaultSort ); break;
case 17:gbsort ( hostSort, nh, sizeof(int32_t), diskUsageSort ); break;
}
@ -309,27 +291,6 @@ skipReplaceHost:
char ipbuf3[64];
strcpy(ipbuf3,iptoa(eip));
const char *fontTagFront = "";
const char *fontTagBack = "";
if ( h->m_pingInfo.m_percentMemUsed >= 98.0 &&
format == FORMAT_HTML ) {
fontTagFront = "<font color=red>";
fontTagBack = "</font>";
}
float cpu = h->m_pingInfo.m_cpuUsage;
if ( cpu > 100.0 ) cpu = 100.0;
if ( cpu < 0.0 ) cpu = -1.0;
char diskUsageMsg[64];
sprintf(diskUsageMsg,"%.1f%%",h->m_pingInfo.m_diskUsage);
if ( h->m_pingInfo.m_diskUsage < 0.0 )
sprintf(diskUsageMsg,"???");
if ( h->m_pingInfo.m_diskUsage>=98.0 && format == FORMAT_HTML )
sprintf(diskUsageMsg,"<font color=red><b>%.1f%%"
"</b></font>",h->m_pingInfo.m_diskUsage);
// split time, don't divide by zero!
int32_t splitTime = 0;
if ( h->m_splitsDone )
@ -355,42 +316,10 @@ skipReplaceHost:
int32_t flags = h->m_pingInfo.m_flags;
if ( format == FORMAT_HTML ) {
// use these new ones for now
int n = h->m_pingInfo.m_numCorruptDiskReads;
if ( n )
fb.safePrintf("<font color=red><b>"
"C"
"<sup>%" PRId32"</sup>"
"</b></font>"
, n );
n = h->m_pingInfo.m_numOutOfMems;
if ( n )
fb.safePrintf("<font color=red><b>"
"O"
"<sup>%" PRId32"</sup>"
"</b></font>"
, n );
n = h->m_pingInfo.m_socketsClosedFromHittingLimit;
if ( n )
fb.safePrintf("<font color=red><b>"
"K"
"<sup>%" PRId32"</sup>"
"</b></font>"
, n );
}
// recovery mode? reocvered from coring?
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML ) {
fb.safePrintf("<b title=\"Recovered from core"
"\">x</b>");
// this is only 8-bits at the moment so it's capped
// at 255. this level is 1 the first time we core
// and are restarted.
if ( h->m_pingInfo.m_recoveryLevel > 1 )
fb.safePrintf("<sup>%" PRId32"</sup>",
(int32_t)
h->m_pingInfo.m_recoveryLevel);
}
if ((flags & PFLAG_RECOVERYMODE)&& format != FORMAT_HTML )
@ -416,59 +345,7 @@ skipReplaceHost:
// if it has spiders going on say "S" with # as the superscript
if ((flags & PFLAG_HASSPIDERS) && format == FORMAT_HTML )
fb.safePrintf ( "<span title=\"Spidering\">S"
"<sup>%" PRId32"</sup>"
"</span>"
,h->m_pingInfo.m_currentSpiders
);
if ( format == FORMAT_HTML &&
h->m_pingInfo.m_udpSlotsInUseIncoming ) {
const char *f1 = "";
const char *f2 = "";
// MAXUDPSLOTS in Spider.cpp is 300 right now
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 300 ) {
f1 = "<b>";
f2 = "</b>";
}
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 400 ) {
f1 = "<b><font color=red>";
f2 = "</font></b>";
}
fb.safePrintf("<span title=\"udpSlotsInUse\">"
"%s"
"U"
"<sup>%" PRId32"</sup>"
"%s"
"</span>"
,f1
,h->m_pingInfo.m_udpSlotsInUseIncoming
,f2
);
}
if ( format == FORMAT_HTML && h->m_pingInfo.m_tcpSocketsInUse){
const char *f1 = "";
const char *f2 = "";
if ( h->m_pingInfo.m_tcpSocketsInUse >= 100 ) {
f1 = "<b>";
f2 = "</b>";
}
if ( h->m_pingInfo.m_tcpSocketsInUse >= 200 ) {
f1 = "<b><font color=red>";
f2 = "</font></b>";
}
fb.safePrintf("<span title=\"tcpSocketsInUse\">"
"%s"
"T"
"<sup>%" PRId32"</sup>"
"%s"
"</span>"
,f1
,h->m_pingInfo.m_tcpSocketsInUse
,f2
);
}
fb.safePrintf ( "<span title=\"Spidering\">S</span>");
if ((flags & PFLAG_HASSPIDERS) && format != FORMAT_HTML )
fb.safePrintf ( "Spidering");
@ -556,14 +433,6 @@ skipReplaceHost:
"</errorTryAgains>\n",
h->m_etryagains.load());
sb.safePrintf("\t\t<udpSlotsInUse>%" PRId32
"</udpSlotsInUse>\n",
h->m_pingInfo.m_udpSlotsInUseIncoming);
sb.safePrintf("\t\t<tcpSocketsInUse>%" PRId32
"</tcpSocketsInUse>\n",
h->m_pingInfo.m_tcpSocketsInUse);
/*
sb.safePrintf("\t\t<dgramsTo>%" PRId64"</dgramsTo>\n",
h->m_dgramsTo);
@ -571,21 +440,6 @@ skipReplaceHost:
h->m_dgramsFrom);
*/
sb.safePrintf("\t\t<numCorruptDiskReads>%" PRId32
"</numCorruptDiskReads>\n"
,h->m_pingInfo.m_numCorruptDiskReads);
sb.safePrintf("\t\t<numOutOfMems>%" PRId32
"</numOutOfMems>\n"
,h->m_pingInfo.m_numOutOfMems);
sb.safePrintf("\t\t<numClosedSockets>%" PRId32
"</numClosedSockets>\n"
,h->m_pingInfo.
m_socketsClosedFromHittingLimit);
sb.safePrintf("\t\t<numOutstandingSpiders>%" PRId32
"</numOutstandingSpiders>\n"
,h->m_pingInfo.m_currentSpiders );
sb.safePrintf("\t\t<splitTime>%" PRId32"</splitTime>\n",
splitTime);
sb.safePrintf("\t\t<splitsDone>%" PRId32"</splitsDone>\n",
@ -598,18 +452,6 @@ skipReplaceHost:
"</docsIndexed>\n",
h->m_pingInfo.m_totalDocsIndexed);
sb.safePrintf("\t\t<percentMemUsed>%.1f%%"
"</percentMemUsed>",
h->m_pingInfo.m_percentMemUsed); // float
sb.safePrintf("\t\t<cpuUsage>%.1f%%"
"</cpuUsage>",
cpu );
sb.safePrintf("\t\t<percentDiskUsed><![CDATA[%s]]>"
"</percentDiskUsed>",
diskUsageMsg);
sb.safePrintf("\t\t<maxPing1>%s</maxPing1>\n",
pms );
@ -671,10 +513,6 @@ skipReplaceHost:
*/
sb.safePrintf("\t\t\t\t\"errorTryAgains\":%" PRId32",\n",
h->m_etryagains.load());
sb.safePrintf("\t\t\t\t\"udpSlotsInUse\":%" PRId32",\n",
h->m_pingInfo.m_udpSlotsInUseIncoming);
sb.safePrintf("\t\t\t\t\"tcpSocketsInUse\":%" PRId32",\n",
h->m_pingInfo.m_tcpSocketsInUse);
/*
sb.safePrintf("\t\t\t\t\"dgramsTo\":%" PRId64",\n",
@ -684,18 +522,6 @@ skipReplaceHost:
*/
sb.safePrintf("\t\t\t\t\"numCorruptDiskReads\":%" PRId32",\n"
,h->m_pingInfo.m_numCorruptDiskReads);
sb.safePrintf("\t\t\t\t\"numOutOfMems\":%" PRId32",\n"
,h->m_pingInfo.m_numOutOfMems);
sb.safePrintf("\t\t\t\t\"numClosedSockets\":%" PRId32",\n"
,h->m_pingInfo.
m_socketsClosedFromHittingLimit);
sb.safePrintf("\t\t\t\t\"numOutstandingSpiders\":%" PRId32
",\n"
,h->m_pingInfo.m_currentSpiders );
sb.safePrintf("\t\t\t\t\"splitTime\":%" PRId32",\n",
splitTime);
sb.safePrintf("\t\t\t\t\"splitsDone\":%" PRId32",\n",
@ -707,14 +533,6 @@ skipReplaceHost:
sb.safePrintf("\t\t\t\t\"docsIndexed\":%" PRId32",\n",
h->m_pingInfo.m_totalDocsIndexed);
sb.safePrintf("\t\t\t\t\"percentMemUsed\":\"%.1f%%\",\n",
h->m_pingInfo.m_percentMemUsed); // float
sb.safePrintf("\t\t\t\t\"cpuUsage\":\"%.1f%%\",\n",cpu);
sb.safePrintf("\t\t\t\t\"percentDiskUsed\":\"%s\",\n",
diskUsageMsg);
sb.safePrintf("\t\t\t\t\"maxPing1\":\"%s\",\n",pms);
sb.safePrintf("\t\t\t\t\"maxPingAge1\":\"%" PRId32"ms\",\n",
@ -799,13 +617,6 @@ skipReplaceHost:
// docs indexed
"<td>%" PRId32"</td>"
// percent mem used
"<td>%s%.1f%%%s</td>"
// cpu usage
"<td>%.1f%%</td>"
// disk usage
"<td>%s</td>"
// ping max
"<td>%s</td>"
@ -844,12 +655,6 @@ skipReplaceHost:
h->m_pingInfo.m_totalDocsIndexed,
fontTagFront,
h->m_pingInfo.m_percentMemUsed, // float
fontTagBack,
cpu, // float
diskUsageMsg,
// ping max
pms,
// ping age
@ -889,13 +694,6 @@ skipReplaceHost:
// end the table now
sb.safePrintf ( "</table><br>\n" );
sb.safePrintf("<table>");
for(int i=0; i<nh; i++) {
Host *h = g_hostdb.getHost(hostSort[i]);
sb.safePrintf("<tr><td>%lu</t><td>%lu</td></tr>", h->getLastRequestSendTimestamp(), h->getLastResponseReceiveTimestamp());
}
sb.safePrintf("</table>");
if( g_hostdb.m_numSpareHosts ) {
// print spare hosts table
@ -1341,33 +1139,3 @@ int dgramsFromSort ( const void *i1, const void *i2 ) {
if ( h1->m_dgramsFrom < h2->m_dgramsFrom ) return 1;
return 0;
}
int memUsedSort ( const void *i1, const void *i2 ) {
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
PingInfo *p1 = &h1->m_pingInfo;
PingInfo *p2 = &h2->m_pingInfo;
if ( p1->m_percentMemUsed > p2->m_percentMemUsed ) return -1;
if ( p1->m_percentMemUsed < p2->m_percentMemUsed ) return 1;
return 0;
}
int cpuUsageSort ( const void *i1, const void *i2 ) {
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
PingInfo *p1 = &h1->m_pingInfo;
PingInfo *p2 = &h2->m_pingInfo;
if ( p1->m_cpuUsage > p2->m_cpuUsage ) return -1;
if ( p1->m_cpuUsage < p2->m_cpuUsage ) return 1;
return 0;
}
int diskUsageSort ( const void *i1, const void *i2 ) {
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
PingInfo *p1 = &h1->m_pingInfo;
PingInfo *p2 = &h2->m_pingInfo;
if ( p1->m_diskUsage > p2->m_diskUsage ) return -1;
if ( p1->m_diskUsage < p2->m_diskUsage ) return 1;
return 0;
}

@ -2737,7 +2737,7 @@ badformat:
if ( scr ) coll = scr->m_coll;
if ( si->m_format == FORMAT_HTML && printCached ) {
sb->safePrintf ( "<a href=\"/get?q=%s&qlang=%s&c=%s&d=%" PRId64 "&cnsp=0\">cached</a>\n",
sb->safePrintf ( "<a href=\"/get?q=%s&qlang=%s&c=%s&d=%" PRId64 "&cnsp=0\">cached</a> - \n",
st->m_qesb.getBufStart() ,
si->m_defaultSortLang, // "qlang" parm
coll ,
@ -2750,7 +2750,7 @@ badformat:
if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) {
// place holder for backlink table link
placeHolder = sb->length();
sb->safePrintf (" - <a onclick="
sb->safePrintf ("<a onclick="
"\""
"var e = document.getElementById('bl%" PRId32"');"
"if ( e.style.display == 'none' ){"
@ -2772,7 +2772,7 @@ badformat:
placeHolderLen = sb->length() - placeHolder;
// unhide the scoring table on click
sb->safePrintf (" - <a onclick="
sb->safePrintf ("<a onclick="
"\""
"var e = document.getElementById('sc%" PRId32"');"
"if ( e.style.display == 'none' ){"

@ -1,10 +1,11 @@
#include "PageTemperatureRegistry.h"
#include "ScalingFunctions.h"
#include "Log.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <math.h>
PageTemperatureRegistry g_pageTemperatureRegistry;
@ -97,13 +98,18 @@ bool PageTemperatureRegistry::load() {
temperature_range_for_scaling = max_temperature-min_temperature;
min_temperature_log = log(min_temperature);
max_temperature_log = log(max_temperature);
temperature_range_for_scaling_log = log(temperature_range_for_scaling);
default_temperature_log = log(default_temperature);
if(!using_meta)
log(LOG_WARN, "meta-file %s could not be loaded. Using default temperature of %u which can scew results for new pages", meta_filename, default_temperature);
log(LOG_DEBUG, "pagetemp: min_temperature=%u",min_temperature);
log(LOG_DEBUG, "pagetemp: max_temperature=%u",max_temperature);
log(LOG_DEBUG, "pagetemp: default_temperature=%u",default_temperature);
log(LOG_DEBUG, "%s loaded (%lu items)", filename, (unsigned long)new_entries);
return true;
}
@ -129,11 +135,15 @@ unsigned PageTemperatureRegistry::query_page_temperature_internal(uint64_t docid
}
double PageTemperatureRegistry::query_page_temperature(uint64_t docid) const {
double PageTemperatureRegistry::query_page_temperature(uint64_t docid, double range_min, double range_max) const {
if(hash_table_size==0)
return 1.0;
unsigned temperature_26bit = query_page_temperature_internal(docid);
return scale_linear(default_temperature_log, min_temperature_log, max_temperature_log, range_min, range_max);
double temperature_26bit_log = log((double)query_page_temperature_internal(docid));
//Then scale to a number in the rangte [0..1]
//It is a bit annoying to do this computation for each lookup but it saves memory
return ((double)(temperature_26bit - min_temperature)) / temperature_range_for_scaling;
// return ((double)(temperature_26bit - min_temperature)) / temperature_range_for_scaling;
return scale_linear(temperature_26bit_log, min_temperature_log, max_temperature_log, range_min, range_max);
}

@ -15,6 +15,12 @@ class PageTemperatureRegistry {
unsigned max_temperature;
unsigned temperature_range_for_scaling;
unsigned default_temperature;
double min_temperature_log;
double max_temperature_log;
double temperature_range_for_scaling_log;
double default_temperature_log;
unsigned query_page_temperature_internal(uint64_t docid) const;
public:
PageTemperatureRegistry()
@ -26,7 +32,7 @@ public:
bool load();
void unload();
double query_page_temperature(uint64_t docid) const;
double query_page_temperature(uint64_t docid, double range_min, double range_max) const;
bool empty() const { return entries==0; }
};

@ -2411,25 +2411,6 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
mb->safePrintf("%s",boxEnd);
}
// out of disk space?
int32_t out = 0;
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
Host *h = &g_hostdb.m_hosts[i];
if ( h->m_pingInfo.m_diskUsage < 98.0 ) continue;
out++;
}
if ( out > 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
const char *s = "s are";
if ( out == 1 ) s = " is";
mb->safePrintf("%s",box);
mb->safePrintf("%" PRId32" host%s over 98%% disk usage. "
"See the <a href=/admin/hosts?c=%s>"
"hosts</a> table.",out,s,coll);
mb->safePrintf("%s",boxEnd);
}
// injections disabled?
if ( ! g_conf.m_injectionsEnabled ) {
if ( adds ) mb->safePrintf("<br>");
@ -2481,13 +2462,12 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
for ( int32_t i = 1 ; i < g_hostdb.getNumHosts() ; i++ ) {
Host *h = &g_hostdb.m_hosts[i];
if ( g_hostdb.isDead( h ) ) continue;
if ( h->m_pingInfo.m_udpSlotsInUseIncoming>= 400)jammedHosts++;
}
if ( jammedHosts > 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
const char *s = "s are";
if ( out == 1 ) s = " is";
if ( jammedHosts == 1 ) s = " is";
mb->safePrintf("%s",box);
mb->safePrintf("%" PRId32" host%s jammed with "
"over %" PRId32" unhandled "

407
Parms.cpp

@ -53,8 +53,40 @@ public:
};
//
// User configured values for these parms need to be adjusted to internal ranges
//
const struct {
char *name;
float div_by;
} static g_fxui_parms[] = {
{"diversityweightmin", 100.0},
{"diversityweightmax", 100.0},
{"densityweightmin", 100.0},
{"densityweightmax", 100.0},
{"hgw_body", 10.0},
{"hgw_title", 10.0},
{"hgw_heading", 10.0},
{"hgw_list", 10.0},
{"hgw_metatag", 10.0},
{"hgw_inlinktext", 10.0},
{"hgw_intag", 10.0},
{"hgw_neighborhood", 10.0},
{"hgw_inmenu", 10.0},
{"hgw_inintlinktext", 10.0},
{"hgw_inurl", 10.0},
{"synonym_weight", 10.0},
{"termfreqweightfreqmin", 100.0},
{"termfreqweightfreqmax", 100.0},
{"termfreqweightmin", 100.0},
{"termfreqweightmax", 100.0}
};
static const int g_num_fxui_parms = sizeof(g_fxui_parms) / sizeof(g_fxui_parms[0]);
Parms g_parms;
Parm::Parm() {
// Coverity
m_title = NULL;
@ -827,8 +859,7 @@ bool Parms::setGigablastRequest ( TcpSocket *socket ,
//if ( (m->m_perms & user) == 0 ) continue;
// set it. now our TYPE_CHARPTR will just be set to it directly
// to save memory...
setParm ( (char *)THIS , m, 0, v, false,//not html enc
false ); // true );
setParm ( (char *)THIS , m, 0, v);
}
return true;
@ -1962,12 +1993,63 @@ bool Parms::printParm( SafeBuf* sb,
return status;
}
//
// Convert external weights presented in the frontend UI to internal values
//
bool Parms::convertUIToInternal(const char *field_base_name, parameter_type_t type, const char *s, char *adjusted_value) {
for(int fx=0; fx < g_num_fxui_parms; fx++) {
if( strcmp(g_fxui_parms[fx].name, field_base_name) == 0 ) {
switch(type) {
case TYPE_FLOAT: {
float f = s ? (float)atof(s) : 0;
if( f >= 1.0 && g_fxui_parms[fx].div_by > 1.0 ) {
f = f / g_fxui_parms[fx].div_by;
}
snprintf(adjusted_value, 128, "%f", f);
}
return true;
case TYPE_DOUBLE: {
double d = s ? (double)atof ( s ) : 0;
if( d >= 1.0 && g_fxui_parms[fx].div_by > 1.0 ) {
d = d / g_fxui_parms[fx].div_by;
}
snprintf(adjusted_value, 128, "%f", d);
}
return true;
case TYPE_INT32:
case TYPE_INT32_CONST: {
int32_t v = s ? atol(s) : 0;
if( v >= 1 && (int32_t)g_fxui_parms[fx].div_by > 1 ) {
v = v / (int32_t)g_fxui_parms[fx].div_by;
}
snprintf(adjusted_value, 128, "%" PRId32 "", v);
}
return true;
case TYPE_INT64: {
int64_t i64 = s ? strtoull(s,NULL,10) : 0;
if( i64 >= 1 && (int64_t)g_fxui_parms[fx].div_by > 1 ) {
i64 = i64 / (int64_t)g_fxui_parms[fx].div_by;
}
snprintf(adjusted_value, 128, "%" PRId64 "", i64);
}
return true;
default:
break;
}
}
}
return false;
}
// now we use this to set SearchInput and GigablastRequest
bool Parms::setFromRequest ( HttpRequest *r ,
TcpSocket* s,
CollectionRec *newcr ,
char *THIS ,
parameter_object_type_t objType) {
bool Parms::setFromRequest(HttpRequest *r, TcpSocket *s, CollectionRec *newcr, char *THIS, parameter_object_type_t objType) {
// use convertHttpRequestToParmList() for these because they
// are persistent records that are updated on every shard.
@ -1985,31 +2067,47 @@ bool Parms::setFromRequest ( HttpRequest *r ,
for(int32_t i = 0; i < r->getNumFields(); i++) {
// get the value of cgi parm (null terminated)
const char *v = r->getValue(i);
if(!v)
if(!v) {
continue; //no value
}
// get cgi parm name
const char *full_field_name = r->getField(i);
size_t full_field_name_len = strlen(full_field_name);
if(full_field_name_len>=128)
if(full_field_name_len>=128) {
continue;
char field_base_name[128];
int field_index;
size_t nondigit_prefix_len = strcspn(full_field_name,"0123456789");
if(nondigit_prefix_len!=full_field_name_len) {
//field name contains digits. Split into base field name and index
memcpy(field_base_name,full_field_name,nondigit_prefix_len);
field_base_name[nondigit_prefix_len] = '\0';
char *endptr = NULL;
field_index = strtol(full_field_name+nondigit_prefix_len, &endptr, 10);
if(field_index<0)
continue; //hmm?
if(endptr && *endptr)
continue; //digits weren't the last part
} else {
strcpy(field_base_name,full_field_name);
field_index = 0;
}
char field_base_name[128];
bool uiconvert = false;
int field_index=0;
//
// To make user configuration of ranking parameters simpler, we sometimes
// use other valid ranges in parameters than those used internally. Prefix
// the param name with 'fxui_' and add the name and divisor to the global
// table to automatically adjust external values to internal ones.
//
if( strncmp(full_field_name, "fxui_", 5) == 0 ) {
strcpy(field_base_name, full_field_name+5);
uiconvert=true;
}
else {
size_t nondigit_prefix_len = strcspn(full_field_name,"0123456789");
if(nondigit_prefix_len!=full_field_name_len) {
//field name contains digits. Split into base field name and index
memcpy(field_base_name,full_field_name,nondigit_prefix_len);
field_base_name[nondigit_prefix_len] = '\0';
char *endptr = NULL;
field_index = strtol(full_field_name+nondigit_prefix_len, &endptr, 10);
if(field_index<0)
continue; //hmm?
if(endptr && *endptr)
continue; //digits weren't the last part
} else {
strcpy(field_base_name,full_field_name);
}
}
// find in parms list
int32_t j;
Parm *m;
@ -2021,17 +2119,33 @@ bool Parms::setFromRequest ( HttpRequest *r ,
strcmp(field_base_name,m->m_cgi) == 0)
break; //found it
}
if(j >= m_numParms)
if(j >= m_numParms) {
continue; //cgi parm name not found
if(field_index>0 && field_index>m->m_max)
}
if(field_index>0 && field_index>m->m_max) {
continue; //out-of-bounds
}
// . skip if no value was provided
// . unless it was a string! so we can make them empty.
if(v[0] == '\0' &&
m->m_type != TYPE_STRING &&
m->m_type != TYPE_STRINGBOX) continue;
m->m_type != TYPE_STRINGBOX) {
continue;
}
char adjusted_value[128];
if( uiconvert ) {
if( !convertUIToInternal(field_base_name, m->m_type, v, adjusted_value) ) {
log(LOG_ERROR, "Could not convert value of '%s' for '%s'", field_base_name, v);
continue;
}
v = adjusted_value;
}
// set it
setParm(THIS, m, field_index, v, false, false);
setParm(THIS, m, field_index, v);
}
return true;
@ -2078,7 +2192,7 @@ bool Parms::insertParm ( int32_t i , int32_t an , char *THIS ) {
*(int32_t *)(THIS + m->m_arrayCountOffset) = *(int32_t *)(THIS + m->m_arrayCountOffset)+1;
// put the defaults in the inserted line
setParm ( (char *)THIS , m, an , m->m_def , false ,false );
setParm ( (char *)THIS , m, an , m->m_def);
return true;
}
@ -2128,9 +2242,7 @@ bool Parms::removeParm ( int32_t i , int32_t an , char *THIS ) {
void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, bool isHtmlEncoded, bool fromRequest) {
if ( fromRequest ) { g_process.shutdownAbort(true); }
void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s) {
// . this is just for setting CollectionRecs, so skip if offset < 0
// . some parms are just for SearchInput (search parms)
@ -2170,8 +2282,6 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
case TYPE_BOOL:
case TYPE_PRIORITY: {
char *ptr = (char*)THIS + m->m_off + sizeof(char)*array_index;
if ( fromRequest && *(char*)ptr == atol(s))
return;
*(char*)ptr = s ? atol(s) : 0;
break;
}
@ -2191,25 +2301,16 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
}
case TYPE_FLOAT: {
char *ptr = (char*)THIS + m->m_off + sizeof(float)*array_index;
if( fromRequest && almostEqualFloat(*(float *)ptr, (s ? (float)atof(s) : 0)) ) {
return;
}
*(float*)ptr = s ? (float)atof ( s ) : 0;
break;
}
case TYPE_DOUBLE: {
char *ptr = (char*)THIS + m->m_off + sizeof(double)*array_index;
if( fromRequest && almostEqualFloat(*(double*)ptr, ( s ? (double)atof(s) : 0)) ) {
return;
}
*(double*)ptr = s ? (double)atof ( s ) : 0;
break;
}
case TYPE_IP: {
char *ptr = (char*)THIS + m->m_off + sizeof(int32_t)*array_index;
if ( fromRequest && *(int32_t*)ptr == (s ? (int32_t)atoip(s,strlen(s)) : 0) )
return;
*(int32_t*)ptr = s ? (int32_t)atoip(s,strlen(s)) : 0;
break;
}
@ -2219,16 +2320,11 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
int32_t v = s ? atol(s) : 0;
// min is considered valid if >= 0
if ( m->m_min >= 0 && v < m->m_min ) v = m->m_min;
if ( fromRequest && *(int32_t *)ptr == v )
return;
*(int32_t *)ptr = v;
break;
}
case TYPE_INT64: {
char *ptr = (char*)THIS + m->m_off + sizeof(int64_t)*array_index;
if ( fromRequest && *(uint64_t*)ptr == ( s ? strtoull(s,NULL,10) : 0) ) {
return;
}
*(int64_t*)ptr = s ? strtoull(s,NULL,10) : 0;
break;
}
@ -2240,18 +2336,9 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
// SafeBufs "array_index" is the # in the array, starting at 0
char *ptr = (char*)THIS + m->m_off + sizeof(SafeBuf)*array_index;
SafeBuf *sb = (SafeBuf *)ptr;
int32_t oldLen = sb->length();
// why was this commented out??? we need it now that we
// send email alerts when parms change!
if ( fromRequest &&
! isHtmlEncoded && oldLen == len &&
memcmp ( sb->getBufStart() , s , len ) == 0 )
return;
// nuke it
sb->purge();
// this means that we can not use string POINTERS as parms!!
if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len );
else len = sb->htmlDecode (s,len);
sb->safeMemcpy ( s , len );
// tag it
sb->setLabel ( "parm1" );
// ensure null terminated
@ -2267,22 +2354,11 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
int32_t len = strlen(s);
if ( len >= m->m_size ) len = m->m_size - 1; // truncate!!
char *dst = THIS + m->m_off + m->m_size*array_index;
// why was this commented out??? we need it now that we
// send email alerts when parms change!
if ( fromRequest &&
! isHtmlEncoded && (int32_t)strlen(dst) == len &&
memcmp ( dst , s , len ) == 0 ) {
return;
}
// this means that we can not use string POINTERS as parms!!
if ( !isHtmlEncoded ) {
gbmemcpy( dst, s, len );
} else {
len = htmlDecode( dst, s, len, false );
}
gbmemcpy( dst, s, len );
dst[len] = '\0';
// . might have to set length
// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
if ( m->m_plen >= 0 )
@ -2295,14 +2371,9 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
log(LOG_LOGIC,"admin: attempt to set parameter %s from cgi-request", m->m_title);
return;
}
// do not send if setting from startup
if ( ! fromRequest ) return;
// note it in the log
log("admin: parm \"%s\" changed value",m->m_title);
}
void Parms::setToDefault(char *THIS, parameter_object_type_t objType, CollectionRec *argcr) {
// init if we should
init();
@ -2344,7 +2415,7 @@ void Parms::setToDefault(char *THIS, parameter_object_type_t objType, Collection
char *dst = THIS + m->m_off;
memcpy(dst, raw_default, m->m_size);
} else
setParm(THIS , m, 0, m->m_def, false/*not enc.*/, false );
setParm(THIS , m, 0, m->m_def);
} else if(m->m_fixed<=0) {
//variable-sized array
//empty it
@ -2357,7 +2428,7 @@ void Parms::setToDefault(char *THIS, parameter_object_type_t objType, Collection
memcpy(dst, raw_default, m->m_size);
raw_default = ((char*)raw_default) + m->m_size;
} else
setParm(THIS, m, k, m->m_def, false/*not enc.*/, false);
setParm(THIS, m, k, m->m_def);
}
}
}
@ -2485,7 +2556,7 @@ bool Parms::setFromFile ( void *THIS ,
v[nb] = '\0';
// set our parm
setParm( (char *)THIS, m, j, v, false, false );
setParm( (char *)THIS, m, j, v);
// we were set from the explicit file
//((CollectionRec *)THIS)->m_orig[i] = 2;
@ -2569,7 +2640,7 @@ bool Parms::setFromFile ( void *THIS ,
v[nb] = '\0';
// set our parm
setParm( (char *)THIS, m, j, v, false /*is html encoded?*/, false );
setParm( (char *)THIS, m, j, v);
// do not repeat same node
nn++;
@ -3519,27 +3590,48 @@ void Parms::init ( ) {
m++;
m->m_title = "diversityWeightMin";
m->m_desc = "diversityWeightMin";
m->m_cgi = "diversity_weight_min";
simple_m_set(SearchInput,m_diversityWeightMin);
m->m_defOff2 = offsetof(Conf,m_diversityWeightMin);
m->m_title = "termfreq min";
m->m_desc = "Term frequency estimate minimum";
m->m_cgi = "termfreqweightfreqmin";
simple_m_set(Conf,m_termFreqWeightFreqMin);
simple_m_set(SearchInput,m_termFreqWeightFreqMin);
m->m_defOff2 = offsetof(Conf,m_termFreqWeightFreqMin);
m->m_def = "0.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "termfreq max";
m->m_desc = "Term frequency estimate maximum";
m->m_cgi = "termfreqweightfreqmax";
simple_m_set(SearchInput,m_termFreqWeightFreqMax);
m->m_defOff2 = offsetof(Conf,m_termFreqWeightFreqMax);
m->m_def = "0.500000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "termfreq weight min";
m->m_desc = "Term frequency weight minimum";
m->m_cgi = "termfreqweightmin";
simple_m_set(SearchInput,m_termFreqWeightMin);
m->m_defOff2 = offsetof(Conf,m_termFreqWeightMin);
m->m_def = "0.500000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "termfreq weight max";
m->m_desc = "Term frequency weight maximum";
m->m_cgi = "termfreqweightmax";
simple_m_set(SearchInput,m_termFreqWeightMax);
m->m_defOff2 = offsetof(Conf,m_termFreqWeightMax);
m->m_def = "1.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "diversityWeightMax";
m->m_desc = "diversityWeightMax";
m->m_cgi = "diversity_weight_max";
simple_m_set(SearchInput,m_diversityWeightMax);
m->m_defOff2 = offsetof(Conf,m_diversityWeightMax);
m->m_def = "1.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "densityWeightMin";
m->m_desc = "densityWeightMin";
m->m_cgi = "density_weight_min";
m->m_cgi = "densityweightmin";
simple_m_set(SearchInput,m_densityWeightMin);
m->m_defOff2 = offsetof(Conf,m_densityWeightMin);
m->m_def = "0.350000";
@ -3548,16 +3640,34 @@ void Parms::init ( ) {
m->m_title = "densityWeightMax";
m->m_desc = "densityWeightMax";
m->m_cgi = "density_weight_max";
m->m_cgi = "densityweightmax";
simple_m_set(SearchInput,m_densityWeightMax);
m->m_defOff2 = offsetof(Conf,m_densityWeightMax);
m->m_def = "1.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "diversityWeightMin";
m->m_desc = "diversityWeightMin";
m->m_cgi = "diversityweightmin";
simple_m_set(SearchInput,m_diversityWeightMin);
m->m_defOff2 = offsetof(Conf,m_diversityWeightMin);
m->m_def = "1.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "diversityWeightMax";
m->m_desc = "diversityWeightMax";
m->m_cgi = "diversityweightmax";
simple_m_set(SearchInput,m_diversityWeightMax);
m->m_defOff2 = offsetof(Conf,m_diversityWeightMax);
m->m_def = "1.000000";
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "hashGroupWeightBody";
m->m_desc = "hashGroupWeightBody";
m->m_cgi = "hash_group_weight_body";
m->m_cgi = "hgw_body";
simple_m_set(SearchInput,m_hashGroupWeightBody);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightBody);
m->m_def = "1.000000";
@ -3566,7 +3676,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightTitle";
m->m_desc = "hashGroupWeightTitle";
m->m_cgi = "hashGroupWeightTitle";
m->m_cgi = "hgw_title";
simple_m_set(SearchInput,m_hashGroupWeightTitle);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightTitle);
m->m_def = "8.000000";
@ -3575,7 +3685,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightHeading";
m->m_desc = "hashGroupWeightHeading";
m->m_cgi = "hash_group_weight_heading";
m->m_cgi = "hgw_heading";
simple_m_set(SearchInput,m_hashGroupWeightHeading);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightHeading);
m->m_def = "1.500000";
@ -3584,7 +3694,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInlist";
m->m_desc = "hashGroupWeightInlist";
m->m_cgi = "hash_group_weight_inlist";
m->m_cgi = "hgw_list";
simple_m_set(SearchInput,m_hashGroupWeightInlist);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInlist);
m->m_def = "0.300000";
@ -3593,7 +3703,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInMetaTag";
m->m_desc = "hashGroupWeightInMetaTag";
m->m_cgi = "hash_group_weight_in_meta_tag";
m->m_cgi = "hgw_metatag";
simple_m_set(SearchInput,m_hashGroupWeightInMetaTag);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInMetaTag);
m->m_def = "0.100000";
@ -3602,7 +3712,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInLinkText";
m->m_desc = "hashGroupWeightInLinkText";
m->m_cgi = "hash_group_weight_in_link_text";
m->m_cgi = "hgw_inlinktext";
simple_m_set(SearchInput,m_hashGroupWeightInLinkText);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInLinkText);
m->m_def = "16.000000";
@ -3611,7 +3721,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInTag";
m->m_desc = "hashGroupWeightInTag";
m->m_cgi = "hash_group_weight_in_tag";
m->m_cgi = "hgw_intag";
simple_m_set(SearchInput,m_hashGroupWeightInTag);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInTag);
m->m_def = "1.000000";
@ -3620,7 +3730,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightNeighborhood";
m->m_desc = "hashGroupWeightNeighborhood";
m->m_cgi = "hash_group_weight_neighborhood";
m->m_cgi = "hgw_neighborhood";
simple_m_set(SearchInput,m_hashGroupWeightNeighborhood);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightNeighborhood);
m->m_def = "0.000000";
@ -3629,7 +3739,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInternalLinkText";
m->m_desc = "hashGroupWeightInternalLinkText";
m->m_cgi = "hash_group_weight_internal_link_text";
m->m_cgi = "hgw_inintlinktext";
simple_m_set(SearchInput,m_hashGroupWeightInternalLinkText);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInternalLinkText);
m->m_def = "4.000000";
@ -3638,7 +3748,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInUrl";
m->m_desc = "hashGroupWeightInUrl";
m->m_cgi = "hash_group_weight_in_url";
m->m_cgi = "hgw_inurl";
simple_m_set(SearchInput,m_hashGroupWeightInUrl);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInUrl);
m->m_def = "1.000000";
@ -3647,7 +3757,7 @@ void Parms::init ( ) {
m->m_title = "hashGroupWeightInMenu";
m->m_desc = "hashGroupWeightInMenu";
m->m_cgi = "hash_group_weight_in_menu";
m->m_cgi = "hgw_inmenu";
simple_m_set(SearchInput,m_hashGroupWeightInMenu);
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInMenu);
m->m_def = "0.200000";
@ -3665,6 +3775,26 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "Page temp weight min";
m->m_desc = "Page temp is scaled to be between the min and max";
m->m_cgi = "pagetempweightmin";
simple_m_set(SearchInput,m_pageTemperatureWeightMin);
m->m_defOff2 = offsetof(Conf,m_pageTemperatureWeightMin);
m->m_def = "1.000000";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "Page temp weight max";
m->m_desc = "Page temp is scaled to be between the min and max";
m->m_cgi = "pagetempweightmax";
simple_m_set(SearchInput,m_pageTemperatureWeightMax);
m->m_defOff2 = offsetof(Conf,m_pageTemperatureWeightMax);
m->m_def = "20.000000";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "Use page temperature";
m->m_desc = "Use page temperature (if available) for ranking";
m->m_cgi = "use_page_temperature";
@ -3741,6 +3871,18 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "unknown language weight";
m->m_desc = "Use this to override the default uknown language weight "
"for this collection. We multiply a result's score by this value "
"if the user requested a specific language, but the language of the "
"indexed page could not be determined.";
simple_m_set(SearchInput,m_unknownLangWeight);
m->m_defOff= offsetof(CollectionRec,m_unknownLangWeight);
m->m_cgi = "ulangw";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "max query terms";
m->m_desc = "Do not allow more than this many query terms. Helps "
"prevent big queries from resource hogging.";
@ -3809,11 +3951,10 @@ void Parms::init ( ) {
m->m_title = "language weight";
m->m_desc = "Default language weight if document matches query "
"language. Use this to give results that match the specified "
"the specified &qlang higher ranking, or docs whose language "
"is unknown. Can be overridden with "
"language. Use this to give results that match "
"the specified &qlang higher ranking. Can be overridden with "
"&langw in the query url.";
m->m_cgi = "langweight";
m->m_cgi = "langw";
simple_m_set(CollectionRec,m_sameLangWeight);
m->m_def = "20.000000";
m->m_group = true;
@ -3821,6 +3962,21 @@ void Parms::init ( ) {
m->m_page = PAGE_RANKING;
m++;
m->m_title = "unknown language weight";
m->m_desc = "Default language weight if query language is specified but document "
"language could not be determined. Use this to give docs with unknown language a "
"higher ranking when qlang is specified. Can be overridden with "
"&ulangw in the query url.";
m->m_cgi = "ulangw";
simple_m_set(CollectionRec,m_unknownLangWeight);
m->m_def = "10.000000";
m->m_group = true;
m->m_flags = PF_REBUILDRANKINGSETTINGS;
m->m_page = PAGE_RANKING;
m++;
m->m_title = "termfreq min";
m->m_desc = "Term frequency estimate minimum";
m->m_cgi = "termfreqweightfreqmin";
@ -3901,6 +4057,8 @@ void Parms::init ( ) {
m->m_page = PAGE_RANKING;
m++;
m->m_title = "Hashgroup weight - body";
m->m_desc = "";
m->m_cgi = "hgw_body";
@ -3953,7 +4111,7 @@ void Parms::init ( ) {
m->m_title = "Hashgroup weight - in link text";
m->m_desc = "";
m->m_cgi = "hgw_innlinktext";
m->m_cgi = "hgw_inlinktext";
simple_m_set(Conf,m_hashGroupWeightInLinkText);
m->m_def = "16.000000";
m->m_group = false;
@ -4021,6 +4179,26 @@ void Parms::init ( ) {
m->m_page = PAGE_RANKING;
m++;
m->m_title = "Page temp weight min";
m->m_desc = "Page temp is scaled to be between the min and max";
m->m_cgi = "pagetempweightmin";
simple_m_set(Conf,m_pageTemperatureWeightMin);
m->m_def = "1.000000";
m->m_group = false;
m->m_flags = PF_REBUILDRANKINGSETTINGS;
m->m_page = PAGE_RANKING;
m++;
m->m_title = "Page temp weight max";
m->m_desc = "Page temp is scaled to be between the min and max";
m->m_cgi = "pagetempweightmax";
simple_m_set(Conf,m_pageTemperatureWeightMax);
m->m_def = "20.000000";
m->m_group = false;
m->m_flags = PF_REBUILDRANKINGSETTINGS;
m->m_page = PAGE_RANKING;
m++;
m->m_title = "Use page temperature";
m->m_desc = "Use page temperature (if available) for ranking";
m->m_cgi = "use_page_temperature";
@ -4047,6 +4225,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RANKING;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "Rank adjustment";
m->m_cgi = "flag_rerank";
m->m_xml = "RankAdjustment";

@ -198,7 +198,7 @@ class Parms {
bool insertParm ( int32_t i , int32_t an , char *THIS ) ;
bool removeParm ( int32_t i , int32_t an , char *THIS ) ;
void setParm(char *THIS, Parm *m, int32_t array_index, const char *s, bool isHtmlEncoded, bool fromRequest);
void setParm(char *THIS, Parm *m, int32_t array_index, const char *s);
void setToDefault(char *THIS, parameter_object_type_t objType,
CollectionRec *argcr );
@ -224,6 +224,7 @@ class Parms {
Parm *getParm(int32_t i) { return m_parms+i; }
int32_t getNumParms() const { return m_numParms; }
bool convertUIToInternal(const char *field_base_name, parameter_type_t type, const char *s, char *adjusted_value);
private:
//

@ -202,22 +202,17 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
//first we update our pinginfo
PingInfo newPingInfo;
newPingInfo.m_numCorruptDiskReads = g_numCorrupt;
newPingInfo.m_numOutOfMems = g_mem.getOOMCount();
newPingInfo.m_socketsClosedFromHittingLimit = g_stats.m_closedSockets;
newPingInfo.m_currentSpiders = g_spiderLoop.getNumSpidersOut();
newPingInfo.m_unused9 = 0;
newPingInfo.m_unused3 = 0;
newPingInfo.m_unused11 = 0;
newPingInfo.m_unused14 = 0;
// let the receiver know our repair mode
newPingInfo.m_repairMode = g_repairMode;
int32_t l_loadavg = (int32_t) (g_process.getLoadAvg() * 100.0);
//gbmemcpy(p, &l_loadavg, sizeof(int32_t)); p += sizeof(int32_t);
newPingInfo.m_loadAvg = l_loadavg ;
newPingInfo.m_unused2 = 0;
// then our percent mem used
float mem = g_mem.getUsedMemPercentage();
//*(float *)p = mem ; p += sizeof(float); // 4 bytes
newPingInfo.m_percentMemUsed = mem;
newPingInfo.m_unused3 = 0;
// our num recs, docsIndexed
newPingInfo.m_totalDocsIndexed = (int32_t)g_process.getTotalDocsIndexed();
@ -229,7 +224,7 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
if ( g_hostdb.getCRC() == 0 ) { g_process.shutdownAbort(true); }
// disk usage (df -ka)
newPingInfo.m_diskUsage = g_process.m_diskUsage;
newPingInfo.m_unused7 = 0.0;
// flags indicating our state
int32_t flags = 0;
@ -247,9 +242,7 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
if ( g_dailyMerge.m_mergeMode ==0 || g_dailyMerge.m_mergeMode == 6 )
flags |= PFLAG_MERGEMODE0OR6;
uint8_t rv8 = (uint8_t)g_recoveryLevel;
if ( g_recoveryLevel > 255 ) rv8 = 255;
newPingInfo.m_recoveryLevel = rv8;
newPingInfo.m_unused18 = 0;
//*(int32_t *)p = flags; p += 4; // 4 bytes
newPingInfo.m_flags = flags;
@ -263,12 +256,11 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
newPingInfo.m_unused0 = 0;
newPingInfo.m_udpSlotsInUseIncoming = g_udpServer.getNumUsedSlotsIncoming();
newPingInfo.m_unused12 = 0;
newPingInfo.m_tcpSocketsInUse = g_httpServer.m_tcp.m_numUsed;
newPingInfo.m_unused13 = 0;
// from Loop.cpp
newPingInfo.m_cpuUsage = 0.0;
newPingInfo.m_unused4 = 0.0;
// store the gbVersionStrBuf now, just a date with a \0 included
char *v = getVersion();
@ -369,18 +361,6 @@ void PingServer::gotReplyWrapperP(void *state, UdpSlot *slot) {
// he is back up then we are free to send another alert about
// any other host that goes down
if ( h->m_hostId == s_lastSentHostId ) s_lastSentHostId = -1;
if ( h->m_pingInfo.m_percentMemUsed >= 99.0 &&
h->m_firstOOMTime == 0 )
h->m_firstOOMTime = nowms;
if ( h->m_pingInfo.m_percentMemUsed < 99.0 )
h->m_firstOOMTime = 0LL;
// if this host is alive and has been at 99% or more mem usage
// for the last X minutes, and we have got at least 10 ping replies
// from him, then send an email alert.
if ( h->m_pingInfo.m_percentMemUsed >= 99.0 &&
nowms - h->m_firstOOMTime >= g_conf.m_sendEmailTimeout )
g_pingServer.sendEmail ( h , NULL , true );
} else {
// . if his ping was dead, try to send an email alert to the admin
// . returns false if blocked, true otherwise

@ -246,19 +246,16 @@ float PosdbTable::getBestScoreSumForSingleTerm(int32_t i, const char *wpi, const
unsigned char div = Posdb::getDiversityRank ( wpi );
score *= m_msg39req->m_scoringWeights.m_diversityWeights[div];
score *= m_msg39req->m_scoringWeights.m_diversityWeights[div];
// hash group? title? body? heading? etc.
unsigned char hg = Posdb::getHashGroup ( wpi );
unsigned char mhg = hg;
if ( s_inBody[mhg] ) mhg = HASHGROUP_BODY;
score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg];
score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg];
// good density?
unsigned char dens = Posdb::getDensityRank ( wpi );
score *= m_msg39req->m_scoringWeights.m_densityWeights[dens];
score *= m_msg39req->m_scoringWeights.m_densityWeights[dens];
// to make more compatible with pair scores divide by distance of 2
//score /= 2.0;
@ -452,6 +449,7 @@ float PosdbTable::getBestScoreSumForSingleTerm(int32_t i, const char *wpi, const
sx->m_densityRank = Posdb::getDensityRank(maxp);
float score = bestScores[k];
//score *= ts;
score *= m_freqWeights[i];
score *= m_freqWeights[i];
@ -3937,6 +3935,7 @@ void PosdbTable::intersectLists10_r ( ) {
}
}
if( currPassNum == INTERSECT_SCORING ) {
//
// Pre-advance each termlist's cursor to skip to next docid.
@ -4094,7 +4093,6 @@ void PosdbTable::intersectLists10_r ( ) {
minSingleScore *= completeScoreMultiplier;
//#
//# DOCID / SITERANK DETECTION
//#
@ -4128,7 +4126,6 @@ void PosdbTable::intersectLists10_r ( ) {
minPairScore *= completeScoreMultiplier;
//#
//# Find minimum score - either single term or term pair
//#
@ -4155,7 +4152,6 @@ void PosdbTable::intersectLists10_r ( ) {
}
} // !m_q->m_isBoolean
//#
//# Calculate score and give boost based on siterank and highest inlinking siterank
//#
@ -4169,14 +4165,23 @@ void PosdbTable::intersectLists10_r ( ) {
score = minScore * (adjustedSiteRank*m_siteRankMultiplier+1.0);
logTrace(g_conf.m_logTracePosdb, "Score %f for docId %" PRIu64 "", score, m_docId);
//#
//# Give score boost if query and doc language is the same.
//# Give score boost if query and doc language is the same,
//# and optionally a different boost if the language of the
//# page is unknown.
//#
//# Use "qlang" parm to set the language. i.e. "&qlang=fr"
//#
if ( m_msg39req->m_language == 0 || docLang == 0 || m_msg39req->m_language == docLang) {
score *= (m_msg39req->m_sameLangWeight); //SAMELANGMULT;
logTrace(g_conf.m_logTracePosdb, "Giving score a matching language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_sameLangWeight, score, m_docId);
if ( m_msg39req->m_language != 0 ) {
if( m_msg39req->m_language == docLang) {
score *= (m_msg39req->m_sameLangWeight);
logTrace(g_conf.m_logTracePosdb, "Giving score a matching language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_sameLangWeight, score, m_docId);
}
else
if( docLang == 0 ) {
score *= (m_msg39req->m_unknownLangWeight);
logTrace(g_conf.m_logTracePosdb, "Giving score an unknown language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_unknownLangWeight, score, m_docId);
}
}
double page_temperature = 0;
@ -4185,13 +4190,12 @@ void PosdbTable::intersectLists10_r ( ) {
if(m_msg39req->m_usePageTemperatureForRanking) {
use_page_temperature = true;
page_temperature = g_pageTemperatureRegistry.query_page_temperature(m_docId);
page_temperature = g_pageTemperatureRegistry.query_page_temperature(m_docId, m_msg39req->m_pageTemperatureWeightMin, m_msg39req->m_pageTemperatureWeightMax);
score *= page_temperature;
logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.4f, score %f->%f", m_docId, page_temperature, score_before_page_temp, score);
logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.14f, score %f -> %f", m_docId, page_temperature, score_before_page_temp, score);
}
//#
//# Handle sortby int/float and minimum docid/score pairs
//#
@ -4524,11 +4528,15 @@ float PosdbTable::getMaxPossibleScore ( const QueryTermInfo *qti,
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
// language boost if same language (or no lang specified)
if ( m_msg39req->m_language == docLang ||
m_msg39req->m_language == 0 ||
docLang == 0 ) {
score *= m_msg39req->m_sameLangWeight;//SAMELANGMULT;
// language boost if language specified and if page is same language, or unknown language
if ( m_msg39req->m_language != 0 ) {
if( m_msg39req->m_language == docLang) {
score *= (m_msg39req->m_sameLangWeight);
}
else
if( docLang == 0 ) {
score *= (m_msg39req->m_unknownLangWeight);
}
}
// assume the other term we pair with will be 1.0

@ -280,7 +280,7 @@ bool Proxy::handleRequest (TcpSocket *s){
s_count = 0;
s_last = now;
}
g_stats.m_closedSockets++;;
Statistics::register_socket_limit_hit();
return g_httpServer.sendErrorReply ( s , 500 ,
"Too many sockets open.");
}

@ -17,17 +17,24 @@ void ScoringWeights::init(float diversityWeightMin, float diversityWeightMax,
float hashGroupWeightInMenu)
{
for(int i = 0; i <= MAXDIVERSITYRANK; i++)
m_diversityWeights[i] = scale_quadratic(i,0,MAXDIVERSITYRANK,diversityWeightMin,diversityWeightMax);
m_diversityWeights[i] = scale_quadratic(i, 0, MAXDIVERSITYRANK, diversityWeightMin, diversityWeightMax);
for(int i = 0; i <= MAXDENSITYRANK; i++)
m_densityWeights[i] = scale_quadratic(i,0,MAXDENSITYRANK,densityWeightMin,densityWeightMax);
m_densityWeights[i] = scale_quadratic(i, 0, MAXDENSITYRANK, densityWeightMin, densityWeightMax);
// make sure if word spam is 0 that the weight is not 0
for(int i = 0; i <= MAXWORDSPAMRANK; i++)
m_wordSpamWeights[i] = scale_linear(i, 0,MAXWORDSPAMRANK, 1.0/MAXWORDSPAMRANK, 1.0);
for(int i = 0; i <= MAXWORDSPAMRANK; i++)
m_wordSpamWeights[i] = scale_linear(i, 0, MAXWORDSPAMRANK, 1.0/MAXWORDSPAMRANK, 1.0);
// site rank of inlinker
// to be on the same level as multiplying the final score
// by the siterank+1 we should make this a sqrt() type thing
// since we square it so that single term scores are on the same
// level as term pair scores
// @@@ BR: Right way to do it? Gives a weight between 1 and 4
for(int i = 0; i <= MAXWORDSPAMRANK; i++) {
m_linkerWeights[i] = sqrt(1.0 + i);
}
for(int i=0; i<HASHGROUP_END; i++)
m_hashGroupWeights[i] = 1.0;

@ -48,6 +48,7 @@ SearchInput::SearchInput() {
m_maxSerpScore = 0.0;
m_minSerpDocId = 0;
m_sameLangWeight = 0.0;
m_unknownLangWeight = 0.0;
m_defaultSortLang = NULL;
m_dedupURL = 0;
m_percentSimilarSummary = 0;
@ -64,7 +65,15 @@ SearchInput::SearchInput() {
m_askOtherShards = false;
memset(m_queryId, 0, sizeof(m_queryId));
m_doMaxScoreAlgo = false;
m_termFreqWeightFreqMin = 0.0;
m_termFreqWeightFreqMax = 0.5;
m_termFreqWeightMin = 0.5;
m_termFreqWeightMax = 1.0;
m_synonymWeight = 0.9;
m_pageTemperatureWeightMin = 1.0;
m_pageTemperatureWeightMax = 20.0;
m_usePageTemperatureForRanking = true;
m_numFlagScoreMultipliers=26;
for(int i=0; i<26; i++)

@ -130,6 +130,7 @@ public:
int64_t m_minSerpDocId;
float m_sameLangWeight;
float m_unknownLangWeight;
// prefer what lang in the results. it gets a 20x boost. "en" "xx" "fr"
char *m_defaultSortLang;
@ -147,6 +148,11 @@ public:
bool m_doDupContentRemoval; // msg40
bool m_getDocIdScoringInfo;
float m_termFreqWeightFreqMin;
float m_termFreqWeightFreqMax;
float m_termFreqWeightMin;
float m_termFreqWeightMax;
float m_diversityWeightMin;
float m_diversityWeightMax;
float m_densityWeightMin;
@ -162,8 +168,9 @@ public:
float m_hashGroupWeightInternalLinkText;
float m_hashGroupWeightInUrl;
float m_hashGroupWeightInMenu;
float m_synonymWeight;
float m_pageTemperatureWeightMin;
float m_pageTemperatureWeightMax;
bool m_usePageTemperatureForRanking;
int32_t m_numFlagScoreMultipliers;

@ -4,6 +4,11 @@
#include "gb-include.h"
#include "types.h"
#include "Msg3.h" //getDiskPageCache()
#include "Mem.h" //memory statistics
#include "UdpServer.h" //g_udpServer.getNumUsedSlotsIncoming()
#include "HttpServer.h" //g_httpServer.m_tcp.m_numUsed
#include "Msg5.h" //g_numCorrupt
#include "SpiderLoop.h"
#include "RdbCache.h"
#include "Rdb.h"
#include "GbMutex.h"
@ -364,6 +369,28 @@ static void dump_rdb_cache_statistics( FILE *fp ) {
}
}
//////////////////////////////////////////////////////////////////////////////
// Assorted statistics
static std::atomic<unsigned long> socket_limit_hit_count(0);
void Statistics::register_socket_limit_hit() {
socket_limit_hit_count++;
}
//Fetch various counters and levels. Some of them were previously exchanged in PingInfo
static void dump_assorted_statistics(FILE *fp) {
fprintf(fp,"mem:pctused:%f\n",g_mem.getUsedMemPercentage());
fprintf(fp,"mem:oom_count:%d\n",g_mem.getOOMCount());
fprintf(fp,"socket:limit_hit:%lu\n",socket_limit_hit_count.load());
fprintf(fp,"socket:slots_incoming:%d\n",g_udpServer.getNumUsedSlotsIncoming());
fprintf(fp,"socket:tcp_in_use:%d\n",g_httpServer.m_tcp.m_numUsed);
fprintf(fp,"misc::corrupt_list_reads:%d\n",g_numCorrupt);
fprintf(fp,"spider:current_spiders:%d\n",g_spiderLoop.getNumSpidersOut());
}
//////////////////////////////////////////////////////////////////////////////
// statistics
@ -381,6 +408,7 @@ static void dump_statistics(time_t now) {
dump_spider_statistics( fp );
dump_io_statistics( fp );
dump_rdb_cache_statistics( fp );
dump_assorted_statistics(fp);
if ( fflush(fp) != 0 ) {
log( LOG_ERROR, "fflush(%s) failed with errno=%d (%s)", tmp_filename, errno, strerror( errno ) );

@ -12,6 +12,8 @@ void register_spider_time( bool is_new, int error_code, int http_status, unsigne
void register_io_time( bool is_write, int error_code, unsigned long bytes, unsigned ms );
void register_socket_limit_hit();
} //namespace
#endif

@ -14,8 +14,6 @@ Stats::Stats ( ) {
m_next = 0;
memset ( m_pts , 0 , sizeof(StatPoint)*MAX_POINTS );
m_closedSockets = 0;
memset(m_msg3aRecalls, 0, sizeof(m_msg3aRecalls));
clearMsgStats();

@ -60,9 +60,6 @@ class Stats {
int64_t m_startTime;
// when we have to close a socket because too many are open.. count it
int32_t m_closedSockets;
time_t m_uptimeStart;
// one count for each CR_* defined in Msg51.h

@ -2,6 +2,7 @@
#include "TcpServer.h"
#include "Stats.h"
#include "Statistics.h"
#include "Profiler.h"
#include "PingServer.h"
#include "HttpServer.h" //g_httpServer.m_ssltcp.m_ctx
@ -771,7 +772,7 @@ TcpSocket *TcpServer::getNewSocket ( ) {
s_last = now;
}
// another stat
g_stats.m_closedSockets++;
Statistics::register_socket_limit_hit();
g_errno = EOUTOFSOCKETS;
// send email alert
g_pingServer.sendEmailMsg ( &s_lastTime ,
@ -888,7 +889,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
s_last = now;
}
// another stat
g_stats.m_closedSockets++;
Statistics::register_socket_limit_hit();
g_errno = EOUTOFSOCKETS;
// send email alert
@ -904,7 +905,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
if ( sd < 0 || sd >= MAX_TCP_SOCKS ) {
log(LOG_LOGIC,"tcp: Got bad sd of %" PRId32".",(int32_t)sd);
// another stat
g_stats.m_closedSockets++;
Statistics::register_socket_limit_hit();
g_errno = EOUTOFSOCKETS;
// send email alert
g_pingServer.sendEmailMsg ( &s_lastTime , "out of sockets on https2");
@ -919,7 +920,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
// . this has happened a few times lately...
if ( s->m_startTime != 0 ) {
log(LOG_LOGIC,"tcp: sd of %" PRId32" is already in use.",(int32_t)sd);
g_stats.m_closedSockets++;
Statistics::register_socket_limit_hit();
g_errno = EOUTOFSOCKETS;
if ( sd == 0 ) log("tcp: closing2 sd of 0");
if ( ::close(sd) == -1 )

@ -12922,6 +12922,9 @@ char *XmlDoc::getMetaList(bool forDelete) {
// we're adding titlerec to keep links between redirection intact
addTitleRec = true;
// since we're adding titlerec, add posrec as well
addPosRec = true;
// if we are adding a simplified redirect as a link to spiderdb
// likewise if the error was ENONCANONICAL treat it like that
spideringLinks = true;
@ -16062,7 +16065,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
m_reply.m_ip = m_ip;
m_reply.m_firstIp = *fip;
m_reply.m_docId = m_docId;
m_reply.m_contentLen = size_utf8Content;
m_reply.m_contentLen = size_utf8Content - 1;
m_reply.m_lastSpidered = getSpideredTime();//m_spideredTime;
m_reply.m_datedbDate = 0;
m_reply.m_firstIndexedDate = m_firstIndexedDate;

@ -483,7 +483,7 @@ public:
SafeBuf *getTimeAxisUrl ( );
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashDateNumbers ( class HashTableX *tt );
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
bool hashIncomingLinkText(HashTableX *table);
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashTitle ( class HashTableX *table );

@ -154,13 +154,6 @@ static bool storeTerm ( const char *s ,
// we know the termlist is small, or the termlist is being used for spidering
// or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
if ( ! pch64 || pch64 == (void *)-1 ) { g_process.shutdownAbort(true); }
// shortcut
Url *fu = getFirstUrl();
// constructor should set to defaults automatically
HashInfo hi;
hi.m_hashGroup = HASHGROUP_INTAG;
@ -168,19 +161,26 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// usually we shard by docid, but these are terms we shard by termid!
hi.m_shardByTermId = true;
if ((size_utf8Content - 1) > 0) {
// for exact content deduping
setStatus("hashing gbcontenthash (deduping) no-split keys");
// for exact content deduping
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
char cbuf[64];
int32_t clen = sprintf(cbuf,"%" PRIu64,(uint64_t)*pch64);
hi.m_prefix = "gbcontenthash";
if ( ! hashString ( cbuf,clen,&hi ) ) return false;
// this should be ready to go and not block!
int64_t *pch64 = getExactContentHash64();
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
char *host = fu->getHost ();
char cbuf[64];
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
hi.m_prefix = "gbcontenthash";
if (!hashString(cbuf, clen, &hi)) return false;
}
// now hash the site
setStatus ( "hashing no-split SiteGetter terms");
Url *fu = getFirstUrl();
char *host = fu->getHost ();
//
// HASH terms for SiteGetter.cpp
//
@ -217,44 +217,6 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
}
//Dates *dp = getDates ();
// hash the clocks into indexdb
//if ( ! dp->hash ( m_docId , tt , this ) ) return false;
// . hash special site/hopcount thing for permalinks
// . used by Images.cpp for doing thumbnails
// . this returns false and sets g_errno on error
// . let's try thumbnails for all...
//if ( ! *getIsPermalink() ) return true;
/*
BR 20160117: No longer has image URLs
setStatus ( "hashing no-split gbimage keys" );
hi.m_prefix = "gbimage";
// hash gbimage: for permalinks only for Images.cpp
for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
// get the node number
//int32_t nn = m_images.m_imageNodes[i];
// get the url of the image
//XmlNode *xn = m_xml.getNodePtr(nn);
int32_t srcLen;
char *src = m_images.getImageUrl(i,&srcLen);
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
Url *cu = getCurrentUrl();
// we can addwww to normalize since this is for deduping kinda
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
char *u = iu.getUrl ();
int32_t ulen = iu.getUrlLen();
// hash each one
//if ( ! hashString ( u,ulen,&hi ) ) return false;
// hash a single entity
if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
//log("test: %s",u);
}
*/
return true;
}
@ -285,9 +247,14 @@ char *XmlDoc::hashAll(HashTableX *table) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
return NULL;
}
// BR 20160127: Never index JSON and XML content
if (*ct == CT_JSON || *ct == CT_XML) {
if (!hashContentType(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
return NULL;
}
// For XML (JSON should not get here as it should be filtered out during spidering)
// store the URL as the only thing in posdb so we are able to find it, and
// eventually ban it.
@ -405,18 +372,17 @@ char *XmlDoc::hashAll(HashTableX *table) {
// global index now, so don't need this... 9/28/2014
// stop indexing xml docs
bool indexDoc = cr->m_indexBody;
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if (!indexDoc) {
if (!cr->m_indexBody) {
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
return (char *)1;
}
if ( *ct == CT_JSON || *ct == CT_XML ) {
goto skip;
if ((size_utf8Content - 1) <= 0) {
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
return (char *)1;
}
// hash the body of the doc first so m_dist is 0 to match
@ -449,7 +415,7 @@ char *XmlDoc::hashAll(HashTableX *table) {
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if (!hashIncomingLinkText(table, false, true)) {
if (!hashIncomingLinkText(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
return NULL;
}
@ -462,7 +428,6 @@ char *XmlDoc::hashAll(HashTableX *table) {
return NULL;
}
// BR 20160220
// Store value of meta tag "geo.placename" to help aid searches for
// location specific sites, e.g. 'Restaurant in London'
@ -471,8 +436,6 @@ char *XmlDoc::hashAll(HashTableX *table) {
return NULL;
}
skip:
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
@ -714,30 +677,6 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
return false;
// do not index the rest if we are a "spider reply" document
// which is like a fake document for seeing spider statuses
//if ( isStatusDoc == CT_STATUS ) return true;
//if ( isStatusDoc ) return true;
// now for CT_STATUS spider status "documents" we also index
// gbspiderdate so index this so we can just do a
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
// spider status "documents"
/*
BR 20160108: Don't store these as we don't plan to use them
hi.m_desc = "doc last spidered date";
hi.m_prefix = "gbdocspiderdate";
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)m_spideredTime );
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
return false;
hi.m_desc = "doc last indexed date";
hi.m_prefix = "gbdocindexdate";
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)indexedTime );
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
return false;
*/
// all done
return true;
}
@ -1024,8 +963,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
Url uw;
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
hi.m_prefix = "url";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "url2";
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
@ -1228,21 +1166,15 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
int32_t elen = fu->getExtensionLen();
// update hash parms
hi.m_prefix = "ext";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "ext2";
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
char buf2[32];
sprintf(buf2,"%" PRIu64, (uint64_t)m_docId );
if ( ! hashSingleTerm(buf2,strlen(buf2),&hi) ) return false;
//if ( isStatusDoc ) return true;
setStatus ( "hashing SiteGetter terms");
//
@ -1299,76 +1231,50 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
hi.m_prefix = "urlhash";
if ( ! hashString(buf,blen,&hi) ) return false;
/*
BR 20160106 removed.
blen = sprintf(buf,"%" PRIu32,h/10);
// update hashing parms
hi.m_prefix = "urlhashdiv10";
if ( ! hashString(buf,blen,&hi) ) return false;
blen = sprintf(buf,"%" PRIu32,h/100);
// update hashing parms
hi.m_prefix = "urlhashdiv100";
if ( ! hashString(buf,blen,&hi) ) return false;
*/
if (m_contentLen > 0) {
setStatus("hashing url mid domain");
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";
hi.m_hashGroup = HASHGROUP_INURL;
hi.m_hashCommonWebWords = false; // Skip www, com, http etc.
if (!hashString(host, hlen, &hi)) {
return false;
}
setStatus ( "hashing url mid domain");
hi.m_hashCommonWebWords = true;
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
return false;
}
// update parms
hi.m_prefix = NULL;
hi.m_desc = "middle domain";
hi.m_hashGroup = HASHGROUP_INURL;
hi.m_hashCommonWebWords = false; // Skip www, com, http etc.
if ( ! hashString ( host,hlen,&hi)) return false;
setStatus("hashing url path");
char *path = fu->getPath();
int32_t plen = fu->getPathLen();
hi.m_hashCommonWebWords = true;
if ( ! hashSingleTerm ( fu->getDomain(),fu->getDomainLen(),&hi)) return false;
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
if (elen > 0) {
elen++; // also skip the dot
}
plen -= elen;
setStatus ( "hashing url path");
char *path = fu->getPath();
int32_t plen = fu->getPathLen();
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
if( elen > 0 )
{
elen++; // also skip the dot
}
plen -= elen;
// BR 20160113: Do not hash the most common page names
if( strncmp(path, "/index", plen) != 0 )
{
// hash the path
// BR 20160114: Exclude numbers in paths (usually dates)
hi.m_hashNumbers = false;
if ( ! hashString (path,plen,&hi) ) return false;
// BR 20160113: Do not hash the most common page names
if (strncmp(path, "/index", plen) != 0) {
// hash the path
// BR 20160114: Exclude numbers in paths (usually dates)
hi.m_hashNumbers = false;
if (!hashString(path, plen, &hi)) return false;
}
}
return true;
}
// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
bool hashAnomalies ,
bool hashNonAnomalies ) {
// do not index ANY of the body if it is NOT a permalink and
// "menu elimination" technology is enabled.
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
setStatus ( "hashing link text" );
// . now it must have an rss item to be indexed in all its glory
// . but if it tells us it has an rss feed, toss it and wait for
// the feed.... BUT sometimes the rss feed outlink is 404!
// . NO, now we discard with ENORSS at Msg16.cpp
//if ( ! *getHasRSSItem() && m_eliminateMenus ) return true;
// sanity check
if ( hashAnomalies == hashNonAnomalies ) { g_process.shutdownAbort(true); }
// sanity
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
@ -1404,14 +1310,7 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
// count external inlinks we have for indexing gbmininlinks:
if ( ! internal ) ecount++;
// get score
//int64_t baseScore = k->m_baseScore;
// get the weight
//int64_t ww ;
//if ( internal ) ww = m_internalLinkTextWeight;
//else ww = m_externalLinkTextWeight;
// modify the baseScore
//int64_t final = (baseScore * ww) / 100LL;
// get length of link text
int32_t tlen = k->size_linkText;
if ( tlen > 0 ) tlen--;
@ -1423,15 +1322,16 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
k->getUrl(),m_firstUrl.getUrl());
continue;
}
// if it is anomalous, set this, we don't
//if ( k->m_isAnomaly )
// hi.m_hashIffNotUnique = true;
//hi.m_baseScore = final;
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
// store the siterank of the linker in this and use that
// to set the multiplier M bits i guess
hi.m_linkerSiteRank = k->m_siteRank;
if(hi.m_linkerSiteRank>MAXSITERANK) {
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
hi.m_linkerSiteRank = MAXSITERANK;
}
// now record this so we can match the link text to
// a matched offsite inlink text term in the scoring info
k->m_wordPosStart = m_dist; // hi.m_startDist;
@ -1453,14 +1353,8 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
// . returns false and sets g_errno on error
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
// seems like iffUnique is off, so do this
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
setStatus ( "hashing neighborhoods" );
//g_tt = table;
// . now we also hash the neighborhood text of each inlink, that is,
// the text surrounding the inlink text.
// . this is also destructive in that it will remove termids that
@ -1702,15 +1596,6 @@ bool XmlDoc::hashLanguage ( HashTableX *tt ) {
if ( ! hashString ( s, slen, &hi ) ) return false;
/*
BR 20160117: Duplicate
// try lang abbreviation
sprintf(s , "%s ", getLanguageAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
// by adding hashLanguageString() function below
//sprintf(s , "%s ", getLanguageAbbr(langId) );
if ( ! hashString ( s, slen, &hi ) ) return false;
*/
return true;
}