forked from Mirrors/privacore-open-source-search-engine
Merge branch 'master' into dev-dumpthread
This commit is contained in:
Collectiondb.cppCollectiondb.hConf.cppConf.hHostdb.cppHostdb.hHttpServer.cppMsg39.cppMsg39.hMsg3a.cppMsg3a.hMsg40.cppMsg51.cppMsg51.hPageHosts.cppPageResults.cppPageTemperatureRegistry.cppPageTemperatureRegistry.hPages.cppParms.cppParms.hPingServer.cppPosdbTable.cppProxy.cppScoringWeights.cppSearchInput.cppSearchInput.hStatistics.cppStatistics.hStats.cppStats.hTcpServer.cppXmlDoc.cppXmlDoc.hXmlDoc_Indexing.cpp
@ -1021,6 +1021,7 @@ CollectionRec::CollectionRec() {
|
||||
m_summDedupNumLines = 0;
|
||||
m_maxQueryTerms = 0;
|
||||
m_sameLangWeight = 0.0;
|
||||
m_unknownLangWeight = 0.0;
|
||||
memset(m_defaultSortLanguage2, 0, sizeof(m_defaultSortLanguage2));
|
||||
m_importEnabled = false;
|
||||
m_numImportInjects = 0;
|
||||
|
@ -240,6 +240,7 @@ public:
|
||||
|
||||
//ranking settings
|
||||
float m_sameLangWeight;
|
||||
float m_unknownLangWeight;
|
||||
|
||||
// Language stuff
|
||||
char m_defaultSortLanguage2[6];
|
||||
|
2
Conf.cpp
2
Conf.cpp
@ -131,6 +131,8 @@ Conf::Conf ( ) {
|
||||
m_hashGroupWeightInUrl = 0.0;
|
||||
m_hashGroupWeightInMenu = 0.0;
|
||||
m_synonymWeight = 0.0;
|
||||
m_pageTemperatureWeightMin = 0.0;
|
||||
m_pageTemperatureWeightMax = 0.0;
|
||||
m_usePageTemperatureForRanking = true;
|
||||
m_numFlagScoreMultipliers = 26;
|
||||
m_numFlagRankAdjustments = 26;
|
||||
|
3
Conf.h
3
Conf.h
@ -224,8 +224,9 @@ class Conf {
|
||||
float m_hashGroupWeightInternalLinkText;
|
||||
float m_hashGroupWeightInUrl;
|
||||
float m_hashGroupWeightInMenu;
|
||||
|
||||
float m_synonymWeight;
|
||||
float m_pageTemperatureWeightMin;
|
||||
float m_pageTemperatureWeightMax;
|
||||
|
||||
bool m_usePageTemperatureForRanking;
|
||||
|
||||
|
42
Hostdb.cpp
42
Hostdb.cpp
@ -683,7 +683,7 @@ createFile:
|
||||
m_hosts[i].m_emailCode = -2;
|
||||
// reset these
|
||||
m_hosts[i].m_pingInfo.m_flags = 0;
|
||||
m_hosts[i].m_pingInfo.m_cpuUsage = 0.0;
|
||||
m_hosts[i].m_pingInfo.m_unused4 = 0.0;
|
||||
m_hosts[i].m_loadAvg = 0.0;
|
||||
|
||||
m_hosts[i].m_lastResponseReceiveTimestamp = 0;
|
||||
@ -1237,8 +1237,9 @@ Host *Hostdb::getHostWithSpideringEnabled ( uint32_t shardNum ) {
|
||||
|
||||
// if niceness 0 can't pick noquery host/ must pick spider host.
|
||||
// if niceness 1 can't pick nospider host/ must pick query host.
|
||||
// Used to select based on PingInfo::m_udpSlotsInUseIncoming but that information is not exchanged often enough to
|
||||
// be even remotely accurate with any realistic number of shards.
|
||||
Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
|
||||
int32_t minOutstandingRequests = 0x7fffffff;
|
||||
int32_t minOutstandingRequestsIndex = -1;
|
||||
Host *shard = getShard ( shardNum );
|
||||
Host *bestDead = NULL;
|
||||
@ -1251,13 +1252,7 @@ Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
|
||||
if ( niceness == 0 && ! hh->m_queryEnabled ) continue;
|
||||
if ( ! bestDead ) bestDead = hh;
|
||||
if(isDead(hh)) continue;
|
||||
// log("host %" PRId32 " numOutstanding is %" PRId32, hh->m_hostId,
|
||||
// hh->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
if ( hh->m_pingInfo.m_udpSlotsInUseIncoming >
|
||||
minOutstandingRequests )
|
||||
continue;
|
||||
|
||||
minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
|
||||
minOutstandingRequestsIndex = i;
|
||||
}
|
||||
// we should never return a nospider/noquery host depending on
|
||||
@ -1374,7 +1369,7 @@ bool Hostdb::replaceHost ( int32_t origHostId, int32_t spareHostId ) {
|
||||
oldHost->m_ping = g_conf.m_deadHostTimeout;
|
||||
oldHost->m_pingShotgun = g_conf.m_deadHostTimeout;
|
||||
oldHost->m_emailCode = 0;
|
||||
oldHost->m_pingInfo.m_udpSlotsInUseIncoming = 0;
|
||||
oldHost->m_pingInfo.m_unused12 = 0;
|
||||
oldHost->m_errorReplies = 0;
|
||||
oldHost->m_dgramsTo = 0;
|
||||
oldHost->m_dgramsFrom = 0;
|
||||
@ -1431,27 +1426,27 @@ void Hostdb::updatePingInfo(Host *h, const PingInfo &pi) {
|
||||
|
||||
h->m_pingInfo.m_unused0 = 0;
|
||||
h->m_pingInfo.m_hostId = pi.m_hostId;
|
||||
h->m_pingInfo.m_loadAvg = pi.m_loadAvg;
|
||||
h->m_pingInfo.m_percentMemUsed = pi.m_percentMemUsed;
|
||||
h->m_pingInfo.m_cpuUsage = pi.m_cpuUsage;
|
||||
h->m_pingInfo.m_unused2 = 0;
|
||||
h->m_pingInfo.m_unused3 = 0;
|
||||
h->m_pingInfo.m_unused4 = 0.0;
|
||||
h->m_pingInfo.m_totalDocsIndexed = pi.m_totalDocsIndexed;
|
||||
h->m_pingInfo.m_hostsConfCRC = pi.m_hostsConfCRC;
|
||||
h->m_pingInfo.m_diskUsage = pi.m_diskUsage;
|
||||
h->m_pingInfo.m_unused7 = 0.0;
|
||||
h->m_pingInfo.m_flags = pi.m_flags;
|
||||
h->m_pingInfo.m_numCorruptDiskReads = pi.m_numCorruptDiskReads;
|
||||
h->m_pingInfo.m_numOutOfMems = pi.m_numOutOfMems;
|
||||
h->m_pingInfo.m_socketsClosedFromHittingLimit = pi.m_socketsClosedFromHittingLimit;
|
||||
h->m_pingInfo.m_unused9 = 0;
|
||||
h->m_pingInfo.m_unused10 = 0;
|
||||
h->m_pingInfo.m_unused11 = 0;
|
||||
//m_totalResends is updated direclty by UdpSlot
|
||||
//h->m_pingInfo.m_totalResends = pi.m_totalResends;
|
||||
//m_etryagains is updated directly by UdpServer
|
||||
//h->m_pingInfo.m_etryagains = pi.m_etryagains;
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming = pi.m_udpSlotsInUseIncoming;
|
||||
h->m_pingInfo.m_tcpSocketsInUse = pi.m_tcpSocketsInUse;
|
||||
h->m_pingInfo.m_currentSpiders = pi.m_currentSpiders;
|
||||
h->m_pingInfo.m_unused12 = 0;
|
||||
h->m_pingInfo.m_unused13 = 0;
|
||||
h->m_pingInfo.m_unused14 = 0;
|
||||
h->m_pingInfo.m_dailyMergeCollnum = pi.m_dailyMergeCollnum;
|
||||
memcpy(h->m_pingInfo.m_gbVersionStr,pi.m_gbVersionStr,sizeof(pi.m_gbVersionStr));
|
||||
h->m_pingInfo.m_repairMode = pi.m_repairMode;
|
||||
h->m_pingInfo.m_recoveryLevel = pi.m_recoveryLevel;
|
||||
h->m_pingInfo.m_unused18 = 0;
|
||||
}
|
||||
|
||||
|
||||
@ -1750,15 +1745,14 @@ int32_t *getLocalIps ( ) {
|
||||
log("hostdb: getifaddrs: %s.",mstrerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
ifaddrs *p = ifap;
|
||||
int32_t ni = 0;
|
||||
// store loopback just in case
|
||||
int32_t loopback = atoip("127.0.0.1");
|
||||
s_localIps[ni++] = loopback;
|
||||
for ( ; p && ni < 18 ; p = p->ifa_next ) {
|
||||
// avoid possible core dump
|
||||
for(ifaddrs *p = ifap; p && ni < 18 ; p = p->ifa_next) {
|
||||
if ( ! p->ifa_addr ) continue;
|
||||
//break; // mdw hack...
|
||||
if(p->ifa_addr->sa_family != AF_INET)
|
||||
continue;
|
||||
struct sockaddr_in *xx = (sockaddr_in *)(void*)p->ifa_addr;
|
||||
int32_t ip = xx->sin_addr.s_addr;
|
||||
// skip if loopback we stored above
|
||||
|
22
Hostdb.h
22
Hostdb.h
@ -45,27 +45,27 @@ class PingInfo {
|
||||
public:
|
||||
int64_t m_unused0; //used to be a timestamp for clock synchronization
|
||||
int32_t m_hostId;
|
||||
int32_t m_loadAvg;
|
||||
float m_percentMemUsed;
|
||||
float m_cpuUsage;
|
||||
int32_t m_unused2; //used for the m_loadAvg
|
||||
float m_unused3; //used to me m_percentMemUsed;
|
||||
float m_unused4; //used to be m_cpuUsage
|
||||
int32_t m_totalDocsIndexed;
|
||||
int32_t m_hostsConfCRC;
|
||||
float m_diskUsage;
|
||||
float m_unused7; //used to be m_diskUsage
|
||||
int32_t m_flags;
|
||||
// some new stuff
|
||||
int32_t m_numCorruptDiskReads;
|
||||
int32_t m_numOutOfMems;
|
||||
int32_t m_socketsClosedFromHittingLimit;
|
||||
int32_t m_unused9;
|
||||
int32_t m_unused10;
|
||||
int32_t m_unused11;
|
||||
|
||||
int32_t m_udpSlotsInUseIncoming;
|
||||
int32_t m_tcpSocketsInUse;
|
||||
int32_t m_unused12;
|
||||
int32_t m_unused13;
|
||||
|
||||
int16_t m_currentSpiders;
|
||||
int16_t m_unused14;
|
||||
collnum_t m_dailyMergeCollnum;
|
||||
|
||||
char m_gbVersionStr[21];
|
||||
char m_repairMode;
|
||||
uint8_t m_recoveryLevel;
|
||||
uint8_t m_unused18;
|
||||
};
|
||||
|
||||
class Host {
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "Collectiondb.h"
|
||||
#include "HashTable.h"
|
||||
#include "Stats.h"
|
||||
#include "Statistics.h"
|
||||
#include "HttpMime.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Loop.h"
|
||||
@ -521,7 +522,7 @@ void HttpServer::requestHandler ( TcpSocket *s ) {
|
||||
sendErrorReply ( s , 500 , "Too many sockets open.");
|
||||
// count as a failed query so we send an email alert if too
|
||||
// many of these happen
|
||||
g_stats.m_closedSockets++;
|
||||
Statistics::register_socket_limit_hit();
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -49,8 +49,15 @@ void Msg39Request::reset() {
|
||||
m_collnum = -1;
|
||||
m_useQueryStopWords = true;
|
||||
m_doMaxScoreAlgo = true;
|
||||
m_termFreqWeightFreqMin = 0.0;
|
||||
m_termFreqWeightFreqMax = 0.5;
|
||||
m_termFreqWeightMin = 0.5;
|
||||
m_termFreqWeightMax = 1.0;
|
||||
m_synonymWeight = 0.9;
|
||||
m_pageTemperatureWeightMin = 1.0;
|
||||
m_pageTemperatureWeightMax = 20.0;
|
||||
m_usePageTemperatureForRanking = true;
|
||||
|
||||
for(int i=0; i<26; i++)
|
||||
m_flagScoreMultiplier[i] = 1.0;
|
||||
for(int i=0; i<26; i++)
|
||||
@ -61,6 +68,7 @@ void Msg39Request::reset() {
|
||||
size_query = 0;
|
||||
size_whiteList = 0;
|
||||
m_sameLangWeight = 20.0;
|
||||
m_unknownLangWeight = 10.0;
|
||||
|
||||
// -1 means to not to docid range restriction
|
||||
m_minDocId = -1LL;
|
||||
|
7
Msg39.h
7
Msg39.h
@ -37,6 +37,7 @@ class Msg39Request {
|
||||
int32_t m_maxQueryTerms;
|
||||
int32_t m_numDocIdSplits;
|
||||
float m_sameLangWeight;
|
||||
float m_unknownLangWeight;
|
||||
|
||||
//int32_t m_compoundListMaxSize;
|
||||
uint8_t m_language;
|
||||
@ -58,7 +59,13 @@ class Msg39Request {
|
||||
bool m_doMaxScoreAlgo;
|
||||
|
||||
ScoringWeights m_scoringWeights;
|
||||
float m_termFreqWeightFreqMin;
|
||||
float m_termFreqWeightFreqMax;
|
||||
float m_termFreqWeightMin;
|
||||
float m_termFreqWeightMax;
|
||||
float m_synonymWeight;
|
||||
float m_pageTemperatureWeightMin;
|
||||
float m_pageTemperatureWeightMax;
|
||||
bool m_usePageTemperatureForRanking;
|
||||
|
||||
float m_flagScoreMultiplier[26];
|
||||
|
11
Msg3a.cpp
11
Msg3a.cpp
@ -200,7 +200,8 @@ bool Msg3a::getDocIds(const SearchInput *si, Query *q, void *state,
|
||||
(PTRTYPE)this);
|
||||
}
|
||||
|
||||
setTermFreqWeights(m_msg39req.m_collnum, m_q);
|
||||
setTermFreqWeights(m_msg39req.m_collnum, m_q, m_msg39req.m_termFreqWeightFreqMin, m_msg39req.m_termFreqWeightFreqMax,
|
||||
m_msg39req.m_termFreqWeightMin, m_msg39req.m_termFreqWeightMax);
|
||||
|
||||
if ( m_debug ) {
|
||||
for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) {
|
||||
@ -1005,15 +1006,15 @@ void Msg3a::printTerms ( ) {
|
||||
}
|
||||
|
||||
|
||||
static float getTermFreqWeight(int64_t termFreq, int64_t numDocsInColl) {
|
||||
static float getTermFreqWeight(int64_t termFreq, int64_t numDocsInColl, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax) {
|
||||
if(numDocsInColl>0)
|
||||
return scale_linear(((float)termFreq)/numDocsInColl, g_conf.m_termFreqWeightFreqMin, g_conf.m_termFreqWeightFreqMax, g_conf.m_termFreqWeightMax, g_conf.m_termFreqWeightMin);
|
||||
return scale_linear(((float)termFreq)/numDocsInColl, termFreqWeightFreqMin, termFreqWeightFreqMax, termFreqWeightMax, termFreqWeightMin);
|
||||
else
|
||||
return 1.0; //whatever...
|
||||
}
|
||||
|
||||
|
||||
void setTermFreqWeights ( collnum_t collnum , Query *q ) {
|
||||
void setTermFreqWeights ( collnum_t collnum , Query *q, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax) {
|
||||
int64_t numDocsInColl = 0;
|
||||
RdbBase *base = getRdbBase ( RDB_CLUSTERDB, collnum );
|
||||
if ( base ) numDocsInColl = base->estimateNumGlobalRecs();
|
||||
@ -1032,7 +1033,7 @@ void setTermFreqWeights ( collnum_t collnum , Query *q ) {
|
||||
// GET THE TERMFREQ for setting weights
|
||||
int64_t tf = g_posdb.getTermFreq ( collnum ,qt->m_termId);
|
||||
qt->m_termFreq = tf;
|
||||
float tfw = getTermFreqWeight(tf,numDocsInColl);
|
||||
float tfw = getTermFreqWeight(tf,numDocsInColl, termFreqWeightFreqMin, termFreqWeightFreqMax, termFreqWeightMin, termFreqWeightMax);
|
||||
qt->m_termFreqWeight = tfw;
|
||||
}
|
||||
}
|
||||
|
2
Msg3a.h
2
Msg3a.h
@ -7,7 +7,7 @@
|
||||
class SearchInput;
|
||||
class Query;
|
||||
|
||||
void setTermFreqWeights ( collnum_t collnum, class Query *q );
|
||||
void setTermFreqWeights ( collnum_t collnum , Query *q, float termFreqWeightFreqMin, float termFreqWeightFreqMax, float termFreqWeightMin, float termFreqWeightMax);
|
||||
|
||||
#define MAX_SHARDS 1024
|
||||
|
||||
|
18
Msg40.cpp
18
Msg40.cpp
@ -344,7 +344,16 @@ bool Msg40::federatedLoop ( ) {
|
||||
m_si->m_hashGroupWeightInternalLinkText,
|
||||
m_si->m_hashGroupWeightInUrl,
|
||||
m_si->m_hashGroupWeightInMenu);
|
||||
|
||||
mr.m_termFreqWeightFreqMin = m_si->m_termFreqWeightFreqMin;
|
||||
mr.m_termFreqWeightFreqMax = m_si->m_termFreqWeightFreqMax;
|
||||
mr.m_termFreqWeightMin = m_si->m_termFreqWeightMin;
|
||||
mr.m_termFreqWeightMax = m_si->m_termFreqWeightMax;
|
||||
|
||||
mr.m_synonymWeight = m_si->m_synonymWeight;
|
||||
mr.m_pageTemperatureWeightMin = m_si->m_pageTemperatureWeightMin;
|
||||
mr.m_pageTemperatureWeightMax = m_si->m_pageTemperatureWeightMax;
|
||||
|
||||
mr.m_usePageTemperatureForRanking = m_si->m_usePageTemperatureForRanking;
|
||||
memcpy(mr.m_flagScoreMultiplier, m_si->m_flagScoreMultiplier, sizeof(mr.m_flagScoreMultiplier));
|
||||
memcpy(mr.m_flagRankAdjustment, m_si->m_flagRankAdjustment, sizeof(mr.m_flagRankAdjustment));
|
||||
@ -364,6 +373,7 @@ bool Msg40::federatedLoop ( ) {
|
||||
mr.m_minSerpDocId = m_si->m_minSerpDocId;
|
||||
mr.m_maxSerpScore = m_si->m_maxSerpScore;
|
||||
mr.m_sameLangWeight = m_si->m_sameLangWeight;
|
||||
mr.m_unknownLangWeight = m_si->m_unknownLangWeight;
|
||||
memcpy(mr.m_queryId, m_si->m_queryId, sizeof(m_si->m_queryId));
|
||||
|
||||
if ( mr.m_timeout < m_si->m_minMsg3aTimeout )
|
||||
@ -1494,6 +1504,14 @@ bool Msg40::gotSummary ( ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// filter simplified redirection/non-caconical document
|
||||
if (mr && mr->size_rubuf > 1 && mr->m_contentLen == 0) {
|
||||
if (!m_si->m_showErrors) {
|
||||
*level = CR_EMPTY_REDIRECTION_PAGE;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// filter empty title & summaries
|
||||
if ( mr && mr->size_tbuf <= 1 && mr->size_displaySum <= 1 ) {
|
||||
if ( ! m_si->m_showErrors ) {
|
||||
|
@ -34,8 +34,8 @@ const char * const g_crStrings[] = {
|
||||
"summary error" ,
|
||||
"duplicate" ,
|
||||
"clusterdb error (subcount of visible)" ,
|
||||
"duplicate url",
|
||||
"wasted summary lookup" ,
|
||||
"duplicate url",
|
||||
"empty redirection page" ,
|
||||
"visible" ,
|
||||
"blacklisted" ,
|
||||
"ruleset filtered" ,
|
||||
|
5
Msg51.h
5
Msg51.h
@ -48,9 +48,8 @@ enum {
|
||||
CR_ERROR_CLUSTERDB ,
|
||||
// the url is a dup of a previous url (wiki pages capitalization)
|
||||
CR_DUP_URL ,
|
||||
// . subset of the CR_OK (visible) results are "wasted" titlerec lookup
|
||||
// . only used for stats by Msg40.cpp/Stats.cpp
|
||||
CR_WASTED ,
|
||||
// the url doesn't have any content due to simplified redirection page/non-caconical page
|
||||
CR_EMPTY_REDIRECTION_PAGE,
|
||||
// the docid is ok to display!
|
||||
CR_OK ,
|
||||
// from a blacklisted site hash
|
||||
|
234
PageHosts.cpp
234
PageHosts.cpp
@ -21,9 +21,6 @@ static int errorsSort ( const void *i1, const void *i2 );
|
||||
static int tryagainSort ( const void *i1, const void *i2 );
|
||||
static int dgramsToSort ( const void *i1, const void *i2 );
|
||||
static int dgramsFromSort ( const void *i1, const void *i2 );
|
||||
static int memUsedSort ( const void *i1, const void *i2 );
|
||||
static int cpuUsageSort ( const void *i1, const void *i2 );
|
||||
static int diskUsageSort ( const void *i1, const void *i2 );
|
||||
|
||||
static int32_t generatePingMsg( Host *h, int64_t nowms, char *buffer );
|
||||
|
||||
@ -156,15 +153,6 @@ skipReplaceHost:
|
||||
|
||||
"<td><b>docs indexed</a></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=9\">"
|
||||
"<b>mem used</a></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=10\">"
|
||||
"<b>cpu used</b></a></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=17\">"
|
||||
"<b>disk used</b></a></td>"
|
||||
|
||||
"<td><a href=\"/admin/hosts?c=%s&sort=14\">"
|
||||
"<b>max ping1</b></a></td>"
|
||||
|
||||
@ -190,9 +178,6 @@ skipReplaceHost:
|
||||
cs,
|
||||
cs,
|
||||
cs,
|
||||
cs,
|
||||
cs,
|
||||
cs,
|
||||
shotcol );
|
||||
|
||||
// loop through each host we know and print it's stats
|
||||
@ -225,15 +210,12 @@ skipReplaceHost:
|
||||
case 6: gbsort ( hostSort, nh, sizeof(int32_t), dgramsToSort ); break;
|
||||
case 7: gbsort ( hostSort, nh, sizeof(int32_t), dgramsFromSort ); break;
|
||||
//case 8:
|
||||
case 9: gbsort ( hostSort, nh, sizeof(int32_t), memUsedSort ); break;
|
||||
case 10:gbsort ( hostSort, nh, sizeof(int32_t), cpuUsageSort ); break;
|
||||
case 11:gbsort ( hostSort, nh, sizeof(int32_t), pingAgeSort ); break;
|
||||
case 12:gbsort ( hostSort, nh, sizeof(int32_t), flagSort ); break;
|
||||
case 13:gbsort ( hostSort, nh, sizeof(int32_t), splitTimeSort ); break;
|
||||
case 14:gbsort ( hostSort, nh, sizeof(int32_t), pingMaxSort ); break;
|
||||
//case 15:
|
||||
case 16:gbsort ( hostSort, nh, sizeof(int32_t), defaultSort ); break;
|
||||
case 17:gbsort ( hostSort, nh, sizeof(int32_t), diskUsageSort ); break;
|
||||
|
||||
}
|
||||
|
||||
@ -309,27 +291,6 @@ skipReplaceHost:
|
||||
char ipbuf3[64];
|
||||
strcpy(ipbuf3,iptoa(eip));
|
||||
|
||||
const char *fontTagFront = "";
|
||||
const char *fontTagBack = "";
|
||||
if ( h->m_pingInfo.m_percentMemUsed >= 98.0 &&
|
||||
format == FORMAT_HTML ) {
|
||||
fontTagFront = "<font color=red>";
|
||||
fontTagBack = "</font>";
|
||||
}
|
||||
|
||||
float cpu = h->m_pingInfo.m_cpuUsage;
|
||||
if ( cpu > 100.0 ) cpu = 100.0;
|
||||
if ( cpu < 0.0 ) cpu = -1.0;
|
||||
|
||||
char diskUsageMsg[64];
|
||||
sprintf(diskUsageMsg,"%.1f%%",h->m_pingInfo.m_diskUsage);
|
||||
if ( h->m_pingInfo.m_diskUsage < 0.0 )
|
||||
sprintf(diskUsageMsg,"???");
|
||||
if ( h->m_pingInfo.m_diskUsage>=98.0 && format == FORMAT_HTML )
|
||||
sprintf(diskUsageMsg,"<font color=red><b>%.1f%%"
|
||||
"</b></font>",h->m_pingInfo.m_diskUsage);
|
||||
|
||||
|
||||
// split time, don't divide by zero!
|
||||
int32_t splitTime = 0;
|
||||
if ( h->m_splitsDone )
|
||||
@ -355,42 +316,10 @@ skipReplaceHost:
|
||||
int32_t flags = h->m_pingInfo.m_flags;
|
||||
|
||||
|
||||
if ( format == FORMAT_HTML ) {
|
||||
// use these new ones for now
|
||||
int n = h->m_pingInfo.m_numCorruptDiskReads;
|
||||
if ( n )
|
||||
fb.safePrintf("<font color=red><b>"
|
||||
"C"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"</b></font>"
|
||||
, n );
|
||||
n = h->m_pingInfo.m_numOutOfMems;
|
||||
if ( n )
|
||||
fb.safePrintf("<font color=red><b>"
|
||||
"O"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"</b></font>"
|
||||
, n );
|
||||
n = h->m_pingInfo.m_socketsClosedFromHittingLimit;
|
||||
if ( n )
|
||||
fb.safePrintf("<font color=red><b>"
|
||||
"K"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"</b></font>"
|
||||
, n );
|
||||
}
|
||||
|
||||
// recovery mode? reocvered from coring?
|
||||
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML ) {
|
||||
fb.safePrintf("<b title=\"Recovered from core"
|
||||
"\">x</b>");
|
||||
// this is only 8-bits at the moment so it's capped
|
||||
// at 255. this level is 1 the first time we core
|
||||
// and are restarted.
|
||||
if ( h->m_pingInfo.m_recoveryLevel > 1 )
|
||||
fb.safePrintf("<sup>%" PRId32"</sup>",
|
||||
(int32_t)
|
||||
h->m_pingInfo.m_recoveryLevel);
|
||||
}
|
||||
|
||||
if ((flags & PFLAG_RECOVERYMODE)&& format != FORMAT_HTML )
|
||||
@ -416,59 +345,7 @@ skipReplaceHost:
|
||||
|
||||
// if it has spiders going on say "S" with # as the superscript
|
||||
if ((flags & PFLAG_HASSPIDERS) && format == FORMAT_HTML )
|
||||
fb.safePrintf ( "<span title=\"Spidering\">S"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"</span>"
|
||||
,h->m_pingInfo.m_currentSpiders
|
||||
);
|
||||
|
||||
if ( format == FORMAT_HTML &&
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming ) {
|
||||
const char *f1 = "";
|
||||
const char *f2 = "";
|
||||
// MAXUDPSLOTS in Spider.cpp is 300 right now
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 300 ) {
|
||||
f1 = "<b>";
|
||||
f2 = "</b>";
|
||||
}
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 400 ) {
|
||||
f1 = "<b><font color=red>";
|
||||
f2 = "</font></b>";
|
||||
}
|
||||
fb.safePrintf("<span title=\"udpSlotsInUse\">"
|
||||
"%s"
|
||||
"U"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"%s"
|
||||
"</span>"
|
||||
,f1
|
||||
,h->m_pingInfo.m_udpSlotsInUseIncoming
|
||||
,f2
|
||||
);
|
||||
}
|
||||
|
||||
if ( format == FORMAT_HTML && h->m_pingInfo.m_tcpSocketsInUse){
|
||||
const char *f1 = "";
|
||||
const char *f2 = "";
|
||||
if ( h->m_pingInfo.m_tcpSocketsInUse >= 100 ) {
|
||||
f1 = "<b>";
|
||||
f2 = "</b>";
|
||||
}
|
||||
if ( h->m_pingInfo.m_tcpSocketsInUse >= 200 ) {
|
||||
f1 = "<b><font color=red>";
|
||||
f2 = "</font></b>";
|
||||
}
|
||||
fb.safePrintf("<span title=\"tcpSocketsInUse\">"
|
||||
"%s"
|
||||
"T"
|
||||
"<sup>%" PRId32"</sup>"
|
||||
"%s"
|
||||
"</span>"
|
||||
,f1
|
||||
,h->m_pingInfo.m_tcpSocketsInUse
|
||||
,f2
|
||||
);
|
||||
}
|
||||
fb.safePrintf ( "<span title=\"Spidering\">S</span>");
|
||||
|
||||
if ((flags & PFLAG_HASSPIDERS) && format != FORMAT_HTML )
|
||||
fb.safePrintf ( "Spidering");
|
||||
@ -556,14 +433,6 @@ skipReplaceHost:
|
||||
"</errorTryAgains>\n",
|
||||
h->m_etryagains.load());
|
||||
|
||||
sb.safePrintf("\t\t<udpSlotsInUse>%" PRId32
|
||||
"</udpSlotsInUse>\n",
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
|
||||
sb.safePrintf("\t\t<tcpSocketsInUse>%" PRId32
|
||||
"</tcpSocketsInUse>\n",
|
||||
h->m_pingInfo.m_tcpSocketsInUse);
|
||||
|
||||
/*
|
||||
sb.safePrintf("\t\t<dgramsTo>%" PRId64"</dgramsTo>\n",
|
||||
h->m_dgramsTo);
|
||||
@ -571,21 +440,6 @@ skipReplaceHost:
|
||||
h->m_dgramsFrom);
|
||||
*/
|
||||
|
||||
sb.safePrintf("\t\t<numCorruptDiskReads>%" PRId32
|
||||
"</numCorruptDiskReads>\n"
|
||||
,h->m_pingInfo.m_numCorruptDiskReads);
|
||||
sb.safePrintf("\t\t<numOutOfMems>%" PRId32
|
||||
"</numOutOfMems>\n"
|
||||
,h->m_pingInfo.m_numOutOfMems);
|
||||
sb.safePrintf("\t\t<numClosedSockets>%" PRId32
|
||||
"</numClosedSockets>\n"
|
||||
,h->m_pingInfo.
|
||||
m_socketsClosedFromHittingLimit);
|
||||
sb.safePrintf("\t\t<numOutstandingSpiders>%" PRId32
|
||||
"</numOutstandingSpiders>\n"
|
||||
,h->m_pingInfo.m_currentSpiders );
|
||||
|
||||
|
||||
sb.safePrintf("\t\t<splitTime>%" PRId32"</splitTime>\n",
|
||||
splitTime);
|
||||
sb.safePrintf("\t\t<splitsDone>%" PRId32"</splitsDone>\n",
|
||||
@ -598,18 +452,6 @@ skipReplaceHost:
|
||||
"</docsIndexed>\n",
|
||||
h->m_pingInfo.m_totalDocsIndexed);
|
||||
|
||||
sb.safePrintf("\t\t<percentMemUsed>%.1f%%"
|
||||
"</percentMemUsed>",
|
||||
h->m_pingInfo.m_percentMemUsed); // float
|
||||
|
||||
sb.safePrintf("\t\t<cpuUsage>%.1f%%"
|
||||
"</cpuUsage>",
|
||||
cpu );
|
||||
|
||||
sb.safePrintf("\t\t<percentDiskUsed><![CDATA[%s]]>"
|
||||
"</percentDiskUsed>",
|
||||
diskUsageMsg);
|
||||
|
||||
sb.safePrintf("\t\t<maxPing1>%s</maxPing1>\n",
|
||||
pms );
|
||||
|
||||
@ -671,10 +513,6 @@ skipReplaceHost:
|
||||
*/
|
||||
sb.safePrintf("\t\t\t\t\"errorTryAgains\":%" PRId32",\n",
|
||||
h->m_etryagains.load());
|
||||
sb.safePrintf("\t\t\t\t\"udpSlotsInUse\":%" PRId32",\n",
|
||||
h->m_pingInfo.m_udpSlotsInUseIncoming);
|
||||
sb.safePrintf("\t\t\t\t\"tcpSocketsInUse\":%" PRId32",\n",
|
||||
h->m_pingInfo.m_tcpSocketsInUse);
|
||||
|
||||
/*
|
||||
sb.safePrintf("\t\t\t\t\"dgramsTo\":%" PRId64",\n",
|
||||
@ -684,18 +522,6 @@ skipReplaceHost:
|
||||
*/
|
||||
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"numCorruptDiskReads\":%" PRId32",\n"
|
||||
,h->m_pingInfo.m_numCorruptDiskReads);
|
||||
sb.safePrintf("\t\t\t\t\"numOutOfMems\":%" PRId32",\n"
|
||||
,h->m_pingInfo.m_numOutOfMems);
|
||||
sb.safePrintf("\t\t\t\t\"numClosedSockets\":%" PRId32",\n"
|
||||
,h->m_pingInfo.
|
||||
m_socketsClosedFromHittingLimit);
|
||||
sb.safePrintf("\t\t\t\t\"numOutstandingSpiders\":%" PRId32
|
||||
",\n"
|
||||
,h->m_pingInfo.m_currentSpiders );
|
||||
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"splitTime\":%" PRId32",\n",
|
||||
splitTime);
|
||||
sb.safePrintf("\t\t\t\t\"splitsDone\":%" PRId32",\n",
|
||||
@ -707,14 +533,6 @@ skipReplaceHost:
|
||||
sb.safePrintf("\t\t\t\t\"docsIndexed\":%" PRId32",\n",
|
||||
h->m_pingInfo.m_totalDocsIndexed);
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"percentMemUsed\":\"%.1f%%\",\n",
|
||||
h->m_pingInfo.m_percentMemUsed); // float
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"cpuUsage\":\"%.1f%%\",\n",cpu);
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"percentDiskUsed\":\"%s\",\n",
|
||||
diskUsageMsg);
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"maxPing1\":\"%s\",\n",pms);
|
||||
|
||||
sb.safePrintf("\t\t\t\t\"maxPingAge1\":\"%" PRId32"ms\",\n",
|
||||
@ -799,13 +617,6 @@ skipReplaceHost:
|
||||
// docs indexed
|
||||
"<td>%" PRId32"</td>"
|
||||
|
||||
// percent mem used
|
||||
"<td>%s%.1f%%%s</td>"
|
||||
// cpu usage
|
||||
"<td>%.1f%%</td>"
|
||||
// disk usage
|
||||
"<td>%s</td>"
|
||||
|
||||
// ping max
|
||||
"<td>%s</td>"
|
||||
|
||||
@ -844,12 +655,6 @@ skipReplaceHost:
|
||||
|
||||
h->m_pingInfo.m_totalDocsIndexed,
|
||||
|
||||
fontTagFront,
|
||||
h->m_pingInfo.m_percentMemUsed, // float
|
||||
fontTagBack,
|
||||
cpu, // float
|
||||
diskUsageMsg,
|
||||
|
||||
// ping max
|
||||
pms,
|
||||
// ping age
|
||||
@ -889,13 +694,6 @@ skipReplaceHost:
|
||||
// end the table now
|
||||
sb.safePrintf ( "</table><br>\n" );
|
||||
|
||||
sb.safePrintf("<table>");
|
||||
for(int i=0; i<nh; i++) {
|
||||
Host *h = g_hostdb.getHost(hostSort[i]);
|
||||
sb.safePrintf("<tr><td>%lu</t><td>%lu</td></tr>", h->getLastRequestSendTimestamp(), h->getLastResponseReceiveTimestamp());
|
||||
}
|
||||
sb.safePrintf("</table>");
|
||||
|
||||
|
||||
if( g_hostdb.m_numSpareHosts ) {
|
||||
// print spare hosts table
|
||||
@ -1341,33 +1139,3 @@ int dgramsFromSort ( const void *i1, const void *i2 ) {
|
||||
if ( h1->m_dgramsFrom < h2->m_dgramsFrom ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int memUsedSort ( const void *i1, const void *i2 ) {
|
||||
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
|
||||
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
|
||||
PingInfo *p1 = &h1->m_pingInfo;
|
||||
PingInfo *p2 = &h2->m_pingInfo;
|
||||
if ( p1->m_percentMemUsed > p2->m_percentMemUsed ) return -1;
|
||||
if ( p1->m_percentMemUsed < p2->m_percentMemUsed ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cpuUsageSort ( const void *i1, const void *i2 ) {
|
||||
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
|
||||
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
|
||||
PingInfo *p1 = &h1->m_pingInfo;
|
||||
PingInfo *p2 = &h2->m_pingInfo;
|
||||
if ( p1->m_cpuUsage > p2->m_cpuUsage ) return -1;
|
||||
if ( p1->m_cpuUsage < p2->m_cpuUsage ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int diskUsageSort ( const void *i1, const void *i2 ) {
|
||||
Host *h1 = g_hostdb.getHost ( *(int32_t*)i1 );
|
||||
Host *h2 = g_hostdb.getHost ( *(int32_t*)i2 );
|
||||
PingInfo *p1 = &h1->m_pingInfo;
|
||||
PingInfo *p2 = &h2->m_pingInfo;
|
||||
if ( p1->m_diskUsage > p2->m_diskUsage ) return -1;
|
||||
if ( p1->m_diskUsage < p2->m_diskUsage ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
@ -2737,7 +2737,7 @@ badformat:
|
||||
if ( scr ) coll = scr->m_coll;
|
||||
|
||||
if ( si->m_format == FORMAT_HTML && printCached ) {
|
||||
sb->safePrintf ( "<a href=\"/get?q=%s&qlang=%s&c=%s&d=%" PRId64 "&cnsp=0\">cached</a>\n",
|
||||
sb->safePrintf ( "<a href=\"/get?q=%s&qlang=%s&c=%s&d=%" PRId64 "&cnsp=0\">cached</a> - \n",
|
||||
st->m_qesb.getBufStart() ,
|
||||
si->m_defaultSortLang, // "qlang" parm
|
||||
coll ,
|
||||
@ -2750,7 +2750,7 @@ badformat:
|
||||
if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) {
|
||||
// place holder for backlink table link
|
||||
placeHolder = sb->length();
|
||||
sb->safePrintf (" - <a onclick="
|
||||
sb->safePrintf ("<a onclick="
|
||||
"\""
|
||||
"var e = document.getElementById('bl%" PRId32"');"
|
||||
"if ( e.style.display == 'none' ){"
|
||||
@ -2772,7 +2772,7 @@ badformat:
|
||||
placeHolderLen = sb->length() - placeHolder;
|
||||
|
||||
// unhide the scoring table on click
|
||||
sb->safePrintf (" - <a onclick="
|
||||
sb->safePrintf ("<a onclick="
|
||||
"\""
|
||||
"var e = document.getElementById('sc%" PRId32"');"
|
||||
"if ( e.style.display == 'none' ){"
|
||||
|
@ -1,10 +1,11 @@
|
||||
#include "PageTemperatureRegistry.h"
|
||||
#include "ScalingFunctions.h"
|
||||
#include "Log.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
PageTemperatureRegistry g_pageTemperatureRegistry;
|
||||
|
||||
@ -97,13 +98,18 @@ bool PageTemperatureRegistry::load() {
|
||||
|
||||
temperature_range_for_scaling = max_temperature-min_temperature;
|
||||
|
||||
min_temperature_log = log(min_temperature);
|
||||
max_temperature_log = log(max_temperature);
|
||||
temperature_range_for_scaling_log = log(temperature_range_for_scaling);
|
||||
default_temperature_log = log(default_temperature);
|
||||
|
||||
if(!using_meta)
|
||||
log(LOG_WARN, "meta-file %s could not be loaded. Using default temperature of %u which can scew results for new pages", meta_filename, default_temperature);
|
||||
|
||||
log(LOG_DEBUG, "pagetemp: min_temperature=%u",min_temperature);
|
||||
log(LOG_DEBUG, "pagetemp: max_temperature=%u",max_temperature);
|
||||
log(LOG_DEBUG, "pagetemp: default_temperature=%u",default_temperature);
|
||||
|
||||
|
||||
log(LOG_DEBUG, "%s loaded (%lu items)", filename, (unsigned long)new_entries);
|
||||
return true;
|
||||
}
|
||||
@ -129,11 +135,15 @@ unsigned PageTemperatureRegistry::query_page_temperature_internal(uint64_t docid
|
||||
}
|
||||
|
||||
|
||||
double PageTemperatureRegistry::query_page_temperature(uint64_t docid) const {
|
||||
double PageTemperatureRegistry::query_page_temperature(uint64_t docid, double range_min, double range_max) const {
|
||||
if(hash_table_size==0)
|
||||
return 1.0;
|
||||
unsigned temperature_26bit = query_page_temperature_internal(docid);
|
||||
return scale_linear(default_temperature_log, min_temperature_log, max_temperature_log, range_min, range_max);
|
||||
|
||||
double temperature_26bit_log = log((double)query_page_temperature_internal(docid));
|
||||
//Then scale to a number in the rangte [0..1]
|
||||
//It is a bit annoying to do this computation for each lookup but it saves memory
|
||||
return ((double)(temperature_26bit - min_temperature)) / temperature_range_for_scaling;
|
||||
// return ((double)(temperature_26bit - min_temperature)) / temperature_range_for_scaling;
|
||||
return scale_linear(temperature_26bit_log, min_temperature_log, max_temperature_log, range_min, range_max);
|
||||
}
|
||||
|
||||
|
||||
|
@ -15,6 +15,12 @@ class PageTemperatureRegistry {
|
||||
unsigned max_temperature;
|
||||
unsigned temperature_range_for_scaling;
|
||||
unsigned default_temperature;
|
||||
|
||||
double min_temperature_log;
|
||||
double max_temperature_log;
|
||||
double temperature_range_for_scaling_log;
|
||||
double default_temperature_log;
|
||||
|
||||
unsigned query_page_temperature_internal(uint64_t docid) const;
|
||||
public:
|
||||
PageTemperatureRegistry()
|
||||
@ -26,7 +32,7 @@ public:
|
||||
bool load();
|
||||
void unload();
|
||||
|
||||
double query_page_temperature(uint64_t docid) const;
|
||||
double query_page_temperature(uint64_t docid, double range_min, double range_max) const;
|
||||
|
||||
bool empty() const { return entries==0; }
|
||||
};
|
||||
|
22
Pages.cpp
22
Pages.cpp
@ -2411,25 +2411,6 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
||||
// out of disk space?
|
||||
int32_t out = 0;
|
||||
for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
||||
Host *h = &g_hostdb.m_hosts[i];
|
||||
if ( h->m_pingInfo.m_diskUsage < 98.0 ) continue;
|
||||
out++;
|
||||
}
|
||||
if ( out > 0 ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
adds++;
|
||||
const char *s = "s are";
|
||||
if ( out == 1 ) s = " is";
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("%" PRId32" host%s over 98%% disk usage. "
|
||||
"See the <a href=/admin/hosts?c=%s>"
|
||||
"hosts</a> table.",out,s,coll);
|
||||
mb->safePrintf("%s",boxEnd);
|
||||
}
|
||||
|
||||
// injections disabled?
|
||||
if ( ! g_conf.m_injectionsEnabled ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
@ -2481,13 +2462,12 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
|
||||
for ( int32_t i = 1 ; i < g_hostdb.getNumHosts() ; i++ ) {
|
||||
Host *h = &g_hostdb.m_hosts[i];
|
||||
if ( g_hostdb.isDead( h ) ) continue;
|
||||
if ( h->m_pingInfo.m_udpSlotsInUseIncoming>= 400)jammedHosts++;
|
||||
}
|
||||
if ( jammedHosts > 0 ) {
|
||||
if ( adds ) mb->safePrintf("<br>");
|
||||
adds++;
|
||||
const char *s = "s are";
|
||||
if ( out == 1 ) s = " is";
|
||||
if ( jammedHosts == 1 ) s = " is";
|
||||
mb->safePrintf("%s",box);
|
||||
mb->safePrintf("%" PRId32" host%s jammed with "
|
||||
"over %" PRId32" unhandled "
|
||||
|
407
Parms.cpp
407
Parms.cpp
@ -53,8 +53,40 @@ public:
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// User configured values for these parms need to be adjusted to internal ranges
|
||||
//
|
||||
const struct {
|
||||
char *name;
|
||||
float div_by;
|
||||
} static g_fxui_parms[] = {
|
||||
{"diversityweightmin", 100.0},
|
||||
{"diversityweightmax", 100.0},
|
||||
{"densityweightmin", 100.0},
|
||||
{"densityweightmax", 100.0},
|
||||
{"hgw_body", 10.0},
|
||||
{"hgw_title", 10.0},
|
||||
{"hgw_heading", 10.0},
|
||||
{"hgw_list", 10.0},
|
||||
{"hgw_metatag", 10.0},
|
||||
{"hgw_inlinktext", 10.0},
|
||||
{"hgw_intag", 10.0},
|
||||
{"hgw_neighborhood", 10.0},
|
||||
{"hgw_inmenu", 10.0},
|
||||
{"hgw_inintlinktext", 10.0},
|
||||
{"hgw_inurl", 10.0},
|
||||
{"synonym_weight", 10.0},
|
||||
{"termfreqweightfreqmin", 100.0},
|
||||
{"termfreqweightfreqmax", 100.0},
|
||||
{"termfreqweightmin", 100.0},
|
||||
{"termfreqweightmax", 100.0}
|
||||
};
|
||||
|
||||
static const int g_num_fxui_parms = sizeof(g_fxui_parms) / sizeof(g_fxui_parms[0]);
|
||||
|
||||
Parms g_parms;
|
||||
|
||||
|
||||
Parm::Parm() {
|
||||
// Coverity
|
||||
m_title = NULL;
|
||||
@ -827,8 +859,7 @@ bool Parms::setGigablastRequest ( TcpSocket *socket ,
|
||||
//if ( (m->m_perms & user) == 0 ) continue;
|
||||
// set it. now our TYPE_CHARPTR will just be set to it directly
|
||||
// to save memory...
|
||||
setParm ( (char *)THIS , m, 0, v, false,//not html enc
|
||||
false ); // true );
|
||||
setParm ( (char *)THIS , m, 0, v);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1962,12 +1993,63 @@ bool Parms::printParm( SafeBuf* sb,
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Convert external weights presented in the frontend UI to internal values
|
||||
//
|
||||
bool Parms::convertUIToInternal(const char *field_base_name, parameter_type_t type, const char *s, char *adjusted_value) {
|
||||
for(int fx=0; fx < g_num_fxui_parms; fx++) {
|
||||
if( strcmp(g_fxui_parms[fx].name, field_base_name) == 0 ) {
|
||||
|
||||
switch(type) {
|
||||
case TYPE_FLOAT: {
|
||||
float f = s ? (float)atof(s) : 0;
|
||||
if( f >= 1.0 && g_fxui_parms[fx].div_by > 1.0 ) {
|
||||
f = f / g_fxui_parms[fx].div_by;
|
||||
}
|
||||
snprintf(adjusted_value, 128, "%f", f);
|
||||
}
|
||||
return true;
|
||||
|
||||
case TYPE_DOUBLE: {
|
||||
double d = s ? (double)atof ( s ) : 0;
|
||||
if( d >= 1.0 && g_fxui_parms[fx].div_by > 1.0 ) {
|
||||
d = d / g_fxui_parms[fx].div_by;
|
||||
}
|
||||
snprintf(adjusted_value, 128, "%f", d);
|
||||
}
|
||||
return true;
|
||||
|
||||
case TYPE_INT32:
|
||||
case TYPE_INT32_CONST: {
|
||||
int32_t v = s ? atol(s) : 0;
|
||||
if( v >= 1 && (int32_t)g_fxui_parms[fx].div_by > 1 ) {
|
||||
v = v / (int32_t)g_fxui_parms[fx].div_by;
|
||||
}
|
||||
snprintf(adjusted_value, 128, "%" PRId32 "", v);
|
||||
}
|
||||
return true;
|
||||
|
||||
case TYPE_INT64: {
|
||||
int64_t i64 = s ? strtoull(s,NULL,10) : 0;
|
||||
if( i64 >= 1 && (int64_t)g_fxui_parms[fx].div_by > 1 ) {
|
||||
i64 = i64 / (int64_t)g_fxui_parms[fx].div_by;
|
||||
}
|
||||
snprintf(adjusted_value, 128, "%" PRId64 "", i64);
|
||||
}
|
||||
return true;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// now we use this to set SearchInput and GigablastRequest
|
||||
bool Parms::setFromRequest ( HttpRequest *r ,
|
||||
TcpSocket* s,
|
||||
CollectionRec *newcr ,
|
||||
char *THIS ,
|
||||
parameter_object_type_t objType) {
|
||||
bool Parms::setFromRequest(HttpRequest *r, TcpSocket *s, CollectionRec *newcr, char *THIS, parameter_object_type_t objType) {
|
||||
|
||||
// use convertHttpRequestToParmList() for these because they
|
||||
// are persistent records that are updated on every shard.
|
||||
@ -1985,31 +2067,47 @@ bool Parms::setFromRequest ( HttpRequest *r ,
|
||||
for(int32_t i = 0; i < r->getNumFields(); i++) {
|
||||
// get the value of cgi parm (null terminated)
|
||||
const char *v = r->getValue(i);
|
||||
if(!v)
|
||||
if(!v) {
|
||||
continue; //no value
|
||||
}
|
||||
// get cgi parm name
|
||||
const char *full_field_name = r->getField(i);
|
||||
size_t full_field_name_len = strlen(full_field_name);
|
||||
if(full_field_name_len>=128)
|
||||
if(full_field_name_len>=128) {
|
||||
continue;
|
||||
char field_base_name[128];
|
||||
int field_index;
|
||||
size_t nondigit_prefix_len = strcspn(full_field_name,"0123456789");
|
||||
if(nondigit_prefix_len!=full_field_name_len) {
|
||||
//field name contains digits. Split into base field name and index
|
||||
memcpy(field_base_name,full_field_name,nondigit_prefix_len);
|
||||
field_base_name[nondigit_prefix_len] = '\0';
|
||||
char *endptr = NULL;
|
||||
field_index = strtol(full_field_name+nondigit_prefix_len, &endptr, 10);
|
||||
if(field_index<0)
|
||||
continue; //hmm?
|
||||
if(endptr && *endptr)
|
||||
continue; //digits weren't the last part
|
||||
|
||||
} else {
|
||||
strcpy(field_base_name,full_field_name);
|
||||
field_index = 0;
|
||||
}
|
||||
|
||||
char field_base_name[128];
|
||||
bool uiconvert = false;
|
||||
int field_index=0;
|
||||
|
||||
//
|
||||
// To make user configuration of ranking parameters simpler, we sometimes
|
||||
// use other valid ranges in parameters than those used internally. Prefix
|
||||
// the param name with 'fxui_' and add the name and divisor to the global
|
||||
// table to automatically adjust external values to internal ones.
|
||||
//
|
||||
if( strncmp(full_field_name, "fxui_", 5) == 0 ) {
|
||||
strcpy(field_base_name, full_field_name+5);
|
||||
uiconvert=true;
|
||||
}
|
||||
else {
|
||||
size_t nondigit_prefix_len = strcspn(full_field_name,"0123456789");
|
||||
if(nondigit_prefix_len!=full_field_name_len) {
|
||||
//field name contains digits. Split into base field name and index
|
||||
memcpy(field_base_name,full_field_name,nondigit_prefix_len);
|
||||
field_base_name[nondigit_prefix_len] = '\0';
|
||||
char *endptr = NULL;
|
||||
field_index = strtol(full_field_name+nondigit_prefix_len, &endptr, 10);
|
||||
if(field_index<0)
|
||||
continue; //hmm?
|
||||
if(endptr && *endptr)
|
||||
continue; //digits weren't the last part
|
||||
} else {
|
||||
strcpy(field_base_name,full_field_name);
|
||||
}
|
||||
}
|
||||
|
||||
// find in parms list
|
||||
int32_t j;
|
||||
Parm *m;
|
||||
@ -2021,17 +2119,33 @@ bool Parms::setFromRequest ( HttpRequest *r ,
|
||||
strcmp(field_base_name,m->m_cgi) == 0)
|
||||
break; //found it
|
||||
}
|
||||
if(j >= m_numParms)
|
||||
if(j >= m_numParms) {
|
||||
continue; //cgi parm name not found
|
||||
if(field_index>0 && field_index>m->m_max)
|
||||
}
|
||||
|
||||
if(field_index>0 && field_index>m->m_max) {
|
||||
continue; //out-of-bounds
|
||||
}
|
||||
|
||||
// . skip if no value was provided
|
||||
// . unless it was a string! so we can make them empty.
|
||||
if(v[0] == '\0' &&
|
||||
m->m_type != TYPE_STRING &&
|
||||
m->m_type != TYPE_STRINGBOX) continue;
|
||||
m->m_type != TYPE_STRINGBOX) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char adjusted_value[128];
|
||||
if( uiconvert ) {
|
||||
if( !convertUIToInternal(field_base_name, m->m_type, v, adjusted_value) ) {
|
||||
log(LOG_ERROR, "Could not convert value of '%s' for '%s'", field_base_name, v);
|
||||
continue;
|
||||
}
|
||||
v = adjusted_value;
|
||||
}
|
||||
|
||||
// set it
|
||||
setParm(THIS, m, field_index, v, false, false);
|
||||
setParm(THIS, m, field_index, v);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -2078,7 +2192,7 @@ bool Parms::insertParm ( int32_t i , int32_t an , char *THIS ) {
|
||||
*(int32_t *)(THIS + m->m_arrayCountOffset) = *(int32_t *)(THIS + m->m_arrayCountOffset)+1;
|
||||
|
||||
// put the defaults in the inserted line
|
||||
setParm ( (char *)THIS , m, an , m->m_def , false ,false );
|
||||
setParm ( (char *)THIS , m, an , m->m_def);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2128,9 +2242,7 @@ bool Parms::removeParm ( int32_t i , int32_t an , char *THIS ) {
|
||||
|
||||
|
||||
|
||||
void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, bool isHtmlEncoded, bool fromRequest) {
|
||||
|
||||
if ( fromRequest ) { g_process.shutdownAbort(true); }
|
||||
void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s) {
|
||||
|
||||
// . this is just for setting CollectionRecs, so skip if offset < 0
|
||||
// . some parms are just for SearchInput (search parms)
|
||||
@ -2170,8 +2282,6 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
case TYPE_BOOL:
|
||||
case TYPE_PRIORITY: {
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(char)*array_index;
|
||||
if ( fromRequest && *(char*)ptr == atol(s))
|
||||
return;
|
||||
*(char*)ptr = s ? atol(s) : 0;
|
||||
break;
|
||||
}
|
||||
@ -2191,25 +2301,16 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
}
|
||||
case TYPE_FLOAT: {
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(float)*array_index;
|
||||
if( fromRequest && almostEqualFloat(*(float *)ptr, (s ? (float)atof(s) : 0)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
*(float*)ptr = s ? (float)atof ( s ) : 0;
|
||||
break;
|
||||
}
|
||||
case TYPE_DOUBLE: {
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(double)*array_index;
|
||||
if( fromRequest && almostEqualFloat(*(double*)ptr, ( s ? (double)atof(s) : 0)) ) {
|
||||
return;
|
||||
}
|
||||
*(double*)ptr = s ? (double)atof ( s ) : 0;
|
||||
break;
|
||||
}
|
||||
case TYPE_IP: {
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(int32_t)*array_index;
|
||||
if ( fromRequest && *(int32_t*)ptr == (s ? (int32_t)atoip(s,strlen(s)) : 0) )
|
||||
return;
|
||||
*(int32_t*)ptr = s ? (int32_t)atoip(s,strlen(s)) : 0;
|
||||
break;
|
||||
}
|
||||
@ -2219,16 +2320,11 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
int32_t v = s ? atol(s) : 0;
|
||||
// min is considered valid if >= 0
|
||||
if ( m->m_min >= 0 && v < m->m_min ) v = m->m_min;
|
||||
if ( fromRequest && *(int32_t *)ptr == v )
|
||||
return;
|
||||
*(int32_t *)ptr = v;
|
||||
break;
|
||||
}
|
||||
case TYPE_INT64: {
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(int64_t)*array_index;
|
||||
if ( fromRequest && *(uint64_t*)ptr == ( s ? strtoull(s,NULL,10) : 0) ) {
|
||||
return;
|
||||
}
|
||||
*(int64_t*)ptr = s ? strtoull(s,NULL,10) : 0;
|
||||
break;
|
||||
}
|
||||
@ -2240,18 +2336,9 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
// SafeBufs "array_index" is the # in the array, starting at 0
|
||||
char *ptr = (char*)THIS + m->m_off + sizeof(SafeBuf)*array_index;
|
||||
SafeBuf *sb = (SafeBuf *)ptr;
|
||||
int32_t oldLen = sb->length();
|
||||
// why was this commented out??? we need it now that we
|
||||
// send email alerts when parms change!
|
||||
if ( fromRequest &&
|
||||
! isHtmlEncoded && oldLen == len &&
|
||||
memcmp ( sb->getBufStart() , s , len ) == 0 )
|
||||
return;
|
||||
// nuke it
|
||||
sb->purge();
|
||||
// this means that we can not use string POINTERS as parms!!
|
||||
if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len );
|
||||
else len = sb->htmlDecode (s,len);
|
||||
sb->safeMemcpy ( s , len );
|
||||
// tag it
|
||||
sb->setLabel ( "parm1" );
|
||||
// ensure null terminated
|
||||
@ -2267,22 +2354,11 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
int32_t len = strlen(s);
|
||||
if ( len >= m->m_size ) len = m->m_size - 1; // truncate!!
|
||||
char *dst = THIS + m->m_off + m->m_size*array_index;
|
||||
// why was this commented out??? we need it now that we
|
||||
// send email alerts when parms change!
|
||||
if ( fromRequest &&
|
||||
! isHtmlEncoded && (int32_t)strlen(dst) == len &&
|
||||
memcmp ( dst , s , len ) == 0 ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// this means that we can not use string POINTERS as parms!!
|
||||
if ( !isHtmlEncoded ) {
|
||||
gbmemcpy( dst, s, len );
|
||||
} else {
|
||||
len = htmlDecode( dst, s, len, false );
|
||||
}
|
||||
|
||||
gbmemcpy( dst, s, len );
|
||||
dst[len] = '\0';
|
||||
|
||||
// . might have to set length
|
||||
// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
|
||||
if ( m->m_plen >= 0 )
|
||||
@ -2295,14 +2371,9 @@ void Parms::setParm(char *THIS, Parm *m, int32_t array_index, const char *s, boo
|
||||
log(LOG_LOGIC,"admin: attempt to set parameter %s from cgi-request", m->m_title);
|
||||
return;
|
||||
}
|
||||
|
||||
// do not send if setting from startup
|
||||
if ( ! fromRequest ) return;
|
||||
|
||||
// note it in the log
|
||||
log("admin: parm \"%s\" changed value",m->m_title);
|
||||
}
|
||||
|
||||
|
||||
void Parms::setToDefault(char *THIS, parameter_object_type_t objType, CollectionRec *argcr) {
|
||||
// init if we should
|
||||
init();
|
||||
@ -2344,7 +2415,7 @@ void Parms::setToDefault(char *THIS, parameter_object_type_t objType, Collection
|
||||
char *dst = THIS + m->m_off;
|
||||
memcpy(dst, raw_default, m->m_size);
|
||||
} else
|
||||
setParm(THIS , m, 0, m->m_def, false/*not enc.*/, false );
|
||||
setParm(THIS , m, 0, m->m_def);
|
||||
} else if(m->m_fixed<=0) {
|
||||
//variable-sized array
|
||||
//empty it
|
||||
@ -2357,7 +2428,7 @@ void Parms::setToDefault(char *THIS, parameter_object_type_t objType, Collection
|
||||
memcpy(dst, raw_default, m->m_size);
|
||||
raw_default = ((char*)raw_default) + m->m_size;
|
||||
} else
|
||||
setParm(THIS, m, k, m->m_def, false/*not enc.*/, false);
|
||||
setParm(THIS, m, k, m->m_def);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2485,7 +2556,7 @@ bool Parms::setFromFile ( void *THIS ,
|
||||
v[nb] = '\0';
|
||||
|
||||
// set our parm
|
||||
setParm( (char *)THIS, m, j, v, false, false );
|
||||
setParm( (char *)THIS, m, j, v);
|
||||
|
||||
// we were set from the explicit file
|
||||
//((CollectionRec *)THIS)->m_orig[i] = 2;
|
||||
@ -2569,7 +2640,7 @@ bool Parms::setFromFile ( void *THIS ,
|
||||
v[nb] = '\0';
|
||||
|
||||
// set our parm
|
||||
setParm( (char *)THIS, m, j, v, false /*is html encoded?*/, false );
|
||||
setParm( (char *)THIS, m, j, v);
|
||||
|
||||
// do not repeat same node
|
||||
nn++;
|
||||
@ -3519,27 +3590,48 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "diversityWeightMin";
|
||||
m->m_desc = "diversityWeightMin";
|
||||
m->m_cgi = "diversity_weight_min";
|
||||
simple_m_set(SearchInput,m_diversityWeightMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_diversityWeightMin);
|
||||
m->m_title = "termfreq min";
|
||||
m->m_desc = "Term frequency estimate minimum";
|
||||
m->m_cgi = "termfreqweightfreqmin";
|
||||
simple_m_set(Conf,m_termFreqWeightFreqMin);
|
||||
simple_m_set(SearchInput,m_termFreqWeightFreqMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_termFreqWeightFreqMin);
|
||||
m->m_def = "0.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "termfreq max";
|
||||
m->m_desc = "Term frequency estimate maximum";
|
||||
m->m_cgi = "termfreqweightfreqmax";
|
||||
simple_m_set(SearchInput,m_termFreqWeightFreqMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_termFreqWeightFreqMax);
|
||||
m->m_def = "0.500000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "termfreq weight min";
|
||||
m->m_desc = "Term frequency weight minimum";
|
||||
m->m_cgi = "termfreqweightmin";
|
||||
simple_m_set(SearchInput,m_termFreqWeightMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_termFreqWeightMin);
|
||||
m->m_def = "0.500000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "termfreq weight max";
|
||||
m->m_desc = "Term frequency weight maximum";
|
||||
m->m_cgi = "termfreqweightmax";
|
||||
simple_m_set(SearchInput,m_termFreqWeightMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_termFreqWeightMax);
|
||||
m->m_def = "1.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "diversityWeightMax";
|
||||
m->m_desc = "diversityWeightMax";
|
||||
m->m_cgi = "diversity_weight_max";
|
||||
simple_m_set(SearchInput,m_diversityWeightMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_diversityWeightMax);
|
||||
m->m_def = "1.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "densityWeightMin";
|
||||
m->m_desc = "densityWeightMin";
|
||||
m->m_cgi = "density_weight_min";
|
||||
m->m_cgi = "densityweightmin";
|
||||
simple_m_set(SearchInput,m_densityWeightMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_densityWeightMin);
|
||||
m->m_def = "0.350000";
|
||||
@ -3548,16 +3640,34 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "densityWeightMax";
|
||||
m->m_desc = "densityWeightMax";
|
||||
m->m_cgi = "density_weight_max";
|
||||
m->m_cgi = "densityweightmax";
|
||||
simple_m_set(SearchInput,m_densityWeightMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_densityWeightMax);
|
||||
m->m_def = "1.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "diversityWeightMin";
|
||||
m->m_desc = "diversityWeightMin";
|
||||
m->m_cgi = "diversityweightmin";
|
||||
simple_m_set(SearchInput,m_diversityWeightMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_diversityWeightMin);
|
||||
m->m_def = "1.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "diversityWeightMax";
|
||||
m->m_desc = "diversityWeightMax";
|
||||
m->m_cgi = "diversityweightmax";
|
||||
simple_m_set(SearchInput,m_diversityWeightMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_diversityWeightMax);
|
||||
m->m_def = "1.000000";
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "hashGroupWeightBody";
|
||||
m->m_desc = "hashGroupWeightBody";
|
||||
m->m_cgi = "hash_group_weight_body";
|
||||
m->m_cgi = "hgw_body";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightBody);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightBody);
|
||||
m->m_def = "1.000000";
|
||||
@ -3566,7 +3676,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightTitle";
|
||||
m->m_desc = "hashGroupWeightTitle";
|
||||
m->m_cgi = "hashGroupWeightTitle";
|
||||
m->m_cgi = "hgw_title";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightTitle);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightTitle);
|
||||
m->m_def = "8.000000";
|
||||
@ -3575,7 +3685,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightHeading";
|
||||
m->m_desc = "hashGroupWeightHeading";
|
||||
m->m_cgi = "hash_group_weight_heading";
|
||||
m->m_cgi = "hgw_heading";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightHeading);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightHeading);
|
||||
m->m_def = "1.500000";
|
||||
@ -3584,7 +3694,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInlist";
|
||||
m->m_desc = "hashGroupWeightInlist";
|
||||
m->m_cgi = "hash_group_weight_inlist";
|
||||
m->m_cgi = "hgw_list";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInlist);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInlist);
|
||||
m->m_def = "0.300000";
|
||||
@ -3593,7 +3703,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInMetaTag";
|
||||
m->m_desc = "hashGroupWeightInMetaTag";
|
||||
m->m_cgi = "hash_group_weight_in_meta_tag";
|
||||
m->m_cgi = "hgw_metatag";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInMetaTag);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInMetaTag);
|
||||
m->m_def = "0.100000";
|
||||
@ -3602,7 +3712,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInLinkText";
|
||||
m->m_desc = "hashGroupWeightInLinkText";
|
||||
m->m_cgi = "hash_group_weight_in_link_text";
|
||||
m->m_cgi = "hgw_inlinktext";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInLinkText);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInLinkText);
|
||||
m->m_def = "16.000000";
|
||||
@ -3611,7 +3721,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInTag";
|
||||
m->m_desc = "hashGroupWeightInTag";
|
||||
m->m_cgi = "hash_group_weight_in_tag";
|
||||
m->m_cgi = "hgw_intag";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInTag);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInTag);
|
||||
m->m_def = "1.000000";
|
||||
@ -3620,7 +3730,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightNeighborhood";
|
||||
m->m_desc = "hashGroupWeightNeighborhood";
|
||||
m->m_cgi = "hash_group_weight_neighborhood";
|
||||
m->m_cgi = "hgw_neighborhood";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightNeighborhood);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightNeighborhood);
|
||||
m->m_def = "0.000000";
|
||||
@ -3629,7 +3739,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInternalLinkText";
|
||||
m->m_desc = "hashGroupWeightInternalLinkText";
|
||||
m->m_cgi = "hash_group_weight_internal_link_text";
|
||||
m->m_cgi = "hgw_inintlinktext";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInternalLinkText);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInternalLinkText);
|
||||
m->m_def = "4.000000";
|
||||
@ -3638,7 +3748,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInUrl";
|
||||
m->m_desc = "hashGroupWeightInUrl";
|
||||
m->m_cgi = "hash_group_weight_in_url";
|
||||
m->m_cgi = "hgw_inurl";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInUrl);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInUrl);
|
||||
m->m_def = "1.000000";
|
||||
@ -3647,7 +3757,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "hashGroupWeightInMenu";
|
||||
m->m_desc = "hashGroupWeightInMenu";
|
||||
m->m_cgi = "hash_group_weight_in_menu";
|
||||
m->m_cgi = "hgw_inmenu";
|
||||
simple_m_set(SearchInput,m_hashGroupWeightInMenu);
|
||||
m->m_defOff2 = offsetof(Conf,m_hashGroupWeightInMenu);
|
||||
m->m_def = "0.200000";
|
||||
@ -3665,6 +3775,26 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Page temp weight min";
|
||||
m->m_desc = "Page temp is scaled to be between the min and max";
|
||||
m->m_cgi = "pagetempweightmin";
|
||||
simple_m_set(SearchInput,m_pageTemperatureWeightMin);
|
||||
m->m_defOff2 = offsetof(Conf,m_pageTemperatureWeightMin);
|
||||
m->m_def = "1.000000";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Page temp weight max";
|
||||
m->m_desc = "Page temp is scaled to be between the min and max";
|
||||
m->m_cgi = "pagetempweightmax";
|
||||
simple_m_set(SearchInput,m_pageTemperatureWeightMax);
|
||||
m->m_defOff2 = offsetof(Conf,m_pageTemperatureWeightMax);
|
||||
m->m_def = "20.000000";
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Use page temperature";
|
||||
m->m_desc = "Use page temperature (if available) for ranking";
|
||||
m->m_cgi = "use_page_temperature";
|
||||
@ -3741,6 +3871,18 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "unknown language weight";
|
||||
m->m_desc = "Use this to override the default uknown language weight "
|
||||
"for this collection. We multiply a result's score by this value "
|
||||
"if the user requested a specific language, but the language of the "
|
||||
"indexed page could not be determined.";
|
||||
simple_m_set(SearchInput,m_unknownLangWeight);
|
||||
m->m_defOff= offsetof(CollectionRec,m_unknownLangWeight);
|
||||
m->m_cgi = "ulangw";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "max query terms";
|
||||
m->m_desc = "Do not allow more than this many query terms. Helps "
|
||||
"prevent big queries from resource hogging.";
|
||||
@ -3809,11 +3951,10 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "language weight";
|
||||
m->m_desc = "Default language weight if document matches query "
|
||||
"language. Use this to give results that match the specified "
|
||||
"the specified &qlang higher ranking, or docs whose language "
|
||||
"is unknown. Can be overridden with "
|
||||
"language. Use this to give results that match "
|
||||
"the specified &qlang higher ranking. Can be overridden with "
|
||||
"&langw in the query url.";
|
||||
m->m_cgi = "langweight";
|
||||
m->m_cgi = "langw";
|
||||
simple_m_set(CollectionRec,m_sameLangWeight);
|
||||
m->m_def = "20.000000";
|
||||
m->m_group = true;
|
||||
@ -3821,6 +3962,21 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
m->m_title = "unknown language weight";
|
||||
m->m_desc = "Default language weight if query language is specified but document "
|
||||
"language could not be determined. Use this to give docs with unknown language a "
|
||||
"higher ranking when qlang is specified. Can be overridden with "
|
||||
"&ulangw in the query url.";
|
||||
m->m_cgi = "ulangw";
|
||||
simple_m_set(CollectionRec,m_unknownLangWeight);
|
||||
m->m_def = "10.000000";
|
||||
m->m_group = true;
|
||||
m->m_flags = PF_REBUILDRANKINGSETTINGS;
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "termfreq min";
|
||||
m->m_desc = "Term frequency estimate minimum";
|
||||
m->m_cgi = "termfreqweightfreqmin";
|
||||
@ -3901,6 +4057,8 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
m->m_title = "Hashgroup weight - body";
|
||||
m->m_desc = "";
|
||||
m->m_cgi = "hgw_body";
|
||||
@ -3953,7 +4111,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "Hashgroup weight - in link text";
|
||||
m->m_desc = "";
|
||||
m->m_cgi = "hgw_innlinktext";
|
||||
m->m_cgi = "hgw_inlinktext";
|
||||
simple_m_set(Conf,m_hashGroupWeightInLinkText);
|
||||
m->m_def = "16.000000";
|
||||
m->m_group = false;
|
||||
@ -4021,6 +4179,26 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
m->m_title = "Page temp weight min";
|
||||
m->m_desc = "Page temp is scaled to be between the min and max";
|
||||
m->m_cgi = "pagetempweightmin";
|
||||
simple_m_set(Conf,m_pageTemperatureWeightMin);
|
||||
m->m_def = "1.000000";
|
||||
m->m_group = false;
|
||||
m->m_flags = PF_REBUILDRANKINGSETTINGS;
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
m->m_title = "Page temp weight max";
|
||||
m->m_desc = "Page temp is scaled to be between the min and max";
|
||||
m->m_cgi = "pagetempweightmax";
|
||||
simple_m_set(Conf,m_pageTemperatureWeightMax);
|
||||
m->m_def = "20.000000";
|
||||
m->m_group = false;
|
||||
m->m_flags = PF_REBUILDRANKINGSETTINGS;
|
||||
m->m_page = PAGE_RANKING;
|
||||
m++;
|
||||
|
||||
m->m_title = "Use page temperature";
|
||||
m->m_desc = "Use page temperature (if available) for ranking";
|
||||
m->m_cgi = "use_page_temperature";
|
||||
@ -4047,6 +4225,7 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RANKING;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "Rank adjustment";
|
||||
m->m_cgi = "flag_rerank";
|
||||
m->m_xml = "RankAdjustment";
|
||||
|
3
Parms.h
3
Parms.h
@ -198,7 +198,7 @@ class Parms {
|
||||
bool insertParm ( int32_t i , int32_t an , char *THIS ) ;
|
||||
bool removeParm ( int32_t i , int32_t an , char *THIS ) ;
|
||||
|
||||
void setParm(char *THIS, Parm *m, int32_t array_index, const char *s, bool isHtmlEncoded, bool fromRequest);
|
||||
void setParm(char *THIS, Parm *m, int32_t array_index, const char *s);
|
||||
|
||||
void setToDefault(char *THIS, parameter_object_type_t objType,
|
||||
CollectionRec *argcr );
|
||||
@ -224,6 +224,7 @@ class Parms {
|
||||
|
||||
Parm *getParm(int32_t i) { return m_parms+i; }
|
||||
int32_t getNumParms() const { return m_numParms; }
|
||||
bool convertUIToInternal(const char *field_base_name, parameter_type_t type, const char *s, char *adjusted_value);
|
||||
|
||||
private:
|
||||
//
|
||||
|
@ -202,22 +202,17 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
//first we update our pinginfo
|
||||
PingInfo newPingInfo;
|
||||
|
||||
newPingInfo.m_numCorruptDiskReads = g_numCorrupt;
|
||||
newPingInfo.m_numOutOfMems = g_mem.getOOMCount();
|
||||
newPingInfo.m_socketsClosedFromHittingLimit = g_stats.m_closedSockets;
|
||||
newPingInfo.m_currentSpiders = g_spiderLoop.getNumSpidersOut();
|
||||
newPingInfo.m_unused9 = 0;
|
||||
newPingInfo.m_unused3 = 0;
|
||||
newPingInfo.m_unused11 = 0;
|
||||
newPingInfo.m_unused14 = 0;
|
||||
|
||||
// let the receiver know our repair mode
|
||||
newPingInfo.m_repairMode = g_repairMode;
|
||||
|
||||
int32_t l_loadavg = (int32_t) (g_process.getLoadAvg() * 100.0);
|
||||
//gbmemcpy(p, &l_loadavg, sizeof(int32_t)); p += sizeof(int32_t);
|
||||
newPingInfo.m_loadAvg = l_loadavg ;
|
||||
newPingInfo.m_unused2 = 0;
|
||||
|
||||
// then our percent mem used
|
||||
float mem = g_mem.getUsedMemPercentage();
|
||||
//*(float *)p = mem ; p += sizeof(float); // 4 bytes
|
||||
newPingInfo.m_percentMemUsed = mem;
|
||||
newPingInfo.m_unused3 = 0;
|
||||
|
||||
// our num recs, docsIndexed
|
||||
newPingInfo.m_totalDocsIndexed = (int32_t)g_process.getTotalDocsIndexed();
|
||||
@ -229,7 +224,7 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
if ( g_hostdb.getCRC() == 0 ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// disk usage (df -ka)
|
||||
newPingInfo.m_diskUsage = g_process.m_diskUsage;
|
||||
newPingInfo.m_unused7 = 0.0;
|
||||
|
||||
// flags indicating our state
|
||||
int32_t flags = 0;
|
||||
@ -247,9 +242,7 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
if ( g_dailyMerge.m_mergeMode ==0 || g_dailyMerge.m_mergeMode == 6 )
|
||||
flags |= PFLAG_MERGEMODE0OR6;
|
||||
|
||||
uint8_t rv8 = (uint8_t)g_recoveryLevel;
|
||||
if ( g_recoveryLevel > 255 ) rv8 = 255;
|
||||
newPingInfo.m_recoveryLevel = rv8;
|
||||
newPingInfo.m_unused18 = 0;
|
||||
|
||||
//*(int32_t *)p = flags; p += 4; // 4 bytes
|
||||
newPingInfo.m_flags = flags;
|
||||
@ -263,12 +256,11 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
|
||||
|
||||
newPingInfo.m_unused0 = 0;
|
||||
|
||||
newPingInfo.m_udpSlotsInUseIncoming = g_udpServer.getNumUsedSlotsIncoming();
|
||||
newPingInfo.m_unused12 = 0;
|
||||
|
||||
newPingInfo.m_tcpSocketsInUse = g_httpServer.m_tcp.m_numUsed;
|
||||
newPingInfo.m_unused13 = 0;
|
||||
|
||||
// from Loop.cpp
|
||||
newPingInfo.m_cpuUsage = 0.0;
|
||||
newPingInfo.m_unused4 = 0.0;
|
||||
|
||||
// store the gbVersionStrBuf now, just a date with a \0 included
|
||||
char *v = getVersion();
|
||||
@ -369,18 +361,6 @@ void PingServer::gotReplyWrapperP(void *state, UdpSlot *slot) {
|
||||
// he is back up then we are free to send another alert about
|
||||
// any other host that goes down
|
||||
if ( h->m_hostId == s_lastSentHostId ) s_lastSentHostId = -1;
|
||||
|
||||
if ( h->m_pingInfo.m_percentMemUsed >= 99.0 &&
|
||||
h->m_firstOOMTime == 0 )
|
||||
h->m_firstOOMTime = nowms;
|
||||
if ( h->m_pingInfo.m_percentMemUsed < 99.0 )
|
||||
h->m_firstOOMTime = 0LL;
|
||||
// if this host is alive and has been at 99% or more mem usage
|
||||
// for the last X minutes, and we have got at least 10 ping replies
|
||||
// from him, then send an email alert.
|
||||
if ( h->m_pingInfo.m_percentMemUsed >= 99.0 &&
|
||||
nowms - h->m_firstOOMTime >= g_conf.m_sendEmailTimeout )
|
||||
g_pingServer.sendEmail ( h , NULL , true );
|
||||
} else {
|
||||
// . if his ping was dead, try to send an email alert to the admin
|
||||
// . returns false if blocked, true otherwise
|
||||
|
@ -246,19 +246,16 @@ float PosdbTable::getBestScoreSumForSingleTerm(int32_t i, const char *wpi, const
|
||||
unsigned char div = Posdb::getDiversityRank ( wpi );
|
||||
score *= m_msg39req->m_scoringWeights.m_diversityWeights[div];
|
||||
score *= m_msg39req->m_scoringWeights.m_diversityWeights[div];
|
||||
|
||||
// hash group? title? body? heading? etc.
|
||||
unsigned char hg = Posdb::getHashGroup ( wpi );
|
||||
unsigned char mhg = hg;
|
||||
if ( s_inBody[mhg] ) mhg = HASHGROUP_BODY;
|
||||
score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg];
|
||||
score *= m_msg39req->m_scoringWeights.m_hashGroupWeights[hg];
|
||||
|
||||
// good density?
|
||||
unsigned char dens = Posdb::getDensityRank ( wpi );
|
||||
score *= m_msg39req->m_scoringWeights.m_densityWeights[dens];
|
||||
score *= m_msg39req->m_scoringWeights.m_densityWeights[dens];
|
||||
|
||||
// to make more compatible with pair scores divide by distance of 2
|
||||
//score /= 2.0;
|
||||
|
||||
@ -452,6 +449,7 @@ float PosdbTable::getBestScoreSumForSingleTerm(int32_t i, const char *wpi, const
|
||||
sx->m_densityRank = Posdb::getDensityRank(maxp);
|
||||
|
||||
float score = bestScores[k];
|
||||
|
||||
//score *= ts;
|
||||
score *= m_freqWeights[i];
|
||||
score *= m_freqWeights[i];
|
||||
@ -3937,6 +3935,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if( currPassNum == INTERSECT_SCORING ) {
|
||||
//
|
||||
// Pre-advance each termlist's cursor to skip to next docid.
|
||||
@ -4094,7 +4093,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
minSingleScore *= completeScoreMultiplier;
|
||||
|
||||
|
||||
//#
|
||||
//# DOCID / SITERANK DETECTION
|
||||
//#
|
||||
@ -4128,7 +4126,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
minPairScore *= completeScoreMultiplier;
|
||||
|
||||
|
||||
//#
|
||||
//# Find minimum score - either single term or term pair
|
||||
//#
|
||||
@ -4155,7 +4152,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
} // !m_q->m_isBoolean
|
||||
|
||||
|
||||
//#
|
||||
//# Calculate score and give boost based on siterank and highest inlinking siterank
|
||||
//#
|
||||
@ -4169,14 +4165,23 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
score = minScore * (adjustedSiteRank*m_siteRankMultiplier+1.0);
|
||||
logTrace(g_conf.m_logTracePosdb, "Score %f for docId %" PRIu64 "", score, m_docId);
|
||||
|
||||
|
||||
//#
|
||||
//# Give score boost if query and doc language is the same.
|
||||
//# Give score boost if query and doc language is the same,
|
||||
//# and optionally a different boost if the language of the
|
||||
//# page is unknown.
|
||||
//#
|
||||
//# Use "qlang" parm to set the language. i.e. "&qlang=fr"
|
||||
//#
|
||||
if ( m_msg39req->m_language == 0 || docLang == 0 || m_msg39req->m_language == docLang) {
|
||||
score *= (m_msg39req->m_sameLangWeight); //SAMELANGMULT;
|
||||
logTrace(g_conf.m_logTracePosdb, "Giving score a matching language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_sameLangWeight, score, m_docId);
|
||||
if ( m_msg39req->m_language != 0 ) {
|
||||
if( m_msg39req->m_language == docLang) {
|
||||
score *= (m_msg39req->m_sameLangWeight);
|
||||
logTrace(g_conf.m_logTracePosdb, "Giving score a matching language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_sameLangWeight, score, m_docId);
|
||||
}
|
||||
else
|
||||
if( docLang == 0 ) {
|
||||
score *= (m_msg39req->m_unknownLangWeight);
|
||||
logTrace(g_conf.m_logTracePosdb, "Giving score an unknown language boost of x%f: %f for docId %" PRIu64 "", m_msg39req->m_unknownLangWeight, score, m_docId);
|
||||
}
|
||||
}
|
||||
|
||||
double page_temperature = 0;
|
||||
@ -4185,13 +4190,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
if(m_msg39req->m_usePageTemperatureForRanking) {
|
||||
use_page_temperature = true;
|
||||
page_temperature = g_pageTemperatureRegistry.query_page_temperature(m_docId);
|
||||
page_temperature = g_pageTemperatureRegistry.query_page_temperature(m_docId, m_msg39req->m_pageTemperatureWeightMin, m_msg39req->m_pageTemperatureWeightMax);
|
||||
score *= page_temperature;
|
||||
logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.4f, score %f->%f", m_docId, page_temperature, score_before_page_temp, score);
|
||||
logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.14f, score %f -> %f", m_docId, page_temperature, score_before_page_temp, score);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//#
|
||||
//# Handle sortby int/float and minimum docid/score pairs
|
||||
//#
|
||||
@ -4524,11 +4528,15 @@ float PosdbTable::getMaxPossibleScore ( const QueryTermInfo *qti,
|
||||
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
|
||||
score *= (((float)siteRank)*m_siteRankMultiplier+1.0);
|
||||
|
||||
// language boost if same language (or no lang specified)
|
||||
if ( m_msg39req->m_language == docLang ||
|
||||
m_msg39req->m_language == 0 ||
|
||||
docLang == 0 ) {
|
||||
score *= m_msg39req->m_sameLangWeight;//SAMELANGMULT;
|
||||
// language boost if language specified and if page is same language, or unknown language
|
||||
if ( m_msg39req->m_language != 0 ) {
|
||||
if( m_msg39req->m_language == docLang) {
|
||||
score *= (m_msg39req->m_sameLangWeight);
|
||||
}
|
||||
else
|
||||
if( docLang == 0 ) {
|
||||
score *= (m_msg39req->m_unknownLangWeight);
|
||||
}
|
||||
}
|
||||
|
||||
// assume the other term we pair with will be 1.0
|
||||
|
@ -280,7 +280,7 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
s_count = 0;
|
||||
s_last = now;
|
||||
}
|
||||
g_stats.m_closedSockets++;;
|
||||
Statistics::register_socket_limit_hit();
|
||||
return g_httpServer.sendErrorReply ( s , 500 ,
|
||||
"Too many sockets open.");
|
||||
}
|
||||
|
@ -17,17 +17,24 @@ void ScoringWeights::init(float diversityWeightMin, float diversityWeightMax,
|
||||
float hashGroupWeightInMenu)
|
||||
{
|
||||
for(int i = 0; i <= MAXDIVERSITYRANK; i++)
|
||||
m_diversityWeights[i] = scale_quadratic(i,0,MAXDIVERSITYRANK,diversityWeightMin,diversityWeightMax);
|
||||
m_diversityWeights[i] = scale_quadratic(i, 0, MAXDIVERSITYRANK, diversityWeightMin, diversityWeightMax);
|
||||
|
||||
for(int i = 0; i <= MAXDENSITYRANK; i++)
|
||||
m_densityWeights[i] = scale_quadratic(i,0,MAXDENSITYRANK,densityWeightMin,densityWeightMax);
|
||||
m_densityWeights[i] = scale_quadratic(i, 0, MAXDENSITYRANK, densityWeightMin, densityWeightMax);
|
||||
|
||||
// make sure if word spam is 0 that the weight is not 0
|
||||
for(int i = 0; i <= MAXWORDSPAMRANK; i++)
|
||||
m_wordSpamWeights[i] = scale_linear(i, 0,MAXWORDSPAMRANK, 1.0/MAXWORDSPAMRANK, 1.0);
|
||||
|
||||
for(int i = 0; i <= MAXWORDSPAMRANK; i++)
|
||||
m_wordSpamWeights[i] = scale_linear(i, 0, MAXWORDSPAMRANK, 1.0/MAXWORDSPAMRANK, 1.0);
|
||||
|
||||
// site rank of inlinker
|
||||
// to be on the same level as multiplying the final score
|
||||
// by the siterank+1 we should make this a sqrt() type thing
|
||||
// since we square it so that single term scores are on the same
|
||||
// level as term pair scores
|
||||
// @@@ BR: Right way to do it? Gives a weight between 1 and 4
|
||||
for(int i = 0; i <= MAXWORDSPAMRANK; i++) {
|
||||
m_linkerWeights[i] = sqrt(1.0 + i);
|
||||
}
|
||||
|
||||
for(int i=0; i<HASHGROUP_END; i++)
|
||||
m_hashGroupWeights[i] = 1.0;
|
||||
|
@ -48,6 +48,7 @@ SearchInput::SearchInput() {
|
||||
m_maxSerpScore = 0.0;
|
||||
m_minSerpDocId = 0;
|
||||
m_sameLangWeight = 0.0;
|
||||
m_unknownLangWeight = 0.0;
|
||||
m_defaultSortLang = NULL;
|
||||
m_dedupURL = 0;
|
||||
m_percentSimilarSummary = 0;
|
||||
@ -64,7 +65,15 @@ SearchInput::SearchInput() {
|
||||
m_askOtherShards = false;
|
||||
memset(m_queryId, 0, sizeof(m_queryId));
|
||||
m_doMaxScoreAlgo = false;
|
||||
|
||||
m_termFreqWeightFreqMin = 0.0;
|
||||
m_termFreqWeightFreqMax = 0.5;
|
||||
m_termFreqWeightMin = 0.5;
|
||||
m_termFreqWeightMax = 1.0;
|
||||
|
||||
m_synonymWeight = 0.9;
|
||||
m_pageTemperatureWeightMin = 1.0;
|
||||
m_pageTemperatureWeightMax = 20.0;
|
||||
m_usePageTemperatureForRanking = true;
|
||||
m_numFlagScoreMultipliers=26;
|
||||
for(int i=0; i<26; i++)
|
||||
|
@ -130,6 +130,7 @@ public:
|
||||
int64_t m_minSerpDocId;
|
||||
|
||||
float m_sameLangWeight;
|
||||
float m_unknownLangWeight;
|
||||
|
||||
// prefer what lang in the results. it gets a 20x boost. "en" "xx" "fr"
|
||||
char *m_defaultSortLang;
|
||||
@ -147,6 +148,11 @@ public:
|
||||
bool m_doDupContentRemoval; // msg40
|
||||
bool m_getDocIdScoringInfo;
|
||||
|
||||
float m_termFreqWeightFreqMin;
|
||||
float m_termFreqWeightFreqMax;
|
||||
float m_termFreqWeightMin;
|
||||
float m_termFreqWeightMax;
|
||||
|
||||
float m_diversityWeightMin;
|
||||
float m_diversityWeightMax;
|
||||
float m_densityWeightMin;
|
||||
@ -162,8 +168,9 @@ public:
|
||||
float m_hashGroupWeightInternalLinkText;
|
||||
float m_hashGroupWeightInUrl;
|
||||
float m_hashGroupWeightInMenu;
|
||||
|
||||
float m_synonymWeight;
|
||||
float m_pageTemperatureWeightMin;
|
||||
float m_pageTemperatureWeightMax;
|
||||
bool m_usePageTemperatureForRanking;
|
||||
|
||||
int32_t m_numFlagScoreMultipliers;
|
||||
|
@ -4,6 +4,11 @@
|
||||
#include "gb-include.h"
|
||||
#include "types.h"
|
||||
#include "Msg3.h" //getDiskPageCache()
|
||||
#include "Mem.h" //memory statistics
|
||||
#include "UdpServer.h" //g_udpServer.getNumUsedSlotsIncoming()
|
||||
#include "HttpServer.h" //g_httpServer.m_tcp.m_numUsed
|
||||
#include "Msg5.h" //g_numCorrupt
|
||||
#include "SpiderLoop.h"
|
||||
#include "RdbCache.h"
|
||||
#include "Rdb.h"
|
||||
#include "GbMutex.h"
|
||||
@ -364,6 +369,28 @@ static void dump_rdb_cache_statistics( FILE *fp ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Assorted statistics
|
||||
|
||||
static std::atomic<unsigned long> socket_limit_hit_count(0);
|
||||
|
||||
void Statistics::register_socket_limit_hit() {
|
||||
socket_limit_hit_count++;
|
||||
}
|
||||
|
||||
//Fetch various counters and levels. Some of them were previously exchanged in PingInfo
|
||||
static void dump_assorted_statistics(FILE *fp) {
|
||||
fprintf(fp,"mem:pctused:%f\n",g_mem.getUsedMemPercentage());
|
||||
fprintf(fp,"mem:oom_count:%d\n",g_mem.getOOMCount());
|
||||
fprintf(fp,"socket:limit_hit:%lu\n",socket_limit_hit_count.load());
|
||||
fprintf(fp,"socket:slots_incoming:%d\n",g_udpServer.getNumUsedSlotsIncoming());
|
||||
fprintf(fp,"socket:tcp_in_use:%d\n",g_httpServer.m_tcp.m_numUsed);
|
||||
fprintf(fp,"misc::corrupt_list_reads:%d\n",g_numCorrupt);
|
||||
fprintf(fp,"spider:current_spiders:%d\n",g_spiderLoop.getNumSpidersOut());
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// statistics
|
||||
|
||||
@ -381,6 +408,7 @@ static void dump_statistics(time_t now) {
|
||||
dump_spider_statistics( fp );
|
||||
dump_io_statistics( fp );
|
||||
dump_rdb_cache_statistics( fp );
|
||||
dump_assorted_statistics(fp);
|
||||
|
||||
if ( fflush(fp) != 0 ) {
|
||||
log( LOG_ERROR, "fflush(%s) failed with errno=%d (%s)", tmp_filename, errno, strerror( errno ) );
|
||||
|
@ -12,6 +12,8 @@ void register_spider_time( bool is_new, int error_code, int http_status, unsigne
|
||||
|
||||
void register_io_time( bool is_write, int error_code, unsigned long bytes, unsigned ms );
|
||||
|
||||
void register_socket_limit_hit();
|
||||
|
||||
} //namespace
|
||||
|
||||
#endif
|
||||
|
@ -14,8 +14,6 @@ Stats::Stats ( ) {
|
||||
m_next = 0;
|
||||
memset ( m_pts , 0 , sizeof(StatPoint)*MAX_POINTS );
|
||||
|
||||
m_closedSockets = 0;
|
||||
|
||||
memset(m_msg3aRecalls, 0, sizeof(m_msg3aRecalls));
|
||||
|
||||
clearMsgStats();
|
||||
|
3
Stats.h
3
Stats.h
@ -60,9 +60,6 @@ class Stats {
|
||||
|
||||
int64_t m_startTime;
|
||||
|
||||
// when we have to close a socket because too many are open.. count it
|
||||
int32_t m_closedSockets;
|
||||
|
||||
time_t m_uptimeStart;
|
||||
|
||||
// one count for each CR_* defined in Msg51.h
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include "TcpServer.h"
|
||||
#include "Stats.h"
|
||||
#include "Statistics.h"
|
||||
#include "Profiler.h"
|
||||
#include "PingServer.h"
|
||||
#include "HttpServer.h" //g_httpServer.m_ssltcp.m_ctx
|
||||
@ -771,7 +772,7 @@ TcpSocket *TcpServer::getNewSocket ( ) {
|
||||
s_last = now;
|
||||
}
|
||||
// another stat
|
||||
g_stats.m_closedSockets++;
|
||||
Statistics::register_socket_limit_hit();
|
||||
g_errno = EOUTOFSOCKETS;
|
||||
// send email alert
|
||||
g_pingServer.sendEmailMsg ( &s_lastTime ,
|
||||
@ -888,7 +889,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
|
||||
s_last = now;
|
||||
}
|
||||
// another stat
|
||||
g_stats.m_closedSockets++;
|
||||
Statistics::register_socket_limit_hit();
|
||||
g_errno = EOUTOFSOCKETS;
|
||||
|
||||
// send email alert
|
||||
@ -904,7 +905,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
|
||||
if ( sd < 0 || sd >= MAX_TCP_SOCKS ) {
|
||||
log(LOG_LOGIC,"tcp: Got bad sd of %" PRId32".",(int32_t)sd);
|
||||
// another stat
|
||||
g_stats.m_closedSockets++;
|
||||
Statistics::register_socket_limit_hit();
|
||||
g_errno = EOUTOFSOCKETS;
|
||||
// send email alert
|
||||
g_pingServer.sendEmailMsg ( &s_lastTime , "out of sockets on https2");
|
||||
@ -919,7 +920,7 @@ TcpSocket *TcpServer::wrapSocket ( int sd , int32_t niceness , bool isIncoming )
|
||||
// . this has happened a few times lately...
|
||||
if ( s->m_startTime != 0 ) {
|
||||
log(LOG_LOGIC,"tcp: sd of %" PRId32" is already in use.",(int32_t)sd);
|
||||
g_stats.m_closedSockets++;
|
||||
Statistics::register_socket_limit_hit();
|
||||
g_errno = EOUTOFSOCKETS;
|
||||
if ( sd == 0 ) log("tcp: closing2 sd of 0");
|
||||
if ( ::close(sd) == -1 )
|
||||
|
@ -12922,6 +12922,9 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
// we're adding titlerec to keep links between redirection intact
|
||||
addTitleRec = true;
|
||||
|
||||
// since we're adding titlerec, add posrec as well
|
||||
addPosRec = true;
|
||||
|
||||
// if we are adding a simplified redirect as a link to spiderdb
|
||||
// likewise if the error was ENONCANONICAL treat it like that
|
||||
spideringLinks = true;
|
||||
@ -16062,7 +16065,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
m_reply.m_ip = m_ip;
|
||||
m_reply.m_firstIp = *fip;
|
||||
m_reply.m_docId = m_docId;
|
||||
m_reply.m_contentLen = size_utf8Content;
|
||||
m_reply.m_contentLen = size_utf8Content - 1;
|
||||
m_reply.m_lastSpidered = getSpideredTime();//m_spideredTime;
|
||||
m_reply.m_datedbDate = 0;
|
||||
m_reply.m_firstIndexedDate = m_firstIndexedDate;
|
||||
|
2
XmlDoc.h
2
XmlDoc.h
@ -483,7 +483,7 @@ public:
|
||||
SafeBuf *getTimeAxisUrl ( );
|
||||
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
|
||||
bool hashIncomingLinkText(HashTableX *table);
|
||||
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
||||
bool hashNeighborhoods ( class HashTableX *table ) ;
|
||||
bool hashTitle ( class HashTableX *table );
|
||||
|
@ -154,13 +154,6 @@ static bool storeTerm ( const char *s ,
|
||||
// we know the termlist is small, or the termlist is being used for spidering
|
||||
// or parsing purposes and is usually not sent across the network.
|
||||
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// this should be ready to go and not block!
|
||||
int64_t *pch64 = getExactContentHash64();
|
||||
if ( ! pch64 || pch64 == (void *)-1 ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// shortcut
|
||||
Url *fu = getFirstUrl();
|
||||
|
||||
// constructor should set to defaults automatically
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
@ -168,19 +161,26 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// usually we shard by docid, but these are terms we shard by termid!
|
||||
hi.m_shardByTermId = true;
|
||||
|
||||
if ((size_utf8Content - 1) > 0) {
|
||||
// for exact content deduping
|
||||
setStatus("hashing gbcontenthash (deduping) no-split keys");
|
||||
|
||||
// for exact content deduping
|
||||
setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
|
||||
char cbuf[64];
|
||||
int32_t clen = sprintf(cbuf,"%" PRIu64,(uint64_t)*pch64);
|
||||
hi.m_prefix = "gbcontenthash";
|
||||
if ( ! hashString ( cbuf,clen,&hi ) ) return false;
|
||||
// this should be ready to go and not block!
|
||||
int64_t *pch64 = getExactContentHash64();
|
||||
if (!pch64 || pch64 == (void *)-1) { g_process.shutdownAbort(true); }
|
||||
|
||||
char *host = fu->getHost ();
|
||||
char cbuf[64];
|
||||
int32_t clen = sprintf(cbuf, "%" PRIu64, (uint64_t)*pch64);
|
||||
hi.m_prefix = "gbcontenthash";
|
||||
if (!hashString(cbuf, clen, &hi)) return false;
|
||||
}
|
||||
|
||||
// now hash the site
|
||||
setStatus ( "hashing no-split SiteGetter terms");
|
||||
|
||||
Url *fu = getFirstUrl();
|
||||
char *host = fu->getHost ();
|
||||
|
||||
//
|
||||
// HASH terms for SiteGetter.cpp
|
||||
//
|
||||
@ -217,44 +217,6 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
|
||||
}
|
||||
|
||||
//Dates *dp = getDates ();
|
||||
// hash the clocks into indexdb
|
||||
//if ( ! dp->hash ( m_docId , tt , this ) ) return false;
|
||||
|
||||
// . hash special site/hopcount thing for permalinks
|
||||
// . used by Images.cpp for doing thumbnails
|
||||
// . this returns false and sets g_errno on error
|
||||
// . let's try thumbnails for all...
|
||||
//if ( ! *getIsPermalink() ) return true;
|
||||
|
||||
/*
|
||||
BR 20160117: No longer has image URLs
|
||||
setStatus ( "hashing no-split gbimage keys" );
|
||||
|
||||
hi.m_prefix = "gbimage";
|
||||
// hash gbimage: for permalinks only for Images.cpp
|
||||
for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
|
||||
// get the node number
|
||||
//int32_t nn = m_images.m_imageNodes[i];
|
||||
// get the url of the image
|
||||
//XmlNode *xn = m_xml.getNodePtr(nn);
|
||||
int32_t srcLen;
|
||||
char *src = m_images.getImageUrl(i,&srcLen);
|
||||
// set it to the full url
|
||||
Url iu;
|
||||
// use "pageUrl" as the baseUrl
|
||||
Url *cu = getCurrentUrl();
|
||||
// we can addwww to normalize since this is for deduping kinda
|
||||
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
|
||||
char *u = iu.getUrl ();
|
||||
int32_t ulen = iu.getUrlLen();
|
||||
// hash each one
|
||||
//if ( ! hashString ( u,ulen,&hi ) ) return false;
|
||||
// hash a single entity
|
||||
if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
|
||||
//log("test: %s",u);
|
||||
}
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -285,9 +247,14 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// BR 20160127: Never index JSON and XML content
|
||||
if (*ct == CT_JSON || *ct == CT_XML) {
|
||||
if (!hashContentType(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// For XML (JSON should not get here as it should be filtered out during spidering)
|
||||
// store the URL as the only thing in posdb so we are able to find it, and
|
||||
// eventually ban it.
|
||||
@ -405,18 +372,17 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
// global index now, so don't need this... 9/28/2014
|
||||
|
||||
// stop indexing xml docs
|
||||
bool indexDoc = cr->m_indexBody;
|
||||
|
||||
// global index unless this is a json object in which case it is
|
||||
// hashed above in the call to hashJSON(). this will decrease disk
|
||||
// usage by about half, posdb* files are pretty big.
|
||||
if (!indexDoc) {
|
||||
if (!cr->m_indexBody) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
||||
goto skip;
|
||||
if ((size_utf8Content - 1) <= 0) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
// hash the body of the doc first so m_dist is 0 to match
|
||||
@ -449,7 +415,7 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
// we index the single words in the neighborhoods next, and
|
||||
// we had songfacts.com coming up for the 'street light facts'
|
||||
// query because it had a bunch of anomalous inlink text.
|
||||
if (!hashIncomingLinkText(table, false, true)) {
|
||||
if (!hashIncomingLinkText(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
|
||||
return NULL;
|
||||
}
|
||||
@ -462,7 +428,6 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// BR 20160220
|
||||
// Store value of meta tag "geo.placename" to help aid searches for
|
||||
// location specific sites, e.g. 'Restaurant in London'
|
||||
@ -471,8 +436,6 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
skip:
|
||||
|
||||
// this will only increment the scores of terms already in the table
|
||||
// because we neighborhoods are not techincally in the document
|
||||
// necessarily and we do not want to ruin our precision
|
||||
@ -714,30 +677,6 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
|
||||
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
// do not index the rest if we are a "spider reply" document
|
||||
// which is like a fake document for seeing spider statuses
|
||||
//if ( isStatusDoc == CT_STATUS ) return true;
|
||||
//if ( isStatusDoc ) return true;
|
||||
|
||||
// now for CT_STATUS spider status "documents" we also index
|
||||
// gbspiderdate so index this so we can just do a
|
||||
// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
|
||||
// spider status "documents"
|
||||
/*
|
||||
BR 20160108: Don't store these as we don't plan to use them
|
||||
hi.m_desc = "doc last spidered date";
|
||||
hi.m_prefix = "gbdocspiderdate";
|
||||
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)m_spideredTime );
|
||||
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
hi.m_desc = "doc last indexed date";
|
||||
hi.m_prefix = "gbdocindexdate";
|
||||
bufLen = sprintf ( buf , "%" PRIu32, (uint32_t)indexedTime );
|
||||
if ( ! hashNumberForSorting ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
*/
|
||||
|
||||
// all done
|
||||
return true;
|
||||
}
|
||||
@ -1024,8 +963,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
Url uw;
|
||||
uw.set( fu->getUrl(), fu->getUrlLen(), true, false );
|
||||
hi.m_prefix = "url";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "url2";
|
||||
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
|
||||
@ -1228,21 +1166,15 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
int32_t elen = fu->getExtensionLen();
|
||||
// update hash parms
|
||||
hi.m_prefix = "ext";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "ext2";
|
||||
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing gbdocid" );
|
||||
hi.m_prefix = "gbdocid";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
|
||||
char buf2[32];
|
||||
sprintf(buf2,"%" PRIu64, (uint64_t)m_docId );
|
||||
if ( ! hashSingleTerm(buf2,strlen(buf2),&hi) ) return false;
|
||||
|
||||
//if ( isStatusDoc ) return true;
|
||||
|
||||
setStatus ( "hashing SiteGetter terms");
|
||||
|
||||
//
|
||||
@ -1299,76 +1231,50 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
hi.m_prefix = "urlhash";
|
||||
if ( ! hashString(buf,blen,&hi) ) return false;
|
||||
|
||||
/*
|
||||
BR 20160106 removed.
|
||||
blen = sprintf(buf,"%" PRIu32,h/10);
|
||||
// update hashing parms
|
||||
hi.m_prefix = "urlhashdiv10";
|
||||
if ( ! hashString(buf,blen,&hi) ) return false;
|
||||
blen = sprintf(buf,"%" PRIu32,h/100);
|
||||
// update hashing parms
|
||||
hi.m_prefix = "urlhashdiv100";
|
||||
if ( ! hashString(buf,blen,&hi) ) return false;
|
||||
*/
|
||||
if (m_contentLen > 0) {
|
||||
setStatus("hashing url mid domain");
|
||||
|
||||
// update parms
|
||||
hi.m_prefix = NULL;
|
||||
hi.m_desc = "middle domain";
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
hi.m_hashCommonWebWords = false; // Skip www, com, http etc.
|
||||
if (!hashString(host, hlen, &hi)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
setStatus ( "hashing url mid domain");
|
||||
hi.m_hashCommonWebWords = true;
|
||||
if (!hashSingleTerm(fu->getDomain(), fu->getDomainLen(), &hi)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// update parms
|
||||
hi.m_prefix = NULL;
|
||||
hi.m_desc = "middle domain";
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
hi.m_hashCommonWebWords = false; // Skip www, com, http etc.
|
||||
if ( ! hashString ( host,hlen,&hi)) return false;
|
||||
setStatus("hashing url path");
|
||||
char *path = fu->getPath();
|
||||
int32_t plen = fu->getPathLen();
|
||||
|
||||
hi.m_hashCommonWebWords = true;
|
||||
if ( ! hashSingleTerm ( fu->getDomain(),fu->getDomainLen(),&hi)) return false;
|
||||
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
|
||||
if (elen > 0) {
|
||||
elen++; // also skip the dot
|
||||
}
|
||||
plen -= elen;
|
||||
|
||||
|
||||
setStatus ( "hashing url path");
|
||||
char *path = fu->getPath();
|
||||
int32_t plen = fu->getPathLen();
|
||||
|
||||
// BR 20160113: Do not hash and combine the page filename extension with the page name (skip e.g. .com)
|
||||
if( elen > 0 )
|
||||
{
|
||||
elen++; // also skip the dot
|
||||
}
|
||||
plen -= elen;
|
||||
|
||||
|
||||
// BR 20160113: Do not hash the most common page names
|
||||
if( strncmp(path, "/index", plen) != 0 )
|
||||
{
|
||||
// hash the path
|
||||
// BR 20160114: Exclude numbers in paths (usually dates)
|
||||
hi.m_hashNumbers = false;
|
||||
if ( ! hashString (path,plen,&hi) ) return false;
|
||||
// BR 20160113: Do not hash the most common page names
|
||||
if (strncmp(path, "/index", plen) != 0) {
|
||||
// hash the path
|
||||
// BR 20160114: Exclude numbers in paths (usually dates)
|
||||
hi.m_hashNumbers = false;
|
||||
if (!hashString(path, plen, &hi)) return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
bool hashAnomalies ,
|
||||
bool hashNonAnomalies ) {
|
||||
|
||||
// do not index ANY of the body if it is NOT a permalink and
|
||||
// "menu elimination" technology is enabled.
|
||||
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
||||
bool XmlDoc::hashIncomingLinkText(HashTableX *tt) {
|
||||
|
||||
setStatus ( "hashing link text" );
|
||||
|
||||
// . now it must have an rss item to be indexed in all its glory
|
||||
// . but if it tells us it has an rss feed, toss it and wait for
|
||||
// the feed.... BUT sometimes the rss feed outlink is 404!
|
||||
// . NO, now we discard with ENORSS at Msg16.cpp
|
||||
//if ( ! *getHasRSSItem() && m_eliminateMenus ) return true;
|
||||
|
||||
// sanity check
|
||||
if ( hashAnomalies == hashNonAnomalies ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// sanity
|
||||
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
@ -1404,14 +1310,7 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
|
||||
// count external inlinks we have for indexing gbmininlinks:
|
||||
if ( ! internal ) ecount++;
|
||||
// get score
|
||||
//int64_t baseScore = k->m_baseScore;
|
||||
// get the weight
|
||||
//int64_t ww ;
|
||||
//if ( internal ) ww = m_internalLinkTextWeight;
|
||||
//else ww = m_externalLinkTextWeight;
|
||||
// modify the baseScore
|
||||
//int64_t final = (baseScore * ww) / 100LL;
|
||||
|
||||
// get length of link text
|
||||
int32_t tlen = k->size_linkText;
|
||||
if ( tlen > 0 ) tlen--;
|
||||
@ -1423,15 +1322,16 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
k->getUrl(),m_firstUrl.getUrl());
|
||||
continue;
|
||||
}
|
||||
// if it is anomalous, set this, we don't
|
||||
//if ( k->m_isAnomaly )
|
||||
// hi.m_hashIffNotUnique = true;
|
||||
//hi.m_baseScore = final;
|
||||
|
||||
if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
|
||||
else hi.m_hashGroup = HASHGROUP_INLINKTEXT;
|
||||
// store the siterank of the linker in this and use that
|
||||
// to set the multiplier M bits i guess
|
||||
hi.m_linkerSiteRank = k->m_siteRank;
|
||||
if(hi.m_linkerSiteRank>MAXSITERANK) {
|
||||
log(LOG_INFO,"Inlink had siteRank>max (%d), probably from docid %ld", k->m_siteRank, k->m_docId);
|
||||
hi.m_linkerSiteRank = MAXSITERANK;
|
||||
}
|
||||
// now record this so we can match the link text to
|
||||
// a matched offsite inlink text term in the scoring info
|
||||
k->m_wordPosStart = m_dist; // hi.m_startDist;
|
||||
@ -1453,14 +1353,8 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
||||
|
||||
// seems like iffUnique is off, so do this
|
||||
//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
|
||||
|
||||
setStatus ( "hashing neighborhoods" );
|
||||
|
||||
//g_tt = table;
|
||||
|
||||
// . now we also hash the neighborhood text of each inlink, that is,
|
||||
// the text surrounding the inlink text.
|
||||
// . this is also destructive in that it will remove termids that
|
||||
@ -1702,15 +1596,6 @@ bool XmlDoc::hashLanguage ( HashTableX *tt ) {
|
||||
|
||||
if ( ! hashString ( s, slen, &hi ) ) return false;
|
||||
|
||||
/*
|
||||
BR 20160117: Duplicate
|
||||
// try lang abbreviation
|
||||
sprintf(s , "%s ", getLanguageAbbr(langId) );
|
||||
// go back to broken way to try to fix parsing consistency bug
|
||||
// by adding hashLanguageString() function below
|
||||
//sprintf(s , "%s ", getLanguageAbbr(langId) );
|
||||
if ( ! hashString ( s, slen, &hi ) ) return false;
|
||||
*/
|
||||
return true;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user