privacore-open-source-searc.../PosdbTable.cpp
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

5570 lines
169 KiB
C++

#include "PosdbTable.h"
#include "Posdb.h"
#include "PageTemperatureRegistry.h"
#include "Docid2Siteflags.h"
#include "SiteMedianPageTemperatureRegistry.h"
#include "ScalingFunctions.h"
#include "ScoringWeights.h"
#include "BitOperations.h"
#include "Msg2.h"
#include "Msg39.h"
#include "Sanity.h"
#include "Stats.h"
#include "Conf.h"
#include "TopTree.h"
#include "DocumentIndexChecker.h"
#include "Lang.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include "Errno.h"
#include <math.h>
#include <valarray>
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#include <valgrind/helgrind.h>
#endif
#define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
#define BF_PIPED 0x02 // before a query pipe operator
#define BF_SYNONYM 0x04
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
#define BF_BIGRAM 0x10
static const int INTERSECT_SCORING = 0;
static const int INTERSECT_DEBUG_INFO = 1;
static bool s_init = false;
static GbMutex s_mtx_weights;
static DerivedScoringWeights s_scoringWeights;
static bool s_isCompatible [HASHGROUP_END][HASHGROUP_END];
static bool s_inBody [HASHGROUP_END];
#define gbmin(a,b) ((a)<(b) ? (a) : (b))
#define gbmax(a,b) ((a)>(b) ? (a) : (b))
static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize);
static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 );
static void initWeights();
//struct used for the mini-merges (see mergeTermSubListsForDocId() etc)
struct MiniMergeBuffer {
//the simpl merge buffer
char buffer[300000];
//the bufptr-to-which-term-did-it-come-from mapping. posbd keys in 'buffer' above is always a multiple of 6 so we only need a sixth
int16_t termIndex[300000/6];
std::vector<const char *> mergedListStart;
std::vector<const char *> mergedListEnd;
MiniMergeBuffer(int numQueryTerms)
: mergedListStart(numQueryTerms),
mergedListEnd(numQueryTerms)
{
#ifdef _VALGRIND_
VALGRIND_MAKE_MEM_UNDEFINED(termIndex,sizeof(termIndex));
#endif
}
int16_t *getTermIndexPtrForBufferPos(const char *ptr) {
size_t bufferOffset = (size_t)(ptr-buffer)/6;
return termIndex+bufferOffset;
}
const int16_t *getTermIndexPtrForBufferPos(const char *ptr) const {
size_t bufferOffset = (size_t)(ptr-buffer)/6;
return termIndex+bufferOffset;
}
int16_t getTermIndexForBufferPos(const char *ptr) const {
return *getTermIndexPtrForBufferPos(ptr);
}
};
// A 2D matrix used by createNonBodyTermPairScoreMatrix/findMinTermPairScoreInWindow/getMinTermPairScoreSlidingWindow
class PairScoreMatrix {
std::vector<float> m;
const int columns;
public:
PairScoreMatrix(int size)
: m(size*size),
columns(size)
{}
float get(int x, int y) const {
return m[y*columns+x];
}
void set(int x, int y, float value) {
m[y*columns+x] = value;
}
};
//////////////////
//
// THE NEW INTERSECTION LOGIC
//
//////////////////
PosdbTable::PosdbTable() {
// top docid info
m_q = NULL;
m_msg39req = NULL;
reset();
}
PosdbTable::~PosdbTable() {
reset();
}
void PosdbTable::reset() {
// has init() been called?
m_initialized = false;
//freeMem(); // not implemented
// does not free the mem of this safebuf, only resets length
m_docIdVoteBuf.reset();
m_filtered = 0;
m_queryTermInfos.clear();
// assume no-op
m_t1 = 0LL;
m_whiteListTable.reset();
m_addedSites = false;
// Coverity
m_docId = 0;
m_hasMaxSerpScore = false;
m_addListsTime = 0;
m_t2 = 0;
m_qpos.clear();
m_wikiPhraseIds.clear();
m_quotedStartIds.clear();
m_bflags.clear();
m_qtermNums.clear();
m_msg2 = NULL;
m_topTree = NULL;
m_nqt = 0;
m_debug = false;
m_useWhiteTable = false;
m_numQueryTermInfos = 0;
m_minTermListSize = 0;
m_minTermListIdx = 0;
m_vecSize = 0;
m_allInSameWikiPhrase = 0;
m_realMaxTop = 0;
}
// . returns false on error and sets g_errno
// . NOTE: termFreqs is just referenced by us, not copied
// . sets m_startKeys, m_endKeys and m_minNumRecs for each termId
// . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!!
// we should be able to get an upper bound estimate from the b-tree
// quickly using Msg36!
// . we now support multiple plus signs before the query term
// . lists[] and termFreqs[] must be 1-1 with q->m_qterms[]
void PosdbTable::init(Query *q, bool debug, TopTree *topTree, const DocumentIndexChecker &documentIndexChecker, Msg2 *msg2, Msg39Request *r) {
// sanity check -- watch out for double calls
if ( m_initialized )
gbshutdownAbort(true);
// clear everything
reset();
// we are now
m_initialized = true;
// set debug flag
m_debug = (debug || g_conf.m_logDebugQuery);
// we should save the lists!
//m_lists = msg2->m_lists;//lists;
//m_numLists = q->m_numTerms;
// seo.cpp supplies a NULL msg2 because it already sets
// QueryTerm::m_posdbListPtrs
if ( ! msg2 ) return;
m_msg2 = msg2;
// save the request
m_msg39req = r;
m_baseScoringParameters = r->m_baseScoringParameters;
m_derivedScoringWeights.init(m_baseScoringParameters);
if(g_conf.m_logTracePosdb || m_msg39req->m_debug)
m_baseScoringParameters.traceToLog("posdbtable");
// save this
//m_coll = coll;
// get the rec for it
// CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// if ( ! cr )
// gbshutdownAbort(true);
// set this now
//m_collnum = cr->m_collnum;
m_documentIndexChecker = &documentIndexChecker;
m_topTree = topTree;
// remember the query class, it has all the info about the termIds
m_q = q;
m_nqt = q->getNumTerms();
m_realMaxTop = r->m_realMaxTop;
if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;
if ( m_q->m_isBoolean ) m_baseScoringParameters.m_siteRankMultiplier = 0.0;
// sanity
if ( msg2->getNumLists() != m_q->getNumTerms() )
gbshutdownAbort(true);
// copy the list ptrs to the QueryTerm::m_posdbListPtr
for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ )
m_q->m_qterms[i].m_posdbListPtr = msg2->getList(i);
// we always use it now
if ( ! topTree )
gbshutdownAbort(true);
}
namespace {
//getMaxScoreForNonBodyTermPair(), getScoreForTermPair() and getTermPairScoreForAny() need to keep several items for each of the
//two lists and set them based on the posdb pointer(s). This helper struct handles the common things
struct PosdbDecodeHelper {
int32_t p;
unsigned char hg;
unsigned char mhg;
unsigned char wsr;
float spamw;
float denw;
//float diversityWeight; //todo?
//unsigned char syn; //not supported anymore
void set(const char *wp, const DerivedScoringWeights &derivedScoringWeights) {
p = Posdb::getWordPos(wp);
hg = Posdb::getHashGroup(wp);
//temporary fix: posdb can have junk in it so clamp the hashgroup to the limit
if(hg>=HASHGROUP_END)
hg = HASHGROUP_END-1;
mhg = hg;
// reduce to either HASHGROUP_BODY/TITLE/INLINK/META
if(s_inBody[mhg])
mhg = HASHGROUP_BODY;
wsr = Posdb::getWordSpamRank(wp);
if(hg == HASHGROUP_INLINKTEXT)
spamw = derivedScoringWeights.m_linkerWeights[wsr];
else
spamw = derivedScoringWeights.m_wordSpamWeights[wsr];
denw = derivedScoringWeights.m_densityWeights[Posdb::getDensityRank(wp)];
//syn = Posdb::getIsSynonym(wp);
}
};
} //anonymous namespace
//
// Find the top score for the term for each hash group.
// INLINK_TEXT may have more than one entry in the top-scorer list, other hash groups only 1.
// INLINK_TEXT may override other lower scoring hashgroups' scores
// Sum up the best scores, and return that result.
// Sets highestScoringNonBodyPos to the highest scoring position.
//
float PosdbTable::getBestScoreSumForSingleTerm(const MiniMergeBuffer *miniMergeBuffer, int32_t i, DocIdScore *pdcs, const char **highestScoringNonBodyPos) {
const char *wpi = miniMergeBuffer->mergedListStart[i];
const char *endi = miniMergeBuffer->mergedListEnd[i];
float nonBodyMax = -1.0;
int32_t lowestScoreTermIdx = 0;
float bestScores[MAX_TOP] = {0}; // make Coverity happy
const char *bestwpi[MAX_TOP];
int32_t numTop = 0;
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// assume no terms!
*highestScoringNonBodyPos = NULL;
if ( wpi ) {
// Sanity check
if( wpi >= endi ) {
logTrace(g_conf.m_logTracePosdb, "END, wpi %p >= %p", wpi, endi);
return -1.0;
}
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(wpi,endi-wpi);
#endif
//stuff for detecting when a position has bigram matches on both sides
const bool leftTermIsIgnored = m_queryTermInfos[i].m_leftTermIsIgnored;
int prevBigramMatchPos = -1;
int prevBigramMatchQtermIdx = -1;
float prevBigramMatchTermWeight = 1.0;
bool first = true;
char bestmhg[MAX_TOP];
while(wpi < endi) {
float score = 100.0;
PosdbDecodeHelper helper;
helper.set(wpi, m_derivedScoringWeights);
// good diversity?
//unsigned char div = Posdb::getDiversityRank ( wpi );
//score *= m_derivedScoringWeights.m_diversityWeights[div];
//score *= m_derivedScoringWeights.m_diversityWeights[div];
// hash group? title? body? heading? etc.
score *= m_derivedScoringWeights.m_hashGroupWeights[helper.mhg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper.mhg];
// good density?
score *= helper.denw;
score *= helper.denw;
// to make more compatible with pair scores divide by distance of 2
//score /= 2.0;
// word spam?
score *= helper.spamw;
score *= helper.spamw;
int queryTermIndex = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm = m_q->m_qterms[queryTermIndex];
score *= qterm.m_userWeight;
score *= m_q->m_qterms[queryTermIndex].m_termFreqWeight;
score *= m_q->m_qterms[queryTermIndex].m_termFreqWeight;
score *= qterm.m_termWeight; //regular/bigram/synonym
score *= qterm.m_termWeight; //regular/bigram/synonym
if(leftTermIsIgnored && m_q->m_qterms[queryTermIndex].m_isPhrase) {
//if the term to the left of us is ignored (stopword/qstopword/highfreqterm/...) then that bigram boost may be lost. Worst case:
// <ignored> term <ignored>
// ==>
// <bigram-hit-left> <term-hit> <bigram-hit-2>
//and then the boost from the left bigram hit would be lost. So detect this case and give it the extra boost:
if(prevBigramMatchPos+2>=helper.p &&
prevBigramMatchQtermIdx!=queryTermIndex) {
score *= prevBigramMatchTermWeight;
score *= prevBigramMatchTermWeight;
}
//record where we got a bigram hit
prevBigramMatchPos = helper.p;
prevBigramMatchQtermIdx = queryTermIndex;
prevBigramMatchTermWeight = qterm.m_termWeight;
}
// do not allow duplicate hashgroups!
int32_t bro = -1;
for( int32_t k=0; k < numTop; k++) {
if( bestmhg[k] == helper.mhg && helper.hg != HASHGROUP_INLINKTEXT) {
bro = k;
break;
}
}
if ( bro >= 0 ) {
//
// We already have a score for this hash group, update if
// current score is higher
//
if ( score > bestScores[bro] ) {
bestScores[bro] = score;
bestwpi [bro] = wpi;
bestmhg [bro] = helper.mhg;
}
}
else
if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
//
// New hash group (or INLINKTEXT).
// We have free slots in the top-list.
// Store found score.
//
bestScores[numTop] = score;
bestwpi [numTop] = wpi;
bestmhg [numTop] = helper.mhg;
numTop++;
}
else
if ( score > bestScores[lowestScoreTermIdx] ) {
//
// New hash group (or INLINKTEXT).
// We have NO free slots in the top-list.
// Replace lowest score in top-list with current higher score.
//
bestScores[lowestScoreTermIdx] = score;
bestwpi [lowestScoreTermIdx] = wpi;
bestmhg [lowestScoreTermIdx] =helper. mhg;
}
// If top-list is full, make lowestScoreTermIdx point to the lowest score
// in the top-list.
if ( numTop >= m_realMaxTop ) { // MAX_TOP ) {
lowestScoreTermIdx = 0;
for ( int32_t k = 1 ; k < m_realMaxTop; k++ ) {//MAX_TOP ; k++ ) {
if ( bestScores[k] > bestScores[lowestScoreTermIdx] ) {
continue;
}
lowestScoreTermIdx = k;
}
}
// For findMinTermPairScoreInWindow() sub-out algo.
// If the term is not in the body, and the score is the
// highest non-body term score, return the position index.
if ( score > nonBodyMax && ! s_inBody[helper.hg] ) {
nonBodyMax = score;
*highestScoringNonBodyPos = wpi;
}
// advance posdb pointer
if(!first)
wpi += 6;
else {
wpi += 12;
first = false;
}
};
}
// add up the top scores
float sum = 0.0;
for ( int32_t k = 0 ; k < numTop ; k++ ) {
// if it is something like "enough for" in a wikipedia
// phrase like "time enough for love" give it a boost!
// now we set a special bit in the keys since we do a mini
// merge, we do the same thing for the syn bits
if ( Posdb::getIsHalfStopWikiBigram(bestwpi[k]) ) {
sum += (bestScores[k] * WIKI_BIGRAM_WEIGHT * WIKI_BIGRAM_WEIGHT);
}
else {
// otherwise just add it up
sum += bestScores[k];
}
}
// wiki weight
//sum *= ts;
// shortcut
//char *maxp = bestwpi[k];
// if terms is a special wiki half stop bigram
//if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM ) {
// sum *= WIKI_BIGRAM_WEIGHT;
// sum *= WIKI_BIGRAM_WEIGHT;
//}
// empty list?
//if ( ! wpi ) sum = -2.0;
//
// end the loop. return now if not collecting scoring info.
//
if ( ! pdcs ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return sum;
}
//#
//# The below is for visual presentation of the scoring ONLY
//#
// none? wtf?
if ( numTop <= 0 ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return sum;
}
// point into buf
SingleScore *sx = (SingleScore *)m_singleScoreBuf.getBufPtr();
int32_t need = sizeof(SingleScore) * numTop;
// point to that
if ( pdcs->m_singlesOffset < 0 ) {
pdcs->m_singlesOffset = m_singleScoreBuf.length();
}
// reset this i guess
pdcs->m_singleScores = NULL;
// sanity
if ( m_singleScoreBuf.getAvail() < need ) {
static bool s_first = true;
if ( s_first ) {
log("posdb: CRITICAL single buf overflow");
}
s_first = false;
logTrace(g_conf.m_logTracePosdb, "END.");
return sum;
//gbshutdownAbort(true); }
}
// increase buf ptr over this then
m_singleScoreBuf.incrementLength(need);
// set each of the top scoring terms individiually
for ( int32_t k = 0 ; k < numTop ; k++, sx++ ) {
// udpate count
pdcs->m_numSingles++;
const char *maxp = bestwpi[k];
memset(sx,0,sizeof(*sx));
sx->m_isSynonym = Posdb::getIsSynonym(maxp);
sx->m_isHalfStopWikiBigram = Posdb::getIsHalfStopWikiBigram(maxp);
//sx->m_isSynonym = (m_bflags[i] & BF_SYNONYM) ;
sx->m_diversityRank = Posdb::getDiversityRank(maxp);
sx->m_wordSpamRank = Posdb::getWordSpamRank(maxp);
sx->m_hashGroup = Posdb::getHashGroup(maxp);
sx->m_wordPos = Posdb::getWordPos(maxp);
sx->m_densityRank = Posdb::getDensityRank(maxp);
float score = bestScores[k];
int queryTermIndex = miniMergeBuffer->getTermIndexForBufferPos(maxp);
//TODO: shouldn't we multiply with userweight here too?
//float userWeight = m_q->m_qterms[queryTermIndex].m_userWeight;
//score *= userWeight;
score *= m_q->m_qterms[queryTermIndex].m_termFreqWeight;
score *= m_q->m_qterms[queryTermIndex].m_termFreqWeight;
// if terms is a special wiki half stop bigram
if ( sx->m_isHalfStopWikiBigram ) {
score *= WIKI_BIGRAM_WEIGHT;
score *= WIKI_BIGRAM_WEIGHT;
}
sx->m_finalScore = score;
sx->m_tfWeight = m_q->m_qterms[queryTermIndex].m_termFreqWeight;
sx->m_qtermNum = m_qtermNums[i];
//int64_t *termFreqs = (int64_t *)m_msg39req->ptr_termFreqs;
//sx->m_termFreq = termFreqs[sx->m_qtermNum];
sx->m_bflags = m_bflags[i];
}
logTrace(g_conf.m_logTracePosdb, "END. sum=%f", sum);
return sum;
}
// . advace two ptrs at the same time so it's just a linear scan
// . TODO: add all up, then basically taking a weight of the top 6 or so...
float PosdbTable::getMaxScoreForNonBodyTermPair(const MiniMergeBuffer *miniMergeBuffer, int i, int j, int32_t qdist) {
const char *wpi = miniMergeBuffer->mergedListStart[i];
const char *wpj = miniMergeBuffer->mergedListStart[j];
const char *endi = miniMergeBuffer->mergedListEnd[i];
const char *endj = miniMergeBuffer->mergedListEnd[j];
// Sanity check
if( wpi >= endi || wpj >= endj ) {
return -1.0;
}
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(wpi,endi-wpi);
VALGRIND_CHECK_MEM_IS_DEFINED(wpj,endj-wpj);
#endif
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
PosdbDecodeHelper helper1, helper2;
helper1.set(wpi, m_derivedScoringWeights);
helper2.set(wpj, m_derivedScoringWeights);
bool firsti = true;
bool firstj = true;
float max = -1.0;
for(;;) {
if ( helper1.p <= helper2.p ) {
// . skip the pair if they are in different hashgroups
// . we no longer allow either to be in the body in this
// algo because we handle those cases in the sliding window
// algo!
if( s_isCompatible[helper1.hg][helper2.hg] ) {
// git distance
int32_t dist = helper2.p - helper1.p;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) {
dist = 2;
}
// fix distance if in different non-body hashgroups
if ( dist > 50 ) {
dist = FIXED_DISTANCE;
}
// subtract from the dist the terms are apart in the query
if ( dist >= qdist ) {
dist = dist - qdist;
}
// good density?
float score = 100 * helper1.denw * helper2.denw;
// hashgroup modifier
score *= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper2.hg];
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm1 = m_q->m_qterms[queryTermIndex1];
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const QueryTerm &qterm2 = m_q->m_qterms[queryTermIndex2];
score *= qterm1.m_userWeight;
score *= qterm2.m_userWeight;
score *= qterm1.m_termWeight; //regular/bigram/synonym
score *= qterm2.m_termWeight; //regular/bigram/synonym
// word spam weights
score *= helper1.spamw * helper2.spamw;
// huge title? do not allow 11th+ word to be weighted high
//if ( helper1.hg == HASHGROUP_TITLE && dist > 20 )
// score /= m_baseScoringParameters.m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
// mod by distance
score /= (dist + 1.0);
// best?
max = gbmax(max,score);
}
// advance posdb pointer
if(!firsti)
wpi += 6;
else {
wpi += 12;
firsti = false;
}
// end of list?
if ( wpi >= endi ) {
break; // exit for(;;) loop
}
helper1.set(wpi, m_derivedScoringWeights);
}
else {
// . skip the pair if they are in different hashgroups
// . we no longer allow either to be in the body in this
// algo because we handle those cases in the sliding window
// algo!
if ( s_isCompatible[helper1.hg][helper2.hg] ) {
// get distance
int32_t dist = helper1.p - helper2.p;
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) dist = 2;
// fix distance if in different non-body hashgroups
if ( dist > 50 ) {
dist = FIXED_DISTANCE;
}
// subtract from the dist the terms are apart in the query
if ( dist >= qdist ) {
dist = dist - qdist;
// add 1 for being out of order
dist += qdist - 1;
}
else {
//dist = dist - qdist;
// add 1 for being out of order
dist += 1; // qdist - 1;
}
// compute score based on that junk
//score = (MAXWORDPOS+1) - dist;
// good diversity? uneeded for pair algo
//score *= m_derivedScoringWeights.m_diversityWeights[div1];
//score *= m_derivedScoringWeights.m_diversityWeights[div2];
// good density?
float score = 100 * helper1.denw * helper2.denw;
// hashgroup modifier
score *= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper2.hg];
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm1 = m_q->m_qterms[queryTermIndex1];
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const QueryTerm &qterm2 = m_q->m_qterms[queryTermIndex2];
score *= qterm1.m_userWeight;
score *= qterm2.m_userWeight;
score *= qterm1.m_termWeight; //regular/bigram/synonym
score *= qterm2.m_termWeight; //regular/bigram/synonym
// word spam weights
score *= helper1.spamw * helper2.spamw;
// huge title? do not allow 11th+ word to be weighted high
//if ( helper1.hg == HASHGROUP_TITLE && dist > 20 )
// score /= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
// mod by distance
score /= (dist + 1.0);
// best?
max = gbmax(max,score);
}
// advance posdb pointer
if(!firstj)
wpj += 6;
else {
wpj += 12;
firstj = false;
}
// end of list?
if ( wpj >= endj ) {
break; // exit for(;;) loop
}
helper2.set(wpj, m_derivedScoringWeights);
}
}
logTrace(g_conf.m_logTracePosdb, "END.");
return max;
}
float PosdbTable::getScoreForTermPair(const MiniMergeBuffer *miniMergeBuffer, const char *wpi, const char *wpj, int32_t fixedDistance, int32_t qdist) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
if ( ! wpi ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return -1.00;
}
if ( ! wpj ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return -1.00;
}
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(wpi,6);
VALGRIND_CHECK_MEM_IS_DEFINED(wpj,6);
#endif
PosdbDecodeHelper helper1, helper2;
helper1.set(wpi, m_derivedScoringWeights);
helper2.set(wpj, m_derivedScoringWeights);
float dist;
// set this
if ( fixedDistance != 0 ) {
dist = fixedDistance;
}
else {
dist = abs(helper1.p - helper2.p);
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
if ( dist < 2 ) dist = 2;
// subtract from the dist the terms are apart in the query
if ( dist >= qdist ) dist = dist - qdist;
// out of order? penalize by 1 unit
if ( helper2.p < helper1.p ) dist += 1;
}
// TODO: use left and right diversity if no matching query term
// is on the left or right
//score *= m_derivedScoringWeights.m_diversityWeights[div1];
//score *= m_derivedScoringWeights.m_diversityWeights[div2];
// good density?
float score = 100 * helper1.denw * helper2.denw;
// wikipedia phrase weight
//score *= ts;
// hashgroup modifier
score *= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper2.hg];
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm1 = m_q->m_qterms[queryTermIndex1];
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const QueryTerm &qterm2 = m_q->m_qterms[queryTermIndex2];
score *= qterm1.m_userWeight;
score *= qterm2.m_userWeight;
score *= qterm1.m_termWeight; //regular/bigram/synonym
score *= qterm2.m_termWeight; //regular/bigram/synonym
// word spam weights
score *= helper1.spamw * helper2.spamw;
// mod by distance
score /= (dist + 1.0);
logTrace(g_conf.m_logTracePosdb, "END. score=%f", score);
return score;
}
// . advance two ptrs at the same time so it's just a linear scan
// . TODO: add all up, then basically taking a weight of the top 6 or so...
// . skip body terms not in the sliding window as defined by bestMinTermPairWindowPtrs[]
float PosdbTable::getTermPairScoreForAny(const MiniMergeBuffer *miniMergeBuffer, int i, int j, const std::vector<const char *> &bestMinTermPairWindowPtrs, DocIdScore *pdcs) {
const char *wpi = miniMergeBuffer->mergedListStart[i];
const char *wpj = miniMergeBuffer->mergedListStart[j];
const char *endi = miniMergeBuffer->mergedListEnd[i];
const char *endj = miniMergeBuffer->mergedListEnd[j];
// wiki phrase weight?
float wikiPhraseWeight;
logTrace(g_conf.m_logTracePosdb, "BEGIN. i=%d, j=%d", i, j);
int32_t qdist;
// but if they are in the same wikipedia phrase
// then try to keep their positions as in the query.
// so for 'time enough for love' ideally we want
// 'time' to be 6 units apart from 'love'
if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] && m_wikiPhraseIds[j] ) { // zero means not in a phrase
qdist = m_qpos[j] - m_qpos[i];
// wiki weight
wikiPhraseWeight = (float)WIKI_WEIGHT;
}
else {
// basically try to get query words as close
// together as possible
qdist = 2;
// this should help fix
// 'what is an unsecured loan' so we are more likely
// to get the page that has that exact phrase in it.
// yes, but hurts how to make a lock pick set.
//qdist = qpos[j] - qpos[i];
// wiki weight
wikiPhraseWeight = 1.0;
}
bool inSameQuotedPhrase = false;
if ( m_quotedStartIds[i] == m_quotedStartIds[j] && m_quotedStartIds[i] >= 0 ) {
inSameQuotedPhrase = true;
}
// oops.. this was not counting non-space punct for 2 units
// instead of 1
if ( inSameQuotedPhrase ) {
qdist = m_qpos[j] - m_qpos[i];
}
PosdbDecodeHelper helper1, helper2;
helper1.set(wpi, m_derivedScoringWeights);
helper2.set(wpj, m_derivedScoringWeights);
bool firsti = true;
bool firstj = true;
int32_t lowestScoreTermIdx = -1;
float bestScores[MAX_TOP] = {0}; // make Coverity happy
const char *bestwpi [MAX_TOP];
const char *bestwpj [MAX_TOP];
char bestmhg1 [MAX_TOP];
char bestmhg2 [MAX_TOP];
char bestFixed [MAX_TOP];
int32_t numTop = 0;
bool fixedDistance;
int32_t bro;
#define advance(first,wp,end,helper) \
if(!first) \
wp += 6; \
else { \
wp += 12; \
first = false; \
} \
if(wp >= end) \
break; \
helper.set(wp, m_derivedScoringWeights);
for(;;) {
// . if p1/p2 is in body and not in window, skip
// . this is how we restrict all body terms to the winning
// sliding window
if ( s_inBody[helper1.hg] && wpi != bestMinTermPairWindowPtrs[i] ) {
advance(firsti,wpi,endi,helper1);
}
if ( s_inBody[helper2.hg] && wpj != bestMinTermPairWindowPtrs[j] ) {
advance(firstj,wpj,endj,helper2);
}
// make this strictly < now and not <= because in the event
// of bigram terms, where p1==p2 we'd like to advance p2/wj to
// point to the non-syn single term in order to get a better score
// to fix the 'search engine' query on gigablast.com
if ( helper1.p <= helper2.p ) {
// git distance
int32_t dist = helper2.p - helper1.p;
// if in the same quoted phrase, order is bad!
if ( inSameQuotedPhrase ) {
// TODO: allow for off by 1
// if it has punct in it then dist will be 3,
// just a space or similar then dist should be 2.
if ( dist > qdist && dist - qdist >= 2 ) {
advance(firsti,wpi,endi,helper1);
}
if ( dist < qdist && qdist - dist >= 2 ) {
advance(firsti,wpi,endi,helper1);
}
}
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
dist = gbmax(dist,2);
// fix distance if in different non-body hashgroups
if ( dist < 50 ) {
fixedDistance = false;
}
// body vs title, linktext vs title, linktext vs body
else if ( helper1.mhg != helper2.mhg ) {
dist = FIXED_DISTANCE;
fixedDistance = true;
}
// link text to other link text
else if ( helper1.mhg == HASHGROUP_INLINKTEXT ) {
dist = FIXED_DISTANCE;
fixedDistance = true;
}
else {
fixedDistance = false;
}
// subtract from the dist the terms are apart in the query
if ( dist >= qdist ) {
dist = dist - qdist;
}
// good density?
float score;
score = 100 * helper1.denw * helper2.denw;
// hashgroup modifier
score *= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper2.hg];
{
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm1 = m_q->m_qterms[queryTermIndex1];
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const QueryTerm &qterm2 = m_q->m_qterms[queryTermIndex2];
score *= qterm1.m_termFreqWeight;
score *= qterm2.m_termFreqWeight;
score *= qterm1.m_userWeight;
score *= qterm2.m_userWeight;
score *= qterm1.m_termWeight; //regular/bigram/synonym
score *= qterm2.m_termWeight; //regular/bigram/synonym
}
// the new logic
if ( Posdb::getIsHalfStopWikiBigram(wpi) ) {
score *= WIKI_BIGRAM_WEIGHT;
}
if ( Posdb::getIsHalfStopWikiBigram(wpj) ) {
score *= WIKI_BIGRAM_WEIGHT;
}
// word spam weights
score *= helper1.spamw * helper2.spamw;
// huge title? do not allow 11th+ word to be weighted high
//if ( helper1.hg == HASHGROUP_TITLE && dist > 20 )
// score /= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
// mod by distance
score /= (dist + 1.0);
// if our hg1/hg2 hashgroup pairing already exists
// in the bestScores array we have to beat it and then
// we have to replace that. we can only have one per,
// except for linktext!
bro = -1;
for ( int32_t k = 0 ; k < numTop ; k++ ) {
if ( bestmhg1[k]==helper1.mhg && helper1.hg != HASHGROUP_INLINKTEXT ){
bro = k;
break;
}
if ( bestmhg2[k]==helper2.mhg && helper2.hg != HASHGROUP_INLINKTEXT ){
bro = k;
break;
}
}
if ( bro >= 0 ) {
if ( score > bestScores[bro] ) {
bestScores[bro] = score;
bestwpi [bro] = wpi;
bestwpj [bro] = wpj;
bestmhg1 [bro] = helper1.mhg;
bestmhg2 [bro] = helper2.mhg;
bestFixed [bro] = fixedDistance;
}
}
else
if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
bestScores[numTop] = score;
bestwpi [numTop] = wpi;
bestwpj [numTop] = wpj;
bestmhg1 [numTop] = helper1.mhg;
bestmhg2 [numTop] = helper2.mhg;
bestFixed [numTop] = fixedDistance;
numTop++;
}
else
if ( lowestScoreTermIdx >= 0 && score > bestScores[lowestScoreTermIdx] ) {
bestScores[lowestScoreTermIdx] = score;
bestwpi [lowestScoreTermIdx] = wpi;
bestwpj [lowestScoreTermIdx] = wpj;
bestmhg1 [lowestScoreTermIdx] = helper1.mhg;
bestmhg2 [lowestScoreTermIdx] = helper2.mhg;
bestFixed [lowestScoreTermIdx] = fixedDistance;
}
// set "lowestScoreTermIdx" to the lowest score out of the top scores
if ( numTop >= m_realMaxTop ) { // MAX_TOP ) {
lowestScoreTermIdx = 0;
for ( int32_t k = 1 ; k < m_realMaxTop;k++){//MAX_TOP;k++
if (bestScores[k] > bestScores[lowestScoreTermIdx] ) {
continue;
}
lowestScoreTermIdx = k;
}
}
advance(firsti,wpi,endi,helper1);
}
else {
// get distance
int32_t dist = helper1.p - helper2.p;
// if in the same quoted phrase, order is bad!
if ( inSameQuotedPhrase ) {
advance(firstj,wpj,endj,helper2);
}
// if zero, make sure its 2. this happens when the same bigram
// is used by both terms. i.e. street uses the bigram
// 'street light' and so does 'light'. so the wordpositions
// are exactly the same!
dist = gbmax(dist,2);
// fix distance if in different non-body hashgroups
if ( dist < 50 ) {
fixedDistance = false;
}
// body vs title, linktext vs title, linktext vs body
else if ( helper1.mhg != helper2.mhg ) {
dist = FIXED_DISTANCE;
fixedDistance = true;
}
// link text to other link text
else if ( helper1.mhg == HASHGROUP_INLINKTEXT ) {
dist = FIXED_DISTANCE;
fixedDistance = true;
}
else
fixedDistance = false;
// subtract from the dist the terms are apart in the query
if ( dist >= qdist ) {
dist = dist - qdist;
// add 1 for being out of order
dist += qdist - 1;
}
else {
//dist = dist - qdist;
// add 1 for being out of order
dist += 1; // qdist - 1;
}
// good diversity? uneeded for pair algo
// good density?
float score;
score = 100 * helper1.denw * helper2.denw;
// hashgroup modifier
score *= m_derivedScoringWeights.m_hashGroupWeights[helper1.hg];
score *= m_derivedScoringWeights.m_hashGroupWeights[helper2.hg];
{
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const QueryTerm &qterm1 = m_q->m_qterms[queryTermIndex1];
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const QueryTerm &qterm2 = m_q->m_qterms[queryTermIndex2];
score *= qterm1.m_termFreqWeight;
score *= qterm2.m_termFreqWeight;
score *= qterm1.m_userWeight;
score *= qterm2.m_userWeight;
score *= qterm1.m_termWeight; //regular/bigram/synonym
score *= qterm2.m_termWeight; //regular/bigram/synonym
}
// word spam weights
score *= helper1.spamw * helper2.spamw;
// huge title? do not allow 11th+ word to be weighted high
//if ( hg1 == HASHGROUP_TITLE && dist > 20 )
// score /= m_derivedScoringWeights.m_hashGroupWeights[hg1];
// mod by distance
score /= (dist + 1.0);
// if our hg1/hg2 hashgroup pairing already exists
// in the bestScores array we have to beat it and then
// we have to replace that. we can only have one per,
// except for linktext!
bro = -1;
for ( int32_t k = 0 ; k < numTop ; k++ ) {
if ( bestmhg1[k]==helper1.mhg && helper1.hg !=HASHGROUP_INLINKTEXT ){
bro = k;
break;
}
if ( bestmhg2[k]==helper2.mhg && helper2.hg !=HASHGROUP_INLINKTEXT ){
bro = k;
break;
}
}
if ( bro >= 0 ) {
if ( score > bestScores[bro] ) {
bestScores[bro] = score;
bestwpi [bro] = wpi;
bestwpj [bro] = wpj;
bestmhg1 [bro] = helper1.mhg;
bestmhg2 [bro] = helper2.mhg;
bestFixed [bro] = fixedDistance;
}
}
// best?
else if ( numTop < m_realMaxTop ) { // MAX_TOP ) {
bestScores[numTop] = score;
bestwpi [numTop] = wpi;
bestwpj [numTop] = wpj;
bestmhg1 [numTop] = helper1.mhg;
bestmhg2 [numTop] = helper2.mhg;
bestFixed [numTop] = fixedDistance;
numTop++;
}
else if ( lowestScoreTermIdx >= 0 && score > bestScores[lowestScoreTermIdx] ) {
bestScores[lowestScoreTermIdx] = score;
bestwpi [lowestScoreTermIdx] = wpi;
bestwpj [lowestScoreTermIdx] = wpj;
bestmhg1 [lowestScoreTermIdx] = helper1.mhg;
bestmhg2 [lowestScoreTermIdx] = helper2.mhg;
bestFixed [lowestScoreTermIdx] = fixedDistance;
}
// set "lowestScoreTermIdx" to the lowest score out of the top scores
if ( numTop >= m_realMaxTop ) { // MAX_TOP ) {
lowestScoreTermIdx = 0;
for ( int32_t k = 1 ; k < m_realMaxTop;k++){//MAX_TOP;k++
if( bestScores[k] > bestScores[lowestScoreTermIdx]) {
continue;
}
lowestScoreTermIdx = k;
}
}
advance(firstj,wpj,endj,helper2);
}
} // for(;;)
if(g_conf.m_logTracePosdb) {
logTrace(g_conf.m_logTracePosdb, "numTop=%d", numTop);
for(int i=0; i<numTop; i++)
logTrace(g_conf.m_logTracePosdb, " bestScores[%d]=%f", i, bestScores[i]);
}
// add up the top scores
float sum = 0.0;
for ( int32_t k = 0 ; k < numTop ; k++ ) {
sum += bestScores[k];
}
if (m_debug) {
for ( int32_t k = 0 ; k < numTop ; k++ )
log(LOG_INFO, "posdb: best score #%" PRId32" = %f",k,bestScores[k]);
log(LOG_INFO, "posdb: best score sum = %f",sum);
}
// wiki phrase weight
sum *= wikiPhraseWeight;
if (m_debug) {
log(LOG_INFO, "posdb: best score final = %f",sum);
}
//
// end the loop. return now if not collecting scoring info.
//
if ( ! pdcs ) {
logTrace(g_conf.m_logTracePosdb, "END. sum=%.3f", sum);
return sum;
}
// none? wtf?
if ( numTop <= 0 ) {
logTrace(g_conf.m_logTracePosdb, "END. sum=%.3f", sum);
return sum;
}
//
// now store the PairScores into the m_pairScoreBuf for this
// top docid.
//
// point into buf
PairScore *px = (PairScore *)m_pairScoreBuf.getBufPtr();
int32_t need = sizeof(PairScore) * numTop;
// point to that
if ( pdcs->m_pairsOffset < 0 ) {
pdcs->m_pairsOffset = m_pairScoreBuf.length();
}
// reset this i guess
pdcs->m_pairScores = NULL;
// sanity
if ( m_pairScoreBuf.getAvail() < need ) {
// m_pairScores will be NULL
static bool s_first = true;
if ( s_first ) {
log("posdb: CRITICAL pair buf overflow");
}
s_first = false;
logTrace(g_conf.m_logTracePosdb, "END.");
return sum;
}
// increase buf ptr over this then
m_pairScoreBuf.incrementLength(need);
// set each of the top scoring terms individiually
for ( int32_t k = 0 ; k < numTop ; k++, px++ ) {
pdcs->m_numPairs++;
memset(px,0,sizeof(*px));
const char *maxp1 = bestwpi[k];
const char *maxp2 = bestwpj[k];
float score = bestScores[k];
bool fixedDist = bestFixed[k];
score *= wikiPhraseWeight;
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(maxp1);
const float termFreqWeight1 = m_q->m_qterms[queryTermIndex1].m_termFreqWeight;
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(maxp2);
const float termFreqWeight2 = m_q->m_qterms[queryTermIndex2].m_termFreqWeight;
// we have to encode these bits into the mini merge now
if ( Posdb::getIsHalfStopWikiBigram(maxp1) ) {
score *= WIKI_BIGRAM_WEIGHT;
}
if ( Posdb::getIsHalfStopWikiBigram(maxp2) ) {
score *= WIKI_BIGRAM_WEIGHT;
}
//if ( m_bflags[i] & BF_HALFSTOPWIKIBIGRAM )
//if ( m_bflags[j] & BF_HALFSTOPWIKIBIGRAM )
// wiki phrase weight
px->m_finalScore = score;
px->m_wordPos1 = Posdb::getWordPos(maxp1);
px->m_wordPos2 = Posdb::getWordPos(maxp2);
px->m_isSynonym1 = Posdb::getIsSynonym(maxp1);
px->m_isSynonym2 = Posdb::getIsSynonym(maxp2);
px->m_isHalfStopWikiBigram1 = Posdb::getIsHalfStopWikiBigram(maxp1);
px->m_isHalfStopWikiBigram2 = Posdb::getIsHalfStopWikiBigram(maxp2);
px->m_diversityRank1 = Posdb::getDiversityRank(maxp1);
px->m_diversityRank2 = Posdb::getDiversityRank(maxp2);
px->m_wordSpamRank1 = Posdb::getWordSpamRank(maxp1);
px->m_wordSpamRank2 = Posdb::getWordSpamRank(maxp2);
px->m_hashGroup1 = Posdb::getHashGroup(maxp1);
px->m_hashGroup2 = Posdb::getHashGroup(maxp2);
px->m_qdist = qdist;
// bigram algorithm fix
//if ( px->m_wordPos1 == px->m_wordPos2 )
// px->m_wordPos2 += 2;
px->m_densityRank1 = Posdb::getDensityRank(maxp1);
px->m_densityRank2 = Posdb::getDensityRank(maxp2);
px->m_fixedDistance = fixedDist;
px->m_qtermNum1 = m_qtermNums[i];
px->m_qtermNum2 = m_qtermNums[j];
px->m_tfWeight1 = termFreqWeight1;
px->m_tfWeight2 = termFreqWeight2;
px->m_bflags1 = m_bflags[i];
px->m_bflags2 = m_bflags[j];
// flag it as in same wiki phrase
if ( almostEqualFloat(wikiPhraseWeight, (float)WIKI_WEIGHT) ) {
px->m_inSameWikiPhrase = 1;
}
else {
px->m_inSameWikiPhrase = 0;
}
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(px,sizeof(*px));
#endif
// only log for debug if it is one result
if ( m_debug ) {
// log each one for debug
log(LOG_INFO, "posdb: result #%" PRId32" "
"i=%" PRId32" "
"j=%" PRId32" "
"termNum0=%" PRId32" "
"termNum1=%" PRId32" "
"finalscore=%f "
"tfw0=%f "
"tfw1=%f "
"fixeddist=%" PRId32" " // bool
"wikiPhraseWeight=%f "
"bflags0=%" PRId32" "
"bflags1=%" PRId32" "
"syn0=%" PRId32" "
"syn1=%" PRId32" "
"div0=%" PRId32" "
"div1=%" PRId32" "
"wspam0=%" PRId32" "
"wspam1=%" PRId32" "
"hgrp0=%s "
"hgrp1=%s "
"qdist=%" PRId32" "
"wpos0=%" PRId32" "
"wpos1=%" PRId32" "
"dens0=%" PRId32" "
"dens1=%" PRId32" ", k, i, j, px->m_qtermNum1, px->m_qtermNum2, score, termFreqWeight1,
termFreqWeight2, (int32_t) bestFixed[k], wikiPhraseWeight, (int32_t) m_bflags[i], (int32_t) m_bflags[j],
(int32_t) px->m_isSynonym1, (int32_t) px->m_isSynonym2, (int32_t) px->m_diversityRank1,
(int32_t) px->m_diversityRank2, (int32_t) px->m_wordSpamRank1, (int32_t) px->m_wordSpamRank2,
getHashGroupString(px->m_hashGroup1), getHashGroupString(px->m_hashGroup2), (int32_t) px->m_qdist,
(int32_t) px->m_wordPos1, (int32_t) px->m_wordPos2, (int32_t) px->m_densityRank1,
(int32_t) px->m_densityRank2
);
}
}
// do the same but for second bests! so seo.cpp's top term pairs
// algo can do a term insertion, and if that hurts the best pair
// the 2nd best might cover for it. ideally, we'd have all the term
// pairs for this algo, but i think we'll have to get by on just this.
logTrace(g_conf.m_logTracePosdb, "END. sum=%f", sum);
return sum;
}
//
//
// TRY TO SPEED IT UP!!!
//
//
// returns false and sets g_errno on error
bool PosdbTable::setQueryTermInfo ( ) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// alloc space. assume max
m_queryTermInfos.resize(m_q->m_numTerms);
#ifdef _VALGRIND_
VALGRIND_MAKE_MEM_UNDEFINED(&(m_queryTermInfos[0]), sizeof(m_queryTermInfos[0])*m_queryTermInfos.size());
#endif
int32_t nrg = 0;
m_hasMaxSerpScore = false;
if ( m_msg39req->m_minSerpDocId ) {
m_hasMaxSerpScore = true;
}
for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
logTrace(g_conf.m_logTracePosdb, "i=%" PRId32 ", term=[%.*s], required=%s", i, qt->m_termLen, qt->m_term, qt->m_isRequired?"true":"false");
if ( ! qt->m_isRequired ) {
continue;
}
// set this stff
const QueryWord *qw = qt->m_qword;
// get one
QueryTermInfo *qti = &(m_queryTermInfos[nrg]);
// and set it
qti->m_qtermNum = i;
qti->m_qterm = qt;
// this is not good enough, we need to count
// non-whitespace punct as 2 units not 1 unit
// otherwise qdist gets thrown off and our phrasing fails.
// so use QueryTerm::m_qpos just for this.
qti->m_qpos = qw->m_posNum;
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
qti->m_quotedStartId = qw->m_quoteStart;
// count
int32_t nn = 0;
// also add in bigram lists
int32_t left = qt->m_leftPhraseTermNum;
int32_t right = qt->m_rightPhraseTermNum;
// terms
const QueryTerm *leftTerm = qt->m_leftPhraseTerm;
const QueryTerm *rightTerm = qt->m_rightPhraseTerm;
bool leftAlreadyAdded = false;
bool rightAlreadyAdded = false;
//
// add the non-bigram list AFTER the
// bigrams, which we like to do when we PREFER the bigram
// terms because they are scored higher, specifically, as
// in the case of being half stop wikipedia phrases like
// "the tigers" for the query 'the tigers' we want to give
// a slight bonus, 1.20x, for having that bigram since its
// in wikipedia
//
//
// add left bigram lists. BACKWARDS.
//
if ( left>=0 && leftTerm && leftTerm->m_isWikiHalfStopBigram ){
// assume added
leftAlreadyAdded = true;
// get list
RdbList *list = m_q->m_qterms[left].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = &m_q->m_qterms[left];
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = BF_HALFSTOPWIKIBIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// add list of member terms as well
m_q->m_qterms[left].m_bitNum = nrg;
// only really add if useful
if ( list && !list->isEmpty() ) {
nn++;
}
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *bt = &m_q->m_qterms[k];
if ( bt->m_synonymOf != leftTerm ) {
continue;
}
list = m_q->m_qterms[k].m_posdbListPtr;
qti->m_subList[nn].m_qt = bt;
qti->m_subList[nn].m_list = list;
qti->m_subList[nn].m_bigramFlag = 0;
qti->m_subList[nn].m_bigramFlag |= BF_HALFSTOPWIKIBIGRAM;
qti->m_subList[nn].m_bigramFlag |= BF_SYNONYM;
if (qt->m_piped) {
qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
}
// add list of member terms as well
bt->m_bitNum = nrg;
if ( list && !list->isEmpty() ) {
nn++;
}
}
}
//
// then the right bigram if also in a wiki half stop bigram
//
if ( right >=0 && rightTerm && rightTerm->m_isWikiHalfStopBigram ){
// assume added
rightAlreadyAdded = true;
// get list
RdbList *list = m_q->m_qterms[right].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = &m_q->m_qterms[right];
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = BF_HALFSTOPWIKIBIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// add list of member terms as well
m_q->m_qterms[right].m_bitNum = nrg;
// only really add if useful
if ( list && !list->isEmpty() ) {
nn++;
}
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *bt = &m_q->m_qterms[k];
if ( bt->m_synonymOf != rightTerm ) {
continue;
}
list = m_q->m_qterms[k].m_posdbListPtr;
qti->m_subList[nn].m_qt = bt;
qti->m_subList[nn].m_list = list;
qti->m_subList[nn].m_bigramFlag = 0;
qti->m_subList[nn].m_bigramFlag |= BF_HALFSTOPWIKIBIGRAM;
qti->m_subList[nn].m_bigramFlag |= BF_SYNONYM;
if (qt->m_piped) {
qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
}
// add list of member terms as well
bt->m_bitNum = nrg;
if ( list && !list->isEmpty() ) {
nn++;
}
}
}
//
// then the non-bigram termlist
//
// add to it. add backwards since we give precedence to
// the first list and we want that to be the NEWEST list!
RdbList *list = m_q->m_qterms[i].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = qt;
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = 0;
// before a pipe operator?
if ( qt->m_piped )
qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// is it a negative term?
if ( qt->m_termSign=='-')
qti->m_subList[nn].m_bigramFlag |= BF_NEGATIVE;
// add list of member terms
qt->m_bitNum = nrg;
// only really add if useful
// no, because when inserting NEW (related) terms that are
// not currently in the document, this list may initially
// be empty.
if ( list && !list->isEmpty() ) {
nn++;
}
//
// add left bigram now if not added above
//
if ( left >= 0 && ! leftAlreadyAdded ) {
// get list
list = m_q->m_qterms[left].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = &m_q->m_qterms[left];
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = BF_BIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// add list of member terms as well
m_q->m_qterms[left].m_bitNum = nrg;
// only really add if useful
if ( list && !list->isEmpty() ) {
nn++;
}
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *bt = &m_q->m_qterms[k];
if ( bt->m_synonymOf != leftTerm ) {
continue;
}
list = m_q->m_qterms[k].m_posdbListPtr;
qti->m_subList[nn].m_qt = bt;
qti->m_subList[nn].m_list = list;
qti->m_subList[nn].m_bigramFlag = 0;
qti->m_subList[nn].m_bigramFlag |= BF_SYNONYM;
if (qt->m_piped) {
qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
}
// add list of member terms as well
bt->m_bitNum = nrg;
if ( list && !list->isEmpty() ) {
nn++;
}
}
}
//
// add right bigram now if not added above
//
if ( right >= 0 && ! rightAlreadyAdded ) {
// get list
list = m_q->m_qterms[right].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = &m_q->m_qterms[right];
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = BF_BIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// add list of member terms as well
m_q->m_qterms[right].m_bitNum = nrg;
// only really add if useful
if ( list && !list->isEmpty() ) {
nn++;
}
// add bigram synonyms! like "new jersey" bigram
// has the synonym "nj"
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *bt = &m_q->m_qterms[k];
if ( bt->m_synonymOf != rightTerm ) {
continue;
}
list = m_q->m_qterms[k].m_posdbListPtr;
qti->m_subList[nn].m_qt = bt;
qti->m_subList[nn].m_list = list;
qti->m_subList[nn].m_bigramFlag = 0;
qti->m_subList[nn].m_bigramFlag |= BF_SYNONYM;
if (qt->m_piped) {
qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
}
// add list of member terms as well
//qti->m_qtermList[nn] = bt;
bt->m_bitNum = nrg;
if ( list && !list->isEmpty() ) {
nn++;
}
}
}
//
// ADD SYNONYM TERMS
//
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *qt2 = &m_q->m_qterms[k];
const QueryTerm *st = qt2->m_synonymOf;
// skip if not a synonym of this term
if ( st != qt ) {
continue;
}
// its a synonym, add it!
list = m_q->m_qterms[k].m_posdbListPtr;
// add list ptr into our required group
qti->m_subList[nn].m_qt = qt2;
qti->m_subList[nn].m_list = list;
// special flags
qti->m_subList[nn].m_bigramFlag = BF_SYNONYM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_subList[nn].m_bigramFlag |= BF_PIPED;
// set bitnum here i guess
qt2->m_bitNum = nrg;
// only really add if useful
if ( list && !list->isEmpty() ) {
nn++;
}
}
// store # lists in required group. nn might be zero!
qti->m_numSubLists = nn;
// crazy?
if ( nn >= MAX_SUBLISTS ) {
log("query: too many sublists. %" PRId32" >= %" PRId32,
nn,(int32_t)MAX_SUBLISTS);
logTrace(g_conf.m_logTracePosdb, "END.");
g_errno = EQUERYTOOBIG;
return false;
}
// count # required groups
nrg++;
}
//Detect the cases where the base term to the left of the qti-baseterm is ignored. This is used by getBestScoreSumForSingleTerm()
//to ensure that bigram match boosts are not lost.
for(int i=0; i<nrg; i++) {
QueryTermInfo &qti = m_queryTermInfos[i];
qti.m_leftTermIsIgnored = false;
if(m_queryTermInfos[i].m_qterm->m_leftPhraseTerm) {
for(int j=0; j<m_q->m_numTerms; j++) {
QueryTerm *qt = &m_q->m_qterms[j];
if(qt==m_queryTermInfos[i].m_qterm->m_leftPhraseTerm) {
if(qt->m_qword->m_ignoreWord!=IGNORE_NO_IGNORE)
qti.m_leftTermIsIgnored = true;
break; //assumption: only one bigram per term. May not be true when bigram-synonyms are added by word variations
}
}
}
}
//
// get the query term with the least data in posdb including syns
//
m_minTermListSize = 0;
m_minTermListIdx = -1;
int64_t grand = 0LL;
for ( int32_t i = 0 ; i < nrg ; i++ ) {
// compute total sizes
QueryTermInfo *qti = &(m_queryTermInfos[i]);
// do not consider for first termlist if negative
if ( qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
int64_t totalSubListsSize = 0;
for(int32_t q = 0; q < qti->m_numSubLists; q++) {
const RdbList *l = qti->m_subList[q].m_list;
totalSubListsSize += l->getListSize();
}
// add up this now
grand += totalSubListsSize;
// get min
if ( totalSubListsSize < m_minTermListSize || m_minTermListIdx == -1 ) {
m_minTermListSize = totalSubListsSize;
m_minTermListIdx = i;
}
}
// bad! ringbuf[] was not designed for this nonsense!
if ( m_minTermListIdx >= 255 ) {
gbshutdownAbort(true);
}
// set this for caller to use to loop over the queryterminfos
m_numQueryTermInfos = nrg;
if(g_conf.m_logTracePosdb) {
logTrace(g_conf.m_logTracePosdb, "m_numQueryTermInfos=%d", m_numQueryTermInfos);
for(int i=0; i<m_numQueryTermInfos; i++) {
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
logTrace(g_conf.m_logTracePosdb, " qti[%d]: m_numSubLists=%d m_qtermNum=%d m_qpos=%d m_leftTermIsIgnored=%s", i, qti->m_numSubLists, qti->m_qtermNum, qti->m_qpos, qti->m_leftTermIsIgnored?"true":"false");
for(int j=0; j<qti->m_numSubLists; j++)
logTrace(g_conf.m_logTracePosdb, " sublist %d = %p", j, qti->m_subList[j].m_list);
}
}
//check if any of the terms are not being used. This can happen if query was truncated and some bigrams or synonyms were included while the base word was not.
std::vector<bool> termUsed(m_q->m_numTerms);
for(int i=0; i<m_numQueryTermInfos; i++) {
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
for(int j=0; j<qti->m_numSubLists; j++) {
const QueryTerm *qt = qti->m_subList[j].m_qt;
int qtermNum = qt - m_q->m_qterms;
termUsed[qtermNum] = true;
}
}
for(int i=0; i<m_q->m_numTerms; i++) {
const QueryTerm *qt = &m_q->m_qterms[i];
logTrace(g_conf.m_logTracePosdb,"termUsed[%d]=%s (%.*s)", i, termUsed[i]?"true":"false", qt->m_termLen,qt->m_term);
if(!termUsed[i] && !qt->m_ignored && !qt->m_isQueryStopWord && qt->m_posdbListPtr && !qt->m_posdbListPtr->isEmpty())
log(LOG_DEBUG, "posdb: unused term found #%d '%.*s' in query '%.*s'. Was query truncated?", i, qt->m_termLen, qt->m_term, m_q->getQueryLen(), m_q->getQuery());
}
// . m_minTermListSize is set in setQueryTermInfo()
// . how many docids do we have at most in the intersection?
// . all keys are of same termid, so they are 12 or 6 bytes compressed
// . assume 12 if each is a different docid
int32_t maxDocIds = m_minTermListSize / 12;
// store all interesected docids in here for new algo plus 1 byte vote
int32_t need = maxDocIds * 6;
// they could all be OR'd together!
if ( m_q->m_isBoolean ) need = grand;
// so we can always cast a int64_t from a ptr in there
// for setting m_docId when m_booleanQuery is true below
need += 8;
// get max # of docids we got in an intersection from all the lists
if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return false;
}
// i'm feeling if a boolean query put this in there too, the
// hashtable that maps each docid to its boolean bit vector
// where each bit stands for an operand so we can quickly evaluate
// the bit vector in a truth table.
// CRAP, can't use min list size because it might be behind a
// NOT operator!!! then we core trying to realloc m_bt in a thread
// below when trying to grow it. they could all be OR'd together
// so alloc the most!
int32_t maxSlots = (grand/12) * 2;
// try to speed up. this doesn't *seem* to matter, so i took out:
//maxSlots *= 2;
// get total operands we used
//int32_t numOperands = m_q->m_numWords;//Operands;
// a quoted phrase counts as a single operand
// . QueryTerm::m_bitNum <== m_numQueryTermInfos
// . each queryTermInfo class corresponds to one bit in our bit vec
// . essentially each queryTermInfo is a query term, but it has
// all the synonym and word forms for that query, etc.
m_vecSize = m_numQueryTermInfos / 8;//numOperands / 8 ;
// allow an extra byte for remainders
if ( m_numQueryTermInfos % 8 ) m_vecSize++;
// now preallocate the hashtable. 0 niceness.
if ( m_q->m_isBoolean && // true = useKeyMagic
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,"booltbl",true)) {
logTrace(g_conf.m_logTracePosdb, "END.");
return false;
}
// . m_ct maps a boolean "bit vector" to a true/false value
// . each "bit" in the "bit vector" indicates if docid has that
// particular query term
if ( m_q->m_isBoolean && // true = useKeyMagic
! m_ct.set (8,1,maxSlots,NULL,0,false,"booltbl",true)) {
logTrace(g_conf.m_logTracePosdb, "END.");
return false;
}
#ifdef _VALGRIND_
//various arrays are over-allocated. tell valgrind what is not defined content
for(int i=0; i<m_numQueryTermInfos; i++) {
QueryTermInfo &qti = m_queryTermInfos[i];
VALGRIND_MAKE_MEM_UNDEFINED(qti.m_subList+qti.m_numSubLists, sizeof(qti.m_subList[0])*(MAX_SUBLISTS-qti.m_numSubLists));
}
for(int i=m_numQueryTermInfos; i<m_q->m_numTerms; i++)
VALGRIND_MAKE_MEM_UNDEFINED(&(m_queryTermInfos[i]), sizeof(QueryTermInfo));
#endif
logTrace(g_conf.m_logTracePosdb, "END.");
return true;
}
bool PosdbTable::findCandidateDocIds() {
int64_t lastTime = gettimeofdayInMilliseconds();
//
// Swap the top 6 bytes of each list.
//
// This gives a contiguous list of 12-byte docid/score entries
// because the first record is always 18 bytes (termid, docid, score)
// and the rest are 6 bytes due to our termid compression.
//
// This makes the lists much much easier to work with, but we have
// to remember to swap back when done! (but we dont...)
//
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
// count
int64_t total = 0LL;
// get the list
RdbList *list = m_q->m_qterms[k].m_posdbListPtr;
// skip if null or empty
if(list && !list->isEmpty()) {
// tally
total += list->getListSize();
// point to start
char *p = list->getList();
// remember to swap back when done!!
char ttt[12];
memcpy ( ttt , p , 12 );
memcpy ( p , p + 12 , 6 );
memcpy ( p + 6 , ttt , 12 );
// point to the low "hks" bytes now, skipping the termid
p += 6;
// turn half bit on. first key is now 12 bytes!!
*p |= 0x02;
// MANGLE the list
list->setListSize(list->getListSize() - 6);
list->setList(p);
logTrace(g_conf.m_logTracePosdb, "termList #%" PRId32" totalSize=%" PRId64, k, total);
// print total list sizes
if ( m_debug ) {
log(LOG_INFO, "query: termlist #%" PRId32" totalSize=%" PRId64, k, total);
}
}
}
// setQueryTermInfos() should have set how many we have
if ( m_numQueryTermInfos == 0 ) {
log(LOG_DEBUG, "query: NO REQUIRED TERMS IN QUERY!");
logTrace(g_conf.m_logTracePosdb, "END, m_numQueryTermInfos = 0");
return false;
}
// . if smallest required list is empty, 0 results
// . also set in setQueryTermInfo
if ( m_minTermListSize == 0 && ! m_q->m_isBoolean ) {
logTrace(g_conf.m_logTracePosdb, "END, m_minTermListSize = 0 and not boolean");
return false;
}
int32_t listGroupNum = 0;
// if all non-negative query terms are in the same wikiphrase then
// we can apply the WIKI_WEIGHT in getMaxPossibleScore() which
// should help us speed things up!
// @@@ BR: Why should this give a boost at all.. ?
m_allInSameWikiPhrase = true;
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
if(qti->m_numSubLists==0)
continue; //ignored word or stopword.
// skip if negative query term
if ( qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
// set it
if ( qti->m_wikiPhraseId == 1 ) {
continue; //@@@ BR: Why only check for id = 1 ??
}
// stop
m_allInSameWikiPhrase = false;
break;
}
logTrace(g_conf.m_logTracePosdb, "m_allInSameWikiPhrase: %s", (m_allInSameWikiPhrase?"true":"false") );
// for boolean queries we scan every docid in all termlists,
// then we see what query terms it has, and make a bit vector for it.
// then use a hashtable to map that bit vector to a true or false
// as to whether we should include it in the results or not.
// we use Query::getBitScore(qvec_t ebits) to evaluate a docid's
// query term explicit term bit vector.
if ( m_q->m_isBoolean ) {
logTrace(g_conf.m_logTracePosdb, "makeDocIdVoteBufForBoolQuery");
// keeping the docids sorted is the challenge here...
makeDocIdVoteBufForBoolQuery();
}
else {
// create "m_docIdVoteBuf" filled with just the docids from the
// smallest group of sublists (one term, all variations).
//
// m_minTermListIdx is the queryterminfo that had the smallest total
// sublist sizes of m_minTermListSize. this was set in
// setQueryTermInfos()
//
// if all these sublist termlists were 50MB i'd day 10-25ms to
// add their docid votes.
logTrace(g_conf.m_logTracePosdb, "addDocIdVotes");
addDocIdVotes ( &m_queryTermInfos[m_minTermListIdx], listGroupNum );
// now repeat the docid scan for successive lists but only
// inc the docid count for docids we match. docids that are
// in m_docIdVoteBuf but not in sublist group #i will be removed
// from m_docIdVoteBuf.
//
// worst case scenario with termlists limited
// to 30MB will be about 10MB of docid vote buf, but it should
// shrink quite a bit every time we call addDocIdVotes() with a
// new group of sublists for a query term. but scanning 10MB should
// be pretty fast since gk0 does like 4GB/s of main memory reads.
// i would think scanning and docid voting for 200MB of termlists
// should take like 50-100ms
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip if we did it above when allocating the vote buffer
if ( i == m_minTermListIdx ) {
continue;
}
// get it
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
// do not consider for adding if negative ('my house -home')
if ( qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
// inc the group number ("score"), which will be set on the docids
// in addDocIdVotes if they match.
listGroupNum++;
// if it hits 256 then wrap back down to 1
if ( listGroupNum >= 256 ) {
listGroupNum = 1;
}
// add it
addDocIdVotes ( qti, listGroupNum );
}
logTrace(g_conf.m_logTracePosdb, "Added DocIdVotes");
//
// remove the negative query term docids from our docid vote buf
//
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip if we did it above
if ( i == m_minTermListIdx ) {
continue;
}
// get it
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
// do not consider for adding if negative ('my house -home')
if ( ! (qti->m_subList[0].m_bigramFlag & BF_NEGATIVE) ) {
continue;
}
// delete the docid from the vote buffer
delDocIdVotes ( qti );
}
logTrace(g_conf.m_logTracePosdb, "Removed DocIdVotes for negative query terms");
}
if ( m_debug ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - lastTime;
log(LOG_INFO, "posdb: new algo phase (find matching docids) took %" PRId64" ms", took);
lastTime = now;
}
//
// NOW FILTER EVERY SUBLIST to just the docids in m_docIdVoteBuf.
// Filter in place so it is destructive. i would think
// that doing a filter on 200MB of termlists wouldn't be more than
// 50-100ms since we can read 4GB/s from main memory.
//
delNonMatchingDocIdsFromSubLists();
logTrace(g_conf.m_logTracePosdb, "Shrunk SubLists");
if(g_conf.m_logTracePosdb) {
log(LOG_TRACE,"Shrunk sublists, m_numQueryTermInfos=%d", m_numQueryTermInfos);
for(int i=0; i<m_numQueryTermInfos; i++) {
log(LOG_TRACE," qti #%d: m_numSubLists=%d", i, m_queryTermInfos[i].m_numSubLists);
for(int j=0; j<m_queryTermInfos[i].m_numSubLists; j++)
log(LOG_TRACE," list #%d: m_qt=%p (%*.*s) m_list=%p m_bigramFlag=%02x", j, m_queryTermInfos[i].m_subList[j].m_qt, m_queryTermInfos[i].m_subList[j].m_qt->m_termLen, m_queryTermInfos[i].m_subList[j].m_qt->m_termLen, m_queryTermInfos[i].m_subList[j].m_qt->m_term, m_queryTermInfos[i].m_subList[j].m_list, m_queryTermInfos[i].m_subList[j].m_bigramFlag);
log(LOG_TRACE," m_numMatchingSubLists=%d", m_queryTermInfos[i].m_numMatchingSubLists);
for(int j=0; j<m_queryTermInfos[i].m_numMatchingSubLists; j++)
log(LOG_TRACE," matchlist #%d: %d bytes %p - %p baseidx=%d", j, m_queryTermInfos[i].m_matchingSublist[j].m_size, m_queryTermInfos[i].m_matchingSublist[j].m_start, m_queryTermInfos[i].m_matchingSublist[j].m_end, m_queryTermInfos[i].m_matchingSublist[j].m_baseSubListIndex);
}
}
if ( m_debug ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - lastTime;
log(LOG_INFO, "posdb: new algo phase (shrink sublists) took %" PRId64" ms", took);
}
return true;
}
bool PosdbTable::genDebugScoreInfo1(int32_t *numProcessed, int32_t *topCursor, bool *docInThisFile) {
*docInThisFile = false;
// did we get enough score info?
if ( *numProcessed >= m_msg39req->m_docsToGet ) {
logTrace(g_conf.m_logTracePosdb, "Too many docs processed. m_docId=%" PRId64 ". Reached msg39 m_docsToGet: %" PRId32 ", SKIPPED", m_docId, *numProcessed);
return true;
}
// loop back up here if the docid is from a previous range
nextNode:
// this mean top tree empty basically
if ( *topCursor == -1 ) {
logTrace(g_conf.m_logTracePosdb, "topCursor is -1, SKIPPED");
return true;
}
// get the #1 docid/score node #
if ( *topCursor == -9 ) {
// if our query had a quoted phrase, might have had no
// docids in the top tree! getHighNode() can't handle
// that so handle it here
if ( m_topTree->getNumUsedNodes() == 0 ) {
logTrace(g_conf.m_logTracePosdb, "Num used nodes is 0, SKIPPED");
return true;
}
// otherwise, initialize topCursor
*topCursor = m_topTree->getHighNode();
}
// get current node
TopNode *tn = m_topTree->getNode ( *topCursor );
// advance
*topCursor = m_topTree->getPrev ( *topCursor );
// shortcut
m_docId = tn->m_docId;
// skip if not in our range! the top tree now holds
// all the winners from previous docid ranges. msg39
// now does the search result in docid range chunks to avoid
// OOM conditions.
if ( m_msg39req->m_minDocId != -1 &&
m_msg39req->m_maxDocId != -1 &&
( m_docId < (uint64_t)m_msg39req->m_minDocId ||
m_docId >= (uint64_t)m_msg39req->m_maxDocId ) ) {
logTrace(g_conf.m_logTracePosdb, "DocId %" PRIu64 " does not match docId range %" PRIu64 " - %" PRIu64 ", SKIPPED", m_docId, (uint64_t)m_msg39req->m_minDocId, (uint64_t)m_msg39req->m_maxDocId);
goto nextNode;
}
*docInThisFile = m_documentIndexChecker->exists(m_docId);
if( !(*docInThisFile) ) {
logTrace(g_conf.m_logTracePosdb, "DocId %" PRId64 " is not in this file, SKIPPED", m_docId);
return false;
}
logTrace(g_conf.m_logTracePosdb, "DocId %" PRId64 " - setting up score buffer", m_docId);
(*numProcessed)++;
// set query termlists in all sublists
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &(m_queryTermInfos[i]);
// do not advance negative termlist cursor
if ( qti->m_numSubLists>0 && qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
// do each sublist
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
// get termlist for that docid
const char *xlist = qti->m_matchingSublist[j].m_start;
const char *xlistEnd = qti->m_matchingSublist[j].m_end;
const char *xp = getWordPosList ( m_docId, xlist, xlistEnd - xlist);
// not there? xlist will be NULL
qti->m_matchingSublist[j].m_savedCursor = xp;
// if not there make cursor NULL as well
if ( ! xp ) {
qti->m_matchingSublist[j].m_cursor = NULL;
continue;
}
// skip over docid list
xp += 12;
for ( ; ; ) {
// do not breach sublist
if ( xp >= xlistEnd ) {
break;
}
// break if 12 byte key: another docid!
if ( !(xp[0] & 0x04) ) {
break;
}
// skip over 6 byte key
xp += 6;
}
// point to docid sublist end
qti->m_matchingSublist[j].m_cursor = xp;
}
}
return false;
}
bool PosdbTable::genDebugScoreInfo2(DocIdScore *dcs, int32_t *lastLen, uint64_t *lastDocId, char siteRank, float score, int32_t intScore, lang_t docLang) {
dcs->m_siteRank = siteRank;
dcs->m_finalScore = score;
dcs->m_docId = m_docId;
dcs->m_numRequiredTerms = m_numQueryTermInfos;
dcs->m_docLang = docLang;
logTrace(g_conf.m_logTracePosdb, "m_docId=%" PRId64 ", *lastDocId=%" PRId64 "", m_docId, *lastDocId);
// ensure enough room we can't allocate in a thread!
if ( m_scoreInfoBuf.getAvail()<(int32_t)sizeof(DocIdScore)+1) {
logTrace(g_conf.m_logTracePosdb, "END, NO ROOM");
return true;
}
// if same as last docid, overwrite it since we have a higher
// siterank or langid i guess
if ( m_docId == *lastDocId ) {
m_scoreInfoBuf.m_length = *lastLen;
}
// save that
int32_t len = m_scoreInfoBuf.m_length;
// copy into the safebuf for holding the scoring info
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&dcs,sizeof(dcs));
#endif
m_scoreInfoBuf.safeMemcpy ( (char *)dcs, sizeof(DocIdScore) );
// save that
*lastLen = len;
// save it
*lastDocId = m_docId;
// try to fix dup docid problem! it was hitting the
// overflow check right above here... strange!!!
//m_docIdTable.removeKey ( &docId );
/////////////////////////////
//
// . docid range split HACK...
// . if we are doing docid range splitting, we process
// the search results separately in disjoint docid ranges.
// . so because we still use the same m_scoreInfoBuf etc.
// for each split we process, we must remove info from
// a top docid of a previous split that got supplanted by
// a docid of this docid-range split, so we do not breach
// the allocated buffer size.
// . so find out which docid we replaced
// in the top tree, and replace his entry in scoreinfobuf
// as well!
// . his info was already added to m_pairScoreBuf in the
// getTermPairScoreForAny() function
//
//////////////////////////////
// the top tree remains persistent between docid ranges.
// and so does the score buf. so we must replace scores
// if the docid gets replaced by a better scoring docid
// from a following docid range of different docids.
// However, scanning the docid scor buffer each time is
// really slow, so we need to get the kicked out docid
// from the top tree and use that to store its offset
// into m_scoreInfoBuf so we can remove it.
DocIdScore *si;
// only kick out docids from the score buffer when there
// is no room left...
if ( m_scoreInfoBuf.getAvail() >= (int)sizeof(DocIdScore ) ) {
logTrace(g_conf.m_logTracePosdb, "END, OK. m_docId=%" PRId64 ", *lastDocId=%" PRId64 "", m_docId, *lastDocId);
return true;
}
char *sx = m_scoreInfoBuf.getBufStart();
char *sxEnd = sx + m_scoreInfoBuf.length();
// if we have not supplanted anyone yet, be on our way
for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) {
si = (DocIdScore *)sx;
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( ! m_topTree->hasDocId( si->m_docId ) ) {
break;
}
}
// might not be full yet
if ( sx >= sxEnd ) {
logTrace(g_conf.m_logTracePosdb, "END, OK 2");
return true;
}
// must be there!
if ( ! si ) {
gbshutdownAbort(true);
}
// note it because it is slow
// this is only used if getting score info, which is
// not default when getting an xml or json feed
logTrace(g_conf.m_logTracePosdb, "Kicking out docid %" PRId64" from score buf", si->m_docId);
// get his single and pair offsets
int32_t pairOffset = si->m_pairsOffset;
int32_t pairSize = si->m_numPairs * sizeof(PairScore);
int32_t singleOffset = si->m_singlesOffset;
int32_t singleSize = si->m_numSingles * sizeof(SingleScore);
// nuke him
m_scoreInfoBuf .removeChunk1 ( sx, sizeof(DocIdScore) );
// and his related info
m_pairScoreBuf .removeChunk2 ( pairOffset , pairSize );
m_singleScoreBuf.removeChunk2 ( singleOffset , singleSize );
// adjust offsets of remaining single scores
sx = m_scoreInfoBuf.getBufStart();
for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) {
si = (DocIdScore *)sx;
if ( si->m_pairsOffset > pairOffset ) {
si->m_pairsOffset -= pairSize;
}
if ( si->m_singlesOffset > singleOffset ) {
si->m_singlesOffset -= singleSize;
}
}
// adjust this too!
*lastLen -= sizeof(DocIdScore);
logTrace(g_conf.m_logTracePosdb, "Returning false");
return false;
}
void PosdbTable::logDebugScoreInfo(int32_t loglevel) {
logTrace(g_conf.m_logTracePosdb, "BEGIN");
const char *sx = m_scoreInfoBuf.getBufStart();
const char *sxEnd = sx + m_scoreInfoBuf.length();
log(loglevel, "DocId scores in m_scoreInfoBuf:");
for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) {
const DocIdScore *si = (const DocIdScore *)sx;
log(loglevel, " docId: %14" PRIu64 ", score: %f", si->m_docId, si->m_finalScore);
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( ! m_topTree->hasDocId( si->m_docId ) ) {
log(loglevel, " ^ NOT in topTree anymore!");
}
}
logTrace(g_conf.m_logTracePosdb, "END");
}
void PosdbTable::removeScoreInfoForDeletedDocIds() {
logTrace(g_conf.m_logTracePosdb, "BEGIN");
char *sx = m_scoreInfoBuf.getBufStart();
char *sxEnd = sx + m_scoreInfoBuf.length();
for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) {
DocIdScore *si = (DocIdScore *)sx;
// if top tree no longer has this docid, we must
// remove its associated scoring info so we do not
// breach our scoring info bufs
if ( m_topTree->hasDocId( si->m_docId ) ) {
continue;
}
logTrace(g_conf.m_logTracePosdb, "Removing old score info for docId %" PRId64 "", si->m_docId);
// get his single and pair offsets
int32_t pairOffset = si->m_pairsOffset;
int32_t pairSize = si->m_numPairs * sizeof(PairScore);
int32_t singleOffset = si->m_singlesOffset;
int32_t singleSize = si->m_numSingles * sizeof(SingleScore);
// nuke him
m_scoreInfoBuf .removeChunk1 ( sx, sizeof(DocIdScore) );
// and his related info
m_pairScoreBuf .removeChunk2 ( pairOffset , pairSize );
m_singleScoreBuf.removeChunk2 ( singleOffset , singleSize );
// adjust offsets of remaining single scores
sx = m_scoreInfoBuf.getBufStart();
for ( ; sx < sxEnd ; sx += sizeof(DocIdScore) ) {
si = (DocIdScore *)sx;
if ( si->m_pairsOffset > pairOffset ) {
si->m_pairsOffset -= pairSize;
}
if ( si->m_singlesOffset > singleOffset ) {
si->m_singlesOffset -= singleSize;
}
}
sxEnd -= sizeof(DocIdScore);
}
logTrace(g_conf.m_logTracePosdb, "END");
}
// Pre-advance each termlist's cursor to skip to next docid.
//
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
// of each termlist so we are ready for a quick skip over this docid.
//
// TODO: use just a single array of termlist ptrs perhaps,
// then we can remove them when they go NULL. and we'd save a little
// time not having a nested loop.
bool PosdbTable::advanceTermListCursors(const char *docIdPtr) {
logTrace(g_conf.m_logTracePosdb, "BEGIN");
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &(m_queryTermInfos[i]);
// do not advance negative termlist cursor
if ( qti->m_numSubLists>0 && qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
//
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
// In second pass (to get detailed scoring info for UI output), they are initialized above
//
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
// shortcuts
const char *xc = qti->m_matchingSublist[j].m_cursor;
const char *xcEnd = qti->m_matchingSublist[j].m_end;
// exhausted? (we can't make cursor NULL because
// getMaxPossibleScore() needs the last ptr)
// must match docid
if ( xc >= xcEnd ||
*(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) ||
(*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) {
// flag it as not having the docid
qti->m_matchingSublist[j].m_savedCursor = NULL;
// skip this sublist if does not have our docid
continue;
}
// save it
qti->m_matchingSublist[j].m_savedCursor = xc;
// get new docid
//log("new docid %" PRId64,Posdb::getDocId(xc) );
// advance the cursors. skip our 12
xc += 12;
// then skip any following 6 byte keys because they
// share the same docid
for ( ; ; xc += 6 ) {
// end of whole termlist?
if ( xc >= xcEnd ) {
break;
}
// sanity. no 18 byte keys allowed
if ( (*xc & 0x06) == 0x00 ) {
// i've seen this triggered on gk28.
// a dump of posdb for the termlist
// for 'post' had corruption in it,
// yet its twin, gk92 did not. the
// corruption could have occurred
// anywhere from nov 2012 to may 2013,
// and the posdb file was never
// re-merged! must have been blatant
// disk malfunction?
log("posdb: encountered corrupt posdb list. bailing.");
logTrace(g_conf.m_logTracePosdb, "END.");
return false;
//gbshutdownAbort(true);
}
// the next docid? it will be a 12 byte key.
if ( ! (*xc & 0x04) ) {
break;
}
}
// assign to next docid word position list
qti->m_matchingSublist[j].m_cursor = xc;
}
}
logTrace(g_conf.m_logTracePosdb, "END");
return true;
}
#define RINGBUFSIZE 4096
//Go through the word positions of the queryterminfo posdb lists and set the corresponding ringBuf[] elements to the qti-index.
//Also find the first position and return that.
static int32_t setRingbufFromQTI(const QueryTermInfo *qti, int32_t listIdx, unsigned char ringBuf[]) {
int32_t firstPos = -1;
for(int32_t i = 0 ; i < qti->m_numMatchingSubLists; i++) {
// scan that sublist and add word positions
const char *sub = qti->m_matchingSublist[i].m_savedCursor;
// skip sublist if it's cursor is exhausted
if(!sub)
continue;
const char *end = qti->m_matchingSublist[i].m_cursor;
// add first key
int32_t wx = Posdb::getWordPos(sub);
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = listIdx;
if(firstPos<0 || wx<firstPos)
firstPos = wx;
// skip first key
sub += 12;
// then 6 byte keys
for( ; sub < end; sub += 6) {
// get word position
wx = Posdb::getWordPos(sub);
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = listIdx;
}
}
return firstPos;
}
//
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'search engine'. it might save time!
//
// Returns:
// false - docid does not meet minimum score requirement
// true - docid can potentially be a top scoring docid
//
bool PosdbTable::prefilterMaxPossibleScoreByDistance(float minWinningScore) {
unsigned char ringBuf[RINGBUFSIZE+10];
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
memset ( ringBuf, 0xff, sizeof(ringBuf) );
logTrace(g_conf.m_logTracePosdb, "BEGIN");
// now to speed up 'time enough for love' query which does not
// have many super high scoring guys on top we need a more restrictive
// filter than getMaxPossibleScore() so let's pick one query term,
// the one with the shortest termlist, and see how close it gets to
// each of the other query terms. then score each of those pairs.
// so quickly record the word positions of each query term into
// a ring buffer of 4096 slots where each slot contains the
// query term info #.
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation");
int32_t ourFirstPos = setRingbufFromQTI(&(m_queryTermInfos[m_minTermListIdx]), m_minTermListIdx, ringBuf);
// now get query term closest to query term # m_minTermListIdx which
// is the query term # with the shortest termlist
// get closest term to m_minTermListIdx and the distance
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2");
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
if ( i == m_minTermListIdx ) {
continue;
}
// get the query term info
const QueryTermInfo *qti = &(m_queryTermInfos[i]);
// if we have a negative term, skip it
if ( qti->m_subList[0].m_bigramFlag & (BF_NEGATIVE) ) {
continue;
}
// store all his word positions into ring buffer AS WELL
setRingbufFromQTI(qti, i, ringBuf);
// reset
int32_t ourLastPos = -1;
int32_t hisLastPos = -1;
int32_t bestDist = 0x7fffffff;
// how far is this guy from the man?
for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) {
// skip next 4 slots if all empty. fast?
if (*(uint32_t *)(ringBuf+x) == 0xffffffff) {
x+=4;
continue;
}
// skip if nobody
if ( ringBuf[x] == 0xff ) {
x++;
continue;
}
// get query term #
unsigned char qt = ringBuf[x];
// if it's the man
if ( qt == m_minTermListIdx ) {
// record
hisLastPos = x;
// skip if we are not there yet
if ( ourLastPos == -1 ) {
x++;
continue;
}
// try distance fix
if ( x - ourLastPos < bestDist ) {
bestDist = x - ourLastPos;
}
}
// if us
else
if ( qt == i ) {
// record
ourLastPos = x;
// skip if he's not recorded yet
if ( hisLastPos == -1 ) {
x++;
continue;
}
// check dist
if ( x - hisLastPos < bestDist ) {
bestDist = x - hisLastPos;
}
}
x++;
}
// compare last occurence of query term #x with our first occ.
// since this is a RING buffer
int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos);
if ( wrapDist < bestDist ) {
bestDist = wrapDist;
}
// query distance
int32_t qdist = m_qpos[m_minTermListIdx] - m_qpos[i];
// compute it
float maxScore2 = getMaxPossibleScore(&(m_queryTermInfos[i]));
maxScore2 = modifyMaxScoreByDistance(maxScore2,
bestDist,
qdist,
&(m_queryTermInfos[m_minTermListIdx]));
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore2 < 0.0 ) {
continue;
}
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
// @todo: BR. Really? ANY of them?
if ( maxScore2 <= minWinningScore ) {
logTrace(g_conf.m_logTracePosdb, "END - docid score too low");
return false;
}
}
logTrace(g_conf.m_logTracePosdb, "END - docid score high enough");
return true;
}
//
// Data for the current DocID found in sublists of each query term
// is merged into a single list, so we end up with one list per query
// term.
//
void PosdbTable::mergeTermSubListsForDocId(MiniMergeBuffer *miniMergeBuffer, int *highestInlinkSiteRank) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
char *miniMergeBuf = miniMergeBuffer->buffer;
char *miniMergeBufEnd = miniMergeBuffer->buffer+sizeof(miniMergeBuffer->buffer);
// we got a docid that has all the query terms, so merge
// each term's sublists into a single list just for this docid.
//
// all posdb keys for this docid should fit in here, the
// mini merge buf:
char *mptr = miniMergeBuf;
char *miniMergeBufSafeEnd = miniMergeBufEnd - 1000; //fragile hack but no worse than the original code
char *lastMptr = NULL;
// Merge each set of sublists, like we merge a term's list with
// its two associated bigram lists, if there, the left bigram and
// right bigram list. Merge all the synonym lists for that term
// together as well, so if the term is 'run' we merge it with the
// lists for 'running' 'ran' etc.
logTrace(g_conf.m_logTracePosdb, "Merge sublists into a single list per query term");
for ( int32_t j = 0 ; j < m_numQueryTermInfos ; j++ ) {
// get the query term info
QueryTermInfo *qti = &(m_queryTermInfos[j]);
// just use the flags from first term i guess
// NO! this loses the wikihalfstopbigram bit! so we gotta
// add that in for the key i guess the same way we add in
// the syn bits below!!!!!
m_bflags [j] = qti->m_numSubLists>0 ? qti->m_subList[0].m_bigramFlag : 0;
// if we have a negative term, skip it
if ( qti->m_numSubLists>0 && qti->m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
// need to make this NULL for getSiteRank() call below
miniMergeBuffer->mergedListStart[j] = NULL;
// if its empty, that's good!
continue;
}
// the merged list for term #j is here:
miniMergeBuffer->mergedListStart[j] = mptr;
bool isFirstKey = true;
const char *nwp[MAX_SUBLISTS];
const char *nwpEnd[MAX_SUBLISTS];
char nwpFlags[MAX_SUBLISTS];
int baseSubListIndex[MAX_SUBLISTS];
// populate the nwp[] arrays for merging
int32_t nsub = 0;
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
baseSubListIndex[nsub] = k;
// NULL means does not have that docid
if ( ! qti->m_matchingSublist[k].m_savedCursor ) {
continue;
}
// getMaxPossibleScore() incremented m_matchingSubListCursor to
// the next docid so use m_matchingSubListSavedCursor.
nwp[nsub] = qti->m_matchingSublist[k].m_savedCursor;
nwpEnd[nsub] = qti->m_matchingSublist[k].m_cursor;
nwpFlags[nsub] = qti->m_subList[k].m_bigramFlag;
nsub++;
}
// Merge the lists into a list in miniMergeBuf.
// Get the min of each list
while( mptr < miniMergeBufSafeEnd ) {
int32_t mink = -1;
for ( int32_t k = 0 ; k < nsub ; k++ ) {
// skip if list is exhausted
if ( ! nwp[k] ) {
continue;
}
// auto winner?
if ( mink == -1 ) {
mink = k;
continue;
}
if ( KEYCMP(nwp[k], nwp[mink], 6) < 0 ) {
mink = k; // a new min...
}
}
// all exhausted? merge next set of sublists then for term #j
if ( mink == -1 ) {
// continue outer "j < m_numQueryTermInfos" loop.
break;
}
// get keysize
char ks = Posdb::getKeySize(nwp[mink]);
// if the first key in our merged list store the docid crap
if ( isFirstKey ) {
// store a 12 byte key in the merged list buffer
memcpy ( mptr, nwp[mink], 12 );
int termIndex = qti->m_subList[baseSubListIndex[mink]].m_qt - m_q->m_qterms;
*(miniMergeBuffer->getTermIndexPtrForBufferPos(mptr)) = termIndex;
// Detect highest siterank of inlinkers
if ( Posdb::getHashGroup(mptr) == HASHGROUP_INLINKTEXT) {
char inlinkerSiteRank = Posdb::getWordSpamRank(mptr);
if(inlinkerSiteRank > *highestInlinkSiteRank) {
*highestInlinkSiteRank = inlinkerSiteRank;
}
}
// wipe out its syn bits and re-use our way
mptr[2] &= 0xfc;
// set the synbit so we know if its a synonym of term
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
mptr[2] |= 0x02;
}
// wiki half stop bigram? so for the query
// 'time enough for love' the phrase term "enough for"
// is a half stopword wiki bigram, because it is in
// a phrase in wikipedia ("time enough for love") and
// one of the two words in the phrase term is a
// stop word. therefore we give it more weight than
// just 'enough' by itself.
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
mptr[2] |= 0x01;
}
// make sure its 12 bytes! it might have been
// the first key for the termid, and 18 bytes.
mptr[0] &= 0xf9;
mptr[0] |= 0x02;
// save it
lastMptr = mptr;
mptr += 12;
isFirstKey = false;
}
else {
// Old, dubious comment: 2dont add if same keypos as previous item ..."
// Relied on the matchSubList to be in the order: left-bigram, right-bigram, term, synonyms...
// But setQueryTermInfo() has had a bug there for as long as we know and
// multiple entries with same position seems to have no harmful effect.
memcpy ( mptr, nwp[mink], 6 );
int termIndex = qti->m_subList[qti->m_matchingSublist[mink].m_baseSubListIndex].m_qt - m_q->m_qterms;
*(miniMergeBuffer->getTermIndexPtrForBufferPos(mptr)) = termIndex;
// Detect highest siterank of inlinkers
if ( Posdb::getHashGroup(mptr) == HASHGROUP_INLINKTEXT) {
char inlinkerSiteRank = Posdb::getWordSpamRank(mptr);
if(inlinkerSiteRank > *highestInlinkSiteRank) {
*highestInlinkSiteRank = inlinkerSiteRank;
}
}
// wipe out its syn bits and re-use our way
mptr[2] &= 0xfc;
// set the synbit so we know if its a synonym of term
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
mptr[2] |= 0x02;
}
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
mptr[2] |= 0x01;
}
// if it was the first key of its list it may not
// have its bit set for being 6 bytes now! so turn
// on the 2 compression bits
mptr[0] &= 0xf9;
mptr[0] |= 0x06;
// save it
lastMptr = mptr;
mptr += 6;
}
// advance the cursor over the key we used.
nwp[mink] += ks; // Posdb::getKeySize(nwp[mink]);
// exhausted?
if ( nwp[mink] >= nwpEnd[mink] ) {
nwp[mink] = NULL;
}
}
// wrap it up here since done merging
miniMergeBuffer->mergedListEnd[j] = mptr;
//log(LOG_ERROR,"%s:%d: j=%" PRId32 ": miniMergedListStart[%" PRId32 "]=%p, miniMergedListEnd[%" PRId32 "]=%p, mptr=%p, miniMergeBufEnd=%p, term=[%.*s] - TERM DONE", __func__, __LINE__, j, j, miniMergeBuffer->mergedListStart[j], j, miniMergeBuffer->mergedListEnd[j], mptr, miniMergeBufEnd, qti->m_qt->m_termLen, qti->m_qt->m_term);
}
// breach?
if ( mptr > miniMergeBufEnd ) {
log(LOG_ERROR,"%s:%s:%d: miniMergeBuf=%p miniMergeBufEnd=%p mptr=%p", __FILE__, __func__, __LINE__, miniMergeBuf, miniMergeBufEnd, mptr);
gbshutdownAbort(true);
}
#ifdef _VALGRIND_
VALGRIND_MAKE_MEM_UNDEFINED(mptr, miniMergeBufEnd-mptr);
#endif
// Make sure miniMergeList[x] is NULL if it contains no values
for(int32_t i=0; i < m_numQueryTermInfos; i++) {
// skip if not part of score
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
// get list
const char *plist = miniMergeBuffer->mergedListStart[i];
const char *plistEnd = miniMergeBuffer->mergedListEnd[i];
int32_t psize = plistEnd - plist;
if( !psize ) {
// BR fix 20160918: Avoid working on empty lists where start and
// end pointers are the same. This can happen if no positions are
// copied to the merged list because they are all synonyms or
// phrase terms. See "HACK OF CONFUSION" above..
miniMergeBuffer->mergedListStart[i] = NULL;
}
//
// sanity check for all
//
// test it. first key is 12 bytes.
if ( psize && Posdb::getKeySize(plist) != 12 ) {
log(LOG_ERROR,"%s:%s:%d: psize=%" PRId32 "", __FILE__, __func__, __LINE__, psize);
gbshutdownAbort(true);
}
// next key is 6
if ( psize > 12 && Posdb::getKeySize(plist+12) != 6) {
log(LOG_ERROR,"%s:%s:%d: next key size=%" PRId32 "", __FILE__, __func__, __LINE__, Posdb::getKeySize(plist+12));
gbshutdownAbort(true);
}
}
logTrace(g_conf.m_logTracePosdb, "END.");
}
//
// Nested for loops to score the term pairs.
//
// Store best scores into the scoreMatrix so the sliding window
// algorithm can use them from there to do sub-outs
//
void PosdbTable::createNonBodyTermPairScoreMatrix(const MiniMergeBuffer *miniMergeBuffer, PairScoreMatrix *scoreMatrix) {
logTrace(g_conf.m_logTracePosdb, "BEGIN");
// scan over each query term (its synonyms are part of the
// QueryTermInfo)
for(int32_t i=0; i < m_numQueryTermInfos; i++) {
// skip if not part of score
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
// and pair it with each other possible query term
for ( int32_t j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
// skip if not part of score
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
// but if they are in the same wikipedia phrase
// then try to keep their positions as in the query.
// so for 'time enough for love' ideally we want
// 'time' to be 6 units apart from 'love'
int32_t qdist;
float score;
// zero means not in a phrase
if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] && m_wikiPhraseIds[j] ) {
// . the distance between the terms in the query
// . ideally we'd like this distance to be reflected
// in the matched terms in the body
qdist = m_qpos[j] - m_qpos[i];
// wiki weight
score = (float)WIKI_WEIGHT; // .50;
}
else {
// basically try to get query words as close
// together as possible
qdist = 2;
// this should help fix
// 'what is an unsecured loan' so we are more likely
// to get the page that has that exact phrase in it.
// yes, but hurts how to make a lock pick set.
//qdist = qpos[j] - qpos[i];
// wiki weight
score = 1.0;
}
float maxnbtp;
//
// get score for term pair from non-body occuring terms
//
if ( miniMergeBuffer->mergedListStart[i] && miniMergeBuffer->mergedListStart[j] ) {
maxnbtp = getMaxScoreForNonBodyTermPair(miniMergeBuffer, i, j, qdist);
}
else {
maxnbtp = -1;
}
// it's -1 if one term is in the body/header/menu/etc.
if ( maxnbtp < 0 ) {
score = -1.00;
}
else {
score *= maxnbtp;
}
// store in matrix for "sub out" algo below
// when doing sliding window
scoreMatrix->set(j,i, score);
}
}
logTrace(g_conf.m_logTracePosdb, "END");
}
//
// Finds the highest single term score sum.
// Creates array of highest scoring non-body positions
//
float PosdbTable::getMinSingleTermScoreSum(const MiniMergeBuffer *miniMergeBuffer, std::vector<const char *> &highestScoringNonBodyPos, DocIdScore *pdcs) {
float minSingleScore = 999999999.0;
bool mergedListFound = false;
bool allSpecialTerms = true;
bool scoredTerm = false;
logTrace(g_conf.m_logTracePosdb, "BEGIN");
// Now add single word scores.
//
// Having inlink text from siterank 15 of max
// diversity/density will result in the largest score,
// but we add them all up...
//
// This should be highly negative if singles[i] has a '-'
// termsign...
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
if ( ! miniMergeBuffer->mergedListStart[i] ) {
continue;
}
mergedListFound = true;
// skip if to the left of a pipe operator
if( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
allSpecialTerms = false;
// sometimes there is no wordpos subtermlist for this docid
// because it just has the bigram, like "streetlight" and not
// the word "light" by itself for the query 'street light'
//if ( miniMergeBuffer->mergedListStart[i] ) {
// assume all word positions are in body
//highestScoringNonBodyPos[i] = NULL;
// This scans all word positions for this term.
//
// This should ignore occurences in the body and only
// focus on inlink text, etc.
//
// Sets "highestScoringNonBodyPos" to point to the winning word
// position which does NOT occur in the body.
//
// Adds up MAX_TOP top scores and returns that sum.
//
// pdcs is NULL if not currPassNum == INTERSECT_DEBUG_INFO
float sts = getBestScoreSumForSingleTerm(miniMergeBuffer, i, pdcs, &(highestScoringNonBodyPos[i]));
scoredTerm = true;
//logTrace(g_conf.m_logTracePosdb, "i=%d sts=%f", i,sts);
// sanity check
if ( highestScoringNonBodyPos[i] && s_inBody[Posdb::getHashGroup(highestScoringNonBodyPos[i])] ) {
gbshutdownAbort(true);
}
//sts /= 3.0;
if ( sts < minSingleScore ) {
minSingleScore = sts;
}
}
if( !mergedListFound || (!scoredTerm && !allSpecialTerms) ) {
// Fix default value if no single terms were scored, and all terms are not special (e.g. numbers).
// This returns -1 for documents matching bigrams only, and not single terms. Can happen when searching
// for "bridget jones" and a document has the text "bridgetjon es" as the only match (bigram).
//
// If terms are numbers, do NOT return -1, otherwise gbsortbyint queries do not work.
minSingleScore = -1;
}
logTrace(g_conf.m_logTracePosdb, "END. minSingleScore=%f", minSingleScore);
return minSingleScore;
}
// Like getTermPairScore, but uses the word positions currently pointed to by ptrs[i].
// Does NOT scan the word position lists.
// Also tries to sub-out each term with the title or linktext wordpos term
// pointed to by "highestScoringNonBodyPos[i]".
//
// OUTPUT:
// bestMinTermPairWindowScore: The best minimum window score
// bestMinTermPairWindowPtrs : Pointers to query term positions giving the best minimum score
//
void PosdbTable::findMinTermPairScoreInWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &ptrs, std::vector<const char *> *bestMinTermPairWindowPtrs, float *bestMinTermPairWindowScore, const std::vector<const char *> &highestScoringNonBodyPos, const PairScoreMatrix &scoreMatrix) {
float minTermPairScoreInWindow = 999999999.0;
bool mergedListFound = false;
bool allSpecialTerms = true;
bool scoredTerms = false;
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// TODO: only do this loop on the (i,j) pairs where i or j
// is the term whose position got advanced in the sliding window.
for ( int32_t i = 0 ; i < m_numQueryTermInfos; i++ ) {
// skip if to the left of a pipe operator
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
allSpecialTerms = false;
// skip empty list
if( !ptrs[i] ) {
continue;
}
mergedListFound = true;
//if ( ptrs[i] ) wpi = ptrs[i];
// if term does not occur in body, sub-in the best term
// from the title/linktext/etc.
//else wpi = highestScoringNonBodyPos[i];
const char *wpi = ptrs[i];
// loop over other terms
for(int32_t j = i + 1; j < m_numQueryTermInfos; j++) {
// skip if to the left of a pipe operator
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
// skip empty list
if( !ptrs[j] ) {
continue;
}
// TODO: use a cache using wpi/wpj as the key.
//if ( ptrs[j] ) wpj = ptrs[j];
// if term does not occur in body, sub-in the best term
// from the title/linktext/etc.
//else wpj = highestScoringNonBodyPos[j];
const char *wpj = ptrs[j];
int32_t qdist;
float wikiWeight;
// in same wikipedia phrase?
if ( m_wikiPhraseIds[j] == m_wikiPhraseIds[i] &&
// zero means not in a phrase
m_wikiPhraseIds[j] ) {
// try to get dist that matches qdist exactly
qdist = m_qpos[j] - m_qpos[i];
// wiki weight
wikiWeight = WIKI_WEIGHT; // .50;
}
else {
// basically try to get query words as close
// together as possible
qdist = 2;
// fix 'what is an unsecured loan' to get the
// exact phrase with higher score
//m_qdist = m_qpos[j] - m_qpos[i];
// wiki weight
wikiWeight = 1.0;
}
// this will be -1 if wpi or wpj is NULL
float max = getScoreForTermPair(miniMergeBuffer, wpi, wpj, 0, qdist);
scoredTerms = true;
// try sub-ing in the best title occurence or best
// inlink text occurence. cuz if the term is in the title
// but these two terms are really far apart, we should
// get a better score
float score = getScoreForTermPair(miniMergeBuffer, highestScoringNonBodyPos[i], wpj, FIXED_DISTANCE, qdist);
max = gbmax(max,score);
// a double pair sub should be covered in the
// getMaxScoreForNonBodyTermPair() function
score = getScoreForTermPair(miniMergeBuffer, highestScoringNonBodyPos[i], highestScoringNonBodyPos[j], FIXED_DISTANCE, qdist);
max = gbmax(max,score);
score = getScoreForTermPair(miniMergeBuffer, wpi, highestScoringNonBodyPos[j], FIXED_DISTANCE, qdist);
max = gbmax(max,score);
// wikipedia phrase weight
if ( !almostEqualFloat(wikiWeight, 1.0) ) {
max *= wikiWeight;
}
// term freqweight here
const int queryTermIndex1 = miniMergeBuffer->getTermIndexForBufferPos(wpi);
const float termFreqWeight1 = m_q->m_qterms[queryTermIndex1].m_termFreqWeight;
max *= termFreqWeight1;
const int queryTermIndex2 = miniMergeBuffer->getTermIndexForBufferPos(wpj);
const float termFreqWeight2 = m_q->m_qterms[queryTermIndex2].m_termFreqWeight;
max *= termFreqWeight2;
//TODO: shouldn't we multiply with userweight here too?
// use score from scoreMatrix if bigger
max = gbmax(max,scoreMatrix.get(j,i));
// in same quoted phrase?
if ( m_quotedStartIds[j] >= 0 && m_quotedStartIds[j] == m_quotedStartIds[i] ) {
// no subouts allowed i guess
if( !wpi ) {
max = -1.0;
}
else
if( !wpj ) {
max = -1.0;
}
else {
int32_t qdist = m_qpos[j] - m_qpos[i];
int32_t p1 = Posdb::getWordPos ( wpi );
int32_t p2 = Posdb::getWordPos ( wpj );
int32_t dist = p2 - p1;
// must be in right order!
if(dist < 0) {
max = -1.0;
}
// allow for a discrepancy of 1 unit in case
// there is a comma? i think we add an extra
// unit
else
if(dist > qdist && dist - qdist > 1) {
max = -1.0;
//log("ddd1: i=%" PRId32" j=%" PRId32" "
// "dist=%" PRId32" qdist=%" PRId32,
// i,j,dist,qdist);
}
else
if(dist < qdist && qdist - dist > 1) {
max = -1.0;
}
}
}
// now we want the sliding window with the largest min
// term pair score!
minTermPairScoreInWindow = gbmin(max,minTermPairScoreInWindow);
}
}
if( !mergedListFound || (!scoredTerms && !allSpecialTerms) ) {
// Similar fix as in getMinSingleTermScoreSum, but should not happen in this function ...
minTermPairScoreInWindow = -1;
}
logTrace(g_conf.m_logTracePosdb, "minTermPairScoreInWindow=%f, bestMinTermPairWindowScore=%f", minTermPairScoreInWindow, *bestMinTermPairWindowScore);
// Our best minimum score better than current best minimum score?
if ( minTermPairScoreInWindow <= *bestMinTermPairWindowScore ) {
logTrace(g_conf.m_logTracePosdb, "END.");
return;
}
// Yep, our best minimum score is the highest so far
*bestMinTermPairWindowScore = minTermPairScoreInWindow;
// Record term positions in winning window
for(int32_t i=0; i < m_numQueryTermInfos; i++) {
if(ptrs[i]!=NULL)
(*bestMinTermPairWindowPtrs)[i] = ptrs[i];
}
logTrace(g_conf.m_logTracePosdb, "END.");
}
// Find first entry that is a body entry (determined by 's_inBody' array).
// Returns NULL if none found
static const char *findFirstBodyPosdbEntry(const char *listStart, const char *listEnd) {
for(const char *p = listStart; p<listEnd; ) {
unsigned char hg = Posdb::getHashGroup(p);
if(s_inBody[hg])
return p;
if(*p & 0x04)
p += 6;
else
p += 12;
}
return NULL;
}
// Find next entry that is a body entry (determined by 's_inBody' array).
// Returns NULL if none found
static const char *findNextBodyPosdbEntry(const char *p, const char *listEnd) {
if(*p & 0x04)
p += 6;
else
p += 12;
return findFirstBodyPosdbEntry(p,listEnd);
}
float PosdbTable::getMinTermPairScoreSlidingWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &highestScoringNonBodyPos, std::vector<const char *> &bestMinTermPairWindowPtrs, std::vector<const char *> &xpos, const PairScoreMatrix &scoreMatrix, DocIdScore *pdcs) {
logTrace(g_conf.m_logTracePosdb, "Sliding Window algorithm begins");
//bestMinTermPairWindowPtrs is just a buffer allocated ones by caller
for(int i=0; i<m_numQueryTermInfos; i++)
bestMinTermPairWindowPtrs[i] = NULL;
// Scan the terms that are in the body in a sliding window
//
// Compute the term pair score on just the terms in that
// sliding window. that way, the term for a word like 'dog'
// keeps the same word position when it is paired up with the
// other terms.
//
// Compute the score the same way getTermPairScore() works so
// we are on the same playing field
//
// Sub-out each term with its best scoring occurence in the title
// or link text or meta tag, etc. but it gets a distance penalty
// of like 100 units or so.
//
// If term does not occur in the body, the sub-out approach should
// fix that.
//
// Keep a matrix of the best scores between two terms from the
// above double nested loop algo, and replace that pair if we
// got a better score in the sliding window.
// use special ptrs for the windows so we do not mangle
// miniMergeBuffer->mergedListStart[] array because we use that below!
//
// init each list ptr to the first wordpos rec in the body
// for THIS DOCID and if no such rec, make it NULL
//
int numNonNullLists = 0;
for(int i = 0; i < m_numQueryTermInfos; i++) {
if(m_bflags[i] & (BF_PIPED|BF_NEGATIVE)) {
// not a ranking term
xpos[i] = NULL;
} else if(miniMergeBuffer->mergedListStart[i]==NULL) {
//no list
xpos[i] = NULL;
} else {
xpos[i] = findFirstBodyPosdbEntry(miniMergeBuffer->mergedListStart[i], miniMergeBuffer->mergedListEnd[i]);
if(xpos[i]) {
numNonNullLists++;
} else if(!highestScoringNonBodyPos[i]) {
// not in nody, not in title?
gbshutdownLogicError();
}
}
}
logTrace(g_conf.m_logTracePosdb, "Run sliding window algo? %s", numNonNullLists>0 ? "yes" : "no, no matches found in body");
float bestMinTermPairWindowScore = -2.0;
//todo: does it make any sense to iterate after one list has been exhausted? findMinTermPairScoreInWindow() is not going to find
//any lower pair-score because it skips the now-null list.
while(numNonNullLists>0) {
//
// Now all xpos point to positions in the document body. Calc the "window" score (score
// for current term positions).
//
// If window score beats bestMinTermPairWindowScore we store the term xpos pointers
// that define this window in the bestMinTermPairWindowPtrs[] array.
//
// Will try to substitute either of the two term positions with highestScoringNonBodyPos[i]
// if better, but will fix the distance to FIXED_DISTANCE to give a distance penalty.
//
// "scoreMatrix" contains the highest scoring non-body term pair score, which will
// be used if higher than the calculated score for the terms.
//
// Sets m_bestMinTermPairWindowScore and bestMinTermPairWindowPtrs if this window score beats it.
//
findMinTermPairScoreInWindow(miniMergeBuffer, xpos, &bestMinTermPairWindowPtrs, &bestMinTermPairWindowScore, highestScoringNonBodyPos, scoreMatrix);
bool advanceMin = true;
while(advanceMin && numNonNullLists>0) {
advanceMin = false;
//
// Find the minimum word position in the document for ANY of the query terms.
// minPosTermIdx will contain the term index, minPos the position.
//
int32_t minPosTermIdx = -1;
int32_t minPos = 0;
for ( int32_t x = 0 ; x < m_numQueryTermInfos ; x++ ) {
if(xpos[x]!=NULL && (minPosTermIdx == -1 || Posdb::getWordPos(xpos[x]) < minPos)) {
minPosTermIdx = x;
minPos = Posdb::getWordPos(xpos[x]);
}
}
// sanity
if ( minPosTermIdx < 0 ) {
gbshutdownLogicError();
}
xpos[minPosTermIdx] = findNextBodyPosdbEntry(xpos[minPosTermIdx],miniMergeBuffer->mergedListEnd[minPosTermIdx]);
if(xpos[minPosTermIdx]==NULL) { //we exhausted the list
numNonNullLists--;
if(numNonNullLists>0)
advanceMin = true; //just advance the next list, if any
}
}
}
float minPairScore = -1.0;
logTrace(g_conf.m_logTracePosdb, "Zak algorithm begins");
for(int32_t i=0; i < m_numQueryTermInfos; i++) {
// skip if to the left of a pipe operator
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
if ( ! miniMergeBuffer->mergedListStart[i] ) {
continue;
}
for ( int32_t j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
// skip if to the left of a pipe operator
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) {
continue;
}
if ( ! miniMergeBuffer->mergedListStart[j] ) {
continue;
}
// . this limits its scoring to the winning sliding window
// as far as the in-body terms are concerned
// . it will do sub-outs using the score matrix
// . this will skip over body terms that are not
// in the winning window defined by bestMinTermPairWindowPtrs[]
// that we set in findMinTermPairScoreInWindow()
// . returns the best score for this term
float tpscore = getTermPairScoreForAny(miniMergeBuffer, i, j, bestMinTermPairWindowPtrs, pdcs);
// get min of all term pair scores
if ( tpscore >= minPairScore && minPairScore >= 0.0 ) {
continue;
}
// got a new min
logTrace(g_conf.m_logTracePosdb, "i=%d, tpscore=%f", i, tpscore);
minPairScore = tpscore;
}
}
logTrace(g_conf.m_logTracePosdb, "Zak algorithm ends. minPairScore=%f", minPairScore);
return minPairScore;
}
//simple wrapper around intersectLists_real() just for transforming std::bad_alloca exceptions in ENOMEM
void PosdbTable::intersectLists() {
try {
intersectLists_real();
} catch(std::bad_alloc&) {
log(LOG_ERROR,"posdb: caught std::bad_alloc - out of memory");
if(g_errno==0)
g_errno = ENOMEM;
}
}
// . compare the output of this to intersectLists9_r()
// . hopefully this will be easier to understand and faster
// . IDEAS:
// we could also note that if a term was not in the title or
// inlink text it could never beat the 10th score.
void PosdbTable::intersectLists_real() {
logTrace(g_conf.m_logTracePosdb, "BEGIN. numTerms: %" PRId32, m_q->m_numTerms);
if(m_topTree->getNumNodes()==0 && !allocateTopTree()) {
logTrace(g_conf.m_logTracePosdb, "END. could not allocate toptree");
g_errno = ENOMEM;
return;
}
if(m_topTree->getNumNodes()==0) {
logTrace(g_conf.m_logTracePosdb, "END. toptree has zero size");
return;
}
if(!allocateScoringInfo()) {
logTrace(g_conf.m_logTracePosdb, "END. could not allocate scoring info");
g_errno = ENOMEM;
return;
}
if(!allocWhiteListTable()) {
logTrace(g_conf.m_logTracePosdb, "END. could not allocate whitelist table");
return;
}
prepareWhiteListTable();
if(!setQueryTermInfo()) {
logTrace(g_conf.m_logTracePosdb, "END. could not allocate query term info");
return;
}
initWeights();
// assume no-op
m_t1 = 0LL;
// set start time
int64_t t1 = gettimeofdayInMilliseconds();
// assume we return early
m_addListsTime = 0;
if( !findCandidateDocIds() ) {
logTrace(g_conf.m_logTracePosdb, "END. Found no candidate docids");
return;
}
//
// The vote buffer now contains the matching docids and each term sublist
// has been adjusted to only contain these docids as well. Let the fun begin.
//
//
// TRANSFORM QueryTermInfo::m_* vars into old style arrays
m_wikiPhraseIds.resize(m_numQueryTermInfos);
m_quotedStartIds.resize(m_numQueryTermInfos);
m_qpos.resize(m_numQueryTermInfos);
m_qtermNums.resize(m_numQueryTermInfos);
m_bflags.resize(m_numQueryTermInfos);
std::vector<const char *> highestScoringNonBodyPos(m_numQueryTermInfos);
std::vector<const char *> bestMinTermPairWindowPtrs(m_numQueryTermInfos);
std::vector<const char *> xpos(m_numQueryTermInfos);
PairScoreMatrix scoreMatrix(m_numQueryTermInfos);
int64_t lastTime = gettimeofdayInMilliseconds();
int64_t now;
int64_t took;
int32_t phase = 3; // 2 first in findCandidateDocIds
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo &qti = m_queryTermInfos[i];
// set it
m_wikiPhraseIds [i] = qti.m_wikiPhraseId;
m_quotedStartIds[i] = qti.m_quotedStartId;
// query term position
m_qpos [i] = qti.m_qpos;
m_qtermNums [i] = qti.m_qtermNum;
}
//////////
//
// OLD MAIN INTERSECTION LOGIC
//
/////////
DocIdScore dcs;
DocIdScore *pdcs = NULL;
uint64_t lastDocId = 0LL;
int32_t lastLen = 0;
char siteRank;
int highestInlinkSiteRank;
lang_t docLang;
float score;
int32_t intScore = 0;
float minScore = 0.0;
float minSingleScore;
// scan the posdb keys in the smallest list
// raised from 200 to 300,000 for 'da da da' query
MiniMergeBuffer miniMergeBuf(m_numQueryTermInfos);
const char *docIdPtr;
char *docIdEnd = m_docIdVoteBuf.getBufStart()+m_docIdVoteBuf.length();
float minWinningScore = -1.0;
int32_t topCursor = -9;
int32_t numProcessed = 0;
int32_t prefiltMaxPossScoreFail = 0;
int32_t prefiltMaxPossScorePass = 0;
int32_t prefiltBestDistMaxPossScoreFail = 0;
int32_t prefiltBestDistMaxPossScorePass = 0;
// populate the cursors for each sublist
int32_t numQueryTermsToHandle = m_numQueryTermInfos;
if ( ! m_msg39req->m_doMaxScoreAlgo ) {
numQueryTermsToHandle = 0;
}
//
// Run through the scoring logic once or twice. Two passes needed ONLY if we
// need to create additional (debug) scoring info for visual display.
//
int numPassesNeeded = m_msg39req->m_getDocIdScoringInfo ? 2 : 1;
for(int currPassNum=0; currPassNum < numPassesNeeded; currPassNum++) {
//
// Pass 0: Find candidate docids and calculate scores
// Pass 1: Only for creating debug scoring info for visual display
//
switch( currPassNum ) {
case INTERSECT_SCORING:
logTrace(g_conf.m_logTracePosdb, "Loop 0: Real scoring");
break;
case INTERSECT_DEBUG_INFO:
logTrace(g_conf.m_logTracePosdb, "Loop 1: Data for visual scoring info");
removeScoreInfoForDeletedDocIds();
break;
default:
log(LOG_LOGIC,"%s:%d: Illegal pass number %d", __FILE__, __LINE__, currPassNum);
gbshutdownLogicError();
}
// reset docid to start!
docIdPtr = m_docIdVoteBuf.getBufStart();
if( currPassNum == INTERSECT_DEBUG_INFO ) {
//
// reset QueryTermInfo::m_matchingSubListCursor[] to point to start of
// term lists for second pass that gets printable scoring info
//
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo &qti = m_queryTermInfos[i];
// skip negative termlists
if ( qti.m_numSubLists>0 && qti.m_subList[0].m_bigramFlag & BF_NEGATIVE ) {
continue;
}
// do each sublist
for ( int32_t j = 0 ; j < qti.m_numMatchingSubLists ; j++ ) {
qti.m_matchingSublist[j].m_cursor = qti.m_matchingSublist[j].m_start;
qti.m_matchingSublist[j].m_savedCursor = qti.m_matchingSublist[j].m_start;
}
}
}
//#
//# MAIN LOOP for looping over each docid
//#
bool allDone = false;
while( !allDone && docIdPtr < docIdEnd ) {
// logTrace(g_conf.m_logTracePosdb, "Handling next docId");
bool skipToNextDocId = false;
siteRank = 0;
docLang = langUnknown;
highestInlinkSiteRank = -1;
bool docInThisFile;
if ( currPassNum == INTERSECT_SCORING ) {
m_docId = *(uint32_t *)(docIdPtr+1);
m_docId <<= 8;
m_docId |= (unsigned char)docIdPtr[0];
m_docId >>= 2;
docInThisFile = m_documentIndexChecker->exists(m_docId);
}
else {
//
// second pass? for printing out transparency info.
// genDebugScoreInfo1 sets m_docId from the top scorer tree
//
if( genDebugScoreInfo1(&numProcessed, &topCursor, &docInThisFile) ) {
logTrace(g_conf.m_logTracePosdb, "Pass #%d for file %" PRId32 " done", currPassNum, m_documentIndexChecker->getFileNum());
// returns true if no more docids to handle
allDone = true;
break; // break out of docIdPtr < docIdEnd loop
}
}
//bool docInThisFile = m_documentIndexChecker->exists(m_docId);
logTrace(g_conf.m_logTracePosdb, "Handling next docId: %" PRId64 " - pass #%d - %sfound in this file (%" PRId32 ")", m_docId, currPassNum, docInThisFile?"":"SKIPPING, not ", m_documentIndexChecker->getFileNum());
if(!docInThisFile) {
// Only advance cursors in first pass
if( currPassNum == INTERSECT_SCORING ) {
if( !advanceTermListCursors(docIdPtr) ) {
logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed");
return;
}
docIdPtr += 6;
}
continue;
}
//calculate complete score multiplier
float completeScoreMultiplier = 1.0;
unsigned flags = 0;
if(g_d2fasm.lookupFlags(m_docId,&flags) && flags) {
for(int i=0; i<26; i++) {
if(flags&(1<<i))
completeScoreMultiplier *= m_baseScoringParameters.m_flagScoreMultiplier[i];
}
}
if( currPassNum == INTERSECT_SCORING ) {
//
// Pre-advance each termlist's cursor to skip to next docid.
//
if( !advanceTermListCursors(docIdPtr) ) {
logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed");
return;
}
if( !m_q->m_isBoolean ) {
//##
//## PRE-FILTERS. Discard DocIDs that cannot meet the minimum required
//## score, before entering the main scoring loop below
//##
//
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'time enough for love'. it might save time!
//
// Calculate maximum possible score for a document. If the max score
// is lower than the current minimum winning score, give up already
// now and skip to the next docid.
//
// minWinningScore is the score of the lowest scoring doc in m_topTree IF
// the topTree contains the number of docs requested. Otherwise it is -1
// and the doc is inserted no matter the score.
//
// Only go through this if we actually have a minimum score to compare with ...
// No need if minWinningScore is still -1.
//
if ( minWinningScore >= 0.0 ) {
logTrace(g_conf.m_logTracePosdb, "Compute 'upper bound' for each query term");
// If there's no way we can break into the winner's circle, give up!
// This computes an upper bound for each query term
for ( int32_t i = 0 ; i < numQueryTermsToHandle ; i++ ) {
// skip negative termlists.
if ( m_queryTermInfos[i].m_subList[0].m_bigramFlag & (BF_NEGATIVE) ) {
continue;
}
// an upper bound on the score we could get
float maxScore = getMaxPossibleScore(&(m_queryTermInfos[i]));
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore < 0.0 ) {
continue;
}
maxScore *= completeScoreMultiplier;
// logTrace(g_conf.m_logTracePosdb, "maxScore=%f minWinningScore=%f", maxScore, minWinningScore);
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
if ( maxScore <= minWinningScore ) {
docIdPtr += 6;
prefiltMaxPossScoreFail++;
skipToNextDocId = true;
break; // break out of numQueryTermsToHandle loop
}
}
}
if( skipToNextDocId ) {
// continue docIdPtr < docIdEnd loop
logTrace(g_conf.m_logTracePosdb, "Max possible score for docId %" PRId64 " too low, skipping to next", m_docId);
continue;
}
prefiltMaxPossScorePass++;
if ( minWinningScore >= 0.0 ) {
if( !prefilterMaxPossibleScoreByDistance(minWinningScore*completeScoreMultiplier) ) {
docIdPtr += 6;
prefiltBestDistMaxPossScoreFail++;
skipToNextDocId = true;
}
}
if( skipToNextDocId ) {
// Continue docIdPtr < docIdEnd loop
logTrace(g_conf.m_logTracePosdb, "Max possible score by distance for docId %" PRId64 " too low, skipping to next", m_docId);
continue;
}
prefiltBestDistMaxPossScorePass++;
} // !m_q->m_isBoolean
} // currPassNum == INTERSECT_SCORING
if ( m_q->m_isBoolean ) {
// add one point for each term matched in the bool query
// this is really just for when the terms are from different
// fields. if we have unfielded boolean terms we should
// do proximity matching.
int32_t slot = m_bt.getSlot ( &m_docId );
if ( slot >= 0 ) {
uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
// then a score based on the # of terms that matched
int16_t bitsOn = getNumBitsOnX ( bv, m_vecSize );
// but store in hashtable now
minScore = (float)bitsOn;
}
else {
minScore = 1.0;
}
}
//##
//## PERFORMANCE HACK: ON-DEMAND MINI MERGES.
//##
//## Data for the current DocID found in sublists of each query term
//## is merged into a single list, so we end up with one list per query
//## term. They are all stored in a single buffer (miniMergeBuf), which
//## the miniMerged* pointers point into..
//##
mergeTermSubListsForDocId(&miniMergeBuf, &highestInlinkSiteRank);
// clear the counts on this DocIdScore class for this new docid
pdcs = NULL;
if ( currPassNum == INTERSECT_DEBUG_INFO ) {
dcs.reset();
pdcs = &dcs;
}
//##
//## ACTUAL SCORING BEGINS
//##
if ( !m_q->m_isBoolean ) {
//#
//# NON-BODY TERM PAIR SCORING LOOP
//#
createNonBodyTermPairScoreMatrix(&miniMergeBuf, &scoreMatrix);
//#
//# SINGLE TERM SCORE LOOP
//#
minSingleScore = getMinSingleTermScoreSum(&miniMergeBuf, highestScoringNonBodyPos, pdcs);
logTrace(g_conf.m_logTracePosdb, "minSingleScore=%f before multiplication for docId %" PRIu64 "", minSingleScore, m_docId);
minSingleScore *= completeScoreMultiplier;
//#
//# DOCID / SITERANK DETECTION
//#
for(int32_t k=0; k < m_numQueryTermInfos; k++) {
if ( ! miniMergeBuf.mergedListStart[k] ) {
continue;
}
siteRank = Posdb::getSiteRank ( miniMergeBuf.mergedListStart[k] );
docLang = Posdb::getLangId(miniMergeBuf.mergedListStart[k]);
break;
}
logTrace(g_conf.m_logTracePosdb, "Got siteRank %d and docLang %d", (int)siteRank, (int)docLang);
//#
//# SLIDING WINDOW SCORING ALGORITHM
//#
// After calling this, bestMinTermPairWindowPtrs will point to the
// term positions set ("window") that has the highest minimum score. These
// pointers are used when determining the minimum term pair score returned
// by the function.
float minPairScore = getMinTermPairScoreSlidingWindow(&miniMergeBuf, highestScoringNonBodyPos, bestMinTermPairWindowPtrs, xpos, scoreMatrix, pdcs);
logTrace(g_conf.m_logTracePosdb, "minPairScore=%f before multiplication for docId %" PRIu64 "", minPairScore, m_docId);
minPairScore *= completeScoreMultiplier;
//#
//# Find minimum score - either single term or term pair
//#
minScore = 999999999.0;
// get a min score from all the term pairs
if ( minPairScore < minScore && minPairScore >= 0.0 ) {
minScore = minPairScore;
}
// if we only had one query term
if ( minSingleScore < minScore ) {
minScore = minSingleScore;
}
logTrace(g_conf.m_logTracePosdb, "minPairScore=%f, minScore=%f for docId %" PRIu64 "", minPairScore, minScore, m_docId);
// No positive score? Then skip the doc
if ( minScore <= 0.0 ) {
if( currPassNum == INTERSECT_SCORING ) {
// advance to next docid
docIdPtr += 6;
}
logTrace(g_conf.m_logTracePosdb, "Skipping docid %" PRIu64 " - no positive score", m_docId);
// Continue docid loop
continue;
}
} // !m_q->m_isBoolean
//#
//# Calculate score and give boost based on siterank and highest inlinking siterank
//#
float adjustedSiteRank = siteRank;
if( highestInlinkSiteRank > siteRank ) {
//adjust effective siterank because a high-rank site linked to it. Don't adjust it too much though.
adjustedSiteRank = siteRank + (highestInlinkSiteRank-siteRank) / 3.0;
logTrace(g_conf.m_logTracePosdb, "Highest inlink siterank %d > siterank %d. Adjusting to %f for docId %" PRIu64 "", highestInlinkSiteRank, (int)siteRank, adjustedSiteRank, m_docId);
}
score = minScore * (adjustedSiteRank*m_baseScoringParameters.m_siteRankMultiplier+1.0);
logTrace(g_conf.m_logTracePosdb, "Score %f for docId %" PRIu64 "", score, m_docId);
//#
//# Give score boost if query and doc language is the same,
//# and optionally a different boost if the language of the
//# page is unknown.
//#
//# Use "qlang" parm to set the language. i.e. "&qlang=fr"
//#
score *= m_msg39req->m_baseScoringParameters.m_languageWeights[docLang];
double page_temperature = 0;
bool use_page_temperature = false;
float score_before_page_temp = score;
if(m_baseScoringParameters.m_usePageTemperatureForRanking) {
use_page_temperature = true;
const auto range_min = m_baseScoringParameters.m_pageTemperatureWeightMin;
const auto range_max = m_baseScoringParameters.m_pageTemperatureWeightMax;
uint32_t sitehash32;
unsigned raw_default_site_page_temperature;
if(g_pageTemperatureRegistry.query_page_temperature(m_docId, range_min, range_max, &page_temperature)) {
//excellent, we know the page's temperature
} else if(g_d2fasm.lookupSiteHash(m_docId,&sitehash32) && g_smptr.lookup(sitehash32,&raw_default_site_page_temperature)) {
// we'll only use site median page temperature when we have updated docid2siteflags file
//hmm, use the site-default page temperature
page_temperature = g_pageTemperatureRegistry.scale_temperature(range_min, range_max, raw_default_site_page_temperature);
} else {
//ok, last resort, use the global default page temperature
page_temperature = g_pageTemperatureRegistry.query_default_page_temperature(range_min, range_max);
}
score *= page_temperature;
logTrace(g_conf.m_logTracePosdb, "Page temperature for docId %" PRIu64 " is %.14f, score %f -> %f", m_docId, page_temperature, score_before_page_temp, score);
}
//#
//# Handle sortby int/float and minimum docid/score pairs
//#
if( m_hasMaxSerpScore ) {
// assume filtered out
if ( currPassNum == INTERSECT_SCORING ) {
m_filtered++;
}
// now we have a maxscore/maxdocid upper range so the widget
// can append only new results to an older result set.
if ( m_hasMaxSerpScore ) {
bool skipToNext = false;
// if dealing with an "int" score use the extra precision
// of the double that m_maxSerpScore is!
if ( score > (float)m_msg39req->m_maxSerpScore ) {
skipToNext = true;
}
else
if ( almostEqualFloat(score, (float)m_msg39req->m_maxSerpScore) && (int64_t)m_docId <= m_msg39req->m_minSerpDocId ) {
//@todo: Why is m_msg39req->m_maxSerpScore double and score float?
skipToNext = true;
}
if( skipToNext ) {
// advance to next docid
if( currPassNum == INTERSECT_SCORING ) {
docIdPtr += 6;
}
// Continue docIdPtr < docIdEnd loop
continue;
}
}
// we did not get filtered out
if ( currPassNum == INTERSECT_SCORING ) {
m_filtered--;
}
}
// We only come here if we actually made it into m_topTree - second round only loops through
// TopTree entries.
if ( currPassNum == INTERSECT_DEBUG_INFO ) {
// NEW 20170423
dcs.m_usePageTemperature = use_page_temperature;
dcs.m_pageTemperature = page_temperature;
dcs.m_adjustedSiteRank = adjustedSiteRank;
if( genDebugScoreInfo2(&dcs, &lastLen, &lastDocId, siteRank, score, intScore, docLang) ) {
// advance to next docid
if( currPassNum == INTERSECT_SCORING ) {
docIdPtr += 6;
}
// Continue docIdPtr < docIdEnd loop
continue;
}
}
if( currPassNum == INTERSECT_SCORING ) {
//#
//# SCORING DONE! Add to top-scorer tree.
//#
int32_t tn = m_topTree->getEmptyNode();
// Sanity check
if( tn < 0 ) {
log(LOG_LOGIC,"%s:%s:%d: No space left in m_topTree", __FILE__, __func__, __LINE__);
gbshutdownLogicError();
}
TopNode *t = m_topTree->getNode(tn);
// set the score and docid ptr
t->m_score = score;
t->m_docId = m_docId;
t->m_flags = flags;
logTrace(g_conf.m_logTracePosdb, "docid=%15ld score=%f", m_docId, score);
//
// This will not add if tree is full and it is less than the m_lowNode in score.
//
// If it does get added to a full tree, lowNode will be removed.
//
m_topTree->addNode(t, tn);
// top tree only holds enough docids to satisfy the
// m_msg39request::m_docsToGet (m_msg39req->m_docsToGet) request
// from the searcher. It basically stores m_docsToGet
// into TopTree::m_docsWanted. TopTree::m_docsWanted is often
// double m_docsToGet to compensate for site clustering, and
// it can be even more than that in order to ensure we get
// enough domains represented in the search results.
// See TopTree::addNode(). it will not add the "t" node if
// its score is not high enough when the top tree is full.
if ( m_topTree->getNumUsedNodes() > m_topTree->getNumDocsWanted() ) {
// get the lowest scoring node
int32_t lowNode = m_topTree->getLowNode();
// and record its score in "minWinningScore"
minWinningScore = m_topTree->getNode(lowNode)->m_score;
}
}
// advance to next docid
if( currPassNum == INTERSECT_SCORING ) {
docIdPtr += 6;
}
} // docIdPtr < docIdEnd loop
if ( m_debug ) {
now = gettimeofdayInMilliseconds();
took = now - lastTime;
log(LOG_INFO, "posdb: new algo phase %" PRId32" took %" PRId64" ms", phase,took);
lastTime = now;
phase++;
}
} // for ... currPassNum < numPassesNeeded
if ( m_debug ) {
log(LOG_INFO, "posdb: # prefiltMaxPossScoreFail........: %" PRId32" ", prefiltMaxPossScoreFail );
log(LOG_INFO, "posdb: # prefiltMaxPossScorePass........: %" PRId32" ", prefiltMaxPossScorePass );
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScoreFail: %" PRId32" ", prefiltBestDistMaxPossScoreFail );
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScorePass: %" PRId32" ", prefiltBestDistMaxPossScorePass );
}
if( g_conf.m_logTracePosdb ) {
m_topTree->logTreeData(LOG_TRACE);
logDebugScoreInfo(LOG_TRACE);
}
// get time now
now = gettimeofdayInMilliseconds();
// store the addLists time
m_addListsTime = now - t1;
m_t1 = t1;
m_t2 = now;
//opportunistic cleanup (memory release)
m_wikiPhraseIds.clear();
m_quotedStartIds.clear();
m_qpos.clear();
m_qtermNums.clear();
m_bflags.clear();
logTrace(g_conf.m_logTracePosdb, "END. Took %" PRId64" msec", m_addListsTime);
}
// . "bestDist" is closest distance to query term # m_minTermListIdx
// . set "bestDist" to 1 to ignore it
float PosdbTable::getMaxPossibleScore(const QueryTermInfo *qti) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// get max score of all sublists
float bestHashGroupWeight = -1.0;
float bestTermFreqWeight = -1.0;
unsigned char bestDensityRank = 0;
char siteRank = -1;
bool docLangSet = false;
lang_t docLang = langUnknown;
unsigned char hgrp;
bool hadHalfStopWikiBigram = false;
// scan those sublists to set m_ptrs[] and to get the
// max possible score of each one
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
// scan backwards up to this
const char *start = qti->m_matchingSublist[j].m_savedCursor;
// skip if does not have our docid
if ( ! start ) {
continue;
}
// note it if any is a wiki bigram
if ( qti->m_subList[0].m_bigramFlag & BF_HALFSTOPWIKIBIGRAM ) {
hadHalfStopWikiBigram = true;
}
// skip if entire sublist/termlist is exhausted
if ( start >= qti->m_matchingSublist[j].m_end ) {
continue;
}
if ( siteRank == -1 ) {
siteRank = Posdb::getSiteRank(start);
}
if ( !docLangSet ) {
docLang = Posdb::getLangId(start);
docLangSet = true;
}
// skip first key because it is 12 bytes, go right to the
// 6 byte keys. we deal with it below.
start += 12;
// get cursor. this is actually pointing to the next docid
const char *dc = qti->m_matchingSublist[j].m_cursor;
// back up into our list
dc -= 6;
// reset this
bool retried = false;
// do not include the beginning 12 byte key in this loop!
for ( ; dc >= start ; dc -= 6 ) {
// loop back here for the 12 byte key
retry:
// get the best hash group
hgrp = Posdb::getHashGroup(dc);
// if not body, do not apply this algo because
// we end up adding term pairs from each hash group
if ( hgrp == HASHGROUP_INLINKTEXT ) {
return -1.0;
}
float termFreqWeight = qti->m_subList[qti->m_matchingSublist[j].m_baseSubListIndex].m_qt->m_termFreqWeight;
if(termFreqWeight>bestTermFreqWeight)
bestTermFreqWeight = termFreqWeight;
//if ( hgrp == HASHGROUP_TITLE ) return -1.0;
// loser?
if ( m_derivedScoringWeights.m_hashGroupWeights[hgrp] < bestHashGroupWeight ) {
// if in body, it's over for this termlist
// because this is the last hash group
// we will encounter.
if ( hgrp == HASHGROUP_BODY ) {
// @@@ BR: Dangerous assumption if we change indexing order in XmlDoc_Indexing ! @@@
goto nextTermList;
}
// otherwise, keep chugging
continue;
}
unsigned char dr = Posdb::getDensityRank(dc);
// a clean win?
if ( m_derivedScoringWeights.m_hashGroupWeights[hgrp] > bestHashGroupWeight ) {
bestHashGroupWeight = m_derivedScoringWeights.m_hashGroupWeights[hgrp];
bestDensityRank = dr;
continue;
}
// better density rank?
if ( dr > bestDensityRank ) {
bestDensityRank = dr;
}
}
// handle the beginning 12 byte key
if ( ! retried ) {
retried = true;
dc = qti->m_matchingSublist[j].m_savedCursor;
goto retry;
}
nextTermList:
;
}
// if nothing, then maybe all sublists were empty?
if ( bestHashGroupWeight < 0 ) {
return 0.0;
}
// assume perfect adjacency and that the other term is perfect
float score = 100.0;
score *= bestHashGroupWeight;
score *= bestHashGroupWeight;
// since adjacent, 2nd term in pair will be in same sentence
// TODO: fix this for 'qtm' it might have a better density rank and
// better hashgroup weight, like being in title!
score *= m_derivedScoringWeights.m_densityWeights[bestDensityRank];
score *= m_derivedScoringWeights.m_densityWeights[bestDensityRank];
// wiki bigram?
if ( hadHalfStopWikiBigram ) {
score *= WIKI_BIGRAM_WEIGHT;
score *= WIKI_BIGRAM_WEIGHT;
}
//score *= perfectWordSpamWeight * perfectWordSpamWeight;
score *= (((float)siteRank)*m_baseScoringParameters.m_siteRankMultiplier+1.0);
// language boost if language specified and if page is same language, or unknown language
score *= m_msg39req->m_baseScoringParameters.m_languageWeights[docLang];
// assume the other term we pair with will be 1.0
score *= bestTermFreqWeight;
score *= bestTermFreqWeight;
//TODO: shouldn't we multiple with userweight here too?
// terms in same wikipedia phrase?
//if ( wikiWeight != 1.0 )
// score *= WIKI_WEIGHT;
// if query is 'time enough for love' it is kinda slow because
// we were never multiplying by WIKI_WEIGHT, even though all
// query terms were in the same wikipedia phrase. so see if this
// speeds it up.
if ( m_allInSameWikiPhrase ) {
score *= WIKI_WEIGHT;
}
logTrace(g_conf.m_logTracePosdb, "END. score=%f", score);
return score;
}
float PosdbTable::modifyMaxScoreByDistance(float score,
int32_t bestDist,
int32_t qdist,
const QueryTermInfo *qtm)
{
score *= qtm->m_maxMatchingTermFreqWeight;
// subtract qdist
bestDist -= qdist;
// make it positive
bestDist = abs(bestDist);
// avoid 0 division
if(bestDist > 1)
score /= (float)bestDist;
return score;
}
////////////////////
//
// "White list" functions used to find docids from only specific sites
//
////////////////////
// this is separate from allocTopTree() function below because we must
// call it for each iteration in Msg39::doDocIdSplitLoop() which is used
// to avoid reading huge termlists into memory. it breaks the huge lists
// up by smaller docid ranges and gets the search results for each docid
// range separately.
bool PosdbTable::allocWhiteListTable ( ) {
//
// the whitetable is for the docids in the whitelist. we have
// to only show results whose docid is in the whitetable, which
// is from the "&sites=abc.com+xyz.com..." custom search site list
// provided by the user.
//
if ( m_msg39req->size_whiteList <= 1 ) m_useWhiteTable = false; // inclds \0
else m_useWhiteTable = true;
int32_t sum = 0;
for ( int32_t i = 0 ; i < m_msg2->getNumWhiteLists() ; i++ ) {
const RdbList *list = m_msg2->getWhiteList(i);
if(!list->isEmpty()) {
// assume 12 bytes for all keys but first which is 18
int32_t size = list->getListSize();
sum += size / 12 + 1;
}
}
if ( sum ) {
// making this sum * 3 does not show a speedup... hmmm...
int32_t numSlots = sum * 2;
// keep it restricted to 5 byte keys so we do not have to
// extract the docid, we can just hash the ptr to those
// 5 bytes (which includes 1 siterank bit as the lowbit,
// but should be ok since it should be set the same in
// all termlists that have that docid)
if (!m_whiteListTable.set(5, 0, numSlots, NULL, 0, false, "wtall", true)) {
return false;
}
}
return true;
}
void PosdbTable::prepareWhiteListTable()
{
// hash the docids in the whitelist termlists into a hashtable.
// every docid in the search results must be in there. the
// whitelist termlists are from a provided "&sites=abc.com+xyz.com+.."
// cgi parm. the user only wants search results returned from the
// specified subdomains. there can be up to MAX_WHITELISTS (500)
// sites right now. this hash table must have been pre-allocated
// in Posdb::allocTopTree() above since we might be in a thread.
if ( m_addedSites )
return;
for ( int32_t i = 0 ; i < m_msg2->getNumWhiteLists() ; i++ ) {
RdbList *list = m_msg2->getWhiteList(i);
if(!list->isEmpty()) {
// sanity test
int64_t d1 = Posdb::getDocId(list->getList());
if ( d1 > m_msg2->docIdEnd() ) {
log("posdb: d1=%" PRId64" > %" PRId64,
d1,m_msg2->docIdEnd());
//gbshutdownAbort(true);
}
if ( d1 < m_msg2->docIdStart() ) {
log("posdb: d1=%" PRId64" < %" PRId64,
d1,m_msg2->docIdStart());
//gbshutdownAbort(true);
}
// first key is always 18 bytes cuz it has the termid
// scan recs in the list
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
char *rec = list->getCurrentRec();
// point to the 5 bytes of docid
m_whiteListTable.addKey ( rec + 7 );
}
}
}
m_addedSites = true;
}
bool PosdbTable::allocateTopTree() {
//If all Msg2 rdblists are empty then don't do anything (there is nothing to rank or score)
bool allEmpty = true;
for(int i = 0; i < m_msg2->getNumLists(); i++) {
const RdbList *list = m_msg2->getList(i);
if(list && !list->isEmpty() && list->getListSize()>=18) {
allEmpty = false;
break;
}
}
if(allEmpty) {
if(m_debug)
log(LOG_DEBUG,"toptree: all Msg2 lists are empty");
return true;
}
// Normally m_msg39req->m_docsToGet is something sensible such as 10 or 50. Some specialized queries or attempts
// at DOSing can set it to something unreasonable as 1.000.000. Internal functions such as QueryReindex sets
// m_msg39req->m_docsToGet to 99999999 meaning "all documents".
// We cannot protect against DOSs here, but the 99999999 value must be handled. The problem is that 99999999
// would likely cause OOM, so we have to size the toptree to what is actually in the database. And in the case
// the database-derived size causes OOM then we'd have to use the documentSplit functionality (which is
// currently defunct after nomerge2 branch was merged to master. See Msg39.cpp for details).
//
// Strategy:
// - if m_msg39req->m_docsToGet is smallish then accept it. Only adjust as needed by enabled clustering.
// - otherwise get an estimated list size from Posdb so we can put an upper (and better) limit on
// m_msg39req->m_docsToGet instead of 99999999
int32_t docsWanted;
if(m_msg39req->m_docsToGet <= 1000) {
// smallish, accept as-is
docsWanted = m_msg39req->m_docsToGet;
if(m_debug)
log(LOG_DEBUG, "toptree: docsToGet is small (%d), docsWanted = %d", m_msg39req->m_docsToGet, docsWanted);
} else {
//not small. Get list size estimates and estimate an upper limit on the number of documents that could possibly match.
//
//we cannot calculate the estimate based on m_msg2->getList(...)->getListSize() because m_msg2 only holds the lists
//from the current fileNumber (eg. posdb0007.dat) and we need the estimate over all the files + bucket
int64_t totalEstimatedEntries = 0;
for(int i=0; i<m_q->getNumTerms(); i++) {
int64_t termId = m_q->getTermId(i);
int64_t estimatedTermListSize = g_posdb.estimateLocalTermListSize(m_msg39req->m_collnum, termId);
logTrace(g_conf.m_logTracePosdb, "allocateTopTree: termId=%ld, estimatedTermListSize=%ld bytes", termId, estimatedTermListSize);
//If we have listsize=54 then due to the double compression of posdb entries it could be one
//document with 7 entries, or four documents with one entry each
//The latter is the worst case so we'll use that.
int32_t estimatedEntries = estimatedTermListSize / (sizeof(posdbkey_t)-6);
totalEstimatedEntries += estimatedEntries;
}
logTrace(g_conf.m_logTracePosdb, "totalEstimatedEntries=%ld", totalEstimatedEntries);
if(m_msg39req->m_docsToGet < totalEstimatedEntries) {
//wants less than what is in the DB. excellent
docsWanted = m_msg39req->m_docsToGet;
} else {
//wants as much or more than there is in the DB. Hmmm.
if(totalEstimatedEntries > INT32_MAX) { //32bit overflow
log(LOG_ERROR,"toptree: estimated number of documents = %ld. Cannot squeeze that into a TopTree", totalEstimatedEntries);
return false;
}
docsWanted = totalEstimatedEntries;
}
if(m_debug)
log(LOG_DEBUG, "toptree: docsToGet is large (%d), docsWanted = %d", m_msg39req->m_docsToGet, docsWanted);
}
//Magic multiplication. Original source had no comment on why this was done. TopTree appears to already
//take care of enlarging the allocation if clustering is enabled, so uhm... ?
if(m_msg39req->m_doSiteClustering)
docsWanted *= 2;
// limit to 2B docids i guess
docsWanted = gbmin(docsWanted,2000000000);
if(m_debug)
log(LOG_INFO, "toptree: toptree: initializing %d nodes",docsWanted);
if(docsWanted < m_msg39req->m_docsToGet) {
log("query: warning only getting up to %d docids even though %d requested because termlist sizes are smaller",
docsWanted, m_msg39req->m_docsToGet);
}
// keep it sane
if(docsWanted > m_msg39req->m_docsToGet * 2 && docsWanted > 60) {
docsWanted = m_msg39req->m_docsToGet * 2;
}
// this actually sets the # of nodes to MORE than nn!!!
if(!m_topTree->setNumNodes(docsWanted, m_msg39req->m_doSiteClustering)) {
log("toptree: toptree: error allocating nodes: %s",
mstrerror(g_errno));
return false;
}
return true;
}
bool PosdbTable::allocateScoringInfo() {
// let's use nn*4 to try to get as many score as possible, although
// it may still not work!
int32_t xx = m_topTree->getNumDocsWanted();
// try to fix a core of growing this table in a thread when xx == 1
xx = gbmax(xx,32);
//if ( m_msg39req->m_doSiteClustering ) xx *= 4;
// for seeing if a docid is in toptree. niceness=0.
//if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") )
// return false;
if ( m_msg39req->m_getDocIdScoringInfo ) {
m_scoreInfoBuf.setLabel ("scinfobuf" );
// . for holding the scoring info
// . add 1 for the \0 safeMemcpy() likes to put at the end so
// it will not realloc on us
if ( ! m_scoreInfoBuf.reserve ( xx * sizeof(DocIdScore) +100) ) {
return false;
}
// likewise how many query term pair scores should we get?
int32_t numTerms = m_q->m_numTerms;
// limit
numTerms = gbmin(numTerms,10);
// the pairs. divide by 2 since (x,y) is same as (y,x)
int32_t numPairs = (numTerms * numTerms) / 2;
// then for each pair assume no more than MAX_TOP reps, usually
// it's just 1, but be on the safe side
numPairs *= m_realMaxTop;//MAX_TOP;
// now that is how many pairs per docid and can be 500! but we
// still have to multiply by how many docids we want to
// compute. so this could easily get into the megabytes, most
// of the time we will not need nearly that much however.
numPairs *= xx;
m_pairScoreBuf.setLabel ( "pairbuf" );
m_singleScoreBuf.setLabel ("snglbuf" );
// but alloc it just in case
if ( ! m_pairScoreBuf.reserve (numPairs * sizeof(PairScore) ) ) {
return false;
}
// and for singles
int32_t numSingles = numTerms * m_realMaxTop * xx; // MAX_TOP *xx;
if ( !m_singleScoreBuf.reserve(numSingles*sizeof(SingleScore))) {
return false;
}
}
return true;
}
////////////////////
//
// Functions used to find candidate docids, including "Voting" functions
// used to find docids with all required term ids and functions to remove
// docids that do not match all required terms
//
////////////////////
//
// Run through each term sublist and remove all docids not
// found in the docid vote buffer
//
void PosdbTable::delNonMatchingDocIdsFromSubLists() {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
//phase 1: shrink the rdblists for all queryterms (except those with a minus sign)
std::valarray<char *> newEndPtr(m_q->m_numTerms);
for(int i=0; i<m_q->m_numTerms; i++) {
newEndPtr[i] = NULL;
if(m_q->m_qterms[i].m_termSign=='-')
continue;
RdbList *list = m_q->m_qterms[i].m_posdbListPtr;
if(!list || list->isEmpty())
continue;
// get that sublist
char *subListPtr = list->getList();
char *subListEnd = list->getListEnd();
char *dst = subListPtr;
// reset docid list ptrs
const char *dp = m_docIdVoteBuf.getBufStart();
const char *dpEnd = dp + m_docIdVoteBuf.length();
//log(LOG_INFO,"@@@@ i#%d subListPtr=%p subListEnd=%p", i, subListPtr, subListEnd);
for(;;) {
// scan the docid list for the current docid in this termlist
for ( ; dp < dpEnd; dp += 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( *(uint32_t *)(dp+1) > *(uint32_t *)(subListPtr+8) ) {
break;
}
// try to catch up docid if it is behind
if ( *(uint32_t *)(dp+1) < *(uint32_t *)(subListPtr+8) ) {
continue;
}
// check lower byte if equal
if ( *(unsigned char *)(dp) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
break;
}
if ( *(unsigned char *)(dp) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
continue;
}
// copy over the 12 byte key
*(int64_t *)dst = *(int64_t *)subListPtr;
*(int32_t *)(dst+8) = *(int32_t *)(subListPtr+8);
// skip that
dst += 12;
subListPtr += 12;
// copy over any 6 bytes keys following
for ( ; ; ) {
if ( subListPtr >= subListEnd ) {
// give up on this exhausted term list!
goto doneWithSubList;
}
// next docid willbe next 12 bytekey
if ( ! ( subListPtr[0] & 0x04 ) ) {
break;
}
// otherwise it's 6 bytes
*(int32_t *)dst = *(int32_t *)subListPtr;
*(int16_t *)(dst+4) = *(int16_t *)(subListPtr+4);
dst += 6;
subListPtr += 6;
}
}
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
subListPtr += 12;
// skip any following keys that are 6 bytes, that means they
// share the same docid
for ( ; ; ) {
// list exhausted?
if ( subListPtr >= subListEnd ) {
goto doneWithSubList;
}
// stop if next key is 12 bytes, that is a new docid
if ( ! (subListPtr[0] & 0x04) ) {
break;
}
// skip it
subListPtr += 6;
}
}
doneWithSubList:
//log(LOG_INFO,"@@@ shrunk #%d to %ld (%p-%p)", i, dst - list->getList(), list->getList(), dst);
newEndPtr[i] = dst;
}
//phase 2: set the matchingsublist pointers in qti
for(int i=0; i<m_numQueryTermInfos; i++) {
QueryTermInfo *qti = &(m_queryTermInfos[i]);;
qti->m_numMatchingSubLists = 0;
if(qti->m_numSubLists>0 && qti->m_subList[0].m_bigramFlag & BF_NEGATIVE)
continue; //don't modify sublist for negative terms
#ifdef _VALGRIND_
VALGRIND_MAKE_MEM_UNDEFINED(qti->m_matchingSublist, sizeof(qti->m_matchingSublist));
#endif
for(int j=0; j<qti->m_numSubLists; j++) {
for(int k=0; k<m_q->m_numTerms; k++) {
if(qti->m_subList[j].m_list == m_q->m_qterms[k].m_posdbListPtr) {
char *newStartPtr = m_q->m_qterms[k].m_posdbListPtr->getList(); //same as always
int32_t x = qti->m_numMatchingSubLists;
qti->m_matchingSublist[x].m_size = newEndPtr[k] - newStartPtr;
qti->m_matchingSublist[x].m_start = newStartPtr;
qti->m_matchingSublist[x].m_end = newEndPtr[k];
qti->m_matchingSublist[x].m_cursor = newStartPtr;
qti->m_matchingSublist[x].m_savedCursor = newStartPtr;
qti->m_matchingSublist[x].m_baseSubListIndex = j;
qti->m_numMatchingSubLists++;
break;
}
}
}
}
//calculate qti[].m_maxMatchingTermFreqWeight
for(int i=0; i<m_numQueryTermInfos; i++) {
QueryTermInfo *qti = &(m_queryTermInfos[i]);
if(qti->m_numMatchingSubLists>0) {
float maxMatchingTermFreqWeight = qti->m_subList[qti->m_matchingSublist[0].m_baseSubListIndex].m_qt->m_termFreqWeight;
for(int j=1; j<qti->m_numMatchingSubLists; j++) {
float w = qti->m_subList[qti->m_matchingSublist[j].m_baseSubListIndex].m_qt->m_termFreqWeight;
if(w>maxMatchingTermFreqWeight)
maxMatchingTermFreqWeight = w;
}
qti->m_maxMatchingTermFreqWeight = maxMatchingTermFreqWeight;
} else
qti->m_maxMatchingTermFreqWeight = -1.0;
}
logTrace(g_conf.m_logTracePosdb, "END.");
}
//
// Removes docids with vote -1, which is set for docids matching
// negative query terms (e.g. -rock)
//
void PosdbTable::delDocIdVotes ( const QueryTermInfo *qti ) {
char *bufStart = m_docIdVoteBuf.getBufStart();
char *voteBufPtr = NULL;
char *voteBufEnd;
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// just scan each sublist vs. the docid list
for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) {
// get that sublist
char *subListPtr = qti->m_subList[i].m_list->getList();
char *subListEnd = qti->m_subList[i].m_list->getListEnd();
// reset docid list ptrs
voteBufPtr = m_docIdVoteBuf.getBufStart();
voteBufEnd = voteBufPtr + m_docIdVoteBuf.length();
// loop it
while ( subListPtr < subListEnd ) {
// scan for his docids and inc the vote
for ( ;voteBufPtr <voteBufEnd ;voteBufPtr += 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( *(uint32_t *)(voteBufPtr+1) > *(uint32_t *)(subListPtr+8) ) {
break;
}
// less than? keep going
if ( *(uint32_t *)(voteBufPtr+1) < *(uint32_t *)(subListPtr+8) ) {
continue;
}
// top 4 bytes are equal. check lower single byte then.
if ( *(unsigned char *)(voteBufPtr) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
break;
}
if ( *(unsigned char *)(voteBufPtr) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
continue;
}
// . equal! mark it as nuked!
voteBufPtr[5] = -1;
// skip it
voteBufPtr += 6;
// advance subListPtr now
break;
}
// if we've exhausted this docid list go to next sublist
if ( voteBufPtr >= voteBufEnd ) {
goto endloop2;
}
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
subListPtr += 12;
// skip any following keys that are 6 bytes, that means they
// share the same docid
for ( ; subListPtr < subListEnd && ((*subListPtr)&0x04); ) {
subListPtr += 6;
}
// if we have more posdb recs in this sublist, then keep
// adding our docid votes into the docid list
}
endloop2: ;
// otherwise, advance to next sublist
}
// now remove docids with a -1 vote, they are nuked
voteBufPtr = m_docIdVoteBuf.getBufStart();
voteBufEnd = voteBufPtr + m_docIdVoteBuf.length();
char *dst = voteBufPtr;
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) {
// do not re-copy it if it was in this negative termlist
if(voteBufPtr[5] != -1) {
// copy it over. might be the same address!
*(int32_t *) dst = *(int32_t *) voteBufPtr;
*(int16_t *)(dst+4) = *(int16_t *)(voteBufPtr+4);
dst += 6;
}
}
// shrink the buffer size now
m_docIdVoteBuf.setLength ( dst - bufStart );
logTrace(g_conf.m_logTracePosdb, "END.");
}
//
// First call will allocate the m_docIdVoteBuf and add docids from the shortest
// term list to the buffer.
//
// Next calls will run through all term sublists (synonyms, term variations) and
// increase the score in the m_docIdVoteBuf for matching docids. Docids that do
// not match a term is removed, so we end up with list of docids matching all
// query terms.
//
void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// sanity check, we store this in a single byte below for voting
if ( listGroupNum >= 256 ) {
gbshutdownAbort(true);
}
//
// add the first sublist's docids into the docid buf
//
if( listGroupNum == 0 ) {
// the first listGroup is not intersecting, just adding to
// the docid vote buf. that is, if the query is "jump car" we
// just add all the docids for "jump". Subsequent calls to
// this function will intersect those docids with the docids
// for "car", resulting in a buffer with docids that contain
// both terms.
makeDocIdVoteBufForRarestTerm(qti);
logTrace(g_conf.m_logTracePosdb, "END.");
return;
}
char *bufStart = m_docIdVoteBuf.getBufStart();
char *voteBufPtr = NULL;
char *voteBufEnd;
//
// For each sublist (term variation) loop through all sublist records
// and compare with docids in the vote buffer. If a match is found, the
// score in the vote buffer is set to the current ListGroupNum.
//
// A sublist is a termlist for a particular query term, for instance
// the query term "jump" will have sublists for "jump" "jumps"
// "jumping" "jumped" and maybe even "jumpy", so that could be
// 5 sublists, and their QueryTermInfo::m_qtermNum should be the
// same for all 5.
//
for ( int32_t i = 0 ; i < qti->m_numSubLists; i++) {
// get that sublist
const char *subListPtr = qti->m_subList[i].m_list->getList();
const char *subListEnd = qti->m_subList[i].m_list->getListEnd();
// reset docid list ptrs
voteBufPtr = m_docIdVoteBuf.getBufStart();
voteBufEnd = voteBufPtr + m_docIdVoteBuf.length();
// loop it
handleNextSubListRecord:
// scan for his docids and inc the vote
for ( ;voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) {
// if current docid in docid list is >= the docid
// in the sublist, stop. docid in list is 6 bytes and
// subListPtr must be pointing to a 12 byte posdb rec.
if ( *(uint32_t *)(voteBufPtr+1) > *(uint32_t *)(subListPtr+8) ) {
break;
}
// less than? keep going
if ( *(uint32_t *)(voteBufPtr+1) < *(uint32_t *)(subListPtr+8) ) {
continue;
}
// top 4 bytes are equal. check lower single byte then.
if ( *(unsigned char *)(voteBufPtr) > (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
break;
}
if ( *(unsigned char *)(voteBufPtr) < (*(unsigned char *)(subListPtr+7) & 0xfc ) ) {
continue;
}
// . equal! record our vote!
// . we start at zero for the
// first termlist, and go to 1, etc.
voteBufPtr[5] = listGroupNum;
// skip it
voteBufPtr += 6;
// break out to advance subListPtr
break;
}
// if we've exhausted this docid list go to next sublist
// since this docid is NOT in the current/ongoing intersection
// of the docids for each queryterm
if ( voteBufPtr >= voteBufEnd ) {
continue;
}
// skip that docid record in our termlist. it MUST have been
// 12 bytes, a docid heading record.
subListPtr += 12;
// skip any following keys that are 6 bytes, that means they
// share the same docid
for ( ; subListPtr < subListEnd && ((*subListPtr)&0x04); ) {
subListPtr += 6;
}
// if we have more posdb recs in this sublist, then keep
// adding our docid votes into the docid list
if ( subListPtr < subListEnd ) {
goto handleNextSubListRecord;
}
// otherwise, advance to next sublist
}
//
// Shrink the docidbuf by removing docids with not enough
// votes which means they are missing a query term
//
voteBufPtr = m_docIdVoteBuf.getBufStart();
voteBufEnd = voteBufPtr + m_docIdVoteBuf.length();
char *dst = voteBufPtr;
for ( ; voteBufPtr < voteBufEnd ; voteBufPtr += 6 ) {
// skip if it has enough votes to be in search
// results so far
if(voteBufPtr[5] == listGroupNum) {
// copy it over. might be the same address!
*(int32_t *) dst = *(int32_t *) voteBufPtr;
*(int16_t *)(dst+4) = *(int16_t *)(voteBufPtr+4);
dst += 6;
}
}
// shrink the buffer size
m_docIdVoteBuf.setLength ( dst - bufStart );
logTrace(g_conf.m_logTracePosdb, "END.");
}
//
// Initialize the vote buffer with docids from the shortest
// term (rarest required term) list and initialize the scores to 0.
// Called by addDocIdVotes
//
// The buffer consists of 5-byte docids and 1-byte scores. The score
// is incremented for each term that matches the docid, and after
// each run, the list is "compacted" and shortened so only the
// matching docids are left.
//
void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti) {
char *cursor[MAX_SUBLISTS];
char *cursorEnd[MAX_SUBLISTS];
logTrace(g_conf.m_logTracePosdb, "term id [%" PRId64 "] [%.*s]", qti->m_subList[0].m_qt->m_termId, qti->m_subList[0].m_qt->m_termLen, qti->m_subList[0].m_qt->m_term);
for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) {
// get that sublist
cursor [i] = qti->m_subList[i].m_list->getList();
cursorEnd [i] = qti->m_subList[i].m_list->getListEnd();
}
char * const bufStart = m_docIdVoteBuf.getBufStart();
char *voteBufPtr = m_docIdVoteBuf.getBufStart();
char *lastMinRecPtr = NULL;
int32_t mini = -1;
// get the next min from all the termlists
for(;;) {
char *minRecPtr = NULL;
// just scan each sublist vs. the docid list
for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) {
// skip if exhausted
if ( ! cursor[i] ) {
continue;
}
// shortcut
char *recPtr = cursor[i];
// get the min docid
if ( ! minRecPtr ) {
minRecPtr = recPtr;
mini = i;
continue;
}
// compare!
if ( *(uint32_t *)(recPtr +8) >
*(uint32_t *)(minRecPtr+8) ) {
continue;
}
// a new min
if ( *(uint32_t *)(recPtr +8) <
*(uint32_t *)(minRecPtr+8) ) {
minRecPtr = recPtr;
mini = i;
continue;
}
// check lowest byte
if ( (*(unsigned char *)(recPtr +7) & 0xfc ) >
(*(unsigned char *)(minRecPtr+7) & 0xfc ) ) {
continue;
}
// a new min
if ( (*(unsigned char *)(recPtr +7) & 0xfc ) <
(*(unsigned char *)(minRecPtr+7) & 0xfc ) ) {
minRecPtr = recPtr;
mini = i;
continue;
}
}
// if no min then all lists exhausted!
if ( ! minRecPtr ) {
// update length
m_docIdVoteBuf.setLength ( voteBufPtr - bufStart );
// all done!
logTrace(g_conf.m_logTracePosdb, "END.");
return;
}
// advance that guy over that docid
cursor[mini] += 12;
// 6 byte keys follow?
for ( ; ; ) {
// end of list?
if ( cursor[mini] >= cursorEnd[mini] ) {
// use NULL to indicate list is exhausted
cursor[mini] = NULL;
break;
}
// if we hit a new 12 byte key for a new docid, stop
if ( ! ( cursor[mini][0] & 0x04 ) ) {
break;
}
// otherwise, skip this 6 byte key
cursor[mini] += 6;
}
// is it a docid dup?
if(lastMinRecPtr &&
*(uint32_t *)(lastMinRecPtr+8) ==
*(uint32_t *)(minRecPtr+8) &&
(*(unsigned char *)(lastMinRecPtr+7)&0xfc) ==
(*(unsigned char *)(minRecPtr+7)&0xfc)) {
continue;
}
// . do not store the docid if not in the whitelist
// . FIX: two lower bits, what are they? at minRecPtrs[7].
// . well the lowest bit is the siterank upper bit and the
// other bit is always 0. we should be ok with just using
// the 6 bytes of the docid ptr as is though since the siterank
// should be the same for the site: terms we indexed for the same
// docid!!
if ( m_useWhiteTable && ! m_whiteListTable.isInTable(minRecPtr+7) ) {
continue;
}
// only update this if we add the docid... that way there can be
// a winning "inRange" term in another sublist and the docid will
// get added.
lastMinRecPtr = minRecPtr;
// store our docid. actually it contains two lower bits not
// part of the docid, so we'll have to shift and mask to get
// the actual docid!
// docid is only 5 bytes for now
*(int32_t *)(voteBufPtr+1) = *(int32_t *)(minRecPtr+8);
// the single lower byte
voteBufPtr[0] = minRecPtr[7] & 0xfc;
// 0 vote count
voteBufPtr[5] = 0;
// debug
// int64_t dd = Posdb::getDocId(minRecPtr);
// log(LOG_ERROR, "%s:%s: adding docid %" PRId64 "", __FILE__, __func__, dd);
// advance
voteBufPtr += 6;
}
}
// TODO: do this in docid range phases to save memory and be much faster
// since we could contain to the L1 cache for hashing
bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) {
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
// . make a hashtable of all the docids from all the termlists
// . the value slot will be the operand bit vector i guess
// . the size of the vector needs one bit per query operand
// . if the vector is only 1-2 bytes we can just evaluate each
// combination we encounter and store it into an array, otherwise,
// we can use a another hashtable in order to avoid re-evaluation
// on if it passes the boolean query.
char bitVec[MAX_OVEC_SIZE];
if ( m_vecSize > MAX_OVEC_SIZE ) {
m_vecSize = MAX_OVEC_SIZE;
}
// . scan each list of docids to a get a new docid, keep a dedup
// table to avoid re-processing the same docid.
// . each posdb list we read corresponds to a query term,
// or a synonym of a query term, or bigram of a query term, etc.
// but we really want to know what operand, so we associate an
// operand bit with each query term, and each list can map to
// the base query term so we can get the operand # from that.
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &(m_queryTermInfos[i]);
QueryTerm *qt = &m_q->m_qterms[qti->m_qtermNum];
// get the query word
//QueryWord *qw = qt->m_qword;
// just use the word # now
//int32_t opNum = qw->m_wordNum;//opNum;
// . make it consistent with Query::isTruth()
// . m_bitNum is set above to the QueryTermInfo #
int32_t bitNum = qt->m_bitNum;
// do not consider for adding if negative ('my house -home')
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// set all to zeroes
memset ( bitVec, 0, m_vecSize );
// set bitvec for this query term #
int32_t byte = bitNum / 8;
unsigned char mask = 1<<(bitNum % 8);
bitVec[byte] |= mask;
// each query term can have synonym lists etc. scan those.
// this includes the original query termlist as well.
for ( int32_t j = 0 ; j < qti->m_numSubLists ; j++ ) {
// scan all docids in this list
const char *p = qti->m_subList[j].m_list->getList();
const char *pend = qti->m_subList[j].m_list->getListEnd();
//int64_t lastDocId = 0LL;
// scan the sub termlist #j
for ( ; p < pend ; ) {
// place holder
int64_t docId = Posdb::getDocId(p);
// sanity
//if ( d < lastDocId )
// gbshutdownAbort(true);
//lastDocId = d;
// point to it
//char *voteBufPtr = p + 8;
// this was the first key for this docid for
// this termid and possible the first key for
// this termid, so skip it, either 12 or 18
// bytes
if ( (((char *)p)[0])&0x02 ) {
p += 12;
}
// the first key for this termid?
else {
p += 18;
}
// then only 6 byte keys would follow from the
// same docid, so skip those as well
subloop:
if( p < pend && (((char *)p)[0]) & 0x04 ) {
p += 6;
goto subloop;
}
// convert docid into hash key
//int64_t docId = *(int64_t *)voteBufPtr;
// shift down 2 bits
//docId >>= 2;
// and mask
//docId &= DOCID_MASK;
// test it
//int64_t docId = Posdb::getDocId(voteBufPtr-8);
//if ( d2 != docId )
// gbshutdownAbort(true);
// store this docid though. treat as int64_t
// but we mask with keymask
int32_t slot = m_bt.getSlot ( &docId );
if ( slot < 0 ) {
// we can't alloc in a thread, careful
if ( ! m_bt.addKey(&docId,bitVec) ) {
gbshutdownAbort(true);
}
continue;
}
// or the bit in otherwise
char *bv = (char *)m_bt.getValueFromSlot(slot);
bv[byte] |= mask;
}
}
}
// debug info
// int32_t nc = m_bt.getLongestString();
// log("posdb: string of %" PRId32" filled slots!",nc);
char *dst = m_docIdVoteBuf.getBufStart();
// . now our hash table is filled with all the docids
// . evaluate each bit vector
for ( int32_t i = 0 ; i < m_bt.getNumSlots() ; i++ ) {
// skip if empty
if ( ! m_bt.m_flags[i] ) {
continue;
}
// get the bit vector
const unsigned char *vec = (unsigned char *)m_bt.getValueFromSlot(i);
// hash the vector
int64_t h64 = 0LL;
for ( int32_t k = 0 ; k < m_vecSize ; k++ )
h64 ^= g_hashtab[(unsigned char)vec[k]][(unsigned char)k];
// check in hash table
char *val = (char *)m_ct.getValue ( &h64 );
// it passes, add the ocid
if ( m_debug ) {
int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i);
log(LOG_INFO, "query: eval d=%" PRIu64" vec[0]=%" PRIx32" h64=%" PRId64, docId,(int32_t)vec[0],h64);
}
// add him to the good table
if ( val && *val ) {
// it passes, add the ocid
int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i);
// fix it up
if ( m_debug ) {
log(LOG_INFO, "query: adding d=%" PRIu64" bitVecSize=%" PRId32" bitvec[0]=0x%" PRIx32" (TRUE)",
docId,m_vecSize,(int32_t)vec[0]);
}
// shift up
docId <<= 2;
// a 6 byte key means you pass
memcpy ( dst, &docId, 6 );
dst += 6;
continue;
}
// evaluate the vector
char include = m_q->matchesBoolQuery ( (unsigned char *)vec, m_vecSize );
if ( include ) {
// it passes, add the ocid
int64_t docId =*(int64_t *)m_bt.getKeyFromSlot(i);
// fix it up
if ( m_debug ) {
log(LOG_INFO, "query: adding d=%" PRIu64" vec[0]=0x%" PRIx32, docId,(int32_t)vec[0]);
}
// shift up
docId <<= 2;
// a 6 byte key means you pass
memcpy ( dst, &docId, 6 );
// test it
if ( m_debug ) {
int64_t d2;
d2 = *(uint32_t *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId )
gbshutdownAbort(true);
}
// end test
dst += 6;
}
// store in hash table
m_ct.addKey ( &h64, &include );
}
// update SafeBuf::m_length
m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery()
// up into docid ranges so we have like 1/100th the # of docids to
// sort. that should make this part a lot faster.
// i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000
// i.e. it's faster to break it down into 1000 pieces
// i.e. for log base 2 maybe it's like 10x faster...
qsort ( m_docIdVoteBuf.getBufStart(),
m_docIdVoteBuf.length() / 6,
6,
docIdVoteBufKeyCompare_desc );
logTrace(g_conf.m_logTracePosdb, "END.");
return true;
}
////////////////////
//
// Global functions
//
////////////////////
// sort vote buf entries in descending order
static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 ) {
return KEYCMP((const char*)h1, (const char*)h2, 6);
}
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize) {
// make step divisible by 6 initially
int32_t step = (listSize / 12) * 6;
// shortcut
const char *listEnd = list + listSize;
// divide in half
const char *p = list + step;
// for detecting not founds
char count = 0;
for(;;) {
// save it
const char *origp = p;
// scan up to docid. we use this special bit to distinguish between
// 6-byte and 12-byte posdb keys
for ( ; p > list && (p[1] & 0x02); ) {
p -= 6;
}
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// ok, we got a 12-byte key then i guess
uint64_t d = Posdb::getDocId ( p );
// we got a match, but it might be a NEGATIVE key so
// we have to try to find the positive keys in that case
if ( d == docId ) {
// if its positive, no need to do anything else
if ( (p[0] & 0x01) == 0x01 ) return p;
// ok, it's negative, try to see if the positive is
// in here, if not then return NULL.
// save current pos
const char *current = p;
// back up to 6 byte key before this 12 byte key
p -= 6;
// now go backwards to previous 12 byte key
for ( ; p > list && (p[1] & 0x02); ) {
p -= 6;
}
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// is it there?
if ( p >= list && Posdb::getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// ok, no positive before us, try after us
p = current;
// advance over current 12 byte key
p += 12;
// now go forwards to next 12 byte key
for ( ; p < listEnd && (p[1] & 0x02); ) {
p += 6;
}
// is it there?
if ( p + 12 < listEnd && Posdb::getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) {
return NULL;
}
// got it
return p;
}
// . crap, i guess just had a single negative docid then
// . return that and the caller will see its negative
return current;
}
// reduce step
//step /= 2;
step >>= 1;
// . make divisible by 6!
// . TODO: speed this up!!!
step = step - (step % 6);
// sanity
if ( step % 6 ) {
gbshutdownAbort(true);
}
// ensure never 0
if ( step <= 0 ) {
step = 6;
// return NULL if not found
if ( count++ >= 2 ) {
return NULL;
}
}
// go up or down then
if ( d < docId ) {
p = origp + step;
if ( p >= listEnd ) {
p = listEnd - 6;
}
}
else {
p = origp - step;
if ( p < list ) {
p = list;
}
}
}
}
// initialize the weights table
static void initWeights ( ) {
ScopedLock sl(s_mtx_weights);
if ( s_init ) {
return;
}
logTrace(g_conf.m_logTracePosdb, "BEGIN.");
s_scoringWeights.init(g_conf.m_baseScoringParameters);
// if two hashgroups are comaptible they can be paired
for ( int32_t i = 0 ; i < HASHGROUP_END ; i++ ) {
// set this
s_inBody[i] = false;
// is it body?
if ( i == HASHGROUP_BODY ||
i == HASHGROUP_HEADING ||
i == HASHGROUP_INLIST ||
i == HASHGROUP_INMENU )
s_inBody[i] = true;
for ( int32_t j = 0 ; j < HASHGROUP_END ; j++ ) {
// assume not
s_isCompatible[i][j] = false;
// or both in body (and not title)
bool inBody1 = true;
if ( i != HASHGROUP_BODY &&
i != HASHGROUP_HEADING &&
i != HASHGROUP_INLIST &&
//i != HASHGROUP_INURL &&
i != HASHGROUP_INMENU )
inBody1 = false;
bool inBody2 = true;
if ( j != HASHGROUP_BODY &&
j != HASHGROUP_HEADING &&
j != HASHGROUP_INLIST &&
//j != HASHGROUP_INURL &&
j != HASHGROUP_INMENU )
inBody2 = false;
// no body allowed now!
if ( inBody1 || inBody2 ) {
continue;
}
//if ( ! inBody ) continue;
// now neither can be in the body, because we handle
// those cases in the new sliding window algo.
// if one term is only in the link text and the other
// term is only in the title, ... what then? i guess
// allow those here, but they will be penalized
// some with the fixed distance of like 64 units or
// something...
s_isCompatible[i][j] = true;
// if either is in the body then do not allow now
// and handle in the sliding window algo
//s_isCompatible[i][j] = 1;
}
}
s_init = true;
#ifdef _VALGRIND_
//we read from the weight tables without locking. tell helgrind to ignore that
VALGRIND_HG_DISABLE_CHECKING(&s_scoringWeights,sizeof(s_scoringWeights));
VALGRIND_HG_DISABLE_CHECKING(s_isCompatible,sizeof(s_isCompatible));
VALGRIND_HG_DISABLE_CHECKING(s_inBody,sizeof(s_inBody));
#endif
logTrace(g_conf.m_logTracePosdb, "END.");
}
// Called when ranking settings are changed. Normally called from update-parameter
// broadcast handling (see handleRequest3fLoop() )
void reinitializeRankingSettings()
{
s_init = false;
initWeights();
}
float getHashGroupWeight ( unsigned char hg ) {
initWeights();
return s_scoringWeights.m_hashGroupWeights[hg];
}
float getDiversityWeight ( unsigned char diversityRank ) {
initWeights();
return s_scoringWeights.m_diversityWeights[diversityRank];
}
float getDensityWeight ( unsigned char densityRank ) {
initWeights();
return s_scoringWeights.m_densityWeights[densityRank];
}
float getWordSpamWeight ( unsigned char wordSpamRank ) {
initWeights();
return s_scoringWeights.m_wordSpamWeights[wordSpamRank];
}
float getLinkerWeight ( unsigned char wordSpamRank ) {
initWeights();
return s_scoringWeights.m_linkerWeights[wordSpamRank];
}