324 lines
10 KiB
C++
324 lines
10 KiB
C++
#ifndef GB_POSDB_TABLE_H
|
|
#define GB_POSDB_TABLE_H
|
|
|
|
#include "RdbList.h"
|
|
#include "HashTableX.h"
|
|
#include "ScoringWeights.h"
|
|
#include "BaseScoringParameters.h"
|
|
#include "Lang.h"
|
|
#include <vector>
|
|
|
|
float getDiversityWeight ( unsigned char diversityRank );
|
|
float getDensityWeight ( unsigned char densityRank );
|
|
float getWordSpamWeight ( unsigned char wordSpamRank );
|
|
float getLinkerWeight ( unsigned char wordSpamRank );
|
|
float getHashGroupWeight ( unsigned char hg );
|
|
|
|
#define WIKI_WEIGHT 0.10 // was 0.20
|
|
|
|
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
|
|
// its in wikipedia.
|
|
// up this to 1.40 for 'the time machine' query
|
|
#define WIKI_BIGRAM_WEIGHT 1.40
|
|
|
|
|
|
|
|
//forward declarations
|
|
class DocumentIndexChecker;
|
|
class TopTree;
|
|
class Msg2;
|
|
class Msg39Request;
|
|
class DocIdScore;
|
|
class Query;
|
|
class QueryTerm;
|
|
struct MiniMergeBuffer;
|
|
class PairScoreMatrix;
|
|
|
|
|
|
#define MAX_SUBLISTS 50
|
|
|
|
// . each QueryTerm has this attached additional info now:
|
|
// . these should be 1-1 with query terms, Query::m_qterms[]
|
|
class QueryTermInfo {
|
|
public:
|
|
//The lists associated with this qti, including the term itself, 9-2 bigrams and any synonyms
|
|
struct {
|
|
const QueryTerm *m_qt;
|
|
RdbList *m_list;
|
|
// flags to indicate if bigram list should be scored higher
|
|
char m_bigramFlag;
|
|
} m_subList[MAX_SUBLISTS];
|
|
int32_t m_numSubLists;
|
|
|
|
// delNonMatchingDocIdsFromSubLists() set these. They
|
|
// point to m_subLists that have been reduced in size
|
|
// to only contain the docids matching all required term ids
|
|
struct {
|
|
int32_t m_size;
|
|
const char *m_start;
|
|
const char *m_end;
|
|
const char *m_cursor;
|
|
const char *m_savedCursor;
|
|
int m_baseSubListIndex; //which of m_subList[] entries it is based on
|
|
} m_matchingSublist[MAX_SUBLISTS];
|
|
int32_t m_numMatchingSubLists;
|
|
|
|
float m_maxMatchingTermFreqWeight; //= max(matchingsublist[]->sublist->qt->m_freqTermWeight)
|
|
|
|
// what query term # do we correspond to in Query.h
|
|
int32_t m_qtermNum;
|
|
QueryTerm *m_qterm;
|
|
// the word position of this query term in the Words.h class
|
|
int32_t m_qpos;
|
|
// the wikipedia phrase id if we start one
|
|
int32_t m_wikiPhraseId;
|
|
// phrase id term or bigram is in
|
|
int32_t m_quotedStartId;
|
|
//The base term to the left of this qti/baseterm is ignored
|
|
bool m_leftTermIsIgnored;
|
|
};
|
|
|
|
|
|
class PosdbTable {
|
|
public:
|
|
PosdbTable();
|
|
~PosdbTable();
|
|
void reset();
|
|
|
|
// . sets m_q to point to q
|
|
void init(Query *q, bool debug, TopTree *topTree, const DocumentIndexChecker &documentIndexChecker, Msg2 *msg2, Msg39Request *r);
|
|
|
|
// has init already been called?
|
|
bool isInitialized() const { return m_initialized; }
|
|
|
|
// the new intersection/scoring algo
|
|
void intersectLists();
|
|
|
|
int64_t getTotalHits() const { return m_docIdVoteBuf.length() / 6; }
|
|
int32_t getFilteredCount() const { return m_filtered; }
|
|
|
|
// how long to add the last batch of lists
|
|
int64_t m_addListsTime;
|
|
int64_t m_t1 ;
|
|
int64_t m_t2 ;
|
|
|
|
SafeBuf m_scoreInfoBuf;
|
|
SafeBuf m_pairScoreBuf;
|
|
SafeBuf m_singleScoreBuf;
|
|
|
|
private:
|
|
TopTree *m_topTree;
|
|
|
|
//used during intersection, part of working area
|
|
std::vector<int32_t> m_wikiPhraseIds;
|
|
std::vector<int32_t> m_quotedStartIds;
|
|
std::vector<int32_t> m_qpos;
|
|
std::vector<int32_t> m_qtermNums;
|
|
std::vector<char> m_bflags;
|
|
|
|
bool m_hasMaxSerpScore;
|
|
|
|
uint64_t m_docId; //the current docid intersection is working on
|
|
|
|
Msg2 *m_msg2;
|
|
|
|
const DocumentIndexChecker *m_documentIndexChecker;
|
|
|
|
// a reference to the query
|
|
Query *m_q;
|
|
int32_t m_nqt;
|
|
|
|
// has init() been called?
|
|
bool m_initialized;
|
|
|
|
// are we in debug mode?
|
|
bool m_debug;
|
|
|
|
Msg39Request *m_msg39req;
|
|
BaseScoringParameters m_baseScoringParameters;
|
|
DerivedScoringWeights m_derivedScoringWeights;
|
|
|
|
|
|
HashTableX m_whiteListTable;
|
|
bool m_useWhiteTable;
|
|
bool m_addedSites;
|
|
|
|
bool allocateTopTree();
|
|
bool allocateScoringInfo();
|
|
bool setQueryTermInfo();
|
|
|
|
// allocation&preparation of m_whiteListTable
|
|
bool allocWhiteListTable();
|
|
void prepareWhiteListTable();
|
|
|
|
void intersectLists_real();
|
|
|
|
bool genDebugScoreInfo1(int32_t *numProcessed, int32_t *topCursor, bool *docInThisFile);
|
|
bool genDebugScoreInfo2(DocIdScore *dcs, int32_t *lastLen, uint64_t *lastDocId, char siteRank, float score, int32_t intScore, lang_t docLang);
|
|
void logDebugScoreInfo(int32_t loglevel);
|
|
void removeScoreInfoForDeletedDocIds();
|
|
bool advanceTermListCursors(const char *docIdPtr);
|
|
bool prefilterMaxPossibleScoreByDistance(float minWinningScore);
|
|
void mergeTermSubListsForDocId(MiniMergeBuffer *miniMergeBuffer, int *highestInlinkSiteRank);
|
|
|
|
void createNonBodyTermPairScoreMatrix(const MiniMergeBuffer *miniMergeBuffer, PairScoreMatrix *scoreMatrix);
|
|
float getMinSingleTermScoreSum(const MiniMergeBuffer *miniMergeBuffer, std::vector<const char *> &highestScoringNonBodyPos, DocIdScore *pdcs);
|
|
float getMinTermPairScoreSlidingWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &highestScoringNonBodyPos, std::vector<const char *> &bestMinTermPairWindowPtrs, std::vector<const char *> &xpos, const PairScoreMatrix &scoreMatrix, DocIdScore *pdcs);
|
|
|
|
float getMaxScoreForNonBodyTermPair(const MiniMergeBuffer *miniMergeBuffer, int i, int j, int32_t qdist);
|
|
float getBestScoreSumForSingleTerm(const MiniMergeBuffer *miniMergeBuf, int32_t i, DocIdScore *pdcs, const char **highestScoringNonBodyPos);
|
|
float getScoreForTermPair(const MiniMergeBuffer *miniMergeBuffer, const char *wpi, const char *wpj, int32_t fixedDistance, int32_t qdist);
|
|
void findMinTermPairScoreInWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &ptrs, std::vector<const char *> *bestMinTermPairWindowPtrs, float *bestMinTermPairWindowScore, const std::vector<const char *> &highestScoringNonBodyPos, const PairScoreMatrix &scoreMatrix);
|
|
|
|
float getTermPairScoreForAny(const MiniMergeBuffer *miniMergeBuffer, int i, int j, const std::vector<const char *> &bestMinTermPairWindowPtrs, DocIdScore *pdcs);
|
|
|
|
void delNonMatchingDocIdsFromSubLists();
|
|
|
|
// for intersecting docids
|
|
void addDocIdVotes( const QueryTermInfo *qti , int32_t listGroupNum );
|
|
void makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti);
|
|
bool makeDocIdVoteBufForBoolQuery() ;
|
|
void delDocIdVotes ( const QueryTermInfo *qti ); // for negative query terms...
|
|
bool findCandidateDocIds();
|
|
|
|
// upper score bound
|
|
float getMaxPossibleScore(const QueryTermInfo *qti) ;
|
|
float modifyMaxScoreByDistance(float score,
|
|
int32_t bestDist,
|
|
int32_t qdist,
|
|
const QueryTermInfo *qtm);
|
|
|
|
// stuff set in setQueryTermInf() function:
|
|
std::vector<QueryTermInfo> m_queryTermInfos;
|
|
int32_t m_numQueryTermInfos;
|
|
// the size of the smallest set of sublists. each sublists is
|
|
// the main term or a synonym, etc. of the main term.
|
|
int32_t m_minTermListSize;
|
|
// which query term info has the smallest set of sublists
|
|
int32_t m_minTermListIdx;
|
|
// intersect docids from each QueryTermInfo into here
|
|
SafeBuf m_docIdVoteBuf;
|
|
|
|
int32_t m_filtered;
|
|
|
|
// boolean truth table for boolean queries
|
|
HashTableX m_bt;
|
|
HashTableX m_ct;
|
|
// size of the data slot in m_bt
|
|
int32_t m_vecSize;
|
|
|
|
// are all positive query terms in same wikipedia phrase like
|
|
// 'time enough for love'?
|
|
bool m_allInSameWikiPhrase;
|
|
|
|
int32_t m_realMaxTop;
|
|
};
|
|
|
|
|
|
// distance used when measuring word from title/linktext/etc to word in body
|
|
#define FIXED_DISTANCE 400
|
|
|
|
class PairScore {
|
|
public:
|
|
int32_t m_wordPos1;
|
|
int32_t m_wordPos2;
|
|
int64_t m_termFreq1;
|
|
int64_t m_termFreq2;
|
|
float m_tfWeight1;
|
|
float m_tfWeight2;
|
|
int32_t m_qtermNum1;
|
|
int32_t m_qtermNum2;
|
|
int32_t m_qdist;
|
|
float m_finalScore;
|
|
char m_isSynonym1;
|
|
char m_isSynonym2;
|
|
char m_isHalfStopWikiBigram1;
|
|
char m_isHalfStopWikiBigram2;
|
|
char m_diversityRank1;
|
|
char m_diversityRank2;
|
|
char m_densityRank1;
|
|
char m_densityRank2;
|
|
char m_wordSpamRank1;
|
|
char m_wordSpamRank2;
|
|
char m_hashGroup1;
|
|
char m_hashGroup2;
|
|
char m_inSameWikiPhrase;
|
|
char m_fixedDistance;
|
|
char m_bflags1;
|
|
char m_bflags2;
|
|
};
|
|
|
|
class SingleScore {
|
|
public:
|
|
int64_t m_termFreq;
|
|
float m_finalScore;
|
|
int32_t m_wordPos;
|
|
float m_tfWeight;
|
|
int32_t m_qtermNum;
|
|
char m_isSynonym;
|
|
char m_isHalfStopWikiBigram;
|
|
char m_diversityRank;
|
|
char m_densityRank;
|
|
char m_wordSpamRank;
|
|
char m_hashGroup;
|
|
char m_bflags;
|
|
char m_reserved0;
|
|
};
|
|
//above struct members are sorted on size as to minimize internal padding and final size
|
|
|
|
// we add up the pair scores of this many of the top-scoring pairs
|
|
// for inlink text only, so it is accumulative. but now we also
|
|
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
|
|
// tune this down.
|
|
#define MAX_TOP 10
|
|
|
|
// transparent query scoring info per docid
|
|
class DocIdScore {
|
|
public:
|
|
DocIdScore ( ) { reset(); }
|
|
|
|
void reset ( ) {
|
|
memset(this,0,sizeof(*this));
|
|
}
|
|
|
|
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
|
|
bool serialize ( class SafeBuf *sb );
|
|
|
|
int64_t m_docId;
|
|
double m_finalScore;
|
|
char m_siteRank;
|
|
char m_usePageTemperature;
|
|
char m_reserved1;
|
|
char m_reserved2;
|
|
lang_t m_docLang;
|
|
int32_t m_numRequiredTerms;
|
|
// NEW 20170423
|
|
float m_adjustedSiteRank;
|
|
double m_pageTemperature;
|
|
|
|
|
|
int32_t m_numPairs;
|
|
int32_t m_numSingles;
|
|
|
|
// . m_pairScores is just all the term pairs serialized
|
|
// . they contain their query term #1 of each term in the pair and
|
|
// they have the match number for each pair, since now each
|
|
// pair of query terms can have up to MAX_TOP associated pairs
|
|
// whose scores we add together to get the final score for that pair
|
|
// . record offset into PosdbTable::m_pairScoreBuf
|
|
// . Msg39Reply::ptr_pairScoreBuf will be this
|
|
int32_t m_pairsOffset;
|
|
// . record offset into PosdbTable.m_singleScoreBuf
|
|
// . Msg39Reply::ptr_singleScoreBuf will be this
|
|
int32_t m_singlesOffset;
|
|
|
|
// Msg3a.cpp::mergeLists() should set these ptrs after it
|
|
// copies over a top DocIdScore for storing the final results array
|
|
class PairScore *m_pairScores;
|
|
class SingleScore *m_singleScores;
|
|
};
|
|
|
|
void reinitializeRankingSettings();
|
|
|
|
#endif // GB_POSDB_TABLE_H
|