privacore-open-source-searc.../PosdbTable.h
2018-05-03 12:00:25 +02:00

324 lines
10 KiB
C++

#ifndef GB_POSDB_TABLE_H
#define GB_POSDB_TABLE_H
#include "RdbList.h"
#include "HashTableX.h"
#include "ScoringWeights.h"
#include "BaseScoringParameters.h"
#include "Lang.h"
#include <vector>
float getDiversityWeight ( unsigned char diversityRank );
float getDensityWeight ( unsigned char densityRank );
float getWordSpamWeight ( unsigned char wordSpamRank );
float getLinkerWeight ( unsigned char wordSpamRank );
float getHashGroupWeight ( unsigned char hg );
#define WIKI_WEIGHT 0.10 // was 0.20
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
// its in wikipedia.
// up this to 1.40 for 'the time machine' query
#define WIKI_BIGRAM_WEIGHT 1.40
//forward declarations
class DocumentIndexChecker;
class TopTree;
class Msg2;
class Msg39Request;
class DocIdScore;
class Query;
class QueryTerm;
struct MiniMergeBuffer;
class PairScoreMatrix;
#define MAX_SUBLISTS 50
// . each QueryTerm has this attached additional info now:
// . these should be 1-1 with query terms, Query::m_qterms[]
class QueryTermInfo {
public:
//The lists associated with this qti, including the term itself, 9-2 bigrams and any synonyms
struct {
const QueryTerm *m_qt;
RdbList *m_list;
// flags to indicate if bigram list should be scored higher
char m_bigramFlag;
} m_subList[MAX_SUBLISTS];
int32_t m_numSubLists;
// delNonMatchingDocIdsFromSubLists() set these. They
// point to m_subLists that have been reduced in size
// to only contain the docids matching all required term ids
struct {
int32_t m_size;
const char *m_start;
const char *m_end;
const char *m_cursor;
const char *m_savedCursor;
int m_baseSubListIndex; //which of m_subList[] entries it is based on
} m_matchingSublist[MAX_SUBLISTS];
int32_t m_numMatchingSubLists;
float m_maxMatchingTermFreqWeight; //= max(matchingsublist[]->sublist->qt->m_freqTermWeight)
// what query term # do we correspond to in Query.h
int32_t m_qtermNum;
QueryTerm *m_qterm;
// the word position of this query term in the Words.h class
int32_t m_qpos;
// the wikipedia phrase id if we start one
int32_t m_wikiPhraseId;
// phrase id term or bigram is in
int32_t m_quotedStartId;
//The base term to the left of this qti/baseterm is ignored
bool m_leftTermIsIgnored;
};
class PosdbTable {
public:
PosdbTable();
~PosdbTable();
void reset();
// . sets m_q to point to q
void init(Query *q, bool debug, TopTree *topTree, const DocumentIndexChecker &documentIndexChecker, Msg2 *msg2, Msg39Request *r);
// has init already been called?
bool isInitialized() const { return m_initialized; }
// the new intersection/scoring algo
void intersectLists();
int64_t getTotalHits() const { return m_docIdVoteBuf.length() / 6; }
int32_t getFilteredCount() const { return m_filtered; }
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;
int64_t m_t2 ;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
private:
TopTree *m_topTree;
//used during intersection, part of working area
std::vector<int32_t> m_wikiPhraseIds;
std::vector<int32_t> m_quotedStartIds;
std::vector<int32_t> m_qpos;
std::vector<int32_t> m_qtermNums;
std::vector<char> m_bflags;
bool m_hasMaxSerpScore;
uint64_t m_docId; //the current docid intersection is working on
Msg2 *m_msg2;
const DocumentIndexChecker *m_documentIndexChecker;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// has init() been called?
bool m_initialized;
// are we in debug mode?
bool m_debug;
Msg39Request *m_msg39req;
BaseScoringParameters m_baseScoringParameters;
DerivedScoringWeights m_derivedScoringWeights;
HashTableX m_whiteListTable;
bool m_useWhiteTable;
bool m_addedSites;
bool allocateTopTree();
bool allocateScoringInfo();
bool setQueryTermInfo();
// allocation&preparation of m_whiteListTable
bool allocWhiteListTable();
void prepareWhiteListTable();
void intersectLists_real();
bool genDebugScoreInfo1(int32_t *numProcessed, int32_t *topCursor, bool *docInThisFile);
bool genDebugScoreInfo2(DocIdScore *dcs, int32_t *lastLen, uint64_t *lastDocId, char siteRank, float score, int32_t intScore, lang_t docLang);
void logDebugScoreInfo(int32_t loglevel);
void removeScoreInfoForDeletedDocIds();
bool advanceTermListCursors(const char *docIdPtr);
bool prefilterMaxPossibleScoreByDistance(float minWinningScore);
void mergeTermSubListsForDocId(MiniMergeBuffer *miniMergeBuffer, int *highestInlinkSiteRank);
void createNonBodyTermPairScoreMatrix(const MiniMergeBuffer *miniMergeBuffer, PairScoreMatrix *scoreMatrix);
float getMinSingleTermScoreSum(const MiniMergeBuffer *miniMergeBuffer, std::vector<const char *> &highestScoringNonBodyPos, DocIdScore *pdcs);
float getMinTermPairScoreSlidingWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &highestScoringNonBodyPos, std::vector<const char *> &bestMinTermPairWindowPtrs, std::vector<const char *> &xpos, const PairScoreMatrix &scoreMatrix, DocIdScore *pdcs);
float getMaxScoreForNonBodyTermPair(const MiniMergeBuffer *miniMergeBuffer, int i, int j, int32_t qdist);
float getBestScoreSumForSingleTerm(const MiniMergeBuffer *miniMergeBuf, int32_t i, DocIdScore *pdcs, const char **highestScoringNonBodyPos);
float getScoreForTermPair(const MiniMergeBuffer *miniMergeBuffer, const char *wpi, const char *wpj, int32_t fixedDistance, int32_t qdist);
void findMinTermPairScoreInWindow(const MiniMergeBuffer *miniMergeBuffer, const std::vector<const char *> &ptrs, std::vector<const char *> *bestMinTermPairWindowPtrs, float *bestMinTermPairWindowScore, const std::vector<const char *> &highestScoringNonBodyPos, const PairScoreMatrix &scoreMatrix);
float getTermPairScoreForAny(const MiniMergeBuffer *miniMergeBuffer, int i, int j, const std::vector<const char *> &bestMinTermPairWindowPtrs, DocIdScore *pdcs);
void delNonMatchingDocIdsFromSubLists();
// for intersecting docids
void addDocIdVotes( const QueryTermInfo *qti , int32_t listGroupNum );
void makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti);
bool makeDocIdVoteBufForBoolQuery() ;
void delDocIdVotes ( const QueryTermInfo *qti ); // for negative query terms...
bool findCandidateDocIds();
// upper score bound
float getMaxPossibleScore(const QueryTermInfo *qti) ;
float modifyMaxScoreByDistance(float score,
int32_t bestDist,
int32_t qdist,
const QueryTermInfo *qtm);
// stuff set in setQueryTermInf() function:
std::vector<QueryTermInfo> m_queryTermInfos;
int32_t m_numQueryTermInfos;
// the size of the smallest set of sublists. each sublists is
// the main term or a synonym, etc. of the main term.
int32_t m_minTermListSize;
// which query term info has the smallest set of sublists
int32_t m_minTermListIdx;
// intersect docids from each QueryTermInfo into here
SafeBuf m_docIdVoteBuf;
int32_t m_filtered;
// boolean truth table for boolean queries
HashTableX m_bt;
HashTableX m_ct;
// size of the data slot in m_bt
int32_t m_vecSize;
// are all positive query terms in same wikipedia phrase like
// 'time enough for love'?
bool m_allInSameWikiPhrase;
int32_t m_realMaxTop;
};
// distance used when measuring word from title/linktext/etc to word in body
#define FIXED_DISTANCE 400
class PairScore {
public:
int32_t m_wordPos1;
int32_t m_wordPos2;
int64_t m_termFreq1;
int64_t m_termFreq2;
float m_tfWeight1;
float m_tfWeight2;
int32_t m_qtermNum1;
int32_t m_qtermNum2;
int32_t m_qdist;
float m_finalScore;
char m_isSynonym1;
char m_isSynonym2;
char m_isHalfStopWikiBigram1;
char m_isHalfStopWikiBigram2;
char m_diversityRank1;
char m_diversityRank2;
char m_densityRank1;
char m_densityRank2;
char m_wordSpamRank1;
char m_wordSpamRank2;
char m_hashGroup1;
char m_hashGroup2;
char m_inSameWikiPhrase;
char m_fixedDistance;
char m_bflags1;
char m_bflags2;
};
class SingleScore {
public:
int64_t m_termFreq;
float m_finalScore;
int32_t m_wordPos;
float m_tfWeight;
int32_t m_qtermNum;
char m_isSynonym;
char m_isHalfStopWikiBigram;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
char m_bflags;
char m_reserved0;
};
//above struct members are sorted on size as to minimize internal padding and final size
// we add up the pair scores of this many of the top-scoring pairs
// for inlink text only, so it is accumulative. but now we also
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
// tune this down.
#define MAX_TOP 10
// transparent query scoring info per docid
class DocIdScore {
public:
DocIdScore ( ) { reset(); }
void reset ( ) {
memset(this,0,sizeof(*this));
}
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
bool serialize ( class SafeBuf *sb );
int64_t m_docId;
double m_finalScore;
char m_siteRank;
char m_usePageTemperature;
char m_reserved1;
char m_reserved2;
lang_t m_docLang;
int32_t m_numRequiredTerms;
// NEW 20170423
float m_adjustedSiteRank;
double m_pageTemperature;
int32_t m_numPairs;
int32_t m_numSingles;
// . m_pairScores is just all the term pairs serialized
// . they contain their query term #1 of each term in the pair and
// they have the match number for each pair, since now each
// pair of query terms can have up to MAX_TOP associated pairs
// whose scores we add together to get the final score for that pair
// . record offset into PosdbTable::m_pairScoreBuf
// . Msg39Reply::ptr_pairScoreBuf will be this
int32_t m_pairsOffset;
// . record offset into PosdbTable.m_singleScoreBuf
// . Msg39Reply::ptr_singleScoreBuf will be this
int32_t m_singlesOffset;
// Msg3a.cpp::mergeLists() should set these ptrs after it
// copies over a top DocIdScore for storing the final results array
class PairScore *m_pairScores;
class SingleScore *m_singleScores;
};
void reinitializeRankingSettings();
#endif // GB_POSDB_TABLE_H