310 lines
11 KiB
C++
310 lines
11 KiB
C++
// Matt Wells, copyright Aug 2001
|
|
|
|
// . calls for intersecting a bunch of IndexLists to generate docIds
|
|
// . IndexLists are data-less lists (keys only)
|
|
// . each key in an IndexList is a termId/score/adultBit/docId tuple
|
|
// . we try to use as small a sublist of each IndexList as possible to avoid
|
|
// wasting network bandwidth
|
|
// . TODO: split into 2+ classes
|
|
|
|
// TODO: implement site clustering??????? in getNumResults()
|
|
|
|
// TODO: if we have in cache we can hash right into the table, but
|
|
// we must do that before blocking on something in case it disappears
|
|
// from the cache
|
|
|
|
// TODO: it is possible to get a better scoring result, even if we found
|
|
// 10 docIds in the heads of ALL the IndexLists. Because it may have
|
|
// a really high score in 3 of the IndexLists, but a low score in the
|
|
// fourth, but it's sum may be the highest of all docIds.
|
|
|
|
// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
|
|
// be quote forced anyway. Some pages will match all but "cable hp"
|
|
// so we should break that up into it's 2 terms, cable and hp
|
|
|
|
// 6912 (30%) of queries are 1 word queries 138
|
|
// 7825 (34%) of queries are 2 word queries 156
|
|
// 4512 (20%) of queries are 3 word queries 90
|
|
// 1771 ( 8%) of queries are 4 word queries 32
|
|
// 869 ( 4%) of queries are 5 word queries 17
|
|
// 391 ( 2%) of queries are 6 word queries 8
|
|
// 290 ( 1%) of queries are 7 word queries 6
|
|
// 183 ( 1%) of queries are 8 word queries (4 per second)
|
|
|
|
#ifndef _INDEXTABLE_H_
|
|
#define _INDEXTABLE_H_
|
|
|
|
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
|
#include "Indexdb.h" // makeStartKey(), getTruncationLimit()
|
|
#include "IndexList.h" // for m_lists[]
|
|
#include "Titledb.h" // g_titledb.getTotalNumDocs()
|
|
#include "IndexReadInfo.h" // MAX_TIERS
|
|
|
|
// max # search results that can be viewed
|
|
#define MAX_RESULTS 1000
|
|
|
|
class IndexTable {
|
|
|
|
public:
|
|
|
|
// . returns false on error and sets errno
|
|
// . we now support multiple plus signs before the query term
|
|
// . start/endTermNums apply to phrase termIds only
|
|
// . allows us to set multiple bits when a phrase termId is matched
|
|
// in case the singleton was truncated, but doc has the phrase
|
|
// . if you want Default AND behaviour set requireAllTerms to true
|
|
// it is much faster, too
|
|
void init (Query *q,bool isDebug,void *logstate,bool requireAllTerms,
|
|
class TopTree *topTree );
|
|
|
|
// has init already been called?
|
|
bool isInitialized ( ) { return m_initialized; };
|
|
|
|
// sets m_positiveBits, etc.
|
|
//void prepareToAddLists ( );
|
|
|
|
// . returns false on error and sets errno
|
|
// . we assume there are "m_numTerms" lists passed in (see set() above)
|
|
void addLists_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
|
|
int32_t numTiers ,
|
|
int32_t numListsPerTier ,
|
|
Query *q ,
|
|
int32_t docsWanted ,
|
|
int32_t *totalListSizes ,
|
|
bool useDateLists ,
|
|
bool sortByDate ,
|
|
float sortByDateWeight );
|
|
|
|
// . these are set from calling addLists() above
|
|
// . we log all matching topDocIds if isDebug is true
|
|
int64_t *getTopDocIds ( int32_t tier ) { return m_topDocIds[tier]; };
|
|
|
|
unsigned char *getTopBitScores ( int32_t tier )
|
|
{ return m_topBitScores[tier]; };
|
|
|
|
char *getTopExplicits ( int32_t tier ) { return m_topExplicits[tier]; };
|
|
|
|
int32_t *getTopScores ( int32_t tier ) { return m_topScores[tier]; };
|
|
//uint32_t *getTopBitScores () { return m_finalTopBitScores; };
|
|
|
|
// make sure to call getTopDocIds() before calling this
|
|
int32_t getNumTopDocIds ( int32_t tier ) { return m_numTopDocIds[tier]; };
|
|
|
|
// . get how many results we have in the topDocIds list
|
|
// . if "thatIncludeAllTerms" is true, results must have all terms
|
|
// from all indexLists that we haven't read ALL of yet
|
|
int32_t getNumExactExplicitMatches ( int32_t tier ) {
|
|
return m_numExactExplicitMatches[tier];};
|
|
|
|
int32_t getNumExactImplicitMatches ( int32_t tier ) {
|
|
return m_numExactImplicitMatches[tier];};
|
|
|
|
// some generic stuff
|
|
IndexTable();
|
|
~IndexTable();
|
|
void reset();
|
|
|
|
// . call to set the m_final* member vars from the m_top* member vars
|
|
// . ALWAYS call this BEFORE calling
|
|
// getTopDocIds(), getNumTopDocIds() or getNumExactMatches()
|
|
void filterTopDocIds ( ) ;
|
|
|
|
// how long to add the last batch of lists
|
|
int64_t m_addListsTime;
|
|
uint32_t m_totalDocIds;
|
|
int32_t m_numPanics;
|
|
int32_t m_numCollisions;
|
|
int32_t m_numPtrs; // in the beginning at least
|
|
int32_t m_numLoops;
|
|
|
|
// how long to get top docIds
|
|
int64_t m_setTopDocIdsTime;
|
|
|
|
int64_t m_estimatedTotalHits;
|
|
|
|
int32_t m_numSlots;
|
|
|
|
// Msg39 needs to call these
|
|
void freeMem ( ) ;
|
|
bool alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
|
|
int32_t numTiers ,
|
|
int32_t numListsPerTier ,
|
|
int32_t docsWanted ,
|
|
bool sortByDate );
|
|
|
|
bool doRecall() { return m_doRecall; };
|
|
|
|
int32_t getNumDocsInTier ( int32_t i ) { return m_numDocsInTier[i]; };
|
|
|
|
// . sets m_scoreWeights[] based on termFreqs (IDF)
|
|
void setScoreWeights ( Query *q );
|
|
void setScoreWeights ( Query *q , bool phrase );
|
|
int32_t *getScoreWeights ( ) { return m_scoreWeights; };
|
|
|
|
private:
|
|
|
|
void addLists2_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
|
|
int32_t numTiers ,
|
|
int32_t numListsPerTier ,
|
|
Query *q ,
|
|
int32_t docsWanted ,
|
|
int32_t *imap ,
|
|
bool lastRound ,
|
|
int32_t numBaseLists ,
|
|
bool useDateLists ,
|
|
bool sortByDate ,
|
|
float sortByDateWeight,
|
|
int32_t *minHardCountPtr );
|
|
|
|
void hashTopDocIds2 ( uint32_t *maxDocId ,
|
|
char **docIdPtrs ,
|
|
int32_t *scores ,
|
|
qvec_t *explicitBits ,
|
|
int16_t *hardCounts ,
|
|
uint32_t mask ,
|
|
int32_t numSlots ) ;
|
|
|
|
|
|
// . used for getting which topDocId to kick out of the top list
|
|
int32_t getWeakestTopDocId ( char **topp ,
|
|
int32_t *tops ,
|
|
unsigned char *topb ,
|
|
int32_t numTop ,
|
|
unsigned char *minBitScore2 ,
|
|
int32_t *score ,
|
|
char **docIdPtr ) ;
|
|
|
|
// . get the termBits for the termId represented by this list
|
|
// . only phrases may set multiple bits
|
|
qvec_t getTermImplicitBitMask_r ( int32_t i );
|
|
|
|
// . set the m_bitScores[] array
|
|
// . "count" is the # of query term (single or phrase) bit combinations
|
|
void setBitScores ( int32_t count );
|
|
|
|
// are lists swapped?
|
|
bool m_swapped [ MAX_TIERS ] [ MAX_QUERY_TERMS ] ;
|
|
|
|
// these describe the lists associated with each m_termId
|
|
int32_t m_scoreWeights [ MAX_QUERY_TERMS ];
|
|
|
|
// for each tier we have a list of the top docids
|
|
int64_t m_topDocIds [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
char *m_topDocIdPtrs [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
int32_t m_topScores [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
//int16_t m_topHardCounts [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
unsigned char m_topBitScores [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
char m_topExplicits [ MAX_TIERS ] [ MAX_RESULTS ];
|
|
int32_t m_numTopDocIds [ MAX_TIERS ] ;
|
|
int32_t m_numExactExplicitMatches [ MAX_TIERS ];
|
|
int32_t m_numExactImplicitMatches [ MAX_TIERS ];
|
|
int32_t m_numTiers;
|
|
|
|
// when filterTopDocIds() is called it uniquifies and combines
|
|
// m_topDocIds[*][] into m_finalDocIds
|
|
/* int64_t m_finalTopDocIds [ MAX_RESULTS ];
|
|
int32_t m_finalTopScores [ MAX_RESULTS ];
|
|
//uint32_t m_finalTopBitScores [ MAX_RESULTS ];
|
|
int32_t m_finalNumExactExplicitMatches ;
|
|
int32_t m_finalNumExactImplicitMatches ;
|
|
int32_t m_finalNumTopDocIds ;*/
|
|
|
|
// a reference to the query
|
|
Query *m_q;
|
|
|
|
// has init() been called?
|
|
bool m_initialized;
|
|
|
|
// are we in debug mode?
|
|
bool m_isDebug;
|
|
|
|
// for debug msgs
|
|
int32_t m_logstate;
|
|
|
|
// . did we already call m_q->setBitScores() for this query?
|
|
// . don't call it more than once
|
|
bool m_alreadySet;
|
|
|
|
bool m_doRecalc;
|
|
|
|
bool m_requireAllTerms;
|
|
char **m_topDocIdPtrs2;
|
|
int32_t *m_topScores2;
|
|
qvec_t *m_topExplicits2;
|
|
int16_t *m_topHardCounts2;
|
|
int32_t m_maxTopDocIds2;
|
|
int32_t m_numTopDocIds2;
|
|
int32_t m_nexti;
|
|
int32_t m_oldnexti;
|
|
bool m_doRecall;
|
|
|
|
// allocated memory
|
|
char *m_buf;
|
|
int32_t m_bufSize;
|
|
char *m_bufMiddle;
|
|
|
|
// for large hashtable for sortByDate
|
|
char *m_bigBuf;
|
|
int32_t m_bigBufSize;
|
|
|
|
// the imap stuff
|
|
int32_t m_imap [ MAX_QUERY_TERMS ];
|
|
int32_t m_sizes [ MAX_QUERY_TERMS ];
|
|
int32_t m_blocksize [ MAX_QUERY_TERMS ];
|
|
int32_t m_nb;
|
|
|
|
class TopTree *m_topTree;
|
|
|
|
// these are for removing component lists replaced by their compounds
|
|
int32_t *m_componentCodes;
|
|
//char *m_ignore;
|
|
//bool m_scoresSet;
|
|
int32_t m_numDocsInTier [ MAX_TIERS ] ;
|
|
};
|
|
|
|
// . get the LOWEST scoring docId from our list of top docIds
|
|
// . set "minBitScore22" and "score" for that lowest docId
|
|
// . inline this for speed
|
|
// . BUT lower docIds are considered higher scoring than higher docIds
|
|
inline int32_t IndexTable::getWeakestTopDocId ( char **topp ,
|
|
int32_t *tops ,
|
|
unsigned char *topb ,
|
|
int32_t numTop ,
|
|
unsigned char *minBitScore2 ,
|
|
int32_t *score ,
|
|
char **docIdPtr ) {
|
|
int64_t tmp = 0LL;
|
|
int32_t minScore = 0x7fffffff;
|
|
unsigned char minBitScore = 0xff;
|
|
char *minDocIdPtr = (char *)&tmp;
|
|
int32_t mini = 0;
|
|
for ( int32_t i = 0 ; i < numTop ; i++ ) {
|
|
if ( topb [i] > minBitScore ) continue;
|
|
if ( topb [i] < minBitScore ) goto gotIt;
|
|
if ( tops [i] > minScore ) continue;
|
|
if ( tops [i] < minScore ) goto gotIt;
|
|
if ( *(uint32_t *)(topp[i]+1 ) <
|
|
*(uint32_t *)(minDocIdPtr+1) ) continue;
|
|
if ( *(uint32_t *)(topp[i]+1 ) >
|
|
*(uint32_t *)(minDocIdPtr+1) ) goto gotIt;
|
|
if ( (*(unsigned char *)(topp[i] ) & 0xfc) <
|
|
(*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
|
|
// ties should not be happening for docid, unless
|
|
// it tied with initial setting of minDocIdPtr, in that
|
|
// case we should add it!
|
|
gotIt:
|
|
minScore = tops [i];
|
|
minBitScore = topb [i];
|
|
minDocIdPtr = topp [i];
|
|
mini = i;
|
|
}
|
|
// set the callers ptrs
|
|
*minBitScore2 = minBitScore;
|
|
*score = minScore;
|
|
*docIdPtr = minDocIdPtr;
|
|
// return the lowest scoring docId's position
|
|
return mini;
|
|
}
|
|
|
|
#endif
|