open-source-search-engine/junkdrawer/fastIndexTable.cpp

629 lines
25 KiB
C++

#include "IndexTable.h"
IndexTable::IndexTable() {
// top docid info
m_buf = NULL;
m_topDocIds = NULL;
m_topScores = NULL;
m_numSlots = 0;
reset();
}
IndexTable::~IndexTable() { reset(); }
void IndexTable::reset() {
if ( m_buf ) mfree ( m_buf , m_bufSize ,"IndexTable");
// free the table stuff
if (m_docIdBits )mfree ( m_docIdBits , m_numSlots * 8 ,"IndexTable");
if (m_scores )mfree ( m_scores , m_numSlots * 4 ,"IndexTable");
if ( m_termBits )mfree ( m_termBits , m_numSlots * 2 ,"IndexTable");
m_docIdBits = NULL;
m_scores = NULL;
m_termBits = NULL;
m_numTopDocIds = 0;
m_maxNumTopDocIds = 0;
m_numSlots = 0;
m_mask = 0;
}
// . max score weight
// . we base score weight on the termFreq(# of docs that have that term)
// . it is now also based on the # of plus signs prepended before a term
#define MAX_SCORE_WEIGHT 1000
// . HACK: date hack
// . this is a special score weight i added for sorting by date
// . it scales the score of date indexlists
// . score of termIds 0xdadadada and 0xdadadad2 represent date of the document
// . 0xdadadada's score is the most signifincat 8-bits of the 16-bit date and
// . 0xdadadad2's score is the least signifincat
// . the date is stored in days since the epoch (16 bits needed = 2 scores)
// . unfortunately, it may not always work too well
// . the max score any doc can have (w/o date) is about 16,000
// . so when we weight the date by DATE_WEIGHT and add to the doc's score
// just a 1 second difference will supersede any difference in term scores
#define DATE_WEIGHT (2 * MAX_QUERY_TERMS * MAX_SCORE_WEIGHT)
// . returns false on error and sets errno
// . NOTE: termIds,termFreqs,termSigns are just referenced by us, not copied
// . sets m_startKeys, m_endKeys and m_minNumRecs for each termId
// . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!!
// we should be able to get an upper bound estimate from the b-tree
// quickly!
// . we now support multiple plus signs before the query term
void IndexTable::set ( int32_t docsWanted ,
int64_t *termIds ,
int64_t *termFreqs ,
char *termSigns ,
char *numPlusses ,
char *startTermNums , // phrase ranges
char *endTermNums , // phrase ranges
int32_t numTerms ) {
// make sure our max score isn't too big
//int32_t a = MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300 * 255 + 255;
//int64_t aa=MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300LL * 255LL + 255;
//if ( a != aa ) {
// log("IndexTable::set: MAX_QUERY_TERMS too big"); exit(-1); }
// clear everything
reset();
// . HACK: we must always get the top 500 results for now
// . so next results page doesn't have same results as a previous page
// . TODO: refine this
// . this is repeated in IndexReadInfo.cpp
// . HACK: this prevents dups when clicking "Next X Results" link
//if ( docsWanted < 500 ) docsWanted = 500;
// remember these
m_docsWanted = docsWanted;
m_termIds = termIds;
m_termSigns = termSigns;
m_startTermNums = startTermNums;
m_endTermNums = endTermNums;
m_numTerms = numTerms;
// change termSign of '+' to '*' if it was truncated
int32_t truncLimit = g_conf.m_indexdbTruncationLimit;
// . termFreqs are already minimal, that is, approximate lower bound,
// since they subtract 2 PAGE_SIZEs worth of keys to make up for err
// . see Rdb::getListSize() to see how we estimate termFreqs
// . and then see Indexdb::getTermFreq() to see linear interpolation
// for truncated lists (should really be bell-curve interpolation)
// . turn the '+' into a '*' to mark truncated, hard-required lists
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
if ( termSigns[i] != '+' ) continue;
if ( termFreqs[i] < truncLimit ) continue;
termSigns[i] = '*' ;
}
// . set score weight for each term based on termFreqs
// . termFreqs is just the max size of an IndexList
setScoreWeights ( termFreqs , numPlusses , numTerms );
// . HACK: date hack
// . set termSigns of the special date termIds to 'd' so addList()
// knows to throw these scores into the highest tier (just for them)
// . also re-set the weights so the most significant 8-bits of our
// 16-bit date is weighted by 256, and the low by 1
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
if ( termIds[i] == (0xdadadada & TERMID_MASK) ) {
termSigns[i] = 'd';
m_scoreWeights[i] = 256;
}
else if ( termIds[i] == (0xdadadad2 & TERMID_MASK) ) {
termSigns[i] = 'd';
m_scoreWeights[i] = 1;
}
}
}
// . the score weight is in [1,MAX_SCORE_WEIGHT=1000]
// . so max score would be MAX_QUERY_TERMS*1000*255 = 16*1000*255 = 4,080,000
// . now we use MAX_QUERY_TERMS * MAX_SCORE to be the actual max score so
// that we can use 4,080,000 for truncated hard-required terms
// . TODO: make adjustable from an xml interpolated point map
void IndexTable::setScoreWeights ( int64_t *termFreqs ,
char *numPlusses ,
int32_t numTerms ) {
// get an estimate on # of docs in the database
//int64_t numDocs = g_titledb.getGlobalNumDocs();
// this should only be zero if we have 0 docs, so make it 1 if so
//if ( numDocs <= 0 ) numDocs = 1;
// . compute the total termfreqs
// . round small termFreqs up to half a PAGE_SIZE
int64_t absMin = PAGE_SIZE/(2*sizeof(key_t));
int64_t minFreq = termFreqs[0];
if ( minFreq < absMin ) minFreq = absMin;
for ( int32_t i = 1 ; i < numTerms ; i++ ) {
if ( termFreqs[i] > minFreq ) continue;
if ( termFreqs[i] < absMin ) continue;
minFreq = termFreqs[i];
}
// loop through each term computing the score weight for it
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
// reserve half the weight for up to 4 plus signs
int64_t max = MAX_SCORE_WEIGHT / 3;
// . 3 extra plusses can triple the score weight
// . each extra plus adds "extra" to the score weight
int32_t extra = (2 * MAX_SCORE_WEIGHT) / 9;
// add 1/6 for each plus over 1
if ( numPlusses[i] > 0 ) max += (numPlusses[i] - 1) * extra;
// but don't exceed the absolute max
if ( max > MAX_SCORE_WEIGHT ) max = MAX_SCORE_WEIGHT;
// increase small term freqs to the minimum
int64_t freq = termFreqs[i];
if ( freq < absMin ) freq = absMin;
// . now if one term is 100 times more popular than another
// it should count 1/100th as much
// . since high scores are logs of the originals (to keep them
// in 8 bits) we should perhaps say:
// if one term is 100 times more popular than another
// it should count 1/log(100)th as much (TODO!)
// . weight score by 300
// . the rarer the term, the lower the termFreq, the higher
// the score
// . we're assured the end result must fit in a int32_t
int32_t weight = ( max * minFreq ) / freq;
// don't let this go to zero
if ( weight <= 0 ) weight = 1; // if ( score < 0 ) score = 0
m_scoreWeights[i] = weight; // (300 - score) + 100 ;
}
}
// . this maps a term's bit mask to a score
// . this used to just count the # of bits on, but one singleton termId is more
// important than one phrase termId
static uint16_t s_bitScores [ 65535 ] ;
static uint16_t s_maxBitMask;
//static int32_t s_collisions ;
// . returns false on error and sets errno
// . we hash the lists into the hash table
// . these "lists" MUST correspond 1-1 w/ m_termIds[],m_termFreqs[],m_termSigns
// . if "forcePhrases" is true we hard require all terms
// . lists are typically appendings of a bunch of lists so each list probably
// does not have it's keys in order and may have unannihilated +/- recs
bool IndexTable::addLists (IndexList *lists, int32_t numLists, bool forcePhrases){
// ensure they aren't messing with us after calling set() above
if ( m_numTerms != numLists ) {
errno = EBADENGINEER;
return log("IndexTable::addLists: numLists=%"INT32", should be %"INT32"",
numLists, m_numTerms );
}
// which bits are for singletons?
uint16_t singleMask = 0;
uint16_t phraseMask = 0;
int32_t count = 1;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
if ( m_startTermNums[i] == -1 ) singleMask |= (1 << i);
else phraseMask |= (1 << i);
count *= 2;
}
// count cannot exceed 65536
if ( count > 65536 ) {
log("IndexTable::addLists: bad engineer. grow the s_table.");
count = 65536;
}
// remember for later
s_maxBitMask = count - 1;
// . init the table for converting a bit mask to a bit score
// . we don't just rank by the # of bits on anymore since singleton
// bits are more important than phrase bits
// . TODO: base s_bitScores[i] on termFreqs secondly!
for ( int32_t i = 0 ; i < count ; i++ )
s_bitScores [ i ] =
100 * getNumBitsOn ( i & singleMask ) +
1 * getNumBitsOn ( i & phraseMask );
// how many records/keys in all lists?
int32_t numKeys = 0;
for ( int32_t i = 0; i < numLists ; i++ )
numKeys += lists[i].getNumKeys();
// . grow hash table by that plus 20%
// . grow table to powers of 2 only!
// . increases m_numSlots by this much
// . seems like we gotta lot of collisions so grow table by twice
// instead of 20%!
if ( ! growTable ( (numKeys * 200) / 100 ) ) return false;
// another timestamp
log("IndexTable::addLists: adding %"INT32" lists now t=%"INT64", slots=%"INT32"",
numLists , gettimeofdayInMilliseconds() , m_numSlots );
// keep track of collisions in hash table
//s_collisions = 0;
// now hash the docIds/score pairs in each key of each list
for ( int32_t i = 0; i < numLists ; i++ ) {
// . convert the list's termId to a termBitMask
// . uses m_startTermNums and m_endTermNums for setting
// multiple bits for phrases to dilute the effects of
// truncation limit
log("IndexTable::addLists: getting bit mask now t=%"INT64"",
gettimeofdayInMilliseconds() );
uint16_t termBitMask = getTermBitMask ( i );
log("IndexTable::addLists: got bit mask now t=%"INT64", sign=%c",
gettimeofdayInMilliseconds() , m_termSigns[i] );
log("adding list #%"INT32", size=%"INT32"",i,lists[i].getListSize());
addList ( &lists[i] , m_scoreWeights[i] , m_termSigns[i] ,
termBitMask ) ;
}
//log("IndexTable::addLists: collisions = %"INT32", #keys=%"INT32", #slots=%"INT32"",
// s_collisions, numKeys , m_numSlots );
// another timestamp
log("IndexTable::addLists: setting the top docIds now t=%"INT64"",
gettimeofdayInMilliseconds() );
// now set the top docIds
if ( ! setTopDocIds ( m_docsWanted , forcePhrases ) ) return false;
// success
return true;
}
// . TODO: this must be very fast!!!!
// . this is still taking 378ms to add about 500,000 keys!!! (from 7 lists)
// . we should be able to hand code assembly to do about 30 cycler per key
// which would mean 500,000 keys would take 15 million cycles =
// which is 15/1400 of a second = ~10 ms. that's 30 times faster!!
void IndexTable::addList ( IndexList *list, int32_t scoreWeight, char termSign ,
uint16_t termBitMask ) {
// . route if necessary
// . these are specialized for speed
// . exlcuded query term
if ( termSign == '-' ) {addList2(list,termBitMask);return;}
// . date search term
if ( termSign == 'd' ) {addList3(list,scoreWeight,termBitMask);return;}
// records are data-less, just keys
int64_t docIdBits;
int16_t score;
int32_t n;
register unsigned char *k = (unsigned char *) list->getList();
unsigned char *end = (unsigned char *) list->getListEnd();
// compute huge score
for ( ; k < end ; k += sizeof(key_t) ) {
// . get the score
// . NOTE: scores are complemented for ordering purposes in db
// . so the min score is actually the max
// . so we uncomplement it here
// . this is 6 cycles of assembly
// . all of this is about 100 lines of assembly
score = 255 - *(k+5);
// . if delbit is set make the score negative
// . i.e. make negative if this key is a delete
// . this is dependent on endianness of machine
// . we're small endian, and the lower int64_t comes first
// and IT stores the least significant int32_t first
if ( *k & (unsigned char)0x01 == 0 ) score = -score;
// just mask out everything else, but the docId from ith key
docIdBits =(*((uint64_t *)k)) & 0x0000007ffffffffeLL;
// . hash docId
// . it's faster to use a power-of-2 table size and a mask
n = ( (*(uint32_t *)k) >>1 ) & m_mask;
// debug
//log("docIdBits=%"UINT64" slot=%"INT32"",docIdBits,n);
// . chain until we find empty bucket
// . empty bucket has a score of 0
while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits ) {
//s_collisions++;
//log("COLLISION with docIdBits=%"UINT64"",m_docIdBits[n]);
if ( ++n >= m_numSlots ) n = 0;
}
// if it's new come here since m_termBits was not calloc'd
// it was malloc'd so it may be garbage
if ( m_scores[n] == 0 ) {
m_docIdBits [ n ] = docIdBits;
m_termBits [ n ] = termBitMask;
}
else
m_termBits [ n ] |= termBitMask;
// . add the score
// . weight the score by the idf (inverse termFreq)
m_scores [ n ] += (int32_t)score * scoreWeight;
}
}
// this is almost an exact dup of addList() above, but it's for terms with
// a minus term in front of them (exclude from search results)
void IndexTable::addList2 ( IndexList *list, uint16_t termBitMask ) {
// records are data-less, just keys
int64_t docIdBits;
int16_t score;
int32_t n;
key_t *k = (key_t *) list->getList();
key_t *end = (key_t *) list->getListEnd();
// compute huge score
for ( ; k < end ; k++ ) {
// . get the score
// . NOTE: scores are complemented for ordering purposes in db
// . so the min score is actually the max
// . so we uncomplement it here
score = 255 - *(((unsigned char *)k) + 5);
// . if delbit is set make the score negative
// . i.e. make negative if this key is a delete
if ( *((uint32_t *)k) & 0x01 == 0 ) score = -score;
// just mask out everything else, but the docId from ith key
docIdBits = k->n0 & 0x0000007ffffffffeLL;
// . hash docId
// . it's faster to use a power-of-2 table size and a mask
n = ((uint32_t)docIdBits) & m_mask;
// . chain until we find empty bucket
// . empty bucket has a score of 0
while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits )
if ( ++n >= m_numSlots ) n = 0;
// adjust score based on if delBit is set or not
if ( score < 0 ) m_scores [ n ] += 500000000;
else m_scores [ n ] -= 5000000000;
m_docIdBits [ n ] = docIdBits;
m_termBits [ n ] = 0;
}
}
// . just like addList()s above but for date search terms
void IndexTable::addList3 ( IndexList *list, int32_t scoreWeight,
uint16_t termBitMask ) {
// records are data-less, just keys
int64_t docIdBits;
int16_t score;
int32_t n;
key_t *k = (key_t *) list->getList();
key_t *end = (key_t *) list->getListEnd();
// compute huge score
for ( ; k < end ; k++ ) {
// . get the score
// . NOTE: scores are complemented for ordering purposes in db
// . so the min score is actually the max
// . so we uncomplement it here
score = 255 - *(((unsigned char *)k) + 5);
// . if delbit is set make the score negative
// . i.e. make negative if this key is a delete
if ( *((uint32_t *)k) & 0x01 == 0 ) score = -score;
// just mask out everything else, but the docId from ith key
docIdBits = k->n0 & 0x0000007ffffffffeLL;
// . hash docId
// . it's faster to use a power-of-2 table size and a mask
n = ((uint32_t)docIdBits) & m_mask;
// . chain until we find empty bucket
// . empty bucket has a score of 0
while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits )
if ( ++n >= m_numSlots ) n = 0;
// if it's new come here since m_termBits was not calloc'd
// it was malloc'd so it may be garbage
if ( m_scores[n] == 0 ) {
m_docIdBits [ n ] = docIdBits;
m_termBits [ n ] = termBitMask;
}
else
m_termBits [ n ] |= termBitMask;
// . HACK: date hack
// . now this isn't perfect, but score is usually around 45
// when scoreWeight is 256
// . it should be good enough most of the time
m_scores [ n ] += score * scoreWeight * DATE_WEIGHT;
}
}
// . returns false and sets errno on error
// . get the top X scoring docIds (docIdBits actually)
// . order by m_numTerms first then by score (complemented)
// . sets "numTopDocIds" to actual number we got
bool IndexTable::setTopDocIds ( int32_t topn , bool forcePhrases ) {
// reset any existing buffer
if ( m_buf ) { mfree ( m_buf , m_bufSize,"IndexTable"); m_buf = NULL;}
// timing debug
int64_t startTime = gettimeofdayInMilliseconds();
// make new buffer to hold top docids/scores/termBits
m_bufSize = (8 + 4 + 2) * topn;
m_buf = (char *) mmalloc ( m_bufSize ,"IndexTable" );
if ( ! m_buf ) return log("IndexTable::setTopDocIds: mmalloc failed");
// set our arrays as offsets into the allocated buffer
m_topDocIds = (int64_t *) m_buf;
m_topScores = (int32_t *) (m_buf + 8 * topn);
m_topBitScores = (uint16_t *) (m_buf + (8+4) * topn);
m_maxNumTopDocIds = topn ;
m_numTopDocIds = 0 ;
// . set a minimal mask from hard required ('+') terms
// . NOTE: truncated termlists had their sign changed to a '*'
uint16_t minMask = 0;
// don't let i be >= 32 cuz << shift won't work right
for ( int32_t i = 0 ; i < m_numTerms && i < 32 ; i++ )
if ( m_termSigns[i]=='+' || forcePhrases ) minMask |= (1 << i);
// add the first m_maxNumTopDocIds terms to list, no questions asked
int32_t nt = 0;
int32_t i;
for ( i = 0 ; nt < m_maxNumTopDocIds && i < m_numSlots ; i++ ) {
// skip empty buckets and negative scores
if ( m_scores[i] <= 0 ) continue;
// don't allow docId into initial top list if minMask not met
if ( (m_termBits[i] & minMask) != minMask ) continue;
m_topScores [nt] = m_scores [i];
m_topBitScores [nt] = s_bitScores [ m_termBits[i] ];
m_topDocIds [nt] =(m_docIdBits[i]>>1) &0x0000003fffffffffLL;
nt++;
}
m_numTopDocIds = nt;
// these are the min info from the topDocIds
char minBitScore ;
int32_t minScore ;
int64_t minDocId ;
// get new lowest parms
getLowestTopDocId ( &minBitScore , &minScore , &minDocId );
// timing debug
// this chunk of code takes the longest (1-4 seconds), the bubble sort
// is only 15ms
log("setTopDocIds: phase 1 took %"INT64" ms",
gettimeofdayInMilliseconds() - startTime );
startTime = gettimeofdayInMilliseconds();
// your bitsOn must be >= minBitsOn and score > minScore to get into
// our list of the top docIds
for ( ; i < m_numSlots ; i++ ) {
// skip empty buckets (or negative scores from '-' operator)
if ( m_scores[i] <= 0 ) continue;
// these must also meet the minMask
if ( (m_termBits[i] & minMask) != minMask ) continue;
// docId must have at least this many matching terms
if ( s_bitScores [ m_termBits[i] ] < minBitScore ) continue;
if ( s_bitScores [ m_termBits[i] ] > minBitScore ) goto addIt;
// docId must have a better score if it tied matching terms
if ( m_scores[i] < minScore ) continue;
if ( m_scores[i] > minScore ) goto addIt;
// continue if docId is too small
if ( ((m_docIdBits[i]>>1) &0x0000003fffffffffLL) > minDocId )
continue;
addIt:
// get the docId
int64_t docId = (m_docIdBits[i]>>1) &0x0000003fffffffffLL;
// kick out the lowest
int32_t j = getLowestTopDocId ( );
// debug
//log("replacing #%"INT32" docId=%"INT64", bitsOn=%"INT32", score=%"INT32" "
// "with #%"INT32" docId=%"INT64", bitsOn=%"INT32", score=%"INT32"",
// j,m_topDocIds[j],m_topBitScores[j],m_topScores[j],
// i,docId,s_bitScores [ m_termBits[i] ],m_scores[i]);
// now we got in the top
m_topScores [j] = m_scores [i];
m_topBitScores [j] = s_bitScores [ m_termBits[i] ];
m_topDocIds [j] = docId;
// get new lowest parms
getLowestTopDocId ( &minBitScore , &minScore, &minDocId );
}
// timing debug
log("setTopDocIds: phase 2 took %"INT64" ms",
gettimeofdayInMilliseconds() - startTime );
// sort the m_topDocIds/m_topScores/m_topBitScores arrays now
sortTopDocIds();
// timing debug
log("setTopDocIds: @ phase 3 time is %"INT64" ms",
gettimeofdayInMilliseconds() - startTime );
return true;
}
void IndexTable::sortTopDocIds ( ) {
// . now sort by m_topBitScores/m_topScores/m_topDocIds
// . use a local static ptr since gbsort doesn't have THIS ptr
// . do a quick bubble sort
bool didSwap = false;
keepSorting:
for ( int32_t i = 1 ; i < m_numTopDocIds ; i++ ) {
// continue if no switch needed
if ( m_topBitScores [i-1] > m_topBitScores [i] ) continue;
if ( m_topBitScores [i-1] < m_topBitScores [i] ) goto doSwap;
if ( m_topScores [i-1] > m_topScores [i] ) continue;
if ( m_topScores [i-1] < m_topScores [i] ) goto doSwap;
if ( m_topDocIds [i-1] <= m_topDocIds [i] ) continue;
doSwap:
int32_t tmpScore = m_topScores [i-1];
uint16_t tmpBitScore = m_topBitScores [i-1];
int64_t tmpDocId = m_topDocIds [i-1];
m_topScores [i-1] = m_topScores [i ];
m_topBitScores [i-1] = m_topBitScores [i ];
m_topDocIds [i-1] = m_topDocIds [i ];
m_topScores [i ] = tmpScore;
m_topBitScores [i ] = tmpBitScore;
m_topDocIds [i ] = tmpDocId;
didSwap = true;
}
// if it's sorted then return
if ( ! didSwap ) return;
// otherwise keep sorting
didSwap = false;
goto keepSorting;
}
// . grow the hash table by "n" slots
// . returns false on alloc error, true on success
bool IndexTable::growTable ( int32_t growBy ) {
// how big should the new arrays be?
int32_t n = m_numSlots + growBy ;
// round up to nearest power of 2 if we need to
uint32_t highest;
if ( getNumBitsOn ( n ) == 1 ) highest = n;
// get the value of the highest lit bit (0,1,2,4,8, ...)
else highest = getHighestLitBitValue ( n ) << 1 ;
// bail if none list
//if ( highest == 0 ) {
// errno = EBADENGINEER;
// return log("IndexTable::growTable: bad engineer");
//}
// overgrow in order for size to be a power of 2
n = highest;
// . alloc new arrays
// . make sure to set all scores to 0
int64_t *newDocIdBits = (int64_t *) mmalloc ( n * 8,"IndexTable");
int32_t *newScores = (int32_t *) mcalloc ( n * 4,"IndexTable");
uint16_t *newBits = (uint16_t *)mmalloc(n*2,"IndexTable");
// return false on alloc error
if ( ! newDocIdBits || ! newScores || ! newBits ) {
if ( newDocIdBits ) mfree ( newDocIdBits ,n * 8,"IndexTable");
if ( newScores ) mfree ( newScores ,n * 4,"IndexTable");
if ( newBits ) mfree ( newBits ,n * 2,"IndexTable");
return log("IndexTable::growTable: mmalloc failed");
}
// we use m_mask instead of the % operator for hashing
m_mask = highest - 1;
// rehash into the new arrays
for ( int32_t i = 0 ; i < m_numSlots ; i++ ) {
// skip the empty buckets
if ( ! m_scores[i] ) continue;
// get the new slot number for this slot (might be the same!)
//int32_t newn = (uint64_t)m_docIdBits[i] % n;
int32_t newn = (uint64_t)m_docIdBits[i] & m_mask;
while ( newScores [ newn ] != 0 ) if ( ++newn >= n ) newn = 0;
// move the docIdBits/score/bitCount to this new slot
newDocIdBits [newn] = m_docIdBits [i];
newScores [newn] = m_scores [i];
newBits [newn] = m_termBits [i];
}
// free the old guys
mfree ( m_docIdBits , m_numSlots * 8, "IndexTable");
mfree ( m_scores , m_numSlots * 4, "IndexTable");
mfree ( m_termBits , m_numSlots * 2, "IndexTable");
// assign the new guys
m_docIdBits = newDocIdBits;
m_scores = newScores;
m_termBits = newBits;
// update m_numSlots to n
m_numSlots = n;
return true;
}
// . get number of docids in m_topDocIds
// . only return # of docIds that have all the termIds if "thatIncludeAllTerms"
int32_t IndexTable::getNumResults ( bool thatIncludeAllTerms ) {
// return 0 if we have no topDocIds
if ( ! m_topDocIds || m_numTopDocIds == 0 ) return 0;
// if they don't need to include all terms just return this
if ( ! thatIncludeAllTerms ) return m_numTopDocIds;
// . if they want all terms, make a min bit mask
// . turn all bits on and see what the bit score is
uint16_t minBitScore = s_bitScores [ s_maxBitMask ];
// otherwise, see if the top docIds have ALL the terms
int32_t count = 0;
for ( int32_t i = 0 ; i < m_numTopDocIds ; i++ ) {
// skip empty buckets (or negative scores)
if ( m_topScores [i] <= 0 ) continue;
if ( m_topBitScores[i] >= minBitScore ) count++;
}
// return that
return count;
}
int64_t *IndexTable::getTopDocIds ( bool isAdmin ) {
if ( isAdmin ) {
log("----- # top docIds = %"INT32"",m_numTopDocIds);
for ( int32_t i = 0 ; i < m_numTopDocIds; i++ )
//log("%"INT32") docId=%"INT64", # terms Matched=%"INT32", sum=%"INT32"",
log("%"INT32") docId=%"INT64", bit score=%hu, sum=%"INT32"",
i,m_topDocIds[i],m_topBitScores[i],
m_topScores[i]);
}
return m_topDocIds;
}
// . get the termBits for the termId represented by this list
// . only phrases may set multiple bits
uint16_t IndexTable::getTermBitMask ( int32_t i ) {
// set the ith bit
uint16_t mask = (1 << i);
// get range of termIds this termId encloses, iff it's a phrase termId
char a = m_startTermNums[i] ;
char b = m_endTermNums [i] ;
// if not a phrase return now
if ( a == -1 ) return mask;
// otherwise set one bit for each query termId used by phrase
for ( char j = a ; j <= b ; j++ ) mask |= (1 << j);
// return that
return mask;
}