open-source-search-engine/junkdrawer/fastIndexTable.cpp

#include "IndexTable.h"

IndexTable::IndexTable() {
	// top docid info
	m_buf       = NULL;
	m_topDocIds = NULL;
	m_topScores = NULL;
	m_numSlots = 0;
	reset();
}
IndexTable::~IndexTable() { reset(); }

void IndexTable::reset() {
	if ( m_buf ) mfree ( m_buf , m_bufSize ,"IndexTable");
	// free the table stuff
	if (m_docIdBits )mfree ( m_docIdBits  , m_numSlots * 8 ,"IndexTable");
	if (m_scores    )mfree ( m_scores     , m_numSlots * 4 ,"IndexTable");
	if ( m_termBits )mfree ( m_termBits   , m_numSlots * 2 ,"IndexTable");
        m_docIdBits       = NULL;
	m_scores          = NULL;
	m_termBits        = NULL;
	m_numTopDocIds    = 0;
	m_maxNumTopDocIds = 0;
	m_numSlots        = 0;
	m_mask            = 0;
}

// . max score weight
// . we base score weight on the termFreq(# of docs that have that term)
// . it is now also based on the # of plus signs prepended before a term
#define MAX_SCORE_WEIGHT 1000

// . HACK: date hack
// . this is a special score weight i added for sorting by date
// . it scales the score of date indexlists
// . score of termIds  0xdadadada and 0xdadadad2 represent date of the document
// . 0xdadadada's score is the most signifincat 8-bits of the 16-bit date and
// . 0xdadadad2's score is the least signifincat
// . the date is stored in days since the epoch (16 bits needed = 2 scores)
// . unfortunately, it may not always work too well
// . the max score any doc can have (w/o date) is about 16,000
// . so when we weight the date by DATE_WEIGHT and add to the doc's score
//   just a 1 second difference will supersede any difference in term scores
#define DATE_WEIGHT  (2 * MAX_QUERY_TERMS * MAX_SCORE_WEIGHT)

// . returns false on error and sets errno
// . NOTE: termIds,termFreqs,termSigns are just referenced by us, not copied
// . sets m_startKeys, m_endKeys and m_minNumRecs for each termId
// . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!!
//         we should be able to get an upper bound estimate from the b-tree
//         quickly!
// . we now support multiple plus signs before the query term
void IndexTable::set ( int32_t       docsWanted ,
		       int64_t *termIds    ,
		       int64_t *termFreqs  ,
		       char      *termSigns  ,
		       char      *numPlusses ,
		       char      *startTermNums , // phrase ranges
		       char      *endTermNums   , // phrase ranges
		       int32_t       numTerms      ) {
	// make sure our max score isn't too big
	//int32_t a     = MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300   * 255   + 255;
	//int64_t aa=MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300LL * 255LL + 255;
	//if ( a != aa ) {
	//	log("IndexTable::set: MAX_QUERY_TERMS too big"); exit(-1); }
	// clear everything
	reset();
	// . HACK: we must always get the top 500 results for now
	// . so next results page doesn't have same results as a previous page
	// . TODO: refine this
	// . this is repeated in IndexReadInfo.cpp
	// . HACK: this prevents dups when clicking "Next X Results" link
	//if ( docsWanted < 500 ) docsWanted = 500;
	// remember these
	m_docsWanted    = docsWanted;
	m_termIds       = termIds;
	m_termSigns     = termSigns;
	m_startTermNums = startTermNums;
	m_endTermNums   = endTermNums;
	m_numTerms      = numTerms;
	// change termSign of '+' to '*' if it was truncated
	int32_t truncLimit = g_conf.m_indexdbTruncationLimit;
	// . termFreqs are already minimal, that is, approximate lower bound,
	//   since they subtract 2 PAGE_SIZEs worth of keys to make up for err
	// . see Rdb::getListSize() to see how we estimate termFreqs
	// . and then see Indexdb::getTermFreq() to see linear interpolation
	//   for truncated lists (should really be bell-curve interpolation)
	// . turn the '+' into a '*' to mark truncated, hard-required lists
	for ( int32_t i = 0 ; i < numTerms ; i++ ) {
		if ( termSigns[i] != '+'       ) continue;
		if ( termFreqs[i] < truncLimit ) continue;
		termSigns[i] = '*' ;
	}
	// . set score weight for each term based on termFreqs
	// . termFreqs is just the max size of an IndexList
	setScoreWeights ( termFreqs , numPlusses , numTerms );
	// . HACK: date hack
	// . set termSigns of the special date termIds to 'd' so addList()
	//   knows to throw these scores into the highest tier (just for them)
	// . also re-set the weights so the most significant 8-bits of our
	//   16-bit date is weighted by 256, and the low by 1
	for ( int32_t i = 0 ; i < numTerms ; i++ ) {
		if      ( termIds[i] == (0xdadadada & TERMID_MASK) ) {
			termSigns[i] = 'd';
			m_scoreWeights[i] = 256;
		}
		else if ( termIds[i] == (0xdadadad2 & TERMID_MASK) ) {
			termSigns[i] = 'd';
			m_scoreWeights[i] = 1;
		}
	}
}

// . the score weight is in [1,MAX_SCORE_WEIGHT=1000]
// . so max score would be MAX_QUERY_TERMS*1000*255 = 16*1000*255 = 4,080,000
// . now we use MAX_QUERY_TERMS * MAX_SCORE to be the actual max score so
//   that we can use 4,080,000 for truncated hard-required terms
// . TODO: make adjustable from an xml interpolated point map
void IndexTable::setScoreWeights ( int64_t *termFreqs  ,
				   char      *numPlusses ,
				   int32_t       numTerms   ) {
	// get an estimate on # of docs in the database
	//int64_t numDocs = g_titledb.getGlobalNumDocs();
	// this should only be zero if we have 0 docs, so make it 1 if so
	//if ( numDocs <= 0 ) numDocs = 1;
	// . compute the total termfreqs
	// . round small termFreqs up to half a PAGE_SIZE
	int64_t absMin  = PAGE_SIZE/(2*sizeof(key_t));
	int64_t minFreq = termFreqs[0];
	if ( minFreq < absMin ) minFreq = absMin;
	for ( int32_t i = 1 ; i < numTerms ; i++ ) {
		if ( termFreqs[i] > minFreq ) continue;
		if ( termFreqs[i] < absMin  ) continue;
		minFreq = termFreqs[i];
	}
	// loop through each term computing the score weight for it
	for ( int32_t i = 0 ; i < numTerms ; i++ ) {
		// reserve half the weight for up to 4 plus signs
		int64_t max = MAX_SCORE_WEIGHT / 3;
		// . 3 extra plusses can triple the score weight
		// . each extra plus adds "extra" to the score weight
		int32_t extra = (2 * MAX_SCORE_WEIGHT) / 9;
		// add 1/6 for each plus over 1
		if ( numPlusses[i] > 0 ) max += (numPlusses[i] - 1) * extra;
		// but don't exceed the absolute max
		if ( max > MAX_SCORE_WEIGHT ) max = MAX_SCORE_WEIGHT;
		// increase small term freqs to the minimum
		int64_t freq = termFreqs[i];
		if ( freq < absMin ) freq = absMin;
		// . now if one term is 100 times more popular than another
		//   it should count 1/100th as much
		// . since high scores are logs of the originals (to keep them
		//   in 8 bits) we should perhaps say:
		//   if one term is 100 times more popular than another
		//   it should count 1/log(100)th as much (TODO!)
		// . weight score by 300
		// . the rarer the term, the lower the termFreq, the higher
		//   the score
		// . we're assured the end result must fit in a int32_t
		int32_t weight = ( max * minFreq ) / freq;
		// don't let this go to zero
		if ( weight <= 0 ) weight = 1; // if ( score < 0 ) score = 0
		m_scoreWeights[i] = weight; // (300 - score) + 100 ;
	}
}


// . this maps a term's bit mask to a score
// . this used to just count the # of bits on, but one singleton termId is more
//   important than one phrase termId
static uint16_t s_bitScores [ 65535 ] ;
static uint16_t s_maxBitMask;

//static int32_t s_collisions ;

// . returns false on error and sets errno
// . we hash the lists into the hash table
// . these "lists" MUST correspond 1-1 w/ m_termIds[],m_termFreqs[],m_termSigns
// . if "forcePhrases" is true we hard require all terms
// . lists are typically appendings of a bunch of lists so each list probably
//   does not have it's keys in order and may have unannihilated +/- recs
bool IndexTable::addLists (IndexList *lists, int32_t numLists, bool forcePhrases){
	// ensure they aren't messing with us after calling set() above
	if ( m_numTerms != numLists ) {
		errno = EBADENGINEER;
		return log("IndexTable::addLists: numLists=%"INT32", should be %"INT32"",
			   numLists, m_numTerms );
	}
	// which bits are for singletons?
	uint16_t singleMask = 0;
	uint16_t phraseMask = 0;
	int32_t           count      = 1;
	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		if ( m_startTermNums[i] == -1 ) singleMask |= (1 << i);
		else                            phraseMask |= (1 << i);
		count *= 2;
	}
	// count cannot exceed 65536
	if ( count > 65536 ) {
		log("IndexTable::addLists: bad engineer. grow the s_table.");
		count = 65536;
	}
	// remember for later
	s_maxBitMask = count - 1;
	// . init the table for converting a bit mask to a bit score
	// . we don't just rank by the # of bits on anymore since singleton
	//   bits are more important than phrase bits
	// . TODO: base s_bitScores[i] on termFreqs secondly!
	for ( int32_t i = 0 ; i < count ; i++ )
		s_bitScores [ i ] =
			100 * getNumBitsOn ( i & singleMask ) +
			1   * getNumBitsOn ( i & phraseMask );
	// how many records/keys in all lists?
	int32_t numKeys = 0;
	for ( int32_t i = 0; i < numLists ; i++ )
		numKeys += lists[i].getNumKeys();
	// . grow hash table by that plus 20%
	// . grow table to powers of 2 only!
	// . increases m_numSlots by this much
	// . seems like we gotta lot of collisions so grow table by twice
	//   instead of 20%!
	if ( ! growTable ( (numKeys * 200) / 100 ) ) return false;
	// another timestamp
	log("IndexTable::addLists: adding %"INT32" lists now t=%"INT64", slots=%"INT32"",
	    numLists , gettimeofdayInMilliseconds() , m_numSlots );
	// keep track of collisions in hash table
	//s_collisions = 0;
	// now hash the docIds/score pairs in each key of each list
	for ( int32_t i = 0; i < numLists ; i++ ) {
		// . convert the list's termId to a termBitMask
		// . uses m_startTermNums and m_endTermNums for setting
		//   multiple bits for phrases to dilute the effects of
		//   truncation limit
		log("IndexTable::addLists: getting bit mask now t=%"INT64"",
		    gettimeofdayInMilliseconds() );
		uint16_t termBitMask = getTermBitMask ( i );
		log("IndexTable::addLists: got bit mask now t=%"INT64", sign=%c",
		    gettimeofdayInMilliseconds() , m_termSigns[i] );
		log("adding list #%"INT32", size=%"INT32"",i,lists[i].getListSize());
		addList ( &lists[i] , m_scoreWeights[i] , m_termSigns[i] ,
			  termBitMask ) ;
	}
	//log("IndexTable::addLists: collisions = %"INT32", #keys=%"INT32", #slots=%"INT32"",
	//    s_collisions, numKeys , m_numSlots );
	// another timestamp
	log("IndexTable::addLists: setting the top docIds now t=%"INT64"",
	    gettimeofdayInMilliseconds() );
	// now set the top docIds
	if ( ! setTopDocIds ( m_docsWanted , forcePhrases ) ) return false;
	// success
	return true;
}

// . TODO: this must be very fast!!!!
// . this is still taking 378ms to add about 500,000 keys!!! (from 7 lists)
// . we should be able to hand code assembly to do about 30 cycler per key
//   which would mean 500,000 keys would take 15 million cycles =
//   which is 15/1400 of a second = ~10 ms. that's 30 times faster!!
void IndexTable::addList ( IndexList *list, int32_t scoreWeight, char termSign ,
			   uint16_t termBitMask ) {
	// . route if necessary
	// . these are specialized for speed
	// . exlcuded query term
	if ( termSign == '-' ) {addList2(list,termBitMask);return;}
	// . date search term
	if ( termSign == 'd' ) {addList3(list,scoreWeight,termBitMask);return;}
	// records are data-less, just keys
	int64_t      docIdBits;
	int16_t          score;
	int32_t           n;
	register unsigned char *k   = (unsigned char *) list->getList();
	unsigned char *end = (unsigned char *) list->getListEnd();
	// compute huge score
	for ( ; k < end ; k += sizeof(key_t) ) {
		// . get the score
		// . NOTE: scores are complemented for ordering purposes in db
		// . so the min score is actually the max
		// . so we uncomplement it here
		// . this is 6 cycles of assembly
		// . all of this is about 100 lines of assembly
		score = 255 - *(k+5);
		// . if delbit is set make the score negative
		// . i.e. make negative if this key is a delete
		// . this is dependent on endianness of machine
		// . we're small endian, and the lower int64_t comes first
		//   and IT stores the least significant int32_t first
		if ( *k & (unsigned char)0x01 == 0 ) score = -score;
		// just mask out everything else, but the docId from ith key
		docIdBits =(*((uint64_t *)k)) & 0x0000007ffffffffeLL;
		// . hash docId
		// . it's faster to use a power-of-2 table size and a mask
		n = ( (*(uint32_t *)k) >>1 ) & m_mask;
		// debug
		//log("docIdBits=%"UINT64" slot=%"INT32"",docIdBits,n);
		// . chain until we find empty bucket
		// . empty bucket has a score of 0
		while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits ) {
			//s_collisions++;
			//log("COLLISION with docIdBits=%"UINT64"",m_docIdBits[n]);
			if ( ++n >= m_numSlots ) n = 0;
		}
		// if it's new come here since m_termBits was not calloc'd
		// it was malloc'd so it may be garbage
		if ( m_scores[n] == 0 ) {
			m_docIdBits  [ n ] = docIdBits;
			m_termBits   [ n ] = termBitMask;
		}
		else
			m_termBits   [ n ] |= termBitMask;
		// . add the score
		// . weight the score by the idf (inverse termFreq)
		m_scores [ n ] +=  (int32_t)score * scoreWeight;
	}
}

// this is almost an exact dup of addList() above, but it's for terms with
// a minus term in front of them (exclude from search results)
void IndexTable::addList2 ( IndexList *list, uint16_t termBitMask ) {
	// records are data-less, just keys
	int64_t     docIdBits;
	int16_t         score;
	int32_t          n;
	key_t        *k   = (key_t *) list->getList();
	key_t        *end = (key_t *) list->getListEnd();
	// compute huge score
	for ( ; k < end ; k++ ) {
		// . get the score
		// . NOTE: scores are complemented for ordering purposes in db
		// . so the min score is actually the max
		// . so we uncomplement it here
		score = 255 - *(((unsigned char *)k) + 5);
		// . if delbit is set make the score negative
		// . i.e. make negative if this key is a delete
		if ( *((uint32_t *)k) & 0x01 == 0 ) score = -score;
		// just mask out everything else, but the docId from ith key
		docIdBits = k->n0 &  0x0000007ffffffffeLL;
		// . hash docId
		// . it's faster to use a power-of-2 table size and a mask
		n = ((uint32_t)docIdBits) & m_mask;
		// . chain until we find empty bucket
		// . empty bucket has a score of 0
		while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits )
			if ( ++n >= m_numSlots ) n = 0;
		// adjust score based on if delBit is set or not
		if ( score < 0 ) m_scores   [ n ] += 500000000;
		else             m_scores   [ n ] -= 5000000000;
		m_docIdBits [ n ] = docIdBits;
		m_termBits  [ n ] = 0;
	}
}


// . just like addList()s above but for date search terms
void IndexTable::addList3 ( IndexList *list, int32_t scoreWeight,
			   uint16_t termBitMask ) {
	// records are data-less, just keys
	int64_t     docIdBits;
	int16_t         score;
	int32_t          n;
	key_t        *k   = (key_t *) list->getList();
	key_t        *end = (key_t *) list->getListEnd();
	// compute huge score
	for ( ; k < end ; k++ ) {
		// . get the score
		// . NOTE: scores are complemented for ordering purposes in db
		// . so the min score is actually the max
		// . so we uncomplement it here
		score = 255 - *(((unsigned char *)k) + 5);
		// . if delbit is set make the score negative
		// . i.e. make negative if this key is a delete
		if ( *((uint32_t *)k) & 0x01 == 0 ) score = -score;
		// just mask out everything else, but the docId from ith key
		docIdBits = k->n0 &  0x0000007ffffffffeLL;
		// . hash docId
		// . it's faster to use a power-of-2 table size and a mask
		n = ((uint32_t)docIdBits) & m_mask;
		// . chain until we find empty bucket
		// . empty bucket has a score of 0
		while ( m_scores[n] != 0 && m_docIdBits[n] != docIdBits )
			if ( ++n >= m_numSlots ) n = 0;
		// if it's new come here since m_termBits was not calloc'd
		// it was malloc'd so it may be garbage
		if ( m_scores[n] == 0 ) {
			m_docIdBits  [ n ] = docIdBits;
			m_termBits   [ n ] = termBitMask;
		}
		else
			m_termBits   [ n ] |= termBitMask;
		// . HACK: date hack
		// . now this isn't perfect, but score is usually around 45
		//   when scoreWeight is 256
		// . it should be good enough most of the time
		m_scores [ n ] += score * scoreWeight * DATE_WEIGHT;
	}
}

// . returns false and sets errno on error
// . get the top X scoring docIds (docIdBits actually)
// . order by m_numTerms first then by score (complemented)
// . sets "numTopDocIds" to actual number we got
bool IndexTable::setTopDocIds ( int32_t topn , bool forcePhrases ) {
	// reset any existing buffer
	if ( m_buf ) {	mfree ( m_buf , m_bufSize,"IndexTable"); m_buf = NULL;}
	// timing debug
	int64_t startTime = gettimeofdayInMilliseconds();
	// make new buffer to hold top docids/scores/termBits
	m_bufSize = (8 + 4 + 2) * topn;
	m_buf = (char *) mmalloc ( m_bufSize ,"IndexTable" );
	if ( ! m_buf ) return log("IndexTable::setTopDocIds: mmalloc failed");
	// set our arrays as offsets into the allocated buffer
	m_topDocIds       = (int64_t      *) m_buf;
	m_topScores       = (int32_t           *) (m_buf + 8     * topn);
	m_topBitScores    = (uint16_t *) (m_buf + (8+4) * topn);
	m_maxNumTopDocIds = topn ;
	m_numTopDocIds    = 0 ;
	// . set a minimal mask from hard required ('+') terms
	// . NOTE: truncated termlists had their sign changed to a '*'
	uint16_t minMask = 0;
	// don't let i be >= 32 cuz << shift won't work right
	for ( int32_t i = 0 ; i < m_numTerms && i < 32 ; i++ )
		if ( m_termSigns[i]=='+' || forcePhrases ) minMask |= (1 << i);
	// add the first m_maxNumTopDocIds terms to list, no questions asked
	int32_t nt = 0;
	int32_t i;
	for ( i = 0 ; nt < m_maxNumTopDocIds && i < m_numSlots ; i++ ) {
		// skip empty buckets and negative scores
		if ( m_scores[i] <= 0 ) continue;
		// don't allow docId into initial top list if minMask not met
		if ( (m_termBits[i] & minMask) != minMask ) continue;
		m_topScores    [nt] = m_scores    [i];
		m_topBitScores [nt] = s_bitScores [ m_termBits[i] ];
		m_topDocIds    [nt] =(m_docIdBits[i]>>1) &0x0000003fffffffffLL;
		nt++;
	}
	m_numTopDocIds = nt;
	// these are the min info from the topDocIds
	char      minBitScore  ;
	int32_t      minScore     ;
	int64_t minDocId     ;
	// get new lowest parms
	getLowestTopDocId ( &minBitScore , &minScore , &minDocId );
	// timing debug
	// this chunk of code takes the longest (1-4 seconds), the bubble sort
	// is only 15ms
	log("setTopDocIds: phase 1 took %"INT64" ms",
	    gettimeofdayInMilliseconds() - startTime );
	startTime = gettimeofdayInMilliseconds();
	// your bitsOn must be >= minBitsOn and score > minScore to get into
	// our list of the top docIds
	for ( ; i < m_numSlots ; i++ ) {
		// skip empty buckets (or negative scores from '-' operator)
		if ( m_scores[i] <= 0 ) continue;
		// these must also meet the minMask
		if ( (m_termBits[i] & minMask) != minMask ) continue;
		// docId must have at least this many matching terms
		if ( s_bitScores [ m_termBits[i] ] < minBitScore ) continue;
		if ( s_bitScores [ m_termBits[i] ] > minBitScore ) goto addIt;
		// docId must have a better score if it tied matching terms
		if ( m_scores[i] < minScore ) continue;
		if ( m_scores[i] > minScore ) goto addIt;
		// continue if docId is too small
		if ( ((m_docIdBits[i]>>1) &0x0000003fffffffffLL) > minDocId )
			continue;
	addIt:
		// get the docId
		int64_t docId = (m_docIdBits[i]>>1) &0x0000003fffffffffLL;
		// kick out the lowest
		int32_t j = getLowestTopDocId ( );
		// debug
		//log("replacing #%"INT32" docId=%"INT64", bitsOn=%"INT32", score=%"INT32" "
		//    "with #%"INT32" docId=%"INT64", bitsOn=%"INT32", score=%"INT32"",
		//    j,m_topDocIds[j],m_topBitScores[j],m_topScores[j],
		//    i,docId,s_bitScores [ m_termBits[i] ],m_scores[i]);
		//  now we got in the top
		m_topScores    [j] = m_scores    [i];
		m_topBitScores [j] = s_bitScores [ m_termBits[i] ];
		m_topDocIds    [j] = docId;
		// get new lowest parms
		getLowestTopDocId ( &minBitScore , &minScore, &minDocId );
	}
	// timing debug
	log("setTopDocIds: phase 2 took %"INT64" ms",
	    gettimeofdayInMilliseconds() - startTime );
	// sort the m_topDocIds/m_topScores/m_topBitScores arrays now
	sortTopDocIds();
	// timing debug
	log("setTopDocIds: @ phase 3 time is %"INT64" ms",
	    gettimeofdayInMilliseconds() - startTime );
	return true;
}

void IndexTable::sortTopDocIds ( ) {
	// . now sort by m_topBitScores/m_topScores/m_topDocIds
	// . use a local static ptr since gbsort doesn't have THIS ptr
	// . do a quick bubble sort
	bool didSwap = false;
 keepSorting:
	for ( int32_t i = 1 ; i < m_numTopDocIds ; i++ ) {
		// continue if no switch needed
		if ( m_topBitScores [i-1] >  m_topBitScores [i] ) continue;
		if ( m_topBitScores [i-1] <  m_topBitScores [i] ) goto doSwap;
		if ( m_topScores    [i-1] >  m_topScores    [i] ) continue;
		if ( m_topScores    [i-1] <  m_topScores    [i] ) goto doSwap;
		if ( m_topDocIds    [i-1] <= m_topDocIds    [i] ) continue;
	doSwap:
		int32_t           tmpScore     = m_topScores     [i-1];
		uint16_t tmpBitScore  = m_topBitScores  [i-1];
		int64_t      tmpDocId     = m_topDocIds     [i-1];
		m_topScores     [i-1]  = m_topScores     [i  ];
		m_topBitScores  [i-1]  = m_topBitScores  [i  ];
		m_topDocIds     [i-1]  = m_topDocIds     [i  ];
		m_topScores     [i  ]  = tmpScore;
		m_topBitScores  [i  ]  = tmpBitScore;
		m_topDocIds     [i  ]  = tmpDocId;
		didSwap = true;
	}
	// if it's sorted then return
	if ( ! didSwap ) return;
	// otherwise keep sorting
	didSwap = false;
	goto keepSorting;
}

// . grow the hash table by "n" slots
// . returns false on alloc error, true on success
bool IndexTable::growTable ( int32_t growBy ) {
	// how big should the new arrays be?
	int32_t n = m_numSlots + growBy ;
	// round up to nearest power of 2 if we need to
	uint32_t highest;
	if ( getNumBitsOn ( n ) == 1 ) highest = n;
	// get the value of the highest lit bit (0,1,2,4,8, ...)
	else highest = getHighestLitBitValue ( n ) << 1 ;
	// bail if none list
	//if ( highest == 0 ) {
	//	errno = EBADENGINEER;
	//	return log("IndexTable::growTable: bad engineer");
	//}
	// overgrow in order for size to be a power of 2
	n = highest;
	// . alloc new arrays
	// . make sure to set all scores to 0
	int64_t *newDocIdBits  = (int64_t *) mmalloc ( n * 8,"IndexTable");
	int32_t      *newScores     = (int32_t      *) mcalloc ( n * 4,"IndexTable");
	uint16_t *newBits  = (uint16_t *)mmalloc(n*2,"IndexTable");
	// return false on alloc error
	if ( ! newDocIdBits  ||  ! newScores  ||  ! newBits ) {
		if ( newDocIdBits  ) mfree ( newDocIdBits ,n * 8,"IndexTable");
		if ( newScores     ) mfree ( newScores    ,n * 4,"IndexTable");
		if ( newBits       ) mfree ( newBits      ,n * 2,"IndexTable");
		return log("IndexTable::growTable: mmalloc failed");
	}
	// we use m_mask instead of the % operator for hashing
	m_mask = highest - 1;
	// rehash into the new arrays
	for ( int32_t i = 0 ; i < m_numSlots ; i++ ) {
                // skip the empty buckets
		if ( ! m_scores[i] ) continue;
                // get the new slot number for this slot (might be the same!)
                //int32_t newn = (uint64_t)m_docIdBits[i] % n;
		int32_t newn = (uint64_t)m_docIdBits[i] & m_mask;
                while ( newScores [ newn ] != 0 ) if ( ++newn >= n ) newn = 0;
                // move the docIdBits/score/bitCount to this new slot
		newDocIdBits [newn] = m_docIdBits [i];
		newScores    [newn] = m_scores    [i];
		newBits      [newn] = m_termBits  [i];
        }
        // free the old guys
        mfree ( m_docIdBits , m_numSlots * 8, "IndexTable");
        mfree ( m_scores    , m_numSlots * 4, "IndexTable");
        mfree ( m_termBits  , m_numSlots * 2, "IndexTable");
        // assign the new guys
        m_docIdBits  = newDocIdBits;
	m_scores     = newScores;
	m_termBits   = newBits;
	// update m_numSlots to n
	m_numSlots = n;
        return true;
}

// . get number of docids in m_topDocIds
// . only return # of docIds that have all the termIds if "thatIncludeAllTerms"
int32_t IndexTable::getNumResults ( bool thatIncludeAllTerms ) {
	// return 0 if we have no topDocIds
	if ( ! m_topDocIds || m_numTopDocIds == 0 ) return 0;
	// if they don't need to include all terms just return this
	if ( ! thatIncludeAllTerms ) return m_numTopDocIds;
	// . if they want all terms, make a min bit mask
	// . turn all bits on and see what the bit score is
	uint16_t minBitScore = s_bitScores [ s_maxBitMask ];
	// otherwise, see if the top docIds have ALL the terms
	int32_t count = 0;
	for ( int32_t i = 0 ; i < m_numTopDocIds ; i++ ) {
		// skip empty buckets (or negative scores)
		if ( m_topScores   [i] <= 0 ) continue;
		if ( m_topBitScores[i] >= minBitScore ) count++;
	}
	// return that
	return count;
}

int64_t *IndexTable::getTopDocIds ( bool isAdmin ) {
	if ( isAdmin ) {
		log("----- # top docIds = %"INT32"",m_numTopDocIds);
		for ( int32_t i = 0 ; i < m_numTopDocIds; i++ )
			//log("%"INT32") docId=%"INT64", # terms Matched=%"INT32", sum=%"INT32"",
			log("%"INT32") docId=%"INT64", bit score=%hu, sum=%"INT32"",
			    i,m_topDocIds[i],m_topBitScores[i],
			    m_topScores[i]);
	}
	return m_topDocIds;
}

// . get the termBits for the termId represented by this list
// . only phrases may set multiple bits
uint16_t IndexTable::getTermBitMask ( int32_t i ) {
	// set the ith bit
	uint16_t mask = (1 << i);
	// get range of termIds this termId encloses, iff it's a phrase termId
	char a = m_startTermNums[i] ;
	char b = m_endTermNums  [i] ;
	// if not a phrase return now
	if ( a == -1 ) return mask;
	// otherwise set one bit for each query termId used by phrase
	for ( char j = a ; j <= b ; j++ ) mask |= (1 << j);
	// return that
	return mask;
}