511 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			511 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "gb-include.h"
 | |
| 
 | |
| #include "Indexdb.h"
 | |
| #include "Url.h"
 | |
| #include "Clusterdb.h"
 | |
| //#include "Checksumdb.h"
 | |
| #include "Threads.h"
 | |
| 
 | |
| // a global class extern'd in .h file
 | |
| Indexdb g_indexdb;
 | |
| 
 | |
| // for rebuilding indexdb
 | |
| Indexdb g_indexdb2;
 | |
| 
 | |
| // resets rdb
 | |
| void Indexdb::reset() { 
 | |
| 	m_rdb.reset();
 | |
| //#ifdef SPLIT_INDEXDB
 | |
| 	//if ( m_groupIdTable ) {
 | |
| 	//if ( g_hostdb.m_indexSplits > 1 && m_groupIdTable ) {
 | |
| 	//	mfree(m_groupIdTable, m_groupIdTableSize, "Indexdb");
 | |
| 	//	m_groupIdTable = NULL;
 | |
| 	//	m_groupIdTableSize = 0;
 | |
| 	//}
 | |
| //#endif
 | |
| }
 | |
| 
 | |
| //#include "DiskPageCache.h"
 | |
| 
 | |
| /*
 | |
| bool Indexdb::setGroupIdTable ( ) {
 | |
| 	// skip if not split
 | |
| 	if ( g_hostdb.m_indexSplits <= 1 ) return true;
 | |
| 	// . create the groupId table
 | |
| 	m_numGroups = g_hostdb.getNumGroups();
 | |
| 	//m_groupIdTableSize = m_numGroups*INDEXDB_SPLIT*sizeof(int32_t);
 | |
| 	m_groupIdTableSize = m_numGroups*g_hostdb.m_indexSplits*sizeof(int32_t);
 | |
| 	m_groupIdTable =(uint32_t*)mmalloc(m_groupIdTableSize, "Indexdb");
 | |
| 	if ( ! m_groupIdTable ) {
 | |
| 		g_errno = ENOMEM;
 | |
| 		log ( "Could not allocate %" INT32 " bytes for groupIdTable",
 | |
| 		      m_groupIdTableSize );
 | |
| 		return false;
 | |
| 	}
 | |
| 	// . the groupId table with the lookup values
 | |
| 	m_groupIdShift = 32;
 | |
| 	int32_t x = m_numGroups;
 | |
| 	while ( x != 1 ) {
 | |
| 		x >>= 1;
 | |
| 		m_groupIdShift--;
 | |
| 	}
 | |
| 	for ( int32_t i = 0; i < m_numGroups; i++ ) {
 | |
| 		uint32_t groupId = g_hostdb.getGroupId(i);
 | |
| 		groupId >>= m_groupIdShift;
 | |
| 		if ( !g_conf.m_legacyIndexdbSplit ) {
 | |
| 			//for ( int32_t s = 0; s < INDEXDB_SPLIT; s++ ) {
 | |
| 			for ( int32_t s = 0; s < g_hostdb.m_indexSplits; s++ ) {
 | |
| 				int32_t g = i + s;
 | |
| 				while ( g >= m_numGroups ) g -= m_numGroups;
 | |
| 				//int32_t x = groupId + ((g % INDEXDB_SPLIT) *
 | |
| 				int32_t x = groupId + ((g%g_hostdb.m_indexSplits)*
 | |
| 						m_numGroups);
 | |
| 				m_groupIdTable[x] = g_hostdb.getGroupId(g);
 | |
| 			}
 | |
| 		}
 | |
| 		else {
 | |
| 			//for ( int32_t s = 0; s < INDEXDB_SPLIT; s++ ) {
 | |
| 			for ( int32_t s = 0; s < g_hostdb.m_indexSplits; s++ ) {
 | |
| 				int32_t g = i + s;
 | |
| 				while ( g >= m_numGroups ) g -= m_numGroups;
 | |
| 				m_groupIdTable[groupId+(m_numGroups*s)] =
 | |
| 					g_hostdb.getGroupId(g);
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return true;
 | |
| }
 | |
| */
 | |
| 
 | |
| bool Indexdb::init ( ) {
 | |
| 	// fake it for now
 | |
| 	return true;
 | |
| 	//if ( ! setGroupIdTable () ) return false;
 | |
| 	// . what's max # of tree nodes?
 | |
| 	// . each rec in tree is only 1 key (12 bytes)
 | |
| 	// . but has 12 bytes of tree overhead (m_left/m_right/m_parents)
 | |
| 	// . this is UNUSED for bin trees!!
 | |
| 	int32_t nodeSize      = (sizeof(key_t)+12+4) + sizeof(collnum_t);
 | |
| 	int32_t maxTreeNodes = g_conf.m_indexdbMaxTreeMem  / nodeSize ;
 | |
| 	// . assume the average cached list is about 600 bytes
 | |
| 	// . TODO: if we cache a lot of not founds (small lists), we won't have
 | |
| 	//   enough nodes!!
 | |
| 	int32_t maxCacheNodes = g_conf.m_indexdbMaxCacheMem / 600;
 | |
| 
 | |
| 	//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
 | |
| 	// we now use a disk page cache as opposed to the
 | |
| 	// old rec cache. i am trying to do away with the Rdb::m_cache rec
 | |
| 	// cache in favor of cleverly used disk page caches, because
 | |
| 	// the rec caches are not real-time and get stale. 
 | |
| 	//int32_t pcmem    = g_conf.m_indexdbMaxDiskPageCacheMem;
 | |
| 
 | |
| 	//pcmem = 0;
 | |
| 	// make sure at least 30MB
 | |
| 	//if ( pcmem < 30000000 ) pcmem = 30000000;
 | |
| 	// keep this low if we are the tmp cluster, 30MB
 | |
| 	//if ( g_hostdb.m_useTmpCluster && pcmem > 30000000 ) pcmem = 30000000;
 | |
| 	// do not use any page cache if doing tmp cluster in order to
 | |
| 	// prevent swapping
 | |
| 	//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
 | |
| 	// . init the page cache
 | |
| 	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
 | |
| 	// if ( ! m_pc.init ( "indexdb",
 | |
| 	// 		   RDB_INDEXDB,
 | |
| 	// 		   pcmem    ,
 | |
| 	// 		   pageSize ))
 | |
| 	// 	return log("db: Indexdb init failed.");
 | |
| 
 | |
| 	// . set our own internal rdb
 | |
| 	// . max disk space for bin tree is same as maxTreeMem so that we
 | |
| 	//   must be able to fit all bins in memory
 | |
| 	// . we do not want indexdb's bin tree to ever hit disk since we
 | |
| 	//   dump it to rdb files when it is 90% full (90% of bins in use)
 | |
| 	if ( !m_rdb.init ( g_hostdb.m_dir              ,
 | |
| 			   "indexdb"                   ,
 | |
| 			   true                        , // dedup same keys?
 | |
| 			   0                           , // fixed data size
 | |
| 			   g_conf.m_indexdbMinFilesToMerge , 
 | |
| 			   g_conf.m_indexdbMaxTreeMem  ,
 | |
| 			   maxTreeNodes                ,
 | |
| 			   // now we balance so Sync.cpp can ordered huge lists
 | |
| 			   true                        , // balance tree?
 | |
| 			   g_conf.m_indexdbMaxCacheMem ,
 | |
| 			   maxCacheNodes 	       ,
 | |
| 			   true                        , // use half keys?
 | |
| 			   false                       , // g_conf.m_indexdbSav
 | |
| 			   NULL))//&m_pc                       ) )
 | |
| 		return false;
 | |
| 	return true;
 | |
| 	// validate indexdb
 | |
| 	//return verify();
 | |
| }
 | |
| 
 | |
| // init the rebuild/secondary rdb, used by PageRepair.cpp
 | |
| bool Indexdb::init2 ( int32_t treeMem ) {
 | |
| 	//if ( ! setGroupIdTable () ) return false;
 | |
| 	// . what's max # of tree nodes?
 | |
| 	// . each rec in tree is only 1 key (12 bytes)
 | |
| 	// . but has 12 bytes of tree overhead (m_left/m_right/m_parents)
 | |
| 	// . this is UNUSED for bin trees!!
 | |
| 	int32_t nodeSize     = (sizeof(key_t)+12+4) + sizeof(collnum_t);
 | |
| 	int32_t maxTreeNodes = treeMem  / nodeSize ;
 | |
| 	// . set our own internal rdb
 | |
| 	// . max disk space for bin tree is same as maxTreeMem so that we
 | |
| 	//   must be able to fit all bins in memory
 | |
| 	// . we do not want indexdb's bin tree to ever hit disk since we
 | |
| 	//   dump it to rdb files when it is 90% full (90% of bins in use)
 | |
| 	if ( ! m_rdb.init ( g_hostdb.m_dir              ,
 | |
| 			    "indexdbRebuild"            ,
 | |
| 			    true                        , // dedup same keys?
 | |
| 			    0                           , // fixed data size
 | |
| 			    200                         , // min files to merge
 | |
| 			    treeMem                     ,
 | |
| 			    maxTreeNodes                ,
 | |
| 			    true                        , // balance tree?
 | |
| 			    0                           , // MaxCacheMem ,
 | |
| 			    0                           , // maxCacheNodes
 | |
| 			    true                        , // use half keys?
 | |
| 			    false                       , // indexdbSaveCache
 | |
| 			    NULL                      ) ) // s_pc
 | |
| 		return false;
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| /*
 | |
| bool Indexdb::addColl ( char *coll, bool doVerify ) {
 | |
| 	if ( ! m_rdb.addColl ( coll ) ) return false;
 | |
| 	if ( ! doVerify ) return true;
 | |
| 	// verify
 | |
| 	if ( verify(coll) ) return true;
 | |
| 	// do a deep verify to figure out which files are corrupt
 | |
| 	deepVerify ( coll );
 | |
| 	// if not allowing scale, return false
 | |
| 	if ( ! g_conf.m_allowScale ) return false;
 | |
| 	// otherwise let it go
 | |
| 	log ( "db: Verify failed, but scaling is allowed, passing." );
 | |
| 	return true;
 | |
| }
 | |
| */
 | |
| 
 | |
| bool Indexdb::verify ( char *coll ) {
 | |
| 	return true;
 | |
| 	log ( LOG_INFO, "db: Verifying Indexdb for coll %s...", coll );
 | |
| 	g_threads.disableThreads();
 | |
| 
 | |
| 	Msg5 msg5;
 | |
| 	Msg5 msg5b;
 | |
| 	RdbList list;
 | |
| 	key_t startKey;
 | |
| 	key_t endKey;
 | |
| 	startKey.setMin();
 | |
| 	endKey.setMax();
 | |
| 	//int32_t minRecSizes = 64000;
 | |
| 	CollectionRec *cr = g_collectiondb.getRec(coll);
 | |
| 	
 | |
| 	if ( ! msg5.getList ( RDB_INDEXDB   ,
 | |
| 			      cr->m_collnum ,
 | |
| 			      &list         ,
 | |
| 			      startKey      ,
 | |
| 			      endKey        ,
 | |
| 			      64000         , // minRecSizes   ,
 | |
| 			      true          , // includeTree   ,
 | |
| 			      false         , // add to cache?
 | |
| 			      0             , // max cache age
 | |
| 			      0             , // startFileNum  ,
 | |
| 			      -1            , // numFiles      ,
 | |
| 			      NULL          , // state
 | |
| 			      NULL          , // callback
 | |
| 			      0             , // niceness
 | |
| 			      false         , // err correction?
 | |
| 			      NULL          ,
 | |
| 			      0             ,
 | |
| 			      -1            ,
 | |
| 			      true          ,
 | |
| 			      -1LL          ,
 | |
| 			      &msg5b        ,
 | |
| 			      true          )) {
 | |
| 		g_threads.enableThreads();
 | |
| 		return log("db: HEY! it did not block");
 | |
| 	}
 | |
| 
 | |
| 	int32_t count = 0;
 | |
| 	int32_t got   = 0;
 | |
| 	bool printedKey = false;
 | |
| 	bool printedZeroKey = false;
 | |
| 	for ( list.resetListPtr() ; ! list.isExhausted() ;
 | |
| 	      list.skipCurrentRecord() ) {
 | |
| 		key_t k = list.getCurrentKey();
 | |
| 		count++;
 | |
| 		//uint32_t groupId = k.n1 & g_hostdb.m_groupMask;
 | |
| 		//uint32_t groupId = getGroupId ( RDB_INDEXDB , &k );
 | |
| 		//if ( groupId == g_hostdb.m_groupId ) got++;
 | |
| 		uint32_t shardNum = getShardNum( RDB_INDEXDB , &k );
 | |
| 		if ( shardNum == getMyShardNum() ) got++;
 | |
| 		else if ( !printedKey ) {
 | |
| 			log ( "db: Found bad key in list (only printing once): "
 | |
| 			      "%" XINT32 " %" XINT64 "", k.n1, k.n0 );
 | |
| 			printedKey = true;
 | |
| 		}
 | |
| 		if ( k.n1 == 0 && k.n0 == 0 ) {
 | |
| 			if ( !printedZeroKey ) {
 | |
| 				log ( "db: Found Zero key in list, passing. "
 | |
| 				      "(only printing once)." );
 | |
| 				printedZeroKey = true;
 | |
| 			}
 | |
| 			if ( shardNum != getMyShardNum() )
 | |
| 				got++;
 | |
| 		}
 | |
| 	}
 | |
| 	if ( got != count ) {
 | |
| 		log ("db: Out of first %" INT32 " records in indexdb, only %" INT32 " belong "
 | |
| 		     "to our group.",count,got);
 | |
| 		// exit if NONE, we probably got the wrong data
 | |
| 		if ( got == 0 ) log("db: Are you sure you have the "
 | |
| 				    "right "
 | |
| 				    "data in the right directory? "
 | |
| 				    "Exiting.");
 | |
| 		log ( "db: Exiting due to Indexdb inconsistency." );
 | |
| 		g_threads.enableThreads();
 | |
| 		return g_conf.m_bypassValidation;
 | |
| 	}
 | |
| 	log ( LOG_INFO, "db: Indexdb passed verification successfully for %" INT32 " "
 | |
| 			"recs.", count );
 | |
| 	// DONE
 | |
| 	g_threads.enableThreads();
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| void Indexdb::deepVerify ( char *coll ) {
 | |
| 	log ( LOG_INFO, "db: Deep Verifying Indexdb for coll %s...", coll );
 | |
| 	g_threads.disableThreads();
 | |
| 
 | |
| 	Msg5 msg5;
 | |
| 	Msg5 msg5b;
 | |
| 	RdbList list;
 | |
| 	key_t startKey;
 | |
| 	key_t endKey;
 | |
| 	startKey.setMin();
 | |
| 	endKey.setMax();
 | |
| 	//int32_t minRecSizes = 64000;
 | |
| 	
 | |
| 	collnum_t collnum = g_collectiondb.getCollnum(coll);
 | |
| 	RdbBase *rdbBase = g_indexdb.m_rdb.getBase(collnum);
 | |
| 	int32_t numFiles = rdbBase->getNumFiles();
 | |
| 	int32_t currentFile = 0;
 | |
| 	CollectionRec *cr = g_collectiondb.getRec(coll);
 | |
| 	
 | |
| deepLoop:
 | |
| 	// done after scanning all files
 | |
| 	if ( currentFile >= numFiles ) {
 | |
| 		g_threads.enableThreads();
 | |
| 		log ( LOG_INFO, "db: Finished deep verify for %" INT32 " files.",
 | |
| 				numFiles );
 | |
| 		return;
 | |
| 	}
 | |
| 	// scan this file
 | |
| 	if ( ! msg5.getList ( RDB_INDEXDB   ,
 | |
| 			      cr->m_collnum ,
 | |
| 			      &list         ,
 | |
| 			      startKey      ,
 | |
| 			      endKey        ,
 | |
| 			      64000         , // minRecSizes   ,
 | |
| 			      true          , // includeTree   ,
 | |
| 			      false         , // add to cache?
 | |
| 			      0             , // max cache age
 | |
| 			      currentFile   , // startFileNum  ,
 | |
| 			      1             , // numFiles      ,
 | |
| 			      NULL          , // state
 | |
| 			      NULL          , // callback
 | |
| 			      0             , // niceness
 | |
| 			      false         , // err correction?
 | |
| 			      NULL          ,
 | |
| 			      0             ,
 | |
| 			      -1            ,
 | |
| 			      true          ,
 | |
| 			      -1LL          ,
 | |
| 			      &msg5b        ,
 | |
| 			      false         )) {
 | |
| 		g_threads.enableThreads();
 | |
| 		log("db: HEY! it did not block");
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	int32_t count = 0;
 | |
| 	int32_t got   = 0;
 | |
| 	for ( list.resetListPtr() ; ! list.isExhausted() ;
 | |
| 	      list.skipCurrentRecord() ) {
 | |
| 		key_t k = list.getCurrentKey();
 | |
| 		count++;
 | |
| 		//uint32_t groupId = k.n1 & g_hostdb.m_groupMask;
 | |
| 		//uint32_t groupId = getGroupId ( RDB_INDEXDB , &k );
 | |
| 		//if ( groupId == g_hostdb.m_groupId ) got++;
 | |
| 		uint32_t shardNum = getShardNum( RDB_INDEXDB , &k );
 | |
| 		if ( shardNum == getMyShardNum() ) got++;
 | |
| 	}
 | |
| 	if ( got != count ) {
 | |
| 		BigFile *f = rdbBase->getFile(currentFile);
 | |
| 		log ("db: File %s: Out of first %" INT32 " records in indexdb, "
 | |
| 		     "only %" INT32 " belong to our group.",
 | |
| 		     f->getFilename(),count,got );
 | |
| 	}
 | |
| 	//else
 | |
| 	//	log ( LOG_INFO, "db: File %" INT32 ": Indexdb passed verification "
 | |
| 	//	      "successfully for %" INT32 " recs.",currentFile,count );
 | |
| 	// next file
 | |
| 	currentFile++;
 | |
| 	goto deepLoop;
 | |
| }
 | |
| 
 | |
| // . see Indexdb.h for format of the 12 byte key
 | |
| // . TODO: substitute var ptrs if you want extra speed
 | |
| key_t Indexdb::makeKey ( int64_t          termId   , 
 | |
| 			 unsigned char      score    , 
 | |
| 			 uint64_t docId    , 
 | |
| 			 bool               isDelKey ) {
 | |
| 	// make sure we mask out the hi bits we do not use first
 | |
| 	termId = termId & TERMID_MASK;
 | |
| 	key_t key ;
 | |
| 	char *kp = (char *)&key;
 | |
| 	char *tp = (char *)&termId;
 | |
| 	char *dp = (char *)&docId;
 | |
| 	// store termid
 | |
| 	*(int16_t *)(kp+10) = *(int16_t *)(tp+4);
 | |
| 	*(int32_t  *)(kp+ 6) = *(int32_t  *)(tp  );
 | |
| 	// store the complement of the score
 | |
| 	kp[5] = ~score;
 | |
| 	// . store docid
 | |
| 	// . make room for del bit and half bit
 | |
| 	docId <<= 2;
 | |
| 	*(int32_t *)(kp+1) = *(int32_t *)(dp+1);
 | |
| 	kp[0] = dp[0];
 | |
| 	// turn off half bit
 | |
| 	kp[0] &= 0xfd;
 | |
| 	// turn on/off delbit
 | |
| 	if ( isDelKey ) kp[0] &= 0xfe;
 | |
| 	else            kp[0] |= 0x01;
 | |
| 	// key is complete
 | |
| 	return key;
 | |
| }
 | |
| 
 | |
| // . accesses RdbMap to estimate size of the indexList for this termId
 | |
| // . returns an UPPER BOUND
 | |
| int64_t Indexdb::getTermFreq ( collnum_t collnum , int64_t termId ) {
 | |
| 	// establish the list boundary keys
 | |
| 	key_t startKey = makeStartKey ( termId );
 | |
| 	key_t endKey   = makeEndKey   ( termId );
 | |
| 	// . ask rdb for an upper bound on this list size
 | |
| 	// . but actually, it will be somewhat of an estimate 'cuz of RdbTree
 | |
| 	key_t maxKey;
 | |
| 	// divide by 6 since indexdb's recs are 6 bytes each, except for first
 | |
| 	int64_t maxRecs;
 | |
| 	// . don't count more than these many in the map
 | |
| 	// . that's our old truncation limit, the new stuff isn't as dense
 | |
| 	int32_t oldTrunc = 100000;
 | |
| 	// get maxKey for only the top "oldTruncLimit" docids because when
 | |
| 	// we increase the trunc limit we screw up our extrapolation! BIG TIME!
 | |
| 	maxRecs=m_rdb.getListSize(collnum,startKey,endKey,&maxKey,oldTrunc )/6;
 | |
| 	// . TRUNCATION NOW OBSOLETE
 | |
| 	return maxRecs;
 | |
| 	
 | |
| 	// . is this termId truncated in this indexdb?
 | |
| 	// . truncationLimit of Indexdb is max # of records for one termId
 | |
| 	//if ( (int64_t)maxRecs < getTruncationLimit() ) return maxRecs;
 | |
| 	// . no, i like to raise truncation limit on the fly, so if we
 | |
| 	//   still have that line above then nothing would seem to be
 | |
| 	//   truncated, would it?
 | |
| 	// . so just, use a minimal truncation limit then
 | |
| 	if ( maxRecs < MIN_TRUNC ) return maxRecs;
 | |
| 
 | |
| 	// this var is so we can adjust the # of recs lost due to truncation
 | |
| 	int64_t numRecs = maxRecs ;
 | |
| 
 | |
| 	// . get last score we got
 | |
| 	// . if it is > 1 then we probably got the 1's truncated off
 | |
| 	unsigned char shy       = g_indexdb.getScore ( maxKey );
 | |
| 	int64_t     lastDocId = g_indexdb.getDocId ( maxKey );
 | |
| 	// . which page has first key with this score (shy)?
 | |
| 	// . modify maxKey
 | |
| 	key_t midKey = g_indexdb.makeKey   ( termId , shy , 0LL , true );
 | |
| 	// get # of recs that have this termId and score
 | |
| 	int32_t  lastChunk = m_rdb.getListSize(collnum,
 | |
| 					    midKey,endKey,&maxKey,oldTrunc)/ 6;
 | |
| 	// now interpolate number of uncounted docids for the score "shy"
 | |
| 	int32_t remaining = (((int64_t)lastChunk) * lastDocId) / 
 | |
| 		(int64_t)DOCID_MASK ;
 | |
| 
 | |
| 	// add in remaining # of docids from the score "shy"
 | |
| 	numRecs += remaining;
 | |
| 
 | |
| 	// log it
 | |
| 	log(LOG_DEBUG,"query: Adding %" INT32 " (%" INT32 ") to score --> %" INT64 ".", 
 | |
| 	    remaining,lastChunk,numRecs);
 | |
| 
 | |
| 	// . if we got a meta tag here, scores are MOSTLY the same
 | |
| 	//   and we should not interpolate based on score
 | |
| 	// . if we got a meta tag scores are usually 33 or more
 | |
| 	// . TODO: figure out a way to do this correctly
 | |
| 	if ( shy > 20 ) shy = 0;
 | |
| 
 | |
| 	// debug msg
 | |
| 	//log("endKey.n0=%" XINT64 " startKey.n0=%" XINT64 "", endKey.n0 , startKey.n0 );
 | |
| 	//log("maxRecs=%" UINT64 " maxKey.n0=%" XINT64 " shy=%" INT32 "",maxRecs,maxKey.n0,shy);
 | |
| 
 | |
| 	// don't loop forever
 | |
| 	if ( shy == 0 ) shy = 1;
 | |
| 	// . if last score is > 1 then interpolate just based on the score
 | |
| 	// . a score of i has about 1.5 times the docids of a score of i+1
 | |
| 	// . so if max score (255) has N docs, then we got
 | |
| 	//   TOTAL = N + Nx + Nxx + Nxxx + ... ( where x = 1.5)
 | |
| 	// . therefore, if we lost the score of 1, we just multiply total
 | |
| 	//   docs for scores of 2 though 255 by 1.5 and add N, if N is small,
 | |
| 	//   which it is, don't bother adding it
 | |
| 	// . unfortunately, if we increase the trunc limit we'll often
 | |
| 	//   quickly get lower scoring docids in as porous filler so "shy" will
 | |
| 	//   equal 1 and we won't extrapolate, because we won't know that
 | |
| 	//   a bunch of other docids are really missing
 | |
| 	// . TODO: extrapolate based on last docid, too, not just score,
 | |
| 	//   that way we are way more continuous
 | |
| 	// . FIX: now we use g_conf.m_oldTruncationLimit
 | |
| 	while ( shy-- > 1 ) {
 | |
| 		// this is exponential
 | |
| 		numRecs = (numRecs * 1436LL /*1106*//*1500*/ ) / 1000LL ;
 | |
| 		// only account for truncation by docid for the first round
 | |
| 		//if ( numRecs == maxRecs ) {
 | |
| 		//	// make up for missed docids
 | |
| 		//	uint64_t d = g_indexdb.getDocId ( maxKey );
 | |
| 		//	toAdd = (toAdd * DOCID_MASK) / d;
 | |
| 		//}
 | |
| 		//numRecs += toAdd;
 | |
| 	}
 | |
| 
 | |
| 	// log it
 | |
| 	log(LOG_DEBUG,"query: Interpolated tf to %" INT64 ".", numRecs );
 | |
| 
 | |
| 	// debug msg
 | |
| 	//log("numRecs=%" UINT64 "",numRecs);
 | |
| 
 | |
| 	// . see PageRoot.cpp for explanation of this:
 | |
| 	// . so starting with Lars we'll use checksumdb
 | |
| 	//#ifdef _LARS_
 | |
| 	//int64_t trecs = g_checksumdb.getRdb()->getNumGlobalRecs();
 | |
| 	int64_t trecs = g_clusterdb.getRdb()->getNumGlobalRecs();
 | |
| 	//#else
 | |
| 	//int64_t trecs = g_clusterdb.getRdb()->getNumGlobalRecs() ;
 | |
| 	//#endif
 | |
| 	if ( numRecs > trecs ) numRecs = trecs;
 | |
| 
 | |
| 	// TODO: watch out for explosions! (if all scores are the same...)
 | |
| 	if ( maxRecs > numRecs ) return maxRecs;
 | |
| 	return numRecs;
 | |
| }
 | |
| 
 | |
| // keys are stored from lowest to highest
 | |
| key_t Indexdb::makeStartKey ( int64_t termId ) {
 | |
| 	return makeKey ( termId , 255/*score*/ , 
 | |
| 			 0x0000000000000000LL/*docId*/ , true/*delKey?*/ );
 | |
| }
 | |
| key_t Indexdb::makeEndKey   ( int64_t termId ) {
 | |
| 	return makeKey ( termId , 0/*score*/ , 
 | |
| 			 0xffffffffffffffffLL/*docId*/ , false/*delKey?*/ );
 | |
| }
 |