345 lines
12 KiB
C++
345 lines
12 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "IndexReadInfo.h"
|
|
#include "Datedb.h"
|
|
|
|
IndexReadInfo::IndexReadInfo() {
|
|
m_numLists = 0;
|
|
m_isDone = false;
|
|
}
|
|
|
|
// . initialize initial read info
|
|
// . sets m_readSizes[i] for each list
|
|
// . sets startKey/endKey for each list, too
|
|
// . startKey set passed endKey to indicate no reading
|
|
void IndexReadInfo::init ( Query *q ,
|
|
int64_t *termFreqs ,
|
|
int32_t docsWanted , char callNum ,
|
|
int32_t stage0 ,
|
|
int32_t *tierStage,
|
|
bool useDateLists , bool sortByDate ,
|
|
uint32_t date1 , uint32_t date2 ,
|
|
bool isDebug ) {
|
|
// save ptr but don't copy
|
|
m_q = q;
|
|
m_useDateLists = useDateLists;
|
|
m_sortByDate = sortByDate;
|
|
m_date1 = date1;
|
|
m_date2 = date2;
|
|
m_isDebug = isDebug;
|
|
if ( m_useDateLists ) m_ks = 16;
|
|
else m_ks = 12;
|
|
m_hks = m_ks - 6;
|
|
// . now set m_includeList array
|
|
// . set to false if we determine termId to be ousted due to dpf
|
|
// . loop over each termId in the query
|
|
for ( int32_t i = 0 ; i < m_q->getNumTerms() ; i++ ) {
|
|
// ignore some
|
|
//m_ignore [i] = m_q->m_ignore[i];
|
|
// no need to gen keys if ignored
|
|
//if ( m_ignore[i] ) continue;
|
|
// nothing ignored initially
|
|
m_ignore[i] = false;
|
|
// make our arrays 1-1 with those in Query class, q
|
|
if ( m_useDateLists ) {
|
|
// remember, date is complemented in the key, so use
|
|
// the larger date for the startKey
|
|
*(key128_t *)&m_startKeys [i*m_ks] =
|
|
g_datedb.makeStartKey(m_q->getTermId(i),m_date2);
|
|
*(key128_t *)&m_endKeys [i*m_ks] =
|
|
g_datedb.makeEndKey (m_q->getTermId(i),m_date1);
|
|
continue;
|
|
}
|
|
|
|
*(key_t *)&m_startKeys [i*m_ks] =
|
|
g_indexdb.makeStartKey ( m_q->getTermId(i) );
|
|
*(key_t *)&m_endKeys [i*m_ks] =
|
|
g_indexdb.makeEndKey ( m_q->getTermId(i) );
|
|
}
|
|
// no negatives
|
|
for ( int32_t i = 0; i < MAX_TIERS; i++ ){
|
|
if ( tierStage[i] < 0 )
|
|
tierStage[i] = 0;
|
|
}
|
|
|
|
// -1 means to use default
|
|
if ( stage0 <= 0 ) {
|
|
// adjust for dateLists, proportionally
|
|
if ( m_useDateLists )
|
|
m_stage[0] = (tierStage[0] * (16-6)) / (12-6);
|
|
else
|
|
m_stage[0] = tierStage[0]; // STAGE0;
|
|
}
|
|
else
|
|
m_stage[0] = stage0 * m_hks + 6;
|
|
|
|
// for all the other stages just get the same tier size
|
|
for ( int32_t i = 1; i < MAX_TIERS; i++ ){
|
|
// adjust for dateLists, proportionally
|
|
if ( m_useDateLists )
|
|
m_stage[i] = (tierStage[i] * (16-6)) / (12-6);
|
|
else
|
|
m_stage[i] = tierStage[i];
|
|
}
|
|
|
|
|
|
// set # of lists
|
|
m_numLists = m_q->getNumTerms();
|
|
// we're not done yet, we haven't even begun
|
|
m_isDone = false;
|
|
|
|
// . how many docs do we need to read to get docsWanted hits?
|
|
// . HITS = (X2 * ... * XN) / T^N
|
|
// . where Xi is docs read from each list
|
|
// . T is the total # of docs in the index
|
|
// . this assumes no dependence between the words
|
|
// . So let's just start off reading 10,000, then 30k more then 60k
|
|
// . So we break up our 100k truncation limit that way
|
|
int32_t toRead = m_stage[(int)callNum];
|
|
|
|
int64_t def = getStage0Default() ;
|
|
int64_t *tf = termFreqs ;
|
|
|
|
// . ...but if we're only reading 1 list...
|
|
// . keys are 6 bytes each, first key is 12 bytes
|
|
// . this made our result count inaccurate
|
|
// . since we had to round up to half a PAGE_SIZE
|
|
// (#defined to be 16k in RdbMap.h) we would never estimate at lower
|
|
// than about 4,000 docids for one-word queries
|
|
// . so, since we're going to read at least a PAGE_SIZE anyway,
|
|
// removing this should not slow us down!!
|
|
// . actually, should speed us up if all the guys site cluster which
|
|
// is especially probable for rare terms --- all from the same site
|
|
// . SECONDLY, now i use Msg39::getStageNum() to do prettier clustering
|
|
// and that requires us to be consistent with our stages from Next
|
|
// 10 to Next 10
|
|
//if ( m_q->getNumTerms() <= 1 ) toRead = docsWanted * 6 + 6;
|
|
// now loop through all non-ignored lists
|
|
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
|
|
// ignore lists that should be
|
|
if ( m_ignore[i] ) { m_readSizes[i]=0; continue; }
|
|
// don't include excluded lists in this calculation
|
|
if ( m_q->m_qterms[i].m_termSign == '-' )
|
|
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
|
|
else if ( m_q->m_qterms[i].m_underNOT )
|
|
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
|
|
else if ( m_q->m_qterms[i].m_piped )
|
|
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
|
|
//m_readSizes[i] = g_indexdb.getTruncationLimit() *6+6;
|
|
// m_readSizes[i] = g_indexdb.getTruncationLimit()*6 ;
|
|
// . this is set to max if we got more than 1 ignored list
|
|
// . later we will use dynamic truncation
|
|
/*else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage2)
|
|
m_readSizes[i] = tierStage2;
|
|
else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage1)
|
|
m_readSizes[i] = tierStage1;*/
|
|
else m_readSizes[i] = toRead;
|
|
|
|
// . when user specifies the s0=X cgi parm and X is like 4M
|
|
// try to avoid allocating so much space when we do not need
|
|
// . mark is using s0 to get exact hit counts
|
|
int64_t max = tf[i] * m_hks+m_hks +GB_INDEXDB_PAGE_SIZE*10 ;
|
|
if ( max < def ) max = def;
|
|
if ( m_readSizes[i] > max ) m_readSizes[i] = max;
|
|
// debug msg
|
|
if ( m_isDebug || g_conf.m_logDebugQuery )
|
|
logf ( LOG_DEBUG,"query: ReadInfo: "
|
|
"newreadSizes[%"INT32"]=%"INT32"",i,
|
|
m_readSizes[i] );
|
|
|
|
// sanity check
|
|
if ( m_readSizes[i] > ( 500 * 1024 * 1024 ) ||
|
|
m_readSizes[i] < 0 ){
|
|
log( "minRecSize = %"INT32"", m_readSizes[i] );
|
|
char *xx=NULL; *xx=0;
|
|
}
|
|
}
|
|
// return for now
|
|
return;
|
|
}
|
|
|
|
int32_t IndexReadInfo::getStage0Default ( ) { return STAGE0; }
|
|
|
|
// . updates m_readSizes
|
|
// . sets m_isDone to true if all lists are exhausted
|
|
void IndexReadInfo::update ( IndexList *lists, int32_t numLists,
|
|
char callNum ) {
|
|
// loop over all lists and update m_startKeys[i]
|
|
for ( int32_t i = 0 ; i < numLists ; i++ ) {
|
|
// ignore lists that should be
|
|
if ( m_ignore[i] ) continue;
|
|
// . how many docIds did we read into this list?
|
|
// . double the size since the lists are compress to half now
|
|
//int32_t docsRead = lists[i].getListSize() / 6 ;
|
|
// . remove the endKey put at the end by RdbList::appendLists()
|
|
// . iff we did NOT do a merge
|
|
//if ( ! didMerge && docsRead > 0 ) docsRead--;
|
|
// debug
|
|
//log("startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64" "
|
|
// "(docsRead=%"INT32")",
|
|
// i,m_startKeys[i].n1,m_startKeys[i].n0,docsRead);
|
|
// . if we read less than supposed to, this list is exhausted
|
|
// so we set m_ignore[i] to true so we don't read again
|
|
// . we also now update termFreq to it's exact value
|
|
// . ok this condition doesn't apply now because when we
|
|
// append lists so that they are all less than a common
|
|
// endKey some lose some keys so the minRecSizes goes down
|
|
// . we should just see that if the # read is 0!
|
|
//if ( docsRead < m_docsToRead[i] ) {
|
|
if ( lists[i].getListSize() < m_readSizes[i] ) {
|
|
m_ignore [i] = true;
|
|
//m_readSizes[i] = 0;
|
|
continue;
|
|
}
|
|
// if we didn't meet our quota...
|
|
//else if ( docsRead < m_docsToRead[i] )
|
|
// m_startKeys [i] = m_endKeys [i] ;
|
|
// point to last compressed 6 byte key in list
|
|
char *list = (char *)lists[i].getList();
|
|
int32_t listSize = lists[i].getListSize();
|
|
// don't seg fault
|
|
if ( listSize < m_hks ) {
|
|
m_ignore [i] = true;
|
|
// keep the old readsize
|
|
// m_readSizes[i] = 0;
|
|
continue;
|
|
}
|
|
// we now do NOT call appendLists() again since
|
|
// we're using fast superMerges
|
|
//char *lastPart = list + listSize - 6;
|
|
char *lastPart = list + listSize - m_hks;
|
|
// . we update m_startKey to the endKey of each list
|
|
// . get the startKey now
|
|
//key_t startKey = m_startKeys[i];
|
|
char *startKey = &m_startKeys[i*m_ks];
|
|
// . load lastPart into lower 6 bytes of "startKey"
|
|
// . little endian
|
|
//gbmemcpy ( &startKey , lastPart , 6 );
|
|
gbmemcpy ( startKey , lastPart , m_hks );
|
|
// debug msg
|
|
//log("pre-startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64"",
|
|
// i,startKey.n1,startKey.n0);
|
|
// sanity checks
|
|
//if ( startKey < m_startKeys[i] ) {
|
|
if ( KEYCMP(startKey,&m_startKeys[i*m_ks],m_ks)<0 ) {
|
|
log("query: bad startKey. "
|
|
"a.n1=%016"XINT64" a.n0=%016"XINT64" < "
|
|
"b.n1=%016"XINT64" b.n0=%016"XINT64"" ,
|
|
KEY1(startKey,m_ks),
|
|
KEY0(startKey ),
|
|
KEY1(&m_startKeys[i*m_ks],m_ks),
|
|
KEY0(&m_startKeys[i*m_ks] ));
|
|
//startKey.n1 = 0xffffffff;
|
|
//startKey.n0 = 0xffffffffffffffffLL;
|
|
}
|
|
// update startKey to read the next piece now
|
|
//m_startKeys[i] = startKey;
|
|
KEYSET(&m_startKeys[i*m_ks],startKey,m_ks);
|
|
// add 1 to startKey
|
|
//m_startKeys[i] += (uint32_t) 1;
|
|
KEYADD(&m_startKeys[i*m_ks],1,m_ks);
|
|
// debug msg
|
|
//log("NOW startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64"",
|
|
// i,m_startKeys[i].n1,m_startKeys[i].n0);
|
|
// . increase termFreqs if we read more than was estimated
|
|
// . no! just changes # of total results when clicking Next 10
|
|
//if ( docsRead > m_q->m_termFreqs[i] )
|
|
// m_q->m_termFreqs[i] = docsRead;
|
|
}
|
|
// break if a list can read more, if it can read more, that is
|
|
int32_t i;
|
|
for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break;
|
|
// if all lists are exhausted, set m_isDone
|
|
if ( i >= numLists ) { m_isDone = true; return; }
|
|
// . based on # of results we got how much more should we have to read
|
|
// to get what we want, "docsWanted"
|
|
// . just base it on linear proportion
|
|
// . keep in mind, if we double the amount to read we will quadruple
|
|
// the results if reading 2 indexLists, x8 if reading from 3.
|
|
// . that doesn't take into account phrases though...
|
|
// . let's just do it this way
|
|
// loop over all lists and update m_startKeys[i] and m_totalDocsRead
|
|
for ( int32_t i = 0 ; i < numLists ; i++ ) {
|
|
// ignore lists that should be
|
|
if ( m_ignore[i] ) continue;
|
|
// update each list's docs to read
|
|
m_readSizes[i] = m_stage[(int)callNum];
|
|
/* if ( m_readSizes[i] < m_stage[0])
|
|
m_readSizes[i] = m_stage0;
|
|
else if ( m_readSizes[i] < m_stage[1])
|
|
m_readSizes[i] = m_stage1;
|
|
else
|
|
m_readSizes[i] = m_stage2;*/
|
|
// debug msg
|
|
log("newreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i]);
|
|
}
|
|
}
|
|
|
|
|
|
// . updates m_readSizes
|
|
// . sets m_isDone to true if all lists are exhausted
|
|
// . used by virtual split in msg3b to check if we're done or not.
|
|
void IndexReadInfo::update ( int64_t *termFreqs,
|
|
int32_t numLists,
|
|
char callNum ) {
|
|
// loop over all lists and update m_startKeys[i]
|
|
for ( int32_t i = 0 ; i < numLists ; i++ ) {
|
|
// ignore lists that should be
|
|
if ( m_ignore[i] ) continue;
|
|
// . how many bytes did we read ? Since these are
|
|
// . half keys, multiply termFreqs by 6 and add 6 for the
|
|
// . first key which is full 12 bytes
|
|
int64_t listSize = termFreqs[i] * 6 + 6;
|
|
|
|
if ( listSize < m_readSizes[i] ) {
|
|
m_ignore [i] = true;
|
|
//m_readSizes[i] = 0;
|
|
continue;
|
|
}
|
|
// if we didn't meet our quota...
|
|
//else if ( docsRead < m_docsToRead[i] )
|
|
// m_startKeys [i] = m_endKeys [i] ;
|
|
// point to last compressed 6 byte key in list
|
|
//char *list = (char *)lists[i].getList();
|
|
|
|
// don't seg fault
|
|
if ( listSize < m_hks ) {
|
|
m_ignore [i] = true;
|
|
//m_readSizes[i] = 0;
|
|
continue;
|
|
}
|
|
}
|
|
// break if a list can read more, if it can read more, that is
|
|
int32_t i;
|
|
for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break;
|
|
// if all lists are exhausted, set m_isDone
|
|
if ( i >= numLists ) { m_isDone = true; return; }
|
|
// . based on # of results we got how much more should we have to read
|
|
// to get what we want, "docsWanted"
|
|
// . just base it on linear proportion
|
|
// . keep in mind, if we double the amount to read we will quadruple
|
|
// the results if reading 2 indexLists, x8 if reading from 3.
|
|
// . that doesn't take into account phrases though...
|
|
// . let's just do it this way
|
|
// loop over all lists and update m_startKeys[i] and m_totalDocsRead
|
|
for ( int32_t i = 0 ; i < numLists ; i++ ) {
|
|
// debug msg
|
|
//log("oldreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i]);
|
|
// update each list's docs to read if we're not on the last
|
|
// tier
|
|
if ( !m_ignore[i] && callNum < MAX_TIERS &&
|
|
m_readSizes[i] < m_stage[(int)callNum] )
|
|
m_readSizes[i] = m_stage[(int)callNum];
|
|
/*if ( m_readSizes[i] < m_stage0)
|
|
m_readSizes[i] = m_stage0;
|
|
else if ( m_readSizes[i] < m_stage1)
|
|
m_readSizes[i] = m_stage1;
|
|
else
|
|
m_readSizes[i] = m_stage2;*/
|
|
// debug msg
|
|
if ( m_isDebug || g_conf.m_logDebugQuery )
|
|
logf ( LOG_DEBUG,"query: ReadInfo: "
|
|
"newreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i] );
|
|
}
|
|
}
|