forked from Mirrors/privacore-open-source-search-engine
Licence to kill (gotos)
This commit is contained in:
236
PosdbTable.cpp
236
PosdbTable.cpp
@ -3218,7 +3218,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
if ( ! qti->m_matchingSubListSavedCursor[k] ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// getMaxPossibleScore() incremented m_matchingSubListCursor to
|
||||
// the next docid so use m_matchingSubListSavedCursor.
|
||||
nwp [nsub] = qti->m_matchingSubListSavedCursor [k];
|
||||
@ -3251,130 +3251,134 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
// . ok, merge the lists into a list in mbuf
|
||||
// . get the min of each list
|
||||
mergeMore:
|
||||
|
||||
int32_t mink = -1;
|
||||
for ( int32_t k = 0 ; k < nsub ; k++ ) {
|
||||
// skip if list is exhausted
|
||||
if ( ! nwp[k] ) {
|
||||
continue;
|
||||
bool currTermDone = false;
|
||||
char ks;
|
||||
|
||||
do {
|
||||
int32_t mink = -1;
|
||||
ks = 0;
|
||||
|
||||
for ( int32_t k = 0 ; k < nsub ; k++ ) {
|
||||
// skip if list is exhausted
|
||||
if ( ! nwp[k] ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// auto winner?
|
||||
if ( mink == -1 ) {
|
||||
mink = k;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( KEYCMP(nwp[k],nwp[mink],6) < 0 ) {
|
||||
mink = k; // a new min...
|
||||
}
|
||||
}
|
||||
|
||||
// auto winner?
|
||||
// all exhausted? merge next set of sublists then for term #j
|
||||
if ( mink == -1 ) {
|
||||
mink = k;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( KEYCMP(nwp[k],nwp[mink],6) < 0 ) {
|
||||
mink = k; // a new min...
|
||||
}
|
||||
}
|
||||
|
||||
// all exhausted? merge next set of sublists then for term #j
|
||||
if ( mink == -1 ) {
|
||||
miniMergedEnd[j] = mptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
// get keysize
|
||||
char ks = Posdb::getKeySize(nwp[mink]);
|
||||
|
||||
// HACK OF CONFUSION:
|
||||
//
|
||||
// skip it if its a query phrase term, like
|
||||
// "searchengine" is for the 'search engine' query
|
||||
// AND it has the synbit which means it was a bigram
|
||||
// in the doc (i.e. occurred as two separate terms)
|
||||
//
|
||||
// second check means it occurred as two separate terms
|
||||
// or could be like bob and occurred as "bob's".
|
||||
// see XmlDoc::hashWords3().
|
||||
if ( ! ((nwpFlags[mink] & BF_BIGRAM) && (nwp[mink][2] & 0x03)) ) {
|
||||
|
||||
// if the first key in our merged list store the docid crap
|
||||
if ( isFirstKey ) {
|
||||
// store a 12 byte key in the merged list buffer
|
||||
memcpy ( mptr, nwp[mink], 12 );
|
||||
// wipe out its syn bits and re-use our way
|
||||
mptr[2] &= 0xfc;
|
||||
// set the synbit so we know if its a synonym of term
|
||||
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
|
||||
mptr[2] |= 0x02;
|
||||
}
|
||||
|
||||
// wiki half stop bigram? so for the query
|
||||
// 'time enough for love' the phrase term "enough for"
|
||||
// is a half stopword wiki bigram, because it is in
|
||||
// a phrase in wikipedia ("time enough for love") and
|
||||
// one of the two words in the phrase term is a
|
||||
// stop word. therefore we give it more weight than
|
||||
// just 'enough' by itself.
|
||||
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
|
||||
mptr[2] |= 0x01;
|
||||
}
|
||||
|
||||
// make sure its 12 bytes! it might have been
|
||||
// the first key for the termid, and 18 bytes.
|
||||
mptr[0] &= 0xf9;
|
||||
mptr[0] |= 0x02;
|
||||
// save it
|
||||
lastMptr = mptr;
|
||||
mptr += 12;
|
||||
isFirstKey = false;
|
||||
// continue outer "j < m_numQueryTermInfos" loop.
|
||||
currTermDone = true;
|
||||
}
|
||||
else {
|
||||
// if matches last key word position, do not add!
|
||||
// we should add the bigram first if more important
|
||||
// since it should be added before the actual term
|
||||
// above in the sublist array. so if they are
|
||||
// wikihalfstop bigrams they will be added first,
|
||||
// otherwise, they are added after the regular term.
|
||||
// should fix double scoring bug for 'cheat codes'
|
||||
// query!
|
||||
if ( Posdb::getWordPos(lastMptr) != Posdb::getWordPos(nwp[mink]) ) {
|
||||
memcpy ( mptr, nwp[mink], 6 );
|
||||
// wipe out its syn bits and re-use our way
|
||||
mptr[2] &= 0xfc;
|
||||
// set the synbit so we know if its a synonym of term
|
||||
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
|
||||
mptr[2] |= 0x02;
|
||||
}
|
||||
// get keysize
|
||||
ks = Posdb::getKeySize(nwp[mink]);
|
||||
|
||||
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
|
||||
mptr[2] |= 0x01;
|
||||
}
|
||||
// HACK OF CONFUSION:
|
||||
//
|
||||
// skip it if its a query phrase term, like
|
||||
// "searchengine" is for the 'search engine' query
|
||||
// AND it has the synbit which means it was a bigram
|
||||
// in the doc (i.e. occurred as two separate terms)
|
||||
//
|
||||
// second check means it occurred as two separate terms
|
||||
// or could be like bob and occurred as "bob's".
|
||||
// see XmlDoc::hashWords3().
|
||||
if ( ! ((nwpFlags[mink] & BF_BIGRAM) && (nwp[mink][2] & 0x03)) ) {
|
||||
|
||||
// if it was the first key of its list it may not
|
||||
// have its bit set for being 6 bytes now! so turn
|
||||
// on the 2 compression bits
|
||||
mptr[0] &= 0xf9;
|
||||
mptr[0] |= 0x06;
|
||||
// save it
|
||||
lastMptr = mptr;
|
||||
mptr += 6;
|
||||
// if the first key in our merged list store the docid crap
|
||||
if ( isFirstKey ) {
|
||||
// store a 12 byte key in the merged list buffer
|
||||
memcpy ( mptr, nwp[mink], 12 );
|
||||
// wipe out its syn bits and re-use our way
|
||||
mptr[2] &= 0xfc;
|
||||
// set the synbit so we know if its a synonym of term
|
||||
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
|
||||
mptr[2] |= 0x02;
|
||||
}
|
||||
|
||||
// wiki half stop bigram? so for the query
|
||||
// 'time enough for love' the phrase term "enough for"
|
||||
// is a half stopword wiki bigram, because it is in
|
||||
// a phrase in wikipedia ("time enough for love") and
|
||||
// one of the two words in the phrase term is a
|
||||
// stop word. therefore we give it more weight than
|
||||
// just 'enough' by itself.
|
||||
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
|
||||
mptr[2] |= 0x01;
|
||||
}
|
||||
|
||||
// make sure its 12 bytes! it might have been
|
||||
// the first key for the termid, and 18 bytes.
|
||||
mptr[0] &= 0xf9;
|
||||
mptr[0] |= 0x02;
|
||||
// save it
|
||||
lastMptr = mptr;
|
||||
mptr += 12;
|
||||
isFirstKey = false;
|
||||
}
|
||||
else {
|
||||
// if matches last key word position, do not add!
|
||||
// we should add the bigram first if more important
|
||||
// since it should be added before the actual term
|
||||
// above in the sublist array. so if they are
|
||||
// wikihalfstop bigrams they will be added first,
|
||||
// otherwise, they are added after the regular term.
|
||||
// should fix double scoring bug for 'cheat codes'
|
||||
// query!
|
||||
if ( Posdb::getWordPos(lastMptr) != Posdb::getWordPos(nwp[mink]) ) {
|
||||
memcpy ( mptr, nwp[mink], 6 );
|
||||
// wipe out its syn bits and re-use our way
|
||||
mptr[2] &= 0xfc;
|
||||
// set the synbit so we know if its a synonym of term
|
||||
if ( nwpFlags[mink] & (BF_BIGRAM|BF_SYNONYM)) {
|
||||
mptr[2] |= 0x02;
|
||||
}
|
||||
|
||||
if ( nwpFlags[mink] & BF_HALFSTOPWIKIBIGRAM ) {
|
||||
mptr[2] |= 0x01;
|
||||
}
|
||||
|
||||
// if it was the first key of its list it may not
|
||||
// have its bit set for being 6 bytes now! so turn
|
||||
// on the 2 compression bits
|
||||
mptr[0] &= 0xf9;
|
||||
mptr[0] |= 0x06;
|
||||
// save it
|
||||
lastMptr = mptr;
|
||||
mptr += 6;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//log("skipping ks=%" PRId32,(int32_t)ks);
|
||||
// advance the cursor over the key we used.
|
||||
nwp[mink] += ks; // Posdb::getKeySize(nwp[mink]);
|
||||
// exhausted?
|
||||
if ( nwp[mink] >= nwpEnd[mink] ) {
|
||||
nwp[mink] = NULL;
|
||||
}
|
||||
|
||||
// or hit a different docid
|
||||
else if ( Posdb::getKeySize(nwp[mink]) != 6 ) {
|
||||
nwp[mink] = NULL;
|
||||
}
|
||||
|
||||
// avoid breach of core below now
|
||||
if ( mptr < mptrEnd ) {
|
||||
goto mergeMore;
|
||||
}
|
||||
|
||||
|
||||
// advance the cursor over the key we used.
|
||||
nwp[mink] += ks; // Posdb::getKeySize(nwp[mink]);
|
||||
|
||||
// exhausted?
|
||||
if ( nwp[mink] >= nwpEnd[mink] ) {
|
||||
nwp[mink] = NULL;
|
||||
}
|
||||
else
|
||||
if ( Posdb::getKeySize(nwp[mink]) != 6 ) {
|
||||
// or hit a different docid
|
||||
nwp[mink] = NULL;
|
||||
}
|
||||
} // mink != -1
|
||||
|
||||
//log("skipping ks=%" PRId32,(int32_t)ks);
|
||||
} while( !currTermDone && mptr < mptrEnd ); // merge more ...
|
||||
|
||||
// wrap it up here since done merging
|
||||
miniMergedEnd[j] = mptr;
|
||||
}
|
||||
|
Reference in New Issue
Block a user