Merge branch 'master' into sto

This commit is contained in:
Ivan Skytte Jørgensen
2017-12-04 13:03:12 +01:00
4 changed files with 13 additions and 295 deletions

@ -47,8 +47,6 @@ static bool s_inBody [HASHGROUP_END];
#define gbmin(a,b) ((a)<(b) ? (a) : (b))
#define gbmax(a,b) ((a)>(b) ? (a) : (b))
static inline bool isTermValueInRange( const char *p, const QueryTerm *qt);
static inline bool isTermValueInRange2 ( const char *recPtr, const char *subListEnd, const QueryTerm *qt);
static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize);
static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 );
static void initWeights();
@ -158,14 +156,6 @@ void PosdbTable::reset() {
m_sortByTermNumInt = -1;
m_sortByTermInfoNum = 0;
m_sortByTermInfoNumInt = 0;
m_minScoreTermNum = 0;
m_maxScoreTermNum = 0;
m_minScoreVal = 0.0;
m_maxScoreVal = 0.0;
m_minScoreTermNumInt = 0;
m_maxScoreTermNumInt = 0;
m_minScoreValInt = 0;
m_maxScoreValInt = 0;
m_useWhiteTable = false;
m_numQueryTermInfos = 0;
m_minTermListSize = 0;
@ -1427,14 +1417,6 @@ bool PosdbTable::setQueryTermInfo ( ) {
m_sortByTermNum = -1;
m_sortByTermNumInt = -1;
// now we have score ranges for gbmin:price:1.99 etc.
m_minScoreTermNum = -1;
m_maxScoreTermNum = -1;
// for gbminint:count:99 etc.
m_minScoreTermNumInt = -1;
m_maxScoreTermNumInt = -1;
m_hasMaxSerpScore = false;
if ( m_msg39req->m_minSerpDocId ) {
m_hasMaxSerpScore = true;
@ -1465,11 +1447,6 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_quotedStartId = qw->m_quoteStart;
switch(qt->m_fieldCode) {
// is it gbsortby:?
case FIELD_GBSORTBYFLOAT:
case FIELD_GBREVSORTBYFLOAT:
m_sortByTermNum = i;
m_sortByTermInfoNum = nrg;
break;
case FIELD_GBSORTBYINT:
case FIELD_GBREVSORTBYINT:
m_sortByTermNumInt = i;
@ -1477,23 +1454,6 @@ bool PosdbTable::setQueryTermInfo ( ) {
// tell topTree to use int scores
m_topTree->m_useIntScores = true;
break;
// is it gbmin:price:1.99?
case FIELD_GBNUMBERMIN:
m_minScoreTermNum = i;
m_minScoreVal = qt->m_qword->m_float;
break;
case FIELD_GBNUMBERMAX:
m_maxScoreTermNum = i;
m_maxScoreVal = qt->m_qword->m_float;
break;
case FIELD_GBNUMBERMININT:
m_minScoreTermNumInt = i;
m_minScoreValInt = qt->m_qword->m_int;
break;
case FIELD_GBNUMBERMAXINT:
m_maxScoreTermNumInt = i;
m_maxScoreValInt = qt->m_qword->m_int;
break;
default:
; //not numeric condition
}
@ -1631,17 +1591,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
// numeric posdb termlist flags. instead of word position
// they have a float stored there for sorting etc.
switch(qt->m_fieldCode) {
case FIELD_GBSORTBYFLOAT:
case FIELD_GBREVSORTBYFLOAT:
case FIELD_GBNUMBERMIN:
case FIELD_GBNUMBERMAX:
case FIELD_GBNUMBEREQUALFLOAT:
case FIELD_GBSORTBYINT:
case FIELD_GBREVSORTBYINT:
case FIELD_GBNUMBERMININT:
case FIELD_GBNUMBERMAXINT:
case FIELD_GBNUMBEREQUALINT:
qti->m_subList[nn].m_bigramFlag |= BF_NUMBER;
break;
default:
@ -5042,25 +4993,6 @@ void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum)
}
// range terms tend to disappear if the docid's value falls outside
// of the specified range... gbmin:offerprice:190
bool isRangeTerm;
const QueryTerm *qt = qti->m_subList[0].m_qt;
switch(qt->m_fieldCode) {
case FIELD_GBNUMBERMIN:
case FIELD_GBNUMBERMAX:
case FIELD_GBNUMBEREQUALFLOAT:
case FIELD_GBNUMBERMININT:
case FIELD_GBNUMBERMAXINT:
case FIELD_GBNUMBEREQUALINT:
//case FIELD_GBFIELDMATCH:
isRangeTerm = true;
break;
default:
isRangeTerm = false;
}
//
// add the first sublist's docids into the docid buf
//
@ -5072,7 +5004,7 @@ void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum)
// for "car", resulting in a buffer with docids that contain
// both terms.
makeDocIdVoteBufForRarestTerm( qti, isRangeTerm);
makeDocIdVoteBufForRarestTerm(qti);
logTrace(g_conf.m_logTracePosdb, "END.");
return;
}
@ -5127,15 +5059,6 @@ void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum)
continue;
}
// if we are a range term, does this subtermlist
// for this docid meet the min/max requirements
// of the range term, i.e. gbmin:offprice:190.
// if it doesn't then do not add this docid to the
// docidVoteBuf, "voteBufPtr"
if ( isRangeTerm && ! isTermValueInRange2(subListPtr, subListEnd, qt)) {
break;
}
// . equal! record our vote!
// . we start at zero for the
// first termlist, and go to 1, etc.
@ -5210,7 +5133,7 @@ void PosdbTable::addDocIdVotes( const QueryTermInfo *qti, int32_t listGroupNum)
// each run, the list is "compacted" and shortened so only the
// matching docids are left.
//
void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool isRangeTerm) {
void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti) {
char *cursor[MAX_SUBLISTS];
char *cursorEnd[MAX_SUBLISTS];
@ -5226,7 +5149,6 @@ void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool is
char *voteBufPtr = m_docIdVoteBuf.getBufStart();
char *lastMinRecPtr = NULL;
int32_t mini = -1;
const QueryTerm * const qt = qti->m_subList[0].m_qt;
// get the next min from all the termlists
for(;;) {
@ -5289,22 +5211,6 @@ void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool is
return;
}
bool inRange=false;
// if we are a range term, does this subtermlist
// for this docid meet the min/max requirements
// of the range term, i.e. gbmin:offprice:190.
// if it doesn't then do not add this docid to the
// docidVoteBuf, "voteBufPtr"
if ( isRangeTerm ) {
// no longer in range
if ( isTermValueInRange2(cursor[mini],cursorEnd[mini],qt)) {
inRange = true;
}
}
// advance that guy over that docid
cursor[mini] += 12;
// 6 byte keys follow?
@ -5321,11 +5227,6 @@ void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool is
break;
}
// check range again
if (isRangeTerm && isTermValueInRange2(cursor[mini],cursorEnd[mini],qt)) {
inRange = true;
}
// otherwise, skip this 6 byte key
cursor[mini] += 6;
}
@ -5350,10 +5251,6 @@ void PosdbTable::makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti, bool is
continue;
}
if ( isRangeTerm && ! inRange ) {
continue;
}
// only update this if we add the docid... that way there can be
// a winning "inRange" term in another sublist and the docid will
// get added.
@ -5419,24 +5316,6 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) {
// just use the word # now
//int32_t opNum = qw->m_wordNum;//opNum;
// if this query term # is a gbmin:offprice:190 type
// of thing, then we may end up ignoring it based on the
// score contained within!
bool isRangeTerm;
switch(qt->m_fieldCode) {
case FIELD_GBNUMBERMIN:
case FIELD_GBNUMBERMAX:
case FIELD_GBNUMBEREQUALFLOAT:
case FIELD_GBNUMBERMININT:
case FIELD_GBNUMBERMAXINT:
case FIELD_GBNUMBEREQUALINT:
//case FIELD_GBFIELDMATCH:
isRangeTerm = true;
break;
default:
isRangeTerm = false;
}
// . make it consistent with Query::isTruth()
// . m_bitNum is set above to the QueryTermInfo #
int32_t bitNum = qt->m_bitNum;
@ -5467,10 +5346,6 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) {
// place holder
int64_t docId = Posdb::getDocId(p);
// assume this docid is not in range if we
// had a range term like gbmin:offerprice:190
bool inRange = false;
// sanity
//if ( d < lastDocId )
// gbshutdownAbort(true);
@ -5479,12 +5354,6 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) {
// point to it
//char *voteBufPtr = p + 8;
// check each posdb key for compliance
// for gbmin:offprice:190 bool terms
if ( isRangeTerm && isTermValueInRange(p,qt) ) {
inRange = true;
}
// this was the first key for this docid for
// this termid and possible the first key for
// this termid, so skip it, either 12 or 18
@ -5501,23 +5370,10 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery( ) {
// same docid, so skip those as well
subloop:
if( p < pend && (((char *)p)[0]) & 0x04 ) {
// check each posdb key for compliance
// for gbmin:offprice:190 bool terms
if ( isRangeTerm && isTermValueInRange(p,qt) ) {
inRange = true;
}
p += 6;
goto subloop;
}
// if we had gbmin:offprice:190 and it
// was not satisfied, then do not OR in this
// bit in the bitvector for the docid
if ( isRangeTerm && ! inRange ) {
continue;
}
// convert docid into hash key
//int64_t docId = *(int64_t *)voteBufPtr;
// shift down 2 bits
@ -5665,63 +5521,6 @@ static int docIdVoteBufKeyCompare_desc ( const void *h1, const void *h2 ) {
// for boolean queries containing terms like gbmin:offerprice:190
static inline bool isTermValueInRange( const char *p, const QueryTerm *qt ) {
// return false if outside of range
switch(qt->m_fieldCode) {
case FIELD_GBNUMBERMIN: {
float score2 = Posdb::getFloat(p);
return score2 >= qt->m_qword->m_float;
}
case FIELD_GBNUMBERMAX: {
float score2 = Posdb::getFloat(p);
return score2 <= qt->m_qword->m_float;
}
case FIELD_GBNUMBEREQUALFLOAT: {
float score2 = Posdb::getFloat(p);
return almostEqualFloat(score2, qt->m_qword->m_float);
}
case FIELD_GBNUMBERMININT: {
int32_t score2 = Posdb::getInt(p);
return score2 >= qt->m_qword->m_int;
}
case FIELD_GBNUMBERMAXINT: {
int32_t score2 = Posdb::getInt(p);
return score2 <= qt->m_qword->m_int;
}
case FIELD_GBNUMBEREQUALINT: {
int32_t score2 = Posdb::getInt(p);
return score2 == qt->m_qword->m_int;
}
// case FIELD_GBFIELDMATCH: {
// int32_t score2 = Posdb::getInt(p);
// return score2 == qt->m_qword->m_int;
// }
default:
// how did this happen?
gbshutdownAbort(true);
}
}
static inline bool isTermValueInRange2 ( const char *recPtr, const char *subListEnd, const QueryTerm *qt ) {
// if we got a range term see if in range.
if ( isTermValueInRange(recPtr,qt) ) {
return true;
}
recPtr += 12;
for(;recPtr<subListEnd&&((*recPtr)&0x04);recPtr +=6) {
if ( isTermValueInRange(recPtr,qt) ) {
return true;
}
}
return false;
}
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
static inline const char *getWordPosList(uint64_t docId, const char *list, int32_t listSize) {

@ -176,22 +176,6 @@ private:
int32_t m_sortByTermInfoNum;
int32_t m_sortByTermInfoNumInt;
// for gbmin:price:1.99
int32_t m_minScoreTermNum;
int32_t m_maxScoreTermNum;
// for gbmin:price:1.99
float m_minScoreVal;
float m_maxScoreVal;
// for gbmin:count:99
int32_t m_minScoreTermNumInt;
int32_t m_maxScoreTermNumInt;
// for gbmin:count:99
int32_t m_minScoreValInt;
int32_t m_maxScoreValInt;
HashTableX m_whiteListTable;
bool m_useWhiteTable;
@ -210,7 +194,7 @@ public:
// for intersecting docids
void addDocIdVotes( const QueryTermInfo *qti , int32_t listGroupNum );
void makeDocIdVoteBufForRarestTerm( const QueryTermInfo *qti , bool isRangeTerm );
void makeDocIdVoteBufForRarestTerm(const QueryTermInfo *qti);
bool makeDocIdVoteBufForBoolQuery() ;
void delDocIdVotes ( const QueryTermInfo *qti ); // for negative query terms...
bool findCandidateDocIds();

@ -1526,8 +1526,7 @@ bool Query::setQWords ( char boolFlag ,
// likewise for gbsortby operators watch out for boolean
// operators at the end of the field. we also check for
// parens below when computing the hash of the value.
if ( (fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBSORTBYFLOAT ) &&
if ( (fieldCode == FIELD_GBSORTBYINT) &&
( w[0] == '(' || w[0] == ')' ) )
cancelField = true;
@ -1665,26 +1664,6 @@ bool Query::setQWords ( char boolFlag ,
if ( fieldCode == FIELD_TYPE )
ph = hash64 ("type",4);
// these are range constraints on the gbsortby: termlist
// which sorts numbers in a field from low to high
if ( fieldCode == FIELD_GBNUMBERMIN )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMAX )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBEREQUALFLOAT )
ph = hash64 ("gbsortby", 8);
// fix for gbsortbyfloat:product.price
if ( fieldCode == FIELD_GBSORTBYFLOAT )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMININT )
ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBERMAXINT )
ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBEREQUALINT )
ph = hash64 ("gbsortbyint", 11);
// ptr to field, if any
qw->m_fieldCode = fieldCode;
@ -1700,18 +1679,9 @@ bool Query::setQWords ( char boolFlag ,
fieldCode == FIELD_LINKS||
fieldCode == FIELD_SITE ||
fieldCode == FIELD_IP ||
fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBYFLOAT ||
// gbmin:price:1.23
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBNUMBEREQUALFLOAT ||
fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBREVSORTBYINT ||
fieldCode == FIELD_GBNUMBERMININT ||
fieldCode == FIELD_GBNUMBERMAXINT ||
fieldCode == FIELD_GBNUMBEREQUALINT ||
fieldCode == FIELD_GBFIELDMATCH ) {
// . find 1st space -- that terminates the field value
@ -1760,9 +1730,7 @@ bool Query::setQWords ( char boolFlag ,
// gbsortby:products.offerPrice
// gbmin:price:1.23 case insensitive
// too late... we have to support what we have
if (fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBYFLOAT ||
fieldCode == FIELD_GBSORTBYINT ||
if (fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBREVSORTBYINT) {
wid = hash64Lower_utf8(w, wlen, 0LL);
// do not include this word as part of
@ -1817,39 +1785,6 @@ bool Query::setQWords ( char boolFlag ,
wid = hash64(val64, wid);
}
// gbmin:price:1.23
if (lastColonLen > 0 &&
(fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBNUMBEREQUALFLOAT ||
fieldCode == FIELD_GBNUMBEREQUALINT ||
fieldCode == FIELD_GBNUMBERMININT ||
fieldCode == FIELD_GBNUMBERMAXINT)) {
// record the field
wid = hash64Lower_utf8(w, lastColonLen, 0LL);
// fix gbminint:gbfacetstr:gbxpath...:165004297
if (colonCount == 2) {
int64_t wid1;
int64_t wid2;
const char *a = w;
const char *b = w + firstColonLen;
wid1 = hash64Lower_utf8(a, b - a);
a = w + firstColonLen + 1;
b = w + lastColonLen;
wid2 = hash64Lower_utf8(a, b - a);
// keep prefix as 2nd arg to this
wid = hash64(wid2, wid1);
// we need this for it to work
ph = 0LL;
}
// and also the floating point after that
qw->m_float = atof(w + lastColonLen + 1);
qw->m_int = (int32_t) atoll(w + lastColonLen + 1);
}
// should we have normalized before hashing?
if (fieldCode == FIELD_URL ||
fieldCode == FIELD_LINK ||

16
Query.h

@ -85,20 +85,20 @@ enum field_code_t {
//FIELD_UNUSED = 51,
FIELD_GBDOCID = 52,
FIELD_GBCONTENTHASH = 53, // for deduping at spider time
FIELD_GBSORTBYFLOAT = 54, // i.e. sortby:price -> numeric termlist
FIELD_GBREVSORTBYFLOAT = 55, // i.e. sortby:price -> low to high
FIELD_GBNUMBERMIN = 56,
FIELD_GBNUMBERMAX = 57,
//FIELD_GBSORTBYFLOAT = 54, // i.e. sortby:price -> numeric termlist
//FIELD_GBREVSORTBYFLOAT = 55, // i.e. sortby:price -> low to high
//FIELD_GBNUMBERMIN = 56,
//FIELD_GBNUMBERMAX = 57,
//FIELD_UNUSED = 58,
FIELD_GBSORTBYINT = 59,
FIELD_GBREVSORTBYINT = 60,
FIELD_GBNUMBERMININT = 61,
FIELD_GBNUMBERMAXINT = 62,
//FIELD_GBNUMBERMININT = 61,
//FIELD_GBNUMBERMAXINT = 62,
//FIELD_UNUSED = 63,
//FIELD_UNUSED = 64,
//FIELD_UNUSED = 65,
FIELD_GBNUMBEREQUALINT = 66,
FIELD_GBNUMBEREQUALFLOAT= 67,
//FIELD_GBNUMBEREQUALINT = 66,
//FIELD_GBNUMBEREQUALFLOAT= 67,
//FIELD_UNUSED = 68,
FIELD_GBFIELDMATCH = 69,
};