1135 lines
39 KiB
C++
1135 lines
39 KiB
C++
// Matt Wells, copyright Aug 2003
|
|
|
|
// Query is a class for parsing queries
|
|
|
|
#ifndef _QUERY_H_
|
|
#define _QUERY_H_
|
|
|
|
#include "SafeBuf.h"
|
|
#include "Mem.h"
|
|
|
|
// keep these down to save memory
|
|
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
|
|
//#define MAX_QUERY_LEN 3200
|
|
// support big OR queries for image shingles
|
|
#define ABS_MAX_QUERY_LEN 62000
|
|
// . words need to deal with int32_t list of sites!
|
|
// . remember, words can be string of punctuation, too
|
|
//#define MAX_QUERY_WORDS 5000
|
|
//#define MAX_QUERY_WORDS 32000
|
|
// not any more!
|
|
//#define MAX_QUERY_WORDS 320
|
|
// raise for crazy bool query on diffbot
|
|
// seems like we alloc just enough to hold our words now so that this
|
|
// is really a performance capper but it is used in Summary.cpp
|
|
// and Matches.h so don't go too big just yet
|
|
//#define MAX_QUERY_WORDS 800
|
|
#define ABS_MAX_QUERY_WORDS 99000
|
|
|
|
// . how many IndexLists might we get/intersect
|
|
// . we now use a int64_t to hold the query term bits for non-boolean queries
|
|
//#define MAX_QUERY_TERMS 216
|
|
//#define MAX_QUERY_TERMS 512
|
|
// seems like CTS is causing huge delay spiders in query processing so
|
|
// truncate for now...
|
|
//#define MAX_QUERY_TERMS 40
|
|
// we need more for zak's categories!
|
|
//#define MAX_QUERY_TERMS 1500
|
|
// nah, do 40 again
|
|
//#define MAX_QUERY_TERMS 40
|
|
// how to make a lock pick set loses synonyms from 40!
|
|
//#define MAX_QUERY_TERMS 80
|
|
//#define MAX_QUERY_TERMS 160
|
|
#define ABS_MAX_QUERY_TERMS 9000
|
|
|
|
// only allow up to 200 interests from facebook plus manually entered
|
|
// because we are limited by the query terms above so we can only
|
|
// UOR so many in SearchInput.cpp
|
|
#define MAX_INTERESTS 200
|
|
|
|
#define GBUF_SIZE (16*1024)
|
|
#define SYNBUF_SIZE (16*1024)
|
|
|
|
// score of highest-scoring query term in the QueryScore
|
|
//#define BASE_QUERY_SCORE 10000000
|
|
|
|
// let's support up to 64 query terms for now
|
|
typedef uint64_t qvec_t;
|
|
|
|
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
|
|
|
|
#define MAX_OVEC_SIZE 256
|
|
|
|
// only can use 16-bit since have to make a 64k truth table!
|
|
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
|
|
|
|
// field codes
|
|
#define FIELD_URL 1
|
|
#define FIELD_LINK 2
|
|
#define FIELD_SITE 3
|
|
#define FIELD_IP 4
|
|
#define FIELD_SUBURL 5
|
|
#define FIELD_TITLE 6
|
|
#define FIELD_TYPE 7
|
|
#define FIELD_EXT 21
|
|
#define FIELD_COLL 22
|
|
#define FIELD_ILINK 23
|
|
#define FIELD_LINKS 24
|
|
#define FIELD_SITELINK 25
|
|
// non-standard field codes
|
|
#define FIELD_ZIP 8
|
|
#define FIELD_CITY 9
|
|
#define FIELD_STREET 10
|
|
#define FIELD_AUTHOR 11
|
|
#define FIELD_LANG 12
|
|
#define FIELD_CLASS 13
|
|
#define FIELD_COUNTRY 14
|
|
#define FIELD_TAG 15
|
|
#define FIELD_STATE 16
|
|
#define FIELD_DATE 17
|
|
#define FIELD_GENERIC 18
|
|
#define FIELD_ISCLEAN 19 // we hash field="isclean:" val="1" if doc clean
|
|
//#define FIELD_RANGE 20 // date range OBSOLETE, was only for newspaperarchive
|
|
#define FIELD_CHARSET 30
|
|
#define FIELD_GBRSS 31
|
|
#define FIELD_URLHASH 32
|
|
#define FIELD_URLHASHDIV10 33
|
|
#define FIELD_URLHASHDIV100 34
|
|
#define FIELD_GBRULESET 35
|
|
#define FIELD_GBLANG 36
|
|
#define FIELD_GBQUALITY 37
|
|
#define FIELD_LINKTEXTIN 38
|
|
#define FIELD_LINKTEXTOUT 39
|
|
#define FIELD_KEYWORD 40
|
|
#define FIELD_QUOTA 41
|
|
#define FIELD_GBTAGVECTOR 42
|
|
#define FIELD_GBGIGABITVECTOR 43
|
|
#define FIELD_GBSAMPLEVECTOR 44
|
|
#define FIELD_SYNONYM 45
|
|
#define FIELD_GBCOUNTRY 46
|
|
#define FIELD_GBAD 47
|
|
#define FIELD_GBSUBMITURL 48
|
|
|
|
#define FIELD_GBPERMALINK 49
|
|
#define FIELD_GBCSENUM 50
|
|
#define FIELD_GBSECTIONHASH 51
|
|
#define FIELD_GBDOCID 52
|
|
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
|
|
#define FIELD_GBSORTBYFLOAT 54 // i.e. sortby:price -> numeric termlist
|
|
#define FIELD_GBREVSORTBYFLOAT 55 // i.e. sortby:price -> low to high
|
|
#define FIELD_GBNUMBERMIN 56
|
|
#define FIELD_GBNUMBERMAX 57
|
|
#define FIELD_GBPARENTURL 58
|
|
|
|
#define FIELD_GBSORTBYINT 59
|
|
#define FIELD_GBREVSORTBYINT 60
|
|
#define FIELD_GBNUMBERMININT 61
|
|
#define FIELD_GBNUMBERMAXINT 62
|
|
#define FIELD_GBFACETSTR 63
|
|
#define FIELD_GBFACETINT 64
|
|
#define FIELD_GBFACETFLOAT 65
|
|
#define FIELD_GBNUMBEREQUALINT 66
|
|
#define FIELD_GBNUMBEREQUALFLOAT 67
|
|
#define FIELD_SUBURL2 68
|
|
#define FIELD_GBFIELDMATCH 69
|
|
|
|
#define FIELD_GBOTHER 92
|
|
|
|
// returns a FIELD_* code above, or FIELD_GENERIC if not in the list
|
|
char getFieldCode ( char *s , int32_t len , bool *hasColon = NULL ) ;
|
|
char getFieldCode2 ( char *s , int32_t len , bool *hasColon = NULL ) ;
|
|
char getFieldCode3 ( int64_t h64 ) ;
|
|
|
|
int32_t getNumFieldCodes ( );
|
|
|
|
// . values for QueryField::m_flag
|
|
// . QTF_DUP means it is just for the help page in PageRoot.cpp to
|
|
// illustrate a second or third example
|
|
#define QTF_DUP 0x01
|
|
#define QTF_HIDE 0x02
|
|
#define QTF_BEGINNEWTABLE 0x04
|
|
|
|
struct QueryField {
|
|
char *text;
|
|
char field;
|
|
bool hasColon;
|
|
char *example;
|
|
char *desc;
|
|
char *m_title;
|
|
char m_flag;
|
|
};
|
|
|
|
extern struct QueryField g_fields[];
|
|
|
|
// reasons why we ignore a particular QueryWord's word or phrase
|
|
#define IGNORE_DEFAULT 1 // punct
|
|
#define IGNORE_CONNECTED 2 // connected sequence (cd-rom)
|
|
#define IGNORE_QSTOP 3 // query stop word (come 'to' me)
|
|
#define IGNORE_REPEAT 4 // repeated term (time after time)
|
|
#define IGNORE_FIELDNAME 5 // word is a field name, like title:
|
|
#define IGNORE_BREECH 6 // query exceeded MAX_QUERY_TERMS so we ignored part
|
|
#define IGNORE_BOOLOP 7 // boolean operator (OR,AND,NOT)
|
|
#define IGNORE_QUOTED 8 // word in quotes is ignored. "the day"
|
|
//#define IGNORE_SYNONYM 9 // part of a gbsynonym: field
|
|
|
|
// . reasons why we ignore a QueryTerm
|
|
// . we replace sequences of UOR'd terms with a compound term, which is
|
|
// created by merging the termlists of the UOR'd terms together. We store
|
|
// this compound termlist into a cache to avoid having to do the merge again.
|
|
#define IGNORE_COMPONENT 9 // if term was replaced by a compound term
|
|
|
|
// boolean query operators (m_opcode field in QueryWord)
|
|
#define OP_OR 1
|
|
#define OP_AND 2
|
|
#define OP_NOT 3
|
|
#define OP_LEFTPAREN 4
|
|
#define OP_RIGHTPAREN 5
|
|
#define OP_UOR 6
|
|
#define OP_PIPE 7
|
|
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
////////// BEGIN BOOLEAN STUFF /////////////
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
// . creating a QueryBoolean class was unnecessary since it was only functional
|
|
// and had nothing new it would store that the Query class doesn't store
|
|
// . the entry point is the Query::setBitScoresBoolean() function below
|
|
|
|
// . we can have as many operands (plain opds, not expressions) as query terms
|
|
// . no, not anymore, we boosted MAX_QUERY_TERMS so we can have UORs which
|
|
// essentially make a bunch of terms use the same explicit bit
|
|
//#define MAX_OPERANDS 16
|
|
#define MAX_OPERANDS (MAX_QUERY_TERMS)
|
|
|
|
class Operand {
|
|
public:
|
|
int32_t set ( int32_t a , int32_t b , class QueryWord *qwords , int32_t level ,
|
|
bool underNOT ) ;
|
|
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
|
// . Operand::m_opBits is the required bits for operand to be true
|
|
// . does not include signless phrases
|
|
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
|
|
bool isTruth ( unsigned char *bitVec , int32_t vecSize ) {
|
|
// must always satisfy hard required terms (+ sign)
|
|
//if ( (bits & m_forcedBits) != m_forcedBits )
|
|
// return false;
|
|
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
|
|
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
|
|
if ( m_hasNOT ) {
|
|
for ( int32_t i = 0 ; i < vecSize ; i++ )
|
|
if ( m_opBits[i] & bitVec[i] ) return false;
|
|
return true;
|
|
}
|
|
for ( int32_t i = 0 ; i < vecSize ; i++ )
|
|
if ( m_opBits[i] & bitVec[i] ) return true;
|
|
return false;
|
|
// . we are now back to good ol' default OR
|
|
// . m_opBits should have been masked with
|
|
// m_requiredBits so as not to include signless phrases
|
|
//return ( (bits & m_opBits) != 0 );
|
|
};
|
|
void print (SafeBuf *sbuf);
|
|
// we are a sequence of QueryWords
|
|
//int32_t m_startWordNum;
|
|
//int32_t m_lastWordNum;
|
|
// . doc just needs one of these bits for this op to be considered true
|
|
// . terms under the same QueryTermInfo class should have the same
|
|
// termbit here
|
|
unsigned char m_opBits[MAX_OVEC_SIZE];
|
|
//int32_t m_vecSize;
|
|
// does the word NOT preceed the operand?
|
|
bool m_hasNOT;
|
|
//class Expression *m_parent;
|
|
|
|
// we MUST have these for this OPERAND to be true
|
|
//uint16_t m_forcedBits;
|
|
};
|
|
*/
|
|
|
|
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
////////// END BOOLEAN STUFF /////////////
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
|
|
|
|
#define MAX_FACET_RANGES 256
|
|
|
|
// . these first two classes are functionless
|
|
// . QueryWord, like the Phrases class, is an extension on the Words class
|
|
// . the array of QueryWords, m_qwords[], is contained in the Query class
|
|
// . we compute the QueryTerms (m_qterms[]) from the QueryWords
|
|
class QueryWord {
|
|
|
|
public:
|
|
bool isAlphaWord() { return is_alnum_utf8(m_word); };
|
|
bool hasWhiteSpace() {
|
|
char *p = m_word;
|
|
char *pend = m_word + m_wordLen;
|
|
for ( ; p < pend ; p += getUtf8CharSize ( p ) )
|
|
if ( is_wspace_utf8 ( p ) ) return true;
|
|
return false;
|
|
};
|
|
void constructor ();
|
|
void destructor ();
|
|
|
|
//UCScript wordScript() {
|
|
// UChar*foo;
|
|
// return ucGetScript(utf16Decode((UChar*)(m_word),&foo));
|
|
//}
|
|
|
|
// this ptr references into the actual query
|
|
char *m_word ;
|
|
int32_t m_wordLen ;
|
|
// the length of the phrase, if any. it starts at m_word
|
|
int32_t m_phraseLen;
|
|
// this is the term hash with collection and field name and
|
|
// can be looked up directly in indexdb
|
|
int64_t m_wordId ;
|
|
int64_t m_phraseId;
|
|
// hash of field name then collection, used to hash termId
|
|
int64_t m_prefixHash;
|
|
int32_t m_wordNum;
|
|
int32_t m_posNum;
|
|
// are we in a phrase in a wikipedia title?
|
|
int32_t m_wikiPhraseId;
|
|
int32_t m_wikiPhraseStart;
|
|
int32_t m_numWordsInWikiPhrase;
|
|
|
|
// . this is just the hash of m_term and is used for highlighting, etc.
|
|
// . it is 0 for terms in a field?
|
|
int64_t m_rawWordId ;
|
|
int64_t m_rawPhraseId ;
|
|
// if we are phrase, the end word's raw id
|
|
int64_t m_rightRawWordId;
|
|
// the field as a convenient numeric code
|
|
char m_fieldCode ;
|
|
// . '-' means to exclude from search results
|
|
// . '+' means to include in all search results
|
|
// . if we're a phrase term, signs distribute across quotes
|
|
char m_wordSign;
|
|
char m_phraseSign;
|
|
// this is 1 if the associated word is a valid query term but its
|
|
// m_explicitBit is 0. we use this to save explicit bits for those
|
|
// terms that need them (like those terms in complicated nested boolean
|
|
// expressions) and just use a hardCount to see how many hard required
|
|
// terms are contained by a document. see IndexTable.cpp "hardCount"
|
|
char m_hardCount;
|
|
// the parenthetical level of this word in the boolean expression.
|
|
// level 0 is the first level.
|
|
char m_level;
|
|
// . how many plusses preceed this query term?
|
|
// . the more plusses the more weight it is given
|
|
//char m_numPlusses ;
|
|
// is this word a query stop word?
|
|
bool m_isQueryStopWord ;
|
|
// is it a plain stop word?
|
|
bool m_isStopWord ;
|
|
bool m_isPunct;
|
|
// are we an op code?
|
|
char m_opcode;
|
|
// . the ignore code
|
|
// . explains why this query term should be ignored
|
|
// . see #define'd IGNORE_* codes above
|
|
char m_ignoreWord ;
|
|
char m_ignorePhrase ;
|
|
|
|
// so we ignore gbsortby:offerprice in bool expressions
|
|
char m_ignoreWordInBoolQuery;
|
|
|
|
// is this query single word in quotes?
|
|
bool m_inQuotes ;
|
|
// is this word in a phrase that is quoted?
|
|
bool m_inQuotedPhrase;
|
|
// what word # does the quote we are in start at?
|
|
int32_t m_quoteStart;
|
|
int32_t m_quoteEnd; // inclusive!
|
|
// are we connected to the alnum word on our left/right?
|
|
bool m_leftConnected;
|
|
bool m_rightConnected;
|
|
// if we're in middle or right end of a phrase, where does it start?
|
|
int32_t m_leftPhraseStart;
|
|
// . what QueryTerm does our "phrase" map to? NULL if none.
|
|
// . this allows us to OR in extra bits into that QueryTerm's m_bits
|
|
// member that correspond to the single word constituents
|
|
// . remember, m_bits is a bit vector that represents the QueryTerms
|
|
// a document contains
|
|
class QueryTerm *m_queryPhraseTerm;
|
|
// . what QueryTerm does our "word" map to? NULL if none.
|
|
// . used by QueryBoolean since it uses QueryWords heavily
|
|
class QueryTerm *m_queryWordTerm;
|
|
// user defined weights
|
|
|
|
int32_t m_userWeight;
|
|
char m_userType;
|
|
float m_userWeightPhrase;
|
|
|
|
char m_userTypePhrase;
|
|
bool m_queryOp;
|
|
// is it after a NOT operator? i.e. NOT ( x UOR y UOR ... )
|
|
bool m_underNOT;
|
|
// is this query word before a | (pipe) operator?
|
|
bool m_piped;
|
|
// used by Matches.cpp for highlighting under different colors
|
|
int32_t m_colorNum;
|
|
|
|
// for min/max score ranges like gbmin:price:1.99
|
|
float m_float;
|
|
// for gbminint:99 etc. uses integers instead of floats for better res
|
|
int32_t m_int;
|
|
|
|
// for holding some synonyms
|
|
SafeBuf m_synWordBuf;
|
|
|
|
|
|
int32_t m_facetRangeIntA [MAX_FACET_RANGES];
|
|
int32_t m_facetRangeIntB [MAX_FACET_RANGES];
|
|
float m_facetRangeFloatA [MAX_FACET_RANGES];
|
|
float m_facetRangeFloatB [MAX_FACET_RANGES];
|
|
int32_t m_numFacetRanges;
|
|
|
|
|
|
// what operand bit # is it for doing boolean queries?
|
|
//int32_t m_opBitNum;
|
|
// when an operand is an expression...
|
|
class Expression *m_expressionPtr;
|
|
|
|
void clear() {
|
|
|
|
m_word = {};
|
|
m_wordLen = {};
|
|
m_phraseLen = {};
|
|
m_wordId = {};
|
|
m_phraseId = {};
|
|
m_prefixHash = {};
|
|
m_wordNum = {};
|
|
m_posNum = {};
|
|
m_wikiPhraseId = {};
|
|
m_wikiPhraseStart = {};
|
|
m_numWordsInWikiPhrase = {};
|
|
m_rawWordId = {};
|
|
m_rawPhraseId = {};
|
|
m_rightRawWordId = {};
|
|
m_fieldCode = {};
|
|
m_wordSign = {};
|
|
m_phraseSign = {};
|
|
m_hardCount = {};
|
|
m_level = {};
|
|
m_isQueryStopWord = {};
|
|
m_isStopWord = {};
|
|
m_isPunct = {};
|
|
m_opcode = {};
|
|
m_ignoreWord = {};
|
|
m_ignorePhrase = {};
|
|
m_ignoreWordInBoolQuery = {};
|
|
m_inQuotes = {};
|
|
m_inQuotedPhrase = {};
|
|
m_quoteStart = {};
|
|
m_quoteEnd = {};
|
|
m_leftConnected = {};
|
|
m_rightConnected = {};
|
|
m_leftPhraseStart = {};
|
|
m_queryPhraseTerm = {};
|
|
m_queryWordTerm = {};
|
|
m_userWeight = {};
|
|
m_userType = {};
|
|
m_userWeightPhrase = {};
|
|
m_userTypePhrase = {};
|
|
m_queryOp = {};
|
|
m_underNOT = {};
|
|
m_piped = {};
|
|
m_colorNum = {};
|
|
m_float = {};
|
|
m_int = {};
|
|
m_synWordBuf = {};
|
|
m_facetRangeIntA [MAX_FACET_RANGES] = {};
|
|
m_facetRangeIntB [MAX_FACET_RANGES] = {};
|
|
m_facetRangeFloatA [MAX_FACET_RANGES] = {};
|
|
m_facetRangeFloatB [MAX_FACET_RANGES] = {};
|
|
m_numFacetRanges = {};
|
|
m_expressionPtr = {};
|
|
}
|
|
};
|
|
|
|
// . we filter the QueryWords and turn them into QueryTerms
|
|
// . QueryTerms are the important parts of the QueryWords
|
|
class QueryTerm {
|
|
|
|
public:
|
|
|
|
//QueryTerm ( ) { constructor(); };
|
|
|
|
void constructor ( ) ;
|
|
|
|
// the query word we were derived from
|
|
QueryWord *m_qword;
|
|
// . are we a phrase termid or single word termid from that QueryWord?
|
|
// . the QueryWord instance represents both, so we must choose
|
|
bool m_isPhrase;
|
|
// for compound phrases like, "cat dog fish" we do not want docs
|
|
// with "cat dog" and "dog fish" to match, so we extended our hackfix
|
|
// in Summary.cpp to use m_phrasePart to do this post-query filtering
|
|
int32_t m_phrasePart;
|
|
// this is phraseId for phrases, and wordId for words
|
|
int64_t m_termId;
|
|
// used by Matches.cpp
|
|
int64_t m_rawTermId;
|
|
|
|
// . if we are a phrase these are the termids of the word that
|
|
// starts the phrase and the word that ends the phrase respectively
|
|
int64_t m_rightRawWordId;
|
|
int64_t m_leftRawWordId;
|
|
|
|
// sign of the phrase or word we used
|
|
char m_termSign;
|
|
// our representative bit (up to 16 MAX_QUERY_TERMS)
|
|
//uint16_t m_explicitBit;
|
|
qvec_t m_explicitBit;
|
|
// usually this equal m_explicitBit, BUT if a word is repeated
|
|
// in different areas of the doc, we union all the individual
|
|
// explicit bits of that repeated word into this bit vec. it is
|
|
// used by Matches.cpp only so far.
|
|
qvec_t m_matchesExplicitBits;
|
|
// this is 1 if the associated word is a valid query term but its
|
|
// m_explicitBit is 0. we use this to save explicit bits for those
|
|
// terms that need them (like those terms in complicated nested boolean
|
|
// expressions) and just use a hardCount to see how many hard required
|
|
// terms are contained by a document. see IndexTable.cpp "hardCount"
|
|
char m_hardCount;
|
|
|
|
// the "number" of the query term used for evaluation boolean
|
|
// expressions in Expression::isTruth(). Basically just the
|
|
// QueryTermInfo for which this query term belongs. each QueryTermInfo
|
|
// is like a single query term and all its synonyms, etc.
|
|
int32_t m_bitNum;
|
|
|
|
// point to term, either m_word or m_phrase
|
|
char *m_term;
|
|
int32_t m_termLen;
|
|
// point to the posdblist that represents us
|
|
class RdbList *m_posdbListPtr;
|
|
|
|
// languages query term is in. currently this is only valid for
|
|
// synonyms of other query terms. so we can show what language the
|
|
// synonym is for in the xml/json feed.
|
|
uint64_t m_langIdBits;
|
|
bool m_langIdBitsValid;
|
|
|
|
// the ()'s following an int/float facet term dictate the
|
|
// ranges for clustering the numeric values. like
|
|
// gbfacetfloat:price:(0-10,10-20,...)
|
|
// values outside the ranges will be ignored
|
|
char *m_parenList;
|
|
int32_t m_parenListLen;
|
|
|
|
int32_t m_componentCode;
|
|
int64_t m_termFreq;
|
|
float m_termFreqWeight;
|
|
|
|
// . our representative bits
|
|
// . the bits in this bit vector is 1-1 with the QueryTerms
|
|
// . if a doc has query term #i then bit #i will be set
|
|
// . if a doc EXplicitly has phrase "A B" then it may have
|
|
// term A and term B implicitly
|
|
// . therefore we also OR the bits for term A and B into m_implicitBits
|
|
// . THIS SHIT SHOULD be just used in setBitScores() !!!
|
|
//uint16_t m_implicitBits;
|
|
qvec_t m_implicitBits;
|
|
// Summary.cpp and Matches.cpp use this one
|
|
bool m_isQueryStopWord ;
|
|
// IndexTable.cpp uses this one
|
|
bool m_inQuotes;
|
|
// . is this term under the influence of a boolean NOT operator?
|
|
// . used in IndexReadInfo.cpp, if so we must read the WHOLE termlist
|
|
bool m_underNOT;
|
|
// is it a repeat?
|
|
char m_repeat;
|
|
// user defined weight for this term, be it phrase or word
|
|
float m_userWeight;
|
|
char m_userType;
|
|
// . is this query term before a | (pipe) operator?
|
|
// . if so we must read the whole termlist, like m_underNOT above
|
|
bool m_piped;
|
|
// . we ignore component terms unless their compound term is not cached
|
|
// . now this is used to ignore low tf synonym terms only
|
|
char m_ignored ;
|
|
// is it part of a UOR chain?
|
|
bool m_isUORed;
|
|
QueryTerm *m_UORedTerm;
|
|
// . if synonymOf is not NULL, then m_term points into m_synBuf, not
|
|
// m_buf
|
|
//int32_t m_affinity; // affinity to the synonym
|
|
QueryTerm *m_synonymOf;
|
|
int64_t m_synWids0;
|
|
int64_t m_synWids1;
|
|
int32_t m_numAlnumWordsInSynonym;
|
|
// like if we are the "nj" syn of "new jersey", this will be 2 words
|
|
// since "new jersey", our base, is 2 alnum words.
|
|
int32_t m_numAlnumWordsInBase;
|
|
// the phrase affinity from the wikititles.txt file used in Wiki.cpp
|
|
//float m_wikiAff ;
|
|
// if later, after getting a more accurate term freq because we
|
|
// actually download the termlist, its term freq drops a lot, we may
|
|
// end up filtering it in Query::filterSynonyms() called by Msg39. in
|
|
// which case the termlist is reset to 0 so it does not play a role
|
|
// in the search results computations in IndexTable2.cpp.
|
|
//char m_isFilteredSynonym;
|
|
// copied from derived QueryWord
|
|
char m_fieldCode ;
|
|
bool isSplit();
|
|
// . weights and affinities calculated in IndexTable2
|
|
// . do not store in here, just pass along as a separate vector
|
|
// . analogous to how Phrases is to Words is to Bits, etc.
|
|
//float m_termWeight;
|
|
//float m_phraseAffinity;
|
|
bool m_isRequired;
|
|
// . true if we are a word IN a phrase
|
|
// . used by IndexTable2's getWeightedScore()
|
|
char m_inPhrase;
|
|
char m_isWikiHalfStopBigram:1;
|
|
// if a single word term, what are the term #'s of the 2 phrases
|
|
// we can be in? uses -1 to indicate none.
|
|
int32_t m_leftPhraseTermNum;
|
|
int32_t m_rightPhraseTermNum;
|
|
// . what operand # are we a part of in a boolean query?
|
|
// . like for (x AND y) x would have an opNum of 0 and y an
|
|
// opNum of 1 for instance.
|
|
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
|
|
// those query terms the same m_opNum for efficiency since
|
|
// they all have the same effecct
|
|
//int32_t m_opNum;
|
|
|
|
// same as above basically
|
|
class QueryTerm *m_leftPhraseTerm;
|
|
class QueryTerm *m_rightPhraseTerm;
|
|
// for scoring summary sentences from XmlDoc::getEventSummary()
|
|
float m_score;
|
|
|
|
// a queryTermInfo class is multiple "related"/synonym terms.
|
|
// we have an array of these we set in Posdb.cpp:setQueryTermInfo().
|
|
int m_queryTermInfoNum;
|
|
|
|
// facet support in Posdb.cpp for compiling the data and we'll
|
|
// send this back via Msg39Reply::ptr_facetHashList which will be
|
|
// 1-1 with the query terms.
|
|
HashTableX m_facetHashTable;
|
|
|
|
// for sorting the facetEntries in m_facetHashTable
|
|
SafeBuf m_facetIndexBuf;
|
|
|
|
char m_startKey[MAX_KEY_BYTES];
|
|
char m_endKey [MAX_KEY_BYTES];
|
|
char m_ks;
|
|
|
|
// used by Msg40.cpp for gigabits generation
|
|
int64_t m_hash64d;
|
|
int32_t m_popWeight;
|
|
|
|
uint64_t m_numDocsThatHaveFacet;
|
|
};
|
|
|
|
//#define MAX_OPSLOTS 256
|
|
|
|
#define MAX_EXPRESSIONS 100
|
|
|
|
// operand1 AND operand2 OR ...
|
|
// operand1 OR operand2 AND ...
|
|
class Expression {
|
|
public:
|
|
bool addExpression (int32_t start,
|
|
int32_t end,
|
|
class Query *q,
|
|
int32_t level );
|
|
bool isTruth ( unsigned char *bitVec , int32_t vecSize );
|
|
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
|
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
|
// . returned bit vector is 1-1 with m_qterms in Query class
|
|
void print (SafeBuf *sbuf);
|
|
// . a list of operands separated by op codes (a AND b OR c ...)
|
|
// . sometimes and operand is another expression: a AND (b OR c)
|
|
// . use NULL in m_operands slot if we got an expression and vice versa
|
|
// . m_opcodes[i] is the opcode after operand #i
|
|
//class Expression *m_parent;
|
|
//bool m_hasNOT;
|
|
//int32_t m_start;
|
|
//int32_t m_end;
|
|
bool m_hadOpCode;
|
|
int32_t m_expressionStartWord;
|
|
int32_t m_numWordsInExpression;
|
|
Query *m_q;
|
|
// . opSlots can be operands operators or expressions
|
|
// . m_opTypes tells which of the 3 they are
|
|
//int32_t m_opSlots[MAX_OPSLOTS];
|
|
//char m_opTypes[MAX_OPSLOTS];
|
|
//int32_t m_cc;
|
|
};
|
|
|
|
// . this is the main class for representing a query
|
|
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
|
|
class Query {
|
|
|
|
public:
|
|
void reset();
|
|
|
|
Query();
|
|
~Query();
|
|
void constructor();
|
|
void destructor();
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . after calling this you can call functions below
|
|
// . if boolFlag is 0 we ignore all boolean operators
|
|
// . if boolFlag is 1 we assume query is boolean
|
|
// . if boolFlag is 2 we attempt to detect if query is boolean or not
|
|
bool set2 ( char *query ,
|
|
//int32_t queryLen ,
|
|
//char *coll ,
|
|
//int32_t collLen ,
|
|
uint8_t langId ,
|
|
char queryExpansion ,
|
|
bool useQueryStopWords = true ,
|
|
//char boolFlag = 2 , // auto-detect if boolean query
|
|
//bool keepAllSingles = false ,
|
|
int32_t maxQueryTerms = 0x7fffffff );
|
|
|
|
// serialize/deserialize ourselves so we don't have to pass the
|
|
// unmodified string around and reparse it every time
|
|
int32_t getStoredSize();
|
|
int32_t serialize(char *buf, int32_t bufLen);
|
|
int32_t deserialize(char *buf, int32_t bufLen);
|
|
|
|
// . if a term is truncated in indexdb, change its '+' sign to a '*'
|
|
// . will recopmute m_bitScores to fix bit #7
|
|
//void softenTruncatedTerms ( );
|
|
|
|
bool setQueryTermScores ( int64_t *termFreqsArg ) ;
|
|
|
|
// about how hits for this query?
|
|
//int64_t getEstimatedTotalHits ( );
|
|
|
|
char *getQuery ( ) { return m_orig ; };
|
|
int32_t getQueryLen ( ) { return m_origLen; };
|
|
|
|
//int32_t getNumIgnored ( ) { return m_numIgnored; };
|
|
//int32_t getNumNotIgnored ( ) { return m_numTerms ; };
|
|
|
|
int32_t getNumTerms ( ) { return m_numTerms; };
|
|
char getTermSign ( int32_t i ) { return m_qterms[i].m_termSign; };
|
|
bool isPhrase ( int32_t i ) { return m_qterms[i].m_isPhrase; };
|
|
bool isInPhrase ( int32_t i ) { return m_qterms[i].m_inPhrase; };
|
|
bool isInQuotes ( int32_t i ) { return m_qterms[i].m_inQuotes; };
|
|
int64_t getTermId ( int32_t i ) { return m_qterms[i].m_termId; };
|
|
char getFieldCode2( int32_t i ) { return m_qterms[i].m_fieldCode; };
|
|
int64_t getRawTermId ( int32_t i ) { return m_qterms[i].m_rawTermId; };
|
|
char *getTerm ( int32_t i ) { return m_qterms[i].m_term; };
|
|
int32_t getTermLen ( int32_t i ) { return m_qterms[i].m_termLen; };
|
|
bool isQueryStopWord (int32_t i ) {
|
|
return m_qterms[i].m_isQueryStopWord; };
|
|
// . not HARD required, but is term #i used for an EXACT match?
|
|
// . this includes negatives and phrases with signs in addition to
|
|
// the standard signless single word query term
|
|
bool isRequired ( int32_t i ) {
|
|
if ( ! m_qterms[i].m_isPhrase ) return true;
|
|
if ( m_qterms[i].m_termSign ) return true;
|
|
return false;
|
|
};
|
|
|
|
//int32_t getNumRequired ( ) ;
|
|
bool isSplit();
|
|
|
|
bool isSplit(int32_t i) { return m_qterms[i].isSplit(); };
|
|
|
|
// . Msg39 calls this to get our vector so it can pass it to Msg37
|
|
// . the signs and ids are dupped in the QueryTerm classes, too
|
|
//int64_t *getTermFreqs ( ) { return m_termFreqs ; };
|
|
//int64_t getTermFreq ( int32_t i ) { return m_termFreqs[i]; };
|
|
//int64_t *getTermIds ( ) { return m_termIds ; };
|
|
//char *getTermSigns ( ) { return m_termSigns ; };
|
|
//int32_t *getComponentCodes ( ) { return m_componentCodes; };
|
|
int64_t getRawWordId ( int32_t i ) { return m_qwords[i].m_rawWordId;};
|
|
|
|
int32_t getNumComponentTerms ( ) { return m_numComponents; };
|
|
|
|
// sets m_bmap[][] so getImplicits() works
|
|
void setBitMap ( );
|
|
bool testBoolean(unsigned char *bits,int32_t vecSize);
|
|
// print to log
|
|
void printBooleanTree();
|
|
void printQueryTerms();
|
|
|
|
// the new way as of 3/12/2014. just determine if matches the bool
|
|
// query or not. let's try to offload the scoring logic to other places
|
|
// if possible.
|
|
// bitVec is all the QueryWord::m_opBits some docid contains, so
|
|
// does it match our boolean query or not?
|
|
bool matchesBoolQuery ( unsigned char *bitVec , int32_t vecSize ) ;
|
|
|
|
|
|
// . call this before calling getBitScore() to set m_bitScores[] table
|
|
// . returns false and sets g_errno on error (ENOMEM usually)
|
|
//bool setBitScores (qvec_t bitMask = (qvec_t)-1);
|
|
|
|
// . m_bitScores[ BITS ] maps BITS to a bitScore
|
|
// . the BITS of a doc is 1-1 with m_qterms[] present in that doc
|
|
// . bitScore returns # of required terms implicitly in the doc
|
|
// . required terms do not include query terms from signless phrases
|
|
// . if bitScore 0x80 is set it matches all forced terms (plus signs)
|
|
// . if bitScore 0x40 is set it has all required terms EXplicitly
|
|
// . if bitScore 0x20 is set it has all required terms IMplicitly
|
|
// . example query: 'cat dog' --> "cat dog"=bit#0, cat=bit#1, dog=bit#2
|
|
// . if a doc does not explicitly have 'cat', but it has the phrase
|
|
// "cat dog" then it is said to have 'cat' implicitly... implied
|
|
// through the phrase
|
|
// . the greater the number of IMplicit SINGLE words a doc has the
|
|
// bigger its bit score
|
|
/*
|
|
uint8_t getBitScore ( qvec_t ebits ) {
|
|
// get implicit bits from explicit bits
|
|
qvec_t ibits = getImplicits ( ebits );
|
|
// . boolean queries are limited in the # of terms so that
|
|
// ebits should NOT be too big, under 10MB i think now
|
|
// . sets the usual 0x80,0x40,0x20 + require termcount.
|
|
// . for boolean queries, if we have too many
|
|
// explicits then when we alloc for "need4" above, it
|
|
// should return ENOMEM and we should never make it
|
|
// here! so "ibits" should not be too big and breach
|
|
// the array!
|
|
if ( m_isBoolean ) {
|
|
//return m_bitScores[(uint32_t)ibits];
|
|
uint8_t bscore = 0;
|
|
if( testBoolean(ibits)) bscore = 0x80|0x40|0x20;
|
|
return bscore;
|
|
}
|
|
// just get those required
|
|
ibits &= m_requiredBits;
|
|
// get the vector of required bits we implicitly have
|
|
uint8_t *iv = (uint8_t *)&ibits;
|
|
// set this
|
|
uint8_t bscore ;
|
|
// . how many terms we do have implicitly?
|
|
// . the g_a table is in Mem.cpp and maps a byte to the number
|
|
// of bits it has that are in the ON position
|
|
bscore = g_a[iv[0]] + g_a[iv[1]] + g_a[iv[2]] + g_a[iv[3]]
|
|
+ g_a[iv[4]] + g_a[iv[5]] + g_a[iv[6]] + g_a[iv[7]];
|
|
|
|
// if we have synonyms, then any implied bits a synonym has
|
|
// should be treated as explicit bits for "bit score" purposes
|
|
// so that if someone searches 'cincinnati, oh' a doc with
|
|
// 'cincinnati ohio' is treated no lesser than a doc with
|
|
// 'cincinnati oh'. BUT only do this for "stem" or
|
|
// "morphological form" synonyms, because it is allows 'bib'
|
|
// for the query 'michael bibby facts' to outscore docs that
|
|
// have all the original query terms explicitly... so limit
|
|
// it to just the "stem" synonyms. BUT, if the syn affinity
|
|
// is 0, do not include these at all...
|
|
//qvec_t ebits2=ebits | getImplicits(ebits&m_synonymBitsHiAff);
|
|
qvec_t ebits2 = ebits | getImplicits(ebits&m_synonymBits);
|
|
|
|
// then OR in some high bits
|
|
if ((ebits2 & m_forcedBits) == m_forcedBits ) bscore|=0x80;
|
|
if ((ebits2 & m_requiredBits) == m_requiredBits ) bscore|=0x40;
|
|
if (ibits == m_requiredBits ) bscore|=0x20;
|
|
return bscore;
|
|
};
|
|
*/
|
|
|
|
// return an implicit vector from an explicit which contains the explic
|
|
qvec_t getImplicits ( qvec_t ebits ) {
|
|
if ( ! m_bmapIsSet ) { char *xx=NULL;*xx=0; }
|
|
uint8_t *ev = (uint8_t *)&ebits;
|
|
return m_bmap[0][ev[0]] |
|
|
m_bmap[1][ev[1]] |
|
|
m_bmap[2][ev[2]] |
|
|
m_bmap[3][ev[3]] |
|
|
m_bmap[4][ev[4]] |
|
|
m_bmap[5][ev[5]] |
|
|
m_bmap[6][ev[6]] |
|
|
m_bmap[7][ev[7]] ;
|
|
};
|
|
|
|
//qvec_t getImplicitBitsFromTermNum ( int32_t qtnum ) {
|
|
//};
|
|
|
|
// returns false if no truths possible
|
|
bool setBitScoresBoolean ( char *buf , int32_t bufSize );
|
|
|
|
// ALWAYS call this after calling setBitScores(), it uses m_bitScores[]
|
|
//int64_t getEstimatedTotalHitsBoolean ( );
|
|
|
|
// sets m_qwords[] array, this function is the heart of the class
|
|
bool setQWords ( char boolFlag , bool keepAllSingles ,
|
|
class Words &words , class Phrases &phrases ) ;
|
|
|
|
// sets m_qterms[] array from the m_qwords[] array
|
|
bool setQTerms ( class Words &words , class Phrases &phrases ) ;
|
|
|
|
// . query expansion functions, the first gets all possible candidates
|
|
// (eliminated after Msg37 returns), the second actually modifies the
|
|
// query to include new terms
|
|
//int32_t getCandidates(int32_t *synMap, int64_t *synIds, int32_t num);
|
|
//void fixTermFreqs ( int32_t *synMap ,
|
|
// int32_t numTermsAndSyns ,
|
|
// int64_t *termFreqs ) ;
|
|
//int32_t filterCandidates ( int32_t *synMap ,
|
|
// int64_t *synIds ,
|
|
// int32_t numTermsAndSyns ,
|
|
// int64_t *termFreqs ,
|
|
// char *coll ) ;
|
|
//bool expandQuery (int32_t *synMap, int64_t *synIds, int32_t num,
|
|
// int64_t *termFreqs);
|
|
|
|
// set m_expressions[] and m_operands[] arrays and m_numOperands
|
|
// for boolean queries
|
|
bool setBooleanOperands ( );
|
|
|
|
// helper funcs for parsing query into m_qwords[]
|
|
//char getFieldCode ( char *s , int32_t len , bool *hasColon ) ;
|
|
bool isConnection ( char *s , int32_t len ) ;
|
|
|
|
// set the QueryTerm::m_hasNOT members
|
|
//void setHasNOTs();
|
|
|
|
// . used by IndexTable.cpp to make a ptr map of the query terms
|
|
// to make intersecting the termlists one at a time efficient
|
|
// . "imap" is a list of the termlist numbers, but especially sorted
|
|
// . 0 <= imap[i] < m_numTerms
|
|
// . sizes[i] is the total docids for query term #i (up to the current
|
|
// tier being examined in IndexTable.cpp)
|
|
// . we set blocksize[i] only when imap[i] is a termlist which is not
|
|
// a signless phrase. it is a number, N, such that
|
|
// imap[i], imap[i+1], ... imap[i+N-1] are a "block" that has all
|
|
// the signless phrase terms that contain query term # imap[i].
|
|
// we cluster them together like this because IndexTable needs to
|
|
// hash them all together since the phrase terms can imply the single
|
|
// terms.
|
|
// . it now returns the number of terms put into imap[]
|
|
// . it sets *retNumBlocks to the number of blocks put into
|
|
// blocksizes[]
|
|
// . "sizes" is the size of each list (all tiers combined). this is
|
|
// in query term num space, not IMAP space, and must be provided by
|
|
// the caller. it is the only arg that is input, the rest are output.
|
|
int32_t getImap ( int32_t *sizes , int32_t *imap , int32_t *blocksizes ,
|
|
int32_t *retNumBlocks );
|
|
|
|
|
|
// . replace sequences of UOR'd terms with a single termid
|
|
// . the sequence of UOR'd terms are the component terms
|
|
// . the term that replaces that sequence is the compound term
|
|
// . the compound termlist will be a merge of the components' termlists
|
|
// . this sets the component terms QueryTerm::m_ignore char to true
|
|
// when they are replaced by a compound term
|
|
// . ensures compound term inherits the common QueryTerm::m_explicitBit
|
|
// from the component terms it replaced
|
|
// . ensures the compound term's m_termFreqs[i] is the sum of the
|
|
// components' termFreqs
|
|
// . sets QueryTerm::m_component and QueryTerm::m_compound respectively
|
|
//void addCompoundTerms ();
|
|
|
|
public:
|
|
|
|
// hash of all the query terms
|
|
int64_t getQueryHash();
|
|
|
|
bool isCompoundTerm ( int32_t i ) ;
|
|
|
|
class QueryTerm *getQueryTermByTermId64 ( int64_t termId ) {
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
if ( m_qterms[i].m_termId == termId )
|
|
return &m_qterms[i];
|
|
}
|
|
return NULL;
|
|
};
|
|
|
|
// for debugging fhtqt mem leak
|
|
char *m_st0Ptr;
|
|
|
|
// silly little functions that support the BIG HACK
|
|
//int32_t getNumNonFieldedSingletonTerms() { return m_numTermsSpecial; };
|
|
//int32_t getTermsFound ( Query *q , char *foundTermVector ) ;
|
|
|
|
// return -1 if does not exist in query, otherwise return the
|
|
// query word num
|
|
int32_t getWordNum ( int64_t wordId );
|
|
|
|
// this is now just used for boolean queries to determine if a docid
|
|
// is a match or not
|
|
unsigned char *m_bitScores ;
|
|
int32_t m_bitScoresSize;
|
|
// . map explicit bits vector to implied bits vector
|
|
// . like m_bitScores but simpler
|
|
//qvec_t *m_bmap ;
|
|
//int32_t m_bmapSize;
|
|
|
|
// one bmap per byte of qvec_t
|
|
qvec_t m_bmap[sizeof(qvec_t)][256];
|
|
|
|
// . bit vector that is 1-1 with m_qterms[]
|
|
// . only has bits that we must have if we were default AND
|
|
//uint16_t m_requiredBits;
|
|
qvec_t m_requiredBits;
|
|
qvec_t m_matchRequiredBits;
|
|
qvec_t m_negativeBits;
|
|
qvec_t m_forcedBits;
|
|
// bit vector for terms that are synonyms
|
|
qvec_t m_synonymBits;
|
|
int32_t m_numRequired;
|
|
|
|
// language of the query
|
|
uint8_t m_langId;
|
|
|
|
bool m_useQueryStopWords;
|
|
|
|
// use a generic buffer for m_qwords and m_expressions to point into
|
|
// so we don't have to malloc for them
|
|
char m_gbuf [ GBUF_SIZE ];
|
|
char *m_gnext;
|
|
|
|
QueryWord *m_qwords ; // [ MAX_QUERY_WORDS ];
|
|
int32_t m_numWords;
|
|
int32_t m_qwordsAllocSize;
|
|
|
|
// QueryWords are converted to QueryTerms
|
|
//QueryTerm m_qterms [ MAX_QUERY_TERMS ];
|
|
int32_t m_numTerms;
|
|
int32_t m_numTermsSpecial;
|
|
|
|
int32_t m_numTermsUntruncated;
|
|
|
|
// separate vectors for easier interfacing, 1-1 with m_qterms
|
|
//int64_t m_termFreqs [ MAX_QUERY_TERMS ];
|
|
//int64_t m_termIds [ MAX_QUERY_TERMS ];
|
|
//char m_termSigns [ MAX_QUERY_TERMS ];
|
|
//int32_t m_componentCodes [ MAX_QUERY_TERMS ];
|
|
//char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
|
|
SafeBuf m_stackBuf;
|
|
QueryTerm *m_qterms ;
|
|
//int64_t *m_termIds ;
|
|
//char *m_termSigns ;
|
|
//int32_t *m_componentCodes ;
|
|
//char *m_ignore ; // is term ignored?
|
|
|
|
int32_t m_numComponents;
|
|
|
|
// how many bits in the full vector?
|
|
//int32_t m_numExplicitBits;
|
|
|
|
// how many terms are we ignoring?
|
|
//int32_t m_numIgnored;
|
|
|
|
// site: field will disable site clustering
|
|
// ip: field will disable ip clustering
|
|
// site:, ip: and url: queries will disable caching
|
|
bool m_hasPositiveSiteField;
|
|
bool m_hasIpField;
|
|
bool m_hasUrlField;
|
|
bool m_hasSubUrlField;
|
|
bool m_hasIlinkField;
|
|
bool m_hasGBLangField;
|
|
bool m_hasGBCountryField;
|
|
char m_hasQuotaField;
|
|
|
|
// query id set by Msg39.cpp
|
|
int32_t m_qid;
|
|
|
|
// . we set this to true if it is a boolean query
|
|
// . when calling Query::set() above you can tell it explicitly
|
|
// if query is boolean or not, OR you can tell it to auto-detect
|
|
// by giving different values to the "boolFlag" parameter.
|
|
bool m_isBoolean;
|
|
|
|
int32_t m_synTerm; // first term that's a synonym
|
|
class SynonymInfo *m_synInfo;
|
|
int32_t m_synInfoAllocSize;
|
|
|
|
// if they got a gbdocid: in the query and it's not boolean, set these
|
|
int64_t m_docIdRestriction;
|
|
class Host *m_groupThatHasDocId;
|
|
|
|
// for holding the filtered query, in utf8
|
|
//char m_buf [ MAX_QUERY_LEN ];
|
|
//int32_t m_bufLen;
|
|
|
|
// for holding the filtered query, in utf8
|
|
SafeBuf m_sb;
|
|
char m_tmpBuf3[128];
|
|
|
|
// for holding the filtered/NULL-terminated query for doing
|
|
// matching. basically store phrases in here without punct
|
|
// so we can point a needle to them for matching in XmlDoc.cpp.
|
|
//char m_needleBuf [ MAX_QUERY_LEN + 1 ];
|
|
//int32_t m_needleBufLen;
|
|
|
|
// the original query
|
|
//char m_orig [ MAX_QUERY_LEN ];
|
|
//int32_t m_origLen;
|
|
|
|
char *m_orig;
|
|
int32_t m_origLen;
|
|
SafeBuf m_osb;
|
|
char m_otmpBuf[128];
|
|
|
|
// we just have a ptr to this so don't pull the rug out
|
|
//char *m_coll;
|
|
//int32_t m_collLen;
|
|
|
|
// . we now contain the parsing components for boolean queries
|
|
// . m_expressions points into m_gbuf or is allocated
|
|
//class Expression *m_expressions; // [ MAX_OPERANDS ];
|
|
//int32_t m_expressionsAllocSize;
|
|
Expression m_expressions[MAX_EXPRESSIONS];
|
|
int32_t m_numExpressions;
|
|
//class Operand m_operands [ MAX_OPERANDS ];
|
|
//int32_t m_numOperands ;
|
|
|
|
// does query contain the pipe operator
|
|
bool m_piped;
|
|
|
|
int32_t m_maxQueryTerms ;
|
|
|
|
bool m_queryExpansion;
|
|
|
|
bool m_truncated;
|
|
bool m_hasDupWords;
|
|
|
|
bool m_hasUOR;
|
|
bool m_hasLinksOperator;
|
|
|
|
bool m_bmapIsSet ;
|
|
|
|
bool m_hasSynonyms;
|
|
|
|
SafeBuf m_debugBuf;
|
|
|
|
void *m_containingParent;
|
|
};
|
|
/*
|
|
class QueryScores {
|
|
public:
|
|
QueryScores(){};
|
|
~QueryScores(){};
|
|
bool set(Query *q);
|
|
void reset();
|
|
|
|
int32_t getNumTerms ( ) { return m_numTerms; } ;
|
|
int32_t getScore ( int32_t i ) { return m_scores[i]; } ;
|
|
void setScore (int32_t i, int32_t score) {m_scores[i] = score; };
|
|
int64_t getTermId ( int32_t i ) { return m_q->getTermId(i); } ;
|
|
//int64_t getWordId ( int32_t i ) { return m_wordIds[i]; } ;
|
|
private:
|
|
Query *m_q;
|
|
int32_t m_numTerms;
|
|
int64_t *m_freqs;
|
|
int32_t m_termPtrs [ MAX_QUERY_TERMS ];
|
|
int32_t m_scores [ MAX_QUERY_TERMS ];
|
|
//int32_t m_wordIds [ MAX_QUERY_TERMS ];
|
|
};
|
|
*/
|
|
|
|
bool queryTest();
|
|
|
|
#endif
|