more bool query fixes

This commit is contained in:
Matt Wells
2014-03-18 10:44:56 -07:00
parent 6e23d37e47
commit 3b97682cc3
5 changed files with 261 additions and 303 deletions

@ -661,6 +661,7 @@ void gotListsWrapper ( void *state ) {
Msg39 *THIS = (Msg39 *) state;
// . hash the lists into our index table
// . this will send back a reply or recycle and read more list data
if ( ! THIS->gotLists ( true ) ) return;
// . if he did not block and there was an errno we send reply
@ -671,6 +672,12 @@ void gotListsWrapper ( void *state ) {
log("msg39: sending back error reply = %s",mstrerror(g_errno));
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
}
// no, block? call the docid split loop
//if ( numDocIdSplits <= 1 ) return;
// if we get the lists and processed them without blocking, repeat!
THIS->doDocIdSplitLoop();
}
// . now come here when we got the necessary index lists
@ -815,6 +822,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
// time it
diff = gettimeofdayInMilliseconds() - start;
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
// returns false if blocked, true otherwise
return addedLists ();
}

@ -4453,6 +4453,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
//
m_minListSize = 0;
m_minListi = -1;
long long grand = 0LL;
// hopefully no more than 100 sublists per term
//char *listEnds [ MAX_QUERY_TERMS ][ MAX_SUBLISTS ];
// set ptrs now i guess
@ -4465,6 +4466,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// add to it
total = qti->m_totalSubListsSize;
// add up this now
grand += total;
// get min
if ( total < m_minListSize || m_minListi == -1 ) {
m_minListSize = total;
@ -4485,6 +4488,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
long maxDocIds = m_minListSize / 12;
// store all interesected docids in here for new algo plus 1 byte vote
long need = maxDocIds * 6;
// they could all be OR'd together!
if ( m_q->m_isBoolean ) need = grand;
// get max # of docids we got in an intersection from all the lists
if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) return false;
@ -4494,7 +4501,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
// the bit vector in a truth table
long maxSlots = maxDocIds * 2;
// get total operands we used
long numOperands = m_q->m_numOperands;
long numOperands = m_q->m_numWords;//Operands;
// a quoted phrase counts as a single operand
m_vecSize = numOperands / 8 ;
// allow an extra byte for remainders
@ -4504,7 +4511,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
return false;
if ( m_q->m_isBoolean &&
! m_ct.set (8,2 * (1<<numOperands),maxSlots,NULL,0,false,0,
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
"booltbl"))
return false;
@ -5527,6 +5534,11 @@ void PosdbTable::intersectLists10_r ( ) {
// is this right?
if ( docIdPtr >= docIdEnd ) goto done;
if ( m_q->m_isBoolean ) {
minScore = 1.0;
goto boolJump;
}
// assume all sublists exhausted for this query term
//docId = *(long long *)docIdPtr;
@ -6547,6 +6559,8 @@ void PosdbTable::intersectLists10_r ( ) {
goto advance;
boolJump:
// try dividing it by 3! (or multiply by .33333 faster)
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
@ -7043,8 +7057,8 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
// get the query word
QueryWord *qw = qt->m_qword;
// and the operand # from that
long opNum = qw->m_opNum;
// just use the word # now
long opNum = qw->m_wordNum;//opNum;
// do not consider for adding if negative ('my house -home')
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
@ -7102,6 +7116,8 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
}
char *dst = m_docIdVoteBuf.getBufStart();
// . now our hash table is filled with all the docids
// . evaluate each bit vector
for ( long i = 0 ; i < m_bt.m_numSlots ; i++ ) {
@ -7121,8 +7137,10 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
long long *d = (long long *)m_bt.getKeyFromSlot(i);
if ( m_debug ) log("query: addind d=%llu vec[0]=%lx",
*d,(long)vec[0]);
// an 8 byte key means you pass
m_docIdVoteBuf.safeMemcpy ( &d , 6 );
// a 6 byte key means you pass
*(long *) dst = *(long *) d;
*(short *)(dst+4) = *(short *)((char *)d+4);
dst += 6;
}
// evaluate the vector
char include = m_q->matchesBoolQuery ( (unsigned char *)vec ,
@ -7132,13 +7150,18 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
long long *d = (long long *)m_bt.getKeyFromSlot(i);
if ( m_debug ) log("query: addind d=%llu vec[0]=%lx",
*d,(long)vec[0]);
// an 8 byte key means you pass
m_docIdVoteBuf.safeMemcpy ( &d , 6 );
// a 6 byte key means you pass
*(long *) dst = *(long *) d;
*(short *)(dst+4) = *(short *)((char *)d+4);
dst += 6;
}
// store in hash table
m_ct.addKey ( &h64 , &include );
}
// update SafeBuf::m_length
m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery_r()
// up into docid ranges so we have like 1/100th the # of docids to
// sort. that should make this part a lot faster.

414
Query.cpp

@ -24,11 +24,11 @@ void Query::constructor ( ) {
//m_bmap = NULL;
m_bitScores = NULL;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
m_qwordsAllocSize = 0;
m_expressionsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
reset ( );
}
@ -46,7 +46,7 @@ void Query::reset ( ) {
m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
m_numOperands = 0;
//m_numOperands = 0;
m_numTerms = 0;
m_synTerm = 0;
//m_numIgnored = 0;
@ -60,14 +60,14 @@ void Query::reset ( ) {
m_bitScores = NULL;
//m_bmapSize = 0;
m_bitScoresSize = 0;
if ( m_expressionsAllocSize )
mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
//if ( m_expressionsAllocSize )
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
if ( m_qwordsAllocSize )
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
m_expressionsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwordsAllocSize = 0;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
m_numExpressions = 0;
m_gnext = m_gbuf;
m_hasUOR = false;
@ -343,8 +343,8 @@ bool Query::set2 ( char *query ,
// set m_expressions[] and m_operands[] arrays and m_numOperands
// for boolean queries
if ( m_isBoolean )
if ( ! setBooleanOperands() ) return false;
//if ( m_isBoolean )
// if ( ! setBooleanOperands() ) return false;
// disable stuff for site:, ip: and url: queries
for ( long i = 0 ; i < m_numWords ; i++ ) {
@ -386,6 +386,16 @@ bool Query::set2 ( char *query ,
break;
}
// . keep it simple for now
// . we limit to MAX_EXRESSIONS to like 10 now i guess
if ( m_isBoolean )
m_expressions[0].add ( 0 ,
m_numWords ,
this , // Query
0 , // level
false ); // hasNOT
// . if it is not truncated, no need to use hard counts
// . comment this line and the next one out for testing hard counts
if ( ! m_truncated ) return true;
@ -450,16 +460,16 @@ bool Query::set2 ( char *query ,
// "(nt=%li)",
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
if ( ! m_isBoolean ) return true;
//if ( ! m_isBoolean ) return true;
// free cuz it was already set
if ( m_expressionsAllocSize )
mfree(m_expressions,m_expressionsAllocSize , "Query" );
m_expressionsAllocSize = 0;
m_expressions = NULL;
//if ( m_expressionsAllocSize )
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
//m_expressionsAllocSize = 0;
//m_expressions = NULL;
// also set the boolean stuff again too!
if ( ! setBooleanOperands() ) return false;
//if ( ! setBooleanOperands() ) return false;
return true;
}
@ -617,7 +627,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// doh! gotta reset to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeightPhrase ;
qt->m_userType = qw->m_userTypePhrase ;
@ -819,7 +829,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// break;
// }
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
@ -1265,7 +1275,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
@ -1902,7 +1912,7 @@ bool Query::setQWords ( char boolFlag ,
// assume QueryWord is ignored by default
qw->m_ignoreWord = IGNORE_DEFAULT;
qw->m_ignorePhrase = IGNORE_DEFAULT;
qw->m_wordNum = i;
// get word as a string
//char *w = words.getWord(i);
//long wlen = words.getWordLen(i);
@ -3312,20 +3322,20 @@ bool Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
if (!m_isBoolean) return false;
Expression *e = &m_expressions [ 0 ];
// find top-level expression
while (e->m_parent && e != e->m_parent) e = e->m_parent;
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits,vecSize);//, bitmask);
}
void Query::printBooleanTree(){
if (!m_isBoolean) return;
Expression *e = &m_expressions [ 0 ];
//Expression *e = &m_expressions [ 0 ];
// find top-level expression
while (e->m_parent && e != e->m_parent) e = e->m_parent;
SafeBuf sbuf(1024,"botree");
e->print(&sbuf);
logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
//SafeBuf sbuf(1024,"botree");
//e->print(&sbuf);
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
}
/*
// . also sets the m_underNOT member of each QueryTerm, too!!
// . returns false and sets g_errno on error, true otherwise
bool Query::setBooleanOperands ( ) {
@ -3381,14 +3391,11 @@ bool Query::setBooleanOperands ( ) {
// . set the expression recursively
// . just setting this will not set the m_hasNOT members of each
// QueryTerm
long status = e->set ( 0 , // first word #
m_numWords , // last word #
0 , // parser position
this , // array of QueryWords
0 ,// level
NULL, NULL, // parent, leftchild
false , // has NOT?
false ); // under NOT?
long status = e->add ( 0 , // first word #
m_numWords , // last word #
this , // array of QueryWords
0 ,// level
false ); // has NOT?
if ( status < 0 ) {
g_errno = ETOOMANYOPERANDS;
return log("query: Maximum number of bool operands "
@ -3413,6 +3420,7 @@ bool Query::setBooleanOperands ( ) {
// . get all the terms that are UNDER a NOT operator in some fashion
// . these bits are 1-1 with m_qterms[]
*/
/*
qvec_t notBits = e->getNOTBits( false );
for ( long i = 0 ; i < m_numTerms ; i++ ) {
@ -3422,10 +3430,11 @@ bool Query::setBooleanOperands ( ) {
m_qterms[i].m_underNOT = false;
}
*/
/*
return true;
}
*/
/*
// . returns -1 on bad query error
// . returns word AFTER the last word in our operand
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
@ -3448,7 +3457,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// set the parenthetical level of the word
qw->m_level = level;
// set this
qw->m_underNOT = underNOT;
//qw->m_underNOT = underNOT;
// skip punct
if ( ! qw->isAlphaWord() ) {
// if it is a parens, bail!
@ -3501,6 +3510,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
}
return b;
}
*/
// . returns -1 on bad query error
// . returns next word to parse (after expression) on success
@ -3510,6 +3520,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// . new: organize query into sum of products normal form, ie:
// . (a) OR (b AND c AND d) OR (e AND f)
/*
unsigned char precedence[] = {
0, // term
4, // OR
@ -3520,209 +3531,89 @@ unsigned char precedence[] = {
3, // UOR
5, // PIPE
};
*/
long Expression::set (long start,
long end,
long pos, // current parsing position
class Query *q,
long level,
class Expression *parent,
class Expression *leftChild,
bool hasNOT ,
bool underNOT ) {
//#define TYPE_OPERAND 1
//#define TYPE_OPCODE 2
//#define TYPE_EXPRESSION 3
// return -1 and set g_errno on error
// returns how many words expression was
long Expression::add (long start,
long end,
class Query *q,
long level,
bool hasNOT
) {
if ( level >= MAX_EXPRESSIONS ) { g_errno = EBADENGINEER; return -1;}
// the # of the first alnumpunct word in the expression
m_start = start;
// and the last one
m_end = end;
m_opcode = 0;
m_operand = NULL;
m_numChildren = 0;
m_hasNOT = hasNOT;
m_parent = parent;
uint8_t curOp = 0;
m_q = q;
QueryWord *qwords = q->m_qwords;
Expression *o_expressions = q->m_expressions;
Operand *o_operands = q->m_operands;
long *o_numOperands = &q->m_numOperands;
long *o_numExpressions = &q->m_numExpressions;
long maxExpressions = q->m_numWords;
//m_cc = 0;
// Lets really try to catch this
if (m_parent == this) {
//log(LOG_WARN, "query: Warning, setting expression "
// "parent to self");
char *xx = NULL; *xx = 0;
}
// "start" is the current alnumpunct word we are parsing out
for ( long i=m_start ; i<end ; i++ ){
//set initial args. if we are the right side of an expression
// and the left side was an expression, leftChild will point to it,
// but it counts as the leftChild of this expression.
if (leftChild) {
leftChild->m_parent = this;
m_children[0] = leftChild;
m_numChildren = 1;
}
hasNOT = false;
// "pos" is the current alnumpunct word we are parsing out
for ( long i=pos ; i<end ; i++ ){
QueryWord *qwords = q->m_qwords;
QueryWord * qw = &qwords[i];
// set this
qw->m_underNOT = underNOT;
//qw->m_underNOT = underNOT;
// set leaf node if not an opcode like "AND" and not punct.
if (!qw->m_opcode && qw->isAlphaWord()){
// if this is NOT the very first word of the expression
if (i > m_start) goto setChildExpr;
// if we maxxed out, error out
if ( *o_numOperands >= MAX_OPERANDS ) return -1;
Operand *op = &o_operands [ *o_numOperands ];
*o_numOperands = *o_numOperands + 1;
// . return ptr to next word for us to parse
// . subtract once since for loop will inc it
i = op->set ( i , end , qwords , level , underNOT );
if ( i < 0 ) return -1;
m_operand = op;
goto endExpr;
if ( ! qw->m_opcode && qw->isAlphaWord()){
//m_opSlots[m_cc] = i;
//m_opTypes[m_cc] = TYPE_OPERAND;
//qw->m_opBitNum = m_cc;
continue;//goto endExpr; mdw
}
if (qw->m_opcode == OP_NOT){
hasNOT = !hasNOT;
underNOT = hasNOT;
//underNOT = hasNOT;
continue;
}
else if (qw->m_opcode == OP_LEFTPAREN){
if (i == m_start) i++;
goto setChildExpr;
// this is expression
// . it should advance "i" to end of expression
// make a new one:
Expression *e = &q->m_expressions[q->m_numExpressions];
// point to next...
q->m_numExpressions++;
// now set it
e->add ( i+1, // skip over (
end ,
q ,
level + 1,
hasNOT );
qw->m_opcode = OP_EXPRESSION;
qw->m_expressionPtr = e;
//m_opSlots[m_cc] = (long)e;
//m_opTypes[m_cc] = TYPE_EXPRESSION;
//qw->m_opBitNum = m_cc;
}
else if (qw->m_opcode == OP_RIGHTPAREN){
goto endExpr;
}
else if (qw->m_opcode) {
int delta = 0;
curOp = qw->m_opcode;
if (m_numChildren == 1)
m_opcode = curOp;
if (m_numChildren > 1 && curOp != m_opcode) {
delta = (int)precedence[curOp] -
(int)precedence[m_opcode];
}
if (delta > 0){
goto endExpr;
}
if (delta < 0){
// set a subexpression conataining the
// last operand we found as the first
goto setChildExpr2;
}
}
continue;
endExpr:
//log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d",
// a, i, curOp);
// if we've matched parens, go to next word
// but if we have an extra right paren, don't crash
if (qw->m_opcode == OP_RIGHTPAREN &&
(qwords[m_start].m_opcode == OP_LEFTPAREN ||
m_start == 0))
i++;
m_end = i;
// We have an extra open paren
if (qwords[m_start].m_opcode == OP_LEFTPAREN &&
qw->m_opcode != OP_RIGHTPAREN)
goto setParentExpr;
// we are top-level expr, but there is more to parse
if (!m_parent && i < end-1)
goto setParentExpr;
// just return
return i;
// add a parent expression with this one as the left child
setParentExpr:
{
if ( *o_numExpressions >= maxExpressions ) return -1;
//if (qw->m_opcode == OP_RIGHTPAREN) i++;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( m_start , end ,i, q ,
level+1,
m_parent,
this,
false ,
underNOT ) ;
// return size i guess, edn point
return i;
}
// add a child expression
setChildExpr:
{
if ( *o_numExpressions >= maxExpressions ) return -1;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( i , end , i, q ,
level+1,
this, NULL, hasNOT ,
underNOT ) -1;
if ( i < 0 ) return -1;
// trim needless parens
while (e->m_numChildren == 1) {
hasNOT = e->m_hasNOT;
e = e->m_children[0];
if (hasNOT) e->m_hasNOT = ! e->m_hasNOT;
}
hasNOT = false;
//cull empty expressions
if (e->m_numChildren < 1 &&
e->m_operand == NULL) continue;
if (m_numChildren >= MAX_OPERANDS) return -1;
// add good expressions
m_children [ m_numChildren] = e;
m_numChildren++;
if (m_numChildren > 1 && m_opcode == 0)
m_opcode = OP_AND; // default AND
continue;
}
// we need to make the last operand we passed
// be the first operand of a subexpression
setChildExpr2:
{
// remove the last expression from our list
Expression *ce = m_children[m_numChildren-1];
m_numChildren--;
if ( *o_numExpressions >= maxExpressions ) return -1;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( ce->m_start , end , i, q ,
level+1,
this, ce,
false ,
underNOT ) -1;
ce->m_parent = e;
if ( i < 0 ) return -1;
if (m_numChildren >= MAX_OPERANDS) return -1;
m_children [ m_numChildren ] = e;
hasNOT = false;
m_numChildren++;
else if (qw->m_opcode) {
// add that mdw
//m_opSlots[m_cc] = qw->m_opcode;
//m_opTypes[m_cc] = TYPE_OPCODE;
//qw->m_opBitNum = m_cc;
//m_cc++;
continue;
}
// white space?
continue;
}
return end;
}
@ -3731,40 +3622,76 @@ bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
return m_expressions[0].isTruth ( bitVec , vecSize );
}
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
//bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
long byte = opBitNum / 8;
long mask = 1<<(opBitNum % 8);
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
return bitVec[byte] & mask;
}
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
// including ignored words and spaces i guess since Expression::add()
// seems to do that.
bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
//bool op1 = false ; // set to false so compiler shuts up
//bool op2 ;
//bool accumulator = false;
//bool hadOR = false;
bool result = false;
// leaf node
if (m_operand){
result = m_operand->isTruth(bitVec,vecSize);//, mask);
// handle masked terms better.. don't apply NOT operator
// mdw - not sure what this is doing
//if (!(m_operand->m_termBits & mask)) return true;
}
else if (m_numChildren == 1){
result = m_children[0]->isTruth(bitVec,vecSize);//, mask);
}
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
for ( long i=0 ; i<m_numChildren ; i++ ) {
result =result||m_children[i]->isTruth(bitVec,vecSize);
if (result) goto done;
//
// operand1 operand2 operator1 operand3 operator2 ....
//
// assume result is off
bool result = true;
char prevOpCode = 0;
long prevResult ;
// result of current operand
long opResult = -1;
for ( long i = 0 ; i < m_q->m_numWords ; i++ ) {
QueryWord *qw = &m_q->m_qwords[i];
if ( qw->m_opcode ) {
prevOpCode = qw->m_opcode;//m_opSlots[i];
continue;
}
}
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
result = true;
for (long i = 0 ; i < m_numChildren ; i++ ) {
result =result&&m_children[i]->isTruth(bitVec,vecSize);
if (!result) goto done;
// this docids must have all these words
// then two remaining opTypes are TYPE_OEPRAND and
// TYPE_EXPRESSION
// save prev one. -1 means no prev.
prevResult = opResult;
// for regular word operands
if ( ! qw->m_opcode ) {
// ignore it like a space?
if ( qw->m_ignoreWord ) continue;
// this is the op bit # for a word in the bool query
//long opBitNum = m_opSlots[i];
// see iff that bit is set in this docid's vector
opResult = isBitNumSet ( i,bitVec,vecSize );
}
// expression operands
else {
Expression *e = (Expression *)qw->m_expressionPtr;
opResult = e->isTruth ( bitVec , vecSize );
}
// need two to tango. i.e. (true OR false)
if ( prevResult == -1 ) continue;
// if this is not the first time... we got two
if ( prevOpCode == OP_AND ) {
if ( ! prevResult ) result = false;
if ( ! result ) result = false;
}
else if ( prevOpCode == OP_OR ) {
if ( prevResult ) result = true;
if ( result ) result = true;
}
}
done :
if (m_hasNOT) return !result;
else return result;
}
@ -3791,6 +3718,7 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
// print boolean expression for debug purposes
void Expression::print(SafeBuf *sbuf) {
/*
if (m_hasNOT) sbuf->safePrintf("NOT ");
if (m_operand){
m_operand->print(sbuf);
@ -3809,9 +3737,10 @@ void Expression::print(SafeBuf *sbuf) {
}
}
sbuf->safePrintf(")");
*/
}
/*
void Operand::print(SafeBuf *sbuf) {
// long shift = 0;
// while (m_termBits >> shift) shift++;
@ -3819,6 +3748,7 @@ void Operand::print(SafeBuf *sbuf) {
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
}
*/
// if any one query term is split, msg3a has to split the query
bool Query::isSplit() {

101
Query.h

@ -161,6 +161,7 @@ extern struct QueryField g_fields[];
#define OP_RIGHTPAREN 5
#define OP_UOR 6
#define OP_PIPE 7
#define OP_EXPRESSION 8
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
@ -168,6 +169,7 @@ extern struct QueryField g_fields[];
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
/*
// . creating a QueryBoolean class was unnecessary since it was only functional
// and had nothing new it would store that the Query class doesn't store
// . the entry point is the Query::setBitScoresBoolean() function below
@ -216,55 +218,13 @@ public:
//long m_vecSize;
// does the word NOT preceed the operand?
bool m_hasNOT;
class Expression *m_parent;
//class Expression *m_parent;
// we MUST have these for this OPERAND to be true
//unsigned short m_forcedBits;
};
*/
// operand1 AND operand2 OR ...
// operand1 OR operand2 AND ...
class Expression {
public:
long set (long start,
long end,
long pos, // current parsing position
class Query *q,
long level,
class Expression *parent,
class Expression *leftChild,
bool hasNOT ,
bool underNOT );
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
bool isTruth ( unsigned char *bitVec , long vecSize );
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
//qvec_t getNOTBits ( bool hasNOT );
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
// . use NULL in m_operands slot if we got an expression and vice versa
// . m_opcodes[i] is the opcode after operand #i
class Expression *m_parent;
//class Operand *m_operands [ MAX_OPERANDS ];
class Expression *m_children [ MAX_OPERANDS ];
//char m_opcodes [ MAX_OPERANDS ];
//long m_numOperands;
// now expressions can have either child expressions or 1 operand
long m_numChildren;
// do we have a NOT operator before operand #i?
//bool m_hasNOT [ MAX_OPERANDS ];
// only one opcode, operand, hasNOT per expression now
uint8_t m_opcode;
class Operand *m_operand;
bool m_hasNOT;
// needed for nesting
long m_start;
long m_end;
};
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
@ -306,7 +266,7 @@ class QueryWord {
long long m_phraseId;
// hash of field name then collection, used to hash termId
long long m_prefixHash;
long m_wordNum;
// are we in a phrase in a wikipedia title?
long m_wikiPhraseId;
long m_wikiPhraseStart;
@ -387,8 +347,10 @@ class QueryWord {
float m_float;
// for gbminint:99 etc. uses integers instead of floats for better res
long m_int;
// what operand # is it for doing boolen queries?
long m_opNum;
// what operand bit # is it for doing boolen queries?
//long m_opBitNum;
// when an operand is an expression...
class Expression *m_expressionPtr;
};
// . we filter the QueryWords and turn them into QueryTerms
@ -507,7 +469,7 @@ class QueryTerm {
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
// those query terms the same m_opNum for efficiency since
// they all have the same effecct
long m_opNum;
//long m_opNum;
// same as above basically
class QueryTerm *m_leftPhraseTerm;
@ -525,6 +487,40 @@ class QueryTerm {
};
//#define MAX_OPSLOTS 256
#define MAX_EXPRESSIONS 10
// operand1 AND operand2 OR ...
// operand1 OR operand2 AND ...
class Expression {
public:
long add (long start,
long end,
class Query *q,
long level,
bool hasNOT );
bool isTruth ( unsigned char *bitVec , long vecSize );
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
// . use NULL in m_operands slot if we got an expression and vice versa
// . m_opcodes[i] is the opcode after operand #i
//class Expression *m_parent;
bool m_hasNOT;
long m_start;
long m_end;
Query *m_q;
// . opSlots can be operands operators or expressions
// . m_opTypes tells which of the 3 they are
//long m_opSlots[MAX_OPSLOTS];
//char m_opTypes[MAX_OPSLOTS];
//long m_cc;
};
// . this is the main class for representing a query
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
class Query {
@ -906,11 +902,12 @@ class Query {
// . we now contain the parsing components for boolean queries
// . m_expressions points into m_gbuf or is allocated
class Expression *m_expressions; // [ MAX_OPERANDS ];
long m_expressionsAllocSize;
//class Expression *m_expressions; // [ MAX_OPERANDS ];
//long m_expressionsAllocSize;
Expression m_expressions[MAX_EXPRESSIONS];
long m_numExpressions;
class Operand m_operands [ MAX_OPERANDS ];
long m_numOperands ;
//class Operand m_operands [ MAX_OPERANDS ];
//long m_numOperands ;
// does query contain the pipe operator
bool m_piped;

@ -51,7 +51,7 @@
<readOnlyMode>0</>
# Controls all spidering for all collections
<spideringEnabled>0</>
<spideringEnabled>1</>
# What is the maximum number of web pages the spider is allowed to download
# simultaneously for ALL collections PER HOST?