forked from Mirrors/privacore-open-source-search-engine
more bool query fixes
This commit is contained in:
@ -661,6 +661,7 @@ void gotListsWrapper ( void *state ) {
|
||||
Msg39 *THIS = (Msg39 *) state;
|
||||
// . hash the lists into our index table
|
||||
// . this will send back a reply or recycle and read more list data
|
||||
|
||||
if ( ! THIS->gotLists ( true ) ) return;
|
||||
|
||||
// . if he did not block and there was an errno we send reply
|
||||
@ -671,6 +672,12 @@ void gotListsWrapper ( void *state ) {
|
||||
log("msg39: sending back error reply = %s",mstrerror(g_errno));
|
||||
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
|
||||
}
|
||||
|
||||
// no, block? call the docid split loop
|
||||
//if ( numDocIdSplits <= 1 ) return;
|
||||
|
||||
// if we get the lists and processed them without blocking, repeat!
|
||||
THIS->doDocIdSplitLoop();
|
||||
}
|
||||
|
||||
// . now come here when we got the necessary index lists
|
||||
@ -815,6 +822,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// time it
|
||||
diff = gettimeofdayInMilliseconds() - start;
|
||||
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
|
||||
|
||||
// returns false if blocked, true otherwise
|
||||
return addedLists ();
|
||||
}
|
||||
|
39
Posdb.cpp
39
Posdb.cpp
@ -4453,6 +4453,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
//
|
||||
m_minListSize = 0;
|
||||
m_minListi = -1;
|
||||
long long grand = 0LL;
|
||||
// hopefully no more than 100 sublists per term
|
||||
//char *listEnds [ MAX_QUERY_TERMS ][ MAX_SUBLISTS ];
|
||||
// set ptrs now i guess
|
||||
@ -4465,6 +4466,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
// add to it
|
||||
total = qti->m_totalSubListsSize;
|
||||
// add up this now
|
||||
grand += total;
|
||||
// get min
|
||||
if ( total < m_minListSize || m_minListi == -1 ) {
|
||||
m_minListSize = total;
|
||||
@ -4485,6 +4488,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
long maxDocIds = m_minListSize / 12;
|
||||
// store all interesected docids in here for new algo plus 1 byte vote
|
||||
long need = maxDocIds * 6;
|
||||
|
||||
// they could all be OR'd together!
|
||||
if ( m_q->m_isBoolean ) need = grand;
|
||||
|
||||
// get max # of docids we got in an intersection from all the lists
|
||||
if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) return false;
|
||||
|
||||
@ -4494,7 +4501,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
// the bit vector in a truth table
|
||||
long maxSlots = maxDocIds * 2;
|
||||
// get total operands we used
|
||||
long numOperands = m_q->m_numOperands;
|
||||
long numOperands = m_q->m_numWords;//Operands;
|
||||
// a quoted phrase counts as a single operand
|
||||
m_vecSize = numOperands / 8 ;
|
||||
// allow an extra byte for remainders
|
||||
@ -4504,7 +4511,7 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
|
||||
return false;
|
||||
if ( m_q->m_isBoolean &&
|
||||
! m_ct.set (8,2 * (1<<numOperands),maxSlots,NULL,0,false,0,
|
||||
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
|
||||
"booltbl"))
|
||||
return false;
|
||||
|
||||
@ -5527,6 +5534,11 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// is this right?
|
||||
if ( docIdPtr >= docIdEnd ) goto done;
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
minScore = 1.0;
|
||||
goto boolJump;
|
||||
}
|
||||
|
||||
// assume all sublists exhausted for this query term
|
||||
//docId = *(long long *)docIdPtr;
|
||||
|
||||
@ -6547,6 +6559,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
goto advance;
|
||||
|
||||
|
||||
boolJump:
|
||||
|
||||
// try dividing it by 3! (or multiply by .33333 faster)
|
||||
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
|
||||
@ -7043,8 +7057,8 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
// get the query word
|
||||
QueryWord *qw = qt->m_qword;
|
||||
|
||||
// and the operand # from that
|
||||
long opNum = qw->m_opNum;
|
||||
// just use the word # now
|
||||
long opNum = qw->m_wordNum;//opNum;
|
||||
|
||||
// do not consider for adding if negative ('my house -home')
|
||||
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
@ -7102,6 +7116,8 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
}
|
||||
|
||||
|
||||
char *dst = m_docIdVoteBuf.getBufStart();
|
||||
|
||||
// . now our hash table is filled with all the docids
|
||||
// . evaluate each bit vector
|
||||
for ( long i = 0 ; i < m_bt.m_numSlots ; i++ ) {
|
||||
@ -7121,8 +7137,10 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
long long *d = (long long *)m_bt.getKeyFromSlot(i);
|
||||
if ( m_debug ) log("query: addind d=%llu vec[0]=%lx",
|
||||
*d,(long)vec[0]);
|
||||
// an 8 byte key means you pass
|
||||
m_docIdVoteBuf.safeMemcpy ( &d , 6 );
|
||||
// a 6 byte key means you pass
|
||||
*(long *) dst = *(long *) d;
|
||||
*(short *)(dst+4) = *(short *)((char *)d+4);
|
||||
dst += 6;
|
||||
}
|
||||
// evaluate the vector
|
||||
char include = m_q->matchesBoolQuery ( (unsigned char *)vec ,
|
||||
@ -7132,13 +7150,18 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
long long *d = (long long *)m_bt.getKeyFromSlot(i);
|
||||
if ( m_debug ) log("query: addind d=%llu vec[0]=%lx",
|
||||
*d,(long)vec[0]);
|
||||
// an 8 byte key means you pass
|
||||
m_docIdVoteBuf.safeMemcpy ( &d , 6 );
|
||||
// a 6 byte key means you pass
|
||||
*(long *) dst = *(long *) d;
|
||||
*(short *)(dst+4) = *(short *)((char *)d+4);
|
||||
dst += 6;
|
||||
}
|
||||
// store in hash table
|
||||
m_ct.addKey ( &h64 , &include );
|
||||
}
|
||||
|
||||
// update SafeBuf::m_length
|
||||
m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
|
||||
|
||||
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery_r()
|
||||
// up into docid ranges so we have like 1/100th the # of docids to
|
||||
// sort. that should make this part a lot faster.
|
||||
|
414
Query.cpp
414
Query.cpp
@ -24,11 +24,11 @@ void Query::constructor ( ) {
|
||||
//m_bmap = NULL;
|
||||
m_bitScores = NULL;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
m_qwordsAllocSize = 0;
|
||||
m_expressionsAllocSize = 0;
|
||||
//m_expressionsAllocSize = 0;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
reset ( );
|
||||
}
|
||||
|
||||
@ -46,7 +46,7 @@ void Query::reset ( ) {
|
||||
m_bufLen = 0;
|
||||
m_origLen = 0;
|
||||
m_numWords = 0;
|
||||
m_numOperands = 0;
|
||||
//m_numOperands = 0;
|
||||
m_numTerms = 0;
|
||||
m_synTerm = 0;
|
||||
//m_numIgnored = 0;
|
||||
@ -60,14 +60,14 @@ void Query::reset ( ) {
|
||||
m_bitScores = NULL;
|
||||
//m_bmapSize = 0;
|
||||
m_bitScoresSize = 0;
|
||||
if ( m_expressionsAllocSize )
|
||||
mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
|
||||
//if ( m_expressionsAllocSize )
|
||||
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
|
||||
if ( m_qwordsAllocSize )
|
||||
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
|
||||
m_expressionsAllocSize = 0;
|
||||
//m_expressionsAllocSize = 0;
|
||||
m_qwordsAllocSize = 0;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
m_numExpressions = 0;
|
||||
m_gnext = m_gbuf;
|
||||
m_hasUOR = false;
|
||||
@ -343,8 +343,8 @@ bool Query::set2 ( char *query ,
|
||||
|
||||
// set m_expressions[] and m_operands[] arrays and m_numOperands
|
||||
// for boolean queries
|
||||
if ( m_isBoolean )
|
||||
if ( ! setBooleanOperands() ) return false;
|
||||
//if ( m_isBoolean )
|
||||
// if ( ! setBooleanOperands() ) return false;
|
||||
|
||||
// disable stuff for site:, ip: and url: queries
|
||||
for ( long i = 0 ; i < m_numWords ; i++ ) {
|
||||
@ -386,6 +386,16 @@ bool Query::set2 ( char *query ,
|
||||
break;
|
||||
}
|
||||
|
||||
// . keep it simple for now
|
||||
// . we limit to MAX_EXRESSIONS to like 10 now i guess
|
||||
if ( m_isBoolean )
|
||||
m_expressions[0].add ( 0 ,
|
||||
m_numWords ,
|
||||
this , // Query
|
||||
0 , // level
|
||||
false ); // hasNOT
|
||||
|
||||
|
||||
// . if it is not truncated, no need to use hard counts
|
||||
// . comment this line and the next one out for testing hard counts
|
||||
if ( ! m_truncated ) return true;
|
||||
@ -450,16 +460,16 @@ bool Query::set2 ( char *query ,
|
||||
// "(nt=%li)",
|
||||
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
|
||||
|
||||
if ( ! m_isBoolean ) return true;
|
||||
//if ( ! m_isBoolean ) return true;
|
||||
|
||||
// free cuz it was already set
|
||||
if ( m_expressionsAllocSize )
|
||||
mfree(m_expressions,m_expressionsAllocSize , "Query" );
|
||||
m_expressionsAllocSize = 0;
|
||||
m_expressions = NULL;
|
||||
//if ( m_expressionsAllocSize )
|
||||
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
|
||||
//m_expressionsAllocSize = 0;
|
||||
//m_expressions = NULL;
|
||||
|
||||
// also set the boolean stuff again too!
|
||||
if ( ! setBooleanOperands() ) return false;
|
||||
//if ( ! setBooleanOperands() ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -617,7 +627,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// doh! gotta reset to 0
|
||||
qt->m_implicitBits = 0;
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeightPhrase ;
|
||||
qt->m_userType = qw->m_userTypePhrase ;
|
||||
@ -819,7 +829,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// break;
|
||||
// }
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeight ;
|
||||
qt->m_userType = qw->m_userType ;
|
||||
@ -1265,7 +1275,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// reset our implicit bits to 0
|
||||
qt->m_implicitBits = 0;
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeight ;
|
||||
qt->m_userType = qw->m_userType ;
|
||||
@ -1902,7 +1912,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// assume QueryWord is ignored by default
|
||||
qw->m_ignoreWord = IGNORE_DEFAULT;
|
||||
qw->m_ignorePhrase = IGNORE_DEFAULT;
|
||||
|
||||
qw->m_wordNum = i;
|
||||
// get word as a string
|
||||
//char *w = words.getWord(i);
|
||||
//long wlen = words.getWordLen(i);
|
||||
@ -3312,20 +3322,20 @@ bool Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
|
||||
if (!m_isBoolean) return false;
|
||||
Expression *e = &m_expressions [ 0 ];
|
||||
// find top-level expression
|
||||
while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
return e->isTruth(bits,vecSize);//, bitmask);
|
||||
|
||||
}
|
||||
void Query::printBooleanTree(){
|
||||
if (!m_isBoolean) return;
|
||||
Expression *e = &m_expressions [ 0 ];
|
||||
//Expression *e = &m_expressions [ 0 ];
|
||||
// find top-level expression
|
||||
while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
SafeBuf sbuf(1024,"botree");
|
||||
e->print(&sbuf);
|
||||
logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
|
||||
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
//SafeBuf sbuf(1024,"botree");
|
||||
//e->print(&sbuf);
|
||||
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
|
||||
}
|
||||
|
||||
/*
|
||||
// . also sets the m_underNOT member of each QueryTerm, too!!
|
||||
// . returns false and sets g_errno on error, true otherwise
|
||||
bool Query::setBooleanOperands ( ) {
|
||||
@ -3381,14 +3391,11 @@ bool Query::setBooleanOperands ( ) {
|
||||
// . set the expression recursively
|
||||
// . just setting this will not set the m_hasNOT members of each
|
||||
// QueryTerm
|
||||
long status = e->set ( 0 , // first word #
|
||||
m_numWords , // last word #
|
||||
0 , // parser position
|
||||
this , // array of QueryWords
|
||||
0 ,// level
|
||||
NULL, NULL, // parent, leftchild
|
||||
false , // has NOT?
|
||||
false ); // under NOT?
|
||||
long status = e->add ( 0 , // first word #
|
||||
m_numWords , // last word #
|
||||
this , // array of QueryWords
|
||||
0 ,// level
|
||||
false ); // has NOT?
|
||||
if ( status < 0 ) {
|
||||
g_errno = ETOOMANYOPERANDS;
|
||||
return log("query: Maximum number of bool operands "
|
||||
@ -3413,6 +3420,7 @@ bool Query::setBooleanOperands ( ) {
|
||||
|
||||
// . get all the terms that are UNDER a NOT operator in some fashion
|
||||
// . these bits are 1-1 with m_qterms[]
|
||||
*/
|
||||
/*
|
||||
qvec_t notBits = e->getNOTBits( false );
|
||||
for ( long i = 0 ; i < m_numTerms ; i++ ) {
|
||||
@ -3422,10 +3430,11 @@ bool Query::setBooleanOperands ( ) {
|
||||
m_qterms[i].m_underNOT = false;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
/*
|
||||
// . returns -1 on bad query error
|
||||
// . returns word AFTER the last word in our operand
|
||||
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
@ -3448,7 +3457,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// set the parenthetical level of the word
|
||||
qw->m_level = level;
|
||||
// set this
|
||||
qw->m_underNOT = underNOT;
|
||||
//qw->m_underNOT = underNOT;
|
||||
// skip punct
|
||||
if ( ! qw->isAlphaWord() ) {
|
||||
// if it is a parens, bail!
|
||||
@ -3501,6 +3510,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
}
|
||||
return b;
|
||||
}
|
||||
*/
|
||||
|
||||
// . returns -1 on bad query error
|
||||
// . returns next word to parse (after expression) on success
|
||||
@ -3510,6 +3520,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// . new: organize query into sum of products normal form, ie:
|
||||
// . (a) OR (b AND c AND d) OR (e AND f)
|
||||
|
||||
/*
|
||||
unsigned char precedence[] = {
|
||||
0, // term
|
||||
4, // OR
|
||||
@ -3520,209 +3531,89 @@ unsigned char precedence[] = {
|
||||
3, // UOR
|
||||
5, // PIPE
|
||||
};
|
||||
*/
|
||||
|
||||
long Expression::set (long start,
|
||||
long end,
|
||||
long pos, // current parsing position
|
||||
class Query *q,
|
||||
long level,
|
||||
class Expression *parent,
|
||||
class Expression *leftChild,
|
||||
bool hasNOT ,
|
||||
bool underNOT ) {
|
||||
//#define TYPE_OPERAND 1
|
||||
//#define TYPE_OPCODE 2
|
||||
//#define TYPE_EXPRESSION 3
|
||||
|
||||
|
||||
// return -1 and set g_errno on error
|
||||
// returns how many words expression was
|
||||
long Expression::add (long start,
|
||||
long end,
|
||||
class Query *q,
|
||||
long level,
|
||||
bool hasNOT
|
||||
) {
|
||||
|
||||
if ( level >= MAX_EXPRESSIONS ) { g_errno = EBADENGINEER; return -1;}
|
||||
|
||||
// the # of the first alnumpunct word in the expression
|
||||
m_start = start;
|
||||
// and the last one
|
||||
m_end = end;
|
||||
m_opcode = 0;
|
||||
m_operand = NULL;
|
||||
m_numChildren = 0;
|
||||
m_hasNOT = hasNOT;
|
||||
m_parent = parent;
|
||||
uint8_t curOp = 0;
|
||||
m_q = q;
|
||||
|
||||
QueryWord *qwords = q->m_qwords;
|
||||
Expression *o_expressions = q->m_expressions;
|
||||
Operand *o_operands = q->m_operands;
|
||||
long *o_numOperands = &q->m_numOperands;
|
||||
long *o_numExpressions = &q->m_numExpressions;
|
||||
long maxExpressions = q->m_numWords;
|
||||
|
||||
//m_cc = 0;
|
||||
|
||||
// Lets really try to catch this
|
||||
if (m_parent == this) {
|
||||
//log(LOG_WARN, "query: Warning, setting expression "
|
||||
// "parent to self");
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// "start" is the current alnumpunct word we are parsing out
|
||||
for ( long i=m_start ; i<end ; i++ ){
|
||||
|
||||
//set initial args. if we are the right side of an expression
|
||||
// and the left side was an expression, leftChild will point to it,
|
||||
// but it counts as the leftChild of this expression.
|
||||
if (leftChild) {
|
||||
leftChild->m_parent = this;
|
||||
m_children[0] = leftChild;
|
||||
m_numChildren = 1;
|
||||
}
|
||||
hasNOT = false;
|
||||
|
||||
// "pos" is the current alnumpunct word we are parsing out
|
||||
for ( long i=pos ; i<end ; i++ ){
|
||||
QueryWord *qwords = q->m_qwords;
|
||||
|
||||
QueryWord * qw = &qwords[i];
|
||||
// set this
|
||||
qw->m_underNOT = underNOT;
|
||||
//qw->m_underNOT = underNOT;
|
||||
|
||||
// set leaf node if not an opcode like "AND" and not punct.
|
||||
if (!qw->m_opcode && qw->isAlphaWord()){
|
||||
// if this is NOT the very first word of the expression
|
||||
if (i > m_start) goto setChildExpr;
|
||||
// if we maxxed out, error out
|
||||
if ( *o_numOperands >= MAX_OPERANDS ) return -1;
|
||||
Operand *op = &o_operands [ *o_numOperands ];
|
||||
*o_numOperands = *o_numOperands + 1;
|
||||
|
||||
// . return ptr to next word for us to parse
|
||||
// . subtract once since for loop will inc it
|
||||
i = op->set ( i , end , qwords , level , underNOT );
|
||||
if ( i < 0 ) return -1;
|
||||
m_operand = op;
|
||||
goto endExpr;
|
||||
if ( ! qw->m_opcode && qw->isAlphaWord()){
|
||||
//m_opSlots[m_cc] = i;
|
||||
//m_opTypes[m_cc] = TYPE_OPERAND;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
continue;//goto endExpr; mdw
|
||||
}
|
||||
if (qw->m_opcode == OP_NOT){
|
||||
hasNOT = !hasNOT;
|
||||
underNOT = hasNOT;
|
||||
//underNOT = hasNOT;
|
||||
continue;
|
||||
}
|
||||
else if (qw->m_opcode == OP_LEFTPAREN){
|
||||
if (i == m_start) i++;
|
||||
goto setChildExpr;
|
||||
// this is expression
|
||||
// . it should advance "i" to end of expression
|
||||
// make a new one:
|
||||
Expression *e = &q->m_expressions[q->m_numExpressions];
|
||||
// point to next...
|
||||
q->m_numExpressions++;
|
||||
// now set it
|
||||
e->add ( i+1, // skip over (
|
||||
end ,
|
||||
q ,
|
||||
level + 1,
|
||||
hasNOT );
|
||||
qw->m_opcode = OP_EXPRESSION;
|
||||
qw->m_expressionPtr = e;
|
||||
//m_opSlots[m_cc] = (long)e;
|
||||
//m_opTypes[m_cc] = TYPE_EXPRESSION;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
}
|
||||
else if (qw->m_opcode == OP_RIGHTPAREN){
|
||||
goto endExpr;
|
||||
}
|
||||
else if (qw->m_opcode) {
|
||||
int delta = 0;
|
||||
curOp = qw->m_opcode;
|
||||
if (m_numChildren == 1)
|
||||
m_opcode = curOp;
|
||||
|
||||
if (m_numChildren > 1 && curOp != m_opcode) {
|
||||
|
||||
delta = (int)precedence[curOp] -
|
||||
(int)precedence[m_opcode];
|
||||
}
|
||||
|
||||
if (delta > 0){
|
||||
goto endExpr;
|
||||
}
|
||||
if (delta < 0){
|
||||
// set a subexpression conataining the
|
||||
// last operand we found as the first
|
||||
goto setChildExpr2;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
endExpr:
|
||||
//log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d",
|
||||
// a, i, curOp);
|
||||
// if we've matched parens, go to next word
|
||||
// but if we have an extra right paren, don't crash
|
||||
if (qw->m_opcode == OP_RIGHTPAREN &&
|
||||
(qwords[m_start].m_opcode == OP_LEFTPAREN ||
|
||||
m_start == 0))
|
||||
i++;
|
||||
|
||||
m_end = i;
|
||||
// We have an extra open paren
|
||||
if (qwords[m_start].m_opcode == OP_LEFTPAREN &&
|
||||
qw->m_opcode != OP_RIGHTPAREN)
|
||||
goto setParentExpr;
|
||||
// we are top-level expr, but there is more to parse
|
||||
if (!m_parent && i < end-1)
|
||||
goto setParentExpr;
|
||||
// just return
|
||||
return i;
|
||||
// add a parent expression with this one as the left child
|
||||
setParentExpr:
|
||||
{
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
//if (qw->m_opcode == OP_RIGHTPAREN) i++;
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( m_start , end ,i, q ,
|
||||
level+1,
|
||||
m_parent,
|
||||
this,
|
||||
false ,
|
||||
underNOT ) ;
|
||||
// return size i guess, edn point
|
||||
return i;
|
||||
}
|
||||
|
||||
// add a child expression
|
||||
setChildExpr:
|
||||
{
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( i , end , i, q ,
|
||||
level+1,
|
||||
this, NULL, hasNOT ,
|
||||
underNOT ) -1;
|
||||
if ( i < 0 ) return -1;
|
||||
|
||||
// trim needless parens
|
||||
while (e->m_numChildren == 1) {
|
||||
hasNOT = e->m_hasNOT;
|
||||
e = e->m_children[0];
|
||||
if (hasNOT) e->m_hasNOT = ! e->m_hasNOT;
|
||||
}
|
||||
hasNOT = false;
|
||||
//cull empty expressions
|
||||
if (e->m_numChildren < 1 &&
|
||||
e->m_operand == NULL) continue;
|
||||
|
||||
if (m_numChildren >= MAX_OPERANDS) return -1;
|
||||
// add good expressions
|
||||
m_children [ m_numChildren] = e;
|
||||
m_numChildren++;
|
||||
if (m_numChildren > 1 && m_opcode == 0)
|
||||
m_opcode = OP_AND; // default AND
|
||||
continue;
|
||||
}
|
||||
|
||||
// we need to make the last operand we passed
|
||||
// be the first operand of a subexpression
|
||||
setChildExpr2:
|
||||
{
|
||||
// remove the last expression from our list
|
||||
Expression *ce = m_children[m_numChildren-1];
|
||||
|
||||
m_numChildren--;
|
||||
|
||||
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( ce->m_start , end , i, q ,
|
||||
level+1,
|
||||
this, ce,
|
||||
false ,
|
||||
underNOT ) -1;
|
||||
ce->m_parent = e;
|
||||
if ( i < 0 ) return -1;
|
||||
|
||||
if (m_numChildren >= MAX_OPERANDS) return -1;
|
||||
m_children [ m_numChildren ] = e;
|
||||
|
||||
hasNOT = false;
|
||||
m_numChildren++;
|
||||
else if (qw->m_opcode) {
|
||||
// add that mdw
|
||||
//m_opSlots[m_cc] = qw->m_opcode;
|
||||
//m_opTypes[m_cc] = TYPE_OPCODE;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
//m_cc++;
|
||||
continue;
|
||||
}
|
||||
// white space?
|
||||
continue;
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
@ -3731,40 +3622,76 @@ bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
|
||||
return m_expressions[0].isTruth ( bitVec , vecSize );
|
||||
}
|
||||
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
//bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
|
||||
|
||||
bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
|
||||
long byte = opBitNum / 8;
|
||||
long mask = 1<<(opBitNum % 8);
|
||||
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
|
||||
return bitVec[byte] & mask;
|
||||
}
|
||||
|
||||
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
|
||||
// including ignored words and spaces i guess since Expression::add()
|
||||
// seems to do that.
|
||||
bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
|
||||
//bool op1 = false ; // set to false so compiler shuts up
|
||||
//bool op2 ;
|
||||
//bool accumulator = false;
|
||||
//bool hadOR = false;
|
||||
bool result = false;
|
||||
|
||||
// leaf node
|
||||
if (m_operand){
|
||||
result = m_operand->isTruth(bitVec,vecSize);//, mask);
|
||||
// handle masked terms better.. don't apply NOT operator
|
||||
// mdw - not sure what this is doing
|
||||
//if (!(m_operand->m_termBits & mask)) return true;
|
||||
}
|
||||
else if (m_numChildren == 1){
|
||||
result = m_children[0]->isTruth(bitVec,vecSize);//, mask);
|
||||
}
|
||||
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
|
||||
for ( long i=0 ; i<m_numChildren ; i++ ) {
|
||||
result =result||m_children[i]->isTruth(bitVec,vecSize);
|
||||
if (result) goto done;
|
||||
//
|
||||
// operand1 operand2 operator1 operand3 operator2 ....
|
||||
//
|
||||
// assume result is off
|
||||
bool result = true;
|
||||
|
||||
char prevOpCode = 0;
|
||||
long prevResult ;
|
||||
// result of current operand
|
||||
long opResult = -1;
|
||||
|
||||
|
||||
for ( long i = 0 ; i < m_q->m_numWords ; i++ ) {
|
||||
|
||||
QueryWord *qw = &m_q->m_qwords[i];
|
||||
|
||||
if ( qw->m_opcode ) {
|
||||
prevOpCode = qw->m_opcode;//m_opSlots[i];
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
|
||||
result = true;
|
||||
for (long i = 0 ; i < m_numChildren ; i++ ) {
|
||||
result =result&&m_children[i]->isTruth(bitVec,vecSize);
|
||||
if (!result) goto done;
|
||||
|
||||
// this docids must have all these words
|
||||
// then two remaining opTypes are TYPE_OEPRAND and
|
||||
// TYPE_EXPRESSION
|
||||
|
||||
// save prev one. -1 means no prev.
|
||||
prevResult = opResult;
|
||||
|
||||
// for regular word operands
|
||||
if ( ! qw->m_opcode ) {
|
||||
// ignore it like a space?
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
// this is the op bit # for a word in the bool query
|
||||
//long opBitNum = m_opSlots[i];
|
||||
// see iff that bit is set in this docid's vector
|
||||
opResult = isBitNumSet ( i,bitVec,vecSize );
|
||||
}
|
||||
// expression operands
|
||||
else {
|
||||
Expression *e = (Expression *)qw->m_expressionPtr;
|
||||
opResult = e->isTruth ( bitVec , vecSize );
|
||||
}
|
||||
|
||||
// need two to tango. i.e. (true OR false)
|
||||
if ( prevResult == -1 ) continue;
|
||||
|
||||
// if this is not the first time... we got two
|
||||
if ( prevOpCode == OP_AND ) {
|
||||
if ( ! prevResult ) result = false;
|
||||
if ( ! result ) result = false;
|
||||
}
|
||||
else if ( prevOpCode == OP_OR ) {
|
||||
if ( prevResult ) result = true;
|
||||
if ( result ) result = true;
|
||||
}
|
||||
}
|
||||
|
||||
done :
|
||||
if (m_hasNOT) return !result;
|
||||
else return result;
|
||||
}
|
||||
@ -3791,6 +3718,7 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
|
||||
|
||||
// print boolean expression for debug purposes
|
||||
void Expression::print(SafeBuf *sbuf) {
|
||||
/*
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT ");
|
||||
if (m_operand){
|
||||
m_operand->print(sbuf);
|
||||
@ -3809,9 +3737,10 @@ void Expression::print(SafeBuf *sbuf) {
|
||||
}
|
||||
}
|
||||
sbuf->safePrintf(")");
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
void Operand::print(SafeBuf *sbuf) {
|
||||
// long shift = 0;
|
||||
// while (m_termBits >> shift) shift++;
|
||||
@ -3819,6 +3748,7 @@ void Operand::print(SafeBuf *sbuf) {
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
|
||||
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
|
||||
}
|
||||
*/
|
||||
|
||||
// if any one query term is split, msg3a has to split the query
|
||||
bool Query::isSplit() {
|
||||
|
101
Query.h
101
Query.h
@ -161,6 +161,7 @@ extern struct QueryField g_fields[];
|
||||
#define OP_RIGHTPAREN 5
|
||||
#define OP_UOR 6
|
||||
#define OP_PIPE 7
|
||||
#define OP_EXPRESSION 8
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
@ -168,6 +169,7 @@ extern struct QueryField g_fields[];
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
// . creating a QueryBoolean class was unnecessary since it was only functional
|
||||
// and had nothing new it would store that the Query class doesn't store
|
||||
// . the entry point is the Query::setBitScoresBoolean() function below
|
||||
@ -216,55 +218,13 @@ public:
|
||||
//long m_vecSize;
|
||||
// does the word NOT preceed the operand?
|
||||
bool m_hasNOT;
|
||||
class Expression *m_parent;
|
||||
//class Expression *m_parent;
|
||||
|
||||
// we MUST have these for this OPERAND to be true
|
||||
//unsigned short m_forcedBits;
|
||||
};
|
||||
*/
|
||||
|
||||
// operand1 AND operand2 OR ...
|
||||
// operand1 OR operand2 AND ...
|
||||
class Expression {
|
||||
public:
|
||||
long set (long start,
|
||||
long end,
|
||||
long pos, // current parsing position
|
||||
class Query *q,
|
||||
long level,
|
||||
class Expression *parent,
|
||||
class Expression *leftChild,
|
||||
bool hasNOT ,
|
||||
bool underNOT );
|
||||
|
||||
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
|
||||
bool isTruth ( unsigned char *bitVec , long vecSize );
|
||||
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
||||
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
||||
// . returned bit vector is 1-1 with m_qterms in Query class
|
||||
//qvec_t getNOTBits ( bool hasNOT );
|
||||
void print (SafeBuf *sbuf);
|
||||
// . a list of operands separated by op codes (a AND b OR c ...)
|
||||
// . sometimes and operand is another expression: a AND (b OR c)
|
||||
// . use NULL in m_operands slot if we got an expression and vice versa
|
||||
// . m_opcodes[i] is the opcode after operand #i
|
||||
class Expression *m_parent;
|
||||
//class Operand *m_operands [ MAX_OPERANDS ];
|
||||
class Expression *m_children [ MAX_OPERANDS ];
|
||||
//char m_opcodes [ MAX_OPERANDS ];
|
||||
//long m_numOperands;
|
||||
// now expressions can have either child expressions or 1 operand
|
||||
long m_numChildren;
|
||||
// do we have a NOT operator before operand #i?
|
||||
//bool m_hasNOT [ MAX_OPERANDS ];
|
||||
// only one opcode, operand, hasNOT per expression now
|
||||
uint8_t m_opcode;
|
||||
class Operand *m_operand;
|
||||
bool m_hasNOT;
|
||||
// needed for nesting
|
||||
long m_start;
|
||||
long m_end;
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
@ -306,7 +266,7 @@ class QueryWord {
|
||||
long long m_phraseId;
|
||||
// hash of field name then collection, used to hash termId
|
||||
long long m_prefixHash;
|
||||
|
||||
long m_wordNum;
|
||||
// are we in a phrase in a wikipedia title?
|
||||
long m_wikiPhraseId;
|
||||
long m_wikiPhraseStart;
|
||||
@ -387,8 +347,10 @@ class QueryWord {
|
||||
float m_float;
|
||||
// for gbminint:99 etc. uses integers instead of floats for better res
|
||||
long m_int;
|
||||
// what operand # is it for doing boolen queries?
|
||||
long m_opNum;
|
||||
// what operand bit # is it for doing boolen queries?
|
||||
//long m_opBitNum;
|
||||
// when an operand is an expression...
|
||||
class Expression *m_expressionPtr;
|
||||
};
|
||||
|
||||
// . we filter the QueryWords and turn them into QueryTerms
|
||||
@ -507,7 +469,7 @@ class QueryTerm {
|
||||
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
|
||||
// those query terms the same m_opNum for efficiency since
|
||||
// they all have the same effecct
|
||||
long m_opNum;
|
||||
//long m_opNum;
|
||||
|
||||
// same as above basically
|
||||
class QueryTerm *m_leftPhraseTerm;
|
||||
@ -525,6 +487,40 @@ class QueryTerm {
|
||||
|
||||
};
|
||||
|
||||
//#define MAX_OPSLOTS 256
|
||||
|
||||
#define MAX_EXPRESSIONS 10
|
||||
|
||||
// operand1 AND operand2 OR ...
|
||||
// operand1 OR operand2 AND ...
|
||||
class Expression {
|
||||
public:
|
||||
long add (long start,
|
||||
long end,
|
||||
class Query *q,
|
||||
long level,
|
||||
bool hasNOT );
|
||||
bool isTruth ( unsigned char *bitVec , long vecSize );
|
||||
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
||||
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
||||
// . returned bit vector is 1-1 with m_qterms in Query class
|
||||
void print (SafeBuf *sbuf);
|
||||
// . a list of operands separated by op codes (a AND b OR c ...)
|
||||
// . sometimes and operand is another expression: a AND (b OR c)
|
||||
// . use NULL in m_operands slot if we got an expression and vice versa
|
||||
// . m_opcodes[i] is the opcode after operand #i
|
||||
//class Expression *m_parent;
|
||||
bool m_hasNOT;
|
||||
long m_start;
|
||||
long m_end;
|
||||
Query *m_q;
|
||||
// . opSlots can be operands operators or expressions
|
||||
// . m_opTypes tells which of the 3 they are
|
||||
//long m_opSlots[MAX_OPSLOTS];
|
||||
//char m_opTypes[MAX_OPSLOTS];
|
||||
//long m_cc;
|
||||
};
|
||||
|
||||
// . this is the main class for representing a query
|
||||
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
|
||||
class Query {
|
||||
@ -906,11 +902,12 @@ class Query {
|
||||
|
||||
// . we now contain the parsing components for boolean queries
|
||||
// . m_expressions points into m_gbuf or is allocated
|
||||
class Expression *m_expressions; // [ MAX_OPERANDS ];
|
||||
long m_expressionsAllocSize;
|
||||
//class Expression *m_expressions; // [ MAX_OPERANDS ];
|
||||
//long m_expressionsAllocSize;
|
||||
Expression m_expressions[MAX_EXPRESSIONS];
|
||||
long m_numExpressions;
|
||||
class Operand m_operands [ MAX_OPERANDS ];
|
||||
long m_numOperands ;
|
||||
//class Operand m_operands [ MAX_OPERANDS ];
|
||||
//long m_numOperands ;
|
||||
|
||||
// does query contain the pipe operator
|
||||
bool m_piped;
|
||||
|
2
gb.conf
2
gb.conf
@ -51,7 +51,7 @@
|
||||
<readOnlyMode>0</>
|
||||
|
||||
# Controls all spidering for all collections
|
||||
<spideringEnabled>0</>
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously for ALL collections PER HOST?
|
||||
|
Reference in New Issue
Block a user