more fixes for new boolean logic.

This commit is contained in:
mwells
2014-03-13 13:09:33 -07:00
parent fb0123ad53
commit 3b2d981dff
6 changed files with 86 additions and 36 deletions

@ -1,8 +1,8 @@
#include "gb-include.h"
#include "PageParser.h"
#include "IndexTable.h"
#include "IndexTable2.h"
//#include "IndexTable.h"
//#include "IndexTable2.h"
//#include "XmlDoc.h" // addCheckboxSpan()
bool g_inPageParser = false;
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket *s ,
st->m_termFreqs = termFreqs;
st->m_termFreqWeights = termFreqWeights;
st->m_affWeights = affWeights;
st->m_total = (score_t)-1;
//st->m_total = (score_t)-1;
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
//st->m_termFreqs = termFreqs;
//st->m_termFreqWeights = termFreqWeights;
//st->m_affWeights = affWeights;
st->m_total = (score_t)-1;
//st->m_total = (score_t)-1;
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;

@ -80,7 +80,7 @@ public:
long long *m_termFreqs;
float *m_termFreqWeights;
float *m_affWeights;
score_t m_total;
//score_t m_total;
bool m_freeIt;
bool m_blocked;

@ -7003,7 +7003,8 @@ bool PosdbTable::makeDocIdVoteBufFromBooleanQuery_r ( ) {
// combination we encounter and store it into an array, otherwise,
// we can use a another hashtable in order to avoid re-evaluation
// on if it passes the boolean query.
char bitVec[m_vecSize];
char bitVec[MAX_OVEC_SIZE];
if ( m_vecSize > MAX_OVEC_SIZE ) m_vecSize = MAX_OVEC_SIZE;
// set all to zeroes
memset ( bitVec , 0 , m_vecSize );

@ -3309,12 +3309,12 @@ void Query::printQueryTerms(){
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
bool Query::testBoolean(qvec_t bits, qvec_t bitmask){
bool Query::testBoolean( unsigned char *bits ) { //qvec_t bitmask){
if (!m_isBoolean) return false;
Expression *e = &m_expressions [ 0 ];
// find top-level expression
while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits, bitmask);
return e->isTruth(bits);//, bitmask);
}
void Query::printBooleanTree(){
@ -3339,6 +3339,20 @@ bool Query::setBooleanOperands ( ) {
"exceeded (%ld).",m_numTerms);
}
// set the QueryWord::m_opBit member of each query word.
// so if you have a query like 'A B OR C' then you need
// to have both A and B if you don't have C. so every word
// unless its an operator needs its own bit. quoted phrases
// may present a problem down the road we'll have to deal with.
long opNum = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
// skip if field, opcode, punct. etc.
if ( m_qwords[i].m_ignoreWord ) continue;
// assign it a # i guess
m_qwords[i].m_opNum = opNum++;
}
// alloc the mem if we need to (mdw left off here)
//long need = (m_numWords/3) * sizeof(Expression);
// illegitmate bool expressions breech the buffer
@ -3400,6 +3414,7 @@ bool Query::setBooleanOperands ( ) {
// . get all the terms that are UNDER a NOT operator in some fashion
// . these bits are 1-1 with m_qterms[]
/*
qvec_t notBits = e->getNOTBits( false );
for ( long i = 0 ; i < m_numTerms ; i++ ) {
if ( m_qterms[i].m_explicitBit & notBits )
@ -3407,6 +3422,7 @@ bool Query::setBooleanOperands ( ) {
else
m_qterms[i].m_underNOT = false;
}
*/
return true;
}
@ -3416,7 +3432,9 @@ bool Query::setBooleanOperands ( ) {
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
bool underNOT ) {
// clear these
m_termBits = 0;
//m_termBits = 0;
memset(m_opBits,0,MAX_OVEC_SIZE);
m_hasNOT = false;
//m_hardRequiredBits = 0;
@ -3461,9 +3479,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// query is too long
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
qw->m_phraseSign ) {
qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
m_termBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = qw->m_opNum % 8;
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
// why would it be ignored? oh... if like cd-rom or in quotes
if ( qw->m_ignoreWord ) continue;
@ -3471,9 +3492,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// . might be a word that's not a QueryTerm because
// query is too long
if ( qw->m_queryWordTerm ) {
qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
m_termBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = qw->m_opNum % 8;
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
}
return b;
@ -3722,12 +3746,13 @@ long Expression::set (long start,
}
// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery ( qvec_t bits ) {
return m_expressions[0].isTruth ( bits );
bool Query::matchesBoolQuery ( unsigned char *bitVec ) {
return m_expressions[0].isTruth ( bitVec );
}
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
//bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
bool Expression::isTruth ( unsigned char *bitVec ) { // , qvec_t mask ) {
//bool op1 = false ; // set to false so compiler shuts up
//bool op2 ;
//bool accumulator = false;
@ -3736,23 +3761,24 @@ bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
// leaf node
if (m_operand){
result = m_operand->isTruth(bits, mask);
result = m_operand->isTruth(bitVec);//, mask);
// handle masked terms better.. don't apply NOT operator
if (!(m_operand->m_termBits & mask)) return true;
// mdw - not sure what this is doing
//if (!(m_operand->m_termBits & mask)) return true;
}
else if (m_numChildren == 1){
result = m_children[0]->isTruth(bits, mask);
result = m_children[0]->isTruth(bitVec);//, mask);
}
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
for ( long i=0 ; i<m_numChildren ; i++ ) {
result = result || m_children[i]->isTruth(bits, mask);
result = result||m_children[i]->isTruth(bitVec);//mask
if (result) goto done;
}
}
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
result = true;
for (long i = 0 ; i < m_numChildren ; i++ ) {
result = result && m_children[i]->isTruth(bits, mask);
result = result &&m_children[i]->isTruth(bitVec);//mask
if (!result) goto done;
}
}
@ -3762,6 +3788,7 @@ done :
else return result;
}
/*
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
// ourside the parens
@ -3779,6 +3806,7 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
// success, all operand pairs were true
return notBits;
}
*/
// print boolean expression for debug purposes
void Expression::print(SafeBuf *sbuf) {
@ -3807,8 +3835,8 @@ void Operand::print(SafeBuf *sbuf) {
// long shift = 0;
// while (m_termBits >> shift) shift++;
// sbuf->safePrintf("%i", 1<<(shift-1));
if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits);
else sbuf->safePrintf("0x%lx", (long)m_termBits);
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
}
// if any one query term is split, msg3a has to split the query

45
Query.h

@ -49,6 +49,8 @@ typedef unsigned long long qvec_t;
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
#define MAX_OVEC_SIZE 256
// only can use 16-bit since have to make a 64k truth table!
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
@ -181,27 +183,38 @@ public:
long set ( long a , long b , class QueryWord *qwords , long level ,
bool underNOT ) ;
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . Operand::m_termBits is the required bits for operand to be true
// . Operand::m_opBits is the required bits for operand to be true
// . does not include signless phrases
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
bool isTruth ( unsigned char *bitVec ) {
// must always satisfy hard required terms (+ sign)
//if ( (bits & m_forcedBits) != m_forcedBits )
// return false;
if (m_hasNOT) return (bits & m_termBits & mask) == 0;
return ( (bits & m_termBits & mask) == (m_termBits & mask));
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
if ( m_hasNOT ) {
for ( long i = 0 ; i < m_vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return false;
return true;
}
for ( long i = 0 ; i < m_vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return true;
return false;
// . we are now back to good ol' default OR
// . m_termBits should have been masked with
// . m_opBits should have been masked with
// m_requiredBits so as not to include signless phrases
//return ( (bits & m_termBits) != 0 );
//return ( (bits & m_opBits) != 0 );
};
void print (SafeBuf *sbuf);
// we are a sequence of QueryWords
//long m_startWordNum;
//long m_lastWordNum;
// . we treat the required term bits of those words as one unit (ANDed)
// . unsigned phrases are not included in these term bits
// . doc just needs one of these bits for this op to be considered true
qvec_t m_termBits;
// . terms under the same QueryTermInfo class should have the same
// termbit here
unsigned char m_opBits[MAX_OVEC_SIZE];
long m_vecSize;
// does the word NOT preceed the operand?
bool m_hasNOT;
class Expression *m_parent;
@ -223,11 +236,12 @@ public:
bool hasNOT ,
bool underNOT );
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
bool isTruth ( unsigned char *bitVec );
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
qvec_t getNOTBits ( bool hasNOT );
//qvec_t getNOTBits ( bool hasNOT );
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
@ -373,6 +387,8 @@ class QueryWord {
float m_float;
// for gbminint:99 etc. uses integers instead of floats for better res
long m_int;
// what operand # is it for doing boolen queries?
long m_opNum;
};
// . we filter the QueryWords and turn them into QueryTerms
@ -597,7 +613,7 @@ class Query {
// sets m_bmap[][] so getImplicits() works
void setBitMap ( );
bool testBoolean(qvec_t bits, qvec_t bitmask=(qvec_t)-1);
bool testBoolean(unsigned char *bits);//, qvec_t bitmask=(qvec_t)-1);
// print to log
void printBooleanTree();
void printQueryTerms();
@ -605,8 +621,11 @@ class Query {
// the new way as of 3/12/2014. just determine if matches the bool
// query or not. let's try to offload the scoring logic to other places
// if possible.
// bitVec is all the QueryWord::m_opBits some docid contains, so
// does it match our boolean query or not?
bool matchesBoolQuery ( unsigned char *bitVec ) ;
// . call this before calling getBitScore() to set m_bitScores[] table
// . returns false and sets g_errno on error (ENOMEM usually)
//bool setBitScores (qvec_t bitMask = (qvec_t)-1);
@ -624,6 +643,7 @@ class Query {
// through the phrase
// . the greater the number of IMplicit SINGLE words a doc has the
// bigger its bit score
/*
uint8_t getBitScore ( qvec_t ebits ) {
// get implicit bits from explicit bits
qvec_t ibits = getImplicits ( ebits );
@ -672,6 +692,7 @@ class Query {
if (ibits == m_requiredBits ) bscore|=0x20;
return bscore;
};
*/
// return an implicit vector from an explicit which contains the explic
qvec_t getImplicits ( qvec_t ebits ) {

@ -9,7 +9,7 @@
#define _TOPTREE_H_
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
#include "IndexTable2.h" // score_t definition
//#include "IndexTable2.h" // score_t definition
#include "RdbTree.h"
class TopNode {