forked from Mirrors/privacore-open-source-search-engine
more fixes for new boolean logic.
This commit is contained in:
@ -1,8 +1,8 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "PageParser.h"
|
||||
#include "IndexTable.h"
|
||||
#include "IndexTable2.h"
|
||||
//#include "IndexTable.h"
|
||||
//#include "IndexTable2.h"
|
||||
//#include "XmlDoc.h" // addCheckboxSpan()
|
||||
|
||||
bool g_inPageParser = false;
|
||||
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
st->m_termFreqs = termFreqs;
|
||||
st->m_termFreqWeights = termFreqWeights;
|
||||
st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
|
||||
//st->m_termFreqs = termFreqs;
|
||||
//st->m_termFreqWeights = termFreqWeights;
|
||||
//st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
|
@ -80,7 +80,7 @@ public:
|
||||
long long *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
score_t m_total;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
|
@ -7003,7 +7003,8 @@ bool PosdbTable::makeDocIdVoteBufFromBooleanQuery_r ( ) {
|
||||
// combination we encounter and store it into an array, otherwise,
|
||||
// we can use a another hashtable in order to avoid re-evaluation
|
||||
// on if it passes the boolean query.
|
||||
char bitVec[m_vecSize];
|
||||
char bitVec[MAX_OVEC_SIZE];
|
||||
if ( m_vecSize > MAX_OVEC_SIZE ) m_vecSize = MAX_OVEC_SIZE;
|
||||
// set all to zeroes
|
||||
memset ( bitVec , 0 , m_vecSize );
|
||||
|
||||
|
62
Query.cpp
62
Query.cpp
@ -3309,12 +3309,12 @@ void Query::printQueryTerms(){
|
||||
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
bool Query::testBoolean(qvec_t bits, qvec_t bitmask){
|
||||
bool Query::testBoolean( unsigned char *bits ) { //qvec_t bitmask){
|
||||
if (!m_isBoolean) return false;
|
||||
Expression *e = &m_expressions [ 0 ];
|
||||
// find top-level expression
|
||||
while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
return e->isTruth(bits, bitmask);
|
||||
return e->isTruth(bits);//, bitmask);
|
||||
|
||||
}
|
||||
void Query::printBooleanTree(){
|
||||
@ -3339,6 +3339,20 @@ bool Query::setBooleanOperands ( ) {
|
||||
"exceeded (%ld).",m_numTerms);
|
||||
}
|
||||
|
||||
// set the QueryWord::m_opBit member of each query word.
|
||||
// so if you have a query like 'A B OR C' then you need
|
||||
// to have both A and B if you don't have C. so every word
|
||||
// unless its an operator needs its own bit. quoted phrases
|
||||
// may present a problem down the road we'll have to deal with.
|
||||
long opNum = 0;
|
||||
for ( long i = 0 ; i < m_numWords ; i++ ) {
|
||||
// skip if field, opcode, punct. etc.
|
||||
if ( m_qwords[i].m_ignoreWord ) continue;
|
||||
// assign it a # i guess
|
||||
m_qwords[i].m_opNum = opNum++;
|
||||
}
|
||||
|
||||
|
||||
// alloc the mem if we need to (mdw left off here)
|
||||
//long need = (m_numWords/3) * sizeof(Expression);
|
||||
// illegitmate bool expressions breech the buffer
|
||||
@ -3400,6 +3414,7 @@ bool Query::setBooleanOperands ( ) {
|
||||
|
||||
// . get all the terms that are UNDER a NOT operator in some fashion
|
||||
// . these bits are 1-1 with m_qterms[]
|
||||
/*
|
||||
qvec_t notBits = e->getNOTBits( false );
|
||||
for ( long i = 0 ; i < m_numTerms ; i++ ) {
|
||||
if ( m_qterms[i].m_explicitBit & notBits )
|
||||
@ -3407,6 +3422,7 @@ bool Query::setBooleanOperands ( ) {
|
||||
else
|
||||
m_qterms[i].m_underNOT = false;
|
||||
}
|
||||
*/
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -3416,7 +3432,9 @@ bool Query::setBooleanOperands ( ) {
|
||||
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
bool underNOT ) {
|
||||
// clear these
|
||||
m_termBits = 0;
|
||||
//m_termBits = 0;
|
||||
memset(m_opBits,0,MAX_OVEC_SIZE);
|
||||
|
||||
m_hasNOT = false;
|
||||
|
||||
//m_hardRequiredBits = 0;
|
||||
@ -3461,9 +3479,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// query is too long
|
||||
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
|
||||
qw->m_phraseSign ) {
|
||||
qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
|
||||
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
|
||||
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
|
||||
m_termBits |= e;
|
||||
//m_termBits |= e;
|
||||
long byte = qw->m_opNum / 8;
|
||||
long mask = qw->m_opNum % 8;
|
||||
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
|
||||
}
|
||||
// why would it be ignored? oh... if like cd-rom or in quotes
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
@ -3471,9 +3492,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// . might be a word that's not a QueryTerm because
|
||||
// query is too long
|
||||
if ( qw->m_queryWordTerm ) {
|
||||
qvec_t e = qw->m_queryWordTerm->m_explicitBit;
|
||||
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
|
||||
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
|
||||
m_termBits |= e;
|
||||
//m_termBits |= e;
|
||||
long byte = qw->m_opNum / 8;
|
||||
long mask = qw->m_opNum % 8;
|
||||
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
@ -3722,12 +3746,13 @@ long Expression::set (long start,
|
||||
}
|
||||
|
||||
// each bit is 1-1 with the explicit terms in the boolean query
|
||||
bool Query::matchesBoolQuery ( qvec_t bits ) {
|
||||
return m_expressions[0].isTruth ( bits );
|
||||
bool Query::matchesBoolQuery ( unsigned char *bitVec ) {
|
||||
return m_expressions[0].isTruth ( bitVec );
|
||||
}
|
||||
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
|
||||
//bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
|
||||
bool Expression::isTruth ( unsigned char *bitVec ) { // , qvec_t mask ) {
|
||||
//bool op1 = false ; // set to false so compiler shuts up
|
||||
//bool op2 ;
|
||||
//bool accumulator = false;
|
||||
@ -3736,23 +3761,24 @@ bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
|
||||
|
||||
// leaf node
|
||||
if (m_operand){
|
||||
result = m_operand->isTruth(bits, mask);
|
||||
result = m_operand->isTruth(bitVec);//, mask);
|
||||
// handle masked terms better.. don't apply NOT operator
|
||||
if (!(m_operand->m_termBits & mask)) return true;
|
||||
// mdw - not sure what this is doing
|
||||
//if (!(m_operand->m_termBits & mask)) return true;
|
||||
}
|
||||
else if (m_numChildren == 1){
|
||||
result = m_children[0]->isTruth(bits, mask);
|
||||
result = m_children[0]->isTruth(bitVec);//, mask);
|
||||
}
|
||||
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
|
||||
for ( long i=0 ; i<m_numChildren ; i++ ) {
|
||||
result = result || m_children[i]->isTruth(bits, mask);
|
||||
result = result||m_children[i]->isTruth(bitVec);//mask
|
||||
if (result) goto done;
|
||||
}
|
||||
}
|
||||
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
|
||||
result = true;
|
||||
for (long i = 0 ; i < m_numChildren ; i++ ) {
|
||||
result = result && m_children[i]->isTruth(bits, mask);
|
||||
result = result &&m_children[i]->isTruth(bitVec);//mask
|
||||
if (!result) goto done;
|
||||
}
|
||||
}
|
||||
@ -3762,6 +3788,7 @@ done :
|
||||
else return result;
|
||||
}
|
||||
|
||||
/*
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
|
||||
// ourside the parens
|
||||
@ -3779,6 +3806,7 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
|
||||
// success, all operand pairs were true
|
||||
return notBits;
|
||||
}
|
||||
*/
|
||||
|
||||
// print boolean expression for debug purposes
|
||||
void Expression::print(SafeBuf *sbuf) {
|
||||
@ -3807,8 +3835,8 @@ void Operand::print(SafeBuf *sbuf) {
|
||||
// long shift = 0;
|
||||
// while (m_termBits >> shift) shift++;
|
||||
// sbuf->safePrintf("%i", 1<<(shift-1));
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits);
|
||||
else sbuf->safePrintf("0x%lx", (long)m_termBits);
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
|
||||
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
|
||||
}
|
||||
|
||||
// if any one query term is split, msg3a has to split the query
|
||||
|
45
Query.h
45
Query.h
@ -49,6 +49,8 @@ typedef unsigned long long qvec_t;
|
||||
|
||||
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
|
||||
|
||||
#define MAX_OVEC_SIZE 256
|
||||
|
||||
// only can use 16-bit since have to make a 64k truth table!
|
||||
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
|
||||
|
||||
@ -181,27 +183,38 @@ public:
|
||||
long set ( long a , long b , class QueryWord *qwords , long level ,
|
||||
bool underNOT ) ;
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
// . Operand::m_termBits is the required bits for operand to be true
|
||||
// . Operand::m_opBits is the required bits for operand to be true
|
||||
// . does not include signless phrases
|
||||
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
|
||||
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
|
||||
bool isTruth ( unsigned char *bitVec ) {
|
||||
// must always satisfy hard required terms (+ sign)
|
||||
//if ( (bits & m_forcedBits) != m_forcedBits )
|
||||
// return false;
|
||||
if (m_hasNOT) return (bits & m_termBits & mask) == 0;
|
||||
return ( (bits & m_termBits & mask) == (m_termBits & mask));
|
||||
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
|
||||
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
|
||||
if ( m_hasNOT ) {
|
||||
for ( long i = 0 ; i < m_vecSize ; i++ )
|
||||
if ( m_opBits[i] & bitVec[i] ) return false;
|
||||
return true;
|
||||
}
|
||||
for ( long i = 0 ; i < m_vecSize ; i++ )
|
||||
if ( m_opBits[i] & bitVec[i] ) return true;
|
||||
return false;
|
||||
// . we are now back to good ol' default OR
|
||||
// . m_termBits should have been masked with
|
||||
// . m_opBits should have been masked with
|
||||
// m_requiredBits so as not to include signless phrases
|
||||
//return ( (bits & m_termBits) != 0 );
|
||||
//return ( (bits & m_opBits) != 0 );
|
||||
};
|
||||
void print (SafeBuf *sbuf);
|
||||
// we are a sequence of QueryWords
|
||||
//long m_startWordNum;
|
||||
//long m_lastWordNum;
|
||||
// . we treat the required term bits of those words as one unit (ANDed)
|
||||
// . unsigned phrases are not included in these term bits
|
||||
// . doc just needs one of these bits for this op to be considered true
|
||||
qvec_t m_termBits;
|
||||
// . terms under the same QueryTermInfo class should have the same
|
||||
// termbit here
|
||||
unsigned char m_opBits[MAX_OVEC_SIZE];
|
||||
long m_vecSize;
|
||||
// does the word NOT preceed the operand?
|
||||
bool m_hasNOT;
|
||||
class Expression *m_parent;
|
||||
|
||||
@ -223,11 +236,12 @@ public:
|
||||
bool hasNOT ,
|
||||
bool underNOT );
|
||||
|
||||
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
|
||||
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
|
||||
bool isTruth ( unsigned char *bitVec );
|
||||
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
||||
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
||||
// . returned bit vector is 1-1 with m_qterms in Query class
|
||||
qvec_t getNOTBits ( bool hasNOT );
|
||||
//qvec_t getNOTBits ( bool hasNOT );
|
||||
void print (SafeBuf *sbuf);
|
||||
// . a list of operands separated by op codes (a AND b OR c ...)
|
||||
// . sometimes and operand is another expression: a AND (b OR c)
|
||||
@ -373,6 +387,8 @@ class QueryWord {
|
||||
float m_float;
|
||||
// for gbminint:99 etc. uses integers instead of floats for better res
|
||||
long m_int;
|
||||
// what operand # is it for doing boolen queries?
|
||||
long m_opNum;
|
||||
};
|
||||
|
||||
// . we filter the QueryWords and turn them into QueryTerms
|
||||
@ -597,7 +613,7 @@ class Query {
|
||||
|
||||
// sets m_bmap[][] so getImplicits() works
|
||||
void setBitMap ( );
|
||||
bool testBoolean(qvec_t bits, qvec_t bitmask=(qvec_t)-1);
|
||||
bool testBoolean(unsigned char *bits);//, qvec_t bitmask=(qvec_t)-1);
|
||||
// print to log
|
||||
void printBooleanTree();
|
||||
void printQueryTerms();
|
||||
@ -605,8 +621,11 @@ class Query {
|
||||
// the new way as of 3/12/2014. just determine if matches the bool
|
||||
// query or not. let's try to offload the scoring logic to other places
|
||||
// if possible.
|
||||
// bitVec is all the QueryWord::m_opBits some docid contains, so
|
||||
// does it match our boolean query or not?
|
||||
bool matchesBoolQuery ( unsigned char *bitVec ) ;
|
||||
|
||||
|
||||
// . call this before calling getBitScore() to set m_bitScores[] table
|
||||
// . returns false and sets g_errno on error (ENOMEM usually)
|
||||
//bool setBitScores (qvec_t bitMask = (qvec_t)-1);
|
||||
@ -624,6 +643,7 @@ class Query {
|
||||
// through the phrase
|
||||
// . the greater the number of IMplicit SINGLE words a doc has the
|
||||
// bigger its bit score
|
||||
/*
|
||||
uint8_t getBitScore ( qvec_t ebits ) {
|
||||
// get implicit bits from explicit bits
|
||||
qvec_t ibits = getImplicits ( ebits );
|
||||
@ -672,6 +692,7 @@ class Query {
|
||||
if (ibits == m_requiredBits ) bscore|=0x20;
|
||||
return bscore;
|
||||
};
|
||||
*/
|
||||
|
||||
// return an implicit vector from an explicit which contains the explic
|
||||
qvec_t getImplicits ( qvec_t ebits ) {
|
||||
|
@ -9,7 +9,7 @@
|
||||
#define _TOPTREE_H_
|
||||
|
||||
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
|
||||
#include "IndexTable2.h" // score_t definition
|
||||
//#include "IndexTable2.h" // score_t definition
|
||||
#include "RdbTree.h"
|
||||
|
||||
class TopNode {
|
||||
|
Reference in New Issue
Block a user