#include "gb-include.h"
#include <limits>
#include "Query.h"
//#include "Indexdb.h" // g_indexdb.getTruncationLimit() g_indexdb.getTermId()
#include "Words.h"
#include "Bits.h"
#include "Phrases.h"
#include "Url.h"
#include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs()
#include "StopWords.h" // isQueryStopWord()
#include "Sections.h"
#include "Msg1.h"
#include "Speller.h"
//#include "Thesaurus.h"
#include "Mem.h"
#include "Msg3a.h"
#include "HashTableX.h"
#include "Synonyms.h"
#include "Wiki.h"
Query::Query ( ) {
void Query::constructor ( ) {
//m_bmap = NULL;
m_bitScores = NULL;
m_qwords = NULL;
m_numWords = 0;
//m_expressions = NULL;
m_qwordsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwords = NULL;
m_numTerms = 0;
m_containingParent = NULL;
m_st0Ptr = NULL;
// we have to manually call this because Query::constructor()
// might have been called explicitly
//for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
// m_qterms[i].constructor();
//m_expressions = NULL;
reset ( );
void Query::destructor ( ) {
Query::~Query ( ) {
reset ( );
void Query::reset ( ) {
// if Query::constructor() was called explicitly then we have to
// call destructors explicitly as well...
// essentially call QueryTerm::reset() on each query term
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// get it
QueryTerm *qt = &m_qterms[i];
HashTableX *ht = &qt->m_facetHashTable;
// debug note
// log("results: free fhtqt of %"PTRFMT" for q=%"PTRFMT
// " st0=%"PTRFMT,
// (PTRTYPE)ht->m_buf,(PTRTYPE)this,(PTRTYPE)m_st0Ptr);
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
m_qterms = NULL;
m_docIdRestriction = 0LL;
m_groupThatHasDocId = NULL;
//m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
//m_numOperands = 0;
m_numTerms = 0;
m_synTerm = 0;
//m_numIgnored = 0;
//m_numRequired = -1;
m_numComponents = 0;
//if ( m_bmap && m_bmapSize ) // != m_bmbuf )
// mfree ( m_bmap , m_bmapSize , "Query1" );
//if ( m_bitScores && m_bitScoresSize ) // != m_bsbuf )
// mfree ( m_bitScores , m_bitScoresSize , "Query2" );
//m_bmap = NULL;
m_bitScores = NULL;
//m_bmapSize = 0;
m_bitScoresSize = 0;
//if ( m_expressionsAllocSize )
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
if ( m_qwordsAllocSize )
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
//m_expressionsAllocSize = 0;
m_qwordsAllocSize = 0;
m_qwords = NULL;
//m_expressions = NULL;
m_numExpressions = 0;
m_gnext = m_gbuf;
m_hasUOR = false;
m_bmapIsSet = false;
// the site: and ip: query terms will disable site clustering & caching
m_hasPositiveSiteField = false;
m_hasIpField = false;
m_hasUrlField = false;
m_hasSubUrlField = false;
m_hasIlinkField = false;
m_hasGBLangField = false;
m_hasGBCountryField = false;
m_hasQuotaField = false;
m_hasLinksOperator = false;
m_truncated = false;
m_hasSynonyms = false;
// . returns false and sets g_errno on error
// . "query" must be NULL terminated
// . if boolFlag is 0 we ignore all boolean operators
// . if boolFlag is 1 we assume query is boolean
// . if boolFlag is 2 we attempt to detect if query is boolean or not
// . if "keepAllSingles" is true we do not ignore any single word UNLESS
// it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME)
// a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT)
// This is used for term highlighting (Highlight.cpp and Summary.cpp)
bool Query::set2 ( char *query ,
//int32_t queryLen ,
//char *coll ,
//int32_t collLen ,
//char boolFlag ,
//bool keepAllSingles ,
// need language for doing synonyms
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords ,
int32_t maxQueryTerms ) {
m_langId = langId;
m_useQueryStopWords = useQueryStopWords;
// fix summary rerank and highlighting.
bool keepAllSingles = true;
m_maxQueryTerms = maxQueryTerms;
// assume boolean auto-detect.
char boolFlag = 2;
// come back up here if we changed our boolean minds
// top:
if ( ! query ) return true;
// set to 256 for synonyms?
//m_maxQueryTerms = 256;
m_queryExpansion = queryExpansion;
int32_t queryLen = gbstrlen(query);
// override this to 32 at least for now
//if ( m_maxQueryTerms < 32 ) m_maxQueryTerms = 32;
// save collection info
//m_coll = coll;
//m_collLen = collLen;
// truncate query if too big
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
log("query: Query length of %"INT32" must be "
"less than %"INT32". "
queryLen = ABS_MAX_QUERY_LEN - 1;
m_truncated = true;
// save original query
m_osb.setBuf ( m_otmpBuf , 128 , 0 , false );
m_osb.setLabel ("oqbuf" );
m_osb.reserve ( queryLen + 1 );
m_osb.safeMemcpy ( query , queryLen );
m_osb.nullTerm ();
//m_origLen = queryLen;
//gbmemcpy ( m_orig , query , queryLen );
//m_orig [ m_origLen ] = '\0';
m_orig = m_osb.getBufStart();
m_origLen = m_osb.getLength();
log(LOG_DEBUG, "query: set called = %s", m_orig);
char *q = query;
// see if it should be boolean...
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
// but if bool flag is 0 that means it is NOT boolean!
// it must be one for autodetection. so do not autodetect
// unless this is 2.
if ( boolFlag != 2 ) break;
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
if ( q[i]=='O' && q[i+1]=='R' &&
(q[i+2]==' ' || q[i+2]=='(') )
boolFlag = 1;
if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
// if we did not set the flag to 1 set it to 0. force to non-bool
if ( boolFlag == 2 ) boolFlag = 0;
// come back up here if we find no bool operators but had ()'s
// top:
// reset anything that was allocated... in case we're being
// called from below... m_qwords may have been allocated in call
// to setQWords() below
// NO! this resets m_origLen to 0!!! not to mention other member vars
// that were set somewhere above!!! i moved top: label above!
// reserve some space, guessing how much we'd need
int32_t need = queryLen * 2 + 32;
if ( ! m_sb.reserve ( need ) )
return false;
// convenience ptr
//char *p = m_buf;
//char *pend = m_buf + MAX_QUERY_LEN;
bool inQuotesFlag = false;
// . copy query into m_buf
// . translate ( and ) to special query operators so Words class
// can parse them as their own word to make parsing bool queries ez
// for parsing out the boolean operators in setBitScoresBoolean()
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
// gotta count quotes! we ignore operators in quotes
// so you can search for diffbotUri:"article|0|123456"
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
if ( inQuotesFlag ) {
//*p = query [i];
// dst buf must be big enough
// if ( p + 8 >= pend ) {
// g_errno = EBUFTOOSMALL;
// return log(LOG_LOGIC,"query: query: query too big.");
// }
// translate ( and )
if ( boolFlag == 1 && query[i] == '(' ) {
//gbmemcpy ( p , " LeFtP " , 7 ); p += 7;
m_sb.safeMemcpy ( " LeFtP " , 7 );
if ( boolFlag == 1 && query[i] == ')' ) {
//gbmemcpy ( p , " RiGhP " , 7 ); p += 7;
m_sb.safeMemcpy ( " RiGhP " , 7 );
if ( query[i] == '|' ) {
//gbmemcpy ( p , " PiiPE " , 7 ); p += 7;
m_sb.safeMemcpy ( " PiiPE " , 7 );
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
if ( query[i] == '[' && is_digit(query[i+1])) {
int32_t j = i+2;
int32_t val = atol ( &query[i+1] );
while ( is_digit(query[j]) ) j++;
char c = query[j];
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
//sprintf ( p , " LeFtB %"INT32" %c RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %c RiGhB ",
//p += gbstrlen(p);
i = j + 1;
else if ( (c == 'a' || c == 'r') &&
query[j+1]=='p' && query[j+2]==']') {
//sprintf ( p , " LeFtB %"INT32" %cp RiGhB ",
m_sb.safePrintf(" LeFtB %"INT32" %cp RiGhB ",
//p += gbstrlen(p);
i = j + 2;
if ( query[i] == '[' && query[i+1] == ']' ) {
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 1;
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
//sprintf ( p , " LeFtB RiGhB ");
//p += gbstrlen(p);
m_sb.safePrintf ( " LeFtB RiGhB ");
i = i + 2;
char *q = &(query[i]);
// Skip old buzz permalink keywords
if (*q == 'g' && *(q+1) == 'b'){
// do not skip anymore, Msg5e.cpp needs this
if (*(q+2) == 'p' && *(q+3) == 'e' && *(q+4) == 'r'
&& *(q+5) == 'm' && *(q+6) == 'a' && *(q+7) == 'l'
&& *(q+8) == 'i' && *(q+9) == 'n' && *(q+10) == 'k'
&& *(q+11) == ':' && *(q+12) =='1'){
//i += 12;
static bool s_printed = false;
if ( ! s_printed )
logf(LOG_DEBUG,"query: skipping "
"gbpermalink term for buzz.");
if ( ! s_printed ) s_printed = true;
if (*(q+2)=='k' && *(q+3)=='e' && *(q+4) == 'y'
&& *(q+5)=='w' && *(q+6)=='o' && *(q+7) == 'r'
&& *(q+8) == 'd' && *(q+9) == ':'
&& *(q+10)=='r' && *(q+11)=='3' && *(q+12)=='6'
&& *(q+13) == 'p' && *(q+14) == '1'){
//logf(LOG_DEBUG,"query: skipping funky "
// "keyword term for buzz.");
i += 14;
// TODO: copy altavista's operators here? & | !
// otherwise, just a plain copy
// *p = query [i];
// p++;
m_sb.pushChar ( query[i] );
// NULL terminate
//*p = '\0';
// debug statement
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
//printf("query: query: Got new query=%s\n",tempBuf);
// set length
//m_bufLen = p - m_buf;
//m_buf = m_sb.getBufStart();
//m_bufLen = m_sb.length();
Words words;
Phrases phrases;
// set m_qwords[] array from m_buf
if ( ! setQWords ( boolFlag , keepAllSingles , words , phrases ) )
return false;
//log(LOG_DEBUG, "Query: QWords set");
// did we have any boolean operators
char found = 0;
char parens = 0;
if ( boolFlag == 1 ) {
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
char *w = m_qwords[i].m_word;
int32_t wlen = m_qwords[i].m_wordLen;
if (wlen==2 &&w[0]=='O'&&w[1]=='R' )
else if (wlen==3 &&w[0]=='A'&&w[1]=='N'&&w[2]=='D')
else if (wlen==3 &&w[0]=='N'&&w[1]=='O'&&w[2]=='T')
if (wlen==5 &&w[0]=='L' && w[1]=='e' &&
w[2]=='F' && w[3]=='t' && w[4]=='P' )
else if (wlen==5 &&w[0]=='R' && w[1]=='i' &&
w[2]=='G' && w[3]=='h' && w[4]=='P' )
// if we were told it was a bool query or to auto-detect
// and it has no operators, but had parens, re-do so parens
// do not get translated to LeFtP or RiGhP
if ( boolFlag >= 1 && found == 0 && parens == 1 ) {
boolFlag = 0; goto top; }
// if no bool operators, it's definitely not a boolean query
if ( found == 0 ) boolFlag = 0;
// set m_qterms from m_qwords, always succeeds
setQTerms ( words , phrases );
// . now add in compound termlists
// . compound query terms replace lists of UOR'd query terms that
// share the same QueryTerm::m_exclusiveBit (ebit)
// . if it cannot get the compound termlist from a remote cache, then
// Msg2 should get its components
// . component termlists have their compound termlist number
// as their m_componentCode, compound termlists have a componentCode
// of -1, other termlists have a componentCode of -2.
// . Query::addCompoundTerms() will add one extra query term for every
// sequence of UOR'd query terms that share the same ebit.
// Furthermore, it sets the m_componentCodes[] array.
// . The compound term must have the same ebit as its component terms.
// . we use the termid of compound termlists (and NOT their components)
// when routing this query to the host that can use the least
// amount of bandwidth to download/get the termlists. if the compound
// termlist is not in the cache then it will not be on disk or
// in the tree since it is a virtual termlist, BUT we will still
// create it and store it in the cache, so assume it is in a cache,
// because the act of storing it in the cache may require sending
// it to another machine.
// . if m_compoundListMaxSize is 0, do not do compound lists
// . Query::addCompoundTerms() will set the termfreq of compound terms
// to the sum of the termfreqs of its component termlists
//if ( m_compoundListMaxSize > 0 ) addCompoundTerms( );
// . always add them for now
//addCompoundTerms( );
// if m_isBoolean was set and we only have OP_UOR then
// we should probably unset it here (mdw)
// set m_expressions[] and m_operands[] arrays and m_numOperands
// for boolean queries
//if ( m_isBoolean )
// if ( ! setBooleanOperands() ) return false;
// disable stuff for site:, ip: and url: queries
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode == FIELD_SITE &&
qw->m_wordSign != '-' )
m_hasPositiveSiteField = true;
else if ( qw->m_fieldCode == FIELD_IP )
m_hasIpField = true;
else if ( qw->m_fieldCode == FIELD_URL )
m_hasUrlField = true;
else if ( qw->m_fieldCode == FIELD_ILINK )
m_hasIlinkField = true;
else if ( qw->m_fieldCode == FIELD_GBLANG )
m_hasGBLangField = true;
else if ( qw->m_fieldCode == FIELD_GBCOUNTRY )
m_hasGBCountryField = true;
else if ( qw->m_fieldCode == FIELD_QUOTA )
m_hasQuotaField = true;
else if ( qw->m_fieldCode == FIELD_SUBURL )
m_hasSubUrlField = true;
else if ( qw->m_fieldCode == FIELD_SUBURL2 )
m_hasSubUrlField = true;
// set m_docIdRestriction if a term is gbdocid:
for ( int32_t i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) {
// get it
QueryTerm *qt = &m_qterms[i];
// gbdocid:?
if ( qt->m_fieldCode != FIELD_GBDOCID ) continue;
// get docid
char *ds = m_qterms[i].m_term + 8;
m_docIdRestriction = atoll(ds);
//uint32_t gid;
uint32_t shard = getShardNumFromDocId(m_docIdRestriction);
//gid = g_hostdb.getGroupIdFromDocId(m_docIdRestriction);
//m_groupThatHasDocId = g_hostdb.getGroup(gid);
m_groupThatHasDocId = g_hostdb.getShard ( shard );
// . keep it simple for now
// . we limit to MAX_EXRESSIONS to like 10 now i guess
if ( m_isBoolean ) {
m_numExpressions = 1;
if ( ! m_expressions[0].addExpression ( 0 ,
m_numWords ,
this , // Query
0 ) ) // level
// return false with g_errno set on error
return false;
// . if it is not truncated, no need to use hard counts
// . comment this line and the next one out for testing hard counts
if ( ! m_truncated ) return true;
// if got truncated AND under the HARD max, nothing we can do, it
// got cut off due to m_maxQueryTerms limit in Parms.cpp
if ( m_numTerms < (int32_t)MAX_EXPLICIT_BITS ) return true;
// if they just hit the admin's ceiling, there's nothing we can do
if ( m_numTerms >= m_maxQueryTerms ) return true;
// a temp log message
log(LOG_DEBUG,"query: Encountered %"INT32" query terms.",m_numTerms);
// otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS
// so we can use hard counts to get more power...
// . use the hard count for excessive query terms to save explicit bits
// . just look for operands on the first level that are not OR'ed
char redo = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get the ith word
QueryWord *qw = &m_qwords[i];
// mark him as NOT hard required
qw->m_hardCount = 0;
// skip if not on first level
if ( qw->m_level != 0 ) continue;
// stop at first OR on this level
if ( qw->m_opcode == OP_OR ) break;
// skip all punct
if ( qw->m_isPunct ) continue;
// if we are a boolean query,the next operator can NOT be OP_OR
// because we can not used terms that are involved in an OR
// as a hard count term, because they are not required terms
for ( int32_t j=i+1 ; m_isBoolean && j<m_numWords; j++ ) {
// stop at previous operator
char opcode = m_qwords[j].m_opcode;
if ( ! opcode ) continue;
if ( opcode != OP_OR ) break;
// otherwise, the next operator is an OR, so do not
// use a hard count for this term
goto stop;
// mark him as required, so he won't use an explicit bit now
qw->m_hardCount = 1;
// mark it so we can reduce our number of explicit bits used
redo = 1;
// if nothing changed, return now
if ( ! redo ) return true;
// . set the query terms again if we have a int32_t query
// . if QueryWords has m_hardCount set, ensure the explicit bit is 0
// . non-quoted phrases that contain a "required" single word should
// themselves have 0 for their implicit bits, BUT 0x8000 for their
// explicit bit
if ( ! setQTerms ( words , phrases ) )
return false;
// a temp log message
//log(LOG_DEBUG,"query: Compressed to %"INT32" query terms, %"INT32" hard. "
// "(nt=%"INT32")",
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
//if ( ! m_isBoolean ) return true;
// free cuz it was already set
//if ( m_expressionsAllocSize )
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
//m_expressionsAllocSize = 0;
//m_expressions = NULL;
// also set the boolean stuff again too!
//if ( ! setBooleanOperands() ) return false;
return true;
// count how many so PageResults will know if he should offer
// a default OR alternative search if no more results for
// the default AND (rat=1)
int32_t Query::getNumRequired ( ) {
if ( m_numRequired >= 0 ) return m_numRequired;
m_numRequired = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_synonymOf ) continue;
// count it up
return m_numRequired;
// returns false and sets g_errno on error
bool Query::setQTerms ( Words &words , Phrases &phrases ) {
//int32_t shift = 0;
// . set m_qptrs/m_qtermIds/m_qbits
// . use one bit position for each phraseId and wordId
// . first set phrases
int32_t n = 0;
// what is the max value for "shift"?
int32_t max = (int32_t)MAX_EXPLICIT_BITS;
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
// count phrases first for allocating
int32_t nqt = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_phraseId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
// none if weight is absolute zero
if ( qw->m_userWeightPhrase == 0 &&
qw->m_userTypePhrase == 'a' ) continue;
// count single terms
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_phraseId ) continue;
// if we are not start of quote and NOT in a phrase we
// must be the tailing word i guess.
// fixes '"john smith" -"bob dole"' from having
// smith and dole as query terms.
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
// ignore if weight is absolute zero
if ( qw->m_userWeight == 0 &&
qw->m_userType == 'a' ) continue;
// thirdly, count synonyms
Synonyms syn;
int32_t sn = 0;
if ( m_queryExpansion ) sn = m_numWords;
int64_t to = hash64n("to",0LL);
for ( int32_t i = 0 ; i < sn ; i++ ) {
// get query word
QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// not '-' either i guess
if ( qw->m_wordSign == '-' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode &&
qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_GENERIC )
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
int32_t naids = syn.getSynonyms ( &words ,
i ,
// language of the query.
// 0 means unknown. if this
// is 0 we sample synonyms
// from all languages.
m_langId ,
tmpBuf ,
0 ); // m_niceness );
// if no synonyms, all done
if ( naids <= 0 ) continue;
nqt += naids;
m_numTermsUntruncated = nqt;
if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;
// allocate the stack buf
if ( nqt ) {
int32_t need = nqt * sizeof(QueryTerm) ;
if ( ! m_stackBuf.reserve ( need ) )
return false;
char *pp = m_stackBuf.getBufStart();
m_qterms = (QueryTerm *)pp;
pp += sizeof(QueryTerm);
if ( pp > m_stackBuf.getBufEnd() ) { char *xx=NULL;*xx=0; }
// call constructor on each one here
for ( int32_t i = 0 ; i < nqt ; i++ ) {
QueryTerm *qt = &m_qterms[i];
// count phrase terms
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
if ( shift >= max ) {
log("query: Query1 has more than %"INT32" unique terms. "
m_truncated = true;
QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_phraseId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
// none if weight is absolute zero
if ( qw->m_userWeightPhrase == 0 &&
qw->m_userTypePhrase == 'a' ) continue;
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query phrase terms to max term "
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
if ( n >= m_maxQueryTerms ) {
log("query: lost query phrase terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = true ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
qt->m_synonymOf = NULL;
qt->m_ignored = 0;
qt->m_term = NULL;
qt->m_termLen = 0;
qt->m_langIdBitsValid = false;
qt->m_langIdBits = 0;
// assume not a repeat of another query term (set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = false;
// change in both places
qt->m_termId = qw->m_phraseId & TERMID_MASK;
//m_termIds[n] = qw->m_phraseId & TERMID_MASK;
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld", n, m_termIds[n]);
qt->m_rawTermId = qw->m_rawPhraseId;
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
// boolean queries are not allowed term signs for phrases
// UNLESS it is a '*' soft require sign which we need for
// phrases like: "cat dog" AND pig
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
qt->m_termSign = '\0';
//m_termSigns[n] = '\0';
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_phraseSign;
//m_termSigns[n] = qw->m_phraseSign;
// int32_t pw = i-1;
// // . back up until word that contains quote if in a quoted
// // phrase
// // . UOR can only support two word phrases really...
// if (m_qwords[i].m_quoteStart >= 0)
// pw = m_qwords[i].m_quoteStart - 1;
// if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 )
// pw = m_qwords[pw].m_quoteStart - 1;
// // back two more if field
// //if ( pw >= 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME )
// // pw -= 2;
// while (pw>0 &&
// ((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
// (m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
// // is UOR operator? if so, backup over it
// if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ) pw -= 2;
// else goto notUORPhrase;
// if ( pw < 0 ) goto notUORPhrase;
// // . if previous term is UOR'd with us then share the same ebit
// // . this allows us to use lots of UOR'd query terms
// // . the UOR'd lists may also be merged together into a single
// // list if "mergeListMaxSize" is positive
// // if ( n >= 1 &&
// // i >= 4 &&
// // //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// // shift > 0 &&
// // qw->m_hardCount == 0 )
// // shift--;
// // set the UOR term sign
// qt->m_isUORed = true;
// notUORPhrase:
// do not use an explicit bit up if we have a hard count
qt->m_hardCount = qw->m_hardCount;
// if ( qw->m_hardCount == 0 ) {
// qt->m_explicitBit = 1 << shift ;
// shift++;
// }
qw->m_queryWordTerm = NULL;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the phrase
qt->m_term = qw->m_word;
qt->m_termLen = qw->m_phraseLen;
// the QueryWord should have a direct link to the QueryTerm,
// at least for phrase, so we can OR in the bits of its
// constituents in the for loop below
qw->m_queryPhraseTerm = qt ;
// include ourselves in the implicit bits
// qt->m_implicitBits = qt->m_explicitBit;
// doh! gotta reset to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeightPhrase ;
qt->m_userType = qw->m_userTypePhrase ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
// debug
//char tmp[1024];
//gbmemcpy ( tmp , qt->m_term , qt->m_termLen );
//tmp [ qt->m_termLen ] = 0;
//logf(LOG_DEBUG,"got term %s (%"INT32")",tmp,qt->m_termLen);
// otherwise, add it
// now if we have enough room, do the singles
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
if ( shift >= max ) {
"query: Query2 has more than %"INT32" unique terms. "
m_truncated = true;
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// if ( qw->m_ignoreWord ) continue;
// ignore if in quotes
//if ( qw->m_quoteStart >= 0 ) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_phraseId ) continue;
// if we are not start of quote and NOT in a phrase we
// must be the tailing word i guess.
// fixes '"john smith" -"bob dole"' from having
// smith and dole as query terms.
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
// ignore if weight is absolute zero
if ( qw->m_userWeight == 0 &&
qw->m_userType == 'a' ) continue;
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost query terms to max term "
"limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
if ( n >= m_maxQueryTerms ) {
log("query: lost query terms to max term cr "
"limit of %"INT32"",(int32_t)m_maxQueryTerms);
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
qt->m_synonymOf = NULL;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// qt->m_ignored = 0;
// assume not a repeat of another query term (set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
qt->m_termId = qw->m_wordId & TERMID_MASK;
//m_termIds[n] = qw->m_wordId & TERMID_MASK;
qt->m_rawTermId = qw->m_rawWordId;
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld raw: %lld", n, m_termIds[n], qt->m_rawTermId);
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
//m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no synonyms.
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
// get previous text word
//int32_t pw = i - 2;
int32_t pw = i-1;
// // . back up until word that contains quote if in a quoted
// // phrase
// // . UOR can only support two word phrases really...
if (m_qwords[i].m_quoteStart >= 0)
pw = m_qwords[i].m_quoteStart ;
if ( pw > 0 ) pw--;
// back two more if field
int32_t fieldStart=-1;
int32_t fieldLen=0;
if ( pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME)
fieldStart = pw;
if ( pw > 0&& m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME ){
pw -= 1;
fieldStart = pw;
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) {
fieldStart = pw;
// skip if it is punct. fixes queries like
// "(this OR that)" from including '(' or from including
// a space.
if ( fieldStart >-1 &&
m_qwords[fieldStart].m_isPunct &&
fieldStart+1<m_numWords )
if (fieldStart > -1) {
pw = i;
while (pw < m_numWords && m_qwords[pw].m_fieldCode)
fieldLen = m_qwords[pw-1].m_word +
m_qwords[pw-1].m_wordLen -
// // is UOR operator? if so, backup over it
// if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ){
// pw -= 2;
// }
// else goto notUOR;
// if ( pw < 0 ) goto notUOR;
// // . if previous term is UOR'd with us then share the same ebit
// // . this allows us to use lots of UOR'd query terms
// // . the UOR'd lists may also be merged together into a single
// // list if "mergeListMaxSize" is positive
// // if ( n >= 1 &&
// // i >= 4 &&
// // //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// // shift > 0 &&
// // qw->m_hardCount == 0 )
// // shift--;
// // set the UOR term sign
// qt->m_isUORed = true;
// if (m_qwords[pw].m_queryWordTerm)
// m_qwords[pw].m_queryWordTerm->m_isUORed = true;
// if (m_qwords[pw].m_queryPhraseTerm)
// m_qwords[pw].m_queryPhraseTerm->m_isUORed = true;
// notUOR:
// do not use an explicit bit up if we have a hard count
qt->m_hardCount = qw->m_hardCount;
// if ( qw->m_hardCount == 0 ) {
// qt->m_explicitBit = 1 << shift ;
// shift++;
// }
qw->m_queryWordTerm = qt;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the word
if (fieldLen > 0) {
qt->m_term = m_qwords[fieldStart].m_word;
qt->m_termLen = fieldLen;
// fix for query
// text:"" foo bar ""
if ( pw-1 < i ) {
log("query: bad query %s",m_orig);
return false;
// skip past the end of the field value
i = pw-1;
else {
qt->m_termLen = qw->m_wordLen;
qt->m_term = qw->m_word;
//log(LOG_DEBUG, "query: *** term \"%s\"", u8Buf);
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// // . OR ourselves into our parent phrase's m_implicitBits
// // . this makes setting m_bitScores[] easy because if a
// // doc contains this prhase then it IMPLICITLY contains us
// // which will make it easier to satisfy requiredBits
// if ( qw->m_queryPhraseTerm )
// qw->m_queryPhraseTerm->m_implicitBits |=
// qt->m_explicitBit;
// // if we're in the middle of the phrase
// int32_t pn = qw->m_leftPhraseStart;
// // convert word to its phrase QueryTerm ptr, if any
// QueryTerm *tt = NULL;
// if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
// if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
// // . there might be some phrase term that actually contains
// // the same word as we are, but a different occurrence
// // . like '"knowledge management" AND NOT management' query
// for ( int32_t j = 0 ; j < i ; j++ ) {
// // must be our same wordId (same word, different occ.)
// QueryWord *qw2 = &m_qwords[j];
// if ( qw2->m_wordId != qw->m_wordId ) continue;
// // get first word in the phrase that jth word is in
// int32_t pn2 = qw2->m_leftPhraseStart;
// if ( pn2 < 0 ) continue;
// // he implies us!
// QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
// if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
// break;
// }
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
// debug
//char tmp[1024];
//gbmemcpy ( tmp , qt->m_term , qt->m_termLen );
//tmp [ qt->m_termLen ] = 0;
//logf(LOG_DEBUG,"got term %s (%"INT32")",tmp,qt->m_termLen);
// now handle the explicit bits
// moved out of separate phrase and singleton loops
// for phrase UOR support
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
// if ( shift >= max ) {
// log("query: Query has more than %"INT32" unique terms. "
// "Truncating.",max);
// m_truncated = true;
// break;
// }
int32_t pw;
QueryWord *qw = &m_qwords[i];
if (!qw->m_queryWordTerm && !qw->m_queryPhraseTerm)
QueryTerm *qt = qw->m_queryPhraseTerm?
qw->m_queryPhraseTerm :
if (!qt) continue;
pw = i-1;
// . back up until word that contains quote if in a quoted
// phrase
// . UOR can only support two word phrases really...
//if (m_qwords[i].m_quoteStart >= 0)
// pw = m_qwords[i].m_quoteStart - 1;
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
(m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
// is UOR operator? if so, backup over it
if ( pw < 0 || m_qwords[pw].m_opcode != OP_UOR )
goto notUOR;
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
(m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 )
//pw = m_qwords[pw].m_quoteStart + 1;
pw = m_qwords[pw].m_quoteStart;
if (pw < 0) goto notUOR;
// . if previous term is UOR'd with us then share the same ebit
// . this allows us to use lots of UOR'd query terms
// . the UOR'd lists may also be merged together into a single
// list if "mergeListMaxSize" is positive
qt->m_isUORed = true;
// set uor flag on all words in phrase
if (qw->m_queryPhraseTerm && m_qwords[i].m_quoteStart >= 0){
int32_t quoteStart = m_qwords[i].m_quoteStart;
for (int32_t j=quoteStart;j<m_numWords;j++){
if (m_qwords[j].m_ignoreWord) continue;
if (m_qwords[j].m_quoteStart != quoteStart)
QueryTerm *qtp = m_qwords[j].m_queryWordTerm;
if (qtp) {
qtp->m_isUORed = true;
qtp->m_UORedTerm =
//QueryTerm *pqt = NULL;
if (m_qwords[pw].m_queryWordTerm){
m_qwords[pw].m_queryWordTerm->m_isUORed = true;
qt->m_UORedTerm = m_qwords[pw].m_queryWordTerm;
//pqt = m_qwords[pw].m_queryWordTerm;
// set uor flag on all words in previous phrase
if (m_qwords[pw].m_queryPhraseTerm &&
m_qwords[pw].m_quoteStart >= 0) {
m_qwords[pw].m_queryPhraseTerm->m_isUORed = true;
qt->m_UORedTerm = m_qwords[pw].m_queryPhraseTerm;
int32_t quoteStart = m_qwords[pw].m_quoteStart;
for (int32_t j=quoteStart;j<m_numWords;j++){
if (m_qwords[j].m_ignoreWord) continue;
if (m_qwords[j].m_quoteStart != quoteStart)
QueryTerm *qtp = m_qwords[j].m_queryWordTerm;
if (qtp) {
qtp->m_isUORed = true;
qtp->m_UORedTerm =
// if ( n >= 1 &&
// i >= 4 &&
// //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// shift > 0 &&
// qw->m_hardCount == 0 ) {
// shift--;
// }
// if ( qt->m_hardCount == 0 ) {
// // qt->m_explicitBit = 1 << shift ;
// qt->m_explicitBit = shift ;
// shift++;
// }
// // . OR ourselves into our parent phrase's m_implicitBits
// // . this makes setting m_bitScores[] easy because if a
// // doc contains this prhase then it IMPLICITLY contains us
// // which will make it easier to satisfy requiredBits
// if ( qw->m_queryPhraseTerm )
// qw->m_queryPhraseTerm->m_implicitBits |=
// qt->m_explicitBit;
// // if we're in the middle of the phrase
// int32_t pn = qw->m_leftPhraseStart;
// // convert word to its phrase QueryTerm ptr, if any
// QueryTerm *tt = NULL;
// if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
// if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
// // . there might be some phrase term that actually contains
// // the same word as we are, but a different occurrence
// // . like '"knowledge management" AND NOT management' query
// for ( int32_t j = 0 ; j < i ; j++ ) {
// // must be our same wordId (same word, different occ.)
// //QueryWord *qw2 = m_qterms[j].m_qword;
// QueryWord *qw2 = &m_qwords[j];
// if ( qw2->m_wordId != qw->m_wordId ) continue;
// // get first word in the phrase that jth word is in
// int32_t pn2 = qw2->m_leftPhraseStart;
// if ( pn2 < 0 ) continue;
// // he implies us!
// QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
// if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
// break;
// }
if (qt == qw->m_queryPhraseTerm){
if ( qw->m_queryWordTerm){
qt = qw->m_queryWordTerm;
goto doAgain;
// Handle exclusive explicit bits only
shift = 0;
int n2 = 0;
for ( int32_t i = 0; i < n ; i++ ){
// break out if no more explicit bits!
if ( shift >= max ) {
"query: Query4 has more than %"INT32" unique terms. "
m_truncated = true;
QueryTerm *qt = &m_qterms[i];
if (qt->m_UORedTerm) continue;
// sometimes UORedTerm is NULL i guess because of IGNORE_BREECH
if ( qt->m_isUORed && qt->m_qword && qt->m_qword->m_ignoreWord )
// Skip duplicate terms before we waste an explicit bit
bool skip=false;
for (int32_t j=0;j<i;j++){
if ( qt->m_termId != m_qterms[j].m_termId ||
qt->m_termSign != m_qterms[j].m_termSign){
skip = true;
qt->m_explicitBit = m_qterms[j].m_explicitBit;
if (skip) continue;
if ( qt->m_hardCount == 0 ) {
qt->m_explicitBit = 1 << shift++;
// count them for doing number of combos
m_numExplicitBits = shift;
// Handle shared explicit bits
for ( int32_t i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
// assume not in a phrase
qt->m_inPhrase = 0;
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
QueryTerm *qt2 = qt->m_UORedTerm;
if (!qt2) continue;
// chase down first term in UOR chain
while (qt2->m_UORedTerm) qt2 = qt2->m_UORedTerm;
//if (!qt2->m_explicitBit) continue;
//qt->m_explicitBit = qt2->m_explicitBit;
//m_numTerms = n2;
// . set implicit bits, m_implicitBits
// . set m_inPhrase
for (int32_t i = 0; i < m_numWords ; i++ ){
QueryWord *qw = &m_qwords[i];
QueryTerm *qt = qw->m_queryWordTerm;
if (!qt) continue;
if ( qw->m_queryPhraseTerm )
qw->m_queryPhraseTerm->m_implicitBits |=
// set flag if in a a phrase, and set phrase term num
if ( qw->m_queryPhraseTerm ) {
qt->m_inPhrase = 1;
QueryTerm *pt = qw->m_queryPhraseTerm;
qt->m_rightPhraseTermNum = pt - m_qterms;
qt->m_rightPhraseTerm = pt;
// if we're in the middle of the phrase
int32_t pn = qw->m_leftPhraseStart;
// convert word to its phrase QueryTerm ptr, if any
QueryTerm *tt = NULL;
if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
if ( tt ) {
qt->m_inPhrase = 1;
qt->m_leftPhraseTermNum = tt - m_qterms;
qt->m_leftPhraseTerm = tt;
// . there might be some phrase term that actually contains
// the same word as we are, but a different occurrence
// . like '"knowledge management" AND NOT management' query
// . made it from "j < i" into "j < m_numWords" because
// 'test "test bed"' was not working but '"test bed" test'
// was working.
for ( int32_t j = 0 ; j < m_numWords ; j++ ) {
// must be our same wordId (same word, different occ.)
QueryWord *qw2 = &m_qwords[j];
if ( qw2->m_wordId != qw->m_wordId ) continue;
// get first word in the phrase that jth word is in
int32_t pn2 = qw2->m_leftPhraseStart;
// we might be the guy that starts it!
if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j;
// if neither is the case, skip this query word
if ( pn2 < 0 ) continue;
// he implies us!
QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
if ( tt2 ) {
qt->m_inPhrase = 1;
qt->m_leftPhraseTermNum = tt2 - m_qterms;
qt->m_leftPhraseTerm = tt2;
// synonym terms should have copy all the implicit/explicit bits
// into their implicit bits field
for (int32_t i = 0; i < m_numTerms; i++) {
QueryTerm *qt = &m_qterms[i];
QueryTerm *st = qt->m_synonymOf;
if (!st) continue;
// also, if we are "auto insurance", a synonymOf
// "car insurance", we should also imply "car insurance"'s
// terms, 'car' and 'insurance' for purposes of
// IndexTable2.cpp::getWeightScore()'s calculation of "min".
// Because when finding the "max" score of a word, we also
// allow its phrase and synonyms' scores to compete.
qt->m_implicitBits = st->m_implicitBits | st->m_explicitBit;
// now skip if not a phrase synonym
if ( ! qt->m_isPhrase ) continue;
// . we also imply the two words bookending this phrase, if any
// . so see if the leftSynHash is in the syn list
for ( int32_t k = m_synTerm ; k < m_numTerms ; k++ ) {
// get term
QueryTerm *tt = &m_qterms[k];
// skip if phrase
if ( tt->m_isPhrase ) continue;
// must be synonym
if ( ! tt->m_synonymOf ) continue;
// must match one of our ids
if ( tt->m_qword->m_rawWordId != qt->m_leftRawWordId &&
tt->m_qword->m_rawWordId != qt->m_rightRawWordId )
// we imply it now!
qt->m_implicitBits |= tt->m_explicitBit;
// . add synonym query terms now
// . skip this part if language is unknown i guess
// loop over all words in query and process its synonyms list
//if ( m_langId != langUnknown && m_queryExpansion )
// if lang is "xx" unknown we still do synonyms it just does
// a loop over all languages starting with english
// if ( m_queryExpansion )
// sn = m_numWords;
//int64_t to = hash64n("to",0LL);
for ( int32_t i = 0 ; i < sn ; i++ ) {
// get query word
QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// not '-' either i guess
if ( qw->m_wordSign == '-' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode &&
qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_GENERIC )
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
int32_t naids = syn.getSynonyms ( &words ,
i ,
// language of the query.
// 0 means unknown. if this
// is 0 we sample synonyms
// from all languages.
m_langId ,
tmpBuf ,
0 ); // m_niceness );
// if no synonyms, all done
if ( naids <= 0 ) continue;
// sanity
if ( naids > MAX_SYNS ) { char *xx=NULL;*xx=0; }
// now make the buffer to hold them for us
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
// get the term for this word
QueryTerm *origTerm = qw->m_queryWordTerm;
// loop over synonyms for word #i now
for ( int32_t j = 0 ; j < naids ; j++ ) {
// stop breach
if ( n >= ABS_MAX_QUERY_TERMS ) {
log("query: lost synonyms due to max term "
"limit of %"INT32"",
// this happens for 'da da da'
if ( ! origTerm ) continue;
if ( n >= m_maxQueryTerms ) {
log("query: lost synonyms due to max cr term "
"limit of %"INT32"",
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
qt->m_langIdBits = 0;
// synonym of this term...
qt->m_synonymOf = origTerm;
// nuke this crap since it was done above and we
// missed out!
qt->m_inPhrase = 0;
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
// need this for displaying language of syn in
// the json/xml feed in PageResults.cpp
qt->m_langIdBitsValid = true;
int langId = syn.m_langIds[j];
uint64_t langBit = (uint64_t)1 << langId;
if ( langId >= 64 ) langBit = 0;
qt->m_langIdBits |= langBit;
// need this for Matches.cpp
qt->m_synWids0 = syn.m_wids0[j];
qt->m_synWids1 = syn.m_wids1[j];
int32_t na = syn.m_numAlnumWords[j];
// how many words were in the base we used to
// get the synonym. i.e. if the base is "new jersey"
// then it's 2! and the synonym "nj" has one alnum
// word.
int32_t ba = syn.m_numAlnumWordsInBase[j];
qt->m_numAlnumWordsInSynonym = na;
qt->m_numAlnumWordsInBase = ba;
// crap, "nj" is a synonym of the PHRASE TERM
// bigram "new jersey" not of the single word term
// "new" so fix that.
if ( ba == 2 && origTerm->m_rightPhraseTerm )
qt->m_synonymOf = origTerm->m_rightPhraseTerm;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// assume not a repeat of another query term(set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
int64_t wid = syn.m_aids[j];
// might be in a title: field or something
if ( qw->m_prefixHash ) {
int64_t ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
qt->m_termId = wid & TERMID_MASK;
//m_termIds[n] = wid & TERMID_MASK;
qt->m_rawTermId = syn.m_aids[j];
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
//m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
//m_termSigns[n] = qw->m_wordSign;
// do not use an explicit bit up if we got a hard count
qt->m_hardCount = qw->m_hardCount;
//qw->m_queryWordTerm = qt;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// usually this is right
char *ptr = syn.m_termPtrs[j];
// buf if it is NULL that means we transformed the
// word by like removing accent marks and stored
// it in m_synWordBuf, as opposed to just pointing
// to a line in memory of wiktionary-buf.txt.
if ( ! ptr ) {
int32_t off = syn.m_termOffs[j];
if ( off < 0 ) {
char *xx=NULL;*xx=0; }
if ( off > qw->m_synWordBuf.length() ) {
char *xx=NULL;*xx=0; }
// use QueryWord::m_synWordBuf which should
// be persistent and not disappear like
// syn.m_synWordBuf.
ptr = qw->m_synWordBuf.getBufStart() + off;
// point to the string itself that is the word
qt->m_term = ptr;
qt->m_termLen = syn.m_termLens[j];
// qt->m_term = syn.m_termPtrs[j];
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
// otherwise, add it
m_numTerms = n;
if ( n > ABS_MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
// count them for doing number of combos
//m_numExplicitBits = shift;
// . repeated terms have the same termbits!!
// . this is only for bool queries since regular queries ignore
// repeated terms in setWords()
// . we need to support: "trains AND (perl OR python) NOT python"
for ( int32_t i = 0 ; i < n ; i++ ) {
// BUT NOT IF in a UOR'd list!!! Metalincs bug...
if ( m_qterms[i].m_isUORed ) continue;
// that didn't seem to fix it right, for dup terms that
// are the FIRST term in a UOR sequence... they don't seem
// to have m_isUORed set
if ( m_hasUOR ) continue;
for ( int32_t j = 0 ; j < i ; j++ ) {
// skip if not a termid match
m_qterms[i].m_explicitBit = m_qterms[j].m_explicitBit;
// if doing phrases, ignore the unrequired phrase
if ( m_qterms[i].m_isPhrase ) {
if ( m_qterms[j].m_implicitBits )
m_qterms[j].m_repeat = true;
m_qterms[i].m_repeat = true;
// if not doing phrases, just ignore term #i
m_qterms[i].m_repeat = true;
// if we're a special range: term and a doc has us, then
// assume it has our associates too because we are all
// essentially the same term. we don't want this to be a
// factor in the ranking. since gigablast usually puts docs
// with all the terms (between OR operators) above terms that do
// not have all ther terms. that is not a good thing for these terms.
int32_t nw = m_numWords;
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not a range: query term
if ( m_qwords[i].m_fieldCode != FIELD_RANGE ) continue;
// loop over all our associates (in same parens level) to
// get the OR of all the explicit bits
qvec_t allBits = 0;
for ( int32_t j=i;j<nw &&m_qwords[j].m_opcode!=OP_RIGHTPAREN;j++){
if ( m_qwords[j].m_ignoreWord ) continue;
// get the jth word's term
QueryTerm *qt = m_qwords[j].m_queryWordTerm;
// this can be NULL if we already got 16 query terms!
if ( ! qt ) continue;
// skip if no value
if ( ! qt->m_explicitBit ) continue;
// grab it
allBits |= qt->m_explicitBit ;
// now make everyone use just one of those bits
for ( int32_t j=i;j<nw &&m_qwords[j].m_opcode!=OP_RIGHTPAREN;j++){
if ( m_qwords[j].m_ignoreWord ) continue;
// get the jth word's term
QueryTerm *qt = m_qwords[j].m_queryWordTerm;
// this can be NULL if we already got 16 query terms!
if ( ! qt ) continue;
// skip if no value
if ( ! qt->m_explicitBit ) continue;
// force it to use the common bit
qt->m_explicitBit = allBits;
qt->m_implicitBits = allBits;
// . if only have one term and it is a signless phrase, make it signed
// . don't forget to set m_termSigns too!
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
m_qterms[0].m_termSign = '*';
//m_termSigns[0] = '*';
// . or bits into the m_implicitBits member of phrase QueryTerms that
// represent the consitutent words
// . loop over each
//m_numTerms = n2;
// . how many of the terms are non fielded singletons?
// . this is just for support of the BIG HACK in Summary.cpp
m_numTermsSpecial = 0;
for ( int32_t i = 0 ; i < n ; i++ ) {
if ( m_qterms[i].m_isPhrase ) continue;
if ( m_qterms[i].m_fieldCode ) continue;
if ( m_qterms[i].m_isUORed ) continue;
// only skip query stop words if in quotes, if it is in
// quotes then we gotta have it...
if ( m_qterms[i].m_isQueryStopWord && !
m_qterms[i].m_inQuotes ) continue;
if ( m_qterms[i].m_underNOT ) continue;
if ( m_qterms[i].m_termSign == '-' ) continue;
// . set m_componentCodes all to -2
// . addCompoundTerms() will set these appropriately
// . see Msg2.cpp for more info on componentCodes
// . -2 means unset, neither a compound term nor a component term at
// this time
//for( int32_t i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
qt->m_componentCode = -2;
m_numComponents = 0;
// . now set m_phrasePart for Summary.cpp's hackfix filter
// . only set this for the non-phrase terms, since keepAllSingles is
// set to true when setting the Query for Summary.cpp::set in order
// to match the singles
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// assume not in a phrase
m_qterms[i].m_phrasePart = -1;
//if ( ! m_qterms[i].m_isPhrase ) continue;
// skip cd-rom too, if not in quotes
if ( ! m_qterms[i].m_inQuotes ) continue;
// is next term also in a quoted phrase?
if ( i - 1 < 0 ) continue;
//if ( ! m_qterms[i+1].m_isPhrase ) continue;
if ( ! m_qterms[i-1].m_inQuotes ) continue;
// are we in the same quoted phrase?
if ( m_qterms[i+0].m_qword->m_quoteStart !=
m_qterms[i-1].m_qword->m_quoteStart ) continue;
// ok, we're in the same quoted phrase
// . set m_requiredBits
// . these are 1-1 with m_qterms (QueryTerms)
// . required terms have no - sign and have no signless phrases
// . these are what terms doc would NEED to have if we were default AND
// BUT for boolean queries that doesn't apply
m_requiredBits = 0; // no - signs, no signless phrases
m_negativeBits = 0; // terms with - signs
m_forcedBits = 0; // terms with + signs
m_synonymBits = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) {
m_negativeBits |= qt->m_explicitBit; // (1 << i );
// forced bits
if ( qt->m_termSign == '+' && ! m_isBoolean )
m_forcedBits |= qt->m_explicitBit; //(1 << i);
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_synonymOf ) {
m_synonymBits |= qt->m_explicitBit;
// fix gbhastitleindicator:1 where "1" is a stop word
if ( qt->m_isQueryStopWord && ! m_qterms[i].m_fieldCode )
// OR it all up
m_requiredBits |= qt->m_explicitBit; // (1 << i);
// set m_matchRequiredBits which we use for Matches.cpp
m_matchRequiredBits = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// skip all phrase terms
if ( qt->m_isPhrase ) continue;
// OR it all up
m_matchRequiredBits |= qt->m_explicitBit;
// if we have '+test -test':
if ( m_negativeBits & m_requiredBits )
m_numTerms = 0;
// we need to remember this now for tier integration in IndexTable.cpp
//m_requiredBits = requiredBits;
// now set m_matches,ExplicitBits, used only by Matches.cpp so far
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// set it up
m_qterms[i].m_matchesExplicitBits = m_qterms[i].m_explicitBit;
// or in the repeats
for ( int32_t j = 0 ; j < m_numTerms ; j++ ) {
// skip if termid mismatch
if ( m_qterms[i].m_termId != m_qterms[j].m_termId )
// i guess signs do not have to match
//m_qterms[i].m_termSign == m_qterms[j].m_termSign){
m_qterms[i].m_matchesExplicitBits |=
m_numRequired = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not required
qt->m_isRequired = false;
// don't require if negative
// no, consider required, but NEGATIVE required...
//if ( qt->m_termSign == '-' ) continue;
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_isPhrase && qt->m_termSign == '*' ) continue;
if ( qt->m_synonymOf ) continue;
if ( qt->m_ignored ) continue;
// mark it
qt->m_isRequired = true;
// count them
// required quoted phrase terms
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// quoted phrase?
if ( ! qt->m_isPhrase ) continue;
if ( ! qt->m_inQuotes ) continue;
// mark it
qt->m_isRequired = true;
// count them
// . for query 'to be or not to be shakespeare'
// require 'tobe' 'beor' 'tobe' because
// they are bigrams in the wikipedia phrase 'to be or not to be'
// and they all consist solely of query stop words. as of
// 8/20/2012 i took 'not' off the query stop word list.
// . require bigrams that consist of 2 query stop words and
// are in a wikipedia phrase. set termSign to '+' i guess?
// . for 'in the nick' , a wiki phrase, make "in the" required
// and give a big bonus for "the nick" below.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// must be two stop words
if ( ! qw1->m_isQueryStopWord ) continue;
if ( ! qw2->m_isQueryStopWord ) continue;
// mark it
qt->m_isRequired = true;
// count them
// new logic for XmlDoc::setRelatedDocIdWeight() to use
int32_t shift = 0;
m_requiredBits = 0;
for ( int32_t i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
qt->m_explicitBit = 0;
if ( ! qt->m_isRequired ) continue;
// negative terms are "negative required", but we ignore here
if ( qt->m_termSign == '-' ) continue;
qt->m_explicitBit = 1<<shift;
m_requiredBits |= qt->m_explicitBit;
if ( shift >= (int32_t)(sizeof(qvec_t)*8) ) break;
// now implicit bits
for ( int32_t i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
// make it explicit bit at least
qt->m_implicitBits = qt->m_explicitBit;
if ( qt->m_isRequired ) continue;
// synonym?
if ( qt->m_synonymOf )
qt->m_implicitBits |= qt->m_synonymOf->m_explicitBit;
// skip if not bigram
if ( ! qt->m_isPhrase ) continue;
// get sides
QueryTerm *t1 = qt->m_leftPhraseTerm;
QueryTerm *t2 = qt->m_rightPhraseTerm;
if ( ! t1 || ! t2 ) continue;
qt->m_implicitBits |= t1->m_explicitBit;
qt->m_implicitBits |= t2->m_explicitBit;
// . for query 'to be or not to be shakespeare'
// give big bonus for 'ornot' and 'notto' bigram terms because
// the single terms 'or' and 'to' are ignored and because
// 'to be or not to be' is a wikipedia phrase
// . on 8/20/2012 i took 'not' off the query stop word list.
// . now give a big bonus for bigrams whose two terms are in the
// same wikipedia phrase and one and only one of the terms in
// the bigram is a query stop word
// . in general 'ornot' is considered a "synonym" of 'not' and
// gets hit with a .90 score factor, but that should never
// happen, it should be 1.00 and in this special case it should
// be 1.20
// . so for 'time enough for love' the phrase term "enough for"
// gets its m_isWikiHalfStopBigram set AND that phrase term
// is a synonym term of the single word term "enough" and is treated
// as such in the Posdb.cpp logic.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not!
qt->m_isWikiHalfStopBigram = 0;
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// if both query stop words, should have been handled above
// we need one to be a query stop word and the other not
// for this algo
if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord )
// skip if neither is a query stop word
if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord )
// one must be a stop word i guess
// so for 'the time machine' we do not count 'time machine'
// as a halfstopwikibigram
if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord )
// don't require it, if query is 'the tigers' accept
// just 'tigers' but give a bonus for 'the tigers' in
// the document.
//qt->m_isRequired = true;
// count them
// special flag
qt->m_isWikiHalfStopBigram = true;
return true;
// . add in compound terms
// . set m_componentCodes appropriately
void Query::addCompoundTerms ( ) {
// loop through possible starting points of sequences of the same ebit
for (int32_t i = 0 ; i < m_numTerms - 1 ; i++ ) {
// break if too many already
if ( m_numTerms >= MAX_QUERY_TERMS ) break;
// if already processed, skip it
if ( m_componentCodes[i] != -2 ) continue;
// get ebit of the ith query term
qvec_t ebit = m_qterms[i].m_explicitBit;
// skip if 0, it is ignored because it breeched limit of 15
if ( ebit == 0 ) continue;
// skip if next term's ebit is different
//if ( ebit != m_qterms[i+1].m_explicitBit ) continue;
// skip if not UOR'd because it could just be a repeat term
//if ( ! m_qterms[i+1].m_isUORed ) continue;
// all UORed terms have m_isOURed set now
// because UORed terms are not necessarily in order
// (first phrases, then words)
if ( ! m_qterms[i].m_isUORed ) continue;
// the termid of the compound list
int64_t id = 0LL;
// store compound terms last
int32_t n = m_numTerms;
// sum of termfreqs
//int64_t sum = 0;
// we got a UOR'd list, see whose involved
int32_t j ;
int32_t numUORComponents = 0;
char *beg = NULL;
char *end = NULL;
for ( j = 0; j < m_numTerms ; j++ ) {
// if term does not have our ebit, break out
if ( ebit != m_qterms[j].m_explicitBit ) continue;
// otherwise, make this term point to the compound term
m_componentCodes[j] = n;
// an integrate its termid into the compound termid
id = hash64 ( m_qterms[j].m_termId , id ) &TERMID_MASK;
// add in the term frequency (aka popularity)
//sum += m_termFreqs[j];
// keep track so IndexTable::alloc() can get it
// get phrase UOR term right
int32_t a = j;
int32_t b = j;
// if (m_qterms[j].m_qword->m_leftPhraseStart >= 0){
// a = m_qterms[j].m_qword->m_leftPhraseStart;
// b++;
// }
char *newBeg = m_qterms[a].m_term;
// had to add check for newBeg being null
// (because of -O2 ???)
if (!beg || (newBeg && newBeg < beg))
beg = newBeg;
char *newEnd = m_qterms[b].m_term
+ m_qterms[b].m_termLen;
if (!end || newEnd > end)
end = newEnd;
if (!numUORComponents) continue;
// copy it
gbmemcpy ( &m_qterms[n] , &m_qterms[i] , sizeof(QueryTerm) );
// get term's length
//char *beg = m_qterms[i].m_term;
//char *end = m_qterms[j-1].m_term + m_qterms[j-1].m_termLen;
m_qterms[n].m_term = beg;
m_qterms[n].m_termLen = end - beg;
// set its id
m_qterms[n].m_termId = id;
// this array too!
m_termIds[n] = id;
m_qterms[n].m_rawTermId = 0LL;
m_qterms[n].m_isQueryStopWord = false;
m_componentCodes[n] = -1; // code for a compound termid is -1
//m_termFreqs [n] = sum;
m_termSigns [n] = '\0';
// inc the total term count
// -1 means compound, -2 means unset, >= 0 means component
bool Query::isCompoundTerm ( int32_t i ) {
//return ( m_componentCodes[i] == -1 );
if ( i >= m_numTerms ) return false;
QueryTerm *qt = &m_qterms[i];
return ( qt->m_componentCode == -1 );
bool Query::setQWords ( char boolFlag ,
bool keepAllSingles ,
Words &words ,
Phrases &phrases ) {
// . break query up into Words and phrases
// . because we now deal with boolean queries, we make parentheses
// their own separate Word, so tell "words" we're setting a query
//Words words;
if ( ! words.set ( m_sb.getBufStart() , m_sb.length() ,
//buf , m_bufLen,
return log("query: Had error parsing query: %s.",
int32_t numWords = words.getNumWords();
// truncate it
if ( numWords > ABS_MAX_QUERY_WORDS ) {
log("query: Had %"INT32" words. Max is %"INT32". Truncating.",
m_truncated = true;
m_numWords = numWords;
// alloc the mem if we need to (mdw left off here)
int32_t need = m_numWords * sizeof(QueryWord);
// sanity check
if ( m_qwords || m_qwordsAllocSize ) { char *xx = NULL; *xx = 0; }
// point m_qwords to our generic buffer if it will fit
// if ( need < GBUF_SIZE ) {
if ( m_gnext + need < m_gbuf + GBUF_SIZE &&
// it can wrap so watch out with this:
need < GBUF_SIZE ) {
m_qwords = (QueryWord *)m_gnext;
m_gnext += need;
// otherwise, we must allocate memory for it
else {
m_qwords = (QueryWord *)mmalloc ( need , "Query4" );
if ( ! m_qwords )
return log("query: Could not allocate mem for query.");
m_qwordsAllocSize = need;
// reset safebuf in there
for ( int32_t i = 0 ; i < m_numWords ; i++ )
// is all alpha chars in query in upper case? caps lock on?
bool allUpper = true;
char *p = m_sb.getBufStart();//m_buf;
char *pend = m_sb.getBuf(); // m_buf + m_bufLen;
for ( ; p < pend ; p += getUtf8CharSize(p) )
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
allUpper = false; break; }
// . come back here from below when we detect dat query is not boolean
// . we need to redo the bits cuz they may have been messed with below
// redo:
// field code we are in
char fieldCode = 0;
char fieldSign = 0;
char *field = NULL;
int32_t fieldLen = 0;
// keep track of the start of different chunks of quotes
int32_t quoteStart = -1;
bool inQuotes = false;
//bool inVQuotes = false;
char quoteSign = 0;
// the current little sign
char wordSign = 0;
// when reading first word in link: ... field we skip the following
// words until we hit a space because we hash them all together
bool ignoreTilSpace = false;
// assume we're NOT a boolean query
m_isBoolean = false;
// used to not respect the bool operator if it is the first word
bool firstWord = true;
// the query processing is broken into 3 stages.
// . STAGE #1
// . reset all query words to default
// set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT
// . set m_isFieldName, m_fieldCode and m_quoteStart for query words.
// no field names in quotes. +title:"hey there".
// set m_quoteStart to -1 if not in quotes.
// . if quotes immediately follow field code's ':' then distribute
// the field code to all words in the quotes
// . distribute +/- signs across quotes and fields to m_wordSigns.
// support -title:"hey there".
// . set m_quoteStart to -1 if only one alnum word is
// in quotes, what's the point of that?
// . set boolean op codes (m_opcode). cannot be in quotes.
// cannot have a field code. cannot have a word sign (+/-).
// . set m_wordId of FIELD_LINK, _URL, _SITE, _IP fields.
// m_wordId of first should be hash of the whole field value.
// only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF.
// . set m_ignore of non-op codes, non-fieldname, alnum words to 0.
// . set m_wordId of each non-ignored alnum word.
// . STAGE #2
// . customize Bits class:
// first alnum word can start phrase.
// first alnum word in quotes (m_quoteStart >= 0 ) can start phrase.
// connected on the right but not on the left.. can start phrase.
// no pair across any double quote
// no pair across ".." --- UNLESS in quotes!
// no pair across any change of field code.
// field names may not be part of any phrase or paired across.
// boolean ops may not be part of any phrase or paired across.
// ignored words may not be part of any phrase or paired across.
// . STAGE #3
// . set phrases class w/ custom Bits class mods.
// . set m_phraseId and m_rawPhraseId of all QueryWords. if phraseId
// is not 0 (phrase exists) then set m_ignorePhrase to 0.
// . set m_leftConnected, m_rightConnected. word you are connecting
// to must not be ignored. (no field names or op codes).
// ensure you are in a phrase with the connected word, too, to
// really be connected.
// . set m_leftPhraseStart and m_rightPhraseEnd for all
// m_inQuotePhrase is not needed since if m_quoteStart is >= 0
// we MUST be in a quoted phrase!
// . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED.
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0).
// m_wordSign may have inherited quote or field sign.
// . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0)
// m_wordSign may have inherited quote or field sign.
// . if one word in a phrase is negative, then set m_phraseSign to '-'
// set the Bits used for making phrases from the Words class
Bits bits;
if ( ! bits.set ( &words, TITLEREC_CURRENT_VERSION , 0 ))
return log("query: Had error processing query: %s.",
int32_t userWeight = 1;
char userType = 'r';
int32_t userWeightPhrase = 1;
char userTypePhrase = 'r';
int32_t ignorei = -1;
// assume we contain no pipe operator
int32_t pi = -1;
int32_t posNum = 0;
char *ignoreTill = NULL;
// loop over all words, these QueryWords are 1-1 with "words"
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
QueryWord *qw = &m_qwords[i];
// set to defaults?
memset ( qw , 0 , sizeof(QueryWord) );
// but quotestart should be -1
qw->m_quoteStart = -1;
qw->m_leftPhraseStart = -1;
// assume QueryWord is ignored by default
qw->m_ignoreWord = IGNORE_DEFAULT;
qw->m_ignorePhrase = IGNORE_DEFAULT;
qw->m_ignoreWordInBoolQuery = false;
qw->m_wordNum = i;
// get word as a string
//char *w = words.getWord(i);
//int32_t wlen = words.getWordLen(i);
qw->m_word = words.getWord(i);
qw->m_wordLen = words.getWordLen(i);
qw->m_isPunct = words.isPunct(i);
qw->m_posNum = posNum;
// count 1 unit for it
// we ignore the facet value range list...
if ( ignoreTill && qw->m_word < ignoreTill )
// . we duplicated this code from XmlDoc.cpp's
// getWordPosVec() function
if ( qw->m_isPunct ) { // ! wids[i] ) {
char *wp = qw->m_word;
int32_t wplen = qw->m_wordLen;
// simple space or sequence of just white space
if ( words.isSpaces(i) )
posNum += 0;
// 'cd-rom'
else if ( wp[0]=='-' && wplen==1 )
posNum += 0;
// 'mr. x'
else if ( wp[0]=='.' && words.isSpaces2(i,1))
posNum += 0;
// animal (dog)
char *w = words.getWord(i);
int32_t wlen = words.getWordLen(i);
// assume it is a query weight operator
qw->m_queryOp = true;
// ignore it? (this is for query weight operators)
if ( i <= ignorei ) continue;
// deal with pipe operators
if ( wlen == 5 &&
w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') {
pi = i;
qw->m_opcode = OP_PIPE;
// [133.0r]
// is it the bracket operator?
// " LeFtB 113 rp RiGhB "
if ( wlen == 5 &&
i+4 < numWords ) {
// s MUST point to a number
char *s = words.getWord(i+2);
int32_t slen = words.getWordLen(i+2);
// if no number, it must be
// " leFtB RiGhB " or " leFtB p RiGhB "
if ( ! is_digit(s[0]) ) {
// phrase weight reset
if ( s[0] == 'p' ) {
userWeightPhrase = 1;
userTypePhrase = 'r';
ignorei = i + 4;
// word reset
else {
userWeight = 1;
userType = 'r';
ignorei = i + 2;
// get the number
float fval = atof2 (s, slen);
// s2 MUST point to the a,r,ap,rp string
char *s2 = words.getWord(i+4);
// is it a phrase?
if ( s2[1] == 'p' ) {
userWeightPhrase = fval;
userTypePhrase = s2[0]; // a or r
else {
userWeight = fval;
userType = s2[0]; // a or r
// ignore all following words up and inc. i+6
ignorei = i + 6;
// assign score weight, if any for this guy
qw->m_userWeight = userWeight ;
qw->m_userType = userType ;
qw->m_userWeightPhrase = userWeightPhrase ;
qw->m_userTypePhrase = userTypePhrase ;
qw->m_queryOp = false;
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
bool endField = false;
if ( words.hasSpace(i) && ! inQuotes ) endField = true;
// TODO: fix title:" hey there" (space in quotes is ok)
// if there's a quote before the first space then
// it's ok!!!
if ( endField ) {
char *s = words.m_words[i];
char *send = s + words.m_wordLens[i];
for ( ; s < send ; s++ ) {
// if the space is inside the quotes then it
// doesn't count!
if ( *s == '\"' ) { endField = false; break;}
if ( is_wspace_a(*s) ) break;
// cancel the field if we hit a space (not in quotes)
if ( endField ) {
// cancel the field
fieldCode = 0;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
// . maintain inQuotes and quoteStart
// . quoteStart is the word # that starts the current quote
int32_t nq = words.getNumQuotes(i) ;
if ( nq > 0 ) { // && ! ignoreQuotes ) {
// toggle quotes if we need to
if ( nq & 0x01 ) inQuotes = ! inQuotes;
// set quote sign to sign before the quote
if ( inQuotes ) {
quoteSign = '\0';
for ( char *p = w + wlen - 1 ; p > w ; p--){
if ( *p != '\"' ) continue;
if ( *(p-1) == '-' ) quoteSign = '-';
if ( *(p-1) == '+' ) quoteSign = '+';
// . quoteStart is the word # the quotes started at
// . it is -1 if not in quotes
// . now we set it to the alnum word AFTER us!!
if ( inQuotes && i+1< numWords ) quoteStart = i+1;
else quoteStart = -1;
//log(LOG_DEBUG, "Query: nq: %"INT32" inQuotes: %d,quoteStart: %"INT32"",
// nq, inQuotes, quoteStart);
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
// TODO: fix title:" hey there" (space in quotes is ok)
bool cancelField = false;
if ( words.hasSpace(i) && ! inQuotes ) cancelField = true;
// fix title:"foo bar" "another quote" so "another quote"
// is not in the title: field
if ( words.hasSpace(i) && inQuotes && nq>= 2 )
cancelField = true;
// likewise for gbsortby operators watch out for boolean
// operators at the end of the field. we also check for
// parens below when computing the hash of the value.
if ( (fieldCode == FIELD_GBSORTBYINT ||
( w[0] == '(' || w[0] == ')' ) )
cancelField = true;
// BUT if we have a quote, and they just got turned off,
// and the space is not after the quote, do not cancel field!
if ( nq == 1 && cancelField ) {
// if we hit the space BEFORE the quote, do NOT cancel
// the field
for ( char *p = w + wlen - 1 ; p > w ; p--) {
// hey, we got the quote first, keep field
if ( *p == '\"' ) {cancelField = false; break;}
// otherwise, we got space first? cancel it!
if ( is_wspace_a(*p) ) break;
if ( cancelField ) {
// cancel the field
fieldCode = 0;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
// skip if we should
if ( ignoreTilSpace ){
if (m_qwords[i-1].m_fieldCode){
qw->m_fieldCode = m_qwords[i-1].m_fieldCode;
// . is this word potentially a field?
// . it cannot be another field name in a field
if ( i < (m_numWords-2) &&
w[wlen] == ':' && ! is_wspace_utf8(w+wlen+1) &&
//w[wlen+1] != '/' && // as in http://
(! is_punct_utf8(w+wlen+1) || w[wlen+1]=='\"' ||
// for gblatrange2:-106.940994to-106.361282
w[wlen+1]=='-') &&
! fieldCode && ! inQuotes ) {
// field name may have started before though if it
// was a compound field name containing hyphens,
// underscores or periods
int32_t j = i-1 ;
while ( j > 0 &&
((m_qwords[j].m_rawWordId != 0) ||
( m_qwords[j].m_wordLen ==1 &&
((m_qwords[j].m_word)[0]=='-' ||
(m_qwords[j].m_word)[0]=='_' ||
if ( j < 0 ) {
//log(LOG_LOGIC,"query: query: bad "
j = 0; }
// advance j to a non-punct word
while (words.isPunct(j)) j++;
// ignore all of these words then,
// they're part of field name
int32_t tlen = 0;
for ( int32_t k = j ; k <= i ; k++ )
tlen += words.getWordLen(k);
// set field name to the compound name if it is
field = words.getWord (j);
fieldLen = tlen;
if ( j == i ) fieldSign = wordSign;
else fieldSign = m_qwords[j].m_wordSign;
// debug msg
//char ttt[128];
//gbmemcpy ( ttt , field , fieldLen );
//ttt[fieldLen] = '\0';
//log("field name = %s", ttt);
// . is it recognized field name,like "title" or "url"?
// . does it officially end in a colon? incl. in hash?
bool hasColon;
fieldCode = getFieldCode (field, fieldLen, &hasColon) ;
// only url,link,site,ip and suburl field names will
// end a colon, due to historical fuck up
//if ( hasColon ){
// fieldLen++;
// reassign alias fields
//Why??? -p
//if ( fieldCode == FIELD_TYPE ) {
// field = "type" ; fieldLen = 4; }
// if so, it does NOT get its own QueryWord,
// but its sign can be inherited by its members
if ( fieldCode ) {
for ( int32_t k = j ; k <= i ; k++ )
m_qwords[k].m_ignoreWord =
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
qw->m_inQuotes = inQuotes;
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// if we are a punct word, see if we end in a sign that can
// be applied to the next word, a non-punct word
if ( words.isPunct(i) ) {
wordSign = w[wlen-1];
if ( wordSign != '-' && wordSign != '+') wordSign = 0;
if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0;
if ( i > 0 && wlen == 1 ) wordSign = 0;
// assign quoteSign to wordSign if we just got into quotes
//if ( nq > 0 && inQuotes ) quoteSign = wordSign;
// don't add any QueryWord for a punctuation word
if ( words.isPunct(i) ) continue;
// what is the sign of our term? +, -, *, ...
char mysign;
if ( fieldCode ) mysign = fieldSign;
else if ( inQuotes ) mysign = quoteSign;
else mysign = wordSign;
// are we doing default AND?
//if ( forcePlus && ! *mysign ) mysign = '+';
// store the sign
qw->m_wordSign = mysign;
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
// if we're the first alnum in this quote and
// the next word has a quote, then we're just a single word
// in quotes which is silly, so undo it. But we should
// still inherit any quoteSign, however. Be sure to also
// set m_inQuotes to false so Matches.cpp::matchWord() works.
// MDW: don't undo it because we do not want to get synonyms
// of terms in quotes. 7/15/2015
// if ( i == quoteStart ) { // + 1 ) {
// if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
// qw->m_quoteStart = -1;
// qw->m_inQuotes = false;
// }
// }
// . get prefix hash of collection name and field
// . but first convert field to lower case
uint64_t ph;
int32_t fflen = fieldLen;
if ( fflen > 62 ) fflen = 62;
char ff[64];
to_lower3_a ( field , fflen , ff );
ph = hash64 ( ff , fflen );
// map "intitle" map to "title"
if ( fieldCode == FIELD_TITLE )
ph = hash64 ( "title", 5 );
// make "suburl" map to "inurl"
if ( fieldCode == FIELD_SUBURL )
ph = hash64 ( "inurl", 5 );
// fix for filetype:pdf queries
if ( fieldCode == FIELD_TYPE )
ph = hash64 ("type",4);
// these are range constraints on the gbsortby: termlist
// which sorts numbers in a field from low to high
if ( fieldCode == FIELD_GBNUMBERMIN )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMAX )
ph = hash64 ("gbsortby", 8);
ph = hash64 ("gbsortby", 8);
// fix for gbsortbyfloat:product.price
if ( fieldCode == FIELD_GBSORTBYFLOAT )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMININT )
ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBERMAXINT )
ph = hash64 ("gbsortbyint", 11);
ph = hash64 ("gbsortbyint", 11);
// really just like the gbfacetstr operator but we do not
// display the facets, instead we try to match the provided
// facet value exactly, case sensitvely.
// NOT any more because termlist is too big and we need it
// to be fast for diffbot.
//if ( fieldCode == FIELD_GBFIELDMATCH )
// ph = hash64 ("gbfacetstr", 10);
if ( fieldCode == FIELD_GBFACETFLOAT )
ph = hash64 ("gbsortby",8);
if ( fieldCode == FIELD_GBFACETINT )
ph = hash64 ("gbsortbyint",11);
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// prefix hash
qw->m_prefixHash = ph;
// set this flag
if ( fieldCode == FIELD_LINKS ) m_hasLinksOperator = true;
if ( fieldCode == FIELD_SITELINK ) m_hasLinksOperator = true;
// if we're hashing a url:, link:, site: or ip: term,
// then we need to hash ALL up to the first space
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_EXT ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK||
fieldCode == FIELD_SITELINK||
fieldCode == FIELD_LINKS||
fieldCode == FIELD_SITE ||
fieldCode == FIELD_IP ||
fieldCode == FIELD_ISCLEAN ||
fieldCode == FIELD_QUOTA ||
// gbmin:price:1.23
fieldCode == FIELD_GBFACETSTR ||
fieldCode == FIELD_GBFACETINT ||
fieldCode == FIELD_GBAD ) {
// . find 1st space -- that terminates the field value
// . make "end" point to the end of the entire query
char *end =
(words.m_words[words.m_numWords-1] +
// use this for gbmin:price:1.99 etc.
int32_t firstColonLen = -1;
int32_t lastColonLen = -1;
int32_t colonCount = 0;
int32_t firstComma = -1;
// are we a facet term?
bool isFacetNumTerm = false;
if ( fieldCode == FIELD_GBFACETINT )
isFacetNumTerm = true;
if ( fieldCode == FIELD_GBFACETFLOAT )
isFacetNumTerm = true;
// "w" points to the first alnumword after the field,
// so for site:xyz.com "w" points to the 'x' and wlen
// would be 3 in that case sinze xyz is a word of 3
// chars. so advance
// wlen until we hit a space.
while ( w + wlen < end ) {
// stop at first white space
if ( is_wspace_utf8(w+wlen) ) break;
// in case of gbmin:price:1.99 record first ':'
if ( w[wlen]==':' ) {
lastColonLen = wlen;
if ( firstColonLen == -1 )
firstColonLen = wlen;
// fix "gbsortbyint:date)"
// these are used as boolean operators
// so do not include them in the value.
// we also did this above to set cancelField
// to true.
if ( w[wlen] == '(' || w[wlen] == ')' )
// hit a comma in something like
// gbfacetfloat:price,0-1,1-2.5,2.5-10
if ( w[wlen]==',' &&
isFacetNumTerm &&
firstComma == -1 )
firstComma = wlen;
// ignore following words until we hit a space
ignoreTilSpace = true;
// the hash. keep it case insensitive. only
// the fieldmatch stuff should be case-sensitive.
// this may change later.
uint64_t wid = hash64Lower_utf8 ( w , wlen, 0LL );
qw->m_numFacetRanges = 0;
// for gbfacetfloat:price,0-1,1-2.5,... just hash price
if ( firstComma > 0 &&
( fieldCode == FIELD_GBFACETINT ||
// hash the "price" not the following range lst
// crap, since this uses the gbsortby:
// termlists it is NOT case-sensitive
wid = hash64Lower_utf8 ( w , firstComma );
// now store the range list so we can
// fill up the buckets below
char *s = w + firstComma + 1;
char *send = w + wlen;
int32_t nr = 0;
for ( ; s <send && fieldCode == FIELD_GBFACETINT;){
// must be a digit or . or - or *
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' &&
s[0] != '*')
char *sav = s;
// skip to hyphen
for ( ; s < send && *s != '-' ; s++ );
// stop if not hyphen
if ( *s != '-' ) break;
// If the first character is a hyphen, check
// if its part of a negative number. If it is,
// don't consider it a hyphen
if ( sav == s && is_digit(s[1]) ) {
// Read the entire negative number
char *s2 = s + 1;
for ( ; s2 < send && is_digit(s2[0]); s2++);
// If there's a hyphen after the negative
// number, use that as the hyphen separator
if ( *s2 == '-' ) s = s2;
// skip hyphen
// must be a digit or . or - or *
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' &&
s[0] != '*')
// if under max, add it
if ( nr < MAX_FACET_RANGES ) {
if (sav[0] == '*')
qw->m_facetRangeIntA [nr] =
qw->m_facetRangeIntA [nr] = atoll(sav);
if (s[0] == '*')
qw->m_facetRangeIntB [nr] =
qw->m_facetRangeIntB [nr] = atoll(s);
qw->m_numFacetRanges = ++nr;
// skip to comma or end
for ( ; s < send && *s != ',' ; s++ );
// skip that
if ( *s != ',' ) break;
// ignore till. does not included s
ignoreTill = s;
for ( ; s <send && fieldCode==FIELD_GBFACETFLOAT;){
// must be a digit or . or - or *
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' &&
s[0] != '*')
char *sav = s;
// skip to hyphen
for ( ; s < send && *s != '-' ; s++ );
// stop if not hyphen
if ( *s != '-' ) break;
// If the first character is a hyphen, check
// if its part of a negative number. If it is,
// don't consider it a hyphen
if ( sav == s && (is_digit(s[1]) ||
(s[1] == '.' &&
s + 2 < send &&
is_digit(s[2]))) ) {
// Read the entire negative number
char *s2 = s + 1;
for ( ; s2 < send &&
(is_digit(s2[0]) || s2[0] == '.'); s2++);
// If there's a hyphen after the negative
// number, use that as the hyphen separator
if ( *s2 == '-' ) s = s2;
// save that
char *cma = s;
// skip hyphen
// must be a digit or . or - or *
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' &&
s[0] != '*')
// save that
char *sav2 = s;
// advance to comma etc.
for ( ; s < send && *s != ',' ; s++ );
char *cma2 = s;
// if under max, add it
if ( nr < MAX_FACET_RANGES ) {
if (sav[0] == '*')
// min() is min positive value for float, so
// we want -max() instead
qw->m_facetRangeFloatA [nr] =
qw->m_facetRangeFloatA [nr] =atof2(sav,cma-sav);
if (sav2[0] == '*')
qw->m_facetRangeFloatB [nr] =
qw->m_facetRangeFloatB [nr] =atof2(sav2,cma2-sav2);
qw->m_numFacetRanges = ++nr;
// skip that
if ( *s != ',' ) break;
// ignore till. does not included s
ignoreTill = s;
// i've decided not to make
// gbsortby:products.offerPrice
// gbmin:price:1.23 case insensitive
// too late... we have to support what we have
if ( fieldCode == FIELD_GBSORTBYFLOAT ||
wid = hash64Lower_utf8 ( w , wlen , 0LL );
// do not include this word as part of
// any boolean expression, so
// Expression::isTruth() will ignore it and we
// fix '(A OR B) gbsortby:offperice' query
qw->m_ignoreWordInBoolQuery = true;
// this seems case sensitive now, gbfacetstr:humanLang
if ( fieldCode == FIELD_GBFACETSTR ) {
wid = hash64 ( w , wlen , 0LL );
if ( fieldCode == FIELD_GBFIELDMATCH ) {
// hash the json field name. (i.e. tag.uri)
// make it case sensitive as
// seen in XmlDoc.cpp::hashFacet2().
// the other fields are hashed in
// XmlDoc.cpp::hashNumber3().
wid = hash64 ( w , firstColonLen , 0LL);
// if it is like
// gbfieldmatch:tag.uri:"http://xyz.com/poo"
// then we should hash the string into
// an int just like how the field value would
// be hashed when adding gbfacetstr: terms
// in XmlDoc.cpp:hashFacet2(). the hash of
// the tag.uri field, for example, is set
// in hashFacet1() and set to "val32". so
// hash it just like that does here.
char *a = w + firstColonLen + 1;
// . skip over colon at start
if ( a[0] == ':' ) a++;
// . skip over quotes at start/end
bool inQuotes = false;
if ( a[0] == '\"' ) {
inQuotes = true;
// end of field
char *b = a;
// if not in quotes advance until
// we hit whitespace
char cs;
for ( ; ! inQuotes && *b ; b += cs ) {
cs = getUtf8CharSize(b);
if ( is_wspace_utf8(b) ) break;
// if in quotes, go until we hit quote
for ( ; inQuotes && *b != '\"';b++);
// now hash that up. this must be 64 bit
// to match in XmlDoc.cpp::hashFieldMatch()
uint64_t val64 = hash64 ( a , b-a );
// make a composite of tag.uri and http://...
// just like XmlDoc.cpp::hashFacet2() does
wid = hash64 ( val64 , wid );
// gbmin:price:1.23
if ( lastColonLen>0 &&
( fieldCode == FIELD_GBNUMBERMIN ||
// record the field
wid = hash64Lower_utf8(w,lastColonLen , 0LL );
// fix gbminint:gbfacetstr:gbxpath...:165004297
if ( colonCount == 2 ) {
int64_t wid1;
int64_t wid2;
char *a = w;
char *b = w + firstColonLen;
wid1 = hash64Lower_utf8(a,b-a);
a = w + firstColonLen+1;
b = w + lastColonLen;
wid2 = hash64Lower_utf8(a,b-a);
// keep prefix as 2nd arg to this
wid = hash64 ( wid2 , wid1 );
// we need this for it to work
ph = 0LL;
// and also the floating point after that
qw->m_float = atof ( w + lastColonLen + 1 );
qw->m_int = (int32_t)atoll( w +lastColonLen+1);
// should we have normalized before hashing?
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK ||
fieldCode == FIELD_SITELINK ||
fieldCode == FIELD_LINKS ||
fieldCode == FIELD_SITE ) {
Url url;
// do we add www?
bool addwww = false;
if ( fieldCode == FIELD_LINK ) addwww = true;
if ( fieldCode == FIELD_ILINK) addwww = true;
if ( fieldCode == FIELD_LINKS) addwww = true;
if ( fieldCode == FIELD_URL ) addwww = true;
if ( fieldCode == FIELD_GBPARENTURL )
addwww = true;
if ( fieldCode == FIELD_SITELINK)
addwww = true;
url.set ( w , wlen , addwww );
char *site = url.getHost();
int32_t siteLen = url.getHostLen();
if (fieldCode == FIELD_SITELINK)
wid = hash64 ( site , siteLen );
wid = hash64 ( url.getUrl(),
url.getUrlLen() );
//qw->m_wordId = g_indexdb.getTermId ( ph , wid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_wordId = hash64h ( wid , ph );
else qw->m_wordId = wid;
qw->m_rawWordId = 0LL; // only for highlighting?
qw->m_phraseId = 0LL;
qw->m_rawPhraseId = 0LL;
qw->m_opcode = 0;
// definitely not a query stop word
qw->m_isQueryStopWord = false;
// do not ignore the wordId
qw->m_ignoreWord = 0;
// override the word length
//qw->m_wordLen = ulen * 2;
// we are the first word?
firstWord = false;
// we're done with this one
char opcode = 0;
// if query is all in upper case and we're doing boolean
// DETECT, then assume not boolean
if ( allUpper && boolFlag == 2 ) boolFlag = 0;
// . having the UOR opcode does not mean we are boolean because
// we want to keep it fast.
// . we need to set this opcode so the UOR logic in setQTerms()
// works, because it checks the m_opcode value. otherwise
// Msg20 won't think we are a boolean query and set boolFlag
// to 0 when setting the query for summary generation and
// will not recognize the UOR word as being an operator
if ( wlen==3 && w[0]=='U' && w[1]=='O' && w[2]=='R' &&
! firstWord ) {
opcode = OP_UOR; m_hasUOR = true; goto skipin; }
// . is this word a boolean operator?
// . cannot be in quotes or field
if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) {
// are we an operator?
if ( ! firstWord && wlen==2 &&
w[0]=='O' && w[1]=='R')
opcode = OP_OR;
else if ( ! firstWord && wlen==3 &&
w[0]=='A' && w[1]=='N' && w[2]=='D')
opcode = OP_AND;
else if ( ! firstWord && wlen==3 &&
w[0]=='N' && w[1]=='O' && w[2]=='T')
opcode = OP_NOT;
else if ( wlen==5 && w[0]=='L' && w[1]=='e' &&
w[2]=='F' && w[3]=='t' && w[4]=='P' )
opcode = OP_LEFTPAREN;
else if ( wlen==5 && w[0]=='R' && w[1]=='i' &&
w[2]=='G' && w[3]=='h' && w[4]=='P' )
// if we are detecting if query is boolean or not AND
// if we are not an operator and have more than 1 cap
// char then the turn off boolean
//if ( boolFlag==2 &&!opcode &&wlen>1&&is_upper(w[1])){
// // turn boolean stuff off
// boolFlag = 0;
// // start again from the top with NO boolean
// goto redo;
// no pair across or even include any boolean op phrs
if ( opcode ) {
bits.m_bits[i] &= ~D_CAN_START_PHRASE;
bits.m_bits[i] &= ~D_CAN_PAIR_ACROSS;
bits.m_bits[i] &= ~D_CAN_BE_IN_PHRASE;
qw->m_ignoreWord = IGNORE_BOOLOP;
qw->m_opcode = opcode;
if ( opcode == OP_LEFTPAREN ) continue;
if ( opcode == OP_RIGHTPAREN ) continue;
// if this is uncommented all of our operators
// become actual query terms (mdw)
if ( opcode == OP_UOR ) continue;
// if you just have ANDs and ()'s that does
// not make you a boolean query! we are bool
// by default!!
if ( opcode == OP_AND ) continue;
m_isBoolean = true;
// . add single-word term id
// . this is computed by hash64AsciiLower()
// . but only hash64Lower_a if _HASHWITHACCENTS_ is true
uint64_t wid = 0LL;
if (fieldCode == FIELD_CHARSET){
// find first space -- that terminates the field value
char* end =
(words.m_words[words.m_numWords-1] +
while ( w+wlen<end &&
! is_wspace_utf8(w+wlen) ) wlen++;
// ignore following words until we hit a space
ignoreTilSpace = true;
// the hash
//wid = hash64 ( uw , ulen, 0LL );
// convert to enum value
int16_t csenum = get_iana_charset(w,wlen);
// convert back to string
char astr[128];
int32_t alen = sprintf(astr, "%d", csenum);
wid = hash64(astr, alen, 0LL);
wid = words.getWordId(i);
qw->m_rawWordId = wid;
// we now have a first word already set
firstWord = false;
// . are we a QUERY stop word?
// . NEVER count as stop word if it's in all CAPS and
// not all letters in the whole query is NOT in all CAPS
// . It's probably an acronym
if ( words.isUpper(i) && words.getWordLen(i)>1 && ! allUpper ){
qw->m_isQueryStopWord = false;
qw->m_isStopWord = false;
else {
qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid,
// . BUT, if it is a single letter contraction thing
// . ninad: make this == 1 if in utf8! TODO!! it is!
if ( wlen == 1 && w[-1] == '\'' )
qw->m_isQueryStopWord = true;
qw->m_isStopWord =::isStopWord (w,wlen,wid);
// . do not count as query stop word if it is the last in query
// . like the query: 'baby names that start with j'
if ( i + 2 > numWords ) qw->m_isQueryStopWord = false;
// hash the termid
//qw->m_wordId = g_indexdb.getTermId ( ph , wid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_wordId = hash64 ( wid , ph );
else qw->m_wordId = wid;
// do not ignore the word
qw->m_ignoreWord = 0;
// pipe those that should be piped
for ( int32_t i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true;
if ( pi >= 0 ) m_piped = true;
// . set m_leftConnected and m_rightConnected
// . we are connected to the first non-punct word on our left
// if we are separated by a small $ of defined punctuation
// . see getIsConnection() for that definition
// . this allows us to just lookup the phrase for things like
// "cd-rom" rather than lookup "cd" , "rom" and "cd-rom"
// . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or
// . we have to set outside the main loop above since we check
// the m_ignoreWord member of the i+2nd word
for ( int32_t i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&&
isConnection(words.getWord(i+1),words.getWordLen(i+1)) )
qw->m_rightConnected = true;
if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord &&
isConnection(words.getWord(i-1),words.getWordLen(i-1) ) )
qw->m_leftConnected = true;
// now modify the Bits class before generating phrases
for ( int32_t i = 0 ; i < numWords ; i++ ) {
// get default bits
unsigned char b = bits.m_bits[i];
// allow pairing across anything by default
// get Query Word
QueryWord *qw = &m_qwords[i];
// . skip if part of a query weight operator
// . cannot be in a phrase, or anything
if ( qw->m_queryOp && !qw->m_opcode) {
// is this word a sequence of punctuation and spaces?
else if ( words.isPunct(i) ) {
// pair across ANY punct, even double spaces by default
// but do not pair across anything with a quote in it
if ( words.getNumQuotes(i) >0) b &= ~D_CAN_PAIR_ACROSS;
// continue if we're in quotes
else if ( qw->m_quoteStart >= 0 ) goto next;
// continue if we're in a field
else if ( qw->m_fieldCode > 0 ) goto next;
// if guy on left is in field, do not pair across
if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 )
// or if guy on right in field
if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 )
// do not pair across ".." when not in quotes/field
char *w = words.getWord (i);
int32_t wlen = words.getWordLen(i);
for ( int32_t j = 0 ; j < wlen-1 ; j++ ) {
if ( w[j ]!='.' ) continue;
if ( w[j+1]!='.' ) continue;
else {
// . not even capped query stop words can start phrase
// . 'Mice And Men' is just one phrase then
// . TODO: "12345678 it was rainy"
// ("it" should start a phrase)
//if ( qw->m_isQueryStopWord) b &= ~D_CAN_START_PHRASE;
if ( qw->m_isStopWord ) b &= ~D_CAN_START_PHRASE;
// . first alnum word can start phrase.
// . example: 'the tigers'
if ( i <= 1 ) b |= D_CAN_START_PHRASE;
// first alnum word in quotes can start phrase.
if ( qw->m_quoteStart == i ) // + 1 )
// . right connected but not left can start phrase
// . example: 'buy a-rom' , 'buy i-phone'
if ( qw->m_rightConnected && ! qw->m_leftConnected )
// . no field names, bool operators, cruft in fields
// can be any part of a phrase
// . no pair across any change of field code
// . 'girl title:boy' --> no "girl title" phrase!
if ( qw->m_ignoreWord ) { //== IGNORE_FIELDNAME ) {
// . no boolean ops
// . 'this OR that' --> no "this OR that" phrase
if ( qw->m_opcode ) {
if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) {
// set it back all tweaked
bits.m_bits[i] = b;
// . now since we may have prevented pairing across certain things
// we need to set D_CAN_START_PHRASE for stop words whose left
// punct word can no longer be paired across
// . "dancing in the rain" is fun --> will include phrase "is fun".
// . title:"is it right"? --> will include phrase "is it"
for ( int32_t i = 1 ; i < numWords ; i++ ) {
// no punct, alnum only
if ( words.isPunct(i) ) continue;
// skip if not a stop word
if ( ! (bits.m_bits[i] & D_IS_STOPWORD) ) continue;
// continue if you can still pair across prev punct word
if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue;
// otherwise, we can now start a phrase
bits.m_bits[i] |= D_CAN_START_PHRASE;
// a bogus spam class, all words have 0 for their spam probability
//Spam spam;
//spam.reset ( words.getNumWords() );
// treat strongly connected phrases like cd-rom and as being
// in quotes for the most part, therefore, set m_quoteStart for them
int32_t j;
int32_t qs = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// skip all but strongly connected words
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
// must also be non punct word OR a space
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
// break the "quote", if any
qs = -1; continue; }
// if he is punctuation and qs is -1, skip him,
// punctuation words can no longer start a quote
if ( words.isPunct(j) && qs == -1 ) continue;
// uningore him if we should
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
// if already in quotes, don't bother!
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
// remember him
if ( qs == -1 ) qs = j;
// he starts the phrase
m_qwords[j].m_quoteStart = qs;
// force him into a quoted phrase
m_qwords[j].m_inQuotes = true;
//m_qwords[j].m_inQuotedPhrase = true;
// fix for tags.uri:http://foo.com/bar so it works like
// tags.uri:"http://foo.com/bar" like it should
int32_t first = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// stop when we hit spaces
if ( words.hasSpace(j) ) {
first = -1;
// skip if not in field
if ( ! m_qwords[j].m_fieldCode ) continue;
// must be in a generic field, the other fields like site:
// will be messed up by this logic
if ( m_qwords[j].m_fieldCode != FIELD_GENERIC ) continue;
// first alnumword in field?
if ( first == -1 ) {
// must be alnum
if ( m_qwords[j].m_isPunct ) continue;
// must have punct then another alnum word
if ( j+2 >= numWords ) break;
// spaces screw it up
if ( words.hasSpace(j+1) ) continue;
// then an alnum word after
first = j;
// we are in fake quoted phrase
m_qwords[j].m_inQuotes = true;
m_qwords[j].m_quoteStart = first;
// make the phrases from the words and the tweaked Bits class
//Phrases phrases;
if ( ! phrases.set ( &words ,
&bits ,
//NULL ,
true , // use stop words?
false , // use stems?
0 /*niceness*/))//disallows HUGE phrases
return false;
int64_t *wids = words.getWordIds();
// do phrases stuff
for ( int32_t i = 0 ; i < numWords ; i++ ) {
// get the ith QueryWord
QueryWord *qw = &m_qwords[i];
// if word is ignored because it is opcode, or whatever,
// it cannot start a phrase
//if ( qw->m_queryOp && qw->m_opcode == OP_PIPE){
// for (int32_t j = i-1;j>=0;j--){
// if (!m_qwords[j].m_phraseId) continue;
// m_qwords[j].m_ignorePhrase = IGNORE_BOOLOP;
// break;
// }
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
// get the first word # to our left that starts a phrase
// of which we are a member
qw->m_leftPhraseStart = -1;
//int64_t tmp;
for ( int32_t j = i - 1 ; j >= 0 ; j-- ) {
//if ( ! bits.isIndexable(j) ) continue;
if ( ! bits.canPairAcross(j+1) ) break;
//if ( ! bits.canStartPhrase(j) ) continue;
if ( ! wids[j] ) continue;
// phrases.getNumWordsInPhrase()
//if( j + phrases.getMaxWordsInPhrase(j,&tmp)<i) break;
qw->m_leftPhraseStart = j;
// we can't pair across alnum words now, we just want bigrams
if ( wids[j] ) break;
// now we do bigrams so only allow two words even
// if they are stop words
// . is this word in a quoted phrase?
// . the whole phrase must be in the same set of quotes
// . if we're in a left phrase, he must be in our quotes
if ( qw->m_leftPhraseStart >= 0 &&
qw->m_quoteStart >= 0 &&
qw->m_leftPhraseStart >= qw->m_quoteStart )
qw->m_inQuotedPhrase = true;
// if we start a phrase, ensure next guy is in our quote
if ( ! qw->m_ignorePhrase && i+1 < numWords &&
m_qwords[i+1].m_quoteStart >= 0 &&
m_qwords[i+1].m_quoteStart <= i )
qw->m_inQuotedPhrase = true;
// are we the first word in the quote?
if ( i-1>=0 && qw->m_quoteStart == i )
qw->m_inQuotedPhrase = true;
// ignore single words that are in a quoted phrase
if ( ! keepAllSingles && qw->m_inQuotedPhrase )
qw->m_ignoreWord = IGNORE_QUOTED;
// . get phrase info for this term
// . a pid (phraseId)of 0 indicates it does not start a phrase
// . raw phrase termId
//uint64_t pid = phrases.getPhraseId(i);
uint64_t pid = 0LL;
// nwp is a REGULAR WORD COUNT!!
int32_t nwp = 0;
if ( qw->m_inQuotedPhrase )
// keep at a bigram for now... i'm not sure if we
// will be indexing trigrams
nwp = phrases.getMinWordsInPhrase(i,(int64_t *)&pid);
// just get a two-word phrase term if not in quotes
nwp = phrases.getMinWordsInPhrase(i,(int64_t *)&pid);
// store it
qw->m_rawPhraseId = pid;
// does word #i start a phrase?
if ( pid != 0 ) {
uint64_t ph = qw->m_prefixHash ;
// store the phrase id with coll/prefix
//qw->m_phraseId = g_indexdb.getTermId ( ph , pid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_phraseId = hash64 ( pid , ph );
else qw->m_phraseId = pid;
// how many regular words int32_t is the bigram?
int32_t plen2; phrases.getPhrase ( i , &plen2 ,2);
// the trigram?
int32_t plen3; phrases.getPhrase ( i , &plen3 ,3);
// get just the bigram for now
qw->m_phraseLen = plen2;
// do not ignore the phrase, it's valid
qw->m_ignorePhrase = 0;
// set our rightPhraseEnd point
//qw->m_rightPhraseEnd = i + phrases.getNumWords(i);
// leave it as 0 if it got truncated i guess by the
qw->m_rightRawWordId = 0LL;
// store left and right raw word ids
int32_t ni = i + nwp - 1;
if ( ni < m_numWords )
// . phrase sign is inherited from word's sign if it's a minus
// . word sign is inherited from field, quote or right before
// the word
// . that is, all words in -"to be or not" will have a '-' sign
// . phraseId may or may not be 0 at this point
if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-';
// . dist word signs to others in the same connected string
// . use "-cd-rom x-box" w/ no connector in between
// . test queries:
// . +cd-rom +x-box
// . -cd-rom +x-box
// . -m-o-n
// . who was the first (was is a query stop word)
// . www.xxx.com
// . welcome to har.com
// . hezekiah walker the love family affair ii live at radio
// city music hall
// . fotostudio +m-o-n-a-r-t
// . fotostudio -m-o-n-a-r-t
// . i'm home
if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 )
qw->m_wordSign = m_qwords[i-2].m_wordSign;
// . if we connected to the alnum word on our right then
// soft require the phrase (i.e. treat like a single term)
// . example: cd-rom or www.xxx.com
// . 'welcome to har.com' should get a '*' for "har.com" sign
if ( qw->m_rightConnected ) {
if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign;
else qw->m_phraseSign = '*';
// . if we're in quotes then any phrase we have should be
// soft required (i.e. treated like a single term)
// . we do not allow phrases in queries to pair across
// quotes. See where we tweak the Bits class above.
if ( qw->m_quoteStart >= 0 ) {
//if (qw->m_wordSign)qw->m_phraseSign = qw->m_wordSign;
//else qw->m_phraseSign = '*';
qw->m_phraseSign = '*';
// . if we are the last word in a phrase that consists of all
// PLAIN stop words then make the phrase have a '*'
// . 'to be or not to be .. test' (cannot pair across "..")
// . don't use QUERY stop words cuz of "who was the first?" qry
if ( pid ) {
int32_t nw = phrases.getNumWordsInPhrase2(i);
int32_t j;
// search up to this far
int32_t maxj = i + nw;
// but not past our truncated limit
if ( maxj > ABS_MAX_QUERY_WORDS )
for ( j = i ; j < maxj ; j++ ) {
// skip punct
if ( words.isPunct(j) ) continue;
// break out if not a stop word
if ( ! bits.isStopWord(j) ) break;
// break out if has a term sign
if ( m_qwords[j].m_wordSign ) break;
// if everybody in phrase #i was a signless stopword
// and the phrase was signless, make it have a '*' sign
if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' )
m_qwords[i].m_phraseSign = '*';
// . if a constituent has a - sign, then the whole
// phrase becomes negative, too
// . fixes 'apple -computer' truncation problem
for ( int32_t j = i ; j < maxj ; j++ )
if ( m_qwords[j].m_wordSign == '-' )
qw->m_phraseSign = '-';
// . ignore unsigned QUERY stop words that are not yet ignored
// and are in unignored phrases
// . 'who was the first taiwanese president' should not get
// "who was" term sign changed to '*' because "was" is a
// QUERY stop word. So ignore singles query stop words
// in phrases now
if ( //! keepAllSingles &&
(qw->m_isQueryStopWord && !m_isBoolean) &&
m_useQueryStopWords &&
! qw->m_fieldCode &&
// fix 'the tigers'
//(qw->m_leftPhraseStart >= 0 || qw->m_phraseId > 0 ) &&
! qw->m_wordSign &&
! qw->m_ignoreWord )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore word if connected to right or left alnum word
// . we will be replaced by a phrase(s)
// . do not worry about keepAllSingles because we turn
// this into a phrase below!
// . if ( ! keepAllSingles &&
// . MDW: no longer do this. but we should consider them
// wikibigrams for proximity weighting
// if ( ( qw->m_leftConnected || qw->m_rightConnected ) )
// qw->m_ignoreWord = IGNORE_CONNECTED;
// . ignore and/or between quoted phrases, save user from
// themselves (they meant AND/OR)
if ( ! keepAllSingles && qw->m_isQueryStopWord &&
! qw->m_fieldCode &&
m_useQueryStopWords &&
! qw->m_phraseId && ! qw->m_inQuotes &&
((qw->m_wordId == 255176654160863LL) ||
(qw->m_wordId == 46196171999655LL)) )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore repeated single words and phrases
// . look at the old termIds for this, too
// . should ignore 2nd 'time' in 'time after time' then
// . but boolean queries often need to repeat terms
// . NEW - words much be same sign and not in different
// . quoted phrases to be ignored -partap
m_hasDupWords = false;
if ( ! m_isBoolean && !qw->m_ignoreWord ) {
for ( int32_t j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignoreWord ) continue;
if ( m_qwords[j].m_wordId == qw->m_wordId &&
m_qwords[j].m_wordSign ==qw->m_wordSign &&
(!keepAllSingles ||
== qw->m_quoteStart))){
qw->m_ignoreWord = IGNORE_REPEAT;
m_hasDupWords = true;
if ( ! m_isBoolean && !qw->m_ignorePhrase ) {
// ignore repeated phrases too!
for ( int32_t j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignorePhrase ) continue;
if ( m_qwords[j].m_phraseId == qw->m_phraseId &&
== qw->m_phraseSign)
qw->m_ignorePhrase = IGNORE_REPEAT;
// . if we only have one quoted query then force its sign to be '+'
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
// . "time enough for love" --> +"time enough" +"enough for love"
// . if all unignored words are in the same set of quotes then change
// all '*' (soft-required) phrase signs to '+'
for ( j= 0 ; j < numWords ; j++ ) {
if ( words.isPunct(j)) continue;
if ( m_qwords[j].m_quoteStart < 0 ) break;
if ( m_qwords[j].m_ignoreWord ) continue;
if ( j < 2 ) continue;
if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart )
if ( j >= numWords ) {
for ( j= 0 ; j < numWords ; j++ ) {
if ( m_qwords[j].m_phraseSign == '*' )
m_qwords[j].m_phraseSign = '+';
// . force a plus on any site: or ip: query terms
// . also disable site clustering if we have either of these terms
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign ) continue;
if ( qw->m_fieldCode != FIELD_SITE &&
qw->m_fieldCode != FIELD_IP ) continue;
qw->m_wordSign = '+';
// now check phrase terms. if you do a search in quotes like
// "directions and nearby" it will now generate two phrases:
// "directions and nearby" and "and nearby", so stop "and nearby"
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( ! qw->m_phraseId ) continue;
// skip if we start this phrase
if ( qw->m_quoteStart == i ) continue;
// . skip if are not a phrase stop word that is paired across
// . not now, we support 3,4 and 5 word phrases...
//if ( ! qw->m_isStopWord ) continue;
// however, we some quoted phrases are more than 5 words
// TODO: fix this!!!
// ok, nuke this term otherwise
qw->m_ignorePhrase = IGNORE_DEFAULT;
// . if one or more of a phrase's constituent terms exceeded
// term #MAX_QUERY_TERMS then we should also soft require that phrase
// . fixes 'hezekiah walker the love family affair ii live at
// radio city music hall'
// . how many non-ignored phrases?
int32_t count = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( ! qw->m_phraseId ) continue;
for ( int32_t i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// count non-ignored words
if ( qw->m_ignoreWord ) continue;
// if under limit, continue
if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
// . otherwise, ignore
// . if we set this for our UOR'ed terms from SearchInput.cpp's
// UOR'ed facebook interests then it causes us to get no results!
// so make sure that MAX_QUERY_TERMS is big enough with respect to
// the opCount in SearchInput.cpp
qw->m_ignoreWord = IGNORE_BREECH;
// left phrase should get a '*'
int32_t left = qw->m_leftPhraseStart;
if ( left >= 0 && ! m_qwords[left].m_phraseSign )
m_qwords[left].m_phraseSign = '*';
// our phrase should get a '*'
if ( qw->m_phraseId && ! qw->m_phraseSign )
qw->m_phraseSign = '*';
// . fix the 'x -50a' query so it returns results
// . how many non-negative, non-ignored words/phrases do we have?
count = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign == '-' ) continue;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( qw->m_phraseSign == '-' ) continue;
if ( qw->m_phraseId == 0LL ) continue;
// if everybody is ignored or negative UNignore first query stop word
if ( count == 0 ) {
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
qw->m_ignoreWord = 0;
// . count ignored WORDS for logging stats
// . do not IGNORE_DEFAULT though, that doesn't really count
//m_numIgnored = 0;
//for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// if ( ! m_qwords[i].m_ignoreWord ) continue;
// if ( m_qwords[i].m_ignoreWord == IGNORE_DEFAULT ) continue;
// m_numIgnored++;
quoteStart = -1;
int32_t quoteEnd = -1;
// set m_quoteENd
for ( int32_t i = m_numWords - 1 ; i >= 0 ; i-- ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// skip if ignored
if ( qw->m_ignoreWord ) continue;
// skip if not in quotes
if ( qw->m_quoteStart < 0 ) continue;
// if match previous guy...
if ( qw->m_quoteStart == quoteStart ) {
// inherit the end
qw->m_quoteEnd = quoteEnd;
// all done
// ok, we are the end then
quoteEnd = i;
quoteStart = qw->m_quoteStart;
int32_t wkid = 0;
int32_t upTo = -1;
int32_t wk_start;
int32_t wk_nwk;
//int64_t *wids = words.getWordIds();
// set the wiki phrase ids
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// in a phrase from before?
if ( i < upTo ) {
qw->m_wikiPhraseId = wkid;
qw->m_wikiPhraseStart = wk_start;
qw->m_numWordsInWikiPhrase = wk_nwk;
// assume none
qw->m_wikiPhraseId = 0;
// skip if punct
if ( ! wids[i] ) continue;
// get word
int32_t nwk ;
nwk = g_wiki.getNumWordsInWikiPhrase ( i , &words );
// bail if none
if ( nwk <= 1 ) continue;
// save these too
wk_start = i;
wk_nwk = nwk;
// inc it
// store it
qw->m_wikiPhraseId = wkid;
qw->m_wikiPhraseStart = wk_start;
qw->m_numWordsInWikiPhrase = wk_nwk;
// set loop parm
upTo = i + nwk;
// consider terms strongly connected like wikipedia title phrases
for ( int32_t i = 0 ; i + 2 < m_numWords ; i++ ) {
// get ith word
QueryWord *qw1 = &m_qwords[i];
// must not already be in a wikiphrase
//if ( qw1->m_wikiPhraseId > 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId > 0 ) continue;
// if there is a strong connector like the . in 'dmoz.org'
// then consider it a wiki bigram too
if ( ! qw1->m_rightConnected ) continue;
if ( ! qw2->m_leftConnected ) continue;
// fix 'rdf.org.dumps' so org.dumps gets same
// wikiphraseid as rdf.org
int id;
if ( qw1->m_wikiPhraseId ) id = qw1->m_wikiPhraseId;
else id = ++wkid;
// store it
qw1->m_wikiPhraseId = id;
qw1->m_wikiPhraseStart = i;
qw1->m_numWordsInWikiPhrase = 2;
qw2->m_wikiPhraseId = id;
qw2->m_wikiPhraseStart = i;
qw2->m_numWordsInWikiPhrase = 2;
// all done
return true;
// return -1 if does not exist in query, otherwise return the query word num
int32_t Query::getWordNum ( int64_t wordId ) {
// skip if punct or whatever
if ( wordId == 0LL || wordId == -1LL ) return -1;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// the non-raw word id includes a hash with "0", which
// signifies an empty field term
if ( qw->m_rawWordId == wordId ) return i;
// otherwise, not found
return -1;
//static TermTable s_table;
static HashTableX s_table;
static bool s_isInitialized = false;
// 3rd field = m_hasColon
struct QueryField g_fields[] = {
"gbfieldmatch:strings.vendor:\"My Vendor Inc.\"",
"Matches all the meta tag or JSON or XML fields that have "
"the name \"strings.vendor\" and contain the exactly provided "
"value, in this case, <i>My Vendor Inc.</i>. This is CASE "
"SENSITIVE and includes punctuation, so it's exact match. In "
"general, it should be a very short termlist, so it should be fast.",
"Advanced Query Operators",
"Matches the page with that exact url. Uses the first url, not "
"the url it redirects to, if any." ,
0 },
"Match documents whose url ends in the <i>.doc</i> file extension.",
0 },
"Matches all the documents that have a link to "
0 },
//{"links", FIELD_LINKS, true,"Same as link:."},
//{"ilink", FIELD_ILINK, true,"Similar to above."},
"Matches all documents that link to any page on the "
"<i>abc.foobar.com</i> site.",
0 },
"Matches all documents on the mysite.com domain.",
0 },
"Matches all documents whose url starts with "
//{"coll", FIELD_COLL, true,"Not sure if this works."},
"Matches all documents whose IP is",
0 },
"Matches all documents whose IP STARTS with 1.2.3.",
"Matches all documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
0 },
"Same as inurl.",
"Matches all the documents that have the word cat in their "
0 },
"title:\"cat food\"",
"Matches all the documents that have the phrase \"cat food\" "
"in their title.",
"Same as intitle:",
//{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
"Matches all documents that are in RSS feeds. Likewise, use "
"<i>gbinrss:0</i> to match all documents that are NOT in RSS feeds.",
"Matches all documents that are in JSON format. "
"Other possible types include "
"<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
"<i>status</i> matches special documents that are stored every time "
"a url is spidered so you can see all the spider attempts and when "
"they occurred as well as the outcome.",
"Same as type: above.",
"Matches all documents that have been detected as adult documents "
"and may be unsuitable for children. Likewise, use "
"<i>gbisadult:0</i> to match all documents that were NOT detected "
"as adult documents.",
"Matches all documents that contain the specified image.",
"Matches all documents for which Gigablast detected a thumbnail. "
"Likewise use <i>gbhasthumbnail:0</i> to match all documents that "
"do not have thumbnails.",
"Matches all documents whose tag named * have the specified value "
"in the tagdb entry for the url. Example: gbtagsitenuminlinks:2 "
"matches all documents that have 2 qualified "
"inlinks pointing to their site "
"based on the tagdb record. You can also provide your own "
"tags in addition to the tags already present. See the <i>tagdb</i> "
"menu for more information.",
"Matches all documents that have the specified zip code "
"in their meta zip code tag.",
"Matches all documents originally in the Windows-1252 charset. "
"Available character sets are listed in the <i>iana_charset.cpp</i> "
"file in the open source distribution. There are a lot. Some "
"more popular ones are: <i>us, latin1, iso-8859-1, csascii, ascii, "
"latin2, latin3, latin4, greek, utf-8, shift_jis.",
// this just complicates things for now, so comment out
//{"urlhash",FIELD_URLHASH, false,""},
//{"urlhashdiv10",FIELD_URLHASHDIV10, false,""},
//{"urlhashdiv100",FIELD_URLHASHDIV100, false,""},
"Matches all documents in german. "
"The supported language abbreviations "
"are at the bottom of the <a href=/admin/filters>url filters</a> "
"page. Some more "
"common ones are <i>gblang:en, gblang:es, gblang:fr, "
// need quotes for this one!!
"gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
//{"gbcharset", FIELD_CHARSET, false,""},
"Matches all documents whose url has 3 path components to it like "
"Matches all documents that are a minimum of two link hops away "
"from a root url.",
"Matches all documents whose url ends in a filename like "
"<i>http://somedomain.com/dir1/myfile</i> and not "
"<i>http://somedomain.com/dir1/dir2/</i>. Likewise, use "
"<i>gbhasfilename:0</i> to match all the documents that do not "
"have a filename in their url.",
"Matches all documents that have a question mark in their url. "
"Likewise gbiscgi:0 matches all documents that do not.",
"Matches all documents that have a file extension in their url. "
"Likewise, <i>gbhasext:0</i> matches all documents that do not have "
"a file extension in their url.",
"Matches all documents that have a form that submits to the "
"specified url.",
// diffbot only
"Diffbot only. Match the json urls that "
"were extract from this parent url. Example: "
"Matches documents determined by Gigablast to be from the United "
"States. See the country abbreviations in the CountryCode.cpp "
"open source distribution. Some more popular examples include: "
"de, fr, uk, ca, cn.",
0} ,
// mdw
"Matches documents that are permalinks. Use <i>gbpermalink:0</i> "
"to match documents that are NOT permalinks.",
"Matches the document with the docid 123456",
// for content type CT_STATUS documents (Spider status docs)
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
"cameras gbsortbyfloat:price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be a root JSON field or "
"in a meta tag, or in an xml <price> tag.",
"Numeric Field Query Operators",
"cameras gbsortbyfloat:product.price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
0 },
"cameras gbrevsortbyfloat:product.price",
"Like above example but sorted with highest prices on top.",
0 },
"dog gbsortbyint:gbdocspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, with the newest "
"on top.",
"dog gbrevsortbyint:gbdocspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, but with the "
"oldest on top.",
"pilots gbsortbyint:employees",
"Sort all documents that "
"contain 'pilots' by employees. "
"<i>employees</i> can be a root JSON field or "
"in a meta tag, or in an xml <price> tag. The value it "
"contains is interpreted as a 32-bit integer.",
0 },
"Sort all documents by the date they were spidered/downloaded.",
"Sort all documents by employees. Documents can contain "
"<i>employees</i> in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
0 },
"Sort all documents by the number of distinct inlinks the "
"document's site has.",
0 },
"Sort all documents by the date they were spidered/downloaded "
"but with the oldest on top.",
// gbmin:price:1.23
"cameras gbminfloat:price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99. "
"<i>price</i> can be a root JSON field or "
"in a meta tag name <i>price</i>, or in an xml <price> tag.",
0 },
"cameras gbminfloat:product.price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99 "
"in a JSON document like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
0 },
// alias we need to bury
"cameras gbmaxfloat:price:109.99",
"Like the gbminfloat examples above, but is an upper bound.",
0 },
"Similar to gbminfloat and gbmaxfloat but is an equality constraint.",
0 },
"Matches all documents with a spider timestamp of at least "
"1391749680. Use this as opposed th gbminfloat when you need "
"32 bits of integer precision.",
"Matches all companies with 20 or less employees "
"in a JSON document like "
"<i>{ \"company\":{\"employees\":13}} "
"</i> or, alternatively, an XML document like <i>"
"Similar to gbminint and gbmaxint but is an equality constraint.",
"Matches documents that have "
"that spider date timestamp (UTC). "
//"Does not include the "
//"special spider status documents. "
"This is the time the document "
"completed downloading.",
"Date Related Query Operators",
"Like above.",
//, but DOES include the special spider status documents.",
"Like above, but is the time the document was last indexed. "
"This time is "
"slightly greater than or equal to the spider date.",//Does not "
//"include the special spider status documents.",
"Like above.",//, but it does include the special spider status "
// {"gbreplyspiderdate",FIELD_GENERIC,false,
// "Example: gbspiderdate:1400081479 will return spider log "
// "results that have "
// "that spider date timestamp (UTC)"},
"Returns facets in "
"the search results "
"by their color field. <i>color</i> is case INsensitive.",
"Facet Related Query Operators",
"Returns facets in "
"the color field in a JSON document like "
"<i>{ \"product\":{\"color\":\"red\"}} "
"</i> or, alternatively, an XML document like <i>"
"</i>. <i>product.color</i> is case INsensitive.",
"gbfacetstr:gbtagsite cat",
"Returns facets from the site names of all pages "
"that contain the word 'cat' or 'cats', etc. <i>gbtagsite</i> is case insensitive."
{"gbfacetint", FIELD_GBFACETINT, false,
"Returns facets in "
"of the <i>cores</i> field in a JSON document like "
"<i>{ \"product\":{\"cores\":10}} "
"</i> or, alternatively, an XML document like <i>"
"</i>. <i>product.cores</i> is case INsensitive.",
{"gbfacetint", FIELD_GBFACETINT, false,
"Returns facets in "
"of the <i>gbhopcount</i> field over the documents so you can "
"search the distribution of hopcounts over the index. <i>gbhopcount</i> is "
"case INsensitive.",
{"gbfacetint", FIELD_GBFACETINT, false,
"Returns facets in "
"of the <i>sitenuminlinks</i> field for the tag <i>sitenuminlinks</i>"
"in the tag for each site. Any numeric tag in tagdb can be "
"facetizeed "
"in this manner so you can add your own facets this way on a per "
"site or per url basis by making tagdb entries. Case Insensitive.",
{"gbfacetint", FIELD_GBFACETINT, false,
"Returns facets in "
"of the <i>size</i> field (either in json, field or a meta tag) "
"and cluster the results into the specified ranges. <i>size</i> is "
"case INsensitive.",
{"gbfacetint", FIELD_GBFACETINT, false,
"Returns facets based on # of site inlinks the site of each "
"result has. <i>gbsitenuminlinks</i> is case INsensitive.",
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"Returns facets "
"of the <i>weight</i> field in a JSON document like "
"<i>{ \"product\":{\"weight\":1.45}} "
"</i> or, alternatively, an XML document like <i>"
"</i>. <i>product.weight</i> is case INsensitive.",
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"Similar to above but cluster the pricess into the specified ranges. "
"<i>product.price</i> is case insensitive.",
// spider status docs queries
"Query the url of a spider status document.",
"Spider Status Documents", // title
"Query on the last url redirect to, if any.",
NULL, // title
"Query on the status code of the index attempt. 0 means no error.",
"gbssStatusMsg:\"Tcp timed\"",
"Like gbssStatusCode but a textual representation.",
"Query on the HTTP status returned from the web server.",
"Was the document in the index before attempting to index? Use 0 "
" or 1 to find all documents that were not or were, respectively.",
"This field is only present if the document was an object from "
"a diffbot reply. Use gbssIsDiffbotObject:0 to find the non-diffbot "
"If the document was in the index at the time we attempted to "
"reindex it, how long has it been since it was last indexed?",
"Query on the domain of the url.",
"Query on the subdomain of the url.",
"Query on the number of times the url redirect when attempting to "
"index it.",
"Show all the spider status docs for the document with this docId.",
"Query on the hop count of the document.",
"Query on the crawl round number.",
"Show all the documents that were considered dups of this docId.",
"Before this index attempt, how many attempts were there?",
"Before this index attempt, how many successful attempts were there?",
"Before this index attempt, how many failed attempts were there?",
"The date in utc that the document was first indexed.",
"The hash of the document content, excluding dates and times. Used "
"internally for deduping.",
"How long it took in millisecons to download the document.",
"When the download started, in seconds since the epoch, UTC.",
"When the download ended, in seconds since the epoch, UTC.",
"This is 0 or 1 depending on if robots.txt was not obeyed or obeyed, "
"For the last set of indexing attempts how many were errors?",
"The IP address of the document being indexed. Is "
"if unknown.",
"How long it took to lookup the IP of the document. Might have been "
"in the cache.",
"How many good inlinks the document's site had.",
"The site rank of the document. Based directly "
"on the number of inlinks the site had.",
"This is 0 or 1 if the content was not injected or injected, "
"A float between 0 and 100, inclusive. Represents how much "
"the document has changed since the last time we indexed it. This is "
"only valid if the document was successfully indexed this time."
"The spider priority, from 0 to 127, inclusive, of the document "
"according to the url filters table.",
"The url filter expression the document matched.",
"The language of the document. If document was empty or not "
"downloaded then this will not be present. Uses xx to mean "
"unknown language. Uses the language abbreviations found at the "
"bottom of the url filters page.",
"The content type of the document. Like html, xml, json, pdf, etc. "
"This field is not present if unknown.",
"The content length of the document. 0 if empty or not downloaded.",
"The crawl delay according to the robots.txt of the document. "
"This is -1 if not specified in the robots.txt or not found.",
"Was the document's url sent to diffbot for processing this time "
"of spidering the url?",
"Was the document's url sent to diffbot for processing, either this "
"time or some time before?",
"The reply received from diffbot. 0 means success, otherwise, it "
"indicates an error code.",
"The reply received from diffbot represented in text.",
"The length of the reply received from diffbot.",
"The time in milliseconds it took to get a reply from diffbot.",
"The number of times we had to resend the request to diffbot "
"because diffbot returned a 504 gateway timed out error.",
"The number of JSON objects diffbot excavated from the provided url.",
"Matches all special spider status documents that spidered "
"their url successfully. Replace <i>0</i> with other numeric error "
"codes to get the other outcomes.",
"Spider Status Documents", // title
"Matches all special spider status documents that had a status "
"message containing the word <i>tcp</i> like in "
"<i>TCP Timed Out</i>. Similarly, gbstatus:success, "
"gbstatus:\"robots.txt\" are other possibilities.",
"Matches the <i>Spider Status</i> documents for the specified url. "
"These special documents "
"let you know exactly when the url was attempted to be "
"spidered and the outcome.",
0 },
"Matches all the special spider status documents on the "
"mysite.com domain.",
0 },
"Matches all the special spider status "
"documents whose IP is",
0 },
"Matches all the special spider status "
"documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
0 },
"Similar to gbpathdepth: described above but for special "
"spider status documents.",
"Similar to gbhopcount: described above but for special "
"spider status documents.",
"Similar to gbhasfilename: described above but for special "
"spider status documents.",
"Similar to gbiscgi: described above but for special "
"spider status documents.",
"Similar to gbhasext: described above but for special "
"spider status documents.",
// they don't need to know about this
{"gbtagvector", FIELD_GBTAGVECTOR, false,"","",NULL,QTF_HIDE},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,"","",NULL,QTF_HIDE},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,"","",NULL,QTF_HIDE},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE},
{"gbduphash" ,FIELD_GBOTHER,false,"","",NULL,QTF_HIDE},
// call it field url to hash all up to the first space
{"gbsitetemplate" ,FIELD_URL,false,"","",NULL,QTF_HIDE}
//{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
//{"gbdeduped" ,FIELD_GBOTHER,false,""},
//{"gbinjected", FIELD_GBOTHER,false,"Was the document injected?."},
void resetQuery ( ) {
int32_t getNumFieldCodes ( ) {
return (int32_t)sizeof(g_fields) / (int32_t)sizeof(QueryField);
static bool initFieldTable(){
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,0,"qryfldtbl" ) )
return log("build: Could not init table of "
"query fields.");
// now add in all the stop words
int32_t n = getNumFieldCodes();
for ( int32_t i = 0 ; i < n ; i++ ) {
// skip if dup
//if ( g_fields[i].m_flag & QTF_DUP ) continue;
int64_t h = hash64b ( g_fields[i].text );
// if already in there it is a dup
if ( s_table.isInTable ( &h ) ) continue;
// store the entity index in the hash table as score
if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
s_isInitialized = true;
return true;
char getFieldCode ( char *s , int32_t len , bool *hasColon ) {
// default
if (hasColon) *hasColon = false;
if (!initFieldTable()) return FIELD_GENERIC;
int64_t h = hash64Lower_a(s, len );//>> 1) ;
int32_t i = (int32_t) s_table.getScore ( &h ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
char getFieldCode2 ( char *s , int32_t len , bool *hasColon ) {
// default
if (hasColon) *hasColon = false;
if (!initFieldTable()) return FIELD_GENERIC;
// subtract the colon for matching
if ( s[len-1]==':') len--;
int64_t h = hash64 (s , len , 0LL );
int32_t i = (int32_t) s_table.getScore ( &h ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
char getFieldCode3 ( int64_t h64 ) {
if (!initFieldTable()) return FIELD_GENERIC;
// subtract the colon for matching
int32_t i = (int32_t) s_table.getScore ( &h64 ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
// guaranteed to be punctuation
bool Query::isConnection ( char *s , int32_t len ) {
if ( len == 1 ) {
switch (*s) {
// . only allow apostrophe if it's NOT a 's
// . so contractions are ok, and names too
case '\'':
// no, i think we should require it. google seems to,
// and msn and yahoo do. 'john's room -"john's" gives
// no result son yahoo and msn.
return true;
if ( *(s+1) !='s' ) return true;
return false;
case ':': return true;
case '-': return true;
case '.': return true;
case '@': return true;
case '#': return true;
case '/': return true;
case '_': return true;
case '&': return true;
case '=': return true;
case '\\': return true;
default: return false;
return false;
//if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true;
if ( len == 3 && s[0]==':' && s[1]=='/' && s[2]=='/' ) return true;
return false;
void Query::printQueryTerms(){
for (int32_t i=0;i<m_numTerms;i++){
char c = getTermSign(i);
char tt[512];
int32_t ttlen = getTermLen(i);
if ( ttlen > 254 ) ttlen = 254;
if ( ttlen < 0 ) ttlen = 0;
// this is utf8
gbmemcpy ( tt , getTerm(i) , ttlen );
if ( c == '\0' ) c = ' ';
logf(LOG_DEBUG, "query: Query Term #%"INT32" "
"phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64""
" sign=%c "
"ebit=0x%0"XINT64" "
"impBits=0x%0"XINT64" "
"hc=%"INT32" "
"component=%"INT32" "
"otermLen=%"INT32" "
"term=%s ",
(int32_t)isPhrase (i) ,
getTermId (i) ,
getRawTermId (i) ,
c ,
(int64_t)m_qterms[i].m_explicitBit ,
(int64_t)m_qterms[i].m_implicitBits ,
(int32_t) m_qterms[i].m_hardCount ,
tt );
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
bool Query::testBoolean( unsigned char *bits ,int32_t vecSize){//qvec_t bitmask){
if (!m_isBoolean) return false;
Expression *e = &m_expressions [ 0 ];
// find top-level expression
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits,vecSize);//, bitmask);
void Query::printBooleanTree(){
if (!m_isBoolean) return;
//Expression *e = &m_expressions [ 0 ];
// find top-level expression
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
//SafeBuf sbuf(1024,"botree");
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
// . also sets the m_underNOT member of each QueryTerm, too!!
// . returns false and sets g_errno on error, true otherwise
bool Query::setBooleanOperands ( ) {
// we're done if we're not boolean
if ( ! m_isBoolean ) return true;
if ( m_truncated ) {
return log("query: Maximum number of bool operands "
"exceeded (%"INT32").",m_numTerms);
// set the QueryWord::m_opBit member of each query word.
// so if you have a query like 'A B OR C' then you need
// to have both A and B if you don't have C. so every word
// unless its an operator needs its own bit. quoted phrases
// may present a problem down the road we'll have to deal with.
int32_t opNum = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// skip if field, opcode, punct. etc.
if ( m_qwords[i].m_ignoreWord ) continue;
// assign it a # i guess
m_qwords[i].m_opNum = opNum++;
// alloc the mem if we need to (mdw left off here)
//int32_t need = (m_numWords/3) * sizeof(Expression);
// illegitimate bool expressions breech the buffer
int32_t need = (m_numWords) * sizeof(Expression);
// sanity check
if ( m_expressions || m_expressionsAllocSize ) {
char *xx = NULL; *xx = 0; }
// point m_qwords to our generic buffer if it will fit
if ( m_gnext + need < m_gbuf + GBUF_SIZE ) {
m_expressions = (Expression *)m_gnext;
m_gnext += need;
// otherwise, we must allocate memory for it
else {
m_expressions = (Expression *)mmalloc ( need , "Query3" );
if ( ! m_expressions ) return log("query: Could not allocate "
"expressions for query.");
m_expressionsAllocSize = need;
// otherwise, we need to set the boolean Expression classes now
// so we can determine which terms are UNDER the influence of
// NOT operators so IndexReadInfo.cpp can read in the WHOLE termlist
// for those terms. (like it would if they had a '-' m_termSign)
Expression *e = &m_expressions [ 0 ];
m_numExpressions = 1;
// . set the expression recursively
// . just setting this will not set the m_hasNOT members of each
// QueryTerm
int32_t status = e->add ( 0 , // first word #
m_numWords , // last word #
this , // array of QueryWords
0 ,// level
false ); // has NOT?
if ( status < 0 ) {
return log("query: Maximum number of bool operands "
"(%"INT32") exceeded.",(int32_t)MAX_OPERANDS);
while (e->m_parent) {
if (e == e->m_parent) {
g_errno = EBADREQUEST;
return log(LOG_WARN, "query: expression is own parent: "
"%s", m_orig);
e = e->m_parent;
//log(LOG_DEBUG, "query: set %"INT32" operands",
// m_numOperands);
if (g_conf.m_logDebugQuery) {
SafeBuf sbuf(1024);
log(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
// . get all the terms that are UNDER a NOT operator in some fashion
// . these bits are 1-1 with m_qterms[]
qvec_t notBits = e->getNOTBits( false );
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
if ( m_qterms[i].m_explicitBit & notBits )
m_qterms[i].m_underNOT = true;
m_qterms[i].m_underNOT = false;
return true;
// . returns -1 on bad query error
// . returns word AFTER the last word in our operand
int32_t Operand::set ( int32_t a , int32_t b , QueryWord *qwords , int32_t level ,
bool underNOT ) {
// clear these
//m_termBits = 0;
m_hasNOT = false;
//m_hardRequiredBits = 0;
// . parse out the operands and OR in their term bits
// . the boy AND girl --> (the AND boy) AND girl
// . "the boy toy" AND girl --> "the boy" AND "boy toy" AND girl
// . cd-rom AND buy --> "cd-rom" AND buy
// . phraseSign will not be 0 if its important (in quotes, cd-rom,...)
for ( int32_t i = a ; i < b ; i++ ) {
// get the QUERY word
QueryWord *qw = &qwords[i];
// set the parenthetical level of the word
qw->m_level = level;
// set this
//qw->m_underNOT = underNOT;
// skip punct
if ( ! qw->isAlphaWord() ) {
// if it is a parens, bail!
if ( qw->m_opcode == OP_LEFTPAREN ) return i ;
if ( qw->m_opcode == OP_RIGHTPAREN ) return i ;
// otherwise, skip this punct and get next word
else continue;
// bail if op code, return PUNCT word # before it
if ( qw->m_opcode ) return i ;
if ( qw->m_wordSign == '-' || qw->m_phraseSign == '-'){
if (i == a) {
m_hasNOT = true;
else {
if (!m_hasNOT) return i;
else if (i>a && m_hasNOT) return i;
// . does it have an unsigned phrase? or in phrase term bits
// . might have a phrase that's not a QueryTerm because
// query is too long
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
qw->m_phraseSign ) {
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
//m_termBits |= e;
int32_t byte = qw->m_opNum / 8;
int32_t mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
// why would it be ignored? oh... if like cd-rom or in quotes
if ( qw->m_ignoreWord ) continue;
// . OR in the word term bits
// . might be a word that's not a QueryTerm because
// query is too long
if ( qw->m_queryWordTerm ) {
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
//m_termBits |= e;
int32_t byte = qw->m_opNum / 8;
int32_t mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
return b;
// . returns -1 on bad query error
// . returns next word to parse (after expression) on success
// . "*globalNumOperands" is how many expressions/operands are being used
// in the global "expressions" and "operands" array
// . new: organize query into sum of products normal form, ie:
// . (a) OR (b AND c AND d) OR (e AND f)
unsigned char precedence[] = {
0, // term
4, // OR
3, // AND
2, // NOT
1, // LEFTP
1, // RIGHTP
3, // UOR
5, // PIPE
//#define TYPE_OPERAND 1
//#define TYPE_OPCODE 2
// return false and set g_errno on error
// returns how many words expression was
bool Expression::addExpression (int32_t start,
int32_t end,
class Query *q,
int32_t level
) {
if ( level >= MAX_EXPRESSIONS ) {
return false;
// the # of the first alnumpunct word in the expression
m_expressionStartWord = start;
// and the last one
//m_end = end;
//m_hasNOT = hasNOT;
m_q = q;
//m_cc = 0;
int32_t i = m_expressionStartWord;
// try to fix
// type:html AND ((site:xyz.com OR site:abc.com))
// query where there are double parens
m_hadOpCode = false;
// "start" is the current alnumpunct word we are parsing out
for ( ; i<end ; i++ ) {
QueryWord *qwords = q->m_qwords;
QueryWord * qw = &qwords[i];
// set this
//qw->m_underNOT = underNOT;
// set leaf node if not an opcode like "AND" and not punct.
if ( ! qw->m_opcode && qw->isAlphaWord()){
//m_opSlots[m_cc] = i;
//m_opTypes[m_cc] = TYPE_OPERAND;
//qw->m_opBitNum = m_cc;
continue;//goto endExpr; mdw
if (qw->m_opcode == OP_NOT){
//hasNOT = !hasNOT;
//underNOT = hasNOT;
else if (qw->m_opcode == OP_LEFTPAREN){
// this is expression
// . it should advance "i" to end of expression
// point to next...
// make a new one:
Expression *e=&q->m_expressions[q->m_numExpressions-1];
// now set it
if ( ! e->addExpression ( i+1, // skip over (
end ,
q ,
level + 1) )
return false;
// skip over it. pt to ')'
i += e->m_numWordsInExpression;
qw->m_expressionPtr = e;
//m_opSlots[m_cc] = (int32_t)e;
//m_opTypes[m_cc] = TYPE_EXPRESSION;
//qw->m_opBitNum = m_cc;
else if (qw->m_opcode == OP_RIGHTPAREN){
// return size i guess, include )
m_numWordsInExpression = i - m_expressionStartWord+1;
return true;
else if (qw->m_opcode) {
// add that mdw
//m_opSlots[m_cc] = qw->m_opcode;
//m_opTypes[m_cc] = TYPE_OPCODE;
//qw->m_opBitNum = m_cc;
m_hadOpCode = true;
// white space?
m_numWordsInExpression = i - m_expressionStartWord;
return true;
// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery ( unsigned char *bitVec , int32_t vecSize ) {
return m_expressions[0].isTruth ( bitVec , vecSize );
bool isBitNumSet ( int32_t opBitNum, unsigned char *bitVec, int32_t vecSize ) {
int32_t byte = opBitNum / 8;
int32_t mask = 1<<(opBitNum % 8);
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
return bitVec[byte] & mask;
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
// including ignored words and spaces i guess since Expression::add()
// seems to do that.
bool Expression::isTruth ( unsigned char *bitVec ,int32_t vecSize ) {
// operand1 operand2 operator1 operand3 operator2 ....
// result: -1 means unknown at this point
int32_t result = -1;
char prevOpCode = 0;
int32_t prevResult ;
// result of current operand
int32_t opResult = -1;
int32_t i = m_expressionStartWord;
int32_t iend = i + m_numWordsInExpression;
bool hasNot = false;
for ( ; i < iend ; i++ ) {
QueryWord *qw = &m_q->m_qwords[i];
// ignore parentheses, aren't real opcodes.
// we just want OP_AND/OP_OR/OP_NOT
int32_t opcode = qw->m_opcode;
if ( opcode != OP_AND &&
opcode != OP_OR &&
opcode != OP_NOT )
opcode = 0;
if ( opcode == OP_NOT ) {
hasNot = true;
// so operands are expressions as well
Expression *e = (Expression *)qw->m_expressionPtr;
if ( e ) {
// save prev one. -1 means no prev.
prevResult = opResult;
// set new onw
opResult = e->isTruth ( bitVec , vecSize );
// skip over that expression. point to ')'
i += e->m_numWordsInExpression;
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
if ( opcode && ! e ) {
prevOpCode = opcode;//m_opSlots[i];
// simple operand
if ( ! opcode && ! e ) {
// for regular word operands
// ignore it like a space?
if ( qw->m_ignoreWord ) continue;
// ignore gbsortby:offerprice in bool queries
// at least for evaluating them
if ( qw->m_ignoreWordInBoolQuery ) continue;
// save old one
prevResult = opResult;
// convert word to term #
QueryTerm *qt = qw->m_queryWordTerm;
// fix title:"notre dame" AND NOT irish
if ( ! qt ) qt = qw->m_queryPhraseTerm;
if ( ! qt ) continue;
// phrase terms are not required and therefore
// do not have a v alid qt->m_bitNum set, so dont core
if ( ! qt->m_isRequired ) continue;
// . m_bitNum is set in Posdb.cpp when it sets its
// QueryTermInfo array
// . it is basically the query term #
// . see iff that bit is set in this docid's vec
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
// need two to tango. i.e. (true OR false)
if ( prevResult == -1 ) continue;
// if this is not the first time... we got two
if ( prevOpCode == OP_AND ) {
// if first operation we encount is A AND B then
// default result to on. only allow an AND operation
// to turn if off.
if ( result == -1 ) result = true;
if ( ! prevResult ) result = false;
if ( ! opResult ) result = false;
else if ( prevOpCode == OP_OR ) {
// if first operation we encount is A OR B then
// default result to off
if ( result == -1 ) result = false;
if ( prevResult ) result = true;
if ( opResult ) result = true;
// if we never set result, then it was probably a single
// argument expression like something in double parens like
// ((site:xyz.com OR site:abc.com)). so set it to value of
// first operand, opResult.
if ( prevOpCode == 0 && result == -1 ) result = opResult;
if ( result == -1 ) return true;
if ( result == 0 ) return false;
return true;
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
// ourside the parens
qvec_t Expression::getNOTBits ( bool hasNOT ) {
qvec_t notBits = 0;
// for ( int32_t i = 0 ; i < m_numOperands ; i++ ) {
// // get value of the ith operand, be it plain or an expression
// if ( m_operands[i] ) {
// if ( m_hasNOT[i] || hasNOT )
// notBits |= m_operands[i]->m_termBits;
// }
// else
// notBits |= m_expressions[i]->getNOTBits (m_hasNOT[i]);
// }
// success, all operand pairs were true
return notBits;
// print boolean expression for debug purposes
void Expression::print(SafeBuf *sbuf) {
if (m_hasNOT) sbuf->safePrintf("NOT ");
if (m_operand){
for (int32_t i=0; i < m_numChildren ; i++) {
if (i >= m_numChildren-1) break;
switch (m_opcode) {
case OP_OR: sbuf->safePrintf(" OR " ); break;
case OP_AND: sbuf->safePrintf(" AND " ); break;
case OP_UOR: sbuf->safePrintf(" UOR " ); break;
case OP_PIPE: sbuf->safePrintf(" PIPE "); break;
void Operand::print(SafeBuf *sbuf) {
// int32_t shift = 0;
// while (m_termBits >> shift) shift++;
// sbuf->safePrintf("%i", 1<<(shift-1));
if (m_hasNOT) sbuf->safePrintf("NOT 0x%"XINT64"",*(int64_t *)m_opBits);
else sbuf->safePrintf("0x%"XINT64"", *(int64_t *)m_opBits);
// if any one query term is split, msg3a has to split the query
bool Query::isSplit() {
for(int32_t i = 0; i < m_numTerms; i++)
if(m_qterms[i].isSplit()) return true;
return false;
void QueryTerm::constructor ( ) {
m_facetHashTable.constructor(); // hashtablex
m_facetIndexBuf.constructor(); // safebuf
m_langIdBits = 0;
m_langIdBitsValid = false;
m_numDocsThatHaveFacet = 0;
bool QueryTerm::isSplit() {
if(!m_fieldCode) return true;
if(m_fieldCode == FIELD_QUOTA) return false;
if(m_fieldCode == FIELD_GBTAGVECTOR) return false;
if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false;
if(m_fieldCode == FIELD_GBSECTIONHASH) return false;
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
return true;
// hash of all the query terms
int64_t Query::getQueryHash() {
int64_t qh = 0LL;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
qh = hash64 ( qt->m_termId , qh );
return qh;
void QueryWord::constructor () {
void QueryWord::destructor () {