privacore-open-source-searc.../Query.cpp

3554 lines
118 KiB
C++

#include "Query.h"
#include "Bits.h"
#include "Phrases.h"
#include "Url.h"
#include "Domains.h"
#include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs()
#include "StopWords.h" // isQueryStopWord()
#include "Sections.h"
#include "Speller.h"
#include "Mem.h"
#include "Msg3a.h"
#include "HashTableX.h"
#include "Synonyms.h"
#include "HighFrequencyTermShortcuts.h"
#include "Wiki.h"
#include "ScoringWeights.h"
#include "RdbList.h"
#include "Process.h"
#include "Conf.h"
#include "termid_mask.h"
#include "Collectiondb.h"
#include "GbUtil.h"
#include <set>
#include "Lemma.h"
#include "Errno.h"
#include "GbMutex.h"
#include "ScopedLock.h"
static int count_quotes(const char *s, size_t len);
Query::Query()
: m_queryWordBuf("Query4"),
m_tr(),
m_filteredQuery("qrystk"),
m_originalQuery("oqbuf"),
m_bigramWeight(1.0),
m_synonymWeight(1.0),
m_word_variations_config()
{
m_qwords = NULL;
m_numWords = 0;
m_qwords = NULL;
m_numTerms = 0;
// Coverity
m_langId = langUnknown;
m_useQueryStopWords = false;
m_allowHighFreqTermCache = false;
m_numTermsUntruncated = 0;
m_isBoolean = false;
m_maxQueryTerms = 0;
memset(m_expressions, 0, sizeof(m_expressions));
reset ( );
}
Query::~Query ( ) {
reset ( );
}
void Query::reset ( ) {
// if Query::constructor() was called explicitly then we have to
// call destructors explicitly as well...
// essentially call QueryTerm::reset() on each query term
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
qw->destructor();
}
m_queryTermBuf.purge();
m_qterms = NULL;
m_tr.clear();
m_filteredQuery.purge();
m_originalQuery.purge();
m_docIdRestriction = 0LL;
m_numWords = 0;
m_numTerms = 0;
m_queryWordBuf.purge();
m_qwords = NULL;
m_numExpressions = 0;
// the site: and ip: query terms will disable site clustering & caching
m_hasPositiveSiteField = false;
m_hasIpField = false;
m_hasUrlField = false;
m_hasSubUrlField = false;
m_truncated = false;
}
// . returns false and sets g_errno on error
// . "query" must be NULL terminated
// . if boolFlag is 0 we ignore all boolean operators
// . if boolFlag is 1 we assume query is boolen
// . if boolFlag is 2 we attempt to detect if query is boolean or not
// . if "keepAllSingles" is true we do not ignore any single word UNLESS
// it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME)
// a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT)
// This is used for term highlighting (Highlight.cpp and Summary.cpp)
bool Query::set(const char *query,
// need language for doing synonyms
lang_t langId,
float bigramWeight,
float synonymWeight,
const WordVariationsConfig *wordVariationsConfig,
bool useQueryStopWords,
bool allowHighFreqTermCache,
int32_t maxQueryTerms)
{
static const WordVariationsConfig defaultWordVariationsConfig;
if(!wordVariationsConfig)
wordVariationsConfig = &defaultWordVariationsConfig;
log(LOG_DEBUG,"query: set2(query='%s', langId=%d, wiktionaryWordVariations=%s, languageSpecificWordVariations=%s useQueryStopWords=%s maxQueryTerms=%d)",
query, (int)langId, wordVariationsConfig->m_wiktionaryWordVariations?"true":"false", wordVariationsConfig->m_languageSpecificWordVariations?"true":"false", useQueryStopWords?"true":"false", maxQueryTerms);
reset();
m_langId = langId;
m_useQueryStopWords = useQueryStopWords;
m_allowHighFreqTermCache = allowHighFreqTermCache;
// fix summary rerank and highlighting.
bool keepAllSingles = true;
m_maxQueryTerms = maxQueryTerms;
// assume boolean auto-detect.
char boolFlag = 2;
if ( ! query ) return true;
m_bigramWeight = bigramWeight;
m_synonymWeight = synonymWeight;
m_word_variations_config = *wordVariationsConfig;
int32_t queryLen = strlen(query);
// truncate query if too big
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
log(LOG_WARN, "query: Query length of %" PRId32" must be less than %" PRId32". Truncating.",
queryLen,(int32_t)ABS_MAX_QUERY_LEN);
queryLen = ABS_MAX_QUERY_LEN - 1;
m_truncated = true;
}
// save original query
if( !m_originalQuery.reserve ( queryLen + 1 ) ) {
logError("Failed to reserve %" PRId32 " bytes, bailing", queryLen+1);
return true;
}
m_originalQuery.safeMemcpy(query, queryLen);
m_originalQuery.nullTerm();
const char *q = query;
// see if it should be boolean...
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
// but if bool flag is 0 that means it is NOT boolean!
// it must be one for autodetection. so do not autodetect
// unless this is 2.
if ( boolFlag != 2 ) break;
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
if ( q[i]=='O' && q[i+1]=='R' &&
(q[i+2]==' ' || q[i+2]=='(') )
boolFlag = 1;
if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
}
// if we did not set the flag to 1 set it to 0. force to non-bool
if ( boolFlag == 2 ) boolFlag = 0;
// reserve some space, guessing how much we'd need
int32_t need = queryLen * 2 + 32;
if ( ! m_filteredQuery.reserve ( need ) )
return false;
bool inQuotesFlag = false;
// . copy query into m_buf
// . translate ( and ) to special query operators so Words class
// can parse them as their own word to make parsing bool queries ez
// for parsing out the boolean operators in setBitScoresBoolean()
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
// gotta count quotes! we ignore operators in quotes
// so you can search for diffbotUri:"article|0|123456"
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
if ( inQuotesFlag ) {
//*p = query [i];
//p++;
m_filteredQuery.pushChar(query[i]);
continue;
}
// translate ( and )
if ( boolFlag == 1 && query[i] == '(' ) {
m_filteredQuery.safeMemcpy ( " LeFtP " , 7 );
continue;
}
if ( boolFlag == 1 && query[i] == ')' ) {
m_filteredQuery.safeMemcpy ( " RiGhP " , 7 );
continue;
}
if ( query[i] == '|' ) {
m_filteredQuery.safeMemcpy ( " PiiPE " , 7 );
continue;
}
if(query[i] == '[') {
// translate [#w] [#p] [#s] [w] [p] [s] [nrw] to operators
char *endptr=NULL;
double val;
if(is_digit(query[i+1]))
val=strtod(query+i+1,&endptr);
if(endptr!=NULL && endptr!=query+1) {
size_t j = (size_t)(endptr-query);
if(query[j]=='w' && query[j+1]==']') {
m_filteredQuery.safePrintf(" LeFtB %f w RiGhB ", val);
i = j + 1;
continue;
} else if(query[j]=='p' && query[j+1]==']') {
m_filteredQuery.safePrintf(" LeFtB %f p RiGhB ", val);
i = j + 1;
continue;
} else if(query[j]=='s' && query[j+1]==']') {
m_filteredQuery.safePrintf(" LeFtB %f s RiGhB ", val);
i = j + 1;
continue;
}
} else if(query[i+1] == 'w' && query[i+2]==']') {
m_filteredQuery.safePrintf(" LeFtB w RiGhB ");
i = i + 2;
continue;
} else if(query[i+1] == 'p' && query[i+2]==']') {
m_filteredQuery.safePrintf(" LeFtB p RiGhB ");
i = i + 2;
continue;
} else if(query[i+1] == 's' && query[i+2]==']') {
m_filteredQuery.safePrintf(" LeFtB s RiGhB ");
i = i + 2;
continue;
} else if( i+4 < queryLen && query[i+1] == 'n' && query[i+2] == 'r' && query[i+3] == 'w' && query[i+4]==']') {
// user specified [nrw] before word, meaning treat it as not required
m_filteredQuery.safePrintf(" LeFtB nrw RiGhB ");
i = i + 4;
continue;
}
}
// TODO: copy altavista's operators here? & | !
// otherwise, just a plain copy
m_filteredQuery.pushChar ( query[i] );
}
// NULL terminate
m_filteredQuery.nullTerm();
if(m_filteredQuery.length() != queryLen || memcmp(m_filteredQuery.getBufStart(),query,queryLen)!=0)
log(LOG_INFO,"query: m_filteredQuery=%*.*s", m_filteredQuery.length(),m_filteredQuery.length(),m_filteredQuery.getBufStart());
Phrases phrases;
// set m_qwords[] array from m_buf
if ( ! setQWords(boolFlag, keepAllSingles, phrases) )
return false;
// set m_qterms from m_qwords, always succeeds
setQTerms();
// disable stuff for site:, ip: and url: queries
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode == FIELD_SITE &&
qw->m_wordSign != '-' )
m_hasPositiveSiteField = true;
else if ( qw->m_fieldCode == FIELD_IP )
m_hasIpField = true;
else if ( qw->m_fieldCode == FIELD_URL )
m_hasUrlField = true;
else if ( qw->m_fieldCode == FIELD_SUBURL )
m_hasSubUrlField = true;
}
// set m_docIdRestriction if a term is gbdocid:
for ( int32_t i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) {
// get it
QueryTerm *qt = &m_qterms[i];
if( qt->m_fieldCode == FIELD_GBTERMID ) {
const char *ds = m_qterms[i].m_term + 9; // strlen("gbtermid:")
qt->m_termId = atoll(ds);
}
// gbdocid:?
if ( qt->m_fieldCode != FIELD_GBDOCID ) continue;
// get docid
const char *ds = m_qterms[i].m_term + 8;
m_docIdRestriction = atoll(ds);
break;
}
// . keep it simple for now
// . we limit to MAX_EXRESSIONS to like 10 now i guess
if ( m_isBoolean ) {
m_numExpressions = 1;
if ( ! m_expressions[0].addExpression ( 0 ,
m_numWords ,
this , // Query
0 ) ) // level
// return false with g_errno set on error
return false;
}
log(LOG_DEBUG,"query: m_numWords=%d, m_numTerms=%d", m_numWords, m_numTerms);
// . if it is not truncated, no need to use hard counts
// . comment this line and the next one out for testing hard counts
if ( ! m_truncated ) return true;
// if they just hit the admin's ceiling, there's nothing we can do
if ( m_numTerms >= m_maxQueryTerms ) return true;
// a temp log message
log(LOG_DEBUG,"query: Encountered %" PRId32" query terms.",m_numTerms);
// otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS
// so we can use hard counts to get more power...
// . use the hard count for excessive query terms to save explicit bits
// . just look for operands on the first level that are not OR'ed
char redo = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get the ith word
QueryWord *qw = &m_qwords[i];
// stop at first OR on this level
if ( qw->m_opcode == opcode_t::OP_OR ) break;
// skip all punct
if ( qw->m_isPunct ) continue;
// if we are a boolean query,the next operator can NOT be OP_OR
// because we can not used terms that are involved in an OR
// as a hard count term, because they are not required terms
for ( int32_t j=i+1 ; m_isBoolean && j<m_numWords; j++ ) {
// stop at previous operator
opcode_t opcode = m_qwords[j].m_opcode;
if ( opcode == opcode_t::OP_NONE ) continue;
if ( opcode != opcode_t::OP_OR ) break;
// otherwise, the next operator is an OR, so do not
// use a hard count for this term
goto stop;
}
// mark it so we can reduce our number of explicit bits used
redo = 1;
}
stop:
// if nothing changed, return now
if ( ! redo ) return true;
// . set the query terms again if we have a long query
if ( ! setQTerms() )
return false;
return true;
}
// returns false and sets g_errno on error
bool Query::setQTerms() {
if(g_conf.m_logTraceQuery) {
logTrace(g_conf.m_logTraceQuery, "Query::setQTerms(words:%zu)", m_tr.size());
for(unsigned i=0; i<m_tr.size(); i++) {
logTrace(g_conf.m_logTraceQuery, " word #%u: '%*.*s'", i, (int)m_tr[i].token_len, (int)m_tr[i].token_len, m_tr[i].token_start);
int64_t phraseTermId = m_qwords[i].m_bigramId&TERMID_MASK;
int64_t wordTermId = m_qwords[i].m_wordId&TERMID_MASK;
logTrace(g_conf.m_logTraceQuery, " m_bigramId=%20" PRId64" (%15" PRId64"), m_ignorePhrase=%d m_bigramLen=%d", m_qwords[i].m_bigramId, phraseTermId, m_qwords[i].m_ignorePhrase, m_qwords[i].m_bigramLen);
logTrace(g_conf.m_logTraceQuery, " m_wordId =%20" PRId64" (%15" PRId64"), m_ignoreWord=%d, m_quoteStart=%d, m_quoteEnd=%d, fieldCode=%s, m_prefixHash=0x%lx", m_qwords[i].m_wordId, wordTermId, m_qwords[i].m_ignoreWord, m_qwords[i].m_quoteStart, m_qwords[i].m_quoteEnd, m_qwords[i].m_fieldCode?getFieldCodeName(m_qwords[i].m_fieldCode):"",m_qwords[i].m_prefixHash);
}
}
// . set m_qptrs/m_qtermIds/m_qbits
// . use one bit position for each phraseId and wordId
// count phrases first for allocating
//Removed: elaborate counting of possible bigrams. Done instead: this:
int numCandidatePhrases = m_numWords-1;
// count single terms
int numCandidateSingles = 0;
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;
// if we are not start of quote and NOT in a phrase we
// must be the tailing word i guess.
// fixes '"john smith" -"bob dole"' from having
// smith and dole as query terms.
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
continue;
// ignore if weight is absolute zero
if ( almostEqualFloat(qw->m_userWeightForWord,0) )
continue;
numCandidateSingles++;
}
// thirdly, count synonyms
int numCandidateSynonyms = 0;
Synonyms syn;
if(m_word_variations_config.m_wiktionaryWordVariations) {
int64_t to = hash64n("to");
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get query word
const QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// not '-' either i guess
if ( qw->m_wordSign == '-' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// ignore if word weight is zero or synonym weight is zero
if(almostEqualFloat(qw->m_userWeightForWord,0))
continue;
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
continue;
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
int32_t naids = syn.getSynonyms ( &m_tr,
i ,
// language of the query.
// 0 means unknown. if this
// is 0 we sample synonyms
// from all languages.
m_langId ,
tmpBuf );
// if no synonyms, all done
if ( naids <= 0 ) continue;
numCandidateSynonyms += naids;
}
}
std::vector<std::string> wvg_source_words;
std::vector<int> wvg_source_word_index; //idx in wvg_source_words -> idx of queryword
if(m_word_variations_config.m_languageSpecificWordVariations) {
for(int i=0; i<m_numWords; i++) {
const QueryWord *qw = &m_qwords[i];
if(qw->m_inQuotes) continue;
if(qw->m_wordSign == '+') continue;
if(qw->m_wordSign == '-') continue;
if(qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
continue;
if(qw->m_ignoreWord == IGNORE_FIELDNAME) continue;
// ignore if word weight is zero or synonym weight is zero
if(almostEqualFloat(qw->m_userWeightForWord,0))
continue;
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
continue;
wvg_source_words.emplace_back(qw->m_word,qw->m_wordLen);
wvg_source_word_index.emplace_back(i);
}
auto wvg(WordVariationGenerator::get_generator(m_langId));
m_wordVariations = wvg->query_variations(wvg_source_words, m_word_variations_config.m_word_variations_weights, m_word_variations_config.m_word_variations_threshold);
numCandidateSynonyms += m_wordVariations.size();
if(!m_wordVariations.empty())
logTrace(g_conf.m_logTraceQuery, "word variations produced %d variants", (int)m_wordVariations.size());
else
logTrace(g_conf.m_logTraceQuery, "word variations didn't produce any");
} else
m_wordVariations.clear();
if(g_conf.m_logTraceQuery) {
logTrace(g_conf.m_logTraceQuery, "m_wordVariations.size()=%zu", m_wordVariations.size());
for(unsigned i=0; i<m_wordVariations.size(); i++)
logTrace(g_conf.m_logTraceQuery, " variation #%u: %s weight=%f src=[%d..%d)", i, m_wordVariations[i].word.c_str(), m_wordVariations[i].weight, m_wordVariations[i].source_word_start, m_wordVariations[i].source_word_end);
}
if(m_word_variations_config.m_lemmaWordVariations)
numCandidateSynonyms += 10;
m_numTermsUntruncated = numCandidatePhrases+numCandidateSingles+numCandidateSynonyms;
logTrace(g_conf.m_logTraceQuery, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms)", m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms);
const int numQueryTerms = std::min(std::min(m_numTermsUntruncated,m_maxQueryTerms),ABS_MAX_QUERY_TERMS);
if(numQueryTerms!=m_numTermsUntruncated)
log(LOG_DEBUG, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms), will be truncated to %d terms for query '%s'",
m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms, numQueryTerms,
m_filteredQuery.getBufStart());
// allocate the term buffer
if(numQueryTerms) {
int32_t need = numQueryTerms * sizeof(QueryTerm);
if ( ! m_queryTermBuf.reserve ( need ) )
return false;
m_queryTermBuf.setLabel("stkbuf3");
const char *pp = m_queryTermBuf.getBufStart();
m_qterms = (QueryTerm *)pp;
}
// call constructor on each one here
for(int32_t i = 0; i < numQueryTerms; i++) {
QueryTerm *qt = &m_qterms[i];
qt->constructor();
}
int32_t n = 0;
// do phrase terms
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
if(numCandidateSingles+numCandidatePhrases > m_maxQueryTerms) {
//we won't have room for both phrases and singles. Put in as many singles as possible. But phrases
//must come first in the list due to bad assumptions elsewhere in the code.
if(numQueryTerms - n - 1 < numCandidateSingles)
break;
}
QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_bigramId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
// none if weight is absolute zero
if ( almostEqualFloat(qw->m_userWeightForPhrase, 0) )
continue;
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = true ;
qt->m_synonymOf = NULL;
qt->m_ignored = false;
qt->m_term = NULL;
qt->m_termLen = 0;
qt->m_langIdBitsValid = false;
qt->m_langIdBits = 0;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = false;
// change in both places
qt->m_termId = qw->m_bigramId & TERMID_MASK;
qt->m_rawTermId = qw->m_rawPhraseId;
// boolean queries are not allowed term signs for phrases
// UNLESS it is a '*' soft require sign which we need for
// phrases like: "cat dog" AND pig
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
qt->m_termSign = '\0';
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_phraseSign;
}
qw->m_queryWordTerm = NULL;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the phrase
qt->m_term = qw->m_word;
qt->m_termLen = qw->m_bigramLen;
// the QueryWord should have a direct link to the QueryTerm,
// at least for phrase, so we can OR in the bits of its
// constituents in the for loop below
qw->m_queryPhraseTerm = qt ;
// assign score weight, we're a phrase here
qt->m_termWeight = m_bigramWeight;
qt->m_userWeight = qw->m_userWeightForPhrase ;
qt->m_fieldCode = qw->m_fieldCode;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
}
n++;
}
// now if we have enough room, do the singles
for(int32_t i = 0; i < m_numWords && n<numQueryTerms; i++) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;
// if we are not start of quote and NOT in a phrase we
// must be the tailing word i guess.
// fixes '"john smith" -"bob dole"' from having
// smith and dole as query terms.
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
continue;
// ignore if weight is absolute zero
if ( almostEqualFloat(qw->m_userWeightForWord,0) )
continue;
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_synonymOf = NULL;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
qt->m_termId = qw->m_wordId & TERMID_MASK;
qt->m_rawTermId = qw->m_rawWordId;
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no synonyms.
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
}
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
}
int32_t pw = i-1;
// . back up until word that contains quote if in a quoted
// phrase
// . UOR can only support two word phrases really...
if (m_qwords[i].m_quoteStart >= 0)
pw = m_qwords[i].m_quoteStart ;
if ( pw > 0 ) pw--;
// back two more if field
int32_t fieldStart=-1;
int32_t fieldLen=0;
if(pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME)
fieldStart = pw;
if(pw > 0 && m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME) {
pw -= 1;
fieldStart = pw;
}
while(pw > 0 && m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME) {
pw--;
fieldStart = pw;
}
// skip if it is punct. fixes queries like
// "(this OR that)" from including '(' or from including
// a space.
if ( fieldStart >-1 &&
m_qwords[fieldStart].m_isPunct &&
fieldStart+1<m_numWords )
fieldStart++;
if (fieldStart > -1) {
pw = i;
while (pw < m_numWords && m_qwords[pw].m_fieldCode)
pw++;
fieldLen = m_qwords[pw-1].m_word +
m_qwords[pw-1].m_wordLen -
m_qwords[fieldStart].m_word;
}
qw->m_queryWordTerm = qt;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the word
if (fieldLen > 0) {
qt->m_term = m_qwords[fieldStart].m_word;
qt->m_termLen = fieldLen;
// fix for query
// text:"" foo bar ""
if ( pw-1 < i ) {
log("query: bad query %s",m_originalQuery.getBufStart());
g_errno = EMALFORMEDQUERY;
return false;
}
// skip past the end of the field value
i = pw-1;
}
else {
qt->m_termLen = qw->m_wordLen;
qt->m_term = qw->m_word;
}
// assign score weight, we're a single-term here
qt->m_termWeight = 1.0;
qt->m_userWeight = qw->m_userWeightForWord;
qt->m_fieldCode = qw->m_fieldCode;
qt->m_userNotRequired = qw->m_userNotRequiredForWord;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
}
n++;
}
// Handle shared explicit bits
for ( int32_t i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
// assume not in a phrase
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
}
// . set m_inPhrase
for (int32_t i = 0; i < m_numWords ; i++ ) {
const QueryWord *qw = &m_qwords[i];
QueryTerm *qt = qw->m_queryWordTerm;
if (!qt) continue;
// set flag if in a a phrase, and set phrase term num
if ( qw->m_queryPhraseTerm ) {
QueryTerm *pt = qw->m_queryPhraseTerm;
qt->m_rightPhraseTermNum = pt - m_qterms;
qt->m_rightPhraseTerm = pt;
}
// if we're in the middle of the phrase
int32_t pn = qw->m_leftPhraseStart;
// convert word to its phrase QueryTerm ptr, if any
QueryTerm *tt = NULL;
if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
if ( tt ) {
qt->m_leftPhraseTermNum = tt - m_qterms;
qt->m_leftPhraseTerm = tt;
}
// . there might be some phrase term that actually contains
// the same word as we are, but a different occurence
// . like '"knowledge management" AND NOT management' query
// . made it from "j < i" into "j < m_numWords" because
// 'test "test bed"' was not working but '"test bed" test'
// was working.
for ( int32_t j = 0 ; j < m_numWords ; j++ ) {
// must be our same wordId (same word, different occ.)
const QueryWord *qw2 = &m_qwords[j];
if ( qw2->m_wordId != qw->m_wordId ) continue;
// get first word in the phrase that jth word is in
int32_t pn2 = qw2->m_leftPhraseStart;
// we might be the guy that starts it!
if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j;
// if neither is the case, skip this query word
if ( pn2 < 0 ) continue;
// he implies us!
QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
if ( tt2 ) {
qt->m_leftPhraseTermNum = tt2 - m_qterms;
qt->m_leftPhraseTerm = tt2;
}
break;
}
}
if(g_conf.m_logTraceQuery) {
logTrace(g_conf.m_logTraceQuery, "query-terms before word variations:");
for(int i=0; i<n; i++)
logTrace(g_conf.m_logTraceQuery, " query-term #%d: termid=%15" PRId64" '%*.*s'", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term);
}
////////////
//
// . add synonym query terms now
// . skip this part if language is unknown i guess
//
////////////
if(m_word_variations_config.m_wiktionaryWordVariations) {
int64_t to = hash64n("to");
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
// get query word
QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// not '-' either i guess
if ( qw->m_wordSign == '-' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// ignore title: etc. words, they are field names
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
// ignore boolean operators
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
// ignore if word weight is zero or synonym weight is zero
if(almostEqualFloat(qw->m_userWeightForWord,0))
continue;
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
continue;
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
int32_t naids = syn.getSynonyms ( &m_tr,
i ,
// language of the query.
// 0 means unknown. if this
// is 0 we sample synonyms
// from all languages.
m_langId ,
tmpBuf );
// if no synonyms, all done
if ( naids <= 0 ) continue;
// sanity
if ( naids > MAX_SYNS ) { g_process.shutdownAbort(true); }
// now make the buffer to hold them for us
qw->m_synWordBuf.setLabel("qswbuf");
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
// get the term for this word
QueryTerm *origTerm = qw->m_queryWordTerm;
// loop over synonyms for word #i now
for(int32_t j = 0; j < naids && n<numQueryTerms; j++) {
// this happens for 'da da da'
if ( ! origTerm ) continue;
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_langIdBits = 0;
// synonym of this term...
qt->m_synonymOf = origTerm;
// nuke this crap since it was done above and we
// missed out!
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
// need this for displaying language of syn in
// the json/xml feed in PageResults.cpp
qt->m_langIdBitsValid = true;
int langId = syn.m_langIds[j];
uint64_t langBit = (uint64_t)1 << langId;
if ( langId >= 64 ) langBit = 0;
qt->m_langIdBits |= langBit;
// need this for Matches.cpp
qt->m_synWids0 = syn.m_wids0[j];
qt->m_synWids1 = syn.m_wids1[j];
int32_t na = syn.m_numAlnumWords[j];
// how many words were in the base we used to
// get the synonym. i.e. if the base is "new jersey"
// then it's 2! and the synonym "nj" has one alnum
// word.
int32_t ba = syn.m_numAlnumWordsInBase[j];
qt->m_numAlnumWordsInSynonym = na;
// crap, "nj" is a synonym of the PHRASE TERM
// bigram "new jersey" not of the single word term
// "new" so fix that.
if ( ba == 2 && origTerm->m_rightPhraseTerm )
qt->m_synonymOf = origTerm->m_rightPhraseTerm;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
int64_t wid = syn.m_aids[j];
// might be in a title: field or something
if ( qw->m_prefixHash ) {
int64_t ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
qt->m_rawTermId = syn.m_aids[j];
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
}
}
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
}
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// usually this is right
const char *ptr = syn.m_termPtrs[j];
// buf if it is NULL that means we transformed the
// word by like removing accent marks and stored
// it in m_synWordBuf, as opposed to just pointing
// to a line in memory of wiktionary-buf.txt.
if ( ! ptr ) {
int32_t off = syn.m_termOffs[j];
if ( off < 0 ) {
g_process.shutdownAbort(true); }
if ( off > qw->m_synWordBuf.length() ) {
g_process.shutdownAbort(true); }
// use QueryWord::m_synWordBuf which should
// be persistent and not disappear like
// syn.m_synWordBuf.
ptr = qw->m_synWordBuf.getBufStart() + off;
}
// point to the string itself that is the word
qt->m_term = ptr;
qt->m_termLen = syn.m_termLens[j];
// assign score weight, we're a synonym here
qt->m_termWeight = m_synonymWeight;
qt->m_userWeight = qw->m_userWeightForSynonym;
qt->m_fieldCode = qw->m_fieldCode;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
}
// otherwise, add it
n++;
}
}
}
if(m_word_variations_config.m_languageSpecificWordVariations) {
logTrace(g_conf.m_logTraceQuery, "Word variations: %zu", m_wordVariations.size());
for(unsigned i=0; i<m_wordVariations.size() && n<numQueryTerms; i++) {
auto const &word_variation(m_wordVariations[i]);
int wordStartIdx = wvg_source_word_index[word_variation.source_word_start];
int wordEndIdx = wvg_source_word_index[word_variation.source_word_end-1];
logTrace(g_conf.m_logTraceQuery, " Word variation #%u: '%s' weight=%f src=[%u..%u]", i, word_variation.word.c_str(), word_variation.weight, wordStartIdx, wordEndIdx);
QueryWord *qw = &m_qwords[wordStartIdx];
if((unsigned)qw->m_wordLen==word_variation.word.length() &&
memcmp(qw->m_word, word_variation.word.data(), word_variation.word.length())==0)
{
//Variation is the same as the base word. The word-variation-plugin is allowed to produce that.
continue; //skip
}
QueryTerm *origTerm = qw->m_queryWordTerm;
//handle if the word variant is a bigram/phrase
bool isPhrase = false;
if(wordEndIdx-wordStartIdx>1) {
logTrace(g_conf.m_logTraceQuery, "Word variation '%s' spans more than 1 word", word_variation.word.c_str());
if(wordEndIdx-wordStartIdx==2) {
//find bigram pointing to first word
QueryTerm *bigramQueryTerm = NULL;
for(int j=0; j<n && !bigramQueryTerm; j++) {
if(m_qterms[j].m_qword==qw && m_qterms[j].m_isPhrase)
bigramQueryTerm = &m_qterms[j];
}
if(bigramQueryTerm) {
logTrace(g_conf.m_logTraceQuery, "Word variation covers '%.*s'", bigramQueryTerm->m_termLen, bigramQueryTerm->m_term);
origTerm = bigramQueryTerm;
isPhrase = true;
} else
log(LOG_LOGIC,"Word variation '%s' bigram/phrase didn't find base bigram", word_variation.word.c_str());
} else {
log(LOG_LOGIC,"Word variation '%s' spans more than 2 words. This is not supported (yet)", word_variation.word.c_str());
}
}
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = isPhrase;
qt->m_langIdBits = 0;
// synonym of this term...
qt->m_synonymOf = origTerm;
// nuke this crap since it was done above and we
// missed out!
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
// need this for displaying language of syn in
// the json/xml feed in PageResults.cpp
qt->m_langIdBitsValid = true;
//int langId = syn.m_langIds[j]; //syn-todo?
//uint64_t langBit = (uint64_t)1 << langId; //syn-todo?
//if(langId >= 64) langBit = 0; //syn-todo?
//qt->m_langIdBits |= langBit; //syn-todo?
// need this for Matches.cpp
qt->m_synWids0 = 0;
qt->m_synWids1 = 0;
qt->m_numAlnumWordsInSynonym = 0;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
//int64_t wid = syn.m_aids[j];
int64_t wid = hash64Lower_utf8_nospaces(word_variation.word.data(), word_variation.word.length());
// might be in a title: field or something
if(qw->m_prefixHash) {
int64_t ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
// boolean queries are not allowed term signs
if(m_isBoolean) {
qt->m_termSign = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if(qw->m_wordSign == '+') {
qt->m_termSign = qw->m_wordSign;
}
}
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
}
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the word
qt->m_term = word_variation.word.data();
qt->m_termLen = word_variation.word.length();
// assign score weight
qt->m_termWeight = word_variation.weight;
qt->m_userWeight = qw->m_userWeightForSynonym;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if(qt->m_piped) {
qt->m_userWeight = 1;
}
// otherwise, add it
n++;
}
}
if(m_word_variations_config.m_lemmaWordVariations && m_langId==langDanish) {
logTrace(g_conf.m_logTraceQuery, "Lexicon-based lemma synonyms");
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
if(!m_tr[i].is_alfanum)
continue;
std::string w(m_tr[i].token_start,m_tr[i].token_len);
logTrace(g_conf.m_logTraceQuery, "Checking lemma for '%s'", w.c_str());
auto le = lemma_lexicon->lookup(w);
if(!le) {
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
char lowercase_word[128];
if(w.size()<sizeof(lowercase_word)) {
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
lowercase_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
le = lemma_lexicon->lookup(lowercase_word);
}
}
}
if(!le) {
//Not found as-is in lexicon. Try capitalized in case it is a lowercase or uppercase word
char capitalized_word[128];
if(w.size()<sizeof(capitalized_word)) {
size_t sz = to_capitalized_utf8(capitalized_word,capitalized_word+sizeof(capitalized_word), w.data(), w.data()+w.size());
capitalized_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
w = capitalized_word;
le = lemma_lexicon->lookup(w);
}
}
}
if(!le) {
//Not found as-is in lexicon. Try uppercasing it
char uppercase_word[128];
if(w.size()<sizeof(uppercase_word)) {
size_t sz = to_upper_utf8(uppercase_word,uppercase_word+sizeof(uppercase_word), w.data(), w.data()+w.size());
uppercase_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
w = uppercase_word;
le = lemma_lexicon->lookup(w);
}
}
}
if(!le)
continue; //unknown word
auto wf = le->find_base_wordform();
if(!wf)
continue;
if(wf->written_form_length==w.size() && memcmp(wf->written_form,w.data(),w.size())==0)
continue; //already base form)
logTrace(g_conf.m_logTraceQuery, "Generating synonym from lemma: %s -> %.*s", w.c_str(), wf->written_form_length,wf->written_form);
QueryWord *qw = &m_qwords[i];
QueryTerm *origTerm = qw->m_queryWordTerm;
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false;
qt->m_langIdBits = 0;
// synonym of this term...
qt->m_synonymOf = origTerm;
// nuke this crap since it was done above and we
// missed out!
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
// need this for displaying language of syn in
// the json/xml feed in PageResults.cpp
qt->m_langIdBitsValid = true;
//int langId = syn.m_langIds[j]; //syn-todo?
//uint64_t langBit = (uint64_t)1 << langId; //syn-todo?
//if(langId >= 64) langBit = 0; //syn-todo?
//qt->m_langIdBits |= langBit; //syn-todo?
// need this for Matches.cpp
qt->m_synWids0 = 0;
qt->m_synWids1 = 0;
qt->m_numAlnumWordsInSynonym = 0;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
//int64_t wid = syn.m_aids[j];
int64_t wid = hash64Lower_utf8_nospaces(wf->written_form,wf->written_form_length);
// might be in a title: field or something
if(qw->m_prefixHash) {
int64_t ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
// boolean queries are not allowed term signs
if(m_isBoolean) {
qt->m_termSign = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if(qw->m_wordSign == '+') {
qt->m_termSign = qw->m_wordSign;
}
}
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
}
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the word
qt->m_term = wf->written_form;
qt->m_termLen = wf->written_form_length;
// assign score weight
qt->m_termWeight = m_synonymWeight;
qt->m_userWeight = qw->m_userWeightForSynonym;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if(qt->m_piped) {
qt->m_userWeight = 1;
}
// otherwise, add it
n++;
}
}
//Merge duplicated synonyms.
//If one of the above synonym-generations produced the same word (eg. from wiktionary, word-variations and as lemma) then we want to use
//the one with highest weight
for(int i=0; i<n; i++) {
if(m_qterms[i].m_synonymOf) {
//it's a synonym. Are there other synonyms on the same base word with the same form? If so then merge/delete
for(int j=i+1; j<n; ) {
if(m_qterms[j].m_synonymOf == m_qterms[i].m_synonymOf &&
m_qterms[j].m_termLen == m_qterms[i].m_termLen &&
memcmp(m_qterms[j].m_term,m_qterms[i].m_term,m_qterms[j].m_termLen)==0)
{
//Identical synonyms of same base word
//note: direct memcmp() test. Downside is that we don't eliminate uppercase/lowercase duplicates, but neither
//do we get into trouble with German eszet, Lithuanian i, ligatures, titlecase, etc.
logTrace(g_conf.m_logTraceQuery, "merging identical synonyms '%.*s' for word '%.*s'", m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_synonymOf->m_termLen,m_qterms[i].m_synonymOf->m_term);
m_qterms[i].m_termWeight = std::max(m_qterms[i].m_termWeight,m_qterms[j].m_termWeight);
m_qterms[i].m_userWeight = std::max(m_qterms[i].m_userWeight,m_qterms[j].m_userWeight);
memmove(m_qterms+j, m_qterms+j+1, sizeof(m_qterms[0])*(n-j-1));
n--;
} else
j++;
}
}
}
m_numTerms = n;
if ( n > ABS_MAX_QUERY_TERMS ) { g_process.shutdownAbort(true); }
// . if only have one term and it is a signless phrase, make it signed
// . don't forget to set m_termSigns too!
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
m_qterms[0].m_termSign = '*';
}
// . now set m_phrasePart for Summary.cpp's hackfix filter
// . only set this for the non-phrase terms, since keepAllSingles is
// set to true when setting the Query for Summary.cpp::set in order
// to match the singles
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
// skip cd-rom too, if not in quotes
if ( ! m_qterms[i].m_inQuotes ) continue;
// is next term also in a quoted phrase?
if ( i - 1 < 0 ) continue;
//if ( ! m_qterms[i+1].m_isPhrase ) continue;
if ( ! m_qterms[i-1].m_inQuotes ) continue;
// are we in the same quoted phrase?
if ( m_qterms[i+0].m_qword->m_quoteStart !=
m_qterms[i-1].m_qword->m_quoteStart ) continue;
}
// if we have '+test -test':
//if ( negativeBits & requiredBits )
// m_numTerms = 0;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
// assume not required
qt->m_isRequired = false;
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_isPhrase && qt->m_termSign == '*' ) continue;
if ( qt->m_synonymOf ) continue;
// IGNORE_QSTOP?
if ( qt->m_ignored ) continue;
// user specified "[nrw]" before word
if( qt->m_userNotRequired) continue;
// mark it
qt->m_isRequired = true;
}
//If there are two highfreqterms in a row then PosdbTable will ignore the bigram of them because it can't tie the bigram to any required term.
//Example: "key west mystery writers fest" where "key" and "west" are highfreqterms, and therefore postdbtable will ignore the bigram "key+west".
//Options:
// 1: do nothing
// 2: mark the bigram as ignored
// 3: mark the bigram as required
// 4: rewrite Query and the queryterminfo handling in posdbtable so the bigram can be optional.
//We do (3) because it will likely filter out more bad results than good results. This is a hack because marking a bigram as required normally
//requires us to to be sure those two words are connected, but for "key west" we are guessing.
//TODO: reqwrite query+postdbtable so qword/qterm can be optional, etc.
//The structure of qwords+qterms make sthis code unnecessarily clumsy
for(int i=0; i+2<m_numWords; i++) {
if(m_qwords[i ].m_ignoreWord==IGNORE_HIGHFREMTERM &&
m_qwords[i+2].m_ignoreWord==IGNORE_HIGHFREMTERM)
{
if(m_qwords[i].m_queryPhraseTerm && m_qwords[i].m_queryPhraseTerm->m_isPhrase) {
logTrace(g_conf.m_logTraceQuery, "query-words #%d (%.*s) and #%d (%.*s) are both high-freq-terms. Marking bigram as required",
i, m_qwords[i].m_wordLen, m_qwords[i].m_word,
i+2, m_qwords[i+2].m_wordLen, m_qwords[i+2].m_word);
m_qwords[i].m_queryPhraseTerm->m_isRequired = true;
}
}
}
//workaround/hack for double-highfreqterm searchs, such as "of a" or "the the" or "the who"
if(m_numWords==3 &&
m_qwords[0].m_ignoreWord==IGNORE_HIGHFREMTERM &&
m_qwords[2].m_ignoreWord==IGNORE_HIGHFREMTERM &&
m_numTerms==1 &&
!m_qterms[0].m_isRequired)
{
log(LOG_DEBUG, "query: Looks like a highfreqterm-highfreqterm query type. Requiring one-and-only QueryTerm/bigram");
m_qterms[0].m_isRequired = true;
//todo: we should investigate if QueryTerm::m_isRequired actually has any effect. It is used
//in a single place in PosdbTable for not generating a QueryTermInfo, but it appears it works
//fine even with the QTI.
}
//if all words are high-freq-terms then we have to mark the generated bigrams as required, otherwise PosdbTable.cpp gets unhappy and
//logs "no required terms in query!"
bool allAlnumWordsAreIgnored = true;
for(int i=0; i<m_numWords; i++) {
if(is_alnum_utf8_string(m_qwords[i].m_word,m_qwords[i].m_word+m_qwords[i].m_wordLen) &&
(m_qwords[i].m_ignoreWord!=IGNORE_HIGHFREMTERM && m_qwords[i].m_ignoreWord!=IGNORE_QSTOP))
allAlnumWordsAreIgnored = false;
}
if(allAlnumWordsAreIgnored) {
log(LOG_DEBUG, "query: all alfanum-terms are ignored (highfreq/qstop). Marking bigrams as required");
for(int i=0; i<m_numTerms; i++) {
if(m_qterms[i].m_isPhrase)
m_qterms[i].m_isRequired = true;
}
}
// required quoted phrase terms
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
// quoted phrase?
if ( ! qt->m_isPhrase ) continue;
if ( ! qt->m_inQuotes ) continue;
// mark it
qt->m_isRequired = true;
}
// . for query 'to be or not to be shakespeare'
// require 'tobe' 'beor' 'tobe' because
// they are bigrams in the wikipedia phrase 'to be or not to be'
// and they all consist solely of query stop words. as of
// 8/20/2012 i took 'not' off the query stop word list.
// . require bigrams that consist of 2 query stop words and
// are in a wikipedia phrase. set termSign to '+' i guess?
// . for 'in the nick' , a wiki phrase, make "in the" required
// and give a big bonus for "the nick" below.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
const QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
const QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// must be two stop words
if ( ! qw1->m_isQueryStopWord ) continue;
if ( ! qw2->m_isQueryStopWord ) continue;
// mark it
qt->m_isRequired = true;
}
// . for query 'to be or not to be shakespeare'
// give big bonus for 'ornot' and 'notto' bigram terms because
// the single terms 'or' and 'to' are ignored and because
// 'to be or not to be' is a wikipedia phrase
// . on 8/20/2012 i took 'not' off the query stop word list.
// . now give a big bonus for bigrams whose two terms are in the
// same wikipedia phrase and one and only one of the terms in
// the bigram is a query stop word
// . in general 'ornot' is considered a "synonym" of 'not' and
// gets hit with a .90 score factor, but that should never
// happen, it should be 1.00 and in this special case it should
// be 1.20
// . so for 'time enough for love' the phrase term "enough for"
// gets its m_isWikiHalfStopBigram set AND that phrase term
// is a synonym term of the single word term "enough" and is treated
// as such in the Posdb.cpp logic.
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
QueryTerm *qt = &m_qterms[i];
// assume not!
qt->m_isWikiHalfStopBigram = false;
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
const QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
const QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// if both query stop words, should have been handled above
// we need one to be a query stop word and the other not
// for this algo
if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord )
continue;
// skip if neither is a query stop word
if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord )
continue;
// one must be a stop word i guess
// so for 'the time machine' we do not count 'time machine'
// as a halfstopwikibigram
if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord )
continue;
// special flag
qt->m_isWikiHalfStopBigram = true;
}
if(g_conf.m_logTraceQuery)
traceTermsToLog("final query-terms");
return true;
}
bool Query::setQWords ( char boolFlag ,
bool keepAllSingles ,
Phrases &phrases ) {
// . break query up into Words and phrases
// . because we now deal with boolean queries, we make parentheses
// their own separate Word, so tell "words" we're setting a query
plain_tokenizer_phase_1(m_filteredQuery.getBufStart(), m_filteredQuery.length(), &m_tr);
calculate_tokens_hashes(&m_tr);
//hackety-hack...
//The tokenizer phase 2 also recognizes "C++" and "john's", but we cannot use phase 2 because Phrases and Query are
//incompatible with phase-2 tokens (too many assumptions about strictly increasing positions and contiguous memory layout)
//So instead we implement special cases here, until we have time to fix the whole Query class.
for(size_t i=0; i+1<m_tr.size(); i++) {
//Hack for C++
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
m_tr[i+1].token_len>=2 && memcmp(m_tr[i+1].token_start,"++",2)==0)
{
m_tr[i].token_len += 2;
m_tr[i].end_pos += 2;
m_tr[i+1].start_pos += 2;
m_tr[i+1].token_start += 2;
m_tr[i+1].token_len -= 2;
if(m_tr[i+1].token_len==0)
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
continue;
}
//Hack for F#
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='f' || m_tr[i].token_start[0]=='F') &&
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
{
m_tr[i].token_len += 1;
m_tr[i].end_pos += 1;
m_tr[i+1].start_pos += 1;
m_tr[i+1].token_start += 1;
m_tr[i+1].token_len -= 1;
if(m_tr[i+1].token_len==0)
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
continue;
}
//Hack for C#
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
{
m_tr[i].token_len += 1;
m_tr[i].end_pos += 1;
m_tr[i+1].start_pos += 1;
m_tr[i+1].token_start += 1;
m_tr[i+1].token_len -= 1;
if(m_tr[i+1].token_len==0)
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
continue;
}
//Hack for C#
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
m_tr[i].token_len==1 && m_tr[i].token_start[0]=='A' &&
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"*",1)==0)
{
m_tr[i].token_len += 1;
m_tr[i].end_pos += 1;
m_tr[i+1].start_pos += 1;
m_tr[i+1].token_start += 1;
m_tr[i+1].token_len -= 1;
if(m_tr[i+1].token_len==0)
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
continue;
}
//Hack for possessive-apostrophe (no need for extra codepoint checks - people usually don't type them in a search field)
if(i+2<m_tr.size() &&
m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum && m_tr[i+2].is_alfanum &&
m_tr[i+1].token_len==1 && (m_tr[i+1].token_start[0]=='\'' || m_tr[i+1].token_start[0]=='`') &&
m_tr[i+2].token_len==1 && (m_tr[i+2].token_start[0]=='s' || m_tr[i+2].token_start[0]=='S'))
{
m_tr[i].end_pos = m_tr[i+2].end_pos;
m_tr[i].token_len += m_tr[i+1].token_len + m_tr[i+2].token_len;
m_tr.tokens.erase(m_tr.tokens.begin()+i+1,m_tr.tokens.begin()+i+3);
continue;
}
}
for(size_t i=0; i+2<m_tr.size(); ) {
const auto &t0 = m_tr[i+0];
const auto &t1 = m_tr[i+1];
const auto &t2 = m_tr[i+2];
if(t0.token_end()==t1.token_start && t1.token_end()==t2.token_start &&
is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len))
{
size_t sl = t0.token_len+t2.token_len;
char *s = (char*)m_tr.egstack.alloc(sl);
memcpy(s, t0.token_start, t0.token_len);
memcpy(s+t0.token_len, t2.token_start, t2.token_len);
m_tr.tokens.emplace_back(t0.start_pos, t2.end_pos, s,sl, false, true);
m_tr.tokens.erase(m_tr.tokens.begin()+i, m_tr.tokens.begin()+i+3);
} else
i++;
}
int32_t numWords = m_tr.size();
// truncate it
if ( numWords > ABS_MAX_QUERY_WORDS ) {
log("query: Had %" PRId32" words. Max is %" PRId32". Truncating.",
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
numWords = ABS_MAX_QUERY_WORDS;
m_truncated = true;
}
m_numWords = numWords;
// alloc the mem if we need to (mdw left off here)
int32_t need = m_numWords * sizeof(QueryWord);
// sanity check
if ( m_qwords ) { g_process.shutdownAbort(true); }
// point m_qwords to our generic buffer if it will fit
if(!m_queryWordBuf.reserve(need)) {
log(LOG_WARN, "query: Could not allocate mem for query.");
return false;
}
m_qwords = (QueryWord *)m_queryWordBuf.getBufStart();
// reset safebuf in there
for ( int32_t i = 0 ; i < m_numWords ; i++ )
m_qwords[i].constructor();
// is all alpha chars in query in upper case? caps lock on?
bool allUpper = true;
const char *p = m_filteredQuery.getBufStart();
const char *pend = m_filteredQuery.getBufPtr();
for ( ; p < pend ; p += getUtf8CharSize(p) )
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
allUpper = false; break; }
// . come back here from below when we detect dat query is not boolean
// . we need to redo the bits cuz they may have been messed with below
// redo:
// field code we are in
field_code_t fieldCode = FIELD_UNSET;
char fieldSign = 0;
const char *field = NULL;
int32_t fieldLen = 0;
// keep track of the start of different chunks of quotes
int32_t quoteStart = -1;
bool inQuotes = false;
//bool inVQuotes = false;
char quoteSign = 0;
// the current little sign
char wordSign = 0;
// when reading first word in link: ... field we skip the following
// words until we hit a space because we hash them all together
bool ignoreTilSpace = false;
// assume we're NOT a boolean query
m_isBoolean = false;
// used to not respect the bool operator if it is the first word
bool firstWord = true;
// the query processing is broken into 3 stages.
// . STAGE #1
// . reset all query words to default
// set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT
// . set m_isFieldName, m_fieldCode and m_quoteStart for query words.
// no field names in quotes. +title:"hey there".
// set m_quoteStart to -1 if not in quotes.
// . if quotes immediately follow field code's ':' then distribute
// the field code to all words in the quotes
// . distribute +/- signs across quotes and fields to m_wordSigns.
// support -title:"hey there".
// . set m_quoteStart to -1 if only one alnum word is
// in quotes, what's the point of that?
// . set boolean op codes (m_opcode). cannot be in quotes.
// cannot have a field code. cannot have a word sign (+/-).
// . set m_wordId of FIELD_LINK, _URL, _SITE, _IP fields.
// m_wordId of first should be hash of the whole field value.
// only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF.
// . set m_ignore of non-op codes, non-fieldname, alnum words to 0.
// . set m_wordId of each non-ignored alnum word.
// . STAGE #2
// . customize Bits class:
// first alnum word can start phrase.
// first alnum word in quotes (m_quoteStart >= 0 ) can start phrase.
// connected on the right but not on the left.. can start phrase.
// no pair across any double quote
// no pair across ".." --- UNLESS in quotes!
// no pair across any change of field code.
// field names may not be part of any phrase or paired across.
// boolean ops may not be part of any phrase or paired across.
// ignored words may not be part of any phrase or paired across.
// . STAGE #3
// . set phrases class w/ custom Bits class mods.
// . set m_bigramId and m_rawPhraseId of all QueryWords. if phraseId
// is not 0 (phrase exists) then set m_ignorePhrase to 0.
// . set m_leftConnected, m_rightConnected. word you are connecting
// to must not be ignored. (no field names or op codes).
// ensure you are in a phrase with the connected word, too, to
// really be connected.
// . set m_leftPhraseStart and m_rightPhraseEnd for all
// m_inQuotePhrase is not needed since if m_quoteStart is >= 0
// we MUST be in a quoted phrase!
// . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED.
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0).
// m_wordSign may have inherited quote or field sign.
// . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0)
// m_wordSign may have inherited quote or field sign.
// . if one word in a phrase is negative, then set m_phraseSign to '-'
// set the Bits used for making phrases from the Words class
Bits bits;
if ( !bits.set(&m_tr)) {
log(LOG_WARN, "query: Had error processing query: %s.", mstrerror(g_errno));
return false;
}
float userWeightForWord = 1;
float userWeightForPhrase = 1;
float userWeightForSynonym = 1;
bool userNotRequiredForWord = false;
int32_t ignorei = -1;
// assume we contain no pipe operator
int32_t pi = -1;
int32_t posNum = 0;
const char *ignoreTill = NULL;
// loop over all words, these QueryWords are 1-1 with "words"
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
QueryWord *qw = &m_qwords[i];
// set to defaults?
memset ( qw , 0 , sizeof(QueryWord) );
// but quotestart should be -1
qw->m_quoteStart = -1;
qw->m_leftPhraseStart = -1;
// assume QueryWord is ignored by default
qw->m_ignoreWord = IGNORE_DEFAULT;
qw->m_ignorePhrase = IGNORE_DEFAULT;
qw->m_ignoreWordInBoolQuery = false;
qw->m_word = m_tr[i].token_start;
qw->m_wordLen = m_tr[i].token_len;
qw->m_isPunct = !m_tr[i].is_alfanum;
qw->m_posNum = posNum;
// count 1 unit for it
posNum++;
// we ignore the facet value range list...
if ( ignoreTill && qw->m_word < ignoreTill )
continue;
// . we duplicated this code from XmlDoc.cpp's
// getWordPosVec() function
if ( qw->m_isPunct ) { // ! wids[i] ) {
const char *wp = qw->m_word;
int32_t wplen = qw->m_wordLen;
// simple space or sequence of just white space
if ( is_wspace_utf8_string(m_tr[i].token_start, m_tr[i].token_end()))
posNum += 0;
// 'cd-rom'
else if ( wp[0]=='-' && wplen==1 )
posNum += 0;
// 'mr. x'
else if ( wp[0]=='.' && is_wspace_utf8_string(m_tr[i].token_start+1, m_tr[i].token_end()))
posNum += 0;
// animal (dog)
else
posNum++;
}
const char *w = m_tr[i].token_start;
int32_t wlen = m_tr[i].token_len;
// assume it is a query weight operator
qw->m_queryOp = true;
// ignore it? (this is for query weight operators)
if ( i <= ignorei ) continue;
// deal with pipe operators
if ( wlen == 5 &&
w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') {
pi = i;
qw->m_opcode = opcode_t::OP_PIPE;
continue;
}
// [133.0r]
// is it the bracket operator?
// " LeFtB 113 rp RiGhB "
if ( wlen == 5 &&
w[0]=='L'&&w[1]=='e'&&w[2]=='F'&&w[3]=='t'&&w[4]=='B'&&
i+4 < numWords ) {
// s MUST point to a number
const char *s = m_tr[i+2].token_start;
int32_t slen = m_tr[i+2].token_len;
// if no number, it must be
// " leFtB w RiGhB " or " leFtB p RiGhB "
if ( ! is_digit(s[0]) ) {
if(s[0] == 'w') {
// word weight reset
userWeightForWord = 1;
ignorei = i + 4;
} else if(s[0] == 'p') {
// phrase weight reset
userWeightForPhrase = 1;
} else if(s[0] == 's') {
// phrase weight reset
userWeightForSynonym = 1;
} else if(s[0] == 'n' && s[1] == 'r' && s[2] == 'w') {
// set word as not required
userNotRequiredForWord = true;
}
ignorei = i + 4;
} else {
// get the number
float fval = atof2 (s, slen);
// s2 MUST point to the a,r,ap,rp string
const char *s2 = m_tr[i+4].token_start;
// is it a phrase?
if(s2[0] == 'w') {
userWeightForWord = fval;
} else if(s2[0] == 'p') {
userWeightForPhrase = fval;
} else if(s2[0] == 's') {
userWeightForSynonym = fval;
}
// ignore all following words up and inc. i+6
ignorei = i + 6;
}
continue;
}
// assign score weight, if any for this guy
qw->m_userWeightForWord = userWeightForWord;
qw->m_userWeightForPhrase = userWeightForPhrase;
qw->m_userWeightForSynonym = userWeightForSynonym;
// Set required state based on user input
qw->m_userNotRequiredForWord = userNotRequiredForWord;
qw->m_queryOp = false;
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
bool endField = false;
if(has_space(m_tr[i].token_start, m_tr[i].token_end()) && ! inQuotes)
endField = true;
// TODO: fix title:" hey there" (space in quotes is ok)
// if there's a quote before the first space then
// it's ok!!!
if ( endField ) {
const char *s = m_tr[i].token_start;
const char *send = s + m_tr[i].token_len;
for ( ; s < send ; s++ ) {
// if the space is inside the quotes then it
// doesn't count!
if(*s == '\"') {
endField = false;
break;
}
if(is_wspace_a(*s))
break;
}
}
// cancel the field if we hit a space (not in quotes)
if ( endField ) {
// cancel the field
fieldCode = FIELD_UNSET;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
}
// . maintain inQuotes and quoteStart
// . quoteStart is the word # that starts the current quote
int32_t nq = count_quotes(m_tr[i].token_start, m_tr[i].token_len);
if ( nq > 0 ) { // && ! ignoreQuotes ) {
// toggle quotes if we need to
if ( nq & 0x01 ) inQuotes = ! inQuotes;
// set quote sign to sign before the quote
if ( inQuotes ) {
quoteSign = '\0';
for ( const char *p = w + wlen - 1 ; p > w ; p--){
if ( *p != '\"' ) continue;
if ( *(p-1) == '-' ) quoteSign = '-';
if ( *(p-1) == '+' ) quoteSign = '+';
break;
}
}
// . quoteStart is the word # the quotes started at
// . it is -1 if not in quotes
// . now we set it to the alnum word AFTER us!!
if ( inQuotes && i+1< numWords ) quoteStart = i+1;
else quoteStart = -1;
}
//log(LOG_DEBUG, "Query: nq: %" PRId32" inQuotes: %d,quoteStart: %" PRId32,
// nq, inQuotes, quoteStart);
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
// TODO: fix title:" hey there" (space in quotes is ok)
bool cancelField = false;
if ( has_space(m_tr[i].token_start, m_tr[i].token_end()) && ! inQuotes )
cancelField = true;
// fix title:"foo bar" "another quote" so "another quote"
// is not in the title: field
if ( has_space(m_tr[i].token_start, m_tr[i].token_end()) && inQuotes && nq>= 2 )
cancelField = true;
// BUT if we have a quote, and they just got turned off,
// and the space is not after the quote, do not cancel field!
if ( nq == 1 && cancelField ) {
// if we hit the space BEFORE the quote, do NOT cancel
// the field
for ( const char *p = w + wlen - 1 ; p > w ; p--) {
// hey, we got the quote first, keep field
if ( *p == '\"' ) {cancelField = false; break;}
// otherwise, we got space first? cancel it!
if ( is_wspace_a(*p) ) break;
}
}
if ( cancelField ) {
// cancel the field
fieldCode = FIELD_UNSET;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
}
// skip if we should
if ( ignoreTilSpace ){
if (m_qwords[i-1].m_fieldCode){
qw->m_fieldCode = m_qwords[i-1].m_fieldCode;
}
continue;
}
// . is this word potentially a field?
// . it cannot be another field name in a field
if(i < m_numWords-2 &&
m_tr[i+1].token_len==1 && m_tr[i+1].token_start[0]==':' &&
!is_wspace_utf8_string(m_tr[i+2].token_start,m_tr[i+2].token_end()) &&
(!is_punct_utf8(m_tr[i+2].token_start) || m_tr[i+2].token_start[0]=='\"' || m_tr[i+2].token_start[0]=='-') &&
! fieldCode && ! inQuotes)
{
// field name may have started before though if it
// was a compound field name containing hyphens,
// underscores or periods
int32_t j = i-1 ;
while ( j > 0 &&
((m_qwords[j].m_rawWordId != 0) ||
( m_qwords[j].m_wordLen ==1 &&
((m_qwords[j].m_word)[0]=='-' ||
(m_qwords[j].m_word)[0]=='_' ||
(m_qwords[j].m_word)[0]=='.')))) {
j--;
}
if ( j < 0 ) {
j = 0;
}
// advance j to a non-punct word
while (!m_tr[j].is_alfanum)
j++;
// ignore all of these words then,
// they're part of field name
int32_t tlen = 0;
for ( int32_t k = j ; k <= i ; k++ )
tlen += m_tr[k].token_len;
//is it recognized field name,like "title" or "url"?
fieldCode = getFieldCode (m_tr[j].token_start, tlen);
if(fieldCode) {
//Previously this was done in all cases to support searching for sub-sub-sub...fields in json/xml
//The downside was that copy-paste of colon-separated words or artist names like "L:Ron:Harald" didn't work.
// set field name to the compound name if it is
field = m_tr[j].token_start;
fieldLen = tlen;
if(j == i)
fieldSign = wordSign;
else
fieldSign = m_qwords[j].m_wordSign;
//FIXME: TokenizerResult does not promise that tokens that are adjacent in the source string also are adjacent in memory
// (but since Query only does phase-1 tokenization and the tokenizer currently only does tricky things in phase 2 it currently holds)
// if so, it does NOT get its own QueryWord,
// but its sign can be inherited by its members
for ( int32_t k = j ; k <= i ; k++ )
m_qwords[k].m_ignoreWord = IGNORE_FIELDNAME;
continue;
}
}
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
qw->m_inQuotes = inQuotes;
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// if we are a punct word, see if we end in a sign that can
// be applied to the next word, a non-punct word
if ( !m_tr[i].is_alfanum ) {
wordSign = w[wlen-1];
if ( wordSign != '-' && wordSign != '+') wordSign = 0;
if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0;
if ( i > 0 && wlen == 1 ) wordSign = 0;
// don't add any QueryWord for a punctuation word
continue;
}
// what is the sign of our term? +, -, *, ...
char mysign;
if ( fieldCode ) mysign = fieldSign;
else if ( inQuotes ) mysign = quoteSign;
else mysign = wordSign;
// are we doing default AND?
//if ( forcePlus && ! *mysign ) mysign = '+';
// store the sign
qw->m_wordSign = mysign;
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
// . get prefix hash of collection name and field
// . but first convert field to lower case
uint64_t ph;
int32_t fflen = fieldLen;
if ( fflen > 62 ) fflen = 62;
char ff[64];
to_lower3_a ( field , fflen , ff );
ph = hash64 ( ff , fflen );
// map "intitle" map to "title"
if ( fieldCode == FIELD_TITLE )
ph = hash64 ( "title", 5 );
// make "suburl" map to "inurl"
if ( fieldCode == FIELD_SUBURL )
ph = hash64 ( "inurl", 5 );
// fix for filetype:pdf queries
if ( fieldCode == FIELD_TYPE )
ph = hash64 ("type",4);
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// prefix hash
qw->m_prefixHash = ph;
// if we're hashing a url:, link:, site: or ip: term,
// then we need to hash ALL up to the first space
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_EXT ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_SITELINK||
fieldCode == FIELD_LINKS||
fieldCode == FIELD_SITE ||
fieldCode == FIELD_IP ||
fieldCode == FIELD_GBFIELDMATCH ) {
// . find 1st space -- that terminates the field value
// . make "end" point to the end of the entire query
const char *end = m_tr[m_tr.size()-1].token_end();
// use this for gbmin:price:1.99 etc.
int32_t firstColonLen = -1;
int32_t lastColonLen = -1;
int32_t colonCount = 0;
// "w" points to the first alnumword after the field,
// so for site:xyz.com "w" points to the 'x' and wlen
// would be 3 in that case sinze xyz is a word of 3
// chars. so advance
// wlen until we hit a space.
while (w + wlen < end) {
// stop at first white space
if (is_wspace_utf8(w + wlen)) break;
// in case of gbmin:price:1.99 record first ':'
if (w[wlen] == ':') {
lastColonLen = wlen;
if (firstColonLen == -1)
firstColonLen = wlen;
colonCount++;
}
// fix "gbsortbyint:date)"
// these are used as boolean operators
// so do not include them in the value.
// we also did this above to set cancelField
// to true.
if (w[wlen] == '(' || w[wlen] == ')')
break;
wlen++;
}
// ignore following words until we hit a space
ignoreTilSpace = true;
// the hash. keep it case insensitive. only
// the fieldmatch stuff should be case-sensitive.
// this may change later.
uint64_t wid = hash64Lower_utf8(w, wlen, 0LL);
if (fieldCode == FIELD_GBFIELDMATCH) {
// hash the json field name. (i.e. tag.uri)
// make it case sensitive as
// seen in XmlDoc.cpp::hashFacet2().
// the other fields are hashed in
// XmlDoc.cpp::hashNumber3().
// CASE SENSITIVE!!!!
wid = hash64(w, firstColonLen, 0LL);
// if it is like
// gbfieldmatch:tag.uri:"http://xyz.com/poo"
// then we should hash the string into
// an int just like how the field value would
// be hashed when adding gbfacetstr: terms
// in XmlDoc.cpp:hashFacet2(). the hash of
// the tag.uri field, for example, is set
// in hashFacet1() and set to "val32". so
// hash it just like that does here.
const char *a = w + firstColonLen + 1;
// . skip over colon at start
if (a[0] == ':') a++;
// . skip over quotes at start/end
bool inQuotes = false;
if (a[0] == '\"') {
inQuotes = true;
a++;
}
// end of field
const char *b = a;
// if not in quotes advance until
// we hit whitespace
char cs;
for (; !inQuotes && *b; b += cs) {
cs = getUtf8CharSize(b);
if (is_wspace_utf8(b)) break;
}
// if in quotes, go until we hit quote
for (; inQuotes && *b != '\"'; b++)
;
// now hash that up. this must be 64 bit
// to match in XmlDoc.cpp::hashFieldMatch()
uint64_t val64 = hash64(a, b - a);
// make a composite of tag.uri and http://...
// just like XmlDoc.cpp::hashFacet2() does
wid = hash64(val64, wid);
}
// should we have normalized before hashing?
if (fieldCode == FIELD_URL ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_SITELINK ||
fieldCode == FIELD_LINKS ||
fieldCode == FIELD_SITE) {
Url url;
url.set( w, wlen, ( fieldCode != FIELD_SITE ), false );
if (fieldCode == FIELD_SITELINK) {
wid = hash64(url.getHost(), url.getHostLen());
} else {
wid = hash64(url.getUrl(), url.getUrlLen());
}
}
// like we do it in XmlDoc.cpp's hashString()
if (ph) {
qw->m_wordId = hash64h(wid, ph);
} else {
qw->m_wordId = wid;
}
qw->m_rawWordId = 0LL; // only for highlighting?
qw->m_bigramId = 0LL;
qw->m_rawPhraseId = 0LL;
qw->m_opcode = opcode_t::OP_NONE;
// definitely not a query stop word
qw->m_isQueryStopWord = false;
// do not ignore the wordId
qw->m_ignoreWord = IGNORE_NO_IGNORE;
// we are the first word?
firstWord = false;
// we're done with this one
continue;
}
opcode_t opcode = opcode_t::OP_NONE;
// if query is all in upper case and we're doing boolean
// DETECT, then assume not boolean
if ( allUpper && boolFlag == 2 ) boolFlag = 0;
// . is this word a boolean operator?
// . cannot be in quotes or field
if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) {
// are we an operator?
if ( ! firstWord && wlen==2 &&
w[0]=='O' && w[1]=='R')
opcode = opcode_t::OP_OR;
else if ( ! firstWord && wlen==3 &&
w[0]=='A' && w[1]=='N' && w[2]=='D')
opcode = opcode_t::OP_AND;
else if ( ! firstWord && wlen==3 &&
w[0]=='N' && w[1]=='O' && w[2]=='T')
opcode = opcode_t::OP_NOT;
else if ( wlen==5 && w[0]=='L' && w[1]=='e' &&
w[2]=='F' && w[3]=='t' && w[4]=='P' )
opcode = opcode_t::OP_LEFTPAREN;
else if ( wlen==5 && w[0]=='R' && w[1]=='i' &&
w[2]=='G' && w[3]=='h' && w[4]=='P' )
opcode = opcode_t::OP_RIGHTPAREN;
// no pair across or even include any boolean op phrs
if ( opcode != opcode_t::OP_NONE ) {
bits.clearBits(i,D_CAN_PAIR_ACROSS);
bits.clearBits(i,D_CAN_BE_IN_PHRASE);
qw->m_ignoreWord = IGNORE_BOOLOP;
qw->m_opcode = opcode;
if ( opcode == opcode_t::OP_LEFTPAREN ) continue;
if ( opcode == opcode_t::OP_RIGHTPAREN ) continue;
// if this is uncommented all of our operators
// become actual query terms (mdw)
if ( opcode == opcode_t::OP_UOR ) continue;
// if you just have ANDs and ()'s that does
// not make you a boolean query! we are bool
// by default!!
if ( opcode == opcode_t::OP_AND ) continue;
m_isBoolean = true;
continue;
}
}
// . add single-word term id
// . this is computed by hash64AsciiLower()
// . but only hash64Lower_a if _HASHWITHACCENTS_ is true
uint64_t wid = m_tr[i].token_hash;
qw->m_rawWordId = wid;
// we now have a first word already set
firstWord = false;
// . are we a QUERY stop word?
// . NEVER count as stop word if it's in all CAPS and
// not all letters in the whole query is NOT in all CAPS
// . It's probably an acronym
if ( m_tr[i].token_len>1 &&
is_upper_utf8_string(m_tr[i].token_start, m_tr[i].token_end()) &&
! allUpper )
{
qw->m_isQueryStopWord = false;
qw->m_isStopWord = false;
} else {
qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid,
m_langId);
// . BUT, if it is a single letter contraction thing
// . ninad: make this == 1 if in utf8! TODO!! it is!
if ( i>0 && wlen == 1 && w[-1] == '\'' )
qw->m_isQueryStopWord = true;
qw->m_isStopWord =::isStopWord (w,wlen,wid);
}
// . do not count as query stop word if it is the last in query
// . like the query: 'baby names that start with j'
if ( i + 2 > numWords ) {
qw->m_isQueryStopWord = false;
}
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) {
qw->m_wordId = hash64(wid, ph);
} else {
qw->m_wordId = wid;
}
// do not ignore the word
qw->m_ignoreWord = IGNORE_NO_IGNORE;
//except if it is a high-frequency-term and expensive to look up. In that case ignore the word but keep the phrases/bigrams thereof
uint64_t termId = (qw->m_wordId & TERMID_MASK);
if(g_conf.m_useHighFrequencyTermCache &&
m_allowHighFreqTermCache && g_hfts.is_registered_term(termId)) {
log(LOG_DEBUG, "query: term='%.*s' with termId %lu is a highfreq term. Marking it for ignoring", wlen, w, termId);
qw->m_ignoreWord = IGNORE_HIGHFREMTERM;
}
// reset for next word
userNotRequiredForWord = false;
}
//If there's only one alphanumerical word and it was ignored due to high-freq-term then the query is treated as 0 terms and will return an empty
//result. Therefore un-ignore the single word and let it fetch (best-efort) results from the high-freq-term-cache
int numAlfanumWords = 0;
int numAlfanumWordsHighFreqTerms = 0;
int alfanumWordIndex = -1;
for(int i=0; i<numWords; i++) {
if(m_tr[i].is_alfanum) {
alfanumWordIndex = i;
numAlfanumWords++;
if(m_qwords[i].m_ignoreWord==IGNORE_HIGHFREMTERM)
numAlfanumWordsHighFreqTerms++;
}
}
if(numAlfanumWords == 1 && numAlfanumWordsHighFreqTerms==1)
m_qwords[alfanumWordIndex].m_ignoreWord = IGNORE_NO_IGNORE;
// pipe those that should be piped
for ( int32_t i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true;
// . set m_leftConnected and m_rightConnected
// . we are connected to the first non-punct word on our left
// if we are separated by a small $ of defined punctuation
// . see getIsConnection() for that definition
// . this allows us to just lookup the phrase for things like
// "cd-rom" rather than lookup "cd" , "rom" and "cd-rom"
// . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or
// IGNORE_DEFAULT
// . we have to set outside the main loop above since we check
// the m_ignoreWord member of the i+2nd word
for ( int32_t i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&&
isConnection(i+1) )
qw->m_rightConnected = true;
if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord &&
isConnection(i-1) )
qw->m_leftConnected = true;
}
// now modify the Bits class before generating phrases
for ( int32_t i = 0 ; i < numWords ; i++ ) {
// get default bits
unsigned char b = bits.queryBits(i);
// allow pairing across anything by default
b |= D_CAN_PAIR_ACROSS;
// get Query Word
QueryWord *qw = &m_qwords[i];
// . skip if part of a query weight operator
// . cannot be in a phrase, or anything
if ( qw->m_queryOp && qw->m_opcode==opcode_t::OP_NONE) {
b = D_CAN_PAIR_ACROSS;
}
// is this word a sequence of punctuation and spaces?
else if ( !m_tr[i].is_alfanum ) {
// pair across ANY punct, even double spaces by default
b |= D_CAN_PAIR_ACROSS;
// but do not pair across anything with a quote in it
if ( count_quotes(m_tr[i].token_start, m_tr[i].token_len) > 0 )
b &= ~D_CAN_PAIR_ACROSS;
// continue if we're in quotes
else if ( qw->m_quoteStart >= 0 ) goto next;
// continue if we're in a field
else if ( qw->m_fieldCode > 0 ) goto next;
// if guy on left is in field, do not pair across
if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 )
b &= ~D_CAN_PAIR_ACROSS;
// or if guy on right in field
if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 )
b &= ~D_CAN_PAIR_ACROSS;
// do not pair across ".." when not in quotes/field
const char *w = m_tr[i].token_start;
int32_t wlen = m_tr[i].token_len;
for ( int32_t j = 0 ; j < wlen-1 ; j++ ) {
if ( w[j ]!='.' ) continue;
if ( w[j+1]!='.' ) continue;
b &= ~D_CAN_PAIR_ACROSS;
break;
}
}
else {
// . no field names, bool operators, cruft in fields
// can be any part of a phrase
// . no pair across any change of field code
// . 'girl title:boy' --> no "girl title" phrase!
if ( qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM ) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
}
// . no boolean ops
// . 'this OR that' --> no "this OR that" phrase
if ( qw->m_opcode != opcode_t::OP_NONE ) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
}
if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
}
}
next:
// set it back all tweaked
bits.assignBits(i,b);
}
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
// in quotes for the most part, therefore, set m_quoteStart for them
int32_t j;
int32_t qs = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// skip all but strongly connected words
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
// must also be non punct word OR a space
( m_tr[j].is_alfanum || *m_tr[j].token_start==' ' ) ) {
// break the "quote", if any
qs = -1; continue; }
// if he is punctuation and qs is -1, skip him,
// punctuation words can no longer start a quote
if ( !m_tr[j].is_alfanum && qs == -1 ) continue;
// uningore him if we should
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = IGNORE_NO_IGNORE;
// if already in quotes, don't bother!
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
// remember him
if ( qs == -1 ) qs = j;
// he starts the phrase
m_qwords[j].m_quoteStart = qs;
// force him into a quoted phrase
m_qwords[j].m_inQuotes = true;
//m_qwords[j].m_inQuotedPhrase = true;
}
// fix for tags.uri:http://foo.com/bar so it works like
// tags.uri:"http://foo.com/bar" like it should
int32_t first = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// stop when we hit spaces
if ( has_wspace_utf8_string(m_tr[j].token_start, m_tr[j].token_end()) ) {
first = -1;
continue;
}
// skip if not in field
if ( ! m_qwords[j].m_fieldCode ) continue;
// must be in a generic field, the other fields like site:
// will be messed up by this logic
if ( m_qwords[j].m_fieldCode != FIELD_GENERIC ) continue;
// first alnumword in field?
if ( first == -1 ) {
// must be alnum
if ( m_qwords[j].m_isPunct ) continue;
// must have punct then another alnum word
if ( j+2 >= numWords ) break;
// spaces screw it up
if ( has_wspace_utf8_string(m_tr[j+1].token_start, m_tr[j+1].token_end()) ) continue;
// then an alnum word after
first = j;
}
// we are in fake quoted phrase
m_qwords[j].m_inQuotes = true;
m_qwords[j].m_quoteStart = first;
}
// make the phrases from the words and the tweaked Bits class
if ( !phrases.set(m_tr,bits) )
return false;
// do phrases stuff
for ( int32_t i = 0 ; i < numWords ; i++ ) {
// get the ith QueryWord
QueryWord *qw = &m_qwords[i];
//if word is ignored (and it is not due to high-freq-term) then don't generate a phrase/bigram query term
if(qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM)
continue;
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
// get the first word # to our left that starts a phrase
// of which we are a member
qw->m_leftPhraseStart = -1;
for ( int32_t j = i - 1 ; j >= 0 ; j-- ) {
if ( ! bits.canPairAcross(j+1) ) break;
if ( !m_tr[j].is_alfanum ) continue;
qw->m_leftPhraseStart = j;
// we can't pair across alnum words now, we just want bigrams
if ( m_tr[j].is_alfanum ) break;
// now we do bigrams so only allow two words even
// if they are stop words
break;
}
// . is this word in a quoted phrase?
// . the whole phrase must be in the same set of quotes
// . if we're in a left phrase, he must be in our quotes
if ( qw->m_leftPhraseStart >= 0 &&
qw->m_quoteStart >= 0 &&
qw->m_leftPhraseStart >= qw->m_quoteStart )
qw->m_inQuotedPhrase = true;
// if we start a phrase, ensure next guy is in our quote
if ( ! qw->m_ignorePhrase && i+1 < numWords &&
m_qwords[i+1].m_quoteStart >= 0 &&
m_qwords[i+1].m_quoteStart <= i )
qw->m_inQuotedPhrase = true;
// are we the first word in the quote?
if ( i-1>=0 && qw->m_quoteStart == i )
qw->m_inQuotedPhrase = true;
// ignore single words that are in a quoted phrase
if ( ! keepAllSingles && qw->m_inQuotedPhrase )
qw->m_ignoreWord = IGNORE_QUOTED;
// . get phrase info for this term
// . a pid (phraseId)of 0 indicates it does not start a phrase
// . raw phrase termId
uint64_t pid = 0LL;
phrases.getMinWordsInPhrase(i,(int64_t *)&pid);;
// store it
qw->m_rawPhraseId = pid;
// does word #i start a phrase?
if ( pid != 0 ) {
uint64_t ph = qw->m_prefixHash ;
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_bigramId = hash64 ( pid , ph );
else qw->m_bigramId = pid;
//calculate length of phrase(bigram) in bytes
int32_t numWordsInPhrase = phrases.getNumWordsInPhrase2(i);
int phraseLen = 0;
for(int j=i; j<i+numWordsInPhrase; j++)
phraseLen += m_qwords[j].m_wordLen;
qw->m_bigramLen = phraseLen;
// do not ignore the phrase, it's valid
qw->m_ignorePhrase = IGNORE_NO_IGNORE;
}
// . phrase sign is inherited from word's sign if it's a minus
// . word sign is inherited from field, quote or right before
// the word
// . that is, all words in -"to be or not" will have a '-' sign
// . phraseId may or may not be 0 at this point
if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-';
// . dist word signs to others in the same connected string
// . use "-cd-rom x-box" w/ no connector in between
// . test queries:
// . +cd-rom +x-box
// . -cd-rom +x-box
// . -m-o-n
// . who was the first (was is a query stop word)
// . www.xxx.com
// . welcome to har.com
// . hezekiah walker the love family affair ii live at radio
// city music hall
// . fotostudio +m-o-n-a-r-t
// . fotostudio -m-o-n-a-r-t
// . i'm home
if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 )
qw->m_wordSign = m_qwords[i-2].m_wordSign;
// . if we connected to the alnum word on our right then
// soft require the phrase (i.e. treat like a single term)
// . example: cd-rom or www.xxx.com
// . 'welcome to har.com' should get a '*' for "har.com" sign
if ( qw->m_rightConnected ) {
if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign;
else qw->m_phraseSign = '*';
}
// . if we're in quotes then any phrase we have should be
// soft required (i.e. treated like a single term)
// . we do not allow phrases in queries to pair across
// quotes. See where we tweak the Bits class above.
if ( qw->m_quoteStart >= 0 ) {
qw->m_phraseSign = '*';
}
// . if we are the last word in a phrase that consists of all
// PLAIN stop words then make the phrase have a '*'
// . 'to be or not to be .. test' (cannot pair across "..")
// . don't use QUERY stop words cuz of "who was the first?" qry
if ( pid ) {
int32_t nw = phrases.getNumWordsInPhrase2(i);
int32_t j;
// search up to this far
int32_t maxj = i + nw;
// but not past our truncated limit
if ( maxj > ABS_MAX_QUERY_WORDS )
maxj = ABS_MAX_QUERY_WORDS;
for ( j = i ; j < maxj ; j++ ) {
// skip punct
if ( !m_tr[j].is_alfanum ) continue;
// break out if not a stop word
if ( ! bits.isStopWord(j) ) break;
// break out if has a term sign
if ( m_qwords[j].m_wordSign ) break;
}
// if everybody in phrase #i was a signless stopword
// and the phrase was signless, make it have a '*' sign
if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' )
m_qwords[i].m_phraseSign = '*';
// . if a constituent has a - sign, then the whole
// phrase becomes negative, too
// . fixes 'apple -computer' truncation problem
for ( int32_t j = i ; j < maxj ; j++ )
if ( m_qwords[j].m_wordSign == '-' )
qw->m_phraseSign = '-';
}
// . ignore unsigned QUERY stop words that are not yet ignored
// and are in unignored phrases
// . 'who was the first taiwanese president' should not get
// "who was" term sign changed to '*' because "was" is a
// QUERY stop word. So ignore singles query stop words
// in phrases now
if ( //! keepAllSingles &&
(qw->m_isQueryStopWord && !m_isBoolean) &&
m_useQueryStopWords &&
! qw->m_fieldCode &&
// fix 'the tigers'
//(qw->m_leftPhraseStart >= 0 || qw->m_bigramId > 0 ) &&
! qw->m_wordSign &&
! qw->m_ignoreWord )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore and/or between quoted phrases, save user from
// themselves (they meant AND/OR)
if ( ! keepAllSingles && qw->m_isQueryStopWord &&
! qw->m_fieldCode &&
m_useQueryStopWords &&
! qw->m_bigramId && ! qw->m_inQuotes &&
((qw->m_wordId == 255176654160863LL) ||
(qw->m_wordId == 46196171999655LL)) )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore repeated single words and phrases
// . look at the old termIds for this, too
// . should ignore 2nd 'time' in 'time after time' then
// . but boolean queries often need to repeat terms
// . NEW - words much be same sign and not in different
// . quoted phrases to be ignored -partap
if ( ! m_isBoolean && !qw->m_ignoreWord ) {
for ( int32_t j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignoreWord ) continue;
if ( m_qwords[j].m_wordId == qw->m_wordId &&
m_qwords[j].m_wordSign ==qw->m_wordSign &&
(!keepAllSingles ||
(m_qwords[j].m_quoteStart
== qw->m_quoteStart))){
qw->m_ignoreWord = IGNORE_REPEAT;
}
}
}
if ( ! m_isBoolean && !qw->m_ignorePhrase ) {
// ignore repeated phrases too!
for ( int32_t j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignorePhrase ) continue;
if ( m_qwords[j].m_bigramId == qw->m_bigramId &&
m_qwords[j].m_phraseSign
== qw->m_phraseSign)
qw->m_ignorePhrase = IGNORE_REPEAT;
}
}
}
// . if we only have one quoted query then force its sign to be '+'
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
// . "time enough for love" --> +"time enough" +"enough for love"
// . if all unignored words are in the same set of quotes then change
// all '*' (soft-required) phrase signs to '+'
for ( j= 0 ; j < numWords ; j++ ) {
if ( !m_tr[j].is_alfanum) continue;
if ( m_qwords[j].m_quoteStart < 0 ) break;
if ( m_qwords[j].m_ignoreWord ) continue;
if ( j < 2 ) continue;
if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart )
break;
}
if ( j >= numWords ) {
for ( j= 0 ; j < numWords ; j++ ) {
if ( m_qwords[j].m_phraseSign == '*' )
m_qwords[j].m_phraseSign = '+';
}
}
// . force a plus on any site: or ip: query terms
// . also disable site clustering if we have either of these terms
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign ) continue;
if ( qw->m_fieldCode != FIELD_SITE &&
qw->m_fieldCode != FIELD_IP ) continue;
qw->m_wordSign = '+';
}
// . if one or more of a phrase's constituent terms exceeded
// term #MAX_QUERY_TERMS then we should also soft require that phrase
// . fixes 'hezekiah walker the love family affair ii live at
// radio city music hall'
// . how many non-ignored phrases?
int32_t count = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( ! qw->m_bigramId ) continue;
count++;
}
for ( int32_t i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// count non-ignored words
if ( qw->m_ignoreWord ) continue;
// if under limit, continue
if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
// . otherwise, ignore
// . if we set this for our UOR'ed terms from SearchInput.cpp's
// UOR'ed facebook interests then it causes us to get no results!
// so make sure that MAX_QUERY_TERMS is big enough with respect to
// the opCount in SearchInput.cpp
qw->m_ignoreWord = IGNORE_BREECH;
// left phrase should get a '*'
int32_t left = qw->m_leftPhraseStart;
if ( left >= 0 && ! m_qwords[left].m_phraseSign )
m_qwords[left].m_phraseSign = '*';
// our phrase should get a '*'
if ( qw->m_bigramId && ! qw->m_phraseSign )
qw->m_phraseSign = '*';
}
// . fix the 'x -50a' query so it returns results
// . how many non-negative, non-ignored words/phrases do we have?
count = 0;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign == '-' ) continue;
count++;
}
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
const QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( qw->m_phraseSign == '-' ) continue;
if ( qw->m_bigramId == 0LL ) continue;
count++;
}
// if everybody is ignored or negative UNignore first query stop word
if ( count == 0 ) {
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
qw->m_ignoreWord = IGNORE_NO_IGNORE;
count++;
break;
}
}
quoteStart = -1;
int32_t quoteEnd = -1;
// set m_quoteENd
for ( int32_t i = m_numWords - 1 ; i >= 0 ; i-- ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// skip if ignored
if ( qw->m_ignoreWord ) continue;
// skip if not in quotes
if ( qw->m_quoteStart < 0 ) continue;
// if match previous guy...
if ( qw->m_quoteStart == quoteStart ) {
// inherit the end
qw->m_quoteEnd = quoteEnd;
// all done
continue;
}
// ok, we are the end then
quoteEnd = i;
quoteStart = qw->m_quoteStart;
}
int32_t wkid = 0;
int32_t upTo = -1;
//
// set the wiki phrase ids
//
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// in a phrase from before?
if ( i < upTo ) {
qw->m_wikiPhraseId = wkid;
continue;
}
// assume none
qw->m_wikiPhraseId = 0;
// skip if punct
if ( !m_tr[i].is_alfanum ) continue;
// get word
int32_t nwk ;
nwk = g_wiki.getNumWordsInWikiPhrase ( i , &m_tr );
// bail if none
if ( nwk <= 1 ) continue;
// inc it
wkid++;
// store it
qw->m_wikiPhraseId = wkid;
// set loop parm
upTo = i + nwk;
}
// consider terms strongly connected like wikipedia title phrases
for ( int32_t i = 0 ; i + 2 < m_numWords ; i++ ) {
// get ith word
QueryWord *qw1 = &m_qwords[i];
// must not already be in a wikiphrase
//if ( qw1->m_wikiPhraseId > 0 ) continue;
// what query word # is that?
int32_t qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId > 0 ) continue;
// if there is a strong connector like the . in 'dmoz.org'
// then consider it a wiki bigram too
if ( ! qw1->m_rightConnected ) continue;
if ( ! qw2->m_leftConnected ) continue;
// fix 'rdf.org.dumps' so org.dumps gets same
// wikiphraseid as rdf.org
int id;
if ( qw1->m_wikiPhraseId ) id = qw1->m_wikiPhraseId;
else id = ++wkid;
// store it
qw1->m_wikiPhraseId = id;
qw2->m_wikiPhraseId = id;
}
// all done
return true;
}
void Query::modifyQuery(DerivedScoringWeights *scoringWeights, const CollectionRec& cr, bool *doSiteClustering) {
logTrace(g_conf.m_logTraceQuery, "Query::modifyQuery: q='%s', modifyDomainLikeSearches=%s, modifyAPILikeSearches=%s", originalQuery(),cr.m_modifyDomainLikeSearches?"true":"false", cr.m_modifyAPILikeSearches?"true":"false");
logTrace(g_conf.m_logTraceQuery, " m_numWords = %d", m_numWords);
logTrace(g_conf.m_logTraceQuery, " m_numTerms = %d", m_numTerms);
if(cr.m_modifyDomainLikeSearches) {
bool looksLikeADomain = false;
// is it a domain in the form of domain.tld ?
if(m_numWords==3 &&
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
looksLikeADomain = true;
// is it a domain in the form of host.domain.tld ?
if(m_numWords==5 &&
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen))
looksLikeADomain = true;
// is it a domain in the form of host.domain.tld1.tld2 ? (eg www.example.co.uk)
if(m_numWords==7 &&
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen) &&
m_qwords[5].m_wordLen==1 && m_qwords[5].m_word[0]=='.' &&
is_alnum_utf8_string(m_qwords[6].m_word,m_qwords[6].m_word+m_qwords[6].m_wordLen))
looksLikeADomain = true;
if(looksLikeADomain) {
if(!isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen) &&
!isTLD(m_qwords[m_numWords-3].m_word,m_qwords[m_numWords-3].m_wordLen+m_qwords[m_numWords-2].m_wordLen+m_qwords[m_numWords-1].m_wordLen))
looksLikeADomain = false; //nope - last component(s) isn't a known tld
}
if(looksLikeADomain) {
log(LOG_DEBUG, "query:Query '%s' looks like a domain", originalQuery());
//set all non-synonym terms as required and boost inUrl weight.
for(int i=0; i<m_numTerms; i++) {
if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
m_qterms[i].m_isRequired = true;
m_qterms[i].m_rightPhraseTermNum = -1;
m_qterms[i].m_leftPhraseTermNum = -1;
m_qterms[i].m_rightPhraseTerm = NULL;
m_qterms[i].m_leftPhraseTerm = NULL;
}
}
if(isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen)) {
//The last term is marked non-required because the tld terms are normally not indexed (see XmlDoc::hashUrl() -> hashString() -> hashString3())
//high-freq-terms and stopwords means that the term may not have been generated, so look for it
for(int i=0; i<m_numTerms; i++) {
if(m_qterms[i].m_qword == &(m_qwords[m_numWords-1]) && !m_qterms[i].m_isPhrase)
m_qterms[i].m_isRequired = false;
}
}
scoringWeights->m_hashGroupWeights[HASHGROUP_INURL] *= 10; //factor 10 seems to work fine
if(cr.m_domainLikeSearchDisablesSiteCluster)
*doSiteClustering = false;
log(LOG_DEBUG, "query:Query modified");
traceTermsToLog("domain-like search terms");
return;
}
}
if(cr.m_modifyAPILikeSearches) {
bool looksLikeAnAPI = false;
//is it something like "file.open" or "file.open()" ?
//todo: detect java packages like java.util.HashSet (but most java programmers probably has built-in help in their IDE so they would rarely use this)
if(m_numWords==3 &&
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
looksLikeAnAPI = true;
if(m_numWords==4 &&
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
looksLikeAnAPI = true;
//or "file::open()"
if(m_numWords==3 &&
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
looksLikeAnAPI = true;
if(m_numWords==4 &&
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
looksLikeAnAPI = true;
if(looksLikeAnAPI) {
log(LOG_DEBUG, "query:Query '%s' looks like an API or function call", originalQuery());
//set all non-synonym terms as required
for(int i=0; i<m_numTerms; i++) {
if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
m_qterms[i].m_isRequired = true;
m_qterms[i].m_rightPhraseTermNum = -1;
m_qterms[i].m_leftPhraseTermNum = -1;
m_qterms[i].m_rightPhraseTerm = NULL;
m_qterms[i].m_leftPhraseTerm = NULL;
}
}
log(LOG_DEBUG, "query:Query modified");
traceTermsToLog("api-like search terms");
return;
}
}
log(LOG_DEBUG, "query: Query not modified");
}
// return -1 if does not exist in query, otherwise return the query word num
int32_t Query::getWordNum(int64_t wordId) const {
// skip if punct or whatever
if ( wordId == 0LL || wordId == -1LL ) return -1;
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
const QueryWord *qw = &m_qwords[i];
// the non-raw word id includes a hash with "0", which
// signifies an empty field term
if ( qw->m_rawWordId == wordId ) return i;
}
// otherwise, not found
return -1;
}
static HashTableX s_table;
static bool s_isInitialized = false;
static GbMutex s_tableMutex;
// 3rd field = m_hasColon
const struct QueryField g_fields[] = {
{"url",
FIELD_URL,
true,
"url:www.example.com/page.html",
"Matches the page with that exact url. Uses the first url, not "
"the url it redirects to, if any." ,
NULL,
0 },
{"ext",
FIELD_EXT,
true,
"ext:doc",
"Match documents whose url ends in the <i>.doc</i> file extension.",
NULL,
0 },
{"link",
FIELD_LINK,
true,
"link:www.example.com/foo.html",
"Matches all the documents that have a link to "
"http://www.example.com/foobar.html",
NULL,
0 },
{"sitelink",
FIELD_SITELINK,
true,
"sitelink:abc.foobar.com",
"Matches all documents that link to any page on the "
"<i>abc.foobar.com</i> site.",
NULL,
0 },
{"site",
FIELD_SITE,
true,
"site:example.com",
"Matches all documents on the example.com domain.",
NULL,
0 },
{"site",
FIELD_SITE,
true,
"site:www.example.com/dir1/dir2/",
"Matches all documents whose url starts with "
"www.example.com/dir1/dir2/",
NULL,
QTF_DUP },
{"sitenoindex",
FIELD_SITE,
true,
"sitenoindex:example.com",
"Matches all documents on the example.com domain that in not indexed.",
NULL,
0 },
{"ip",
FIELD_IP,
true,
"ip:192.0.2.1",
"Matches all documents whose IP is 192.0.2.1.",
NULL,
0 },
{"ip",
FIELD_IP,
true,
"ip:192.0.2",
"Matches all documents whose IP STARTS with 192.0.2.",
NULL,
QTF_DUP },
{"inurl",
FIELD_SUBURL,
true,
"inurl:dog",
"Matches all documents that have the word dog in their url, like "
"http://www.example.com/dog/food.html. However will not match "
"http://www.example.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
NULL,
0 },
{"suburl",
FIELD_SUBURL,
true,
"suburl:dog",
"Same as inurl.",
NULL,
0},
{"intitle",
FIELD_TITLE,
false,
"title:cat",
"Matches all the documents that have the word cat in their "
"title.",
NULL,
0 },
{"intitle",
FIELD_TITLE,
false,
"title:\"cat food\"",
"Matches all the documents that have the phrase \"cat food\" "
"in their title.",
NULL,
QTF_DUP },
{"title",
FIELD_TITLE,
false,
"title:cat",
"Same as intitle:",
NULL,
0},
{"type",
FIELD_TYPE,
false,
"type:json",
"Matches all documents that are in JSON format. "
"Other possible types include "
"<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
"<i>status</i> matches special documents that are stored every time "
"a url is spidered so you can see all the spider attempts and when "
"they occurred as well as the outcome.",
NULL,
0},
{"filetype",
FIELD_TYPE,
false,
"filetype:json",
"Same as type: above.",
NULL,
0},
{"gblang",
FIELD_GBLANG,
false,
"gblang:de",
"Matches all documents in german. "
"The supported language abbreviations "
"are at the bottom of the <a href=\"/admin/filters\">url filters</a> "
"page. Some more "
"common ones are <i>gblang:en, gblang:es, gblang:fr, "
// need quotes for this one!!
"gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
NULL,
0},
{"gbcountry",
FIELD_GBCOUNTRY,
false,
"gbcountry:us",
"Matches documents determined by Gigablast to be from the United "
"States. See the country abbreviations in the CountryCode.cpp "
"open source distribution. Some more popular examples include: "
"de, fr, uk, ca, cn.",
NULL,
0} ,
// mdw
{"gbdocid",
FIELD_GBDOCID,
false,
"gbdocid:123456",
"Matches the document with the docid 123456",
NULL,
0},
{"gbtermid",
FIELD_GBTERMID,
false,
"gbtermid:123456",
"Matches the documents for the term with termid 123456",
NULL,
0},
//
// for content type CT_STATUS documents (Spider status docs)
//
{"gbdocspiderdate",
FIELD_GENERIC,
false,
"gbdocspiderdate:1400081479",
"Matches documents that have "
"that spider date timestamp (UTC). "
//"Does not include the "
//"special spider status documents. "
"This is the time the document "
"completed downloading.",
"Date Related Query Operators",
QTF_BEGINNEWTABLE},
{"gbspiderdate",
FIELD_GENERIC,
false,
"gbspiderdate:1400081479",
"Like above.",
//, but DOES include the special spider status documents.",
NULL,
0},
{"gbdocindexdate",
FIELD_GENERIC,
false,
"gbdocindexdate:1400081479",
"Like above, but is the time the document was last indexed. "
"This time is "
"slightly greater than or equal to the spider date.",//Does not "
//"include the special spider status documents.",
NULL,
0},
{"gbindexdate",
FIELD_GENERIC,
false,
"gbindexdate:1400081479",
"Like above.",//, but it does include the special spider status "
//"documents.",
NULL,
0},
// they don't need to know about this
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE}
};
void resetQuery ( ) {
s_table.reset();
}
int32_t getNumFieldCodes ( ) {
return (int32_t)sizeof(g_fields) / (int32_t)sizeof(QueryField);
}
static bool initFieldTable(){
ScopedLock sl(s_tableMutex);
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,"qryfldtbl" ) ) {
log(LOG_WARN, "build: Could not init table of query fields.");
return false;
}
// now add in all the stop words
int32_t n = getNumFieldCodes();
for ( int32_t i = 0 ; i < n ; i++ ) {
// skip if dup
int64_t h = hash64b ( g_fields[i].text );
// if already in there it is a dup
if ( s_table.isInTable ( &h ) ) continue;
// store the entity index in the hash table as score
if ( ! s_table.addTerm(h, i+1) ) return false;
}
s_isInitialized = true;
}
return true;
}
field_code_t getFieldCode(const char *s, int32_t len) {
if ( !initFieldTable() ) {
return FIELD_UNSET;
}
int64_t h = hash64Lower_a( s, len );
int32_t i = (int32_t) s_table.getScore(h);
if ( i == 0 ) {
return FIELD_UNSET;
}
return g_fields[i-1].field;
}
const char *getFieldCodeName(field_code_t fc) {
switch(fc) {
case FIELD_UNSET: return "unset";
case FIELD_URL: return "url";
case FIELD_LINK: return "link";
case FIELD_SITE: return "site";
case FIELD_IP: return "ip";
case FIELD_SUBURL: return "suburl";
case FIELD_TITLE: return "title";
case FIELD_TYPE: return "type";
case FIELD_EXT: return "ext";
case FIELD_LINKS: return "links";
case FIELD_SITELINK: return "sitelink";
case FIELD_GENERIC: return "generic";
case FIELD_GBLANG: return "gblang";
case FIELD_GBCOUNTRY: return "gbcountry";
case FIELD_GBTERMID: return "gbtermid";
case FIELD_GBDOCID: return "gbdocid";
case FIELD_GBCONTENTHASH: return "gbcontenthash";
case FIELD_GBFIELDMATCH: return "gbfieldmatch";
default: return NULL;
}
}
// guaranteed to be punctuation
bool Query::isConnection(unsigned i) const {
auto const &token = m_tr[i];
if(token.token_len==1) {
switch(*token.token_start) {
// . only allow apostrophe if it's NOT a 's
// . so contractions are ok, and names too
case '\'':
// no, i think we should require it. google seems to,
// and msn and yahoo do. 'john's room -"john's" gives
// no result son yahoo and msn.
return true;
case ':': return true;
case '-': return true;
case '.': return true;
case '@': return true;
case '#': return true;
case '/': return true;
case '_': return true;
case '&': return true;
case '=': return true;
case '\\': return true;
default: return false;
}
}
//if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true;
if(token.token_len==3 &&
token.token_start[0]==':' && token.token_start[1]=='/' && token.token_start[2]=='/' )
return true;
return false;
}
void Query::dumpToLog() const
{
log(LOG_DEBUG, "Query:setQTerms: dumping %d query-words:", m_numWords);
for(int i=0; i<m_numWords; i++) {
const QueryWord &qw = m_qwords[i];
log(" qword #%d:",i);
log(" word='%*.*s'", (int)qw.m_wordLen, (int)qw.m_wordLen, qw.m_word);
log(" phrase='%*.*s'", (int)qw.m_bigramLen, (int)qw.m_bigramLen, qw.m_word);
log(" m_wordId=%" PRId64, qw.m_wordId);
log(" m_bigramId=%" PRId64, qw.m_bigramId);
if(qw.m_queryWordTerm)
log(" m_queryWordTerm= #%d", (int)(qw.m_queryWordTerm-m_qterms));
}
log("Query:setQTerms: dumping %d query-terms:", m_numTerms);
for(int i=0; i<m_numTerms; i++) {
const QueryTerm &qt = m_qterms[i];
log(" term #%d:",i);
log(" m_term='%*.*s'", (int)qt.m_termLen, (int)qt.m_termLen, qt.m_term);
log(" m_isPhrase=%s synonym=%s", qt.m_isPhrase?"true":"false", qt.m_synonymOf?"true":"false");
log(" m_termId=%" PRId64, qt.m_termId);
log(" m_rawTermId=%" PRId64, qt.m_rawTermId);
log(" m_isWikiHalfStopBigram=%s", qt.m_isWikiHalfStopBigram?"true":"false");
log(" m_leftPhraseTermNum=%d, m_leftPhraseTerm=%p", qt.m_leftPhraseTermNum, (void*)qt.m_leftPhraseTerm);
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
log(" m_termFreqWeight=%f m_termWeight=%f m_userWeight=%f", qt.m_termFreqWeight, qt.m_termWeight, qt.m_userWeight);
if(qt.m_synonymOf)
log(" m_synonymOf=#%d '%.*s'", (int)(qt.m_synonymOf-m_qterms), qt.m_synonymOf->m_termLen, qt.m_synonymOf->m_term);
}
}
void Query::traceTermsToLog(const char *header) {
logTrace(g_conf.m_logTraceQuery, "%s: %d queryterms:", header, m_numTerms);
for(int i=0; i<m_numTerms; i++) {
logTrace(g_conf.m_logTraceQuery, " query-term #%d: termid=%15" PRId64" '%*.*s', t-weight=%f u-weight=%f %s", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_termWeight,m_qterms[i].m_userWeight, m_qterms[i].m_ignored?"ignored":"");
logTrace(g_conf.m_logTraceQuery, " qstopw=%s req=%s", m_qterms[i].m_isQueryStopWord?"true":"false", m_qterms[i].m_isRequired?"yes":"no");
}
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
// return false and set g_errno on error
// returns how many words expression was
bool Expression::addExpression (int32_t start,
int32_t end,
Query *q,
int32_t level
) {
if ( level >= MAX_EXPRESSIONS ) {
g_errno = ETOOMANYPARENS;
return false;
}
// the # of the first alnumpunct word in the expression
m_expressionStartWord = start;
m_q = q;
int32_t i = m_expressionStartWord;
// "start" is the current alnumpunct word we are parsing out
for ( ; i<end ; i++ ) {
QueryWord *qwords = q->m_qwords;
QueryWord * qw = &qwords[i];
// set leaf node if not an opcode like "AND" and not punct.
if ( qw->m_opcode==opcode_t::OP_NONE && qw->isAlphaWord()){
continue;
}
if (qw->m_opcode == opcode_t::OP_NOT) {
continue;
}
else if (qw->m_opcode == opcode_t::OP_LEFTPAREN ) {
// this is expression
// . it should advance "i" to end of expression
// point to next...
q->m_numExpressions++;
// make a new one:
Expression *e=&q->m_expressions[q->m_numExpressions-1];
// now set it
if ( ! e->addExpression ( i+1, // skip over (
end ,
q ,
level + 1) )
return false;
// skip over it. pt to ')'
i += e->m_numWordsInExpression;
qw->m_expressionPtr = e;
}
else if (qw->m_opcode == opcode_t::OP_RIGHTPAREN ) {
// return size i guess, include )
m_numWordsInExpression = i - m_expressionStartWord+1;
return true;
}
else if (qw->m_opcode!=opcode_t::OP_NONE) {
continue;
}
// white space?
}
m_numWordsInExpression = i - m_expressionStartWord;
return true;
}
// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery(const unsigned char *bitVec, int32_t vecSize) const {
return m_expressions[0].isTruth ( bitVec , vecSize );
}
static bool isBitNumSet(int32_t opBitNum, const unsigned char *bitVec, int32_t vecSize) {
int32_t byte = opBitNum / 8;
int32_t mask = 1<<(opBitNum % 8);
if ( byte >= vecSize ) { g_process.shutdownAbort(true); }
return bitVec[byte] & mask;
}
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
// including ignored words and spaces i guess since Expression::add()
// seems to do that.
bool Expression::isTruth(const unsigned char *bitVec, int32_t vecSize) const {
//
// operand1 operand2 operator1 operand3 operator2 ....
//
// result: -1 means unknown at this point
int32_t result = -1;
opcode_t prevOpCode = opcode_t::OP_NONE;
int32_t prevResult ;
// result of current operand
int32_t opResult = -1;
int32_t i = m_expressionStartWord;
int32_t iend = i + m_numWordsInExpression;
bool hasNot = false;
for ( ; i < iend ; i++ ) {
const QueryWord *qw = &m_q->m_qwords[i];
// ignore parentheses, aren't real opcodes.
// we just want OP_AND/OP_OR/OP_NOT
opcode_t opcode = qw->m_opcode;
if ( opcode != opcode_t::OP_AND &&
opcode != opcode_t::OP_OR &&
opcode != opcode_t::OP_NOT )
opcode = opcode_t::OP_NONE;
if ( opcode == opcode_t::OP_NOT ) {
hasNot = true;
continue;
}
// so operands are expressions as well
const Expression *e = (const Expression *)qw->m_expressionPtr;
if ( e ) {
// save prev one. -1 means no prev.
prevResult = opResult;
// set new onw
opResult = e->isTruth ( bitVec , vecSize );
// skip over that expression. point to ')'
i += e->m_numWordsInExpression;
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
}
if ( opcode!=opcode_t::OP_NONE && ! e ) {
prevOpCode = opcode;//m_opSlots[i];
continue;
}
// simple operand
if ( opcode==opcode_t::OP_NONE && ! e ) {
// for regular word operands
// ignore it like a space?
if ( qw->m_ignoreWord ) continue;
// ignore gbsortby:offerprice in bool queries
// at least for evaluating them
if ( qw->m_ignoreWordInBoolQuery ) continue;
// save old one
prevResult = opResult;
// convert word to term #
const QueryTerm *qt = qw->m_queryWordTerm;
// fix title:"notre dame" AND NOT irish
if ( ! qt ) qt = qw->m_queryPhraseTerm;
if ( ! qt ) continue;
// phrase terms are not required and therefore
// do not have a v alid qt->m_bitNum set, so dont core
if ( ! qt->m_isRequired ) continue;
// . m_bitNum is set in Posdb.cpp when it sets its
// QueryTermInfo array
// . it is basically the query term #
// . see iff that bit is set in this docid's vec
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
}
// need two to tango. i.e. (true OR false)
if ( prevResult == -1 ) continue;
// if this is not the first time... we got two
if ( prevOpCode == opcode_t::OP_AND ) {
// if first operation we encount is A AND B then
// default result to on. only allow an AND operation
// to turn if off.
if ( result == -1 ) result = 1;
if ( ! prevResult ) result = 0;
if ( ! opResult ) result = 0;
}
else if ( prevOpCode == opcode_t::OP_OR ) {
// if first operation we encount is A OR B then
// default result to off
if ( result == -1 ) result = 0;
if ( prevResult ) result = 1;
if ( opResult ) result = 1;
}
}
// if we never set result, then it was probably a single
// argument expression like something in double parens like
// ((site:xyz.com OR site:abc.com)). so set it to value of
// first operand, opResult.
if ( prevOpCode == opcode_t::OP_NONE && result == -1 ) result = opResult;
if ( result == -1 ) return true;
if ( result == 0 ) return false;
return true;
}
// if any one query term is split, msg3a has to split the query
bool Query::isSplit() const {
for(int32_t i = 0; i < m_numTerms; i++)
if(m_qterms[i].isSplit()) return true;
return false;
}
void QueryTerm::constructor ( ) {
m_qword = NULL;
m_isPhrase = false;
m_termId = 0;
m_rawTermId = 0;
m_termSign = 0;
m_bitNum = 0;
m_term = NULL;
m_termLen = 0;
m_posdbListPtr = NULL;
m_langIdBits = 0;
m_langIdBitsValid = false;
m_termFreq = 0;
m_termFreqWeight = 0.0;
m_isQueryStopWord = false;
m_inQuotes = false;
m_termWeight = 0;
m_userWeight = 0;
m_userNotRequired = false;
m_piped = false;
m_ignored = false;
m_synonymOf = NULL;
m_synWids0 = 0;
m_synWids1 = 0;
m_numAlnumWordsInSynonym = 1;
m_fieldCode = FIELD_UNSET;
m_isRequired = false;
m_isWikiHalfStopBigram = false;
m_leftPhraseTermNum = 0;
m_rightPhraseTermNum = 0;
m_leftPhraseTerm = NULL;
m_rightPhraseTerm = NULL;
memset(m_startKey,0,sizeof(m_startKey));
memset(m_endKey,0,sizeof(m_endKey));
}
bool QueryTerm::isSplit() const {
if(!m_fieldCode) return true;
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
return true;
}
// hash of all the query terms
int64_t Query::getQueryHash() const {
int64_t qh = 0LL;
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
const QueryTerm *qt = &m_qterms[i];
qh = hash64 ( qt->m_termId , qh );
}
return qh;
}
void QueryWord::constructor () {
m_synWordBuf.constructor();
}
void QueryWord::destructor () {
m_synWordBuf.purge();
}
static int count_quotes(const char *s, size_t len) {
int count = 0;
while(len--)
if(*s++ == '\"')
count++;
return count;
}