mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
3554 lines
118 KiB
C++
3554 lines
118 KiB
C++
#include "Query.h"
|
|
#include "Bits.h"
|
|
#include "Phrases.h"
|
|
#include "Url.h"
|
|
#include "Domains.h"
|
|
#include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs()
|
|
#include "StopWords.h" // isQueryStopWord()
|
|
#include "Sections.h"
|
|
#include "Speller.h"
|
|
#include "Mem.h"
|
|
#include "Msg3a.h"
|
|
#include "HashTableX.h"
|
|
#include "Synonyms.h"
|
|
#include "HighFrequencyTermShortcuts.h"
|
|
#include "Wiki.h"
|
|
#include "ScoringWeights.h"
|
|
#include "RdbList.h"
|
|
#include "Process.h"
|
|
#include "Conf.h"
|
|
#include "termid_mask.h"
|
|
#include "Collectiondb.h"
|
|
#include "GbUtil.h"
|
|
#include <set>
|
|
#include "Lemma.h"
|
|
#include "Errno.h"
|
|
|
|
|
|
#include "GbMutex.h"
|
|
#include "ScopedLock.h"
|
|
|
|
static int count_quotes(const char *s, size_t len);
|
|
|
|
|
|
Query::Query()
|
|
: m_queryWordBuf("Query4"),
|
|
m_tr(),
|
|
m_filteredQuery("qrystk"),
|
|
m_originalQuery("oqbuf"),
|
|
m_bigramWeight(1.0),
|
|
m_synonymWeight(1.0),
|
|
m_word_variations_config()
|
|
{
|
|
m_qwords = NULL;
|
|
m_numWords = 0;
|
|
m_qwords = NULL;
|
|
m_numTerms = 0;
|
|
|
|
// Coverity
|
|
m_langId = langUnknown;
|
|
m_useQueryStopWords = false;
|
|
m_allowHighFreqTermCache = false;
|
|
m_numTermsUntruncated = 0;
|
|
m_isBoolean = false;
|
|
m_maxQueryTerms = 0;
|
|
|
|
memset(m_expressions, 0, sizeof(m_expressions));
|
|
|
|
reset ( );
|
|
}
|
|
|
|
Query::~Query ( ) {
|
|
reset ( );
|
|
}
|
|
|
|
void Query::reset ( ) {
|
|
|
|
// if Query::constructor() was called explicitly then we have to
|
|
// call destructors explicitly as well...
|
|
// essentially call QueryTerm::reset() on each query term
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
qw->destructor();
|
|
}
|
|
|
|
m_queryTermBuf.purge();
|
|
m_qterms = NULL;
|
|
m_tr.clear();
|
|
|
|
m_filteredQuery.purge();
|
|
m_originalQuery.purge();
|
|
m_docIdRestriction = 0LL;
|
|
m_numWords = 0;
|
|
m_numTerms = 0;
|
|
|
|
m_queryWordBuf.purge();
|
|
m_qwords = NULL;
|
|
m_numExpressions = 0;
|
|
// the site: and ip: query terms will disable site clustering & caching
|
|
m_hasPositiveSiteField = false;
|
|
m_hasIpField = false;
|
|
m_hasUrlField = false;
|
|
m_hasSubUrlField = false;
|
|
m_truncated = false;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . "query" must be NULL terminated
|
|
// . if boolFlag is 0 we ignore all boolean operators
|
|
// . if boolFlag is 1 we assume query is boolen
|
|
// . if boolFlag is 2 we attempt to detect if query is boolean or not
|
|
// . if "keepAllSingles" is true we do not ignore any single word UNLESS
|
|
// it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME)
|
|
// a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT)
|
|
// This is used for term highlighting (Highlight.cpp and Summary.cpp)
|
|
bool Query::set(const char *query,
|
|
// need language for doing synonyms
|
|
lang_t langId,
|
|
float bigramWeight,
|
|
float synonymWeight,
|
|
const WordVariationsConfig *wordVariationsConfig,
|
|
bool useQueryStopWords,
|
|
bool allowHighFreqTermCache,
|
|
int32_t maxQueryTerms)
|
|
{
|
|
static const WordVariationsConfig defaultWordVariationsConfig;
|
|
if(!wordVariationsConfig)
|
|
wordVariationsConfig = &defaultWordVariationsConfig;
|
|
log(LOG_DEBUG,"query: set2(query='%s', langId=%d, wiktionaryWordVariations=%s, languageSpecificWordVariations=%s useQueryStopWords=%s maxQueryTerms=%d)",
|
|
query, (int)langId, wordVariationsConfig->m_wiktionaryWordVariations?"true":"false", wordVariationsConfig->m_languageSpecificWordVariations?"true":"false", useQueryStopWords?"true":"false", maxQueryTerms);
|
|
|
|
reset();
|
|
|
|
m_langId = langId;
|
|
m_useQueryStopWords = useQueryStopWords;
|
|
m_allowHighFreqTermCache = allowHighFreqTermCache;
|
|
|
|
// fix summary rerank and highlighting.
|
|
bool keepAllSingles = true;
|
|
|
|
m_maxQueryTerms = maxQueryTerms;
|
|
|
|
// assume boolean auto-detect.
|
|
char boolFlag = 2;
|
|
|
|
|
|
if ( ! query ) return true;
|
|
|
|
m_bigramWeight = bigramWeight;
|
|
m_synonymWeight = synonymWeight;
|
|
m_word_variations_config = *wordVariationsConfig;
|
|
|
|
int32_t queryLen = strlen(query);
|
|
|
|
// truncate query if too big
|
|
if ( queryLen >= ABS_MAX_QUERY_LEN ) {
|
|
log(LOG_WARN, "query: Query length of %" PRId32" must be less than %" PRId32". Truncating.",
|
|
queryLen,(int32_t)ABS_MAX_QUERY_LEN);
|
|
queryLen = ABS_MAX_QUERY_LEN - 1;
|
|
m_truncated = true;
|
|
}
|
|
// save original query
|
|
if( !m_originalQuery.reserve ( queryLen + 1 ) ) {
|
|
logError("Failed to reserve %" PRId32 " bytes, bailing", queryLen+1);
|
|
return true;
|
|
}
|
|
m_originalQuery.safeMemcpy(query, queryLen);
|
|
m_originalQuery.nullTerm();
|
|
|
|
const char *q = query;
|
|
// see if it should be boolean...
|
|
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
|
|
// but if bool flag is 0 that means it is NOT boolean!
|
|
// it must be one for autodetection. so do not autodetect
|
|
// unless this is 2.
|
|
if ( boolFlag != 2 ) break;
|
|
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
|
|
(q[i+3]==' ' || q[i+3]=='(') )
|
|
boolFlag = 1;
|
|
if ( q[i]=='O' && q[i+1]=='R' &&
|
|
(q[i+2]==' ' || q[i+2]=='(') )
|
|
boolFlag = 1;
|
|
if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' &&
|
|
(q[i+3]==' ' || q[i+3]=='(') )
|
|
boolFlag = 1;
|
|
}
|
|
|
|
// if we did not set the flag to 1 set it to 0. force to non-bool
|
|
if ( boolFlag == 2 ) boolFlag = 0;
|
|
|
|
// reserve some space, guessing how much we'd need
|
|
int32_t need = queryLen * 2 + 32;
|
|
if ( ! m_filteredQuery.reserve ( need ) )
|
|
return false;
|
|
|
|
bool inQuotesFlag = false;
|
|
// . copy query into m_buf
|
|
// . translate ( and ) to special query operators so Words class
|
|
// can parse them as their own word to make parsing bool queries ez
|
|
// for parsing out the boolean operators in setBitScoresBoolean()
|
|
for ( int32_t i = 0 ; i < queryLen ; i++ ) {
|
|
|
|
// gotta count quotes! we ignore operators in quotes
|
|
// so you can search for diffbotUri:"article|0|123456"
|
|
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
|
|
|
|
if ( inQuotesFlag ) {
|
|
//*p = query [i];
|
|
//p++;
|
|
m_filteredQuery.pushChar(query[i]);
|
|
continue;
|
|
}
|
|
|
|
// translate ( and )
|
|
if ( boolFlag == 1 && query[i] == '(' ) {
|
|
m_filteredQuery.safeMemcpy ( " LeFtP " , 7 );
|
|
continue;
|
|
}
|
|
if ( boolFlag == 1 && query[i] == ')' ) {
|
|
m_filteredQuery.safeMemcpy ( " RiGhP " , 7 );
|
|
continue;
|
|
}
|
|
if ( query[i] == '|' ) {
|
|
m_filteredQuery.safeMemcpy ( " PiiPE " , 7 );
|
|
continue;
|
|
}
|
|
|
|
if(query[i] == '[') {
|
|
// translate [#w] [#p] [#s] [w] [p] [s] [nrw] to operators
|
|
char *endptr=NULL;
|
|
double val;
|
|
if(is_digit(query[i+1]))
|
|
val=strtod(query+i+1,&endptr);
|
|
if(endptr!=NULL && endptr!=query+1) {
|
|
size_t j = (size_t)(endptr-query);
|
|
if(query[j]=='w' && query[j+1]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB %f w RiGhB ", val);
|
|
i = j + 1;
|
|
continue;
|
|
} else if(query[j]=='p' && query[j+1]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB %f p RiGhB ", val);
|
|
i = j + 1;
|
|
continue;
|
|
} else if(query[j]=='s' && query[j+1]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB %f s RiGhB ", val);
|
|
i = j + 1;
|
|
continue;
|
|
}
|
|
} else if(query[i+1] == 'w' && query[i+2]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB w RiGhB ");
|
|
i = i + 2;
|
|
continue;
|
|
} else if(query[i+1] == 'p' && query[i+2]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB p RiGhB ");
|
|
i = i + 2;
|
|
continue;
|
|
} else if(query[i+1] == 's' && query[i+2]==']') {
|
|
m_filteredQuery.safePrintf(" LeFtB s RiGhB ");
|
|
i = i + 2;
|
|
continue;
|
|
} else if( i+4 < queryLen && query[i+1] == 'n' && query[i+2] == 'r' && query[i+3] == 'w' && query[i+4]==']') {
|
|
// user specified [nrw] before word, meaning treat it as not required
|
|
m_filteredQuery.safePrintf(" LeFtB nrw RiGhB ");
|
|
i = i + 4;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// TODO: copy altavista's operators here? & | !
|
|
// otherwise, just a plain copy
|
|
m_filteredQuery.pushChar ( query[i] );
|
|
}
|
|
// NULL terminate
|
|
m_filteredQuery.nullTerm();
|
|
if(m_filteredQuery.length() != queryLen || memcmp(m_filteredQuery.getBufStart(),query,queryLen)!=0)
|
|
log(LOG_INFO,"query: m_filteredQuery=%*.*s", m_filteredQuery.length(),m_filteredQuery.length(),m_filteredQuery.getBufStart());
|
|
|
|
Phrases phrases;
|
|
|
|
// set m_qwords[] array from m_buf
|
|
if ( ! setQWords(boolFlag, keepAllSingles, phrases) )
|
|
return false;
|
|
|
|
// set m_qterms from m_qwords, always succeeds
|
|
setQTerms();
|
|
|
|
// disable stuff for site:, ip: and url: queries
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord ) continue;
|
|
if ( qw->m_fieldCode == FIELD_SITE &&
|
|
qw->m_wordSign != '-' )
|
|
m_hasPositiveSiteField = true;
|
|
else if ( qw->m_fieldCode == FIELD_IP )
|
|
m_hasIpField = true;
|
|
else if ( qw->m_fieldCode == FIELD_URL )
|
|
m_hasUrlField = true;
|
|
else if ( qw->m_fieldCode == FIELD_SUBURL )
|
|
m_hasSubUrlField = true;
|
|
}
|
|
|
|
// set m_docIdRestriction if a term is gbdocid:
|
|
for ( int32_t i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) {
|
|
// get it
|
|
QueryTerm *qt = &m_qterms[i];
|
|
|
|
if( qt->m_fieldCode == FIELD_GBTERMID ) {
|
|
const char *ds = m_qterms[i].m_term + 9; // strlen("gbtermid:")
|
|
qt->m_termId = atoll(ds);
|
|
}
|
|
|
|
// gbdocid:?
|
|
if ( qt->m_fieldCode != FIELD_GBDOCID ) continue;
|
|
// get docid
|
|
const char *ds = m_qterms[i].m_term + 8;
|
|
m_docIdRestriction = atoll(ds);
|
|
break;
|
|
}
|
|
|
|
// . keep it simple for now
|
|
// . we limit to MAX_EXRESSIONS to like 10 now i guess
|
|
if ( m_isBoolean ) {
|
|
m_numExpressions = 1;
|
|
if ( ! m_expressions[0].addExpression ( 0 ,
|
|
m_numWords ,
|
|
this , // Query
|
|
0 ) ) // level
|
|
// return false with g_errno set on error
|
|
return false;
|
|
}
|
|
|
|
|
|
log(LOG_DEBUG,"query: m_numWords=%d, m_numTerms=%d", m_numWords, m_numTerms);
|
|
|
|
// . if it is not truncated, no need to use hard counts
|
|
// . comment this line and the next one out for testing hard counts
|
|
if ( ! m_truncated ) return true;
|
|
// if they just hit the admin's ceiling, there's nothing we can do
|
|
if ( m_numTerms >= m_maxQueryTerms ) return true;
|
|
// a temp log message
|
|
log(LOG_DEBUG,"query: Encountered %" PRId32" query terms.",m_numTerms);
|
|
|
|
// otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS
|
|
// so we can use hard counts to get more power...
|
|
|
|
// . use the hard count for excessive query terms to save explicit bits
|
|
// . just look for operands on the first level that are not OR'ed
|
|
char redo = 0;
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
// get the ith word
|
|
QueryWord *qw = &m_qwords[i];
|
|
// stop at first OR on this level
|
|
if ( qw->m_opcode == opcode_t::OP_OR ) break;
|
|
// skip all punct
|
|
if ( qw->m_isPunct ) continue;
|
|
// if we are a boolean query,the next operator can NOT be OP_OR
|
|
// because we can not used terms that are involved in an OR
|
|
// as a hard count term, because they are not required terms
|
|
for ( int32_t j=i+1 ; m_isBoolean && j<m_numWords; j++ ) {
|
|
// stop at previous operator
|
|
opcode_t opcode = m_qwords[j].m_opcode;
|
|
if ( opcode == opcode_t::OP_NONE ) continue;
|
|
if ( opcode != opcode_t::OP_OR ) break;
|
|
// otherwise, the next operator is an OR, so do not
|
|
// use a hard count for this term
|
|
goto stop;
|
|
}
|
|
// mark it so we can reduce our number of explicit bits used
|
|
redo = 1;
|
|
}
|
|
|
|
stop:
|
|
// if nothing changed, return now
|
|
if ( ! redo ) return true;
|
|
|
|
// . set the query terms again if we have a long query
|
|
if ( ! setQTerms() )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Query::setQTerms() {
|
|
if(g_conf.m_logTraceQuery) {
|
|
logTrace(g_conf.m_logTraceQuery, "Query::setQTerms(words:%zu)", m_tr.size());
|
|
for(unsigned i=0; i<m_tr.size(); i++) {
|
|
logTrace(g_conf.m_logTraceQuery, " word #%u: '%*.*s'", i, (int)m_tr[i].token_len, (int)m_tr[i].token_len, m_tr[i].token_start);
|
|
int64_t phraseTermId = m_qwords[i].m_bigramId&TERMID_MASK;
|
|
int64_t wordTermId = m_qwords[i].m_wordId&TERMID_MASK;
|
|
logTrace(g_conf.m_logTraceQuery, " m_bigramId=%20" PRId64" (%15" PRId64"), m_ignorePhrase=%d m_bigramLen=%d", m_qwords[i].m_bigramId, phraseTermId, m_qwords[i].m_ignorePhrase, m_qwords[i].m_bigramLen);
|
|
logTrace(g_conf.m_logTraceQuery, " m_wordId =%20" PRId64" (%15" PRId64"), m_ignoreWord=%d, m_quoteStart=%d, m_quoteEnd=%d, fieldCode=%s, m_prefixHash=0x%lx", m_qwords[i].m_wordId, wordTermId, m_qwords[i].m_ignoreWord, m_qwords[i].m_quoteStart, m_qwords[i].m_quoteEnd, m_qwords[i].m_fieldCode?getFieldCodeName(m_qwords[i].m_fieldCode):"",m_qwords[i].m_prefixHash);
|
|
|
|
}
|
|
}
|
|
// . set m_qptrs/m_qtermIds/m_qbits
|
|
// . use one bit position for each phraseId and wordId
|
|
|
|
// count phrases first for allocating
|
|
//Removed: elaborate counting of possible bigrams. Done instead: this:
|
|
int numCandidatePhrases = m_numWords-1;
|
|
|
|
// count single terms
|
|
int numCandidateSingles = 0;
|
|
for ( int32_t i = 0 ; i < m_numWords; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord &&
|
|
qw->m_ignoreWord != IGNORE_QSTOP) continue;
|
|
// ignore if in quotes and part of phrase, watch out
|
|
// for things like "word", a single word in quotes.
|
|
if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;
|
|
// if we are not start of quote and NOT in a phrase we
|
|
// must be the tailing word i guess.
|
|
// fixes '"john smith" -"bob dole"' from having
|
|
// smith and dole as query terms.
|
|
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
|
|
continue;
|
|
// ignore if weight is absolute zero
|
|
if ( almostEqualFloat(qw->m_userWeightForWord,0) )
|
|
continue;
|
|
numCandidateSingles++;
|
|
}
|
|
// thirdly, count synonyms
|
|
int numCandidateSynonyms = 0;
|
|
Synonyms syn;
|
|
if(m_word_variations_config.m_wiktionaryWordVariations) {
|
|
int64_t to = hash64n("to");
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
// get query word
|
|
const QueryWord *qw = &m_qwords[i];
|
|
// skip if in quotes, we will not get synonyms for it
|
|
if ( qw->m_inQuotes ) continue;
|
|
// skip if has plus sign in front
|
|
if ( qw->m_wordSign == '+' ) continue;
|
|
// not '-' either i guess
|
|
if ( qw->m_wordSign == '-' ) continue;
|
|
// no url: stuff, maybe only title
|
|
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
|
|
continue;
|
|
// ignore title: etc. words, they are field names
|
|
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
|
// ignore boolean operators
|
|
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
|
// ignore if word weight is zero or synonym weight is zero
|
|
if(almostEqualFloat(qw->m_userWeightForWord,0))
|
|
continue;
|
|
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
|
|
continue;
|
|
// no, hurts 'Greencastle IN economic development'
|
|
if ( qw->m_wordId == to ) continue;
|
|
// single letters...
|
|
if ( qw->m_wordLen == 1 ) continue;
|
|
// set the synonyms for this word
|
|
char tmpBuf [ TMPSYNBUFSIZE ];
|
|
int32_t naids = syn.getSynonyms ( &m_tr,
|
|
i ,
|
|
// language of the query.
|
|
// 0 means unknown. if this
|
|
// is 0 we sample synonyms
|
|
// from all languages.
|
|
m_langId ,
|
|
tmpBuf );
|
|
// if no synonyms, all done
|
|
if ( naids <= 0 ) continue;
|
|
numCandidateSynonyms += naids;
|
|
}
|
|
}
|
|
|
|
std::vector<std::string> wvg_source_words;
|
|
std::vector<int> wvg_source_word_index; //idx in wvg_source_words -> idx of queryword
|
|
if(m_word_variations_config.m_languageSpecificWordVariations) {
|
|
for(int i=0; i<m_numWords; i++) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
if(qw->m_inQuotes) continue;
|
|
if(qw->m_wordSign == '+') continue;
|
|
if(qw->m_wordSign == '-') continue;
|
|
if(qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
|
|
continue;
|
|
if(qw->m_ignoreWord == IGNORE_FIELDNAME) continue;
|
|
// ignore if word weight is zero or synonym weight is zero
|
|
if(almostEqualFloat(qw->m_userWeightForWord,0))
|
|
continue;
|
|
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
|
|
continue;
|
|
wvg_source_words.emplace_back(qw->m_word,qw->m_wordLen);
|
|
wvg_source_word_index.emplace_back(i);
|
|
}
|
|
auto wvg(WordVariationGenerator::get_generator(m_langId));
|
|
m_wordVariations = wvg->query_variations(wvg_source_words, m_word_variations_config.m_word_variations_weights, m_word_variations_config.m_word_variations_threshold);
|
|
numCandidateSynonyms += m_wordVariations.size();
|
|
if(!m_wordVariations.empty())
|
|
logTrace(g_conf.m_logTraceQuery, "word variations produced %d variants", (int)m_wordVariations.size());
|
|
else
|
|
logTrace(g_conf.m_logTraceQuery, "word variations didn't produce any");
|
|
} else
|
|
m_wordVariations.clear();
|
|
if(g_conf.m_logTraceQuery) {
|
|
logTrace(g_conf.m_logTraceQuery, "m_wordVariations.size()=%zu", m_wordVariations.size());
|
|
for(unsigned i=0; i<m_wordVariations.size(); i++)
|
|
logTrace(g_conf.m_logTraceQuery, " variation #%u: %s weight=%f src=[%d..%d)", i, m_wordVariations[i].word.c_str(), m_wordVariations[i].weight, m_wordVariations[i].source_word_start, m_wordVariations[i].source_word_end);
|
|
}
|
|
|
|
if(m_word_variations_config.m_lemmaWordVariations)
|
|
numCandidateSynonyms += 10;
|
|
|
|
m_numTermsUntruncated = numCandidatePhrases+numCandidateSingles+numCandidateSynonyms;
|
|
logTrace(g_conf.m_logTraceQuery, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms)", m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms);
|
|
const int numQueryTerms = std::min(std::min(m_numTermsUntruncated,m_maxQueryTerms),ABS_MAX_QUERY_TERMS);
|
|
if(numQueryTerms!=m_numTermsUntruncated)
|
|
log(LOG_DEBUG, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms), will be truncated to %d terms for query '%s'",
|
|
m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms, numQueryTerms,
|
|
m_filteredQuery.getBufStart());
|
|
|
|
// allocate the term buffer
|
|
if(numQueryTerms) {
|
|
int32_t need = numQueryTerms * sizeof(QueryTerm);
|
|
if ( ! m_queryTermBuf.reserve ( need ) )
|
|
return false;
|
|
m_queryTermBuf.setLabel("stkbuf3");
|
|
const char *pp = m_queryTermBuf.getBufStart();
|
|
m_qterms = (QueryTerm *)pp;
|
|
}
|
|
|
|
// call constructor on each one here
|
|
for(int32_t i = 0; i < numQueryTerms; i++) {
|
|
QueryTerm *qt = &m_qterms[i];
|
|
qt->constructor();
|
|
}
|
|
|
|
|
|
int32_t n = 0;
|
|
|
|
// do phrase terms
|
|
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
|
|
if(numCandidateSingles+numCandidatePhrases > m_maxQueryTerms) {
|
|
//we won't have room for both phrases and singles. Put in as many singles as possible. But phrases
|
|
//must come first in the list due to bad assumptions elsewhere in the code.
|
|
if(numQueryTerms - n - 1 < numCandidateSingles)
|
|
break;
|
|
}
|
|
|
|
QueryWord *qw = &m_qwords[i];
|
|
// skip if ignored... mdw...
|
|
if ( ! qw->m_bigramId ) continue;
|
|
if ( qw->m_ignorePhrase ) continue; // could be a repeat
|
|
// none if weight is absolute zero
|
|
if ( almostEqualFloat(qw->m_userWeightForPhrase, 0) )
|
|
continue;
|
|
|
|
QueryTerm *qt = &m_qterms[n];
|
|
qt->m_qword = qw ;
|
|
qt->m_piped = qw->m_piped;
|
|
qt->m_isPhrase = true ;
|
|
qt->m_synonymOf = NULL;
|
|
qt->m_ignored = false;
|
|
qt->m_term = NULL;
|
|
qt->m_termLen = 0;
|
|
qt->m_langIdBitsValid = false;
|
|
qt->m_langIdBits = 0;
|
|
// stop word? no, we're a phrase term
|
|
qt->m_isQueryStopWord = false;
|
|
// change in both places
|
|
qt->m_termId = qw->m_bigramId & TERMID_MASK;
|
|
qt->m_rawTermId = qw->m_rawPhraseId;
|
|
// boolean queries are not allowed term signs for phrases
|
|
// UNLESS it is a '*' soft require sign which we need for
|
|
// phrases like: "cat dog" AND pig
|
|
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
|
|
qt->m_termSign = '\0';
|
|
}
|
|
// if not boolean, ensure to change signs in both places
|
|
else {
|
|
qt->m_termSign = qw->m_phraseSign;
|
|
}
|
|
|
|
qw->m_queryWordTerm = NULL;
|
|
// IndexTable.cpp uses this one
|
|
qt->m_inQuotes = qw->m_inQuotes;
|
|
// point to the string itself that is the phrase
|
|
qt->m_term = qw->m_word;
|
|
qt->m_termLen = qw->m_bigramLen;
|
|
|
|
// the QueryWord should have a direct link to the QueryTerm,
|
|
// at least for phrase, so we can OR in the bits of its
|
|
// constituents in the for loop below
|
|
qw->m_queryPhraseTerm = qt ;
|
|
// assign score weight, we're a phrase here
|
|
qt->m_termWeight = m_bigramWeight;
|
|
qt->m_userWeight = qw->m_userWeightForPhrase ;
|
|
qt->m_fieldCode = qw->m_fieldCode;
|
|
|
|
// stuff before a pipe always has a weight of 1
|
|
if ( qt->m_piped ) {
|
|
qt->m_userWeight = 1;
|
|
}
|
|
n++;
|
|
}
|
|
|
|
// now if we have enough room, do the singles
|
|
for(int32_t i = 0; i < m_numWords && n<numQueryTerms; i++) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
|
|
if ( qw->m_ignoreWord &&
|
|
qw->m_ignoreWord != IGNORE_QSTOP) continue;
|
|
|
|
// ignore if in quotes and part of phrase, watch out
|
|
// for things like "word", a single word in quotes.
|
|
if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;
|
|
|
|
// if we are not start of quote and NOT in a phrase we
|
|
// must be the tailing word i guess.
|
|
// fixes '"john smith" -"bob dole"' from having
|
|
// smith and dole as query terms.
|
|
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
|
|
continue;
|
|
|
|
// ignore if weight is absolute zero
|
|
if ( almostEqualFloat(qw->m_userWeightForWord,0) )
|
|
continue;
|
|
|
|
QueryTerm *qt = &m_qterms[n];
|
|
qt->m_qword = qw ;
|
|
qt->m_piped = qw->m_piped;
|
|
qt->m_isPhrase = false ;
|
|
qt->m_synonymOf = NULL;
|
|
// ignore some synonym terms if tf is too low
|
|
qt->m_ignored = qw->m_ignoreWord;
|
|
// stop word? no, we're a phrase term
|
|
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
|
// change in both places
|
|
qt->m_termId = qw->m_wordId & TERMID_MASK;
|
|
qt->m_rawTermId = qw->m_rawWordId;
|
|
// boolean queries are not allowed term signs
|
|
if ( m_isBoolean ) {
|
|
qt->m_termSign = '\0';
|
|
// boolean fix for "health OR +sports" because
|
|
// the + there means exact word match, no synonyms.
|
|
if ( qw->m_wordSign == '+' ) {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
}
|
|
// if not boolean, ensure to change signs in both places
|
|
else {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
int32_t pw = i-1;
|
|
// . back up until word that contains quote if in a quoted
|
|
// phrase
|
|
// . UOR can only support two word phrases really...
|
|
if (m_qwords[i].m_quoteStart >= 0)
|
|
pw = m_qwords[i].m_quoteStart ;
|
|
if ( pw > 0 ) pw--;
|
|
|
|
// back two more if field
|
|
int32_t fieldStart=-1;
|
|
int32_t fieldLen=0;
|
|
|
|
if(pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME)
|
|
fieldStart = pw;
|
|
|
|
if(pw > 0 && m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME) {
|
|
pw -= 1;
|
|
fieldStart = pw;
|
|
}
|
|
while(pw > 0 && m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME) {
|
|
pw--;
|
|
fieldStart = pw;
|
|
}
|
|
|
|
|
|
// skip if it is punct. fixes queries like
|
|
// "(this OR that)" from including '(' or from including
|
|
// a space.
|
|
if ( fieldStart >-1 &&
|
|
m_qwords[fieldStart].m_isPunct &&
|
|
fieldStart+1<m_numWords )
|
|
fieldStart++;
|
|
|
|
if (fieldStart > -1) {
|
|
pw = i;
|
|
while (pw < m_numWords && m_qwords[pw].m_fieldCode)
|
|
pw++;
|
|
|
|
fieldLen = m_qwords[pw-1].m_word +
|
|
m_qwords[pw-1].m_wordLen -
|
|
m_qwords[fieldStart].m_word;
|
|
}
|
|
qw->m_queryWordTerm = qt;
|
|
// IndexTable.cpp uses this one
|
|
qt->m_inQuotes = qw->m_inQuotes;
|
|
// point to the string itself that is the word
|
|
|
|
if (fieldLen > 0) {
|
|
qt->m_term = m_qwords[fieldStart].m_word;
|
|
qt->m_termLen = fieldLen;
|
|
// fix for query
|
|
// text:"" foo bar ""
|
|
if ( pw-1 < i ) {
|
|
log("query: bad query %s",m_originalQuery.getBufStart());
|
|
g_errno = EMALFORMEDQUERY;
|
|
return false;
|
|
}
|
|
// skip past the end of the field value
|
|
i = pw-1;
|
|
}
|
|
else {
|
|
qt->m_termLen = qw->m_wordLen;
|
|
qt->m_term = qw->m_word;
|
|
}
|
|
|
|
// assign score weight, we're a single-term here
|
|
qt->m_termWeight = 1.0;
|
|
qt->m_userWeight = qw->m_userWeightForWord;
|
|
qt->m_fieldCode = qw->m_fieldCode;
|
|
qt->m_userNotRequired = qw->m_userNotRequiredForWord;
|
|
|
|
// stuff before a pipe always has a weight of 1
|
|
if ( qt->m_piped ) {
|
|
qt->m_userWeight = 1;
|
|
}
|
|
n++;
|
|
}
|
|
|
|
// Handle shared explicit bits
|
|
for ( int32_t i = 0; i < n ; i++ ){
|
|
QueryTerm *qt = &m_qterms[i];
|
|
// assume not in a phrase
|
|
qt->m_rightPhraseTermNum = -1;
|
|
qt->m_leftPhraseTermNum = -1;
|
|
qt->m_rightPhraseTerm = NULL;
|
|
qt->m_leftPhraseTerm = NULL;
|
|
}
|
|
|
|
// . set m_inPhrase
|
|
for (int32_t i = 0; i < m_numWords ; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
QueryTerm *qt = qw->m_queryWordTerm;
|
|
if (!qt) continue;
|
|
// set flag if in a a phrase, and set phrase term num
|
|
if ( qw->m_queryPhraseTerm ) {
|
|
QueryTerm *pt = qw->m_queryPhraseTerm;
|
|
qt->m_rightPhraseTermNum = pt - m_qterms;
|
|
qt->m_rightPhraseTerm = pt;
|
|
}
|
|
// if we're in the middle of the phrase
|
|
int32_t pn = qw->m_leftPhraseStart;
|
|
// convert word to its phrase QueryTerm ptr, if any
|
|
QueryTerm *tt = NULL;
|
|
if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
|
|
if ( tt ) {
|
|
qt->m_leftPhraseTermNum = tt - m_qterms;
|
|
qt->m_leftPhraseTerm = tt;
|
|
}
|
|
// . there might be some phrase term that actually contains
|
|
// the same word as we are, but a different occurence
|
|
// . like '"knowledge management" AND NOT management' query
|
|
// . made it from "j < i" into "j < m_numWords" because
|
|
// 'test "test bed"' was not working but '"test bed" test'
|
|
// was working.
|
|
for ( int32_t j = 0 ; j < m_numWords ; j++ ) {
|
|
// must be our same wordId (same word, different occ.)
|
|
const QueryWord *qw2 = &m_qwords[j];
|
|
if ( qw2->m_wordId != qw->m_wordId ) continue;
|
|
// get first word in the phrase that jth word is in
|
|
int32_t pn2 = qw2->m_leftPhraseStart;
|
|
// we might be the guy that starts it!
|
|
if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j;
|
|
// if neither is the case, skip this query word
|
|
if ( pn2 < 0 ) continue;
|
|
// he implies us!
|
|
QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
|
|
if ( tt2 ) {
|
|
qt->m_leftPhraseTermNum = tt2 - m_qterms;
|
|
qt->m_leftPhraseTerm = tt2;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(g_conf.m_logTraceQuery) {
|
|
logTrace(g_conf.m_logTraceQuery, "query-terms before word variations:");
|
|
for(int i=0; i<n; i++)
|
|
logTrace(g_conf.m_logTraceQuery, " query-term #%d: termid=%15" PRId64" '%*.*s'", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term);
|
|
}
|
|
|
|
////////////
|
|
//
|
|
// . add synonym query terms now
|
|
// . skip this part if language is unknown i guess
|
|
//
|
|
////////////
|
|
|
|
if(m_word_variations_config.m_wiktionaryWordVariations) {
|
|
int64_t to = hash64n("to");
|
|
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
|
|
// get query word
|
|
QueryWord *qw = &m_qwords[i];
|
|
// skip if in quotes, we will not get synonyms for it
|
|
if ( qw->m_inQuotes ) continue;
|
|
// skip if has plus sign in front
|
|
if ( qw->m_wordSign == '+' ) continue;
|
|
// not '-' either i guess
|
|
if ( qw->m_wordSign == '-' ) continue;
|
|
// no url: stuff, maybe only title
|
|
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
|
|
continue;
|
|
// skip if ignored like a stopword (stop to->too)
|
|
//if ( qw->m_ignoreWord ) continue;
|
|
// ignore title: etc. words, they are field names
|
|
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
|
|
// ignore boolean operators
|
|
if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
|
|
// ignore if word weight is zero or synonym weight is zero
|
|
if(almostEqualFloat(qw->m_userWeightForWord,0))
|
|
continue;
|
|
if(almostEqualFloat(qw->m_userWeightForSynonym,0))
|
|
continue;
|
|
// no, hurts 'Greencastle IN economic development'
|
|
if ( qw->m_wordId == to ) continue;
|
|
// single letters...
|
|
if ( qw->m_wordLen == 1 ) continue;
|
|
// set the synonyms for this word
|
|
char tmpBuf [ TMPSYNBUFSIZE ];
|
|
int32_t naids = syn.getSynonyms ( &m_tr,
|
|
i ,
|
|
// language of the query.
|
|
// 0 means unknown. if this
|
|
// is 0 we sample synonyms
|
|
// from all languages.
|
|
m_langId ,
|
|
tmpBuf );
|
|
// if no synonyms, all done
|
|
if ( naids <= 0 ) continue;
|
|
// sanity
|
|
if ( naids > MAX_SYNS ) { g_process.shutdownAbort(true); }
|
|
// now make the buffer to hold them for us
|
|
qw->m_synWordBuf.setLabel("qswbuf");
|
|
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
|
|
// get the term for this word
|
|
QueryTerm *origTerm = qw->m_queryWordTerm;
|
|
// loop over synonyms for word #i now
|
|
for(int32_t j = 0; j < naids && n<numQueryTerms; j++) {
|
|
// this happens for 'da da da'
|
|
if ( ! origTerm ) continue;
|
|
|
|
// add that query term
|
|
QueryTerm *qt = &m_qterms[n];
|
|
qt->m_qword = qw; // NULL;
|
|
qt->m_piped = qw->m_piped;
|
|
qt->m_isPhrase = false ;
|
|
qt->m_langIdBits = 0;
|
|
// synonym of this term...
|
|
qt->m_synonymOf = origTerm;
|
|
// nuke this crap since it was done above and we
|
|
// missed out!
|
|
qt->m_rightPhraseTermNum = -1;
|
|
qt->m_leftPhraseTermNum = -1;
|
|
qt->m_rightPhraseTerm = NULL;
|
|
qt->m_leftPhraseTerm = NULL;
|
|
// need this for displaying language of syn in
|
|
// the json/xml feed in PageResults.cpp
|
|
qt->m_langIdBitsValid = true;
|
|
int langId = syn.m_langIds[j];
|
|
uint64_t langBit = (uint64_t)1 << langId;
|
|
if ( langId >= 64 ) langBit = 0;
|
|
qt->m_langIdBits |= langBit;
|
|
// need this for Matches.cpp
|
|
qt->m_synWids0 = syn.m_wids0[j];
|
|
qt->m_synWids1 = syn.m_wids1[j];
|
|
int32_t na = syn.m_numAlnumWords[j];
|
|
// how many words were in the base we used to
|
|
// get the synonym. i.e. if the base is "new jersey"
|
|
// then it's 2! and the synonym "nj" has one alnum
|
|
// word.
|
|
int32_t ba = syn.m_numAlnumWordsInBase[j];
|
|
qt->m_numAlnumWordsInSynonym = na;
|
|
|
|
// crap, "nj" is a synonym of the PHRASE TERM
|
|
// bigram "new jersey" not of the single word term
|
|
// "new" so fix that.
|
|
if ( ba == 2 && origTerm->m_rightPhraseTerm )
|
|
qt->m_synonymOf = origTerm->m_rightPhraseTerm;
|
|
|
|
// ignore some synonym terms if tf is too low
|
|
qt->m_ignored = qw->m_ignoreWord;
|
|
// stop word? no, we're a phrase term
|
|
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
|
// change in both places
|
|
int64_t wid = syn.m_aids[j];
|
|
// might be in a title: field or something
|
|
if ( qw->m_prefixHash ) {
|
|
int64_t ph = qw->m_prefixHash;
|
|
wid= hash64h(wid,ph);
|
|
}
|
|
qt->m_termId = wid & TERMID_MASK;
|
|
qt->m_rawTermId = syn.m_aids[j];
|
|
// boolean queries are not allowed term signs
|
|
if ( m_isBoolean ) {
|
|
qt->m_termSign = '\0';
|
|
// boolean fix for "health OR +sports" because
|
|
// the + there means exact word match, no syns
|
|
if ( qw->m_wordSign == '+' ) {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
}
|
|
// if not bool, ensure to change signs in both places
|
|
else {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
// IndexTable.cpp uses this one
|
|
qt->m_inQuotes = qw->m_inQuotes;
|
|
// usually this is right
|
|
const char *ptr = syn.m_termPtrs[j];
|
|
// buf if it is NULL that means we transformed the
|
|
// word by like removing accent marks and stored
|
|
// it in m_synWordBuf, as opposed to just pointing
|
|
// to a line in memory of wiktionary-buf.txt.
|
|
if ( ! ptr ) {
|
|
int32_t off = syn.m_termOffs[j];
|
|
if ( off < 0 ) {
|
|
g_process.shutdownAbort(true); }
|
|
if ( off > qw->m_synWordBuf.length() ) {
|
|
g_process.shutdownAbort(true); }
|
|
// use QueryWord::m_synWordBuf which should
|
|
// be persistent and not disappear like
|
|
// syn.m_synWordBuf.
|
|
ptr = qw->m_synWordBuf.getBufStart() + off;
|
|
}
|
|
// point to the string itself that is the word
|
|
qt->m_term = ptr;
|
|
qt->m_termLen = syn.m_termLens[j];
|
|
// assign score weight, we're a synonym here
|
|
qt->m_termWeight = m_synonymWeight;
|
|
qt->m_userWeight = qw->m_userWeightForSynonym;
|
|
qt->m_fieldCode = qw->m_fieldCode;
|
|
|
|
// stuff before a pipe always has a weight of 1
|
|
if ( qt->m_piped ) {
|
|
qt->m_userWeight = 1;
|
|
}
|
|
// otherwise, add it
|
|
n++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(m_word_variations_config.m_languageSpecificWordVariations) {
|
|
logTrace(g_conf.m_logTraceQuery, "Word variations: %zu", m_wordVariations.size());
|
|
for(unsigned i=0; i<m_wordVariations.size() && n<numQueryTerms; i++) {
|
|
auto const &word_variation(m_wordVariations[i]);
|
|
int wordStartIdx = wvg_source_word_index[word_variation.source_word_start];
|
|
int wordEndIdx = wvg_source_word_index[word_variation.source_word_end-1];
|
|
logTrace(g_conf.m_logTraceQuery, " Word variation #%u: '%s' weight=%f src=[%u..%u]", i, word_variation.word.c_str(), word_variation.weight, wordStartIdx, wordEndIdx);
|
|
QueryWord *qw = &m_qwords[wordStartIdx];
|
|
if((unsigned)qw->m_wordLen==word_variation.word.length() &&
|
|
memcmp(qw->m_word, word_variation.word.data(), word_variation.word.length())==0)
|
|
{
|
|
//Variation is the same as the base word. The word-variation-plugin is allowed to produce that.
|
|
continue; //skip
|
|
}
|
|
QueryTerm *origTerm = qw->m_queryWordTerm;
|
|
|
|
//handle if the word variant is a bigram/phrase
|
|
bool isPhrase = false;
|
|
if(wordEndIdx-wordStartIdx>1) {
|
|
logTrace(g_conf.m_logTraceQuery, "Word variation '%s' spans more than 1 word", word_variation.word.c_str());
|
|
if(wordEndIdx-wordStartIdx==2) {
|
|
//find bigram pointing to first word
|
|
QueryTerm *bigramQueryTerm = NULL;
|
|
for(int j=0; j<n && !bigramQueryTerm; j++) {
|
|
if(m_qterms[j].m_qword==qw && m_qterms[j].m_isPhrase)
|
|
bigramQueryTerm = &m_qterms[j];
|
|
}
|
|
if(bigramQueryTerm) {
|
|
logTrace(g_conf.m_logTraceQuery, "Word variation covers '%.*s'", bigramQueryTerm->m_termLen, bigramQueryTerm->m_term);
|
|
origTerm = bigramQueryTerm;
|
|
isPhrase = true;
|
|
} else
|
|
log(LOG_LOGIC,"Word variation '%s' bigram/phrase didn't find base bigram", word_variation.word.c_str());
|
|
} else {
|
|
log(LOG_LOGIC,"Word variation '%s' spans more than 2 words. This is not supported (yet)", word_variation.word.c_str());
|
|
}
|
|
}
|
|
|
|
// add that query term
|
|
QueryTerm *qt = &m_qterms[n];
|
|
qt->m_qword = qw; // NULL;
|
|
qt->m_piped = qw->m_piped;
|
|
qt->m_isPhrase = isPhrase;
|
|
qt->m_langIdBits = 0;
|
|
// synonym of this term...
|
|
qt->m_synonymOf = origTerm;
|
|
// nuke this crap since it was done above and we
|
|
// missed out!
|
|
qt->m_rightPhraseTermNum = -1;
|
|
qt->m_leftPhraseTermNum = -1;
|
|
qt->m_rightPhraseTerm = NULL;
|
|
qt->m_leftPhraseTerm = NULL;
|
|
// need this for displaying language of syn in
|
|
// the json/xml feed in PageResults.cpp
|
|
qt->m_langIdBitsValid = true;
|
|
//int langId = syn.m_langIds[j]; //syn-todo?
|
|
//uint64_t langBit = (uint64_t)1 << langId; //syn-todo?
|
|
//if(langId >= 64) langBit = 0; //syn-todo?
|
|
//qt->m_langIdBits |= langBit; //syn-todo?
|
|
// need this for Matches.cpp
|
|
qt->m_synWids0 = 0;
|
|
qt->m_synWids1 = 0;
|
|
qt->m_numAlnumWordsInSynonym = 0;
|
|
|
|
// ignore some synonym terms if tf is too low
|
|
qt->m_ignored = qw->m_ignoreWord;
|
|
// stop word? no, we're a phrase term
|
|
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
|
// change in both places
|
|
//int64_t wid = syn.m_aids[j];
|
|
int64_t wid = hash64Lower_utf8_nospaces(word_variation.word.data(), word_variation.word.length());
|
|
// might be in a title: field or something
|
|
if(qw->m_prefixHash) {
|
|
int64_t ph = qw->m_prefixHash;
|
|
wid= hash64h(wid,ph);
|
|
}
|
|
qt->m_termId = wid & TERMID_MASK;
|
|
//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
|
|
// boolean queries are not allowed term signs
|
|
if(m_isBoolean) {
|
|
qt->m_termSign = '\0';
|
|
// boolean fix for "health OR +sports" because
|
|
// the + there means exact word match, no syns
|
|
if(qw->m_wordSign == '+') {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
}
|
|
// if not bool, ensure to change signs in both places
|
|
else {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
// IndexTable.cpp uses this one
|
|
qt->m_inQuotes = qw->m_inQuotes;
|
|
// point to the string itself that is the word
|
|
qt->m_term = word_variation.word.data();
|
|
qt->m_termLen = word_variation.word.length();
|
|
// assign score weight
|
|
qt->m_termWeight = word_variation.weight;
|
|
qt->m_userWeight = qw->m_userWeightForSynonym;
|
|
qt->m_fieldCode = qw->m_fieldCode ;
|
|
// stuff before a pipe always has a weight of 1
|
|
if(qt->m_piped) {
|
|
qt->m_userWeight = 1;
|
|
}
|
|
// otherwise, add it
|
|
n++;
|
|
}
|
|
}
|
|
|
|
if(m_word_variations_config.m_lemmaWordVariations && m_langId==langDanish) {
|
|
logTrace(g_conf.m_logTraceQuery, "Lexicon-based lemma synonyms");
|
|
for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
|
|
if(!m_tr[i].is_alfanum)
|
|
continue;
|
|
std::string w(m_tr[i].token_start,m_tr[i].token_len);
|
|
logTrace(g_conf.m_logTraceQuery, "Checking lemma for '%s'", w.c_str());
|
|
auto le = lemma_lexicon->lookup(w);
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
|
|
char lowercase_word[128];
|
|
if(w.size()<sizeof(lowercase_word)) {
|
|
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
|
|
lowercase_word[sz] = '\0';
|
|
if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
|
|
le = lemma_lexicon->lookup(lowercase_word);
|
|
}
|
|
}
|
|
}
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try capitalized in case it is a lowercase or uppercase word
|
|
char capitalized_word[128];
|
|
if(w.size()<sizeof(capitalized_word)) {
|
|
size_t sz = to_capitalized_utf8(capitalized_word,capitalized_word+sizeof(capitalized_word), w.data(), w.data()+w.size());
|
|
capitalized_word[sz] = '\0';
|
|
if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
|
|
w = capitalized_word;
|
|
le = lemma_lexicon->lookup(w);
|
|
}
|
|
}
|
|
}
|
|
if(!le) {
|
|
//Not found as-is in lexicon. Try uppercasing it
|
|
char uppercase_word[128];
|
|
if(w.size()<sizeof(uppercase_word)) {
|
|
size_t sz = to_upper_utf8(uppercase_word,uppercase_word+sizeof(uppercase_word), w.data(), w.data()+w.size());
|
|
uppercase_word[sz] = '\0';
|
|
if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
|
|
w = uppercase_word;
|
|
le = lemma_lexicon->lookup(w);
|
|
}
|
|
}
|
|
}
|
|
if(!le)
|
|
continue; //unknown word
|
|
auto wf = le->find_base_wordform();
|
|
if(!wf)
|
|
continue;
|
|
if(wf->written_form_length==w.size() && memcmp(wf->written_form,w.data(),w.size())==0)
|
|
continue; //already base form)
|
|
logTrace(g_conf.m_logTraceQuery, "Generating synonym from lemma: %s -> %.*s", w.c_str(), wf->written_form_length,wf->written_form);
|
|
|
|
QueryWord *qw = &m_qwords[i];
|
|
QueryTerm *origTerm = qw->m_queryWordTerm;
|
|
|
|
// add that query term
|
|
QueryTerm *qt = &m_qterms[n];
|
|
qt->m_qword = qw; // NULL;
|
|
qt->m_piped = qw->m_piped;
|
|
qt->m_isPhrase = false;
|
|
qt->m_langIdBits = 0;
|
|
// synonym of this term...
|
|
qt->m_synonymOf = origTerm;
|
|
// nuke this crap since it was done above and we
|
|
// missed out!
|
|
qt->m_rightPhraseTermNum = -1;
|
|
qt->m_leftPhraseTermNum = -1;
|
|
qt->m_rightPhraseTerm = NULL;
|
|
qt->m_leftPhraseTerm = NULL;
|
|
// need this for displaying language of syn in
|
|
// the json/xml feed in PageResults.cpp
|
|
qt->m_langIdBitsValid = true;
|
|
//int langId = syn.m_langIds[j]; //syn-todo?
|
|
//uint64_t langBit = (uint64_t)1 << langId; //syn-todo?
|
|
//if(langId >= 64) langBit = 0; //syn-todo?
|
|
//qt->m_langIdBits |= langBit; //syn-todo?
|
|
// need this for Matches.cpp
|
|
qt->m_synWids0 = 0;
|
|
qt->m_synWids1 = 0;
|
|
qt->m_numAlnumWordsInSynonym = 0;
|
|
|
|
// ignore some synonym terms if tf is too low
|
|
qt->m_ignored = qw->m_ignoreWord;
|
|
// stop word? no, we're a phrase term
|
|
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
|
// change in both places
|
|
//int64_t wid = syn.m_aids[j];
|
|
int64_t wid = hash64Lower_utf8_nospaces(wf->written_form,wf->written_form_length);
|
|
// might be in a title: field or something
|
|
if(qw->m_prefixHash) {
|
|
int64_t ph = qw->m_prefixHash;
|
|
wid= hash64h(wid,ph);
|
|
}
|
|
qt->m_termId = wid & TERMID_MASK;
|
|
//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
|
|
// boolean queries are not allowed term signs
|
|
if(m_isBoolean) {
|
|
qt->m_termSign = '\0';
|
|
// boolean fix for "health OR +sports" because
|
|
// the + there means exact word match, no syns
|
|
if(qw->m_wordSign == '+') {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
}
|
|
// if not bool, ensure to change signs in both places
|
|
else {
|
|
qt->m_termSign = qw->m_wordSign;
|
|
}
|
|
// IndexTable.cpp uses this one
|
|
qt->m_inQuotes = qw->m_inQuotes;
|
|
// point to the string itself that is the word
|
|
qt->m_term = wf->written_form;
|
|
qt->m_termLen = wf->written_form_length;
|
|
// assign score weight
|
|
qt->m_termWeight = m_synonymWeight;
|
|
qt->m_userWeight = qw->m_userWeightForSynonym;
|
|
qt->m_fieldCode = qw->m_fieldCode ;
|
|
// stuff before a pipe always has a weight of 1
|
|
if(qt->m_piped) {
|
|
qt->m_userWeight = 1;
|
|
}
|
|
// otherwise, add it
|
|
n++;
|
|
}
|
|
}
|
|
|
|
//Merge duplicated synonyms.
|
|
//If one of the above synonym-generations produced the same word (eg. from wiktionary, word-variations and as lemma) then we want to use
|
|
//the one with highest weight
|
|
for(int i=0; i<n; i++) {
|
|
if(m_qterms[i].m_synonymOf) {
|
|
//it's a synonym. Are there other synonyms on the same base word with the same form? If so then merge/delete
|
|
for(int j=i+1; j<n; ) {
|
|
if(m_qterms[j].m_synonymOf == m_qterms[i].m_synonymOf &&
|
|
m_qterms[j].m_termLen == m_qterms[i].m_termLen &&
|
|
memcmp(m_qterms[j].m_term,m_qterms[i].m_term,m_qterms[j].m_termLen)==0)
|
|
{
|
|
//Identical synonyms of same base word
|
|
//note: direct memcmp() test. Downside is that we don't eliminate uppercase/lowercase duplicates, but neither
|
|
//do we get into trouble with German eszet, Lithuanian i, ligatures, titlecase, etc.
|
|
logTrace(g_conf.m_logTraceQuery, "merging identical synonyms '%.*s' for word '%.*s'", m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_synonymOf->m_termLen,m_qterms[i].m_synonymOf->m_term);
|
|
m_qterms[i].m_termWeight = std::max(m_qterms[i].m_termWeight,m_qterms[j].m_termWeight);
|
|
m_qterms[i].m_userWeight = std::max(m_qterms[i].m_userWeight,m_qterms[j].m_userWeight);
|
|
memmove(m_qterms+j, m_qterms+j+1, sizeof(m_qterms[0])*(n-j-1));
|
|
n--;
|
|
} else
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
|
|
m_numTerms = n;
|
|
|
|
if ( n > ABS_MAX_QUERY_TERMS ) { g_process.shutdownAbort(true); }
|
|
|
|
// . if only have one term and it is a signless phrase, make it signed
|
|
// . don't forget to set m_termSigns too!
|
|
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
|
|
m_qterms[0].m_termSign = '*';
|
|
}
|
|
|
|
// . now set m_phrasePart for Summary.cpp's hackfix filter
|
|
// . only set this for the non-phrase terms, since keepAllSingles is
|
|
// set to true when setting the Query for Summary.cpp::set in order
|
|
// to match the singles
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
// skip cd-rom too, if not in quotes
|
|
if ( ! m_qterms[i].m_inQuotes ) continue;
|
|
// is next term also in a quoted phrase?
|
|
if ( i - 1 < 0 ) continue;
|
|
//if ( ! m_qterms[i+1].m_isPhrase ) continue;
|
|
if ( ! m_qterms[i-1].m_inQuotes ) continue;
|
|
// are we in the same quoted phrase?
|
|
if ( m_qterms[i+0].m_qword->m_quoteStart !=
|
|
m_qterms[i-1].m_qword->m_quoteStart ) continue;
|
|
}
|
|
|
|
// if we have '+test -test':
|
|
//if ( negativeBits & requiredBits )
|
|
// m_numTerms = 0;
|
|
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
QueryTerm *qt = &m_qterms[i];
|
|
// assume not required
|
|
qt->m_isRequired = false;
|
|
// skip signless phrases
|
|
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
|
|
if ( qt->m_isPhrase && qt->m_termSign == '*' ) continue;
|
|
if ( qt->m_synonymOf ) continue;
|
|
// IGNORE_QSTOP?
|
|
if ( qt->m_ignored ) continue;
|
|
|
|
// user specified "[nrw]" before word
|
|
if( qt->m_userNotRequired) continue;
|
|
|
|
// mark it
|
|
qt->m_isRequired = true;
|
|
}
|
|
|
|
|
|
//If there are two highfreqterms in a row then PosdbTable will ignore the bigram of them because it can't tie the bigram to any required term.
|
|
//Example: "key west mystery writers fest" where "key" and "west" are highfreqterms, and therefore postdbtable will ignore the bigram "key+west".
|
|
//Options:
|
|
// 1: do nothing
|
|
// 2: mark the bigram as ignored
|
|
// 3: mark the bigram as required
|
|
// 4: rewrite Query and the queryterminfo handling in posdbtable so the bigram can be optional.
|
|
//We do (3) because it will likely filter out more bad results than good results. This is a hack because marking a bigram as required normally
|
|
//requires us to to be sure those two words are connected, but for "key west" we are guessing.
|
|
//TODO: reqwrite query+postdbtable so qword/qterm can be optional, etc.
|
|
//The structure of qwords+qterms make sthis code unnecessarily clumsy
|
|
for(int i=0; i+2<m_numWords; i++) {
|
|
if(m_qwords[i ].m_ignoreWord==IGNORE_HIGHFREMTERM &&
|
|
m_qwords[i+2].m_ignoreWord==IGNORE_HIGHFREMTERM)
|
|
{
|
|
if(m_qwords[i].m_queryPhraseTerm && m_qwords[i].m_queryPhraseTerm->m_isPhrase) {
|
|
logTrace(g_conf.m_logTraceQuery, "query-words #%d (%.*s) and #%d (%.*s) are both high-freq-terms. Marking bigram as required",
|
|
i, m_qwords[i].m_wordLen, m_qwords[i].m_word,
|
|
i+2, m_qwords[i+2].m_wordLen, m_qwords[i+2].m_word);
|
|
m_qwords[i].m_queryPhraseTerm->m_isRequired = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//workaround/hack for double-highfreqterm searchs, such as "of a" or "the the" or "the who"
|
|
if(m_numWords==3 &&
|
|
m_qwords[0].m_ignoreWord==IGNORE_HIGHFREMTERM &&
|
|
m_qwords[2].m_ignoreWord==IGNORE_HIGHFREMTERM &&
|
|
m_numTerms==1 &&
|
|
!m_qterms[0].m_isRequired)
|
|
{
|
|
log(LOG_DEBUG, "query: Looks like a highfreqterm-highfreqterm query type. Requiring one-and-only QueryTerm/bigram");
|
|
m_qterms[0].m_isRequired = true;
|
|
//todo: we should investigate if QueryTerm::m_isRequired actually has any effect. It is used
|
|
//in a single place in PosdbTable for not generating a QueryTermInfo, but it appears it works
|
|
//fine even with the QTI.
|
|
}
|
|
|
|
//if all words are high-freq-terms then we have to mark the generated bigrams as required, otherwise PosdbTable.cpp gets unhappy and
|
|
//logs "no required terms in query!"
|
|
bool allAlnumWordsAreIgnored = true;
|
|
for(int i=0; i<m_numWords; i++) {
|
|
if(is_alnum_utf8_string(m_qwords[i].m_word,m_qwords[i].m_word+m_qwords[i].m_wordLen) &&
|
|
(m_qwords[i].m_ignoreWord!=IGNORE_HIGHFREMTERM && m_qwords[i].m_ignoreWord!=IGNORE_QSTOP))
|
|
allAlnumWordsAreIgnored = false;
|
|
}
|
|
if(allAlnumWordsAreIgnored) {
|
|
log(LOG_DEBUG, "query: all alfanum-terms are ignored (highfreq/qstop). Marking bigrams as required");
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
if(m_qterms[i].m_isPhrase)
|
|
m_qterms[i].m_isRequired = true;
|
|
}
|
|
}
|
|
|
|
// required quoted phrase terms
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
QueryTerm *qt = &m_qterms[i];
|
|
// quoted phrase?
|
|
if ( ! qt->m_isPhrase ) continue;
|
|
if ( ! qt->m_inQuotes ) continue;
|
|
// mark it
|
|
qt->m_isRequired = true;
|
|
}
|
|
|
|
|
|
// . for query 'to be or not to be shakespeare'
|
|
// require 'tobe' 'beor' 'tobe' because
|
|
// they are bigrams in the wikipedia phrase 'to be or not to be'
|
|
// and they all consist solely of query stop words. as of
|
|
// 8/20/2012 i took 'not' off the query stop word list.
|
|
// . require bigrams that consist of 2 query stop words and
|
|
// are in a wikipedia phrase. set termSign to '+' i guess?
|
|
// . for 'in the nick' , a wiki phrase, make "in the" required
|
|
// and give a big bonus for "the nick" below.
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
QueryTerm *qt = &m_qterms[i];
|
|
// don't require if negative
|
|
if ( qt->m_termSign == '-' ) continue;
|
|
// only check bigrams here
|
|
if ( ! qt->m_isPhrase ) continue;
|
|
// get the query word that starts this phrase
|
|
const QueryWord *qw1 = qt->m_qword;
|
|
// must be in a wikiphrase
|
|
if ( qw1->m_wikiPhraseId <= 0 ) continue;
|
|
// what query word # is that?
|
|
int32_t qwn = qw1 - m_qwords;
|
|
// get the next alnum word after that
|
|
// assume its the last word in our bigram phrase
|
|
const QueryWord *qw2 = &m_qwords[qwn+2];
|
|
// must be in same wikiphrase
|
|
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
|
|
// must be two stop words
|
|
if ( ! qw1->m_isQueryStopWord ) continue;
|
|
if ( ! qw2->m_isQueryStopWord ) continue;
|
|
// mark it
|
|
qt->m_isRequired = true;
|
|
}
|
|
|
|
// . for query 'to be or not to be shakespeare'
|
|
// give big bonus for 'ornot' and 'notto' bigram terms because
|
|
// the single terms 'or' and 'to' are ignored and because
|
|
// 'to be or not to be' is a wikipedia phrase
|
|
// . on 8/20/2012 i took 'not' off the query stop word list.
|
|
// . now give a big bonus for bigrams whose two terms are in the
|
|
// same wikipedia phrase and one and only one of the terms in
|
|
// the bigram is a query stop word
|
|
// . in general 'ornot' is considered a "synonym" of 'not' and
|
|
// gets hit with a .90 score factor, but that should never
|
|
// happen, it should be 1.00 and in this special case it should
|
|
// be 1.20
|
|
// . so for 'time enough for love' the phrase term "enough for"
|
|
// gets its m_isWikiHalfStopBigram set AND that phrase term
|
|
// is a synonym term of the single word term "enough" and is treated
|
|
// as such in the Posdb.cpp logic.
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
QueryTerm *qt = &m_qterms[i];
|
|
// assume not!
|
|
qt->m_isWikiHalfStopBigram = false;
|
|
// don't require if negative
|
|
if ( qt->m_termSign == '-' ) continue;
|
|
// only check bigrams here
|
|
if ( ! qt->m_isPhrase ) continue;
|
|
// get the query word that starts this phrase
|
|
const QueryWord *qw1 = qt->m_qword;
|
|
// must be in a wikiphrase
|
|
if ( qw1->m_wikiPhraseId <= 0 ) continue;
|
|
// what query word # is that?
|
|
int32_t qwn = qw1 - m_qwords;
|
|
// get the next alnum word after that
|
|
// assume its the last word in our bigram phrase
|
|
const QueryWord *qw2 = &m_qwords[qwn+2];
|
|
// must be in same wikiphrase
|
|
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
|
|
// if both query stop words, should have been handled above
|
|
// we need one to be a query stop word and the other not
|
|
// for this algo
|
|
if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord )
|
|
continue;
|
|
// skip if neither is a query stop word
|
|
if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord )
|
|
continue;
|
|
// one must be a stop word i guess
|
|
// so for 'the time machine' we do not count 'time machine'
|
|
// as a halfstopwikibigram
|
|
if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord )
|
|
continue;
|
|
|
|
// special flag
|
|
qt->m_isWikiHalfStopBigram = true;
|
|
}
|
|
|
|
if(g_conf.m_logTraceQuery)
|
|
traceTermsToLog("final query-terms");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Query::setQWords ( char boolFlag ,
|
|
bool keepAllSingles ,
|
|
Phrases &phrases ) {
|
|
|
|
// . break query up into Words and phrases
|
|
// . because we now deal with boolean queries, we make parentheses
|
|
// their own separate Word, so tell "words" we're setting a query
|
|
plain_tokenizer_phase_1(m_filteredQuery.getBufStart(), m_filteredQuery.length(), &m_tr);
|
|
calculate_tokens_hashes(&m_tr);
|
|
|
|
//hackety-hack...
|
|
//The tokenizer phase 2 also recognizes "C++" and "john's", but we cannot use phase 2 because Phrases and Query are
|
|
//incompatible with phase-2 tokens (too many assumptions about strictly increasing positions and contiguous memory layout)
|
|
//So instead we implement special cases here, until we have time to fix the whole Query class.
|
|
for(size_t i=0; i+1<m_tr.size(); i++) {
|
|
//Hack for C++
|
|
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
|
|
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
|
|
m_tr[i+1].token_len>=2 && memcmp(m_tr[i+1].token_start,"++",2)==0)
|
|
{
|
|
m_tr[i].token_len += 2;
|
|
m_tr[i].end_pos += 2;
|
|
m_tr[i+1].start_pos += 2;
|
|
m_tr[i+1].token_start += 2;
|
|
m_tr[i+1].token_len -= 2;
|
|
if(m_tr[i+1].token_len==0)
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
|
|
continue;
|
|
}
|
|
//Hack for F#
|
|
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
|
|
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='f' || m_tr[i].token_start[0]=='F') &&
|
|
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
|
|
{
|
|
m_tr[i].token_len += 1;
|
|
m_tr[i].end_pos += 1;
|
|
m_tr[i+1].start_pos += 1;
|
|
m_tr[i+1].token_start += 1;
|
|
m_tr[i+1].token_len -= 1;
|
|
if(m_tr[i+1].token_len==0)
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
|
|
continue;
|
|
}
|
|
//Hack for C#
|
|
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
|
|
m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
|
|
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
|
|
{
|
|
m_tr[i].token_len += 1;
|
|
m_tr[i].end_pos += 1;
|
|
m_tr[i+1].start_pos += 1;
|
|
m_tr[i+1].token_start += 1;
|
|
m_tr[i+1].token_len -= 1;
|
|
if(m_tr[i+1].token_len==0)
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
|
|
continue;
|
|
}
|
|
//Hack for C#
|
|
if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
|
|
m_tr[i].token_len==1 && m_tr[i].token_start[0]=='A' &&
|
|
m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"*",1)==0)
|
|
{
|
|
m_tr[i].token_len += 1;
|
|
m_tr[i].end_pos += 1;
|
|
m_tr[i+1].start_pos += 1;
|
|
m_tr[i+1].token_start += 1;
|
|
m_tr[i+1].token_len -= 1;
|
|
if(m_tr[i+1].token_len==0)
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
|
|
continue;
|
|
}
|
|
//Hack for possessive-apostrophe (no need for extra codepoint checks - people usually don't type them in a search field)
|
|
if(i+2<m_tr.size() &&
|
|
m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum && m_tr[i+2].is_alfanum &&
|
|
m_tr[i+1].token_len==1 && (m_tr[i+1].token_start[0]=='\'' || m_tr[i+1].token_start[0]=='`') &&
|
|
m_tr[i+2].token_len==1 && (m_tr[i+2].token_start[0]=='s' || m_tr[i+2].token_start[0]=='S'))
|
|
{
|
|
m_tr[i].end_pos = m_tr[i+2].end_pos;
|
|
m_tr[i].token_len += m_tr[i+1].token_len + m_tr[i+2].token_len;
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i+1,m_tr.tokens.begin()+i+3);
|
|
continue;
|
|
}
|
|
}
|
|
for(size_t i=0; i+2<m_tr.size(); ) {
|
|
const auto &t0 = m_tr[i+0];
|
|
const auto &t1 = m_tr[i+1];
|
|
const auto &t2 = m_tr[i+2];
|
|
if(t0.token_end()==t1.token_start && t1.token_end()==t2.token_start &&
|
|
is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len))
|
|
{
|
|
size_t sl = t0.token_len+t2.token_len;
|
|
char *s = (char*)m_tr.egstack.alloc(sl);
|
|
memcpy(s, t0.token_start, t0.token_len);
|
|
memcpy(s+t0.token_len, t2.token_start, t2.token_len);
|
|
m_tr.tokens.emplace_back(t0.start_pos, t2.end_pos, s,sl, false, true);
|
|
m_tr.tokens.erase(m_tr.tokens.begin()+i, m_tr.tokens.begin()+i+3);
|
|
} else
|
|
i++;
|
|
}
|
|
|
|
int32_t numWords = m_tr.size();
|
|
// truncate it
|
|
if ( numWords > ABS_MAX_QUERY_WORDS ) {
|
|
log("query: Had %" PRId32" words. Max is %" PRId32". Truncating.",
|
|
numWords,(int32_t)ABS_MAX_QUERY_WORDS);
|
|
numWords = ABS_MAX_QUERY_WORDS;
|
|
m_truncated = true;
|
|
}
|
|
m_numWords = numWords;
|
|
// alloc the mem if we need to (mdw left off here)
|
|
int32_t need = m_numWords * sizeof(QueryWord);
|
|
// sanity check
|
|
if ( m_qwords ) { g_process.shutdownAbort(true); }
|
|
// point m_qwords to our generic buffer if it will fit
|
|
if(!m_queryWordBuf.reserve(need)) {
|
|
log(LOG_WARN, "query: Could not allocate mem for query.");
|
|
return false;
|
|
}
|
|
m_qwords = (QueryWord *)m_queryWordBuf.getBufStart();
|
|
// reset safebuf in there
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ )
|
|
m_qwords[i].constructor();
|
|
|
|
// is all alpha chars in query in upper case? caps lock on?
|
|
bool allUpper = true;
|
|
const char *p = m_filteredQuery.getBufStart();
|
|
const char *pend = m_filteredQuery.getBufPtr();
|
|
for ( ; p < pend ; p += getUtf8CharSize(p) )
|
|
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
|
|
allUpper = false; break; }
|
|
|
|
// . come back here from below when we detect dat query is not boolean
|
|
// . we need to redo the bits cuz they may have been messed with below
|
|
// redo:
|
|
// field code we are in
|
|
field_code_t fieldCode = FIELD_UNSET;
|
|
char fieldSign = 0;
|
|
const char *field = NULL;
|
|
int32_t fieldLen = 0;
|
|
// keep track of the start of different chunks of quotes
|
|
int32_t quoteStart = -1;
|
|
bool inQuotes = false;
|
|
//bool inVQuotes = false;
|
|
char quoteSign = 0;
|
|
// the current little sign
|
|
char wordSign = 0;
|
|
// when reading first word in link: ... field we skip the following
|
|
// words until we hit a space because we hash them all together
|
|
bool ignoreTilSpace = false;
|
|
// assume we're NOT a boolean query
|
|
m_isBoolean = false;
|
|
// used to not respect the bool operator if it is the first word
|
|
bool firstWord = true;
|
|
|
|
// the query processing is broken into 3 stages.
|
|
|
|
// . STAGE #1
|
|
// . reset all query words to default
|
|
// set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT
|
|
// . set m_isFieldName, m_fieldCode and m_quoteStart for query words.
|
|
// no field names in quotes. +title:"hey there".
|
|
// set m_quoteStart to -1 if not in quotes.
|
|
// . if quotes immediately follow field code's ':' then distribute
|
|
// the field code to all words in the quotes
|
|
// . distribute +/- signs across quotes and fields to m_wordSigns.
|
|
// support -title:"hey there".
|
|
// . set m_quoteStart to -1 if only one alnum word is
|
|
// in quotes, what's the point of that?
|
|
// . set boolean op codes (m_opcode). cannot be in quotes.
|
|
// cannot have a field code. cannot have a word sign (+/-).
|
|
// . set m_wordId of FIELD_LINK, _URL, _SITE, _IP fields.
|
|
// m_wordId of first should be hash of the whole field value.
|
|
// only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF.
|
|
// . set m_ignore of non-op codes, non-fieldname, alnum words to 0.
|
|
// . set m_wordId of each non-ignored alnum word.
|
|
|
|
// . STAGE #2
|
|
// . customize Bits class:
|
|
// first alnum word can start phrase.
|
|
// first alnum word in quotes (m_quoteStart >= 0 ) can start phrase.
|
|
// connected on the right but not on the left.. can start phrase.
|
|
// no pair across any double quote
|
|
// no pair across ".." --- UNLESS in quotes!
|
|
// no pair across any change of field code.
|
|
// field names may not be part of any phrase or paired across.
|
|
// boolean ops may not be part of any phrase or paired across.
|
|
// ignored words may not be part of any phrase or paired across.
|
|
|
|
// . STAGE #3
|
|
// . set phrases class w/ custom Bits class mods.
|
|
// . set m_bigramId and m_rawPhraseId of all QueryWords. if phraseId
|
|
// is not 0 (phrase exists) then set m_ignorePhrase to 0.
|
|
// . set m_leftConnected, m_rightConnected. word you are connecting
|
|
// to must not be ignored. (no field names or op codes).
|
|
// ensure you are in a phrase with the connected word, too, to
|
|
// really be connected.
|
|
// . set m_leftPhraseStart and m_rightPhraseEnd for all
|
|
// m_inQuotePhrase is not needed since if m_quoteStart is >= 0
|
|
// we MUST be in a quoted phrase!
|
|
// . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED.
|
|
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0).
|
|
// m_wordSign may have inherited quote or field sign.
|
|
// . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED
|
|
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0)
|
|
// m_wordSign may have inherited quote or field sign.
|
|
// . if one word in a phrase is negative, then set m_phraseSign to '-'
|
|
|
|
// set the Bits used for making phrases from the Words class
|
|
Bits bits;
|
|
if ( !bits.set(&m_tr)) {
|
|
log(LOG_WARN, "query: Had error processing query: %s.", mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
|
|
float userWeightForWord = 1;
|
|
float userWeightForPhrase = 1;
|
|
float userWeightForSynonym = 1;
|
|
bool userNotRequiredForWord = false;
|
|
int32_t ignorei = -1;
|
|
|
|
// assume we contain no pipe operator
|
|
int32_t pi = -1;
|
|
|
|
int32_t posNum = 0;
|
|
const char *ignoreTill = NULL;
|
|
|
|
// loop over all words, these QueryWords are 1-1 with "words"
|
|
for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
|
|
// convenience var, these are 1-1 with "words"
|
|
QueryWord *qw = &m_qwords[i];
|
|
// set to defaults?
|
|
memset ( qw , 0 , sizeof(QueryWord) );
|
|
// but quotestart should be -1
|
|
qw->m_quoteStart = -1;
|
|
qw->m_leftPhraseStart = -1;
|
|
// assume QueryWord is ignored by default
|
|
qw->m_ignoreWord = IGNORE_DEFAULT;
|
|
qw->m_ignorePhrase = IGNORE_DEFAULT;
|
|
qw->m_ignoreWordInBoolQuery = false;
|
|
qw->m_word = m_tr[i].token_start;
|
|
qw->m_wordLen = m_tr[i].token_len;
|
|
qw->m_isPunct = !m_tr[i].is_alfanum;
|
|
|
|
qw->m_posNum = posNum;
|
|
|
|
// count 1 unit for it
|
|
posNum++;
|
|
|
|
// we ignore the facet value range list...
|
|
if ( ignoreTill && qw->m_word < ignoreTill )
|
|
continue;
|
|
|
|
// . we duplicated this code from XmlDoc.cpp's
|
|
// getWordPosVec() function
|
|
if ( qw->m_isPunct ) { // ! wids[i] ) {
|
|
const char *wp = qw->m_word;
|
|
int32_t wplen = qw->m_wordLen;
|
|
// simple space or sequence of just white space
|
|
if ( is_wspace_utf8_string(m_tr[i].token_start, m_tr[i].token_end()))
|
|
posNum += 0;
|
|
// 'cd-rom'
|
|
else if ( wp[0]=='-' && wplen==1 )
|
|
posNum += 0;
|
|
// 'mr. x'
|
|
else if ( wp[0]=='.' && is_wspace_utf8_string(m_tr[i].token_start+1, m_tr[i].token_end()))
|
|
posNum += 0;
|
|
// animal (dog)
|
|
else
|
|
posNum++;
|
|
}
|
|
|
|
const char *w = m_tr[i].token_start;
|
|
int32_t wlen = m_tr[i].token_len;
|
|
// assume it is a query weight operator
|
|
qw->m_queryOp = true;
|
|
// ignore it? (this is for query weight operators)
|
|
if ( i <= ignorei ) continue;
|
|
// deal with pipe operators
|
|
if ( wlen == 5 &&
|
|
w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') {
|
|
pi = i;
|
|
qw->m_opcode = opcode_t::OP_PIPE;
|
|
continue;
|
|
}
|
|
// [133.0r]
|
|
// is it the bracket operator?
|
|
// " LeFtB 113 rp RiGhB "
|
|
if ( wlen == 5 &&
|
|
w[0]=='L'&&w[1]=='e'&&w[2]=='F'&&w[3]=='t'&&w[4]=='B'&&
|
|
i+4 < numWords ) {
|
|
// s MUST point to a number
|
|
const char *s = m_tr[i+2].token_start;
|
|
int32_t slen = m_tr[i+2].token_len;
|
|
|
|
// if no number, it must be
|
|
// " leFtB w RiGhB " or " leFtB p RiGhB "
|
|
if ( ! is_digit(s[0]) ) {
|
|
if(s[0] == 'w') {
|
|
// word weight reset
|
|
userWeightForWord = 1;
|
|
ignorei = i + 4;
|
|
} else if(s[0] == 'p') {
|
|
// phrase weight reset
|
|
userWeightForPhrase = 1;
|
|
} else if(s[0] == 's') {
|
|
// phrase weight reset
|
|
userWeightForSynonym = 1;
|
|
} else if(s[0] == 'n' && s[1] == 'r' && s[2] == 'w') {
|
|
// set word as not required
|
|
userNotRequiredForWord = true;
|
|
}
|
|
ignorei = i + 4;
|
|
} else {
|
|
// get the number
|
|
float fval = atof2 (s, slen);
|
|
// s2 MUST point to the a,r,ap,rp string
|
|
const char *s2 = m_tr[i+4].token_start;
|
|
// is it a phrase?
|
|
if(s2[0] == 'w') {
|
|
userWeightForWord = fval;
|
|
} else if(s2[0] == 'p') {
|
|
userWeightForPhrase = fval;
|
|
} else if(s2[0] == 's') {
|
|
userWeightForSynonym = fval;
|
|
}
|
|
// ignore all following words up and inc. i+6
|
|
ignorei = i + 6;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// assign score weight, if any for this guy
|
|
qw->m_userWeightForWord = userWeightForWord;
|
|
qw->m_userWeightForPhrase = userWeightForPhrase;
|
|
qw->m_userWeightForSynonym = userWeightForSynonym;
|
|
// Set required state based on user input
|
|
qw->m_userNotRequiredForWord = userNotRequiredForWord;
|
|
qw->m_queryOp = false;
|
|
|
|
|
|
// does word #i have a space in it? that will cancel fieldCode
|
|
// if we were in a field
|
|
bool endField = false;
|
|
if(has_space(m_tr[i].token_start, m_tr[i].token_end()) && ! inQuotes)
|
|
endField = true;
|
|
// TODO: fix title:" hey there" (space in quotes is ok)
|
|
// if there's a quote before the first space then
|
|
// it's ok!!!
|
|
if ( endField ) {
|
|
const char *s = m_tr[i].token_start;
|
|
const char *send = s + m_tr[i].token_len;
|
|
for ( ; s < send ; s++ ) {
|
|
// if the space is inside the quotes then it
|
|
// doesn't count!
|
|
if(*s == '\"') {
|
|
endField = false;
|
|
break;
|
|
}
|
|
if(is_wspace_a(*s))
|
|
break;
|
|
}
|
|
}
|
|
// cancel the field if we hit a space (not in quotes)
|
|
if ( endField ) {
|
|
// cancel the field
|
|
fieldCode = FIELD_UNSET;
|
|
fieldLen = 0;
|
|
field = NULL;
|
|
// we no longer have to ignore for link: et al
|
|
ignoreTilSpace = false;
|
|
}
|
|
// . maintain inQuotes and quoteStart
|
|
// . quoteStart is the word # that starts the current quote
|
|
int32_t nq = count_quotes(m_tr[i].token_start, m_tr[i].token_len);
|
|
|
|
if ( nq > 0 ) { // && ! ignoreQuotes ) {
|
|
// toggle quotes if we need to
|
|
if ( nq & 0x01 ) inQuotes = ! inQuotes;
|
|
// set quote sign to sign before the quote
|
|
if ( inQuotes ) {
|
|
quoteSign = '\0';
|
|
for ( const char *p = w + wlen - 1 ; p > w ; p--){
|
|
if ( *p != '\"' ) continue;
|
|
if ( *(p-1) == '-' ) quoteSign = '-';
|
|
if ( *(p-1) == '+' ) quoteSign = '+';
|
|
break;
|
|
}
|
|
}
|
|
// . quoteStart is the word # the quotes started at
|
|
// . it is -1 if not in quotes
|
|
// . now we set it to the alnum word AFTER us!!
|
|
if ( inQuotes && i+1< numWords ) quoteStart = i+1;
|
|
else quoteStart = -1;
|
|
}
|
|
//log(LOG_DEBUG, "Query: nq: %" PRId32" inQuotes: %d,quoteStart: %" PRId32,
|
|
// nq, inQuotes, quoteStart);
|
|
// does word #i have a space in it? that will cancel fieldCode
|
|
// if we were in a field
|
|
// TODO: fix title:" hey there" (space in quotes is ok)
|
|
bool cancelField = false;
|
|
if ( has_space(m_tr[i].token_start, m_tr[i].token_end()) && ! inQuotes )
|
|
cancelField = true;
|
|
// fix title:"foo bar" "another quote" so "another quote"
|
|
// is not in the title: field
|
|
if ( has_space(m_tr[i].token_start, m_tr[i].token_end()) && inQuotes && nq>= 2 )
|
|
cancelField = true;
|
|
|
|
// BUT if we have a quote, and they just got turned off,
|
|
// and the space is not after the quote, do not cancel field!
|
|
if ( nq == 1 && cancelField ) {
|
|
// if we hit the space BEFORE the quote, do NOT cancel
|
|
// the field
|
|
for ( const char *p = w + wlen - 1 ; p > w ; p--) {
|
|
// hey, we got the quote first, keep field
|
|
if ( *p == '\"' ) {cancelField = false; break;}
|
|
// otherwise, we got space first? cancel it!
|
|
if ( is_wspace_a(*p) ) break;
|
|
}
|
|
}
|
|
if ( cancelField ) {
|
|
// cancel the field
|
|
fieldCode = FIELD_UNSET;
|
|
fieldLen = 0;
|
|
field = NULL;
|
|
// we no longer have to ignore for link: et al
|
|
ignoreTilSpace = false;
|
|
}
|
|
// skip if we should
|
|
if ( ignoreTilSpace ){
|
|
if (m_qwords[i-1].m_fieldCode){
|
|
qw->m_fieldCode = m_qwords[i-1].m_fieldCode;
|
|
}
|
|
continue;
|
|
}
|
|
// . is this word potentially a field?
|
|
// . it cannot be another field name in a field
|
|
if(i < m_numWords-2 &&
|
|
m_tr[i+1].token_len==1 && m_tr[i+1].token_start[0]==':' &&
|
|
!is_wspace_utf8_string(m_tr[i+2].token_start,m_tr[i+2].token_end()) &&
|
|
(!is_punct_utf8(m_tr[i+2].token_start) || m_tr[i+2].token_start[0]=='\"' || m_tr[i+2].token_start[0]=='-') &&
|
|
! fieldCode && ! inQuotes)
|
|
{
|
|
// field name may have started before though if it
|
|
// was a compound field name containing hyphens,
|
|
// underscores or periods
|
|
int32_t j = i-1 ;
|
|
while ( j > 0 &&
|
|
((m_qwords[j].m_rawWordId != 0) ||
|
|
( m_qwords[j].m_wordLen ==1 &&
|
|
((m_qwords[j].m_word)[0]=='-' ||
|
|
(m_qwords[j].m_word)[0]=='_' ||
|
|
(m_qwords[j].m_word)[0]=='.')))) {
|
|
j--;
|
|
}
|
|
|
|
if ( j < 0 ) {
|
|
j = 0;
|
|
}
|
|
|
|
// advance j to a non-punct word
|
|
while (!m_tr[j].is_alfanum)
|
|
j++;
|
|
|
|
// ignore all of these words then,
|
|
// they're part of field name
|
|
int32_t tlen = 0;
|
|
for ( int32_t k = j ; k <= i ; k++ )
|
|
tlen += m_tr[k].token_len;
|
|
|
|
//is it recognized field name,like "title" or "url"?
|
|
fieldCode = getFieldCode (m_tr[j].token_start, tlen);
|
|
if(fieldCode) {
|
|
//Previously this was done in all cases to support searching for sub-sub-sub...fields in json/xml
|
|
//The downside was that copy-paste of colon-separated words or artist names like "L:Ron:Harald" didn't work.
|
|
|
|
// set field name to the compound name if it is
|
|
field = m_tr[j].token_start;
|
|
fieldLen = tlen;
|
|
if(j == i)
|
|
fieldSign = wordSign;
|
|
else
|
|
fieldSign = m_qwords[j].m_wordSign;
|
|
//FIXME: TokenizerResult does not promise that tokens that are adjacent in the source string also are adjacent in memory
|
|
// (but since Query only does phase-1 tokenization and the tokenizer currently only does tricky things in phase 2 it currently holds)
|
|
|
|
// if so, it does NOT get its own QueryWord,
|
|
// but its sign can be inherited by its members
|
|
for ( int32_t k = j ; k <= i ; k++ )
|
|
m_qwords[k].m_ignoreWord = IGNORE_FIELDNAME;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// what quote chunk are we in? this is 0 if we're not in quotes
|
|
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
|
|
else qw->m_quoteStart = -1;
|
|
qw->m_inQuotes = inQuotes;
|
|
|
|
// ptr to field, if any
|
|
qw->m_fieldCode = fieldCode;
|
|
// if we are a punct word, see if we end in a sign that can
|
|
// be applied to the next word, a non-punct word
|
|
if ( !m_tr[i].is_alfanum ) {
|
|
wordSign = w[wlen-1];
|
|
if ( wordSign != '-' && wordSign != '+') wordSign = 0;
|
|
if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0;
|
|
if ( i > 0 && wlen == 1 ) wordSign = 0;
|
|
|
|
// don't add any QueryWord for a punctuation word
|
|
continue;
|
|
}
|
|
|
|
// what is the sign of our term? +, -, *, ...
|
|
char mysign;
|
|
if ( fieldCode ) mysign = fieldSign;
|
|
else if ( inQuotes ) mysign = quoteSign;
|
|
else mysign = wordSign;
|
|
// are we doing default AND?
|
|
//if ( forcePlus && ! *mysign ) mysign = '+';
|
|
// store the sign
|
|
qw->m_wordSign = mysign;
|
|
// what quote chunk are we in? this is 0 if we're not in quotes
|
|
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
|
|
else qw->m_quoteStart = -1;
|
|
|
|
// . get prefix hash of collection name and field
|
|
// . but first convert field to lower case
|
|
uint64_t ph;
|
|
int32_t fflen = fieldLen;
|
|
if ( fflen > 62 ) fflen = 62;
|
|
char ff[64];
|
|
to_lower3_a ( field , fflen , ff );
|
|
|
|
ph = hash64 ( ff , fflen );
|
|
// map "intitle" map to "title"
|
|
if ( fieldCode == FIELD_TITLE )
|
|
ph = hash64 ( "title", 5 );
|
|
// make "suburl" map to "inurl"
|
|
if ( fieldCode == FIELD_SUBURL )
|
|
ph = hash64 ( "inurl", 5 );
|
|
|
|
// fix for filetype:pdf queries
|
|
if ( fieldCode == FIELD_TYPE )
|
|
ph = hash64 ("type",4);
|
|
|
|
// ptr to field, if any
|
|
qw->m_fieldCode = fieldCode;
|
|
|
|
// prefix hash
|
|
qw->m_prefixHash = ph;
|
|
|
|
// if we're hashing a url:, link:, site: or ip: term,
|
|
// then we need to hash ALL up to the first space
|
|
if ( fieldCode == FIELD_URL ||
|
|
fieldCode == FIELD_EXT ||
|
|
fieldCode == FIELD_LINK ||
|
|
fieldCode == FIELD_SITELINK||
|
|
fieldCode == FIELD_LINKS||
|
|
fieldCode == FIELD_SITE ||
|
|
fieldCode == FIELD_IP ||
|
|
fieldCode == FIELD_GBFIELDMATCH ) {
|
|
// . find 1st space -- that terminates the field value
|
|
// . make "end" point to the end of the entire query
|
|
const char *end = m_tr[m_tr.size()-1].token_end();
|
|
// use this for gbmin:price:1.99 etc.
|
|
int32_t firstColonLen = -1;
|
|
int32_t lastColonLen = -1;
|
|
int32_t colonCount = 0;
|
|
|
|
// "w" points to the first alnumword after the field,
|
|
// so for site:xyz.com "w" points to the 'x' and wlen
|
|
// would be 3 in that case sinze xyz is a word of 3
|
|
// chars. so advance
|
|
// wlen until we hit a space.
|
|
while (w + wlen < end) {
|
|
// stop at first white space
|
|
if (is_wspace_utf8(w + wlen)) break;
|
|
// in case of gbmin:price:1.99 record first ':'
|
|
if (w[wlen] == ':') {
|
|
lastColonLen = wlen;
|
|
if (firstColonLen == -1)
|
|
firstColonLen = wlen;
|
|
colonCount++;
|
|
}
|
|
// fix "gbsortbyint:date)"
|
|
// these are used as boolean operators
|
|
// so do not include them in the value.
|
|
// we also did this above to set cancelField
|
|
// to true.
|
|
if (w[wlen] == '(' || w[wlen] == ')')
|
|
break;
|
|
|
|
wlen++;
|
|
}
|
|
// ignore following words until we hit a space
|
|
ignoreTilSpace = true;
|
|
// the hash. keep it case insensitive. only
|
|
// the fieldmatch stuff should be case-sensitive.
|
|
// this may change later.
|
|
uint64_t wid = hash64Lower_utf8(w, wlen, 0LL);
|
|
|
|
if (fieldCode == FIELD_GBFIELDMATCH) {
|
|
// hash the json field name. (i.e. tag.uri)
|
|
// make it case sensitive as
|
|
// seen in XmlDoc.cpp::hashFacet2().
|
|
// the other fields are hashed in
|
|
// XmlDoc.cpp::hashNumber3().
|
|
// CASE SENSITIVE!!!!
|
|
wid = hash64(w, firstColonLen, 0LL);
|
|
// if it is like
|
|
// gbfieldmatch:tag.uri:"http://xyz.com/poo"
|
|
// then we should hash the string into
|
|
// an int just like how the field value would
|
|
// be hashed when adding gbfacetstr: terms
|
|
// in XmlDoc.cpp:hashFacet2(). the hash of
|
|
// the tag.uri field, for example, is set
|
|
// in hashFacet1() and set to "val32". so
|
|
// hash it just like that does here.
|
|
const char *a = w + firstColonLen + 1;
|
|
// . skip over colon at start
|
|
if (a[0] == ':') a++;
|
|
// . skip over quotes at start/end
|
|
bool inQuotes = false;
|
|
if (a[0] == '\"') {
|
|
inQuotes = true;
|
|
a++;
|
|
}
|
|
// end of field
|
|
const char *b = a;
|
|
// if not in quotes advance until
|
|
// we hit whitespace
|
|
char cs;
|
|
for (; !inQuotes && *b; b += cs) {
|
|
cs = getUtf8CharSize(b);
|
|
if (is_wspace_utf8(b)) break;
|
|
}
|
|
// if in quotes, go until we hit quote
|
|
for (; inQuotes && *b != '\"'; b++)
|
|
;
|
|
// now hash that up. this must be 64 bit
|
|
// to match in XmlDoc.cpp::hashFieldMatch()
|
|
uint64_t val64 = hash64(a, b - a);
|
|
// make a composite of tag.uri and http://...
|
|
// just like XmlDoc.cpp::hashFacet2() does
|
|
wid = hash64(val64, wid);
|
|
}
|
|
|
|
// should we have normalized before hashing?
|
|
if (fieldCode == FIELD_URL ||
|
|
fieldCode == FIELD_LINK ||
|
|
fieldCode == FIELD_SITELINK ||
|
|
fieldCode == FIELD_LINKS ||
|
|
fieldCode == FIELD_SITE) {
|
|
Url url;
|
|
url.set( w, wlen, ( fieldCode != FIELD_SITE ), false );
|
|
|
|
if (fieldCode == FIELD_SITELINK) {
|
|
wid = hash64(url.getHost(), url.getHostLen());
|
|
} else {
|
|
wid = hash64(url.getUrl(), url.getUrlLen());
|
|
}
|
|
}
|
|
|
|
// like we do it in XmlDoc.cpp's hashString()
|
|
if (ph) {
|
|
qw->m_wordId = hash64h(wid, ph);
|
|
} else {
|
|
qw->m_wordId = wid;
|
|
}
|
|
|
|
qw->m_rawWordId = 0LL; // only for highlighting?
|
|
qw->m_bigramId = 0LL;
|
|
qw->m_rawPhraseId = 0LL;
|
|
qw->m_opcode = opcode_t::OP_NONE;
|
|
|
|
// definitely not a query stop word
|
|
qw->m_isQueryStopWord = false;
|
|
|
|
// do not ignore the wordId
|
|
qw->m_ignoreWord = IGNORE_NO_IGNORE;
|
|
|
|
// we are the first word?
|
|
firstWord = false;
|
|
|
|
// we're done with this one
|
|
continue;
|
|
}
|
|
|
|
|
|
opcode_t opcode = opcode_t::OP_NONE;
|
|
// if query is all in upper case and we're doing boolean
|
|
// DETECT, then assume not boolean
|
|
if ( allUpper && boolFlag == 2 ) boolFlag = 0;
|
|
// . is this word a boolean operator?
|
|
// . cannot be in quotes or field
|
|
if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) {
|
|
// are we an operator?
|
|
if ( ! firstWord && wlen==2 &&
|
|
w[0]=='O' && w[1]=='R')
|
|
opcode = opcode_t::OP_OR;
|
|
else if ( ! firstWord && wlen==3 &&
|
|
w[0]=='A' && w[1]=='N' && w[2]=='D')
|
|
opcode = opcode_t::OP_AND;
|
|
else if ( ! firstWord && wlen==3 &&
|
|
w[0]=='N' && w[1]=='O' && w[2]=='T')
|
|
opcode = opcode_t::OP_NOT;
|
|
else if ( wlen==5 && w[0]=='L' && w[1]=='e' &&
|
|
w[2]=='F' && w[3]=='t' && w[4]=='P' )
|
|
opcode = opcode_t::OP_LEFTPAREN;
|
|
else if ( wlen==5 && w[0]=='R' && w[1]=='i' &&
|
|
w[2]=='G' && w[3]=='h' && w[4]=='P' )
|
|
opcode = opcode_t::OP_RIGHTPAREN;
|
|
// no pair across or even include any boolean op phrs
|
|
if ( opcode != opcode_t::OP_NONE ) {
|
|
bits.clearBits(i,D_CAN_PAIR_ACROSS);
|
|
bits.clearBits(i,D_CAN_BE_IN_PHRASE);
|
|
qw->m_ignoreWord = IGNORE_BOOLOP;
|
|
qw->m_opcode = opcode;
|
|
if ( opcode == opcode_t::OP_LEFTPAREN ) continue;
|
|
if ( opcode == opcode_t::OP_RIGHTPAREN ) continue;
|
|
// if this is uncommented all of our operators
|
|
// become actual query terms (mdw)
|
|
if ( opcode == opcode_t::OP_UOR ) continue;
|
|
// if you just have ANDs and ()'s that does
|
|
// not make you a boolean query! we are bool
|
|
// by default!!
|
|
if ( opcode == opcode_t::OP_AND ) continue;
|
|
m_isBoolean = true;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// . add single-word term id
|
|
// . this is computed by hash64AsciiLower()
|
|
// . but only hash64Lower_a if _HASHWITHACCENTS_ is true
|
|
uint64_t wid = m_tr[i].token_hash;
|
|
qw->m_rawWordId = wid;
|
|
// we now have a first word already set
|
|
firstWord = false;
|
|
// . are we a QUERY stop word?
|
|
// . NEVER count as stop word if it's in all CAPS and
|
|
// not all letters in the whole query is NOT in all CAPS
|
|
// . It's probably an acronym
|
|
if ( m_tr[i].token_len>1 &&
|
|
is_upper_utf8_string(m_tr[i].token_start, m_tr[i].token_end()) &&
|
|
! allUpper )
|
|
{
|
|
qw->m_isQueryStopWord = false;
|
|
qw->m_isStopWord = false;
|
|
} else {
|
|
qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid,
|
|
m_langId);
|
|
// . BUT, if it is a single letter contraction thing
|
|
// . ninad: make this == 1 if in utf8! TODO!! it is!
|
|
if ( i>0 && wlen == 1 && w[-1] == '\'' )
|
|
qw->m_isQueryStopWord = true;
|
|
qw->m_isStopWord =::isStopWord (w,wlen,wid);
|
|
}
|
|
// . do not count as query stop word if it is the last in query
|
|
// . like the query: 'baby names that start with j'
|
|
if ( i + 2 > numWords ) {
|
|
qw->m_isQueryStopWord = false;
|
|
}
|
|
|
|
// like we do it in XmlDoc.cpp's hashString()
|
|
if ( ph ) {
|
|
qw->m_wordId = hash64(wid, ph);
|
|
} else {
|
|
qw->m_wordId = wid;
|
|
}
|
|
|
|
// do not ignore the word
|
|
qw->m_ignoreWord = IGNORE_NO_IGNORE;
|
|
|
|
//except if it is a high-frequency-term and expensive to look up. In that case ignore the word but keep the phrases/bigrams thereof
|
|
uint64_t termId = (qw->m_wordId & TERMID_MASK);
|
|
if(g_conf.m_useHighFrequencyTermCache &&
|
|
m_allowHighFreqTermCache && g_hfts.is_registered_term(termId)) {
|
|
log(LOG_DEBUG, "query: term='%.*s' with termId %lu is a highfreq term. Marking it for ignoring", wlen, w, termId);
|
|
qw->m_ignoreWord = IGNORE_HIGHFREMTERM;
|
|
}
|
|
|
|
// reset for next word
|
|
userNotRequiredForWord = false;
|
|
}
|
|
|
|
//If there's only one alphanumerical word and it was ignored due to high-freq-term then the query is treated as 0 terms and will return an empty
|
|
//result. Therefore un-ignore the single word and let it fetch (best-efort) results from the high-freq-term-cache
|
|
int numAlfanumWords = 0;
|
|
int numAlfanumWordsHighFreqTerms = 0;
|
|
int alfanumWordIndex = -1;
|
|
for(int i=0; i<numWords; i++) {
|
|
if(m_tr[i].is_alfanum) {
|
|
alfanumWordIndex = i;
|
|
numAlfanumWords++;
|
|
if(m_qwords[i].m_ignoreWord==IGNORE_HIGHFREMTERM)
|
|
numAlfanumWordsHighFreqTerms++;
|
|
|
|
}
|
|
}
|
|
if(numAlfanumWords == 1 && numAlfanumWordsHighFreqTerms==1)
|
|
m_qwords[alfanumWordIndex].m_ignoreWord = IGNORE_NO_IGNORE;
|
|
|
|
// pipe those that should be piped
|
|
for ( int32_t i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true;
|
|
|
|
// . set m_leftConnected and m_rightConnected
|
|
// . we are connected to the first non-punct word on our left
|
|
// if we are separated by a small $ of defined punctuation
|
|
// . see getIsConnection() for that definition
|
|
// . this allows us to just lookup the phrase for things like
|
|
// "cd-rom" rather than lookup "cd" , "rom" and "cd-rom"
|
|
// . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or
|
|
// IGNORE_DEFAULT
|
|
// . we have to set outside the main loop above since we check
|
|
// the m_ignoreWord member of the i+2nd word
|
|
for ( int32_t i = 0 ; i < numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord ) continue;
|
|
if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&&
|
|
isConnection(i+1) )
|
|
qw->m_rightConnected = true;
|
|
if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord &&
|
|
isConnection(i-1) )
|
|
qw->m_leftConnected = true;
|
|
}
|
|
|
|
// now modify the Bits class before generating phrases
|
|
for ( int32_t i = 0 ; i < numWords ; i++ ) {
|
|
// get default bits
|
|
unsigned char b = bits.queryBits(i);
|
|
// allow pairing across anything by default
|
|
b |= D_CAN_PAIR_ACROSS;
|
|
// get Query Word
|
|
QueryWord *qw = &m_qwords[i];
|
|
// . skip if part of a query weight operator
|
|
// . cannot be in a phrase, or anything
|
|
if ( qw->m_queryOp && qw->m_opcode==opcode_t::OP_NONE) {
|
|
b = D_CAN_PAIR_ACROSS;
|
|
}
|
|
// is this word a sequence of punctuation and spaces?
|
|
else if ( !m_tr[i].is_alfanum ) {
|
|
// pair across ANY punct, even double spaces by default
|
|
b |= D_CAN_PAIR_ACROSS;
|
|
// but do not pair across anything with a quote in it
|
|
if ( count_quotes(m_tr[i].token_start, m_tr[i].token_len) > 0 )
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
// continue if we're in quotes
|
|
else if ( qw->m_quoteStart >= 0 ) goto next;
|
|
// continue if we're in a field
|
|
else if ( qw->m_fieldCode > 0 ) goto next;
|
|
// if guy on left is in field, do not pair across
|
|
if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 )
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
// or if guy on right in field
|
|
if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 )
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
// do not pair across ".." when not in quotes/field
|
|
const char *w = m_tr[i].token_start;
|
|
int32_t wlen = m_tr[i].token_len;
|
|
for ( int32_t j = 0 ; j < wlen-1 ; j++ ) {
|
|
if ( w[j ]!='.' ) continue;
|
|
if ( w[j+1]!='.' ) continue;
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
// . no field names, bool operators, cruft in fields
|
|
// can be any part of a phrase
|
|
// . no pair across any change of field code
|
|
// . 'girl title:boy' --> no "girl title" phrase!
|
|
if ( qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM ) {
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
b &= ~D_CAN_BE_IN_PHRASE;
|
|
}
|
|
// . no boolean ops
|
|
// . 'this OR that' --> no "this OR that" phrase
|
|
if ( qw->m_opcode != opcode_t::OP_NONE ) {
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
b &= ~D_CAN_BE_IN_PHRASE;
|
|
}
|
|
if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) {
|
|
b &= ~D_CAN_PAIR_ACROSS;
|
|
b &= ~D_CAN_BE_IN_PHRASE;
|
|
}
|
|
|
|
}
|
|
next:
|
|
// set it back all tweaked
|
|
bits.assignBits(i,b);
|
|
}
|
|
|
|
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
|
|
// in quotes for the most part, therefore, set m_quoteStart for them
|
|
int32_t j;
|
|
int32_t qs = -1;
|
|
for ( j = 0 ; j < numWords ; j++ ) {
|
|
// skip all but strongly connected words
|
|
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
|
|
// must also be non punct word OR a space
|
|
( m_tr[j].is_alfanum || *m_tr[j].token_start==' ' ) ) {
|
|
// break the "quote", if any
|
|
qs = -1; continue; }
|
|
// if he is punctuation and qs is -1, skip him,
|
|
// punctuation words can no longer start a quote
|
|
if ( !m_tr[j].is_alfanum && qs == -1 ) continue;
|
|
// uningore him if we should
|
|
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = IGNORE_NO_IGNORE;
|
|
// if already in quotes, don't bother!
|
|
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
|
|
// remember him
|
|
if ( qs == -1 ) qs = j;
|
|
// he starts the phrase
|
|
m_qwords[j].m_quoteStart = qs;
|
|
// force him into a quoted phrase
|
|
m_qwords[j].m_inQuotes = true;
|
|
//m_qwords[j].m_inQuotedPhrase = true;
|
|
}
|
|
|
|
// fix for tags.uri:http://foo.com/bar so it works like
|
|
// tags.uri:"http://foo.com/bar" like it should
|
|
int32_t first = -1;
|
|
for ( j = 0 ; j < numWords ; j++ ) {
|
|
// stop when we hit spaces
|
|
if ( has_wspace_utf8_string(m_tr[j].token_start, m_tr[j].token_end()) ) {
|
|
first = -1;
|
|
continue;
|
|
}
|
|
// skip if not in field
|
|
if ( ! m_qwords[j].m_fieldCode ) continue;
|
|
// must be in a generic field, the other fields like site:
|
|
// will be messed up by this logic
|
|
if ( m_qwords[j].m_fieldCode != FIELD_GENERIC ) continue;
|
|
// first alnumword in field?
|
|
if ( first == -1 ) {
|
|
// must be alnum
|
|
if ( m_qwords[j].m_isPunct ) continue;
|
|
// must have punct then another alnum word
|
|
if ( j+2 >= numWords ) break;
|
|
// spaces screw it up
|
|
if ( has_wspace_utf8_string(m_tr[j+1].token_start, m_tr[j+1].token_end()) ) continue;
|
|
// then an alnum word after
|
|
first = j;
|
|
}
|
|
// we are in fake quoted phrase
|
|
m_qwords[j].m_inQuotes = true;
|
|
m_qwords[j].m_quoteStart = first;
|
|
}
|
|
|
|
// make the phrases from the words and the tweaked Bits class
|
|
if ( !phrases.set(m_tr,bits) )
|
|
return false;
|
|
|
|
// do phrases stuff
|
|
for ( int32_t i = 0 ; i < numWords ; i++ ) {
|
|
// get the ith QueryWord
|
|
QueryWord *qw = &m_qwords[i];
|
|
|
|
//if word is ignored (and it is not due to high-freq-term) then don't generate a phrase/bigram query term
|
|
if(qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM)
|
|
continue;
|
|
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
|
|
// get the first word # to our left that starts a phrase
|
|
// of which we are a member
|
|
qw->m_leftPhraseStart = -1;
|
|
for ( int32_t j = i - 1 ; j >= 0 ; j-- ) {
|
|
if ( ! bits.canPairAcross(j+1) ) break;
|
|
if ( !m_tr[j].is_alfanum ) continue;
|
|
|
|
qw->m_leftPhraseStart = j;
|
|
// we can't pair across alnum words now, we just want bigrams
|
|
if ( m_tr[j].is_alfanum ) break;
|
|
// now we do bigrams so only allow two words even
|
|
// if they are stop words
|
|
break;
|
|
}
|
|
// . is this word in a quoted phrase?
|
|
// . the whole phrase must be in the same set of quotes
|
|
// . if we're in a left phrase, he must be in our quotes
|
|
if ( qw->m_leftPhraseStart >= 0 &&
|
|
qw->m_quoteStart >= 0 &&
|
|
qw->m_leftPhraseStart >= qw->m_quoteStart )
|
|
qw->m_inQuotedPhrase = true;
|
|
// if we start a phrase, ensure next guy is in our quote
|
|
if ( ! qw->m_ignorePhrase && i+1 < numWords &&
|
|
m_qwords[i+1].m_quoteStart >= 0 &&
|
|
m_qwords[i+1].m_quoteStart <= i )
|
|
qw->m_inQuotedPhrase = true;
|
|
// are we the first word in the quote?
|
|
if ( i-1>=0 && qw->m_quoteStart == i )
|
|
qw->m_inQuotedPhrase = true;
|
|
// ignore single words that are in a quoted phrase
|
|
if ( ! keepAllSingles && qw->m_inQuotedPhrase )
|
|
qw->m_ignoreWord = IGNORE_QUOTED;
|
|
|
|
// . get phrase info for this term
|
|
// . a pid (phraseId)of 0 indicates it does not start a phrase
|
|
// . raw phrase termId
|
|
uint64_t pid = 0LL;
|
|
|
|
phrases.getMinWordsInPhrase(i,(int64_t *)&pid);;
|
|
|
|
// store it
|
|
qw->m_rawPhraseId = pid;
|
|
|
|
// does word #i start a phrase?
|
|
if ( pid != 0 ) {
|
|
uint64_t ph = qw->m_prefixHash ;
|
|
|
|
// like we do it in XmlDoc.cpp's hashString()
|
|
if ( ph ) qw->m_bigramId = hash64 ( pid , ph );
|
|
else qw->m_bigramId = pid;
|
|
|
|
//calculate length of phrase(bigram) in bytes
|
|
int32_t numWordsInPhrase = phrases.getNumWordsInPhrase2(i);
|
|
int phraseLen = 0;
|
|
for(int j=i; j<i+numWordsInPhrase; j++)
|
|
phraseLen += m_qwords[j].m_wordLen;
|
|
qw->m_bigramLen = phraseLen;
|
|
|
|
// do not ignore the phrase, it's valid
|
|
qw->m_ignorePhrase = IGNORE_NO_IGNORE;
|
|
}
|
|
|
|
|
|
// . phrase sign is inherited from word's sign if it's a minus
|
|
// . word sign is inherited from field, quote or right before
|
|
// the word
|
|
// . that is, all words in -"to be or not" will have a '-' sign
|
|
// . phraseId may or may not be 0 at this point
|
|
if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-';
|
|
|
|
// . dist word signs to others in the same connected string
|
|
// . use "-cd-rom x-box" w/ no connector in between
|
|
// . test queries:
|
|
// . +cd-rom +x-box
|
|
// . -cd-rom +x-box
|
|
// . -m-o-n
|
|
// . who was the first (was is a query stop word)
|
|
// . www.xxx.com
|
|
// . welcome to har.com
|
|
// . hezekiah walker the love family affair ii live at radio
|
|
// city music hall
|
|
// . fotostudio +m-o-n-a-r-t
|
|
// . fotostudio -m-o-n-a-r-t
|
|
// . i'm home
|
|
if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 )
|
|
qw->m_wordSign = m_qwords[i-2].m_wordSign;
|
|
|
|
// . if we connected to the alnum word on our right then
|
|
// soft require the phrase (i.e. treat like a single term)
|
|
// . example: cd-rom or www.xxx.com
|
|
// . 'welcome to har.com' should get a '*' for "har.com" sign
|
|
if ( qw->m_rightConnected ) {
|
|
if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign;
|
|
else qw->m_phraseSign = '*';
|
|
}
|
|
|
|
// . if we're in quotes then any phrase we have should be
|
|
// soft required (i.e. treated like a single term)
|
|
// . we do not allow phrases in queries to pair across
|
|
// quotes. See where we tweak the Bits class above.
|
|
if ( qw->m_quoteStart >= 0 ) {
|
|
qw->m_phraseSign = '*';
|
|
}
|
|
|
|
// . if we are the last word in a phrase that consists of all
|
|
// PLAIN stop words then make the phrase have a '*'
|
|
// . 'to be or not to be .. test' (cannot pair across "..")
|
|
// . don't use QUERY stop words cuz of "who was the first?" qry
|
|
if ( pid ) {
|
|
int32_t nw = phrases.getNumWordsInPhrase2(i);
|
|
int32_t j;
|
|
// search up to this far
|
|
int32_t maxj = i + nw;
|
|
// but not past our truncated limit
|
|
if ( maxj > ABS_MAX_QUERY_WORDS )
|
|
maxj = ABS_MAX_QUERY_WORDS;
|
|
|
|
for ( j = i ; j < maxj ; j++ ) {
|
|
// skip punct
|
|
if ( !m_tr[j].is_alfanum ) continue;
|
|
// break out if not a stop word
|
|
if ( ! bits.isStopWord(j) ) break;
|
|
// break out if has a term sign
|
|
if ( m_qwords[j].m_wordSign ) break;
|
|
}
|
|
// if everybody in phrase #i was a signless stopword
|
|
// and the phrase was signless, make it have a '*' sign
|
|
if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' )
|
|
m_qwords[i].m_phraseSign = '*';
|
|
// . if a constituent has a - sign, then the whole
|
|
// phrase becomes negative, too
|
|
// . fixes 'apple -computer' truncation problem
|
|
for ( int32_t j = i ; j < maxj ; j++ )
|
|
if ( m_qwords[j].m_wordSign == '-' )
|
|
qw->m_phraseSign = '-';
|
|
}
|
|
|
|
// . ignore unsigned QUERY stop words that are not yet ignored
|
|
// and are in unignored phrases
|
|
// . 'who was the first taiwanese president' should not get
|
|
// "who was" term sign changed to '*' because "was" is a
|
|
// QUERY stop word. So ignore singles query stop words
|
|
// in phrases now
|
|
if ( //! keepAllSingles &&
|
|
(qw->m_isQueryStopWord && !m_isBoolean) &&
|
|
m_useQueryStopWords &&
|
|
! qw->m_fieldCode &&
|
|
// fix 'the tigers'
|
|
//(qw->m_leftPhraseStart >= 0 || qw->m_bigramId > 0 ) &&
|
|
! qw->m_wordSign &&
|
|
! qw->m_ignoreWord )
|
|
qw->m_ignoreWord = IGNORE_QSTOP;
|
|
|
|
// . ignore and/or between quoted phrases, save user from
|
|
// themselves (they meant AND/OR)
|
|
if ( ! keepAllSingles && qw->m_isQueryStopWord &&
|
|
! qw->m_fieldCode &&
|
|
m_useQueryStopWords &&
|
|
! qw->m_bigramId && ! qw->m_inQuotes &&
|
|
((qw->m_wordId == 255176654160863LL) ||
|
|
(qw->m_wordId == 46196171999655LL)) )
|
|
qw->m_ignoreWord = IGNORE_QSTOP;
|
|
// . ignore repeated single words and phrases
|
|
// . look at the old termIds for this, too
|
|
// . should ignore 2nd 'time' in 'time after time' then
|
|
// . but boolean queries often need to repeat terms
|
|
|
|
// . NEW - words much be same sign and not in different
|
|
// . quoted phrases to be ignored -partap
|
|
if ( ! m_isBoolean && !qw->m_ignoreWord ) {
|
|
for ( int32_t j = 0 ; j < i ; j++ ) {
|
|
if ( m_qwords[j].m_ignoreWord ) continue;
|
|
if ( m_qwords[j].m_wordId == qw->m_wordId &&
|
|
m_qwords[j].m_wordSign ==qw->m_wordSign &&
|
|
(!keepAllSingles ||
|
|
(m_qwords[j].m_quoteStart
|
|
== qw->m_quoteStart))){
|
|
qw->m_ignoreWord = IGNORE_REPEAT;
|
|
}
|
|
}
|
|
}
|
|
if ( ! m_isBoolean && !qw->m_ignorePhrase ) {
|
|
// ignore repeated phrases too!
|
|
for ( int32_t j = 0 ; j < i ; j++ ) {
|
|
if ( m_qwords[j].m_ignorePhrase ) continue;
|
|
if ( m_qwords[j].m_bigramId == qw->m_bigramId &&
|
|
m_qwords[j].m_phraseSign
|
|
== qw->m_phraseSign)
|
|
qw->m_ignorePhrase = IGNORE_REPEAT;
|
|
}
|
|
}
|
|
}
|
|
|
|
// . if we only have one quoted query then force its sign to be '+'
|
|
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
|
|
// . "time enough for love" --> +"time enough" +"enough for love"
|
|
// . if all unignored words are in the same set of quotes then change
|
|
// all '*' (soft-required) phrase signs to '+'
|
|
for ( j= 0 ; j < numWords ; j++ ) {
|
|
if ( !m_tr[j].is_alfanum) continue;
|
|
if ( m_qwords[j].m_quoteStart < 0 ) break;
|
|
if ( m_qwords[j].m_ignoreWord ) continue;
|
|
if ( j < 2 ) continue;
|
|
if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart )
|
|
break;
|
|
}
|
|
if ( j >= numWords ) {
|
|
for ( j= 0 ; j < numWords ; j++ ) {
|
|
if ( m_qwords[j].m_phraseSign == '*' )
|
|
m_qwords[j].m_phraseSign = '+';
|
|
}
|
|
}
|
|
|
|
// . force a plus on any site: or ip: query terms
|
|
// . also disable site clustering if we have either of these terms
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord ) continue;
|
|
if ( qw->m_wordSign ) continue;
|
|
if ( qw->m_fieldCode != FIELD_SITE &&
|
|
qw->m_fieldCode != FIELD_IP ) continue;
|
|
qw->m_wordSign = '+';
|
|
}
|
|
|
|
// . if one or more of a phrase's constituent terms exceeded
|
|
// term #MAX_QUERY_TERMS then we should also soft require that phrase
|
|
// . fixes 'hezekiah walker the love family affair ii live at
|
|
// radio city music hall'
|
|
// . how many non-ignored phrases?
|
|
int32_t count = 0;
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignorePhrase ) continue;
|
|
if ( ! qw->m_bigramId ) continue;
|
|
count++;
|
|
}
|
|
for ( int32_t i = 0 ; i < numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
// count non-ignored words
|
|
if ( qw->m_ignoreWord ) continue;
|
|
// if under limit, continue
|
|
if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
|
|
// . otherwise, ignore
|
|
// . if we set this for our UOR'ed terms from SearchInput.cpp's
|
|
// UOR'ed facebook interests then it causes us to get no results!
|
|
// so make sure that MAX_QUERY_TERMS is big enough with respect to
|
|
// the opCount in SearchInput.cpp
|
|
qw->m_ignoreWord = IGNORE_BREECH;
|
|
// left phrase should get a '*'
|
|
int32_t left = qw->m_leftPhraseStart;
|
|
if ( left >= 0 && ! m_qwords[left].m_phraseSign )
|
|
m_qwords[left].m_phraseSign = '*';
|
|
// our phrase should get a '*'
|
|
if ( qw->m_bigramId && ! qw->m_phraseSign )
|
|
qw->m_phraseSign = '*';
|
|
}
|
|
|
|
// . fix the 'x -50a' query so it returns results
|
|
// . how many non-negative, non-ignored words/phrases do we have?
|
|
count = 0;
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord ) continue;
|
|
if ( qw->m_wordSign == '-' ) continue;
|
|
count++;
|
|
}
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignorePhrase ) continue;
|
|
if ( qw->m_phraseSign == '-' ) continue;
|
|
if ( qw->m_bigramId == 0LL ) continue;
|
|
count++;
|
|
}
|
|
// if everybody is ignored or negative UNignore first query stop word
|
|
if ( count == 0 ) {
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
QueryWord *qw = &m_qwords[i];
|
|
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
|
|
qw->m_ignoreWord = IGNORE_NO_IGNORE;
|
|
count++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
quoteStart = -1;
|
|
int32_t quoteEnd = -1;
|
|
// set m_quoteENd
|
|
for ( int32_t i = m_numWords - 1 ; i >= 0 ; i-- ) {
|
|
// get ith word
|
|
QueryWord *qw = &m_qwords[i];
|
|
// skip if ignored
|
|
if ( qw->m_ignoreWord ) continue;
|
|
// skip if not in quotes
|
|
if ( qw->m_quoteStart < 0 ) continue;
|
|
// if match previous guy...
|
|
if ( qw->m_quoteStart == quoteStart ) {
|
|
// inherit the end
|
|
qw->m_quoteEnd = quoteEnd;
|
|
// all done
|
|
continue;
|
|
}
|
|
// ok, we are the end then
|
|
quoteEnd = i;
|
|
quoteStart = qw->m_quoteStart;
|
|
}
|
|
|
|
|
|
int32_t wkid = 0;
|
|
int32_t upTo = -1;
|
|
|
|
//
|
|
// set the wiki phrase ids
|
|
//
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
// get ith word
|
|
QueryWord *qw = &m_qwords[i];
|
|
// in a phrase from before?
|
|
if ( i < upTo ) {
|
|
qw->m_wikiPhraseId = wkid;
|
|
continue;
|
|
}
|
|
// assume none
|
|
qw->m_wikiPhraseId = 0;
|
|
// skip if punct
|
|
if ( !m_tr[i].is_alfanum ) continue;
|
|
// get word
|
|
int32_t nwk ;
|
|
nwk = g_wiki.getNumWordsInWikiPhrase ( i , &m_tr );
|
|
// bail if none
|
|
if ( nwk <= 1 ) continue;
|
|
|
|
// inc it
|
|
wkid++;
|
|
// store it
|
|
qw->m_wikiPhraseId = wkid;
|
|
// set loop parm
|
|
upTo = i + nwk;
|
|
}
|
|
|
|
// consider terms strongly connected like wikipedia title phrases
|
|
for ( int32_t i = 0 ; i + 2 < m_numWords ; i++ ) {
|
|
// get ith word
|
|
QueryWord *qw1 = &m_qwords[i];
|
|
// must not already be in a wikiphrase
|
|
//if ( qw1->m_wikiPhraseId > 0 ) continue;
|
|
// what query word # is that?
|
|
int32_t qwn = qw1 - m_qwords;
|
|
// get the next alnum word after that
|
|
// assume its the last word in our bigram phrase
|
|
QueryWord *qw2 = &m_qwords[qwn+2];
|
|
// must be in same wikiphrase
|
|
if ( qw2->m_wikiPhraseId > 0 ) continue;
|
|
|
|
// if there is a strong connector like the . in 'dmoz.org'
|
|
// then consider it a wiki bigram too
|
|
if ( ! qw1->m_rightConnected ) continue;
|
|
if ( ! qw2->m_leftConnected ) continue;
|
|
|
|
// fix 'rdf.org.dumps' so org.dumps gets same
|
|
// wikiphraseid as rdf.org
|
|
int id;
|
|
if ( qw1->m_wikiPhraseId ) id = qw1->m_wikiPhraseId;
|
|
else id = ++wkid;
|
|
|
|
// store it
|
|
qw1->m_wikiPhraseId = id;
|
|
|
|
qw2->m_wikiPhraseId = id;
|
|
}
|
|
|
|
// all done
|
|
return true;
|
|
}
|
|
|
|
|
|
void Query::modifyQuery(DerivedScoringWeights *scoringWeights, const CollectionRec& cr, bool *doSiteClustering) {
|
|
logTrace(g_conf.m_logTraceQuery, "Query::modifyQuery: q='%s', modifyDomainLikeSearches=%s, modifyAPILikeSearches=%s", originalQuery(),cr.m_modifyDomainLikeSearches?"true":"false", cr.m_modifyAPILikeSearches?"true":"false");
|
|
logTrace(g_conf.m_logTraceQuery, " m_numWords = %d", m_numWords);
|
|
logTrace(g_conf.m_logTraceQuery, " m_numTerms = %d", m_numTerms);
|
|
if(cr.m_modifyDomainLikeSearches) {
|
|
bool looksLikeADomain = false;
|
|
// is it a domain in the form of domain.tld ?
|
|
if(m_numWords==3 &&
|
|
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
|
|
looksLikeADomain = true;
|
|
// is it a domain in the form of host.domain.tld ?
|
|
if(m_numWords==5 &&
|
|
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
|
|
m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen))
|
|
looksLikeADomain = true;
|
|
// is it a domain in the form of host.domain.tld1.tld2 ? (eg www.example.co.uk)
|
|
if(m_numWords==7 &&
|
|
is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
|
|
m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen) &&
|
|
m_qwords[5].m_wordLen==1 && m_qwords[5].m_word[0]=='.' &&
|
|
is_alnum_utf8_string(m_qwords[6].m_word,m_qwords[6].m_word+m_qwords[6].m_wordLen))
|
|
looksLikeADomain = true;
|
|
if(looksLikeADomain) {
|
|
if(!isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen) &&
|
|
!isTLD(m_qwords[m_numWords-3].m_word,m_qwords[m_numWords-3].m_wordLen+m_qwords[m_numWords-2].m_wordLen+m_qwords[m_numWords-1].m_wordLen))
|
|
looksLikeADomain = false; //nope - last component(s) isn't a known tld
|
|
}
|
|
if(looksLikeADomain) {
|
|
log(LOG_DEBUG, "query:Query '%s' looks like a domain", originalQuery());
|
|
//set all non-synonym terms as required and boost inUrl weight.
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
|
|
m_qterms[i].m_isRequired = true;
|
|
m_qterms[i].m_rightPhraseTermNum = -1;
|
|
m_qterms[i].m_leftPhraseTermNum = -1;
|
|
m_qterms[i].m_rightPhraseTerm = NULL;
|
|
m_qterms[i].m_leftPhraseTerm = NULL;
|
|
}
|
|
}
|
|
if(isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen)) {
|
|
//The last term is marked non-required because the tld terms are normally not indexed (see XmlDoc::hashUrl() -> hashString() -> hashString3())
|
|
//high-freq-terms and stopwords means that the term may not have been generated, so look for it
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
if(m_qterms[i].m_qword == &(m_qwords[m_numWords-1]) && !m_qterms[i].m_isPhrase)
|
|
m_qterms[i].m_isRequired = false;
|
|
}
|
|
}
|
|
scoringWeights->m_hashGroupWeights[HASHGROUP_INURL] *= 10; //factor 10 seems to work fine
|
|
if(cr.m_domainLikeSearchDisablesSiteCluster)
|
|
*doSiteClustering = false;
|
|
log(LOG_DEBUG, "query:Query modified");
|
|
traceTermsToLog("domain-like search terms");
|
|
return;
|
|
}
|
|
}
|
|
|
|
if(cr.m_modifyAPILikeSearches) {
|
|
bool looksLikeAnAPI = false;
|
|
//is it something like "file.open" or "file.open()" ?
|
|
//todo: detect java packages like java.util.HashSet (but most java programmers probably has built-in help in their IDE so they would rarely use this)
|
|
if(m_numWords==3 &&
|
|
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
|
|
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
|
|
looksLikeAnAPI = true;
|
|
if(m_numWords==4 &&
|
|
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
|
|
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
|
|
m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
|
|
looksLikeAnAPI = true;
|
|
//or "file::open()"
|
|
if(m_numWords==3 &&
|
|
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
|
|
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
|
|
looksLikeAnAPI = true;
|
|
if(m_numWords==4 &&
|
|
is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
|
|
m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
|
|
is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
|
|
m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
|
|
looksLikeAnAPI = true;
|
|
if(looksLikeAnAPI) {
|
|
log(LOG_DEBUG, "query:Query '%s' looks like an API or function call", originalQuery());
|
|
//set all non-synonym terms as required
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
|
|
m_qterms[i].m_isRequired = true;
|
|
m_qterms[i].m_rightPhraseTermNum = -1;
|
|
m_qterms[i].m_leftPhraseTermNum = -1;
|
|
m_qterms[i].m_rightPhraseTerm = NULL;
|
|
m_qterms[i].m_leftPhraseTerm = NULL;
|
|
}
|
|
}
|
|
log(LOG_DEBUG, "query:Query modified");
|
|
traceTermsToLog("api-like search terms");
|
|
return;
|
|
}
|
|
}
|
|
log(LOG_DEBUG, "query: Query not modified");
|
|
}
|
|
|
|
|
|
|
|
// return -1 if does not exist in query, otherwise return the query word num
|
|
int32_t Query::getWordNum(int64_t wordId) const {
|
|
// skip if punct or whatever
|
|
if ( wordId == 0LL || wordId == -1LL ) return -1;
|
|
for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
|
|
const QueryWord *qw = &m_qwords[i];
|
|
// the non-raw word id includes a hash with "0", which
|
|
// signifies an empty field term
|
|
if ( qw->m_rawWordId == wordId ) return i;
|
|
}
|
|
// otherwise, not found
|
|
return -1;
|
|
}
|
|
|
|
static HashTableX s_table;
|
|
static bool s_isInitialized = false;
|
|
static GbMutex s_tableMutex;
|
|
|
|
// 3rd field = m_hasColon
|
|
const struct QueryField g_fields[] = {
|
|
{"url",
|
|
FIELD_URL,
|
|
true,
|
|
"url:www.example.com/page.html",
|
|
"Matches the page with that exact url. Uses the first url, not "
|
|
"the url it redirects to, if any." ,
|
|
NULL,
|
|
0 },
|
|
|
|
{"ext",
|
|
FIELD_EXT,
|
|
true,
|
|
"ext:doc",
|
|
"Match documents whose url ends in the <i>.doc</i> file extension.",
|
|
NULL,
|
|
0 },
|
|
|
|
|
|
{"link",
|
|
FIELD_LINK,
|
|
true,
|
|
"link:www.example.com/foo.html",
|
|
"Matches all the documents that have a link to "
|
|
"http://www.example.com/foobar.html",
|
|
NULL,
|
|
0 },
|
|
|
|
{"sitelink",
|
|
FIELD_SITELINK,
|
|
true,
|
|
"sitelink:abc.foobar.com",
|
|
"Matches all documents that link to any page on the "
|
|
"<i>abc.foobar.com</i> site.",
|
|
NULL,
|
|
0 },
|
|
|
|
{"site",
|
|
FIELD_SITE,
|
|
true,
|
|
"site:example.com",
|
|
"Matches all documents on the example.com domain.",
|
|
NULL,
|
|
0 },
|
|
|
|
{"site",
|
|
FIELD_SITE,
|
|
true,
|
|
"site:www.example.com/dir1/dir2/",
|
|
"Matches all documents whose url starts with "
|
|
"www.example.com/dir1/dir2/",
|
|
NULL,
|
|
QTF_DUP },
|
|
|
|
{"sitenoindex",
|
|
FIELD_SITE,
|
|
true,
|
|
"sitenoindex:example.com",
|
|
"Matches all documents on the example.com domain that in not indexed.",
|
|
NULL,
|
|
0 },
|
|
|
|
{"ip",
|
|
FIELD_IP,
|
|
true,
|
|
"ip:192.0.2.1",
|
|
"Matches all documents whose IP is 192.0.2.1.",
|
|
NULL,
|
|
0 },
|
|
|
|
|
|
{"ip",
|
|
FIELD_IP,
|
|
true,
|
|
"ip:192.0.2",
|
|
"Matches all documents whose IP STARTS with 192.0.2.",
|
|
NULL,
|
|
QTF_DUP },
|
|
|
|
|
|
{"inurl",
|
|
FIELD_SUBURL,
|
|
true,
|
|
"inurl:dog",
|
|
"Matches all documents that have the word dog in their url, like "
|
|
"http://www.example.com/dog/food.html. However will not match "
|
|
"http://www.example.com/dogfood.html because it is not an "
|
|
"individual word. It must be delineated by punctuation.",
|
|
NULL,
|
|
0 },
|
|
|
|
|
|
{"suburl",
|
|
FIELD_SUBURL,
|
|
true,
|
|
"suburl:dog",
|
|
"Same as inurl.",
|
|
NULL,
|
|
0},
|
|
|
|
{"intitle",
|
|
FIELD_TITLE,
|
|
false,
|
|
"title:cat",
|
|
"Matches all the documents that have the word cat in their "
|
|
"title.",
|
|
NULL,
|
|
0 },
|
|
|
|
|
|
{"intitle",
|
|
FIELD_TITLE,
|
|
false,
|
|
"title:\"cat food\"",
|
|
"Matches all the documents that have the phrase \"cat food\" "
|
|
"in their title.",
|
|
NULL,
|
|
QTF_DUP },
|
|
|
|
|
|
{"title",
|
|
FIELD_TITLE,
|
|
false,
|
|
"title:cat",
|
|
"Same as intitle:",
|
|
NULL,
|
|
0},
|
|
|
|
{"type",
|
|
FIELD_TYPE,
|
|
false,
|
|
"type:json",
|
|
"Matches all documents that are in JSON format. "
|
|
"Other possible types include "
|
|
"<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
|
|
"<i>status</i> matches special documents that are stored every time "
|
|
"a url is spidered so you can see all the spider attempts and when "
|
|
"they occurred as well as the outcome.",
|
|
NULL,
|
|
0},
|
|
|
|
{"filetype",
|
|
FIELD_TYPE,
|
|
false,
|
|
"filetype:json",
|
|
"Same as type: above.",
|
|
NULL,
|
|
0},
|
|
|
|
{"gblang",
|
|
FIELD_GBLANG,
|
|
false,
|
|
"gblang:de",
|
|
"Matches all documents in german. "
|
|
"The supported language abbreviations "
|
|
"are at the bottom of the <a href=\"/admin/filters\">url filters</a> "
|
|
"page. Some more "
|
|
"common ones are <i>gblang:en, gblang:es, gblang:fr, "
|
|
// need quotes for this one!!
|
|
"gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
|
|
NULL,
|
|
0},
|
|
|
|
{"gbcountry",
|
|
FIELD_GBCOUNTRY,
|
|
false,
|
|
"gbcountry:us",
|
|
"Matches documents determined by Gigablast to be from the United "
|
|
"States. See the country abbreviations in the CountryCode.cpp "
|
|
"open source distribution. Some more popular examples include: "
|
|
"de, fr, uk, ca, cn.",
|
|
NULL,
|
|
0} ,
|
|
|
|
// mdw
|
|
|
|
{"gbdocid",
|
|
FIELD_GBDOCID,
|
|
false,
|
|
"gbdocid:123456",
|
|
"Matches the document with the docid 123456",
|
|
NULL,
|
|
0},
|
|
|
|
{"gbtermid",
|
|
FIELD_GBTERMID,
|
|
false,
|
|
"gbtermid:123456",
|
|
"Matches the documents for the term with termid 123456",
|
|
NULL,
|
|
0},
|
|
|
|
//
|
|
// for content type CT_STATUS documents (Spider status docs)
|
|
//
|
|
|
|
{"gbdocspiderdate",
|
|
FIELD_GENERIC,
|
|
false,
|
|
"gbdocspiderdate:1400081479",
|
|
"Matches documents that have "
|
|
"that spider date timestamp (UTC). "
|
|
//"Does not include the "
|
|
//"special spider status documents. "
|
|
"This is the time the document "
|
|
"completed downloading.",
|
|
"Date Related Query Operators",
|
|
QTF_BEGINNEWTABLE},
|
|
|
|
|
|
{"gbspiderdate",
|
|
FIELD_GENERIC,
|
|
false,
|
|
"gbspiderdate:1400081479",
|
|
"Like above.",
|
|
//, but DOES include the special spider status documents.",
|
|
NULL,
|
|
0},
|
|
|
|
{"gbdocindexdate",
|
|
FIELD_GENERIC,
|
|
false,
|
|
"gbdocindexdate:1400081479",
|
|
"Like above, but is the time the document was last indexed. "
|
|
"This time is "
|
|
"slightly greater than or equal to the spider date.",//Does not "
|
|
//"include the special spider status documents.",
|
|
NULL,
|
|
0},
|
|
|
|
|
|
{"gbindexdate",
|
|
FIELD_GENERIC,
|
|
false,
|
|
"gbindexdate:1400081479",
|
|
"Like above.",//, but it does include the special spider status "
|
|
//"documents.",
|
|
NULL,
|
|
0},
|
|
|
|
// they don't need to know about this
|
|
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE}
|
|
};
|
|
|
|
void resetQuery ( ) {
|
|
s_table.reset();
|
|
}
|
|
|
|
|
|
|
|
int32_t getNumFieldCodes ( ) {
|
|
return (int32_t)sizeof(g_fields) / (int32_t)sizeof(QueryField);
|
|
}
|
|
|
|
static bool initFieldTable(){
|
|
|
|
ScopedLock sl(s_tableMutex);
|
|
if ( ! s_isInitialized ) {
|
|
// set up the hash table
|
|
if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,"qryfldtbl" ) ) {
|
|
log(LOG_WARN, "build: Could not init table of query fields.");
|
|
return false;
|
|
}
|
|
// now add in all the stop words
|
|
int32_t n = getNumFieldCodes();
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// skip if dup
|
|
int64_t h = hash64b ( g_fields[i].text );
|
|
|
|
// if already in there it is a dup
|
|
if ( s_table.isInTable ( &h ) ) continue;
|
|
|
|
// store the entity index in the hash table as score
|
|
if ( ! s_table.addTerm(h, i+1) ) return false;
|
|
}
|
|
s_isInitialized = true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
field_code_t getFieldCode(const char *s, int32_t len) {
|
|
if ( !initFieldTable() ) {
|
|
return FIELD_UNSET;
|
|
}
|
|
|
|
int64_t h = hash64Lower_a( s, len );
|
|
int32_t i = (int32_t) s_table.getScore(h);
|
|
|
|
if ( i == 0 ) {
|
|
return FIELD_UNSET;
|
|
}
|
|
|
|
return g_fields[i-1].field;
|
|
}
|
|
|
|
const char *getFieldCodeName(field_code_t fc) {
|
|
switch(fc) {
|
|
case FIELD_UNSET: return "unset";
|
|
case FIELD_URL: return "url";
|
|
case FIELD_LINK: return "link";
|
|
case FIELD_SITE: return "site";
|
|
case FIELD_IP: return "ip";
|
|
case FIELD_SUBURL: return "suburl";
|
|
case FIELD_TITLE: return "title";
|
|
case FIELD_TYPE: return "type";
|
|
case FIELD_EXT: return "ext";
|
|
case FIELD_LINKS: return "links";
|
|
case FIELD_SITELINK: return "sitelink";
|
|
case FIELD_GENERIC: return "generic";
|
|
case FIELD_GBLANG: return "gblang";
|
|
case FIELD_GBCOUNTRY: return "gbcountry";
|
|
case FIELD_GBTERMID: return "gbtermid";
|
|
case FIELD_GBDOCID: return "gbdocid";
|
|
case FIELD_GBCONTENTHASH: return "gbcontenthash";
|
|
case FIELD_GBFIELDMATCH: return "gbfieldmatch";
|
|
default: return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
// guaranteed to be punctuation
|
|
bool Query::isConnection(unsigned i) const {
|
|
auto const &token = m_tr[i];
|
|
if(token.token_len==1) {
|
|
switch(*token.token_start) {
|
|
// . only allow apostrophe if it's NOT a 's
|
|
// . so contractions are ok, and names too
|
|
case '\'':
|
|
// no, i think we should require it. google seems to,
|
|
// and msn and yahoo do. 'john's room -"john's" gives
|
|
// no result son yahoo and msn.
|
|
return true;
|
|
case ':': return true;
|
|
case '-': return true;
|
|
case '.': return true;
|
|
case '@': return true;
|
|
case '#': return true;
|
|
case '/': return true;
|
|
case '_': return true;
|
|
case '&': return true;
|
|
case '=': return true;
|
|
case '\\': return true;
|
|
default: return false;
|
|
}
|
|
}
|
|
//if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true;
|
|
if(token.token_len==3 &&
|
|
token.token_start[0]==':' && token.token_start[1]=='/' && token.token_start[2]=='/' )
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
void Query::dumpToLog() const
|
|
{
|
|
log(LOG_DEBUG, "Query:setQTerms: dumping %d query-words:", m_numWords);
|
|
for(int i=0; i<m_numWords; i++) {
|
|
const QueryWord &qw = m_qwords[i];
|
|
log(" qword #%d:",i);
|
|
log(" word='%*.*s'", (int)qw.m_wordLen, (int)qw.m_wordLen, qw.m_word);
|
|
log(" phrase='%*.*s'", (int)qw.m_bigramLen, (int)qw.m_bigramLen, qw.m_word);
|
|
log(" m_wordId=%" PRId64, qw.m_wordId);
|
|
log(" m_bigramId=%" PRId64, qw.m_bigramId);
|
|
if(qw.m_queryWordTerm)
|
|
log(" m_queryWordTerm= #%d", (int)(qw.m_queryWordTerm-m_qterms));
|
|
}
|
|
log("Query:setQTerms: dumping %d query-terms:", m_numTerms);
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
const QueryTerm &qt = m_qterms[i];
|
|
log(" term #%d:",i);
|
|
log(" m_term='%*.*s'", (int)qt.m_termLen, (int)qt.m_termLen, qt.m_term);
|
|
log(" m_isPhrase=%s synonym=%s", qt.m_isPhrase?"true":"false", qt.m_synonymOf?"true":"false");
|
|
log(" m_termId=%" PRId64, qt.m_termId);
|
|
log(" m_rawTermId=%" PRId64, qt.m_rawTermId);
|
|
log(" m_isWikiHalfStopBigram=%s", qt.m_isWikiHalfStopBigram?"true":"false");
|
|
log(" m_leftPhraseTermNum=%d, m_leftPhraseTerm=%p", qt.m_leftPhraseTermNum, (void*)qt.m_leftPhraseTerm);
|
|
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
|
|
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
|
|
log(" m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
|
|
log(" m_termFreqWeight=%f m_termWeight=%f m_userWeight=%f", qt.m_termFreqWeight, qt.m_termWeight, qt.m_userWeight);
|
|
if(qt.m_synonymOf)
|
|
log(" m_synonymOf=#%d '%.*s'", (int)(qt.m_synonymOf-m_qterms), qt.m_synonymOf->m_termLen, qt.m_synonymOf->m_term);
|
|
}
|
|
}
|
|
|
|
void Query::traceTermsToLog(const char *header) {
|
|
logTrace(g_conf.m_logTraceQuery, "%s: %d queryterms:", header, m_numTerms);
|
|
for(int i=0; i<m_numTerms; i++) {
|
|
logTrace(g_conf.m_logTraceQuery, " query-term #%d: termid=%15" PRId64" '%*.*s', t-weight=%f u-weight=%f %s", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_termWeight,m_qterms[i].m_userWeight, m_qterms[i].m_ignored?"ignored":"");
|
|
logTrace(g_conf.m_logTraceQuery, " qstopw=%s req=%s", m_qterms[i].m_isQueryStopWord?"true":"false", m_qterms[i].m_isRequired?"yes":"no");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
|
|
////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////
|
|
|
|
// return false and set g_errno on error
|
|
// returns how many words expression was
|
|
bool Expression::addExpression (int32_t start,
|
|
int32_t end,
|
|
Query *q,
|
|
int32_t level
|
|
) {
|
|
|
|
if ( level >= MAX_EXPRESSIONS ) {
|
|
g_errno = ETOOMANYPARENS;
|
|
return false;
|
|
}
|
|
|
|
// the # of the first alnumpunct word in the expression
|
|
m_expressionStartWord = start;
|
|
m_q = q;
|
|
|
|
int32_t i = m_expressionStartWord;
|
|
|
|
// "start" is the current alnumpunct word we are parsing out
|
|
for ( ; i<end ; i++ ) {
|
|
|
|
QueryWord *qwords = q->m_qwords;
|
|
|
|
QueryWord * qw = &qwords[i];
|
|
|
|
// set leaf node if not an opcode like "AND" and not punct.
|
|
if ( qw->m_opcode==opcode_t::OP_NONE && qw->isAlphaWord()){
|
|
continue;
|
|
}
|
|
if (qw->m_opcode == opcode_t::OP_NOT) {
|
|
continue;
|
|
}
|
|
else if (qw->m_opcode == opcode_t::OP_LEFTPAREN ) {
|
|
// this is expression
|
|
// . it should advance "i" to end of expression
|
|
// point to next...
|
|
q->m_numExpressions++;
|
|
// make a new one:
|
|
Expression *e=&q->m_expressions[q->m_numExpressions-1];
|
|
// now set it
|
|
if ( ! e->addExpression ( i+1, // skip over (
|
|
end ,
|
|
q ,
|
|
level + 1) )
|
|
return false;
|
|
// skip over it. pt to ')'
|
|
i += e->m_numWordsInExpression;
|
|
qw->m_expressionPtr = e;
|
|
}
|
|
else if (qw->m_opcode == opcode_t::OP_RIGHTPAREN ) {
|
|
// return size i guess, include )
|
|
m_numWordsInExpression = i - m_expressionStartWord+1;
|
|
return true;
|
|
}
|
|
else if (qw->m_opcode!=opcode_t::OP_NONE) {
|
|
continue;
|
|
}
|
|
// white space?
|
|
}
|
|
|
|
m_numWordsInExpression = i - m_expressionStartWord;
|
|
|
|
return true;
|
|
}
|
|
|
|
// each bit is 1-1 with the explicit terms in the boolean query
|
|
bool Query::matchesBoolQuery(const unsigned char *bitVec, int32_t vecSize) const {
|
|
return m_expressions[0].isTruth ( bitVec , vecSize );
|
|
}
|
|
|
|
|
|
static bool isBitNumSet(int32_t opBitNum, const unsigned char *bitVec, int32_t vecSize) {
|
|
int32_t byte = opBitNum / 8;
|
|
int32_t mask = 1<<(opBitNum % 8);
|
|
if ( byte >= vecSize ) { g_process.shutdownAbort(true); }
|
|
return bitVec[byte] & mask;
|
|
}
|
|
|
|
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
|
|
// including ignored words and spaces i guess since Expression::add()
|
|
// seems to do that.
|
|
bool Expression::isTruth(const unsigned char *bitVec, int32_t vecSize) const {
|
|
|
|
//
|
|
// operand1 operand2 operator1 operand3 operator2 ....
|
|
//
|
|
|
|
// result: -1 means unknown at this point
|
|
int32_t result = -1;
|
|
|
|
opcode_t prevOpCode = opcode_t::OP_NONE;
|
|
int32_t prevResult ;
|
|
// result of current operand
|
|
int32_t opResult = -1;
|
|
|
|
int32_t i = m_expressionStartWord;
|
|
int32_t iend = i + m_numWordsInExpression;
|
|
|
|
bool hasNot = false;
|
|
|
|
for ( ; i < iend ; i++ ) {
|
|
|
|
const QueryWord *qw = &m_q->m_qwords[i];
|
|
|
|
// ignore parentheses, aren't real opcodes.
|
|
// we just want OP_AND/OP_OR/OP_NOT
|
|
opcode_t opcode = qw->m_opcode;
|
|
if ( opcode != opcode_t::OP_AND &&
|
|
opcode != opcode_t::OP_OR &&
|
|
opcode != opcode_t::OP_NOT )
|
|
opcode = opcode_t::OP_NONE;
|
|
|
|
if ( opcode == opcode_t::OP_NOT ) {
|
|
hasNot = true;
|
|
continue;
|
|
}
|
|
|
|
|
|
// so operands are expressions as well
|
|
const Expression *e = (const Expression *)qw->m_expressionPtr;
|
|
if ( e ) {
|
|
// save prev one. -1 means no prev.
|
|
prevResult = opResult;
|
|
// set new onw
|
|
opResult = e->isTruth ( bitVec , vecSize );
|
|
// skip over that expression. point to ')'
|
|
i += e->m_numWordsInExpression;
|
|
// flip?
|
|
if ( hasNot ) {
|
|
if ( opResult == 1 ) opResult = 0;
|
|
else opResult = 1;
|
|
hasNot = false;
|
|
}
|
|
}
|
|
|
|
if ( opcode!=opcode_t::OP_NONE && ! e ) {
|
|
prevOpCode = opcode;//m_opSlots[i];
|
|
continue;
|
|
}
|
|
|
|
// simple operand
|
|
if ( opcode==opcode_t::OP_NONE && ! e ) {
|
|
// for regular word operands
|
|
// ignore it like a space?
|
|
if ( qw->m_ignoreWord ) continue;
|
|
// ignore gbsortby:offerprice in bool queries
|
|
// at least for evaluating them
|
|
if ( qw->m_ignoreWordInBoolQuery ) continue;
|
|
// save old one
|
|
prevResult = opResult;
|
|
// convert word to term #
|
|
const QueryTerm *qt = qw->m_queryWordTerm;
|
|
// fix title:"notre dame" AND NOT irish
|
|
if ( ! qt ) qt = qw->m_queryPhraseTerm;
|
|
if ( ! qt ) continue;
|
|
// phrase terms are not required and therefore
|
|
// do not have a v alid qt->m_bitNum set, so dont core
|
|
if ( ! qt->m_isRequired ) continue;
|
|
// . m_bitNum is set in Posdb.cpp when it sets its
|
|
// QueryTermInfo array
|
|
// . it is basically the query term #
|
|
// . see iff that bit is set in this docid's vec
|
|
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
|
|
// flip?
|
|
if ( hasNot ) {
|
|
if ( opResult == 1 ) opResult = 0;
|
|
else opResult = 1;
|
|
hasNot = false;
|
|
}
|
|
}
|
|
|
|
// need two to tango. i.e. (true OR false)
|
|
if ( prevResult == -1 ) continue;
|
|
|
|
// if this is not the first time... we got two
|
|
if ( prevOpCode == opcode_t::OP_AND ) {
|
|
// if first operation we encount is A AND B then
|
|
// default result to on. only allow an AND operation
|
|
// to turn if off.
|
|
if ( result == -1 ) result = 1;
|
|
if ( ! prevResult ) result = 0;
|
|
if ( ! opResult ) result = 0;
|
|
}
|
|
else if ( prevOpCode == opcode_t::OP_OR ) {
|
|
// if first operation we encount is A OR B then
|
|
// default result to off
|
|
if ( result == -1 ) result = 0;
|
|
if ( prevResult ) result = 1;
|
|
if ( opResult ) result = 1;
|
|
}
|
|
}
|
|
|
|
// if we never set result, then it was probably a single
|
|
// argument expression like something in double parens like
|
|
// ((site:xyz.com OR site:abc.com)). so set it to value of
|
|
// first operand, opResult.
|
|
if ( prevOpCode == opcode_t::OP_NONE && result == -1 ) result = opResult;
|
|
|
|
if ( result == -1 ) return true;
|
|
if ( result == 0 ) return false;
|
|
return true;
|
|
}
|
|
|
|
// if any one query term is split, msg3a has to split the query
|
|
bool Query::isSplit() const {
|
|
for(int32_t i = 0; i < m_numTerms; i++)
|
|
if(m_qterms[i].isSplit()) return true;
|
|
return false;
|
|
}
|
|
|
|
void QueryTerm::constructor ( ) {
|
|
m_qword = NULL;
|
|
m_isPhrase = false;
|
|
m_termId = 0;
|
|
m_rawTermId = 0;
|
|
m_termSign = 0;
|
|
m_bitNum = 0;
|
|
m_term = NULL;
|
|
m_termLen = 0;
|
|
m_posdbListPtr = NULL;
|
|
m_langIdBits = 0;
|
|
m_langIdBitsValid = false;
|
|
m_termFreq = 0;
|
|
m_termFreqWeight = 0.0;
|
|
m_isQueryStopWord = false;
|
|
m_inQuotes = false;
|
|
m_termWeight = 0;
|
|
m_userWeight = 0;
|
|
m_userNotRequired = false;
|
|
m_piped = false;
|
|
m_ignored = false;
|
|
m_synonymOf = NULL;
|
|
m_synWids0 = 0;
|
|
m_synWids1 = 0;
|
|
m_numAlnumWordsInSynonym = 1;
|
|
m_fieldCode = FIELD_UNSET;
|
|
m_isRequired = false;
|
|
m_isWikiHalfStopBigram = false;
|
|
m_leftPhraseTermNum = 0;
|
|
m_rightPhraseTermNum = 0;
|
|
m_leftPhraseTerm = NULL;
|
|
m_rightPhraseTerm = NULL;
|
|
memset(m_startKey,0,sizeof(m_startKey));
|
|
memset(m_endKey,0,sizeof(m_endKey));
|
|
}
|
|
|
|
bool QueryTerm::isSplit() const {
|
|
if(!m_fieldCode) return true;
|
|
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
|
|
return true;
|
|
}
|
|
|
|
// hash of all the query terms
|
|
int64_t Query::getQueryHash() const {
|
|
int64_t qh = 0LL;
|
|
for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
|
|
const QueryTerm *qt = &m_qterms[i];
|
|
qh = hash64 ( qt->m_termId , qh );
|
|
}
|
|
return qh;
|
|
}
|
|
|
|
void QueryWord::constructor () {
|
|
m_synWordBuf.constructor();
|
|
}
|
|
|
|
void QueryWord::destructor () {
|
|
m_synWordBuf.purge();
|
|
}
|
|
|
|
|
|
static int count_quotes(const char *s, size_t len) {
|
|
int count = 0;
|
|
while(len--)
|
|
if(*s++ == '\"')
|
|
count++;
|
|
return count;
|
|
}
|