privacore-open-source-searc…/Query.cpp

#include "Query.h"
#include "Bits.h"
#include "Phrases.h"
#include "Url.h"
#include "Domains.h"
#include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs()
#include "StopWords.h" // isQueryStopWord()
#include "Sections.h"
#include "Speller.h"
#include "Mem.h"
#include "Msg3a.h"
#include "HashTableX.h"
#include "Synonyms.h"
#include "HighFrequencyTermShortcuts.h"
#include "Wiki.h"
#include "ScoringWeights.h"
#include "RdbList.h"
#include "Process.h"
#include "Conf.h"
#include "termid_mask.h"
#include "Collectiondb.h"
#include "GbUtil.h"
#include <set>
#include "Lemma.h"
#include "Errno.h"


#include "GbMutex.h"
#include "ScopedLock.h"

static int count_quotes(const char *s, size_t len);


Query::Query()
  : m_queryWordBuf("Query4"),
    m_tr(),
    m_filteredQuery("qrystk"),
    m_originalQuery("oqbuf"),
    m_bigramWeight(1.0),
    m_synonymWeight(1.0),
    m_word_variations_config()
{
	m_qwords      = NULL;
	m_numWords = 0;
	m_qwords               = NULL;
	m_numTerms = 0;

	// Coverity
	m_langId = langUnknown;
	m_useQueryStopWords = false;
		m_allowHighFreqTermCache = false;
	m_numTermsUntruncated = 0;
	m_isBoolean = false;
	m_maxQueryTerms = 0;

	memset(m_expressions, 0, sizeof(m_expressions));

	reset ( );
}

Query::~Query ( ) {
	reset ( );
}

void Query::reset ( ) {

	// if Query::constructor() was called explicitly then we have to
	// call destructors explicitly as well...
	// essentially call QueryTerm::reset() on each query term
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		QueryWord *qw = &m_qwords[i];
		qw->destructor();
	}

	m_queryTermBuf.purge();
	m_qterms = NULL;
	m_tr.clear();

	m_filteredQuery.purge();
	m_originalQuery.purge();
	m_docIdRestriction = 0LL;
	m_numWords    = 0;
	m_numTerms    = 0;

	m_queryWordBuf.purge();
	m_qwords               = NULL;
	m_numExpressions       = 0;
	// the site: and ip: query terms will disable site clustering & caching
	m_hasPositiveSiteField         = false;
	m_hasIpField           = false;
	m_hasUrlField          = false;
	m_hasSubUrlField       = false;
	m_truncated            = false;
}

// . returns false and sets g_errno on error
// . "query" must be NULL terminated
// . if boolFlag is 0 we ignore all boolean operators
// . if boolFlag is 1  we assume query is boolen
// . if boolFlag is 2  we attempt to detect if query is boolean or not
// . if "keepAllSingles" is true we do not ignore any single word UNLESS
//   it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME)
//   a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT)
//   This is used for term highlighting (Highlight.cpp and Summary.cpp)
bool Query::set(const char *query,
	// need language for doing synonyms
	            lang_t langId,
	            float bigramWeight,
	            float synonymWeight,
	            const WordVariationsConfig *wordVariationsConfig,
	            bool useQueryStopWords,
	            bool allowHighFreqTermCache,
	            int32_t maxQueryTerms)
{
	static const WordVariationsConfig defaultWordVariationsConfig;
	if(!wordVariationsConfig)
		wordVariationsConfig = &defaultWordVariationsConfig;
	log(LOG_DEBUG,"query: set2(query='%s', langId=%d, wiktionaryWordVariations=%s, languageSpecificWordVariations=%s useQueryStopWords=%s maxQueryTerms=%d)",
	    query, (int)langId, wordVariationsConfig->m_wiktionaryWordVariations?"true":"false", wordVariationsConfig->m_languageSpecificWordVariations?"true":"false", useQueryStopWords?"true":"false", maxQueryTerms);

	reset();

	m_langId = langId;
	m_useQueryStopWords = useQueryStopWords;
	m_allowHighFreqTermCache = allowHighFreqTermCache;

	// fix summary rerank and highlighting.
	bool keepAllSingles = true;

	m_maxQueryTerms = maxQueryTerms;

	// assume  boolean auto-detect.
	char boolFlag = 2;


	if ( ! query ) return true;

	m_bigramWeight = bigramWeight;
	m_synonymWeight = synonymWeight;
	m_word_variations_config = *wordVariationsConfig;

	int32_t queryLen = strlen(query);

	// truncate query if too big
	if ( queryLen >= ABS_MAX_QUERY_LEN ) {
		log(LOG_WARN, "query: Query length of %" PRId32" must be less than %" PRId32". Truncating.",
		    queryLen,(int32_t)ABS_MAX_QUERY_LEN);
		queryLen = ABS_MAX_QUERY_LEN - 1;
		m_truncated = true;
	}
	// save original query
	if( !m_originalQuery.reserve ( queryLen + 1 ) ) {
		logError("Failed to reserve %" PRId32 " bytes, bailing", queryLen+1);
		return true;
	}
	m_originalQuery.safeMemcpy(query, queryLen);
	m_originalQuery.nullTerm();

	const char *q = query;
	// see if it should be boolean...
	for ( int32_t i = 0 ; i < queryLen ; i++ ) {
		// but if bool flag is 0 that means it is NOT boolean!
		// it must be one for autodetection. so do not autodetect
		// unless this is 2.
		if ( boolFlag != 2 ) break;
		if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
		     (q[i+3]==' ' || q[i+3]=='(') )
			boolFlag = 1;
		if ( q[i]=='O' && q[i+1]=='R' &&
		     (q[i+2]==' ' || q[i+2]=='(') )
			boolFlag = 1;
		if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' &&
		     (q[i+3]==' ' || q[i+3]=='(') )
			boolFlag = 1;
	}

	// if we did not set the flag to 1 set it to 0. force to non-bool
	if ( boolFlag == 2 ) boolFlag = 0;

	// reserve some space, guessing how much we'd need
	int32_t need = queryLen * 2 + 32;
	if ( ! m_filteredQuery.reserve ( need ) )
		return false;

	bool inQuotesFlag = false;
	// . copy query into m_buf
	// . translate ( and ) to special query operators so Words class
	//   can parse them as their own word to make parsing bool queries ez
	//   for parsing out the boolean operators in setBitScoresBoolean()
	for ( int32_t i = 0 ; i < queryLen ; i++ ) {

		// gotta count quotes! we ignore operators in quotes
		// so you can search for diffbotUri:"article|0|123456"
		if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;

		if ( inQuotesFlag ) {
			//*p = query [i];
			//p++;
			m_filteredQuery.pushChar(query[i]);
			continue;
		}

		// translate ( and )
		if ( boolFlag == 1 && query[i] == '(' ) {
			m_filteredQuery.safeMemcpy ( " LeFtP " , 7 );
			continue;
		}
		if ( boolFlag == 1 && query[i] == ')' ) {
			m_filteredQuery.safeMemcpy ( " RiGhP " , 7 );
			continue;
		}
		if ( query[i] == '|' ) {
			m_filteredQuery.safeMemcpy ( " PiiPE " , 7 );
			continue;
		}

		if(query[i] == '[') {
			// translate [#w] [#p] [#s] [w] [p] [s] [nrw] to operators
			char *endptr=NULL;
			double val;
			if(is_digit(query[i+1]))
				val=strtod(query+i+1,&endptr);
			if(endptr!=NULL && endptr!=query+1) {
				size_t j = (size_t)(endptr-query);
				if(query[j]=='w' && query[j+1]==']') {
					m_filteredQuery.safePrintf(" LeFtB %f w RiGhB ", val);
					i = j + 1;
					continue;
				} else if(query[j]=='p' && query[j+1]==']') {
					m_filteredQuery.safePrintf(" LeFtB %f p RiGhB ", val);
					i = j + 1;
					continue;
				} else if(query[j]=='s' && query[j+1]==']') {
					m_filteredQuery.safePrintf(" LeFtB %f s RiGhB ", val);
					i = j + 1;
					continue;
				}
			} else if(query[i+1] == 'w' && query[i+2]==']') {
				m_filteredQuery.safePrintf(" LeFtB w RiGhB ");
				i = i + 2;
				continue;
			} else if(query[i+1] == 'p' && query[i+2]==']') {
				m_filteredQuery.safePrintf(" LeFtB p RiGhB ");
				i = i + 2;
				continue;
			} else if(query[i+1] == 's' && query[i+2]==']') {
				m_filteredQuery.safePrintf(" LeFtB s RiGhB ");
				i = i + 2;
				continue;
			} else if( i+4 < queryLen && query[i+1] == 'n' && query[i+2] == 'r' && query[i+3] == 'w' && query[i+4]==']') {
				// user specified [nrw] before word, meaning treat it as not required
				m_filteredQuery.safePrintf(" LeFtB nrw RiGhB ");
				i = i + 4;
				continue;
			}
		}

		// TODO: copy altavista's operators here? & | !
		// otherwise, just a plain copy
		m_filteredQuery.pushChar ( query[i] );
	}
	// NULL terminate
	m_filteredQuery.nullTerm();
	if(m_filteredQuery.length() != queryLen || memcmp(m_filteredQuery.getBufStart(),query,queryLen)!=0)
		log(LOG_INFO,"query: m_filteredQuery=%*.*s", m_filteredQuery.length(),m_filteredQuery.length(),m_filteredQuery.getBufStart());

	Phrases phrases;

	// set m_qwords[] array from m_buf
	if ( ! setQWords(boolFlag, keepAllSingles, phrases) )
		return false;

	// set m_qterms from m_qwords, always succeeds
	setQTerms();

	// disable stuff for site:, ip: and url: queries
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		const QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignoreWord  ) continue;
		if      ( qw->m_fieldCode == FIELD_SITE &&
			  qw->m_wordSign != '-' )
			m_hasPositiveSiteField = true;
		else if ( qw->m_fieldCode == FIELD_IP )
			m_hasIpField   = true;
		else if ( qw->m_fieldCode == FIELD_URL )
			m_hasUrlField  = true;
		else if ( qw->m_fieldCode == FIELD_SUBURL )
			m_hasSubUrlField = true;
	}

	// set m_docIdRestriction if a term is gbdocid:
	for ( int32_t i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) {
		// get it
		QueryTerm *qt = &m_qterms[i];

		if( qt->m_fieldCode == FIELD_GBTERMID ) {
			const char *ds = m_qterms[i].m_term + 9; // strlen("gbtermid:")
			qt->m_termId = atoll(ds);
		}

		// gbdocid:?
		if ( qt->m_fieldCode != FIELD_GBDOCID ) continue;
		// get docid
		const char *ds = m_qterms[i].m_term + 8;
		m_docIdRestriction = atoll(ds);
		break;
	}

	// . keep it simple for now
	// . we limit to MAX_EXRESSIONS to like 10 now i guess
	if ( m_isBoolean ) {
		m_numExpressions = 1;
		if ( ! m_expressions[0].addExpression ( 0 ,
						      m_numWords ,
						      this , // Query
						      0 ) ) // level
			// return false with g_errno set on error
			return false;
	}


	log(LOG_DEBUG,"query: m_numWords=%d, m_numTerms=%d", m_numWords, m_numTerms);

	// . if it is not truncated, no need to use hard counts
	// . comment this line and the next one out for testing hard counts
	if ( ! m_truncated ) return true;
	// if they just hit the admin's ceiling, there's nothing we can do
	if ( m_numTerms >= m_maxQueryTerms ) return true;
	// a temp log message
	log(LOG_DEBUG,"query: Encountered %" PRId32" query terms.",m_numTerms);

	// otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS
	// so we can use hard counts to get more power...

	// . use the hard count for excessive query terms to save explicit bits
	// . just look for operands on the first level that are not OR'ed
	char redo = 0;
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		// get the ith word
		QueryWord *qw = &m_qwords[i];
		// stop at first OR on this level
		if ( qw->m_opcode == opcode_t::OP_OR ) break;
		// skip all punct
		if (  qw->m_isPunct ) continue;
		// if we are a boolean query,the next operator can NOT be OP_OR
		// because we can not used terms that are involved in an OR
		// as a hard count term, because they are not required terms
		for ( int32_t j=i+1 ; m_isBoolean && j<m_numWords; j++ ) {
			// stop at previous operator
			opcode_t opcode = m_qwords[j].m_opcode;
			if ( opcode == opcode_t::OP_NONE ) continue;
			if ( opcode != opcode_t::OP_OR   ) break;
			// otherwise, the next operator is an OR, so do not
			// use a hard count for this term
			goto stop;
		}
		// mark it so we can reduce our number of explicit bits used
		redo = 1;
	}

 stop:
	// if nothing changed, return now
	if ( ! redo ) return true;

	// . set the query terms again if we have a long query
	if ( ! setQTerms() )
		return false;

	return true;
}

// returns false and sets g_errno on error
bool Query::setQTerms() {
	if(g_conf.m_logTraceQuery) {
		logTrace(g_conf.m_logTraceQuery, "Query::setQTerms(words:%zu)", m_tr.size());
		for(unsigned i=0; i<m_tr.size(); i++) {
			logTrace(g_conf.m_logTraceQuery, "  word #%u: '%*.*s'", i, (int)m_tr[i].token_len, (int)m_tr[i].token_len, m_tr[i].token_start);
			int64_t phraseTermId = m_qwords[i].m_bigramId&TERMID_MASK;
			int64_t wordTermId = m_qwords[i].m_wordId&TERMID_MASK;
			logTrace(g_conf.m_logTraceQuery, "    m_bigramId=%20" PRId64" (%15" PRId64"), m_ignorePhrase=%d m_bigramLen=%d", m_qwords[i].m_bigramId, phraseTermId, m_qwords[i].m_ignorePhrase, m_qwords[i].m_bigramLen);
			logTrace(g_conf.m_logTraceQuery, "    m_wordId  =%20" PRId64" (%15" PRId64"), m_ignoreWord=%d, m_quoteStart=%d, m_quoteEnd=%d, fieldCode=%s, m_prefixHash=0x%lx", m_qwords[i].m_wordId, wordTermId, m_qwords[i].m_ignoreWord, m_qwords[i].m_quoteStart, m_qwords[i].m_quoteEnd, m_qwords[i].m_fieldCode?getFieldCodeName(m_qwords[i].m_fieldCode):"",m_qwords[i].m_prefixHash);

		}
	}
	// . set m_qptrs/m_qtermIds/m_qbits
	// . use one bit position for each phraseId and wordId

	// count phrases first for allocating
	//Removed: elaborate counting of possible bigrams. Done instead: this:
	int numCandidatePhrases = m_numWords-1;

	// count single terms
	int numCandidateSingles = 0;
	for ( int32_t i = 0 ; i < m_numWords; i++ ) {
		const QueryWord *qw  = &m_qwords[i];
 		if ( qw->m_ignoreWord &&
 		     qw->m_ignoreWord != IGNORE_QSTOP) continue;
		// ignore if in quotes and part of phrase, watch out
		// for things like "word", a single word in quotes.
		if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;
		// if we are not start of quote and NOT in a phrase we
		// must be the tailing word i guess.
		// fixes '"john smith" -"bob dole"' from having
		// smith and dole as query terms.
		if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
			continue;
		// ignore if weight is absolute zero
		if ( almostEqualFloat(qw->m_userWeightForWord,0) )
			continue;
		numCandidateSingles++;
	}
	// thirdly, count synonyms
	int numCandidateSynonyms = 0;
	Synonyms syn;
	if(m_word_variations_config.m_wiktionaryWordVariations) {
		int64_t to = hash64n("to");
		for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
			// get query word
			const QueryWord *qw  = &m_qwords[i];
			// skip if in quotes, we will not get synonyms for it
			if ( qw->m_inQuotes ) continue;
			// skip if has plus sign in front
			if ( qw->m_wordSign == '+' ) continue;
			// not '-' either i guess
			if ( qw->m_wordSign == '-' ) continue;
			// no url: stuff, maybe only title
			if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
				continue;
			// ignore title: etc. words, they are field names
			if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
			// ignore boolean operators
			if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
			// ignore if word weight is zero or synonym weight is zero
			if(almostEqualFloat(qw->m_userWeightForWord,0))
				continue;
			if(almostEqualFloat(qw->m_userWeightForSynonym,0))
				continue;
			// no, hurts 'Greencastle IN economic development'
			if ( qw->m_wordId == to ) continue;
			// single letters...
			if ( qw->m_wordLen == 1 ) continue;
			// set the synonyms for this word
			char tmpBuf [ TMPSYNBUFSIZE ];
			int32_t naids = syn.getSynonyms ( &m_tr,
							  i ,
							  // language of the query.
							  // 0 means unknown. if this
							  // is 0 we sample synonyms
							  // from all languages.
							  m_langId ,
							  tmpBuf );
			// if no synonyms, all done
			if ( naids <= 0 ) continue;
			numCandidateSynonyms += naids;
		}
	}

	std::vector<std::string> wvg_source_words;
	std::vector<int> wvg_source_word_index; //idx in wvg_source_words -> idx of queryword
	if(m_word_variations_config.m_languageSpecificWordVariations) {
		for(int i=0; i<m_numWords; i++) {
			const QueryWord *qw  = &m_qwords[i];
			if(qw->m_inQuotes) continue;
			if(qw->m_wordSign == '+') continue;
			if(qw->m_wordSign == '-') continue;
			if(qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
				continue;
			if(qw->m_ignoreWord == IGNORE_FIELDNAME) continue;
			// ignore if word weight is zero or synonym weight is zero
			if(almostEqualFloat(qw->m_userWeightForWord,0))
				continue;
			if(almostEqualFloat(qw->m_userWeightForSynonym,0))
				continue;
			wvg_source_words.emplace_back(qw->m_word,qw->m_wordLen);
			wvg_source_word_index.emplace_back(i);
		}
		auto wvg(WordVariationGenerator::get_generator(m_langId));
		m_wordVariations = wvg->query_variations(wvg_source_words, m_word_variations_config.m_word_variations_weights, m_word_variations_config.m_word_variations_threshold);
		numCandidateSynonyms += m_wordVariations.size();
		if(!m_wordVariations.empty())
			logTrace(g_conf.m_logTraceQuery, "word variations produced %d variants", (int)m_wordVariations.size());
		else
			logTrace(g_conf.m_logTraceQuery, "word variations didn't produce any");
	} else
		m_wordVariations.clear();
	if(g_conf.m_logTraceQuery) {
		logTrace(g_conf.m_logTraceQuery, "m_wordVariations.size()=%zu", m_wordVariations.size());
		for(unsigned i=0; i<m_wordVariations.size(); i++)
			logTrace(g_conf.m_logTraceQuery, "  variation #%u: %s weight=%f src=[%d..%d)", i, m_wordVariations[i].word.c_str(), m_wordVariations[i].weight, m_wordVariations[i].source_word_start, m_wordVariations[i].source_word_end);
	}

	if(m_word_variations_config.m_lemmaWordVariations)
		numCandidateSynonyms += 10;

	m_numTermsUntruncated = numCandidatePhrases+numCandidateSingles+numCandidateSynonyms;
	logTrace(g_conf.m_logTraceQuery, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms)", m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms);
	const int numQueryTerms = std::min(std::min(m_numTermsUntruncated,m_maxQueryTerms),ABS_MAX_QUERY_TERMS);
	if(numQueryTerms!=m_numTermsUntruncated)
		log(LOG_DEBUG, "m_numTermsUntruncated=%d (%d phrases, %d singles, %d synonyms), will be truncated to %d terms for query '%s'",
		    m_numTermsUntruncated, numCandidatePhrases, numCandidateSingles, numCandidateSynonyms, numQueryTerms,
		    m_filteredQuery.getBufStart());

	// allocate the term buffer
	if(numQueryTerms) {
		int32_t need = numQueryTerms * sizeof(QueryTerm);
		if ( ! m_queryTermBuf.reserve ( need ) )
			return false;
		m_queryTermBuf.setLabel("stkbuf3");
		const char *pp = m_queryTermBuf.getBufStart();
		m_qterms = (QueryTerm *)pp;
	}

	// call constructor on each one here
	for(int32_t i = 0; i < numQueryTerms; i++) {
		QueryTerm *qt = &m_qterms[i];
		qt->constructor();
	}


	int32_t n = 0;

	// do phrase terms
	for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
		if(numCandidateSingles+numCandidatePhrases > m_maxQueryTerms) {
			//we won't have room for both phrases and singles. Put in as many singles as possible. But phrases
			//must come first in the list due to bad assumptions elsewhere in the code.
			if(numQueryTerms - n - 1 < numCandidateSingles)
				break;
		}

		QueryWord *qw  = &m_qwords[i];
		// skip if ignored... mdw...
		if ( ! qw->m_bigramId ) continue;
		if (   qw->m_ignorePhrase ) continue; // could be a repeat
		// none if weight is absolute zero
		if ( almostEqualFloat(qw->m_userWeightForPhrase, 0) )
			continue;

		QueryTerm *qt = &m_qterms[n];
		qt->m_qword     = qw ;
		qt->m_piped     = qw->m_piped;
		qt->m_isPhrase  = true ;
		qt->m_synonymOf = NULL;
		qt->m_ignored   = false;
		qt->m_term      = NULL;
		qt->m_termLen   = 0;
		qt->m_langIdBitsValid = false;
		qt->m_langIdBits      = 0;
		// stop word? no, we're a phrase term
		qt->m_isQueryStopWord = false;
		// change in both places
		qt->m_termId    = qw->m_bigramId & TERMID_MASK;
		qt->m_rawTermId = qw->m_rawPhraseId;
		// boolean queries are not allowed term signs for phrases
		// UNLESS it is a '*' soft require sign which we need for
		// phrases like: "cat dog" AND pig
		if ( m_isBoolean && qw->m_phraseSign != '*' ) {
			qt->m_termSign = '\0';
		}
		// if not boolean, ensure to change signs in both places
		else {
			qt->m_termSign  = qw->m_phraseSign;
		}

		qw->m_queryWordTerm = NULL;
		// IndexTable.cpp uses this one
		qt->m_inQuotes  = qw->m_inQuotes;
		// point to the string itself that is the phrase
		qt->m_term      = qw->m_word;
		qt->m_termLen   = qw->m_bigramLen;

		// the QueryWord should have a direct link to the QueryTerm,
		// at least for phrase, so we can OR in the bits of its
		// constituents in the for loop below
		qw->m_queryPhraseTerm = qt ;
		// assign score weight, we're a phrase here
		qt->m_termWeight = m_bigramWeight;
		qt->m_userWeight = qw->m_userWeightForPhrase ;
		qt->m_fieldCode  = qw->m_fieldCode;

		// stuff before a pipe always has a weight of 1
		if ( qt->m_piped ) {
			qt->m_userWeight = 1;
		}
		n++;
	}

	// now if we have enough room, do the singles
	for(int32_t i = 0; i < m_numWords && n<numQueryTerms; i++) {
		QueryWord *qw  = &m_qwords[i];

 		if ( qw->m_ignoreWord &&
 		     qw->m_ignoreWord != IGNORE_QSTOP) continue;

		// ignore if in quotes and part of phrase, watch out
		// for things like "word", a single word in quotes.
		if ( qw->m_quoteStart >= 0 && qw->m_bigramId ) continue;

		// if we are not start of quote and NOT in a phrase we
		// must be the tailing word i guess.
		// fixes '"john smith" -"bob dole"' from having
		// smith and dole as query terms.
		if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != i )
			continue;

		// ignore if weight is absolute zero
		if ( almostEqualFloat(qw->m_userWeightForWord,0) )
			continue;

		QueryTerm *qt = &m_qterms[n];
		qt->m_qword     = qw ;
		qt->m_piped     = qw->m_piped;
		qt->m_isPhrase  = false ;
		qt->m_synonymOf = NULL;
		// ignore some synonym terms if tf is too low
		qt->m_ignored = qw->m_ignoreWord;
		// stop word? no, we're a phrase term
		qt->m_isQueryStopWord = qw->m_isQueryStopWord;
		// change in both places
		qt->m_termId    = qw->m_wordId & TERMID_MASK;
		qt->m_rawTermId = qw->m_rawWordId;
		// boolean queries are not allowed term signs
		if ( m_isBoolean ) {
			qt->m_termSign = '\0';
			// boolean fix for "health OR +sports" because
			// the + there means exact word match, no synonyms.
			if ( qw->m_wordSign == '+' ) {
				qt->m_termSign  = qw->m_wordSign;
			}
		}
		// if not boolean, ensure to change signs in both places
		else {
			qt->m_termSign  = qw->m_wordSign;
		}
 		int32_t pw = i-1;
 		// . back up until word that contains quote if in a quoted
 		//   phrase
 		// . UOR can only support two word phrases really...
 		if (m_qwords[i].m_quoteStart >= 0)
			pw = m_qwords[i].m_quoteStart ;
		if ( pw > 0 ) pw--;

 		// back two more if field
		int32_t fieldStart=-1;
		int32_t fieldLen=0;

		if(pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME)
			fieldStart = pw;

		if(pw > 0 && m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME) {
  			pw -= 1;
 			fieldStart = pw;
 		}
		while(pw > 0 && m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME) {
			pw--;
			fieldStart = pw;
		}


		// skip if it is punct. fixes queries like
		// "(this OR that)" from including '(' or from including
		// a space.
		if ( fieldStart >-1 &&
		     m_qwords[fieldStart].m_isPunct &&
		     fieldStart+1<m_numWords )
			fieldStart++;

		if (fieldStart > -1) {
			pw = i;
			while (pw < m_numWords && m_qwords[pw].m_fieldCode)
				pw++;

			fieldLen = m_qwords[pw-1].m_word +
				   m_qwords[pw-1].m_wordLen -
				   m_qwords[fieldStart].m_word;
		}
		qw->m_queryWordTerm   = qt;
		// IndexTable.cpp uses this one
		qt->m_inQuotes  = qw->m_inQuotes;
		// point to the string itself that is the word

		if (fieldLen > 0) {
			qt->m_term    = m_qwords[fieldStart].m_word;
			qt->m_termLen = fieldLen;
			// fix for query
			// text:""  foo bar   ""
			if ( pw-1 < i ) {
				log("query: bad query %s",m_originalQuery.getBufStart());
				g_errno = EMALFORMEDQUERY;
				return false;
			}
			// skip past the end of the field value
			i = pw-1;
		}
		else {
			qt->m_termLen   = qw->m_wordLen;
			qt->m_term      = qw->m_word;
		}

		// assign score weight, we're a single-term here
		qt->m_termWeight = 1.0;
		qt->m_userWeight = qw->m_userWeightForWord;
		qt->m_fieldCode  = qw->m_fieldCode;
		qt->m_userNotRequired = qw->m_userNotRequiredForWord;

		// stuff before a pipe always has a weight of 1
		if ( qt->m_piped ) {
			qt->m_userWeight = 1;
		}
		n++;
	}

	// Handle shared explicit bits
	for ( int32_t i = 0; i < n ; i++ ){
		QueryTerm *qt = &m_qterms[i];
		// assume not in a phrase
		qt->m_rightPhraseTermNum = -1;
		qt->m_leftPhraseTermNum  = -1;
		qt->m_rightPhraseTerm    = NULL;
		qt->m_leftPhraseTerm     = NULL;
	}

	// . set m_inPhrase
	for (int32_t i = 0; i < m_numWords ; i++ ) {
		const QueryWord *qw = &m_qwords[i];
		QueryTerm *qt = qw->m_queryWordTerm;
		if (!qt) continue;
		// set flag if in a a phrase, and set phrase term num
		if ( qw->m_queryPhraseTerm  ) {
			QueryTerm *pt = qw->m_queryPhraseTerm;
			qt->m_rightPhraseTermNum = pt - m_qterms;
			qt->m_rightPhraseTerm    = pt;
		}
		// if we're in the middle of the phrase
		int32_t pn = qw->m_leftPhraseStart;
		// convert word to its phrase QueryTerm ptr, if any
		QueryTerm *tt = NULL;
		if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
		if ( tt      ) {
			qt->m_leftPhraseTermNum = tt - m_qterms;
			qt->m_leftPhraseTerm    = tt;
		}
		// . there might be some phrase term that actually contains
		//   the same word as we are, but a different occurence
		// . like '"knowledge management" AND NOT management' query
		// . made it from "j < i" into "j < m_numWords" because
		//   'test "test bed"' was not working but '"test bed" test'
		//   was working.
		for ( int32_t j = 0 ; j < m_numWords ; j++ ) {
			// must be our same wordId (same word, different occ.)
			const QueryWord *qw2 = &m_qwords[j];
			if ( qw2->m_wordId != qw->m_wordId ) continue;
			// get first word in the phrase that jth word is in
			int32_t pn2 = qw2->m_leftPhraseStart;
			// we might be the guy that starts it!
			if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j;
			// if neither is the case, skip this query word
			if ( pn2 < 0 ) continue;
			// he implies us!
			QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
			if ( tt2 ) {
				qt->m_leftPhraseTermNum = tt2 - m_qterms;
				qt->m_leftPhraseTerm    = tt2;
			}
			break;
		}
	}

	if(g_conf.m_logTraceQuery) {
		logTrace(g_conf.m_logTraceQuery, "query-terms before word variations:");
		for(int i=0; i<n; i++)
			logTrace(g_conf.m_logTraceQuery, "  query-term #%d: termid=%15" PRId64" '%*.*s'", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term);
	}

	////////////
	//
	// . add synonym query terms now
	// . skip this part if language is unknown i guess
	//
	////////////

	if(m_word_variations_config.m_wiktionaryWordVariations) {
		int64_t to = hash64n("to");
		for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
			// get query word
			QueryWord *qw  = &m_qwords[i];
			// skip if in quotes, we will not get synonyms for it
			if ( qw->m_inQuotes ) continue;
			// skip if has plus sign in front
			if ( qw->m_wordSign == '+' ) continue;
			// not '-' either i guess
			if ( qw->m_wordSign == '-' ) continue;
			// no url: stuff, maybe only title
			if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE )
				continue;
			// skip if ignored like a stopword (stop to->too)
			//if ( qw->m_ignoreWord ) continue;
			// ignore title: etc. words, they are field names
			if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
			// ignore boolean operators
			if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
			// ignore if word weight is zero or synonym weight is zero
			if(almostEqualFloat(qw->m_userWeightForWord,0))
				continue;
			if(almostEqualFloat(qw->m_userWeightForSynonym,0))
				continue;
			// no, hurts 'Greencastle IN economic development'
			if ( qw->m_wordId == to ) continue;
			// single letters...
			if ( qw->m_wordLen == 1 ) continue;
			// set the synonyms for this word
			char tmpBuf [ TMPSYNBUFSIZE ];
			int32_t naids = syn.getSynonyms ( &m_tr,
							  i ,
							  // language of the query.
							  // 0 means unknown. if this
							  // is 0 we sample synonyms
							  // from all languages.
							  m_langId ,
							  tmpBuf );
			// if no synonyms, all done
			if ( naids <= 0 ) continue;
			// sanity
			if ( naids > MAX_SYNS ) { g_process.shutdownAbort(true); }
			// now make the buffer to hold them for us
			qw->m_synWordBuf.setLabel("qswbuf");
			qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
			// get the term for this word
			QueryTerm *origTerm = qw->m_queryWordTerm;
			// loop over synonyms for word #i now
			for(int32_t j = 0; j < naids && n<numQueryTerms; j++) {
				// this happens for 'da da da'
				if ( ! origTerm ) continue;

				// add that query term
				QueryTerm *qt   = &m_qterms[n];
				qt->m_qword     = qw; // NULL;
				qt->m_piped     = qw->m_piped;
				qt->m_isPhrase  = false ;
				qt->m_langIdBits = 0;
				// synonym of this term...
				qt->m_synonymOf = origTerm;
				// nuke this crap since it was done above and we
				// missed out!
				qt->m_rightPhraseTermNum = -1;
				qt->m_leftPhraseTermNum  = -1;
				qt->m_rightPhraseTerm    = NULL;
				qt->m_leftPhraseTerm     = NULL;
				// need this for displaying language of syn in
				// the json/xml feed in PageResults.cpp
				qt->m_langIdBitsValid = true;
				int langId = syn.m_langIds[j];
				uint64_t langBit = (uint64_t)1 << langId;
				if ( langId >= 64 ) langBit = 0;
				qt->m_langIdBits |= langBit;
				// need this for Matches.cpp
				qt->m_synWids0 = syn.m_wids0[j];
				qt->m_synWids1 = syn.m_wids1[j];
				int32_t na        = syn.m_numAlnumWords[j];
				// how many words were in the base we used to
				// get the synonym. i.e. if the base is "new jersey"
				// then it's 2! and the synonym "nj" has one alnum
				// word.
				int32_t ba        = syn.m_numAlnumWordsInBase[j];
				qt->m_numAlnumWordsInSynonym = na;

				// crap, "nj" is a synonym of the PHRASE TERM
				// bigram "new jersey" not of the single word term
				// "new" so fix that.
				if ( ba == 2 && origTerm->m_rightPhraseTerm )
					qt->m_synonymOf = origTerm->m_rightPhraseTerm;

				// ignore some synonym terms if tf is too low
				qt->m_ignored = qw->m_ignoreWord;
				// stop word? no, we're a phrase term
				qt->m_isQueryStopWord = qw->m_isQueryStopWord;
				// change in both places
				int64_t wid = syn.m_aids[j];
				// might be in a title: field or something
				if ( qw->m_prefixHash ) {
					int64_t ph = qw->m_prefixHash;
					wid= hash64h(wid,ph);
				}
				qt->m_termId    = wid & TERMID_MASK;
				qt->m_rawTermId = syn.m_aids[j];
				// boolean queries are not allowed term signs
				if ( m_isBoolean ) {
					qt->m_termSign = '\0';
					// boolean fix for "health OR +sports" because
					// the + there means exact word match, no syns
					if ( qw->m_wordSign == '+' ) {
						qt->m_termSign  = qw->m_wordSign;
					}
				}
				// if not bool, ensure to change signs in both places
				else {
					qt->m_termSign  = qw->m_wordSign;
				}
				// IndexTable.cpp uses this one
				qt->m_inQuotes  = qw->m_inQuotes;
				// usually this is right
				const char *ptr = syn.m_termPtrs[j];
				// buf if it is NULL that means we transformed the
				// word by like removing accent marks and stored
				// it in m_synWordBuf, as opposed to just pointing
				// to a line in memory of wiktionary-buf.txt.
				if ( ! ptr ) {
					int32_t off = syn.m_termOffs[j];
					if ( off < 0 ) {
						g_process.shutdownAbort(true); }
					if ( off > qw->m_synWordBuf.length() ) {
						g_process.shutdownAbort(true); }
					// use QueryWord::m_synWordBuf which should
					// be persistent and not disappear like
					// syn.m_synWordBuf.
					ptr = qw->m_synWordBuf.getBufStart() + off;
				}
				// point to the string itself that is the word
				qt->m_term     = ptr;
				qt->m_termLen  = syn.m_termLens[j];
				// assign score weight, we're a synonym here
				qt->m_termWeight = m_synonymWeight;
				qt->m_userWeight = qw->m_userWeightForSynonym;
				qt->m_fieldCode  = qw->m_fieldCode;

				// stuff before a pipe always has a weight of 1
				if ( qt->m_piped ) {
					qt->m_userWeight = 1;
				}
				// otherwise, add it
				n++;
			}
		}
	}

	if(m_word_variations_config.m_languageSpecificWordVariations) {
		logTrace(g_conf.m_logTraceQuery, "Word variations: %zu", m_wordVariations.size());
		for(unsigned i=0; i<m_wordVariations.size() && n<numQueryTerms; i++) {
			auto const &word_variation(m_wordVariations[i]);
			int wordStartIdx = wvg_source_word_index[word_variation.source_word_start];
			int wordEndIdx = wvg_source_word_index[word_variation.source_word_end-1];
			logTrace(g_conf.m_logTraceQuery, "  Word variation #%u: '%s' weight=%f src=[%u..%u]", i, word_variation.word.c_str(), word_variation.weight, wordStartIdx, wordEndIdx);
			QueryWord *qw = &m_qwords[wordStartIdx];
			if((unsigned)qw->m_wordLen==word_variation.word.length() &&
			   memcmp(qw->m_word, word_variation.word.data(), word_variation.word.length())==0)
			{
				//Variation is the same as the base word. The word-variation-plugin is allowed to produce that.
				continue; //skip
			}
			QueryTerm *origTerm = qw->m_queryWordTerm;

			//handle if the word variant is a bigram/phrase
			bool isPhrase = false;
			if(wordEndIdx-wordStartIdx>1) {
				logTrace(g_conf.m_logTraceQuery, "Word variation '%s' spans more than 1 word", word_variation.word.c_str());
				if(wordEndIdx-wordStartIdx==2) {
					//find bigram pointing to first word
					QueryTerm *bigramQueryTerm = NULL;
					for(int j=0; j<n && !bigramQueryTerm; j++) {
						if(m_qterms[j].m_qword==qw && m_qterms[j].m_isPhrase)
							bigramQueryTerm = &m_qterms[j];
					}
					if(bigramQueryTerm) {
						logTrace(g_conf.m_logTraceQuery, "Word variation covers '%.*s'", bigramQueryTerm->m_termLen, bigramQueryTerm->m_term);
						origTerm = bigramQueryTerm;
						isPhrase = true;
					} else
						log(LOG_LOGIC,"Word variation '%s' bigram/phrase didn't find base bigram", word_variation.word.c_str());
				} else {
					log(LOG_LOGIC,"Word variation '%s' spans more than 2 words. This is not supported (yet)", word_variation.word.c_str());
				}
			}

			// add that query term
			QueryTerm *qt   = &m_qterms[n];
			qt->m_qword     = qw; // NULL;
			qt->m_piped     = qw->m_piped;
			qt->m_isPhrase  = isPhrase;
			qt->m_langIdBits = 0;
			// synonym of this term...
			qt->m_synonymOf = origTerm;
			// nuke this crap since it was done above and we
			// missed out!
			qt->m_rightPhraseTermNum = -1;
			qt->m_leftPhraseTermNum  = -1;
			qt->m_rightPhraseTerm    = NULL;
			qt->m_leftPhraseTerm     = NULL;
			// need this for displaying language of syn in
			// the json/xml feed in PageResults.cpp
			qt->m_langIdBitsValid = true;
			//int langId = syn.m_langIds[j];  //syn-todo?
			//uint64_t langBit = (uint64_t)1 << langId;  //syn-todo?
			//if(langId >= 64) langBit = 0; //syn-todo?
			//qt->m_langIdBits |= langBit; //syn-todo?
			// need this for Matches.cpp
			qt->m_synWids0 = 0;
			qt->m_synWids1 = 0;
			qt->m_numAlnumWordsInSynonym = 0;

			// ignore some synonym terms if tf is too low
			qt->m_ignored = qw->m_ignoreWord;
			// stop word? no, we're a phrase term
			qt->m_isQueryStopWord = qw->m_isQueryStopWord;
			// change in both places
			//int64_t wid = syn.m_aids[j];
			int64_t wid = hash64Lower_utf8_nospaces(word_variation.word.data(), word_variation.word.length());
			// might be in a title: field or something
			if(qw->m_prefixHash) {
				int64_t ph = qw->m_prefixHash;
				wid= hash64h(wid,ph);
			}
			qt->m_termId    = wid & TERMID_MASK;
			//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
			// boolean queries are not allowed term signs
			if(m_isBoolean) {
				qt->m_termSign = '\0';
				// boolean fix for "health OR +sports" because
				// the + there means exact word match, no syns
				if(qw->m_wordSign == '+') {
					qt->m_termSign  = qw->m_wordSign;
				}
			}
			// if not bool, ensure to change signs in both places
			else {
				qt->m_termSign  = qw->m_wordSign;
			}
			// IndexTable.cpp uses this one
			qt->m_inQuotes  = qw->m_inQuotes;
			// point to the string itself that is the word
			qt->m_term     = word_variation.word.data();
			qt->m_termLen  = word_variation.word.length();
			// assign score weight
			qt->m_termWeight = word_variation.weight;
			qt->m_userWeight = qw->m_userWeightForSynonym;
			qt->m_fieldCode  = qw->m_fieldCode  ;
			// stuff before a pipe always has a weight of 1
			if(qt->m_piped) {
				qt->m_userWeight = 1;
			}
			// otherwise, add it
			n++;
		}
	}

	if(m_word_variations_config.m_lemmaWordVariations && m_langId==langDanish) {
		logTrace(g_conf.m_logTraceQuery, "Lexicon-based lemma synonyms");
		for(int32_t i = 0; i<m_numWords && n<numQueryTerms; i++) {
			if(!m_tr[i].is_alfanum)
				continue;
			std::string w(m_tr[i].token_start,m_tr[i].token_len);
			logTrace(g_conf.m_logTraceQuery, "Checking lemma for '%s'", w.c_str());
			auto le = lemma_lexicon->lookup(w);
			if(!le) {
				//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
				char lowercase_word[128];
				if(w.size()<sizeof(lowercase_word)) {
					size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
					lowercase_word[sz] = '\0';
					if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
						le = lemma_lexicon->lookup(lowercase_word);
					}
				}
			}
			if(!le) {
				//Not found as-is in lexicon. Try capitalized in case it is a lowercase or uppercase word
				char capitalized_word[128];
				if(w.size()<sizeof(capitalized_word)) {
					size_t sz = to_capitalized_utf8(capitalized_word,capitalized_word+sizeof(capitalized_word), w.data(), w.data()+w.size());
					capitalized_word[sz] = '\0';
					if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
						w = capitalized_word;
						le = lemma_lexicon->lookup(w);
					}
				}
			}
			if(!le) {
				//Not found as-is in lexicon. Try uppercasing it
				char uppercase_word[128];
				if(w.size()<sizeof(uppercase_word)) {
					size_t sz = to_upper_utf8(uppercase_word,uppercase_word+sizeof(uppercase_word), w.data(), w.data()+w.size());
					uppercase_word[sz] = '\0';
					if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
						w = uppercase_word;
						le = lemma_lexicon->lookup(w);
					}
				}
			}
			if(!le)
				continue; //unknown word
			auto wf = le->find_base_wordform();
			if(!wf)
				continue;
			if(wf->written_form_length==w.size() && memcmp(wf->written_form,w.data(),w.size())==0)
				continue; //already base form)
			logTrace(g_conf.m_logTraceQuery, "Generating synonym from lemma: %s -> %.*s", w.c_str(), wf->written_form_length,wf->written_form);

			QueryWord *qw  = &m_qwords[i];
			QueryTerm *origTerm = qw->m_queryWordTerm;

			// add that query term
			QueryTerm *qt   = &m_qterms[n];
			qt->m_qword     = qw; // NULL;
			qt->m_piped     = qw->m_piped;
			qt->m_isPhrase  = false;
			qt->m_langIdBits = 0;
			// synonym of this term...
			qt->m_synonymOf = origTerm;
			// nuke this crap since it was done above and we
			// missed out!
			qt->m_rightPhraseTermNum = -1;
			qt->m_leftPhraseTermNum  = -1;
			qt->m_rightPhraseTerm    = NULL;
			qt->m_leftPhraseTerm     = NULL;
			// need this for displaying language of syn in
			// the json/xml feed in PageResults.cpp
			qt->m_langIdBitsValid = true;
			//int langId = syn.m_langIds[j];  //syn-todo?
			//uint64_t langBit = (uint64_t)1 << langId;  //syn-todo?
			//if(langId >= 64) langBit = 0; //syn-todo?
			//qt->m_langIdBits |= langBit; //syn-todo?
			// need this for Matches.cpp
			qt->m_synWids0 = 0;
			qt->m_synWids1 = 0;
			qt->m_numAlnumWordsInSynonym = 0;

			// ignore some synonym terms if tf is too low
			qt->m_ignored = qw->m_ignoreWord;
			// stop word? no, we're a phrase term
			qt->m_isQueryStopWord = qw->m_isQueryStopWord;
			// change in both places
			//int64_t wid = syn.m_aids[j];
			int64_t wid = hash64Lower_utf8_nospaces(wf->written_form,wf->written_form_length);
			// might be in a title: field or something
			if(qw->m_prefixHash) {
				int64_t ph = qw->m_prefixHash;
				wid= hash64h(wid,ph);
			}
			qt->m_termId    = wid & TERMID_MASK;
			//qt->m_rawTermId = syn.m_aids[j]; //syn-todo?
			// boolean queries are not allowed term signs
			if(m_isBoolean) {
				qt->m_termSign = '\0';
				// boolean fix for "health OR +sports" because
				// the + there means exact word match, no syns
				if(qw->m_wordSign == '+') {
					qt->m_termSign  = qw->m_wordSign;
				}
			}
			// if not bool, ensure to change signs in both places
			else {
				qt->m_termSign  = qw->m_wordSign;
			}
			// IndexTable.cpp uses this one
			qt->m_inQuotes  = qw->m_inQuotes;
			// point to the string itself that is the word
			qt->m_term     = wf->written_form;
			qt->m_termLen  = wf->written_form_length;
			// assign score weight
			qt->m_termWeight = m_synonymWeight;
			qt->m_userWeight = qw->m_userWeightForSynonym;
			qt->m_fieldCode  = qw->m_fieldCode  ;
			// stuff before a pipe always has a weight of 1
			if(qt->m_piped) {
				qt->m_userWeight = 1;
			}
			// otherwise, add it
			n++;
		}
	}

	//Merge duplicated synonyms.
	//If one of the above synonym-generations produced the same word (eg. from wiktionary, word-variations and as lemma) then we want to use
	//the one with highest weight
	for(int i=0; i<n; i++) {
		if(m_qterms[i].m_synonymOf) {
			//it's a synonym. Are there other synonyms on the same base word with the same form? If so then merge/delete
			for(int j=i+1; j<n; ) {
				if(m_qterms[j].m_synonymOf == m_qterms[i].m_synonymOf &&
				   m_qterms[j].m_termLen == m_qterms[i].m_termLen &&
				   memcmp(m_qterms[j].m_term,m_qterms[i].m_term,m_qterms[j].m_termLen)==0)
				{
					//Identical synonyms of same base word
					//note: direct memcmp() test. Downside is that we don't eliminate uppercase/lowercase duplicates, but neither
					//do we get into trouble with German eszet, Lithuanian i, ligatures, titlecase, etc.
					logTrace(g_conf.m_logTraceQuery, "merging identical synonyms '%.*s' for word '%.*s'", m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_synonymOf->m_termLen,m_qterms[i].m_synonymOf->m_term);
					m_qterms[i].m_termWeight = std::max(m_qterms[i].m_termWeight,m_qterms[j].m_termWeight);
					m_qterms[i].m_userWeight = std::max(m_qterms[i].m_userWeight,m_qterms[j].m_userWeight);
					memmove(m_qterms+j, m_qterms+j+1, sizeof(m_qterms[0])*(n-j-1));
					n--;
				} else
					j++;
			}
		}
	}

	m_numTerms = n;

	if ( n > ABS_MAX_QUERY_TERMS ) { g_process.shutdownAbort(true); }

	// . if only have one term and it is a signless phrase, make it signed
	// . don't forget to set m_termSigns too!
	if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
		m_qterms[0].m_termSign = '*';
	}

	// . now set m_phrasePart for Summary.cpp's hackfix filter
	// . only set this for the non-phrase terms, since keepAllSingles is
	//   set to true when setting the Query for Summary.cpp::set in order
	//   to match the singles
	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		// skip cd-rom too, if not in quotes
		if ( ! m_qterms[i].m_inQuotes ) continue;
		// is next term also in a quoted phrase?
		if ( i - 1 < 0 ) continue;
		//if ( ! m_qterms[i+1].m_isPhrase ) continue;
		if ( ! m_qterms[i-1].m_inQuotes ) continue;
		// are we in the same quoted phrase?
		if ( m_qterms[i+0].m_qword->m_quoteStart !=
		     m_qterms[i-1].m_qword->m_quoteStart  ) continue;
	}

	// if we have '+test -test':
	//if ( negativeBits & requiredBits )
	//	m_numTerms = 0;

	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		QueryTerm *qt = &m_qterms[i];
		// assume not required
		qt->m_isRequired = false;
		// skip signless phrases
		if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
		if ( qt->m_isPhrase && qt->m_termSign == '*'  ) continue;
		if ( qt->m_synonymOf ) continue;
		// IGNORE_QSTOP?
		if ( qt->m_ignored ) continue;

		// user specified "[nrw]" before word
		if( qt->m_userNotRequired) continue;

		// mark it
		qt->m_isRequired = true;
	}


	//If there are two highfreqterms in a row then PosdbTable will ignore the bigram of them because it can't tie the bigram to any required term.
	//Example: "key west mystery writers fest" where "key" and "west" are highfreqterms, and therefore postdbtable will ignore the bigram "key+west".
	//Options:
	//  1: do nothing
	//  2: mark the bigram as ignored
	//  3: mark the bigram as required
	//  4: rewrite Query and the queryterminfo handling in posdbtable so the bigram can be optional.
	//We do (3) because it will likely filter out more bad results than good results. This is a hack because marking a bigram as required normally
	//requires us to to be sure those two words are connected, but for "key west" we are guessing.
	//TODO: reqwrite query+postdbtable so qword/qterm can be optional, etc.
	//The structure of qwords+qterms make sthis code unnecessarily clumsy
	for(int i=0; i+2<m_numWords; i++) {
		if(m_qwords[i  ].m_ignoreWord==IGNORE_HIGHFREMTERM &&
		   m_qwords[i+2].m_ignoreWord==IGNORE_HIGHFREMTERM)
		{
			if(m_qwords[i].m_queryPhraseTerm && m_qwords[i].m_queryPhraseTerm->m_isPhrase) {
				logTrace(g_conf.m_logTraceQuery, "query-words #%d (%.*s) and #%d (%.*s) are both high-freq-terms. Marking bigram as required",
					 i,   m_qwords[i].m_wordLen,   m_qwords[i].m_word,
				         i+2, m_qwords[i+2].m_wordLen, m_qwords[i+2].m_word);
				m_qwords[i].m_queryPhraseTerm->m_isRequired = true;
			}
		}
	}


	//workaround/hack for double-highfreqterm searchs, such as "of a" or "the the" or "the who"
	if(m_numWords==3 &&
	   m_qwords[0].m_ignoreWord==IGNORE_HIGHFREMTERM &&
	   m_qwords[2].m_ignoreWord==IGNORE_HIGHFREMTERM &&
	   m_numTerms==1 &&
	   !m_qterms[0].m_isRequired)
	{
		log(LOG_DEBUG, "query: Looks like a highfreqterm-highfreqterm query type. Requiring one-and-only QueryTerm/bigram");
		m_qterms[0].m_isRequired = true;
		//todo: we should investigate if QueryTerm::m_isRequired actually has any effect. It is used
		//in a single place in PosdbTable for not generating a QueryTermInfo, but it appears it works
		//fine even with the QTI.
	}

	//if all words are high-freq-terms then we have to mark the generated bigrams as required, otherwise PosdbTable.cpp gets unhappy and
	//logs "no required terms in query!"
	bool allAlnumWordsAreIgnored = true;
	for(int i=0; i<m_numWords; i++) {
		if(is_alnum_utf8_string(m_qwords[i].m_word,m_qwords[i].m_word+m_qwords[i].m_wordLen) &&
		   (m_qwords[i].m_ignoreWord!=IGNORE_HIGHFREMTERM && m_qwords[i].m_ignoreWord!=IGNORE_QSTOP))
			allAlnumWordsAreIgnored = false;
	}
	if(allAlnumWordsAreIgnored) {
		log(LOG_DEBUG, "query: all alfanum-terms are ignored (highfreq/qstop). Marking bigrams as required");
		for(int i=0; i<m_numTerms; i++) {
			if(m_qterms[i].m_isPhrase)
				m_qterms[i].m_isRequired = true;
		}
	}

	// required quoted phrase terms
	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		QueryTerm *qt = &m_qterms[i];
		// quoted phrase?
		if ( ! qt->m_isPhrase ) continue;
		if ( ! qt->m_inQuotes ) continue;
		// mark it
		qt->m_isRequired = true;
	}


	// . for query 'to be or not to be shakespeare'
	//   require 'tobe' 'beor' 'tobe' because
	//   they are bigrams in the wikipedia phrase 'to be or not to be'
	//   and they all consist solely of query stop words. as of
	//   8/20/2012 i took 'not' off the query stop word list.
	// . require bigrams that consist of 2 query stop words and
	//   are in a wikipedia phrase. set termSign to '+' i guess?
	// . for 'in the nick' , a wiki phrase, make "in the" required
	//   and give a big bonus for "the nick" below.
	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		QueryTerm *qt = &m_qterms[i];
		// don't require if negative
		if ( qt->m_termSign == '-' ) continue;
		// only check bigrams here
		if ( ! qt->m_isPhrase ) continue;
		// get the query word that starts this phrase
		const QueryWord *qw1 = qt->m_qword;
		// must be in a wikiphrase
		if ( qw1->m_wikiPhraseId <= 0 ) continue;
		// what query word # is that?
		int32_t qwn = qw1 - m_qwords;
		// get the next alnum word after that
		// assume its the last word in our bigram phrase
		const QueryWord *qw2 = &m_qwords[qwn+2];
		// must be in same wikiphrase
		if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
		// must be two stop words
		if ( ! qw1->m_isQueryStopWord ) continue;
		if ( ! qw2->m_isQueryStopWord ) continue;
		// mark it
		qt->m_isRequired = true;
	}

	// . for query 'to be or not to be shakespeare'
	//   give big bonus for 'ornot' and 'notto' bigram terms because
	//   the single terms 'or' and 'to' are ignored and because
	//   'to be or not to be' is a wikipedia phrase
	// . on 8/20/2012 i took 'not' off the query stop word list.
	// . now give a big bonus for bigrams whose two terms are in the
	//   same wikipedia phrase and one and only one of the terms in
	//   the bigram is a query stop word
	// . in general 'ornot' is considered a "synonym" of 'not' and
	//   gets hit with a .90 score factor, but that should never
	//   happen, it should be 1.00 and in this special case it should
	//   be 1.20
	// . so for 'time enough for love' the phrase term "enough for"
	//   gets its m_isWikiHalfStopBigram set AND that phrase term
	//   is a synonym term of the single word term "enough" and is treated
	//   as such in the Posdb.cpp logic.
	for ( int32_t i = 0 ; i < m_numTerms ; i++ ) {
		QueryTerm *qt = &m_qterms[i];
		// assume not!
		qt->m_isWikiHalfStopBigram = false;
		// don't require if negative
		if ( qt->m_termSign == '-' ) continue;
		// only check bigrams here
		if ( ! qt->m_isPhrase ) continue;
		// get the query word that starts this phrase
		const QueryWord *qw1 = qt->m_qword;
		// must be in a wikiphrase
		if ( qw1->m_wikiPhraseId <= 0 ) continue;
		// what query word # is that?
		int32_t qwn = qw1 - m_qwords;
		// get the next alnum word after that
		// assume its the last word in our bigram phrase
		const QueryWord *qw2 = &m_qwords[qwn+2];
		// must be in same wikiphrase
		if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
		// if both query stop words, should have been handled above
		// we need one to be a query stop word and the other not
		// for this algo
		if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord )
			continue;
		// skip if neither is a query stop word
		if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord )
			continue;
		// one must be a stop word i guess
		// so for 'the time machine' we do not count 'time machine'
		// as a halfstopwikibigram
		if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord )
			continue;

		// special flag
		qt->m_isWikiHalfStopBigram = true;
	}

	if(g_conf.m_logTraceQuery)
		traceTermsToLog("final query-terms");

	return true;
}

bool Query::setQWords ( char boolFlag ,
			bool keepAllSingles ,
			Phrases &phrases ) {

	// . break query up into Words and phrases
	// . because we now deal with boolean queries, we make parentheses
	//   their own separate Word, so tell "words" we're setting a query
	plain_tokenizer_phase_1(m_filteredQuery.getBufStart(), m_filteredQuery.length(), &m_tr);
	calculate_tokens_hashes(&m_tr);

	//hackety-hack...
	//The tokenizer phase 2 also recognizes "C++" and "john's", but we cannot use phase 2 because Phrases and Query are
	//incompatible with phase-2 tokens (too many assumptions about strictly increasing positions and contiguous memory layout)
	//So instead we implement special cases here, until we have time to fix the whole Query class.
	for(size_t i=0; i+1<m_tr.size(); i++) {
		//Hack for C++
		if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
		   m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
		   m_tr[i+1].token_len>=2 && memcmp(m_tr[i+1].token_start,"++",2)==0)
		{
			m_tr[i].token_len += 2;
			m_tr[i].end_pos += 2;
			m_tr[i+1].start_pos += 2;
			m_tr[i+1].token_start += 2;
			m_tr[i+1].token_len -= 2;
			if(m_tr[i+1].token_len==0)
				m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
			continue;
		}
		//Hack for F#
		if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
		   m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='f' || m_tr[i].token_start[0]=='F') &&
		   m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
		{
			m_tr[i].token_len += 1;
			m_tr[i].end_pos += 1;
			m_tr[i+1].start_pos += 1;
			m_tr[i+1].token_start += 1;
			m_tr[i+1].token_len -= 1;
			if(m_tr[i+1].token_len==0)
				m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
			continue;
		}
		//Hack for C#
		if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
		   m_tr[i].token_len==1 && (m_tr[i].token_start[0]=='c' || m_tr[i].token_start[0]=='C') &&
		   m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"#",1)==0)
		{
			m_tr[i].token_len += 1;
			m_tr[i].end_pos += 1;
			m_tr[i+1].start_pos += 1;
			m_tr[i+1].token_start += 1;
			m_tr[i+1].token_len -= 1;
			if(m_tr[i+1].token_len==0)
				m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
			continue;
		}
		//Hack for C#
		if(m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum &&
		   m_tr[i].token_len==1 && m_tr[i].token_start[0]=='A' &&
		   m_tr[i+1].token_len>=1 && memcmp(m_tr[i+1].token_start,"*",1)==0)
		{
			m_tr[i].token_len += 1;
			m_tr[i].end_pos += 1;
			m_tr[i+1].start_pos += 1;
			m_tr[i+1].token_start += 1;
			m_tr[i+1].token_len -= 1;
			if(m_tr[i+1].token_len==0)
				m_tr.tokens.erase(m_tr.tokens.begin()+i+1);
			continue;
		}
		//Hack for possessive-apostrophe (no need for extra codepoint checks - people usually don't type them in a search field)
		if(i+2<m_tr.size() &&
		   m_tr[i].is_alfanum && !m_tr[i+1].is_alfanum && m_tr[i+2].is_alfanum &&
		   m_tr[i+1].token_len==1 && (m_tr[i+1].token_start[0]=='\'' || m_tr[i+1].token_start[0]=='`') &&
		   m_tr[i+2].token_len==1 && (m_tr[i+2].token_start[0]=='s' || m_tr[i+2].token_start[0]=='S'))
		{
			m_tr[i].end_pos = m_tr[i+2].end_pos;
			m_tr[i].token_len += m_tr[i+1].token_len + m_tr[i+2].token_len;
			m_tr.tokens.erase(m_tr.tokens.begin()+i+1,m_tr.tokens.begin()+i+3);
			continue;
		}
	}
	for(size_t i=0; i+2<m_tr.size(); ) {
		const auto &t0 = m_tr[i+0];
		const auto &t1 = m_tr[i+1];
		const auto &t2 = m_tr[i+2];
		if(t0.token_end()==t1.token_start && t1.token_end()==t2.token_start &&
		   is_slash_abbreviation(t0.token_start, t0.token_len+t1.token_len+t2.token_len))
		{
			size_t sl = t0.token_len+t2.token_len;
			char *s = (char*)m_tr.egstack.alloc(sl);
			memcpy(s, t0.token_start, t0.token_len);
			memcpy(s+t0.token_len, t2.token_start, t2.token_len);
			m_tr.tokens.emplace_back(t0.start_pos, t2.end_pos, s,sl, false, true);
			m_tr.tokens.erase(m_tr.tokens.begin()+i, m_tr.tokens.begin()+i+3);
		} else
			i++;
	}

	int32_t numWords = m_tr.size();
	// truncate it
	if ( numWords > ABS_MAX_QUERY_WORDS ) {
		log("query: Had %" PRId32" words. Max is %" PRId32". Truncating.",
		    numWords,(int32_t)ABS_MAX_QUERY_WORDS);
		numWords = ABS_MAX_QUERY_WORDS;
		m_truncated = true;
	}
	m_numWords = numWords;
	// alloc the mem if we need to (mdw left off here)
	int32_t need = m_numWords * sizeof(QueryWord);
	// sanity check
	if ( m_qwords ) { g_process.shutdownAbort(true); }
	// point m_qwords to our generic buffer if it will fit
	if(!m_queryWordBuf.reserve(need)) {
		log(LOG_WARN, "query: Could not allocate mem for query.");
		return false;
	}
	m_qwords = (QueryWord *)m_queryWordBuf.getBufStart();
	// reset safebuf in there
	for ( int32_t i = 0 ; i < m_numWords ; i++ )
		m_qwords[i].constructor();

	// is all alpha chars in query in upper case? caps lock on?
	bool allUpper = true;
	const char *p    = m_filteredQuery.getBufStart();
	const char *pend = m_filteredQuery.getBufPtr();
	for ( ; p < pend ; p += getUtf8CharSize(p) )
		if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
			allUpper = false; break; }

	// . come back here from below when we detect dat query is not boolean
	// . we need to redo the bits cuz they may have been messed with below
	// redo:
	// field code we are in
	field_code_t  fieldCode = FIELD_UNSET;
	char  fieldSign = 0;
	const char *field     = NULL;
	int32_t  fieldLen  = 0;
	// keep track of the start of different chunks of quotes
	int32_t quoteStart = -1;
	bool inQuotes   = false;
	//bool inVQuotes   = false;
	char quoteSign  = 0;
	// the current little sign
	char wordSign   = 0;
	// when reading first word in link: ... field we skip the following
	// words until we hit a space because we hash them all together
	bool ignoreTilSpace = false;
	// assume we're NOT a boolean query
	m_isBoolean = false;
	// used to not respect the bool operator if it is the first word
	bool firstWord = true;

	// the query processing is broken into 3 stages.

	// . STAGE #1
	// . reset all query words to default
	//   set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT
	// . set m_isFieldName, m_fieldCode and m_quoteStart for query words.
	//   no field names in quotes. +title:"hey there".
	//   set m_quoteStart to -1 if not in quotes.
	// . if quotes immediately follow field code's ':' then distribute
	//   the field code to all words in the quotes
	// . distribute +/- signs across quotes and fields to m_wordSigns.
	//   support -title:"hey there".
	// . set m_quoteStart to -1 if only one alnum word is
	//   in quotes, what's the point of that?
	// . set boolean op codes (m_opcode). cannot be in quotes.
	//   cannot have a field code. cannot have a word sign (+/-).
	// . set m_wordId of FIELD_LINK, _URL, _SITE, _IP  fields.
	//   m_wordId of first should be hash of the whole field value.
	//   only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF.
	// . set m_ignore of non-op codes, non-fieldname, alnum words to 0.
	// . set m_wordId of each non-ignored alnum word.

	// . STAGE #2
	// . customize Bits class:
	//   first alnum word can start phrase.
	//   first alnum word in quotes (m_quoteStart >= 0 ) can start phrase.
	//   connected on the right but not on the left.. can start phrase.
	//   no pair across any double quote
	//   no pair across ".." --- UNLESS in quotes!
	//   no pair across any change of field code.
	//   field names may not be part of any phrase or paired across.
	//   boolean ops may not be part of any phrase or paired across.
	//   ignored words may not be part of any phrase or paired across.

	// . STAGE #3
	// . set phrases class w/ custom Bits class mods.
	// . set m_bigramId and m_rawPhraseId of all QueryWords. if phraseId
	//   is not 0 (phrase exists) then set m_ignorePhrase to 0.
	// . set m_leftConnected, m_rightConnected. word you are connecting
	//   to must not be ignored. (no field names or op codes).
	//   ensure you are in a phrase with the connected word, too, to
	//   really be connected.
	// . set m_leftPhraseStart and m_rightPhraseEnd for all
	//   m_inQuotePhrase is not needed since if m_quoteStart is >= 0
	//   we MUST be in a quoted phrase!
	// . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED.
	//   set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0).
	//   m_wordSign may have inherited quote or field sign.
	// . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED
	//   set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0)
	//   m_wordSign may have inherited quote or field sign.
	// . if one word in a phrase is negative, then set m_phraseSign to '-'

	// set the Bits used for making phrases from the Words class
	Bits bits;
	if ( !bits.set(&m_tr)) {
		log(LOG_WARN, "query: Had error processing query: %s.", mstrerror(g_errno));
		return false;
	}

	float userWeightForWord   = 1;
	float userWeightForPhrase = 1;
	float userWeightForSynonym = 1;
	bool userNotRequiredForWord = false;
	int32_t ignorei          = -1;

	// assume we contain no pipe operator
	int32_t pi = -1;

	int32_t posNum = 0;
	const char *ignoreTill = NULL;

	// loop over all words, these QueryWords are 1-1 with "words"
	for ( int32_t i = 0 ; i < numWords && i < ABS_MAX_QUERY_WORDS ; i++ ) {
		// convenience var, these are 1-1 with "words"
		QueryWord *qw = &m_qwords[i];
		// set to defaults?
		memset ( qw , 0 , sizeof(QueryWord) );
		// but quotestart should be -1
		qw->m_quoteStart = -1;
		qw->m_leftPhraseStart = -1;
		// assume QueryWord is ignored by default
		qw->m_ignoreWord   = IGNORE_DEFAULT;
		qw->m_ignorePhrase = IGNORE_DEFAULT;
		qw->m_ignoreWordInBoolQuery = false;
		qw->m_word    = m_tr[i].token_start;
		qw->m_wordLen = m_tr[i].token_len;
		qw->m_isPunct = !m_tr[i].is_alfanum;

		qw->m_posNum = posNum;

		// count 1 unit for it
		posNum++;

		// we ignore the facet value range list...
		if ( ignoreTill && qw->m_word < ignoreTill )
			continue;

		// . we duplicated this code from XmlDoc.cpp's
		//   getWordPosVec() function
		if ( qw->m_isPunct ) { // ! wids[i] ) {
			const char *wp = qw->m_word;
			int32_t  wplen = qw->m_wordLen;
			// simple space or sequence of just white space
			if ( is_wspace_utf8_string(m_tr[i].token_start, m_tr[i].token_end()))
				posNum += 0;
			// 'cd-rom'
			else if ( wp[0]=='-' && wplen==1 )
				posNum += 0;
			// 'mr. x'
			else if ( wp[0]=='.' && is_wspace_utf8_string(m_tr[i].token_start+1, m_tr[i].token_end()))
				posNum += 0;
			// animal (dog)
			else
				posNum++;
		}

		const char *w = m_tr[i].token_start;
		int32_t wlen = m_tr[i].token_len;
		// assume it is a query weight operator
		qw->m_queryOp = true;
		// ignore it? (this is for query weight operators)
		if ( i <= ignorei ) continue;
		// deal with pipe operators
		if ( wlen == 5 &&
		     w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') {
			pi = i;
			qw->m_opcode = opcode_t::OP_PIPE;
			continue;
		}
		// [133.0r]
		// is it the bracket operator?
		// " LeFtB 113 rp RiGhB "
		if ( wlen == 5 &&
		     w[0]=='L'&&w[1]=='e'&&w[2]=='F'&&w[3]=='t'&&w[4]=='B'&&
		     i+4 < numWords ) {
			// s MUST point to a number
			const char *s = m_tr[i+2].token_start;
			int32_t slen = m_tr[i+2].token_len;

			// if no number, it must be
			// " leFtB w RiGhB " or " leFtB p RiGhB "
			if ( ! is_digit(s[0]) ) {
				if(s[0] == 'w') {
					// word weight reset
					userWeightForWord = 1;
					ignorei = i + 4;
				} else if(s[0] == 'p') {
					// phrase weight reset
					userWeightForPhrase = 1;
				} else if(s[0] == 's') {
					// phrase weight reset
					userWeightForSynonym = 1;
				} else if(s[0] == 'n' && s[1] == 'r' && s[2] == 'w') {
					// set word as not required
					userNotRequiredForWord = true;
				}
				ignorei = i + 4;
			} else {
				// get the number
				float fval = atof2 (s, slen);
				// s2 MUST point to the a,r,ap,rp string
				const char *s2 = m_tr[i+4].token_start;
				// is it a phrase?
				if(s2[0] == 'w') {
					userWeightForWord = fval;
				} else if(s2[0] == 'p') {
					userWeightForPhrase = fval;
				} else if(s2[0] == 's') {
					userWeightForSynonym = fval;
				}
				// ignore all following words up and inc. i+6
				ignorei = i + 6;
			}
			continue;
		}

		// assign score weight, if any for this guy
		qw->m_userWeightForWord = userWeightForWord;
		qw->m_userWeightForPhrase = userWeightForPhrase;
		qw->m_userWeightForSynonym = userWeightForSynonym;
		// Set required state based on user input
		qw->m_userNotRequiredForWord = userNotRequiredForWord;
		qw->m_queryOp          = false;


		// does word #i have a space in it? that will cancel fieldCode
		// if we were in a field
		bool endField = false;
		if(has_space(m_tr[i].token_start, m_tr[i].token_end()) && ! inQuotes)
			endField = true;
		// TODO: fix title:" hey there" (space in quotes is ok)
		// if there's a quote before the first space then
		// it's ok!!!
		if ( endField ) {
			const char *s = m_tr[i].token_start;
			const char *send = s + m_tr[i].token_len;
			for ( ; s < send ; s++ ) {
				// if the space is inside the quotes then it
				// doesn't count!
				if(*s == '\"') {
					endField = false;
					break;
				}
				if(is_wspace_a(*s))
					break;
			}
		}
		// cancel the field if we hit a space (not in quotes)
		if ( endField ) {
			// cancel the field
			fieldCode = FIELD_UNSET;
			fieldLen  = 0;
			field     = NULL;
			// we no longer have to ignore for link: et al
			ignoreTilSpace = false;
		}
		// . maintain inQuotes and quoteStart
		// . quoteStart is the word # that starts the current quote
		int32_t nq = count_quotes(m_tr[i].token_start, m_tr[i].token_len);

		if ( nq > 0 ) { // && ! ignoreQuotes ) {
			// toggle quotes if we need to
			if ( nq & 0x01 ) inQuotes   = ! inQuotes;
			// set quote sign to sign before the quote
			if ( inQuotes ) {
				quoteSign = '\0';
				for ( const char *p = w + wlen - 1 ; p > w ; p--){
					if ( *p != '\"' ) continue;
					if ( *(p-1) == '-' ) quoteSign = '-';
					if ( *(p-1) == '+' ) quoteSign = '+';
					break;
				}
			}
			// . quoteStart is the word # the quotes started at
			// . it is -1 if not in quotes
			// . now we set it to the alnum word AFTER us!!
			if   ( inQuotes && i+1< numWords ) quoteStart =  i+1;
			else                               quoteStart = -1;
		}
		//log(LOG_DEBUG, "Query: nq: %" PRId32" inQuotes: %d,quoteStart: %" PRId32,
		//    nq, inQuotes, quoteStart);
		// does word #i have a space in it? that will cancel fieldCode
		// if we were in a field
		// TODO: fix title:" hey there" (space in quotes is ok)
		bool cancelField = false;
		if ( has_space(m_tr[i].token_start,  m_tr[i].token_end()) && ! inQuotes )
			cancelField = true;
		// fix title:"foo bar" "another quote" so "another quote"
		// is not in the title: field
		if ( has_space(m_tr[i].token_start,  m_tr[i].token_end()) && inQuotes && nq>= 2 )
			cancelField = true;

		// BUT if we have a quote, and they just got turned off,
		// and the space is not after the quote, do not cancel field!
		if ( nq == 1 && cancelField ) {
			// if we hit the space BEFORE the quote, do NOT cancel
			// the field
			for ( const char *p = w + wlen - 1 ; p > w ; p--) {
				// hey, we got the quote first, keep field
				if ( *p == '\"' ) {cancelField = false; break;}
				// otherwise, we got space first? cancel it!
				if ( is_wspace_a(*p) ) break;
			}
		}
		if ( cancelField ) {
			// cancel the field
			fieldCode = FIELD_UNSET;
			fieldLen  = 0;
			field     = NULL;
			// we no longer have to ignore for link: et al
			ignoreTilSpace = false;
		}
		// skip if we should
		if ( ignoreTilSpace ){
			if (m_qwords[i-1].m_fieldCode){
				qw->m_fieldCode = m_qwords[i-1].m_fieldCode;
			}
			continue;
		}
		// . is this word potentially a field?
		// . it cannot be another field name in a field
		if(i < m_numWords-2 &&
		   m_tr[i+1].token_len==1 && m_tr[i+1].token_start[0]==':' &&
		   !is_wspace_utf8_string(m_tr[i+2].token_start,m_tr[i+2].token_end()) &&
		   (!is_punct_utf8(m_tr[i+2].token_start) || m_tr[i+2].token_start[0]=='\"' || m_tr[i+2].token_start[0]=='-') &&
		   ! fieldCode && ! inQuotes)
		{
			// field name may have started before though if it
			// was a compound field name containing hyphens,
			// underscores or periods
			int32_t  j = i-1 ;
			while ( j > 0 &&
				((m_qwords[j].m_rawWordId != 0) ||
				 (  m_qwords[j].m_wordLen ==1 &&
				  ((m_qwords[j].m_word)[0]=='-' ||
				   (m_qwords[j].m_word)[0]=='_' ||
				   (m_qwords[j].m_word)[0]=='.')))) {
				j--;
			}

			if ( j < 0 ) {
				j = 0;
			}

			// advance j to a non-punct word
			while (!m_tr[j].is_alfanum)
				j++;

			// ignore all of these words then,
			// they're part of field name
			int32_t tlen = 0;
			for ( int32_t k = j ; k <= i ; k++ )
				tlen += m_tr[k].token_len;

			//is it recognized field name,like "title" or "url"?
			fieldCode = getFieldCode (m_tr[j].token_start, tlen);
			if(fieldCode) {
				//Previously this was done in all cases to support searching for sub-sub-sub...fields in json/xml
				//The downside was that copy-paste of colon-separated words or artist names like "L:Ron:Harald" didn't work.

				// set field name to the compound name if it is
				field     = m_tr[j].token_start;
				fieldLen  = tlen;
				if(j == i)
					fieldSign = wordSign;
				else
					fieldSign = m_qwords[j].m_wordSign;
				//FIXME: TokenizerResult does not promise that tokens that are adjacent in the source string also are adjacent in memory
				// (but since Query only does phase-1 tokenization and the tokenizer currently only does tricky things in phase 2 it currently holds)

				// if so, it does NOT get its own QueryWord,
				// but its sign can be inherited by its members
				for ( int32_t k = j ; k <= i ; k++ )
					m_qwords[k].m_ignoreWord = IGNORE_FIELDNAME;
				continue;
			}
		}

		// what quote chunk are we in? this is 0 if we're not in quotes
		if ( inQuotes ) qw->m_quoteStart = quoteStart ;
		else            qw->m_quoteStart = -1;
		qw->m_inQuotes = inQuotes;

		// ptr to field, if any
		qw->m_fieldCode = fieldCode;
		// if we are a punct word, see if we end in a sign that can
		// be applied to the next word, a non-punct word
		if ( !m_tr[i].is_alfanum ) {
			wordSign = w[wlen-1];
			if ( wordSign != '-' && wordSign != '+') wordSign = 0;
			if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0;
			if ( i > 0 && wlen == 1                ) wordSign = 0;

			// don't add any QueryWord for a punctuation word
			continue;
		}

		// what is the sign of our term? +, -, *, ...
		char mysign;
		if      ( fieldCode ) mysign = fieldSign;
		else if ( inQuotes  ) mysign = quoteSign;
		else                  mysign = wordSign;
		// are we doing default AND?
		//if ( forcePlus && ! *mysign ) mysign = '+';
		// store the sign
		qw->m_wordSign = mysign;
		// what quote chunk are we in? this is 0 if we're not in quotes
		if ( inQuotes ) qw->m_quoteStart = quoteStart ;
		else            qw->m_quoteStart = -1;

		// . get prefix hash of collection name and field
		// . but first convert field to lower case
		uint64_t ph;
		int32_t fflen = fieldLen;
		if ( fflen > 62 ) fflen = 62;
		char ff[64];
		to_lower3_a ( field , fflen , ff );

		ph = hash64 ( ff , fflen );
		// map "intitle" map to "title"
		if ( fieldCode == FIELD_TITLE )
			ph = hash64 ( "title", 5 );
		// make "suburl" map to "inurl"
		if ( fieldCode == FIELD_SUBURL )
			ph = hash64 ( "inurl", 5 );

		// fix for filetype:pdf queries
		if ( fieldCode == FIELD_TYPE )
			ph = hash64 ("type",4);

		// ptr to field, if any
		qw->m_fieldCode = fieldCode;

		// prefix hash
		qw->m_prefixHash = ph;

		// if we're hashing a url:, link:, site: or ip: term,
		// then we need to hash ALL up to the first space
		if ( fieldCode == FIELD_URL  ||
		     fieldCode == FIELD_EXT  ||
		     fieldCode == FIELD_LINK ||
		     fieldCode == FIELD_SITELINK||
		     fieldCode == FIELD_LINKS||
		     fieldCode == FIELD_SITE ||
		     fieldCode == FIELD_IP   ||
		     fieldCode == FIELD_GBFIELDMATCH ) {
			// . find 1st space -- that terminates the field value
			// . make "end" point to the end of the entire query
			const char *end = m_tr[m_tr.size()-1].token_end();
			// use this for gbmin:price:1.99 etc.
			int32_t firstColonLen = -1;
			int32_t lastColonLen = -1;
			int32_t colonCount = 0;

			// "w" points to the first alnumword after the field,
			// so for site:xyz.com "w" points to the 'x' and wlen
			// would be 3 in that case sinze xyz is a word of 3
			// chars. so advance
			// wlen until we hit a space.
			while (w + wlen < end) {
				// stop at first white space
				if (is_wspace_utf8(w + wlen)) break;
				// in case of gbmin:price:1.99 record first ':'
				if (w[wlen] == ':') {
					lastColonLen = wlen;
					if (firstColonLen == -1)
						firstColonLen = wlen;
					colonCount++;
				}
				// fix "gbsortbyint:date)"
				// these are used as boolean operators
				// so do not include them in the value.
				// we also did this above to set cancelField
				// to true.
				if (w[wlen] == '(' || w[wlen] == ')')
					break;

				wlen++;
			}
			// ignore following words until we hit a space
			ignoreTilSpace = true;
			// the hash. keep it case insensitive. only
			// the fieldmatch stuff should be case-sensitive.
			// this may change later.
			uint64_t wid = hash64Lower_utf8(w, wlen, 0LL);

			if (fieldCode == FIELD_GBFIELDMATCH) {
				// hash the json field name. (i.e. tag.uri)
				// make it case sensitive as
				// seen in XmlDoc.cpp::hashFacet2().
				// the other fields are hashed in
				// XmlDoc.cpp::hashNumber3().
				// CASE SENSITIVE!!!!
				wid = hash64(w, firstColonLen, 0LL);
				// if it is like
				// gbfieldmatch:tag.uri:"http://xyz.com/poo"
				// then we should hash the string into
				// an int just like how the field value would
				// be hashed when adding gbfacetstr: terms
				// in XmlDoc.cpp:hashFacet2(). the hash of
				// the tag.uri field, for example, is set
				// in hashFacet1() and set to "val32". so
				// hash it just like that does here.
				const char *a = w + firstColonLen + 1;
				// . skip over colon at start
				if (a[0] == ':') a++;
				// . skip over quotes at start/end
				bool inQuotes = false;
				if (a[0] == '\"') {
					inQuotes = true;
					a++;
				}
				// end of field
				const char *b = a;
				// if not in quotes advance until
				// we hit whitespace
				char cs;
				for (; !inQuotes && *b; b += cs) {
					cs = getUtf8CharSize(b);
					if (is_wspace_utf8(b)) break;
				}
				// if in quotes, go until we hit quote
				for (; inQuotes && *b != '\"'; b++)
					;
				// now hash that up. this must be 64 bit
				// to match in XmlDoc.cpp::hashFieldMatch()
				uint64_t val64 = hash64(a, b - a);
				// make a composite of tag.uri and http://...
				// just like XmlDoc.cpp::hashFacet2() does
				wid = hash64(val64, wid);
			}

			// should we have normalized before hashing?
			if (fieldCode == FIELD_URL ||
				fieldCode == FIELD_LINK ||
				fieldCode == FIELD_SITELINK ||
				fieldCode == FIELD_LINKS ||
				fieldCode == FIELD_SITE) {
				Url url;
				url.set( w, wlen, ( fieldCode != FIELD_SITE ), false );

				if (fieldCode == FIELD_SITELINK) {
					wid = hash64(url.getHost(), url.getHostLen());
				} else {
					wid = hash64(url.getUrl(), url.getUrlLen());
				}
			}

			// like we do it in XmlDoc.cpp's hashString()
			if (ph) {
				qw->m_wordId = hash64h(wid, ph);
			} else {
				qw->m_wordId = wid;
			}

			qw->m_rawWordId   = 0LL; // only for highlighting?
			qw->m_bigramId    = 0LL;
			qw->m_rawPhraseId = 0LL;
			qw->m_opcode      = opcode_t::OP_NONE;

			// definitely not a query stop word
			qw->m_isQueryStopWord = false;

			// do not ignore the wordId
			qw->m_ignoreWord = IGNORE_NO_IGNORE;

			// we are the first word?
			firstWord = false;

			// we're done with this one
			continue;
		}


		opcode_t opcode = opcode_t::OP_NONE;
		// if query is all in upper case and we're doing boolean
		// DETECT, then assume not boolean
		if ( allUpper && boolFlag == 2 ) boolFlag = 0;
		// . is this word a boolean operator?
		// . cannot be in quotes or field
		if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) {
			// are we an operator?
			if      ( ! firstWord && wlen==2 &&
				  w[0]=='O' && w[1]=='R')
				opcode = opcode_t::OP_OR;
			else if ( ! firstWord && wlen==3 &&
				  w[0]=='A' && w[1]=='N' && w[2]=='D')
				opcode = opcode_t::OP_AND;
			else if ( ! firstWord && wlen==3 &&
				  w[0]=='N' && w[1]=='O' && w[2]=='T')
				opcode = opcode_t::OP_NOT;
			else if ( wlen==5 && w[0]=='L' && w[1]=='e' &&
				  w[2]=='F' && w[3]=='t' && w[4]=='P' )
				opcode = opcode_t::OP_LEFTPAREN;
			else if ( wlen==5 && w[0]=='R' && w[1]=='i' &&
				  w[2]=='G' && w[3]=='h' && w[4]=='P' )
				opcode = opcode_t::OP_RIGHTPAREN;
			// no pair across or even include any boolean op phrs
			if ( opcode != opcode_t::OP_NONE ) {
				bits.clearBits(i,D_CAN_PAIR_ACROSS);
				bits.clearBits(i,D_CAN_BE_IN_PHRASE);
				qw->m_ignoreWord = IGNORE_BOOLOP;
				qw->m_opcode     = opcode;
				if ( opcode == opcode_t::OP_LEFTPAREN  ) continue;
				if ( opcode == opcode_t::OP_RIGHTPAREN ) continue;
				// if this is uncommented all of our operators
				// become actual query terms (mdw)
				if ( opcode == opcode_t::OP_UOR        ) continue;
				// if you just have ANDs and ()'s that does
				// not make you a boolean query! we are bool
				// by default!!
				if ( opcode == opcode_t::OP_AND        ) continue;
				m_isBoolean = true;
				continue;
			}
		}

		// . add single-word term id
		// . this is computed by hash64AsciiLower()
		// . but only hash64Lower_a if _HASHWITHACCENTS_ is true
		uint64_t wid = m_tr[i].token_hash;
		qw->m_rawWordId = wid;
		// we now have a first word already set
		firstWord = false;
		// . are we a QUERY stop word?
		// . NEVER count as stop word if it's in all CAPS and
		//   not all letters in the whole query is NOT in all CAPS
		// . It's probably an acronym
		if ( m_tr[i].token_len>1 &&
		     is_upper_utf8_string(m_tr[i].token_start, m_tr[i].token_end()) &&
		     ! allUpper )
		{
			qw->m_isQueryStopWord = false;
			qw->m_isStopWord      = false;
		} else {
			qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid,
								  m_langId);
			// . BUT, if it is a single letter contraction thing
			// . ninad: make this == 1 if in utf8! TODO!! it is!
			if ( i>0 && wlen == 1 && w[-1] == '\'' )
				qw->m_isQueryStopWord = true;
			qw->m_isStopWord =::isStopWord (w,wlen,wid);
		}
		// . do not count as query stop word if it is the last in query
		// . like the query: 'baby names that start with j'
		if ( i + 2 > numWords ) {
			qw->m_isQueryStopWord = false;
		}

		// like we do it in XmlDoc.cpp's hashString()
		if ( ph ) {
			qw->m_wordId = hash64(wid, ph);
		} else {
			qw->m_wordId = wid;
		}

		// do not ignore the word
		qw->m_ignoreWord = IGNORE_NO_IGNORE;

		//except if it is a high-frequency-term and expensive to look up. In that case ignore the word but keep the phrases/bigrams thereof
		uint64_t termId = (qw->m_wordId & TERMID_MASK);
		if(g_conf.m_useHighFrequencyTermCache &&
			m_allowHighFreqTermCache && g_hfts.is_registered_term(termId)) {
			log(LOG_DEBUG, "query: term='%.*s' with termId %lu is a highfreq term. Marking it for ignoring", wlen, w, termId);
			qw->m_ignoreWord = IGNORE_HIGHFREMTERM;
		}

		// reset for next word
		userNotRequiredForWord = false;
	}

	//If there's only one alphanumerical word and it was ignored due to high-freq-term then the query is treated as 0 terms and will return an empty
	//result. Therefore un-ignore the single word and let it fetch (best-efort) results from the high-freq-term-cache
	int numAlfanumWords = 0;
	int numAlfanumWordsHighFreqTerms = 0;
	int alfanumWordIndex = -1;
	for(int i=0; i<numWords; i++) {
		if(m_tr[i].is_alfanum) {
			alfanumWordIndex = i;
			numAlfanumWords++;
			if(m_qwords[i].m_ignoreWord==IGNORE_HIGHFREMTERM)
				numAlfanumWordsHighFreqTerms++;

		}
	}
	if(numAlfanumWords == 1 && numAlfanumWordsHighFreqTerms==1)
		m_qwords[alfanumWordIndex].m_ignoreWord = IGNORE_NO_IGNORE;

	// pipe those that should be piped
	for ( int32_t i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true;

	// . set m_leftConnected and m_rightConnected
	// . we are connected to the first non-punct word on our left
	//   if we are separated by a small $ of defined punctuation
	// . see getIsConnection() for that definition
	// . this allows us to just lookup the phrase for things like
	//   "cd-rom" rather than lookup "cd" , "rom" and "cd-rom"
	// . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or
	//   IGNORE_DEFAULT
	// . we have to set outside the main loop above since we check
	//   the m_ignoreWord member of the i+2nd word
	for ( int32_t i = 0 ; i < numWords ; i++ ) {
		QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignoreWord ) continue;
		if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&&
		     isConnection(i+1) )
			qw->m_rightConnected = true;
		if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord &&
		     isConnection(i-1) )
			qw->m_leftConnected  = true;
	}

	// now modify the Bits class before generating phrases
	for ( int32_t i = 0 ; i < numWords ; i++ ) {
		// get default bits
		unsigned char b = bits.queryBits(i);
		// allow pairing across anything by default
		b |= D_CAN_PAIR_ACROSS;
		// get Query Word
		QueryWord *qw = &m_qwords[i];
		// . skip if part of a query weight operator
		// . cannot be in a phrase, or anything
		if ( qw->m_queryOp && qw->m_opcode==opcode_t::OP_NONE) {
			b = D_CAN_PAIR_ACROSS;
		}
		// is this word a sequence of punctuation and spaces?
		else if ( !m_tr[i].is_alfanum ) {
			// pair across ANY punct, even double spaces by default
			b |= D_CAN_PAIR_ACROSS;
			// but do not pair across anything with a quote in it
			if ( count_quotes(m_tr[i].token_start, m_tr[i].token_len) > 0 )
				b &= ~D_CAN_PAIR_ACROSS;
			// continue if we're in quotes
			else if ( qw->m_quoteStart >= 0 ) goto next;
			// continue if we're in a field
			else if ( qw->m_fieldCode > 0 ) goto next;
			// if guy on left is in field, do not pair across
			if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 )
				b &= ~D_CAN_PAIR_ACROSS;
			// or if guy on right in field
			if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 )
				b &= ~D_CAN_PAIR_ACROSS;
			// do not pair across ".." when not in quotes/field
			const char *w = m_tr[i].token_start;
			int32_t  wlen = m_tr[i].token_len;
			for ( int32_t j = 0 ; j < wlen-1 ; j++ ) {
				if ( w[j  ]!='.' ) continue;
				if ( w[j+1]!='.' ) continue;
				b &= ~D_CAN_PAIR_ACROSS;
				break;
			}
		}
		else {
			// . no field names, bool operators, cruft in fields
			//   can be any part of a phrase
			// . no pair across any change of field code
			// . 'girl title:boy' --> no "girl title" phrase!
			if ( qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM ) {
				b &= ~D_CAN_PAIR_ACROSS;
				b &= ~D_CAN_BE_IN_PHRASE;
			}
			// . no boolean ops
			// . 'this OR that' --> no "this OR that" phrase
			if ( qw->m_opcode != opcode_t::OP_NONE ) {
				b &= ~D_CAN_PAIR_ACROSS;
				b &= ~D_CAN_BE_IN_PHRASE;
			}
			if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) {
				b &= ~D_CAN_PAIR_ACROSS;
				b &= ~D_CAN_BE_IN_PHRASE;
			}

		}
	next:
		// set it back all tweaked
		bits.assignBits(i,b);
	}

	// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
	// in quotes for the most part, therefore, set m_quoteStart for them
	int32_t j;
	int32_t qs = -1;
	for ( j = 0 ; j < numWords ; j++ ) {
		// skip all but strongly connected words
		if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
		     // must also be non punct word OR a space
		     ( m_tr[j].is_alfanum || *m_tr[j].token_start==' ' ) ) {
			// break the "quote", if any
			qs = -1; continue; }
		// if he is punctuation and qs is -1, skip him,
		// punctuation words can no longer start a quote
		if ( !m_tr[j].is_alfanum && qs == -1 ) continue;
		// uningore him if we should
		if ( keepAllSingles ) m_qwords[j].m_ignoreWord = IGNORE_NO_IGNORE;
		// if already in quotes, don't bother!
		if ( m_qwords[j].m_quoteStart >= 0 ) continue;
		// remember him
		if ( qs == -1 ) qs = j;
		// he starts the phrase
		m_qwords[j].m_quoteStart = qs;
		// force him into a quoted phrase
		m_qwords[j].m_inQuotes   = true;
		//m_qwords[j].m_inQuotedPhrase = true;
	}

	// fix for tags.uri:http://foo.com/bar so it works like
	// tags.uri:"http://foo.com/bar" like it should
	int32_t first = -1;
	for ( j = 0 ; j < numWords ; j++ ) {
		// stop when we hit spaces
		if ( has_wspace_utf8_string(m_tr[j].token_start, m_tr[j].token_end()) ) {
			first = -1;
			continue;
		}
		// skip if not in field
		if ( ! m_qwords[j].m_fieldCode ) continue;
		// must be in a generic field, the other fields like site:
		// will be messed up by this logic
		if ( m_qwords[j].m_fieldCode != FIELD_GENERIC ) continue;
		// first alnumword in field?
		if ( first == -1 ) {
			// must be alnum
			if ( m_qwords[j].m_isPunct ) continue;
			// must have punct then another alnum word
			if ( j+2 >= numWords ) break;
			// spaces screw it up
			if ( has_wspace_utf8_string(m_tr[j+1].token_start, m_tr[j+1].token_end()) ) continue;
			// then an alnum word after
			first = j;
		}
		// we are in fake quoted phrase
		m_qwords[j].m_inQuotes = true;
		m_qwords[j].m_quoteStart = first;
	}

	// make the phrases from the words and the tweaked Bits class
	if ( !phrases.set(m_tr,bits) )
		return false;

	// do phrases stuff
	for ( int32_t i = 0 ; i < numWords ; i++ ) {
		// get the ith QueryWord
		QueryWord *qw = &m_qwords[i];

		//if word is ignored (and it is not due to high-freq-term) then don't generate a phrase/bigram query term
		if(qw->m_ignoreWord && qw->m_ignoreWord!=IGNORE_HIGHFREMTERM)
			continue;
		if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
		// get the first word # to our left that starts a phrase
		// of which we are a member
		qw->m_leftPhraseStart = -1;
		for ( int32_t j = i - 1 ; j >= 0 ; j-- ) {
			if ( ! bits.canPairAcross(j+1) ) break;
			if ( !m_tr[j].is_alfanum ) continue;

			qw->m_leftPhraseStart = j;
			// we can't pair across alnum words now, we just want bigrams
			if ( m_tr[j].is_alfanum ) break;
			// now we do bigrams so only allow two words even
			// if they are stop words
			break;
		}
		// . is this word in a quoted phrase?
		// . the whole phrase must be in the same set of quotes
		// . if we're in a left phrase, he must be in our quotes
		if ( qw->m_leftPhraseStart >= 0 &&
		     qw->m_quoteStart      >= 0 &&
		     qw->m_leftPhraseStart >= qw->m_quoteStart )
			qw->m_inQuotedPhrase = true;
		// if we start a phrase, ensure next guy is in our quote
		if ( ! qw->m_ignorePhrase && i+1 < numWords &&
		     m_qwords[i+1].m_quoteStart >= 0 &&
		     m_qwords[i+1].m_quoteStart <= i )
			qw->m_inQuotedPhrase = true;
		// are we the first word in the quote?
		if ( i-1>=0 && qw->m_quoteStart == i )
			qw->m_inQuotedPhrase = true;
		// ignore single words that are in a quoted phrase
		if ( ! keepAllSingles && qw->m_inQuotedPhrase )
			qw->m_ignoreWord = IGNORE_QUOTED;

		// . get phrase info for this term
		// . a pid (phraseId)of 0 indicates it does not start a phrase
		// . raw phrase termId
		uint64_t pid = 0LL;

		phrases.getMinWordsInPhrase(i,(int64_t *)&pid);;

		// store it
		qw->m_rawPhraseId = pid;

		// does word #i start a phrase?
		if ( pid != 0 ) {
			uint64_t ph = qw->m_prefixHash ;

			// like we do it in XmlDoc.cpp's hashString()
			if ( ph ) qw->m_bigramId = hash64 ( pid , ph );
			else      qw->m_bigramId = pid;

			//calculate length of phrase(bigram) in bytes
			int32_t numWordsInPhrase = phrases.getNumWordsInPhrase2(i);
			int phraseLen = 0;
			for(int j=i; j<i+numWordsInPhrase; j++)
				phraseLen += m_qwords[j].m_wordLen;
			qw->m_bigramLen = phraseLen;

			// do not ignore the phrase, it's valid
			qw->m_ignorePhrase = IGNORE_NO_IGNORE;
		}


		// . phrase sign is inherited from word's sign if it's a minus
		// . word sign is inherited from field, quote or right before
		//   the word
		// . that is, all words in -"to be or not" will have a '-' sign
		// . phraseId may or may not be 0 at this point
		if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-';

		// . dist word signs to others in the same connected string
		// . use "-cd-rom x-box" w/ no connector in between
		// . test queries:
		// . +cd-rom +x-box
		// . -cd-rom +x-box
		// . -m-o-n
		// . who was the first   (was is a query stop word)
		// . www.xxx.com
		// . welcome to har.com
		// . hezekiah walker the love family affair ii live at radio
		//   city music hall
		// . fotostudio +m-o-n-a-r-t
		// . fotostudio -m-o-n-a-r-t
		// . i'm home
		if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 )
			qw->m_wordSign = m_qwords[i-2].m_wordSign;

		// . if we connected to the alnum word on our right then
		//   soft require the phrase (i.e. treat like a single term)
		// . example: cd-rom or www.xxx.com
		// . 'welcome to har.com' should get a '*' for "har.com" sign
		if ( qw->m_rightConnected ) {
			if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign;
			else                 qw->m_phraseSign = '*';
		}

		// . if we're in quotes then any phrase we have should be
		//   soft required (i.e. treated like a single term)
		// . we do not allow phrases in queries to pair across
		//   quotes. See where we tweak the Bits class above.
		if ( qw->m_quoteStart >= 0 ) {
			qw->m_phraseSign = '*';
		}

		// . if we are the last word in a phrase that consists of all
		//   PLAIN stop words then make the phrase have a '*'
		// . 'to be or not to be .. test' (cannot pair across "..")
		// . don't use QUERY stop words cuz of "who was the first?" qry
		if ( pid ) {
			int32_t nw = phrases.getNumWordsInPhrase2(i);
			int32_t j;
			// search up to this far
			int32_t maxj = i + nw;
			// but not past our truncated limit
			if ( maxj > ABS_MAX_QUERY_WORDS )
				maxj = ABS_MAX_QUERY_WORDS;

			for ( j = i ; j < maxj ; j++ ) {
				// skip punct
				if ( !m_tr[j].is_alfanum      ) continue;
				// break out if not a stop word
				if ( ! bits.isStopWord(j)     ) break;
				// break out if has a term sign
				if (   m_qwords[j].m_wordSign ) break;
			}
			// if everybody in phrase #i was a signless stopword
			// and the phrase was signless, make it have a '*' sign
			if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' )
				m_qwords[i].m_phraseSign = '*';
			// . if a constituent has a - sign, then the whole
			//   phrase becomes negative, too
			// . fixes 'apple -computer' truncation problem
			for ( int32_t j = i ; j < maxj ; j++ )
				if ( m_qwords[j].m_wordSign == '-' )
					qw->m_phraseSign = '-';
		}

		// . ignore unsigned QUERY stop words that are not yet ignored
		//   and are in unignored phrases
		// . 'who was the first taiwanese president' should not get
		//   "who was" term sign changed to '*' because "was" is a
		//   QUERY stop word. So ignore singles query stop words
		//   in phrases now
		if ( //! keepAllSingles &&
		     (qw->m_isQueryStopWord && !m_isBoolean) &&
		     m_useQueryStopWords &&
		     ! qw->m_fieldCode &&
		     // fix 'the tigers'
		     //(qw->m_leftPhraseStart >= 0 || qw->m_bigramId > 0 ) &&
		     ! qw->m_wordSign &&
		     ! qw->m_ignoreWord )
			qw->m_ignoreWord = IGNORE_QSTOP;

		// . ignore and/or between quoted phrases, save user from
		//   themselves (they meant AND/OR)
		if ( ! keepAllSingles && qw->m_isQueryStopWord &&
		     ! qw->m_fieldCode &&
		     m_useQueryStopWords &&
		     ! qw->m_bigramId && ! qw->m_inQuotes &&
		     ((qw->m_wordId == 255176654160863LL) ||
		      (qw->m_wordId ==  46196171999655LL))        )
			qw->m_ignoreWord = IGNORE_QSTOP;
		// . ignore repeated single words and phrases
		// . look at the old termIds for this, too
		// . should ignore 2nd 'time' in 'time after time' then
		// . but boolean queries often need to repeat terms

		// . NEW - words much be same sign and not in different
		// . quoted phrases to be ignored -partap
		if ( ! m_isBoolean && !qw->m_ignoreWord ) {
			for ( int32_t j = 0 ; j < i ; j++ ) {
				if ( m_qwords[j].m_ignoreWord   ) continue;
				if ( m_qwords[j].m_wordId == qw->m_wordId &&
				     m_qwords[j].m_wordSign ==qw->m_wordSign &&
				     (!keepAllSingles ||
				      (m_qwords[j].m_quoteStart
				       == qw->m_quoteStart))){
					qw->m_ignoreWord = IGNORE_REPEAT;
				}
			}
		}
		if ( ! m_isBoolean && !qw->m_ignorePhrase ) {
			// ignore repeated phrases too!
			for ( int32_t j = 0 ; j < i ; j++ ) {
				if ( m_qwords[j].m_ignorePhrase ) continue;
				if ( m_qwords[j].m_bigramId == qw->m_bigramId &&
				     m_qwords[j].m_phraseSign
				     == qw->m_phraseSign)
					qw->m_ignorePhrase = IGNORE_REPEAT;
			}
		}
	}

	// . if we only have one quoted query then force its sign to be '+'
	// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
	// . "time enough for love" --> +"time enough" +"enough for love"
	// . if all unignored words are in the same set of quotes then change
	//   all '*' (soft-required) phrase signs to '+'
	for ( j= 0 ; j < numWords ; j++ ) {
		if ( !m_tr[j].is_alfanum) continue;
		if ( m_qwords[j].m_quoteStart < 0 ) break;
		if ( m_qwords[j].m_ignoreWord ) continue;
		if ( j < 2 ) continue;
		if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart )
			break;
	}
	if ( j >= numWords ) {
		for ( j= 0 ; j < numWords ; j++ ) {
			if ( m_qwords[j].m_phraseSign == '*' )
				m_qwords[j].m_phraseSign = '+';
		}
	}

	// . force a plus on any site: or ip: query terms
	// . also disable site clustering if we have either of these terms
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignoreWord ) continue;
		if ( qw->m_wordSign   ) continue;
		if ( qw->m_fieldCode != FIELD_SITE &&
		     qw->m_fieldCode != FIELD_IP     ) continue;
		qw->m_wordSign = '+';
	}

	// . if one or more of a phrase's constituent terms exceeded
	//   term #MAX_QUERY_TERMS then we should also soft require that phrase
	// . fixes 'hezekiah walker the love family affair ii live at
	//          radio city music hall'
	// . how many non-ignored phrases?
	int32_t count = 0;
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignorePhrase ) continue;
		if ( ! qw->m_bigramId   ) continue;
		count++;
	}
	for ( int32_t i = 0 ; i < numWords ; i++ ) {
		QueryWord *qw = &m_qwords[i];
		// count non-ignored words
		if ( qw->m_ignoreWord ) continue;
		// if under limit, continue
		if ( count++ < ABS_MAX_QUERY_TERMS ) continue;
		// . otherwise, ignore
		// . if we set this for our UOR'ed terms from SearchInput.cpp's
		//   UOR'ed facebook interests then it causes us to get no results!
		//   so make sure that MAX_QUERY_TERMS is big enough with respect to
		//   the opCount in SearchInput.cpp
		qw->m_ignoreWord = IGNORE_BREECH;
		// left phrase should get a '*'
		int32_t left = qw->m_leftPhraseStart;
		if ( left >= 0 && ! m_qwords[left].m_phraseSign )
			m_qwords[left].m_phraseSign = '*';
		// our phrase should get a '*'
		if ( qw->m_bigramId && ! qw->m_phraseSign )
			qw->m_phraseSign = '*';
	}

	// . fix the 'x -50a' query so it returns results
	// . how many non-negative, non-ignored words/phrases do we have?
	count = 0;
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		const QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignoreWord      ) continue;
		if ( qw->m_wordSign == '-' ) continue;
		count++;
	}
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		const QueryWord *qw = &m_qwords[i];
		if ( qw->m_ignorePhrase      ) continue;
		if ( qw->m_phraseSign == '-' ) continue;
		if ( qw->m_bigramId == 0LL   ) continue;
		count++;
	}
	// if everybody is ignored or negative UNignore first query stop word
	if ( count == 0 ) {
		for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
			QueryWord *qw = &m_qwords[i];
			if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
			qw->m_ignoreWord = IGNORE_NO_IGNORE;
			count++;
			break;
		}
	}

	quoteStart = -1;
	int32_t quoteEnd = -1;
	// set m_quoteENd
	for ( int32_t i = m_numWords - 1 ; i >= 0 ; i-- ) {
		// get ith word
		QueryWord *qw = &m_qwords[i];
		// skip if ignored
		if ( qw->m_ignoreWord ) continue;
		// skip if not in quotes
		if ( qw->m_quoteStart < 0 ) continue;
		// if match previous guy...
		if ( qw->m_quoteStart == quoteStart ) {
			// inherit the end
			qw->m_quoteEnd = quoteEnd;
			// all done
			continue;
		}
		// ok, we are the end then
		quoteEnd   = i;
		quoteStart = qw->m_quoteStart;
	}


	int32_t wkid = 0;
	int32_t upTo = -1;

	//
	// set the wiki phrase ids
	//
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		// get ith word
		QueryWord *qw = &m_qwords[i];
		// in a phrase from before?
		if ( i < upTo ) {
			qw->m_wikiPhraseId = wkid;
			continue;
		}
		// assume none
		qw->m_wikiPhraseId = 0;
		// skip if punct
		if ( !m_tr[i].is_alfanum ) continue;
		// get word
		int32_t nwk ;
		nwk = g_wiki.getNumWordsInWikiPhrase ( i , &m_tr );
		// bail if none
		if ( nwk <= 1 ) continue;

		// inc it
		wkid++;
		// store it
		qw->m_wikiPhraseId = wkid;
		// set loop parm
		upTo = i + nwk;
	}

	// consider terms strongly connected like wikipedia title phrases
	for ( int32_t i = 0 ; i + 2 < m_numWords ; i++ ) {
		// get ith word
		QueryWord *qw1 = &m_qwords[i];
		// must not already be in a wikiphrase
		//if ( qw1->m_wikiPhraseId > 0 ) continue;
		// what query word # is that?
		int32_t qwn = qw1 - m_qwords;
		// get the next alnum word after that
		// assume its the last word in our bigram phrase
		QueryWord *qw2 = &m_qwords[qwn+2];
		// must be in same wikiphrase
		if ( qw2->m_wikiPhraseId > 0 ) continue;

		// if there is a strong connector like the . in 'dmoz.org'
		// then consider it a wiki bigram too
		if ( ! qw1->m_rightConnected ) continue;
		if ( ! qw2->m_leftConnected  ) continue;

		// fix 'rdf.org.dumps' so org.dumps gets same
		// wikiphraseid as rdf.org
		int id;
		if ( qw1->m_wikiPhraseId ) id = qw1->m_wikiPhraseId;
		else id = ++wkid;

		// store it
		qw1->m_wikiPhraseId = id;

		qw2->m_wikiPhraseId = id;
	}

	// all done
	return true;
}


void Query::modifyQuery(DerivedScoringWeights *scoringWeights, const CollectionRec& cr, bool *doSiteClustering) {
	logTrace(g_conf.m_logTraceQuery, "Query::modifyQuery: q='%s', modifyDomainLikeSearches=%s, modifyAPILikeSearches=%s", originalQuery(),cr.m_modifyDomainLikeSearches?"true":"false", cr.m_modifyAPILikeSearches?"true":"false");
	logTrace(g_conf.m_logTraceQuery, "                     m_numWords = %d", m_numWords);
	logTrace(g_conf.m_logTraceQuery, "                     m_numTerms = %d", m_numTerms);
	if(cr.m_modifyDomainLikeSearches) {
		bool looksLikeADomain = false;
		// is it a domain in the form of domain.tld ?
		if(m_numWords==3 &&
		  is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		  m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
			looksLikeADomain = true;
		// is it a domain in the form of host.domain.tld ?
		if(m_numWords==5 &&
		  is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		  m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
		  m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen))
			looksLikeADomain = true;
		// is it a domain in the form of host.domain.tld1.tld2 ? (eg www.example.co.uk)
		if(m_numWords==7 &&
		  is_alnum_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		  m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
		  m_qwords[3].m_wordLen==1 && m_qwords[3].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[4].m_word,m_qwords[4].m_word+m_qwords[4].m_wordLen) &&
		  m_qwords[5].m_wordLen==1 && m_qwords[5].m_word[0]=='.' &&
		  is_alnum_utf8_string(m_qwords[6].m_word,m_qwords[6].m_word+m_qwords[6].m_wordLen))
			looksLikeADomain = true;
		if(looksLikeADomain) {
			if(!isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen) &&
			   !isTLD(m_qwords[m_numWords-3].m_word,m_qwords[m_numWords-3].m_wordLen+m_qwords[m_numWords-2].m_wordLen+m_qwords[m_numWords-1].m_wordLen))
				looksLikeADomain = false; //nope - last component(s) isn't a known tld
		}
		if(looksLikeADomain) {
			log(LOG_DEBUG, "query:Query '%s' looks like a domain", originalQuery());
			//set all non-synonym terms as required and boost inUrl weight.
			for(int i=0; i<m_numTerms; i++) {
				if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
					m_qterms[i].m_isRequired         = true;
					m_qterms[i].m_rightPhraseTermNum = -1;
					m_qterms[i].m_leftPhraseTermNum  = -1;
					m_qterms[i].m_rightPhraseTerm    = NULL;
					m_qterms[i].m_leftPhraseTerm     = NULL;
				}
			}
			if(isTLD(m_qwords[m_numWords-1].m_word,m_qwords[m_numWords-1].m_wordLen)) {
				//The last term is marked non-required because the tld terms are normally not indexed (see XmlDoc::hashUrl() -> hashString() -> hashString3())
				//high-freq-terms and stopwords means that the term may not have been generated, so look for it
				for(int i=0; i<m_numTerms; i++) {
					if(m_qterms[i].m_qword == &(m_qwords[m_numWords-1]) && !m_qterms[i].m_isPhrase)
						m_qterms[i].m_isRequired = false;
				}
			}
			scoringWeights->m_hashGroupWeights[HASHGROUP_INURL]  *= 10; //factor 10 seems to work fine
			if(cr.m_domainLikeSearchDisablesSiteCluster)
				*doSiteClustering = false;
			log(LOG_DEBUG, "query:Query modified");
			traceTermsToLog("domain-like search terms");
			return;
		}
	}

	if(cr.m_modifyAPILikeSearches) {
		bool looksLikeAnAPI = false;
		//is it something like "file.open" or "file.open()" ?
		//todo: detect java packages like java.util.HashSet (but most java programmers probably has built-in help in their IDE so they would rarely use this)
		if(m_numWords==3 &&
		  is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		  m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
		  is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
			looksLikeAnAPI = true;
		if(m_numWords==4 &&
		   is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		   m_qwords[1].m_wordLen==1 && m_qwords[1].m_word[0]=='.' &&
		   is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
		   m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
		   looksLikeAnAPI = true;
		//or "file::open()"
		if(m_numWords==3 &&
		  is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		  m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
		  is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen))
			looksLikeAnAPI = true;
		if(m_numWords==4 &&
		   is_alnum_api_utf8_string(m_qwords[0].m_word,m_qwords[0].m_word+m_qwords[0].m_wordLen) &&
		   m_qwords[1].m_wordLen==2 && m_qwords[1].m_word[0]==':' && m_qwords[1].m_word[1]==':' &&
		   is_alnum_api_utf8_string(m_qwords[2].m_word,m_qwords[2].m_word+m_qwords[2].m_wordLen) &&
		   m_qwords[3].m_wordLen==2 && m_qwords[3].m_word[0]=='(' && m_qwords[3].m_word[1]==')')
		   looksLikeAnAPI = true;
		if(looksLikeAnAPI) {
			log(LOG_DEBUG, "query:Query '%s' looks like an API or function call", originalQuery());
			//set all non-synonym terms as required
			for(int i=0; i<m_numTerms; i++) {
				if(!m_qterms[i].m_synonymOf && !m_qterms[i].m_ignored) {
					m_qterms[i].m_isRequired         = true;
					m_qterms[i].m_rightPhraseTermNum = -1;
					m_qterms[i].m_leftPhraseTermNum  = -1;
					m_qterms[i].m_rightPhraseTerm    = NULL;
					m_qterms[i].m_leftPhraseTerm     = NULL;
				}
			}
			log(LOG_DEBUG, "query:Query modified");
			traceTermsToLog("api-like search terms");
			return;
		}
	}
	log(LOG_DEBUG, "query: Query not modified");
}


// return -1 if does not exist in query, otherwise return the query word num
int32_t Query::getWordNum(int64_t wordId) const {
	// skip if punct or whatever
	if ( wordId == 0LL || wordId == -1LL ) return -1;
	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
		const QueryWord *qw = &m_qwords[i];
		// the non-raw word id includes a hash with "0", which
		// signifies an empty field term
		if ( qw->m_rawWordId == wordId ) return i;
	}
	// otherwise, not found
	return -1;
}

static HashTableX s_table;
static bool       s_isInitialized = false;
static GbMutex    s_tableMutex;

// 3rd field = m_hasColon
const struct QueryField g_fields[] = {
	{"url",
	 FIELD_URL,
	 true,
	 "url:www.example.com/page.html",
	 "Matches the page with that exact url. Uses the first url, not "
	 "the url it redirects to, if any." ,
	 NULL,
	 0 },

	{"ext",
	 FIELD_EXT,
	 true,
	 "ext:doc",
	 "Match documents whose url ends in the <i>.doc</i> file extension.",
	 NULL,
	 0 },


	{"link",
	 FIELD_LINK,
	 true,
	 "link:www.example.com/foo.html",
	 "Matches all the documents that have a link to "
	 "http://www.example.com/foobar.html",
	 NULL,
	 0 },

	{"sitelink",
	 FIELD_SITELINK,
	 true,
	 "sitelink:abc.foobar.com",
	 "Matches all documents that link to any page on the "
	 "<i>abc.foobar.com</i> site.",
	 NULL,
	 0 },

	{"site",
	 FIELD_SITE,
	 true,
	 "site:example.com",
	 "Matches all documents on the example.com domain.",
	 NULL,
	 0 },

	{"site",
	 FIELD_SITE,
	 true,
	 "site:www.example.com/dir1/dir2/",
	 "Matches all documents whose url starts with "
	 "www.example.com/dir1/dir2/",
	 NULL,
	 QTF_DUP },

	{"sitenoindex",
	 FIELD_SITE,
	 true,
	 "sitenoindex:example.com",
	 "Matches all documents on the example.com domain that in not indexed.",
	 NULL,
	 0 },

	{"ip",
	 FIELD_IP,
	 true,
	 "ip:192.0.2.1",
	 "Matches all documents whose IP is 192.0.2.1.",
	 NULL,
	 0 },


	{"ip",
	 FIELD_IP,
	 true,
	 "ip:192.0.2",
	 "Matches all documents whose IP STARTS with 192.0.2.",
	 NULL,
	 QTF_DUP },


	{"inurl",
	 FIELD_SUBURL,
	 true,
	 "inurl:dog",
	 "Matches all documents that have the word dog in their url, like "
	 "http://www.example.com/dog/food.html. However will not match "
	 "http://www.example.com/dogfood.html because it is not an "
	 "individual word. It must be delineated by punctuation.",
	 NULL,
	 0 },


	{"suburl",
	 FIELD_SUBURL,
	 true,
	 "suburl:dog",
	 "Same as inurl.",
	 NULL,
	0},

	{"intitle",
	 FIELD_TITLE,
	 false,
	 "title:cat",
	 "Matches all the documents that have the word cat in their "
	 "title.",
	 NULL,
	 0 },


	{"intitle",
	 FIELD_TITLE,
	 false,
	 "title:\"cat food\"",
	 "Matches all the documents that have the phrase \"cat food\" "
	 "in their title.",
	 NULL,
	 QTF_DUP },


	{"title",
	 FIELD_TITLE,
	 false,
	 "title:cat",
	 "Same as intitle:",
	 NULL,
	0},

	{"type",
	 FIELD_TYPE,
	 false,
	 "type:json",
	 "Matches all documents that are in JSON format. "
	 "Other possible types include "
	 "<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
	 "<i>status</i> matches special documents that are stored every time "
	 "a url is spidered so you can see all the spider attempts and when "
	 "they occurred as well as the outcome.",
	 NULL,
	 0},

	{"filetype",
	 FIELD_TYPE,
	 false,
	 "filetype:json",
	 "Same as type: above.",
	 NULL,
	0},

	{"gblang",
	 FIELD_GBLANG,
	 false,
	 "gblang:de",
	 "Matches all documents in german. "
	 "The supported language abbreviations "
	 "are at the bottom of the <a href=\"/admin/filters\">url filters</a> "
	 "page. Some more "
	 "common ones are <i>gblang:en, gblang:es, gblang:fr, "
	 // need quotes for this one!!
	 "gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
	 NULL,
	 0},

	{"gbcountry",
	 FIELD_GBCOUNTRY,
	 false,
	 "gbcountry:us",
	 "Matches documents determined by Gigablast to be from the United "
	 "States. See the country abbreviations in the CountryCode.cpp "
	 "open source distribution. Some more popular examples include: "
	 "de, fr, uk, ca, cn.",
	 NULL,
	 0} ,

// mdw

	{"gbdocid",
	 FIELD_GBDOCID,
	 false,
	 "gbdocid:123456",
	 "Matches the document with the docid 123456",
	 NULL,
	 0},

	{"gbtermid",
	 FIELD_GBTERMID,
	 false,
	 "gbtermid:123456",
	 "Matches the documents for the term with termid 123456",
	 NULL,
	 0},

	//
	// for content type CT_STATUS documents (Spider status docs)
	//

	{"gbdocspiderdate",
	 FIELD_GENERIC,
	 false,
	 "gbdocspiderdate:1400081479",
	 "Matches documents that have "
	 "that spider date timestamp (UTC). "
	 //"Does not include the "
	 //"special spider status documents. "
	 "This is the time the document "
	 "completed downloading.",
	 "Date Related Query Operators",
	 QTF_BEGINNEWTABLE},


	{"gbspiderdate",
	 FIELD_GENERIC,
	 false,
	 "gbspiderdate:1400081479",
	 "Like above.",
	 //, but DOES include the special spider status documents.",
	 NULL,
	 0},

	{"gbdocindexdate",
	 FIELD_GENERIC,
	 false,
	 "gbdocindexdate:1400081479",
	 "Like above, but is the time the document was last indexed. "
	 "This time is "
	 "slightly greater than or equal to the spider date.",//Does not "
	 //"include the special spider status documents.",
	 NULL,
	 0},


	{"gbindexdate",
	 FIELD_GENERIC,
	 false,
	 "gbindexdate:1400081479",
	 "Like above.",//, but it does include the special spider status "
	 //"documents.",
	 NULL,
	 0},

	// they don't need to know about this
	{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE}
};

void resetQuery ( ) {
	s_table.reset();
}


int32_t getNumFieldCodes ( ) {
	return (int32_t)sizeof(g_fields) / (int32_t)sizeof(QueryField);
}

static bool initFieldTable(){

	ScopedLock sl(s_tableMutex);
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,"qryfldtbl" ) ) {
			log(LOG_WARN, "build: Could not init table of query fields.");
			return false;
		}
		// now add in all the stop words
		int32_t n = getNumFieldCodes();
		for ( int32_t i = 0 ; i < n ; i++ ) {
			// skip if dup
			int64_t h = hash64b ( g_fields[i].text );

			// if already in there it is a dup
			if ( s_table.isInTable ( &h ) ) continue;

			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	}
	return true;
}


field_code_t getFieldCode(const char *s, int32_t len) {
	if ( !initFieldTable() ) {
		return FIELD_UNSET;
	}

	int64_t h = hash64Lower_a( s, len );
	int32_t i = (int32_t) s_table.getScore(h);

	if ( i == 0 ) {
		return FIELD_UNSET;
	}

	return g_fields[i-1].field;
}

const char *getFieldCodeName(field_code_t fc) {
	switch(fc) {
		case FIELD_UNSET: return "unset";
		case FIELD_URL: return "url";
		case FIELD_LINK: return "link";
		case FIELD_SITE: return "site";
		case FIELD_IP: return "ip";
		case FIELD_SUBURL: return "suburl";
		case FIELD_TITLE: return "title";
		case FIELD_TYPE: return "type";
		case FIELD_EXT: return "ext";
		case FIELD_LINKS: return "links";
		case FIELD_SITELINK: return "sitelink";
		case FIELD_GENERIC: return "generic";
		case FIELD_GBLANG: return "gblang";
		case FIELD_GBCOUNTRY: return "gbcountry";
		case FIELD_GBTERMID: return "gbtermid";
		case FIELD_GBDOCID: return "gbdocid";
		case FIELD_GBCONTENTHASH: return "gbcontenthash";
		case FIELD_GBFIELDMATCH: return "gbfieldmatch";
		default: return NULL;
	}
}


// guaranteed to be punctuation
bool Query::isConnection(unsigned i) const {
	auto const &token = m_tr[i];
	if(token.token_len==1) {
		switch(*token.token_start) {
			// . only allow apostrophe if it's NOT a 's
			// . so contractions are ok, and names too
		case '\'':
			// no, i think we should require it. google seems to,
			// and msn and yahoo do. 'john's room -"john's" gives
			// no result son yahoo and msn.
			return true;
		case ':': return true;
		case '-': return true;
		case '.': return true;
		case '@': return true;
		case '#': return true;
		case '/': return true;
		case '_': return true;
		case '&': return true;
		case '=': return true;
		case '\\': return true;
		default: return false;
		}
	}
	//if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true;
	if(token.token_len==3 &&
	   token.token_start[0]==':' && token.token_start[1]=='/' && token.token_start[2]=='/' )
		return true;
	return false;
}


void Query::dumpToLog() const
{
	log(LOG_DEBUG, "Query:setQTerms: dumping %d query-words:", m_numWords);
	for(int i=0; i<m_numWords; i++) {
		const QueryWord &qw = m_qwords[i];
		log("  qword #%d:",i);
		log("    word='%*.*s'", (int)qw.m_wordLen, (int)qw.m_wordLen, qw.m_word);
		log("    phrase='%*.*s'", (int)qw.m_bigramLen, (int)qw.m_bigramLen, qw.m_word);
		log("    m_wordId=%" PRId64, qw.m_wordId);
		log("    m_bigramId=%" PRId64, qw.m_bigramId);
		if(qw.m_queryWordTerm)
			log("    m_queryWordTerm= #%d", (int)(qw.m_queryWordTerm-m_qterms));
	}
	log("Query:setQTerms: dumping %d query-terms:", m_numTerms);
	for(int i=0; i<m_numTerms; i++) {
		const QueryTerm &qt = m_qterms[i];
		log("  term #%d:",i);
		log("    m_term='%*.*s'", (int)qt.m_termLen, (int)qt.m_termLen, qt.m_term);
		log("    m_isPhrase=%s synonym=%s", qt.m_isPhrase?"true":"false", qt.m_synonymOf?"true":"false");
		log("    m_termId=%" PRId64, qt.m_termId);
		log("    m_rawTermId=%" PRId64, qt.m_rawTermId);
		log("    m_isWikiHalfStopBigram=%s", qt.m_isWikiHalfStopBigram?"true":"false");
		log("    m_leftPhraseTermNum=%d, m_leftPhraseTerm=%p", qt.m_leftPhraseTermNum, (void*)qt.m_leftPhraseTerm);
		log("    m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
		log("    m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
		log("    m_rightPhraseTermNum=%d, m_rightPhraseTerm=%p", qt.m_rightPhraseTermNum, (void*)qt.m_rightPhraseTerm);
		log("    m_termFreqWeight=%f m_termWeight=%f m_userWeight=%f", qt.m_termFreqWeight, qt.m_termWeight, qt.m_userWeight);
		if(qt.m_synonymOf)
			log("    m_synonymOf=#%d '%.*s'", (int)(qt.m_synonymOf-m_qterms), qt.m_synonymOf->m_termLen, qt.m_synonymOf->m_term);
	}
}

void Query::traceTermsToLog(const char *header) {
	logTrace(g_conf.m_logTraceQuery, "%s: %d queryterms:", header, m_numTerms);
	for(int i=0; i<m_numTerms; i++) {
		logTrace(g_conf.m_logTraceQuery, "  query-term #%d: termid=%15" PRId64" '%*.*s', t-weight=%f u-weight=%f %s", i, m_qterms[i].m_termId, m_qterms[i].m_termLen,m_qterms[i].m_termLen,m_qterms[i].m_term, m_qterms[i].m_termWeight,m_qterms[i].m_userWeight, m_qterms[i].m_ignored?"ignored":"");
		logTrace(g_conf.m_logTraceQuery, "                  qstopw=%s req=%s", m_qterms[i].m_isQueryStopWord?"true":"false", m_qterms[i].m_isRequired?"yes":"no");
	}
}


////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
//////////   ONLY BOOLEAN STUFF BELOW HERE  /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////

// return false and set g_errno on error
// returns how many words expression was
bool Expression::addExpression (int32_t start,
				int32_t end,
				Query *q,
				int32_t              level
				) {

	if ( level >= MAX_EXPRESSIONS ) {
		g_errno = ETOOMANYPARENS;
		return false;
	}

	// the # of the first alnumpunct word in the expression
	m_expressionStartWord = start;
	m_q = q;

	int32_t i = m_expressionStartWord;

	// "start" is the current alnumpunct word we are parsing out
	for ( ; i<end ; i++ ) {

		QueryWord *qwords = q->m_qwords;

		QueryWord * qw = &qwords[i];

		// set leaf node if not an opcode like "AND" and not punct.
		if ( qw->m_opcode==opcode_t::OP_NONE && qw->isAlphaWord()){
			continue;
		}
		if (qw->m_opcode == opcode_t::OP_NOT) {
			continue;
		}
		else if (qw->m_opcode == opcode_t::OP_LEFTPAREN ) {
			// this is expression
			// . it should advance "i" to end of expression
			// point to next...
			q->m_numExpressions++;
			// make a new one:
			Expression *e=&q->m_expressions[q->m_numExpressions-1];
			// now set it
			if ( ! e->addExpression ( i+1, // skip over (
						  end ,
						  q ,
						  level + 1)  )
				return false;
			// skip over it. pt to ')'
			i += e->m_numWordsInExpression;
			qw->m_expressionPtr = e;
		}
		else if (qw->m_opcode == opcode_t::OP_RIGHTPAREN ) {
			// return size i guess, include )
			m_numWordsInExpression = i - m_expressionStartWord+1;
			return true;
		}
		else if (qw->m_opcode!=opcode_t::OP_NONE) {
			continue;
		}
		// white space?
	}

	m_numWordsInExpression = i - m_expressionStartWord;

	return true;
}

// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery(const unsigned char *bitVec, int32_t vecSize) const {
	return m_expressions[0].isTruth ( bitVec , vecSize );
}


static bool isBitNumSet(int32_t opBitNum, const unsigned char *bitVec, int32_t vecSize) {
	int32_t byte = opBitNum / 8;
	int32_t mask = 1<<(opBitNum % 8);
	if ( byte >= vecSize ) { g_process.shutdownAbort(true); }
	return bitVec[byte] & mask;
}

// . "bits" are 1-1 with the query words in Query::m_qwords[] array
//   including ignored words and spaces i guess since Expression::add()
//   seems to do that.
bool Expression::isTruth(const unsigned char *bitVec, int32_t vecSize) const {

	//
	// operand1 operand2 operator1 operand3 operator2 ....
	//

	// result: -1 means unknown at this point
	int32_t result = -1;

	opcode_t prevOpCode = opcode_t::OP_NONE;
	int32_t prevResult ;
	// result of current operand
	int32_t opResult = -1;

	int32_t i    =     m_expressionStartWord;
	int32_t iend = i + m_numWordsInExpression;

	bool hasNot = false;

	for ( ; i < iend ; i++ ) {

		const QueryWord *qw = &m_q->m_qwords[i];

		// ignore parentheses, aren't real opcodes.
		// we just want OP_AND/OP_OR/OP_NOT
		opcode_t opcode = qw->m_opcode;
		if ( opcode != opcode_t::OP_AND &&
		     opcode != opcode_t::OP_OR  &&
		     opcode != opcode_t::OP_NOT )
			opcode = opcode_t::OP_NONE;

		if ( opcode == opcode_t::OP_NOT ) {
			hasNot = true;
			continue;
		}


		// so operands are expressions as well
		const Expression *e = (const Expression *)qw->m_expressionPtr;
		if ( e ) {
			// save prev one. -1 means no prev.
			prevResult = opResult;
			// set new onw
			opResult = e->isTruth ( bitVec , vecSize );
			// skip over that expression. point to ')'
			i += e->m_numWordsInExpression;
			// flip?
			if ( hasNot ) {
				if ( opResult == 1 ) opResult = 0;
				else                 opResult = 1;
				hasNot = false;
			}
		}

		if ( opcode!=opcode_t::OP_NONE && ! e ) {
			prevOpCode = opcode;//m_opSlots[i];
			continue;
		}

		// simple operand
		if ( opcode==opcode_t::OP_NONE && ! e ) {
			// for regular word operands
			// ignore it like a space?
			if ( qw->m_ignoreWord ) continue;
			// ignore gbsortby:offerprice in bool queries
			// at least for evaluating them
			if ( qw->m_ignoreWordInBoolQuery ) continue;
			// save old one
			prevResult = opResult;
			// convert word to term #
			const QueryTerm *qt = qw->m_queryWordTerm;
			// fix title:"notre dame" AND NOT irish
			if ( ! qt ) qt = qw->m_queryPhraseTerm;
			if ( ! qt ) continue;
			// phrase terms are not required and therefore
			// do not have a v alid qt->m_bitNum set, so dont core
			if ( ! qt->m_isRequired ) continue;
			// . m_bitNum is set in Posdb.cpp when it sets its
			//   QueryTermInfo array
			// . it is basically the query term #
			// . see iff that bit is set in this docid's vec
			opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
			// flip?
			if ( hasNot ) {
				if ( opResult == 1 ) opResult = 0;
				else                 opResult = 1;
				hasNot = false;
			}
		}

		// need two to tango. i.e. (true OR false)
		if ( prevResult == -1 ) continue;

		// if this is not the first time... we got two
		if ( prevOpCode == opcode_t::OP_AND ) {
			// if first operation we encount is A AND B then
			// default result to on. only allow an AND operation
			// to turn if off.
			if ( result == -1 ) result = 1;
			if ( ! prevResult ) result = 0;
			if ( !    opResult ) result = 0;
		}
		else if ( prevOpCode == opcode_t::OP_OR ) {
			// if first operation we encount is A OR B then
			// default result to off
			if ( result == -1 ) result = 0;
			if ( prevResult ) result = 1;
			if (   opResult ) result = 1;
		}
	}

	// if we never set result, then it was probably a single
	// argument expression like something in double parens like
	// ((site:xyz.com OR site:abc.com)). so set it to value of
	// first operand, opResult.
	if ( prevOpCode == opcode_t::OP_NONE && result == -1 ) result = opResult;

	if ( result == -1 ) return true;
	if ( result ==  0 ) return false;
	return true;
}

// if any one query term is split, msg3a has to split the query
bool Query::isSplit() const {
	for(int32_t i = 0; i < m_numTerms; i++)
		if(m_qterms[i].isSplit()) return true;
	return false;
}

void QueryTerm::constructor ( ) {
	m_qword = NULL;
	m_isPhrase = false;
	m_termId = 0;
	m_rawTermId = 0;
	m_termSign = 0;
	m_bitNum = 0;
	m_term = NULL;
	m_termLen = 0;
	m_posdbListPtr = NULL;
	m_langIdBits = 0;
	m_langIdBitsValid = false;
	m_termFreq = 0;
	m_termFreqWeight = 0.0;
	m_isQueryStopWord = false;
	m_inQuotes = false;
	m_termWeight = 0;
	m_userWeight = 0;
	m_userNotRequired = false;
	m_piped = false;
	m_ignored = false;
	m_synonymOf = NULL;
	m_synWids0 = 0;
	m_synWids1 = 0;
	m_numAlnumWordsInSynonym = 1;
	m_fieldCode = FIELD_UNSET;
	m_isRequired = false;
	m_isWikiHalfStopBigram = false;
	m_leftPhraseTermNum = 0;
	m_rightPhraseTermNum = 0;
	m_leftPhraseTerm = NULL;
	m_rightPhraseTerm = NULL;
	memset(m_startKey,0,sizeof(m_startKey));
	memset(m_endKey,0,sizeof(m_endKey));
}

bool QueryTerm::isSplit() const {
	if(!m_fieldCode) return true;
	if(m_fieldCode == FIELD_GBCONTENTHASH)  return false;
	return true;
}

// hash of all the query terms
int64_t Query::getQueryHash() const {
	int64_t qh = 0LL;
	for ( int32_t i = 0 ; i < m_numTerms ; i++ )  {
		const QueryTerm *qt = &m_qterms[i];
		qh = hash64 ( qt->m_termId , qh );
	}
	return qh;
}

void QueryWord::constructor () {
	m_synWordBuf.constructor();
}

void QueryWord::destructor () {
	m_synWordBuf.purge();
}


static int count_quotes(const char *s, size_t len) {
	int count = 0;
	while(len--)
		if(*s++ == '\"')
			count++;
	return count;
}