privacore-open-source-searc…/Matches.cpp

#include "Matches.h"
#include "Query.h"
#include "Titledb.h"  // for getting total # of docs in db
#include "StopWords.h"
#include "Phrases.h"
#include "Title.h"
#include "Domains.h"
#include "Sections.h"
#include "Linkdb.h"
#include "Xml.h"
#include "BitOperations.h"
#include "Process.h"
#include "Mem.h"
#include "Url.h"
#include "hash.h"


// TODO: have Matches set itself from all the meta tags, titles, link text,
//       neighborhoods and body. then proximity algo can utilize that info
//       as well as the summary generator, Summary.cpp. right now prox algo
//       was setting all those different classes itself.

Matches::Matches()
  : m_qwordFlags(NULL),
    m_numMatches(0),
    m_numSlots(0),
    m_q(NULL),
    m_numAlnums(0),
    m_qwordAllocSize(0),
    m_numMatchGroups(0)
{
	memset(m_matches, 0, sizeof(m_matches));	//@todo: added to silence Coverity. Remove if impacting performance (quite big memset)
	memset(m_qtableIds, 0, sizeof(m_qtableIds));	// PVS-Studio
	memset(m_qtableWordNums, 0, sizeof(m_qtableWordNums));	// PVS-Studio
	memset(m_qtableFlags, 0, sizeof(m_qtableFlags));	// PVS-Studio
	memset(m_tmpBuf, 0, sizeof(m_tmpBuf));	// PVS-Studio
}


Matches::~Matches() {
	reset();
}

void Matches::reset() {
	reset2();
	if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
		mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
		m_qwordFlags = NULL;
	}
}

void Matches::reset2() {
	m_numMatches = 0;
	m_numAlnums  = 0;
	// free all the classes' buffers
	for ( int32_t i = 0 ; i < m_numMatchGroups ; i++ ) {
		m_tokenizerResultArray[i].clear();
		m_posArray     [i].reset();
		m_bitsArray    [i].reset();
	}
	m_numMatchGroups = 0;
}

bool Matches::isMatchableTerm(const QueryTerm *qt) const {
	const QueryWord *qw = qt->m_qword;
	// not derived from  a query word? how?
	if ( ! qw ) return false;
	if ( qw->m_ignoreWord == IGNORE_DEFAULT        ) return false;
	if ( qw->m_ignoreWord == IGNORE_FIELDNAME      ) return false;
	if ( qw->m_ignoreWord == IGNORE_BOOLOP         ) return false;
	// take this out for now so we highlight for title: terms
	if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) return false;
	// what word # are we?
	int32_t qwn = qw - m_q->m_qwords;
	// do not include if in a quote and does not start it!!
	if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != qwn ) return false;
	// if query is too long, a query word can be truncated!
	// this happens for some words if they are ignored, too!
	if ( ! qw->m_queryWordTerm && ! qw->m_queryPhraseTerm ) return false;
	return true;
}

void Matches::setQuery(const Query *q) {
	reset();
	// save it
	m_q       = q;

	if ( m_qwordFlags ) {
		g_process.shutdownAbort(true);
	}

	int32_t need = m_q->m_numWords * sizeof(mf_t) ;
	m_qwordAllocSize = need;

	if ( need < 128 )
		m_qwordFlags = (mf_t *)m_tmpBuf;
	else
		m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );

	if ( ! m_qwordFlags ) {
		log("matches: alloc failed for query %s",q->originalQuery());
		return;
	}

	// this is word based. these are each 1 byte
	memset ( m_qwordFlags  , 0 , m_q->m_numWords * sizeof(mf_t));

	// # of WORDS in the query
	int32_t nqt = m_q->m_numTerms;

	// how many query words do we have that can be matched?
	int32_t numToMatch = 0;
	for ( int32_t i = 0 ; i < nqt ; i++ ) {
		// get query word #i
		QueryTerm *qt = &m_q->m_qterms[i];
		// skip if ignored *in certain ways only*
		if ( ! isMatchableTerm ( qt ) ) {
			continue;
		}
		// count it
		numToMatch++;
		// don't breach. MDW: i made this >= from > (2/11/09)
		if ( numToMatch < MAX_QUERY_WORDS_TO_MATCH ) continue;
		// note it
		log("matches: hit %" PRId32" max query words to match limit",
		    (int32_t)MAX_QUERY_WORDS_TO_MATCH);
		break;
	}

	// fix a core the hack way for now!
	if ( numToMatch < 256 ) numToMatch = 256;

	// keep number of slots in hash table a power of two for fast hashing
	m_numSlots = getHighestLitBitValue ( (uint32_t)(numToMatch * 3));
	// make the hash mask
	uint32_t mask = m_numSlots - 1;
	int32_t          n;
	// sanity check
	if ( m_numSlots > MAX_QUERY_WORDS_TO_MATCH * 3 ) {
		g_process.shutdownAbort(true); }

	// clear hash table
	memset ( m_qtableIds   , 0 , m_numSlots * 8 );
	memset ( m_qtableFlags , 0 , m_numSlots     );

	for ( int32_t i = 0 ; i < nqt ; i++ ) {
		// get query word #i
		QueryTerm *qt = &m_q->m_qterms[i];

		// skip if ignored *in certain ways only*
		if ( ! isMatchableTerm ( qt ) ) {
			continue;
		}

		// get the word it is from
		const QueryWord *qw = qt->m_qword;

		// get word #
		int32_t qwn = qw - q->m_qwords;

		// do not overfill table
		if ( i >= MAX_QUERY_WORDS_TO_MATCH ) {
			break;
		}

		// this should be equivalent to the word id
		int64_t qid = qt->m_rawTermId;//qw->m_rawWordId;

		// but NOT for 'cheatcodes.com'
		if ( qt->m_isPhrase ) {
			qid = qw->m_rawWordId;
		}

		// if its a multi-word synonym, like "new jersey" we must
		// index the individual words... or compute the phrase ids
		// for all the words in the doc. right now the qid is
		// the phrase hash for this guy i think...
		if ( qt->m_synonymOf && qt->m_numAlnumWordsInSynonym == 2 ) {
			qid = qt->m_synWids0;
		}

		// put in hash table
		n = ((uint32_t)qid) & mask;

		// chain to an empty slot
		while ( m_qtableIds[n] && m_qtableIds[n] != qid ) {
			if ( ++n >= m_numSlots ) {
				n = 0;
			}
		}

		// . if already occupied, do not overwrite this, keep this
		//   first word, the other is often ignored as IGNORE_REPEAT
		// . what word # in the query are we. save this.
		if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;

		// store it
		m_qtableIds[n] = qid;

		// in quotes? this term may appear multiple times in the
		// query, in some cases in quotes, and in some cases not.
		// we need to know either way for logic below.
		if ( qw->m_inQuotes ) {
			m_qtableFlags[n] |= 0x02;
		}
		else {
			m_qtableFlags[n] |= 0x01;
		}

		// this is basically a quoted synonym
		if ( qt->m_numAlnumWordsInSynonym == 2 ) {
			m_qtableFlags[n] |=  0x08;
		}

		//QueryTerm *qt = qw->m_queryWordTerm;
		if ( qt->m_termSign == '+' ) {
			m_qtableFlags[n] |= 0x04;
		}

		//
		// if query has e-mail, then index phrase id "email" so
		// it matches "email" in the doc.
		// we need this for the 'cheat codes' query as well so it
		// highlights 'cheatcodes'
		//
		int64_t pid = qw->m_rawPhraseId;
		if ( pid == 0 ) {
			continue;
		}

		// put in hash table
		n = ((uint32_t)pid) & mask;
		// chain to an empty slot
		while ( m_qtableIds[n] && m_qtableIds[n] != pid )
			if ( ++n >= m_numSlots ) n = 0;
		// this too?
		if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
		// store it
		m_qtableIds[n] = pid;

	}
}

// . this was in Summary.cpp, but is more useful here
// . we can also use this to replace the proximity algo setup where it
//   fills in the matrix for title, link text, etc.
// . returns false and sets g_errno on error
bool Matches::set(const TokenizerResult *bodyTr, Phrases *bodyPhrases, const Sections *bodySections, const Bits *bodyBits,
		  const Pos *bodyPos, Xml *bodyXml, const Title *tt, const Url *firstUrl, LinkInfo *linkInfo ) {
	// don't reset query info!
	reset2();

	// . first add all the matches in the body of the doc
	// . add it first since it will kick out early if too many matches
	//   and we get all the explicit bits matched
	if ( !addMatches( bodyTr, bodyPhrases, bodySections, bodyBits, bodyPos, MF_BODY ) ) {
		return false;
	}

	// add the title in
	if ( !addMatches( tt->getTitle(), tt->getTitleLen(), MF_TITLEGEN ) ) {
		return false;
	}

	// add in the url terms
	if ( !addMatches( firstUrl->getUrl(), firstUrl->getUrlLen(), MF_URL ) ) {
		return false;
	}

	// also use the title from the title tag, because sometimes it does not equal "tt->getTitle()"
	int32_t  a     = tt->getTitleTagStart();
	int32_t  b     = tt->getTitleTagEnd();

	if ( a >= 0 && b >= 0 && b>a ) {
		const auto &t_a = (*bodyTr)[a];
		const auto &t_bm1 = (*bodyTr)[b-1];
		const char *start = t_a.token_start;
		const char *end   = t_bm1.token_end();
		if ( !addMatches( start, end - start, MF_TITLETAG ) ) {
			return false;
		}
	}

	// now add in the meta tags
	int32_t n = bodyXml->getNumNodes();
	XmlNode *nodes = bodyXml->getNodes();

	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_META ) continue;
		// only get content for <meta name=..> not <meta http-equiv=..>
		int32_t tagLen;
		char *tag = bodyXml->getString ( i , "name" , &tagLen );
		// is it an accepted meta tag?
		int32_t flag = 0;
		if (tagLen== 7&&strncasecmp(tag,"keyword"    , 7)== 0)
			flag = MF_METAKEYW;
		if (tagLen== 7&&strncasecmp(tag,"summary"    , 7)== 0)
			flag = MF_METASUMM;
		if (tagLen== 8&&strncasecmp(tag,"keywords"   , 8)== 0)
			flag = MF_METAKEYW;
		if (tagLen==11&&strncasecmp(tag,"description",11)== 0)
			flag = MF_METADESC;
		if ( ! flag ) continue;
		// get the content
		int32_t len;
		char *s = bodyXml->getString ( i , "content" , &len );
		if ( ! s || len <= 0 ) continue;
		// wordify
		if ( !addMatches( s, len, flag ) ) {
			return false;
		}
	}

	// . now the link text
	// . loop through each link text and it its matches

	// loop through the Inlinks
	for (Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)); ) {
		// does it have link text? skip if not.
		if ( k->size_linkText <= 1 ) {
			continue;
		}

		// set the flag, the type of match
		mf_t flags = MF_LINK;

		// add it in
		if ( !addMatches( k->getLinkText(), k->size_linkText - 1, flags ) ) {
			return false;
		}

		// set flag for that
		flags = MF_HOOD;

		// add it in
		if ( !addMatches( k->getSurroundingText(), k->size_surroundingText - 1, flags ) ) {
			return false;
		}

		// parse the rss up into xml
		Xml rxml;
		if ( ! k->setXmlFromRSS ( &rxml ) ) {
			return false;
		}

		// add rss description
		bool isHtmlEncoded;
		int32_t rdlen;
		char *rd = rxml.getRSSDescription( &rdlen, &isHtmlEncoded );
		if ( !addMatches( rd, rdlen, MF_RSSDESC ) ) {
			return false;
		}

		// add rss title
		int32_t rtlen;
		char *rt = rxml.getRSSTitle( &rtlen, &isHtmlEncoded );
		if ( !addMatches( rt, rtlen, MF_RSSTITLE ) ) {
			return false;
		}
	}

	// that should be it
	return true;
}

bool Matches::addMatches( const char *s, int32_t slen, mf_t flags ) {
	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	// get some new ptrs for this match group
	TokenizerResult    *tr = &m_tokenizerResultArray[ m_numMatchGroups ];
	Bits     *bp = &m_bitsArray     [ m_numMatchGroups ];
	Pos      *pb = &m_posArray      [ m_numMatchGroups ];

	// set the words class for this match group
	plain_tokenizer_phase_1(s,slen,tr);
	//TODO: should this just be a plain phase-1, or should we also do phase-2 tokenization?
	calculate_tokens_hashes(tr);

	// bits vector
	if ( ! bp->setForSummary ( tr ) ) {
		return false;
	}

	// position vector
	if ( ! pb->set ( tr ) ) {
		return false;
	}

	// record the start
	int32_t startNumMatches = m_numMatches;
	// sometimes it returns true w/o incrementing this
	int32_t n = m_numMatchGroups;
	// . add all the Match classes from this match group
	// . this increments m_numMatchGroups on success
	bool status = addMatches( tr, NULL, NULL, bp, pb, flags );

	// if this matchgroup had some, matches, then keep it
	if ( m_numMatches > startNumMatches ) {
		return status;
	}

	// otherwise, reset it, useless
	tr->clear();
	bp->reset();
	pb->reset();

	// do not decrement the counter if we never incremented it
	if ( n == m_numMatchGroups ) {
		return status;
	}

	// ok, remove it
	m_numMatchGroups--;

	return status;
}

// . TODO: support stemming later. each word should then have multiple ids.
// . add to our m_matches[] array iff addToMatches is true, otherwise we just
//   set the m_foundTermVector for doing the BIG HACK described in Summary.cpp
bool Matches::addMatches(const TokenizerResult *tr, Phrases *phrases, const Sections *sections, const Bits *bits, const Pos *pos, mf_t flags ) {
	// if no query term, bail.
	if ( m_numSlots <= 0 ) {
		return true;
	}

	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	Section *sp = NULL;
	if ( sections ) {
		sp = sections->m_sections;
	}

	mf_t eflag = 0;

	m_numMatchGroups++;

	// set convenience vars
	uint32_t mask = m_numSlots - 1;
	int32_t nw = tr->size();
	int32_t n;
	int32_t matchStack = 0;
	int64_t nextMatchWordIdMustBeThis = 0;
	int32_t nextMatchWordPos = 0;
	int32_t lasti = -3;

	if ( getNumXmlNodes() > 512 ) { g_process.shutdownAbort(true); }

	int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;

	int32_t qwn;
	int32_t numQWords;
	int32_t numWords;

	//
	// . set m_matches[] array
	// . loop over all words in the document
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		const auto &token = (*tr)[i];
		//if      (tids && (tids[i]            ) == TAG_A)
		//	inAnchTag = true;
		//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A)
		//	inAnchTag = false;

		if ( token.nodeid ){
			// tagIds don't have wids and are skipped
			continue;
		}

		// skip if wid is 0, it is not an alnum word then
		if ( ! token.is_alfanum ) {
			continue;
		}

		// count the number of alnum words
		m_numAlnums++;

		// clear this
		eflag = 0;

		// NO NO, a score of -1 means in a select tag, and
		// we do index that!! so only skip if wscores is 0 now.
		// -1 means in script, style, select or marquee. it is
		// indexed but with very little weight... this is really
		// a hack in Scores.cpp and should be fixed.
		// in Scores.cpp we set even the select tag stuff to -1...
		//if ( wscores && wscores[i] == -1 ) continue;
		if ( sp && (sp->m_flags & badFlags) ) continue;


		// . does it match a query term?
		// . hash to the slot in the hash table
		n = ((uint32_t)token.token_hash) & mask;
		//n2 = swids[i]?((uint32_t)swids[i]) & mask:n;
	chain1:
		// skip if slot is empty (doesn't match query term)
		//if ( ! m_qtableIds[n] && ! m_qtableIds[n2]) continue;
		if ( ! m_qtableIds[n] ) goto tryPhrase;
		// otherwise chain
		if ( (m_qtableIds[n] != token.token_hash) ) {
			if ( m_qtableIds[n] && ++n >= m_numSlots ) n = 0;
			goto chain1;
		}
		// we got one!
		goto gotMatch;


		//
		// fix so we hihglight "woman's" when query term is "woman"
		// for 'spiritual books for women' query
		//
	tryPhrase:
		// try without 's if it had it
		if ( token.token_len >= 3 &&
		     token.token_start[token.token_len-2] == '\'' &&
		     to_lower_a(token.token_start[token.token_len-1]) == 's' ) {
			// move 's from word hash... very tricky
			int64_t nwid = token.token_hash;
			// undo hash64Lower_utf8 in hash.h
			nwid ^= g_hashtab[token.token_len-1][(uint8_t)'s'];
			nwid ^= g_hashtab[token.token_len-2][(uint8_t)'\''];
			n = ((uint32_t)nwid) & mask;
		chain2:
			if ( ! m_qtableIds[n] ) goto tryPhrase2;
			if ( (m_qtableIds[n] != nwid) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots ) n=0;
				goto chain2;
			}
			qwn = m_qtableWordNums[n];
			numWords = 1;
			numQWords = 1;
			// we got one!
			goto gotMatch2;
		}

	tryPhrase2:
		// try phrase first
		int64_t pid;
		if ( phrases && (pid=phrases->getPhraseId(i))!=0 ) {
			n = ((uint32_t)pid) & mask;
		chain3:
			if ( ! m_qtableIds[n] ) continue;
			if ( (m_qtableIds[n] != pid) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots)n = 0;
				goto chain3;
			}
			// what query word # do we match?
			qwn = m_qtableWordNums[n];
			// get that query word #
			QueryWord *qw = &m_q->m_qwords[qwn];
			// . do we match it as a single word?
			// . did they search for "bluetribe" ...?
			if ( qw->m_rawWordId == pid ) {
				// set our # of words basically to 3
				numWords = 3;
				// matching a single query word
				numQWords = 1;
				// got a match
				goto gotMatch2;
			}
			if ( qw->m_bigramId == pid ) {
				// might match more if we had more query
				// terms in the quote
				numWords = getNumWordsInMatch( tr, i, n, &numQWords, &qwn, true );

				// this is 0 if we were an unmatched quote
				if ( numWords <= 0 ) continue;

				// got a match
				goto gotMatch2;
			}
			// otherwise we are matching a query phrase id
			log("matches: wtf? query word not matched for "
			    "highlighting... strange.");
			// assume one word for now
			numWords = 1;
			numQWords = 1;
			goto gotMatch2;
		}

		//
		// shucks, no match
		//
		continue;

	gotMatch:
		// what query word # do we match?
		qwn = m_qtableWordNums[n];


		// . how many words are in this match?
		// . it may match a single word or a phrase or both
		// . this will be 1 for just matching a single word, and
		//   multiple words for quotes/phrases. The number of words
		//   in both cases will included unmatched punctuation words
		//   and tags in between matching words.
		numQWords = 0;
		numWords = getNumWordsInMatch( tr, i, n, &numQWords, &qwn, true );
		// this is 0 if we were an unmatched quote
		if ( numWords <= 0 ) continue;

	gotMatch2:
		// get query word
		QueryWord *qw = &m_q->m_qwords[qwn];
		// point to next word in the query
		QueryWord *nq = NULL;
		if ( qwn+2 < m_q->m_numWords ) nq = &m_q->m_qwords[qwn+2];

		// . if only one word matches and its a stop word, make sure
		//   it's next to the correct words in the query
		// . if phraseId is 0, that means we do not start a phrase,
		//   because stop words can start phrases if they are the
		//   first word, are capitalized, or have breaking punct before
		//   them.
		if ( numWords == 1 &&
		     ! qw->m_inQuotes &&
		     m_q->m_numWords > 2 &&
		     qw->m_wordSign == '\0' &&
		     (nq && nq->m_wordId) && // no field names can follow
		     //(qw->m_isQueryStopWord || qw->m_isStopWord ) ) {
		     // we no longer consider single alnum chars to be
		     // query stop words as stated in StopWords.cpp to fix
		     // the query 'j. w. eagan'
		     qw->m_isQueryStopWord ) {
			// if stop word does not start a phrase in the query
			// then he must have a matched word before him in the
			// document. if he doesn't then do not count as a match
			if ( qw->m_bigramId == 0LL && i-2 != lasti ) {
				// peel off anybody before us
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;

				continue;
			}
			// if we already have a match stack, we must
			// be in nextMatchWordPos
			if ( matchStack && nextMatchWordPos != i ) {
				// peel off anybody before us
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;
				//continue;
			}
			// if the phraseId is 0 and the previous word
			// is a match, then we're ok, but put us on a stack
			// so if we lose a match, we'll be erased
			QueryWord *nq = &m_q->m_qwords[qwn+2];
			// next match is only required if next word in query
			// is indeed valid.
			if ( nq->m_wordId && nq->m_fieldCode == 0 ) {
				nextMatchWordIdMustBeThis = nq->m_rawWordId;
				nextMatchWordPos          = i + 2;
				matchStack++;
			}
		}
		else if ( matchStack ) {
			// if the last word matched was a stop word, we have to
			// match otherwise we have to remove the whole stack.
			if ( qw->m_rawWordId != nextMatchWordIdMustBeThis ||
			     i                > nextMatchWordPos ) {
				m_numMatches -= matchStack;
				// ensure we never go negative like for
				// www.experian.com query
				if ( m_numMatches < 0 ) m_numMatches = 0;
			}
			// always reset this here if we're not a stop word
			matchStack = 0;
		}

		// record word # of last match
		lasti = i;

		// otherwise, store it in our m_matches[] array
		Match *m = &m_matches[m_numMatches];

		// the word # in the doc, and how many of 'em are in the match
		m->m_wordNum  = i;
		m->m_numWords = numWords;

		// the word # in the query, and how many of 'em we match
		m->m_qwordNum  = qwn;
		m->m_numQWords = numQWords;

		// get the first query word # of this match
		qw = &m_q->m_qwords[qwn];

		// convenience, used by Summary.cpp
		m->m_tr    = tr;
		m->m_sections = sections;
		m->m_bits     = bits;
		m->m_pos      = pos;
		m->m_flags    = flags | eflag ;

		// add to our vector. we want to know where each QueryWord
		// is. i.e. in the title, link text, meta tag, etc. so
		// the proximity algo in Summary.cpp can use that info.
		m_qwordFlags[qwn] |= flags;

		// advance
		m_numMatches++;

		// we get atleast MAX_MATCHES
		if ( m_numMatches < MAX_MATCHES ) {
			continue;
		}

		break;
	}

	// peel off anybody before us
	m_numMatches -= matchStack;
	if ( m_numMatches < 0 ) m_numMatches = 0;

	return true;
}

// . word #i in the doc matches slot #n in the hash table
int32_t Matches::getNumWordsInMatch(const TokenizerResult *tr, unsigned wn, int32_t n, int32_t *numQWords, int32_t *qwn,
				    bool allowPunctInPhrase) {
	// is it a two-word synonym?
	if ( m_qtableFlags[n] & 0x08 ) {
		// get the word following this
		int64_t wid2 = 0LL;
		if ( wn+2 < tr->size() )
			wid2 = (*tr)[wn+2].token_hash;
		// scan the synonyms...
		for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
			QueryTerm *qt = &m_q->m_qterms[k];
			if ( ! qt->m_synonymOf ) continue;
			if ( qt->m_synWids0 != (*tr)[wn].token_hash ) continue;
			if ( qt->m_synWids1 != wid2 ) continue;
			*numQWords = 3;
			return 3;
		}
	}

	// save the first word in the doc that we match first
	unsigned wn0 = wn;

	// CAUTION: the query "business development center" (in quotes)
	// would match a doc with "business development" and
	// "development center" as two separate phrases.

	// if query word never appears in quotes, it's a single word match
	if ( ! (m_qtableFlags[n] & 0x02) ) { *numQWords = 1; return 1; }

	// get word ids array for the doc
	//int64_t  *swids  = words->getStripWordIds();
	//the word we match in the query appears in quotes in the query
	int32_t k     = -1;
	int32_t count = 0;
	unsigned nw    = tr->size();;

	// loop through all the quotes in the query and find
	// which one we match, if any. we will have to advance the
	// query word and doc word simultaneously and make sure they
	// match as we advance.
	int32_t nqw = m_q->m_numWords;
	int32_t j;
	for ( j = 0 ; j < nqw ; j++ ) {
		// get ith query word
		QueryWord *qw = &m_q->m_qwords[j];
		if ( !qw->m_rawWordId ) continue;
		// query word must match wid of first word in quote
		if ( (qw->m_rawWordId != (*tr)[wn].token_hash) ) continue;
		//     (qw->m_rawWordId != swids[wn])) continue;
		// skip if in field
		// . we were doing an intitle:"fight club" query and
		//   needed to match that in the title...
		//if ( qw->m_fieldCode         ) continue;
		// query word must be in quotes
		if ( ! qw->m_inQuotes        ) continue;
		// skip it if it does NOT start the quote. quoteStart
		// is actually the query word # that contains the quote
		//if ( qw->m_quoteStart != j-1 ) continue;
		// not any more it isn't...
		if ( qw->m_quoteStart != j ) continue;
		// save the first word # in the query of the quote
		k = j; // -1;
		// count number of words we match in the quote, we've
		// already matched the first one
		count = 0;
	subloop:
		// query word must match wid of first word in phrase
		if ( (qw->m_rawWordId != (*tr)[wn].token_hash) ) {
		//     (qw->m_rawWordId != swids[wn])) {
			// reset and try another quote in the query
			count = 0;
			wn    = wn0;
			continue;
		}
		// up the count of query words matched in the quote
		count++;
		// ADVANCE QUERY WORD
		j++;
		// if no more, we got a match
		if ( j >= nqw ) break;
		// skip punct words
		if ( m_q->m_qwords[j].m_isPunct ) j++;
		// if no more, we got a match
		if ( j >= nqw ) break;
		// now we should point to the next query word in quote
		qw = &m_q->m_qwords[j];
		// if not in quotes, we're done, we got a match
		if ( ! qw->m_inQuotes      ) break;
		// or if in a different set of quotes, we got a match
		if ( qw->m_quoteStart != k ) break;
		// . ADVANCE DOCUMENT WORD
		// . tags and punctuation words have 0 for their wid
		for ( wn++ ; wn < nw ; wn++ ) {
			// . if NO PUNCT, IN QUOTES, AND word id is zero
			//   then check for punctuation
			if(!allowPunctInPhrase && qw->m_inQuotes && !(*tr)[wn].is_alfanum) {
				// . check if its a space [0x20, 0x00]
				if( ((*tr)[wn].token_len == 2) && ((*tr)[wn].token_start[0] == ' ') )
					continue;
				// . if the length is greater than a space
				else if( (*tr)[wn].token_len > 2 ) {
					// . increment until we find no space
					// . increment by 2 since its utf16
					for( unsigned i = 0; i < (*tr)[wn].token_len; i+=2 )
						// . if its not a space, its punc
						if( (*tr)[wn].token_start[i] != ' ' ) {
							count=0; break;
						}
					// . if count is 0, punc found break
					if( count == 0 ) break;
				}
				// . otherwise its solo punc, set count and break
				else { count=0; break; }
			}
			// . we incremented to a new word break and check
			if ( (*tr)[wn].is_alfanum ) break;
		}
		// there was a following query word in the quote
		// so there must be a following word, if not, continue
		// to try to find another quote in the query we match
		if ( wn >= nw ) {
			// reset and try another quote in the query
			count = 0;
			wn    = wn0;
			continue;
		}
		// see if the next word and query term match
		goto subloop;
	}

	// if we did not match any quote in the query
	// check if we did match a single word. e.g.
	// Hello World "HelloWorld" "Hello World Example"
	if ( count <= 0 ) {
		if ( m_qtableFlags[n] & 0x01 ) {
			*numQWords = 1;
			// we did match a single word. m_qtableWordNums[n] may
			// not be pointing to the right qword. Set it to a
			// qword that is the single word
			for ( j = 0 ; j < nqw ; j++ ) {
				// get ith query word
				QueryWord *qw = &m_q->m_qwords[j];
				if ( !qw->m_rawWordId ) continue;
				// query word must match wid of word
				if ( (qw->m_rawWordId != (*tr)[wn].token_hash) ) continue;
				//   (qw->m_rawWordId != swids[wn])) continue;
				// skip if in field
				// . fix intitle:"fight club"
				//if ( qw->m_fieldCode         ) continue;
				// query word must NOT be in quotes
				if ( qw->m_inQuotes        ) continue;
				*qwn = j;
			}
			return 1;
		}
		else
			return 0;
	}
	// sanity check
	if ( k < 0 ) { g_process.shutdownAbort(true); }
	// skip punct words
	if ( j-1>=0 && m_q->m_qwords[j-1].m_isPunct ) j--;
	// . ok, we got a quote match
	// . it had this man query words in it
	//*numQWords = j - (k+1);
	*numQWords = j - k;
	// fix the start word
	*qwn = k ;
	if (m_q->m_qwords[k].m_isPunct) *qwn = k+1;

	return wn - wn0 + 1;
}