#include "gb-include.h"

#include "Matches.h"
#include "Query.h"
#include "Titledb.h"  // for getting total # of docs in db
#include "StopWords.h"
#include "Phrases.h"
#include "Title.h"
#include "Domains.h"
#include "Sections.h"
#include "Linkdb.h"
#include "Xml.h"
#include "BitOperations.h"
#include "Process.h"
#include "Mem.h"


// TODO: have Matches set itself from all the meta tags, titles, link text,
//       neighborhoods and body. then proximity algo can utilize that info
//       as well as the summary generator, Summary.cpp. right now prox algo
//       was setting all those different classes itself.

Matches::Matches()
  : m_qwordFlags(NULL),
    m_numMatches(0),
    m_numSlots(0),
    m_q(NULL),
    m_numAlnums(0),
    m_qwordAllocSize(0),
    m_numMatchGroups(0)
{
	memset(m_matches, 0, sizeof(m_matches));	//@todo: added to silence Coverity. Remove if impacting performance (quite big memset)
	memset(m_qtableIds, 0, sizeof(m_qtableIds));	// PVS-Studio
	memset(m_qtableWordNums, 0, sizeof(m_qtableWordNums));	// PVS-Studio
	memset(m_qtableFlags, 0, sizeof(m_qtableFlags));	// PVS-Studio
	memset(m_tmpBuf, 0, sizeof(m_tmpBuf));	// PVS-Studio
}


Matches::~Matches() {
	reset();
}

void Matches::reset() { 
	reset2();
	if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
		mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
		m_qwordFlags = NULL;
	}
}

void Matches::reset2() {
	m_numMatches = 0;
	m_numAlnums  = 0;
	// free all the classes' buffers
	for ( int32_t i = 0 ; i < m_numMatchGroups ; i++ ) {
		m_wordsArray   [i].reset();
		m_posArray     [i].reset();
		m_bitsArray    [i].reset();
	}
	m_numMatchGroups = 0;
}

bool Matches::isMatchableTerm(const QueryTerm *qt) const {
	const QueryWord *qw = qt->m_qword;
	// not derived from  a query word? how?
	if ( ! qw ) return false;
	if ( qw->m_ignoreWord == IGNORE_DEFAULT        ) return false;
	if ( qw->m_ignoreWord == IGNORE_FIELDNAME      ) return false;
	if ( qw->m_ignoreWord == IGNORE_BOOLOP         ) return false;
	// take this out for now so we highlight for title: terms
	if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) return false;
	// what word # are we?
	int32_t qwn = qw - m_q->m_qwords;
	// do not include if in a quote and does not start it!!
	if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != qwn ) return false;
	// if query is too long, a query word can be truncated!
	// this happens for some words if they are ignored, too!
	if ( ! qw->m_queryWordTerm && ! qw->m_queryPhraseTerm ) return false;
	return true;
}

void Matches::setQuery ( Query *q ) { 
	reset();
	// save it
	m_q       = q;

	if ( m_qwordFlags ) {
		g_process.shutdownAbort(true);
	}

	int32_t need = m_q->m_numWords * sizeof(mf_t) ;
	m_qwordAllocSize = need;

	if ( need < 128 ) 
		m_qwordFlags = (mf_t *)m_tmpBuf;
	else
		m_qwordFlags = (mf_t *)mmalloc ( need , "mmqf" );

	if ( ! m_qwordFlags ) {
		log("matches: alloc failed for query %s",q->originalQuery());
		return;
	}

	// this is word based. these are each 1 byte
	memset ( m_qwordFlags  , 0 , m_q->m_numWords * sizeof(mf_t));

	// # of WORDS in the query
	int32_t nqt = m_q->m_numTerms;

	// how many query words do we have that can be matched?
	int32_t numToMatch = 0;
	for ( int32_t i = 0 ; i < nqt ; i++ ) {
		// get query word #i
		QueryTerm *qt = &m_q->m_qterms[i];
		// skip if ignored *in certain ways only*
		if ( ! isMatchableTerm ( qt ) ) {
			continue;
		}
		// count it
		numToMatch++;
		// don't breach. MDW: i made this >= from > (2/11/09)
		if ( numToMatch < MAX_QUERY_WORDS_TO_MATCH ) continue;
		// note it
		log("matches: hit %" PRId32" max query words to match limit",
		    (int32_t)MAX_QUERY_WORDS_TO_MATCH);
		break;
	}

	// fix a core the hack way for now!
	if ( numToMatch < 256 ) numToMatch = 256;

	// keep number of slots in hash table a power of two for fast hashing
	m_numSlots = getHighestLitBitValue ( (uint32_t)(numToMatch * 3));
	// make the hash mask
	uint32_t mask = m_numSlots - 1;
	int32_t          n;
	// sanity check
	if ( m_numSlots > MAX_QUERY_WORDS_TO_MATCH * 3 ) {
		g_process.shutdownAbort(true); }

	// clear hash table
	memset ( m_qtableIds   , 0 , m_numSlots * 8 );
	memset ( m_qtableFlags , 0 , m_numSlots     );

	for ( int32_t i = 0 ; i < nqt ; i++ ) {
		// get query word #i
		QueryTerm *qt = &m_q->m_qterms[i];

		if( !qt ) {
			continue;
		}

		// skip if ignored *in certain ways only*
		if ( ! isMatchableTerm ( qt ) ) {
			continue;
		}

		// get the word it is from
		const QueryWord *qw = qt->m_qword;

		// get word #
		int32_t qwn = qw - q->m_qwords;

		// do not overfill table
		if ( i >= MAX_QUERY_WORDS_TO_MATCH ) {
			break;
		}

		// this should be equivalent to the word id
		int64_t qid = qt->m_rawTermId;//qw->m_rawWordId;

		// but NOT for 'cheatcodes.com'
		if ( qt->m_isPhrase ) {
			qid = qw->m_rawWordId;
		}

		// if its a multi-word synonym, like "new jersey" we must
		// index the individual words... or compute the phrase ids
		// for all the words in the doc. right now the qid is
		// the phrase hash for this guy i think...
		if ( qt->m_synonymOf && qt->m_numAlnumWordsInSynonym == 2 ) {
			qid = qt->m_synWids0;
		}

		// put in hash table
		n = ((uint32_t)qid) & mask;

		// chain to an empty slot
		while ( m_qtableIds[n] && m_qtableIds[n] != qid ) {
			if ( ++n >= m_numSlots ) {
				n = 0;
			}
		}

		// . if already occupied, do not overwrite this, keep this
		//   first word, the other is often ignored as IGNORE_REPEAT
		// . what word # in the query are we. save this.
		if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;

		// store it
		m_qtableIds[n] = qid;

		// in quotes? this term may appear multiple times in the
		// query, in some cases in quotes, and in some cases not.
		// we need to know either way for logic below.
		if ( qw->m_inQuotes ) {
			m_qtableFlags[n] |= 0x02;
		}
		else {
			m_qtableFlags[n] |= 0x01;
		}

		// this is basically a quoted synonym
		if ( qt->m_numAlnumWordsInSynonym == 2 ) {
			m_qtableFlags[n] |=  0x08;
		}

		//QueryTerm *qt = qw->m_queryWordTerm;
		if ( qt->m_termSign == '+' ) {
			m_qtableFlags[n] |= 0x04;
		}

		//
		// if query has e-mail, then index phrase id "email" so
		// it matches "email" in the doc.
		// we need this for the 'cheat codes' query as well so it
		// highlights 'cheatcodes'
		//
		int64_t pid = qw->m_rawPhraseId;
		if ( pid == 0 ) {
			continue;
		}

		// put in hash table
		n = ((uint32_t)pid) & mask;
		// chain to an empty slot
		while ( m_qtableIds[n] && m_qtableIds[n] != pid ) 
			if ( ++n >= m_numSlots ) n = 0;
		// this too?
		if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
		// store it
		m_qtableIds[n] = pid;

	}
}

// . this was in Summary.cpp, but is more useful here
// . we can also use this to replace the proximity algo setup where it
//   fills in the matrix for title, link text, etc.
// . returns false and sets g_errno on error
bool Matches::set( Words *bodyWords, Phrases *bodyPhrases, Sections *bodySections, Bits *bodyBits,
				   Pos *bodyPos, Xml *bodyXml, Title *tt, Url *firstUrl, LinkInfo *linkInfo ) {
	// don't reset query info!
	reset2();

	// . first add all the matches in the body of the doc
	// . add it first since it will kick out early if too many matches
	//   and we get all the explicit bits matched
	if ( !addMatches( bodyWords, bodyPhrases, bodySections, bodyBits, bodyPos, MF_BODY ) ) {
		return false;
	}

	// add the title in
	if ( !addMatches( tt->getTitle(), tt->getTitleLen(), MF_TITLEGEN ) ) {
		return false;
	}

	// add in the url terms
	if ( !addMatches( firstUrl->getUrl(), firstUrl->getUrlLen(), MF_URL ) ) {
		return false;
	}

	// also use the title from the title tag, because sometimes it does not equal "tt->getTitle()"
	int32_t  a     = tt->getTitleTagStart();
	int32_t  b     = tt->getTitleTagEnd();

	char *start = NULL;
	char *end   = NULL;
	if ( a >= 0 && b >= 0 && b>a ) {
		start = bodyWords->getWord(a);
		end   = bodyWords->getWord(b-1) + bodyWords->getWordLen(b-1);
		if ( !addMatches( start, end - start, MF_TITLETAG ) ) {
			return false;
		}
	}

	// now add in the meta tags
	int32_t n = bodyXml->getNumNodes();
	XmlNode *nodes = bodyXml->getNodes();

	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_META ) continue;
		// only get content for <meta name=..> not <meta http-equiv=..>
		int32_t tagLen;
		char *tag = bodyXml->getString ( i , "name" , &tagLen );
		// is it an accepted meta tag?
		int32_t flag = 0;
		if (tagLen== 7&&strncasecmp(tag,"keyword"    , 7)== 0)
			flag = MF_METAKEYW;
		if (tagLen== 7&&strncasecmp(tag,"summary"    , 7)== 0)
			flag = MF_METASUMM;
		if (tagLen== 8&&strncasecmp(tag,"keywords"   , 8)== 0)
			flag = MF_METAKEYW;
		if (tagLen==11&&strncasecmp(tag,"description",11)== 0)
			flag = MF_METADESC;
		if ( ! flag ) continue;
		// get the content
		int32_t len;
		char *s = bodyXml->getString ( i , "content" , &len );
		if ( ! s || len <= 0 ) continue;
		// wordify
		if ( !addMatches( s, len, flag ) ) {
			return false;
		}
	}

	// . now the link text
	// . loop through each link text and it its matches

	// loop through the Inlinks
	for (Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)); ) {
		// does it have link text? skip if not.
		if ( k->size_linkText <= 1 ) {
			continue;
		}

		// set the flag, the type of match
		mf_t flags = MF_LINK;

		// add it in
		if ( !addMatches( k->getLinkText(), k->size_linkText - 1, flags ) ) {
			return false;
		}

		// set flag for that
		flags = MF_HOOD;

		// add it in
		if ( !addMatches( k->getSurroundingText(), k->size_surroundingText - 1, flags ) ) {
			return false;
		}

		// parse the rss up into xml
		Xml rxml;
		if ( ! k->setXmlFromRSS ( &rxml ) ) {
			return false;
		}

		// add rss description
		bool isHtmlEncoded;
		int32_t rdlen;
		char *rd = rxml.getRSSDescription( &rdlen, &isHtmlEncoded );
		if ( !addMatches( rd, rdlen, MF_RSSDESC ) ) {
			return false;
		}

		// add rss title
		int32_t rtlen;
		char *rt = rxml.getRSSTitle( &rtlen, &isHtmlEncoded );
		if ( !addMatches( rt, rtlen, MF_RSSTITLE ) ) {
			return false;
		}
	}

	// that should be it
	return true;
}

bool Matches::addMatches( char *s, int32_t slen, mf_t flags ) {
	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	// get some new ptrs for this match group
	Words    *wp = &m_wordsArray    [ m_numMatchGroups ];
	Bits     *bp = &m_bitsArray     [ m_numMatchGroups ];
	Pos      *pb = &m_posArray      [ m_numMatchGroups ];

	// set the words class for this match group
	if ( !wp->set( s, slen, true ) ) {
		return false;
	}

	// bits vector
	if ( ! bp->setForSummary ( wp ) ) {
		return false;
	}

	// position vector
	if ( ! pb->set ( wp ) ) {
		return false;
	}

	// record the start
	int32_t startNumMatches = m_numMatches;
	// sometimes it returns true w/o incrementing this
	int32_t n = m_numMatchGroups;
	// . add all the Match classes from this match group
	// . this increments m_numMatchGroups on success
	bool status = addMatches( wp, NULL, NULL, bp, pb, flags );

	// if this matchgroup had some, matches, then keep it
	if ( m_numMatches > startNumMatches ) {
		return status;
	}

	// otherwise, reset it, useless
	wp->reset();
	bp->reset();
	pb->reset();

	// do not decrement the counter if we never incremented it
	if ( n == m_numMatchGroups ) {
		return status;
	}

	// ok, remove it
	m_numMatchGroups--;

	return status;
}

// . TODO: support stemming later. each word should then have multiple ids.
// . add to our m_matches[] array iff addToMatches is true, otherwise we just
//   set the m_foundTermVector for doing the BIG HACK described in Summary.cpp
bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bits *bits, Pos *pos, mf_t flags ) {
	// if no query term, bail.
	if ( m_numSlots <= 0 ) {
		return true;
	}

	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	Section *sp = NULL;
	if ( sections ) {
		sp = sections->m_sections;
	}

	mf_t eflag = 0;

	m_numMatchGroups++;

	const int64_t *pids = NULL;
	if ( phrases ) {
		pids = phrases->getPhraseIds2();
	}

	// set convenience vars
	uint32_t mask = m_numSlots - 1;
	const int64_t *wids = words->getWordIds();
	const int32_t *wlens = words->getWordLens();
	const char * const *wptrs = words->getWordPtrs();
	nodeid_t *tids = words->getTagIds();
	int32_t nw = words->getNumWords();
	int32_t n;
	int32_t matchStack = 0;
	int64_t nextMatchWordIdMustBeThis = 0;
	int32_t nextMatchWordPos = 0;
	int32_t lasti = -3;

	if ( getNumXmlNodes() > 512 ) { g_process.shutdownAbort(true); }

	int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;

	int32_t qwn;
	int32_t numQWords;
	int32_t numWords;

	//
	// . set m_matches[] array
	// . loop over all words in the document
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		//if      (tids && (tids[i]            ) == TAG_A) 
		//	inAnchTag = true;
		//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A) 
		//	inAnchTag = false;

		if ( tids && tids[i] ){
			// tagIds don't have wids and are skipped
			continue;
		}

		// skip if wid is 0, it is not an alnum word then
		if ( ! wids[i] ) {
			continue;
		}

		// count the number of alnum words
		m_numAlnums++;

		// clear this
		eflag = 0;

		// NO NO, a score of -1 means in a select tag, and
		// we do index that!! so only skip if wscores is 0 now.
		// -1 means in script, style, select or marquee. it is
		// indexed but with very little weight... this is really
		// a hack in Scores.cpp and should be fixed.
		// in Scores.cpp we set even the select tag stuff to -1...
		//if ( wscores && wscores[i] == -1 ) continue;
		if ( sp && (sp->m_flags & badFlags) ) continue;


		// . does it match a query term?
		// . hash to the slot in the hash table
		n = ((uint32_t)wids[i]) & mask;
		//n2 = swids[i]?((uint32_t)swids[i]) & mask:n;
	chain1:
		// skip if slot is empty (doesn't match query term)
		//if ( ! m_qtableIds[n] && ! m_qtableIds[n2]) continue;
		if ( ! m_qtableIds[n] ) goto tryPhrase;
		// otherwise chain
		if ( (m_qtableIds[n] != wids[i]) ) {
			if ( m_qtableIds[n] && ++n >= m_numSlots ) n = 0;
			goto chain1;
		}
		// we got one!
		goto gotMatch;


		//
		// fix so we hihglight "woman's" when query term is "woman"
		// for 'spiritual books for women' query
		// 
	tryPhrase:
		// try without 's if it had it
		if ( wlens[i] >= 3 && 
		     wptrs[i][wlens[i]-2] == '\'' &&
		     to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
			// move 's from word hash... very tricky
			int64_t nwid = wids[i];
			// undo hash64Lower_utf8 in hash.h
			nwid ^= g_hashtab[wlens[i]-1][(uint8_t)'s'];
			nwid ^= g_hashtab[wlens[i]-2][(uint8_t)'\''];
			n = ((uint32_t)nwid) & mask;
		chain2:
			if ( ! m_qtableIds[n] ) goto tryPhrase2;
			if ( (m_qtableIds[n] != nwid) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots ) n=0;
				goto chain2;
			}
			qwn = m_qtableWordNums[n];
			numWords = 1;
			numQWords = 1;
			// we got one!
			goto gotMatch2;
		}

	tryPhrase2:
		// try phrase first
		if ( pids && pids[i] ) {
			n = ((uint32_t)pids[i]) & mask;
		chain3:
			if ( ! m_qtableIds[n] ) continue;
			if ( (m_qtableIds[n] != pids[i]) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots)n = 0;
				goto chain3;
			}
			// what query word # do we match?
			qwn = m_qtableWordNums[n];
			// get that query word #
			QueryWord *qw = &m_q->m_qwords[qwn];
			// . do we match it as a single word?
			// . did they search for "bluetribe" ...?
			if ( qw->m_rawWordId == pids[i] ) {
				// set our # of words basically to 3
				numWords = 3;
				// matching a single query word
				numQWords = 1;
				// got a match
				goto gotMatch2;
			}
			if ( qw->m_phraseId == pids[i] ) {
				// might match more if we had more query
				// terms in the quote
				numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true );

				// this is 0 if we were an unmatched quote
				if ( numWords <= 0 ) continue;

				// got a match
				goto gotMatch2;
			}
			// otherwise we are matching a query phrase id
			log("matches: wtf? query word not matched for "
			    "highlighting... strange.");
			// assume one word for now
			numWords = 1;
			numQWords = 1;
			goto gotMatch2;
		}

		//
		// shucks, no match
		//
		continue;

	gotMatch:
		// what query word # do we match?
		qwn = m_qtableWordNums[n];
		

		// . how many words are in this match?
		// . it may match a single word or a phrase or both
		// . this will be 1 for just matching a single word, and 
		//   multiple words for quotes/phrases. The number of words
		//   in both cases will included unmatched punctuation words
		//   and tags in between matching words.
		numQWords = 0;
		numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true );
		// this is 0 if we were an unmatched quote
		if ( numWords <= 0 ) continue;

	gotMatch2:
		// get query word
		QueryWord *qw = &m_q->m_qwords[qwn];
		// point to next word in the query
		QueryWord *nq = NULL;
		if ( qwn+2 < m_q->m_numWords ) nq = &m_q->m_qwords[qwn+2];

		// . if only one word matches and its a stop word, make sure
		//   it's next to the correct words in the query
		// . if phraseId is 0, that means we do not start a phrase,
		//   because stop words can start phrases if they are the
		//   first word, are capitalized, or have breaking punct before
		//   them.
		if ( numWords == 1 && 
		     ! qw->m_inQuotes &&
		     m_q->m_numWords > 2 &&
		     qw->m_wordSign == '\0' &&
		     (nq && nq->m_wordId) && // no field names can follow
		     //(qw->m_isQueryStopWord || qw->m_isStopWord ) ) {
		     // we no longer consider single alnum chars to be
		     // query stop words as stated in StopWords.cpp to fix
		     // the query 'j. w. eagan'
		     qw->m_isQueryStopWord ) {
			// if stop word does not start a phrase in the query 
			// then he must have a matched word before him in the
			// document. if he doesn't then do not count as a match
			if ( qw->m_phraseId == 0LL && i-2 != lasti ) {
				// peel off anybody before us
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;	

				continue; 
			}
			// if we already have a match stack, we must
			// be in nextMatchWordPos
			if ( matchStack && nextMatchWordPos != i ) {
				// peel off anybody before us 
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;	
				//continue; 
			}
			// if the phraseId is 0 and the previous word
			// is a match, then we're ok, but put us on a stack
			// so if we lose a match, we'll be erased
			QueryWord *nq = &m_q->m_qwords[qwn+2];
			// next match is only required if next word in query
			// is indeed valid.
			if ( nq->m_wordId && nq->m_fieldCode == 0 ) {
				nextMatchWordIdMustBeThis = nq->m_rawWordId;
				nextMatchWordPos          = i + 2;
				matchStack++;
			}
		}
		else if ( matchStack ) {
			// if the last word matched was a stop word, we have to
			// match otherwise we have to remove the whole stack.
			if ( qw->m_rawWordId != nextMatchWordIdMustBeThis ||
			     i                > nextMatchWordPos ) {
				m_numMatches -= matchStack;
				// ensure we never go negative like for 
				// www.experian.com query
				if ( m_numMatches < 0 ) m_numMatches = 0;
			}
			// always reset this here if we're not a stop word
			matchStack = 0;
		}

		// record word # of last match
		lasti = i;

		// otherwise, store it in our m_matches[] array
		Match *m = &m_matches[m_numMatches];

		// the word # in the doc, and how many of 'em are in the match
		m->m_wordNum  = i;
		m->m_numWords = numWords;

		// the word # in the query, and how many of 'em we match
		m->m_qwordNum  = qwn;
		m->m_numQWords = numQWords;

		// get the first query word # of this match
		qw = &m_q->m_qwords[qwn];

		// convenience, used by Summary.cpp
		m->m_words    = words;
		m->m_sections = sections;
		m->m_bits     = bits;
		m->m_pos      = pos;
		m->m_flags    = flags | eflag ;

		// add to our vector. we want to know where each QueryWord
		// is. i.e. in the title, link text, meta tag, etc. so
		// the proximity algo in Summary.cpp can use that info.
		m_qwordFlags[qwn] |= flags;

		// advance
		m_numMatches++;

		// we get atleast MAX_MATCHES
		if ( m_numMatches < MAX_MATCHES ) {
			continue;
		}

		break;
	}

	// peel off anybody before us
	m_numMatches -= matchStack;
	if ( m_numMatches < 0 ) m_numMatches = 0;

	return true;
}

// . word #i in the doc matches slot #n in the hash table
int32_t Matches::getNumWordsInMatch(Words *words, int32_t wn, int32_t n, int32_t *numQWords, int32_t *qwn,
				    bool allowPunctInPhrase) {
	// is it a two-word synonym?
	if ( m_qtableFlags[n] & 0x08 ) {
		// get the word following this
		int64_t wid2 = 0LL;
		if ( wn+2 < words->getNumWords() )
			wid2 = words->getWordId(wn+2);
		// scan the synonyms...
		const int64_t *wids = words->getWordIds();
		for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
			QueryTerm *qt = &m_q->m_qterms[k];
			if ( ! qt->m_synonymOf ) continue;
			if ( qt->m_synWids0 != wids[wn] ) continue;
			if ( qt->m_synWids1 != wid2 ) continue;
			*numQWords = 3; 
			return 3;
		}
	}

	// save the first word in the doc that we match first
	int32_t wn0 = wn;

	// CAUTION: the query "business development center" (in quotes)
	// would match a doc with "business development" and 
	// "development center" as two separate phrases.

	// if query word never appears in quotes, it's a single word match
	if ( ! (m_qtableFlags[n] & 0x02) ) { *numQWords = 1; return 1; }

	// get word ids array for the doc
	int64_t  *wids   = words->getWordIds();
	//int64_t  *swids  = words->getStripWordIds();
	char      **ws     = words->getWordPtrs();
	int32_t       *wl     = words->getWordLens();
	//the word we match in the query appears in quotes in the query
	int32_t k     = -1;
	int32_t count = 0;
	int32_t nw    = words->getNumWords();

	// loop through all the quotes in the query and find
	// which one we match, if any. we will have to advance the
	// query word and doc word simultaneously and make sure they
	// match as we advance. 
	int32_t nqw = m_q->m_numWords;
	int32_t j;
	for ( j = 0 ; j < nqw ; j++ ) {
		// get ith query word
		QueryWord *qw = &m_q->m_qwords[j];
		if ( !qw->m_rawWordId ) continue;
		// query word must match wid of first word in quote
		if ( (qw->m_rawWordId != wids[wn]) ) continue;
		//     (qw->m_rawWordId != swids[wn])) continue;
		// skip if in field
		// . we were doing an intitle:"fight club" query and
		//   needed to match that in the title...
		//if ( qw->m_fieldCode         ) continue;
		// query word must be in quotes
		if ( ! qw->m_inQuotes        ) continue;
		// skip it if it does NOT start the quote. quoteStart
		// is actually the query word # that contains the quote
		//if ( qw->m_quoteStart != j-1 ) continue;
		// not any more it isn't...
		if ( qw->m_quoteStart != j ) continue;
		// save the first word # in the query of the quote
		k = j; // -1;
		// count number of words we match in the quote, we've
		// already matched the first one
		count = 0;
	subloop:
		// query word must match wid of first word in phrase
		if ( (qw->m_rawWordId != wids[wn]) ) {
		//     (qw->m_rawWordId != swids[wn])) {
			// reset and try another quote in the query
			count = 0;
			wn    = wn0;
			continue;
		}
		// up the count of query words matched in the quote
		count++;
		// ADVANCE QUERY WORD
		j++;
		// if no more, we got a match
		if ( j >= nqw ) break;
		// skip punct words
		if ( m_q->m_qwords[j].m_isPunct ) j++;
		// if no more, we got a match
		if ( j >= nqw ) break;
		// now we should point to the next query word in quote
		qw = &m_q->m_qwords[j];
		// if not in quotes, we're done, we got a match
		if ( ! qw->m_inQuotes      ) break;
		// or if in a different set of quotes, we got a match
		if ( qw->m_quoteStart != k ) break;
		// . ADVANCE DOCUMENT WORD
		// . tags and punctuation words have 0 for their wid
		for ( wn++ ; wn < nw ; wn++ ) {
			// . if NO PUNCT, IN QUOTES, AND word id is zero
			//   then check for punctuation
			if(!allowPunctInPhrase && qw->m_inQuotes && !wids[wn]) {
				// . check if its a space [0x20, 0x00]
				if( (wl[wn] == 2) && (ws[wn][0] == ' ') ) 
					continue;
				// . if the length is greater than a space
				else if( wl[wn] > 2 ) {
					// . increment until we find no space
					// . increment by 2 since its utf16
					for( int32_t i = 0; i < wl[wn]; i+=2 )
						// . if its not a space, its punc
						if( ws[wn][i] != ' ' ) {
							count=0; break;
						}
					// . if count is 0, punc found break
					if( count == 0 ) break;
				}
				// . otherwise its solo punc, set count and break
				else { count=0; break; }
			}
			// . we incremented to a new word break and check
			if ( wids[wn] ) break;
		}
		// there was a following query word in the quote
		// so there must be a following word, if not, continue
		// to try to find another quote in the query we match
		if ( wn >= nw ) { 
			// reset and try another quote in the query
			count = 0; 
			wn    = wn0;
			continue;
		}
		// see if the next word and query term match
		goto subloop;
	}

	// if we did not match any quote in the query
	// check if we did match a single word. e.g.
	// Hello World "HelloWorld" "Hello World Example"
	if ( count <= 0 ) {
		if ( m_qtableFlags[n] & 0x01 ) {
			*numQWords = 1;
			// we did match a single word. m_qtableWordNums[n] may
			// not be pointing to the right qword. Set it to a 
			// qword that is the single word
			for ( j = 0 ; j < nqw ; j++ ) {
				// get ith query word
				QueryWord *qw = &m_q->m_qwords[j];
				if ( !qw->m_rawWordId ) continue;
				// query word must match wid of word
				if ( (qw->m_rawWordId != wids[wn]) ) continue;
				//   (qw->m_rawWordId != swids[wn])) continue;
				// skip if in field
				// . fix intitle:"fight club"
				//if ( qw->m_fieldCode         ) continue;
				// query word must NOT be in quotes
				if ( qw->m_inQuotes        ) continue;
				*qwn = j;
			}
			return 1;
		}
		else
			return 0;
	}
	// sanity check
	if ( k < 0 ) { g_process.shutdownAbort(true); }
	// skip punct words
	if ( j-1>=0 && m_q->m_qwords[j-1].m_isPunct ) j--;
	// . ok, we got a quote match
	// . it had this man query words in it
	//*numQWords = j - (k+1);
	*numQWords = j - k;
	// fix the start word
	*qwn = k ;
	if (m_q->m_qwords[k].m_isPunct) *qwn = k+1;

	return wn - wn0 + 1;
}