privacore-open-source-searc…/Phrases.cpp

//#include "gb-include.h"

#include "Phrases.h"
#include "Words.h"
#include "Bits.h"
#include "Mem.h"
#include "Conf.h"
#include "Sanity.h"


Phrases::Phrases() : m_buf(NULL) {

	memset(m_localBuf, 0, sizeof(m_localBuf));

	// Coverity
	m_bufSize = 0;
	m_phraseIds2 = NULL;
	m_numWordsTotal2 = NULL;
	m_numPhrases = 0;
	m_words = NULL;
	m_wids = NULL;
	m_wptrs = NULL;
	m_wlens = NULL;
	m_bits = NULL;

	reset();
}

Phrases::~Phrases ( ) {
	reset();
}

void Phrases::reset() {
	if ( m_buf && m_buf != m_localBuf ) {
		mfree ( m_buf , m_bufSize , "Phrases" );
	}
	m_buf = NULL;
}


// initialize this token array with the string, "s" of length, "len".
bool Phrases::set( const Words *words, const Bits *bits ) {
	// reset in case being re-used
	reset();

	// ensure we have words
	if ( ! words ) return true;

	// . we have one phrase per word
	// . a phrase #n is "empty" if spam[n] == PSKIP
	m_numPhrases = words->getNumWords();

	// how much mem do we need?
	int32_t need = m_numPhrases * (8+1);

	// alloc if we need to
	if ( (unsigned)need > sizeof(m_localBuf) )
		m_buf = (char *)mmalloc ( need , "Phrases" );
	else
		m_buf = m_localBuf;

	if ( ! m_buf ) {
		log(LOG_WARN, "query: Phrases::set: %s",mstrerror(g_errno));
		return false;
	}

	m_bufSize = need;

	// set up arrays
	char *p = m_buf;

	// phrase not using stop words
	m_phraseIds2 = (int64_t *)p;
	p += m_numPhrases * 8;

	m_numWordsTotal2 = (unsigned char *)p;
	p += m_numPhrases * 1;

	// sanity
	if ( p != m_buf + need ) gbshutdownLogicError();

	// point to this info while we parse
	m_words        = words;
	m_wptrs        = words->getWordPtrs();
	m_wlens        = words->getWordLens();
	m_wids         = words->getWordIds();
	m_bits         = bits;

	// . set the phrases
	// . sets m_phraseIds [i]
	// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
	for ( int32_t i = 0 ; i < words->getNumWords() ; ++i ) {
		if ( ! m_wids[i] ) {
			continue;
		}

		setPhrase ( i );
	}

	// success
	return true;
}

// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase ( int32_t i ) {
	logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 " BEGIN", i);

	// hash of the phrase
	int64_t h   = 0LL; 

	// the hash of the two-word phrase
	int64_t h2  = 0LL; 

	// reset
	unsigned char pos = 0;

	// now look for other tokens that should follow the ith token
	int32_t nw = m_words->getNumWords();
	int32_t numWordsInPhrase = 1;

	// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
	char isNum = is_digit(m_wptrs[i][0]);

	// do not include punct/tag words in the m_numWordsTotal[j] count
	// of the total words in the phrase. these are just usesless tails.
	int32_t lastWordj = -1;

	// loop over following words
	bool hasHyphen;
	bool hasStopWord2 ;

	// . NOTE: a token can start a phrase but NOT be in it. 
	// . like a large number for example.
	// . wordId is the lower ascii hash of the ith word
	// . NO... this is allowing the query operator PiiPe to start
	//   a phrase but not be in it, then the phrase id ends up just
	//   being the following word's id. causing the synonyms code to
	//   give a synonym which it should not un Synonyms::set()
	if ( ! m_bits->canBeInPhrase(i) ) {
		// so indeed, skip it then
		goto nophrase;
	}

	h = m_wids[i];

	// set position
	pos = (unsigned char)m_wlens[i];

	hasHyphen = false;
	hasStopWord2 = m_bits->isStopWord(i);

	for( int32_t j = i + 1 ; j < nw ; j++ ) {
		logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". LOOP START", i, j, m_wids[i], m_wids[j] );

		// Do not allow more than 32 alnum/punct "words" in a phrase.
		// Tthis prevents phrases with 100,000 words from slowing
		// us down. would put us in a huge double-nested for loop
		// BR: But it will never happen? It breaks out of the loop
		//     when the phrase contains 2 (real) words?
		if ( j > i + 32 ) {
			logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j > i+32. no phrase", i, j, m_wids[i], m_wids[j] );
			goto nophrase;
		}

		// deal with punct words
		if ( ! m_wids[j] ) {
			// if we cannot pair across word j then break
			if ( !m_bits->canPairAcross( j ) ) {
				logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Pair cannot cross. Breaking.", i, j, m_wids[i], m_wids[j] );
				break;
			}

			// does it have a hyphen?
			if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {
				hasHyphen = true;
				logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is hyphen, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
			}
			else {
				logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is space, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
			}
			continue;
		}

		// record lastWordj to indicate that word #j was a true word
		lastWordj = j;

		// if word #j can be in phrase then incorporate it's hash
		if ( m_bits->canBeInPhrase (j) ) {
			int32_t conti = pos;

			// hash the jth word into the hash
			h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );

			pos = conti;

			++numWordsInPhrase;

			logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". CAN be in phrase. Adding j's hash. numWordsInPhrase=%" PRId32 "", i, j, m_wids[i], m_wids[j], numWordsInPhrase);


			// N-word phrases?
			if ( numWordsInPhrase == 2 ) {
				h2 = h;
				m_numWordsTotal2[i] = j - i + 1;
				hasStopWord2 = m_bits->isStopWord(j);

				logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Words in phrase is 2. Breaking.", i, j, m_wids[i], m_wids[j] );
				break;
			}
		}
		else {
			logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j cannot be in a phrase.", i, j, m_wids[i], m_wids[j] );
		}
			

		// if we cannot pair across word j then break
		if ( ! m_bits->canPairAcross (j) ) {
			logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Cannot pair across. Breaking.", i, j, m_wids[i], m_wids[j] );
			break;
		}

		// otherwise, get the next word
		logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Get next word", i, j, m_wids[i], m_wids[j] );
	}

	// if we had no phrase then use 0 as id (need 2+ words to be a phrase)
	if ( numWordsInPhrase <= 1 ) { 
	nophrase:
		m_phraseIds2[i]      = 0LL; 
		m_numWordsTotal2[i]   = 0;
		logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Not a phrase. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
		return;
	}

	// sanity check
	if ( lastWordj == -1 ) gbshutdownLogicError();

	// sanity check
	if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError();

	// hyphen between numbers does not count (so 1-2 != 12)
	if ( isNum ) hasHyphen = false;

	// . the two word phrase id
	// . "cd rom"    -> cdrom
	// . "fly paper" -> flypaper
	// . "i-phone"   -> iphone
	// . "e-mail"    -> email
	if ( hasHyphen || ! hasStopWord2 ) {
		m_phraseIds2[i] = h2;
		logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Has hyphen or no stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i] );
	}
	// . "st. and"    !-> stand
	// . "the rapist" !-> therapist
	else {
		m_phraseIds2[i] = h2 ^ 0x768867;
		logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. either no hyphen or a stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
	}
}


// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
void Phrases::getPhrase(int32_t i, char *buf, size_t bufsize, int32_t *phrLen) const {
	// return 0 if no phrase
	if ( m_phraseIds2[i] == 0LL ) {
		*buf='\0';
		return;
	}

	// . how many words, including punct words, are in phrase?
	// . this should never be 1 or less
	int32_t  n = m_numWordsTotal2[i] ;

	char *s     = buf;
	char *send  = buf + bufsize - 1;
	for (int32_t w = i;w<i+n;w++){
		if (!m_words->isAlnum(w)){
			// skip spaces for now since we has altogether now
			*s++ = ' ';
			continue;
		}
		const char *w1   = m_words->getWord(w);
		const char *wend = w1 + m_words->getWordLen(w);
		for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
			// make sure not to overflow destination buffer
			if( s + m_words->getWordLen(w) >= send ) {
				*phrLen=0;
				*buf='\0';
				return;
			}

			// write the lower case char from w1+j into "s"
			int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
			// advance
			j += size;
			s += size;
		}
	}
	// null terminate
	*s = '\0';

	// set length we wrote into "buf"
	*phrLen = s - buf;
}

int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) const {
	*pid = 0LL;

	if ( m_numWordsTotal2[i] ) {
		*pid = m_phraseIds2[i];
		return m_numWordsTotal2[i];
	}

	return 0;
}
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`//#include "gb-include.h"`
Initial file population. 2013-08-02 13:12:24 -07:00
			`#include "Phrases.h"`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`#include "Words.h"`
			`#include "Bits.h"`
Initial file population. 2013-08-02 13:12:24 -07:00			`#include "Mem.h"`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`#include "Conf.h"`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`#include "Sanity.h"`
First batch of changes streamlining emergency shutdown code 2016-06-20 12:30:26 +02:00
Initial file population. 2013-08-02 13:12:24 -07:00
init class members in Phrases, also when resetting obj 2016-09-26 17:33:45 +02:00			`Phrases::Phrases() : m_buf(NULL) {`
Initial file population. 2013-08-02 13:12:24 -07:00
member init in Phrases 2016-10-21 11:49:06 +02:00			`memset(m_localBuf, 0, sizeof(m_localBuf));`
init class members in Phrases, also when resetting obj 2016-09-26 17:33:45 +02:00
			`// Coverity`
			`m_bufSize = 0;`
			`m_phraseIds2 = NULL;`
			`m_numWordsTotal2 = NULL;`
			`m_numPhrases = 0;`
			`m_words = NULL;`
			`m_wids = NULL;`
			`m_wptrs = NULL;`
			`m_wlens = NULL;`
			`m_bits = NULL;`
member init in Phrases 2016-10-21 11:49:06 +02:00
			`reset();`
			`}`

			`Phrases::~Phrases ( ) {`
			`reset();`
			`}`

			`void Phrases::reset() {`
			`if ( m_buf && m_buf != m_localBuf ) {`
			`mfree ( m_buf , m_bufSize , "Phrases" );`
			`}`
			`m_buf = NULL;`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`

init class members in Phrases, also when resetting obj 2016-09-26 17:33:45 +02:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// initialize this token array with the string, "s" of length, "len".`
Remove niceness from Phrases 2016-09-23 02:40:50 +02:00			`bool Phrases::set( const Words words, const Bits bits ) {`
Initial file population. 2013-08-02 13:12:24 -07:00			`// reset in case being re-used`
			`reset();`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// ensure we have words`
			`if ( ! words ) return true;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// . we have one phrase per word`
			`// . a phrase #n is "empty" if spam[n] == PSKIP`
			`m_numPhrases = words->getNumWords();`

			`// how much mem do we need?`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`int32_t need = m_numPhrases * (8+1);`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// alloc if we need to`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`if ( (unsigned)need > sizeof(m_localBuf) )`
Initial file population. 2013-08-02 13:12:24 -07:00			`m_buf = (char *)mmalloc ( need , "Phrases" );`
			`else`
			`m_buf = m_localBuf;`

Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00			`if ( ! m_buf ) {`
Log function will now return void instead of a boolean 2016-08-01 15:29:03 +02:00			`log(LOG_WARN, "query: Phrases::set: %s",mstrerror(g_errno));`
			`return false;`
Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00			`}`

Initial file population. 2013-08-02 13:12:24 -07:00			`m_bufSize = need;`
Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// set up arrays`
			`char *p = m_buf;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// phrase not using stop words`
Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00			`m_phraseIds2 = (int64_t *)p;`
			`p += m_numPhrases * 8;`

			`m_numWordsTotal2 = (unsigned char *)p;`
			`p += m_numPhrases * 1;`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// sanity`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`if ( p != m_buf + need ) gbshutdownLogicError();`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// point to this info while we parse`
			`m_words = words;`
removed duplicate function getWords from Words 2016-10-19 19:53:00 +02:00			`m_wptrs = words->getWordPtrs();`
Initial file population. 2013-08-02 13:12:24 -07:00			`m_wlens = words->getWordLens();`
			`m_wids = words->getWordIds();`
			`m_bits = bits;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// . set the phrases`
			`// . sets m_phraseIds [i]`
			`// . sets m_phraseSpam[i] to PSKIP if NO phrase exists`
Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00			`for ( int32_t i = 0 ; i < words->getNumWords() ; ++i ) {`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( ! m_wids[i] ) {`
			`continue;`
			`}`

Move QUICKPOLL to outer loop 2016-02-25 17:12:23 +01:00			`setPhrase ( i );`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`
Remove unused version variable from Phrases::set 2016-02-25 16:59:06 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// success`
			`return true;`
			`}`

			`// . add the phrase that starts with the ith word`
			`// . "read Of Mice and Men" should make 3 phrases:`
			`// . read.ofmice`
			`// . ofmice`
			`// . mice.andmen`
Move QUICKPOLL to outer loop 2016-02-25 17:12:23 +01:00			`void Phrases::setPhrase ( int32_t i ) {`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 " BEGIN", i);`

Initial file population. 2013-08-02 13:12:24 -07:00			`// hash of the phrase`
replace long long with int64_t 2014-10-30 13:36:39 -06:00			`int64_t h = 0LL;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`// the hash of the two-word phrase`
replace long long with int64_t 2014-10-30 13:36:39 -06:00			`int64_t h2 = 0LL;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// reset`
			`unsigned char pos = 0;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// now look for other tokens that should follow the ith token`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`int32_t nw = m_words->getNumWords();`
			`int32_t numWordsInPhrase = 1;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.`
			`char isNum = is_digit(m_wptrs[i][0]);`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// do not include punct/tag words in the m_numWordsTotal[j] count`
			`// of the total words in the phrase. these are just usesless tails.`
now it compiles with -m32 2014-11-10 14:45:11 -08:00			`int32_t lastWordj = -1;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// loop over following words`
potentially used uninit var j in Phrases::setPhrase when trace log was enabled 2016-11-04 12:39:47 +01:00			`bool hasHyphen;`
Initial file population. 2013-08-02 13:12:24 -07:00			`bool hasStopWord2 ;`

			`// . NOTE: a token can start a phrase but NOT be in it.`
			`// . like a large number for example.`
			`// . wordId is the lower ascii hash of the ith word`
			`// . NO... this is allowing the query operator PiiPe to start`
			`// a phrase but not be in it, then the phrase id ends up just`
			`// being the following word's id. causing the synonyms code to`
			`// give a synonym which it should not un Synonyms::set()`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( ! m_bits->canBeInPhrase(i) ) {`
Initial file population. 2013-08-02 13:12:24 -07:00			`// so indeed, skip it then`
			`goto nophrase;`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
			`h = m_wids[i];`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// set position`
			`pos = (unsigned char)m_wlens[i];`

			`hasHyphen = false;`
			`hasStopWord2 = m_bits->isStopWord(i);`

potentially used uninit var j in Phrases::setPhrase when trace log was enabled 2016-11-04 12:39:47 +01:00			`for( int32_t j = i + 1 ; j < nw ; j++ ) {`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". LOOP START", i, j, m_wids[i], m_wids[j] );`

			`// Do not allow more than 32 alnum/punct "words" in a phrase.`
			`// Tthis prevents phrases with 100,000 words from slowing`
			`// us down. would put us in a huge double-nested for loop`
			`// BR: But it will never happen? It breaks out of the loop`
			`// when the phrase contains 2 (real) words?`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( j > i + 32 ) {`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j > i+32. no phrase", i, j, m_wids[i], m_wids[j] );`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`goto nophrase;`
			`}`

Initial file population. 2013-08-02 13:12:24 -07:00			`// deal with punct words`
			`if ( ! m_wids[j] ) {`
			`// if we cannot pair across word j then break`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( !m_bits->canPairAcross( j ) ) {`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Pair cannot cross. Breaking.", i, j, m_wids[i], m_wids[j] );`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`break;`
			`}`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// does it have a hyphen?`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {`
			`hasHyphen = true;`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is hyphen, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );`
			`}`
			`else {`
			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is space, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`}`
Initial file population. 2013-08-02 13:12:24 -07:00			`continue;`
			`}`

			`// record lastWordj to indicate that word #j was a true word`
			`lastWordj = j;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// if word #j can be in phrase then incorporate it's hash`
			`if ( m_bits->canBeInPhrase (j) ) {`
now it compiles with -m32 2014-11-10 14:45:11 -08:00			`int32_t conti = pos;`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// hash the jth word into the hash`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );`

Initial file population. 2013-08-02 13:12:24 -07:00			`pos = conti;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`++numWordsInPhrase;`
Initial file population. 2013-08-02 13:12:24 -07:00
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". CAN be in phrase. Adding j's hash. numWordsInPhrase=%" PRId32 "", i, j, m_wids[i], m_wids[j], numWordsInPhrase);`


Initial file population. 2013-08-02 13:12:24 -07:00			`// N-word phrases?`
Remove commented out codes 2016-02-18 17:07:23 +01:00			`if ( numWordsInPhrase == 2 ) {`
Initial file population. 2013-08-02 13:12:24 -07:00			`h2 = h;`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`m_numWordsTotal2[i] = j - i + 1;`
			`hasStopWord2 = m_bits->isStopWord(j);`

added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Words in phrase is 2. Breaking.", i, j, m_wids[i], m_wids[j] );`
Initial file population. 2013-08-02 13:12:24 -07:00			`break;`
			`}`
			`}`
added trace log option to Phrases 2016-11-03 12:26:58 +01:00			`else {`
			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j cannot be in a phrase.", i, j, m_wids[i], m_wids[j] );`
			`}`

Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// if we cannot pair across word j then break`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( ! m_bits->canPairAcross (j) ) {`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Cannot pair across. Breaking.", i, j, m_wids[i], m_wids[j] );`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`break;`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// otherwise, get the next word`
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Get next word", i, j, m_wids[i], m_wids[j] );`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`
Remove commented out codes 2016-02-18 17:07:23 +01:00
added trace log option to Phrases 2016-11-03 12:19:59 +01:00			`// if we had no phrase then use 0 as id (need 2+ words to be a phrase)`
Initial file population. 2013-08-02 13:12:24 -07:00			`if ( numWordsInPhrase <= 1 ) {`
			`nophrase:`
			`m_phraseIds2[i] = 0LL;`
			`m_numWordsTotal2[i] = 0;`
potentially used uninit var j in Phrases::setPhrase when trace log was enabled 2016-11-04 12:39:47 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Not a phrase. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);`
Initial file population. 2013-08-02 13:12:24 -07:00			`return;`
			`}`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// sanity check`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`if ( lastWordj == -1 ) gbshutdownLogicError();`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// sanity check`
#include cleanup in Phrases.* 2016-08-05 14:31:18 +02:00			`if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError();`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// hyphen between numbers does not count (so 1-2 != 12)`
			`if ( isNum ) hasHyphen = false;`
Remove commented out codes 2016-02-18 17:07:23 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// . the two word phrase id`
			`// . "cd rom" -> cdrom`
			`// . "fly paper" -> flypaper`
			`// . "i-phone" -> iphone`
			`// . "e-mail" -> email`
			`if ( hasHyphen \|\| ! hasStopWord2 ) {`
			`m_phraseIds2[i] = h2;`
potentially used uninit var j in Phrases::setPhrase when trace log was enabled 2016-11-04 12:39:47 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Has hyphen or no stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i] );`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`
			`// . "st. and" !-> stand`
			`// . "the rapist" !-> therapist`
			`else {`
			`m_phraseIds2[i] = h2 ^ 0x768867;`
potentially used uninit var j in Phrases::setPhrase when trace log was enabled 2016-11-04 12:39:47 +01:00			`logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. either no hyphen or a stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);`
Initial file population. 2013-08-02 13:12:24 -07:00			`}`
			`}`

added trace log option to Phrases 2016-11-03 12:19:59 +01:00

Initial file population. 2013-08-02 13:12:24 -07:00			`// . store phrase that starts with word #i into "printBuf"`
			`// . return bytes stored in "printBuf"`
Don't use static buffer in Phrases::getPhrase() (not thread-safe) 2016-05-27 16:02:21 +02:00			`void Phrases::getPhrase(int32_t i, char buf, size_t bufsize, int32_t phrLen) const {`
Initial file population. 2013-08-02 13:12:24 -07:00			`// return 0 if no phrase`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`if ( m_phraseIds2[i] == 0LL ) {`
Don't use static buffer in Phrases::getPhrase() (not thread-safe) 2016-05-27 16:02:21 +02:00			`*buf='\0';`
			`return;`
Add timing log in XmlDoc for query. Optimize Phrases.cpp. Remove unused codes 2016-02-24 16:12:05 +01:00			`}`

Initial file population. 2013-08-02 13:12:24 -07:00			`// . how many words, including punct words, are in phrase?`
			`// . this should never be 1 or less`
Remove 'static' input variable 2016-03-01 11:42:30 +01:00			`int32_t n = m_numWordsTotal2[i] ;`
Initial file population. 2013-08-02 13:12:24 -07:00
			`char *s = buf;`
Don't use static buffer in Phrases::getPhrase() (not thread-safe) 2016-05-27 16:02:21 +02:00			`char *send = buf + bufsize - 1;`
now it compiles with -m32 2014-11-10 14:45:11 -08:00			`for (int32_t w = i;w<i+n;w++){`
Initial file population. 2013-08-02 13:12:24 -07:00			`if (!m_words->isAlnum(w)){`
			`// skip spaces for now since we has altogether now`
			`*s++ = ' ';`
			`continue;`
			`}`
More constness in Phrases.* 2016-05-27 16:10:31 +02:00			`const char *w1 = m_words->getWord(w);`
			`const char *wend = w1 + m_words->getWordLen(w);`
now it compiles with -m32 2014-11-10 14:45:11 -08:00			`for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){`
avoid buffer overrun in getPhrase 2017-10-01 18:04:56 +02:00			`// make sure not to overflow destination buffer`
			`if( s + m_words->getWordLen(w) >= send ) {`
			`*phrLen=0;`
			`*buf='\0';`
			`return;`
			`}`

Initial file population. 2013-08-02 13:12:24 -07:00			`// write the lower case char from w1+j into "s"`
now it compiles with -m32 2014-11-10 14:45:11 -08:00			`int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );`
Initial file population. 2013-08-02 13:12:24 -07:00			`// advance`
			`j += size;`
			`s += size;`
			`}`
			`}`
			`// null terminate`
			`*s = '\0';`
Remove 'static' input variable 2016-03-01 11:42:30 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// set length we wrote into "buf"`
			`*phrLen = s - buf;`
			`}`
Fix bug where bigrams was not searched for 2016-03-07 17:06:21 +01:00
More constness in Phrases.* 2016-05-27 16:10:31 +02:00			`int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) const {`
Fix bug where bigrams was not searched for 2016-03-07 17:06:21 +01:00			`*pid = 0LL;`

			`if ( m_numWordsTotal2[i] ) {`
			`*pid = m_phraseIds2[i];`
			`return m_numWordsTotal2[i];`
			`}`

			`return 0;`
			`}`