privacore-open-source-searc…/Synonyms.cpp

#include "Synonyms.h"
#include "tokenizer.h"
#include "Bits.h"
#include "Phrases.h"
#include "Wiktionary.h"
#include "Lang.h"
#include "GbUtil.h"
#include "Sanity.h"
#include "gbmemcpy.h"

#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif

Synonyms::Synonyms() {
	m_synWordBuf.setLabel("syswbuf");

	// Coverity
	m_aids = NULL;
	m_wids0 = NULL;
	m_wids1 = NULL;
	m_termPtrs = NULL;
	m_termOffs = NULL;
	m_termLens = NULL;
	m_numAlnumWords = NULL;
	m_numAlnumWordsInBase = NULL;
	m_src = NULL;
	m_langIds = NULL;
	m_aidsPtr = NULL;
	m_wids0Ptr = NULL;
	m_wids1Ptr = NULL;
	m_termPtrsPtr = NULL;
	m_termOffsPtr = NULL;
	m_termLensPtr = NULL;
	m_numAlnumWordsPtr = NULL;
	m_numAlnumWordsInBasePtr = NULL;
	m_srcPtr = NULL;
	m_langIdsPtr = NULL;
}

Synonyms::~Synonyms() {
	reset();
}

void Synonyms::reset() {
	m_synWordBuf.purge();
}

// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
int32_t Synonyms::getSynonyms(const TokenizerResult *tr,
			      unsigned wordNum,
			     uint8_t langId ,
			     char *tmpBuf ) {

	if ( wordNum >= tr->size() ) gbshutdownLogicError();

	const auto &token = (*tr)[wordNum];

	// punct words have no synoyms
	if ( !token.is_alfanum )
		return 0;


	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,"altwrds");


	int32_t maxSyns = (int32_t)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (int64_t *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (int64_t *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (int64_t *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * sizeof(char *);

	// we can't use m_termPtrs when we store a transformed word as the
	// synonym into m_synWordBuf, because it can grow dynamically
	// so we have to use offsets into that. so when m_termPtrs is
	// NULL for a syn, use m_termOffs to get it
	m_termOffs = (int32_t *)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (int32_t *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (int32_t *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (int32_t *)bufPtr;
	bufPtr += maxSyns * 4;

	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// langid bit vector. 64 bits means up to 64 langs
	m_langIds = (uint8_t *)bufPtr;
	bufPtr += maxSyns ;

	if ( bufPtr > tmpBuf + TMPSYNBUFSIZE ) gbshutdownLogicError();

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = (const char**)m_termPtrs;
	m_termOffsPtr = m_termOffs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;
	m_langIdsPtr = m_langIds;


	const char *w = token.token_start;
	int32_t  wlen = token.token_len;

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	const char *ss = NULL;
	const char *savedss = NULL;
	int64_t bwid;
	char wikiLangId = langId;
	bool hadSpace ;
	int32_t klen ;
	int32_t baseNumAlnumWords;
	char origLangId = wikiLangId;
	int32_t synSetCount = 0;
	bool doLangLoop = false;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_queryLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_queryLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		int64_t bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId &&
	     wordNum+2 < tr->size() &&
	     (*tr)[wordNum+2].is_alfanum) {
		// get phrase id bigram then
		int32_t conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		const char *wp2 = (*tr)[wordNum+2].token_start;
		int32_t  wlen2 = (*tr)[wordNum+2].token_len;
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = token.token_hash;
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss &&
		     wlen >= 3 &&
		     w[wlen-2]=='\'' &&
		     w[wlen-1]=='s' ) {
			int64_t cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// loop over all the other langids if no synset found in this langid
	if ( ! ss && ! doLangLoop ) {
		wikiLangId = langUnknown; // start at 0
		doLangLoop = true;
	}

	// loop through all languages if no luck
	if ( doLangLoop ) {

		// save it. english is #1 so prefer that in case of
		// multiple matches i guess...
		if ( ss && ! savedss ) savedss = ss;

		// can only have one match to avoid ambiguity when doing
		// a loop over all the langids
		if ( ss && ++synSetCount >= 2 ) {
			// no, don't do this, just keep the first one.
			// like 'sport' is in english and french, so keep
			// the english one i guess. so do not NULL out "ss".
			// only NULL it out orig langid is unknown
			if ( origLangId != langUnknown ) ss = NULL;
			goto skip;
		}

		// advance langid of synset attempt
		wikiLangId++;

		// advance over original we tried first
		if ( wikiLangId == origLangId )
			wikiLangId++;
		// all done?
		if ( wikiLangId < langLast ) { // the last langid
			ss = NULL;
			goto tryOtherLang;
		}
	}

	// use the one single synset we found for some language
	if ( ! ss ) ss = savedss;

 skip:

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	/*
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_queryLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_queryLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_queryLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}
	*/


	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		int32_t count = 0;
	addSynSet:
		// do we have another set following this
		const char *next = g_wiktionary.getNextSynSet(bwid,langId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,"sddbuf");
		}
		// get lang, 2 chars, unless zh_ch
		const char *synLangAbbr = ss;
		// skip over the pipe i guess
		const char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) gbshutdownAbort(true);

		// is it "en" or "zh_ch" etc.
		int synLangAbbrLen = pipe - ss;

		// point to word list
		const char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		const char *e = p + 1;


		char tmp[32];
		int langId;

		// save count in case we need to undo
		//int32_t saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer"
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		int64_t h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		// store the lang as a bit in a bit vector for the query term
		// so it can be from multiple langs.
		if ( synLangAbbrLen > 30 ) gbshutdownAbort(true);
		gbmemcpy ( tmp , synLangAbbr , synLangAbbrLen );
		tmp[synLangAbbrLen] = '\0';
		langId = getLangIdFromAbbr ( tmp ); // order is linear
		if ( langId < 0 ) langId = 0;
		*m_langIdsPtr = langId;


		hadSpace = false;
		klen = e - p;
		for ( int32_t k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// increment the dummies to keep in sync with synonym index
		// this is only for when m_termPtrs[x] is NULL because
		// we store the term into m_synWordBuf() because it is not
		// in out wiktionary file in memory.
		*m_termOffsPtr++ = -1;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			TokenizerResult tmptr;
			plain_tokenizer_phase_1(p,e-p,&tmptr);
			calculate_tokens_hashes(&tmptr);

			*(int64_t *)m_wids0Ptr = tmptr[0].token_hash;
			*(int64_t *)m_wids1Ptr = tmptr[2].token_hash;
			int alfanumCount=0;
			for(const auto &t : tmptr.tokens)
				if(t.is_alfanum)
					alfanumCount++;
			*(int32_t  *)m_numAlnumWordsPtr = alfanumCount;
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_langIdsPtr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) return m_aidsPtr - m_aids;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
		//done:
		// all done
		//return m_aidsPtr - m_aids;
	}

	// do not breach
	if ( m_aidsPtr - m_aids > maxSyns ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase(tr, wordNum, &dt) ) return m_aidsPtr - m_aids;

	// do not breach
	if ( m_aidsPtr - m_aids > maxSyns ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' &&
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe(token.token_start,token.token_len, &dt) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}


bool Synonyms::addWithoutApostrophe(const char *w, int32_t wlen, HashTableX *dt) {
	wlen -= 2;

	uint64_t h = hash64Lower_utf8 ( w, wlen );

	// do not add dups
	if ( dt->isInTable ( &h ) ) return true;
	// add to dedup table. return false with g_errno set on error
	if ( ! dt->addKey ( &h ) ) return false;

	// store that
	*m_aidsPtr++ = h;
	*m_wids0Ptr++ = 0LL;
	*m_wids1Ptr++ = 0LL;
	*m_termPtrsPtr++ = NULL;
	*m_termLensPtr++ = wlen;

	*m_termOffsPtr++ = m_synWordBuf.length();

	m_synWordBuf.safeMemcpy(w,wlen);
	m_synWordBuf.pushChar('\0');

	*m_numAlnumWordsPtr++ = 1;
	*m_numAlnumWordsInBasePtr++ = 1;
	*m_srcPtr++ = SOURCE_GENERATED;

	// no langs
	*m_langIdsPtr++ = 0;

	return true;
}


// just index the first bigram for now to give a little bonus
bool Synonyms::addAmpPhrase(const TokenizerResult *tr, unsigned wordNum, class HashTableX* dt)
{
	// . "D & B" --> dandb
	// . make the "andb" a suffix

	if ( wordNum+2 >= tr->size() ) return true;
	const auto &t0 = (*tr)[wordNum];
	const auto &t1 = (*tr)[wordNum+1];
	const auto &t2 = (*tr)[wordNum+2];

	if(!has_char(t1.token_start,t1.token_end(),'&'))
		return true;
	if(!t2.is_alfanum)
		return true;
	if(t2.token_len > 50)
		return true;

	// need this for hash continuation procedure
	int32_t conti = 0;
	// hack for "d & b" -> "dandb"
	uint64_t h = hash64Lower_utf8_cont(t0.token_start,t0.token_len,0LL,&conti );
	// just make it a bigram with the word "and" after it
	// . we usually ignore stop words like and when someone does the query
	//   but we give out bonus points if the query term's left or right
	//   bigram has that stop word where it should be.
	// . so Dave & Barry will index "daveand" as a bigram and the
	//   search for 'Dave and Barry' will give bonus points for that
	//   bigram.
	h = hash64Lower_utf8_cont ( "and", 3,h,&conti);
	// logic in Phrases.cpp will xor it with 0x768867
	// because it contains a stop word. this prevents "st.
	// and" from matching "stand".
	h ^= 0x768867; //keep in sync with Phrases

	// do not add dups
	if ( dt->isInTable ( &h ) ) return true;
	// add to dedup table. return false with g_errno set on error
	if ( ! dt->addKey ( &h ) ) return false;

	// store that
	*m_aidsPtr++ = h;
	*m_wids0Ptr++ = 0LL;
	*m_wids1Ptr++ = 0LL;
	*m_termPtrsPtr++ = NULL;

	*m_termOffsPtr++ = m_synWordBuf.length();
	*m_termLensPtr++ = t0.token_len;
	m_synWordBuf.safeMemcpy(t0.token_start,t0.token_len);
	m_synWordBuf.safeStrcpy (" and");
	m_synWordBuf.pushChar('\0');

	*m_numAlnumWordsPtr++ = 1;
	*m_numAlnumWordsInBasePtr++ = 1;
	*m_srcPtr++ = SOURCE_GENERATED;

	// no langs
	*m_langIdsPtr++ = 0;

	return true;
}

const char *getSourceString ( char source ) {
	if ( source == SOURCE_NONE ) return "none";
	if ( source == SOURCE_PRESET ) return "preset";
	if ( source == SOURCE_WIKTIONARY ) return "wiktionary";
	if ( source == SOURCE_GENERATED ) return "generated";
	if ( source == SOURCE_BIGRAM ) return "bigram";
	if ( source == SOURCE_TRIGRAM ) return "trigram";
	if ( source == SOURCE_WIKTIONARY_EN ) return "wiktionary-en";
	// the thing we are hashing is a "number"
	if ( source == SOURCE_NUMBER ) return "number";
	return "unknown";
}