privacore-open-source-searc.../Synonyms.cpp
Ivan Skytte Jørgensen 8853a156ac bugfix/workaround for bigram hashes
If a bigram contained one of the 109 eglish stopwords then the hash was XORed with 0x768867 in an attempt to distinquish bigrams that were split compound words and bigrams that couldn't form a compound word. eg:
  Not compound word: "the rapist" versus "therapist"
  Possibly a compound word: "light footed" vs. "lightfooted"
However, the code was english-specific and could hurt resutls for other languages. And you need a POS-tagger to do it correctly.
2018-08-02 13:14:44 +02:00

534 lines
14 KiB
C++

#include "Synonyms.h"
#include "tokenizer.h"
#include "Bits.h"
#include "Phrases.h"
#include "Wiktionary.h"
#include "Lang.h"
#include "GbUtil.h"
#include "Sanity.h"
#include "gbmemcpy.h"
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
Synonyms::Synonyms() {
m_synWordBuf.setLabel("syswbuf");
// Coverity
m_aids = NULL;
m_wids0 = NULL;
m_wids1 = NULL;
m_termPtrs = NULL;
m_termOffs = NULL;
m_termLens = NULL;
m_numAlnumWords = NULL;
m_numAlnumWordsInBase = NULL;
m_src = NULL;
m_langIds = NULL;
m_aidsPtr = NULL;
m_wids0Ptr = NULL;
m_wids1Ptr = NULL;
m_termPtrsPtr = NULL;
m_termOffsPtr = NULL;
m_termLensPtr = NULL;
m_numAlnumWordsPtr = NULL;
m_numAlnumWordsInBasePtr = NULL;
m_srcPtr = NULL;
m_langIdsPtr = NULL;
}
Synonyms::~Synonyms() {
reset();
}
void Synonyms::reset() {
m_synWordBuf.purge();
}
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
// to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
// which we pre-alloc upon calling the set() function based on the # of
// words we got
// . returns # of synonyms stored into "tmpBuf"
int32_t Synonyms::getSynonyms(const TokenizerResult *tr,
unsigned wordNum,
uint8_t langId ,
char *tmpBuf ) {
if ( wordNum >= tr->size() ) gbshutdownLogicError();
const auto &token = (*tr)[wordNum];
// punct words have no synoyms
if ( !token.is_alfanum )
return 0;
// init the dedup table to dedup wordIds
HashTableX dt;
char dbuf[512];
dt.set(8,0,12,dbuf,512,false,"altwrds");
int32_t maxSyns = (int32_t)MAX_SYNS;
char *bufPtr = tmpBuf;
// point into buffer
m_aids = (int64_t *)bufPtr;
bufPtr += maxSyns * 8;
// then the word ids
m_wids0 = (int64_t *)bufPtr;
bufPtr += maxSyns * 8;
// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
m_wids1 = (int64_t *)bufPtr;
bufPtr += maxSyns * 8;
m_termPtrs = (char **)bufPtr;
bufPtr += maxSyns * sizeof(char *);
// we can't use m_termPtrs when we store a transformed word as the
// synonym into m_synWordBuf, because it can grow dynamically
// so we have to use offsets into that. so when m_termPtrs is
// NULL for a syn, use m_termOffs to get it
m_termOffs = (int32_t *)bufPtr;
bufPtr += maxSyns * 4;
m_termLens = (int32_t *)bufPtr;
bufPtr += maxSyns * 4;
m_numAlnumWords = (int32_t *)bufPtr;
bufPtr += maxSyns * 4;
m_numAlnumWordsInBase = (int32_t *)bufPtr;
bufPtr += maxSyns * 4;
// source
m_src = bufPtr;
bufPtr += maxSyns;
// langid bit vector. 64 bits means up to 64 langs
m_langIds = (uint8_t *)bufPtr;
bufPtr += maxSyns ;
if ( bufPtr > tmpBuf + TMPSYNBUFSIZE ) gbshutdownLogicError();
// cursors
m_aidsPtr = m_aids;
m_wids0Ptr = m_wids0;
m_wids1Ptr = m_wids1;
m_srcPtr = m_src;
m_termPtrsPtr = (const char**)m_termPtrs;
m_termOffsPtr = m_termOffs;
m_termLensPtr = m_termLens;
m_numAlnumWordsPtr = m_numAlnumWords;
m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;
m_langIdsPtr = m_langIds;
const char *w = token.token_start;
int32_t wlen = token.token_len;
//
// NOW hit wiktionary
// Trust this less then our s_exceptions above, but more than
// our morph computations below
//
char sourceId = SOURCE_WIKTIONARY;
const char *ss = NULL;
const char *savedss = NULL;
int64_t bwid;
char wikiLangId = langId;
bool hadSpace ;
int32_t klen ;
int32_t baseNumAlnumWords;
char origLangId = wikiLangId;
int32_t synSetCount = 0;
bool doLangLoop = false;
tryOtherLang:
/*
// if word only exists in one language, assume that language for word
// even if m_queryLangId is langUnknown (0)
if ( ! ss &&
! m_queryLangId &&
! wikiLangId ) {
// get raw word id
bwid = m_words->m_wordIds[wordNum];
// each lang has its own bit
int64_t bits = g_speller.getLangBits64 ( &bwid );
// skip if not unique
char count = getNumBitsOn64 ( bits ) ;
// if we only got one lang we could be, assume that
if ( count == 1 )
// get it. bit #0 is english, so add 1
wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
// try setting based on script. greek. russian. etc.
// if the word was not in the wiktionary.
// this will be langUnknown if not definitive.
else
wikiLangId = getCharacterLanguage(w);
}
*/
// try looking up bigram so "new jersey" gets "nj" as synonym
if ( wikiLangId &&
wordNum+2 < tr->size() &&
(*tr)[wordNum+2].is_alfanum) {
// get phrase id bigram then
int32_t conti = 0;
bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
// then the next word
const char *wp2 = (*tr)[wordNum+2].token_start;
int32_t wlen2 = (*tr)[wordNum+2].token_len;
bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
baseNumAlnumWords = 2;
ss = g_wiktionary.getSynSet( bwid, wikiLangId );
}
// need a language for wiktionary to work with
if ( wikiLangId && ! ss ) {
// get raw word id
bwid = token.token_hash;
baseNumAlnumWords = 1;
//if ( bwid == 1424622907102375150LL)
// log("a");
ss = g_wiktionary.getSynSet( bwid, wikiLangId );
// if that failed try removing 's from word if there
if ( ! ss &&
wlen >= 3 &&
w[wlen-2]=='\'' &&
w[wlen-1]=='s' ) {
int64_t cwid = hash64Lower_utf8(w,wlen-2);
ss = g_wiktionary.getSynSet( cwid, wikiLangId );
}
}
// loop over all the other langids if no synset found in this langid
if ( ! ss && ! doLangLoop ) {
wikiLangId = langUnknown; // start at 0
doLangLoop = true;
}
// loop through all languages if no luck
if ( doLangLoop ) {
// save it. english is #1 so prefer that in case of
// multiple matches i guess...
if ( ss && ! savedss ) savedss = ss;
// can only have one match to avoid ambiguity when doing
// a loop over all the langids
if ( ss && ++synSetCount >= 2 ) {
// no, don't do this, just keep the first one.
// like 'sport' is in english and french, so keep
// the english one i guess. so do not NULL out "ss".
// only NULL it out orig langid is unknown
if ( origLangId != langUnknown ) ss = NULL;
goto skip;
}
// advance langid of synset attempt
wikiLangId++;
// advance over original we tried first
if ( wikiLangId == origLangId )
wikiLangId++;
// all done?
if ( wikiLangId < langLast ) { // the last langid
ss = NULL;
goto tryOtherLang;
}
}
// use the one single synset we found for some language
if ( ! ss ) ss = savedss;
skip:
// even though a document may be in german it often has some
// english words "pdf download" "copyright" etc. so if the word
// has no synset in german, try it in english
/*
if ( //numPresets == 0 &&
! ss &&
m_queryLangId != langEnglish &&
wikiLangId != langEnglish &&
m_queryLangId &&
g_speller.getSynsInEnglish(w,wlen,m_queryLangId,langEnglish) ) {
// try english
wikiLangId = langEnglish;
sourceId = SOURCE_WIKTIONARY_EN;
goto tryOtherLang;
}
*/
// if it was in wiktionary, just use that synset
if ( ss ) {
// prepare th
HashTableX dedup;
HashTableX *dd = NULL;
char dbuf[512];
int32_t count = 0;
addSynSet:
// do we have another set following this
const char *next = g_wiktionary.getNextSynSet(bwid,langId,ss);
// if so, init the dedup table then
if ( next && ! dd ) {
dd = &dedup;
dd->set ( 8,0,8,dbuf,512,false,"sddbuf");
}
// get lang, 2 chars, unless zh_ch
const char *synLangAbbr = ss;
// skip over the pipe i guess
const char *pipe = ss + 2;
// zh_ch?
if ( *pipe == '_' ) pipe += 3;
// sanity
if ( *pipe != '|' ) gbshutdownAbort(true);
// is it "en" or "zh_ch" etc.
int synLangAbbrLen = pipe - ss;
// point to word list
const char *p = pipe + 1;
// hash up the list of words, they are in utf8 and
const char *e = p + 1;
char tmp[32];
int langId;
// save count in case we need to undo
//int32_t saved = m_numAlts[wordNum];
hashLoop:
// skip synonyms that are anagrams because its to ambiguous
// the are mappings like
// "PC" -> "PC,Personal Computer"
// "PC" -> "PC,Probable Cause" ... (lots more!)
//bool isAnagram = true;
for ( ; *e !='\n' && *e != ',' ; e++ ) ;
// if ( ! is_upper_a(*e) ) isAnagram = false;
// get it
int64_t h = hash64Lower_utf8_nospaces ( p , e - p );
// skip if same as base word
if ( h == bwid ) goto getNextSyn;
// should we check for dups?
if ( dd ) {
// skip dups
if ( dd->isInTable(&h) ) goto getNextSyn;
// dedup. return false with g_errno set on error
if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
}
// store it
*m_aidsPtr++ = h;
// store source
*m_srcPtr++ = sourceId;
// store the lang as a bit in a bit vector for the query term
// so it can be from multiple langs.
if ( synLangAbbrLen > 30 ) gbshutdownAbort(true);
gbmemcpy ( tmp , synLangAbbr , synLangAbbrLen );
tmp[synLangAbbrLen] = '\0';
langId = getLangIdFromAbbr ( tmp ); // order is linear
if ( langId < 0 ) langId = 0;
*m_langIdsPtr = langId;
hadSpace = false;
klen = e - p;
for ( int32_t k = 0 ; k < klen ; k++ )
if ( is_wspace_a(p[k]) ) hadSpace = true;
*m_termPtrsPtr++ = p;
*m_termLensPtr++ = e-p;
// increment the dummies to keep in sync with synonym index
// this is only for when m_termPtrs[x] is NULL because
// we store the term into m_synWordBuf() because it is not
// in out wiktionary file in memory.
*m_termOffsPtr++ = -1;
// only for multi-word synonyms like "New Jersey"...
*m_wids0Ptr = 0LL;
*m_wids1Ptr = 0LL;
*m_numAlnumWordsPtr = 1;
// and for multi alnum word synonyms
if ( hadSpace ) {
TokenizerResult tmptr;
plain_tokenizer_phase_1(p,e-p,&tmptr);
calculate_tokens_hashes(&tmptr);
*(int64_t *)m_wids0Ptr = tmptr[0].token_hash;
*(int64_t *)m_wids1Ptr = tmptr[2].token_hash;
int alfanumCount=0;
for(const auto &t : tmptr.tokens)
if(t.is_alfanum)
alfanumCount++;
*(int32_t *)m_numAlnumWordsPtr = alfanumCount;
}
m_wids0Ptr++;
m_wids1Ptr++;
m_langIdsPtr++;
m_numAlnumWordsPtr++;
// how many words did we have to hash to find a synset?
// i.e. "new jersey" would be 2, to get "nj"
*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;
// do not breach
if ( ++count >= maxSyns ) return m_aidsPtr - m_aids;
getNextSyn:
// loop for more
if ( *e == ',' ) { e++; p = e; goto hashLoop; }
// add in the next syn set, deduped
if ( next ) { ss = next; goto addSynSet; }
// wrap it up
//done:
// all done
//return m_aidsPtr - m_aids;
}
// do not breach
if ( m_aidsPtr - m_aids > maxSyns ) return m_aidsPtr - m_aids;
// returns false with g_errno set
if ( ! addAmpPhrase(tr, wordNum, &dt) ) return m_aidsPtr - m_aids;
// do not breach
if ( m_aidsPtr - m_aids > maxSyns ) return m_aidsPtr - m_aids;
// if we end in apostrophe, strip and add
if ( wlen>= 3 &&
w[wlen-1] == 's' &&
w[wlen-2]=='\'' &&
! addWithoutApostrophe(token.token_start,token.token_len, &dt) )
return m_aidsPtr - m_aids;
return m_aidsPtr - m_aids;
}
bool Synonyms::addWithoutApostrophe(const char *w, int32_t wlen, HashTableX *dt) {
wlen -= 2;
uint64_t h = hash64Lower_utf8 ( w, wlen );
// do not add dups
if ( dt->isInTable ( &h ) ) return true;
// add to dedup table. return false with g_errno set on error
if ( ! dt->addKey ( &h ) ) return false;
// store that
*m_aidsPtr++ = h;
*m_wids0Ptr++ = 0LL;
*m_wids1Ptr++ = 0LL;
*m_termPtrsPtr++ = NULL;
*m_termLensPtr++ = wlen;
*m_termOffsPtr++ = m_synWordBuf.length();
m_synWordBuf.safeMemcpy(w,wlen);
m_synWordBuf.pushChar('\0');
*m_numAlnumWordsPtr++ = 1;
*m_numAlnumWordsInBasePtr++ = 1;
*m_srcPtr++ = SOURCE_GENERATED;
// no langs
*m_langIdsPtr++ = 0;
return true;
}
// just index the first bigram for now to give a little bonus
bool Synonyms::addAmpPhrase(const TokenizerResult *tr, unsigned wordNum, class HashTableX* dt)
{
// . "D & B" --> dandb
// . make the "andb" a suffix
if ( wordNum+2 >= tr->size() ) return true;
const auto &t0 = (*tr)[wordNum];
const auto &t1 = (*tr)[wordNum+1];
const auto &t2 = (*tr)[wordNum+2];
if(!has_char(t1.token_start,t1.token_end(),'&'))
return true;
if(!t2.is_alfanum)
return true;
if(t2.token_len > 50)
return true;
// need this for hash continuation procedure
int32_t conti = 0;
// hack for "d & b" -> "dandb"
uint64_t h = hash64Lower_utf8_cont(t0.token_start,t0.token_len,0LL,&conti );
// just make it a bigram with the word "and" after it
// . we usually ignore stop words like and when someone does the query
// but we give out bonus points if the query term's left or right
// bigram has that stop word where it should be.
// . so Dave & Barry will index "daveand" as a bigram and the
// search for 'Dave and Barry' will give bonus points for that
// bigram.
h = hash64Lower_utf8_cont ( "and", 3,h,&conti);
// logic in Phrases.cpp will xor it with 0x768867
// because it contains a stop word. this prevents "st.
// and" from matching "stand".
h ^= 0x768867; //keep in sync with Phrases
// do not add dups
if ( dt->isInTable ( &h ) ) return true;
// add to dedup table. return false with g_errno set on error
if ( ! dt->addKey ( &h ) ) return false;
// store that
*m_aidsPtr++ = h;
*m_wids0Ptr++ = 0LL;
*m_wids1Ptr++ = 0LL;
*m_termPtrsPtr++ = NULL;
*m_termOffsPtr++ = m_synWordBuf.length();
*m_termLensPtr++ = t0.token_len;
m_synWordBuf.safeMemcpy(t0.token_start,t0.token_len);
m_synWordBuf.safeStrcpy (" and");
m_synWordBuf.pushChar('\0');
*m_numAlnumWordsPtr++ = 1;
*m_numAlnumWordsInBasePtr++ = 1;
*m_srcPtr++ = SOURCE_GENERATED;
// no langs
*m_langIdsPtr++ = 0;
return true;
}
const char *getSourceString ( char source ) {
if ( source == SOURCE_NONE ) return "none";
if ( source == SOURCE_PRESET ) return "preset";
if ( source == SOURCE_WIKTIONARY ) return "wiktionary";
if ( source == SOURCE_GENERATED ) return "generated";
if ( source == SOURCE_BIGRAM ) return "bigram";
if ( source == SOURCE_TRIGRAM ) return "trigram";
if ( source == SOURCE_WIKTIONARY_EN ) return "wiktionary-en";
// the thing we are hashing is a "number"
if ( source == SOURCE_NUMBER ) return "number";
return "unknown";
}