when user searches for a word without the

accent marks, we now also search for the same
word but with the proper accent marks.
This commit is contained in:
Matt Wells 2014-06-01 09:37:00 -07:00
parent 6f704d3d6a
commit d15f5d3ce7
7 changed files with 492869 additions and 21974 deletions

3
Lang.h

@ -72,7 +72,8 @@ enum {
langMalgasy = 52,
langKurdish = 53,
langLuxembourgish = 54,
langEstonian = 55
langEstonian = 55,
langLast = 56
};
uint8_t getLanguageFromName(uint8_t *name);

@ -113,11 +113,15 @@ long Synonyms::getSynonyms ( Words *words ,
char sourceId = SOURCE_WIKTIONARY;
char *ss = NULL;
char *savedss = NULL;
long long bwid;
char wikiLangId = m_docLangId;
bool hadSpace ;
long klen ;
long baseNumAlnumWords;
char origLangId = wikiLangId;
long synSetCount = 0;
bool doLangLoop = false;
tryOtherLang:
@ -178,9 +182,47 @@ long Synonyms::getSynonyms ( Words *words ,
}
}
// loop over all the other langids if no synset found in this langid
if ( ! ss && ! doLangLoop ) {
wikiLangId = langUnknown; // start at 0
doLangLoop = true;
}
// loop through all languages if no luck
if ( doLangLoop ) {
// save it
if ( ss ) savedss = ss;
// can only have one match to avoid ambiguity when doing
// a loop over all the langids
if ( ss && ++synSetCount >= 2 ) {
ss = NULL;
goto skip;
}
// advance langid of synset attempt
wikiLangId++;
// advance over original we tried first
if ( wikiLangId == origLangId )
wikiLangId++;
// all done?
if ( wikiLangId < langLast ) { // the last langid
ss = NULL;
goto tryOtherLang;
}
}
// use the one single synset we found for some language
if ( ! ss ) ss = savedss;
skip:
// even though a document may be in german it often has some
// english words "pdf download" "copyright" etc. so if the word
// has no synset in german, try it in english
/*
if ( //numPresets == 0 &&
! ss &&
m_docLangId != langEnglish &&
@ -192,6 +234,8 @@ long Synonyms::getSynonyms ( Words *words ,
sourceId = SOURCE_WIKTIONARY_EN;
goto tryOtherLang;
}
*/
// if it was in wiktionary, just use that synset
if ( ss ) {

@ -1144,8 +1144,11 @@ long stripAccentMarks (char *outbuf, long outbufsize,
// convert the utf8 character to UChar32
UChar32 uc = utf8Decode ( s );
// break "uc" into decomposition of UChar32s
UChar32 ttt[8];
long klen = recursiveKDExpand(uc,ttt,8);
UChar32 ttt[32];
long klen = recursiveKDExpand(uc,ttt,32);
if(klen>32){char *xx=NULL;*xx=0;}
// sanity
if ( dst + 5 > outbuf+outbufsize ) return -1;
// if the same, leave it! it had no accent marks or other
// modifiers...
if ( klen <= 1 ) {

@ -1900,10 +1900,11 @@ bool Wiktionary::compile ( ) {
//long long lastWid = 0LL;
// remove dups
HashTableX dd2;
char dbuf2[256];
dd2.set(8,0,8,dbuf2,256,false,0,"ddttt2");
char dbuf2[512];
dd2.set(8,0,8,dbuf2,512,false,0,"ddttt2");
// how many forms? must be 2+ to get added to syntable
long formCount = 0;
long stripCount = 0;
for ( long j = i ; ; j++ ) {
// wrap around
if ( j >= m_tmp.m_numSlots ) j = 0;
@ -1942,19 +1943,24 @@ bool Wiktionary::compile ( ) {
1023,
(unsigned char *)word,
gbstrlen(word));
if ( stripLen > 0 )
formCount++;
if ( stripLen <= 0 ) continue;
// if same as original word, skip
long wlen = gbstrlen(word);
if ( wlen==stripLen && strncmp(a,word,wlen)==0)
continue;
// count as additional form
stripCount++;
}
// need 2+ forms!
if ( formCount <= 1 ) continue;
if ( formCount +stripCount <= 1 ) continue;
// base form
//long long wid = *(long long *)m_tmp.getDataFromSlot(i);
// remember buf start
long bufLen = m_synBuf.length();
// remove dups
HashTableX dd;
char dbuf[256];
dd.set(8,0,8,dbuf,256,false,0,"ddttt");
char dbuf[512];
dd.set(8,0,8,dbuf,512,false,0,"ddttt");
// a byte for storing the # of synonym forms
//m_synBuf.pushChar(0);
// push the langid!
@ -2037,13 +2043,30 @@ bool Wiktionary::compile ( ) {
1023,
(unsigned char *)word,
gbstrlen(word));
// debug time
if ( stripLen > 0 ) a[stripLen] = 0;
//if ( stripLen > 0 )
// log("wikt: %li) %s->%s",i,word,a);
//if ( i==5133265 )
// log("hey");
// if same as original word, ignore it
if ( stripLen > 0 ) {
long long swid = hash64Lower_utf8(word);
long wlen = gbstrlen(word);
if ( wlen==stripLen &&
strncmp(a,word,wlen) == 0 )
stripLen = 0;
}
// if different, add it
if ( stripLen > 0 ) {
long long swid = hash64Lower_utf8(a);
// xor in the langid
swid ^= g_hashtab[0][langId];
// only add this word form once per langId
if ( dd.isInTable ( &swid ) ) continue;
dd.addKey ( &swid );
// . a ptr to that sequence of alt forms in buf
// . this uses 6 byte keys
m_synTable.addKey(&swid,&bufLen);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.