when user searches for a word without the
accent marks, we now also search for the same word but with the proper accent marks.
This commit is contained in:
parent
6f704d3d6a
commit
d15f5d3ce7
3
Lang.h
3
Lang.h
@ -72,7 +72,8 @@ enum {
|
||||
langMalgasy = 52,
|
||||
langKurdish = 53,
|
||||
langLuxembourgish = 54,
|
||||
langEstonian = 55
|
||||
langEstonian = 55,
|
||||
langLast = 56
|
||||
};
|
||||
|
||||
uint8_t getLanguageFromName(uint8_t *name);
|
||||
|
44
Synonyms.cpp
44
Synonyms.cpp
@ -113,11 +113,15 @@ long Synonyms::getSynonyms ( Words *words ,
|
||||
|
||||
char sourceId = SOURCE_WIKTIONARY;
|
||||
char *ss = NULL;
|
||||
char *savedss = NULL;
|
||||
long long bwid;
|
||||
char wikiLangId = m_docLangId;
|
||||
bool hadSpace ;
|
||||
long klen ;
|
||||
long baseNumAlnumWords;
|
||||
char origLangId = wikiLangId;
|
||||
long synSetCount = 0;
|
||||
bool doLangLoop = false;
|
||||
|
||||
tryOtherLang:
|
||||
|
||||
@ -178,9 +182,47 @@ long Synonyms::getSynonyms ( Words *words ,
|
||||
}
|
||||
}
|
||||
|
||||
// loop over all the other langids if no synset found in this langid
|
||||
if ( ! ss && ! doLangLoop ) {
|
||||
wikiLangId = langUnknown; // start at 0
|
||||
doLangLoop = true;
|
||||
}
|
||||
|
||||
// loop through all languages if no luck
|
||||
if ( doLangLoop ) {
|
||||
|
||||
// save it
|
||||
if ( ss ) savedss = ss;
|
||||
|
||||
// can only have one match to avoid ambiguity when doing
|
||||
// a loop over all the langids
|
||||
if ( ss && ++synSetCount >= 2 ) {
|
||||
ss = NULL;
|
||||
goto skip;
|
||||
}
|
||||
|
||||
// advance langid of synset attempt
|
||||
wikiLangId++;
|
||||
|
||||
// advance over original we tried first
|
||||
if ( wikiLangId == origLangId )
|
||||
wikiLangId++;
|
||||
// all done?
|
||||
if ( wikiLangId < langLast ) { // the last langid
|
||||
ss = NULL;
|
||||
goto tryOtherLang;
|
||||
}
|
||||
}
|
||||
|
||||
// use the one single synset we found for some language
|
||||
if ( ! ss ) ss = savedss;
|
||||
|
||||
skip:
|
||||
|
||||
// even though a document may be in german it often has some
|
||||
// english words "pdf download" "copyright" etc. so if the word
|
||||
// has no synset in german, try it in english
|
||||
/*
|
||||
if ( //numPresets == 0 &&
|
||||
! ss &&
|
||||
m_docLangId != langEnglish &&
|
||||
@ -192,6 +234,8 @@ long Synonyms::getSynonyms ( Words *words ,
|
||||
sourceId = SOURCE_WIKTIONARY_EN;
|
||||
goto tryOtherLang;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// if it was in wiktionary, just use that synset
|
||||
if ( ss ) {
|
||||
|
@ -1144,8 +1144,11 @@ long stripAccentMarks (char *outbuf, long outbufsize,
|
||||
// convert the utf8 character to UChar32
|
||||
UChar32 uc = utf8Decode ( s );
|
||||
// break "uc" into decomposition of UChar32s
|
||||
UChar32 ttt[8];
|
||||
long klen = recursiveKDExpand(uc,ttt,8);
|
||||
UChar32 ttt[32];
|
||||
long klen = recursiveKDExpand(uc,ttt,32);
|
||||
if(klen>32){char *xx=NULL;*xx=0;}
|
||||
// sanity
|
||||
if ( dst + 5 > outbuf+outbufsize ) return -1;
|
||||
// if the same, leave it! it had no accent marks or other
|
||||
// modifiers...
|
||||
if ( klen <= 1 ) {
|
||||
|
@ -1900,10 +1900,11 @@ bool Wiktionary::compile ( ) {
|
||||
//long long lastWid = 0LL;
|
||||
// remove dups
|
||||
HashTableX dd2;
|
||||
char dbuf2[256];
|
||||
dd2.set(8,0,8,dbuf2,256,false,0,"ddttt2");
|
||||
char dbuf2[512];
|
||||
dd2.set(8,0,8,dbuf2,512,false,0,"ddttt2");
|
||||
// how many forms? must be 2+ to get added to syntable
|
||||
long formCount = 0;
|
||||
long stripCount = 0;
|
||||
for ( long j = i ; ; j++ ) {
|
||||
// wrap around
|
||||
if ( j >= m_tmp.m_numSlots ) j = 0;
|
||||
@ -1942,19 +1943,24 @@ bool Wiktionary::compile ( ) {
|
||||
1023,
|
||||
(unsigned char *)word,
|
||||
gbstrlen(word));
|
||||
if ( stripLen > 0 )
|
||||
formCount++;
|
||||
if ( stripLen <= 0 ) continue;
|
||||
// if same as original word, skip
|
||||
long wlen = gbstrlen(word);
|
||||
if ( wlen==stripLen && strncmp(a,word,wlen)==0)
|
||||
continue;
|
||||
// count as additional form
|
||||
stripCount++;
|
||||
}
|
||||
// need 2+ forms!
|
||||
if ( formCount <= 1 ) continue;
|
||||
if ( formCount +stripCount <= 1 ) continue;
|
||||
// base form
|
||||
//long long wid = *(long long *)m_tmp.getDataFromSlot(i);
|
||||
// remember buf start
|
||||
long bufLen = m_synBuf.length();
|
||||
// remove dups
|
||||
HashTableX dd;
|
||||
char dbuf[256];
|
||||
dd.set(8,0,8,dbuf,256,false,0,"ddttt");
|
||||
char dbuf[512];
|
||||
dd.set(8,0,8,dbuf,512,false,0,"ddttt");
|
||||
// a byte for storing the # of synonym forms
|
||||
//m_synBuf.pushChar(0);
|
||||
// push the langid!
|
||||
@ -2037,13 +2043,30 @@ bool Wiktionary::compile ( ) {
|
||||
1023,
|
||||
(unsigned char *)word,
|
||||
gbstrlen(word));
|
||||
// debug time
|
||||
if ( stripLen > 0 ) a[stripLen] = 0;
|
||||
//if ( stripLen > 0 )
|
||||
// log("wikt: %li) %s->%s",i,word,a);
|
||||
//if ( i==5133265 )
|
||||
// log("hey");
|
||||
// if same as original word, ignore it
|
||||
if ( stripLen > 0 ) {
|
||||
long long swid = hash64Lower_utf8(word);
|
||||
long wlen = gbstrlen(word);
|
||||
if ( wlen==stripLen &&
|
||||
strncmp(a,word,wlen) == 0 )
|
||||
stripLen = 0;
|
||||
}
|
||||
// if different, add it
|
||||
if ( stripLen > 0 ) {
|
||||
long long swid = hash64Lower_utf8(a);
|
||||
// xor in the langid
|
||||
swid ^= g_hashtab[0][langId];
|
||||
// only add this word form once per langId
|
||||
if ( dd.isInTable ( &swid ) ) continue;
|
||||
dd.addKey ( &swid );
|
||||
// . a ptr to that sequence of alt forms in buf
|
||||
// . this uses 6 byte keys
|
||||
m_synTable.addKey(&swid,&bufLen);
|
||||
}
|
||||
|
||||
|
||||
|
480381
wiktionary-buf.txt
480381
wiktionary-buf.txt
File diff suppressed because it is too large
Load Diff
34369
wiktionary-lang.txt
34369
wiktionary-lang.txt
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user