forked from Mirrors/privacore-open-source-search-engine
fix stripAccentMarks() to use libiconv stuff
so all languages are now supported.
This commit is contained in:
@ -419,6 +419,10 @@ bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) {
|
||||
long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen);
|
||||
// skip if can't convert to ascii... (unsupported letter)
|
||||
if ( alen < 0 ) return true;
|
||||
|
||||
// if same as original word, skip
|
||||
if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true;
|
||||
|
||||
// hash it
|
||||
uint64_t h2 = hash64Lower_utf8(abuf,alen);
|
||||
// do not add dups
|
||||
|
41
Unicode.cpp
41
Unicode.cpp
@ -660,7 +660,7 @@ UChar32 utf8Decode2(const char *p, const char **next){
|
||||
};
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// starting at 0xc3 0x80 ending at 0xc3 0xbf
|
||||
static char ascii_c3[] = {
|
||||
'A', // 80
|
||||
@ -1075,9 +1075,9 @@ static bichar utf_cf[] = {
|
||||
{0xcf,0xb9}, // be Ͼ
|
||||
{0xcf,0xbd} // bf Ͽ
|
||||
};
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
//long utf8ToAscii(char *outbuf, long outbufsize,
|
||||
long stripAccentMarks (char *outbuf, long outbufsize,
|
||||
unsigned char *p, long inbuflen) { // inbuf
|
||||
@ -1129,6 +1129,41 @@ long stripAccentMarks (char *outbuf, long outbufsize,
|
||||
*dst = '\0';
|
||||
return dst - outbuf;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
long stripAccentMarks (char *outbuf, long outbufsize,
|
||||
unsigned char *p, long inbuflen) {
|
||||
char *s = (char *)p;
|
||||
char *send = (char *)p + inbuflen;
|
||||
long cs;
|
||||
char *dst = outbuf;
|
||||
for ( ; s < send ; s += cs ) {
|
||||
// how big is this character?
|
||||
cs = getUtf8CharSize(s);
|
||||
// convert the utf8 character to UChar32
|
||||
UChar32 uc = utf8Decode ( s );
|
||||
// break "uc" into decomposition of UChar32s
|
||||
UChar32 ttt[8];
|
||||
long klen = recursiveKDExpand(uc,ttt,8);
|
||||
// if the same, leave it! it had no accent marks or other
|
||||
// modifiers...
|
||||
if ( klen <= 1 ) {
|
||||
memcpy ( dst , s , cs );
|
||||
dst += cs;
|
||||
continue;
|
||||
}
|
||||
// take the first one as the stripped
|
||||
// convert back to utf8
|
||||
long stored = utf8Encode ( ttt[0] , dst );
|
||||
// skip over the stored utf8 char
|
||||
dst += stored;
|
||||
}
|
||||
// sanity. breach check
|
||||
if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
|
||||
// return # of bytes stored into outbuf
|
||||
return dst - outbuf;
|
||||
}
|
||||
|
||||
|
||||
// helper function for printing unicode text range
|
||||
|
67
main.cpp
67
main.cpp
@ -2900,6 +2900,73 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// some tests. the greek letter alpha with an accent mark (decompose)
|
||||
/*
|
||||
{
|
||||
char us[] = {0xe1,0xbe,0x80};
|
||||
UChar32 uc = utf8Decode(us);//,&next);
|
||||
UChar32 ttt[32];
|
||||
long klen = recursiveKDExpand(uc,ttt,256);
|
||||
char obuf[64];
|
||||
for ( long i = 0 ; i < klen ; i++ ) {
|
||||
UChar32 ui = ttt[i];
|
||||
long blen = utf8Encode(ui,obuf);
|
||||
obuf[blen]=0;
|
||||
long an = ucIsAlpha(ui);
|
||||
|
||||
fprintf(stderr,"#%li=%s (alnum=%li)\n",i,obuf,an);
|
||||
}
|
||||
fprintf(stderr,"hey\n");
|
||||
exit(0);
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
PRINT OUT all Unicode characters and their decompositions
|
||||
|
||||
{
|
||||
for ( long uc = 0 ; uc < 0xe01ef ; uc++ ) {
|
||||
//if ( ! ucIsAlnum(uc) ) continue;
|
||||
UChar32 ttt[32];
|
||||
long klen = recursiveKDExpand(uc,ttt,256);
|
||||
char obuf[64];
|
||||
long clen = utf8Encode(uc,obuf);
|
||||
obuf[clen]=0;
|
||||
// print utf8 char we are decomposing
|
||||
fprintf(stderr,"%lx) %s --> ",uc,obuf);
|
||||
// sanity
|
||||
if ( klen > 1 && ttt[0] == (UChar32)uc ) {
|
||||
fprintf(stderr,"SAME\n");
|
||||
continue;
|
||||
}
|
||||
// print decomposition
|
||||
for ( long i = 0 ; i < klen ; i++ ) {
|
||||
UChar32 ui = ttt[i];
|
||||
char qbuf[64];
|
||||
long blen = utf8Encode(ui,qbuf);
|
||||
qbuf[blen]=0;
|
||||
fprintf(stderr,"%s",qbuf);
|
||||
// show the #
|
||||
fprintf(stderr,"{%lx}",(long)ui);
|
||||
if ( i+1<klen ) fprintf(stderr,", ");
|
||||
}
|
||||
// show utf8 rep
|
||||
fprintf(stderr," [");
|
||||
for ( long i = 0 ; i < clen ; i++ ) {
|
||||
fprintf(stderr,"0x%hhx",(int)obuf[i]);
|
||||
if ( i+1<clen) fprintf(stderr," ");
|
||||
}
|
||||
fprintf(stderr,"]");
|
||||
fprintf(stderr,"\n");
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
// the wiktionary for lang identification and alternate word forms/
|
||||
// synonyms
|
||||
if ( ! g_wiktionary.load() ) return 1;
|
||||
|
Reference in New Issue
Block a user