fix stripAccentMarks() to use libiconv stuff

so all languages are now supported.
This commit is contained in:
mwells
2014-05-31 08:14:39 -07:00
parent 5f16013a9e
commit f16414b774
3 changed files with 109 additions and 3 deletions

@ -419,6 +419,10 @@ bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) {
long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen);
// skip if can't convert to ascii... (unsupported letter)
if ( alen < 0 ) return true;
// if same as original word, skip
if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true;
// hash it
uint64_t h2 = hash64Lower_utf8(abuf,alen);
// do not add dups

@ -660,7 +660,7 @@ UChar32 utf8Decode2(const char *p, const char **next){
};
}
*/
/*
// starting at 0xc3 0x80 ending at 0xc3 0xbf
static char ascii_c3[] = {
'A', // 80
@ -1075,9 +1075,9 @@ static bichar utf_cf[] = {
{0xcf,0xb9}, // be Ͼ
{0xcf,0xbd} // bf Ͽ
};
*/
/*
//long utf8ToAscii(char *outbuf, long outbufsize,
long stripAccentMarks (char *outbuf, long outbufsize,
unsigned char *p, long inbuflen) { // inbuf
@ -1129,6 +1129,41 @@ long stripAccentMarks (char *outbuf, long outbufsize,
*dst = '\0';
return dst - outbuf;
}
*/
long stripAccentMarks (char *outbuf, long outbufsize,
unsigned char *p, long inbuflen) {
char *s = (char *)p;
char *send = (char *)p + inbuflen;
long cs;
char *dst = outbuf;
for ( ; s < send ; s += cs ) {
// how big is this character?
cs = getUtf8CharSize(s);
// convert the utf8 character to UChar32
UChar32 uc = utf8Decode ( s );
// break "uc" into decomposition of UChar32s
UChar32 ttt[8];
long klen = recursiveKDExpand(uc,ttt,8);
// if the same, leave it! it had no accent marks or other
// modifiers...
if ( klen <= 1 ) {
memcpy ( dst , s , cs );
dst += cs;
continue;
}
// take the first one as the stripped
// convert back to utf8
long stored = utf8Encode ( ttt[0] , dst );
// skip over the stored utf8 char
dst += stored;
}
// sanity. breach check
if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
// return # of bytes stored into outbuf
return dst - outbuf;
}
// helper function for printing unicode text range

@ -2900,6 +2900,73 @@ int main2 ( int argc , char *argv[] ) {
return 1;
}
// some tests. the greek letter alpha with an accent mark (decompose)
/*
{
char us[] = {0xe1,0xbe,0x80};
UChar32 uc = utf8Decode(us);//,&next);
UChar32 ttt[32];
long klen = recursiveKDExpand(uc,ttt,256);
char obuf[64];
for ( long i = 0 ; i < klen ; i++ ) {
UChar32 ui = ttt[i];
long blen = utf8Encode(ui,obuf);
obuf[blen]=0;
long an = ucIsAlpha(ui);
fprintf(stderr,"#%li=%s (alnum=%li)\n",i,obuf,an);
}
fprintf(stderr,"hey\n");
exit(0);
}
*/
/*
PRINT OUT all Unicode characters and their decompositions
{
for ( long uc = 0 ; uc < 0xe01ef ; uc++ ) {
//if ( ! ucIsAlnum(uc) ) continue;
UChar32 ttt[32];
long klen = recursiveKDExpand(uc,ttt,256);
char obuf[64];
long clen = utf8Encode(uc,obuf);
obuf[clen]=0;
// print utf8 char we are decomposing
fprintf(stderr,"%lx) %s --> ",uc,obuf);
// sanity
if ( klen > 1 && ttt[0] == (UChar32)uc ) {
fprintf(stderr,"SAME\n");
continue;
}
// print decomposition
for ( long i = 0 ; i < klen ; i++ ) {
UChar32 ui = ttt[i];
char qbuf[64];
long blen = utf8Encode(ui,qbuf);
qbuf[blen]=0;
fprintf(stderr,"%s",qbuf);
// show the #
fprintf(stderr,"{%lx}",(long)ui);
if ( i+1<klen ) fprintf(stderr,", ");
}
// show utf8 rep
fprintf(stderr," [");
for ( long i = 0 ; i < clen ; i++ ) {
fprintf(stderr,"0x%hhx",(int)obuf[i]);
if ( i+1<clen) fprintf(stderr," ");
}
fprintf(stderr,"]");
fprintf(stderr,"\n");
}
exit(0);
}
*/
// the wiktionary for lang identification and alternate word forms/
// synonyms
if ( ! g_wiktionary.load() ) return 1;