fix stripAccentMarks() to use libiconv stuff

so all languages are now supported.
2014-05-31 08:14:39 -07:00
parent 5f16013a9e
commit f16414b774
3 changed files with 109 additions and 3 deletions
--- a/Synonyms.cpp
+++ b/Synonyms.cpp
@ -419,6 +419,10 @@ bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) {
 	long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen);
 	// skip if can't convert to ascii... (unsupported letter)
 	if ( alen < 0 ) return true;
+
+	// if same as original word, skip
+	if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true;
+
 	// hash it
 	uint64_t h2 = hash64Lower_utf8(abuf,alen);
 	// do not add dups
--- a/Unicode.cpp
+++ b/Unicode.cpp
@ -660,7 +660,7 @@ UChar32 utf8Decode2(const char *p, const char **next){
 	};
 }
 */
-
+/*
 // starting at 0xc3 0x80  ending at 0xc3 0xbf
 static char ascii_c3[] = {
 	'A', // 80
@ -1075,9 +1075,9 @@ static bichar utf_cf[] = {
 	{0xcf,0xb9}, // be Ͼ
 	{0xcf,0xbd}  // bf Ͽ
 };
-	
-
+*/	

+/*
 //long utf8ToAscii(char *outbuf, long outbufsize,
 long stripAccentMarks (char *outbuf, long outbufsize,
 		       unsigned char *p, long inbuflen) { // inbuf
@ -1129,6 +1129,41 @@ long stripAccentMarks (char *outbuf, long outbufsize,
 	*dst = '\0';
 	return dst - outbuf;
 }
+*/
+
+
+long stripAccentMarks (char *outbuf, long outbufsize,
+		       unsigned char *p, long inbuflen) {
+	char *s = (char *)p;
+	char *send = (char *)p + inbuflen;
+	long cs;
+	char *dst = outbuf;
+	for ( ; s < send ; s += cs ) {
+		// how big is this character?
+		cs = getUtf8CharSize(s);
+		// convert the utf8 character to UChar32
+		UChar32 uc = utf8Decode ( s );
+		// break "uc" into decomposition of UChar32s
+		UChar32 ttt[8];
+		long klen = recursiveKDExpand(uc,ttt,8);
+		// if the same, leave it! it had no accent marks or other
+		// modifiers...
+		if ( klen <= 1 ) {
+			memcpy ( dst , s , cs );
+			dst += cs;
+			continue;
+		}
+		// take the first one as the stripped
+		// convert back to utf8
+		long stored = utf8Encode ( ttt[0] , dst );
+		// skip over the stored utf8 char
+		dst += stored;
+	}
+	// sanity. breach check
+	if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
+	// return # of bytes stored into outbuf
+	return dst - outbuf;
+}


 // helper function for printing unicode text range
--- a/main.cpp
+++ b/main.cpp
@ -2900,6 +2900,73 @@ int main2 ( int argc , char *argv[] ) {
 		return 1;
 	}

+	// some tests. the greek letter alpha with an accent mark (decompose)
+	/*
+	{
+		char us[] = {0xe1,0xbe,0x80};
+		UChar32 uc = utf8Decode(us);//,&next);
+		UChar32 ttt[32];
+		long klen = recursiveKDExpand(uc,ttt,256);
+		char obuf[64];
+		for ( long i = 0 ; i < klen ; i++ ) {
+			UChar32 ui = ttt[i];
+			long blen = utf8Encode(ui,obuf);
+			obuf[blen]=0;
+			long an = ucIsAlpha(ui);
+			
+			fprintf(stderr,"#%li=%s (alnum=%li)\n",i,obuf,an);
+		}
+		fprintf(stderr,"hey\n");
+		exit(0);
+	}
+	*/
+
+	/*
+
+	  PRINT OUT all Unicode characters and their decompositions
+
+	{
+		for ( long uc = 0 ; uc < 0xe01ef ; uc++ ) {
+			//if ( ! ucIsAlnum(uc) ) continue;
+			UChar32 ttt[32];
+			long klen = recursiveKDExpand(uc,ttt,256);
+			char obuf[64];
+			long clen = utf8Encode(uc,obuf);
+			obuf[clen]=0;
+			// print utf8 char we are decomposing
+			fprintf(stderr,"%lx) %s --> ",uc,obuf);
+			// sanity
+			if ( klen > 1 && ttt[0] == (UChar32)uc ) {
+				fprintf(stderr,"SAME\n");
+				continue;
+			}
+			// print decomposition
+			for ( long i = 0 ; i < klen ; i++ ) {
+				UChar32 ui = ttt[i];
+				char qbuf[64];
+				long blen = utf8Encode(ui,qbuf);
+				qbuf[blen]=0;
+				fprintf(stderr,"%s",qbuf);
+				// show the #
+				fprintf(stderr,"{%lx}",(long)ui);
+				if ( i+1<klen ) fprintf(stderr,", ");
+			}
+			// show utf8 rep
+			fprintf(stderr," [");
+			for ( long i = 0 ; i < clen ; i++ ) {
+				fprintf(stderr,"0x%hhx",(int)obuf[i]);
+				if ( i+1<clen) fprintf(stderr," ");
+			}
+			fprintf(stderr,"]");
+			fprintf(stderr,"\n");
+		}
+		exit(0);
+	}
+	*/			
+
+
+	
+
 	// the wiktionary for lang identification and alternate word forms/
 	// synonyms
 	if ( ! g_wiktionary.load() ) return 1;