when user searches for a word without the

accent marks, we now also search for the same word but with the proper accent marks.
2014-06-01 09:37:00 -07:00 · 2014-06-01 09:37:00 -07:00 · d15f5d3ce7
commit d15f5d3ce7
parent 6f704d3d6a
7 changed files with 492869 additions and 21974 deletions
--- a/Lang.h
+++ b/Lang.h
@ -72,7 +72,8 @@ enum {
 	langMalgasy        = 52,
 	langKurdish        = 53,
 	langLuxembourgish  = 54,
-	langEstonian       = 55
+	langEstonian       = 55,
+	langLast           = 56
 };

 uint8_t getLanguageFromName(uint8_t *name);
--- a/Synonyms.cpp
+++ b/Synonyms.cpp
@ -113,11 +113,15 @@ long Synonyms::getSynonyms ( Words *words ,

 	char sourceId = SOURCE_WIKTIONARY;
 	char *ss = NULL;
+	char *savedss = NULL;
 	long long bwid;
 	char wikiLangId = m_docLangId;
 	bool hadSpace ;
 	long klen ;
 	long baseNumAlnumWords;
+	char origLangId = wikiLangId;
+	long synSetCount = 0;
+	bool doLangLoop = false;

 tryOtherLang:

@ -178,9 +182,47 @@ long Synonyms::getSynonyms ( Words *words ,
 		}
 	}

+	// loop over all the other langids if no synset found in this langid
+	if ( ! ss && ! doLangLoop ) {
+		wikiLangId = langUnknown; // start at 0
+		doLangLoop = true;
+	}
+
+	// loop through all languages if no luck
+	if ( doLangLoop ) {
+
+		// save it
+		if ( ss ) savedss = ss;
+
+		// can only have one match to avoid ambiguity when doing
+		// a loop over all the langids
+		if ( ss && ++synSetCount >= 2 ) {
+			ss = NULL;
+			goto skip;
+		}
+
+		// advance langid of synset attempt
+		wikiLangId++;
+
+		// advance over original we tried first
+		if ( wikiLangId == origLangId )
+			wikiLangId++;
+		// all done?
+		if ( wikiLangId < langLast ) { // the last langid
+			ss = NULL;
+			goto tryOtherLang;
+		}
+	}
+
+	// use the one single synset we found for some language
+	if ( ! ss ) ss = savedss;
+
+ skip:
+
 	// even though a document may be in german it often has some
 	// english words "pdf download" "copyright" etc. so if the word
 	// has no synset in german, try it in english
+	/*
 	if ( //numPresets == 0 &&
 	     ! ss &&
 	     m_docLangId != langEnglish &&
@ -192,6 +234,8 @@ long Synonyms::getSynonyms ( Words *words ,
 		sourceId   = SOURCE_WIKTIONARY_EN;
 		goto tryOtherLang;
 	}
+	*/
+

 	// if it was in wiktionary, just use that synset
 	if ( ss ) {
--- a/Unicode.cpp
+++ b/Unicode.cpp
@ -1144,8 +1144,11 @@ long stripAccentMarks (char *outbuf, long outbufsize,
 		// convert the utf8 character to UChar32
 		UChar32 uc = utf8Decode ( s );
 		// break "uc" into decomposition of UChar32s
-		UChar32 ttt[8];
-		long klen = recursiveKDExpand(uc,ttt,8);
+		UChar32 ttt[32];
+		long klen = recursiveKDExpand(uc,ttt,32);
+		if(klen>32){char *xx=NULL;*xx=0;}
+		// sanity
+		if ( dst + 5 > outbuf+outbufsize ) return -1;
 		// if the same, leave it! it had no accent marks or other
 		// modifiers...
 		if ( klen <= 1 ) {
--- a/Wiktionary.cpp
+++ b/Wiktionary.cpp
@ -1900,10 +1900,11 @@ bool Wiktionary::compile ( ) {
 		//long long lastWid = 0LL;
 		// remove dups
 		HashTableX dd2;
-		char dbuf2[256];
-		dd2.set(8,0,8,dbuf2,256,false,0,"ddttt2");
+		char dbuf2[512];
+		dd2.set(8,0,8,dbuf2,512,false,0,"ddttt2");
 		// how many forms? must be 2+ to get added to syntable
 		long formCount = 0;
+		long stripCount = 0;
 		for ( long j = i ; ; j++ ) {
 			// wrap around
 			if ( j >= m_tmp.m_numSlots ) j = 0;
@ -1942,19 +1943,24 @@ bool Wiktionary::compile ( ) {
 							 1023,
 							 (unsigned char *)word,
 							 gbstrlen(word));
-			if ( stripLen > 0 ) 
-				formCount++;
+			if ( stripLen <= 0 ) continue;
+			// if same as original word, skip
+			long wlen = gbstrlen(word);
+			if ( wlen==stripLen && strncmp(a,word,wlen)==0) 
+				continue;
+			// count as additional form
+			stripCount++;
 		}
 		// need 2+ forms!
-		if ( formCount <= 1 ) continue;
+		if ( formCount +stripCount <= 1 ) continue;
 		// base form
 		//long long wid = *(long long *)m_tmp.getDataFromSlot(i);
 		// remember buf start
 		long bufLen = m_synBuf.length();
 		// remove dups
 		HashTableX dd;
-		char dbuf[256];
-		dd.set(8,0,8,dbuf,256,false,0,"ddttt");
+		char dbuf[512];
+		dd.set(8,0,8,dbuf,512,false,0,"ddttt");
 		// a byte for storing the # of synonym forms
 		//m_synBuf.pushChar(0);
 		// push the langid!
@ -2037,13 +2043,30 @@ bool Wiktionary::compile ( ) {
 							 1023,
 							 (unsigned char *)word,
 							 gbstrlen(word));
+			// debug time
+			if ( stripLen > 0 ) a[stripLen] = 0;
+			//if ( stripLen > 0 ) 
+			//	log("wikt: %li) %s->%s",i,word,a);
+			//if ( i==5133265 )
+			//	log("hey");
+			// if same as original word, ignore it
 			if ( stripLen > 0 ) {
-				long long swid = hash64Lower_utf8(word);
+				long wlen = gbstrlen(word);
+				if ( wlen==stripLen && 
+				     strncmp(a,word,wlen) == 0 ) 
+					stripLen = 0;
+			}
+			// if different, add it
+			if ( stripLen > 0 ) {
+				long long swid = hash64Lower_utf8(a);
 				// xor in the langid
 				swid ^= g_hashtab[0][langId];
 				// only add this word form once per langId
 				if ( dd.isInTable ( &swid ) ) continue;
 				dd.addKey ( &swid );
+				// . a ptr to that sequence of alt forms in buf
+				// . this uses 6 byte keys
+				m_synTable.addKey(&swid,&bufLen);
 			}


--- a/wiktionary-buf.txt
+++ b/wiktionary-buf.txt
--- a/wiktionary-lang.txt
+++ b/wiktionary-lang.txt
--- a/wiktionary-syns.dat
+++ b/wiktionary-syns.dat