Moved ucToUtf8() to separate module

2025-05-17 18:29:33 -04:00 · 2018-02-05 18:16:05 +01:00 · 2018-02-05 18:16:05 +01:00 · 3be99ae9cd
commit 3be99ae9cd
parent 2040903102
8 changed files with 270 additions and 205 deletions
--- a/2
+++ b/2
@ -51,7 +51,7 @@ OBJS_O2 = \
 	Rdb.o RdbBase.o \
 	Sections.o Spider.o SpiderCache.o SpiderColl.o SpiderLoop.o StopWords.o Summary.o \
 	Title.o \
-	UCPropTable.o UdpServer.o Unicode.o UnicodeProperties.o utf8.o utf8_fast.o \
+	UCPropTable.o UdpServer.o Unicode.o UnicodeProperties.o utf8.o utf8_fast.o utf8_convert.o \
 	Words.o \
 	Xml.o XmlDoc.o XmlDoc_Indexing.o XmlNode.o \

--- a/Process.cpp
+++ b/Process.cpp
@ -8,6 +8,7 @@
 #include "Tagdb.h"
 #include "Posdb.h"
 #include "Titledb.h"
+#include "utf8_convert.h"
 #include "Sections.h"
 #include "Spider.h"
 #include "SpiderColl.h"
@ -994,6 +995,7 @@ void Process::resetAll ( ) {
 	g_spiderCache     .reset();
 	g_jobScheduler    .finalize();
 	ucResetMaps();
+	utf8_convert_finalize();
 	g_profiler        .reset();

 	// reset disk page caches
--- a/Unicode.cpp
+++ b/Unicode.cpp
@ -1,79 +1,8 @@
 #include "Unicode.h"
-
-#include "HashTableX.h"
 #include "Sanity.h"
-
-
-static HashTableX s_convTable;
-
-static iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
-	// get hash for to/from
-	uint32_t hash1 = hash32Lower_a(tocode, strlen(tocode), 0);
-	uint32_t hash2 = hash32Lower_a(fromcode, strlen(fromcode),0);
-	uint32_t hash = hash32h(hash1, hash2);
-
-	g_errno = 0;
-	iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash);
-	iconv_t conv = NULL;
-	if ( convp ) conv = *convp;
-	//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%" PRIx32": 0x%" PRIx32,
-	//    fromcode, tocode,
-	//    hash, conv);
-	if (!conv){
-		//log(LOG_DEBUG, "uni: Allocating new convertor for "
-		//    "%s to %s (hash: 0x%" PRIx32")",
-		//    fromcode, tocode,hash);
-		conv = iconv_open(tocode, fromcode);
-		if (conv == (iconv_t) -1) {
-			log(LOG_WARN, "uni: failed to open converter for "
-			    "%s to %s: %s (%d)", fromcode, tocode, 
-			    strerror(errno), errno);
-			g_errno = errno;
-			if (errno == EINVAL)
-				g_errno = EBADCHARSET;
-			
-			return conv;
-		}
-		// cache convertor
-		s_convTable.addKey(&hash, &conv);
-		//log(LOG_DEBUG, "uni: Saved convertor 0x%" PRId32" under hash 0x%" PRIx32,
-		//    conv, hash);
-	}
-	else{
-		// reset convertor
-		char *dummy = NULL;
-		size_t dummy2 = 0;
-		// JAB: warning abatement
-		//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
-		iconv(conv,NULL,NULL,&dummy,&dummy2);
-	}
-
-	return conv;
-}
-
-static int gbiconv_close(iconv_t cd) {
-	/// @todo ALC gbiconv_close currently does nothing
-	//int val = iconv_close(cd);
-	//return val;	
-	return 0;
-}
-
-static void gbiconv_reset() {
-	for (int32_t i=0;i<s_convTable.getNumSlots();i++){
-		//int32_t key = *(int32_t *)s_convTable.getKey(i);
-		//if (!key) continue;
-		if ( ! s_convTable.m_flags[i] ) continue;
-		iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i);
-		if (! pconv) continue;
-		iconv_t iconv = *pconv;
-		//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
-		iconv_close(iconv);
-	}
-	s_convTable.reset();
-}
-
-
-
+#include "Log.h"
+#include "utf8.h"
+#include <string.h>


 #define VERIFY_UNICODE_CHECKSUMS 1
@ -136,10 +65,6 @@ bool ucInit(const char *path) {
 		goto failed;
 	}

-	//s_convTable.set(1024);
-	if ( ! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,"cnvtbl"))
-		goto failed;	
-
 	return true;
 	
 failed:
@ -175,126 +100,6 @@ const char *ucDetectBOM(const char *buf, int32_t bufsize){
 	return NULL;
 }

-static int32_t ucToAny(char *outbuf, int32_t outbufsize, const char *charset_out,
-		const char *inbuf, int32_t inbuflen, const char *charset_in,
-		 int32_t ignoreBadChars ){
-	if (inbuflen == 0) return 0;
-	// alias for iconv
-	const char *csAlias = charset_in;
-	if (!strncmp(charset_in, "x-windows-949", 13))
-		csAlias = "CP949";
-
-	// Treat all latin1 as windows-1252 extended charset
-	if (!strncmp(charset_in, "ISO-8859-1", 10) )
-		csAlias = "WINDOWS-1252";
-	
-	iconv_t cd = gbiconv_open(charset_out, csAlias);
-	int32_t numBadChars = 0;
-	if (cd == (iconv_t)-1) {	
-		log("uni: Error opening input conversion"
-		    " descriptor for %s: %s (%d)\n", 
-		    charset_in,
-		    strerror(errno),errno);
-		return 0;		
-	}
-
-	//if (normalized) *normalized = false;
-	char *pin = const_cast<char*>(inbuf); //const cast due to iconv() speciality
-	size_t inRemaining = inbuflen;
-	char *pout = outbuf;
-	size_t outRemaining = outbufsize;
-	int res = 0;
-	if (outbuf == NULL || outbufsize == 0) {
-		// just find the size needed for conversion
-#define TMP_SIZE 32
-		char buf[TMP_SIZE];
-		int32_t len = 0;
-		while (inRemaining) {
-			pout = buf;
-			outRemaining = TMP_SIZE;
-			res = iconv(cd, &pin, &inRemaining, 
-				    &pout, &outRemaining);
-			if (res < 0 && errno){
-				// convert the next TMP_SIZE block
-				if (errno == E2BIG) { 
-					len += TMP_SIZE; 
-					continue;
-				}
-				gbiconv_close(cd);
-				return 0; // other error
-			}
-			len += TMP_SIZE-outRemaining;
-			len += 1; // NULL terminated
-			gbiconv_close(cd);
-			return len;			
-		}
-	}
-
-	while (inRemaining && outRemaining) {
-		//printf("Before - in: %d, out: %d\n", 
-		//inRemaining, outRemaining);
-		res = iconv(cd,&pin, &inRemaining,
-				&pout, &outRemaining);
-
-		if (res < 0 && errno){
-			//printf("errno: %s (%d)\n", strerror(errno), errno);
-			g_errno = errno;
-			switch(errno) {
-			case EILSEQ:
-				numBadChars++;
-
- 				if (ignoreBadChars >= 0 &&
-				    numBadChars > ignoreBadChars) goto done;
-				utf8Encode('?', pout);
-				pout++;outRemaining --;
- 				pin++; inRemaining--;
-				g_errno = 0;
- 				continue;
-			case EINVAL:
-				numBadChars++;
-
-				utf8Encode('?', pout); 
-				pout++;outRemaining --;
-				pin++; inRemaining--;
-				g_errno=0;
-				continue;
-				// go ahead and flag an error now
-				// if there is a bad character, we've 
-				// probably misguessed the charset
-
-			case E2BIG:
-				//log("uni: error converting to UTF-8: %s",
-				//    strerror(errno));
-				goto done;
-			default:
-				log("uni: unknown error occurred "
-				    "converting to UTF-8: %s (%d)",
-				    strerror(errno), errno);
-				goto done;
-			}
-		}
-	}
-done:
-	gbiconv_close(cd);
-	int32_t len =  (outbufsize - outRemaining) ;
-	len = len>=outbufsize-1?outbufsize-2:len;
-	//len >>= 1;
-	//len = outbuf[len]=='\0'?len-1:len;
-	outbuf[len] = '\0';
-	if (numBadChars) {
-		log(LOG_DEBUG, "uni: ucToAny: got %" PRId32" bad chars in conversion 2.",
-		    numBadChars);
-	}
-	if (res < 0 && g_errno) return 0; 
-	return len ;
-}
-
-
-int32_t ucToUtf8(char *outbuf, int32_t outbuflen,
-		const char *inbuf, int32_t inbuflen,
-		const char *charset, int32_t ignoreBadChars) {
-  return ucToAny(outbuf, outbuflen, "UTF-8", inbuf, inbuflen, charset, ignoreBadChars);
-}

 int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
 			  const unsigned char *p, int32_t inbuflen) {
@ -316,7 +121,7 @@ int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
 		// if the same, leave it! it had no accent marks or other
 		// modifiers...
 		if ( klen <= 1 ) {
-			gbmemcpy ( dst , s , cs );
+			memcpy ( dst , s , cs );
 			dst += cs;
 			continue;
 		}
@ -334,5 +139,4 @@ int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,


 void resetUnicode ( ) {
-	gbiconv_reset();
 }
--- a/Unicode.h
+++ b/Unicode.h
@ -18,10 +18,6 @@ void ucResetMaps();



-int32_t ucToUtf8(char *outbuf, int32_t outbuflen,
-		const char *inbuf, int32_t inbuflen,
-		const char *charset, int32_t ignoreBadChars);
-
 // Try to detect the Byte Order Mark of a Unicode Document
 const char *	ucDetectBOM(const char *buf, int32_t bufsize);

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -42,6 +42,7 @@
 #include "ScopedLock.h"
 #include "Mem.h"
 #include "UrlBlockCheck.h"
+#include "utf8_convert.h"
 #include <fcntl.h>
 #include <algorithm>
 #include "GbEncoding.h"
--- a/main.cpp
+++ b/main.cpp
@ -69,6 +69,7 @@
 #include "Pages.h"
 #include "PageInject.h"
 #include "Unicode.h"
+#include "utf8_convert.h"

 #include "Profiler.h"
 #include "Proxy.h"
@ -527,6 +528,10 @@ int main2 ( int argc , char *argv[] ) {
 			log( LOG_ERROR, "db: Unicode initialization failed!" );
 			return 1;
 		}
+		if(!utf8_convert_initialize()) {
+			log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+			return 1;
+		}

 		// load speller unifiedDict for spider compression proxy
 		//if ( g_hostdb.m_myHost->m_type & HT_SCPROXY )
@ -1227,6 +1232,10 @@ int main2 ( int argc , char *argv[] ) {
 			log("Unicode initialization failed!");
 			return 1;
 		}
+		if(!utf8_convert_initialize()) {
+			log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+			return 1;
+		}

 		if ( ! g_collectiondb.loadAllCollRecs()   ) {
 			log("db: Collectiondb init failed." ); return 1; }
@ -1339,6 +1348,10 @@ int main2 ( int argc , char *argv[] ) {
 		log( LOG_ERROR, "Unicode initialization failed!" );
 		return 1;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return 1;
+	}

 	// the wiktionary for lang identification and alternate word forms/
 	// synonyms
@ -2446,6 +2459,10 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
 		log("Unicode initialization failed!");
 		return;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return;
+	}
 	// init our table for doing zobrist hashing
 	if ( ! hashinit() ) {
 		log("db: Failed to init hashtable." ); return ; }
@ -3883,6 +3900,10 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
 		log("Unicode initialization failed!");
 		return;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return;
+	}
 	// init our table for doing zobrist hashing
 	if ( ! hashinit() ) {
 		log("db: Failed to init hashtable." );
@ -4052,6 +4073,10 @@ static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_
 		log("Unicode initialization failed!");
 		return;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return;
+	}
 	// init our table for doing zobrist hashing
 	if ( ! hashinit() ) {
 		log("db: Failed to init hashtable." );
@ -4189,6 +4214,10 @@ static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t
 		log("Unicode initialization failed!");
 		return;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return;
+	}
 	// init our table for doing zobrist hashing
 	if ( ! hashinit() ) {
 		log("db: Failed to init hashtable." );
@ -4364,6 +4393,10 @@ static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t
 		log("Unicode initialization failed!");
 		return;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return;
+	}
 	// init our table for doing zobrist hashing
 	if ( ! hashinit() ) {
 		log("db: Failed to init hashtable." );
@ -4765,6 +4798,10 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
 		log(LOG_WARN, "Unicode initialization failed!");
 		return false;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return false;
+	}

 	// get raw rec from list
 	char *rec      = tlist.getCurrentRec();
@ -4852,6 +4889,10 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
 		log("Unicode initialization failed!");
 		return 1;
 	}
+	if(!utf8_convert_initialize()) {
+		log( LOG_ERROR, "db: utf-8 conversion initialization failed!" );
+		return 1;
+	}
 	Words words;

 	t = gettimeofdayInMilliseconds();
--- a/utf8_convert.cpp
+++ b/utf8_convert.cpp
@ -0,0 +1,206 @@
+#include "utf8_convert.h"
+#include "HashTableX.h"
+
+
+static HashTableX s_convTable;
+
+
+static iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
+	// get hash for to/from
+	uint32_t hash1 = hash32Lower_a(tocode, strlen(tocode), 0);
+	uint32_t hash2 = hash32Lower_a(fromcode, strlen(fromcode),0);
+	uint32_t hash = hash32h(hash1, hash2);
+
+	g_errno = 0;
+	iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash);
+	iconv_t conv = NULL;
+	if ( convp ) conv = *convp;
+	//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%" PRIx32": 0x%" PRIx32,
+	//    fromcode, tocode,
+	//    hash, conv);
+	if (!conv){
+		//log(LOG_DEBUG, "uni: Allocating new convertor for "
+		//    "%s to %s (hash: 0x%" PRIx32")",
+		//    fromcode, tocode,hash);
+		conv = iconv_open(tocode, fromcode);
+		if (conv == (iconv_t) -1) {
+			log(LOG_WARN, "uni: failed to open converter for "
+			    "%s to %s: %s (%d)", fromcode, tocode, 
+			    strerror(errno), errno);
+			g_errno = errno;
+			if (errno == EINVAL)
+				g_errno = EBADCHARSET;
+			
+			return conv;
+		}
+		// cache convertor
+		s_convTable.addKey(&hash, &conv);
+		//log(LOG_DEBUG, "uni: Saved convertor 0x%" PRId32" under hash 0x%" PRIx32,
+		//    conv, hash);
+	}
+	else{
+		// reset convertor
+		char *dummy = NULL;
+		size_t dummy2 = 0;
+		// JAB: warning abatement
+		//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
+		iconv(conv,NULL,NULL,&dummy,&dummy2);
+	}
+
+	return conv;
+}
+
+static int gbiconv_close(iconv_t cd) {
+	/// @todo ALC gbiconv_close currently does nothing
+	//int val = iconv_close(cd);
+	//return val;	
+	return 0;
+}
+
+static void gbiconv_reset() {
+	for (int32_t i=0;i<s_convTable.getNumSlots();i++){
+		//int32_t key = *(int32_t *)s_convTable.getKey(i);
+		//if (!key) continue;
+		if ( ! s_convTable.m_flags[i] ) continue;
+		iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i);
+		if (! pconv) continue;
+		iconv_t iconv = *pconv;
+		//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
+		iconv_close(iconv);
+	}
+	s_convTable.reset();
+}
+
+
+static int32_t ucToAny(char *outbuf, int32_t outbufsize, const char *charset_out,
+		const char *inbuf, int32_t inbuflen, const char *charset_in,
+		 int32_t ignoreBadChars ){
+	if (inbuflen == 0) return 0;
+	// alias for iconv
+	const char *csAlias = charset_in;
+	if (!strncmp(charset_in, "x-windows-949", 13))
+		csAlias = "CP949";
+
+	// Treat all latin1 as windows-1252 extended charset
+	if (!strncmp(charset_in, "ISO-8859-1", 10) )
+		csAlias = "WINDOWS-1252";
+	
+	iconv_t cd = gbiconv_open(charset_out, csAlias);
+	int32_t numBadChars = 0;
+	if (cd == (iconv_t)-1) {	
+		log("uni: Error opening input conversion"
+		    " descriptor for %s: %s (%d)\n", 
+		    charset_in,
+		    strerror(errno),errno);
+		return 0;		
+	}
+
+	//if (normalized) *normalized = false;
+	char *pin = const_cast<char*>(inbuf); //const cast due to iconv() speciality
+	size_t inRemaining = inbuflen;
+	char *pout = outbuf;
+	size_t outRemaining = outbufsize;
+	int res = 0;
+	if (outbuf == NULL || outbufsize == 0) {
+		// just find the size needed for conversion
+#define TMP_SIZE 32
+		char buf[TMP_SIZE];
+		int32_t len = 0;
+		while (inRemaining) {
+			pout = buf;
+			outRemaining = TMP_SIZE;
+			res = iconv(cd, &pin, &inRemaining, 
+				    &pout, &outRemaining);
+			if (res < 0 && errno){
+				// convert the next TMP_SIZE block
+				if (errno == E2BIG) { 
+					len += TMP_SIZE; 
+					continue;
+				}
+				gbiconv_close(cd);
+				return 0; // other error
+			}
+			len += TMP_SIZE-outRemaining;
+			len += 1; // NULL terminated
+			gbiconv_close(cd);
+			return len;			
+		}
+	}
+
+	while (inRemaining && outRemaining) {
+		//printf("Before - in: %d, out: %d\n", 
+		//inRemaining, outRemaining);
+		res = iconv(cd,&pin, &inRemaining,
+				&pout, &outRemaining);
+
+		if (res < 0 && errno){
+			//printf("errno: %s (%d)\n", strerror(errno), errno);
+			g_errno = errno;
+			switch(errno) {
+			case EILSEQ:
+				numBadChars++;
+
+ 				if (ignoreBadChars >= 0 &&
+				    numBadChars > ignoreBadChars) goto done;
+				utf8Encode('?', pout);
+				pout++;outRemaining --;
+ 				pin++; inRemaining--;
+				g_errno = 0;
+ 				continue;
+			case EINVAL:
+				numBadChars++;
+
+				utf8Encode('?', pout); 
+				pout++;outRemaining --;
+				pin++; inRemaining--;
+				g_errno=0;
+				continue;
+				// go ahead and flag an error now
+				// if there is a bad character, we've 
+				// probably misguessed the charset
+
+			case E2BIG:
+				//log("uni: error converting to UTF-8: %s",
+				//    strerror(errno));
+				goto done;
+			default:
+				log("uni: unknown error occurred "
+				    "converting to UTF-8: %s (%d)",
+				    strerror(errno), errno);
+				goto done;
+			}
+		}
+	}
+done:
+	gbiconv_close(cd);
+	int32_t len =  (outbufsize - outRemaining) ;
+	len = len>=outbufsize-1?outbufsize-2:len;
+	//len >>= 1;
+	//len = outbuf[len]=='\0'?len-1:len;
+	outbuf[len] = '\0';
+	if (numBadChars) {
+		log(LOG_DEBUG, "uni: ucToAny: got %" PRId32" bad chars in conversion 2.",
+		    numBadChars);
+	}
+	if (res < 0 && g_errno) return 0; 
+	return len ;
+}
+
+
+int32_t ucToUtf8(char *outbuf, int32_t outbuflen,
+		const char *inbuf, int32_t inbuflen,
+		const char *charset, int32_t ignoreBadChars) {
+  return ucToAny(outbuf, outbuflen, "UTF-8", inbuf, inbuflen, charset, ignoreBadChars);
+}
+
+
+bool utf8_convert_initialize() {
+	if(! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,"cnvtbl"))
+		return false;
+	return true;
+}
+
+
+void utf8_convert_finalize() {
+	gbiconv_reset();
+}
--- a/utf8_convert.h
+++ b/utf8_convert.h
@ -0,0 +1,15 @@
+#ifndef UTF8_CONVERT_H_
+#define UTF8_CONVERT_H_
+#include <inttypes.h>
+
+//functions for converting variaous encodings into UTF-8
+//ok, one function.
+
+int32_t ucToUtf8(char *outbuf, int32_t outbuflen,
+		 const char *inbuf, int32_t inbuflen,
+		 const char *charset, int32_t ignoreBadChars);
+
+bool utf8_convert_initialize();
+void utf8_convert_finalize();
+
+#endif