privacore-open-source-searc…/utf8_convert.cpp

#include "utf8_convert.h"
#include "HashTableX.h"
#include "Errno.h"
#include <iconv.h>


static HashTableX s_convTable;


static iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
	// get hash for to/from
	uint32_t hash1 = hash32Lower_a(tocode, strlen(tocode), 0);
	uint32_t hash2 = hash32Lower_a(fromcode, strlen(fromcode),0);
	uint32_t hash = hash32h(hash1, hash2);

	g_errno = 0;
	iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash);
	iconv_t conv = NULL;
	if ( convp ) conv = *convp;
	//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%" PRIx32": 0x%" PRIx32,
	//    fromcode, tocode,
	//    hash, conv);
	if (!conv){
		//log(LOG_DEBUG, "uni: Allocating new convertor for "
		//    "%s to %s (hash: 0x%" PRIx32")",
		//    fromcode, tocode,hash);
		conv = iconv_open(tocode, fromcode);
		if (conv == (iconv_t) -1) {
			log(LOG_WARN, "uni: failed to open converter for "
			    "%s to %s: %s (%d)", fromcode, tocode,
			    strerror(errno), errno);
			g_errno = errno;
			if (errno == EINVAL)
				g_errno = EBADCHARSET;

			return conv;
		}
		// cache convertor
		s_convTable.addKey(&hash, &conv);
		//log(LOG_DEBUG, "uni: Saved convertor 0x%" PRId32" under hash 0x%" PRIx32,
		//    conv, hash);
	}
	else{
		// reset convertor
		char *dummy = NULL;
		size_t dummy2 = 0;
		// JAB: warning abatement
		//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
		iconv(conv,NULL,NULL,&dummy,&dummy2);
	}

	return conv;
}

static int gbiconv_close(iconv_t cd) {
	/// @todo ALC gbiconv_close currently does nothing
	//int val = iconv_close(cd);
	//return val;
	return 0;
}

static void gbiconv_reset() {
	for (int32_t i=0;i<s_convTable.getNumSlots();i++){
		//int32_t key = *(int32_t *)s_convTable.getKey(i);
		//if (!key) continue;
		if ( ! s_convTable.m_flags[i] ) continue;
		iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i);
		if (! pconv) continue;
		iconv_t iconv = *pconv;
		//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
		iconv_close(iconv);
	}
	s_convTable.reset();
}


int32_t ucToAny(char *outbuf, int32_t outbufsize, const char *charset_out,
		const char *inbuf, int32_t inbuflen, const char *charset_in,
		 int32_t ignoreBadChars ){
	if (inbuflen == 0) return 0;
	// alias for iconv
	const char *csAlias = charset_in;
	if (!strncmp(charset_in, "x-windows-949", 13))
		csAlias = "CP949";

	// Treat all latin1 as windows-1252 extended charset
	if (!strncmp(charset_in, "ISO-8859-1", 10) )
		csAlias = "WINDOWS-1252";

	iconv_t cd = gbiconv_open(charset_out, csAlias);
	int32_t numBadChars = 0;
	if (cd == (iconv_t)-1) {
		log("uni: Error opening input conversion"
		    " descriptor for %s: %s (%d)\n",
		    charset_in,
		    strerror(errno),errno);
		return 0;
	}

	//if (normalized) *normalized = false;
	char *pin = const_cast<char*>(inbuf); //const cast due to iconv() speciality
	size_t inRemaining = inbuflen;
	char *pout = outbuf;
	size_t outRemaining = outbufsize;
	int res = 0;
	if (outbuf == NULL || outbufsize == 0) {
		// just find the size needed for conversion
#define TMP_SIZE 32
		char buf[TMP_SIZE];
		int32_t len = 0;
		while (inRemaining) {
			pout = buf;
			outRemaining = TMP_SIZE;
			res = iconv(cd, &pin, &inRemaining,
				    &pout, &outRemaining);
			if (res < 0 && errno){
				// convert the next TMP_SIZE block
				if (errno == E2BIG) {
					len += TMP_SIZE;
					continue;
				}
				gbiconv_close(cd);
				return 0; // other error
			}
			len += TMP_SIZE-outRemaining;
			len += 1; // NULL terminated
			gbiconv_close(cd);
			return len;
		}
	}

	while (inRemaining && outRemaining) {
		//printf("Before - in: %d, out: %d\n",
		//inRemaining, outRemaining);
		res = iconv(cd,&pin, &inRemaining,
				&pout, &outRemaining);

		if (res < 0 && errno){
			//printf("errno: %s (%d)\n", strerror(errno), errno);
			g_errno = errno;
			switch(errno) {
			case EILSEQ:
				numBadChars++;

 				if (ignoreBadChars >= 0 &&
				    numBadChars > ignoreBadChars) goto done;
				utf8Encode('?', pout);
				pout++;outRemaining --;
 				pin++; inRemaining--;
				g_errno = 0;
 				continue;
			case EINVAL:
				numBadChars++;

				utf8Encode('?', pout);
				pout++;outRemaining --;
				pin++; inRemaining--;
				g_errno=0;
				continue;
				// go ahead and flag an error now
				// if there is a bad character, we've
				// probably misguessed the charset

			case E2BIG:
				//log("uni: error converting to UTF-8: %s",
				//    strerror(errno));
				goto done;
			default:
				log("uni: unknown error occurred "
				    "converting to UTF-8: %s (%d)",
				    strerror(errno), errno);
				goto done;
			}
		}
	}
done:
	gbiconv_close(cd);
	int32_t len =  (outbufsize - outRemaining) ;
	len = len>=outbufsize-1?outbufsize-2:len;
	//len >>= 1;
	//len = outbuf[len]=='\0'?len-1:len;
	outbuf[len] = '\0';
	if (numBadChars) {
		log(LOG_DEBUG, "uni: ucToAny: got %" PRId32" bad chars in conversion 2.",
		    numBadChars);
	}
	if (res < 0 && g_errno) return 0;
	return len ;
}


int32_t ucToUtf8(char *outbuf, int32_t outbuflen,
		const char *inbuf, int32_t inbuflen,
		const char *charset, int32_t ignoreBadChars) {
  return ucToAny(outbuf, outbuflen, "UTF-8", inbuf, inbuflen, charset, ignoreBadChars);
}


bool utf8_convert_initialize() {
	if(! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,"cnvtbl"))
		return false;
	return true;
}


void utf8_convert_finalize() {
	gbiconv_reset();
}