#include "Unicode.h" #include "HashTableX.h" #include "Sanity.h" static HashTableX s_convTable; iconv_t gbiconv_open( const char *tocode, const char *fromcode) { // get hash for to/from uint32_t hash1 = hash32Lower_a(tocode, strlen(tocode), 0); uint32_t hash2 = hash32Lower_a(fromcode, strlen(fromcode),0); uint32_t hash = hash32h(hash1, hash2); g_errno = 0; iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash); iconv_t conv = NULL; if ( convp ) conv = *convp; //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%" PRIx32": 0x%" PRIx32, // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%" PRIx32")", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // cache convertor s_convTable.addKey(&hash, &conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%" PRId32" under hash 0x%" PRIx32, // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; } int gbiconv_close(iconv_t cd) { /// @todo ALC gbiconv_close currently does nothing //int val = iconv_close(cd); //return val; return 0; } void gbiconv_reset(){ for (int32_t i=0;i<s_convTable.getNumSlots();i++){ //int32_t key = *(int32_t *)s_convTable.getKey(i); //if (!key) continue; if ( ! s_convTable.m_flags[i] ) continue; iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i); if (! pconv) continue; iconv_t iconv = *pconv; //logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv); iconv_close(iconv); } s_convTable.reset(); } #define VERIFY_UNICODE_CHECKSUMS 1 #define CHKSUM_UPPERMAP 1241336150 #define CHKSUM_LOWERMAP 1023166806 #define CHKSUM_PROPERTIES 33375957 #define CHKSUM_COMBININGCLASS 526097805 #define CHKSUM_SCRIPTS 1826246000 #define CHKSUM_KDMAP 1920116453 bool ucInit(const char *path) { char file[384]; if (path == NULL) path = "./"; // Might want to move this out of ucInit someday // but right now it's the only thing that uses .so files (?) char gbLibDir[512]; snprintf(gbLibDir, 512, "%s/lib",path); // i don't think this is used any more because we don't have it! //log(LOG_INIT, "ucinit: Setting LD_RUN_PATH to \"%s\"",gbLibDir); if (setenv("LD_RUN_PATH", gbLibDir, 1)){ log(LOG_INIT, "Failed to set LD_RUN_PATH"); } //char *ldpath = getenv("LD_RUN_PATH"); // i don't think this is used any more because we don't have it! //log(LOG_DEBUG, "ucinit: LD_RUN_PATH: %s\n", ldpath); strcpy(file, path); strcat(file, "/ucdata/uppermap.dat"); if (!loadUnicodeTable(&g_ucUpperMap,file, VERIFY_UNICODE_CHECKSUMS, CHKSUM_UPPERMAP)) goto failed; strcpy(file, path); strcat(file, "/ucdata/lowermap.dat"); if (!loadUnicodeTable(&g_ucLowerMap,file, VERIFY_UNICODE_CHECKSUMS, CHKSUM_LOWERMAP)) goto failed; strcpy(file, path); strcat(file, "/ucdata/properties.dat"); if (!loadUnicodeTable(&g_ucProps, file, VERIFY_UNICODE_CHECKSUMS, CHKSUM_PROPERTIES)) goto failed; strcpy(file, path); strcat(file, "/ucdata/scripts.dat"); if (!loadUnicodeTable(&g_ucScripts, file, VERIFY_UNICODE_CHECKSUMS, CHKSUM_SCRIPTS)) goto failed; // MDW: do we need this for converting from X to utf8? or for // the is_alnum(), etc. functions? if (!loadDecompTables(path)) { goto failed; } //s_convTable.set(1024); if ( ! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,"cnvtbl")) goto failed; return true; failed: log(LOG_WARN, "uni: unable to load all property tables"); return false; } const char *ucDetectBOM(const char *buf, int32_t bufsize){ if (bufsize < 4) return NULL; // copied from ICU if(buf[0] == '\xFE' && buf[1] == '\xFF') { return "UTF-16BE"; } else if(buf[0] == '\xFF' && buf[1] == '\xFE') { if(buf[2] == '\x00' && buf[3] =='\x00') { return "UTF-32LE"; } else { return "UTF-16LE"; } } else if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') { return "UTF-8"; } else if(buf[0] == '\x00' && buf[1] == '\x00' && buf[2] == '\xFE' && buf[3]=='\xFF') { return "UTF-32BE"; } return NULL; } int32_t ucToAny(char *outbuf, int32_t outbufsize, const char *charset_out, const char *inbuf, int32_t inbuflen, const char *charset_in, int32_t ignoreBadChars ){ if (inbuflen == 0) return 0; // alias for iconv const char *csAlias = charset_in; if (!strncmp(charset_in, "x-windows-949", 13)) csAlias = "CP949"; // Treat all latin1 as windows-1252 extended charset if (!strncmp(charset_in, "ISO-8859-1", 10) ) csAlias = "WINDOWS-1252"; iconv_t cd = gbiconv_open(charset_out, csAlias); int32_t numBadChars = 0; if (cd == (iconv_t)-1) { log("uni: Error opening input conversion" " descriptor for %s: %s (%d)\n", charset_in, strerror(errno),errno); return 0; } //if (normalized) *normalized = false; char *pin = const_cast<char*>(inbuf); //const cast due to iconv() speciality size_t inRemaining = inbuflen; char *pout = outbuf; size_t outRemaining = outbufsize; int res = 0; if (outbuf == NULL || outbufsize == 0) { // just find the size needed for conversion #define TMP_SIZE 32 char buf[TMP_SIZE]; int32_t len = 0; while (inRemaining) { pout = buf; outRemaining = TMP_SIZE; res = iconv(cd, &pin, &inRemaining, &pout, &outRemaining); if (res < 0 && errno){ // convert the next TMP_SIZE block if (errno == E2BIG) { len += TMP_SIZE; continue; } gbiconv_close(cd); return 0; // other error } len += TMP_SIZE-outRemaining; //len >>= 1; // sizeof UChar len += 1; // NULL terminated gbiconv_close(cd); return len; } } while (inRemaining && outRemaining) { //printf("Before - in: %d, out: %d\n", //inRemaining, outRemaining); res = iconv(cd,&pin, &inRemaining, &pout, &outRemaining); if (res < 0 && errno){ //printf("errno: %s (%d)\n", strerror(errno), errno); g_errno = errno; switch(errno) { case EILSEQ: numBadChars++; if (ignoreBadChars >= 0 && numBadChars > ignoreBadChars) goto done; utf8Encode('?', pout); pout++;outRemaining --; pin++; inRemaining--; g_errno = 0; continue; case EINVAL: numBadChars++; utf8Encode('?', pout); pout++;outRemaining --; pin++; inRemaining--; g_errno=0; continue; // go ahead and flag an error now // if there is a bad character, we've // probably misguessed the charset case E2BIG: //log("uni: error converting to UTF-8: %s", // strerror(errno)); goto done; default: log("uni: unknown error occurred " "converting to UTF-8: %s (%d)", strerror(errno), errno); goto done; } } } done: gbiconv_close(cd); int32_t len = (outbufsize - outRemaining) ; len = len>=outbufsize-1?outbufsize-2:len; //len >>= 1; //len = outbuf[len]=='\0'?len-1:len; outbuf[len] = '\0'; if (numBadChars) { log(LOG_DEBUG, "uni: ucToAny: got %" PRId32" bad chars in conversion 2.", numBadChars); } if (res < 0 && g_errno) return 0; return len ; } int32_t stripAccentMarks (char *outbuf, int32_t outbufsize, const unsigned char *p, int32_t inbuflen) { char *s = (char *)p; char *send = (char *)p + inbuflen; int32_t cs; char *dst = outbuf; for ( ; s < send ; s += cs ) { // how big is this character? cs = getUtf8CharSize(s); // convert the utf8 character to UChar32 UChar32 uc = utf8Decode ( s ); // break "uc" into decomposition of UChar32s UChar32 ttt[32]; int32_t klen = recursiveKDExpand(uc,ttt,32); if(klen>32) gbshutdownLogicError(); // sanity if ( dst + 5 > outbuf+outbufsize ) return -1; // if the same, leave it! it had no accent marks or other // modifiers... if ( klen <= 1 ) { gbmemcpy ( dst , s , cs ); dst += cs; continue; } // take the first one as the stripped // convert back to utf8 int32_t stored = utf8Encode ( ttt[0] , dst ); // skip over the stored utf8 char dst += stored; } // sanity. breach check if ( dst > outbuf+outbufsize ) gbshutdownLogicError(); // return # of bytes stored into outbuf return dst - outbuf; } void resetUnicode ( ) { //s_convTable.reset(); gbiconv_reset(); }