forked from Mirrors/privacore-open-source-search-engine
327 lines
8.5 KiB
C++
327 lines
8.5 KiB
C++
#include "Unicode.h"
|
|
|
|
#include "HashTableX.h"
|
|
#include "Sanity.h"
|
|
|
|
|
|
static HashTableX s_convTable;
|
|
|
|
iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
|
|
// get hash for to/from
|
|
uint32_t hash1 = hash32Lower_a(tocode, strlen(tocode), 0);
|
|
uint32_t hash2 = hash32Lower_a(fromcode, strlen(fromcode),0);
|
|
uint32_t hash = hash32h(hash1, hash2);
|
|
|
|
g_errno = 0;
|
|
iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash);
|
|
iconv_t conv = NULL;
|
|
if ( convp ) conv = *convp;
|
|
//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%" PRIx32": 0x%" PRIx32,
|
|
// fromcode, tocode,
|
|
// hash, conv);
|
|
if (!conv){
|
|
//log(LOG_DEBUG, "uni: Allocating new convertor for "
|
|
// "%s to %s (hash: 0x%" PRIx32")",
|
|
// fromcode, tocode,hash);
|
|
conv = iconv_open(tocode, fromcode);
|
|
if (conv == (iconv_t) -1) {
|
|
log(LOG_WARN, "uni: failed to open converter for "
|
|
"%s to %s: %s (%d)", fromcode, tocode,
|
|
strerror(errno), errno);
|
|
g_errno = errno;
|
|
if (errno == EINVAL)
|
|
g_errno = EBADCHARSET;
|
|
|
|
return conv;
|
|
}
|
|
// cache convertor
|
|
s_convTable.addKey(&hash, &conv);
|
|
//log(LOG_DEBUG, "uni: Saved convertor 0x%" PRId32" under hash 0x%" PRIx32,
|
|
// conv, hash);
|
|
}
|
|
else{
|
|
// reset convertor
|
|
char *dummy = NULL;
|
|
size_t dummy2 = 0;
|
|
// JAB: warning abatement
|
|
//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
|
|
iconv(conv,NULL,NULL,&dummy,&dummy2);
|
|
}
|
|
|
|
return conv;
|
|
}
|
|
|
|
int gbiconv_close(iconv_t cd) {
|
|
/// @todo ALC gbiconv_close currently does nothing
|
|
//int val = iconv_close(cd);
|
|
//return val;
|
|
return 0;
|
|
}
|
|
|
|
void gbiconv_reset(){
|
|
for (int32_t i=0;i<s_convTable.getNumSlots();i++){
|
|
//int32_t key = *(int32_t *)s_convTable.getKey(i);
|
|
//if (!key) continue;
|
|
if ( ! s_convTable.m_flags[i] ) continue;
|
|
iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i);
|
|
if (! pconv) continue;
|
|
iconv_t iconv = *pconv;
|
|
//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
|
|
iconv_close(iconv);
|
|
}
|
|
s_convTable.reset();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define VERIFY_UNICODE_CHECKSUMS 1
|
|
|
|
#define CHKSUM_UPPERMAP 1241336150
|
|
#define CHKSUM_LOWERMAP 1023166806
|
|
#define CHKSUM_PROPERTIES 33375957
|
|
#define CHKSUM_COMBININGCLASS 526097805
|
|
#define CHKSUM_SCRIPTS 1826246000
|
|
#define CHKSUM_KDMAP 1920116453
|
|
|
|
bool ucInit(const char *path) {
|
|
|
|
char file[384];
|
|
if (path == NULL) path = "./";
|
|
|
|
// Might want to move this out of ucInit someday
|
|
// but right now it's the only thing that uses .so files (?)
|
|
char gbLibDir[512];
|
|
snprintf(gbLibDir, 512, "%s/lib",path);
|
|
// i don't think this is used any more because we don't have it!
|
|
//log(LOG_INIT, "ucinit: Setting LD_RUN_PATH to \"%s\"",gbLibDir);
|
|
if (setenv("LD_RUN_PATH", gbLibDir, 1)){
|
|
log(LOG_INIT, "Failed to set LD_RUN_PATH");
|
|
}
|
|
//char *ldpath = getenv("LD_RUN_PATH");
|
|
// i don't think this is used any more because we don't have it!
|
|
//log(LOG_DEBUG, "ucinit: LD_RUN_PATH: %s\n", ldpath);
|
|
|
|
|
|
strcpy(file, path);
|
|
strcat(file, "/ucdata/uppermap.dat");
|
|
if (!loadUnicodeTable(&g_ucUpperMap,file,
|
|
VERIFY_UNICODE_CHECKSUMS,
|
|
CHKSUM_UPPERMAP))
|
|
goto failed;
|
|
strcpy(file, path);
|
|
strcat(file, "/ucdata/lowermap.dat");
|
|
if (!loadUnicodeTable(&g_ucLowerMap,file,
|
|
VERIFY_UNICODE_CHECKSUMS,
|
|
CHKSUM_LOWERMAP))
|
|
goto failed;
|
|
strcpy(file, path);
|
|
strcat(file, "/ucdata/properties.dat");
|
|
if (!loadUnicodeTable(&g_ucProps, file,
|
|
VERIFY_UNICODE_CHECKSUMS,
|
|
CHKSUM_PROPERTIES))
|
|
goto failed;
|
|
|
|
strcpy(file, path);
|
|
strcat(file, "/ucdata/scripts.dat");
|
|
if (!loadUnicodeTable(&g_ucScripts, file,
|
|
VERIFY_UNICODE_CHECKSUMS,
|
|
CHKSUM_SCRIPTS))
|
|
goto failed;
|
|
|
|
// MDW: do we need this for converting from X to utf8? or for
|
|
// the is_alnum(), etc. functions?
|
|
if (!loadDecompTables(path)) {
|
|
goto failed;
|
|
}
|
|
|
|
//s_convTable.set(1024);
|
|
if ( ! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,"cnvtbl"))
|
|
goto failed;
|
|
|
|
return true;
|
|
|
|
failed:
|
|
log(LOG_WARN, "uni: unable to load all property tables");
|
|
return false;
|
|
}
|
|
|
|
const char *ucDetectBOM(const char *buf, int32_t bufsize){
|
|
if (bufsize < 4) return NULL;
|
|
// copied from ICU
|
|
if(buf[0] == '\xFE' && buf[1] == '\xFF') {
|
|
return "UTF-16BE";
|
|
} else if(buf[0] == '\xFF' && buf[1] == '\xFE') {
|
|
if(buf[2] == '\x00' && buf[3] =='\x00') {
|
|
return "UTF-32LE";
|
|
} else {
|
|
return "UTF-16LE";
|
|
}
|
|
} else if(buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') {
|
|
return "UTF-8";
|
|
} else if(buf[0] == '\x00' && buf[1] == '\x00' &&
|
|
buf[2] == '\xFE' && buf[3]=='\xFF') {
|
|
return "UTF-32BE";
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
int32_t ucToAny(char *outbuf, int32_t outbufsize, const char *charset_out,
|
|
const char *inbuf, int32_t inbuflen, const char *charset_in,
|
|
int32_t ignoreBadChars ){
|
|
if (inbuflen == 0) return 0;
|
|
// alias for iconv
|
|
const char *csAlias = charset_in;
|
|
if (!strncmp(charset_in, "x-windows-949", 13))
|
|
csAlias = "CP949";
|
|
|
|
// Treat all latin1 as windows-1252 extended charset
|
|
if (!strncmp(charset_in, "ISO-8859-1", 10) )
|
|
csAlias = "WINDOWS-1252";
|
|
|
|
iconv_t cd = gbiconv_open(charset_out, csAlias);
|
|
int32_t numBadChars = 0;
|
|
if (cd == (iconv_t)-1) {
|
|
log("uni: Error opening input conversion"
|
|
" descriptor for %s: %s (%d)\n",
|
|
charset_in,
|
|
strerror(errno),errno);
|
|
return 0;
|
|
}
|
|
|
|
//if (normalized) *normalized = false;
|
|
char *pin = const_cast<char*>(inbuf); //const cast due to iconv() speciality
|
|
size_t inRemaining = inbuflen;
|
|
char *pout = outbuf;
|
|
size_t outRemaining = outbufsize;
|
|
int res = 0;
|
|
if (outbuf == NULL || outbufsize == 0) {
|
|
// just find the size needed for conversion
|
|
#define TMP_SIZE 32
|
|
char buf[TMP_SIZE];
|
|
int32_t len = 0;
|
|
while (inRemaining) {
|
|
pout = buf;
|
|
outRemaining = TMP_SIZE;
|
|
res = iconv(cd, &pin, &inRemaining,
|
|
&pout, &outRemaining);
|
|
if (res < 0 && errno){
|
|
// convert the next TMP_SIZE block
|
|
if (errno == E2BIG) {
|
|
len += TMP_SIZE;
|
|
continue;
|
|
}
|
|
gbiconv_close(cd);
|
|
return 0; // other error
|
|
}
|
|
len += TMP_SIZE-outRemaining;
|
|
//len >>= 1; // sizeof UChar
|
|
len += 1; // NULL terminated
|
|
gbiconv_close(cd);
|
|
return len;
|
|
}
|
|
}
|
|
|
|
while (inRemaining && outRemaining) {
|
|
//printf("Before - in: %d, out: %d\n",
|
|
//inRemaining, outRemaining);
|
|
res = iconv(cd,&pin, &inRemaining,
|
|
&pout, &outRemaining);
|
|
|
|
if (res < 0 && errno){
|
|
//printf("errno: %s (%d)\n", strerror(errno), errno);
|
|
g_errno = errno;
|
|
switch(errno) {
|
|
case EILSEQ:
|
|
numBadChars++;
|
|
|
|
if (ignoreBadChars >= 0 &&
|
|
numBadChars > ignoreBadChars) goto done;
|
|
utf8Encode('?', pout);
|
|
pout++;outRemaining --;
|
|
pin++; inRemaining--;
|
|
g_errno = 0;
|
|
continue;
|
|
case EINVAL:
|
|
numBadChars++;
|
|
|
|
utf8Encode('?', pout);
|
|
pout++;outRemaining --;
|
|
pin++; inRemaining--;
|
|
g_errno=0;
|
|
continue;
|
|
// go ahead and flag an error now
|
|
// if there is a bad character, we've
|
|
// probably misguessed the charset
|
|
|
|
case E2BIG:
|
|
//log("uni: error converting to UTF-8: %s",
|
|
// strerror(errno));
|
|
goto done;
|
|
default:
|
|
log("uni: unknown error occurred "
|
|
"converting to UTF-8: %s (%d)",
|
|
strerror(errno), errno);
|
|
goto done;
|
|
}
|
|
}
|
|
}
|
|
done:
|
|
gbiconv_close(cd);
|
|
int32_t len = (outbufsize - outRemaining) ;
|
|
len = len>=outbufsize-1?outbufsize-2:len;
|
|
//len >>= 1;
|
|
//len = outbuf[len]=='\0'?len-1:len;
|
|
outbuf[len] = '\0';
|
|
if (numBadChars) {
|
|
log(LOG_DEBUG, "uni: ucToAny: got %" PRId32" bad chars in conversion 2.",
|
|
numBadChars);
|
|
}
|
|
if (res < 0 && g_errno) return 0;
|
|
return len ;
|
|
}
|
|
|
|
int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
|
|
const unsigned char *p, int32_t inbuflen) {
|
|
char *s = (char *)p;
|
|
char *send = (char *)p + inbuflen;
|
|
int32_t cs;
|
|
char *dst = outbuf;
|
|
for ( ; s < send ; s += cs ) {
|
|
// how big is this character?
|
|
cs = getUtf8CharSize(s);
|
|
// convert the utf8 character to UChar32
|
|
UChar32 uc = utf8Decode ( s );
|
|
// break "uc" into decomposition of UChar32s
|
|
UChar32 ttt[32];
|
|
int32_t klen = recursiveKDExpand(uc,ttt,32);
|
|
if(klen>32) gbshutdownLogicError();
|
|
// sanity
|
|
if ( dst + 5 > outbuf+outbufsize ) return -1;
|
|
// if the same, leave it! it had no accent marks or other
|
|
// modifiers...
|
|
if ( klen <= 1 ) {
|
|
gbmemcpy ( dst , s , cs );
|
|
dst += cs;
|
|
continue;
|
|
}
|
|
// take the first one as the stripped
|
|
// convert back to utf8
|
|
int32_t stored = utf8Encode ( ttt[0] , dst );
|
|
// skip over the stored utf8 char
|
|
dst += stored;
|
|
}
|
|
// sanity. breach check
|
|
if ( dst > outbuf+outbufsize ) gbshutdownLogicError();
|
|
// return # of bytes stored into outbuf
|
|
return dst - outbuf;
|
|
}
|
|
|
|
|
|
void resetUnicode ( ) {
|
|
//s_convTable.reset();
|
|
gbiconv_reset();
|
|
}
|