privacore-open-source-searc.../utf8.cpp
Ivan Skytte Jørgensen ce26026076 tokenizer: uise phase-2 tokens
Ended up disabling use of Phrases and countTable in XmlDoc::hashWords3() becuase Phrases+Pos have hard assumptions about token order that are hard to bypass.
Also disable diversityrank calculation.
2018-03-13 17:12:42 +01:00

100 lines
2.6 KiB
C++

#include "utf8.h"
#include <string.h>
bool verifyUtf8(const char *txt, int32_t tlen) {
if( ! txt || tlen <= 0)
return true;
char size;
const char *p = txt;
const char *pend = txt + tlen;
for( ; p < pend; p += size) {
size = getUtf8CharSize(p);
// skip if ascii
if( ! (p[0] & 0x80))
continue;
// ok, it's a utf8 char, it must have both hi bits set
if( (p[0] & 0xc0) != 0xc0)
return false;
// if only one byte, we are done.. how can that be?
if( size == 1)
return false;
//if ( ! utf8IsSane ( p[0] ) ) return false;
// successive utf8 chars must have & 0xc0 be equal to 0x80
// but the first char it must equal 0xc0, both set
if( (p[1] & 0xc0) != 0x80)
return false;
if( size == 2)
continue;
if( (p[2] & 0xc0) != 0x80)
return false;
if( size == 3)
continue;
if( (p[3] & 0xc0) != 0x80)
return false;
}
if(p != pend)
return false;
return true;
}
bool verifyUtf8 ( const char *txt ) {
int32_t tlen = strlen(txt);
return verifyUtf8(txt,tlen);
}
// table for decoding utf8...says how many bytes in the character
// based on value of first byte. 0 is an illegal value
const int bytes_in_utf8_code[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// next two rows are all illegal, so return 1 byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// many for loop add this many bytes to iterate, so since the last
// 8 entries in this table are invalid, assume 1, not 0
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1
};
int decode_utf8_string(const char *utf8, size_t utf8len, UChar32 uc[]) {
const char *p = utf8;
const char *end = utf8+utf8len;
int codepoints = 0;
while(p<end) {
int cs = getUtf8CharSize(p);
if(p+cs>end)
return -1; //decode error
uc[codepoints++] = utf8Decode(p);
p += cs;
}
return codepoints;
}
size_t encode_utf8_string(UChar32 uc[], unsigned codepoints, char *utf8) {
size_t utf8len = 0;
for(unsigned i=0; i<codepoints; i++)
utf8len += utf8Encode(uc[i], utf8+utf8len);
return utf8len;
}
size_t strnlen_utf8(const char *p, size_t maxlen) {
const char *pend = p+maxlen;
size_t count = 0;
while(p<pend) {
char cs = getUtf8CharSize(p);
if(p+cs>pend)
break;
count++;
p += cs;
}
return count;
}