mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
ce26026076
Ended up disabling use of Phrases and countTable in XmlDoc::hashWords3() becuase Phrases+Pos have hard assumptions about token order that are hard to bypass. Also disable diversityrank calculation.
100 lines
2.6 KiB
C++
100 lines
2.6 KiB
C++
#include "utf8.h"
|
|
#include <string.h>
|
|
|
|
bool verifyUtf8(const char *txt, int32_t tlen) {
|
|
if( ! txt || tlen <= 0)
|
|
return true;
|
|
char size;
|
|
const char *p = txt;
|
|
const char *pend = txt + tlen;
|
|
for( ; p < pend; p += size) {
|
|
size = getUtf8CharSize(p);
|
|
// skip if ascii
|
|
if( ! (p[0] & 0x80))
|
|
continue;
|
|
// ok, it's a utf8 char, it must have both hi bits set
|
|
if( (p[0] & 0xc0) != 0xc0)
|
|
return false;
|
|
// if only one byte, we are done.. how can that be?
|
|
if( size == 1)
|
|
return false;
|
|
//if ( ! utf8IsSane ( p[0] ) ) return false;
|
|
// successive utf8 chars must have & 0xc0 be equal to 0x80
|
|
// but the first char it must equal 0xc0, both set
|
|
if( (p[1] & 0xc0) != 0x80)
|
|
return false;
|
|
if( size == 2)
|
|
continue;
|
|
if( (p[2] & 0xc0) != 0x80)
|
|
return false;
|
|
if( size == 3)
|
|
continue;
|
|
if( (p[3] & 0xc0) != 0x80)
|
|
return false;
|
|
}
|
|
if(p != pend)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
|
|
bool verifyUtf8 ( const char *txt ) {
|
|
int32_t tlen = strlen(txt);
|
|
return verifyUtf8(txt,tlen);
|
|
}
|
|
|
|
|
|
// table for decoding utf8...says how many bytes in the character
|
|
// based on value of first byte. 0 is an illegal value
|
|
const int bytes_in_utf8_code[] = {
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
// next two rows are all illegal, so return 1 byte
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
// many for loop add this many bytes to iterate, so since the last
|
|
// 8 entries in this table are invalid, assume 1, not 0
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1
|
|
};
|
|
|
|
|
|
int decode_utf8_string(const char *utf8, size_t utf8len, UChar32 uc[]) {
|
|
const char *p = utf8;
|
|
const char *end = utf8+utf8len;
|
|
int codepoints = 0;
|
|
while(p<end) {
|
|
int cs = getUtf8CharSize(p);
|
|
if(p+cs>end)
|
|
return -1; //decode error
|
|
uc[codepoints++] = utf8Decode(p);
|
|
p += cs;
|
|
}
|
|
return codepoints;
|
|
}
|
|
|
|
size_t encode_utf8_string(UChar32 uc[], unsigned codepoints, char *utf8) {
|
|
size_t utf8len = 0;
|
|
for(unsigned i=0; i<codepoints; i++)
|
|
utf8len += utf8Encode(uc[i], utf8+utf8len);
|
|
return utf8len;
|
|
}
|
|
|
|
|
|
size_t strnlen_utf8(const char *p, size_t maxlen) {
|
|
const char *pend = p+maxlen;
|
|
size_t count = 0;
|
|
while(p<pend) {
|
|
char cs = getUtf8CharSize(p);
|
|
if(p+cs>pend)
|
|
break;
|
|
count++;
|
|
p += cs;
|
|
}
|
|
return count;
|
|
}
|