privacore-open-source-searc.../utf8_fast.h

153 lines
5.5 KiB
C

#ifndef UTF8_FAST_H_
#define UTF8_FAST_H_
#include "utf8.h"
#include "unicode/UCMaps.h"
//Various functions for examining and manipulating UTF8 chars/strings and
//unicode codepoints.
//Optimized for the expectation that the text we normally encounter has most
//of the codepoints in the range 0..128
bool has_alpha_utf8(const char *s, const char *send);
int32_t to_lower_utf8(char *dst, const char *src);
int32_t to_lower_utf8(char *dst, char *dstEnd, const char *src);
int32_t to_lower_utf8(char *dst, char *dstEnd, const char *src, const char *srcEnd);
int32_t to_upper_utf8(char *dst, const char *src); //one codepoint only
int32_t to_upper_utf8(char *dst, char *dstEnd, const char *src, const char *srcEnd);
int32_t to_capitalized_utf8(char *dst, char *dstEnd, const char *src, const char *srcEnd);
extern const unsigned char g_map_to_lower[256];
extern const unsigned char g_map_to_upper[256];
extern const char g_map_is_upper[256];
extern const char g_map_is_binary[256];
extern const char g_map_is_lower[256];
extern const char g_map_is_ascii[256];
extern const char g_map_is_punct[256];
extern const char g_map_is_alnum[256];
extern const char g_map_is_alpha[256];
extern const char g_map_is_digit[256];
extern const char g_map_is_hex[256];
extern const char g_map_is_tagname_char[256];
//Table lookups
static inline bool is_lower_a(char c) { return g_map_is_lower[(unsigned char)c]; }
static inline char to_lower_a(char c) { return g_map_to_lower[(unsigned char)c]; }
static inline bool is_upper_a(char c) { return g_map_is_upper[(unsigned char)c]; }
static inline char to_upper_a(char c) { return g_map_to_upper[(unsigned char)c]; }
// c is latin1 in this case:
static inline bool is_binary_a(char c) { return g_map_is_binary[(unsigned char)c]; }
static inline bool is_wspace_a(char c) { return (((c)==32) || ((c)==9) || ((c)==10) || ((c)==13)); }
static inline bool is_ascii(char c) { return (((c)>=32) && ((c)<=126)); }
static inline bool is_ascii3(char c) { return ((unsigned char)(c)<128); }
static inline bool is_punct_a(char c) { return g_map_is_punct[(unsigned char)c]; }
static inline bool is_alnum_a(char c) { return g_map_is_alnum[(unsigned char)c]; }
static inline bool is_alpha_a(char c) { return g_map_is_alpha[(unsigned char)c]; }
static inline bool is_digit(char c) { return g_map_is_digit[(unsigned char)c]; }
static inline bool is_hex(char c) { return g_map_is_hex[(unsigned char)c]; }
static inline bool is_tagname_char(char c) { return g_map_is_tagname_char[(unsigned char)c]; }
static inline bool is_ascii2_a(const char *s, int32_t len) {
for(int32_t i=0;i<len;i++)
if(!is_ascii(s[i]))
return false;
return true;
}
static inline void to_lower3_a(const char *s, int32_t len, char *buf) {
for(int32_t i=0;i<len ;i++)
buf[i]=to_lower_a((unsigned char)s[i]);
}
static inline bool is_binary_utf8(const char *p) {
if(getUtf8CharSize((uint8_t *)p) != 1)
return false;
return is_binary_a(*p);
}
static inline bool is_lower_utf8(const char *src) {
if(is_ascii3(*src))
return is_lower_a(*src);
return UnicodeMaps::is_lowercase(utf8Decode(src));
}
static inline bool is_upper_utf8(const char *src) {
if(is_ascii3(*src))
return is_upper_a(*src);
return UnicodeMaps::is_uppercase(utf8Decode(src));
}
static inline bool is_alnum_utf8(const char *src) {
if(is_ascii3(*src))
return is_alnum_a(*src);
return UnicodeMaps::is_wordchar(utf8Decode(src));
//todo/bug: the call to is_wordchar() in is_alnum_utf8() looks suspicious. Shouldn't it be is_alfanumeric() ?
}
static inline bool is_alnum_utf8(const uint8_t *src) {
return is_alnum_utf8((const char*)src);
}
bool is_alnum_utf8_string(const char *s, const char *send);
bool is_upper_utf8_string(const char *s, const char *send); //string does not contain any lowercase letters
bool is_all_upper_utf8_string(const char *s, const char *send); //string contains only uppercase letters
bool is_wspace_utf8_string(const char *s, const char *send);
bool has_wspace_utf8_string(const char *s, const char *send);
bool is_capitalized_utf8_word(const char *s, const char *send); //first char is uppercase/titlecase letter, rest is lowercase letters
bool is_alnum_api_utf8_string(const char *s, const char *send); //starts with letter or underscore, contains only ascii letters/digits and underscore
bool is_ascii_digit_string(const char *s, const char *send); //consists of pure ascii digits 0-9
static inline bool is_alpha_utf8(const char *src) {
if(is_ascii3(*src))
return is_alpha_a(*src);
return UnicodeMaps::is_alphabetic(utf8Decode(src));
}
static inline bool is_punct_utf8(const char *src) {
if(is_ascii3(*src)) return is_punct_a(*src);
UChar32 x = utf8Decode(src);
if(UnicodeMaps::is_wordchar(x)) //todo/bug: should it call is_alfanumeric()?
return false;
else
return true;
}
static inline bool is_wspace_utf8(const char *src) {
if(is_ascii3(*src))
return is_wspace_a(*src);
return UnicodeMaps::is_whitespace(utf8Decode(src));
}
static inline bool is_wspace_utf8(const uint8_t *src) {
return is_wspace_utf8((const char*)src);
}
static inline bool is_capitalized_utf8(const char *src) {//alpha and uppercase/titlecase
return is_alpha_utf8(src) && is_upper_utf8(src);
}
static inline bool ucIsWordChar_fast(UChar32 c) {
if (!(c & 0xffffff80))
return is_alnum_a(c);
return UnicodeMaps::is_wordchar(c);
}
static inline bool is_ignorable_fast(UChar32 c) {
if(c<0x034F) {
return c==0x00AD; //soft-hyphen is the only ignorable codepoint in the low range
} else
return UnicodeMaps::is_ignorable(c);
}
#endif //UTF8_FAST_H_