Files
privacore-open-source-searc…/UnicodeProperties.h

262 lines
6.7 KiB
C

#ifndef GB_UNICODEPROPERTIES_H
#define GB_UNICODEPROPERTIES_H
#include <sys/types.h>
#include "UCPropTable.h"
#ifndef USE_ICU
typedef uint32_t UChar32;
typedef uint16_t UChar;
typedef unsigned char UChar8; // utf-8
#endif
typedef uint16_t UCProps;
typedef unsigned char UCScript;
bool loadUnicodeTable(UCPropTable *table, const char *filename, bool useChecksum = false, uint32_t expectedChecksum = 0);
bool saveUnicodeTable(UCPropTable *table, const char *filename);
// JAB: we now have Kompatible and Canonical decomposition tables
bool saveKDecompTable(const char *baseDir = NULL) ;
bool loadDecompTables(const char *baseDir = NULL) ;
void resetDecompTables() ;
bool setKDValue(UChar32 c, UChar32* decomp, int32_t decompCount,
bool fullComp = false);
const UChar32 *getKDValue(UChar32 c, int32_t *decompCount, bool *fullComp = NULL);
int32_t recursiveKDExpand(UChar32 c, UChar32 *buf, int32_t bufSize);
unsigned char ucCombiningClass(UChar32 c);
static inline bool ucIsWhiteSpace(UChar32 c);
static inline bool ucIsIdeograph(UChar32 c);
static inline bool ucIsPunct(UChar32 c);
static inline bool ucIsWordChar(UChar32 c);
static inline bool ucIsIgnorable(UChar32 c);
static inline bool ucIsExtend(UChar32 c);
// Parse Properties
#define UC_WORDCHAR ( 1 << 0 )
#define UC_IGNORABLE ( 1 << 1 )
#define UC_IDEOGRAPH ( 1 << 2 )
#define UC_HIRAGANA ( 1 << 3 )
#define UC_KATAKANA ( 1 << 4 )
#define UC_THAI ( 1 << 5 )
#define UC_EXTEND ( 1 << 6 )
// General Properties
#define UC_ALPHA ( 1 << 7 )
#define UC_DIGIT ( 1 << 8 )
#define UC_UPPER ( 1 << 9 )
#define UC_LOWER ( 1 << 10 )
#define UC_WHITESPACE ( 1 << 11 )
#define UC_NFKC_QC_NO ( 1 << 12 )
#define UC_COMP_EX ( 1 << 13 )
extern UCPropTable g_ucProps;
extern UCPropTable g_ucLowerMap;
extern UCPropTable g_ucUpperMap;
extern UCPropTable g_ucScripts;
extern UCPropTable g_ucKDIndex;
extern const char * const g_ucScriptNames[];
enum ucScript {
ucScriptCommon = 0,
ucScriptArabic,
ucScriptArmenian,
ucScriptBengali,
ucScriptBopomofo,
ucScriptBraille,
ucScriptBuhid,
ucScriptCanadian_Aboriginal,
ucScriptCherokee,
ucScriptCypriot,
ucScriptCyrillic, // 10 = russian
ucScriptDeseret,
ucScriptDevanagari,
ucScriptEthiopic,
ucScriptGeorgian,
ucScriptGothic, // 15
ucScriptGreek,
ucScriptGujarati,
ucScriptGurmukhi,
ucScriptHan,
ucScriptHangul, // 20
ucScriptHanunoo,
ucScriptHebrew, // 22
ucScriptHiragana,
ucScriptInherited, // 24
ucScriptKannada,
ucScriptKatakana,
ucScriptKatakana_Or_Hiragana,
ucScriptKhmer,
ucScriptLao,
ucScriptLatin,
ucScriptLimbu,
ucScriptLinear_B,
ucScriptMalayalam,
ucScriptMongolian,
ucScriptMyanmar,
ucScriptOgham,
ucScriptOld_Italic,
ucScriptOriya,
ucScriptOsmanya,
ucScriptRunic,
ucScriptShavian,
ucScriptSinhala,
ucScriptSyriac,
ucScriptTagalog,
ucScriptTagbanwa,
ucScriptTai_Le,
ucScriptTamil,
ucScriptTelugu,
ucScriptThaana,
ucScriptThai,
ucScriptTibetan,
ucScriptUgaritic,
ucScriptYi,
ucScriptNumScripts
};
enum UCProperty {
ucPropASCII_Hex_Digit = 1,
ucPropBidi_Control = 1 << 1,
ucPropDash = 1 << 2,
ucPropDeprecated = 1 << 3,
ucPropDiacritic = 1 << 4,
ucPropExtender = 1 << 5,
ucPropGrapheme_Link = 1 << 6,
ucPropHex_Digit = 1 << 7,
ucPropHyphen = 1 << 8,
ucPropIDS_Binary_Operator = 1 << 9,
ucPropIDS_Trinary_Operator = 1 << 10,
ucPropIdeographic = 1 << 11,
ucPropJoin_Control = 1 << 12,
ucPropLogical_Order_Exception = 1 << 13,
ucPropNoncharacter_Code_Point = 1 << 14,
ucPropOther_Alphabetic = 1 << 15,
ucPropOther_Default_Ignorable_Code_Point = 1 << 16,
ucPropOther_Grapheme_Extend = 1 << 17,
ucPropOther_ID_Start = 1 << 18,
ucPropOther_Lowercase = 1 << 19,
ucPropOther_Math = 1 << 20,
ucPropOther_Uppercase = 1 << 21,
ucPropQuotation_Mark = 1 << 22,
ucPropRadical = 1 << 23,
ucPropSTerm = 1 << 24,
ucPropSoft_Dotted = 1 << 25,
ucPropTerminal_Punctuation = 1 << 26,
ucPropUnified_Ideograph = 1 << 27,
ucPropVariation_Selector = 1 << 28,
ucPropWhite_Space = 1 << 29
};
enum ucDerivedCoreProperties {
ucDCPropAlphabetic = 1,
ucDCPropDefault_Ignorable_Code_Point = 1 << 1,
ucDCPropGrapheme_Base = 1 << 2,
ucDCPropGrapheme_Extend = 1 << 3,
ucDCPropID_Continue = 1 << 4,
ucDCPropID_Start = 1 << 5,
ucDCPropLowercase = 1 << 6,
ucDCPropMath = 1 << 7,
ucDCPropUppercase = 1 << 8,
ucDCPropXID_Continue = 1 << 9,
ucDCPropXID_Start = 1 << 10
};
// Inline Functions here
static inline bool ucIsAlpha(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_ALPHA;
}
static inline bool ucIsDigit(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_DIGIT;
}
static inline bool ucIsAlnum(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WORDCHAR;
}
static inline bool ucIsUpper(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_UPPER;
}
static inline bool ucIsLower(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_LOWER;
}
static inline bool ucIsWhiteSpace(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WHITESPACE;
}
static inline bool ucIsIdeograph(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_IDEOGRAPH;
}
static inline bool ucIsPunct(UChar32 c) {
return !ucIsWordChar(c);
}
static inline bool ucIsIgnorable(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_IGNORABLE;
}
static inline bool ucIsExtend(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_EXTEND;
}
static inline UChar32 ucToLower(UChar32 c) {
const void *p = g_ucLowerMap.getValue(c);
if (!p || !*(UChar32*)p) return c;
return *(UChar32*)p;
}
static inline UChar32 ucToUpper(UChar32 c){
const void *p = g_ucUpperMap.getValue(c);
if (!p || !*(UChar32*)p) return c;
return *(UChar32*)p;
}
static inline UCProps ucProperties(UChar32 c) {
const void *p = g_ucProps.getValue(c);
if (!p) return (UCProps)0;
return *(UCProps*)p;
}
static inline int32_t ucDigitValue(UChar32 c) {
if (c >= '0' && c <= '9') return (int32_t)(c-'0');
return 0;
}
static inline UCScript ucGetScript(UChar32 c) {
const void *p = g_ucScripts.getValue(c);
if (!p) return ucScriptCommon;
return *(UCScript*)p;
}
#endif // GB_UNICODEPROPERTIES_H