Make unicode_is_ignorable.dat

This commit is contained in:
Ivan Skytte Jørgensen 2018-02-26 17:44:30 +01:00
parent 2e5ecde0c3
commit 4fb49d608c
4 changed files with 45 additions and 2 deletions

@ -97,6 +97,19 @@ int main() {
assert(m.lookup2(0x2082)); //2082;SUBSCRIPT TWO
}
{
UnicodeMaps::FullMap<bool> m;
assert(m.load("unicode_is_ignorable.dat"));
assert(!m.lookup2('A'));
assert(!m.lookup2(' '));
assert(!m.lookup2('9'));
assert(!m.lookup2(0x00E6)); //æ
assert(m.lookup2(0x00AD)); //soft hyphen
assert(m.lookup2(0x034F)); //combining grapheme joiner
assert(m.lookup2(0x2064)); //invisible plus
assert(!m.lookup2(0x0306)); //combining breve
}
{
UnicodeMaps::SparseMap<UChar32> m;
assert(m.load("unicode_to_lowercase.dat"));

@ -11,6 +11,7 @@ UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_alph
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_uppercase_map;
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_lowercase_map;
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_wordchars_map;
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_ignorable_map;
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_uppercase_map;
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_lowercase_map;
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_canonical_decomposition_map;
@ -45,7 +46,8 @@ bool UnicodeMaps::load_maps(const char *dir, const char **errstr) {
load_map(&g_unicode_general_category_map,dir,"unicode_general_categories.dat",errstr) &&
load_map(&g_unicode_properties_map,dir,"unicode_properties.dat",errstr) &&
load_map(&g_unicode_wordchars_map,dir,"unicode_wordchars.dat",errstr) &&
load_map(&g_unicode_is_alphabetic_map,dir,"unicode_is_alphabetic.dat",errstr) &&
load_map(&g_unicode_wordchars_map,dir,"unicode_wordchars.dat",errstr) &&
load_map(&g_unicode_is_ignorable_map,dir,"unicode_is_ignorable.dat",errstr) &&
load_map(&g_unicode_is_uppercase_map,dir,"unicode_is_uppercase.dat",errstr) &&
load_map(&g_unicode_is_lowercase_map,dir,"unicode_is_lowercase.dat",errstr) &&
load_map(&g_unicode_uppercase_map,dir,"unicode_to_uppercase.dat",errstr) &&
@ -60,6 +62,7 @@ void UnicodeMaps::unload_maps() {
g_unicode_general_category_map.clear();
g_unicode_properties_map.clear();
g_unicode_wordchars_map.clear();
g_unicode_is_ignorable_map.clear();
g_unicode_is_alphabetic_map.clear();
g_unicode_is_uppercase_map.clear();
g_unicode_is_lowercase_map.clear();

@ -12,6 +12,7 @@ extern FullMap<bool> g_unicode_is_alphabetic_map;
extern FullMap<bool> g_unicode_is_uppercase_map;
extern FullMap<bool> g_unicode_is_lowercase_map;
extern FullMap<bool> g_unicode_wordchars_map;
extern FullMap<bool> g_unicode_is_ignorable_map;
extern SparseMap<UChar32> g_unicode_uppercase_map;
extern SparseMap<UChar32> g_unicode_lowercase_map;
extern SparseMap<UChar32> g_unicode_canonical_decomposition_map;
@ -68,6 +69,10 @@ static inline bool is_wordchar(UChar32 c) {
return g_unicode_wordchars_map.lookup2(c);
}
static inline bool is_ignorable(UChar32 c) {
return g_unicode_is_ignorable_map.lookup2(c);
}
static inline bool is_alfanumeric(UChar32 c) {
auto gc = g_unicode_general_category_map.lookup2(c);
return gc==Unicode::general_category_t::Lu ||

@ -17,7 +17,7 @@ last_codepoint = max(UnicodeData.data.keys())
print "Last codepoint: %d"%last_codepoint
def is_interesting(codepoint):
return codepoint in [0x00b2,0x00b3,0x2074, 0x2080,0x2081,0x2082,0x2083]
return codepoint in [0x00A0, 0x00AD, 0x00b2,0x00b3,0x2074, 0x2080,0x2081,0x2082,0x2083]
## Generate codepoint->script mapping
script_name_to_code_mapping = {
@ -223,6 +223,7 @@ with open("unicode_properties.dat","w") as f:
for codepoint in range(0,last_codepoint+1):
if codepoint in UnicodeData.data:
cpi = UnicodeData.data[codepoint]
if is_interesting(codepoint): print "U+%04X: props: %s"%(codepoint,cpi.props)
prop_bits = 0
for p in cpi.props:
if p in property_to_bit_mapping:
@ -315,6 +316,27 @@ with open("unicode_wordchars.dat","w") as f:
print "Done."
#ignorable codepoints. used in conjunction with is_alfanum and script checks. If a codepoint is ignoreable then it can be skipped or included or whatever. It doesn't matter.
print "Generating unicode_is_ignorable.dat"
with open("unicode_is_ignorable.dat","w") as f:
for codepoint in range(0,last_codepoint+1):
is_ignorable = False
if codepoint in UnicodeData.data:
cpi = UnicodeData.data[codepoint]
if "Default_Ignorable_Code_Point" in cpi.derived_core_props:
is_ignorable = True
else:
is_ignorable = False
else:
is_ignorable = False #missing codepoint
if is_interesting(codepoint): print "U+%04X: '%s' : is_ignorable=%s"%(codepoint,unichr(codepoint),is_ignorable)
if is_ignorable:
f.write('\1')
else:
f.write('\0')
print "Done"
print "Generating unicode_is_alphabetic.dat"
with open("unicode_is_alphabetic.dat","w") as f:
for codepoint in range(0,last_codepoint+1):