mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-05-07 17:09:32 -04:00
Make unicode_is_ignorable.dat
This commit is contained in:
parent
2e5ecde0c3
commit
4fb49d608c
@ -97,6 +97,19 @@ int main() {
|
||||
assert(m.lookup2(0x2082)); //2082;SUBSCRIPT TWO
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeMaps::FullMap<bool> m;
|
||||
assert(m.load("unicode_is_ignorable.dat"));
|
||||
assert(!m.lookup2('A'));
|
||||
assert(!m.lookup2(' '));
|
||||
assert(!m.lookup2('9'));
|
||||
assert(!m.lookup2(0x00E6)); //æ
|
||||
assert(m.lookup2(0x00AD)); //soft hyphen
|
||||
assert(m.lookup2(0x034F)); //combining grapheme joiner
|
||||
assert(m.lookup2(0x2064)); //invisible plus
|
||||
assert(!m.lookup2(0x0306)); //combining breve
|
||||
}
|
||||
|
||||
{
|
||||
UnicodeMaps::SparseMap<UChar32> m;
|
||||
assert(m.load("unicode_to_lowercase.dat"));
|
||||
|
@ -11,6 +11,7 @@ UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_alph
|
||||
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_uppercase_map;
|
||||
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_lowercase_map;
|
||||
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_wordchars_map;
|
||||
UnicodeMaps::FullMap<bool> UnicodeMaps::g_unicode_is_ignorable_map;
|
||||
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_uppercase_map;
|
||||
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_lowercase_map;
|
||||
UnicodeMaps::SparseMap<UChar32> UnicodeMaps::g_unicode_canonical_decomposition_map;
|
||||
@ -45,7 +46,8 @@ bool UnicodeMaps::load_maps(const char *dir, const char **errstr) {
|
||||
load_map(&g_unicode_general_category_map,dir,"unicode_general_categories.dat",errstr) &&
|
||||
load_map(&g_unicode_properties_map,dir,"unicode_properties.dat",errstr) &&
|
||||
load_map(&g_unicode_wordchars_map,dir,"unicode_wordchars.dat",errstr) &&
|
||||
load_map(&g_unicode_is_alphabetic_map,dir,"unicode_is_alphabetic.dat",errstr) &&
|
||||
load_map(&g_unicode_wordchars_map,dir,"unicode_wordchars.dat",errstr) &&
|
||||
load_map(&g_unicode_is_ignorable_map,dir,"unicode_is_ignorable.dat",errstr) &&
|
||||
load_map(&g_unicode_is_uppercase_map,dir,"unicode_is_uppercase.dat",errstr) &&
|
||||
load_map(&g_unicode_is_lowercase_map,dir,"unicode_is_lowercase.dat",errstr) &&
|
||||
load_map(&g_unicode_uppercase_map,dir,"unicode_to_uppercase.dat",errstr) &&
|
||||
@ -60,6 +62,7 @@ void UnicodeMaps::unload_maps() {
|
||||
g_unicode_general_category_map.clear();
|
||||
g_unicode_properties_map.clear();
|
||||
g_unicode_wordchars_map.clear();
|
||||
g_unicode_is_ignorable_map.clear();
|
||||
g_unicode_is_alphabetic_map.clear();
|
||||
g_unicode_is_uppercase_map.clear();
|
||||
g_unicode_is_lowercase_map.clear();
|
||||
|
@ -12,6 +12,7 @@ extern FullMap<bool> g_unicode_is_alphabetic_map;
|
||||
extern FullMap<bool> g_unicode_is_uppercase_map;
|
||||
extern FullMap<bool> g_unicode_is_lowercase_map;
|
||||
extern FullMap<bool> g_unicode_wordchars_map;
|
||||
extern FullMap<bool> g_unicode_is_ignorable_map;
|
||||
extern SparseMap<UChar32> g_unicode_uppercase_map;
|
||||
extern SparseMap<UChar32> g_unicode_lowercase_map;
|
||||
extern SparseMap<UChar32> g_unicode_canonical_decomposition_map;
|
||||
@ -68,6 +69,10 @@ static inline bool is_wordchar(UChar32 c) {
|
||||
return g_unicode_wordchars_map.lookup2(c);
|
||||
}
|
||||
|
||||
static inline bool is_ignorable(UChar32 c) {
|
||||
return g_unicode_is_ignorable_map.lookup2(c);
|
||||
}
|
||||
|
||||
static inline bool is_alfanumeric(UChar32 c) {
|
||||
auto gc = g_unicode_general_category_map.lookup2(c);
|
||||
return gc==Unicode::general_category_t::Lu ||
|
||||
|
@ -17,7 +17,7 @@ last_codepoint = max(UnicodeData.data.keys())
|
||||
print "Last codepoint: %d"%last_codepoint
|
||||
|
||||
def is_interesting(codepoint):
|
||||
return codepoint in [0x00b2,0x00b3,0x2074, 0x2080,0x2081,0x2082,0x2083]
|
||||
return codepoint in [0x00A0, 0x00AD, 0x00b2,0x00b3,0x2074, 0x2080,0x2081,0x2082,0x2083]
|
||||
|
||||
## Generate codepoint->script mapping
|
||||
script_name_to_code_mapping = {
|
||||
@ -223,6 +223,7 @@ with open("unicode_properties.dat","w") as f:
|
||||
for codepoint in range(0,last_codepoint+1):
|
||||
if codepoint in UnicodeData.data:
|
||||
cpi = UnicodeData.data[codepoint]
|
||||
if is_interesting(codepoint): print "U+%04X: props: %s"%(codepoint,cpi.props)
|
||||
prop_bits = 0
|
||||
for p in cpi.props:
|
||||
if p in property_to_bit_mapping:
|
||||
@ -315,6 +316,27 @@ with open("unicode_wordchars.dat","w") as f:
|
||||
print "Done."
|
||||
|
||||
|
||||
#ignorable codepoints. used in conjunction with is_alfanum and script checks. If a codepoint is ignoreable then it can be skipped or included or whatever. It doesn't matter.
|
||||
print "Generating unicode_is_ignorable.dat"
|
||||
with open("unicode_is_ignorable.dat","w") as f:
|
||||
for codepoint in range(0,last_codepoint+1):
|
||||
is_ignorable = False
|
||||
if codepoint in UnicodeData.data:
|
||||
cpi = UnicodeData.data[codepoint]
|
||||
if "Default_Ignorable_Code_Point" in cpi.derived_core_props:
|
||||
is_ignorable = True
|
||||
else:
|
||||
is_ignorable = False
|
||||
else:
|
||||
is_ignorable = False #missing codepoint
|
||||
if is_interesting(codepoint): print "U+%04X: '%s' : is_ignorable=%s"%(codepoint,unichr(codepoint),is_ignorable)
|
||||
if is_ignorable:
|
||||
f.write('\1')
|
||||
else:
|
||||
f.write('\0')
|
||||
|
||||
print "Done"
|
||||
|
||||
print "Generating unicode_is_alphabetic.dat"
|
||||
with open("unicode_is_alphabetic.dat","w") as f:
|
||||
for codepoint in range(0,last_codepoint+1):
|
||||
|
Loading…
x
Reference in New Issue
Block a user