privacore-open-source-searc.../generate_entities.py
2023-12-03 19:51:02 -05:00

69 lines
1.9 KiB
Python
Executable File

#!/usr/bin/env python
import os
import json
import urllib.request, urllib.error, urllib.parse
import sys
def die(msg):
print(msg, file=sys.stderr)
sys.exit(1)
# 1: Fetch (if not already done) https://www.w3.org/TR/html5/entities.json
# 2: Transform into nice 'Entity' data entries for inclusion in Entities.cpp
#if 'entities.json' doesn't exist then fetch it
filename = "entities.json"
if not os.path.exists(filename):
url = "https://www.w3.org/TR/html5/entities.json"
f = urllib.request.urlopen(url)
if f.getcode()!=200:
die("Could not fetch %s"%url)
r = f.read()
json_entities = json.loads(r)
with open(filename,"w") as out_file:
out_file.write(r)
else:
#load existing file
with open(filename,"r") as in_file:
json_entities = json.loads(in_file.read())
max_entity_name_len = 0
#keep track of which entities we have seen. The w3c list contains duplicates.
seen_entitites = set()
print("static struct Entity s_entities[] = {")
for entity_name,data in json_entities.items():
if entity_name[0]!='&':
die("entity %s does not start with an ampersand"%entity_name)
entity_name = entity_name[1:]
if entity_name[-1]==';':
#strip off if present (w3c file is inconsistent)
entity_name=entity_name[0:-1]
if entity_name in seen_entitites:
continue
seen_entitites.add(entity_name)
codepoints = data['codepoints']
if len(codepoints)<1 or len(codepoints)>2:
die("Unexpected codepoint count for entity %s",entity_name)
codepoint_count = len(codepoints)
if len(codepoints)<2:
codepoints.append(0) #make codepoints a full array so compilers/flexelint dont complain about too few initializers
max_entity_name_len = max(max_entity_name_len,len(entity_name))
print(' {"&%s", %d, {%s}, 0, ""},'%(entity_name, codepoint_count, ",".join([str(c) for c in codepoints])))
print("};")
print("static const int max_entity_name_len = %d;"%(max_entity_name_len+1))