privacore-open-source-searc.../sto/sto_convert.py
Ivan Skytte Jørgensen 27b3b6c199 Collect warnings at end
2018-06-18 00:49:49 +02:00

257 lines
7.3 KiB
Python
Executable File

#!/usr/bin/python2
from __future__ import print_function
import xml.etree.ElementTree
import struct
import argparse
import sys
import os
#hack to make utf-8 values work
import sys
reload(sys)
sys.setdefaultencoding("utf_8")
part_of_speech_map={
"adjective":1,
"commonNoun":2,
"conjunction":3,
"demonstrativePronoun":4,
"deponentVerb":5,
"existentialPronoun":6,
"generalAdverb":7,
"indefinitePronoun":8,
"infinitiveParticle":9,
"interjection":10,
"interrogativeRelativePronoun":11,
"mainVerb":12,
"numeral":13,
"ordinalAdjective":14,
"personalPronoun":15,
"possessivePronoun":16,
"preposition":17,
"properNoun":18,
"reciprocalPronoun":19,
"unclassifiedParticle":20,
"unspecified":21,
"coordinatingConjunction":22,
"subordinatingConjunction":23
}
word_form_attribute_map={
"adjectivalFunction_attributiveFunction": 1,
"adjectivalFunction_predicativeFunction": 2,
"adjectivalFunction_unspecified": 3,
"case_genitiveCase": 4,
"case_nominativeCase": 5,
"case_unspecified": 6,
"definiteness_definite": 7,
"definiteness_indefinite": 8,
"definiteness_unspecified": 9,
"degree_comparative": 10,
"degree_positive": 11,
"degree_superlative": 12,
"grammaticalGender_commonGender": 13,
"grammaticalGender_neuter": 14,
"grammaticalGender_unspecified": 15,
"grammaticalNumber_plural": 16,
"grammaticalNumber_singular": 17,
"grammaticalNumber_unspecified": 18,
"independentWord_no": 19,
"independentWord_yes": 20,
"officiallyApproved_no": 21,
"officiallyApproved_yes": 22,
"ownerNumber_plural": 23,
"ownerNumber_singular": 24,
"ownerNumber_unspecified": 25,
"person_firstPerson": 26,
"person_secondPerson": 27,
"person_thirdPerson": 28,
"reflexivity_no": 29,
"reflexivity_yes": 30,
"reflexivity_unspecified": 31,
"register_formalRegister": 32,
"register_OBSOLETE": 33,
"tense_past": 34,
"tense_present": 35,
"transcategorization_transadjectival": 36,
"transcategorization_transadverbial": 37,
"transcategorization_transnominal": 38,
"verbFormMood_gerundive": 39,
"verbFormMood_imperative": 40,
"verbFormMood_indicative": 41,
"verbFormMood_infinitive": 42,
"verbFormMood_participle": 43,
"voice_activeVoice": 44,
"voice_passiveVoice": 45
}
total_entry_count = None
total_wordform_count = None
warnings = {}
skips = {}
def emit_warning(id,what):
global warnings
warnings[id] = what
def emit_skip(id,why):
global skips
skips[id] = why
def process_lexcial_entry(lexicalentry,output_file):
global total_entry_count, total_wordform_count
part_of_speech=None
id=None
morphological_unit_id=None
for feat in lexicalentry.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
if att=="partOfSpeech":
if val in part_of_speech_map:
part_of_speech = part_of_speech_map[val]
else:
print("Unknown part_of_speech: ",val, file=sys.stderr)
sys.exit(2)
elif att=="id":
id=val
elif att=="morphologicalUnitId":
morphological_unit_id=val
#todo:decomposition
if part_of_speech==None:
emit_skip(id,"No partOfSpeech")
return
if morphological_unit_id==None:
emit_skip(id,"No morphologicalUnitId")
return
raw_wordforms = b""
wordform_count = 0
for wordform in lexicalentry.findall("WordForm"):
attributes=[]
for feat in wordform.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("wordform.feat: att=%s val=%s"%(att,val))
s=att+"_"+val
if s in word_form_attribute_map:
attributes.append(word_form_attribute_map[s])
else:
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
sys.exit(2)
if len(attributes)==0:
emit_warning(id,"No <feat> attributes")
#happens for a few entries such as "Chippendale". We convert it anyway because at least we know the part-of-speech
if len(attributes)>6:
emit_skip(id,"Too many <feat>")
return
while len(attributes)<6:
attributes.append(0)
for formrepresentation in wordform.findall("FormRepresentation"):
writtenform=None
for feat in formrepresentation.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
if att=="writtenForm":
writtenform=val
raw_writtenform = writtenform.encode()
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
+ struct.pack(">B",len(raw_writtenform)) \
+ raw_writtenform
wordform_count += 1
raw_wordforms += raw_wordform
raw_morphological_unit_id = morphological_unit_id.encode()
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
output_file.write(raw_entry)
total_entry_count += 1
total_wordform_count += wordform_count
def do_convert_lexicon_file(input_file_name, output_file):
print("Opening and parsing %s"%(input_file_name))
tree = xml.etree.ElementTree.parse(input_file_name)
root = tree.getroot()
lexicon=root.find("Lexicon")
global total_entry_count, total_wordform_count
total_entry_count=0
total_wordform_count=0
for lexicalentry in lexicon.findall("LexicalEntry"):
process_lexcial_entry(lexicalentry,output_file)
print("Done")
print("\tlexical entries: %d"%total_entry_count)
print("\twordforms: %d"%total_wordform_count)
def do_convert_lexcialentry_file(input_file_name,output_file):
print("%s:"%input_file_name);
tree = xml.etree.ElementTree.parse(input_file_name)
root = tree.getroot()
process_lexcial_entry(root,output_file)
def do_convert_tree(input_tree_name, output_file):
global total_entry_count, total_wordform_count
total_entry_count=0
total_wordform_count=0
for (dirpath,dirnames,filenames) in os.walk(input_tree_name):
for filename in filenames:
if filename[-4:]==".xml":
full_file_name = dirpath+"/"+filename
do_convert_lexcialentry_file(full_file_name,output_file)
print("Done")
print("\tlexical entries: %d"%total_entry_count)
print("\twordforms: %d"%total_wordform_count)
parser = argparse.ArgumentParser(description="STO converter")
parser.add_argument("-i","--input_file",type=str,default=None)
parser.add_argument("-I","--input_tree",type=str,default=None)
parser.add_argument("-o","--output_file",type=str,required=True)
parser.add_argument("command",type=str,default="convert",nargs='?',choices=["convert","signature"])
args=parser.parse_args()
if args.command=="signature" and (args.input_file!=None or args.input_tree!=None):
print("input_file/input_tree cannot be specified when generating signature", file=sys.stderr)
sys.exit(1)
if args.command=="convert" and args.input_file==None and args.input_tree==None:
print("input_file/input_tree and output_file must be specified when generating converting", file=sys.stderr)
sys.exit(1)
output_file = open(args.output_file,"ab")
if args.command=="signature":
#simple
version_1_signature = ("parsed-sto-v2\n"+'\0'*80)[0:80]
output_file.write(version_1_signature.encode())
elif args.command=="convert":
if args.input_file:
do_convert_lexicon_file(args.input_file,output_file)
else:
do_convert_tree(args.input_tree,output_file)
else:
print("argh...", file=sys.stderr)
sys.exit(99)
output_file.close()
if len(warnings)>0:
print("===Warnings:", file=sys.stderr)
for (k,v) in warnings.iteritems():
print("%s: %s"%(k,v), file=sys.stderr)
if len(skips)>0:
print("===Skips:", file=sys.stderr)
for (k,v) in skips.iteritems():
print("%s: %s"%(k,v), file=sys.stderr)
sys.exit(0)