mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
257 lines
7.3 KiB
Python
Executable File
257 lines
7.3 KiB
Python
Executable File
#!/usr/bin/python2
|
|
from __future__ import print_function
|
|
import xml.etree.ElementTree
|
|
import struct
|
|
import argparse
|
|
import sys
|
|
import os
|
|
|
|
#hack to make utf-8 values work
|
|
import sys
|
|
reload(sys)
|
|
sys.setdefaultencoding("utf_8")
|
|
|
|
part_of_speech_map={
|
|
"adjective":1,
|
|
"commonNoun":2,
|
|
"conjunction":3,
|
|
"demonstrativePronoun":4,
|
|
"deponentVerb":5,
|
|
"existentialPronoun":6,
|
|
"generalAdverb":7,
|
|
"indefinitePronoun":8,
|
|
"infinitiveParticle":9,
|
|
"interjection":10,
|
|
"interrogativeRelativePronoun":11,
|
|
"mainVerb":12,
|
|
"numeral":13,
|
|
"ordinalAdjective":14,
|
|
"personalPronoun":15,
|
|
"possessivePronoun":16,
|
|
"preposition":17,
|
|
"properNoun":18,
|
|
"reciprocalPronoun":19,
|
|
"unclassifiedParticle":20,
|
|
"unspecified":21,
|
|
"coordinatingConjunction":22,
|
|
"subordinatingConjunction":23
|
|
}
|
|
|
|
|
|
word_form_attribute_map={
|
|
"adjectivalFunction_attributiveFunction": 1,
|
|
"adjectivalFunction_predicativeFunction": 2,
|
|
"adjectivalFunction_unspecified": 3,
|
|
"case_genitiveCase": 4,
|
|
"case_nominativeCase": 5,
|
|
"case_unspecified": 6,
|
|
"definiteness_definite": 7,
|
|
"definiteness_indefinite": 8,
|
|
"definiteness_unspecified": 9,
|
|
"degree_comparative": 10,
|
|
"degree_positive": 11,
|
|
"degree_superlative": 12,
|
|
"grammaticalGender_commonGender": 13,
|
|
"grammaticalGender_neuter": 14,
|
|
"grammaticalGender_unspecified": 15,
|
|
"grammaticalNumber_plural": 16,
|
|
"grammaticalNumber_singular": 17,
|
|
"grammaticalNumber_unspecified": 18,
|
|
"independentWord_no": 19,
|
|
"independentWord_yes": 20,
|
|
"officiallyApproved_no": 21,
|
|
"officiallyApproved_yes": 22,
|
|
"ownerNumber_plural": 23,
|
|
"ownerNumber_singular": 24,
|
|
"ownerNumber_unspecified": 25,
|
|
"person_firstPerson": 26,
|
|
"person_secondPerson": 27,
|
|
"person_thirdPerson": 28,
|
|
"reflexivity_no": 29,
|
|
"reflexivity_yes": 30,
|
|
"reflexivity_unspecified": 31,
|
|
"register_formalRegister": 32,
|
|
"register_OBSOLETE": 33,
|
|
"tense_past": 34,
|
|
"tense_present": 35,
|
|
"transcategorization_transadjectival": 36,
|
|
"transcategorization_transadverbial": 37,
|
|
"transcategorization_transnominal": 38,
|
|
"verbFormMood_gerundive": 39,
|
|
"verbFormMood_imperative": 40,
|
|
"verbFormMood_indicative": 41,
|
|
"verbFormMood_infinitive": 42,
|
|
"verbFormMood_participle": 43,
|
|
"voice_activeVoice": 44,
|
|
"voice_passiveVoice": 45
|
|
}
|
|
|
|
|
|
|
|
total_entry_count = None
|
|
total_wordform_count = None
|
|
|
|
warnings = {}
|
|
skips = {}
|
|
|
|
def emit_warning(id,what):
|
|
global warnings
|
|
warnings[id] = what
|
|
def emit_skip(id,why):
|
|
global skips
|
|
skips[id] = why
|
|
|
|
|
|
def process_lexcial_entry(lexicalentry,output_file):
|
|
global total_entry_count, total_wordform_count
|
|
|
|
part_of_speech=None
|
|
id=None
|
|
morphological_unit_id=None
|
|
for feat in lexicalentry.findall("feat"):
|
|
att=feat.attrib["att"]
|
|
val=feat.attrib["val"]
|
|
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
|
|
if att=="partOfSpeech":
|
|
if val in part_of_speech_map:
|
|
part_of_speech = part_of_speech_map[val]
|
|
else:
|
|
print("Unknown part_of_speech: ",val, file=sys.stderr)
|
|
sys.exit(2)
|
|
elif att=="id":
|
|
id=val
|
|
elif att=="morphologicalUnitId":
|
|
morphological_unit_id=val
|
|
#todo:decomposition
|
|
if part_of_speech==None:
|
|
emit_skip(id,"No partOfSpeech")
|
|
return
|
|
if morphological_unit_id==None:
|
|
emit_skip(id,"No morphologicalUnitId")
|
|
return
|
|
|
|
raw_wordforms = b""
|
|
wordform_count = 0
|
|
|
|
for wordform in lexicalentry.findall("WordForm"):
|
|
attributes=[]
|
|
for feat in wordform.findall("feat"):
|
|
att=feat.attrib["att"]
|
|
val=feat.attrib["val"]
|
|
#print("wordform.feat: att=%s val=%s"%(att,val))
|
|
s=att+"_"+val
|
|
if s in word_form_attribute_map:
|
|
attributes.append(word_form_attribute_map[s])
|
|
else:
|
|
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
|
|
sys.exit(2)
|
|
if len(attributes)==0:
|
|
emit_warning(id,"No <feat> attributes")
|
|
#happens for a few entries such as "Chippendale". We convert it anyway because at least we know the part-of-speech
|
|
if len(attributes)>6:
|
|
emit_skip(id,"Too many <feat>")
|
|
return
|
|
while len(attributes)<6:
|
|
attributes.append(0)
|
|
for formrepresentation in wordform.findall("FormRepresentation"):
|
|
writtenform=None
|
|
for feat in formrepresentation.findall("feat"):
|
|
att=feat.attrib["att"]
|
|
val=feat.attrib["val"]
|
|
if att=="writtenForm":
|
|
writtenform=val
|
|
|
|
raw_writtenform = writtenform.encode()
|
|
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
|
|
+ struct.pack(">B",len(raw_writtenform)) \
|
|
+ raw_writtenform
|
|
wordform_count += 1
|
|
raw_wordforms += raw_wordform
|
|
|
|
raw_morphological_unit_id = morphological_unit_id.encode()
|
|
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
|
|
output_file.write(raw_entry)
|
|
|
|
total_entry_count += 1
|
|
total_wordform_count += wordform_count
|
|
|
|
|
|
def do_convert_lexicon_file(input_file_name, output_file):
|
|
print("Opening and parsing %s"%(input_file_name))
|
|
tree = xml.etree.ElementTree.parse(input_file_name)
|
|
root = tree.getroot()
|
|
lexicon=root.find("Lexicon")
|
|
global total_entry_count, total_wordform_count
|
|
total_entry_count=0
|
|
total_wordform_count=0
|
|
for lexicalentry in lexicon.findall("LexicalEntry"):
|
|
process_lexcial_entry(lexicalentry,output_file)
|
|
|
|
print("Done")
|
|
print("\tlexical entries: %d"%total_entry_count)
|
|
print("\twordforms: %d"%total_wordform_count)
|
|
|
|
|
|
def do_convert_lexcialentry_file(input_file_name,output_file):
|
|
print("%s:"%input_file_name);
|
|
tree = xml.etree.ElementTree.parse(input_file_name)
|
|
root = tree.getroot()
|
|
process_lexcial_entry(root,output_file)
|
|
|
|
def do_convert_tree(input_tree_name, output_file):
|
|
global total_entry_count, total_wordform_count
|
|
total_entry_count=0
|
|
total_wordform_count=0
|
|
for (dirpath,dirnames,filenames) in os.walk(input_tree_name):
|
|
for filename in filenames:
|
|
if filename[-4:]==".xml":
|
|
full_file_name = dirpath+"/"+filename
|
|
do_convert_lexcialentry_file(full_file_name,output_file)
|
|
print("Done")
|
|
print("\tlexical entries: %d"%total_entry_count)
|
|
print("\twordforms: %d"%total_wordform_count)
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="STO converter")
|
|
parser.add_argument("-i","--input_file",type=str,default=None)
|
|
parser.add_argument("-I","--input_tree",type=str,default=None)
|
|
parser.add_argument("-o","--output_file",type=str,required=True)
|
|
parser.add_argument("command",type=str,default="convert",nargs='?',choices=["convert","signature"])
|
|
|
|
args=parser.parse_args()
|
|
|
|
if args.command=="signature" and (args.input_file!=None or args.input_tree!=None):
|
|
print("input_file/input_tree cannot be specified when generating signature", file=sys.stderr)
|
|
sys.exit(1)
|
|
if args.command=="convert" and args.input_file==None and args.input_tree==None:
|
|
print("input_file/input_tree and output_file must be specified when generating converting", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
output_file = open(args.output_file,"ab")
|
|
if args.command=="signature":
|
|
#simple
|
|
version_1_signature = ("parsed-sto-v2\n"+'\0'*80)[0:80]
|
|
output_file.write(version_1_signature.encode())
|
|
elif args.command=="convert":
|
|
if args.input_file:
|
|
do_convert_lexicon_file(args.input_file,output_file)
|
|
else:
|
|
do_convert_tree(args.input_tree,output_file)
|
|
else:
|
|
print("argh...", file=sys.stderr)
|
|
sys.exit(99)
|
|
|
|
output_file.close()
|
|
|
|
if len(warnings)>0:
|
|
print("===Warnings:", file=sys.stderr)
|
|
for (k,v) in warnings.iteritems():
|
|
print("%s: %s"%(k,v), file=sys.stderr)
|
|
if len(skips)>0:
|
|
print("===Skips:", file=sys.stderr)
|
|
for (k,v) in skips.iteritems():
|
|
print("%s: %s"%(k,v), file=sys.stderr)
|
|
|
|
sys.exit(0)
|