privacore-open-source-searc.../sto/sto.cpp

530 lines
16 KiB
C++

#include "sto.h"
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#include <algorithm>
//static const char version_1_signature[80] = "parsed-sto-v2\n";
static const char version_2_signature[80] = "parsed-sto-v2\n";
std::vector<const sto::WordForm *> sto::LexicalEntry::query_all_explicit_word_forms() const {
std::vector<const WordForm*> entries;
const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
for(unsigned i=0; i<explicit_word_form_count; i++) {
const WordForm *e = reinterpret_cast<const WordForm*>(p);
entries.push_back(e);
p += e->size();
}
return entries;
}
const sto::WordForm *sto::LexicalEntry::find_first_wordform(const std::string &word) const {
const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
for(unsigned i=0; i<explicit_word_form_count; i++) {
const WordForm *e = reinterpret_cast<const WordForm*>(p);
if(e->written_form_length==word.length() &&
memcmp(e->written_form,word.data(),e->written_form_length)==0)
return e;
p += e->size();
}
return NULL;
}
//Find the base form of the lexical entry. That means:
// verbs: infinitive mood, active voice
// nouns: indefinite singular nominative
// adjectives: positive, common gender
// other: <null>
const sto::WordForm *sto::LexicalEntry::find_base_wordform() const {
const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
for(unsigned i=0; i<explicit_word_form_count; i++) {
const WordForm *e = reinterpret_cast<const WordForm*>(p);
switch(part_of_speech) {
case part_of_speech_t::deponentVerb:
case part_of_speech_t::mainVerb: {
if(e->has_attribute(word_form_attribute_t::verbFormMood_infinitive) &&
e->has_attribute(word_form_attribute_t::voice_activeVoice))
return e;
break;
}
case part_of_speech_t::commonNoun:
case part_of_speech_t::properNoun: {
if((e->has_attribute(word_form_attribute_t::definiteness_indefinite) || e->has_attribute(word_form_attribute_t::definiteness_unspecified)) &&
(e->has_attribute(word_form_attribute_t::grammaticalNumber_singular) || e->has_attribute(word_form_attribute_t::grammaticalNumber_unspecified)) &&
(e->has_attribute(word_form_attribute_t::case_unspecified) || e->has_attribute(word_form_attribute_t::case_nominativeCase)))
return e;
break;
}
default:
return NULL;
}
p += e->size();
}
return NULL;
}
bool sto::Lexicon::load(const std::string &filename) {
unload();
int fd = open(filename.c_str(), O_RDONLY);
if(fd<0)
return false;
struct stat st;
if(fstat(fd,&st)!=0) {
::close(fd);
return false;
}
if((size_t)st.st_size<sizeof(version_2_signature)) {
::close(fd);
return false;
}
mapped_memory_start = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
if(mapped_memory_start==MAP_FAILED) {
::close(fd);
return false;
}
::close(fd);
mapped_memory_size = st.st_size;
(void)madvise(mapped_memory_start, mapped_memory_size, MADV_WILLNEED);
if(memcmp(mapped_memory_start,version_2_signature,sizeof(version_2_signature))!=0) {
unload();
return false;
}
//parse and index the entries
//see sto_structure.txt for details
size_t estimated_entries = mapped_memory_size/171;
size_t entries_to_reserve = (size_t)(estimated_entries*1.25);
entries.reserve(entries_to_reserve);
morphological_unit_id_entries.reserve(entries_to_reserve);
const char *start = reinterpret_cast<const char*>(mapped_memory_start);
const char *end = start + mapped_memory_size;
const char *p = start + sizeof(version_2_signature);
while(p<end) {
const LexicalEntry *le = reinterpret_cast<const LexicalEntry*>(p);
p = reinterpret_cast<const char*>(le->query_first_explicit_word_form());
for(unsigned i=0; i<le->explicit_word_form_count; i++) {
const WordForm *wf = reinterpret_cast<const WordForm*>(p);
const char *p2 = p+wf->size();
if(p2>end)
return false;
entries.emplace_back(wf->written_form,wf->written_form_length,le);
p = p2;
}
morphological_unit_id_entries.emplace_back(le->query_morphological_unit_id(),le->morphological_unit_id_len,le);
}
sort(entries);
sort(morphological_unit_id_entries);
return true;
}
bool sto::Lexicon::MapEntry::compare(const MapEntry &me1, const MapEntry &me2) {
if(me1.length<me2.length) {
int r = memcmp(me1.str,me2.str,me1.length);
return r<=0;
} else if(me1.length>me2.length) {
int r = memcmp(me1.str,me2.str,me2.length);
return r<0;
} else {
return memcmp(me1.str,me2.str,me1.length)<0;
}
}
void sto::Lexicon::sort(std::vector<MapEntry> &v) {
std::sort(v.begin(),v.end(),MapEntry::compare);
}
void sto::Lexicon::unload() {
if(mapped_memory_size!=0) {
(void)munmap(mapped_memory_start,mapped_memory_size);
mapped_memory_start = NULL;
mapped_memory_size = 0;
}
entries.clear();
morphological_unit_id_entries.clear();
}
const sto::LexicalEntry *sto::Lexicon::lookup(const std::string &word) const {
MapEntry me_word(word.data(),word.length(),0);
auto iter = std::lower_bound(entries.begin(),entries.end(),me_word,MapEntry::compare);
if(iter!=entries.end() && iter->length==word.length() && memcmp(iter->str,word.data(),iter->length)==0)
return iter->entry;
else
return 0;
}
std::vector<const sto::LexicalEntry *> sto::Lexicon::query_matches(const std::string &word) const {
MapEntry me_word(word.data(),word.length(),0);
auto range = std::equal_range(entries.begin(),entries.end(), me_word, MapEntry::compare);
std::vector<const LexicalEntry *> entries;
for(auto iter=range.first; iter!=range.second; ++iter)
entries.push_back(iter->entry);
return entries;
}
const sto::LexicalEntry *sto::Lexicon::first_entry() const {
const char *start = reinterpret_cast<const char*>(mapped_memory_start);
const char *p = start + sizeof(version_2_signature);
return reinterpret_cast<const LexicalEntry*>(p);
}
const sto::LexicalEntry *sto::Lexicon::next_entry(const LexicalEntry *le) const {
const char *p = reinterpret_cast<const char*>(le);
const char *start = reinterpret_cast<const char*>(mapped_memory_start);
const char *end = start + mapped_memory_size;
if(p<start || p>=end)
return NULL;
p = reinterpret_cast<const char*>(le->query_first_explicit_word_form());
for(unsigned i=0; i<le->explicit_word_form_count; i++) {
const WordForm *wf = reinterpret_cast<const WordForm*>(p);
const char *p2 = p+wf->size();
if(p2>end)
return NULL;
p = p2;
}
if(p<end)
return reinterpret_cast<const LexicalEntry*>(p);
else
return NULL;
}
std::vector<const sto::LexicalEntry *> sto::Lexicon::query_lexical_entries_with_same_morphological_unit_id(const sto::LexicalEntry *le) const {
MapEntry me_word(le->query_morphological_unit_id(),le->morphological_unit_id_len,0);
std::vector<const sto::LexicalEntry *> v;
auto range = std::equal_range(morphological_unit_id_entries.begin(), morphological_unit_id_entries.end(), me_word, MapEntry::compare);
for(auto iter=range.first; iter!=range.second; ++iter) {
v.push_back(iter->entry);
}
return v;
}
#ifdef UNITTEST
#include <assert.h>
#include <stdio.h>
using namespace sto;
int main(void) {
//plain ctor
{
Lexicon l;
assert(l.lookup("foo")==NULL);
auto v(l.query_matches("foo"));
assert(v.empty());
}
//nonexisting file
{
::unlink("sto.unittest");
Lexicon l;
assert(!l.load("sto.unittest"));
}
//empty file
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
close(fd);
Lexicon l;
assert(!l.load("sto.unittest"));
}
//file with wrong signature
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
write(fd,"hello world",11);
for(int i=0; i<10; i++)
write(fd,"0123456789abcdef",16);
close(fd);
Lexicon l;
assert(!l.load("sto.unittest"));
}
//file with just the signature
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
write(fd,version_2_signature,sizeof(version_2_signature));
close(fd);
Lexicon l;
assert(l.load("sto.unittest"));
assert(l.lookup("foo")==NULL);
}
//file with one lexical entry
//0: foo foos
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
char tmp[16];
write(fd,version_2_signature,sizeof(version_2_signature));
//le#0
tmp[0] = (char)part_of_speech_t::commonNoun;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd, "\006",1); //morph-unit-id len
write(fd,"\002",1); //wordforms
write(fd, "morph1",6); //morph-unit-id
//le#0:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::degree_positive;
write(fd,tmp,6);
write(fd,"\003foo",4);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
write(fd,tmp,6);
write(fd,"\004foos",5);
close(fd);
Lexicon l;
assert(l.load("sto.unittest"));
assert(l.lookup("foo")!=NULL);
assert(l.lookup("foos")!=NULL);
assert(l.lookup("fooz")==NULL);
auto e0(l.lookup("foo"));
auto e1(l.lookup("foos"));
assert(e0==e1);
assert(e0->part_of_speech==part_of_speech_t::commonNoun);
auto wf0(e0->find_first_wordform("foo"));
assert(wf0);
assert(wf0->has_attribute(word_form_attribute_t::none));
assert(wf0->has_attribute(word_form_attribute_t::degree_positive));
assert(!wf0->has_attribute(word_form_attribute_t::person_thirdPerson));
auto wf1(e1->find_first_wordform("foos"));
assert(wf1);
assert(wf1->has_attribute(word_form_attribute_t::none));
assert(wf1->has_attribute(word_form_attribute_t::case_nominativeCase));
assert(!wf1->has_attribute(word_form_attribute_t::person_thirdPerson));
auto wf2(e0->find_first_wordform("xxxx"));
assert(!wf2);
}
//file with three lexical entries
//0: foo foos
//1: boo boos
//2: goo foo boo
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
char tmp[16];
write(fd,version_2_signature,sizeof(version_2_signature));
//le#0
tmp[0] = (char)part_of_speech_t::commonNoun;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\006",1); //morph-unit-id len
write(fd,"\002",1); //#wordforms
write(fd, "morph1",6); //morph-unit-id
//le#0:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
write(fd,tmp,6);
write(fd,"\003foo",4);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
write(fd,tmp,6);
write(fd,"\004foos",5);
//le#1
tmp[0] = (char)part_of_speech_t::commonNoun;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\006",1); //morph-unit-id len
write(fd,"\002",1); //#wordforms
write(fd, "morph2",6); //morph-unit-id
//le#1:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
write(fd,tmp,6);
write(fd,"\003boo",4);
//le#1:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
write(fd,tmp,6);
write(fd,"\004boos",5);
//le#2
tmp[0] = (char)part_of_speech_t::commonNoun;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\006",1); //morph-unit-id len
write(fd,"\003",1); //#wordforms
write(fd, "morph1",6); //morph-unit-id
//le#2:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
write(fd,tmp,6);
write(fd,"\003goo",4);
//le#2:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
write(fd,tmp,6);
write(fd,"\003foo",4);
//le#2:wf#2
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
write(fd,tmp,6);
write(fd,"\003boo",4);
close(fd);
Lexicon l;
assert(l.load("sto.unittest"));
assert(l.lookup("foo")!=NULL);
assert(l.lookup("foos")!=NULL);
assert(l.lookup("boo")!=NULL);
assert(l.lookup("foos")!=NULL);
assert(l.lookup("goo")!=NULL);
auto v0(l.query_matches("foo"));
assert(v0.size()==2);
auto v1(l.query_matches("foos"));
assert(v1.size()==1);
auto v2(l.query_matches("boo"));
assert(v2.size()==2);
auto v3(l.query_matches("boos"));
assert(v3.size()==1);
auto v4(l.query_matches("goo"));
assert(v4.size()==1);
assert(v0[0]==v1[0] || v0[1]==v1[0]);
auto m0 = l.query_lexical_entries_with_same_morphological_unit_id(l.lookup("foos"));
assert(m0.size()==2);
assert(m0[0]!=m0[1]);
auto m1 = l.query_lexical_entries_with_same_morphological_unit_id(l.lookup("boos"));
assert(m1.size()==1);
assert(m1[0]==l.lookup("boos"));
}
//file with three entries, for testing LexicalEntry::find_base_wordform()
// verb: aaa1(imperative mood, active voice), aaa2(indicative mood, passive voice), aaa2(indicative mood, active voice)
// verb: bbb1(imperative mood, active voice), bbb2(indicative mood, passive voice)
// noun: ccc1(definite, singular, unspecified case), ccc1(indefinite, singular, unspecified case)
{
int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
char tmp[16];
write(fd,version_2_signature,sizeof(version_2_signature));
//le#0
tmp[0] = (char)part_of_speech_t::mainVerb;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\003",1); //morph-unit-id len
write(fd,"\003",1); //#wordforms
write(fd, "aaa",3); //morph-unit-id
//le#0:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::verbFormMood_imperative;
tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
write(fd,tmp,6);
write(fd,"\004aaa1",5);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
tmp[1]=(char)word_form_attribute_t::voice_passiveVoice;
write(fd,tmp,6);
write(fd,"\004aaa2",5);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
write(fd,tmp,6);
write(fd,"\004aaa3",5);
//le#1
tmp[0] = (char)part_of_speech_t::mainVerb;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\003",1); //morph-unit-id len
write(fd,"\002",1); //#wordforms
write(fd, "bbb",3); //morph-unit-id
//le#0:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::verbFormMood_imperative;
tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
write(fd,tmp,6);
write(fd,"\004bbb1",5);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
tmp[1]=(char)word_form_attribute_t::voice_passiveVoice;
write(fd,tmp,6);
write(fd,"\004bbb2",5);
//le#2
tmp[0] = (char)part_of_speech_t::commonNoun;
write(fd, tmp, 1);
tmp[0] = (char)word_form_type_t::wordFormsExplicit;
write(fd, tmp, 1);
write(fd,"\003",1); //morph-unit-id len
write(fd,"\002",1); //#wordforms
write(fd, "ccc",3); //morph-unit-id
//le#0:wf#0
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::definiteness_definite;
tmp[1]=(char)word_form_attribute_t::grammaticalNumber_singular;
tmp[2]=(char)word_form_attribute_t::case_unspecified;
write(fd,tmp,6);
write(fd,"\004ccc1",5);
//le#0:wf#1
tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
tmp[0]=(char)word_form_attribute_t::definiteness_indefinite;
tmp[1]=(char)word_form_attribute_t::grammaticalNumber_singular;
tmp[2]=(char)word_form_attribute_t::case_unspecified;
write(fd,tmp,6);
write(fd,"\004ccc2",5);
close(fd);
Lexicon l;
assert(l.load("sto.unittest"));
assert(l.lookup("aaa1")!=NULL);
assert(l.lookup("aaa2")!=NULL);
assert(l.lookup("aaa3")!=NULL);
const sto::LexicalEntry *le1 = l.lookup("aaa1");
const WordForm *wf1 = le1->find_base_wordform();
assert(wf1);
assert(std::string(wf1->written_form,wf1->written_form_length)=="aaa3");
const sto::LexicalEntry *le2 = l.lookup("bbb1");
const WordForm *wf2 = le2->find_base_wordform();
assert(!wf2);
const sto::LexicalEntry *le3 = l.lookup("ccc1");
const WordForm *wf3 = le3->find_base_wordform();
assert(wf3);
assert(std::string(wf3->written_form,wf3->written_form_length)=="ccc2");
}
}
#endif