privacore-open-source-searc…/sto/sto.cpp

#include "sto.h"
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#include <algorithm>


//static const char version_1_signature[80] = "parsed-sto-v2\n";
static const char version_2_signature[80] = "parsed-sto-v2\n";

std::vector<const sto::WordForm *> sto::LexicalEntry::query_all_explicit_word_forms() const {
	std::vector<const WordForm*> entries;
	const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
	for(unsigned i=0; i<explicit_word_form_count; i++) {
		const WordForm *e = reinterpret_cast<const WordForm*>(p);
		entries.push_back(e);
		p += e->size();
	}
	return entries;
}


const sto::WordForm *sto::LexicalEntry::find_first_wordform(const std::string &word) const {
	const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
	for(unsigned i=0; i<explicit_word_form_count; i++) {
		const WordForm *e = reinterpret_cast<const WordForm*>(p);
		if(e->written_form_length==word.length() &&
		   memcmp(e->written_form,word.data(),e->written_form_length)==0)
			return e;
		p += e->size();
	}
	return NULL;
}


//Find the base form of the lexical entry. That means:
//  verbs: infinitive mood, active voice
//  nouns: indefinite singular nominative
//  adjectives: positive, common gender
//  other: <null>
const sto::WordForm *sto::LexicalEntry::find_base_wordform() const {
	const char *p = reinterpret_cast<const char*>(query_first_explicit_word_form());
	for(unsigned i=0; i<explicit_word_form_count; i++) {
		const WordForm *e = reinterpret_cast<const WordForm*>(p);
		switch(part_of_speech) {
			case part_of_speech_t::deponentVerb:
			case part_of_speech_t::mainVerb: {
				if(e->has_attribute(word_form_attribute_t::verbFormMood_infinitive) &&
				   e->has_attribute(word_form_attribute_t::voice_activeVoice))
					return e;
				break;
			}
			case part_of_speech_t::commonNoun:
			case part_of_speech_t::properNoun: {
				if((e->has_attribute(word_form_attribute_t::definiteness_indefinite) || e->has_attribute(word_form_attribute_t::definiteness_unspecified)) &&
				   (e->has_attribute(word_form_attribute_t::grammaticalNumber_singular) || e->has_attribute(word_form_attribute_t::grammaticalNumber_unspecified)) &&
				   (e->has_attribute(word_form_attribute_t::case_unspecified) || e->has_attribute(word_form_attribute_t::case_nominativeCase)))
					return e;
				break;
			}
			default:
				return NULL;
		}
		p += e->size();
	}
	return NULL;
}


bool sto::Lexicon::load(const std::string &filename) {
	unload();

	int fd = open(filename.c_str(), O_RDONLY);
	if(fd<0)
		return false;

	struct stat st;
	if(fstat(fd,&st)!=0) {
		::close(fd);
		return false;
	}
	if((size_t)st.st_size<sizeof(version_2_signature)) {
		::close(fd);
		return false;
	}

	mapped_memory_start = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
	if(mapped_memory_start==MAP_FAILED) {
		::close(fd);
		return false;
	}
	::close(fd);

	mapped_memory_size = st.st_size;

	(void)madvise(mapped_memory_start, mapped_memory_size, MADV_WILLNEED);

	if(memcmp(mapped_memory_start,version_2_signature,sizeof(version_2_signature))!=0) {
		unload();
		return false;
	}

	//parse and index the entries
	//see sto_structure.txt for details
	size_t estimated_entries = mapped_memory_size/171;
	size_t entries_to_reserve = (size_t)(estimated_entries*1.25);
	entries.reserve(entries_to_reserve);
	morphological_unit_id_entries.reserve(entries_to_reserve);
	const char *start = reinterpret_cast<const char*>(mapped_memory_start);
	const char *end = start + mapped_memory_size;
	const char *p = start + sizeof(version_2_signature);
	while(p<end) {
		const LexicalEntry *le = reinterpret_cast<const LexicalEntry*>(p);
		p = reinterpret_cast<const char*>(le->query_first_explicit_word_form());
		for(unsigned i=0; i<le->explicit_word_form_count; i++) {
			const WordForm *wf = reinterpret_cast<const WordForm*>(p);
			const char *p2 = p+wf->size();
			if(p2>end)
				return false;

			entries.emplace_back(wf->written_form,wf->written_form_length,le);
			p = p2;
		}
		morphological_unit_id_entries.emplace_back(le->query_morphological_unit_id(),le->morphological_unit_id_len,le);
	}

	sort(entries);
	sort(morphological_unit_id_entries);

	return true;
}


bool sto::Lexicon::MapEntry::compare(const MapEntry &me1, const MapEntry &me2) {
	if(me1.length<me2.length) {
		int r = memcmp(me1.str,me2.str,me1.length);
		return r<=0;
	} else if(me1.length>me2.length) {
		int r = memcmp(me1.str,me2.str,me2.length);
		return r<0;
	} else {
		return memcmp(me1.str,me2.str,me1.length)<0;
	}
}

void sto::Lexicon::sort(std::vector<MapEntry> &v) {
	std::sort(v.begin(),v.end(),MapEntry::compare);
}


void sto::Lexicon::unload() {
	if(mapped_memory_size!=0) {
		(void)munmap(mapped_memory_start,mapped_memory_size);
		mapped_memory_start = NULL;
		mapped_memory_size = 0;
	}
	entries.clear();
	morphological_unit_id_entries.clear();
}


const sto::LexicalEntry *sto::Lexicon::lookup(const std::string &word) const {
	MapEntry me_word(word.data(),word.length(),0);
	auto iter = std::lower_bound(entries.begin(),entries.end(),me_word,MapEntry::compare);
	if(iter!=entries.end() && iter->length==word.length() && memcmp(iter->str,word.data(),iter->length)==0)
		return iter->entry;
	else
		return 0;
}


std::vector<const sto::LexicalEntry *> sto::Lexicon::query_matches(const std::string &word) const {
	MapEntry me_word(word.data(),word.length(),0);
	auto range = std::equal_range(entries.begin(),entries.end(), me_word, MapEntry::compare);
	std::vector<const LexicalEntry *> entries;
	for(auto iter=range.first; iter!=range.second; ++iter)
		entries.push_back(iter->entry);
	return entries;
}


const sto::LexicalEntry *sto::Lexicon::first_entry() const {
	const char *start = reinterpret_cast<const char*>(mapped_memory_start);
	const char *p = start + sizeof(version_2_signature);
	return reinterpret_cast<const LexicalEntry*>(p);
}


const sto::LexicalEntry *sto::Lexicon::next_entry(const LexicalEntry *le) const {
	const char *p = reinterpret_cast<const char*>(le);
	const char *start = reinterpret_cast<const char*>(mapped_memory_start);
	const char *end = start + mapped_memory_size;
	if(p<start || p>=end)
		return NULL;
	p = reinterpret_cast<const char*>(le->query_first_explicit_word_form());
	for(unsigned i=0; i<le->explicit_word_form_count; i++) {
		const WordForm *wf = reinterpret_cast<const WordForm*>(p);
		const char *p2 = p+wf->size();
		if(p2>end)
			return NULL;
		p = p2;
	}
	if(p<end)
		return reinterpret_cast<const LexicalEntry*>(p);
	else
		return NULL;
}


std::vector<const sto::LexicalEntry *> sto::Lexicon::query_lexical_entries_with_same_morphological_unit_id(const sto::LexicalEntry *le) const {
	MapEntry me_word(le->query_morphological_unit_id(),le->morphological_unit_id_len,0);
	std::vector<const sto::LexicalEntry *> v;
	auto range = std::equal_range(morphological_unit_id_entries.begin(), morphological_unit_id_entries.end(), me_word, MapEntry::compare);
	for(auto iter=range.first; iter!=range.second; ++iter) {
		v.push_back(iter->entry);
	}
	return v;
}


#ifdef UNITTEST
#include <assert.h>
#include <stdio.h>

using namespace sto;

int main(void) {
	//plain ctor
	{
		Lexicon l;
		assert(l.lookup("foo")==NULL);
		auto v(l.query_matches("foo"));
		assert(v.empty());
	}

	//nonexisting file
	{
		::unlink("sto.unittest");
		Lexicon l;
		assert(!l.load("sto.unittest"));
	}

	//empty file
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		close(fd);
		Lexicon l;
		assert(!l.load("sto.unittest"));
	}

	//file with wrong signature
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		write(fd,"hello world",11);
		for(int i=0; i<10; i++)
			write(fd,"0123456789abcdef",16);
		close(fd);
		Lexicon l;
		assert(!l.load("sto.unittest"));
	}

	//file with just the signature
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		write(fd,version_2_signature,sizeof(version_2_signature));
		close(fd);
		Lexicon l;
		assert(l.load("sto.unittest"));
		assert(l.lookup("foo")==NULL);
	}

	//file with one lexical entry
	//0: foo foos
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		char tmp[16];
		write(fd,version_2_signature,sizeof(version_2_signature));
		//le#0
		tmp[0] = (char)part_of_speech_t::commonNoun;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd, "\006",1); //morph-unit-id len
		write(fd,"\002",1); //wordforms
		write(fd, "morph1",6); //morph-unit-id
		//le#0:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::degree_positive;
		write(fd,tmp,6);
		write(fd,"\003foo",4);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
		write(fd,tmp,6);
		write(fd,"\004foos",5);
		close(fd);
		Lexicon l;
		assert(l.load("sto.unittest"));
		assert(l.lookup("foo")!=NULL);
		assert(l.lookup("foos")!=NULL);
		assert(l.lookup("fooz")==NULL);
		auto e0(l.lookup("foo"));
		auto e1(l.lookup("foos"));
		assert(e0==e1);
		assert(e0->part_of_speech==part_of_speech_t::commonNoun);
		auto wf0(e0->find_first_wordform("foo"));
		assert(wf0);
		assert(wf0->has_attribute(word_form_attribute_t::none));
		assert(wf0->has_attribute(word_form_attribute_t::degree_positive));
		assert(!wf0->has_attribute(word_form_attribute_t::person_thirdPerson));
		auto wf1(e1->find_first_wordform("foos"));
		assert(wf1);
		assert(wf1->has_attribute(word_form_attribute_t::none));
		assert(wf1->has_attribute(word_form_attribute_t::case_nominativeCase));
		assert(!wf1->has_attribute(word_form_attribute_t::person_thirdPerson));
		auto wf2(e0->find_first_wordform("xxxx"));
		assert(!wf2);
	}


	//file with three lexical entries
	//0: foo foos
	//1: boo boos
	//2: goo foo boo
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		char tmp[16];
		write(fd,version_2_signature,sizeof(version_2_signature));

		//le#0
		tmp[0] = (char)part_of_speech_t::commonNoun;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\006",1); //morph-unit-id len
		write(fd,"\002",1); //#wordforms
		write(fd, "morph1",6); //morph-unit-id
		//le#0:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		write(fd,tmp,6);
		write(fd,"\003foo",4);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
		write(fd,tmp,6);
		write(fd,"\004foos",5);

		//le#1
		tmp[0] = (char)part_of_speech_t::commonNoun;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\006",1); //morph-unit-id len
		write(fd,"\002",1); //#wordforms
		write(fd, "morph2",6); //morph-unit-id
		//le#1:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		write(fd,tmp,6);
		write(fd,"\003boo",4);
		//le#1:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
		write(fd,tmp,6);
		write(fd,"\004boos",5);

		//le#2
		tmp[0] = (char)part_of_speech_t::commonNoun;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\006",1); //morph-unit-id len
		write(fd,"\003",1); //#wordforms
		write(fd, "morph1",6); //morph-unit-id
		//le#2:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		write(fd,tmp,6);
		write(fd,"\003goo",4);
		//le#2:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
		write(fd,tmp,6);
		write(fd,"\003foo",4);
		//le#2:wf#2
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::case_nominativeCase;
		write(fd,tmp,6);
		write(fd,"\003boo",4);

		close(fd);

		Lexicon l;
		assert(l.load("sto.unittest"));
		assert(l.lookup("foo")!=NULL);
		assert(l.lookup("foos")!=NULL);
		assert(l.lookup("boo")!=NULL);
		assert(l.lookup("foos")!=NULL);
		assert(l.lookup("goo")!=NULL);

		auto v0(l.query_matches("foo"));
		assert(v0.size()==2);
		auto v1(l.query_matches("foos"));
		assert(v1.size()==1);
		auto v2(l.query_matches("boo"));
		assert(v2.size()==2);
		auto v3(l.query_matches("boos"));
		assert(v3.size()==1);
		auto v4(l.query_matches("goo"));
		assert(v4.size()==1);

		assert(v0[0]==v1[0] || v0[1]==v1[0]);

		auto m0 = l.query_lexical_entries_with_same_morphological_unit_id(l.lookup("foos"));
		assert(m0.size()==2);
		assert(m0[0]!=m0[1]);

		auto m1 = l.query_lexical_entries_with_same_morphological_unit_id(l.lookup("boos"));
		assert(m1.size()==1);
		assert(m1[0]==l.lookup("boos"));
	}

	//file with three entries, for testing LexicalEntry::find_base_wordform()
	//  verb: aaa1(imperative mood, active voice), aaa2(indicative mood, passive voice), aaa2(indicative mood, active voice)
	//  verb: bbb1(imperative mood, active voice), bbb2(indicative mood, passive voice)
	//  noun: ccc1(definite, singular, unspecified case), ccc1(indefinite, singular, unspecified case)
	{
		int fd = open("sto.unittest",O_WRONLY|O_CREAT|O_TRUNC,0666);
		char tmp[16];
		write(fd,version_2_signature,sizeof(version_2_signature));

		//le#0
		tmp[0] = (char)part_of_speech_t::mainVerb;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\003",1); //morph-unit-id len
		write(fd,"\003",1); //#wordforms
		write(fd, "aaa",3); //morph-unit-id
		//le#0:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::verbFormMood_imperative;
		tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
		write(fd,tmp,6);
		write(fd,"\004aaa1",5);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
		tmp[1]=(char)word_form_attribute_t::voice_passiveVoice;
		write(fd,tmp,6);
		write(fd,"\004aaa2",5);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
		tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
		write(fd,tmp,6);
		write(fd,"\004aaa3",5);

		//le#1
		tmp[0] = (char)part_of_speech_t::mainVerb;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\003",1); //morph-unit-id len
		write(fd,"\002",1); //#wordforms
		write(fd, "bbb",3); //morph-unit-id
		//le#0:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::verbFormMood_imperative;
		tmp[1]=(char)word_form_attribute_t::voice_activeVoice;
		write(fd,tmp,6);
		write(fd,"\004bbb1",5);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::verbFormMood_infinitive;
		tmp[1]=(char)word_form_attribute_t::voice_passiveVoice;
		write(fd,tmp,6);
		write(fd,"\004bbb2",5);

		//le#2
		tmp[0] = (char)part_of_speech_t::commonNoun;
		write(fd, tmp, 1);
		tmp[0] = (char)word_form_type_t::wordFormsExplicit;
		write(fd, tmp, 1);
		write(fd,"\003",1); //morph-unit-id len
		write(fd,"\002",1); //#wordforms
		write(fd, "ccc",3); //morph-unit-id
		//le#0:wf#0
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::definiteness_definite;
		tmp[1]=(char)word_form_attribute_t::grammaticalNumber_singular;
		tmp[2]=(char)word_form_attribute_t::case_unspecified;
		write(fd,tmp,6);
		write(fd,"\004ccc1",5);
		//le#0:wf#1
		tmp[0]=tmp[1]=tmp[2]=tmp[3]=tmp[4]=tmp[5] = (char)word_form_attribute_t::none;
		tmp[0]=(char)word_form_attribute_t::definiteness_indefinite;
		tmp[1]=(char)word_form_attribute_t::grammaticalNumber_singular;
		tmp[2]=(char)word_form_attribute_t::case_unspecified;
		write(fd,tmp,6);
		write(fd,"\004ccc2",5);

		close(fd);

		Lexicon l;
		assert(l.load("sto.unittest"));
		assert(l.lookup("aaa1")!=NULL);
		assert(l.lookup("aaa2")!=NULL);
		assert(l.lookup("aaa3")!=NULL);

		const sto::LexicalEntry *le1 = l.lookup("aaa1");
		const WordForm *wf1 = le1->find_base_wordform();
		assert(wf1);
		assert(std::string(wf1->written_form,wf1->written_form_length)=="aaa3");

		const sto::LexicalEntry *le2 = l.lookup("bbb1");
		const WordForm *wf2 = le2->find_base_wordform();
		assert(!wf2);

		const sto::LexicalEntry *le3 = l.lookup("ccc1");
		const WordForm *wf3 = le3->find_base_wordform();
		assert(wf3);
		assert(std::string(wf3->written_form,wf3->written_form_length)=="ccc2");
	}

}
#endif