167 lines
4.7 KiB
C++
167 lines
4.7 KiB
C++
#ifndef STO_H_
|
|
#define STO_H_
|
|
|
|
#include <inttypes.h>
|
|
#include <stddef.h>
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
|
|
//Interface for reading/using a processed STO file.
|
|
|
|
namespace sto {
|
|
|
|
enum class part_of_speech_t : uint8_t {
|
|
//0 is used for detecting corrupted files
|
|
adjective = 1,
|
|
commonNoun = 2,
|
|
conjunction = 3,
|
|
demonstrativePronoun = 4,
|
|
deponentVerb = 5,
|
|
existentialPronoun = 6,
|
|
generalAdverb = 7,
|
|
indefinitePronoun = 8,
|
|
infinitiveParticle = 9,
|
|
interjection = 10,
|
|
interrogativeRelativePronoun = 11,
|
|
mainVerb = 12,
|
|
numeral = 13,
|
|
ordinalAdjective = 14,
|
|
personalPronoun = 15,
|
|
possessivePronoun = 16,
|
|
preposition = 17,
|
|
properNoun = 18,
|
|
reciprocalPronoun = 19,
|
|
unclassifiedParticle = 20,
|
|
unspecified = 21,
|
|
coordinatingConjunction = 22,
|
|
subordinatingConjunction = 23,
|
|
};
|
|
|
|
|
|
enum class word_form_type_t : uint8_t {
|
|
wordFormsExplicit = 1, //entry has all word forms explicitly listed
|
|
};
|
|
|
|
|
|
enum class word_form_attribute_t : uint8_t {
|
|
none = 0,
|
|
adjectivalFunction_attributiveFunction = 1,
|
|
adjectivalFunction_predicativeFunction = 2,
|
|
adjectivalFunction_unspecified = 3,
|
|
case_genitiveCase = 4,
|
|
case_nominativeCase = 5,
|
|
case_unspecified = 6,
|
|
definiteness_definite = 7,
|
|
definiteness_indefinite = 8,
|
|
definiteness_unspecified = 9,
|
|
degree_comparative = 10,
|
|
degree_positive = 11,
|
|
degree_superlative = 12,
|
|
grammaticalGender_commonGender = 13,
|
|
grammaticalGender_neuter = 14,
|
|
grammaticalGender_unspecified = 15,
|
|
grammaticalNumber_plural = 16,
|
|
grammaticalNumber_singular = 17,
|
|
grammaticalNumber_unspecified = 18,
|
|
independentWord_no = 19,
|
|
independentWord_yes = 20,
|
|
officiallyApproved_no = 21,
|
|
officiallyApproved_yes = 22,
|
|
ownerNumber_plural = 23,
|
|
ownerNumber_singular = 24,
|
|
ownerNumber_unspecified = 25,
|
|
person_firstPerson = 26,
|
|
person_secondPerson = 27,
|
|
person_thirdPerson = 28,
|
|
reflexivity_no = 29,
|
|
reflexivity_yes = 30,
|
|
reflexivity_unspecified = 31,
|
|
register_formalRegister = 32,
|
|
register_OBSOLETE = 33,
|
|
tense_past = 34,
|
|
tense_present = 35,
|
|
transcategorization_transadjectival = 36,
|
|
transcategorization_transadverbial = 37,
|
|
transcategorization_transnominal = 38,
|
|
verbFormMood_gerundive = 39,
|
|
verbFormMood_imperative = 40,
|
|
verbFormMood_indicative = 41,
|
|
verbFormMood_infinitive = 42,
|
|
verbFormMood_participle = 43,
|
|
voice_activeVoice = 44,
|
|
voice_passiveVoice = 45,
|
|
};
|
|
|
|
|
|
struct WordForm {
|
|
static const unsigned max_attributes = 6;
|
|
word_form_attribute_t attribute[max_attributes];
|
|
uint8_t written_form_length;
|
|
char written_form[];
|
|
size_t size() const { return sizeof(attribute)+sizeof(written_form_length)+written_form_length; }
|
|
bool has_attribute(word_form_attribute_t a) const {
|
|
for(auto attr:attribute)
|
|
if(attr==a)
|
|
return true;
|
|
return false;
|
|
}
|
|
};
|
|
|
|
struct LexicalEntry {
|
|
part_of_speech_t part_of_speech;
|
|
word_form_type_t word_form_type;
|
|
uint8_t morphological_unit_id_len;
|
|
uint8_t explicit_word_form_count;
|
|
//char morphological_unit_id[];
|
|
//char explicit_word_forms[];
|
|
const char *query_morphological_unit_id() const { return reinterpret_cast<const char*>(this) + sizeof(*this); }
|
|
const WordForm *query_first_explicit_word_form() const {
|
|
const char *p = reinterpret_cast<const char*>(this) + sizeof(*this);
|
|
p += morphological_unit_id_len;
|
|
return reinterpret_cast<const WordForm*>(p);
|
|
}
|
|
std::vector<const WordForm *> query_all_explicit_word_forms() const;
|
|
const WordForm *find_first_wordform(const std::string &word) const;
|
|
const WordForm *find_base_wordform() const;
|
|
};
|
|
|
|
|
|
class Lexicon {
|
|
Lexicon(const Lexicon&) = delete;
|
|
Lexicon& operator=(const Lexicon&) = delete;
|
|
|
|
void *mapped_memory_start;
|
|
size_t mapped_memory_size;
|
|
struct MapEntry { //efficient (str,leng)->entry mapping structure (std::map is too slow and memory-inefficient)
|
|
uint32_t length;
|
|
const char *str;
|
|
const LexicalEntry *entry;
|
|
MapEntry(const char *str_, uint32_t length_, const LexicalEntry *entry_)
|
|
: length(length_), str(str_), entry(entry_)
|
|
{}
|
|
static bool compare(const MapEntry &me1, const MapEntry &me2);
|
|
};
|
|
std::vector<MapEntry> entries; //wordform -> entry[]
|
|
std::vector<MapEntry> morphological_unit_id_entries; //morphological_unit_id -> entry[]
|
|
void sort(std::vector<MapEntry> &v);
|
|
|
|
public:
|
|
Lexicon() : mapped_memory_start(0), mapped_memory_size(0), entries(), morphological_unit_id_entries() {}
|
|
~Lexicon() { unload(); }
|
|
|
|
bool load(const std::string &filename);
|
|
void unload();
|
|
|
|
const LexicalEntry *lookup(const std::string &word) const;
|
|
std::vector<const LexicalEntry *> query_matches(const std::string &word) const;
|
|
|
|
const LexicalEntry *first_entry() const;
|
|
const LexicalEntry *next_entry(const LexicalEntry *le) const;
|
|
std::vector<const LexicalEntry *> query_lexical_entries_with_same_morphological_unit_id(const LexicalEntry *le) const;
|
|
};
|
|
|
|
} //namespace
|
|
|
|
#endif
|