Only load sto lexicon once

This commit is contained in:
Ivan Skytte Jørgensen 2018-07-17 13:49:53 +02:00
parent 01466009d5
commit 68f6e60069
10 changed files with 67 additions and 23 deletions

@ -1,8 +1,10 @@
#include "Lemma.h"
#include "Lexicons.h"
sto::Lexicon lemma_lexicon;
sto::Lexicon *lemma_lexicon = nullptr;
bool load_lemma_lexicon() {
return lemma_lexicon.load("lexicon_da.sto");
lemma_lexicon = getLexicon("lexicon_da.sto");
return lemma_lexicon!=nullptr;
}

@ -3,7 +3,7 @@
#include "sto/sto.h"
extern sto::Lexicon lemma_lexicon;
extern sto::Lexicon *lemma_lexicon;
bool load_lemma_lexicon();
#endif

31
Lexicons.cpp Normal file

@ -0,0 +1,31 @@
#include "Lexicons.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include <map>
#include <memory>
//Yes, I do know that with Greek morphology the plural of lexicon is lexica. But this isn't Greek
static std::map<std::string,std::unique_ptr<sto::Lexicon>> map;
static GbMutex mtx_map;
sto::Lexicon *getLexicon(const std::string &filename) {
ScopedLock sl(mtx_map);
auto iter = map.find(filename);
if(iter!=map.end())
return iter->second.get();
sto::Lexicon *l = new sto::Lexicon();
if(!l->load(filename)) {
delete l;
return nullptr;
}
map.emplace(filename,l);
return l;
}
void forgetAllLexicons() {
map.clear();
}

8
Lexicons.h Normal file

@ -0,0 +1,8 @@
#ifndef LEXICONREF_H_
#define LEXICONREF_H_
#include "sto/sto.h"
sto::Lexicon *getLexicon(const std::string &filename);
void forgetAllLexicons();
#endif

@ -97,6 +97,7 @@ OBJS_O3 = \
SiteNumInlinks.o \
SiteMedianPageTemperature.o \
MemoryMappedFile.o \
Lexicons.o \
Lemma.o \

@ -1058,7 +1058,7 @@ bool Query::setQTerms() {
continue;
std::string w(m_tr[i].token_start,m_tr[i].token_len);
logTrace(g_conf.m_logTraceQuery, "@@ Checking lemma for '%s'", w.c_str());
auto le = lemma_lexicon.lookup(w);
auto le = lemma_lexicon->lookup(w);
if(!le) {
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
char lowercase_word[128];
@ -1066,7 +1066,7 @@ bool Query::setQTerms() {
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
lowercase_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
le = lemma_lexicon.lookup(lowercase_word);
le = lemma_lexicon->lookup(lowercase_word);
}
}
}
@ -1078,7 +1078,7 @@ bool Query::setQTerms() {
capitalized_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
w = capitalized_word;
le = lemma_lexicon.lookup(w);
le = lemma_lexicon->lookup(w);
}
}
}
@ -1090,7 +1090,7 @@ bool Query::setQTerms() {
uppercase_word[sz] = '\0';
if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
w = uppercase_word;
le = lemma_lexicon.lookup(w);
le = lemma_lexicon->lookup(w);
}
}
}

@ -2247,7 +2247,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
for(auto e : candidate_lemma_words) {
//find the word in the lexicon. find the lemma. If the word is unknown or already in its base form then don't generate a lemma entry
logTrace(g_conf.m_logTraceTokenIndexing,"candidate word for lemma: %s", e.c_str());
auto le = lemma_lexicon.lookup(e);
auto le = lemma_lexicon->lookup(e);
if(!le) {
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
char lowercase_word[128];
@ -2256,7 +2256,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
lowercase_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),lowercase_word,e.size())!=0) {
e = lowercase_word;
le = lemma_lexicon.lookup(e);
le = lemma_lexicon->lookup(e);
}
}
}
@ -2268,7 +2268,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
capitalized_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),capitalized_word,e.size())!=0) {
e = capitalized_word;
le = lemma_lexicon.lookup(e);
le = lemma_lexicon->lookup(e);
}
}
}
@ -2280,7 +2280,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
uppercase_word[sz] = '\0';
if(sz!=e.size() || memcmp(e.data(),uppercase_word,e.size())!=0) {
e = uppercase_word;
le = lemma_lexicon.lookup(e);
le = lemma_lexicon->lookup(e);
}
}
}

@ -1,10 +1,12 @@
#include "STOWordVariationGenerator.h"
#include "Lexicons.h"
#include "fctypes.h" //to_lower_utf8
#include "utf8.h" //getUtf8CharSize etc
bool STOWordVariationGenerator::load_lexicon(const char *filename) {
return lexicon.load(filename);
lexicon = getLexicon(filename);
return lexicon!=nullptr;
}

@ -6,11 +6,11 @@
//A word variation generator that can use a STO database
class STOWordVariationGenerator : public WordVariationGenerator {
protected:
sto::Lexicon lexicon;
sto::Lexicon *lexicon;
public:
using WordVariationGenerator::WordVariationGenerator;
bool load_lexicon(const char *filename);
void unload_lexicon() { lexicon.unload(); }
void unload_lexicon() { lexicon = nullptr; }
std::vector<std::string> lower_words(const std::vector<std::string> &source_words);
std::string capitalize_word(const std::string &lower_word);

@ -227,7 +227,7 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
{
for(unsigned i=0; i<source_words.size(); i++) {
auto source_word(source_words[i]);
LogicalMatches matches(lexicon,source_word,noun);
LogicalMatches matches(*lexicon,source_word,noun);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
@ -269,14 +269,14 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
{
for(unsigned i=0; i<source_words.size(); i++) {
auto source_word(source_words[i]);
LogicalMatches matches(lexicon,source_word,whatever);
LogicalMatches matches(*lexicon,source_word,whatever);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,matches.query_matched_word())) {
//found the word form match. Now look for other wordforms with exactly the same attributes. Those are alternate spellings.
//so first find all lexical entries with the same morphological unit id, and check all wordforms of those, looking for an attribute match
auto same_morph_entries = lexicon.query_lexical_entries_with_same_morphological_unit_id(match);
auto same_morph_entries = lexicon->query_lexical_entries_with_same_morphological_unit_id(match);
for(auto same_morph_entry : same_morph_entries) {
auto wordforms2(same_morph_entry->query_all_explicit_word_forms());
for(auto wordform2 : wordforms2) {
@ -427,7 +427,7 @@ void WordVariationGenerator_danish::transliterate_verb_acute_accent(std::vector<
if(source_word.length()>4 && source_word.substr(source_word.length()-2)=="er") {
//possibly a verb in imperative
bool is_imperative = false;
LogicalMatches matches(lexicon,source_word,verb);
LogicalMatches matches(*lexicon,source_word,verb);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
@ -474,7 +474,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
auto source_word(lower_source_words[i]);
if(source_word==" ")
continue;
LogicalMatches matches(lexicon,source_word,verb);
LogicalMatches matches(*lexicon,source_word,verb);
if(prev_was_er || prev_was_var || prev_was_har || prev_was_havde) {
//check if this word is the past participle
const sto::WordForm *wordform_past_participle = NULL;
@ -672,7 +672,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
continue;
//find noun
LogicalMatches matches(lexicon,source_word0,noun);
LogicalMatches matches(*lexicon,source_word0,noun);
const sto::WordForm *wordform_noun = NULL;
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
@ -709,7 +709,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
auto source_word4_capitalized(capitalize_word(source_word4));
//find proper-noun
auto matches2 = lexicon.query_matches(source_word4_capitalized);
auto matches2 = lexicon->query_matches(source_word4_capitalized);
const sto::WordForm *wordform_proper_noun = NULL;
const sto::WordForm *wordform_proper_noun_genitive = NULL;
for(auto match : matches2) {
@ -763,7 +763,7 @@ void WordVariationGenerator_danish::handle_adjective_grammatical_gender_simplifi
//find adjective
bool is_common_singular_indefinite = false;
const sto::WordForm *wordform_neuter_singular_indefinite = NULL;
LogicalMatches matches(lexicon,source_word0,whatever);
LogicalMatches matches(*lexicon,source_word0,whatever);
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::adjective) {
auto wordforms(match->query_all_explicit_word_forms());