Only load sto lexicon once
This commit is contained in:
parent
01466009d5
commit
68f6e60069
@ -1,8 +1,10 @@
|
||||
#include "Lemma.h"
|
||||
#include "Lexicons.h"
|
||||
|
||||
sto::Lexicon lemma_lexicon;
|
||||
|
||||
sto::Lexicon *lemma_lexicon = nullptr;
|
||||
|
||||
bool load_lemma_lexicon() {
|
||||
return lemma_lexicon.load("lexicon_da.sto");
|
||||
lemma_lexicon = getLexicon("lexicon_da.sto");
|
||||
return lemma_lexicon!=nullptr;
|
||||
}
|
||||
|
||||
|
2
Lemma.h
2
Lemma.h
@ -3,7 +3,7 @@
|
||||
|
||||
#include "sto/sto.h"
|
||||
|
||||
extern sto::Lexicon lemma_lexicon;
|
||||
extern sto::Lexicon *lemma_lexicon;
|
||||
bool load_lemma_lexicon();
|
||||
|
||||
#endif
|
||||
|
31
Lexicons.cpp
Normal file
31
Lexicons.cpp
Normal file
@ -0,0 +1,31 @@
|
||||
#include "Lexicons.h"
|
||||
#include "GbMutex.h"
|
||||
#include "ScopedLock.h"
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
//Yes, I do know that with Greek morphology the plural of lexicon is lexica. But this isn't Greek
|
||||
|
||||
|
||||
static std::map<std::string,std::unique_ptr<sto::Lexicon>> map;
|
||||
static GbMutex mtx_map;
|
||||
|
||||
sto::Lexicon *getLexicon(const std::string &filename) {
|
||||
ScopedLock sl(mtx_map);
|
||||
auto iter = map.find(filename);
|
||||
if(iter!=map.end())
|
||||
return iter->second.get();
|
||||
sto::Lexicon *l = new sto::Lexicon();
|
||||
if(!l->load(filename)) {
|
||||
delete l;
|
||||
return nullptr;
|
||||
}
|
||||
map.emplace(filename,l);
|
||||
return l;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void forgetAllLexicons() {
|
||||
map.clear();
|
||||
}
|
8
Lexicons.h
Normal file
8
Lexicons.h
Normal file
@ -0,0 +1,8 @@
|
||||
#ifndef LEXICONREF_H_
|
||||
#define LEXICONREF_H_
|
||||
#include "sto/sto.h"
|
||||
|
||||
sto::Lexicon *getLexicon(const std::string &filename);
|
||||
void forgetAllLexicons();
|
||||
|
||||
#endif
|
1
Makefile
1
Makefile
@ -97,6 +97,7 @@ OBJS_O3 = \
|
||||
SiteNumInlinks.o \
|
||||
SiteMedianPageTemperature.o \
|
||||
MemoryMappedFile.o \
|
||||
Lexicons.o \
|
||||
Lemma.o \
|
||||
|
||||
|
||||
|
@ -1058,7 +1058,7 @@ bool Query::setQTerms() {
|
||||
continue;
|
||||
std::string w(m_tr[i].token_start,m_tr[i].token_len);
|
||||
logTrace(g_conf.m_logTraceQuery, "@@ Checking lemma for '%s'", w.c_str());
|
||||
auto le = lemma_lexicon.lookup(w);
|
||||
auto le = lemma_lexicon->lookup(w);
|
||||
if(!le) {
|
||||
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
|
||||
char lowercase_word[128];
|
||||
@ -1066,7 +1066,7 @@ bool Query::setQTerms() {
|
||||
size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
|
||||
lowercase_word[sz] = '\0';
|
||||
if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
|
||||
le = lemma_lexicon.lookup(lowercase_word);
|
||||
le = lemma_lexicon->lookup(lowercase_word);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1078,7 +1078,7 @@ bool Query::setQTerms() {
|
||||
capitalized_word[sz] = '\0';
|
||||
if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
|
||||
w = capitalized_word;
|
||||
le = lemma_lexicon.lookup(w);
|
||||
le = lemma_lexicon->lookup(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1090,7 +1090,7 @@ bool Query::setQTerms() {
|
||||
uppercase_word[sz] = '\0';
|
||||
if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
|
||||
w = uppercase_word;
|
||||
le = lemma_lexicon.lookup(w);
|
||||
le = lemma_lexicon->lookup(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2247,7 +2247,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
for(auto e : candidate_lemma_words) {
|
||||
//find the word in the lexicon. find the lemma. If the word is unknown or already in its base form then don't generate a lemma entry
|
||||
logTrace(g_conf.m_logTraceTokenIndexing,"candidate word for lemma: %s", e.c_str());
|
||||
auto le = lemma_lexicon.lookup(e);
|
||||
auto le = lemma_lexicon->lookup(e);
|
||||
if(!le) {
|
||||
//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
|
||||
char lowercase_word[128];
|
||||
@ -2256,7 +2256,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
lowercase_word[sz] = '\0';
|
||||
if(sz!=e.size() || memcmp(e.data(),lowercase_word,e.size())!=0) {
|
||||
e = lowercase_word;
|
||||
le = lemma_lexicon.lookup(e);
|
||||
le = lemma_lexicon->lookup(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2268,7 +2268,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
capitalized_word[sz] = '\0';
|
||||
if(sz!=e.size() || memcmp(e.data(),capitalized_word,e.size())!=0) {
|
||||
e = capitalized_word;
|
||||
le = lemma_lexicon.lookup(e);
|
||||
le = lemma_lexicon->lookup(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2280,7 +2280,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
|
||||
uppercase_word[sz] = '\0';
|
||||
if(sz!=e.size() || memcmp(e.data(),uppercase_word,e.size())!=0) {
|
||||
e = uppercase_word;
|
||||
le = lemma_lexicon.lookup(e);
|
||||
le = lemma_lexicon->lookup(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
#include "STOWordVariationGenerator.h"
|
||||
#include "Lexicons.h"
|
||||
#include "fctypes.h" //to_lower_utf8
|
||||
#include "utf8.h" //getUtf8CharSize etc
|
||||
|
||||
|
||||
bool STOWordVariationGenerator::load_lexicon(const char *filename) {
|
||||
return lexicon.load(filename);
|
||||
lexicon = getLexicon(filename);
|
||||
return lexicon!=nullptr;
|
||||
}
|
||||
|
||||
|
||||
|
@ -6,11 +6,11 @@
|
||||
//A word variation generator that can use a STO database
|
||||
class STOWordVariationGenerator : public WordVariationGenerator {
|
||||
protected:
|
||||
sto::Lexicon lexicon;
|
||||
sto::Lexicon *lexicon;
|
||||
public:
|
||||
using WordVariationGenerator::WordVariationGenerator;
|
||||
bool load_lexicon(const char *filename);
|
||||
void unload_lexicon() { lexicon.unload(); }
|
||||
void unload_lexicon() { lexicon = nullptr; }
|
||||
|
||||
std::vector<std::string> lower_words(const std::vector<std::string> &source_words);
|
||||
std::string capitalize_word(const std::string &lower_word);
|
||||
|
@ -227,7 +227,7 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
|
||||
{
|
||||
for(unsigned i=0; i<source_words.size(); i++) {
|
||||
auto source_word(source_words[i]);
|
||||
LogicalMatches matches(lexicon,source_word,noun);
|
||||
LogicalMatches matches(*lexicon,source_word,noun);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
@ -269,14 +269,14 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
|
||||
{
|
||||
for(unsigned i=0; i<source_words.size(); i++) {
|
||||
auto source_word(source_words[i]);
|
||||
LogicalMatches matches(lexicon,source_word,whatever);
|
||||
LogicalMatches matches(*lexicon,source_word,whatever);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word())) {
|
||||
//found the word form match. Now look for other wordforms with exactly the same attributes. Those are alternate spellings.
|
||||
//so first find all lexical entries with the same morphological unit id, and check all wordforms of those, looking for an attribute match
|
||||
auto same_morph_entries = lexicon.query_lexical_entries_with_same_morphological_unit_id(match);
|
||||
auto same_morph_entries = lexicon->query_lexical_entries_with_same_morphological_unit_id(match);
|
||||
for(auto same_morph_entry : same_morph_entries) {
|
||||
auto wordforms2(same_morph_entry->query_all_explicit_word_forms());
|
||||
for(auto wordform2 : wordforms2) {
|
||||
@ -427,7 +427,7 @@ void WordVariationGenerator_danish::transliterate_verb_acute_accent(std::vector<
|
||||
if(source_word.length()>4 && source_word.substr(source_word.length()-2)=="er") {
|
||||
//possibly a verb in imperative
|
||||
bool is_imperative = false;
|
||||
LogicalMatches matches(lexicon,source_word,verb);
|
||||
LogicalMatches matches(*lexicon,source_word,verb);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
@ -474,7 +474,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
auto source_word(lower_source_words[i]);
|
||||
if(source_word==" ")
|
||||
continue;
|
||||
LogicalMatches matches(lexicon,source_word,verb);
|
||||
LogicalMatches matches(*lexicon,source_word,verb);
|
||||
if(prev_was_er || prev_was_var || prev_was_har || prev_was_havde) {
|
||||
//check if this word is the past participle
|
||||
const sto::WordForm *wordform_past_participle = NULL;
|
||||
@ -672,7 +672,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
|
||||
continue;
|
||||
|
||||
//find noun
|
||||
LogicalMatches matches(lexicon,source_word0,noun);
|
||||
LogicalMatches matches(*lexicon,source_word0,noun);
|
||||
const sto::WordForm *wordform_noun = NULL;
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
|
||||
@ -709,7 +709,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
|
||||
auto source_word4_capitalized(capitalize_word(source_word4));
|
||||
|
||||
//find proper-noun
|
||||
auto matches2 = lexicon.query_matches(source_word4_capitalized);
|
||||
auto matches2 = lexicon->query_matches(source_word4_capitalized);
|
||||
const sto::WordForm *wordform_proper_noun = NULL;
|
||||
const sto::WordForm *wordform_proper_noun_genitive = NULL;
|
||||
for(auto match : matches2) {
|
||||
@ -763,7 +763,7 @@ void WordVariationGenerator_danish::handle_adjective_grammatical_gender_simplifi
|
||||
//find adjective
|
||||
bool is_common_singular_indefinite = false;
|
||||
const sto::WordForm *wordform_neuter_singular_indefinite = NULL;
|
||||
LogicalMatches matches(lexicon,source_word0,whatever);
|
||||
LogicalMatches matches(*lexicon,source_word0,whatever);
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::adjective) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
|
Loading…
x
Reference in New Issue
Block a user