Only load sto lexicon once

2018-07-17 13:49:53 +02:00 · 2018-07-17 13:49:53 +02:00 · 68f6e60069
commit 68f6e60069
parent 01466009d5
10 changed files with 67 additions and 23 deletions
--- a/Lemma.cpp
+++ b/Lemma.cpp
@ -1,8 +1,10 @@
 #include "Lemma.h"
+#include "Lexicons.h"

-sto::Lexicon lemma_lexicon;
+
+sto::Lexicon *lemma_lexicon = nullptr;

 bool load_lemma_lexicon() {
-	return lemma_lexicon.load("lexicon_da.sto");
+	lemma_lexicon = getLexicon("lexicon_da.sto");
+	return lemma_lexicon!=nullptr;
 }
-
--- a/Lemma.h
+++ b/Lemma.h
@ -3,7 +3,7 @@

 #include "sto/sto.h"

-extern sto::Lexicon lemma_lexicon;
+extern sto::Lexicon *lemma_lexicon;
 bool load_lemma_lexicon();

 #endif
--- a/Lexicons.cpp
+++ b/Lexicons.cpp
@ -0,0 +1,31 @@
+#include "Lexicons.h"
+#include "GbMutex.h"
+#include "ScopedLock.h"
+#include <map>
+#include <memory>
+
+//Yes, I do know that with Greek morphology the plural of lexicon is lexica. But this isn't Greek
+
+
+static std::map<std::string,std::unique_ptr<sto::Lexicon>> map;
+static GbMutex mtx_map;
+
+sto::Lexicon *getLexicon(const std::string &filename) {
+	ScopedLock sl(mtx_map);
+	auto iter = map.find(filename);
+	if(iter!=map.end())
+		return iter->second.get();
+	sto::Lexicon *l = new sto::Lexicon();
+	if(!l->load(filename)) {
+		delete l;
+		return nullptr;
+	}
+	map.emplace(filename,l);
+	return l;
+	
+}
+
+
+void forgetAllLexicons() {
+	map.clear();
+}
--- a/Lexicons.h
+++ b/Lexicons.h
@ -0,0 +1,8 @@
+#ifndef LEXICONREF_H_
+#define LEXICONREF_H_
+#include "sto/sto.h"
+
+sto::Lexicon *getLexicon(const std::string &filename);
+void forgetAllLexicons();
+
+#endif
--- a/1
+++ b/1
@ -97,6 +97,7 @@ OBJS_O3 = \
 	SiteNumInlinks.o \
 	SiteMedianPageTemperature.o \
 	MemoryMappedFile.o \
+	Lexicons.o \
 	Lemma.o \


--- a/Query.cpp
+++ b/Query.cpp
@ -1058,7 +1058,7 @@ bool Query::setQTerms() {
 				continue;
 			std::string w(m_tr[i].token_start,m_tr[i].token_len);
 			logTrace(g_conf.m_logTraceQuery, "@@ Checking lemma for '%s'", w.c_str());
-			auto le = lemma_lexicon.lookup(w);
+			auto le = lemma_lexicon->lookup(w);
 			if(!le) {
 				//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
 				char lowercase_word[128];
@ -1066,7 +1066,7 @@ bool Query::setQTerms() {
 					size_t sz = to_lower_utf8(lowercase_word,lowercase_word+sizeof(lowercase_word), w.data(), w.data()+w.size());
 					lowercase_word[sz] = '\0';
 					if(sz!=w.size() || memcmp(w.data(),lowercase_word,w.size())!=0) {
-						le = lemma_lexicon.lookup(lowercase_word);
+						le = lemma_lexicon->lookup(lowercase_word);
 					}
 				}
 			}
@ -1078,7 +1078,7 @@ bool Query::setQTerms() {
 					capitalized_word[sz] = '\0';
 					if(sz!=w.size() || memcmp(w.data(),capitalized_word,w.size())!=0) {
 						w = capitalized_word;
-						le = lemma_lexicon.lookup(w);
+						le = lemma_lexicon->lookup(w);
 					}
 				}
 			}
@ -1090,7 +1090,7 @@ bool Query::setQTerms() {
 					uppercase_word[sz] = '\0';
 					if(sz!=w.size() || memcmp(w.data(),uppercase_word,w.size())!=0) {
 						w = uppercase_word;
-						le = lemma_lexicon.lookup(w);
+						le = lemma_lexicon->lookup(w);
 					}
 				}
 			}
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -2247,7 +2247,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
 		for(auto e : candidate_lemma_words) {
 			//find the word in the lexicon. find the lemma. If the word is unknown or already in its base form then don't generate a lemma entry
 			logTrace(g_conf.m_logTraceTokenIndexing,"candidate  word for lemma: %s", e.c_str());
-			auto le = lemma_lexicon.lookup(e);
+			auto le = lemma_lexicon->lookup(e);
 			if(!le) {
 				//Not found as-is in lexicon. Try lowercase in case it is a capitalized word
 				char lowercase_word[128];
@ -2256,7 +2256,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
 					lowercase_word[sz] = '\0';
 					if(sz!=e.size() || memcmp(e.data(),lowercase_word,e.size())!=0) {
 						e = lowercase_word;
-						le = lemma_lexicon.lookup(e);
+						le = lemma_lexicon->lookup(e);
 					}
 				}
 			}
@ -2268,7 +2268,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
 					capitalized_word[sz] = '\0';
 					if(sz!=e.size() || memcmp(e.data(),capitalized_word,e.size())!=0) {
 						e = capitalized_word;
-						le = lemma_lexicon.lookup(e);
+						le = lemma_lexicon->lookup(e);
 					}
 				}
 			}
@ -2280,7 +2280,7 @@ bool XmlDoc::hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_to
 					uppercase_word[sz] = '\0';
 					if(sz!=e.size() || memcmp(e.data(),uppercase_word,e.size())!=0) {
 						e = uppercase_word;
-						le = lemma_lexicon.lookup(e);
+						le = lemma_lexicon->lookup(e);
 					}
 				}
 			}
--- a/word_variations/STOWordVariationGenerator.cpp
+++ b/word_variations/STOWordVariationGenerator.cpp
@ -1,10 +1,12 @@
 #include "STOWordVariationGenerator.h"
+#include "Lexicons.h"
 #include "fctypes.h"  //to_lower_utf8
 #include "utf8.h" //getUtf8CharSize etc


 bool STOWordVariationGenerator::load_lexicon(const char *filename) {
-	return lexicon.load(filename);
+	lexicon = getLexicon(filename);
+	return lexicon!=nullptr;
 }


--- a/word_variations/STOWordVariationGenerator.h
+++ b/word_variations/STOWordVariationGenerator.h
@ -6,11 +6,11 @@
 //A word variation generator that can use a STO database
 class STOWordVariationGenerator : public WordVariationGenerator {
 protected:
-	sto::Lexicon lexicon;
+	sto::Lexicon *lexicon;
 public:
 	using WordVariationGenerator::WordVariationGenerator;
 	bool load_lexicon(const char *filename);
-	void unload_lexicon() { lexicon.unload(); }
+	void unload_lexicon() { lexicon = nullptr; }

 	std::vector<std::string> lower_words(const std::vector<std::string> &source_words);
 	std::string capitalize_word(const std::string &lower_word);
--- a/word_variations/WordVariationsGenerator_danish.cpp
+++ b/word_variations/WordVariationsGenerator_danish.cpp
@ -227,7 +227,7 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
 {
 	for(unsigned i=0; i<source_words.size(); i++) {
 		auto source_word(source_words[i]);
-		LogicalMatches matches(lexicon,source_word,noun);
+		LogicalMatches matches(*lexicon,source_word,noun);
 		for(auto match : matches) {
 			auto wordforms(match->query_all_explicit_word_forms());
 			for(auto wordform : wordforms) {
@ -269,14 +269,14 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
 {
 	for(unsigned i=0; i<source_words.size(); i++) {
 		auto source_word(source_words[i]);
-		LogicalMatches matches(lexicon,source_word,whatever);
+		LogicalMatches matches(*lexicon,source_word,whatever);
 		for(auto match : matches) {
 			auto wordforms(match->query_all_explicit_word_forms());
 			for(auto wordform : wordforms) {
 				if(same_wordform_as_source(*wordform,matches.query_matched_word())) {
 					//found the word form match. Now look for other wordforms with exactly the same attributes. Those are alternate spellings.
 					//so first find all lexical entries with the same morphological unit id, and check all wordforms of those, looking for an attribute match
-					auto same_morph_entries = lexicon.query_lexical_entries_with_same_morphological_unit_id(match);
+					auto same_morph_entries = lexicon->query_lexical_entries_with_same_morphological_unit_id(match);
 					for(auto same_morph_entry : same_morph_entries) {
 						auto wordforms2(same_morph_entry->query_all_explicit_word_forms());
 						for(auto wordform2 : wordforms2) {
@ -427,7 +427,7 @@ void WordVariationGenerator_danish::transliterate_verb_acute_accent(std::vector<
 		if(source_word.length()>4 && source_word.substr(source_word.length()-2)=="er") {
 			//possibly a verb in imperative
 			bool is_imperative = false;
-			LogicalMatches matches(lexicon,source_word,verb);
+			LogicalMatches matches(*lexicon,source_word,verb);
 			for(auto match : matches) {
 				auto wordforms(match->query_all_explicit_word_forms());
 				for(auto wordform : wordforms) {
@ -474,7 +474,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
 		auto source_word(lower_source_words[i]);
 		if(source_word==" ")
 			continue;
-		LogicalMatches matches(lexicon,source_word,verb);
+		LogicalMatches matches(*lexicon,source_word,verb);
 		if(prev_was_er || prev_was_var || prev_was_har || prev_was_havde) {
 			//check if this word is the past participle
 			const sto::WordForm *wordform_past_participle = NULL;
@ -672,7 +672,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
 			continue;
 		
 		//find noun
-		LogicalMatches matches(lexicon,source_word0,noun);
+		LogicalMatches matches(*lexicon,source_word0,noun);
 		const sto::WordForm *wordform_noun = NULL;
 		for(auto match : matches) {
 			if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
@ -709,7 +709,7 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
 		auto source_word4_capitalized(capitalize_word(source_word4));
 		
 		//find proper-noun
-		auto matches2 = lexicon.query_matches(source_word4_capitalized);
+		auto matches2 = lexicon->query_matches(source_word4_capitalized);
 		const sto::WordForm *wordform_proper_noun = NULL;
 		const sto::WordForm *wordform_proper_noun_genitive = NULL;
 		for(auto match : matches2) {
@ -763,7 +763,7 @@ void WordVariationGenerator_danish::handle_adjective_grammatical_gender_simplifi
 		//find adjective
 		bool is_common_singular_indefinite = false;
 		const sto::WordForm *wordform_neuter_singular_indefinite = NULL;
-		LogicalMatches matches(lexicon,source_word0,whatever);
+		LogicalMatches matches(*lexicon,source_word0,whatever);
 		for(auto match : matches) {
 			if(match->part_of_speech==sto::part_of_speech_t::adjective) {
 				auto wordforms(match->query_all_explicit_word_forms());