mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-06-19 23:16:08 -04:00
Wordvariations: initial work on noun-prep-propernoun to propernoun-genetive noun rewrites
This commit is contained in:
@ -984,6 +984,15 @@ int32_t to_lower_utf8(char *dst, char * /*dstEnd*/, const char *src ) {
|
||||
return dst - dstart;
|
||||
}
|
||||
|
||||
int32_t to_upper_utf8(char *dst, const char *src) {
|
||||
if(is_ascii3(*src)) {
|
||||
*dst = to_upper_a(*src);
|
||||
return 1;
|
||||
}
|
||||
UChar32 x = utf8Decode(src);
|
||||
UChar32 y = ucToUpper(x);
|
||||
return utf8Encode(y, dst);
|
||||
}
|
||||
// currently unused
|
||||
// int32_t to_upper_utf8(char *dst, char *src) {
|
||||
// // if in ascii do it quickly
|
||||
|
@ -64,6 +64,7 @@ bool has_alpha_utf8(const char *s, const char *send);
|
||||
int32_t to_lower_utf8 (char *dst , const char *src ) ;
|
||||
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src ) ;
|
||||
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src, const char *srcEnd) ;
|
||||
int32_t to_upper_utf8(char *dst, const char *src);
|
||||
|
||||
// . get the # of words in this string
|
||||
int32_t getNumWords ( char *s , int32_t len ) ;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "STOWordVariationGenerator.h"
|
||||
#include "fctypes.h" //to_lower_utf8
|
||||
#include "Unicode.h" //getUtf8CharSize etc
|
||||
|
||||
|
||||
bool STOWordVariationGenerator::load_lexicon(const char *filename) {
|
||||
@ -20,3 +21,30 @@ std::vector<std::string> STOWordVariationGenerator::lower_words(const std::vecto
|
||||
}
|
||||
return dst_words;
|
||||
}
|
||||
|
||||
//....except for proper nouns which are present in capitalized form, but users rarely bother typing it correctly so these functions are useful too
|
||||
std::string STOWordVariationGenerator::capitalize_word(const std::string &lower_src) {
|
||||
//todo: we don't handle o'Brien and other Irish names properly
|
||||
//todo: we don't handle correct capitalization of 'i' in Turkish locale (which should be 'İ')
|
||||
if(lower_src.length()==0)
|
||||
return lower_src;
|
||||
size_t sz = getUtf8CharSize(lower_src.data());
|
||||
if(sz>lower_src.length())
|
||||
return lower_src; //invalid/truncated utf8
|
||||
char tmp_src[6], tmp_dst[6];
|
||||
if(sz>=sizeof(tmp_src))
|
||||
return lower_src; //invalid/truncated utf8
|
||||
memcpy(tmp_src,lower_src.data(),sz);
|
||||
tmp_src[sz]='\0';
|
||||
size_t dstsz = to_upper_utf8(tmp_dst,tmp_src);
|
||||
tmp_dst[dstsz]='\0';
|
||||
return tmp_dst + lower_src.substr(sz);
|
||||
}
|
||||
|
||||
std::vector<std::string> STOWordVariationGenerator::capitalize_words(const std::vector<std::string> &lower_words) {
|
||||
std::vector<std::string> dst_words;
|
||||
for(auto src : lower_words) {
|
||||
dst_words.push_back(capitalize_word(src));
|
||||
}
|
||||
return dst_words;
|
||||
}
|
||||
|
@ -13,6 +13,8 @@ public:
|
||||
void unload_lexicon() { lexicon.unload(); }
|
||||
|
||||
std::vector<std::string> lower_words(const std::vector<std::string> &source_words);
|
||||
std::string capitalize_word(const std::string &lower_word);
|
||||
std::vector<std::string> capitalize_words(const std::vector<std::string> &lower_words);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -37,6 +37,10 @@ public:
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight);
|
||||
void make_proper_noun_part_genetive(std::vector<WordVariationGenerator::Variation> &variations,
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight);
|
||||
};
|
||||
|
||||
static WordVariationGenerator_danish s_WordVariationGenerator_danish;
|
||||
@ -87,6 +91,9 @@ std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::qu
|
||||
find_simple_attribute_match_wordforms(variations,lower_source_words,weights.simple_spelling_variants);
|
||||
}
|
||||
|
||||
//currently inactive because Query.cpp/PosdbTable.cpp cannot handle wordvariations spanning more than one word
|
||||
//make_proper_noun_part_genetive(variations,source_words,lower_source_words,1.2);
|
||||
|
||||
//filter out duplicates and variations below threshold
|
||||
//syn-todo: when filtering out duplicates choose the one with the highest weight
|
||||
std::set<std::string> seen_variations;
|
||||
@ -545,3 +552,98 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
prev_word_idx = i;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<WordVariationGenerator::Variation> &variations,
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight)
|
||||
{
|
||||
//In Danish when referring to the mayor/king/president/foreman/institution of some place/organization you can use either:
|
||||
// <noun> <preposition> <proper-noun>
|
||||
//or
|
||||
// <proper-noun><genitive s-suffix> <noun>
|
||||
//Examples:
|
||||
// Kongen af Albanien. Hospitalet i Lille Ubehage. Direktøren for Nordisk Fjer
|
||||
//vs.
|
||||
// Albaniens konge. Lille Ubehages hospital. Nordisk Fjers direktør.
|
||||
//are almost equally used. Some Danes feel that the genitive variant is a bit artificial for inanimate objects, eg a bucket,
|
||||
//but for places and organizations it feels more natural. So there are no clear-cut rules.
|
||||
|
||||
//Iterate through the words and locate <noun> <preposition> <proper-noun>, and generate <proper-noun><genitive s-suffix> <noun>
|
||||
for(unsigned i=0; i+4<lower_source_words.size(); i++) {
|
||||
auto source_word0(lower_source_words[i]);
|
||||
if(source_word0==" ")
|
||||
continue;
|
||||
|
||||
//find noun
|
||||
auto matches(lexicon.query_matches(source_word0));
|
||||
const sto::WordForm *wordform_noun = NULL;
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word0) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
|
||||
{
|
||||
wordform_noun = wordform;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(!wordform_noun)
|
||||
continue;
|
||||
|
||||
if(lower_source_words[i+1]!=" ")
|
||||
continue;
|
||||
|
||||
//find preposition
|
||||
//hack: just check of i/af/for
|
||||
auto source_word2(lower_source_words[i+2]);
|
||||
if(source_word2==" ")
|
||||
continue;
|
||||
if(source_word2!="i" && source_word2!="af" && source_word2!="for")
|
||||
continue;
|
||||
|
||||
if(lower_source_words[i+3]!=" ")
|
||||
continue;
|
||||
|
||||
auto source_word4(lower_source_words[i+4]);
|
||||
if(source_word4==" ")
|
||||
continue;
|
||||
auto source_word4_capitalized(capitalize_word(source_word4));
|
||||
|
||||
//find proper-noun
|
||||
matches = lexicon.query_matches(source_word4_capitalized);
|
||||
const sto::WordForm *wordform_proper_noun = NULL;
|
||||
const sto::WordForm *wordform_proper_noun_genitive = NULL;
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::properNoun) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
|
||||
wordform_proper_noun = wordform;
|
||||
if(wordform->has_attribute(sto::word_form_attribute_t::case_genitiveCase))
|
||||
wordform_proper_noun_genitive = wordform;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(!wordform_proper_noun)
|
||||
continue;
|
||||
|
||||
//ok, we have noun-preposition-propernoun
|
||||
if(!wordform_proper_noun_genitive) {
|
||||
//but no genitive case. Hmmm. why?
|
||||
continue;
|
||||
}
|
||||
|
||||
//transform that into propernoun-genetive noun
|
||||
|
||||
WordVariationGenerator::Variation v0_0;
|
||||
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + std::string(wordform_noun->written_form,wordform_noun->written_form_length);
|
||||
v0_0.weight = weight;
|
||||
v0_0.source_word_start = i;
|
||||
v0_0.source_word_end = i+5;
|
||||
variations.push_back(v0_0);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user