Wordvariations: initial work on noun-prep-propernoun to propernoun-genetive noun rewrites

This commit is contained in:
Ivan Skytte Jørgensen
2018-01-11 15:21:12 +01:00
parent 3361bb25a3
commit e00ef2469c
5 changed files with 142 additions and 0 deletions

@ -984,6 +984,15 @@ int32_t to_lower_utf8(char *dst, char * /*dstEnd*/, const char *src ) {
return dst - dstart;
}
int32_t to_upper_utf8(char *dst, const char *src) {
if(is_ascii3(*src)) {
*dst = to_upper_a(*src);
return 1;
}
UChar32 x = utf8Decode(src);
UChar32 y = ucToUpper(x);
return utf8Encode(y, dst);
}
// currently unused
// int32_t to_upper_utf8(char *dst, char *src) {
// // if in ascii do it quickly

@ -64,6 +64,7 @@ bool has_alpha_utf8(const char *s, const char *send);
int32_t to_lower_utf8 (char *dst , const char *src ) ;
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src ) ;
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src, const char *srcEnd) ;
int32_t to_upper_utf8(char *dst, const char *src);
// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len ) ;

@ -1,5 +1,6 @@
#include "STOWordVariationGenerator.h"
#include "fctypes.h" //to_lower_utf8
#include "Unicode.h" //getUtf8CharSize etc
bool STOWordVariationGenerator::load_lexicon(const char *filename) {
@ -20,3 +21,30 @@ std::vector<std::string> STOWordVariationGenerator::lower_words(const std::vecto
}
return dst_words;
}
//....except for proper nouns which are present in capitalized form, but users rarely bother typing it correctly so these functions are useful too
std::string STOWordVariationGenerator::capitalize_word(const std::string &lower_src) {
//todo: we don't handle o'Brien and other Irish names properly
//todo: we don't handle correct capitalization of 'i' in Turkish locale (which should be 'İ')
if(lower_src.length()==0)
return lower_src;
size_t sz = getUtf8CharSize(lower_src.data());
if(sz>lower_src.length())
return lower_src; //invalid/truncated utf8
char tmp_src[6], tmp_dst[6];
if(sz>=sizeof(tmp_src))
return lower_src; //invalid/truncated utf8
memcpy(tmp_src,lower_src.data(),sz);
tmp_src[sz]='\0';
size_t dstsz = to_upper_utf8(tmp_dst,tmp_src);
tmp_dst[dstsz]='\0';
return tmp_dst + lower_src.substr(sz);
}
std::vector<std::string> STOWordVariationGenerator::capitalize_words(const std::vector<std::string> &lower_words) {
std::vector<std::string> dst_words;
for(auto src : lower_words) {
dst_words.push_back(capitalize_word(src));
}
return dst_words;
}

@ -13,6 +13,8 @@ public:
void unload_lexicon() { lexicon.unload(); }
std::vector<std::string> lower_words(const std::vector<std::string> &source_words);
std::string capitalize_word(const std::string &lower_word);
std::vector<std::string> capitalize_words(const std::vector<std::string> &lower_words);
};
#endif

@ -37,6 +37,10 @@ public:
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight);
void make_proper_noun_part_genetive(std::vector<WordVariationGenerator::Variation> &variations,
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight);
};
static WordVariationGenerator_danish s_WordVariationGenerator_danish;
@ -87,6 +91,9 @@ std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::qu
find_simple_attribute_match_wordforms(variations,lower_source_words,weights.simple_spelling_variants);
}
//currently inactive because Query.cpp/PosdbTable.cpp cannot handle wordvariations spanning more than one word
//make_proper_noun_part_genetive(variations,source_words,lower_source_words,1.2);
//filter out duplicates and variations below threshold
//syn-todo: when filtering out duplicates choose the one with the highest weight
std::set<std::string> seen_variations;
@ -545,3 +552,98 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
prev_word_idx = i;
}
}
void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<WordVariationGenerator::Variation> &variations,
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight)
{
//In Danish when referring to the mayor/king/president/foreman/institution of some place/organization you can use either:
// <noun> <preposition> <proper-noun>
//or
// <proper-noun><genitive s-suffix> <noun>
//Examples:
// Kongen af Albanien. Hospitalet i Lille Ubehage. Direktøren for Nordisk Fjer
//vs.
// Albaniens konge. Lille Ubehages hospital. Nordisk Fjers direktør.
//are almost equally used. Some Danes feel that the genitive variant is a bit artificial for inanimate objects, eg a bucket,
//but for places and organizations it feels more natural. So there are no clear-cut rules.
//Iterate through the words and locate <noun> <preposition> <proper-noun>, and generate <proper-noun><genitive s-suffix> <noun>
for(unsigned i=0; i+4<lower_source_words.size(); i++) {
auto source_word0(lower_source_words[i]);
if(source_word0==" ")
continue;
//find noun
auto matches(lexicon.query_matches(source_word0));
const sto::WordForm *wordform_noun = NULL;
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word0) &&
wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
{
wordform_noun = wordform;
}
}
}
}
if(!wordform_noun)
continue;
if(lower_source_words[i+1]!=" ")
continue;
//find preposition
//hack: just check of i/af/for
auto source_word2(lower_source_words[i+2]);
if(source_word2==" ")
continue;
if(source_word2!="i" && source_word2!="af" && source_word2!="for")
continue;
if(lower_source_words[i+3]!=" ")
continue;
auto source_word4(lower_source_words[i+4]);
if(source_word4==" ")
continue;
auto source_word4_capitalized(capitalize_word(source_word4));
//find proper-noun
matches = lexicon.query_matches(source_word4_capitalized);
const sto::WordForm *wordform_proper_noun = NULL;
const sto::WordForm *wordform_proper_noun_genitive = NULL;
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::properNoun) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
wordform_proper_noun = wordform;
if(wordform->has_attribute(sto::word_form_attribute_t::case_genitiveCase))
wordform_proper_noun_genitive = wordform;
}
}
}
if(!wordform_proper_noun)
continue;
//ok, we have noun-preposition-propernoun
if(!wordform_proper_noun_genitive) {
//but no genitive case. Hmmm. why?
continue;
}
//transform that into propernoun-genetive noun
WordVariationGenerator::Variation v0_0;
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + std::string(wordform_noun->written_form,wordform_noun->written_form_length);
v0_0.weight = weight;
v0_0.source_word_start = i;
v0_0.source_word_end = i+5;
variations.push_back(v0_0);
}
}