tokenizer: combining mark removal for Italian

This commit is contained in:
Ivan Skytte Jørgensen 2018-06-13 16:56:59 +02:00
parent c4be13a0bf
commit d8e03ccfb2
2 changed files with 78 additions and 0 deletions

View File

@ -230,6 +230,7 @@ static void remove_combining_marks_norwegian(TokenizerResult *tr);
static void remove_combining_marks_swedish(TokenizerResult *tr);
static void remove_combining_marks_german(TokenizerResult *tr);
static void remove_combining_marks_swiss_german(TokenizerResult *tr);
static void remove_combining_marks_italian(TokenizerResult *tr);
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count);
@ -250,6 +251,9 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang, const char
else
remove_combining_marks_swiss_german(tr);
return;
case langItalian:
remove_combining_marks_italian(tr);
break;
default:
break;
}
@ -333,6 +337,37 @@ static void remove_combining_marks_swiss_german(TokenizerResult *tr) {
}
//Combining marks in Italian:
// - grave àèìòù Mandatory for lowercase. Dedicated keys on keyboard
// - acute é Mandatory for lowercase. Dedicated keys on keyboard
// - cedilla ç Non-native. Dedicated key on keyboard - lowercase only
//Swiss-Italian keyboard has access to umlaut.
//Major problem is that none the the three Italian keyboard layouts have easy access to uppercase accented letters, so the accents are frequently
//omitted or typed as apostrophe. More discussion here: https://italian.stackexchange.com/questions/3878/how-do-italians-customarily-insert-uppercase-italian-vowels-with-diacritics-with
//So one way to deal with this is to just remove all diacritics in both diocument and query, but that would lose precision. But given that most documents has been run through word
//processing software the documents are mostly written correctly, and that when users type queries they rarely use uppercase so the accents are probably also typed correctly there.
//So we keep the native and easily accessible marks. Then on a later date we should detect the incorrect forms and fix them (requires a dictionary though).
static void remove_combining_marks_italian(TokenizerResult *tr) {
static const UChar32 native_marked_letters[] = {
0x00C0, //À
0x00C8, //È
0x00CC, //Ì
0x00D2, //Ò
0x00D9, //Ù
0x00E0, //à
0x00E8, //è
0x00EC, //ì
0x00F2, //ò
0x00F9, //ù
0x00C9, //É
0x00E9, //é
0x00C7, //Ç
0x00E7, //ç
};
remove_some_combining_marks(tr, native_marked_letters, sizeof(native_marked_letters)/sizeof(native_marked_letters[0]));
}
//Remove combining marks form the codepoints except for the native marked letters
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count) {
const size_t org_token_count = tr->size();

View File

@ -609,6 +609,49 @@ int main(void) {
assert(t.str(6)=="Noel");
}
//italian diacritics
printf("Test line %d\n",__LINE__);
{
T2 t("aaa bbb",langItalian);
assert(t.token_count()==3);
}
printf("Test line %d\n",__LINE__);
{
T2 t("Ragù",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="Ragù");
}
printf("Test line %d\n",__LINE__);
{
T2 t("àèìòùéç",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="àèìòùéç");
}
printf("Test line %d\n",__LINE__);
{
T2 t("ÀÈÌÒÙÉÇ",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="ÀÈÌÒÙÉÇ");
}
printf("Test line %d\n",__LINE__);
{
T2 t("monaco münchen",langItalian);
assert(t.token_count()==4);
assert(t.str(3)=="munchen");
}
printf("Test line %d\n",__LINE__);
{
T2 t("Eskişehir",langItalian);
assert(t.token_count()==2);
assert(t.str(1)=="Eskisehir");
}
//diacritics hands-off
printf("Test line %d\n",__LINE__);
{