tokenizer: combining mark removal for Italian
This commit is contained in:
parent
c4be13a0bf
commit
d8e03ccfb2
tokenizer
@ -230,6 +230,7 @@ static void remove_combining_marks_norwegian(TokenizerResult *tr);
|
||||
static void remove_combining_marks_swedish(TokenizerResult *tr);
|
||||
static void remove_combining_marks_german(TokenizerResult *tr);
|
||||
static void remove_combining_marks_swiss_german(TokenizerResult *tr);
|
||||
static void remove_combining_marks_italian(TokenizerResult *tr);
|
||||
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count);
|
||||
|
||||
|
||||
@ -250,6 +251,9 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang, const char
|
||||
else
|
||||
remove_combining_marks_swiss_german(tr);
|
||||
return;
|
||||
case langItalian:
|
||||
remove_combining_marks_italian(tr);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -333,6 +337,37 @@ static void remove_combining_marks_swiss_german(TokenizerResult *tr) {
|
||||
}
|
||||
|
||||
|
||||
//Combining marks in Italian:
|
||||
// - grave àèìòù Mandatory for lowercase. Dedicated keys on keyboard
|
||||
// - acute é Mandatory for lowercase. Dedicated keys on keyboard
|
||||
// - cedilla ç Non-native. Dedicated key on keyboard - lowercase only
|
||||
//Swiss-Italian keyboard has access to umlaut.
|
||||
//Major problem is that none the the three Italian keyboard layouts have easy access to uppercase accented letters, so the accents are frequently
|
||||
//omitted or typed as apostrophe. More discussion here: https://italian.stackexchange.com/questions/3878/how-do-italians-customarily-insert-uppercase-italian-vowels-with-diacritics-with
|
||||
//So one way to deal with this is to just remove all diacritics in both diocument and query, but that would lose precision. But given that most documents has been run through word
|
||||
//processing software the documents are mostly written correctly, and that when users type queries they rarely use uppercase so the accents are probably also typed correctly there.
|
||||
//So we keep the native and easily accessible marks. Then on a later date we should detect the incorrect forms and fix them (requires a dictionary though).
|
||||
static void remove_combining_marks_italian(TokenizerResult *tr) {
|
||||
static const UChar32 native_marked_letters[] = {
|
||||
0x00C0, //À
|
||||
0x00C8, //È
|
||||
0x00CC, //Ì
|
||||
0x00D2, //Ò
|
||||
0x00D9, //Ù
|
||||
0x00E0, //à
|
||||
0x00E8, //è
|
||||
0x00EC, //ì
|
||||
0x00F2, //ò
|
||||
0x00F9, //ù
|
||||
0x00C9, //É
|
||||
0x00E9, //é
|
||||
0x00C7, //Ç
|
||||
0x00E7, //ç
|
||||
};
|
||||
remove_some_combining_marks(tr, native_marked_letters, sizeof(native_marked_letters)/sizeof(native_marked_letters[0]));
|
||||
}
|
||||
|
||||
|
||||
//Remove combining marks form the codepoints except for the native marked letters
|
||||
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count) {
|
||||
const size_t org_token_count = tr->size();
|
||||
|
@ -609,6 +609,49 @@ int main(void) {
|
||||
assert(t.str(6)=="Noel");
|
||||
}
|
||||
|
||||
//italian diacritics
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("aaa bbb",langItalian);
|
||||
assert(t.token_count()==3);
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("Ragù",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="Ragù");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("àèìòùéç",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="àèìòùéç");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("ÀÈÌÒÙÉÇ",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="ÀÈÌÒÙÉÇ");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("monaco münchen",langItalian);
|
||||
assert(t.token_count()==4);
|
||||
assert(t.str(3)=="munchen");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("Eskişehir",langItalian);
|
||||
assert(t.token_count()==2);
|
||||
assert(t.str(1)=="Eskisehir");
|
||||
}
|
||||
|
||||
|
||||
//diacritics hands-off
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user