tokenizer: handle superscript/subscript

This commit is contained in:
Ivan Skytte Jørgensen
2018-03-02 16:22:16 +01:00
parent cb7c8e4bc5
commit 41bb518844
2 changed files with 265 additions and 3 deletions

@ -14,6 +14,8 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang);
static void combine_possessive_s_tokens(TokenizerResult *tr, lang_t lang);
static void combine_hyphenated_words(TokenizerResult *tr);
static void recognize_telephone_numbers(TokenizerResult *tr, lang_t lang, const char *country_code);
static void tokenize_superscript(TokenizerResult *tr);
static void tokenize_subscript(TokenizerResult *tr);
//pass 2 tokenizer / language-dependent tokenization
@ -28,8 +30,8 @@ void plain_tokenizer_phase_2(const char * /*str*/, size_t /*len*/, lang_t lang,
remove_combining_marks(tr,lang);
combine_possessive_s_tokens(tr,lang);
//TODO: chemical formulae
//TODO: subscript
//TODO: superscript
tokenize_subscript(tr);
tokenize_superscript(tr);
combine_hyphenated_words(tr);
recognize_telephone_numbers(tr,lang,country_code);
@ -259,7 +261,7 @@ static bool is_hyphen(const TokenRange &tr) {
static void combine_hyphenated_words(TokenizerResult *tr) {
const size_t org_token_count = tr->size();
for(size_t i=0; i<org_token_count; ) {
auto const &first_token = (*tr)[0];
auto const &first_token = (*tr)[i];
if(!first_token.is_alfanum) {
i++;
continue;
@ -400,3 +402,206 @@ static void recognize_telephone_numbers_denmark(TokenizerResult *tr) {
tr->tokens.emplace_back(t0.start_pos, t6.end_pos, s, sl, true);
}
}
//////////////////////////////////////////////////////////////////////////////
// Superscript and subscript
static void tokenize_superscript(TokenizerResult *tr) {
//The phase-1 tokenizer considers "E=mc²" three tokens.
//Because people normally don't type the superscript-2 we generate a variant with plain digit
//If the superscript is at the end of the token then we also generate two tokens split. this is
//a workaround for footnote numbers directly attached to the preceeding word
const size_t org_token_count = tr->size();
for(size_t i=0; i<org_token_count; i++) {
auto const &t = (*tr)[i];
if(!t.is_alfanum)
continue;
if(t.token_len>max_word_codepoints)
continue;
UChar32 org_uc[max_word_codepoints];
int ucs = decode_utf8_string(t.token_start,t.token_len,org_uc);
if(ucs<=0)
continue;
UChar32 new_uc[max_word_codepoints];
bool any_changed = false;
int num_changed=0;
int change_pos=-1;
for(int j=0; j<ucs; j++) {
//UnicodeData.txt has many entries with <super< but we only look for a subset of those (we don't care abotu API extensions ideagraphic annotations, ...)
UChar32 n = org_uc[j];
switch(org_uc[j]) {
case 0x00AA: //FEMININE ORDINAL INDICATOR
n = 0x0061; break;
case 0x00B2: //SUPERSCRIPT TWO
n = 0x0032; break;
case 0x00B3: //SUPERSCRIPT THREE
n = 0x0033; break;
case 0x00B9: //SUPERSCRIPT ONE
n = 0x0031; break;
case 0x00BA: //MASCULINE ORDINAL INDICATOR
n = 0x006F; break;
case 0x2070: //SUPERSCRIPT ZERO
n = 0x0030; break;
case 0x2071: //SUPERSCRIPT LATIN SMALL LETTER I
n = 0x0069; break;
case 0x2074: //SUPERSCRIPT FOUR
n = 0x0034; break;
case 0x2075: //SUPERSCRIPT FIVE
n = 0x0035; break;
case 0x2076: //SUPERSCRIPT SIX
n = 0x0036; break;
case 0x2077: //SUPERSCRIPT SEVEN
n = 0x0037; break;
case 0x2078: //SUPERSCRIPT EIGHT
n = 0x0038; break;
case 0x2079: //SUPERSCRIPT NINE
n = 0x0039; break;
// case 0x207A: //SUPERSCRIPT PLUS SIGN
// n = 0x002B; break;
// case 0x207B: //SUPERSCRIPT MINUS;Sm;0
// n = 0x2212; break;
// case 0x207C: //SUPERSCRIPT EQUALS SIGN
// n = 0x003D; break;
// case 0x207D: //SUPERSCRIPT LEFT PARENTHESIS
// n = 0x0028; break;
// case 0x207E: //SUPERSCRIPT RIGHT PARENTHESIS
// n = 0x0029; break;
case 0x207F: //SUPERSCRIPT LATIN SMALL LETTER N
n = 0x006E; break;
default:
break;
}
new_uc[j] = n;
if(n!=org_uc[j]) {
any_changed = true;
num_changed++;
change_pos = j;
}
}
if(any_changed) {
char *s = (char*)tr->egstack.alloc(ucs*4);
size_t sl = encode_utf8_string(new_uc,ucs,s);
tr->tokens.emplace_back(t.start_pos,t.end_pos, s,sl, true);
if(num_changed==1 && change_pos==ucs-1) {
//footnote special (and spanish/portuguese ordinal)
s = (char*)tr->egstack.alloc((ucs-1)*4);
sl = encode_utf8_string(new_uc,ucs-1,s);
tr->tokens.emplace_back(t.start_pos,t.start_pos+sl, s,sl, true);
s = (char*)tr->egstack.alloc(4);
sl = encode_utf8_string(new_uc+ucs-1,1,s);
tr->tokens.emplace_back(t.end_pos-sl,t.end_pos, s,sl, true);
}
}
}
}
static void tokenize_subscript(TokenizerResult *tr) {
//The phase-1 tokenizer considers "H₂O" a single token
//We generate the variant without the subcsript, "H2O"
const size_t org_token_count = tr->size();
for(size_t i=0; i<org_token_count; i++) {
auto const &t = (*tr)[i];
if(!t.is_alfanum)
continue;
if(t.token_len>max_word_codepoints)
continue;
UChar32 org_uc[max_word_codepoints];
int ucs = decode_utf8_string(t.token_start,t.token_len,org_uc);
if(ucs<=0)
continue;
UChar32 new_uc[max_word_codepoints];
bool any_changed = false;
for(int j=0; j<ucs; j++) {
//we should really be using UnicodeData.txt's <sub> decompositions, but it's currently hardly worth it.
UChar32 n = org_uc[j];
switch(org_uc[j]) {
case 0x1D62: //LATIN SUBSCRIPT SMALL LETTER I
n = 0x0069; break;
case 0x1D63: //LATIN SUBSCRIPT SMALL LETTER R
n = 0x0072; break;
case 0x1D64: //LATIN SUBSCRIPT SMALL LETTER U
n = 0x0075; break;
case 0x1D65: //LATIN SUBSCRIPT SMALL LETTER V
n = 0x0076; break;
case 0x1D66: //GREEK SUBSCRIPT SMALL LETTER BETA
n = 0x03B2; break;
case 0x1D67: //GREEK SUBSCRIPT SMALL LETTER GAMMA
n = 0x03B3; break;
case 0x1D68: //GREEK SUBSCRIPT SMALL LETTER RHO
n = 0x03C1; break;
case 0x1D69: //GREEK SUBSCRIPT SMALL LETTER PHI
n = 0x03C6; break;
case 0x1D6A: //GREEK SUBSCRIPT SMALL LETTER CHI
n = 0x03C7; break;
case 0x2080: //SUBSCRIPT ZERO
n = 0x0030; break;
case 0x2081: //SUBSCRIPT ONE
n = 0x0031; break;
case 0x2082: //SUBSCRIPT TWO
n = 0x0032; break;
case 0x2083: //SUBSCRIPT THREE
n = 0x0033; break;
case 0x2084: //SUBSCRIPT FOUR
n = 0x0034; break;
case 0x2085: //SUBSCRIPT FIVE
n = 0x0035; break;
case 0x2086: //SUBSCRIPT SIX
n = 0x0036; break;
case 0x2087: //SUBSCRIPT SEVEN
n = 0x0037; break;
case 0x2088: //SUBSCRIPT EIGHT
n = 0x0038; break;
case 0x2089: //SUBSCRIPT NINE
n = 0x0039; break;
case 0x208A: //SUBSCRIPT PLUS SIGN
n = 0x002B; break;
case 0x208B: //SUBSCRIPT MINUS
n = 0x2212; break;
case 0x208C: //SUBSCRIPT EQUALS SIGN
n = 0x003D; break;
case 0x208D: //SUBSCRIPT LEFT PARENTHESIS
n = 0x0028; break;
case 0x208E: //SUBSCRIPT RIGHT PARENTHESIS
n = 0x0029; break;
case 0x2090: //LATIN SUBSCRIPT SMALL LETTER A
n = 0x0061; break;
case 0x2091: //LATIN SUBSCRIPT SMALL LETTER E
n = 0x0065; break;
case 0x2092: //LATIN SUBSCRIPT SMALL LETTER O
n = 0x006F; break;
case 0x2093: //LATIN SUBSCRIPT SMALL LETTER X
n = 0x0078; break;
case 0x2094: //LATIN SUBSCRIPT SMALL LETTER SCHWA
n = 0x0259; break;
case 0x2095: //LATIN SUBSCRIPT SMALL LETTER H
n = 0x0068; break;
case 0x2096: //LATIN SUBSCRIPT SMALL LETTER K
n = 0x006B; break;
case 0x2097: //LATIN SUBSCRIPT SMALL LETTER L
n = 0x006C; break;
case 0x2098: //LATIN SUBSCRIPT SMALL LETTER M
n = 0x006D; break;
case 0x2099: //LATIN SUBSCRIPT SMALL LETTER N
n = 0x006E; break;
case 0x209A: //LATIN SUBSCRIPT SMALL LETTER P
n = 0x0070; break;
case 0x209B: //LATIN SUBSCRIPT SMALL LETTER S
n = 0x0073; break;
case 0x209C: //LATIN SUBSCRIPT SMALL LETTER T
n = 0x0074; break;
case 0x2C7C: //LATIN SUBSCRIPT SMALL LETTER J
n = 0x006A; break;
default:
break;
}
new_uc[j] = n;
if(n!=org_uc[j])
any_changed = true;
}
if(any_changed) {
char *s = (char*)tr->egstack.alloc(ucs*4);
size_t sl = encode_utf8_string(new_uc,ucs,s);
tr->tokens.emplace_back(t.start_pos,t.end_pos, s,sl, true);
}
}
}

@ -607,5 +607,62 @@ int main(void) {
assert(t.token_count()==8);
assert(t.str(7)=="70270431");
}
// subscript, phase 2
printf("Test line %d\n",__LINE__);
{
T2 t("H₂O",langUnknown); //water
assert(t.token_count()==2);
assert(t.str(1)=="H2O");
}
printf("Test line %d\n",__LINE__);
{
T2 t("H₂O₂",langUnknown); //hydrogen peroxide
assert(t.token_count()==2);
assert(t.str(1)=="H2O2");
}
printf("Test line %d\n",__LINE__);
{
T2 t("H₂SO₄",langUnknown); //sulphuric acid
assert(t.token_count()==2);
assert(t.str(1)=="H2SO4");
}
// superscript, phase 2
printf("Test line %d\n",__LINE__);
{
T2 t("foo²boo",langUnknown);
assert(!t.empty());
assert(t.token_count()==2);
assert(t.str(0)=="foo²boo");
assert(t.str(1)=="foo2boo");
}
printf("Test line %d\n",__LINE__);
{
T2 t("E=mc²",langUnknown);
assert(!t.empty());
assert(t.token_count()==6);
assert(t.str(0)=="E");
assert(t.str(1)=="=");
assert(t.str(2)=="mc²");
assert(t.str(3)=="mc2");
assert(t.str(4)=="mc");
assert(t.str(5)=="2");
}
printf("Test line %d\n",__LINE__);
{
T2 t("j*=σT⁴",langUnknown);
assert(!t.empty());
assert(t.token_count()==7);
assert(t.str(0)=="j");
assert(t.str(1)=="*=");
assert(t.str(2)=="σ");
assert(t.str(3)=="T⁴");
assert(t.str(4)=="T4");
assert(t.str(5)=="T");
assert(t.str(6)=="4");
}
return 0;
}