tokenizer: phase-1 XML tokenizer added + tests

This commit is contained in:
Ivan Skytte Jørgensen
2018-03-02 18:46:03 +01:00
parent f494ee4edf
commit 9591eb7978
5 changed files with 145 additions and 7 deletions

@ -15,6 +15,13 @@ default: run
tokenizer_unittest: tokenizer.o tokenizer2.o tokenizer_unittest.o
g++ -g tokenizer.o tokenizer2.o tokenizer_unittest.o ../unicode/libunicode.a ../utf8_fast.o ../utf8.o ../EGStack.o -o $@
xml_tokenizer_unittest: tokenizer.o tokenizer2.o tokenizer3.o xml_tokenizer_unittest.o
g++ -g tokenizer.o tokenizer2.o tokenizer3.o xml_tokenizer_unittest.o ../unicode/libunicode.a ../libgb.a -lm -lpthread -lssl -lcrypto -lz -lpcre -lsqlite3 -ldl -L../ -lcld2_full -lcld3 -lprotobuf -lced -lcares -o $@
PHONY: run
run: tokenizer_unittest
LD_LIBRARY_PATH=.. ./tokenizer_unittest
PHONY: xml_run
xml_run: xml_tokenizer_unittest
LD_LIBRARY_PATH=.. ./xml_tokenizer_unittest

@ -14,6 +14,10 @@ static bool is_word_script(Unicode::script_t s);
// common: used in multiple scripts, eg digits 0-9, but also eg. thai currency symbol
// inherit: has the script of the preceding character. This is normally a decomposed diacritic or combining mark
void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr) {
plain_tokenizer_phase_1_downcall(str,len,0,tr);
}
void plain_tokenizer_phase_1_downcall(const char *str, size_t len, size_t pos_base, TokenizerResult *tr) {
for(size_t i = 0; i<len; ) {
UChar32 c = utf8Decode(str+i);
bool in_alnum_token = ucIsWordChar_fast(c);
@ -45,7 +49,7 @@ void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr) {
j += getUtf8CharSize(str+j);
}
//found token [i..j)
tr->tokens.emplace_back(i,j,str+i,j-i,in_alnum_token);
tr->tokens.emplace_back(pos_base+i,pos_base+j, str+i,j-i, in_alnum_token);
i = j;
}
}

@ -9,12 +9,20 @@
struct TokenRange {
TokenRange(size_t start_pos_, size_t end_pos_, const char *token_start_, size_t token_len_, bool is_alfanum_)
: start_pos(start_pos_), end_pos(end_pos_),
token_start(token_start_), token_len(token_len_),
is_alfanum(is_alfanum_),
token_hash(0),
nodeid(0),
xml_node_index(0)
: start_pos(start_pos_), end_pos(end_pos_),
token_start(token_start_), token_len(token_len_),
is_alfanum(is_alfanum_),
token_hash(0),
nodeid(0),
xml_node_index(0)
{ }
TokenRange(size_t start_pos_, size_t end_pos_, const char *token_start_, size_t token_len_, nodeid_t node_id, int32_t xml_node_index_)
: start_pos(start_pos_), end_pos(end_pos_),
token_start(token_start_), token_len(token_len_),
is_alfanum(false),
token_hash(0),
nodeid(node_id),
xml_node_index(xml_node_index_)
{ }
size_t start_pos, end_pos; //[start..end[ in source text
@ -41,7 +49,11 @@ public:
void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr);
void plain_tokenizer_phase_1_downcall(const char *str, size_t len, size_t pos_base, TokenizerResult *tr);
void plain_tokenizer_phase_2(const char *str, size_t len, lang_t lang, const char *country_code, TokenizerResult *tr);
class Xml;
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr);
void xml_tokenizer_phase_2(const Xml *xml, const char *country_code, TokenizerResult *tr);
#endif

24
tokenizer/tokenizer3.cpp Normal file

@ -0,0 +1,24 @@
#include "tokenizer.h"
#include "Xml.h"
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr) {
if(xml->getNumNodes()==0)
return;
const char *first_pos = xml->getNode(0);
for(int i=0; i<xml->getNumNodes(); i++) {
const char *node = xml->getNode(i);
int node_len = xml->getNodeLen(i);
if(!xml->isTag(i)) {
//Not a tag. Just run a plain tokenizer on the contained text
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
} else {
//tag
nodeid_t node_id = xml->getNodeId(i);
if(xml->isBackTag(i))
node_id |= BACKBIT;
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
}
}
}

@ -0,0 +1,91 @@
#include "tokenizer.h"
#include "UCMaps.h"
#include "TitleRecVersion.h"
#include "Xml.h"
#include "HttpMime.h"
#include <assert.h>
static bool has_token(const TokenizerResult &tr, const char *s) {
size_t sl = strlen(s);
for(size_t i=0; i<tr.tokens.size(); i++) {
const auto &t = tr.tokens[i];
if(t.token_len==sl && memcmp(t.token_start,s,sl)==0)
return true;
}
return false;
}
int main(void) {
{
static const char html[] = "";
Xml xml;
assert(xml.set((char*)html,sizeof(html)-1, TITLEREC_CURRENT_VERSION, CT_HTML));
TokenizerResult tr;
xml_tokenizer_phase_1(&xml,&tr);
assert(tr.tokens.empty());
}
{
static const char html[] = "<html><title>zzz</title><body><h1>aaa</h2><p>bbb ccc</p></body></html>";
Xml xml;
assert(xml.set((char*)html,sizeof(html)-1, TITLEREC_CURRENT_VERSION, CT_HTML));
TokenizerResult tr;
xml_tokenizer_phase_1(&xml,&tr);
assert(!tr.tokens.empty());
assert(has_token(tr,"zzz"));
assert(has_token(tr,"aaa"));
assert(has_token(tr,"bbb"));
assert(has_token(tr," "));
assert(has_token(tr,"ccc"));
assert(!has_token(tr,"body"));
assert(!has_token(tr,"html"));
assert(!has_token(tr,"title"));
assert(!has_token(tr,"h1"));
assert(tr.tokens.size()==15);
//html
assert(tr.tokens[0].start_pos==0);
assert(tr.tokens[0].end_pos==6);
assert(tr.tokens[0].token_start==html);
assert(!tr.tokens[0].is_alfanum);
assert(tr.tokens[0].nodeid==TAG_HTML);
assert(tr.tokens[0].xml_node_index==0);
//title
assert(tr.tokens[1].start_pos==6);
assert(tr.tokens[1].end_pos==13);
assert(!tr.tokens[1].is_alfanum);
assert(tr.tokens[1].nodeid==TAG_TITLE);
assert(tr.tokens[1].xml_node_index==1);
//zzz
assert(tr.tokens[2].start_pos==13);
assert(tr.tokens[2].end_pos==16);
assert(tr.tokens[2].is_alfanum);
assert(tr.tokens[2].nodeid==0);
//title end
assert(tr.tokens[3].start_pos==16);
assert(tr.tokens[3].end_pos==24);
assert(!tr.tokens[3].is_alfanum);
assert(tr.tokens[3].nodeid&BACKBIT);
assert(tr.tokens[3].xml_node_index=1);
//body
assert(tr.tokens[4].start_pos==24);
assert(tr.tokens[4].end_pos==30);
assert(!tr.tokens[4].is_alfanum);
assert(tr.tokens[4].nodeid==TAG_BODY);
//body
assert(tr.tokens[5].start_pos==30);
assert(tr.tokens[5].end_pos==34);
assert(!tr.tokens[5].is_alfanum);
assert(tr.tokens[5].nodeid==TAG_H1);
//aaa
assert(tr.tokens[6].start_pos==34);
assert(tr.tokens[6].end_pos==37);
assert(tr.tokens[6].token_len==3);
assert(memcmp(tr.tokens[6].token_start,"aaa",3)==0);
assert(tr.tokens[6].is_alfanum);
assert(tr.tokens[6].nodeid==0);
//good enough
}
}