forked from Mirrors/privacore-open-source-search-engine
tokenizer: phase-1 XML tokenizer added + tests
This commit is contained in:
@ -15,6 +15,13 @@ default: run
|
||||
tokenizer_unittest: tokenizer.o tokenizer2.o tokenizer_unittest.o
|
||||
g++ -g tokenizer.o tokenizer2.o tokenizer_unittest.o ../unicode/libunicode.a ../utf8_fast.o ../utf8.o ../EGStack.o -o $@
|
||||
|
||||
xml_tokenizer_unittest: tokenizer.o tokenizer2.o tokenizer3.o xml_tokenizer_unittest.o
|
||||
g++ -g tokenizer.o tokenizer2.o tokenizer3.o xml_tokenizer_unittest.o ../unicode/libunicode.a ../libgb.a -lm -lpthread -lssl -lcrypto -lz -lpcre -lsqlite3 -ldl -L../ -lcld2_full -lcld3 -lprotobuf -lced -lcares -o $@
|
||||
|
||||
PHONY: run
|
||||
run: tokenizer_unittest
|
||||
LD_LIBRARY_PATH=.. ./tokenizer_unittest
|
||||
|
||||
PHONY: xml_run
|
||||
xml_run: xml_tokenizer_unittest
|
||||
LD_LIBRARY_PATH=.. ./xml_tokenizer_unittest
|
||||
|
@ -14,6 +14,10 @@ static bool is_word_script(Unicode::script_t s);
|
||||
// common: used in multiple scripts, eg digits 0-9, but also eg. thai currency symbol
|
||||
// inherit: has the script of the preceding character. This is normally a decomposed diacritic or combining mark
|
||||
void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr) {
|
||||
plain_tokenizer_phase_1_downcall(str,len,0,tr);
|
||||
}
|
||||
|
||||
void plain_tokenizer_phase_1_downcall(const char *str, size_t len, size_t pos_base, TokenizerResult *tr) {
|
||||
for(size_t i = 0; i<len; ) {
|
||||
UChar32 c = utf8Decode(str+i);
|
||||
bool in_alnum_token = ucIsWordChar_fast(c);
|
||||
@ -45,7 +49,7 @@ void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr) {
|
||||
j += getUtf8CharSize(str+j);
|
||||
}
|
||||
//found token [i..j)
|
||||
tr->tokens.emplace_back(i,j,str+i,j-i,in_alnum_token);
|
||||
tr->tokens.emplace_back(pos_base+i,pos_base+j, str+i,j-i, in_alnum_token);
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
|
@ -9,12 +9,20 @@
|
||||
|
||||
struct TokenRange {
|
||||
TokenRange(size_t start_pos_, size_t end_pos_, const char *token_start_, size_t token_len_, bool is_alfanum_)
|
||||
: start_pos(start_pos_), end_pos(end_pos_),
|
||||
token_start(token_start_), token_len(token_len_),
|
||||
is_alfanum(is_alfanum_),
|
||||
token_hash(0),
|
||||
nodeid(0),
|
||||
xml_node_index(0)
|
||||
: start_pos(start_pos_), end_pos(end_pos_),
|
||||
token_start(token_start_), token_len(token_len_),
|
||||
is_alfanum(is_alfanum_),
|
||||
token_hash(0),
|
||||
nodeid(0),
|
||||
xml_node_index(0)
|
||||
{ }
|
||||
TokenRange(size_t start_pos_, size_t end_pos_, const char *token_start_, size_t token_len_, nodeid_t node_id, int32_t xml_node_index_)
|
||||
: start_pos(start_pos_), end_pos(end_pos_),
|
||||
token_start(token_start_), token_len(token_len_),
|
||||
is_alfanum(false),
|
||||
token_hash(0),
|
||||
nodeid(node_id),
|
||||
xml_node_index(xml_node_index_)
|
||||
{ }
|
||||
|
||||
size_t start_pos, end_pos; //[start..end[ in source text
|
||||
@ -41,7 +49,11 @@ public:
|
||||
|
||||
|
||||
void plain_tokenizer_phase_1(const char *str, size_t len, TokenizerResult *tr);
|
||||
void plain_tokenizer_phase_1_downcall(const char *str, size_t len, size_t pos_base, TokenizerResult *tr);
|
||||
void plain_tokenizer_phase_2(const char *str, size_t len, lang_t lang, const char *country_code, TokenizerResult *tr);
|
||||
class Xml;
|
||||
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr);
|
||||
void xml_tokenizer_phase_2(const Xml *xml, const char *country_code, TokenizerResult *tr);
|
||||
|
||||
|
||||
#endif
|
||||
|
24
tokenizer/tokenizer3.cpp
Normal file
24
tokenizer/tokenizer3.cpp
Normal file
@ -0,0 +1,24 @@
|
||||
#include "tokenizer.h"
|
||||
#include "Xml.h"
|
||||
|
||||
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr) {
|
||||
if(xml->getNumNodes()==0)
|
||||
return;
|
||||
const char *first_pos = xml->getNode(0);
|
||||
|
||||
for(int i=0; i<xml->getNumNodes(); i++) {
|
||||
const char *node = xml->getNode(i);
|
||||
int node_len = xml->getNodeLen(i);
|
||||
|
||||
if(!xml->isTag(i)) {
|
||||
//Not a tag. Just run a plain tokenizer on the contained text
|
||||
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
|
||||
} else {
|
||||
//tag
|
||||
nodeid_t node_id = xml->getNodeId(i);
|
||||
if(xml->isBackTag(i))
|
||||
node_id |= BACKBIT;
|
||||
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
|
||||
}
|
||||
}
|
||||
}
|
91
tokenizer/xml_tokenizer_unittest.cpp
Normal file
91
tokenizer/xml_tokenizer_unittest.cpp
Normal file
@ -0,0 +1,91 @@
|
||||
#include "tokenizer.h"
|
||||
#include "UCMaps.h"
|
||||
#include "TitleRecVersion.h"
|
||||
#include "Xml.h"
|
||||
#include "HttpMime.h"
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
static bool has_token(const TokenizerResult &tr, const char *s) {
|
||||
size_t sl = strlen(s);
|
||||
for(size_t i=0; i<tr.tokens.size(); i++) {
|
||||
const auto &t = tr.tokens[i];
|
||||
if(t.token_len==sl && memcmp(t.token_start,s,sl)==0)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int main(void) {
|
||||
{
|
||||
static const char html[] = "";
|
||||
Xml xml;
|
||||
assert(xml.set((char*)html,sizeof(html)-1, TITLEREC_CURRENT_VERSION, CT_HTML));
|
||||
TokenizerResult tr;
|
||||
xml_tokenizer_phase_1(&xml,&tr);
|
||||
assert(tr.tokens.empty());
|
||||
}
|
||||
{
|
||||
static const char html[] = "<html><title>zzz</title><body><h1>aaa</h2><p>bbb ccc</p></body></html>";
|
||||
Xml xml;
|
||||
assert(xml.set((char*)html,sizeof(html)-1, TITLEREC_CURRENT_VERSION, CT_HTML));
|
||||
TokenizerResult tr;
|
||||
xml_tokenizer_phase_1(&xml,&tr);
|
||||
assert(!tr.tokens.empty());
|
||||
assert(has_token(tr,"zzz"));
|
||||
assert(has_token(tr,"aaa"));
|
||||
assert(has_token(tr,"bbb"));
|
||||
assert(has_token(tr," "));
|
||||
assert(has_token(tr,"ccc"));
|
||||
assert(!has_token(tr,"body"));
|
||||
assert(!has_token(tr,"html"));
|
||||
assert(!has_token(tr,"title"));
|
||||
assert(!has_token(tr,"h1"));
|
||||
|
||||
assert(tr.tokens.size()==15);
|
||||
//html
|
||||
assert(tr.tokens[0].start_pos==0);
|
||||
assert(tr.tokens[0].end_pos==6);
|
||||
assert(tr.tokens[0].token_start==html);
|
||||
assert(!tr.tokens[0].is_alfanum);
|
||||
assert(tr.tokens[0].nodeid==TAG_HTML);
|
||||
assert(tr.tokens[0].xml_node_index==0);
|
||||
//title
|
||||
assert(tr.tokens[1].start_pos==6);
|
||||
assert(tr.tokens[1].end_pos==13);
|
||||
assert(!tr.tokens[1].is_alfanum);
|
||||
assert(tr.tokens[1].nodeid==TAG_TITLE);
|
||||
assert(tr.tokens[1].xml_node_index==1);
|
||||
//zzz
|
||||
assert(tr.tokens[2].start_pos==13);
|
||||
assert(tr.tokens[2].end_pos==16);
|
||||
assert(tr.tokens[2].is_alfanum);
|
||||
assert(tr.tokens[2].nodeid==0);
|
||||
//title end
|
||||
assert(tr.tokens[3].start_pos==16);
|
||||
assert(tr.tokens[3].end_pos==24);
|
||||
assert(!tr.tokens[3].is_alfanum);
|
||||
assert(tr.tokens[3].nodeid&BACKBIT);
|
||||
assert(tr.tokens[3].xml_node_index=1);
|
||||
//body
|
||||
assert(tr.tokens[4].start_pos==24);
|
||||
assert(tr.tokens[4].end_pos==30);
|
||||
assert(!tr.tokens[4].is_alfanum);
|
||||
assert(tr.tokens[4].nodeid==TAG_BODY);
|
||||
//body
|
||||
assert(tr.tokens[5].start_pos==30);
|
||||
assert(tr.tokens[5].end_pos==34);
|
||||
assert(!tr.tokens[5].is_alfanum);
|
||||
assert(tr.tokens[5].nodeid==TAG_H1);
|
||||
//aaa
|
||||
assert(tr.tokens[6].start_pos==34);
|
||||
assert(tr.tokens[6].end_pos==37);
|
||||
assert(tr.tokens[6].token_len==3);
|
||||
assert(memcmp(tr.tokens[6].token_start,"aaa",3)==0);
|
||||
assert(tr.tokens[6].is_alfanum);
|
||||
assert(tr.tokens[6].nodeid==0);
|
||||
|
||||
//good enough
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user