privacore-open-source-searc.../tokenizer/tokenizer3.cpp
2018-03-09 16:24:39 +01:00

50 lines
1.4 KiB
C++

#include "tokenizer.h"
#include "Xml.h"
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr) {
if(xml->getNumNodes()==0)
return;
const char *first_pos = xml->getNode(0);
for(int i=0; i<xml->getNumNodes(); i++) {
const char *node = xml->getNode(i);
int node_len = xml->getNodeLen(i);
if(!xml->isTag(i)) {
//Not a tag. Just run a plain tokenizer on the contained text
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
} else {
//tag
nodeid_t node_id = xml->getNodeId(i);
if(xml->isBackTag(i))
node_id |= BACKBIT;
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
}
}
}
void xml_tokenizer_phase_1_subset(const Xml *xml, unsigned start_node, unsigned end_node, TokenizerResult *tr) {
if(end_node<=start_node)
return;
if(xml->getNumNodes()==0)
return;
const char *first_pos = xml->getNode(0);
for(unsigned i=start_node; i<(unsigned)xml->getNumNodes() && i<end_node; i++) {
const char *node = xml->getNode(i);
int node_len = xml->getNodeLen(i);
if(!xml->isTag(i)) {
//Not a tag. Just run a plain tokenizer on the contained text
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
} else {
//tag
nodeid_t node_id = xml->getNodeId(i);
if(xml->isBackTag(i))
node_id |= BACKBIT;
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
}
}
}