mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
50 lines
1.4 KiB
C++
50 lines
1.4 KiB
C++
#include "tokenizer.h"
|
|
#include "Xml.h"
|
|
|
|
void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr) {
|
|
if(xml->getNumNodes()==0)
|
|
return;
|
|
const char *first_pos = xml->getNode(0);
|
|
|
|
for(int i=0; i<xml->getNumNodes(); i++) {
|
|
const char *node = xml->getNode(i);
|
|
int node_len = xml->getNodeLen(i);
|
|
|
|
if(!xml->isTag(i)) {
|
|
//Not a tag. Just run a plain tokenizer on the contained text
|
|
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
|
|
} else {
|
|
//tag
|
|
nodeid_t node_id = xml->getNodeId(i);
|
|
if(xml->isBackTag(i))
|
|
node_id |= BACKBIT;
|
|
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void xml_tokenizer_phase_1_subset(const Xml *xml, unsigned start_node, unsigned end_node, TokenizerResult *tr) {
|
|
if(end_node<=start_node)
|
|
return;
|
|
if(xml->getNumNodes()==0)
|
|
return;
|
|
const char *first_pos = xml->getNode(0);
|
|
|
|
for(unsigned i=start_node; i<(unsigned)xml->getNumNodes() && i<end_node; i++) {
|
|
const char *node = xml->getNode(i);
|
|
int node_len = xml->getNodeLen(i);
|
|
|
|
if(!xml->isTag(i)) {
|
|
//Not a tag. Just run a plain tokenizer on the contained text
|
|
plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr);
|
|
} else {
|
|
//tag
|
|
nodeid_t node_id = xml->getNodeId(i);
|
|
if(xml->isBackTag(i))
|
|
node_id |= BACKBIT;
|
|
tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i);
|
|
}
|
|
}
|
|
}
|