#include "tokenizer.h" #include "Xml.h" void xml_tokenizer_phase_1(const Xml *xml, TokenizerResult *tr) { if(xml->getNumNodes()==0) return; const char *first_pos = xml->getNode(0); for(int i=0; i<xml->getNumNodes(); i++) { const char *node = xml->getNode(i); int node_len = xml->getNodeLen(i); if(!xml->isTag(i)) { //Not a tag. Just run a plain tokenizer on the contained text plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr); } else { //tag nodeid_t node_id = xml->getNodeId(i); if(xml->isBackTag(i)) node_id |= BACKBIT; tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i); } } } void xml_tokenizer_phase_1_subset(const Xml *xml, unsigned start_node, unsigned end_node, TokenizerResult *tr) { if(end_node<=start_node) return; if(xml->getNumNodes()==0) return; const char *first_pos = xml->getNode(0); for(unsigned i=start_node; i<(unsigned)xml->getNumNodes() && i<end_node; i++) { const char *node = xml->getNode(i); int node_len = xml->getNodeLen(i); if(!xml->isTag(i)) { //Not a tag. Just run a plain tokenizer on the contained text plain_tokenizer_phase_1_downcall(node,node_len, node-first_pos, tr); } else { //tag nodeid_t node_id = xml->getNodeId(i); if(xml->isBackTag(i)) node_id |= BACKBIT; tr->tokens.emplace_back(node-first_pos, node-first_pos+node_len, node,node_len, node_id,i); } } }