privacore-open-source-searc.../XmlNode.h
Ivan Skytte Jørgensen beeddcf35d Got rid of gb-include.h
2018-07-26 17:29:51 +02:00

304 lines
6.4 KiB
C++

#ifndef GB_XMLNODE_H
#define GB_XMLNODE_H
// . an xml node can be text or tag (html or xml tag)
#include "nodeid_t.h"
// . get how many xml/html tags we have classified in our g_nodes[] array
int32_t getNumXmlNodes ( ) ;
bool isBreakingTagId ( nodeid_t tagId ) ;
bool hasBackTag ( nodeid_t tagId ) ;
// s points to tag name - first char
nodeid_t getTagId ( const char *s , class NodeType **retp = nullptr );
enum {
TAG_TEXTNODE = 0,
TAG_XMLTAG = 1,
TAG_A = 2,
TAG_ABBREV = 3,
TAG_ACRONYM = 4,
TAG_ADDRESS = 5,
TAG_APPLET = 6,
TAG_AREA = 7,
TAG_AU = 8,
TAG_AUTHOR = 9,
TAG_B = 10,
TAG_BANNER = 11,
TAG_BASE = 12,
TAG_BASEFONT = 13,
TAG_BGSOUND = 14,
TAG_BIG = 15,
TAG_BLINK = 16,
TAG_BLOCKQUOTE = 17,
TAG_BQ = 18,
TAG_BODY = 19,
TAG_BR = 20,
TAG_CAPTION = 21,
TAG_CENTER = 22,
TAG_CITE = 23,
TAG_CODE = 24,
TAG_COL = 25,
TAG_COLGROUP = 26,
TAG_CREDIT = 27,
TAG_DEL = 28,
TAG_DFN = 29,
TAG_DIR = 30,
TAG_DIV = 31,
TAG_DL = 32,
TAG_DT = 33,
TAG_DD = 34,
TAG_EM = 35,
TAG_EMBED = 36,
TAG_FIG = 37,
TAG_FN = 38,
TAG_FONT = 39,
TAG_FORM = 40,
TAG_FRAME = 41,
TAG_FRAMESET = 42,
TAG_H1 = 43,
TAG_H2 = 44,
TAG_H3 = 45,
TAG_H4 = 46,
TAG_H5 = 47,
TAG_H6 = 48,
TAG_HEAD = 49,
TAG_HR = 50,
TAG_HTML = 51,
TAG_I = 52,
TAG_IFRAME = 53,
TAG_IMG = 54,
TAG_INPUT = 55,
TAG_INS = 56,
TAG_ISINDEX = 57,
TAG_KBD = 58,
TAG_LANG = 59,
TAG_LH = 60,
TAG_LI = 61,
TAG_LINK = 62,
TAG_LISTING = 63,
TAG_MAP = 64,
TAG_MARQUEE = 65,
TAG_MATH = 66,
TAG_MENU = 67,
TAG_META = 68,
TAG_MULTICOL = 69,
TAG_NOBR = 70,
TAG_NOFRAMES = 71,
TAG_NOTE = 72,
TAG_OL = 73,
TAG_OVERLAY = 74,
TAG_P = 75,
TAG_PARAM = 76,
TAG_PERSON = 77,
TAG_PLAINTEXT = 78,
TAG_PRE = 79,
TAG_Q = 80,
TAG_RANGE = 81,
TAG_SAMP = 82,
TAG_SCRIPT = 83,
TAG_SELECT = 84,
TAG_SMALL = 85,
TAG_SPACER = 86,
TAG_SPOT = 87,
TAG_STRIKE = 88,
TAG_STRONG = 89,
TAG_SUB = 90,
TAG_SUP = 91,
TAG_TAB = 92,
TAG_TABLE = 93,
TAG_TBODY = 94,
TAG_TD = 95,
TAG_TEXTAREA = 96,
TAG_TEXTFLOW = 97,
TAG_TFOOT = 98,
TAG_TH = 99,
TAG_THEAD = 100,
TAG_TITLE = 101,
TAG_TR = 102,
TAG_TT = 103,
TAG_U = 104,
TAG_UL = 105,
TAG_VAR = 106,
TAG_WBR = 107,
TAG_XMP = 108,
TAG_COMMENT = 109,
TAG_OPTION = 110,
TAG_STYLE = 111,
TAG_DOCTYPE = 112,
TAG_XML = 113,
TAG_START = 114,
TAG_STOP = 115,
TAG_SPAN = 116,
TAG_LEGEND = 117,
TAG_S = 118,
TAG_ABBR = 119,
TAG_CDATA = 120,
TAG_NOSCRIPT = 121,
TAG_FIELDSET = 122,
TAG_FBORIGLINK = 123, // "feedburner:origlink" special feedburner link
TAG_RDF = 124, // rdf:RDF
TAG_RSS = 125, // rss
TAG_FEED = 126, // atom feed tag
TAG_ITEM = 127,
TAG_ENTRY = 128,
TAG_CHANNEL = 129,
TAG_ENCLOSURE = 130,
TAG_WEBLOG = 131,
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
TAG_GBFRAME = 132,
TAG_TC = 133,
TAG_GBXMLTITLE = 134,
// facebook xml tags
TAG_FBSTARTTIME = 135,
TAG_FBENDTIME = 136,
TAG_FBNAME = 137,
TAG_FBPICSQUARE = 138,
TAG_FBHIDEGUESTLIST = 139,
// . do not parse this up into words!! it is text in <script> tags
// . consider it a whole tag i guess
TAG_SCRIPTTEXT = 140,
TAG_BUTTON = 141,
TAG_URLFROM = 142, // for ahrefs.com
// support sitemap.xml
TAG_LOC = 143,
LAST_TAG
};
class XmlNode {
public:
bool isText() {
return m_nodeId == TAG_TEXTNODE;
}
bool isTag() {
return m_nodeId > 0;
}
bool isHtmlTag() {
return m_nodeId > 1;
}
bool isXmlTag() {
return m_nodeId == TAG_XMLTAG;
}
nodeid_t getNodeId() {
return m_nodeId;
}
int64_t getNodeHash() {
return m_hash;
}
char *getNode() {
return m_node;
}
// m_nodeLen is in bytes
int32_t getNodeLen() {
return m_nodeLen;
}
bool isBreaking() {
return m_isBreaking;
}
bool isVisible() {
return m_isVisible;
}
bool hasBackTag() {
return m_hasBackTag;
}
// exclude meta tags and comment tags (they are not front or back)
bool isFrontTag() {
return m_nodeId > 0 && m_node[1] != '/' && m_nodeId != TAG_META && m_nodeId != TAG_COMMENT;
}
char* getAttrValue(const char *field, int32_t fieldLen, int32_t *valueLen );
// . get the value of a field like "href" in the <a href="blah"> tag
char *getFieldValue ( const char *fieldName , int32_t *valueLen );
// . used exclusively by Xml class which contains an array of XmlNodes
// . "node" points to the beginning of the node, the '<' if it's a tag
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
// . returns the length of the node
// . pureXml is true if node cannot be an html tag, except comment
//int32_t set ( char *node , bool pureXml );
int32_t set(char *node, int maxNodeLen, bool pureXml);
// . called by set() to get the length of a COMMENT node (and set it)
int32_t setCommentNode ( char *node );
int32_t setCommentNode2 ( char *node );
// . called by set() to get the length of a CDATA node (and set it)
int32_t setCDATANode ( char *node );
// . called by set() to get nodeId and isBreaking of a tag node
// . returns the nodeId
nodeid_t setNodeInfo ( int64_t nodeHash );
char *m_node; // tag data, or text data if not a tag
int32_t m_nodeLen; // m_nodeLen is in bytes
char *m_tagName; // iff this node is a tag
int32_t m_tagNameLen;
int64_t m_hash; // iff this node is a tag
int16_t m_depth; // set by Xml class (xml depth only)
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
unsigned char m_hasBackTag : 1;
unsigned char m_isBreaking : 1; // does tag (if it is) line break?
unsigned char m_isVisible : 1;
unsigned char m_isSelfLink : 1; // an a href tag link to self?
int32_t m_pairTagNum; // paired opening or closing tag
class XmlNode *m_parent;
};
enum NodeTagType {
TAG_TYPE_UNKNOWN = 0,
TAG_TYPE_XML,
TAG_TYPE_HTML_VOID,
TAG_TYPE_HTML_RAW,
TAG_TYPE_HTML_ESCAPABLE_RAW,
TAG_TYPE_HTML_FOREIGN,
TAG_TYPE_HTML_NORMAL,
LAST_TAG_TYPE
};
// Now set up a structure for describing ALL the available HTML nodes.
// . Each HTML node has a name, name length, does it break a word?
// a format bit. (most HTML tags have 0 for their format bit
// because we really don't care about what they do -- we use format
// bits for extracting title, summaries, et al.
// . the is indexable is false for tags like <script> <option> whose contents
// are not visible/indexable
class NodeType {
public:
const char *m_nodeName;
bool m_hasBackTag;
char m_isBreaking;
char m_isVisible;
char m_filterKeep1; // for &strip=1 option
char m_filterKeep2; // for &strip=2 option
nodeid_t m_nodeId;
char m_tagType;
};
extern const class NodeType g_nodes[];
#endif // GB_XMLNODE_H