mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
304 lines
6.4 KiB
C++
304 lines
6.4 KiB
C++
#ifndef GB_XMLNODE_H
|
|
#define GB_XMLNODE_H
|
|
|
|
// . an xml node can be text or tag (html or xml tag)
|
|
|
|
#include "nodeid_t.h"
|
|
|
|
// . get how many xml/html tags we have classified in our g_nodes[] array
|
|
int32_t getNumXmlNodes ( ) ;
|
|
bool isBreakingTagId ( nodeid_t tagId ) ;
|
|
bool hasBackTag ( nodeid_t tagId ) ;
|
|
|
|
// s points to tag name - first char
|
|
nodeid_t getTagId ( const char *s , class NodeType **retp = nullptr );
|
|
|
|
enum {
|
|
TAG_TEXTNODE = 0,
|
|
TAG_XMLTAG = 1,
|
|
TAG_A = 2,
|
|
TAG_ABBREV = 3,
|
|
TAG_ACRONYM = 4,
|
|
TAG_ADDRESS = 5,
|
|
TAG_APPLET = 6,
|
|
TAG_AREA = 7,
|
|
TAG_AU = 8,
|
|
TAG_AUTHOR = 9,
|
|
TAG_B = 10,
|
|
TAG_BANNER = 11,
|
|
TAG_BASE = 12,
|
|
TAG_BASEFONT = 13,
|
|
TAG_BGSOUND = 14,
|
|
TAG_BIG = 15,
|
|
TAG_BLINK = 16,
|
|
TAG_BLOCKQUOTE = 17,
|
|
TAG_BQ = 18,
|
|
TAG_BODY = 19,
|
|
TAG_BR = 20,
|
|
TAG_CAPTION = 21,
|
|
TAG_CENTER = 22,
|
|
TAG_CITE = 23,
|
|
TAG_CODE = 24,
|
|
TAG_COL = 25,
|
|
TAG_COLGROUP = 26,
|
|
TAG_CREDIT = 27,
|
|
TAG_DEL = 28,
|
|
TAG_DFN = 29,
|
|
TAG_DIR = 30,
|
|
TAG_DIV = 31,
|
|
TAG_DL = 32,
|
|
TAG_DT = 33,
|
|
TAG_DD = 34,
|
|
TAG_EM = 35,
|
|
TAG_EMBED = 36,
|
|
TAG_FIG = 37,
|
|
TAG_FN = 38,
|
|
TAG_FONT = 39,
|
|
TAG_FORM = 40,
|
|
TAG_FRAME = 41,
|
|
TAG_FRAMESET = 42,
|
|
TAG_H1 = 43,
|
|
TAG_H2 = 44,
|
|
TAG_H3 = 45,
|
|
TAG_H4 = 46,
|
|
TAG_H5 = 47,
|
|
TAG_H6 = 48,
|
|
TAG_HEAD = 49,
|
|
TAG_HR = 50,
|
|
TAG_HTML = 51,
|
|
TAG_I = 52,
|
|
TAG_IFRAME = 53,
|
|
TAG_IMG = 54,
|
|
TAG_INPUT = 55,
|
|
TAG_INS = 56,
|
|
TAG_ISINDEX = 57,
|
|
TAG_KBD = 58,
|
|
TAG_LANG = 59,
|
|
TAG_LH = 60,
|
|
TAG_LI = 61,
|
|
TAG_LINK = 62,
|
|
TAG_LISTING = 63,
|
|
TAG_MAP = 64,
|
|
TAG_MARQUEE = 65,
|
|
TAG_MATH = 66,
|
|
TAG_MENU = 67,
|
|
TAG_META = 68,
|
|
TAG_MULTICOL = 69,
|
|
TAG_NOBR = 70,
|
|
TAG_NOFRAMES = 71,
|
|
TAG_NOTE = 72,
|
|
TAG_OL = 73,
|
|
TAG_OVERLAY = 74,
|
|
TAG_P = 75,
|
|
TAG_PARAM = 76,
|
|
TAG_PERSON = 77,
|
|
TAG_PLAINTEXT = 78,
|
|
TAG_PRE = 79,
|
|
TAG_Q = 80,
|
|
TAG_RANGE = 81,
|
|
TAG_SAMP = 82,
|
|
TAG_SCRIPT = 83,
|
|
TAG_SELECT = 84,
|
|
TAG_SMALL = 85,
|
|
TAG_SPACER = 86,
|
|
TAG_SPOT = 87,
|
|
TAG_STRIKE = 88,
|
|
TAG_STRONG = 89,
|
|
TAG_SUB = 90,
|
|
TAG_SUP = 91,
|
|
TAG_TAB = 92,
|
|
TAG_TABLE = 93,
|
|
TAG_TBODY = 94,
|
|
|
|
TAG_TD = 95,
|
|
TAG_TEXTAREA = 96,
|
|
TAG_TEXTFLOW = 97,
|
|
TAG_TFOOT = 98,
|
|
TAG_TH = 99,
|
|
TAG_THEAD = 100,
|
|
TAG_TITLE = 101,
|
|
TAG_TR = 102,
|
|
TAG_TT = 103,
|
|
|
|
TAG_U = 104,
|
|
TAG_UL = 105,
|
|
TAG_VAR = 106,
|
|
TAG_WBR = 107,
|
|
TAG_XMP = 108,
|
|
TAG_COMMENT = 109,
|
|
|
|
TAG_OPTION = 110,
|
|
TAG_STYLE = 111,
|
|
TAG_DOCTYPE = 112,
|
|
TAG_XML = 113,
|
|
TAG_START = 114,
|
|
TAG_STOP = 115,
|
|
TAG_SPAN = 116,
|
|
TAG_LEGEND = 117,
|
|
TAG_S = 118,
|
|
|
|
TAG_ABBR = 119,
|
|
TAG_CDATA = 120,
|
|
TAG_NOSCRIPT = 121,
|
|
TAG_FIELDSET = 122,
|
|
TAG_FBORIGLINK = 123, // "feedburner:origlink" special feedburner link
|
|
TAG_RDF = 124, // rdf:RDF
|
|
TAG_RSS = 125, // rss
|
|
TAG_FEED = 126, // atom feed tag
|
|
|
|
TAG_ITEM = 127,
|
|
TAG_ENTRY = 128,
|
|
TAG_CHANNEL = 129,
|
|
TAG_ENCLOSURE = 130,
|
|
TAG_WEBLOG = 131,
|
|
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
|
|
TAG_GBFRAME = 132,
|
|
TAG_TC = 133,
|
|
TAG_GBXMLTITLE = 134,
|
|
|
|
// facebook xml tags
|
|
TAG_FBSTARTTIME = 135,
|
|
TAG_FBENDTIME = 136,
|
|
TAG_FBNAME = 137,
|
|
TAG_FBPICSQUARE = 138,
|
|
TAG_FBHIDEGUESTLIST = 139,
|
|
|
|
// . do not parse this up into words!! it is text in <script> tags
|
|
// . consider it a whole tag i guess
|
|
TAG_SCRIPTTEXT = 140,
|
|
TAG_BUTTON = 141,
|
|
TAG_URLFROM = 142, // for ahrefs.com
|
|
|
|
// support sitemap.xml
|
|
TAG_LOC = 143,
|
|
|
|
LAST_TAG
|
|
};
|
|
|
|
class XmlNode {
|
|
public:
|
|
bool isText() {
|
|
return m_nodeId == TAG_TEXTNODE;
|
|
}
|
|
|
|
bool isTag() {
|
|
return m_nodeId > 0;
|
|
}
|
|
|
|
bool isHtmlTag() {
|
|
return m_nodeId > 1;
|
|
}
|
|
|
|
bool isXmlTag() {
|
|
return m_nodeId == TAG_XMLTAG;
|
|
}
|
|
|
|
nodeid_t getNodeId() {
|
|
return m_nodeId;
|
|
}
|
|
|
|
int64_t getNodeHash() {
|
|
return m_hash;
|
|
}
|
|
|
|
char *getNode() {
|
|
return m_node;
|
|
}
|
|
|
|
// m_nodeLen is in bytes
|
|
int32_t getNodeLen() {
|
|
return m_nodeLen;
|
|
}
|
|
|
|
bool isBreaking() {
|
|
return m_isBreaking;
|
|
}
|
|
|
|
bool isVisible() {
|
|
return m_isVisible;
|
|
}
|
|
|
|
bool hasBackTag() {
|
|
return m_hasBackTag;
|
|
}
|
|
|
|
// exclude meta tags and comment tags (they are not front or back)
|
|
bool isFrontTag() {
|
|
return m_nodeId > 0 && m_node[1] != '/' && m_nodeId != TAG_META && m_nodeId != TAG_COMMENT;
|
|
}
|
|
|
|
char* getAttrValue(const char *field, int32_t fieldLen, int32_t *valueLen );
|
|
|
|
// . get the value of a field like "href" in the <a href="blah"> tag
|
|
char *getFieldValue ( const char *fieldName , int32_t *valueLen );
|
|
|
|
// . used exclusively by Xml class which contains an array of XmlNodes
|
|
// . "node" points to the beginning of the node, the '<' if it's a tag
|
|
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
|
|
// . returns the length of the node
|
|
// . pureXml is true if node cannot be an html tag, except comment
|
|
//int32_t set ( char *node , bool pureXml );
|
|
int32_t set(char *node, int maxNodeLen, bool pureXml);
|
|
|
|
// . called by set() to get the length of a COMMENT node (and set it)
|
|
int32_t setCommentNode ( char *node );
|
|
|
|
int32_t setCommentNode2 ( char *node );
|
|
|
|
// . called by set() to get the length of a CDATA node (and set it)
|
|
int32_t setCDATANode ( char *node );
|
|
|
|
// . called by set() to get nodeId and isBreaking of a tag node
|
|
// . returns the nodeId
|
|
nodeid_t setNodeInfo ( int64_t nodeHash );
|
|
|
|
char *m_node; // tag data, or text data if not a tag
|
|
int32_t m_nodeLen; // m_nodeLen is in bytes
|
|
char *m_tagName; // iff this node is a tag
|
|
int32_t m_tagNameLen;
|
|
int64_t m_hash; // iff this node is a tag
|
|
int16_t m_depth; // set by Xml class (xml depth only)
|
|
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
|
|
unsigned char m_hasBackTag : 1;
|
|
unsigned char m_isBreaking : 1; // does tag (if it is) line break?
|
|
unsigned char m_isVisible : 1;
|
|
unsigned char m_isSelfLink : 1; // an a href tag link to self?
|
|
int32_t m_pairTagNum; // paired opening or closing tag
|
|
class XmlNode *m_parent;
|
|
};
|
|
|
|
enum NodeTagType {
|
|
TAG_TYPE_UNKNOWN = 0,
|
|
TAG_TYPE_XML,
|
|
TAG_TYPE_HTML_VOID,
|
|
TAG_TYPE_HTML_RAW,
|
|
TAG_TYPE_HTML_ESCAPABLE_RAW,
|
|
TAG_TYPE_HTML_FOREIGN,
|
|
TAG_TYPE_HTML_NORMAL,
|
|
LAST_TAG_TYPE
|
|
};
|
|
|
|
// Now set up a structure for describing ALL the available HTML nodes.
|
|
// . Each HTML node has a name, name length, does it break a word?
|
|
// a format bit. (most HTML tags have 0 for their format bit
|
|
// because we really don't care about what they do -- we use format
|
|
// bits for extracting title, summaries, et al.
|
|
// . the is indexable is false for tags like <script> <option> whose contents
|
|
// are not visible/indexable
|
|
class NodeType {
|
|
public:
|
|
const char *m_nodeName;
|
|
bool m_hasBackTag;
|
|
char m_isBreaking;
|
|
char m_isVisible;
|
|
char m_filterKeep1; // for &strip=1 option
|
|
char m_filterKeep2; // for &strip=2 option
|
|
nodeid_t m_nodeId;
|
|
char m_tagType;
|
|
};
|
|
|
|
extern const class NodeType g_nodes[];
|
|
|
|
#endif // GB_XMLNODE_H
|
|
|