#ifndef GB_XMLNODE_H #define GB_XMLNODE_H // . an xml node can be text or tag (html or xml tag) #include "nodeid_t.h" // . get how many xml/html tags we have classified in our g_nodes[] array int32_t getNumXmlNodes ( ) ; bool isBreakingTagId ( nodeid_t tagId ) ; bool hasBackTag ( nodeid_t tagId ) ; // s points to tag name - first char nodeid_t getTagId ( const char *s , class NodeType **retp = nullptr ); enum { TAG_TEXTNODE = 0, TAG_XMLTAG = 1, TAG_A = 2, TAG_ABBREV = 3, TAG_ACRONYM = 4, TAG_ADDRESS = 5, TAG_APPLET = 6, TAG_AREA = 7, TAG_AU = 8, TAG_AUTHOR = 9, TAG_B = 10, TAG_BANNER = 11, TAG_BASE = 12, TAG_BASEFONT = 13, TAG_BGSOUND = 14, TAG_BIG = 15, TAG_BLINK = 16, TAG_BLOCKQUOTE = 17, TAG_BQ = 18, TAG_BODY = 19, TAG_BR = 20, TAG_CAPTION = 21, TAG_CENTER = 22, TAG_CITE = 23, TAG_CODE = 24, TAG_COL = 25, TAG_COLGROUP = 26, TAG_CREDIT = 27, TAG_DEL = 28, TAG_DFN = 29, TAG_DIR = 30, TAG_DIV = 31, TAG_DL = 32, TAG_DT = 33, TAG_DD = 34, TAG_EM = 35, TAG_EMBED = 36, TAG_FIG = 37, TAG_FN = 38, TAG_FONT = 39, TAG_FORM = 40, TAG_FRAME = 41, TAG_FRAMESET = 42, TAG_H1 = 43, TAG_H2 = 44, TAG_H3 = 45, TAG_H4 = 46, TAG_H5 = 47, TAG_H6 = 48, TAG_HEAD = 49, TAG_HR = 50, TAG_HTML = 51, TAG_I = 52, TAG_IFRAME = 53, TAG_IMG = 54, TAG_INPUT = 55, TAG_INS = 56, TAG_ISINDEX = 57, TAG_KBD = 58, TAG_LANG = 59, TAG_LH = 60, TAG_LI = 61, TAG_LINK = 62, TAG_LISTING = 63, TAG_MAP = 64, TAG_MARQUEE = 65, TAG_MATH = 66, TAG_MENU = 67, TAG_META = 68, TAG_MULTICOL = 69, TAG_NOBR = 70, TAG_NOFRAMES = 71, TAG_NOTE = 72, TAG_OL = 73, TAG_OVERLAY = 74, TAG_P = 75, TAG_PARAM = 76, TAG_PERSON = 77, TAG_PLAINTEXT = 78, TAG_PRE = 79, TAG_Q = 80, TAG_RANGE = 81, TAG_SAMP = 82, TAG_SCRIPT = 83, TAG_SELECT = 84, TAG_SMALL = 85, TAG_SPACER = 86, TAG_SPOT = 87, TAG_STRIKE = 88, TAG_STRONG = 89, TAG_SUB = 90, TAG_SUP = 91, TAG_TAB = 92, TAG_TABLE = 93, TAG_TBODY = 94, TAG_TD = 95, TAG_TEXTAREA = 96, TAG_TEXTFLOW = 97, TAG_TFOOT = 98, TAG_TH = 99, TAG_THEAD = 100, TAG_TITLE = 101, TAG_TR = 102, TAG_TT = 103, TAG_U = 104, TAG_UL = 105, TAG_VAR = 106, TAG_WBR = 107, TAG_XMP = 108, TAG_COMMENT = 109, TAG_OPTION = 110, TAG_STYLE = 111, TAG_DOCTYPE = 112, TAG_XML = 113, TAG_START = 114, TAG_STOP = 115, TAG_SPAN = 116, TAG_LEGEND = 117, TAG_S = 118, TAG_ABBR = 119, TAG_CDATA = 120, TAG_NOSCRIPT = 121, TAG_FIELDSET = 122, TAG_FBORIGLINK = 123, // "feedburner:origlink" special feedburner link TAG_RDF = 124, // rdf:RDF TAG_RSS = 125, // rss TAG_FEED = 126, // atom feed tag TAG_ITEM = 127, TAG_ENTRY = 128, TAG_CHANNEL = 129, TAG_ENCLOSURE = 130, TAG_WEBLOG = 131, // a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src TAG_GBFRAME = 132, TAG_TC = 133, TAG_GBXMLTITLE = 134, // facebook xml tags TAG_FBSTARTTIME = 135, TAG_FBENDTIME = 136, TAG_FBNAME = 137, TAG_FBPICSQUARE = 138, TAG_FBHIDEGUESTLIST = 139, // . do not parse this up into words!! it is text in <script> tags // . consider it a whole tag i guess TAG_SCRIPTTEXT = 140, TAG_BUTTON = 141, TAG_URLFROM = 142, // for ahrefs.com // support sitemap.xml TAG_LOC = 143, LAST_TAG }; class XmlNode { public: bool isText() { return m_nodeId == TAG_TEXTNODE; } bool isTag() { return m_nodeId > 0; } bool isHtmlTag() { return m_nodeId > 1; } bool isXmlTag() { return m_nodeId == TAG_XMLTAG; } nodeid_t getNodeId() { return m_nodeId; } int64_t getNodeHash() { return m_hash; } char *getNode() { return m_node; } // m_nodeLen is in bytes int32_t getNodeLen() { return m_nodeLen; } bool isBreaking() { return m_isBreaking; } bool isVisible() { return m_isVisible; } bool hasBackTag() { return m_hasBackTag; } // exclude meta tags and comment tags (they are not front or back) bool isFrontTag() { return m_nodeId > 0 && m_node[1] != '/' && m_nodeId != TAG_META && m_nodeId != TAG_COMMENT; } char* getAttrValue(const char *field, int32_t fieldLen, int32_t *valueLen ); // . get the value of a field like "href" in the <a href="blah"> tag char *getFieldValue ( const char *fieldName , int32_t *valueLen ); // . used exclusively by Xml class which contains an array of XmlNodes // . "node" points to the beginning of the node, the '<' if it's a tag // . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId // . returns the length of the node // . pureXml is true if node cannot be an html tag, except comment //int32_t set ( char *node , bool pureXml ); int32_t set(char *node, int maxNodeLen, bool pureXml); // . called by set() to get the length of a COMMENT node (and set it) int32_t setCommentNode ( char *node ); int32_t setCommentNode2 ( char *node ); // . called by set() to get the length of a CDATA node (and set it) int32_t setCDATANode ( char *node ); // . called by set() to get nodeId and isBreaking of a tag node // . returns the nodeId nodeid_t setNodeInfo ( int64_t nodeHash ); char *m_node; // tag data, or text data if not a tag int32_t m_nodeLen; // m_nodeLen is in bytes char *m_tagName; // iff this node is a tag int32_t m_tagNameLen; int64_t m_hash; // iff this node is a tag int16_t m_depth; // set by Xml class (xml depth only) nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html unsigned char m_hasBackTag : 1; unsigned char m_isBreaking : 1; // does tag (if it is) line break? unsigned char m_isVisible : 1; unsigned char m_isSelfLink : 1; // an a href tag link to self? int32_t m_pairTagNum; // paired opening or closing tag class XmlNode *m_parent; }; enum NodeTagType { TAG_TYPE_UNKNOWN = 0, TAG_TYPE_XML, TAG_TYPE_HTML_VOID, TAG_TYPE_HTML_RAW, TAG_TYPE_HTML_ESCAPABLE_RAW, TAG_TYPE_HTML_FOREIGN, TAG_TYPE_HTML_NORMAL, LAST_TAG_TYPE }; // Now set up a structure for describing ALL the available HTML nodes. // . Each HTML node has a name, name length, does it break a word? // a format bit. (most HTML tags have 0 for their format bit // because we really don't care about what they do -- we use format // bits for extracting title, summaries, et al. // . the is indexable is false for tags like <script> <option> whose contents // are not visible/indexable class NodeType { public: const char *m_nodeName; bool m_hasBackTag; char m_isBreaking; char m_isVisible; char m_filterKeep1; // for &strip=1 option char m_filterKeep2; // for &strip=2 option nodeid_t m_nodeId; char m_tagType; }; extern const class NodeType g_nodes[]; #endif // GB_XMLNODE_H