339 lines
8.3 KiB
C++
339 lines
8.3 KiB
C++
#ifndef _XMLNODE_H_
|
|
#define _XMLNODE_H_
|
|
|
|
#include "gb-include.h"
|
|
// . an xml node can be text or tag (html or xml tag)
|
|
|
|
typedef int16_t nodeid_t;
|
|
|
|
// . get how many xml/html tags we have classified in our g_nodes[] array
|
|
// . used by Weights.cpp
|
|
int32_t getNumXmlNodes ( ) ;
|
|
bool isBreakingTagId ( nodeid_t tagId ) ;
|
|
bool hasBackTag ( nodeid_t tagId ) ;
|
|
int32_t getTagLen ( char *node ) ;
|
|
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
|
|
// s points to tag name - first char
|
|
nodeid_t getTagId ( char *s , class NodeType **retp = NULL );
|
|
|
|
class XmlNode {
|
|
|
|
public:
|
|
|
|
friend class Xml; // needs to access our private parts ;)
|
|
friend class XmlDoc; // needs to access our private parts ;)
|
|
|
|
bool isText () { return m_nodeId == 0; };
|
|
bool isTag () { return m_nodeId > 0; };
|
|
bool isHtmlTag () { return m_nodeId > 1; };
|
|
bool isXmlTag () { return m_nodeId == 1; };
|
|
nodeid_t getNodeId () { return m_nodeId; };
|
|
int64_t getNodeHash() { return m_hash; };
|
|
char *getNode () { return m_node; };
|
|
// m_nodeLen is in bytes
|
|
int32_t getNodeLen () { return m_nodeLen; };
|
|
//int32_t getXmlParent () { return m_xmlParentTagNum; };
|
|
bool isBreaking () { return m_isBreaking; };
|
|
bool isVisible () { return m_isVisible; };
|
|
bool hasBackTag () { return m_hasBackTag; };
|
|
|
|
// exclude meta tags and comment tags (they are not front or back)
|
|
bool isFrontTag () {
|
|
return m_nodeId > 0 && m_node[1]!='/' &&
|
|
m_nodeId != 68 && m_nodeId != 109; };
|
|
|
|
// . get the value of a field like "href" in the <a href="blah"> tag
|
|
char *getFieldValue ( char *fieldName , int32_t *valueLen );
|
|
|
|
// . used exclusively by Xml class which contains an array of XmlNodes
|
|
// . "node" points to the beginning of the node, the '<' if it's a tag
|
|
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
|
|
// . returns the length of the node
|
|
// . pureXml is true if node cannot be an html tag, except comment
|
|
//int32_t set ( char *node , bool pureXml );
|
|
int32_t set ( char *node , bool pureXml , int32_t version );
|
|
|
|
// private:
|
|
|
|
// . called by set() to get the length of a tag node
|
|
//int32_t getTagLen ( char *node , int32_t version);
|
|
//int32_t getTagLen ( UChar *node , int32_t version );
|
|
|
|
// . called by set() to get the length of a TEXT node (and set it)
|
|
//int32_t setTextNode ( char *node );
|
|
|
|
// . called by set() to get the length of a COMMENT node (and set it)
|
|
int32_t setCommentNode ( char *node );
|
|
//int32_t setCommentNode ( UChar *node );
|
|
|
|
int32_t setCommentNode2 ( char *node );
|
|
|
|
// . called by set() to get the length of a CDATA node (and set it)
|
|
int32_t setCDATANode ( char *node );
|
|
//int32_t setCDATANode ( UChar *node );
|
|
|
|
// . called by set() to get nodeId and isBreaking of a tag node
|
|
// . returns the nodeId
|
|
nodeid_t setNodeInfo ( int64_t nodeHash );
|
|
|
|
char *m_node; // tag data, or text data if not a tag
|
|
int32_t m_nodeLen; // m_nodeLen is in bytes
|
|
char *m_tagName; // iff this node is a tag
|
|
int32_t m_tagNameLen;
|
|
int64_t m_hash; // iff this node is a tag
|
|
//int64_t m_compoundHash; // set by Xml class
|
|
//int32_t m_parentTagNum; // set by Xml class
|
|
//int32_t m_xmlParentTagNum; // set by Xml class
|
|
int16_t m_depth; // set by Xml class (xml depth only)
|
|
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
|
|
char m_hasBackTag:1;
|
|
char m_isBreaking:1; // does tag (if it is) line break?
|
|
char m_isVisible:1;
|
|
char m_isSelfLink:1; // an a href tag link to self?
|
|
int32_t m_pairTagNum; // paired opening or closing tag
|
|
// . "m_linkNum" references a link in Links.cpp
|
|
// . use for <a href> xml nodes only right now
|
|
// . used so XmlDoc.cpp::getContactUsLink() works better
|
|
//int32_t m_linkNum;
|
|
class XmlNode *m_parent;
|
|
};
|
|
|
|
// . does "s" start a tag? (regular tag , back tag or comment tag)
|
|
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
|
|
// it must start with < to be a tag
|
|
if ( *s != '<' ) return false;
|
|
// a <gb is a fake tag because we now decode all html entities
|
|
// so in htmlDecode() in fctypes.cpp we decode < to
|
|
// "<gb"
|
|
//if ( s[i+1]=='g' && s[i+2]=='b') return false;
|
|
// minimal tag is 3 chars
|
|
// if ( !s[ii + 2 >= len ) return false;
|
|
// next char can be an alnum, !-- or / then alnum
|
|
if ( is_alnum_a ( s[1] ) ) return true;
|
|
// next char can be 1 of 3 things to be a tag
|
|
//switch ( s[1] ) {
|
|
// / is also acceptable, followed only by an alnum or >
|
|
if ( s[1]== '/' ) {
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
if ( s[2] == '>' ) return true;
|
|
return false;
|
|
}
|
|
// office.microsoft.com uses <?xml ...?> tags
|
|
if ( s[1]=='?' ) {
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
//if ( s[2] == '>' ) return true; <?> is tag???
|
|
return false;
|
|
}
|
|
// make sure the double hyphens follow the ! or alnum
|
|
if ( s[1]=='!' ) {
|
|
// this is for <!xml> i guess
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
// and the <![CDATA[
|
|
if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&
|
|
s[5]=='A' && s[6]=='T' && s[7]=='A' &&
|
|
s[8]=='[' ) return true;
|
|
// and the <!-- comment here--> famous comment tag
|
|
if ( s[2]=='-' && s[3]=='-' ) return true;
|
|
// and <![....]> i've seen too
|
|
// <![if gt IE 6]><script>.... for waterfordcoc.org
|
|
if ( s[2] == '[' ) return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
|
|
// Now set up a structure for describing ALL the available HTML nodes.
|
|
// . Each HTML node has a name, name length, does it break a word?
|
|
// a format bit. (most HTML tags have 0 for their format bit
|
|
// because we really don't care about what they do -- we use format
|
|
// bits for extracting title, summaries, et al.
|
|
// . the is indexable is false for tags like <script> <option> whose contents
|
|
// are not visible/indexable
|
|
class NodeType {
|
|
public:
|
|
char *m_nodeName;
|
|
bool m_hasBackTag;
|
|
char m_isBreaking;
|
|
char m_isVisible;
|
|
char m_filterKeep1; // for &strip=1 option
|
|
char m_filterKeep2; // for &strip=2 option
|
|
nodeid_t m_nodeId;
|
|
char m_isXmlTag;
|
|
};
|
|
|
|
extern class NodeType g_nodes[];
|
|
|
|
inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};
|
|
|
|
// . each tag has a number
|
|
enum {
|
|
TAG_TEXTNODE = 0,
|
|
TAG_XMLTAG,
|
|
TAG_A,
|
|
TAG_ABBREV,
|
|
TAG_ACRONYM,
|
|
TAG_ADDRESS,
|
|
TAG_APPLET,
|
|
TAG_AREA,
|
|
TAG_AU,
|
|
TAG_AUTHOR,
|
|
TAG_B, // 10
|
|
TAG_BANNER,
|
|
TAG_BASE,
|
|
TAG_BASEFONT,
|
|
TAG_BGSOUND,
|
|
TAG_BIG,
|
|
TAG_BLINK,
|
|
TAG_BLOCKQUOTE,
|
|
TAG_BQ,
|
|
TAG_BODY,
|
|
TAG_BR, // 20
|
|
TAG_CAPTION,
|
|
TAG_CENTER,
|
|
TAG_CITE,
|
|
TAG_CODE,
|
|
TAG_COL,
|
|
TAG_COLGROUP,
|
|
TAG_CREDIT,
|
|
TAG_DEL,
|
|
TAG_DFN,
|
|
TAG_DIR, // 30
|
|
TAG_DIV,
|
|
TAG_DL,
|
|
TAG_DT,
|
|
TAG_DD,
|
|
TAG_EM,
|
|
TAG_EMBED,
|
|
TAG_FIG,
|
|
TAG_FN,
|
|
TAG_FONT,
|
|
TAG_FORM, // 40
|
|
TAG_FRAME,
|
|
TAG_FRAMESET,
|
|
TAG_H1,
|
|
TAG_H2,
|
|
TAG_H3,
|
|
TAG_H4,
|
|
TAG_H5,
|
|
TAG_H6,
|
|
TAG_HEAD,
|
|
TAG_HR, // 50
|
|
TAG_HTML,
|
|
TAG_I,
|
|
TAG_IFRAME,
|
|
TAG_IMG,
|
|
TAG_INPUT,
|
|
TAG_INS,
|
|
TAG_ISINDEX,
|
|
TAG_KBD,
|
|
TAG_LANG,
|
|
TAG_LH, // 60
|
|
TAG_LI,
|
|
TAG_LINK,
|
|
TAG_LISTING,
|
|
TAG_MAP,
|
|
TAG_MARQUEE,
|
|
TAG_MATH,
|
|
TAG_MENU,
|
|
TAG_META,
|
|
TAG_MULTICOL,
|
|
TAG_NOBR, // 70
|
|
TAG_NOFRAMES,
|
|
TAG_NOTE,
|
|
TAG_OL,
|
|
TAG_OVERLAY,
|
|
TAG_P,
|
|
TAG_PARAM,
|
|
TAG_PERSON,
|
|
TAG_PLAINTEXT,
|
|
TAG_PRE,
|
|
TAG_Q, // 80
|
|
TAG_RANGE,
|
|
TAG_SAMP,
|
|
TAG_SCRIPT,
|
|
TAG_SELECT,
|
|
TAG_SMALL,
|
|
TAG_SPACER,
|
|
TAG_SPOT,
|
|
TAG_STRIKE,
|
|
TAG_STRONG,
|
|
TAG_SUB, // 90
|
|
TAG_SUP,
|
|
TAG_TAB,
|
|
TAG_TABLE,
|
|
TAG_TBODY,
|
|
|
|
TAG_TD,
|
|
TAG_TEXTAREA,
|
|
TAG_TEXTFLOW,
|
|
TAG_TFOOT,
|
|
TAG_TH,
|
|
TAG_THEAD, // 100
|
|
TAG_TITLE,
|
|
TAG_TR,
|
|
TAG_TT,
|
|
|
|
TAG_U,
|
|
TAG_UL,
|
|
TAG_VAR,
|
|
TAG_WBR,
|
|
TAG_XMP,
|
|
TAG_COMMENT,
|
|
|
|
TAG_OPTION, // 110
|
|
TAG_STYLE,
|
|
TAG_DOCTYPE,
|
|
TAG_XML,
|
|
TAG_START,
|
|
TAG_STOP,
|
|
TAG_SPAN,
|
|
TAG_LEGEND,
|
|
TAG_S,
|
|
|
|
TAG_ABBR,
|
|
TAG_CDATA, // 120
|
|
TAG_NOSCRIPT,
|
|
TAG_FIELDSET,
|
|
TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link
|
|
TAG_RDF , // rdf:RDF
|
|
TAG_RSS , // rss
|
|
TAG_FEED , // atom feed tag
|
|
|
|
TAG_ITEM,
|
|
TAG_ENTRY,
|
|
TAG_CHANNEL,
|
|
TAG_ENCLOSURE,
|
|
TAG_WEBLOG,
|
|
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
|
|
TAG_GBFRAME,
|
|
TAG_TC,
|
|
TAG_GBXMLTITLE,
|
|
|
|
// facebook xml tags
|
|
TAG_FBSTARTTIME, // 135
|
|
TAG_FBENDTIME, // 136
|
|
TAG_FBNAME,
|
|
TAG_FBPICSQUARE,
|
|
TAG_FBHIDEGUESTLIST,
|
|
|
|
// . do not parse this up into words!! it is text in <script> tags
|
|
// . consider it a whole tag i guess
|
|
TAG_SCRIPTTEXT,
|
|
TAG_BUTTON,
|
|
TAG_URLFROM, // for ahrefs.com
|
|
|
|
// support sitemap.xml
|
|
TAG_LOC,
|
|
|
|
//
|
|
// fake tags below here
|
|
//
|
|
// a fake tag used by Sections.cpp
|
|
TAG_SENTENCE,
|
|
|
|
LAST_TAG
|
|
};
|
|
#endif
|
|
|
|
|