parse xml docs as pure xml again but set nodeid to TAG_LINK etc. so Linkdb.cpp can get links again. added isparentsitemap url filter to prioritize urls from sitemaps. added isrssext to url filters to prioritize new possible rss feed urls. added numinlinks to url filters to prioritize popular urls for spidering. use those filters in default web filter set. fix filters that delete urls from the index using the 'DELETE' priority. they weren't getting deleted.
339 lines
8.3 KiB
C++
339 lines
8.3 KiB
C++
#ifndef _XMLNODE_H_
|
|
#define _XMLNODE_H_
|
|
|
|
#include "gb-include.h"
|
|
// . an xml node can be text or tag (html or xml tag)
|
|
|
|
typedef int16_t nodeid_t;
|
|
|
|
// . get how many xml/html tags we have classified in our g_nodes[] array
|
|
// . used by Weights.cpp
|
|
int32_t getNumXmlNodes ( ) ;
|
|
bool isBreakingTagId ( nodeid_t tagId ) ;
|
|
bool hasBackTag ( nodeid_t tagId ) ;
|
|
int32_t getTagLen ( char *node ) ;
|
|
bool isTagStart ( char *s );//, int32_t i, int32_t version ) ;
|
|
// s points to tag name - first char
|
|
nodeid_t getTagId ( char *s , class NodeType **retp = NULL );
|
|
|
|
class XmlNode {
|
|
|
|
public:
|
|
|
|
friend class Xml; // needs to access our private parts ;)
|
|
friend class XmlDoc; // needs to access our private parts ;)
|
|
|
|
bool isText () { return m_nodeId == 0; };
|
|
bool isTag () { return m_nodeId > 0; };
|
|
bool isHtmlTag () { return m_nodeId > 1; };
|
|
bool isXmlTag () { return m_nodeId == 1; };
|
|
nodeid_t getNodeId () { return m_nodeId; };
|
|
int64_t getNodeHash() { return m_hash; };
|
|
char *getNode () { return m_node; };
|
|
// m_nodeLen is in bytes
|
|
int32_t getNodeLen () { return m_nodeLen; };
|
|
//int32_t getXmlParent () { return m_xmlParentTagNum; };
|
|
bool isBreaking () { return m_isBreaking; };
|
|
bool isVisible () { return m_isVisible; };
|
|
bool hasBackTag () { return m_hasBackTag; };
|
|
|
|
// exclude meta tags and comment tags (they are not front or back)
|
|
bool isFrontTag () {
|
|
return m_nodeId > 0 && m_node[1]!='/' &&
|
|
m_nodeId != 68 && m_nodeId != 109; };
|
|
|
|
// . get the value of a field like "href" in the <a href="blah"> tag
|
|
char *getFieldValue ( char *fieldName , int32_t *valueLen );
|
|
|
|
// . used exclusively by Xml class which contains an array of XmlNodes
|
|
// . "node" points to the beginning of the node, the '<' if it's a tag
|
|
// . sets m_node,m_nodeLen,m_hash,m_isBreaking,m_nodeId
|
|
// . returns the length of the node
|
|
// . pureXml is true if node cannot be an html tag, except comment
|
|
//int32_t set ( char *node , bool pureXml );
|
|
int32_t set ( char *node , bool pureXml , int32_t version );
|
|
|
|
// private:
|
|
|
|
// . called by set() to get the length of a tag node
|
|
//int32_t getTagLen ( char *node , int32_t version);
|
|
//int32_t getTagLen ( UChar *node , int32_t version );
|
|
|
|
// . called by set() to get the length of a TEXT node (and set it)
|
|
//int32_t setTextNode ( char *node );
|
|
|
|
// . called by set() to get the length of a COMMENT node (and set it)
|
|
int32_t setCommentNode ( char *node );
|
|
//int32_t setCommentNode ( UChar *node );
|
|
|
|
int32_t setCommentNode2 ( char *node );
|
|
|
|
// . called by set() to get the length of a CDATA node (and set it)
|
|
int32_t setCDATANode ( char *node );
|
|
//int32_t setCDATANode ( UChar *node );
|
|
|
|
// . called by set() to get nodeId and isBreaking of a tag node
|
|
// . returns the nodeId
|
|
nodeid_t setNodeInfo ( int64_t nodeHash );
|
|
|
|
char *m_node; // tag data, or text data if not a tag
|
|
int32_t m_nodeLen; // m_nodeLen is in bytes
|
|
char *m_tagName; // iff this node is a tag
|
|
int32_t m_tagNameLen;
|
|
int64_t m_hash; // iff this node is a tag
|
|
//int64_t m_compoundHash; // set by Xml class
|
|
//int32_t m_parentTagNum; // set by Xml class
|
|
//int32_t m_xmlParentTagNum; // set by Xml class
|
|
int16_t m_depth; // set by Xml class (xml depth only)
|
|
nodeid_t m_nodeId; // 0 for text,1 for xml tag, 1+ for html
|
|
char m_hasBackTag:1;
|
|
char m_isBreaking:1; // does tag (if it is) line break?
|
|
char m_isVisible:1;
|
|
char m_isSelfLink:1; // an a href tag link to self?
|
|
int32_t m_pairTagNum; // paired opening or closing tag
|
|
// . "m_linkNum" references a link in Links.cpp
|
|
// . use for <a href> xml nodes only right now
|
|
// . used so XmlDoc.cpp::getContactUsLink() works better
|
|
//int32_t m_linkNum;
|
|
class XmlNode *m_parent;
|
|
};
|
|
|
|
// . does "s" start a tag? (regular tag , back tag or comment tag)
|
|
inline bool isTagStart ( char *s ) { // , int32_t i, int32_t version ) {
|
|
// it must start with < to be a tag
|
|
if ( *s != '<' ) return false;
|
|
// a <gb is a fake tag because we now decode all html entites
|
|
// so in htmlDecode() in fctypes.cpp we decode < to
|
|
// "<gb"
|
|
//if ( s[i+1]=='g' && s[i+2]=='b') return false;
|
|
// minimal tag is 3 chars
|
|
// if ( !s[ii + 2 >= len ) return false;
|
|
// next char can be an alnum, !-- or / then alnum
|
|
if ( is_alnum_a ( s[1] ) ) return true;
|
|
// next char can be 1 of 3 things to be a tag
|
|
//switch ( s[1] ) {
|
|
// / is also acceptable, followed only by an alnum or >
|
|
if ( s[1]== '/' ) {
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
if ( s[2] == '>' ) return true;
|
|
return false;
|
|
}
|
|
// office.microsoft.com uses <?xml ...?> tags
|
|
if ( s[1]=='?' ) {
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
//if ( s[2] == '>' ) return true; <?> is tag???
|
|
return false;
|
|
}
|
|
// make sure the double hyphens follow the ! or alnum
|
|
if ( s[1]=='!' ) {
|
|
// this is for <!xml> i guess
|
|
if ( is_alnum_a(s[2]) ) return true;
|
|
// and the <![CDATA[
|
|
if ( s[2]=='[' && s[3]=='C' && s[4]=='D' &&
|
|
s[5]=='A' && s[6]=='T' && s[7]=='A' &&
|
|
s[8]=='[' ) return true;
|
|
// and the <!-- comment here--> famous comment tag
|
|
if ( s[2]=='-' && s[3]=='-' ) return true;
|
|
// and <![....]> i've seen too
|
|
// <![if gt IE 6]><script>.... for waterfordcoc.org
|
|
if ( s[2] == '[' ) return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
|
|
// Now set up a structure for describing ALL the available HTML nodes.
|
|
// . Each HTML node has a name, name length, does it break a word?
|
|
// a format bit. (most HTML tags have 0 for their format bit
|
|
// because we really don't care about what they do -- we use format
|
|
// bits for extracting title, summaries, et al.
|
|
// . the is indexable is false for tags like <script> <option> whose contents
|
|
// are not visible/indexable
|
|
class NodeType {
|
|
public:
|
|
char *m_nodeName;
|
|
bool m_hasBackTag;
|
|
char m_isBreaking;
|
|
char m_isVisible;
|
|
char m_filterKeep1; // for &strip=1 option
|
|
char m_filterKeep2; // for &strip=2 option
|
|
nodeid_t m_nodeId;
|
|
char m_isXmlTag;
|
|
};
|
|
|
|
extern class NodeType g_nodes[];
|
|
|
|
inline char *getTagName ( nodeid_t tagId ) {return g_nodes[tagId].m_nodeName;};
|
|
|
|
// . each tag has a number
|
|
enum {
|
|
TAG_TEXTNODE = 0,
|
|
TAG_XMLTAG,
|
|
TAG_A,
|
|
TAG_ABBREV,
|
|
TAG_ACRONYM,
|
|
TAG_ADDRESS,
|
|
TAG_APPLET,
|
|
TAG_AREA,
|
|
TAG_AU,
|
|
TAG_AUTHOR,
|
|
TAG_B, // 10
|
|
TAG_BANNER,
|
|
TAG_BASE,
|
|
TAG_BASEFONT,
|
|
TAG_BGSOUND,
|
|
TAG_BIG,
|
|
TAG_BLINK,
|
|
TAG_BLOCKQUOTE,
|
|
TAG_BQ,
|
|
TAG_BODY,
|
|
TAG_BR, // 20
|
|
TAG_CAPTION,
|
|
TAG_CENTER,
|
|
TAG_CITE,
|
|
TAG_CODE,
|
|
TAG_COL,
|
|
TAG_COLGROUP,
|
|
TAG_CREDIT,
|
|
TAG_DEL,
|
|
TAG_DFN,
|
|
TAG_DIR, // 30
|
|
TAG_DIV,
|
|
TAG_DL,
|
|
TAG_DT,
|
|
TAG_DD,
|
|
TAG_EM,
|
|
TAG_EMBED,
|
|
TAG_FIG,
|
|
TAG_FN,
|
|
TAG_FONT,
|
|
TAG_FORM, // 40
|
|
TAG_FRAME,
|
|
TAG_FRAMESET,
|
|
TAG_H1,
|
|
TAG_H2,
|
|
TAG_H3,
|
|
TAG_H4,
|
|
TAG_H5,
|
|
TAG_H6,
|
|
TAG_HEAD,
|
|
TAG_HR, // 50
|
|
TAG_HTML,
|
|
TAG_I,
|
|
TAG_IFRAME,
|
|
TAG_IMG,
|
|
TAG_INPUT,
|
|
TAG_INS,
|
|
TAG_ISINDEX,
|
|
TAG_KBD,
|
|
TAG_LANG,
|
|
TAG_LH, // 60
|
|
TAG_LI,
|
|
TAG_LINK,
|
|
TAG_LISTING,
|
|
TAG_MAP,
|
|
TAG_MARQUEE,
|
|
TAG_MATH,
|
|
TAG_MENU,
|
|
TAG_META,
|
|
TAG_MULTICOL,
|
|
TAG_NOBR, // 70
|
|
TAG_NOFRAMES,
|
|
TAG_NOTE,
|
|
TAG_OL,
|
|
TAG_OVERLAY,
|
|
TAG_P,
|
|
TAG_PARAM,
|
|
TAG_PERSON,
|
|
TAG_PLAINTEXT,
|
|
TAG_PRE,
|
|
TAG_Q, // 80
|
|
TAG_RANGE,
|
|
TAG_SAMP,
|
|
TAG_SCRIPT,
|
|
TAG_SELECT,
|
|
TAG_SMALL,
|
|
TAG_SPACER,
|
|
TAG_SPOT,
|
|
TAG_STRIKE,
|
|
TAG_STRONG,
|
|
TAG_SUB, // 90
|
|
TAG_SUP,
|
|
TAG_TAB,
|
|
TAG_TABLE,
|
|
TAG_TBODY,
|
|
|
|
TAG_TD,
|
|
TAG_TEXTAREA,
|
|
TAG_TEXTFLOW,
|
|
TAG_TFOOT,
|
|
TAG_TH,
|
|
TAG_THEAD, // 100
|
|
TAG_TITLE,
|
|
TAG_TR,
|
|
TAG_TT,
|
|
|
|
TAG_U,
|
|
TAG_UL,
|
|
TAG_VAR,
|
|
TAG_WBR,
|
|
TAG_XMP,
|
|
TAG_COMMENT,
|
|
|
|
TAG_OPTION, // 110
|
|
TAG_STYLE,
|
|
TAG_DOCTYPE,
|
|
TAG_XML,
|
|
TAG_START,
|
|
TAG_STOP,
|
|
TAG_SPAN,
|
|
TAG_LEGEND,
|
|
TAG_S,
|
|
|
|
TAG_ABBR,
|
|
TAG_CDATA, // 120
|
|
TAG_NOSCRIPT,
|
|
TAG_FIELDSET,
|
|
TAG_FBORIGLINK, // "feedburner:origlink" special feedburner link
|
|
TAG_RDF , // rdf:RDF
|
|
TAG_RSS , // rss
|
|
TAG_FEED , // atom feed tag
|
|
|
|
TAG_ITEM,
|
|
TAG_ENTRY,
|
|
TAG_CHANNEL,
|
|
TAG_ENCLOSURE,
|
|
TAG_WEBLOG,
|
|
// a tag we insert in XmlDoc.cpp to indicate expanded frame/iframe src
|
|
TAG_GBFRAME,
|
|
TAG_TC,
|
|
TAG_GBXMLTITLE,
|
|
|
|
// facebook xml tags
|
|
TAG_FBSTARTTIME, // 135
|
|
TAG_FBENDTIME, // 136
|
|
TAG_FBNAME,
|
|
TAG_FBPICSQUARE,
|
|
TAG_FBHIDEGUESTLIST,
|
|
|
|
// . do not parse this up into words!! it is text in <script> tags
|
|
// . consider it a whole tag i guess
|
|
TAG_SCRIPTTEXT,
|
|
TAG_BUTTON,
|
|
TAG_URLFROM, // for ahrefs.com
|
|
|
|
// support sitemap.xml
|
|
TAG_LOC,
|
|
|
|
//
|
|
// fake tags below here
|
|
//
|
|
// a fake tag used by Sections.cpp
|
|
TAG_SENTENCE,
|
|
|
|
LAST_TAG
|
|
};
|
|
#endif
|
|
|
|
|