privacore-open-source-searc…/XmlNode.cpp

#include "XmlNode.h"
#include "Mem.h"
#include "Sanity.h"
#include "utf8_fast.h"
#include "hash.h"

static int32_t getTagLen(const char *node, int maxNodeLen);

// . Here's a nice list of all the html nodes names, lengths, whether they're
//   a breaking node or not and their node id
// . isVisible is true if text in between front and end tags is visible on page
// . isVisible is used by Xml::getText()
// . filterKeep is 1 if we should keep it when &strip=1 is given when getting
//   the cached document. i added this for faisal
// . a filterKeep of 0 means remove tag and text between it and its back tag.
// . a filterKeep of 1 means keep the tag and text between it and its back tag.
// . a filterKeep of 2 means remove tag BUT keep the text between
//   it and its back tag.
const NodeType g_nodes[] = {
    // NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i] isXml?
    // --------------------------
    //  -- text node    ---  0
	{"textNode" , 0, 0, 1, 1,1, TAG_TEXTNODE  , 0},
	//  -- xml tag node ---  1
	{"xmlTag"   , 1, 1, 1, 2,2, TAG_XMLTAG    , 0},
	{"A"        , 1, 0, 1, 1,1, TAG_A         , 0},
	{"ABBREV"   , 1, 1, 1, 2,2, TAG_ABBREV    , 0},
	{"ACRONYM"  , 1, 1, 1, 2,1, TAG_ACRONYM   , 0},
	{"ADDRESS"  , 1, 1, 1, 2,2, TAG_ADDRESS   , 0},
	{"APPLET"   , 1, 1, 1, 0,0, TAG_APPLET    , 0},
	{"AREA"     , 0, 1, 1, 0,0, TAG_AREA      , TAG_TYPE_HTML_VOID},
	{"AU"       , 1, 1, 1, 0,0, TAG_AU        , 0},
	{"AUTHOR"   , 1, 1, 1, 0,0, TAG_AUTHOR    , 0},
	{"B"        , 1, 0, 1, 1,1, TAG_B         , 0},
	{"BANNER"   , 1, 1, 1, 0,0, TAG_BANNER    , 0},
	{"BASE"     , 0, 1, 1, 0,0, TAG_BASE      , TAG_TYPE_HTML_VOID},
	{"BASEFONT" , 0, 1, 1, 2,2, TAG_BASEFONT  , 0},
	{"BGSOUND"  , 0, 1, 1, 0,0, TAG_BGSOUND   , 0},
	{"BIG"      , 1, 0, 1, 2,1, TAG_BIG       , 0},
	{"BLINK"    , 1, 0, 1, 2,2, TAG_BLINK     , 0},
	{"BLOCKQUOTE",1, 1, 1, 2,1, TAG_BLOCKQUOTE, 0},
	{"BQ"       , 1, 1, 1, 0,0, TAG_BQ        , 0},
	{"BODY"     , 1, 1, 1, 1,1, TAG_BODY      , 0},
	{"BR"       , 0, 1, 1, 1,1, TAG_BR        , TAG_TYPE_HTML_VOID},
	{"CAPTION"  , 1, 1, 1, 2,1, TAG_CAPTION   , 0},
	{"CENTER"   , 1, 1, 1, 1,1, TAG_CENTER    , 0},
	{"CITE"     , 1, 1, 1, 2,1, TAG_CITE      , 0},
	{"CODE"     , 1, 1, 1, 2,1, TAG_CODE      , 0},
	{"COL"      , 1, 1, 1, 2,2, TAG_COL       , TAG_TYPE_HTML_VOID},
	{"COLGROUP" , 1, 1, 1, 0,0, TAG_COLGROUP  , 0},
	{"CREDIT"   , 1, 1, 1, 0,0, TAG_CREDIT    , 0},
	{"DEL"      , 1, 1, 1, 2,1, TAG_DEL       , 0},
	{"DFN"      , 1, 1, 1, 2,1, TAG_DFN       , 0},
	{"DIR"      , 1, 1, 1, 0,0, TAG_DIR       , 0},
	{"DIV"      , 1, 1, 1, 1,1, TAG_DIV       , 0},
	{"DL"       , 1, 1, 1, 1,1, TAG_DL        , 0},
	// this may not have a back tag!
	{"DT"       , 1, 1, 1, 1,1, TAG_DT        , 0},
	// this may not have a back tag!
	{"DD"       , 1, 1, 1, 1,1, TAG_DD        , 0},
	{"EM"       , 1, 0, 1, 2,1, TAG_EM        , 0}, // emphasized text
	{"EMBED"    , 0, 1, 1, 0,0, TAG_EMBED     , TAG_TYPE_HTML_VOID},
	{"FIG"      , 1, 1, 1, 0,0, TAG_FIG       , 0},
	{"FN"       , 1, 1, 1, 0,0, TAG_FN        , 0},
	{"FONT"     , 1, 0, 1, 1,1, TAG_FONT      , 0},
	{"FORM"     , 1, 1, 1, 2,2, TAG_FORM      , 0},
	// this may not have a back tag!
	{"FRAME"    , 1, 1, 1, 0,0, TAG_FRAME     , 0},
	{"FRAMESET" , 1, 1, 1, 0,0, TAG_FRAMESET  , 0},
	{"H1"       , 1, 1, 1, 1,1, TAG_H1        , 0},
	{"H2"       , 1, 1, 1, 1,1, TAG_H2        , 0},
	{"H3"       , 1, 1, 1, 1,1, TAG_H3        , 0},
	{"H4"       , 1, 1, 1, 1,1, TAG_H4        , 0},
	{"H5"       , 1, 1, 1, 1,1, TAG_H5        , 0},
	{"H6"       , 1, 1, 1, 1,1, TAG_H6        , 0},
	{"HEAD"     , 1, 1, 1, 1,1, TAG_HEAD      , 0},
	{"HR"       , 0, 1, 1, 1,1, TAG_HR        , TAG_TYPE_HTML_VOID},
	{"HTML"     , 1, 1, 1, 1,1, TAG_HTML      , 0},
	{"I"        , 1, 0, 1, 2,1, TAG_I         , 0},
	{"IFRAME"   , 1, 1, 1, 2,2, TAG_IFRAME    , 0},
	// filter = 1,but tag is turned to alt
	{"IMG"      , 0, 1, 1, 1,1, TAG_IMG       , TAG_TYPE_HTML_VOID},
	{"INPUT"    , 0, 1, 1, 0,0, TAG_INPUT     , TAG_TYPE_HTML_VOID},
	{"INS"      , 1, 1, 1, 2,1, TAG_INS       , 0},
	{"ISINDEX"  , 0, 1, 1, 0,0, TAG_ISINDEX   , 0},
	{"KBD"      , 1, 1, 1, 2,1, TAG_KBD       , 0},
	{"LANG"     , 1, 1, 1, 0,0, TAG_LANG      , 0},
	{"LH"       , 1, 1, 1, 0,0, TAG_LH        , 0},
	// this may or may not have a back tag
	{"LI"       , 1, 1, 1, 1,1, TAG_LI        , 0},
	// this may or may not have a back tag
	{"LINK"     , 0, 1, 1, 0,0, TAG_LINK      , TAG_TYPE_HTML_VOID},
	{"LISTING"  , 1, 1, 1, 0,0, TAG_LISTING   , 0},
	{"MAP"      , 1, 1, 1, 0,0, TAG_MAP       , 0},
	// don't index marquee text
	{"MARQUEE"  , 1, 1, 0, 2,2, TAG_MARQUEE   , 0},
	{"MATH"     , 1, 1, 1, 0,0, TAG_MATH      , 0},
	{"MENU"     , 1, 1, 1, 1,1, TAG_MENU      , 0},
    // TAG_MENUITEM (TAG_TYPE_HTML_VOID)
	{"META"     , 0, 1, 1, 1,1, TAG_META      , TAG_TYPE_HTML_VOID},
	{"MULTICOL" , 0, 1, 1, 0,0, TAG_MULTICOL  , 0},
	{"NOBR"     , 1, 0, 1, 0,0, TAG_NOBR      , 0},
	{"NOFRAMES" , 1, 1, 1, 0,0, TAG_NOFRAMES  , 0},
	{"NOTE"     , 1, 1, 1, 0,0, TAG_NOTE      , 0},
	{"OL"       , 1, 1, 1, 1,1, TAG_OL        , 0},
	{"OVERLAY"  , 0, 1, 1, 0,0, TAG_OVERLAY   , 0},
	// this may not have a back tag!
	{"P"        , 0, 1, 1, 1,1, TAG_P         , 0},
	{"PARAM"    , 0, 1, 1, 0,0, TAG_PARAM     , TAG_TYPE_HTML_VOID},
	{"PERSON"   , 1, 1, 1, 0,0, TAG_PERSON    , 0},
	{"PLAINTEXT", 1, 1, 1, 0,0, TAG_PLAINTEXT , 0},
	{"PRE"      , 1, 1, 1, 2,1, TAG_PRE       , 0},
	{"Q"        , 1, 1, 1, 2,1, TAG_Q         , 0},
	{"RANGE"    , 0, 1, 1, 0,0, TAG_RANGE     , 0},
	{"SAMP"     , 1, 1, 1, 2,1, TAG_SAMP      , 0},
	{"SCRIPT"   , 1, 1, 0, 0,0, TAG_SCRIPT    , 0},
	{"SELECT"   , 1, 1, 0, 0,0, TAG_SELECT    , 0},
	{"SMALL"    , 1, 0, 1, 2,1, TAG_SMALL     , 0},
    // TAG_SOURCE (TAG_TYPE_HTML_VOID)
	{"SPACER"   , 0, 1, 1, 2,1, TAG_SPACER    , 0},
	{"SPOT"     , 0, 1, 1, 0,0, TAG_SPOT      , 0},
	{"STRIKE"   , 1, 1, 1, 2,1, TAG_STRIKE    , 0},
	{"STRONG"   , 1, 0, 1, 2,1, TAG_STRONG    , 0},
	{"SUB"      , 1, 0, 1, 2,2, TAG_SUB       , 0},
	{"SUP"      , 1, 0, 1, 2,2, TAG_SUP       , 0},
	{"TAB"      , 0, 1, 1, 0,0, TAG_TAB       , 0},
	{"TABLE"    , 1, 1, 1, 1,1, TAG_TABLE     , 0},
	{"TBODY"    , 1, 1, 1, 1,1, TAG_TBODY     , 0},

	// this may not have a back tag!
	{"TD"       , 1, 1, 1, 1,1, TAG_TD        , 0},
	{"TEXTAREA" , 1, 1, 1, 2,2, TAG_TEXTAREA  , 0},
	{"TEXTFLOW" , 0, 1, 1, 0,0, TAG_TEXTFLOW  , 0},
	{"TFOOT"    , 0, 1, 1, 0,0, TAG_TFOOT     , 0},
	// this DOES have a back tag
	{"TH"       , 1, 1, 1, 0,0, TAG_TH        , 0},
	{"THEAD"    , 0, 1, 1, 0,0, TAG_THEAD     , 0},
	{"TITLE"    , 1, 1, 1, 1,1, TAG_TITLE     , 0},

	// this may not have a back tag!
	{"TR"       , 1, 1, 1, 1,1, TAG_TR        , 0},
    // TAG_TRACK (TAG_TYPE_HTML_VOID)
	{"TT"       , 1, 1, 1, 2,1, TAG_TT        , 0},

	{"U"        , 1, 0, 1, 1,1, TAG_U         , 0},
	{"UL"       , 1, 0, 1, 1,1, TAG_UL        , 0},
	{"VAR"      , 1, 1, 1, 2,1, TAG_VAR       , 0},
	{"WBR"      , 0, 1, 1, 0,0, TAG_WBR       , TAG_TYPE_HTML_VOID},
	{"XMP"      , 1, 1, 1, 0,0, TAG_XMP       , 0},
	{"!--"      , 0, 1, 1, 0,0, TAG_COMMENT   , 0}, // comment tag!

	{"OPTION"   , 0, 1, 1, 2,2, TAG_OPTION    , 0},
	{"STYLE"    , 1, 1, 0, 0,1, TAG_STYLE     , 0},

	// doctype tag <!DOCTYPE ...>
	{"DOCTYPE"  , 0, 1, 1, 0,0, TAG_DOCTYPE   , 0},

	// used in office.microsoft.com <?xml ...>
	{"XML"      , 0, 1, 1, 0,0, TAG_XML       , 0},

	// <start index> <stop index>
	{"START"    , 0, 1, 1, 0,0, TAG_START     , 0},
	{"STOP"     , 0, 1, 1, 0,0, TAG_STOP      , 0},

	// . i added these tags for faisal, but don't really need them
	//   since our XML tag condition handles this case
	// . we can no longer treat as a generic XML tags since faisal wanted
	//   the strip=2 option
	{"SPAN"     , 1, 0, 1, 2,1, TAG_SPAN      , 0}, // not breaking!
	{"LEGEND"   , 1, 1, 1, 2,1, TAG_LEGEND    , 0},
	{"S"        , 1, 1, 1, 2,1, TAG_S         , 0}, // strike tag

	{"ABBR"     , 1, 0, 1, 2,1, TAG_ABBR      , 0},
	{"![CDATA[" , 0, 1, 1, 0,0, TAG_CDATA     , 0}, // <![CDATA[ tag
	{"NOSCRIPT" , 1, 1, 0, 0,0, TAG_NOSCRIPT  , 0},
    {"FIELDSET" , 1, 1, 1, 0,0, TAG_FIELDSET  , 0},

	// feedburner uses these in the xml
	{"FEEDBURNER:ORIGLINK", 0, 1, 1, 0,0, TAG_FBORIGLINK , TAG_TYPE_XML},

	// ahrefs uses these as links
	{"RDF:RDF"  , 0, 1, 1, 0,0, TAG_RDF       , TAG_TYPE_XML},
    {"RSS"      , 0, 1, 1, 0,0, TAG_RSS       , TAG_TYPE_XML},
	{"FEED"     , 0, 1, 1, 0,0, TAG_FEED      , TAG_TYPE_XML},

	{"ITEM"     , 1, 1, 0, 0,0, TAG_ITEM      , TAG_TYPE_XML},
	{"ENTRY"    , 1, 1, 0, 0,0, TAG_ENTRY     , TAG_TYPE_XML},
	{"CHANNEL"  , 1, 1, 0, 0,0, TAG_CHANNEL   , TAG_TYPE_XML},
	{"ENCLOSURE", 1, 1, 0, 0,0, TAG_ENCLOSURE , 0},
	{"WEBLOG"   , 0, 1, 0, 0,0, TAG_WEBLOG    , TAG_TYPE_XML},

	{"GBFRAME"  , 1, 1, 1, 1,1, TAG_GBFRAME   , 0},
	{"TC"       , 1, 1, 1, 1,1, TAG_TC        , 0},// HACK: tbl column section
	{"GBXMLTITLE", 1, 1, 1, 1,1, TAG_GBXMLTITLE, TAG_TYPE_XML},

	// facebook xml
	{"START_TIME", 1, 1, 1, 1,1, TAG_FBSTARTTIME, TAG_TYPE_XML},
	{"END_TIME" , 1, 1, 1, 1,1, TAG_FBENDTIME, TAG_TYPE_XML},
	{"NAME"     , 1, 1, 1, 1,1, TAG_FBNAME, TAG_TYPE_XML},
	{"PIC_SQUARE", 1, 1, 1, 1,1, TAG_FBPICSQUARE, TAG_TYPE_XML},
	{"HIDE_GUEST_LIST", 1, 1, 1, 1,1, TAG_FBHIDEGUESTLIST, TAG_TYPE_XML},


	{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
	{"BUTTON"   , 1, 1, 1, 0,0, TAG_BUTTON,0},
	{"UrlFrom"  , 0, 1, 1, 0,0, TAG_URLFROM, TAG_TYPE_XML},

	// for sitemap.xml
	{"LOC"      , 0, 1, 1, 0,0, TAG_LOC, 0}
};
// NAME hasBackTag brk? isVisible? filterKeep1? filterKeep2 type/m_nodeId[i]


// . does "s" start a tag? (regular tag , back tag or comment tag)
static bool isTagStart(const char *s) {
	// it must start with < to be a tag
	if( s[0] != '<' )
		return false;

	// next char can be an alnum, !-- or / then alnum

	// Extensible Markup Language (XML) 1.0 (Fifth Edition)
	// https://www.w3.org/TR/REC-xml/#NT-Name
	// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
	//                   [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
	//                   [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
	//                   [#x10000-#xEFFFF]

	/// @todo ALC cater for other start characters
	/// regex: "^<[A-Za-z]"
	if( is_alpha_a(s[1]) )
		return true;

	// next char can be 1 of 3 things to be a tag
	// / is also acceptable, followed only by an alnum or >

	/// @todo ALC "</>" a valid tag?
	/// regex: "^</[A-Za-z0-9>]"
	if( s[1] == '/' ) {
		if( is_alnum_a(s[2]) || (s[2] == '>') )
			return true;
		return false;
	}

	// office.microsoft.com uses <?xml ...?> tags
	/// regex: "^<?[A-Za-z0-9]"
	if( s[1]=='?' ) {
		if( is_alnum_a(s[2]) )
			return true;
		return false;
	}

	// make sure the double hyphens follow the ! or alnum
	if( s[1]=='!' ) {
		// this is for <!xml> i guess
		if( is_alnum_a(s[2]) )
			return true;

		// and the <![CDATA[
		// and <![....]> i've seen too
		// <![if gt IE 6]><script>.... for waterfordcoc.org
		if( s[2] == '[' )
			return true;

		// and the <!-- comment here--> famous comment tag
		if( s[2]=='-' && s[3]=='-' )
			return true;
	}

	return false;
}


// . called by Xml class
// . returns the length of the node
// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
int32_t XmlNode::set( char *node, int maxNodeLen, bool pureXml ) {
	// save head of node
	m_node = node;

	// sanity check
	static bool s_check = false;
	if ( ! s_check ) {
		s_check = true;

		// how many NodeTypes do we have in g_nodes?
		static const int32_t nn = sizeof(g_nodes) / sizeof(NodeType);

		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			// sanity
			if ( g_nodes[i].m_nodeId != i ) gbshutdownLogicError();
		}
	}

	// . reset this
	// . need to do here instead of in Links.cpp because sometimes
	//   we think an anchor tag indicates a link, but it is really
	//   just an <a href="javascript:..."> function call and Links.cpp
	//   ignored it but we are expecting this to be valid!
	m_isSelfLink = 0;

	// CDATA tag was identified in earlier versions as a text node. Now
	// it is identified as a CDATA tag node. But gb.conf and others always
	// pass their version as 0
	if ( node[0] == '<' &&
	     node[1] == '!' &&
	     node[2] == '[' &&
	     node[3] == 'C' &&
	     node[4] == 'D' &&
	     node[5] == 'A' &&
	     node[6] == 'T' &&
	     node[7] == 'A' &&
	     node[8] == '[' ) {
		return setCDATANode ( node );
	}

	// if "node" isn't the start of a tag then set it as a Text Node
	if ( ! isTagStart ( node ) ) {
		// . set this node as a text node!
		// . nodeId for text nodes is 0
		m_nodeId     = TAG_TEXTNODE;
		m_node       = node;
		m_hasBackTag = false;
		m_hash       = 0;
		int32_t i = 0;

		// inc i as long as it's NOT the beginning of a tag
		while ( node[i] && ( node[i] != '<' || !isTagStart( node + i ) ) ) {
			++i;
		}

		m_nodeLen = i;
		m_pairTagNum = -1;

		return m_nodeLen;
	}

	// . see if it's a comment (node end is "-->" for comments)
	// . comments are special cases
	if  ( node[1] == '!' ) {
		if ( node[2]=='-' && node[3]=='-' ) {
			return setCommentNode ( node );
		}

		// this means comment too:
		// <![if ....]>
		if ( node[2]=='[' ) {
			return setCommentNode2 ( node );
		}
	}

	// . otherwise it's a regular tag
	// . might be <!DOCTYPE ...> or something though
	m_nodeLen = getTagLen(node, maxNodeLen);

	// . get the node's name's length (i-1)
	// . node name ends at non alnum char
	// . we can have hyphens in node name (TODO: even at beginning???)
	int32_t tagNameStart = 1;

	// . skip over backslash in the back tags
	// . or skip over / or ? or ! now
	// . tag names must start with a letter, fwiw
	if ( ! is_alnum_a(node[tagNameStart]) ) {
		tagNameStart++;
	}

	int32_t i = tagNameStart;

	// skip i to end of tagName. this should only allow ascii chars
	// to be "tag name chars"
	for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );

	// set the tagName and tagNameLen
	m_tagName    = &node [ tagNameStart ];
	m_tagNameLen = i - tagNameStart;

	// . set the node's hash -- used cuz it's faster than strcmp
	// . just hash the letters as upper case
	// . tag names are never utf8, so use the ascii ha
	m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);

	m_nodeId = setNodeInfo ( m_hash );

	// if we're pure xml, don't allow any html tags accept <!-- -->
	if ( pureXml ) {
		m_hasBackTag = true;
		m_isBreaking = true;
		m_isVisible  = true;
	}

	// . no back tag if / follow name
	// . this was only for "pureXml" but now i do it for all tags!
	if ( m_node[m_nodeLen - 2] == '/' || m_node[m_nodeLen - 2] == '?' ) {
		m_hasBackTag = false;
	}

	return m_nodeLen;
}

// . return the length of a node starting at "node"
static int32_t getTagLen ( const char *node, int maxNodeLen) {
	// skip over first <
	int32_t i ;

	// . keep looping until we hit a < or > OR while we're in quotes
	// . ignore < and > when they're in quotes
	for ( i = 1 ; node[i] && i<maxNodeLen; i++ ) {
		// this switch should speed things up... no!
		if ( node[i] != '<'  &&
		     node[i] != '>'  &&
		     node[i] != '\"' &&
		     node[i] != '\''  ) {
			continue;
		}

		if ( ( node[i] == '<' ) || ( node[i] == '>' ) ) {
			break;
		}

		// we can have double quotes within single quotes
		if ( node [ i ] == '\"' ) {
			// scan back looking for equal sign...
			int32_t k;

			for ( k = i - 1 ; k > 1 ; k-- ) {
				if ( is_wspace_a(node[k]) ) continue;
				break;
			}
			if ( k <= 1 ) continue;
			// . if an equal sign did not immediately preceed
			//   this double quote then ignore the double quote
			// . this now fixes the harwoodmuseum.org issue
			//   talked about below
			if ( node[k] != '=' ) continue;
			// skip over this first quote
			i++;
			while ( node[i] && node[i]!='\"' ) {
				// crap some pages have unbalanced quotes.
				// see /test/doc.14541556377486183454.html
				if(node[i]=='>') {
					if((node[i-1]=='\"') ||
					   (node[i-1]==' ' && node[i-2]=='\"'))
					{
						//Well, what about those ther have balanced quotes and just happen to have a '>' first in an attribute value?
						//Scan forward and check if '<' or '>' comes first. If '>' comes first then this (node[i]) greater-than sign
						//is really a greater-than sign in an attribute value.
						int max_bytes_to_scan = std::min(maxNodeLen-i,100);
						const char *next_gt = (const char*)memchr(node+i+1,'>',max_bytes_to_scan);
						const char *next_lt = (const char*)memchr(node+i+1,'<',max_bytes_to_scan);
						if(!next_lt || (next_gt && next_gt<next_lt))
							; // greater-than comes first
						else {
							i--;
							break;
						}
					}
				}

				// skip this char
				i++;
			}

			// return the length if tag ended abuptly
			if ( !node[i] ) {
				return i;
			}

			// back-to-back quotes? common mistake
			if ( node[i + 1] == '\"' ) {
				i++;
			}
			continue;
		}

		// continue if we don't have a " '" or "='"
		if ( node[i] != '\'' ) {
			continue;
		}

		if ( node[i - 1] != '=' && !is_wspace_a( node[i - 1] ) ) {
			continue;
		}

		// skip to end of quote
		while ( node[i] && node[i] != '\'' ) {
			i++;
		}
	}

	// skip i over the >
	if ( node[i] == '>' ) {
		i++;
	} else {
		// . else we found no closure outside of quotes so be more stringent
		// . look for closure with regard to quotes
		for ( i = 1; node[i] && node[i] != '>' && node[i] != '<'; i++ );
	}

	// return the LENGTH of the whole node
	return i;
}

int32_t XmlNode::setCommentNode ( char *node ) {
	m_nodeId      = TAG_COMMENT;
	m_isBreaking  = true;
	m_isVisible   = true;
	m_hasBackTag  = false;
	m_hash        = hash64 ( "!--" , 3 , 0LL );
	m_node        = node;
	m_tagName     = node + 1; // !--
	m_tagNameLen  = 3;

	// . compute node length
	// . TODO: do we have to deal with quotes????
	// . TODO: what about nested comments?
	int32_t i;
	for ( i = 3 ; node[i] ; i++ ) {
		if ( node[i]   !='>' ) continue;
		if ( node[i-1] !='-' ) continue;
		if ( node[i-2] =='-' ) break;
	}

	// skip i over the >, if any (could be end of doc)
	if ( node[i] == '>' ) i++;

	m_nodeLen = i;

	return i;
}

int32_t XmlNode::setCommentNode2 ( char *node ) {
	m_nodeId      = TAG_COMMENT;
	m_isBreaking  = false;//true;
	m_isVisible   = false;//true;
	m_hasBackTag  = false;
	m_hash        = hash64 ( "![" , 2 , 0LL );
	m_node        = node;
	m_tagName     = node + 1;
	m_tagNameLen  = 2;

	// . compute node length
	// . TODO: do we have to deal with quotes????
	// . TODO: what about nested comments?
	int32_t i;
	for ( i = 2 ; node[i] ; i++ ) {
		// look for ending of ]> like for <![if gt IE 6]>
		if ( node[i]   !='>' ) continue;
		if ( node[i-1] ==']' ) break;
		// look for ending of --> like for <![endif]-->
		if ( node[i-1] == '-' && node[i-2] == '-' ) break;
	}

	// skip i over the >, if any (could be end of doc)
	if ( node[i] == '>' ) i++;

	m_nodeLen = i;

	return i;
}

int32_t XmlNode::setCDATANode ( char *node ) {
	m_nodeId      = TAG_CDATA;
	m_isBreaking  = true;
	m_isVisible   = true;
	m_hasBackTag  = false;
	m_hash        = hash64 ( "![CDATA[" , 8 , 0LL );
	m_node        = node;
	m_tagName     = node + 1; // !--
	m_tagNameLen  = 8;

	// . compute node length
	// . TODO: do we have to deal with quotes????
	// . TODO: what about nested comments?
	int32_t i;
	for ( i = 8; node[i]; i++ ) {
		// seems like just ]] is good enough! don't need "]]>"
		if ( node[i] != ']' ) {
			continue;
		}

		if ( node[i + 1] != ']' ) {
			continue;
		}

		// but skip it if we got it
		if ( node[i + 2] != '>' ) {
			continue;
		}

		i += 3;
		break;
	}

	m_nodeLen = i;

	return i;
}

static bool findCharSingle( char nodeChar, char expectedChar, char expectedOtherChar, char *foundChar ) {
	if ( ( to_lower_a(nodeChar) == to_lower_a(expectedChar) ) ||
	     ( expectedOtherChar && to_lower_a(nodeChar) == to_lower_a(expectedOtherChar) ) ) {
		if ( foundChar ) {
			*foundChar = nodeChar;
		}

		return true;
	}

	// invalid char
	return false;
}

// allow whitespace when looking for char
static bool findChar( const char *node, int32_t start, int32_t end, int32_t *pos, char expectedChar,
					  char expectedOtherChar, char *foundChar, bool onlyAllowWhiteSpace ) {
	int32_t i;
	for ( i=start; i<end; i++ ) {
		if ( is_wspace_a( node[i] ) ) {
			continue;
		}

		if ( findCharSingle( node[i], expectedChar, expectedOtherChar, foundChar ) ) {
			*pos = i;
			return true;
		} else {
			if ( onlyAllowWhiteSpace ) {
				*pos = i;
				return false;
			}

			continue;
		}
	}

	*pos = i;
	return false;
}

static bool findChar( const char *node, int32_t start, int32_t end, int32_t *pos, char expectedChar,
					  bool onlyAllowWhiteSpace ) {
	return findChar(node, start, end, pos, expectedChar, '\0', NULL, onlyAllowWhiteSpace);
}

static bool findCharReverse( const char *node, int32_t start, int32_t end, int32_t *pos, char expectedChar,
                             char expectedOtherChar, char *foundChar, bool onlyAllowWhiteSpace ) {
	int32_t i;
	for ( i = start; i>end && i>0; i-- ) {
		if ( is_wspace_a( node[i] ) ) {
			continue;
		}

		if ( findCharSingle( node[i], expectedChar, expectedOtherChar, foundChar ) ) {
			*pos = i;
			return true;
		} else {
			if ( onlyAllowWhiteSpace ) {
				*pos = i;
				return false;
			}

			continue;
		}
	}

	*pos = i;
	return false;
}


static bool findCharReverse( const char *node, int32_t start, int32_t end, int32_t *pos, char expectedChar) {
	return findCharReverse(node, start, end, pos, expectedChar, '\0', NULL, true);
}

static bool findQuoteChar( const char *node, int32_t start, int32_t end, int32_t *pos, char *foundChar) {
	if (start > end) {
		return findCharReverse(node, start, end, pos, '\'', '"', foundChar, false);
	}

	return findChar(node, start, end, pos, '\'', '"', foundChar, false);
}

static bool findEqualChar( const char *node, int32_t start, int32_t end, int32_t *pos ) {
	if (start > end) {
		return findCharReverse(node, start, end, pos, '=');
	}

	return findChar(node, start, end, pos, '=', true);
}

static bool isValidAttrNameChar(char nodeChar) {
	if ( is_wspace_a( nodeChar ) || is_binary_a( nodeChar ) || nodeChar == '"' || nodeChar == '\'' ||
	     nodeChar == '<' || nodeChar == '>' || nodeChar == '/' || nodeChar == '=' ) {
		return false;
	}

	return true;
}

/**
 * Get attribute value
 *
 * The difference between XmlNode::getAttrValue and XmlNode::getFieldValue is that we try to recover
 * from bad attribute value
 *
 * @param[in] field Attribute name (only ascii supported)
 * @param[out] valueLen Attribute value length
 *
 * @return Attribute value
 */
char *XmlNode::getAttrValue( const char *field, int32_t fieldLen, int32_t *valueLen ) {
	if (valueLen) {
		*valueLen = 0;
	}

	/*
	 * https://www.w3.org/TR/html-markup/syntax.html#syntax-attributes
	 *
	 * attribute names:
	 *   must consist of one or more characters other than the space characters,
	 *   U+0000 NULL, """, "'", ">", "/", "=", the control characters, and any characters that are not defined by Unicode.
	 *
	 * attribute values:
	 *   can contain text and character references, with additional restrictions depending on whether they are
	 *   unquoted attribute values, single-quoted attribute values, or double-quoted attribute values.
	 *   Also, the HTML elements section of this reference describes further restrictions on the allowed values of
	 *   particular attributes, and attributes must have values that conform to those restrictions.
	 *
	 *   attributes can be specified in four different ways:
	 *     - empty attribute syntax (not handled)
	 *     - unquoted attribute-value syntax (not handled)
	 *     - single-quoted attribute-value syntax
	 *     - double-quoted attribute-value syntax
	 */

	bool found = false;

	int32_t startPos = 0;
	int32_t prevEndPos = 0;

	int32_t startQuotePos = 0;
	int32_t endQuotePos = 0;

	while ( startPos < m_nodeLen ) {
		char foundQuoteChar = '\0';

		// look for start quote (forward)
		found = findQuoteChar(m_node, startPos, m_nodeLen, &startQuotePos, &foundQuoteChar);
		if ( !found ) {
			// we should have at least one quote char to be able to have any value
			return NULL;
		}

		int32_t equalsPos = 0;

		// look for equals (reverse)
		found = findEqualChar( m_node, startQuotePos - 1, startPos, &equalsPos);
		if ( !found ) {
			// unable to find equals, assume dangling quote
			startPos = startQuotePos + 1;
			continue;
		}

		// look for end quote (forward)
		found = findChar(m_node, startQuotePos + 1, m_nodeLen, &endQuotePos, foundQuoteChar, false);
		if ( !found ) {
			// no end quote
			return NULL;
		}

		while (endQuotePos < m_nodeLen) {
			// do some validation

			// look for another quote
			int32_t nextQuotePos = 0;
			char nextFoundQuoteChar = '\0';

			found = findQuoteChar(m_node, endQuotePos + 1, m_nodeLen, &nextQuotePos, &nextFoundQuoteChar);
			if ( found ) {
				// let's see if it's preceeded by equals
				int32_t nextEqualsPos = 0;

				found = findEqualChar(m_node, nextQuotePos - 1, endQuotePos, &nextEqualsPos);
				if ( !found ) {
					// no preceding equals sign (assume invalid meta tag, try to recover from it)
					endQuotePos = nextQuotePos;
					continue;
				}
			}

			// assume found valid end quote
			break;
		}

		int32_t endAttrNamePos = equalsPos;

		// look for attr name (reverse)
		found = findCharReverse(m_node, equalsPos - 1, prevEndPos, &endAttrNamePos, field[fieldLen - 1]);
		if ( !found ) {
			// not our field
			prevEndPos = endQuotePos;
			startPos = endQuotePos + 1;
			continue;
		}

		int32_t startAttrNamePos = endAttrNamePos;

		// look for attr name start (forward)
		while ( isValidAttrNameChar( m_node[startAttrNamePos] ) && startAttrNamePos > prevEndPos ) {
			--startAttrNamePos;
		}

		if ( endAttrNamePos - startAttrNamePos != fieldLen ||
			 strncasecmp( &m_node[startAttrNamePos + 1], field, fieldLen ) != 0 ) {
			// no match
			found = false;
			prevEndPos = endQuotePos;
			startPos = endQuotePos + 1;
			continue;
		}

		// found match!
		found = true;
		break;
	}

	if (!found) {
		return NULL;
	}

	// set the length of the value
	if( valueLen ) {
		*valueLen = endQuotePos - startQuotePos - 1;
	}

	// return a ptr to the value
	return m_node + startQuotePos + 1;
}

// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( const char *field , int32_t *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	int32_t flen = strlen(field);
	char inQuotes = '\0';
	int32_t i;

	// scan the characters in the node, looking for the field name in ascii
	for ( i = 1; i + flen < m_nodeLen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (m_node[i] == inQuotes ) {
				inQuotes = 0;
			}
			continue;
		}

		// set inQuotes to the quote if we're in quotes
		if ( (m_node[i]=='\"' || m_node[i]=='\'')) {
			inQuotes = m_node[i];
			continue;
		}

		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( m_node[i-1] ) ) {
			continue;
		}

		// the first character of this field shout match field[0]
		if ( to_lower_a( m_node[i] ) != to_lower_a( field[0] ) ) {
			continue;
		}

		// field just be immediately followed by an = or space
		if ( m_node[i + flen] != '=' && !is_wspace_a( m_node[i + flen] ) ) {
			continue;
		}

		// field names must match
		if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) {
			continue;
		}

		// break cuz we got a match for our field name
		break;
	}


	// return NULL if no matching field
	if ( i + flen >= m_nodeLen ) {
		return NULL;
	}

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) {
		i++;
	}

	// advance over the equal sign, return NULL if does not exist
	if ( i < m_nodeLen && m_node[i++] != '=' ) {
		return NULL;
	}

	// advance i over spaces after the equal sign
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) {
		i++;
	}

	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( m_node[i] == '\"' || m_node[i] == '\'' ) {
		inQuotes = m_node[i++];
	}

	// mark this as the start of the value
	int start = i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if ( inQuotes ) {
		while ( i < m_nodeLen && m_node[i] != inQuotes ) {
			++i;
		}
	} else {
		while ( i < m_nodeLen && !is_wspace_a( m_node[i] ) && m_node[i] != '>' ) {
			++i;
		}
	}

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return m_node + start;
}

#include "HashTableX.h"

nodeid_t getTagId ( const char *s , NodeType **retp ) {
	// init table?
	static bool s_init = false;
	static HashTableX  s_ht;
	static char s_buf[10000];

	if ( ! s_init ) {
		s_init = true;
		s_ht.set ( 4 ,4,1024,s_buf,10000,false,"tagids");

		// how many NodeTypes do we have in g_nodes?
		static const int32_t nn = sizeof(g_nodes) / sizeof(NodeType);

		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			const char *name = g_nodes[i].m_nodeName;
			int32_t  nlen = strlen(name);
			int64_t h = hash64Upper_a ( name,nlen,0LL );
			const NodeType *nt = &g_nodes[i];
			if ( ! s_ht.addKey(&h,&nt) ) {
				gbshutdownLogicError();
			}
		}

		// sanity
		if ( s_ht.getNumSlots() != 1024 ) gbshutdownLogicError();

		// sanity test
		nodeid_t tt = getTagId ( "br" );
		if ( tt != TAG_BR ) {
			gbshutdownLogicError();
		}
	}

	// find end of tag name. hyphens are ok to be in name.
	// facebook uses underscores like <start_time>
	const char *e = s;
	for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);

	// hash it for lookup
	int64_t h = hash64Upper_a ( s , e - s , 0 );

	// look it up
	NodeType **ntp = (NodeType **)s_ht.getValue(&h);

	// assume none
	if ( retp ) {
		*retp = NULL;
	}

	// none?
	if ( ! ntp ) {
		return 0;
	}

	// got one
	if ( retp ) {
		*retp = *ntp;
	}

	// get id otherwise
	return (*ntp)->m_nodeId;
}

// . returns the nodeId
// . 0 means not a node
// . 1 means it's an xml node
// . > 1 is reserved for pre-defined html nodes
nodeid_t XmlNode::setNodeInfo ( int64_t  nodeHash ){
	// . we have a list of all node types called "g_nodes"
	// . each node type is a NodeType struct
	// . hash all these node types into a hash table by their node name
	// . we have 108 node names so we'll use 512 buckets
	// . given the hash of your node name you can look it up in this table
	static bool      s_isHashed = false;
	static int64_t s_hash [512];
	static nodeid_t  s_num  [512];

	// how many NodeTypes do we have in g_nodes?
	static const int32_t s_numNodeTypes = sizeof( g_nodes ) / sizeof( NodeType );

	// we only need to fill in the hash table once since it's static
	if ( !s_isHashed ) {
		// set this to true so we don't do the hashing again
		s_isHashed = true;

		// clear the hash table
		memset ( s_hash , 0 , 8*512 );
		// set the hash table
		for ( int32_t i = 0 ; i < s_numNodeTypes ; i++ ) {
			int64_t h = hash64Upper_a( g_nodes[i].m_nodeName, strlen( g_nodes[i].m_nodeName ), 0LL );
			int32_t b = (uint64_t)h & 511;

			while ( s_hash[b] ) {
				if ( ++b == 512 ) {
					b = 0;
				}
			}

			s_hash [ b ] = h;
			s_num  [ b ] = i;
		}
	}

	// look up nodeHash in hash table
	int32_t b = (uint64_t)nodeHash & 511;
	while ( s_hash[b] ) {
		if ( s_hash[b] == nodeHash ) {
			break;
		}

		if ( ++b == 512 ) {
			b = 0;
		}
	}

	// if it wasn't found it must be an xml node(or unrecognized html node)
	if ( ! s_hash[b] ) {
		// default is breaking, has back tag and is indexable
		m_isBreaking = true;
		m_hasBackTag = true;
		m_isVisible  = true;
		return 1;
	}

	// otherwise extract the isBreaking and the nodeId from the hit bucket
	int32_t n = s_num[b];
	m_hasBackTag = g_nodes [ n ].m_hasBackTag;
	m_isBreaking = g_nodes [ n ].m_isBreaking;
	m_isVisible  = g_nodes [ n ].m_isVisible;

	// return the tag/node Id
	return g_nodes [ n ].m_nodeId;
}

int32_t getNumXmlNodes ( ) {
	return (int32_t)sizeof(g_nodes) / sizeof(XmlNode);
}


bool isBreakingTagId ( nodeid_t tagId ) {
	return g_nodes [ tagId & BACKBITCOMP ].m_isBreaking;
}

bool hasBackTag ( nodeid_t tagId ) {
	return g_nodes [ tagId & BACKBITCOMP ].m_hasBackTag;
}