privacore-open-source-searc…/Xml.h

#ifndef GB_XML_H
#define GB_XML_H

// . this is used for parsing tagdb records
// . used for pasrsing tagdb records, conf file, and html/xml documents
// . NOTE: ALL tags are case INsensitive so <myTag> equals <MYTAG>

#include "XmlNode.h"
#include "Lang.h"
#include "Sanity.h"


class Xml {
public:
	Xml();

	// . should free m_xml if m_copy is true
	~Xml();

	// do we have any xml in here?
	bool isEmpty() const {
		return ( m_xml == NULL );
	}

	// . set this xml class from a string
	// . should be called before calling anything else
	// . if "copy" makes a copy of the string "s" and references into that
	// . s must be NULL terminated
	// . if it's pure xml then set pureXml to true otherwise we assume it
	//   is html or xhtml
	bool set( char *s, int32_t slen, int32_t version, char contentType );

	void  reset ( );

	int32_t getVersion() const {
		return m_version;
	}

	char *getContent() {
		return m_xml;
	}

	char *getContentEnd() {
		return m_xml + m_xmlLen;
	}

	int32_t getContentLen() {
		return m_xmlLen;
	}

	int32_t getNumNodes() const {
		return m_numNodes;
	}

	// . tagName is compound for xml tags, simple for html tags
	// . xml compound tag name example = "myhouse.bedroom.nightstand"
	// . html simple  tag name example = "title" or "table"
	// . obsolete compound name = myhouse[0].bedroom[2].nightstand[1]
	// . returns -1 if not found
	// . only searches nodes in [n0,n1] node range
	int32_t getNodeNum( int32_t n0, int32_t n1, const char *tagName, int32_t tagNameLen ) const;

	// . get the back tag node for a given node
	int32_t getEndNode(int32_t num) const;

	bool isTag( int32_t n ) const {
		return m_nodes[n].isTag();
	}

	bool isBackTag( int32_t n ) const {
		return m_nodes[n].m_node[1] == '/';
	}

	char *getNode( int32_t n ) {
		return m_nodes[n].m_node;
	}
	const char *getNode( int32_t n ) const {
		return m_nodes[n].m_node;
	}

	int32_t getNodeLen( int32_t n ) const {
		return m_nodes[n].m_nodeLen;
	}

	nodeid_t getNodeId( int32_t n ) const {
		return m_nodes[n].m_nodeId;
	}

	// get all nodes!
	XmlNode *getNodes() {
		return m_nodes;
	}

	XmlNode *getNodePtr( int32_t n ) {
		return &m_nodes[n];
	}

	// get like compound name like "node1.node2.node3\0"
	bool getCompoundName( int32_t node, class SafeBuf *sb );

	// . used for parsing xml conf files
	// . used for getting the title in an html doc, etc.
	// . gets the value of the text field immediately following the tag
	// . "tagName" is always compound
	// . only searches nodes in [n0,n1] node range

	int32_t getLong( int32_t n0, int32_t n1, const char *tagName, int32_t defaultLong = 0 );

	char *getString( int32_t n0, int32_t n1, const char *tagName, int32_t *len,
					 bool skipLeadingSpaces = true ) const;

	// like above routines but we search all nodes
	int32_t getLong( const char *tagName, int32_t defaultLong = 0 ) {
		return getLong( 0, m_numNodes, tagName, defaultLong );
	}

	char *getString( const char *tagName, int32_t *len, bool skipLeadingSpaces = true ) {
		return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );
	}
	const char *getString( const char *tagName, int32_t *len, bool skipLeadingSpaces = true ) const {
		return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );
	}

	// . used for getting links in the <a href=...> tag
	// . used for getting data from meta tags
	char *getString( int32_t node, const char *field, int32_t *valueLen ) {
		if ( node >= m_numNodes ) {
			gbshutdownAbort(true);
		}

		return m_nodes[node].getFieldValue( field, valueLen );
	}
	const char *getString(int32_t node, const char *field, int32_t *valueLen) const {
		return const_cast<Xml*>(this)->getString(node,field,valueLen);
	}

	// called by getTextForXmlTag() below
	char *getString( int32_t node, bool skipLeadingSpaces, int32_t *len ) const;

	// used for title/summary generation
	bool getTagContent( const char *fieldName, const char *fieldContent, char *buf, int32_t bufLen,
						int32_t minLength, int32_t maxLength, int32_t *contentLenPtr,
						bool ignoreExpandedIframe = false, nodeid_t expectedNodeId = LAST_TAG );

	bool getTagValue(const char *fieldName, const char *fieldContent, const char *fieldValueName,
	                 const char **valuePtr, int32_t *valueLenPtr, bool ignoreExpandedIframe,
	                 nodeid_t expectedNodeId, int32_t *startNode = nullptr);

	// . like getText() below but gets the content from a meta tag
	// . stores it in "buf"  and NULL terminates it
	// . returns the length
	// . field can be stuff like "summary","description","keywords",...
	// . use "http-equiv" for "name" for meta redirect tags
	// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
	int32_t getMetaContent( char *buf, int32_t bufLen, const char *field, int32_t fieldLen, const char *name = "name",
							int32_t startNode = 0, int32_t *matchedNode = NULL );

	// just get a pointer to it
	char *getMetaContentPointer( const char *field, int32_t fieldLen, const char *name = "name", int32_t *len = NULL );

	// . filters out tags (uses html entities) and stores in "buf"
	// . replaces "line breaking" html tags with 2 returns
	// . only get text of nodes in [node1,node2]
	// . returns # chars written to buf
	// . buf is NULL terminated
	// . bufMaxSize is usually at least getContentLen() + 1 (m_xmlLen+1)
	// . maxDepth is RELATIVE to node # nodeNumber's depth
	// . if "filter" then convert html entities and \r's to spaces
	// . get kid text of node #"nodeNumber" unless it's -1
	// . if "filterSpaces" then don't allow back to back spaces or \n's
	//   and replace tags with ".." not \n (but no back to back ..'s)
	int32_t getText( char *buf, int32_t bufMaxSize, int32_t node1 = 0, int32_t node2 = -1,
					 bool filterSpaces = false );

	int32_t isRSSFeed();

	char *getRSSTitle( int32_t *titleLen, bool *isHtmlEncoded );
	const char *getRSSTitle( int32_t *titleLen, bool *isHtmlEncoded ) const;
	char *getRSSDescription( int32_t *titleLen, bool *isHtmlEncoded );

	// . used by getValueAsBool/Long/String()
	// . tagName is compound for xml tags, simple for html tags
	char *getTextForXmlTag( int32_t n0, int32_t n1, const char *tagName, int32_t *len,
							bool skipLeadingSpaces ) const;

private:
	// used because "s" may have words separated by periods
	int64_t getCompoundHash( const char *s, int32_t len ) const;

	XmlNode *m_nodes;
	int32_t m_numNodes;
	int32_t m_maxNumNodes;

	// If this is a unicode buffer, then m_xml is encoded in UTF-16
	// m_xmlLen is still the size of the buffer IN BYTES
	char *m_xml;
	int32_t m_xmlLen;

	int32_t m_version;
};

#endif // GB_XML_H
Standardize header guards 2016-03-08 22:14:30 +01:00			`#ifndef GB_XML_H`
			`#define GB_XML_H`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . this is used for parsing tagdb records`
			`// . used for pasrsing tagdb records, conf file, and html/xml documents`
			`// . NOTE: ALL tags are case INsensitive so <myTag> equals <MYTAG>`

			`#include "XmlNode.h"`
			`#include "Lang.h"`
Fix unreachable-code warnings 2016-06-28 11:01:47 +02:00			`#include "Sanity.h"`
More emergency shutdown streamlining 2016-06-20 19:29:10 +02:00
Initial file population. 2013-08-02 13:12:24 -07:00
			`class Xml {`
Remove unused variable 2016-01-14 13:52:33 +01:00			`public:`
			`Xml();`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . should free m_xml if m_copy is true`
Remove unused variable 2016-01-14 13:52:33 +01:00			`~Xml();`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// do we have any xml in here?`
More constness in Xml 2016-12-01 11:38:23 +01:00			`bool isEmpty() const {`
Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00			`return ( m_xml == NULL );`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . set this xml class from a string`
			`// . should be called before calling anything else`
			`// . if "copy" makes a copy of the string "s" and references into that`
			`// . s must be NULL terminated`
			`// . if it's pure xml then set pureXml to true otherwise we assume it`
			`// is html or xhtml`
Remove niceness and quickpoll from Xml 2016-09-23 00:54:04 +02:00			`bool set( char *s, int32_t slen, int32_t version, char contentType );`
Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00
			`void reset ( );`
for json docs only give them a single xmlnode in the Xml.cpp class. hopefully will not get "malformed sections" error anymore. i think that was a result of the json having html tags in it and making unnested html structures which the sections class did not like. TODO: probably do this for CT_TEXT etc. as well. 2014-01-25 08:17:38 -08:00
More constness in Xml 2016-12-01 11:38:23 +01:00			`int32_t getVersion() const {`
Don't replace '>' & '<' to '\|' when converting from HTML entities 2016-01-29 19:18:22 +01:00			`return m_version;`
			`}`

Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00			`char *getContent() {`
			`return m_xml;`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00			`char *getContentEnd() {`
			`return m_xml + m_xmlLen;`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00			`int32_t getContentLen() {`
			`return m_xmlLen;`
			`}`

More constness in Xml 2016-12-01 11:38:23 +01:00			`int32_t getNumNodes() const {`
Skip getting meta tags from inside gbframe (expanded iframe) 2016-01-13 13:26:37 +01:00			`return m_numNodes;`
			`}`

Initial file population. 2013-08-02 13:12:24 -07:00			`// . tagName is compound for xml tags, simple for html tags`
			`// . xml compound tag name example = "myhouse.bedroom.nightstand"`
			`// . html simple tag name example = "title" or "table"`
			`// . obsolete compound name = myhouse[0].bedroom[2].nightstand[1]`
			`// . returns -1 if not found`
			`// . only searches nodes in [n0,n1] node range`
Add constness to Parms::m_xml 2016-04-17 23:25:19 +02:00			`int32_t getNodeNum( int32_t n0, int32_t n1, const char *tagName, int32_t tagNameLen ) const;`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// . get the back tag node for a given node`
More constness in Xml 2016-12-01 11:38:23 +01:00			`int32_t getEndNode(int32_t num) const;`
Initial file population. 2013-08-02 13:12:24 -07:00
More constness in Xml 2016-12-01 11:38:23 +01:00			`bool isTag( int32_t n ) const {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return m_nodes[n].isTag();`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
More constness in Xml 2016-12-01 11:38:23 +01:00			`bool isBackTag( int32_t n ) const {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return m_nodes[n].m_node[1] == '/';`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`char *getNode( int32_t n ) {`
			`return m_nodes[n].m_node;`
			`}`
More constness in Xml 2016-05-29 18:18:26 +02:00			`const char *getNode( int32_t n ) const {`
			`return m_nodes[n].m_node;`
			`}`
index xml docs properly like we do json 2014-09-28 09:20:16 -07:00
More constness in Xml 2016-12-01 11:38:23 +01:00			`int32_t getNodeLen( int32_t n ) const {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return m_nodes[n].m_nodeLen;`
			`}`

More constness in Xml 2016-12-01 11:38:23 +01:00			`nodeid_t getNodeId( int32_t n ) const {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return m_nodes[n].m_nodeId;`
			`}`

			`// get all nodes!`
			`XmlNode *getNodes() {`
			`return m_nodes;`
			`}`
Make some Xml member variable private 2016-01-15 11:11:38 +01:00
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`XmlNode *getNodePtr( int32_t n ) {`
			`return &m_nodes[n];`
			`}`

			`// get like compound name like "node1.node2.node3\0"`
			`bool getCompoundName( int32_t node, class SafeBuf *sb );`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . used for parsing xml conf files`
			`// . used for getting the title in an html doc, etc.`
			`// . gets the value of the text field immediately following the tag`
			`// . "tagName" is always compound`
			`// . only searches nodes in [n0,n1] node range`
Remove unused methods 2016-01-11 23:12:54 +01:00
constness in Xml.* 2016-05-22 01:02:30 +02:00			`int32_t getLong( int32_t n0, int32_t n1, const char *tagName, int32_t defaultLong = 0 );`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00
constness in Xml.* 2016-05-22 01:02:30 +02:00			`char getString( int32_t n0, int32_t n1, const char tagName, int32_t *len,`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`bool skipLeadingSpaces = true ) const;`
Remove unused methods 2016-01-11 23:12:54 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// like above routines but we search all nodes`
More constness in Xml 2016-05-29 18:18:26 +02:00			`int32_t getLong( const char *tagName, int32_t defaultLong = 0 ) {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return getLong( 0, m_numNodes, tagName, defaultLong );`
			`}`
Remove unused methods 2016-01-11 23:12:54 +01:00
More constness in Xml 2016-05-29 18:18:26 +02:00			`char getString( const char tagName, int32_t *len, bool skipLeadingSpaces = true ) {`
			`return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );`
			`}`
			`const char getString( const char tagName, int32_t *len, bool skipLeadingSpaces = true ) const {`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . used for getting links in the <a href=...> tag`
			`// . used for getting data from meta tags`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`char getString( int32_t node, const char field, int32_t *valueLen ) {`
			`if ( node >= m_numNodes ) {`
More emergency shutdown streamlining 2016-06-20 19:29:10 +02:00			`gbshutdownAbort(true);`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`}`

			`return m_nodes[node].getFieldValue( field, valueLen );`
Try to get a nicer summary by using what the website set as description Use the following in priority order (highest first) - itemprop = "description" - meta name = "og:description" - meta name = "description" 2016-01-12 15:33:42 +01:00			`}`
const version of Xml::getString 2016-12-01 12:15:32 +01:00			`const char getString(int32_t node, const char field, int32_t *valueLen) const {`
			`return const_cast<Xml*>(this)->getString(node,field,valueLen);`
			`}`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// called by getTextForXmlTag() below`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`char getString( int32_t node, bool skipLeadingSpaces, int32_t len ) const;`
Initial file population. 2013-08-02 13:12:24 -07:00
Use meta tags (og:title & title) & title tag when available for generating title 2016-01-15 15:52:17 +01:00			`// used for title/summary generation`
			`bool getTagContent( const char fieldName, const char fieldContent, char *buf, int32_t bufLen,`
			`int32_t minLength, int32_t maxLength, int32_t *contentLenPtr,`
			`bool ignoreExpandedIframe = false, nodeid_t expectedNodeId = LAST_TAG );`

Various bug fixes on canonical url - canonical url with base url - canonical url that redirects 2017-09-12 16:37:28 +02:00			`bool getTagValue(const char fieldName, const char fieldContent, const char *fieldValueName,`
			`const char *valuePtr, int32_t valueLenPtr, bool ignoreExpandedIframe,`
Fix potential infiniteloop when multiple canonical link is found 2017-09-15 12:05:51 +02:00			`nodeid_t expectedNodeId, int32_t *startNode = nullptr);`
Various bug fixes on canonical url - canonical url with base url - canonical url that redirects 2017-09-12 16:37:28 +02:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// . like getText() below but gets the content from a meta tag`
			`// . stores it in "buf" and NULL terminates it`
			`// . returns the length`
			`// . field can be stuff like "summary","description","keywords",...`
			`// . use "http-equiv" for "name" for meta redirect tags`
			`// . if "convertHtmlEntites" is true we change < to < and > to >`
More constness in Xml 2016-05-29 18:18:26 +02:00			`int32_t getMetaContent( char buf, int32_t bufLen, const char field, int32_t fieldLen, const char *name = "name",`
Remove always false variable 2016-01-15 11:20:11 +01:00			`int32_t startNode = 0, int32_t *matchedNode = NULL );`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00
Initial file population. 2013-08-02 13:12:24 -07:00			`// just get a pointer to it`
constness in Xml.* 2016-05-22 01:02:30 +02:00			`char getMetaContentPointer( const char field, int32_t fieldLen, const char name = "name", int32_t len = NULL );`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . filters out tags (uses html entities) and stores in "buf"`
			`// . replaces "line breaking" html tags with 2 returns`
			`// . only get text of nodes in [node1,node2]`
			`// . returns # chars written to buf`
			`// . buf is NULL terminated`
			`// . bufMaxSize is usually at least getContentLen() + 1 (m_xmlLen+1)`
			`// . maxDepth is RELATIVE to node # nodeNumber's depth`
			`// . if "filter" then convert html entities and \r's to spaces`
			`// . get kid text of node #"nodeNumber" unless it's -1`
			`// . if "filterSpaces" then don't allow back to back spaces or \n's`
			`// and replace tags with ".." not \n (but no back to back ..'s)`
Use -1 for max node instead of arbitary 999999 2017-06-22 15:01:33 +02:00			`int32_t getText( char *buf, int32_t bufMaxSize, int32_t node1 = 0, int32_t node2 = -1,`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`bool filterSpaces = false );`
Initial file population. 2013-08-02 13:12:24 -07:00
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`int32_t isRSSFeed();`
Remove unused variable 2016-01-14 13:52:33 +01:00
More constness in Xml 2016-05-29 18:18:26 +02:00			`char getRSSTitle( int32_t titleLen, bool *isHtmlEncoded );`
			`const char getRSSTitle( int32_t titleLen, bool *isHtmlEncoded ) const;`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`char getRSSDescription( int32_t titleLen, bool *isHtmlEncoded );`
Initial file population. 2013-08-02 13:12:24 -07:00
			`// . used by getValueAsBool/Long/String()`
			`// . tagName is compound for xml tags, simple for html tags`
constness in Xml.* 2016-05-22 01:02:30 +02:00			`char getTextForXmlTag( int32_t n0, int32_t n1, const char tagName, int32_t *len,`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`bool skipLeadingSpaces ) const;`
Initial file population. 2013-08-02 13:12:24 -07:00
Remove unused variable 2016-01-14 13:52:33 +01:00			`private:`
			`// used because "s" may have words separated by periods`
Add constness to Parms::m_xml 2016-04-17 23:25:19 +02:00			`int64_t getCompoundHash( const char *s, int32_t len ) const;`
fix links parser so it harvests outlinks from rss feeds' <link> tags. it was doing this before, now it is doing it again. 2015-03-12 17:35:47 -07:00
Make some Xml member variable private 2016-01-15 11:11:38 +01:00			`XmlNode *m_nodes;`
			`int32_t m_numNodes;`
			`int32_t m_maxNumNodes;`

Remove write only Xml::m_version variable 2016-01-27 11:23:40 +01:00			`// If this is a unicode buffer, then m_xml is encoded in UTF-16`
			`// m_xmlLen is still the size of the buffer IN BYTES`
Fix calculation of end XML node. Ignore itemprop description if it's too long. Remove commented out codes. 2016-01-14 17:12:52 +01:00			`char *m_xml;`
			`int32_t m_xmlLen;`
Initial file population. 2013-08-02 13:12:24 -07:00
Don't replace '>' & '<' to '\|' when converting from HTML entities 2016-01-29 19:18:22 +01:00			`int32_t m_version;`
Initial file population. 2013-08-02 13:12:24 -07:00			`};`

Standardize header guards 2016-03-08 22:14:30 +01:00			`#endif // GB_XML_H`