Files
privacore-open-source-searc…/Xml.h

203 lines
6.4 KiB
C
Raw Normal View History

2016-03-08 22:14:30 +01:00
#ifndef GB_XML_H
#define GB_XML_H
2013-08-02 13:12:24 -07:00
// . this is used for parsing tagdb records
// . used for pasrsing tagdb records, conf file, and html/xml documents
// . NOTE: ALL tags are case INsensitive so <myTag> equals <MYTAG>
#include "XmlNode.h"
#include "Lang.h"
2016-06-28 11:01:47 +02:00
#include "Sanity.h"
2016-06-20 19:29:10 +02:00
2013-08-02 13:12:24 -07:00
class Xml {
2016-01-14 13:52:33 +01:00
public:
Xml();
2013-08-02 13:12:24 -07:00
// . should free m_xml if m_copy is true
2016-01-14 13:52:33 +01:00
~Xml();
2013-08-02 13:12:24 -07:00
// do we have any xml in here?
2016-12-01 11:38:23 +01:00
bool isEmpty() const {
return ( m_xml == NULL );
}
2013-08-02 13:12:24 -07:00
// . set this xml class from a string
// . should be called before calling anything else
// . if "copy" makes a copy of the string "s" and references into that
// . s must be NULL terminated
// . if it's pure xml then set pureXml to true otherwise we assume it
// is html or xhtml
2016-09-23 00:54:04 +02:00
bool set( char *s, int32_t slen, int32_t version, char contentType );
void reset ( );
2016-12-01 11:38:23 +01:00
int32_t getVersion() const {
return m_version;
}
char *getContent() {
return m_xml;
}
2013-08-02 13:12:24 -07:00
char *getContentEnd() {
return m_xml + m_xmlLen;
}
2013-08-02 13:12:24 -07:00
int32_t getContentLen() {
return m_xmlLen;
}
2016-12-01 11:38:23 +01:00
int32_t getNumNodes() const {
return m_numNodes;
}
2013-08-02 13:12:24 -07:00
// . tagName is compound for xml tags, simple for html tags
// . xml compound tag name example = "myhouse.bedroom.nightstand"
// . html simple tag name example = "title" or "table"
// . obsolete compound name = myhouse[0].bedroom[2].nightstand[1]
// . returns -1 if not found
// . only searches nodes in [n0,n1] node range
2016-04-17 23:25:19 +02:00
int32_t getNodeNum( int32_t n0, int32_t n1, const char *tagName, int32_t tagNameLen ) const;
2013-08-02 13:12:24 -07:00
// . get the back tag node for a given node
2016-12-01 11:38:23 +01:00
int32_t getEndNode(int32_t num) const;
2013-08-02 13:12:24 -07:00
2016-12-01 11:38:23 +01:00
bool isTag( int32_t n ) const {
return m_nodes[n].isTag();
}
2013-08-02 13:12:24 -07:00
2016-12-01 11:38:23 +01:00
bool isBackTag( int32_t n ) const {
return m_nodes[n].m_node[1] == '/';
}
2013-08-02 13:12:24 -07:00
char *getNode( int32_t n ) {
return m_nodes[n].m_node;
}
2016-05-29 18:18:26 +02:00
const char *getNode( int32_t n ) const {
return m_nodes[n].m_node;
}
2016-12-01 11:38:23 +01:00
int32_t getNodeLen( int32_t n ) const {
return m_nodes[n].m_nodeLen;
}
2016-12-01 11:38:23 +01:00
nodeid_t getNodeId( int32_t n ) const {
return m_nodes[n].m_nodeId;
}
// get all nodes!
XmlNode *getNodes() {
return m_nodes;
}
2016-01-15 11:11:38 +01:00
XmlNode *getNodePtr( int32_t n ) {
return &m_nodes[n];
}
// get like compound name like "node1.node2.node3\0"
bool getCompoundName( int32_t node, class SafeBuf *sb );
2013-08-02 13:12:24 -07:00
// . used for parsing xml conf files
// . used for getting the title in an html doc, etc.
// . gets the value of the text field immediately following the tag
// . "tagName" is always compound
// . only searches nodes in [n0,n1] node range
2016-01-11 23:12:54 +01:00
2016-05-22 01:02:30 +02:00
int32_t getLong( int32_t n0, int32_t n1, const char *tagName, int32_t defaultLong = 0 );
2016-05-22 01:02:30 +02:00
char *getString( int32_t n0, int32_t n1, const char *tagName, int32_t *len,
bool skipLeadingSpaces = true ) const;
2016-01-11 23:12:54 +01:00
2013-08-02 13:12:24 -07:00
// like above routines but we search all nodes
2016-05-29 18:18:26 +02:00
int32_t getLong( const char *tagName, int32_t defaultLong = 0 ) {
return getLong( 0, m_numNodes, tagName, defaultLong );
}
2016-01-11 23:12:54 +01:00
2016-05-29 18:18:26 +02:00
char *getString( const char *tagName, int32_t *len, bool skipLeadingSpaces = true ) {
return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );
}
const char *getString( const char *tagName, int32_t *len, bool skipLeadingSpaces = true ) const {
return getString( 0, m_numNodes, tagName, len, skipLeadingSpaces );
}
2013-08-02 13:12:24 -07:00
// . used for getting links in the <a href=...> tag
// . used for getting data from meta tags
char *getString( int32_t node, const char *field, int32_t *valueLen ) {
if ( node >= m_numNodes ) {
2016-06-20 19:29:10 +02:00
gbshutdownAbort(true);
}
return m_nodes[node].getFieldValue( field, valueLen );
}
2016-12-01 12:15:32 +01:00
const char *getString(int32_t node, const char *field, int32_t *valueLen) const {
return const_cast<Xml*>(this)->getString(node,field,valueLen);
}
2013-08-02 13:12:24 -07:00
// called by getTextForXmlTag() below
char *getString( int32_t node, bool skipLeadingSpaces, int32_t *len ) const;
2013-08-02 13:12:24 -07:00
// used for title/summary generation
bool getTagContent( const char *fieldName, const char *fieldContent, char *buf, int32_t bufLen,
int32_t minLength, int32_t maxLength, int32_t *contentLenPtr,
bool ignoreExpandedIframe = false, nodeid_t expectedNodeId = LAST_TAG );
bool getTagValue(const char *fieldName, const char *fieldContent, const char *fieldValueName,
const char **valuePtr, int32_t *valueLenPtr, bool ignoreExpandedIframe,
nodeid_t expectedNodeId, int32_t *startNode = nullptr);
2013-08-02 13:12:24 -07:00
// . like getText() below but gets the content from a meta tag
// . stores it in "buf" and NULL terminates it
// . returns the length
// . field can be stuff like "summary","description","keywords",...
// . use "http-equiv" for "name" for meta redirect tags
// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
2016-05-29 18:18:26 +02:00
int32_t getMetaContent( char *buf, int32_t bufLen, const char *field, int32_t fieldLen, const char *name = "name",
2016-01-15 11:20:11 +01:00
int32_t startNode = 0, int32_t *matchedNode = NULL );
2013-08-02 13:12:24 -07:00
// just get a pointer to it
2016-05-22 01:02:30 +02:00
char *getMetaContentPointer( const char *field, int32_t fieldLen, const char *name = "name", int32_t *len = NULL );
2013-08-02 13:12:24 -07:00
// . filters out tags (uses html entities) and stores in "buf"
// . replaces "line breaking" html tags with 2 returns
// . only get text of nodes in [node1,node2]
// . returns # chars written to buf
// . buf is NULL terminated
// . bufMaxSize is usually at least getContentLen() + 1 (m_xmlLen+1)
// . maxDepth is RELATIVE to node # nodeNumber's depth
// . if "filter" then convert html entities and \r's to spaces
// . get kid text of node #"nodeNumber" unless it's -1
// . if "filterSpaces" then don't allow back to back spaces or \n's
// and replace tags with ".." not \n (but no back to back ..'s)
int32_t getText( char *buf, int32_t bufMaxSize, int32_t node1 = 0, int32_t node2 = -1,
bool filterSpaces = false );
2013-08-02 13:12:24 -07:00
int32_t isRSSFeed();
2016-01-14 13:52:33 +01:00
2016-05-29 18:18:26 +02:00
char *getRSSTitle( int32_t *titleLen, bool *isHtmlEncoded );
const char *getRSSTitle( int32_t *titleLen, bool *isHtmlEncoded ) const;
char *getRSSDescription( int32_t *titleLen, bool *isHtmlEncoded );
2013-08-02 13:12:24 -07:00
// . used by getValueAsBool/Long/String()
// . tagName is compound for xml tags, simple for html tags
2016-05-22 01:02:30 +02:00
char *getTextForXmlTag( int32_t n0, int32_t n1, const char *tagName, int32_t *len,
bool skipLeadingSpaces ) const;
2013-08-02 13:12:24 -07:00
2016-01-14 13:52:33 +01:00
private:
// used because "s" may have words separated by periods
2016-04-17 23:25:19 +02:00
int64_t getCompoundHash( const char *s, int32_t len ) const;
2016-01-15 11:11:38 +01:00
XmlNode *m_nodes;
int32_t m_numNodes;
int32_t m_maxNumNodes;
// If this is a unicode buffer, then m_xml is encoded in UTF-16
// m_xmlLen is still the size of the buffer IN BYTES
char *m_xml;
int32_t m_xmlLen;
2013-08-02 13:12:24 -07:00
int32_t m_version;
2013-08-02 13:12:24 -07:00
};
2016-03-08 22:14:30 +01:00
#endif // GB_XML_H