privacore-open-source-searc.../Xml.cpp
2018-08-31 12:11:16 +02:00

1163 lines
32 KiB
C++

#include "Xml.h"
#include "Mem.h" // mfree(), mmalloc()
#include "Titledb.h"
#include "tokenizer.h"
#include "Pos.h"
#include "Sanity.h"
#include "Conf.h"
#include "Errno.h"
#include "fctypes.h"
#include "utf8_fast.h"
#include "hash.h"
#include "gbmemcpy.h"
Xml::Xml () {
m_xml = NULL;
m_xmlLen = 0;
m_nodes = NULL;
m_numNodes=0;
m_maxNumNodes = 0;
m_version = 0;
}
// . should free m_xml if m_copy is true
Xml::~Xml () {
reset();
}
// . for parsing xml conf files
int32_t Xml::getLong ( int32_t n0, int32_t n1, const char *tagName, int32_t defaultLong ) {
int32_t len;
char *s = getTextForXmlTag ( n0 , n1 , tagName , &len , false );
if ( s ) return atol2 ( s , len );
// return the default if no non-white-space text
return defaultLong;
}
char *Xml::getString ( int32_t n0, int32_t n1, const char *tagName, int32_t *len ,
bool skipLeadingSpaces ) const {
char *s = getTextForXmlTag ( n0, n1, tagName, len, skipLeadingSpaces );
if ( s ) return s;
// return the default if s is null
return NULL;
}
// . used by getValueAsBool/Long/String()
// . tagName is compound for xml tags, simple for html tags
// . NOTE: we skip over leading spaces
char *Xml::getTextForXmlTag ( int32_t n0, int32_t n1, const char *tagName, int32_t *len,
bool skipLeadingSpaces ) const {
// assume len is 0
*len = 0;
// get a matching xml TAG
int32_t num = getNodeNum ( n0 , n1 , tagName , strlen(tagName) );
if ( num < 0 ) return NULL;
return getString ( num , skipLeadingSpaces , len );
}
char *Xml::getString ( int32_t num , bool skipLeadingSpaces , int32_t *len ) const {
// get the text of this tag (if any)
if ( ++num >= m_numNodes ) { *len = 0; return NULL; }
if ( ! m_nodes[num].isText() ) { *len = 0; return NULL; }
// if we don't skip leading spaces return it as is
if ( ! skipLeadingSpaces ) {
*len = m_nodes[num].m_nodeLen;
return m_nodes[num].m_node;
}
// get the string
char *s = m_nodes[num].m_node;
// set the length and return the string
int32_t slen = m_nodes[num].m_nodeLen;
// skip leading spaces
while ( is_wspace_utf8 ( s ) && slen > 0 ) { s++; slen--; }
// set len
*len = slen;
// return NULL if slen is 0
if ( slen == 0 ) return NULL;
// otherwise return s
return s;
}
int32_t Xml::getEndNode ( int32_t num ) const {
if ( (num < 0) || (num >= m_numNodes) ) {
return -1;
}
XmlNode *node = &m_nodes[num];
// we can't use hasBackTag() because some tags has back tag but it's not mandatory
// so we check for void elements
if ( !node->isTag() || g_nodes[node->getNodeId()].m_tagType == TAG_TYPE_HTML_VOID ) {
return -1;
}
int innerTagCount = 1;
// scan for ending back tag
int32_t i;
for ( i = num + 1 ; i < m_numNodes ; ++i ) {
if ( m_nodes[i].m_hash == node->m_hash ) {
if ( m_nodes[i].isFrontTag() ) {
++innerTagCount;
} else {
--innerTagCount;
}
if ( innerTagCount == 0 ) {
break;
}
}
}
if ( i >= m_numNodes ) {
return -1;
}
return i;
}
int64_t Xml::getCompoundHash ( const char *s , int32_t len ) const {
// setup
const char *p = s;
const char *start = s;
int32_t i = 0;
int64_t h = 0;
loop:
// find fisrt .
while ( i < len && p[i] != '.' ) i++;
// . hash from p to p[i]
// . tag names are always ascii, so use the ascii hasher, not utf8
h = hash64Upper_a ( start , &p[i] - start , h );
// bail if done
if ( i >= len ) return h;
// then period
h = hash64 ( "." , 1 , h );
// skip period
i++;
// start now points to next word
start = &p[i];
// continue
goto loop;
}
// . return -1 if not found
// . "tagName" is compound (i.e. "myhouse.myroom" )
int32_t Xml::getNodeNum ( int32_t n0 , int32_t n1 , const char *tagName , int32_t tagNameLen ) const {
// . since i changed the hash to a zobrist hash, hashing
// "dns.ip" is not the same as hashing "dns" then "." then "ip"
// by passing the hash of the last to the next as the startHash
// . therefore, i now parse it up
int64_t h = getCompoundHash ( tagName , tagNameLen );
int32_t i;
if ( n1 > m_numNodes ) n1 = m_numNodes;
if ( n0 > m_numNodes ) n0 = m_numNodes;
if ( n1 < 0 ) n1 = 0;
if ( n0 < 0 ) n0 = 0;
for ( i = n0 ; i < n1; i++ ) {
// if node is text (non-tag) then skip
if ( ! m_nodes[i].isTag() ) continue;
//if ( m_nodes[i].m_compoundHash == h ) break;
if ( m_nodes[i].m_hash == h ) break;
}
// return -1 if not found at all
if ( i >= n1 ) return -1;
return i;
}
void Xml::reset ( ) {
// free old nodes array if any
if ( m_nodes ) {
mfree ( m_nodes, m_maxNumNodes*sizeof(XmlNode),"Xml1");
}
m_xml = NULL;
m_nodes = NULL;
m_numNodes = 0;
m_maxNumNodes = 0;
}
bool Xml::getCompoundName ( int32_t node , SafeBuf *sb ) {
XmlNode *buf[256];
XmlNode *xn = &m_nodes[node];
int32_t np = 0;
for ( ; xn ; xn = xn->m_parent ) {
if ( ! xn->m_nodeId ) continue;
if ( np >= 256 ) {g_errno = EBUFTOOSMALL;return false;}
buf[np++] = xn;
}
// ignore that initial <?xml ..> tag they all have
if ( np > 0 &&
buf[np-1]->m_tagNameLen == 3 &&
strncasecmp(buf[np-1]->m_tagName,"xml",3) == 0 )
np--;
for ( int32_t i = np - 1 ; i >= 0 ; i-- ) {
XmlNode *xn = buf[i];
sb->safeMemcpy ( xn->m_tagName , xn->m_tagNameLen );
sb->pushChar('.');
}
// remove last '.'
if ( sb->length() ) sb->m_length--;
sb->nullTerm();
return true;
}
#include "HttpMime.h" // CT_JSON
// "s" must be in utf8
bool Xml::set( char *s, int32_t slen, int32_t version, char contentType ) {
// just in case
reset();
m_version = version;
// clear it
g_errno = 0;
// make pointers to data
m_xml = s;
m_xmlLen = slen;
// debug msg time
if ( g_conf.m_logTimingBuild ) {
logf( LOG_TIMING, "build: xml: set: 4a. %" PRIu64 "", gettimeofdayInMilliseconds() );
}
// sanity check
if ( !s || slen <= 0 ) {
return true;
}
if ( s[slen] != '\0' ) {
log(LOG_LOGIC,"build: Xml: Content is not null terminated.");
gbshutdownLogicError();
}
// if json go no further. TODO: also do this for CT_TEXT etc.
if ( contentType == CT_JSON ) {
m_numNodes = 0;
// make the array
m_maxNumNodes = 1;
m_nodes =(XmlNode *)mmalloc(sizeof(XmlNode)*m_maxNumNodes,"x");
if ( ! m_nodes ) return false;
XmlNode *xd = &m_nodes[m_numNodes];
// hack the node
xd->m_node = s;
xd->m_nodeLen = slen;
xd->m_isSelfLink = 0;
// . nodeId for text nodes is 0
xd->m_nodeId = TAG_TEXTNODE;
xd->m_hasBackTag = false;
xd->m_hash = 0;
xd->m_pairTagNum = -1;
m_numNodes++;
return true;
}
// override. no don't it hurts when parsing CT_XML docs!!
// we need XmlNode.cpp's setNodeInfo() to identify xml tags in
// an rss feed. No, this was here for XmlDoc::hashXml() i think
// so let's just fix Links.cpp to get links from pure xml.
// we can't do this any more. it's easier to fix xmldoc::hashxml()
// some other way... because Links.cpp and Xml::isRSSFeed()
// depend on having regular tagids. but without this here
// then XmlDoc::hashXml() breaks.
bool pureXml = ( contentType == CT_XML );
int32_t i;
/// @todo ALC why are we replacing NULL bytes here?
/// Shouldn't all string be valid utf-8 at this point?
// . replacing NULL bytes with spaces in the buffer
// . utf8 should never have any 0 bytes in it either!
for ( i = 0 ; i < slen ; i++ ) {
if ( !s[i] ) {
s[i] = ' ';
}
}
// counting the max num nodes
for ( i = 0 ; s[i] ; i++ ) {
if ( s[i] == '<' ) {
m_maxNumNodes++;
}
}
// account for the text (non-tag) nodes (padding nodes between tags)
m_maxNumNodes *= 2 ;
// if we only have one tag we can still have 3 nodes!
m_maxNumNodes++;
// debug msg time
if ( g_conf.m_logTimingBuild ) {
logf( LOG_TIMING, "build: xml: set: 4b. %" PRIu64 "", gettimeofdayInMilliseconds() );
}
// . truncate it to avoid spammers
// . now i limit to 30k nodes because of those damned xls docs!
// . they have 300,000+ nodes some of 'em
// now allow 35k nodes for every 100k doclen
int32_t num100k = slen/(100*1024);
if (num100k <= 0) num100k = 1;
int32_t bigMax = 35*1024 * num100k;
if (m_maxNumNodes > bigMax){
log(LOG_WARN, "build: xml: doclen %" PRId32", "
"too many nodes: counted %" PRId32", max %" PRId32" "
"...truncating", slen, m_maxNumNodes, bigMax);
m_maxNumNodes = bigMax;
}
m_nodes = (XmlNode *)mmalloc( sizeof( XmlNode ) * m_maxNumNodes, "Xml1" );
if ( ! m_nodes ) {
reset();
log(LOG_WARN, "build: Could not allocate %" PRId32 " bytes need to parse document.",
(int32_t) sizeof(XmlNode) * m_maxNumNodes);
return false;
}
// debug msg time
if ( g_conf.m_logTimingBuild ) {
logf( LOG_TIMING, "build: xml: set: 4c. %" PRIu64 "", gettimeofdayInMilliseconds() );
}
XmlNode *parent = NULL;
XmlNode *parentStackStart[256];
XmlNode **parentStackPtr = &parentStackStart[0];
XmlNode **parentStackEnd = &parentStackStart[256];
// . TODO: do this on demand
// . now fill our nodes array
// . loop over the xml
// . i is byte-index in buffer
for ( i = 0 ; i < m_xmlLen && m_numNodes < m_maxNumNodes ; ) {
// convenience ptr
XmlNode *xi = &m_nodes[m_numNodes];
// set that node
i += xi->set( &m_xml[i], m_xmlLen-i, pureXml );
// set his parent xml node if is xml
xi->m_parent = parent;
bool endsInSlash = false;
if ( xi->m_node[xi->m_nodeLen-2] == '/' ) {
endsInSlash = true;
}
if ( xi->m_node[xi->m_nodeLen-2] == '?' ) {
endsInSlash = true;
}
// disregard </> in the conf files
if ( xi->m_nodeLen == 3 && endsInSlash ) {
endsInSlash = false;
}
// if not text node then he's the new parent
// if we don't do this for xhtml then we don't pop the parent
// and run out of parent stack space very quickly.
if ( pureXml && xi->m_nodeId &&
xi->m_nodeId != TAG_COMMENT && xi->m_nodeId != TAG_CDATA &&
!endsInSlash ) {
// if we are a back tag pop the stack
if ( ! xi->isFrontTag() ) {
// pop old parent
if ( parentStackPtr > parentStackStart ) {
parent = *(--parentStackPtr);
}
}
// we are a front tag...
else {
// did we overflow?
if ( parentStackPtr >= parentStackEnd ) {
log("xml: xml parent overflow");
g_errno = EBUFTOOSMALL;
return false;
}
// push the old parent ptr
if ( parent ) {
*parentStackPtr++ = parent;
}
// set the new parent to us
parent = xi;
}
}
if ( xi->m_nodeId != TAG_SCRIPT || !xi->isFrontTag() ) {
++m_numNodes;
continue;
}
// ok, we got a <script> tag now
++m_numNodes;
// use this for parsing consistency when deleting records
// so they equal what we added.
bool newVersion = (version > 120);
// retry:
// scan for </script>
char *pstart = &m_xml[i];
char *p = pstart;
char *pend = &m_xml[0] + m_xmlLen;
bool inDoubles = false;
bool inSingles = false;
bool inComment1 = false;
bool inComment2 = false;
bool inComment3 = false;
bool inComment4 = false;
bool escaped = false;
// scan -- 5 continues -- node 1570 is text of script
for ( ; p < pend ; p++ ) {
//
// adding these new quote checks may cause a few
// parsing inconsistencies for pages a hanful of pages
//
// windows-based html pages use 13 sometimes and no
// \n at all...
if ( p[0] =='\n' || p[0] == 13 ) { // ^m = 13 = CR
//newLine = true;
inComment1 = false;
}
if ( p[0] == '\\' ) {
escaped = ! escaped;
continue;
}
//if ( newLine && is_wspace_a(p[0]) )
// continue;
if ( p[0] == '<' && p[1] == '!' &&
p[2] == '-' && p[3] == '-' &&
! inSingles && ! inDoubles &&
! inComment1 &&
! inComment2 &&
! inComment4 )
inComment3 = true;
if ( p[0] == '-' && p[1] == '-' &&
p[2] == '>' &&
inComment3 )
inComment3 = false;
// no. i saw <script>//</script> and </script> was
// not considered to be in a comment
if ( p[0] == '/' && p[1]=='/'&&
! inSingles && ! inDoubles &&
! inComment2 &&
! inComment3 &&
// allow for "//<![CDATA[..." to end in
// "//]]>" so ignore if inComment4 is true.
// i'd say these are the weaker of all 4
// comment types in that regard.
! inComment4 )
inComment1 = true;
// handle /* */ comments
if ( p[0] == '/' && p[1]=='*' &&
! inSingles && ! inDoubles &&
! inComment1 &&
! inComment3 &&
! inComment4 )
inComment2 = true;
// <![CDATA[...]]> "comments" in <script> tags
// are common. CDATA tags seem to prevail even if
// within another comment tag, like i am seeing
// "//<![CDATA[..." a lot.
if ( p[0] == '<' &&
p[1] == '!' &&
p[2] == '[' &&
p[3] == 'C' &&
p[4] == 'D' &&
p[5] == 'A' &&
p[6] == 'T' &&
p[7] == 'A' &&
p[8] == '['
//! inComment1 &&
//! inComment2 &&
//! inComment3 )
)
inComment4 = true;
if ( p[0] == ']' &&
p[1] == ']' &&
p[2] == '>' )
inComment4 = false;
if ( p[0] == '*' &&
p[1]=='/' &&
! inComment4 )
inComment2 = false;
// no longer the start of a newLine
//newLine = false;
// don't check for quotes or </script> if in comment
// no, if've seen <script>//</script> on ibm.com pages,
// so just ignore ' and " for // comments
if ( inComment1 && newVersion ) {
escaped = false;
//continue;
}
if ( inComment2 && newVersion ) {
escaped = false;
continue;
}
if ( inComment3 && newVersion ) {
escaped = false;
continue;
}
if ( inComment4 && newVersion ) {
escaped = false;
continue;
}
// if an unescaped double quote
if ( p[0] == '\"' && ! escaped && ! inSingles &&
// i've seen <script>//</script> on ibm.com pages,
// so just ignore ' and " for // comments
! inComment1 )
inDoubles = ! inDoubles;
// if an unescaped single quote.
if ( p[0] == '\'' && ! escaped && ! inDoubles &&
// i've seen <script>//</script> on ibm.com pages,
// so just ignore ' and " for // comments
! inComment1 )
inSingles = ! inSingles;
// no longer escaped
escaped = false;
// keep going if not a tag
if ( p[0] != '<' ) continue;
// </script> or </gbframe> stops it
if ( p[1] == '/' ) {
if ( to_lower_a(p[2]) == 's' &&
to_lower_a(p[3]) == 'c' &&
to_lower_a(p[4]) == 'r' &&
to_lower_a(p[5]) == 'i' &&
to_lower_a(p[6]) == 'p' &&
to_lower_a(p[7]) == 't' ) {
if((inDoubles||inSingles)&& newVersion)
continue;
break;
}
if ( to_lower_a(p[2]) == 'g' &&
to_lower_a(p[3]) == 'b' &&
to_lower_a(p[4]) == 'f' &&
to_lower_a(p[5]) == 'r' &&
to_lower_a(p[6]) == 'a' &&
to_lower_a(p[7]) == 'm' )
break;
}
// another <script> stops it
if ( to_lower_a(p[1]) == 's' &&
to_lower_a(p[2]) == 'c' &&
to_lower_a(p[3]) == 'r' &&
to_lower_a(p[4]) == 'i' &&
to_lower_a(p[5]) == 'p' &&
to_lower_a(p[6]) == 't' ) {
if ( (inDoubles || inSingles) && newVersion )
continue;
break;
}
}
// make sure we do not breach! i saw this happen once!
if ( m_numNodes >= m_maxNumNodes ) break;
// was it like <script></script> then no scripttext tag?
if ( p - pstart == 0 )
continue;
XmlNode *xn = &m_nodes[m_numNodes++];
xn->m_nodeId = TAG_SCRIPTTEXT;
xn->m_node = pstart;
xn->m_nodeLen = p - pstart;
xn->m_tagName = NULL;
xn->m_tagNameLen = 0;
xn->m_hasBackTag = false;
xn->m_hash = 0;
xn->m_isVisible = false;
xn->m_isBreaking = false;
// advance i to get to the </script> or <gbframe> etc.
i = p - &m_xml[0] ;
}
// sanity
if ( m_numNodes > m_maxNumNodes ) gbshutdownLogicError();
// trim off last node if empty! it is causing a core in isBackTag()
if ( m_numNodes > 0 && m_nodes[m_numNodes-1].m_nodeLen == 0 ) {
m_numNodes--;
}
// debug msg time
if ( g_conf.m_logTimingBuild ) {
logf( LOG_TIMING, "build: xml: set: 4d. %" PRIu64 "", gettimeofdayInMilliseconds() );
}
return true;
}
// . replaces line-breaking html tags with 2 returns if "includeTags" is false
// . stores tags too if "includeTags" is true
// . returns # chars written to buf
// . NOTE: see XmlNode.cpp for list of tag types in "NodeType" structure
// . used to get xml subtrees as text
// . used to get <TITLE>'s
// . must write to your buf rather than just return a pointer since we may
// have to concatenate several nodes together, we may have to replace tags,..
// . TODO: nuke this in favor of Pos.cpp::filter() -- but that needs Words.cpp
int32_t Xml::getText( char *buf, int32_t bufMaxSize, int32_t node1, int32_t node2, bool filterSpaces ) {
// init some vars
int32_t i = node1;
int32_t n = node2;
// truncate n to the # of nodes we have
if (n > m_numNodes || n == -1) {
n = m_numNodes;
}
// keep a non visible tag stack
int32_t notVisible = 0;
// the destination
char *dst = buf;
char *dstEnd = buf + bufMaxSize;
char cs = -1;
// loop through all nodes from here on until we run outta nodes...
// or until we hit a tag with the same depth as us.
for ( ; i < n ; i++ ) {
// . set skipText to true if this tag has inivisble text
// . examples: <option> <script> ...
if ( m_nodes[i].isTag() && ! m_nodes[i].isVisible() &&
m_nodes[i].hasBackTag() ) {
if ( m_nodes[i].isFrontTag() ) notVisible++;
else notVisible--;
if ( notVisible < 0 ) notVisible = 0;
}
// . if it's a tag then write a \n\n or \n to the buf
// . do this only if we do not include tags
// . do it only if there's something already in the buf
if ( m_nodes[i].isTag() ) {
// do nothing if buf still empty
if ( dst <= buf ) continue;
// or not a breaking tag
if ( ! m_nodes[i].isBreaking() ) continue;
// forgot this check! leave room for terminating \0
if ( dst + 3 >= dstEnd ) break;
// if we're not junk filtering just add 2 \n's
if ( ! filterSpaces ) {
*dst++='\n';
*dst++='\n';
continue;
}
// need at least 2 chars in the dst buf so far
if ( dst - 1 <= buf ) continue;
if ( cs == -1 ) continue;
// . if prev char is punct, do nothing.
// . check prev prev char to make sure not a single chr
// . TODO: fix this!
if ( is_punct_a( *(dst - cs))) continue;
//if ( is_punct_utf8( dst[-1]) ) continue;
if ( i+1 >= n ) continue;
if ( is_punct_utf8 ( &m_nodes[i+1].m_node[0] )
&& !m_nodes[i+1].isTag() ) continue;
// . watch out for punct before space(s) though
// . it also ensures that this char is the first char
// of any potential multi-byte sequence
if ( is_wspace_utf8 ( dst - cs ) ) {
// back up one before that even
char *f = dst - cs - 1;
// don't do a while loop on this
// cuz with those xls docs we can
// have a TON of spaces cuz their
// just a bunch of <td></td>&nbsp;'s
if ( f > buf && is_wspace_a ( *f ) ) f--;
if ( f > buf && is_wspace_a ( *f ) ) f--;
if ( f > buf && is_wspace_a ( *f ) ) f--;
if ( f > buf && is_ascii(*f)&&is_punct_a(*f) )
continue;
}
// ok, add the ".."
*dst++='.';
*dst++='.';
continue;
}
// if this tag/text is not visible then continue
if ( notVisible ) continue;
// . get a ptr to the node's data
// . is 1 of 3 things: a text blob, xml tag or html tag
char *nodeData = m_nodes[i].getNode ();
int nodeDataLen = m_nodes[i].getNodeLen();
// . truncate the node if it's too big
// . make sure we truncate at a non alphanumeric character
// . avoid breaking in the middle of a word
// . we cannot truncate tags
if ( dst + nodeDataLen >= dstEnd ) { // bufMaxSize ) {
// cannot truncate tags
if ( m_nodes[i].isTag() ) break;
nodeDataLen = dstEnd - dst - 2;//bufMaxSize - blen;
while ( nodeDataLen > 0 &&
! is_wspace_a(nodeData[ nodeDataLen-1 ]))
nodeDataLen--;
}
// if we truncated the whole thing just break out, we're done.
if ( nodeDataLen <= 0 ) break;
// . copy the node data into our buffer
// . translate HTML entities to iso characters
// . translate \r's into spaces
// point to it
char *src = nodeData;
char *srcEnd = nodeData + nodeDataLen;
// copy the node @src into "dst"
for ( ; src < srcEnd ; src += cs , dst += cs ) {
// get the character size in bytes
cs = getUtf8CharSize ( src );
// no back to back spaces if we're filtering junk
if ( filterSpaces && is_wspace_utf8 ( src ) ) {
if ( dst <= buf ) {dst -= cs; continue;}
if ( dst[-1] == ' ' ) {dst -= cs; continue;}
// ok, do not filter it
//goto simplecopy;
}
// if more than 1 byte in char, use gbmemcpy
if ( cs > 1 ) {gbmemcpy ( dst , src , cs );}
else *dst = *src;
}
// continue looping over nodes (text and tag nodes)
}
// . strip trailing spaces
// . is_wspace_utf8 will be false if it is not the first character
// of a utf8 char sequence, and i don't count any multi-byte
// spaces i guess...
while ( dst > buf && is_wspace_a ( dst[-1] ) ) dst--;
// null term it
*dst = '\0';
// return the # of bytes we've written into the buffer.
return dst - buf;
}
// just get a pointer to it
char *Xml::getMetaContentPointer( const char *field, int32_t fieldLen, const char *name, int32_t *slen ) {
// find the first meta summary node
for ( int32_t i = 0 ; i < m_numNodes ; i++ ) {
// continue if not a meta tag
if ( m_nodes[i].m_nodeId != TAG_META ) continue;
// . does it have a type field that's "summary"
// . <meta name=summary content="...">
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
int32_t len;
char *s = getString ( i , name , &len );
// continue if name doesn't match field
if ( len != fieldLen ) continue;
// field can be "summary","description","keywords",...
if ( strncasecmp ( s , field , fieldLen ) != 0 ) continue;
// point to the summary itself
s = getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
// return the pointer (and set the length of what it points to)
*slen = len;
return s;
}
*slen = 0;
return NULL;
}
// . extract the content from a meta tag
// . null terminate it and store it into "buf"
// . field can be stuff like "summary","description","keywords",...
// . TODO: have a filter option to filter out back-to-back spaces for summary
// generation purposes in Summary class
// . "name" is usually "name" or "http-equiv"
// . if "convertHtmlEntities" is true we turn < into &lt; and > in &gt;
int32_t Xml::getMetaContent( char *buf, int32_t bufLen, const char *field, int32_t fieldLen, const char *name,
int32_t startNode, int32_t *matchedNode ) {
// return 0 length if no buffer space
if ( bufLen <= 0 ) return 0;
// assume it's empty
buf[0] = '\0';
// assume no tag matched
if ( matchedNode ) *matchedNode = -1;
// store output into "dst"
char *dst = buf;
char *dstEnd = buf + bufLen;
// find the first meta summary node
for ( int32_t i = startNode ; i < m_numNodes ; i++ ) {
// continue if not a meta tag
if ( m_nodes[i].m_nodeId != TAG_META ) {
continue;
}
// . does it have a type field that's "summary"
// . <meta name=summary content="...">
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
int32_t len;
char *s = getString ( i , name , &len );
// continue if name doesn't match field
// field can be "summary","description","keywords",...
if ( len != fieldLen ) {
continue;
}
if ( strncasecmp ( s , field , fieldLen ) != 0 ) {
continue;
}
// point to the summary itself
s = getString ( i , "content" , &len );
if ( ! s || len <= 0 ) {
continue;
}
// point to it
char *src = s;
char *srcEnd = s + len;
// size of character in bytes, usually 1
char cs ;
// bookmark
char *lastp = NULL;
// copy the node @p into "dst"
for ( ; src < srcEnd ; src+= cs ) {
// get the character size in bytes
cs = getUtf8CharSize ( src );
// break if we are full! (save room for \0)
if ( dst + 5 >= dstEnd ) break;
// remember last punct for cutting purposes
if ( ! is_alnum_utf8 ( src ) ) lastp = dst;
// if more than 1 byte in char, use gbmemcpy
if ( cs > 1 ) {gbmemcpy ( dst , src , cs );}
else *dst = *src;
dst += cs;
}
// continue looping over nodes (text and tag nodes)
// do not split a word in the middle! so if we had to
// truncate, at least try to truncate at last punctuation
// mark if we had one.
if ( dst + 5 >= dstEnd && lastp ) {
*lastp = '\0';
len = lastp - buf;
}
// end at dst as well
else {
*dst = '\0';
len = dst - buf;
}
// store node number
if ( matchedNode ) {
*matchedNode = i;
}
return len;
}
return 0;
}
static bool inTag ( XmlNode *node, nodeid_t tagId, int *count ) {
if ( !count ) {
return false;
}
if ( node->getNodeId() == tagId ) {
if ( node->isFrontTag() ) {
++(*count);
return true;
}
// back tag
if ( *count ) {
--(*count);
}
}
return (*count > 0);
}
static int32_t filterContent(TokenizerResult *tr, Pos *pp, char *buf, int32_t bufLen, unsigned minLength,
unsigned maxLength, int32_t version) {
unsigned contentLen = 0;
/// @todo ALC configurable maxNumWord so we can tweak this as needed
const unsigned maxNumWord = maxLength * 2;
if ( tr->size() > maxNumWord ) {
// ignore too long snippet
// it may not be that useful to get the first x characters from a long snippet
contentLen = 0;
buf[0] = '\0';
return contentLen;
}
contentLen = pp->filter( tr, 0, tr->size(), true, buf, buf + maxLength, version );
if ( contentLen < minLength ) {
// ignore too short descriptions
// it may not be a good summary if it's too short
contentLen = 0;
buf[0] = '\0';
return contentLen;
}
return contentLen;
}
bool Xml::getTagContent(const char *fieldName, const char *fieldContent, char *buf, int32_t bufLen,
unsigned minLength, unsigned maxLength, int32_t *contentLenPtr,
bool ignoreExpandedIframe, nodeid_t expectedNodeId ) {
int32_t fieldNameLen = strlen( fieldName );
int32_t fieldContentLen = strlen(fieldContent);
int32_t contentLen = 0;
int inTagCount = 0;
for (int32_t i = 0; i < getNumNodes(); ++i ) {
// don't get tag from gbframe (expanded iframe content)
if ( ignoreExpandedIframe && inTag( getNodePtr( i ), TAG_GBFRAME, &inTagCount ) ) {
continue;
}
if ( expectedNodeId != LAST_TAG && getNodeId(i) != expectedNodeId ) {
continue;
}
bool found = false;
if ( fieldNameLen > 0 ) {
int32_t tagLen = 0;
char *tag = getNodePtr(i)->getAttrValue(fieldName, fieldNameLen, &tagLen);
if ( tagLen == fieldContentLen && strncasecmp( tag, fieldContent, fieldContentLen ) == 0 ) {
found = true;
}
} else {
found = true;
}
if ( found ) {
int32_t end_node = getEndNode(i);
TokenizerResult tr;
Pos pp;
if (end_node < 0) {
if ( getNodeId(i) != TAG_META ) {
// no end tag
continue;
}
// extract content from meta tag
int32_t len = 0;
char *s = getNodePtr(i)->getAttrValue("content", 7, &len);
if ( ! s || len <= 0 ) {
// no content
continue;
}
Xml xml;
{
/// @todo ALC workaround until we fix xml to use len instead of '\0'
char saved = s[len];
s[len] = '\0';
if ( !xml.set( s, len, m_version, CT_HTML ) ) {
s[len] = saved;
return false;
}
s[len] = saved;
}
xml_tokenizer_phase_1(&xml,&tr);
} else {
xml_tokenizer_phase_1_subset(this, i,end_node, &tr);
}
calculate_tokens_hashes(&tr);
if ( !pp.set( &tr ) ) {
// unable to allocate buffer
return false;
}
contentLen = filterContent( &tr, &pp, buf, bufLen, minLength, maxLength, m_version );
if ( contentLen > 0 ) {
if (contentLenPtr) {
*contentLenPtr = contentLen;
}
/// @todo ALC we may want to loop through the whole doc and get the best.
/// Only get the first for now
break;
}
}
}
return (contentLen > 0);
}
bool Xml::getTagValue(const char *fieldName, const char *fieldContent, const char *fieldValueName,
const char **valuePtr, int32_t *valueLenPtr, bool ignoreExpandedIframe,
nodeid_t expectedNodeId, int32_t *startNodePtr) {
int32_t fieldNameLen = strlen(fieldName);
int32_t fieldContentLen = strlen(fieldContent);
int32_t fieldValueNameLen = strlen(fieldValueName);
int32_t startNode = (startNodePtr != nullptr) ? *startNodePtr : 0;
int inTagCount = 0;
// clear value
*valuePtr = nullptr;
*valueLenPtr = 0;
for (int32_t i = startNode; i < getNumNodes(); ++i) {
// don't get tag from gbframe (expanded iframe content)
if (ignoreExpandedIframe && inTag(getNodePtr(i), TAG_GBFRAME, &inTagCount)) {
continue;
}
if (expectedNodeId != LAST_TAG && getNodeId(i) != expectedNodeId) {
continue;
}
bool found = false;
if (fieldNameLen > 0) {
int32_t tagLen = 0;
char *tag = getNodePtr(i)->getAttrValue(fieldName, fieldNameLen, &tagLen);
if (tagLen == fieldContentLen && strncasecmp(tag, fieldContent, fieldContentLen) == 0) {
found = true;
}
} else {
found = true;
}
if (found) {
// extract value
*valuePtr = getNodePtr(i)->getAttrValue(fieldValueName, fieldValueNameLen, valueLenPtr);
if (!*valuePtr || *valueLenPtr <= 0) {
// no content
continue;
}
if (startNodePtr) {
*startNodePtr = i;
}
break;
}
}
return (*valueLenPtr > 0);
}
// TEST CASES:
//. this is NOT rss, but has an rdf:rdf tag in it!
// http://www.silverstripe.com/silverstripe-adds-a-touch-of-design-and-a-whole-lot-more/
// http://government.zdnet.com/?p=4245
int32_t Xml::isRSSFeed ( ) {
int32_t type = 0;
int32_t tag = 0;
int32_t i;
for ( i = 0; i < m_numNodes; i++ ) {
// skip text nodes (nodeId is 0)
if ( m_nodes[i].m_nodeId == TAG_TEXTNODE ) continue;
// check for RSS/FEED/RDF node
if ( m_nodes[i].m_nodeId == TAG_RDF ) {
tag = TAG_RDF; type = 1; }
if ( m_nodes[i].m_nodeId == TAG_RSS ) {
tag = TAG_RSS; type = 1; }
if ( m_nodes[i].m_nodeId == TAG_FEED ) {
tag = TAG_FEED; type = 6; }
if ( tag ) break;
}
// if no such tag we are definitely not rss
if ( ! tag ) return 0;
// i have only seen rdf tags embedded in html
if ( tag != TAG_RDF ) return type;
// . now check for a <channel>, <item> or <link> tag
// . we need one of those to be useful
for ( i = 0; i < m_numNodes; i++ ) {
if ( m_nodes[i].m_nodeId == TAG_CHANNEL ) return type;
if ( m_nodes[i].m_nodeId == TAG_ITEM ) return type;
if ( m_nodes[i].m_nodeId == TAG_ENTRY ) return type;
//if ( m_nodes[i].m_nodeId == TAG_LINK ) return type;
}
return 0;
}
char *Xml::getRSSTitle ( int32_t *titleLen , bool *isHtmlEncoded ) {
// assume it is html encoded (i.e. <'s are encoded as &lt;'s)
*isHtmlEncoded = true;
// . extract the RSS/Atom title
// rss/rdf
int32_t tLen;
char *title = getString( "title", &tLen, true );
// watch out for <![CDATA[]]> block
if ( tLen >= 12 && strncasecmp(title, "<![CDATA[", 9) == 0 ) {
title += 9;
tLen -= 12;
*isHtmlEncoded = false;
}
// return
*titleLen = tLen;
return title;
}
const char *Xml::getRSSTitle ( int32_t *titleLen , bool *isHtmlEncoded ) const {
return const_cast<Xml*>(this)->getRSSTitle(titleLen,isHtmlEncoded);
}
char *Xml::getRSSDescription ( int32_t *descLen , bool *isHtmlEncoded ) {
// assume it is html encoded (i.e. <'s are encoded as &lt;'s)
*isHtmlEncoded = true;
// . extract the RSS/Atom description
// rss/rdf
int32_t dLen;
// "item.description"
char *desc = getString( "description", &dLen, true );
// get content first, it is usually more inclusive than the summary
if ( ! desc ) {
// "entry.content"
desc = getString( "content", &dLen, true );
}
// atom
if ( ! desc ) {
// "entry.summary"
desc = getString( "summary", &dLen, true );
}
// watch out for <![CDATA[]]> block
if ( dLen >= 12 && strncasecmp(desc, "<![CDATA[", 9) == 0 ) {
desc += 9;
dLen -= 12;
*isHtmlEncoded = false;
}
// return
*descLen = dLen;
return desc;
}