Improve title/summary for youtube

This commit is contained in:
Ai Lin Chia
2016-01-20 13:32:13 +01:00
parent 40af20df8d
commit ac8249e07d
11 changed files with 332 additions and 259 deletions

466
Pos.cpp

@ -17,62 +17,19 @@ void Pos::reset() {
if ( m_buf && m_needsFree )
mfree ( m_buf , m_bufSize , "Pos" );
m_buf = NULL;
}
// . the interval is half-open [a,b)
// . do not print out any alnum word with negative score
int32_t Pos::filter( char *p, char *pend, Words *words, int32_t a, int32_t b, bool addEllipsis ) {
int32_t plen = 0;
set ( words , addEllipsis, p , pend, &plen , a , b );
return plen;
}
// . set the filtered position of each word
// . used by Summary.cpp to determine how many chars are in the summary,
// be those chars single byte or utf8 chars that are 4 bytes
// . returns false and sets g_errno on error
// . if f is non-NULL store filtered words into there. back to back spaces
// are eliminated.
bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len, int32_t a, int32_t b ) {
// free m_buf in case this is a second call
if ( ! f ) {
reset();
}
int32_t nw = words->getNumWords();
int32_t *wlens = words->m_wordLens;
int32_t Pos::filter( Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend ) {
nodeid_t *tids = words->getTagIds(); // m_tagIds;
char **wp = words->m_words;
// save start point for filtering
char *fstart = f;
// -1 is the default value
if ( b == -1 ) {
b = nw;
b = words->getNumWords();
}
// alloc array if need to
int32_t need = (nw+1) * 4;
// do not destroy m_pos/m_numWords if only filtering into a buffer
if ( !f ) {
m_needsFree = false;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
}
// bail on error
if ( ! m_buf ) return false;
m_bufSize = need;
m_pos = (int32_t *)m_buf;
m_numWords = nw;
}
// this is the CHARACTER count.
int32_t pos = 0;
bool trunc = false;
static const int32_t maxCharSize = 4; // we are utf8
@ -91,31 +48,23 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
int dotCount = 0; // store last encountered total consecutive dots
char* dotPrevChar = NULL; // store char before dot which is not a space
for ( int32_t i = a ; i < b ; i++ ) {
for ( int32_t i = a ; i < b ; ++i ) {
if (trunc) {
break;
}
// set pos for the ith word to "pos"
if ( ! f ) {
m_pos[i] = pos;
}
// is tag?
if ( tids && tids[i] ) {
// filtering into buffer (when generating summaries)
if ( f ) {
// let's not get from bad tags
if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
++inBadTags;
continue;
}
// let's not get from bad tags
if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
++inBadTags;
continue;
}
if ( inBadTags ) {
if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
--inBadTags;
}
if ( inBadTags ) {
if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
--inBadTags;
}
}
@ -126,14 +75,15 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
// list tag? <li>
if ( tids[i] == TAG_LI ) {
if ( f ) {
if ( ( fend - f > maxCharSize ) ) {
*f++ = '*';
} else {
trunc = true;
}
if ( ( fend - f > maxCharSize ) ) {
*f++ = '*';
// counted as caps because we're detecting all caps for a sentence
++capCount;
} else {
trunc = true;
}
pos++;
lastSpace = false;
continue;
}
@ -146,35 +96,29 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
// if had a br tag count it as a '.'
if ( tids[i] ) { // <br>
// are we filtering?
if ( f && f != fstart ) {
if ( f != fstart ) {
if ( ( fend - f > 2 * maxCharSize ) ) {
*f++ = '.';
*f++ = ' ';
// counted as caps because we're detecting all caps for a sentence
capCount += 2;
} else {
trunc = true;
}
}
// no, just single period.
pos += 2;
lastSpace = true;
continue;
}
// are we filtering?
if ( f ) {
if ( ( fend - f > maxCharSize ) ) {
*f++ = ' ';
} else {
trunc = true;
}
if ( ( fend - f > maxCharSize ) ) {
*f++ = ' ';
} else {
trunc = true;
}
// count as a single space
pos++;
// do not allow back-to-back spaces
lastSpace = true;
@ -187,22 +131,19 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
}
// scan through all chars discounting back-to-back spaces
char *pend = wp[i] + wlens[i];
char *pend = words->getWord(i) + words->getWordLen(i);
unsigned char cs = 0;
char *p = NULL ;
// assume filters out to the same # of chars
for ( p = wp[i]; p < pend; p += cs ) {
for ( p = words->getWord(i); p < pend; p += cs ) {
// get size
cs = getUtf8CharSize(p);
// filtering into buffer (when generating summaries)
if ( f ) {
// skip unwanted character
if ( isUtf8UnwantedSymbols( p ) ) {
continue;
}
// skip unwanted character
if ( isUtf8UnwantedSymbols( p ) ) {
continue;
}
// do not count space if one before
@ -213,155 +154,151 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
lastSpace = true;
// are we filtering?
if ( f ) {
if ( fend - f > 1 ) {
lastBreakPrevChar = prevChar;
if ( fend - f > 1 ) {
lastBreakPrevChar = prevChar;
lastBreak = f;
*f++ = ' ';
lastBreak = f;
*f++ = ' ';
// space is counted as caps as well because we're detecting all caps for a sentence
++capCount;
// counted as caps because we're detecting all caps for a sentence
++capCount;
dotCount = 0;
dotCount = 0;
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
} else {
trunc = true;
}
}
++pos;
continue;
}
if ( f ) {
if ( fend - f > cs ) {
prevChar = f;
if ( cs == 1 ) {
// we only do it for ascii to avoid catering for different rules in different languages
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
// eg:
// The Greek upper-case letter "Σ" has two different lower-case forms:
// "ς" in word-final position and "σ" elsewhere
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
// and comma/quotes/etc. is included
++capCount;
}
// some sites try to be smart and truncate for us, let's remove that
// if if there are no space between dots and letter
if ( *p == '.' ) {
++dotCount;
} else {
dotCount = 0;
dotPrevChar = f;
}
*f++ = *p;
} else {
dotCount = 0;
dotPrevChar = f;
gbmemcpy( f, p, cs );
f += cs;
}
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
} else {
trunc = true;
}
continue;
}
if ( fend - f > cs ) {
prevChar = f;
if ( cs == 1 ) {
// we only do it for ascii to avoid catering for different rules in different languages
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
// eg:
// The Greek upper-case letter "Σ" has two different lower-case forms:
// "ς" in word-final position and "σ" elsewhere
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
// and comma/quotes/etc. is included
++capCount;
}
// some sites try to be smart and truncate for us, let's remove that
// if if there are no space between dots and letter
if ( *p == '.' ) {
++dotCount;
} else {
dotCount = 0;
dotPrevChar = f;
}
*f++ = *p;
} else {
dotCount = 0;
dotPrevChar = f;
gbmemcpy( f, p, cs );
f += cs;
}
} else {
trunc = true;
}
pos++;
lastSpace = false;
}
}
/// @todo ALC simplify logic/break into smaller functions
if ( f ) {
// only capitalize first letter in a word for a sentence with all caps
if ( capCount == ( f - fstart ) ) {
bool isFirstLetter = true;
unsigned char cs = 0;
for ( char *c = fstart; c < f; c += cs ) {
cs = getUtf8CharSize(c);
/// @todo ALC configurable minCapCount so we can tweak this as needed
const int minCapCount = 5;
bool isAlpha = is_alpha_utf8( c );
// only capitalize first letter in a word for a sentence with all caps
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
bool isFirstLetter = true;
if ( isAlpha ) {
if (isFirstLetter) {
isFirstLetter = false;
continue;
}
} else {
isFirstLetter = true;
unsigned char cs = 0;
for ( char *c = fstart; c < f; c += cs ) {
cs = getUtf8CharSize(c);
bool isAlpha = is_alpha_utf8( c );
if ( isAlpha ) {
if (isFirstLetter) {
isFirstLetter = false;
continue;
}
} else {
isFirstLetter = true;
continue;
}
if ( !isFirstLetter ) {
to_lower_utf8(c, c);
}
if ( !isFirstLetter ) {
to_lower_utf8(c, c);
}
}
}
// let's remove ellipsis (...) at the end
if ( dotCount == 3 ) {
if ( is_ascii3( *dotPrevChar ) ) {
switch ( *dotPrevChar ) {
case ',':
trunc = true;
lastBreak = dotPrevChar + 1;
break;
case '!':
case '.':
trunc = false;
f = dotPrevChar + 1;
break;
case ' ':
trunc = false;
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
const int minRemoveEllipsisLen = 120;
if ( lastBreak ) {
f = lastBreak;
}
break;
default:
trunc = true;
// let's remove ellipsis (...) at the end
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
if ( is_ascii3( *dotPrevChar ) ) {
switch ( *dotPrevChar ) {
case ',':
trunc = true;
lastBreak = dotPrevChar + 1;
break;
case '!':
case '.':
trunc = false;
f = dotPrevChar + 1;
break;
case ' ':
trunc = false;
if ( lastBreakPrevChar ) {
if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
switch ( *( lastBreakPrevChar ) ) {
case '!':
case '.':
trunc = false;
if ( lastBreak ) {
f = lastBreak;
}
break;
default:
trunc = true;
if (lastBreak) {
f = lastBreak;
}
break;
default:
break;
}
if ( lastBreakPrevChar ) {
if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
switch ( *( lastBreakPrevChar ) ) {
case '!':
case '.':
trunc = false;
if (lastBreak) {
f = lastBreak;
}
break;
default:
break;
}
}
break;
}
}
break;
}
}
}
if ( trunc ) {
if ( lastBreak == NULL ) {
*len = 0;
return false;
return 0;
}
if ( f ) {
f = lastBreak;
}
f = lastBreak;
/// @todo ALC we should cater ellipsis for different languages
if ( addEllipsis ) {
if ( (fend - f) > 4 ) {
gbmemcpy ( f , " ..." , 4 );
@ -370,14 +307,121 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
}
}
// set pos for the END of the last word here (used in Summary.cpp)
if ( !f ) {
m_pos[nw] = pos;
} else { // NULL terminate f
*len = f - fstart;
*f = '\0';
// NULL terminate f
*f = '\0';
return (f - fstart);
}
bool Pos::set( Words *words, int32_t a, int32_t b ) {
// free m_buf in case this is a second call
reset();
int32_t nw = words->getNumWords();
int32_t *wlens = words->m_wordLens;
nodeid_t *tids = words->getTagIds(); // m_tagIds;
char **wp = words->m_words;
// -1 is the default value
if ( b == -1 ) {
b = nw;
}
// Success
// alloc array if need to
int32_t need = (nw+1) * 4;
// do not destroy m_pos/m_numWords if only filtering into a buffer
m_needsFree = false;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
}
// bail on error
if ( ! m_buf ) {
return false;
}
m_bufSize = need;
m_pos = (int32_t *)m_buf;
// this is the CHARACTER count.
int32_t pos = 0;
// flag for stopping back-to-back spaces. only count those as one char.
bool lastSpace = false;
for ( int32_t i = a ; i < b ; i++ ) {
// set pos for the ith word to "pos"
m_pos[i] = pos;
// is tag?
if ( tids && tids[i] ) {
// if not breaking, does nothing
if ( !g_nodes[tids[i] & 0x7f].m_isBreaking ) {
continue;
}
// list tag? <li>
if ( tids[i] == TAG_LI ) {
++pos;
lastSpace = false;
continue;
}
// if had a previous breaking tag and no non-tag
// word after it, do not count back-to-back spaces
if ( lastSpace ) {
continue;
}
// if had a br tag count it as a '. '
if ( tids[i] ) { // <br>
pos += 2;
lastSpace = true;
continue;
}
// count as a single space
pos++;
// do not allow back-to-back spaces
lastSpace = true;
continue;
}
// scan through all chars discounting back-to-back spaces
char *pend = wp[i] + wlens[i];
unsigned char cs = 0;
// assume filters out to the same # of chars
for ( char *p = wp[i]; p < pend; p += cs ) {
// get size
cs = getUtf8CharSize(p);
// do not count space if one before
if ( is_wspace_utf8 (p) ) {
if ( lastSpace ) {
continue;
}
lastSpace = true;
++pos;
continue;
}
++pos;
lastSpace = false;
}
}
// set pos for the END of the last word here
m_pos[nw] = pos;
return true;
}

16
Pos.h

@ -3,6 +3,8 @@
#ifndef _POS_H_
#define _POS_H_
#include <stdint.h>
// this class is used to measure the number of characters between two "words"
// (as defined in the Words.cpp class) in units of "characters". A utf8
// character can be 1, 2, 3 or 4 bytes, so be careful.
@ -19,22 +21,20 @@ class Pos {
~Pos();
void reset();
bool set(Words *words, bool addEllipsis = false, char *f = NULL, char *fend = NULL,
int32_t *flen = NULL, int32_t a = 0, int32_t b = -1 );
bool set(Words *words, int32_t a = 0, int32_t b = -1 );
// . filter out xml words [a,b] into plain text, stores into "p"
// . will not exceed "pend"
// . returns number of BYTES stored into "p"
int32_t filter(char *p, char *pend, Words *words, int32_t a = 0, int32_t b = -1,
bool addEllipsis = false );
// . filter out xml words [a,b] into plain text, stores into "f"
// . will not exceed "fend"
// . returns number of BYTES stored into "f"
int32_t filter(Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend);
// . the position in CHARACTERS of word i is given by m_pos[i]
// . this is NOT the byte position. you can have 2, 3 or even 4
// byte characters in utf8. the purpose here is for counting
// "letters" or "characters" for formatting purposes.
int32_t *m_pos;
int32_t m_numWords;
private:
char m_localBuf [ POS_LOCALBUFSIZE ];
char *m_buf;
int32_t m_bufSize;

@ -75,8 +75,8 @@ bool Summary::verifySummary( char *titleBuf, int32_t titleBufLen ) {
// - meta name = "og:description"
// - meta name = "description"
bool Summary::setFromTags( Xml *xml, int32_t maxSummaryLen, char *titleBuf, int32_t titleBufLen ) {
/// @todo ALC we may want this to be configurable so we can tweak this as needed
int minSummaryLen = (maxSummaryLen / 3);
/// @todo ALC configurable minSummaryLen so we can tweak this as needed
const int minSummaryLen = (maxSummaryLen / 3);
// itemprop = "description"
if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
@ -512,7 +512,7 @@ bool Summary::set (Xml *xml, Words *words, Sections *sections, Pos *pos, Query *
// . removes back to back spaces
// . converts html entities
// . filters in stores words in [a,b) interval
int32_t len = pos->filter( p, pend, ww, maxa, maxb );
int32_t len = pos->filter( ww, maxa, maxb, false, p, pend );
// break out if did not fit
if ( len == 0 ) {
@ -1078,7 +1078,7 @@ bool Summary::getDefaultSummary ( Xml *xml, Words *words, Sections *sections, Po
}
if (bestStart >= 0 && bestEnd > bestStart){
int32_t len = pos->filter( p, pend - 10, words, bestStart, bestEnd );
int32_t len = pos->filter( words, bestStart, bestEnd, false, p, pend - 10 );
p += len;
if ( len > 0 && p + 3 + 2 < pend ){
// space first?

@ -52,8 +52,8 @@ void Title::reset() {
}
bool Title::setFromTags( Xml *xml, int32_t maxTitleLen ) {
/// @todo ALC we may want this to be configurable so we can tweak this as needed
int minTitleLen = 3;
/// @todo ALC configurable minTitleLen so we can tweak this as needed
const int minTitleLen = 3;
// meta property = "og:title"
if ( xml->getTagContent("property", "og:title", m_title, MAX_TITLE_LEN, minTitleLen, maxTitleLen, &m_titleLen, true, TAG_META) ) {
@ -555,7 +555,7 @@ bool Title::setTitle4 ( XmlDoc *xd, Xml *XML, Words *WW, int32_t maxTitleChars,
}
}
/// @todo we should allow more tags than just title/link
/// @todo ALC we should allow more tags than just title/link
// skip if not a good tag.
if (tid != TAG_TITLE && tid != TAG_A) {
continue;

@ -20,7 +20,7 @@ public:
void reset();
/// @todo correct comments
/// @todo ALC correct comments
// . set m_title to the title of the document represented by "xd"
// . if getHardTitle is true will always use the title in the <title>
// tag, but if that is not present, will try dmoz titles before

@ -66,7 +66,7 @@ iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
}
int gbiconv_close(iconv_t cd) {
/// @todo gbiconv_close currently does nothing
/// @todo ALC gbiconv_close currently does nothing
//int val = iconv_close(cd);
//if (val == 0) g_mem.rmMem((void*)cd, 1, "iconv", 1);
//return val;

@ -128,6 +128,7 @@ bool inline isValidUtf8Char(const char *s) {
// Refer to:
// http://www.unicode.org/charts/
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
// http://www.utf8-chartable.de/
// Emoji & Pictographs
// 260026FF: Miscellaneous Symbols
@ -143,31 +144,45 @@ bool inline isValidUtf8Char(const char *s) {
// 1F0301F09F: Domino Tiles
// 1F0A01F0FF: Playing Cards
// Enclosed Alphanumeric Supplement
// 1F1E61F1FF: Regional indicator symbols
// Geometric Shapes
// 25A025FF: Geometric Shapes
// +--------------------+----------+----------+----------+----------+
// | Code Points | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
// +--------------------+----------+----------+----------+----------+
// | U+2600..U+27BF | E2 | 98..9E | 80..BF | |
// | U+25A0..U+25BF | E2 | 96 | A0..BF | |
// | U+25C0..U+27BF | E2 | 97..9E | 80..BF | |
// | U+1F000..U+1F0FF | F0 | 9F | 80..83 | 80..BF |
// | U+1F1E6..U+1F1FF | F0 | 9F | 87 | A6..BF |
// | U+1F300..U+1F6FF | F0 | 9F | 8C..9B | 80..BF |
// | U+1F900..U+1F9FF | F0 | 9F | A4..A7 | 80..BF |
// +--------------------+----------+----------+----------+----------+
bool inline isUtf8UnwantedSymbols(const char *s) {
const uint8_t *u = (uint8_t*)s;
const uint8_t *u = (uint8_t *)s;
if (u[0] == 0xE2) { // U+2600..U+27BF
if ((u[1] >= 0x98 && u[1] <= 0x9E) &&
(u[2] >= 0x80 && u[2] <= 0xBF)) {
return true;
}
} else if (u[0] == 0xF0 && u[1] == 0x9F) {
if ((u[2] >= 0x80 && u[2] <= 0x83) &&
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F000..U+1F0FF
return true;
} else if ((u[2] >= 0x8C && u[2] <= 0x9B) &&
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F300..U+1F6FF
if ( u[0] == 0xE2 ) {
if ( ( u[1] == 0x96 ) &&
( u[2] >= 0xA0 && u[2] <= 0xBF ) ) {
return true;
} else if ((u[2] >= 0xA4 && u[2] <= 0xA7) &&
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F900..U+1F9FF
} else if ( ( u[1] >= 0x97 && u[1] <= 0x9E ) &&
( u[2] >= 0x80 && u[2] <= 0xBF ) ) { // U+25C0..U+27BF
return true;
}
} else if ( u[0] == 0xF0 && u[1] == 0x9F ) {
if ( ( u[2] >= 0x80 && u[2] <= 0x83 ) &&
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F000..U+1F0FF
return true;
} else if ( ( u[2] == 0x87 ) &&
( u[3] >= 0xA6 && u[3] <= 0xBF ) ) { // U+1F1E6..U+1F1FF
return true;
} else if ( ( u[2] >= 0x8C && u[2] <= 0x9B ) &&
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F300..U+1F6FF
return true;
} else if ( ( u[2] >= 0xA4 && u[2] <= 0xA7 ) &&
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F900..U+1F9FF
return true;
}
}

13
Xml.cpp

@ -959,8 +959,10 @@ static bool inTag ( XmlNode *node, nodeid_t tagId, int *count ) {
static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, int32_t minLength, int32_t maxLength ) {
int32_t contentLen = 0;
/// @todo ALC we may want this to be configurable so we can tweak this as needed
if ( wp->getNumWords() > (maxLength * 2) ) {
/// @todo ALC configurable maxNumWord so we can tweak this as needed
const int32_t maxNumWord = (maxLength * 2);
if ( wp->getNumWords() > maxNumWord ) {
// ignore too long snippet
// it may not be that useful to get the first x characters from a long snippet
contentLen = 0;
@ -969,12 +971,7 @@ static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, in
return contentLen;
}
char *bufEnd = buf + maxLength + 4; // plus ellipsis
if ( bufEnd > buf + bufLen ) {
bufEnd = buf + bufLen;
}
contentLen = pp->filter( buf, bufEnd, wp, 0, wp->getNumWords(), true );
contentLen = pp->filter( wp, 0, wp->getNumWords(), true, buf, buf + maxLength );
if ( contentLen < minLength ) {
// ignore too short descriptions

@ -29839,7 +29839,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
if ( p + len + 1 < pend ) {
// store it
// FILTER the html entities!!
int32_t len2 = pos->filter( p, pend, ww, a, b );
int32_t len2 = pos->filter( ww, a, b, false, p, pend );
// ensure NULL terminated
p[len2] = '\0';
@ -30207,7 +30207,7 @@ Summary *XmlDoc::getSummary () {
return (Summary *)ct;
}
/// @todo fill in summary for XML document
/// @todo ALC fill in summary for XML document
// xml and json docs have empty summaries for now
if ( *ct == CT_JSON || *ct == CT_XML ) {
m_summaryValid = true;
@ -30614,7 +30614,7 @@ SafeBuf *XmlDoc::getSampleForGigabits ( ) {
// if match would send us over, we are done
if ( p + len >= pend ) break;
len = pos->filter( p, pend, m->m_words, a, b );
len = pos->filter( m->m_words, a, b, false, p, pend );
// for debug (mdw)
//log("query: gigabitsample#%"INT32"=%s",i,p);

@ -29,7 +29,7 @@ TEST( PosTest, FilterAllCaps ) {
ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );
int32_t len = pos.filter( buf, buf + MAX_BUF_SIZE, &words );
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
EXPECT_EQ( strlen( expected_output[i] ), len );
EXPECT_STREQ( expected_output[i], buf );
@ -54,7 +54,12 @@ TEST( PosTest, FilterEnding ) {
"Computer programming is tremendous fun. Li...",
"Premature optimization is the root of all evil."
"Premature optimization is the root of all evil.",
"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
"right as we had thought. Debugging had to be discovered. I can remember the exact instant when I "
"realized that a large part of my life from then on was going to be spent in finding mistakes in my "
"own programs. "
};
const char *expected_output[] = {
@ -74,7 +79,10 @@ TEST( PosTest, FilterEnding ) {
"Computer programming is tremendous fun.",
"Premature optimization is the root of all evil."
"Premature optimization is the root of all evil.",
"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
"right as we had thought. Debugging had to be discovered. I can remember the ..."
};
ASSERT_EQ( sizeof( input_strs ) / sizeof( input_strs[0] ),
@ -88,9 +96,9 @@ TEST( PosTest, FilterEnding ) {
ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );
int32_t len = pos.filter( buf, buf + 180 + 4, &words, 0, -1, true );
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
//EXPECT_EQ( strlen( expected_output[i] ), len );
EXPECT_EQ( strlen( expected_output[i] ), len );
EXPECT_STREQ( expected_output[i], buf );
}
}

@ -173,6 +173,15 @@ TEST(UnicodeTest, UnwantedSymbols) {
"🂠",
"🃿",
// Enclosed Alphanumeric Supplement
// 1F1E61F1FF: Regional indicator symbols
"🇦",
"🇿",
// Geometric Shapes
// 25A025FF: Geometric Shapes
"",
"",
};
size_t len = sizeof(inputs) / sizeof(inputs[0]);