Improve title/summary for youtube

2025-07-16 02:46:08 -04:00 · 2016-01-20 13:32:13 +01:00
parent 40af20df8d
commit ac8249e07d
11 changed files with 332 additions and 259 deletions
--- a/Pos.cpp
+++ b/Pos.cpp
@ -17,62 +17,19 @@ void Pos::reset() {
 	if ( m_buf && m_needsFree )
 		mfree ( m_buf , m_bufSize , "Pos" );
 	m_buf = NULL;
-}	
-
-// . the interval is half-open [a,b)
-// . do not print out any alnum word with negative score
-int32_t Pos::filter( char *p, char *pend, Words *words, int32_t a, int32_t b, bool addEllipsis ) {
-	int32_t plen = 0;
-	set ( words , addEllipsis, p , pend, &plen , a , b );
-	return plen;
 }

-// . set the filtered position of each word
-// . used by Summary.cpp to determine how many chars are in the summary,
-//   be those chars single byte or utf8 chars that are 4 bytes 
-// . returns false and sets g_errno on error
-// . if f is non-NULL store filtered words into there. back to back spaces
-//   are eliminated.
-bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len, int32_t a, int32_t b ) {
-	// free m_buf in case this is a second call
-	if ( ! f ) {
-		reset();
-	}
-
-	int32_t nw = words->getNumWords();
-	int32_t *wlens = words->m_wordLens;
+int32_t Pos::filter( Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend ) {
 	nodeid_t *tids = words->getTagIds(); // m_tagIds;
-	char **wp = words->m_words;

 	// save start point for filtering
 	char *fstart = f;

 	// -1 is the default value
 	if ( b == -1 ) {
-		b = nw;
+		b = words->getNumWords();
 	}

-	// alloc array if need to
-	int32_t need = (nw+1) * 4;
-
-	// do not destroy m_pos/m_numWords if only filtering into a buffer
-	if ( !f ) {
-		m_needsFree = false;
-
-		m_buf = m_localBuf;
-		if ( need > POS_LOCALBUFSIZE ) {
-			m_buf = (char *)mmalloc(need,"Pos");
-			m_needsFree = true;
-		}
-		// bail on error
-		if ( ! m_buf ) return false;
-		m_bufSize = need;
-		m_pos      = (int32_t *)m_buf;
-		m_numWords = nw;
-	}
-
-	// this is the CHARACTER count. 
-	int32_t pos = 0;
 	bool trunc = false;

 	static const int32_t maxCharSize = 4; // we are utf8
@ -91,31 +48,23 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
 	int dotCount = 0; // store last encountered total consecutive dots
 	char* dotPrevChar = NULL; // store char before dot which is not a space

-	for ( int32_t i = a ; i < b ; i++ ) {
+	for ( int32_t i = a ; i < b ; ++i ) {
 		if (trunc) {
 			break;
 		}

-		// set pos for the ith word to "pos"
-		if ( ! f ) {
-			m_pos[i] = pos;
-		}
-
 		// is tag?
 		if ( tids && tids[i] ) {
-			// filtering into buffer (when generating summaries)
-			if ( f ) {
-				// let's not get from bad tags
-				if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
-					++inBadTags;
-					continue;
-				}
+			// let's not get from bad tags
+			if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
+				++inBadTags;
+				continue;
+			}

-				if ( inBadTags ) {
-					if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
-					     ( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
-						--inBadTags;
-					}
+			if ( inBadTags ) {
+				if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
+				     ( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
+					--inBadTags;
 				}
 			}

@ -126,14 +75,15 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len

 			// list tag? <li>
 			if ( tids[i] == TAG_LI ) {
-				if ( f ) {
-					if ( ( fend - f > maxCharSize ) ) {
-						*f++ = '*';
-					} else {
-						trunc = true;
-					}
+				if ( ( fend - f > maxCharSize ) ) {
+					*f++ = '*';
+
+					// counted as caps because we're detecting all caps for a sentence
+					++capCount;
+				} else {
+					trunc = true;
 				}
-				pos++;
+
 				lastSpace = false;
 				continue;
 			}
@ -146,35 +96,29 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len

 			// if had a br tag count it as a '.'
 			if ( tids[i] ) { // <br>
-				// are we filtering?
-				if ( f && f != fstart ) {
+				if ( f != fstart ) {
 					if ( ( fend - f > 2 * maxCharSize ) ) {
 						*f++ = '.';
 						*f++ = ' ';
+
+						// counted as caps because we're detecting all caps for a sentence
+						capCount += 2;
 					} else {
 						trunc = true;
 					}
 				}

-				// no, just single period.
-				pos += 2;
 				lastSpace = true;

 				continue;
 			}

-			// are we filtering?
-			if ( f ) {
-				if ( ( fend - f > maxCharSize ) ) {
-					*f++ = ' ';
-				} else {
-					trunc = true;
-				}
+			if ( ( fend - f > maxCharSize ) ) {
+				*f++ = ' ';
+			} else {
+				trunc = true;
 			}

-			// count as a single space
-			pos++;
-
 			// do not allow back-to-back spaces
 			lastSpace = true;

@ -187,22 +131,19 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
 		}

 		// scan through all chars discounting back-to-back spaces
-		char *pend = wp[i] + wlens[i];
+		char *pend = words->getWord(i) + words->getWordLen(i);
 		unsigned char cs = 0;

 		char *p    = NULL ;

 		// assume filters out to the same # of chars
-		for ( p = wp[i]; p < pend; p += cs ) {
+		for ( p = words->getWord(i); p < pend; p += cs ) {
 			// get size
 			cs = getUtf8CharSize(p);

-			// filtering into buffer (when generating summaries)
-			if ( f ) {
-				// skip unwanted character
-				if ( isUtf8UnwantedSymbols( p ) ) {
-					continue;
-				}
+			// skip unwanted character
+			if ( isUtf8UnwantedSymbols( p ) ) {
+				continue;
 			}

 			// do not count space if one before
@ -213,155 +154,151 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len

 				lastSpace = true;

-				// are we filtering?
-				if ( f ) {
-					if ( fend - f > 1 ) {
-						lastBreakPrevChar = prevChar;
+				if ( fend - f > 1 ) {
+					lastBreakPrevChar = prevChar;

-						lastBreak = f;
-						*f++ = ' ';
+					lastBreak = f;
+					*f++ = ' ';

-						// space is counted as caps as well because we're detecting all caps for a sentence
-						++capCount;
+					// counted as caps because we're detecting all caps for a sentence
+					++capCount;

-						dotCount = 0;
+					dotCount = 0;

-						// we don't store space as dotPreviousChar because we want to strip ' ...' as well
-					} else {
-						trunc = true;
-					}
-				}
-
-				++pos;
-				continue;
-			}
-
-			if ( f ) {
-				if ( fend - f > cs ) {
-					prevChar = f;
-
-					if ( cs == 1 ) {
-						// we only do it for ascii to avoid catering for different rules in different languages
-						// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
-						// eg:
-						//   The Greek upper-case letter "Σ" has two different lower-case forms:
-						//     "ς" in word-final position and "σ" elsewhere
-						if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
-							// non-alpha is counted as caps as well because we're detecting all caps for a sentence
-							// and comma/quotes/etc. is included
-							++capCount;
-						}
-
-						// some sites try to be smart and truncate for us, let's remove that
-						// if if there are no space between dots and letter
-						if ( *p == '.' ) {
-							++dotCount;
-						} else {
-							dotCount = 0;
-							dotPrevChar = f;
-						}
-
-						*f++ = *p;
-					} else {
-						dotCount = 0;
-						dotPrevChar = f;
-
-						gbmemcpy( f, p, cs );
-						f += cs;
-					}
+					// we don't store space as dotPreviousChar because we want to strip ' ...' as well
 				} else {
 					trunc = true;
 				}
+
+				continue;
+			}
+
+			if ( fend - f > cs ) {
+				prevChar = f;
+
+				if ( cs == 1 ) {
+					// we only do it for ascii to avoid catering for different rules in different languages
+					// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
+					// eg:
+					//   The Greek upper-case letter "Σ" has two different lower-case forms:
+					//     "ς" in word-final position and "σ" elsewhere
+					if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
+						// non-alpha is counted as caps as well because we're detecting all caps for a sentence
+						// and comma/quotes/etc. is included
+						++capCount;
+					}
+
+					// some sites try to be smart and truncate for us, let's remove that
+					// if if there are no space between dots and letter
+					if ( *p == '.' ) {
+						++dotCount;
+					} else {
+						dotCount = 0;
+						dotPrevChar = f;
+					}
+
+					*f++ = *p;
+				} else {
+					dotCount = 0;
+					dotPrevChar = f;
+
+					gbmemcpy( f, p, cs );
+					f += cs;
+				}
+			} else {
+				trunc = true;
 			}

-			pos++; 
 			lastSpace = false;
 		}
 	}

 	/// @todo ALC simplify logic/break into smaller functions
-	if ( f ) {
-		// only capitalize first letter in a word for a sentence with all caps
-		if ( capCount == ( f - fstart ) ) {
-			bool isFirstLetter = true;

-			unsigned char cs = 0;
-			for ( char *c = fstart; c < f; c += cs ) {
-				cs = getUtf8CharSize(c);
+	/// @todo ALC configurable minCapCount so we can tweak this as needed
+	const int minCapCount = 5;

-				bool isAlpha = is_alpha_utf8( c );
+	// only capitalize first letter in a word for a sentence with all caps
+	if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
+		bool isFirstLetter = true;

-				if ( isAlpha ) {
-					if (isFirstLetter) {
-						isFirstLetter = false;
-						continue;
-					}
-				} else {
-					isFirstLetter = true;
+		unsigned char cs = 0;
+		for ( char *c = fstart; c < f; c += cs ) {
+			cs = getUtf8CharSize(c);
+
+			bool isAlpha = is_alpha_utf8( c );
+
+			if ( isAlpha ) {
+				if (isFirstLetter) {
+					isFirstLetter = false;
 					continue;
 				}
+			} else {
+				isFirstLetter = true;
+				continue;
+			}

-				if ( !isFirstLetter ) {
-					to_lower_utf8(c, c);
-				}
+			if ( !isFirstLetter ) {
+				to_lower_utf8(c, c);
 			}
 		}
+	}

-		// let's remove ellipsis (...) at the end
-		if ( dotCount == 3 ) {
-			if ( is_ascii3( *dotPrevChar ) ) {
-				switch ( *dotPrevChar ) {
-					case ',':
-						trunc = true;
-						lastBreak = dotPrevChar + 1;
-						break;
-					case '!':
-					case '.':
-						trunc = false;
-						f = dotPrevChar + 1;
-						break;
-					case ' ':
-						trunc = false;
+	/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
+	const int minRemoveEllipsisLen = 120;

-						if ( lastBreak ) {
-							f = lastBreak;
-						}
-						break;
-					default:
-						trunc = true;
+	// let's remove ellipsis (...) at the end
+	if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
+		if ( is_ascii3( *dotPrevChar ) ) {
+			switch ( *dotPrevChar ) {
+				case ',':
+					trunc = true;
+					lastBreak = dotPrevChar + 1;
+					break;
+				case '!':
+				case '.':
+					trunc = false;
+					f = dotPrevChar + 1;
+					break;
+				case ' ':
+					trunc = false;

-						if ( lastBreakPrevChar ) {
-							if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
-								switch ( *( lastBreakPrevChar ) ) {
-									case '!':
-									case '.':
-										trunc = false;
+					if ( lastBreak ) {
+						f = lastBreak;
+					}
+					break;
+				default:
+					trunc = true;

-										if (lastBreak) {
-											f = lastBreak;
-										}
-										break;
-									default:
-										break;
-								}
+					if ( lastBreakPrevChar ) {
+						if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
+							switch ( *( lastBreakPrevChar ) ) {
+								case '!':
+								case '.':
+									trunc = false;
+
+									if (lastBreak) {
+										f = lastBreak;
+									}
+									break;
+								default:
+									break;
 							}
 						}
-						break;
-				}
+					}
+					break;
 			}
 		}
 	}

 	if ( trunc ) {
 		if ( lastBreak == NULL ) {
-			*len = 0;
-			return false;
+			return 0;
 		}

-		if ( f ) {
-			f = lastBreak;
-		}
+		f = lastBreak;

+		/// @todo ALC we should cater ellipsis for different languages
 		if ( addEllipsis ) {
 			if ( (fend - f) > 4 ) {
 				gbmemcpy ( f , " ..." , 4 );
@ -370,14 +307,121 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
 		}
 	}

-	// set pos for the END of the last word here (used in Summary.cpp)
-	if ( !f ) {
-		m_pos[nw] = pos;
-	} else { // NULL terminate f
-		*len = f - fstart;
-		*f = '\0';
+	// NULL terminate f
+	*f = '\0';
+
+	return (f - fstart);
+}
+
+bool Pos::set( Words *words, int32_t a, int32_t b ) {
+	// free m_buf in case this is a second call
+	reset();
+
+	int32_t nw = words->getNumWords();
+	int32_t *wlens = words->m_wordLens;
+	nodeid_t *tids = words->getTagIds(); // m_tagIds;
+	char **wp = words->m_words;
+
+	// -1 is the default value
+	if ( b == -1 ) {
+		b = nw;
 	}

-	// Success
+	// alloc array if need to
+	int32_t need = (nw+1) * 4;
+
+	// do not destroy m_pos/m_numWords if only filtering into a buffer
+	m_needsFree = false;
+
+	m_buf = m_localBuf;
+	if ( need > POS_LOCALBUFSIZE ) {
+		m_buf = (char *)mmalloc(need,"Pos");
+		m_needsFree = true;
+	}
+
+	// bail on error
+	if ( ! m_buf ) {
+		return false;
+	}
+
+	m_bufSize = need;
+	m_pos      = (int32_t *)m_buf;
+
+	// this is the CHARACTER count.
+	int32_t pos = 0;
+
+	// flag for stopping back-to-back spaces. only count those as one char.
+	bool lastSpace = false;
+
+	for ( int32_t i = a ; i < b ; i++ ) {
+		// set pos for the ith word to "pos"
+		m_pos[i] = pos;
+
+		// is tag?
+		if ( tids && tids[i] ) {
+			// if not breaking, does nothing
+			if ( !g_nodes[tids[i] & 0x7f].m_isBreaking ) {
+				continue;
+			}
+
+			// list tag? <li>
+			if ( tids[i] == TAG_LI ) {
+				++pos;
+				lastSpace = false;
+				continue;
+			}
+
+			// if had a previous breaking tag and no non-tag
+			// word after it, do not count back-to-back spaces
+			if ( lastSpace ) {
+				continue;
+			}
+
+			// if had a br tag count it as a '. '
+			if ( tids[i] ) { // <br>
+				pos += 2;
+				lastSpace = true;
+
+				continue;
+			}
+
+			// count as a single space
+			pos++;
+
+			// do not allow back-to-back spaces
+			lastSpace = true;
+
+			continue;
+		}
+
+		// scan through all chars discounting back-to-back spaces
+		char *pend = wp[i] + wlens[i];
+		unsigned char cs = 0;
+
+		// assume filters out to the same # of chars
+		for ( char *p = wp[i]; p < pend; p += cs ) {
+			// get size
+			cs = getUtf8CharSize(p);
+
+			// do not count space if one before
+			if ( is_wspace_utf8 (p) ) {
+				if ( lastSpace ) {
+					continue;
+				}
+
+				lastSpace = true;
+
+				++pos;
+				continue;
+			}
+
+			++pos;
+			lastSpace = false;
+		}
+	}
+
+	// set pos for the END of the last word here
+	m_pos[nw] = pos;
+
 	return true;
 }
--- a/Pos.h
+++ b/Pos.h
@ -3,6 +3,8 @@
 #ifndef _POS_H_
 #define _POS_H_

+#include <stdint.h>
+
 // this class is used to measure the number of characters between two "words"
 // (as defined in the Words.cpp class) in units of "characters". A utf8
 // character can be 1, 2, 3 or 4 bytes, so be careful.
@ -19,22 +21,20 @@ class Pos {
 	~Pos();
 	void reset();

-	bool set(Words *words, bool addEllipsis = false, char *f = NULL, char *fend = NULL,
-			  int32_t *flen = NULL, int32_t a = 0, int32_t b = -1 );
+	bool set(Words *words, int32_t a = 0, int32_t b = -1 );

-	// . filter out xml words [a,b] into plain text, stores into "p"
-	// . will not exceed "pend"
-	// . returns number of BYTES stored into "p"
-	int32_t filter(char *p, char *pend, Words *words, int32_t a = 0, int32_t b = -1,
-					bool addEllipsis = false );
+	// . filter out xml words [a,b] into plain text, stores into "f"
+	// . will not exceed "fend"
+	// . returns number of BYTES stored into "f"
+	int32_t filter(Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend);

 	// . the position in CHARACTERS of word i is given by m_pos[i]
 	// . this is NOT the byte position. you can have 2, 3 or even 4
 	//   byte characters in utf8. the purpose here is for counting 
 	//   "letters" or "characters" for formatting purposes.
 	int32_t *m_pos;
-	int32_t  m_numWords;

+private:
 	char  m_localBuf [ POS_LOCALBUFSIZE ];
 	char *m_buf;
 	int32_t  m_bufSize;
--- a/Summary.cpp
+++ b/Summary.cpp
@ -75,8 +75,8 @@ bool Summary::verifySummary( char *titleBuf, int32_t titleBufLen ) {
 // - meta name = "og:description"
 // - meta name = "description"
 bool Summary::setFromTags( Xml *xml, int32_t maxSummaryLen, char *titleBuf, int32_t titleBufLen ) {
-	/// @todo ALC we may want this to be configurable so we can tweak this as needed
-	int minSummaryLen = (maxSummaryLen / 3);
+	/// @todo ALC configurable minSummaryLen so we can tweak this as needed
+	const int minSummaryLen = (maxSummaryLen / 3);

 	// itemprop = "description"
 	if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
@ -512,7 +512,7 @@ bool Summary::set (Xml *xml, Words *words, Sections *sections, Pos *pos, Query *
 		// . removes back to back spaces
 		// . converts html entities
 		// . filters in stores words in [a,b) interval
-		int32_t len = pos->filter( p, pend, ww, maxa, maxb );
+		int32_t len = pos->filter( ww, maxa, maxb, false, p, pend );

 		// break out if did not fit
 		if ( len == 0 ) {
@ -1078,7 +1078,7 @@ bool Summary::getDefaultSummary ( Xml *xml, Words *words, Sections *sections, Po
 	}

 	if (bestStart >= 0 && bestEnd > bestStart){
-		int32_t len = pos->filter( p, pend - 10, words, bestStart, bestEnd );
+		int32_t len = pos->filter( words, bestStart, bestEnd, false, p, pend - 10 );
 		p += len;
 		if ( len > 0 && p + 3 + 2 < pend ){
 			// space first?
--- a/Title.cpp
+++ b/Title.cpp
@ -52,8 +52,8 @@ void Title::reset() {
 }

 bool Title::setFromTags( Xml *xml, int32_t maxTitleLen ) {
-	/// @todo ALC we may want this to be configurable so we can tweak this as needed
-	int minTitleLen = 3;
+	/// @todo ALC configurable minTitleLen so we can tweak this as needed
+	const int minTitleLen = 3;

 	// meta property = "og:title"
 	if ( xml->getTagContent("property", "og:title", m_title, MAX_TITLE_LEN, minTitleLen, maxTitleLen, &m_titleLen, true, TAG_META) ) {
@ -555,7 +555,7 @@ bool Title::setTitle4 ( XmlDoc *xd, Xml *XML, Words *WW, int32_t maxTitleChars,
 			}
 		}

-		/// @todo we should allow more tags than just title/link
+		/// @todo ALC we should allow more tags than just title/link
 		// skip if not a good tag.
 		if (tid != TAG_TITLE && tid != TAG_A) {
 			continue;
--- a/Title.h
+++ b/Title.h
@ -20,7 +20,7 @@ public:

 	void reset();

-	/// @todo correct comments
+	/// @todo ALC correct comments
 	// . set m_title to the title of the document represented by "xd"
 	// . if getHardTitle is true will always use the title in the <title>
 	//   tag, but if that is not present, will try dmoz titles before
--- a/Unicode.cpp
+++ b/Unicode.cpp
@ -66,7 +66,7 @@ iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
 }

 int gbiconv_close(iconv_t cd) {
-	/// @todo gbiconv_close currently does nothing
+	/// @todo ALC gbiconv_close currently does nothing
 	//int val = iconv_close(cd);
 	//if (val  == 0) g_mem.rmMem((void*)cd, 1, "iconv", 1);
 	//return val;	
--- a/Unicode.h
+++ b/Unicode.h
@ -128,6 +128,7 @@ bool inline isValidUtf8Char(const char *s) {
 // Refer to:
 // http://www.unicode.org/charts/
 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
+// http://www.utf8-chartable.de/

 // Emoji & Pictographs
 // 2600–26FF: Miscellaneous Symbols
@ -143,31 +144,45 @@ bool inline isValidUtf8Char(const char *s) {
 // 1F030–1F09F: Domino Tiles
 // 1F0A0–1F0FF: Playing Cards

+// Enclosed Alphanumeric Supplement
+// 1F1E6–1F1FF: Regional indicator symbols
+
+// Geometric Shapes
+// 25A0–25FF: Geometric Shapes
+
 // +--------------------+----------+----------+----------+----------+
 // | Code Points        | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
 // +--------------------+----------+----------+----------+----------+
-// | U+2600..U+27BF     | E2       | 98..9E   | 80..BF   |          |
+// | U+25A0..U+25BF     | E2       | 96       | A0..BF   |          |
+// | U+25C0..U+27BF     | E2       | 97..9E   | 80..BF   |          |
 // | U+1F000..U+1F0FF   | F0       | 9F       | 80..83   | 80..BF   |
+// | U+1F1E6..U+1F1FF   | F0       | 9F       | 87       | A6..BF   |
 // | U+1F300..U+1F6FF   | F0       | 9F       | 8C..9B   | 80..BF   |
 // | U+1F900..U+1F9FF   | F0       | 9F       | A4..A7   | 80..BF   |
 // +--------------------+----------+----------+----------+----------+
 bool inline isUtf8UnwantedSymbols(const char *s) {
-	const uint8_t *u = (uint8_t*)s;
+	const uint8_t *u = (uint8_t *)s;

-	if (u[0] == 0xE2) { // U+2600..U+27BF
-		if ((u[1] >= 0x98 && u[1] <= 0x9E) &&
-		    (u[2] >= 0x80 && u[2] <= 0xBF)) {
-		    return true;
-		}
-	} else if (u[0] == 0xF0 && u[1] == 0x9F) {
-		if ((u[2] >= 0x80 && u[2] <= 0x83) &&
-		    (u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F000..U+1F0FF
-		    return true;
-		} else if ((u[2] >= 0x8C && u[2] <= 0x9B) &&
-		           (u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F300..U+1F6FF
+	if ( u[0] == 0xE2 ) {
+		if ( ( u[1] == 0x96 ) &&
+		     ( u[2] >= 0xA0 && u[2] <= 0xBF ) ) {
 			return true;
-		} else if ((u[2] >= 0xA4 && u[2] <= 0xA7) &&
-		           (u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F900..U+1F9FF
+		} else if ( ( u[1] >= 0x97 && u[1] <= 0x9E ) &&
+		            ( u[2] >= 0x80 && u[2] <= 0xBF ) ) { // U+25C0..U+27BF
+			return true;
+		}
+	} else if ( u[0] == 0xF0 && u[1] == 0x9F ) {
+		if ( ( u[2] >= 0x80 && u[2] <= 0x83 ) &&
+		     ( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F000..U+1F0FF
+			return true;
+		} else if ( ( u[2] == 0x87 ) &&
+		            ( u[3] >= 0xA6 && u[3] <= 0xBF ) ) { // U+1F1E6..U+1F1FF
+			return true;
+		} else if ( ( u[2] >= 0x8C && u[2] <= 0x9B ) &&
+					( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F300..U+1F6FF
+			return true;
+		} else if ( ( u[2] >= 0xA4 && u[2] <= 0xA7 ) &&
+					( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F900..U+1F9FF
 			return true;
 		}
 	}
--- a/Xml.cpp
+++ b/Xml.cpp
@ -959,8 +959,10 @@ static bool inTag ( XmlNode *node, nodeid_t tagId, int *count ) {
 static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, int32_t minLength, int32_t maxLength ) {
 	int32_t contentLen = 0;

-	/// @todo ALC we may want this to be configurable so we can tweak this as needed
-	if ( wp->getNumWords() > (maxLength * 2) ) {
+	/// @todo ALC configurable maxNumWord so we can tweak this as needed
+	const int32_t maxNumWord = (maxLength * 2);
+
+	if ( wp->getNumWords() > maxNumWord ) {
 		// ignore too long snippet
 		// it may not be that useful to get the first x characters from a long snippet
 		contentLen = 0;
@ -969,12 +971,7 @@ static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, in
 		return contentLen;
 	}

-	char *bufEnd = buf + maxLength + 4; // plus ellipsis
-	if ( bufEnd > buf + bufLen ) {
-		bufEnd = buf + bufLen;
-	}
-
-	contentLen = pp->filter( buf, bufEnd, wp, 0, wp->getNumWords(), true );
+	contentLen = pp->filter( wp, 0, wp->getNumWords(), true, buf, buf + maxLength );

 	if ( contentLen < minLength ) {
 		// ignore too short descriptions
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -29839,7 +29839,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 	if ( p + len + 1 < pend ) {
 		// store it
 		// FILTER the html entities!!
-		int32_t len2 = pos->filter( p, pend, ww, a, b );
+		int32_t len2 = pos->filter( ww, a, b, false, p, pend );

 		// ensure NULL terminated
 		p[len2] = '\0';
@ -30207,7 +30207,7 @@ Summary *XmlDoc::getSummary () {
 		return (Summary *)ct;
 	}

-	/// @todo fill in summary for XML document
+	/// @todo ALC fill in summary for XML document
 	// xml and json docs have empty summaries for now
 	if ( *ct == CT_JSON || *ct == CT_XML ) {
 		m_summaryValid = true;
@ -30614,7 +30614,7 @@ SafeBuf *XmlDoc::getSampleForGigabits ( ) {
 		// if match would send us over, we are done
 		if ( p + len >= pend ) break;

-		len = pos->filter( p, pend, m->m_words, a, b );
+		len = pos->filter( m->m_words, a, b, false, p, pend );

 		// for debug (mdw)
 		//log("query: gigabitsample#%"INT32"=%s",i,p);
--- a/test/unit/PosTest.cpp
+++ b/test/unit/PosTest.cpp
@ -29,7 +29,7 @@ TEST( PosTest, FilterAllCaps ) {

 		ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );

-		int32_t len = pos.filter( buf, buf + MAX_BUF_SIZE, &words );
+		int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );

 		EXPECT_EQ( strlen( expected_output[i] ), len );
 		EXPECT_STREQ( expected_output[i], buf );
@ -54,7 +54,12 @@ TEST( PosTest, FilterEnding ) {

 		"Computer programming is tremendous fun. Li...",

-		"Premature optimization is the root of all evil."
+		"Premature optimization is the root of all evil.",
+
+		"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
+		"right as we had thought. Debugging had to be discovered. I can remember the exact instant when I "
+		"realized that a large part of my life from then on was going to be spent in finding mistakes in my "
+		"own programs. "
 	};

 	const char *expected_output[] = {
@ -74,7 +79,10 @@ TEST( PosTest, FilterEnding ) {

 		"Computer programming is tremendous fun.",

-		"Premature optimization is the root of all evil."
+		"Premature optimization is the root of all evil.",
+
+		"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
+		"right as we had thought. Debugging had to be discovered. I can remember the ..."
 	};

 	ASSERT_EQ( sizeof( input_strs ) / sizeof( input_strs[0] ),
@ -88,9 +96,9 @@ TEST( PosTest, FilterEnding ) {

 		ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );

-		int32_t len = pos.filter( buf, buf + 180 + 4, &words, 0, -1, true );
+		int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );

-		//EXPECT_EQ( strlen( expected_output[i] ), len );
+		EXPECT_EQ( strlen( expected_output[i] ), len );
 		EXPECT_STREQ( expected_output[i], buf );
 	}
 }
--- a/test/unit/UnicodeTest.cpp
+++ b/test/unit/UnicodeTest.cpp
@ -173,6 +173,15 @@ TEST(UnicodeTest, UnwantedSymbols) {
 	    "🂠",
 	    "🃿",

+	    // Enclosed Alphanumeric Supplement
+		// 1F1E6–1F1FF: Regional indicator symbols
+	    "🇦",
+	    "🇿",
+
+	    // Geometric Shapes
+		// 25A0–25FF: Geometric Shapes
+	    "■",
+	    "◿",
 	};

 	size_t len = sizeof(inputs) / sizeof(inputs[0]);