mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-16 02:46:08 -04:00
Improve title/summary for youtube
This commit is contained in:
466
Pos.cpp
466
Pos.cpp
@ -17,62 +17,19 @@ void Pos::reset() {
|
||||
if ( m_buf && m_needsFree )
|
||||
mfree ( m_buf , m_bufSize , "Pos" );
|
||||
m_buf = NULL;
|
||||
}
|
||||
|
||||
// . the interval is half-open [a,b)
|
||||
// . do not print out any alnum word with negative score
|
||||
int32_t Pos::filter( char *p, char *pend, Words *words, int32_t a, int32_t b, bool addEllipsis ) {
|
||||
int32_t plen = 0;
|
||||
set ( words , addEllipsis, p , pend, &plen , a , b );
|
||||
return plen;
|
||||
}
|
||||
|
||||
// . set the filtered position of each word
|
||||
// . used by Summary.cpp to determine how many chars are in the summary,
|
||||
// be those chars single byte or utf8 chars that are 4 bytes
|
||||
// . returns false and sets g_errno on error
|
||||
// . if f is non-NULL store filtered words into there. back to back spaces
|
||||
// are eliminated.
|
||||
bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len, int32_t a, int32_t b ) {
|
||||
// free m_buf in case this is a second call
|
||||
if ( ! f ) {
|
||||
reset();
|
||||
}
|
||||
|
||||
int32_t nw = words->getNumWords();
|
||||
int32_t *wlens = words->m_wordLens;
|
||||
int32_t Pos::filter( Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend ) {
|
||||
nodeid_t *tids = words->getTagIds(); // m_tagIds;
|
||||
char **wp = words->m_words;
|
||||
|
||||
// save start point for filtering
|
||||
char *fstart = f;
|
||||
|
||||
// -1 is the default value
|
||||
if ( b == -1 ) {
|
||||
b = nw;
|
||||
b = words->getNumWords();
|
||||
}
|
||||
|
||||
// alloc array if need to
|
||||
int32_t need = (nw+1) * 4;
|
||||
|
||||
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
||||
if ( !f ) {
|
||||
m_needsFree = false;
|
||||
|
||||
m_buf = m_localBuf;
|
||||
if ( need > POS_LOCALBUFSIZE ) {
|
||||
m_buf = (char *)mmalloc(need,"Pos");
|
||||
m_needsFree = true;
|
||||
}
|
||||
// bail on error
|
||||
if ( ! m_buf ) return false;
|
||||
m_bufSize = need;
|
||||
m_pos = (int32_t *)m_buf;
|
||||
m_numWords = nw;
|
||||
}
|
||||
|
||||
// this is the CHARACTER count.
|
||||
int32_t pos = 0;
|
||||
bool trunc = false;
|
||||
|
||||
static const int32_t maxCharSize = 4; // we are utf8
|
||||
@ -91,31 +48,23 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
int dotCount = 0; // store last encountered total consecutive dots
|
||||
char* dotPrevChar = NULL; // store char before dot which is not a space
|
||||
|
||||
for ( int32_t i = a ; i < b ; i++ ) {
|
||||
for ( int32_t i = a ; i < b ; ++i ) {
|
||||
if (trunc) {
|
||||
break;
|
||||
}
|
||||
|
||||
// set pos for the ith word to "pos"
|
||||
if ( ! f ) {
|
||||
m_pos[i] = pos;
|
||||
}
|
||||
|
||||
// is tag?
|
||||
if ( tids && tids[i] ) {
|
||||
// filtering into buffer (when generating summaries)
|
||||
if ( f ) {
|
||||
// let's not get from bad tags
|
||||
if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
|
||||
++inBadTags;
|
||||
continue;
|
||||
}
|
||||
// let's not get from bad tags
|
||||
if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
|
||||
++inBadTags;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( inBadTags ) {
|
||||
if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
|
||||
( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
|
||||
--inBadTags;
|
||||
}
|
||||
if ( inBadTags ) {
|
||||
if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
|
||||
( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
|
||||
--inBadTags;
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,14 +75,15 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
|
||||
// list tag? <li>
|
||||
if ( tids[i] == TAG_LI ) {
|
||||
if ( f ) {
|
||||
if ( ( fend - f > maxCharSize ) ) {
|
||||
*f++ = '*';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
if ( ( fend - f > maxCharSize ) ) {
|
||||
*f++ = '*';
|
||||
|
||||
// counted as caps because we're detecting all caps for a sentence
|
||||
++capCount;
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
pos++;
|
||||
|
||||
lastSpace = false;
|
||||
continue;
|
||||
}
|
||||
@ -146,35 +96,29 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
|
||||
// if had a br tag count it as a '.'
|
||||
if ( tids[i] ) { // <br>
|
||||
// are we filtering?
|
||||
if ( f && f != fstart ) {
|
||||
if ( f != fstart ) {
|
||||
if ( ( fend - f > 2 * maxCharSize ) ) {
|
||||
*f++ = '.';
|
||||
*f++ = ' ';
|
||||
|
||||
// counted as caps because we're detecting all caps for a sentence
|
||||
capCount += 2;
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
}
|
||||
|
||||
// no, just single period.
|
||||
pos += 2;
|
||||
lastSpace = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// are we filtering?
|
||||
if ( f ) {
|
||||
if ( ( fend - f > maxCharSize ) ) {
|
||||
*f++ = ' ';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
if ( ( fend - f > maxCharSize ) ) {
|
||||
*f++ = ' ';
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
|
||||
// count as a single space
|
||||
pos++;
|
||||
|
||||
// do not allow back-to-back spaces
|
||||
lastSpace = true;
|
||||
|
||||
@ -187,22 +131,19 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
}
|
||||
|
||||
// scan through all chars discounting back-to-back spaces
|
||||
char *pend = wp[i] + wlens[i];
|
||||
char *pend = words->getWord(i) + words->getWordLen(i);
|
||||
unsigned char cs = 0;
|
||||
|
||||
char *p = NULL ;
|
||||
|
||||
// assume filters out to the same # of chars
|
||||
for ( p = wp[i]; p < pend; p += cs ) {
|
||||
for ( p = words->getWord(i); p < pend; p += cs ) {
|
||||
// get size
|
||||
cs = getUtf8CharSize(p);
|
||||
|
||||
// filtering into buffer (when generating summaries)
|
||||
if ( f ) {
|
||||
// skip unwanted character
|
||||
if ( isUtf8UnwantedSymbols( p ) ) {
|
||||
continue;
|
||||
}
|
||||
// skip unwanted character
|
||||
if ( isUtf8UnwantedSymbols( p ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// do not count space if one before
|
||||
@ -213,155 +154,151 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
|
||||
lastSpace = true;
|
||||
|
||||
// are we filtering?
|
||||
if ( f ) {
|
||||
if ( fend - f > 1 ) {
|
||||
lastBreakPrevChar = prevChar;
|
||||
if ( fend - f > 1 ) {
|
||||
lastBreakPrevChar = prevChar;
|
||||
|
||||
lastBreak = f;
|
||||
*f++ = ' ';
|
||||
lastBreak = f;
|
||||
*f++ = ' ';
|
||||
|
||||
// space is counted as caps as well because we're detecting all caps for a sentence
|
||||
++capCount;
|
||||
// counted as caps because we're detecting all caps for a sentence
|
||||
++capCount;
|
||||
|
||||
dotCount = 0;
|
||||
dotCount = 0;
|
||||
|
||||
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
}
|
||||
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( f ) {
|
||||
if ( fend - f > cs ) {
|
||||
prevChar = f;
|
||||
|
||||
if ( cs == 1 ) {
|
||||
// we only do it for ascii to avoid catering for different rules in different languages
|
||||
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
|
||||
// eg:
|
||||
// The Greek upper-case letter "Σ" has two different lower-case forms:
|
||||
// "ς" in word-final position and "σ" elsewhere
|
||||
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
|
||||
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
|
||||
// and comma/quotes/etc. is included
|
||||
++capCount;
|
||||
}
|
||||
|
||||
// some sites try to be smart and truncate for us, let's remove that
|
||||
// if if there are no space between dots and letter
|
||||
if ( *p == '.' ) {
|
||||
++dotCount;
|
||||
} else {
|
||||
dotCount = 0;
|
||||
dotPrevChar = f;
|
||||
}
|
||||
|
||||
*f++ = *p;
|
||||
} else {
|
||||
dotCount = 0;
|
||||
dotPrevChar = f;
|
||||
|
||||
gbmemcpy( f, p, cs );
|
||||
f += cs;
|
||||
}
|
||||
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( fend - f > cs ) {
|
||||
prevChar = f;
|
||||
|
||||
if ( cs == 1 ) {
|
||||
// we only do it for ascii to avoid catering for different rules in different languages
|
||||
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
|
||||
// eg:
|
||||
// The Greek upper-case letter "Σ" has two different lower-case forms:
|
||||
// "ς" in word-final position and "σ" elsewhere
|
||||
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
|
||||
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
|
||||
// and comma/quotes/etc. is included
|
||||
++capCount;
|
||||
}
|
||||
|
||||
// some sites try to be smart and truncate for us, let's remove that
|
||||
// if if there are no space between dots and letter
|
||||
if ( *p == '.' ) {
|
||||
++dotCount;
|
||||
} else {
|
||||
dotCount = 0;
|
||||
dotPrevChar = f;
|
||||
}
|
||||
|
||||
*f++ = *p;
|
||||
} else {
|
||||
dotCount = 0;
|
||||
dotPrevChar = f;
|
||||
|
||||
gbmemcpy( f, p, cs );
|
||||
f += cs;
|
||||
}
|
||||
} else {
|
||||
trunc = true;
|
||||
}
|
||||
|
||||
pos++;
|
||||
lastSpace = false;
|
||||
}
|
||||
}
|
||||
|
||||
/// @todo ALC simplify logic/break into smaller functions
|
||||
if ( f ) {
|
||||
// only capitalize first letter in a word for a sentence with all caps
|
||||
if ( capCount == ( f - fstart ) ) {
|
||||
bool isFirstLetter = true;
|
||||
|
||||
unsigned char cs = 0;
|
||||
for ( char *c = fstart; c < f; c += cs ) {
|
||||
cs = getUtf8CharSize(c);
|
||||
/// @todo ALC configurable minCapCount so we can tweak this as needed
|
||||
const int minCapCount = 5;
|
||||
|
||||
bool isAlpha = is_alpha_utf8( c );
|
||||
// only capitalize first letter in a word for a sentence with all caps
|
||||
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
|
||||
bool isFirstLetter = true;
|
||||
|
||||
if ( isAlpha ) {
|
||||
if (isFirstLetter) {
|
||||
isFirstLetter = false;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
isFirstLetter = true;
|
||||
unsigned char cs = 0;
|
||||
for ( char *c = fstart; c < f; c += cs ) {
|
||||
cs = getUtf8CharSize(c);
|
||||
|
||||
bool isAlpha = is_alpha_utf8( c );
|
||||
|
||||
if ( isAlpha ) {
|
||||
if (isFirstLetter) {
|
||||
isFirstLetter = false;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
isFirstLetter = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( !isFirstLetter ) {
|
||||
to_lower_utf8(c, c);
|
||||
}
|
||||
if ( !isFirstLetter ) {
|
||||
to_lower_utf8(c, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// let's remove ellipsis (...) at the end
|
||||
if ( dotCount == 3 ) {
|
||||
if ( is_ascii3( *dotPrevChar ) ) {
|
||||
switch ( *dotPrevChar ) {
|
||||
case ',':
|
||||
trunc = true;
|
||||
lastBreak = dotPrevChar + 1;
|
||||
break;
|
||||
case '!':
|
||||
case '.':
|
||||
trunc = false;
|
||||
f = dotPrevChar + 1;
|
||||
break;
|
||||
case ' ':
|
||||
trunc = false;
|
||||
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
|
||||
const int minRemoveEllipsisLen = 120;
|
||||
|
||||
if ( lastBreak ) {
|
||||
f = lastBreak;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
trunc = true;
|
||||
// let's remove ellipsis (...) at the end
|
||||
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
|
||||
if ( is_ascii3( *dotPrevChar ) ) {
|
||||
switch ( *dotPrevChar ) {
|
||||
case ',':
|
||||
trunc = true;
|
||||
lastBreak = dotPrevChar + 1;
|
||||
break;
|
||||
case '!':
|
||||
case '.':
|
||||
trunc = false;
|
||||
f = dotPrevChar + 1;
|
||||
break;
|
||||
case ' ':
|
||||
trunc = false;
|
||||
|
||||
if ( lastBreakPrevChar ) {
|
||||
if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
|
||||
switch ( *( lastBreakPrevChar ) ) {
|
||||
case '!':
|
||||
case '.':
|
||||
trunc = false;
|
||||
if ( lastBreak ) {
|
||||
f = lastBreak;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
trunc = true;
|
||||
|
||||
if (lastBreak) {
|
||||
f = lastBreak;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if ( lastBreakPrevChar ) {
|
||||
if ( is_ascii3( *( lastBreakPrevChar ) ) ) {
|
||||
switch ( *( lastBreakPrevChar ) ) {
|
||||
case '!':
|
||||
case '.':
|
||||
trunc = false;
|
||||
|
||||
if (lastBreak) {
|
||||
f = lastBreak;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( trunc ) {
|
||||
if ( lastBreak == NULL ) {
|
||||
*len = 0;
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( f ) {
|
||||
f = lastBreak;
|
||||
}
|
||||
f = lastBreak;
|
||||
|
||||
/// @todo ALC we should cater ellipsis for different languages
|
||||
if ( addEllipsis ) {
|
||||
if ( (fend - f) > 4 ) {
|
||||
gbmemcpy ( f , " ..." , 4 );
|
||||
@ -370,14 +307,121 @@ bool Pos::set( Words *words, bool addEllipsis, char *f, char *fend, int32_t *len
|
||||
}
|
||||
}
|
||||
|
||||
// set pos for the END of the last word here (used in Summary.cpp)
|
||||
if ( !f ) {
|
||||
m_pos[nw] = pos;
|
||||
} else { // NULL terminate f
|
||||
*len = f - fstart;
|
||||
*f = '\0';
|
||||
// NULL terminate f
|
||||
*f = '\0';
|
||||
|
||||
return (f - fstart);
|
||||
}
|
||||
|
||||
bool Pos::set( Words *words, int32_t a, int32_t b ) {
|
||||
// free m_buf in case this is a second call
|
||||
reset();
|
||||
|
||||
int32_t nw = words->getNumWords();
|
||||
int32_t *wlens = words->m_wordLens;
|
||||
nodeid_t *tids = words->getTagIds(); // m_tagIds;
|
||||
char **wp = words->m_words;
|
||||
|
||||
// -1 is the default value
|
||||
if ( b == -1 ) {
|
||||
b = nw;
|
||||
}
|
||||
|
||||
// Success
|
||||
// alloc array if need to
|
||||
int32_t need = (nw+1) * 4;
|
||||
|
||||
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
||||
m_needsFree = false;
|
||||
|
||||
m_buf = m_localBuf;
|
||||
if ( need > POS_LOCALBUFSIZE ) {
|
||||
m_buf = (char *)mmalloc(need,"Pos");
|
||||
m_needsFree = true;
|
||||
}
|
||||
|
||||
// bail on error
|
||||
if ( ! m_buf ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
m_bufSize = need;
|
||||
m_pos = (int32_t *)m_buf;
|
||||
|
||||
// this is the CHARACTER count.
|
||||
int32_t pos = 0;
|
||||
|
||||
// flag for stopping back-to-back spaces. only count those as one char.
|
||||
bool lastSpace = false;
|
||||
|
||||
for ( int32_t i = a ; i < b ; i++ ) {
|
||||
// set pos for the ith word to "pos"
|
||||
m_pos[i] = pos;
|
||||
|
||||
// is tag?
|
||||
if ( tids && tids[i] ) {
|
||||
// if not breaking, does nothing
|
||||
if ( !g_nodes[tids[i] & 0x7f].m_isBreaking ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// list tag? <li>
|
||||
if ( tids[i] == TAG_LI ) {
|
||||
++pos;
|
||||
lastSpace = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if had a previous breaking tag and no non-tag
|
||||
// word after it, do not count back-to-back spaces
|
||||
if ( lastSpace ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if had a br tag count it as a '. '
|
||||
if ( tids[i] ) { // <br>
|
||||
pos += 2;
|
||||
lastSpace = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// count as a single space
|
||||
pos++;
|
||||
|
||||
// do not allow back-to-back spaces
|
||||
lastSpace = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// scan through all chars discounting back-to-back spaces
|
||||
char *pend = wp[i] + wlens[i];
|
||||
unsigned char cs = 0;
|
||||
|
||||
// assume filters out to the same # of chars
|
||||
for ( char *p = wp[i]; p < pend; p += cs ) {
|
||||
// get size
|
||||
cs = getUtf8CharSize(p);
|
||||
|
||||
// do not count space if one before
|
||||
if ( is_wspace_utf8 (p) ) {
|
||||
if ( lastSpace ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
lastSpace = true;
|
||||
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
++pos;
|
||||
lastSpace = false;
|
||||
}
|
||||
}
|
||||
|
||||
// set pos for the END of the last word here
|
||||
m_pos[nw] = pos;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
16
Pos.h
16
Pos.h
@ -3,6 +3,8 @@
|
||||
#ifndef _POS_H_
|
||||
#define _POS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// this class is used to measure the number of characters between two "words"
|
||||
// (as defined in the Words.cpp class) in units of "characters". A utf8
|
||||
// character can be 1, 2, 3 or 4 bytes, so be careful.
|
||||
@ -19,22 +21,20 @@ class Pos {
|
||||
~Pos();
|
||||
void reset();
|
||||
|
||||
bool set(Words *words, bool addEllipsis = false, char *f = NULL, char *fend = NULL,
|
||||
int32_t *flen = NULL, int32_t a = 0, int32_t b = -1 );
|
||||
bool set(Words *words, int32_t a = 0, int32_t b = -1 );
|
||||
|
||||
// . filter out xml words [a,b] into plain text, stores into "p"
|
||||
// . will not exceed "pend"
|
||||
// . returns number of BYTES stored into "p"
|
||||
int32_t filter(char *p, char *pend, Words *words, int32_t a = 0, int32_t b = -1,
|
||||
bool addEllipsis = false );
|
||||
// . filter out xml words [a,b] into plain text, stores into "f"
|
||||
// . will not exceed "fend"
|
||||
// . returns number of BYTES stored into "f"
|
||||
int32_t filter(Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend);
|
||||
|
||||
// . the position in CHARACTERS of word i is given by m_pos[i]
|
||||
// . this is NOT the byte position. you can have 2, 3 or even 4
|
||||
// byte characters in utf8. the purpose here is for counting
|
||||
// "letters" or "characters" for formatting purposes.
|
||||
int32_t *m_pos;
|
||||
int32_t m_numWords;
|
||||
|
||||
private:
|
||||
char m_localBuf [ POS_LOCALBUFSIZE ];
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
|
@ -75,8 +75,8 @@ bool Summary::verifySummary( char *titleBuf, int32_t titleBufLen ) {
|
||||
// - meta name = "og:description"
|
||||
// - meta name = "description"
|
||||
bool Summary::setFromTags( Xml *xml, int32_t maxSummaryLen, char *titleBuf, int32_t titleBufLen ) {
|
||||
/// @todo ALC we may want this to be configurable so we can tweak this as needed
|
||||
int minSummaryLen = (maxSummaryLen / 3);
|
||||
/// @todo ALC configurable minSummaryLen so we can tweak this as needed
|
||||
const int minSummaryLen = (maxSummaryLen / 3);
|
||||
|
||||
// itemprop = "description"
|
||||
if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
|
||||
@ -512,7 +512,7 @@ bool Summary::set (Xml *xml, Words *words, Sections *sections, Pos *pos, Query *
|
||||
// . removes back to back spaces
|
||||
// . converts html entities
|
||||
// . filters in stores words in [a,b) interval
|
||||
int32_t len = pos->filter( p, pend, ww, maxa, maxb );
|
||||
int32_t len = pos->filter( ww, maxa, maxb, false, p, pend );
|
||||
|
||||
// break out if did not fit
|
||||
if ( len == 0 ) {
|
||||
@ -1078,7 +1078,7 @@ bool Summary::getDefaultSummary ( Xml *xml, Words *words, Sections *sections, Po
|
||||
}
|
||||
|
||||
if (bestStart >= 0 && bestEnd > bestStart){
|
||||
int32_t len = pos->filter( p, pend - 10, words, bestStart, bestEnd );
|
||||
int32_t len = pos->filter( words, bestStart, bestEnd, false, p, pend - 10 );
|
||||
p += len;
|
||||
if ( len > 0 && p + 3 + 2 < pend ){
|
||||
// space first?
|
||||
|
@ -52,8 +52,8 @@ void Title::reset() {
|
||||
}
|
||||
|
||||
bool Title::setFromTags( Xml *xml, int32_t maxTitleLen ) {
|
||||
/// @todo ALC we may want this to be configurable so we can tweak this as needed
|
||||
int minTitleLen = 3;
|
||||
/// @todo ALC configurable minTitleLen so we can tweak this as needed
|
||||
const int minTitleLen = 3;
|
||||
|
||||
// meta property = "og:title"
|
||||
if ( xml->getTagContent("property", "og:title", m_title, MAX_TITLE_LEN, minTitleLen, maxTitleLen, &m_titleLen, true, TAG_META) ) {
|
||||
@ -555,7 +555,7 @@ bool Title::setTitle4 ( XmlDoc *xd, Xml *XML, Words *WW, int32_t maxTitleChars,
|
||||
}
|
||||
}
|
||||
|
||||
/// @todo we should allow more tags than just title/link
|
||||
/// @todo ALC we should allow more tags than just title/link
|
||||
// skip if not a good tag.
|
||||
if (tid != TAG_TITLE && tid != TAG_A) {
|
||||
continue;
|
||||
|
2
Title.h
2
Title.h
@ -20,7 +20,7 @@ public:
|
||||
|
||||
void reset();
|
||||
|
||||
/// @todo correct comments
|
||||
/// @todo ALC correct comments
|
||||
// . set m_title to the title of the document represented by "xd"
|
||||
// . if getHardTitle is true will always use the title in the <title>
|
||||
// tag, but if that is not present, will try dmoz titles before
|
||||
|
@ -66,7 +66,7 @@ iconv_t gbiconv_open( const char *tocode, const char *fromcode) {
|
||||
}
|
||||
|
||||
int gbiconv_close(iconv_t cd) {
|
||||
/// @todo gbiconv_close currently does nothing
|
||||
/// @todo ALC gbiconv_close currently does nothing
|
||||
//int val = iconv_close(cd);
|
||||
//if (val == 0) g_mem.rmMem((void*)cd, 1, "iconv", 1);
|
||||
//return val;
|
||||
|
45
Unicode.h
45
Unicode.h
@ -128,6 +128,7 @@ bool inline isValidUtf8Char(const char *s) {
|
||||
// Refer to:
|
||||
// http://www.unicode.org/charts/
|
||||
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
|
||||
// http://www.utf8-chartable.de/
|
||||
|
||||
// Emoji & Pictographs
|
||||
// 2600–26FF: Miscellaneous Symbols
|
||||
@ -143,31 +144,45 @@ bool inline isValidUtf8Char(const char *s) {
|
||||
// 1F030–1F09F: Domino Tiles
|
||||
// 1F0A0–1F0FF: Playing Cards
|
||||
|
||||
// Enclosed Alphanumeric Supplement
|
||||
// 1F1E6–1F1FF: Regional indicator symbols
|
||||
|
||||
// Geometric Shapes
|
||||
// 25A0–25FF: Geometric Shapes
|
||||
|
||||
// +--------------------+----------+----------+----------+----------+
|
||||
// | Code Points | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
|
||||
// +--------------------+----------+----------+----------+----------+
|
||||
// | U+2600..U+27BF | E2 | 98..9E | 80..BF | |
|
||||
// | U+25A0..U+25BF | E2 | 96 | A0..BF | |
|
||||
// | U+25C0..U+27BF | E2 | 97..9E | 80..BF | |
|
||||
// | U+1F000..U+1F0FF | F0 | 9F | 80..83 | 80..BF |
|
||||
// | U+1F1E6..U+1F1FF | F0 | 9F | 87 | A6..BF |
|
||||
// | U+1F300..U+1F6FF | F0 | 9F | 8C..9B | 80..BF |
|
||||
// | U+1F900..U+1F9FF | F0 | 9F | A4..A7 | 80..BF |
|
||||
// +--------------------+----------+----------+----------+----------+
|
||||
bool inline isUtf8UnwantedSymbols(const char *s) {
|
||||
const uint8_t *u = (uint8_t*)s;
|
||||
const uint8_t *u = (uint8_t *)s;
|
||||
|
||||
if (u[0] == 0xE2) { // U+2600..U+27BF
|
||||
if ((u[1] >= 0x98 && u[1] <= 0x9E) &&
|
||||
(u[2] >= 0x80 && u[2] <= 0xBF)) {
|
||||
return true;
|
||||
}
|
||||
} else if (u[0] == 0xF0 && u[1] == 0x9F) {
|
||||
if ((u[2] >= 0x80 && u[2] <= 0x83) &&
|
||||
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F000..U+1F0FF
|
||||
return true;
|
||||
} else if ((u[2] >= 0x8C && u[2] <= 0x9B) &&
|
||||
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F300..U+1F6FF
|
||||
if ( u[0] == 0xE2 ) {
|
||||
if ( ( u[1] == 0x96 ) &&
|
||||
( u[2] >= 0xA0 && u[2] <= 0xBF ) ) {
|
||||
return true;
|
||||
} else if ((u[2] >= 0xA4 && u[2] <= 0xA7) &&
|
||||
(u[3] >= 0x80 && u[3] <= 0xBF)) { // U+1F900..U+1F9FF
|
||||
} else if ( ( u[1] >= 0x97 && u[1] <= 0x9E ) &&
|
||||
( u[2] >= 0x80 && u[2] <= 0xBF ) ) { // U+25C0..U+27BF
|
||||
return true;
|
||||
}
|
||||
} else if ( u[0] == 0xF0 && u[1] == 0x9F ) {
|
||||
if ( ( u[2] >= 0x80 && u[2] <= 0x83 ) &&
|
||||
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F000..U+1F0FF
|
||||
return true;
|
||||
} else if ( ( u[2] == 0x87 ) &&
|
||||
( u[3] >= 0xA6 && u[3] <= 0xBF ) ) { // U+1F1E6..U+1F1FF
|
||||
return true;
|
||||
} else if ( ( u[2] >= 0x8C && u[2] <= 0x9B ) &&
|
||||
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F300..U+1F6FF
|
||||
return true;
|
||||
} else if ( ( u[2] >= 0xA4 && u[2] <= 0xA7 ) &&
|
||||
( u[3] >= 0x80 && u[3] <= 0xBF ) ) { // U+1F900..U+1F9FF
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
13
Xml.cpp
13
Xml.cpp
@ -959,8 +959,10 @@ static bool inTag ( XmlNode *node, nodeid_t tagId, int *count ) {
|
||||
static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, int32_t minLength, int32_t maxLength ) {
|
||||
int32_t contentLen = 0;
|
||||
|
||||
/// @todo ALC we may want this to be configurable so we can tweak this as needed
|
||||
if ( wp->getNumWords() > (maxLength * 2) ) {
|
||||
/// @todo ALC configurable maxNumWord so we can tweak this as needed
|
||||
const int32_t maxNumWord = (maxLength * 2);
|
||||
|
||||
if ( wp->getNumWords() > maxNumWord ) {
|
||||
// ignore too long snippet
|
||||
// it may not be that useful to get the first x characters from a long snippet
|
||||
contentLen = 0;
|
||||
@ -969,12 +971,7 @@ static int32_t filterContent ( Words *wp, Pos *pp, char *buf, int32_t bufLen, in
|
||||
return contentLen;
|
||||
}
|
||||
|
||||
char *bufEnd = buf + maxLength + 4; // plus ellipsis
|
||||
if ( bufEnd > buf + bufLen ) {
|
||||
bufEnd = buf + bufLen;
|
||||
}
|
||||
|
||||
contentLen = pp->filter( buf, bufEnd, wp, 0, wp->getNumWords(), true );
|
||||
contentLen = pp->filter( wp, 0, wp->getNumWords(), true, buf, buf + maxLength );
|
||||
|
||||
if ( contentLen < minLength ) {
|
||||
// ignore too short descriptions
|
||||
|
@ -29839,7 +29839,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
if ( p + len + 1 < pend ) {
|
||||
// store it
|
||||
// FILTER the html entities!!
|
||||
int32_t len2 = pos->filter( p, pend, ww, a, b );
|
||||
int32_t len2 = pos->filter( ww, a, b, false, p, pend );
|
||||
|
||||
// ensure NULL terminated
|
||||
p[len2] = '\0';
|
||||
@ -30207,7 +30207,7 @@ Summary *XmlDoc::getSummary () {
|
||||
return (Summary *)ct;
|
||||
}
|
||||
|
||||
/// @todo fill in summary for XML document
|
||||
/// @todo ALC fill in summary for XML document
|
||||
// xml and json docs have empty summaries for now
|
||||
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
||||
m_summaryValid = true;
|
||||
@ -30614,7 +30614,7 @@ SafeBuf *XmlDoc::getSampleForGigabits ( ) {
|
||||
// if match would send us over, we are done
|
||||
if ( p + len >= pend ) break;
|
||||
|
||||
len = pos->filter( p, pend, m->m_words, a, b );
|
||||
len = pos->filter( m->m_words, a, b, false, p, pend );
|
||||
|
||||
// for debug (mdw)
|
||||
//log("query: gigabitsample#%"INT32"=%s",i,p);
|
||||
|
@ -29,7 +29,7 @@ TEST( PosTest, FilterAllCaps ) {
|
||||
|
||||
ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );
|
||||
|
||||
int32_t len = pos.filter( buf, buf + MAX_BUF_SIZE, &words );
|
||||
int32_t len = pos.filter( &words, 0, words.getNumWords(), false, buf, buf + MAX_BUF_SIZE );
|
||||
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
@ -54,7 +54,12 @@ TEST( PosTest, FilterEnding ) {
|
||||
|
||||
"Computer programming is tremendous fun. Li...",
|
||||
|
||||
"Premature optimization is the root of all evil."
|
||||
"Premature optimization is the root of all evil.",
|
||||
|
||||
"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
|
||||
"right as we had thought. Debugging had to be discovered. I can remember the exact instant when I "
|
||||
"realized that a large part of my life from then on was going to be spent in finding mistakes in my "
|
||||
"own programs. "
|
||||
};
|
||||
|
||||
const char *expected_output[] = {
|
||||
@ -74,7 +79,10 @@ TEST( PosTest, FilterEnding ) {
|
||||
|
||||
"Computer programming is tremendous fun.",
|
||||
|
||||
"Premature optimization is the root of all evil."
|
||||
"Premature optimization is the root of all evil.",
|
||||
|
||||
"As soon as we started programming, we found to our surprise that it wasn't as easy to get programs "
|
||||
"right as we had thought. Debugging had to be discovered. I can remember the ..."
|
||||
};
|
||||
|
||||
ASSERT_EQ( sizeof( input_strs ) / sizeof( input_strs[0] ),
|
||||
@ -88,9 +96,9 @@ TEST( PosTest, FilterEnding ) {
|
||||
|
||||
ASSERT_TRUE( words.set( input_strs[i], true, 0 ) );
|
||||
|
||||
int32_t len = pos.filter( buf, buf + 180 + 4, &words, 0, -1, true );
|
||||
int32_t len = pos.filter( &words, 0, -1, true, buf, buf + 180 );
|
||||
|
||||
//EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
EXPECT_EQ( strlen( expected_output[i] ), len );
|
||||
EXPECT_STREQ( expected_output[i], buf );
|
||||
}
|
||||
}
|
||||
|
@ -173,6 +173,15 @@ TEST(UnicodeTest, UnwantedSymbols) {
|
||||
"🂠",
|
||||
"",
|
||||
|
||||
// Enclosed Alphanumeric Supplement
|
||||
// 1F1E6–1F1FF: Regional indicator symbols
|
||||
"🇦",
|
||||
"🇿",
|
||||
|
||||
// Geometric Shapes
|
||||
// 25A0–25FF: Geometric Shapes
|
||||
"■",
|
||||
"◿",
|
||||
};
|
||||
|
||||
size_t len = sizeof(inputs) / sizeof(inputs[0]);
|
||||
|
Reference in New Issue
Block a user