653 lines
14 KiB
C++
653 lines
14 KiB
C++
#include "Pos.h"
|
||
#include "tokenizer.h"
|
||
#include "XmlNode.h"
|
||
#include "Sections.h"
|
||
#include "TitleSummaryCodepointFilter.h"
|
||
#include "Conf.h"
|
||
#include "Mem.h"
|
||
#include "Errno.h"
|
||
#include "Log.h"
|
||
#include "utf8_fast.h"
|
||
|
||
|
||
Pos::Pos() {
|
||
m_buf = NULL;
|
||
m_needsFree = false;
|
||
m_pos = NULL;
|
||
m_bufSize = 0;
|
||
memset(m_localBuf, 0, sizeof(m_localBuf));
|
||
}
|
||
|
||
Pos::~Pos () {
|
||
reset();
|
||
}
|
||
|
||
void Pos::reset() {
|
||
if ( m_buf && m_needsFree )
|
||
mfree ( m_buf , m_bufSize , "Pos" );
|
||
m_buf = NULL;
|
||
}
|
||
|
||
static bool inTag( nodeid_t tagId, nodeid_t expectedTagId, int *count ) {
|
||
if ( !count ) {
|
||
return false;
|
||
}
|
||
|
||
if ( tagId == expectedTagId ) {
|
||
++( *count );
|
||
}
|
||
|
||
if ( *count ) {
|
||
// back tag
|
||
if ( ( tagId & BACKBITCOMP ) == expectedTagId ) {
|
||
--( *count );
|
||
}
|
||
}
|
||
|
||
return ( *count > 0 );
|
||
}
|
||
|
||
unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend, int32_t version ) {
|
||
logTrace(g_conf.m_logTracePos, "BEGIN");
|
||
|
||
// save start point for filtering
|
||
char *fstart = f;
|
||
|
||
// -1 is the default value
|
||
if ( b == -1 ) {
|
||
b = tr->size();
|
||
}
|
||
|
||
bool trunc = false;
|
||
|
||
static const int32_t maxCharSize = 4; // we are utf8
|
||
|
||
char* prevChar = NULL;
|
||
|
||
char* lastBreak = NULL;
|
||
char* lastBreakPrevChar = NULL; // store char before space
|
||
|
||
// flag for stopping back-to-back spaces. only count those as one char.
|
||
bool lastSpace = false;
|
||
|
||
int inBadTags = 0;
|
||
int capCount = 0;
|
||
|
||
const char *lastPunct = NULL;
|
||
unsigned char lastPunctSize = 0;
|
||
int samePunctCount = 0;
|
||
|
||
int dotCount = 0; // store last encountered total consecutive dots
|
||
char* dotPrevChar = NULL; // store char before dot which is not a space
|
||
|
||
const char* entityPos[32];
|
||
int32_t entityLen[32];
|
||
char entityChar[32];
|
||
int32_t entityCount = 0;
|
||
|
||
// we need to decode HTML entities for version above 122 because we stop decoding
|
||
// & > < to avoid losing information
|
||
if (version >= 122) { // TITLEREC_CURRENT_VERSION
|
||
int32_t maxWord = b;
|
||
|
||
if ((unsigned)maxWord == tr->size()) {
|
||
maxWord -= 1;
|
||
}
|
||
|
||
const char *pos = (*tr)[a].token_start;
|
||
const char *endPos = (*tr)[maxWord].token_end();
|
||
|
||
for ( ; ( pos + 3 ) < endPos; ++pos ) {
|
||
if (*pos == '&') {
|
||
if (*(pos + 3) == ';') {
|
||
if (*(pos + 2) == 't') {
|
||
char c = *(pos + 1);
|
||
if ( c == 'g' || c == 'l' ) {
|
||
// > / <
|
||
entityPos[entityCount] = pos;
|
||
entityLen[entityCount] = 4;
|
||
if ( c == 'g' ) {
|
||
entityChar[entityCount] = '>';
|
||
} else {
|
||
entityChar[entityCount] = '<';
|
||
}
|
||
++entityCount;
|
||
}
|
||
}
|
||
} else if ((pos + 4 < endPos) && *(pos + 4) == ';') {
|
||
if (*(pos + 1) == 'a' && *(pos + 2) == 'm' && *(pos + 3) == 'p') {
|
||
// &
|
||
entityPos[entityCount] = pos;
|
||
entityLen[entityCount] = 5;
|
||
entityChar[entityCount] = '&';
|
||
++entityCount;
|
||
}
|
||
}
|
||
}
|
||
|
||
// make sure we don't overflow
|
||
if (entityCount >= 32) {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
int32_t currentEntityPos = 0;
|
||
|
||
for ( int32_t i = a ; i < b ; ++i ) {
|
||
if (trunc) {
|
||
break;
|
||
}
|
||
|
||
// is tag?
|
||
nodeid_t tid = (*tr)[i].nodeid;
|
||
if ( tid ) {
|
||
logTrace(g_conf.m_logTracePos, "tags");
|
||
|
||
// let's not get from bad tags
|
||
if ( inTag( tid, TAG_STYLE, &inBadTags ) ) {
|
||
continue;
|
||
}
|
||
|
||
if ( inTag( tid, TAG_SCRIPT, &inBadTags ) ) {
|
||
continue;
|
||
}
|
||
|
||
// if not breaking, does nothing
|
||
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
|
||
continue;
|
||
}
|
||
|
||
// list tag? <li>
|
||
if ( tid == TAG_LI ) {
|
||
if ( ( fend - f > maxCharSize ) ) {
|
||
*f++ = '*';
|
||
|
||
// counted as caps because we're detecting all caps for a sentence
|
||
++capCount;
|
||
} else {
|
||
trunc = true;
|
||
}
|
||
|
||
lastSpace = false;
|
||
continue;
|
||
}
|
||
|
||
// if had a previous breaking tag and no non-tag
|
||
// word after it, do not count back-to-back spaces
|
||
if ( lastSpace ) {
|
||
continue;
|
||
}
|
||
|
||
// if had a br tag count it as a '.'
|
||
if ( tid ) { // <br>
|
||
if ( f != fstart ) {
|
||
if ( ( fend - f > 2 * maxCharSize ) ) {
|
||
if ( prevChar && is_ascii(*prevChar) && (*prevChar != '.') ) {
|
||
*f++ = '.';
|
||
|
||
// counted as caps because we're detecting all caps for a sentence
|
||
++capCount;
|
||
}
|
||
|
||
*f++ = ' ';
|
||
++capCount;
|
||
} else {
|
||
trunc = true;
|
||
}
|
||
}
|
||
|
||
lastSpace = true;
|
||
|
||
continue;
|
||
}
|
||
|
||
if ( ( fend - f > maxCharSize ) ) {
|
||
*f++ = ' ';
|
||
} else {
|
||
trunc = true;
|
||
}
|
||
|
||
// do not allow back-to-back spaces
|
||
lastSpace = true;
|
||
|
||
continue;
|
||
}
|
||
|
||
// scan through all chars discounting back-to-back spaces
|
||
unsigned char cs = 0;
|
||
const char *p = (*tr)[i].token_start;
|
||
const char *pend = (*tr)[i].token_end();
|
||
|
||
|
||
const char *currentEntity = NULL;
|
||
int32_t currentEntityLen = 0;
|
||
char currentEntityChar = '\0';
|
||
const char *nextEntity = NULL;
|
||
int32_t nextEntityLen = 0;
|
||
char nextEntityChar = '\0';
|
||
|
||
bool hasEntity = false;
|
||
while (currentEntityPos < entityCount) {
|
||
currentEntity = entityPos[currentEntityPos];
|
||
currentEntityLen = entityLen[currentEntityPos];
|
||
currentEntityChar = entityChar[currentEntityPos];
|
||
|
||
if ( currentEntityPos + 1 < entityCount ) {
|
||
nextEntity = entityPos[currentEntityPos + 1];
|
||
nextEntityLen = entityLen[currentEntityPos + 1];
|
||
nextEntityChar = entityChar[currentEntityPos + 1];
|
||
}
|
||
|
||
if ( p <= currentEntity || p <= (currentEntity + currentEntityLen) ) {
|
||
hasEntity = true;
|
||
break;
|
||
} else {
|
||
if (p > currentEntity) {
|
||
++currentEntityPos;
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
/// @todo ALC configurable maxSamePunctCount so we can tweak this as needed
|
||
const int maxSamePunctCount = 5;
|
||
char *lastEllipsis = NULL;
|
||
|
||
// assume filters out to the same # of chars
|
||
for ( ; p < pend; p += cs ) {
|
||
// get size
|
||
cs = getUtf8CharSize(p);
|
||
|
||
// skip entity
|
||
if ( hasEntity ) {
|
||
if (p >= currentEntity && p < (currentEntity + currentEntityLen)) {
|
||
if (p == currentEntity) {
|
||
*f++ = currentEntityChar;
|
||
lastSpace = false;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (nextEntity && p >= nextEntity && p < (nextEntity + nextEntityLen)) {
|
||
if (p == nextEntity) {
|
||
*f++ = nextEntityChar;
|
||
lastSpace = false;
|
||
}
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// skip unwanted character
|
||
if ( isUtf8UnwantedSymbols( p ) ) {
|
||
continue;
|
||
}
|
||
|
||
|
||
bool resetPunctCount = true;
|
||
if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
|
||
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
|
||
resetPunctCount = false;
|
||
++samePunctCount;
|
||
}
|
||
}
|
||
|
||
if ( resetPunctCount ) {
|
||
if (samePunctCount >= maxSamePunctCount) {
|
||
f -= (maxSamePunctCount);
|
||
|
||
bool addEllipsis = false;
|
||
if ( lastEllipsis ) {
|
||
// if all from f to last ellipsis are punctuation, skip to last ellipsis
|
||
for ( char *c = lastEllipsis + 1; c < f; ++c) {
|
||
if ( is_alnum_utf8( c ) ) {
|
||
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
|
||
addEllipsis = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if ( !addEllipsis ) {
|
||
f = lastEllipsis;
|
||
}
|
||
} else {
|
||
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
|
||
addEllipsis = true;
|
||
}
|
||
|
||
if (addEllipsis) {
|
||
logTrace(g_conf.m_logTracePos, "addEllipsis");
|
||
|
||
if ( f != fstart && *(f - 1) != ' ' ) {
|
||
*f++ = ' ';
|
||
}
|
||
|
||
lastSpace = true;
|
||
memcpy ( f, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
|
||
f += 4;
|
||
|
||
lastEllipsis = f;
|
||
}
|
||
}
|
||
|
||
lastPunct = p;
|
||
lastPunctSize = cs;
|
||
samePunctCount = 0;
|
||
}
|
||
|
||
if ( samePunctCount >= maxSamePunctCount ) {
|
||
continue;
|
||
}
|
||
|
||
// do not count space if one before
|
||
if ( is_wspace_utf8 (p) ) {
|
||
if ( lastSpace ) {
|
||
continue;
|
||
}
|
||
|
||
lastSpace = true;
|
||
|
||
if ( fend - f > 1 ) {
|
||
lastBreakPrevChar = prevChar;
|
||
|
||
// don't store lastBreak if we have less than ellipsis length ' ...'
|
||
if ( fend - f > 4 ) {
|
||
lastBreak = f;
|
||
}
|
||
|
||
*f++ = ' ';
|
||
|
||
// counted as caps because we're detecting all caps for a sentence
|
||
++capCount;
|
||
|
||
dotCount = 0;
|
||
|
||
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
|
||
} else {
|
||
trunc = true;
|
||
}
|
||
|
||
continue;
|
||
}
|
||
|
||
if ( fend - f > cs ) {
|
||
prevChar = f;
|
||
|
||
if ( cs == 1 ) {
|
||
// we only do it for ascii to avoid catering for different rules in different languages
|
||
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
|
||
// eg:
|
||
// The Greek upper-case letter "Σ" has two different lower-case forms:
|
||
// "ς" in word-final position and "σ" elsewhere
|
||
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
|
||
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
|
||
// and comma/quotes/etc. is included
|
||
++capCount;
|
||
}
|
||
|
||
// some sites try to be smart and truncate for us, let's remove that
|
||
// if if there are no space between dots and letter
|
||
if ( *p == '.' ) {
|
||
++dotCount;
|
||
} else {
|
||
dotCount = 0;
|
||
dotPrevChar = f;
|
||
}
|
||
|
||
*f++ = *p;
|
||
} else {
|
||
dotCount = 0;
|
||
dotPrevChar = f;
|
||
|
||
memcpy( f, p, cs );
|
||
f += cs;
|
||
}
|
||
} else {
|
||
trunc = true;
|
||
}
|
||
|
||
lastSpace = false;
|
||
}
|
||
}
|
||
|
||
/// @todo ALC simplify logic/break into smaller functions
|
||
|
||
/// @todo ALC configurable minCapCount so we can tweak this as needed
|
||
const int minCapCount = 5;
|
||
|
||
// only capitalize first letter in a word for a sentence with all caps
|
||
//TODO: assumes we want a us-centric title capitilization. There are other styles.
|
||
//FIXME: Assumes lowercasing a codepoint doesn't change its utf8-encoding length. This is not true (eg. Turkish U+0130 İ -> U+0069 i)
|
||
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
|
||
logTrace(g_conf.m_logTracePos, "all caps");
|
||
|
||
bool isFirstLetter = true;
|
||
|
||
unsigned char cs = 0;
|
||
for ( char *c = fstart; c < f; c += cs ) {
|
||
cs = getUtf8CharSize(c);
|
||
|
||
bool isAlpha = is_alpha_utf8( c );
|
||
|
||
if ( isAlpha ) {
|
||
if (isFirstLetter) {
|
||
isFirstLetter = false;
|
||
continue;
|
||
}
|
||
} else {
|
||
// some hard coded punctuation that we don't want to treat as first letter
|
||
// eg: Program's instead of Program'S
|
||
if ( cs == 1 && *c == '\'' ) {
|
||
isFirstLetter = false;
|
||
} else {
|
||
isFirstLetter = true;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if ( !isFirstLetter ) {
|
||
to_lower_utf8(c, c);
|
||
//TODO: do titlecase on the first letter - don't leave it as uppercase
|
||
}
|
||
}
|
||
}
|
||
|
||
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
|
||
const int minRemoveEllipsisLen = 90;
|
||
|
||
logTrace(g_conf.m_logTracePos, "len=%ld", (f - fstart));
|
||
|
||
// let's remove ellipsis (...) at the end
|
||
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
|
||
logTrace(g_conf.m_logTracePos, "remove ellipsis");
|
||
if ( dotPrevChar ) {
|
||
if ( is_ascii3( *dotPrevChar ) ) {
|
||
logTrace(g_conf.m_logTracePos, "dotPrevChar=%c", *dotPrevChar);
|
||
switch ( *dotPrevChar ) {
|
||
case ',':
|
||
trunc = true;
|
||
lastBreak = dotPrevChar + 1;
|
||
break;
|
||
case '!':
|
||
case '.':
|
||
trunc = false;
|
||
f = dotPrevChar + 1;
|
||
break;
|
||
case ' ':
|
||
trunc = false;
|
||
|
||
if ( lastBreak ) {
|
||
f = lastBreak;
|
||
}
|
||
break;
|
||
default:
|
||
trunc = true;
|
||
|
||
if ( lastBreakPrevChar ) {
|
||
logTrace(g_conf.m_logTracePos, "lastBreakPrevChar=%c", *lastBreakPrevChar);
|
||
if ( is_ascii( *( lastBreakPrevChar ) ) ) {
|
||
switch ( *( lastBreakPrevChar ) ) {
|
||
case '!':
|
||
case '.':
|
||
trunc = false;
|
||
|
||
if (lastBreak) {
|
||
f = lastBreak;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
} else {
|
||
trunc = true;
|
||
lastBreak = nullptr;
|
||
}
|
||
}
|
||
if ( trunc ) {
|
||
logTrace(g_conf.m_logTracePos, "trunc");
|
||
|
||
if ( lastBreak == NULL ) {
|
||
logTrace(g_conf.m_logTracePos, "END. Return 0");
|
||
return 0;
|
||
}
|
||
|
||
f = lastBreak;
|
||
|
||
/// @todo ALC we should cater ellipsis for different languages
|
||
if ( addEllipsis ) {
|
||
logTrace(g_conf.m_logTracePos, "addEllipsis");
|
||
if ( (fend - f) > 4 ) {
|
||
memcpy ( f, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
|
||
f += 4;
|
||
}
|
||
}
|
||
}
|
||
|
||
// NULL terminate f
|
||
*f = '\0';
|
||
|
||
int bytesStored = static_cast<int>(f - fstart);
|
||
|
||
logTrace(g_conf.m_logTracePos, "END. Return %d", bytesStored);
|
||
|
||
return bytesStored;
|
||
}
|
||
|
||
bool Pos::set(const TokenizerResult *tr, int32_t a, int32_t b) {
|
||
// free m_buf in case this is a second call
|
||
reset();
|
||
|
||
int32_t nw = tr->size();
|
||
|
||
// -1 is the default value
|
||
if ( b == -1 ) {
|
||
b = nw;
|
||
}
|
||
|
||
// alloc array if need to
|
||
int32_t need = (nw+1) * 4;
|
||
|
||
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
||
m_needsFree = false;
|
||
|
||
m_buf = m_localBuf;
|
||
if ( need > POS_LOCALBUFSIZE ) {
|
||
m_buf = (char *)mmalloc(need,"Pos");
|
||
m_needsFree = true;
|
||
}
|
||
|
||
// bail on error
|
||
if ( ! m_buf ) {
|
||
return false;
|
||
}
|
||
|
||
m_bufSize = need;
|
||
m_pos = (int32_t *)m_buf;
|
||
|
||
// this is the CHARACTER count.
|
||
int32_t pos = 0;
|
||
|
||
// flag for stopping back-to-back spaces. only count those as one char.
|
||
bool lastSpace = false;
|
||
|
||
for ( int32_t i = a ; i < b ; i++ ) {
|
||
// set pos for the ith word to "pos"
|
||
m_pos[i] = pos;
|
||
|
||
nodeid_t tid = (*tr)[i].nodeid;
|
||
// is tag?
|
||
if ( tid ) {
|
||
// if not breaking, does nothing
|
||
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
|
||
continue;
|
||
}
|
||
|
||
// list tag? <li>
|
||
if ( tid == TAG_LI ) {
|
||
++pos;
|
||
lastSpace = false;
|
||
continue;
|
||
}
|
||
|
||
// if had a previous breaking tag and no non-tag
|
||
// word after it, do not count back-to-back spaces
|
||
if ( lastSpace ) {
|
||
continue;
|
||
}
|
||
|
||
// if had a br tag count it as a '. '
|
||
if ( tid ) { // <br>
|
||
pos += 2;
|
||
lastSpace = true;
|
||
|
||
continue;
|
||
}
|
||
|
||
// count as a single space
|
||
pos++;
|
||
|
||
// do not allow back-to-back spaces
|
||
lastSpace = true;
|
||
|
||
continue;
|
||
}
|
||
|
||
// scan through all chars discounting back-to-back spaces
|
||
const char *wp = (*tr)[i].token_start;
|
||
const char *pend = wp + (*tr)[i].token_len;
|
||
unsigned char cs = 0;
|
||
|
||
// assume filters out to the same # of chars
|
||
for ( const char *p = wp; p < pend; p += cs ) {
|
||
// get size
|
||
cs = getUtf8CharSize(p);
|
||
|
||
// do not count space if one before
|
||
if ( is_wspace_utf8 (p) ) {
|
||
if ( lastSpace ) {
|
||
continue;
|
||
}
|
||
|
||
lastSpace = true;
|
||
|
||
++pos;
|
||
continue;
|
||
}
|
||
|
||
++pos;
|
||
lastSpace = false;
|
||
}
|
||
}
|
||
|
||
// set pos for the END of the last word here
|
||
m_pos[nw] = pos;
|
||
|
||
return true;
|
||
}
|