2013-08-02 16:12:24 -04:00
|
|
|
|
#include "Pos.h"
|
2018-03-09 10:24:39 -05:00
|
|
|
|
#include "tokenizer.h"
|
2018-02-27 08:50:28 -05:00
|
|
|
|
#include "XmlNode.h"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
#include "Sections.h"
|
2018-02-03 14:58:45 -05:00
|
|
|
|
#include "TitleSummaryCodepointFilter.h"
|
2016-11-12 14:24:20 -05:00
|
|
|
|
#include "Conf.h"
|
2016-12-08 10:56:09 -05:00
|
|
|
|
#include "Mem.h"
|
2018-07-26 11:29:51 -04:00
|
|
|
|
#include "Errno.h"
|
|
|
|
|
#include "Log.h"
|
|
|
|
|
#include "utf8_fast.h"
|
2016-11-12 14:24:20 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
Pos::Pos() {
|
|
|
|
|
m_buf = NULL;
|
|
|
|
|
m_needsFree = false;
|
2016-09-23 06:21:13 -04:00
|
|
|
|
m_pos = NULL;
|
|
|
|
|
m_bufSize = 0;
|
2016-10-21 16:41:03 -04:00
|
|
|
|
memset(m_localBuf, 0, sizeof(m_localBuf));
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Pos::~Pos () {
|
|
|
|
|
reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Pos::reset() {
|
|
|
|
|
if ( m_buf && m_needsFree )
|
|
|
|
|
mfree ( m_buf , m_bufSize , "Pos" );
|
|
|
|
|
m_buf = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-28 06:10:42 -05:00
|
|
|
|
static bool inTag( nodeid_t tagId, nodeid_t expectedTagId, int *count ) {
|
|
|
|
|
if ( !count ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( tagId == expectedTagId ) {
|
|
|
|
|
++( *count );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( *count ) {
|
|
|
|
|
// back tag
|
|
|
|
|
if ( ( tagId & BACKBITCOMP ) == expectedTagId ) {
|
|
|
|
|
--( *count );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ( *count > 0 );
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-19 10:33:26 -04:00
|
|
|
|
unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend, int32_t version ) {
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "BEGIN");
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// save start point for filtering
|
|
|
|
|
char *fstart = f;
|
|
|
|
|
|
|
|
|
|
// -1 is the default value
|
2015-12-01 06:38:51 -05:00
|
|
|
|
if ( b == -1 ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
b = tr->size();
|
2015-12-01 06:38:51 -05:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
bool trunc = false;
|
|
|
|
|
|
2016-01-19 08:26:35 -05:00
|
|
|
|
static const int32_t maxCharSize = 4; // we are utf8
|
|
|
|
|
|
|
|
|
|
char* prevChar = NULL;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
char* lastBreak = NULL;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
char* lastBreakPrevChar = NULL; // store char before space
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// flag for stopping back-to-back spaces. only count those as one char.
|
|
|
|
|
bool lastSpace = false;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-01-13 07:26:37 -05:00
|
|
|
|
int inBadTags = 0;
|
2016-01-18 13:09:38 -05:00
|
|
|
|
int capCount = 0;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-05-24 10:55:50 -04:00
|
|
|
|
const char *lastPunct = NULL;
|
2016-02-18 16:18:42 -05:00
|
|
|
|
unsigned char lastPunctSize = 0;
|
|
|
|
|
int samePunctCount = 0;
|
|
|
|
|
|
2016-01-19 08:26:35 -05:00
|
|
|
|
int dotCount = 0; // store last encountered total consecutive dots
|
|
|
|
|
char* dotPrevChar = NULL; // store char before dot which is not a space
|
|
|
|
|
|
2016-05-24 10:55:50 -04:00
|
|
|
|
const char* entityPos[32];
|
2016-01-29 13:18:22 -05:00
|
|
|
|
int32_t entityLen[32];
|
|
|
|
|
char entityChar[32];
|
|
|
|
|
int32_t entityCount = 0;
|
|
|
|
|
|
|
|
|
|
// we need to decode HTML entities for version above 122 because we stop decoding
|
|
|
|
|
// & > < to avoid losing information
|
|
|
|
|
if (version >= 122) { // TITLEREC_CURRENT_VERSION
|
|
|
|
|
int32_t maxWord = b;
|
|
|
|
|
|
2018-03-19 10:33:26 -04:00
|
|
|
|
if ((unsigned)maxWord == tr->size()) {
|
2016-01-29 13:18:22 -05:00
|
|
|
|
maxWord -= 1;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *pos = (*tr)[a].token_start;
|
|
|
|
|
const char *endPos = (*tr)[maxWord].token_end();
|
2016-01-29 13:18:22 -05:00
|
|
|
|
|
|
|
|
|
for ( ; ( pos + 3 ) < endPos; ++pos ) {
|
|
|
|
|
if (*pos == '&') {
|
|
|
|
|
if (*(pos + 3) == ';') {
|
|
|
|
|
if (*(pos + 2) == 't') {
|
|
|
|
|
char c = *(pos + 1);
|
|
|
|
|
if ( c == 'g' || c == 'l' ) {
|
|
|
|
|
// > / <
|
|
|
|
|
entityPos[entityCount] = pos;
|
|
|
|
|
entityLen[entityCount] = 4;
|
|
|
|
|
if ( c == 'g' ) {
|
|
|
|
|
entityChar[entityCount] = '>';
|
|
|
|
|
} else {
|
|
|
|
|
entityChar[entityCount] = '<';
|
|
|
|
|
}
|
|
|
|
|
++entityCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if ((pos + 4 < endPos) && *(pos + 4) == ';') {
|
|
|
|
|
if (*(pos + 1) == 'a' && *(pos + 2) == 'm' && *(pos + 3) == 'p') {
|
|
|
|
|
// &
|
|
|
|
|
entityPos[entityCount] = pos;
|
|
|
|
|
entityLen[entityCount] = 5;
|
|
|
|
|
entityChar[entityCount] = '&';
|
|
|
|
|
++entityCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make sure we don't overflow
|
|
|
|
|
if (entityCount >= 32) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t currentEntityPos = 0;
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
for ( int32_t i = a ; i < b ; ++i ) {
|
2015-12-01 06:38:51 -05:00
|
|
|
|
if (trunc) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// is tag?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
nodeid_t tid = (*tr)[i].nodeid;
|
|
|
|
|
if ( tid ) {
|
2016-11-11 10:40:39 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "tags");
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// let's not get from bad tags
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( inTag( tid, TAG_STYLE, &inBadTags ) ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-07 05:50:56 -05:00
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( inTag( tid, TAG_SCRIPT, &inBadTags ) ) {
|
2016-01-28 06:10:42 -05:00
|
|
|
|
continue;
|
2016-01-07 05:50:56 -05:00
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if not breaking, does nothing
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
|
2016-01-07 05:50:56 -05:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// list tag? <li>
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tid == TAG_LI ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( ( fend - f > maxCharSize ) ) {
|
|
|
|
|
*f++ = '*';
|
|
|
|
|
|
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
lastSpace = false;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-11 09:46:09 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if had a previous breaking tag and no non-tag
|
|
|
|
|
// word after it, do not count back-to-back spaces
|
2015-12-01 06:38:51 -05:00
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if had a br tag count it as a '.'
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tid ) { // <br>
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( f != fstart ) {
|
2016-01-11 09:46:09 -05:00
|
|
|
|
if ( ( fend - f > 2 * maxCharSize ) ) {
|
2016-02-04 07:47:49 -05:00
|
|
|
|
if ( prevChar && is_ascii(*prevChar) && (*prevChar != '.') ) {
|
2016-01-28 06:10:42 -05:00
|
|
|
|
*f++ = '.';
|
|
|
|
|
|
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
|
2016-01-28 06:10:42 -05:00
|
|
|
|
*f++ = ' ';
|
|
|
|
|
++capCount;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
lastSpace = true;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( ( fend - f > maxCharSize ) ) {
|
|
|
|
|
*f++ = ' ';
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// do not allow back-to-back spaces
|
|
|
|
|
lastSpace = true;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-07 05:50:56 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan through all chars discounting back-to-back spaces
|
|
|
|
|
unsigned char cs = 0;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *p = (*tr)[i].token_start;
|
|
|
|
|
const char *pend = (*tr)[i].token_end();
|
2016-01-29 13:18:22 -05:00
|
|
|
|
|
2016-01-18 11:16:45 -05:00
|
|
|
|
|
2016-05-24 10:55:50 -04:00
|
|
|
|
const char *currentEntity = NULL;
|
2016-01-29 13:18:22 -05:00
|
|
|
|
int32_t currentEntityLen = 0;
|
|
|
|
|
char currentEntityChar = '\0';
|
2016-05-24 10:55:50 -04:00
|
|
|
|
const char *nextEntity = NULL;
|
2016-01-29 13:18:22 -05:00
|
|
|
|
int32_t nextEntityLen = 0;
|
|
|
|
|
char nextEntityChar = '\0';
|
|
|
|
|
|
|
|
|
|
bool hasEntity = false;
|
|
|
|
|
while (currentEntityPos < entityCount) {
|
|
|
|
|
currentEntity = entityPos[currentEntityPos];
|
|
|
|
|
currentEntityLen = entityLen[currentEntityPos];
|
|
|
|
|
currentEntityChar = entityChar[currentEntityPos];
|
|
|
|
|
|
|
|
|
|
if ( currentEntityPos + 1 < entityCount ) {
|
|
|
|
|
nextEntity = entityPos[currentEntityPos + 1];
|
|
|
|
|
nextEntityLen = entityLen[currentEntityPos + 1];
|
|
|
|
|
nextEntityChar = entityChar[currentEntityPos + 1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( p <= currentEntity || p <= (currentEntity + currentEntityLen) ) {
|
|
|
|
|
hasEntity = true;
|
|
|
|
|
break;
|
|
|
|
|
} else {
|
|
|
|
|
if (p > currentEntity) {
|
|
|
|
|
++currentEntityPos;
|
|
|
|
|
} else {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-18 11:16:45 -05:00
|
|
|
|
|
2016-02-18 16:18:42 -05:00
|
|
|
|
/// @todo ALC configurable maxSamePunctCount so we can tweak this as needed
|
|
|
|
|
const int maxSamePunctCount = 5;
|
|
|
|
|
char *lastEllipsis = NULL;
|
|
|
|
|
|
2016-01-18 11:16:45 -05:00
|
|
|
|
// assume filters out to the same # of chars
|
2016-01-29 13:18:22 -05:00
|
|
|
|
for ( ; p < pend; p += cs ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get size
|
|
|
|
|
cs = getUtf8CharSize(p);
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2016-01-29 13:18:22 -05:00
|
|
|
|
// skip entity
|
|
|
|
|
if ( hasEntity ) {
|
|
|
|
|
if (p >= currentEntity && p < (currentEntity + currentEntityLen)) {
|
|
|
|
|
if (p == currentEntity) {
|
|
|
|
|
*f++ = currentEntityChar;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (nextEntity && p >= nextEntity && p < (nextEntity + nextEntityLen)) {
|
|
|
|
|
if (p == nextEntity) {
|
|
|
|
|
*f++ = nextEntityChar;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// skip unwanted character
|
|
|
|
|
if ( isUtf8UnwantedSymbols( p ) ) {
|
|
|
|
|
continue;
|
2016-01-08 09:20:42 -05:00
|
|
|
|
}
|
|
|
|
|
|
2016-02-18 16:18:42 -05:00
|
|
|
|
|
|
|
|
|
bool resetPunctCount = true;
|
2018-06-06 06:12:47 -04:00
|
|
|
|
if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
|
2016-02-18 16:18:42 -05:00
|
|
|
|
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
|
|
|
|
|
resetPunctCount = false;
|
|
|
|
|
++samePunctCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( resetPunctCount ) {
|
|
|
|
|
if (samePunctCount >= maxSamePunctCount) {
|
|
|
|
|
f -= (maxSamePunctCount);
|
|
|
|
|
|
|
|
|
|
bool addEllipsis = false;
|
|
|
|
|
if ( lastEllipsis ) {
|
|
|
|
|
// if all from f to last ellipsis are punctuation, skip to last ellipsis
|
|
|
|
|
for ( char *c = lastEllipsis + 1; c < f; ++c) {
|
|
|
|
|
if ( is_alnum_utf8( c ) ) {
|
2016-11-11 10:40:39 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
|
2016-02-18 16:18:42 -05:00
|
|
|
|
addEllipsis = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( !addEllipsis ) {
|
|
|
|
|
f = lastEllipsis;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2016-11-11 10:32:11 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
|
2016-02-18 16:18:42 -05:00
|
|
|
|
addEllipsis = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (addEllipsis) {
|
2016-11-11 10:32:11 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "addEllipsis");
|
|
|
|
|
|
2016-02-18 16:18:42 -05:00
|
|
|
|
if ( f != fstart && *(f - 1) != ' ' ) {
|
|
|
|
|
*f++ = ' ';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = true;
|
2016-11-11 10:32:11 -05:00
|
|
|
|
memcpy ( f, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
|
2016-02-18 16:18:42 -05:00
|
|
|
|
f += 4;
|
|
|
|
|
|
|
|
|
|
lastEllipsis = f;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastPunct = p;
|
|
|
|
|
lastPunctSize = cs;
|
|
|
|
|
samePunctCount = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( samePunctCount >= maxSamePunctCount ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// do not count space if one before
|
|
|
|
|
if ( is_wspace_utf8 (p) ) {
|
2015-12-01 06:38:51 -05:00
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
lastSpace = true;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( fend - f > 1 ) {
|
|
|
|
|
lastBreakPrevChar = prevChar;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-01-20 07:49:30 -05:00
|
|
|
|
// don't store lastBreak if we have less than ellipsis length ' ...'
|
|
|
|
|
if ( fend - f > 4 ) {
|
|
|
|
|
lastBreak = f;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
*f++ = ' ';
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
dotCount = 0;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-08 09:20:42 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( fend - f > cs ) {
|
|
|
|
|
prevChar = f;
|
|
|
|
|
|
|
|
|
|
if ( cs == 1 ) {
|
|
|
|
|
// we only do it for ascii to avoid catering for different rules in different languages
|
|
|
|
|
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
|
|
|
|
|
// eg:
|
|
|
|
|
// The Greek upper-case letter "Σ" has two different lower-case forms:
|
|
|
|
|
// "ς" in word-final position and "σ" elsewhere
|
|
|
|
|
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
|
|
|
|
|
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
|
|
|
|
|
// and comma/quotes/etc. is included
|
|
|
|
|
++capCount;
|
|
|
|
|
}
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// some sites try to be smart and truncate for us, let's remove that
|
|
|
|
|
// if if there are no space between dots and letter
|
|
|
|
|
if ( *p == '.' ) {
|
|
|
|
|
++dotCount;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
} else {
|
2016-01-19 08:26:35 -05:00
|
|
|
|
dotCount = 0;
|
|
|
|
|
dotPrevChar = f;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
|
|
|
|
|
*f++ = *p;
|
2016-01-11 09:46:09 -05:00
|
|
|
|
} else {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
dotCount = 0;
|
|
|
|
|
dotPrevChar = f;
|
|
|
|
|
|
2018-07-26 10:19:54 -04:00
|
|
|
|
memcpy( f, p, cs );
|
2016-01-20 07:32:13 -05:00
|
|
|
|
f += cs;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2016-01-19 08:26:35 -05:00
|
|
|
|
/// @todo ALC simplify logic/break into smaller functions
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
/// @todo ALC configurable minCapCount so we can tweak this as needed
|
|
|
|
|
const int minCapCount = 5;
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// only capitalize first letter in a word for a sentence with all caps
|
2018-03-09 10:24:39 -05:00
|
|
|
|
//TODO: assumes we want a us-centric title capitilization. There are other styles.
|
|
|
|
|
//FIXME: Assumes lowercasing a codepoint doesn't change its utf8-encoding length. This is not true (eg. Turkish U+0130 İ -> U+0069 i)
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
|
2016-11-11 10:40:39 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "all caps");
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
bool isFirstLetter = true;
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
unsigned char cs = 0;
|
|
|
|
|
for ( char *c = fstart; c < f; c += cs ) {
|
|
|
|
|
cs = getUtf8CharSize(c);
|
|
|
|
|
|
|
|
|
|
bool isAlpha = is_alpha_utf8( c );
|
|
|
|
|
|
|
|
|
|
if ( isAlpha ) {
|
|
|
|
|
if (isFirstLetter) {
|
|
|
|
|
isFirstLetter = false;
|
2016-01-18 13:09:38 -05:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
} else {
|
2016-06-09 08:51:52 -04:00
|
|
|
|
// some hard coded punctuation that we don't want to treat as first letter
|
|
|
|
|
// eg: Program's instead of Program'S
|
|
|
|
|
if ( cs == 1 && *c == '\'' ) {
|
|
|
|
|
isFirstLetter = false;
|
|
|
|
|
} else {
|
|
|
|
|
isFirstLetter = true;
|
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
if ( !isFirstLetter ) {
|
|
|
|
|
to_lower_utf8(c, c);
|
2018-03-09 10:24:39 -05:00
|
|
|
|
//TODO: do titlecase on the first letter - don't leave it as uppercase
|
2016-01-18 13:09:38 -05:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-20 07:32:13 -05:00
|
|
|
|
}
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
|
2016-11-11 07:57:47 -05:00
|
|
|
|
const int minRemoveEllipsisLen = 90;
|
2016-01-20 07:32:13 -05:00
|
|
|
|
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "len=%ld", (f - fstart));
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// let's remove ellipsis (...) at the end
|
|
|
|
|
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "remove ellipsis");
|
2018-02-04 04:43:15 -05:00
|
|
|
|
if ( dotPrevChar ) {
|
|
|
|
|
if ( is_ascii3( *dotPrevChar ) ) {
|
|
|
|
|
logTrace(g_conf.m_logTracePos, "dotPrevChar=%c", *dotPrevChar);
|
|
|
|
|
switch ( *dotPrevChar ) {
|
|
|
|
|
case ',':
|
|
|
|
|
trunc = true;
|
|
|
|
|
lastBreak = dotPrevChar + 1;
|
|
|
|
|
break;
|
|
|
|
|
case '!':
|
|
|
|
|
case '.':
|
|
|
|
|
trunc = false;
|
|
|
|
|
f = dotPrevChar + 1;
|
|
|
|
|
break;
|
|
|
|
|
case ' ':
|
|
|
|
|
trunc = false;
|
|
|
|
|
|
|
|
|
|
if ( lastBreak ) {
|
|
|
|
|
f = lastBreak;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
trunc = true;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
|
2018-02-04 04:43:15 -05:00
|
|
|
|
if ( lastBreakPrevChar ) {
|
|
|
|
|
logTrace(g_conf.m_logTracePos, "lastBreakPrevChar=%c", *lastBreakPrevChar);
|
|
|
|
|
if ( is_ascii( *( lastBreakPrevChar ) ) ) {
|
|
|
|
|
switch ( *( lastBreakPrevChar ) ) {
|
|
|
|
|
case '!':
|
|
|
|
|
case '.':
|
|
|
|
|
trunc = false;
|
|
|
|
|
|
|
|
|
|
if (lastBreak) {
|
|
|
|
|
f = lastBreak;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
2016-01-19 08:26:35 -05:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-02-04 04:43:15 -05:00
|
|
|
|
break;
|
|
|
|
|
}
|
2016-01-19 08:26:35 -05:00
|
|
|
|
}
|
2018-02-04 04:43:15 -05:00
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
|
|
|
|
lastBreak = nullptr;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
}
|
2016-01-12 09:33:42 -05:00
|
|
|
|
}
|
2016-01-11 09:46:09 -05:00
|
|
|
|
if ( trunc ) {
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "trunc");
|
|
|
|
|
|
2016-01-11 09:46:09 -05:00
|
|
|
|
if ( lastBreak == NULL ) {
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "END. Return 0");
|
2016-01-20 07:32:13 -05:00
|
|
|
|
return 0;
|
2016-01-19 08:26:35 -05:00
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
f = lastBreak;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
/// @todo ALC we should cater ellipsis for different languages
|
2016-01-19 08:26:35 -05:00
|
|
|
|
if ( addEllipsis ) {
|
2016-11-11 07:51:55 -05:00
|
|
|
|
logTrace(g_conf.m_logTracePos, "addEllipsis");
|
2016-01-19 08:26:35 -05:00
|
|
|
|
if ( (fend - f) > 4 ) {
|
2016-05-13 05:31:28 -04:00
|
|
|
|
memcpy ( f, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
|
2016-01-19 08:26:35 -05:00
|
|
|
|
f += 4;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-18 13:09:38 -05:00
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// NULL terminate f
|
|
|
|
|
*f = '\0';
|
|
|
|
|
|
2016-11-11 07:51:55 -05:00
|
|
|
|
int bytesStored = static_cast<int>(f - fstart);
|
|
|
|
|
|
|
|
|
|
logTrace(g_conf.m_logTracePos, "END. Return %d", bytesStored);
|
|
|
|
|
|
|
|
|
|
return bytesStored;
|
2016-01-20 07:32:13 -05:00
|
|
|
|
}
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
bool Pos::set(const TokenizerResult *tr, int32_t a, int32_t b) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// free m_buf in case this is a second call
|
|
|
|
|
reset();
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t nw = tr->size();
|
2016-01-20 07:32:13 -05:00
|
|
|
|
|
|
|
|
|
// -1 is the default value
|
|
|
|
|
if ( b == -1 ) {
|
|
|
|
|
b = nw;
|
2015-12-01 06:38:51 -05:00
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// alloc array if need to
|
|
|
|
|
int32_t need = (nw+1) * 4;
|
|
|
|
|
|
|
|
|
|
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
|
|
|
|
m_needsFree = false;
|
|
|
|
|
|
|
|
|
|
m_buf = m_localBuf;
|
|
|
|
|
if ( need > POS_LOCALBUFSIZE ) {
|
|
|
|
|
m_buf = (char *)mmalloc(need,"Pos");
|
|
|
|
|
m_needsFree = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bail on error
|
|
|
|
|
if ( ! m_buf ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
m_bufSize = need;
|
|
|
|
|
m_pos = (int32_t *)m_buf;
|
|
|
|
|
|
|
|
|
|
// this is the CHARACTER count.
|
|
|
|
|
int32_t pos = 0;
|
|
|
|
|
|
|
|
|
|
// flag for stopping back-to-back spaces. only count those as one char.
|
|
|
|
|
bool lastSpace = false;
|
|
|
|
|
|
|
|
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
|
|
|
// set pos for the ith word to "pos"
|
|
|
|
|
m_pos[i] = pos;
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
nodeid_t tid = (*tr)[i].nodeid;
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// is tag?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tid ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// if not breaking, does nothing
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// list tag? <li>
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tid == TAG_LI ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
++pos;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if had a previous breaking tag and no non-tag
|
|
|
|
|
// word after it, do not count back-to-back spaces
|
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if had a br tag count it as a '. '
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tid ) { // <br>
|
2016-01-20 07:32:13 -05:00
|
|
|
|
pos += 2;
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// count as a single space
|
|
|
|
|
pos++;
|
|
|
|
|
|
|
|
|
|
// do not allow back-to-back spaces
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// scan through all chars discounting back-to-back spaces
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *wp = (*tr)[i].token_start;
|
|
|
|
|
const char *pend = wp + (*tr)[i].token_len;
|
2016-01-20 07:32:13 -05:00
|
|
|
|
unsigned char cs = 0;
|
|
|
|
|
|
|
|
|
|
// assume filters out to the same # of chars
|
2016-05-23 10:39:52 -04:00
|
|
|
|
for ( const char *p = wp; p < pend; p += cs ) {
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// get size
|
|
|
|
|
cs = getUtf8CharSize(p);
|
|
|
|
|
|
|
|
|
|
// do not count space if one before
|
|
|
|
|
if ( is_wspace_utf8 (p) ) {
|
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
++pos;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
++pos;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set pos for the END of the last word here
|
|
|
|
|
m_pos[nw] = pos;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
return true;
|
|
|
|
|
}
|