2013-08-02 13:12:24 -07:00
|
|
|
|
#include "gb-include.h"
|
|
|
|
|
|
|
|
|
|
#include "Pos.h"
|
2016-01-12 15:33:42 +01:00
|
|
|
|
#include "Words.h"
|
2013-08-02 13:12:24 -07:00
|
|
|
|
#include "Sections.h"
|
|
|
|
|
|
|
|
|
|
Pos::Pos() {
|
|
|
|
|
m_buf = NULL;
|
|
|
|
|
m_needsFree = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Pos::~Pos () {
|
|
|
|
|
reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Pos::reset() {
|
|
|
|
|
if ( m_buf && m_needsFree )
|
|
|
|
|
mfree ( m_buf , m_bufSize , "Pos" );
|
|
|
|
|
m_buf = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-28 12:10:42 +01:00
|
|
|
|
static bool inTag( nodeid_t tagId, nodeid_t expectedTagId, int *count ) {
|
|
|
|
|
if ( !count ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( tagId == expectedTagId ) {
|
|
|
|
|
++( *count );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( *count ) {
|
|
|
|
|
// back tag
|
|
|
|
|
if ( ( tagId & BACKBITCOMP ) == expectedTagId ) {
|
|
|
|
|
--( *count );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ( *count > 0 );
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-24 16:55:50 +02:00
|
|
|
|
int32_t Pos::filter( const Words *words, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend, int32_t version ) {
|
|
|
|
|
const nodeid_t *tids = words->getTagIds();
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
|
|
// save start point for filtering
|
|
|
|
|
char *fstart = f;
|
|
|
|
|
|
|
|
|
|
// -1 is the default value
|
2015-12-01 12:38:51 +01:00
|
|
|
|
if ( b == -1 ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
b = words->getNumWords();
|
2015-12-01 12:38:51 +01:00
|
|
|
|
}
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
|
|
bool trunc = false;
|
|
|
|
|
|
2016-01-19 14:26:35 +01:00
|
|
|
|
static const int32_t maxCharSize = 4; // we are utf8
|
|
|
|
|
|
|
|
|
|
char* prevChar = NULL;
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
char* lastBreak = NULL;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
char* lastBreakPrevChar = NULL; // store char before space
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
|
|
// flag for stopping back-to-back spaces. only count those as one char.
|
|
|
|
|
bool lastSpace = false;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-13 13:26:37 +01:00
|
|
|
|
int inBadTags = 0;
|
2016-01-18 19:09:38 +01:00
|
|
|
|
int capCount = 0;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char *lastPunct = NULL;
|
2016-02-18 22:18:42 +01:00
|
|
|
|
unsigned char lastPunctSize = 0;
|
|
|
|
|
int samePunctCount = 0;
|
|
|
|
|
|
2016-01-19 14:26:35 +01:00
|
|
|
|
int dotCount = 0; // store last encountered total consecutive dots
|
|
|
|
|
char* dotPrevChar = NULL; // store char before dot which is not a space
|
|
|
|
|
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char* entityPos[32];
|
2016-01-29 19:18:22 +01:00
|
|
|
|
int32_t entityLen[32];
|
|
|
|
|
char entityChar[32];
|
|
|
|
|
int32_t entityCount = 0;
|
|
|
|
|
|
|
|
|
|
// we need to decode HTML entities for version above 122 because we stop decoding
|
|
|
|
|
// & > < to avoid losing information
|
|
|
|
|
if (version >= 122) { // TITLEREC_CURRENT_VERSION
|
|
|
|
|
int32_t maxWord = b;
|
|
|
|
|
|
|
|
|
|
if (maxWord == words->getNumWords()) {
|
|
|
|
|
maxWord -= 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t totalLen = (words->getWord(maxWord) + words->getWordLen(maxWord)) - words->getWord(a);
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char *pos = words->getWord(a);
|
|
|
|
|
const char *endPos = pos + totalLen;
|
2016-01-29 19:18:22 +01:00
|
|
|
|
|
|
|
|
|
for ( ; ( pos + 3 ) < endPos; ++pos ) {
|
|
|
|
|
if (*pos == '&') {
|
|
|
|
|
if (*(pos + 3) == ';') {
|
|
|
|
|
if (*(pos + 2) == 't') {
|
|
|
|
|
char c = *(pos + 1);
|
|
|
|
|
if ( c == 'g' || c == 'l' ) {
|
|
|
|
|
// > / <
|
|
|
|
|
entityPos[entityCount] = pos;
|
|
|
|
|
entityLen[entityCount] = 4;
|
|
|
|
|
if ( c == 'g' ) {
|
|
|
|
|
entityChar[entityCount] = '>';
|
|
|
|
|
} else {
|
|
|
|
|
entityChar[entityCount] = '<';
|
|
|
|
|
}
|
|
|
|
|
++entityCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if ((pos + 4 < endPos) && *(pos + 4) == ';') {
|
|
|
|
|
if (*(pos + 1) == 'a' && *(pos + 2) == 'm' && *(pos + 3) == 'p') {
|
|
|
|
|
// &
|
|
|
|
|
entityPos[entityCount] = pos;
|
|
|
|
|
entityLen[entityCount] = 5;
|
|
|
|
|
entityChar[entityCount] = '&';
|
|
|
|
|
++entityCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make sure we don't overflow
|
|
|
|
|
if (entityCount >= 32) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t currentEntityPos = 0;
|
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
for ( int32_t i = a ; i < b ; ++i ) {
|
2015-12-01 12:38:51 +01:00
|
|
|
|
if (trunc) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
|
|
// is tag?
|
|
|
|
|
if ( tids && tids[i] ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// let's not get from bad tags
|
2016-01-28 12:10:42 +01:00
|
|
|
|
if ( inTag( tids[i], TAG_STYLE, &inBadTags ) ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-07 11:50:56 +01:00
|
|
|
|
|
2016-01-28 12:10:42 +01:00
|
|
|
|
if ( inTag( tids[i], TAG_SCRIPT, &inBadTags ) ) {
|
|
|
|
|
continue;
|
2016-01-07 11:50:56 +01:00
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// if not breaking, does nothing
|
2016-01-11 15:46:09 +01:00
|
|
|
|
if ( !g_nodes[tids[i] & 0x7f].m_isBreaking ) {
|
2016-01-07 11:50:56 +01:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// list tag? <li>
|
2016-01-11 15:46:09 +01:00
|
|
|
|
if ( tids[i] == TAG_LI ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( ( fend - f > maxCharSize ) ) {
|
|
|
|
|
*f++ = '*';
|
|
|
|
|
|
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
lastSpace = false;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-11 15:46:09 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// if had a previous breaking tag and no non-tag
|
|
|
|
|
// word after it, do not count back-to-back spaces
|
2015-12-01 12:38:51 +01:00
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// if had a br tag count it as a '.'
|
2015-12-01 12:38:51 +01:00
|
|
|
|
if ( tids[i] ) { // <br>
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( f != fstart ) {
|
2016-01-11 15:46:09 +01:00
|
|
|
|
if ( ( fend - f > 2 * maxCharSize ) ) {
|
2016-02-04 13:47:49 +01:00
|
|
|
|
if ( prevChar && is_ascii(*prevChar) && (*prevChar != '.') ) {
|
2016-01-28 12:10:42 +01:00
|
|
|
|
*f++ = '.';
|
|
|
|
|
|
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
|
2016-01-28 12:10:42 +01:00
|
|
|
|
*f++ = ' ';
|
|
|
|
|
++capCount;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
|
|
|
|
}
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
lastSpace = true;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( ( fend - f > maxCharSize ) ) {
|
|
|
|
|
*f++ = ' ';
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// do not allow back-to-back spaces
|
|
|
|
|
lastSpace = true;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-07 11:50:56 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// scan through all chars discounting back-to-back spaces
|
|
|
|
|
unsigned char cs = 0;
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char *p = words->getWord(i) ;
|
|
|
|
|
const char *pend = words->getWord(i) + words->getWordLen(i);
|
2016-01-29 19:18:22 +01:00
|
|
|
|
|
2016-01-18 17:16:45 +01:00
|
|
|
|
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char *currentEntity = NULL;
|
2016-01-29 19:18:22 +01:00
|
|
|
|
int32_t currentEntityLen = 0;
|
|
|
|
|
char currentEntityChar = '\0';
|
2016-05-24 16:55:50 +02:00
|
|
|
|
const char *nextEntity = NULL;
|
2016-01-29 19:18:22 +01:00
|
|
|
|
int32_t nextEntityLen = 0;
|
|
|
|
|
char nextEntityChar = '\0';
|
|
|
|
|
|
|
|
|
|
bool hasEntity = false;
|
|
|
|
|
while (currentEntityPos < entityCount) {
|
|
|
|
|
currentEntity = entityPos[currentEntityPos];
|
|
|
|
|
currentEntityLen = entityLen[currentEntityPos];
|
|
|
|
|
currentEntityChar = entityChar[currentEntityPos];
|
|
|
|
|
|
|
|
|
|
if ( currentEntityPos + 1 < entityCount ) {
|
|
|
|
|
nextEntity = entityPos[currentEntityPos + 1];
|
|
|
|
|
nextEntityLen = entityLen[currentEntityPos + 1];
|
|
|
|
|
nextEntityChar = entityChar[currentEntityPos + 1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( p <= currentEntity || p <= (currentEntity + currentEntityLen) ) {
|
|
|
|
|
hasEntity = true;
|
|
|
|
|
break;
|
|
|
|
|
} else {
|
|
|
|
|
if (p > currentEntity) {
|
|
|
|
|
++currentEntityPos;
|
|
|
|
|
} else {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-18 17:16:45 +01:00
|
|
|
|
|
2016-02-18 22:18:42 +01:00
|
|
|
|
/// @todo ALC configurable maxSamePunctCount so we can tweak this as needed
|
|
|
|
|
const int maxSamePunctCount = 5;
|
|
|
|
|
char *lastEllipsis = NULL;
|
|
|
|
|
|
2016-01-18 17:16:45 +01:00
|
|
|
|
// assume filters out to the same # of chars
|
2016-01-29 19:18:22 +01:00
|
|
|
|
for ( ; p < pend; p += cs ) {
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// get size
|
|
|
|
|
cs = getUtf8CharSize(p);
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2016-01-29 19:18:22 +01:00
|
|
|
|
// skip entity
|
|
|
|
|
if ( hasEntity ) {
|
|
|
|
|
if (p >= currentEntity && p < (currentEntity + currentEntityLen)) {
|
|
|
|
|
if (p == currentEntity) {
|
|
|
|
|
*f++ = currentEntityChar;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (nextEntity && p >= nextEntity && p < (nextEntity + nextEntityLen)) {
|
|
|
|
|
if (p == nextEntity) {
|
|
|
|
|
*f++ = nextEntityChar;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// skip unwanted character
|
|
|
|
|
if ( isUtf8UnwantedSymbols( p ) ) {
|
|
|
|
|
continue;
|
2016-01-08 15:20:42 +01:00
|
|
|
|
}
|
|
|
|
|
|
2016-02-18 22:18:42 +01:00
|
|
|
|
|
|
|
|
|
bool resetPunctCount = true;
|
|
|
|
|
if ( is_punct_utf8( p ) ) {
|
|
|
|
|
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
|
|
|
|
|
resetPunctCount = false;
|
|
|
|
|
++samePunctCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( resetPunctCount ) {
|
|
|
|
|
if (samePunctCount >= maxSamePunctCount) {
|
|
|
|
|
f -= (maxSamePunctCount);
|
|
|
|
|
|
|
|
|
|
bool addEllipsis = false;
|
|
|
|
|
if ( lastEllipsis ) {
|
|
|
|
|
// if all from f to last ellipsis are punctuation, skip to last ellipsis
|
|
|
|
|
for ( char *c = lastEllipsis + 1; c < f; ++c) {
|
|
|
|
|
if ( is_alnum_utf8( c ) ) {
|
|
|
|
|
addEllipsis = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( !addEllipsis ) {
|
|
|
|
|
f = lastEllipsis;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
addEllipsis = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (addEllipsis) {
|
|
|
|
|
if ( f != fstart && *(f - 1) != ' ' ) {
|
|
|
|
|
*f++ = ' ';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = true;
|
2016-05-12 16:39:15 +02:00
|
|
|
|
memcpy ( f, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
|
2016-02-18 22:18:42 +01:00
|
|
|
|
f += 4;
|
|
|
|
|
|
|
|
|
|
lastEllipsis = f;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastPunct = p;
|
|
|
|
|
lastPunctSize = cs;
|
|
|
|
|
samePunctCount = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( samePunctCount >= maxSamePunctCount ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
// do not count space if one before
|
|
|
|
|
if ( is_wspace_utf8 (p) ) {
|
2015-12-01 12:38:51 +01:00
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
lastSpace = true;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( fend - f > 1 ) {
|
|
|
|
|
lastBreakPrevChar = prevChar;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-20 13:49:30 +01:00
|
|
|
|
// don't store lastBreak if we have less than ellipsis length ' ...'
|
|
|
|
|
if ( fend - f > 4 ) {
|
|
|
|
|
lastBreak = f;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
*f++ = ' ';
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// counted as caps because we're detecting all caps for a sentence
|
|
|
|
|
++capCount;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
dotCount = 0;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
|
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-08 15:20:42 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( fend - f > cs ) {
|
|
|
|
|
prevChar = f;
|
|
|
|
|
|
|
|
|
|
if ( cs == 1 ) {
|
|
|
|
|
// we only do it for ascii to avoid catering for different rules in different languages
|
|
|
|
|
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
|
|
|
|
|
// eg:
|
|
|
|
|
// The Greek upper-case letter "Σ" has two different lower-case forms:
|
|
|
|
|
// "ς" in word-final position and "σ" elsewhere
|
|
|
|
|
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
|
|
|
|
|
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
|
|
|
|
|
// and comma/quotes/etc. is included
|
|
|
|
|
++capCount;
|
|
|
|
|
}
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// some sites try to be smart and truncate for us, let's remove that
|
|
|
|
|
// if if there are no space between dots and letter
|
|
|
|
|
if ( *p == '.' ) {
|
|
|
|
|
++dotCount;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
} else {
|
2016-01-19 14:26:35 +01:00
|
|
|
|
dotCount = 0;
|
|
|
|
|
dotPrevChar = f;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
|
|
|
|
|
*f++ = *p;
|
2016-01-11 15:46:09 +01:00
|
|
|
|
} else {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
dotCount = 0;
|
|
|
|
|
dotPrevChar = f;
|
|
|
|
|
|
|
|
|
|
gbmemcpy( f, p, cs );
|
|
|
|
|
f += cs;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
} else {
|
|
|
|
|
trunc = true;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2016-01-19 14:26:35 +01:00
|
|
|
|
/// @todo ALC simplify logic/break into smaller functions
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
/// @todo ALC configurable minCapCount so we can tweak this as needed
|
|
|
|
|
const int minCapCount = 5;
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// only capitalize first letter in a word for a sentence with all caps
|
|
|
|
|
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
|
|
|
|
|
bool isFirstLetter = true;
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
unsigned char cs = 0;
|
|
|
|
|
for ( char *c = fstart; c < f; c += cs ) {
|
|
|
|
|
cs = getUtf8CharSize(c);
|
|
|
|
|
|
|
|
|
|
bool isAlpha = is_alpha_utf8( c );
|
|
|
|
|
|
|
|
|
|
if ( isAlpha ) {
|
|
|
|
|
if (isFirstLetter) {
|
|
|
|
|
isFirstLetter = false;
|
2016-01-18 19:09:38 +01:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
} else {
|
2016-06-09 14:51:52 +02:00
|
|
|
|
// some hard coded punctuation that we don't want to treat as first letter
|
|
|
|
|
// eg: Program's instead of Program'S
|
|
|
|
|
if ( cs == 1 && *c == '\'' ) {
|
|
|
|
|
isFirstLetter = false;
|
|
|
|
|
} else {
|
|
|
|
|
isFirstLetter = true;
|
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( !isFirstLetter ) {
|
|
|
|
|
to_lower_utf8(c, c);
|
2016-01-18 19:09:38 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
}
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
|
|
|
|
|
const int minRemoveEllipsisLen = 120;
|
|
|
|
|
|
|
|
|
|
// let's remove ellipsis (...) at the end
|
|
|
|
|
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
|
|
|
|
|
if ( is_ascii3( *dotPrevChar ) ) {
|
|
|
|
|
switch ( *dotPrevChar ) {
|
|
|
|
|
case ',':
|
|
|
|
|
trunc = true;
|
|
|
|
|
lastBreak = dotPrevChar + 1;
|
|
|
|
|
break;
|
|
|
|
|
case '!':
|
|
|
|
|
case '.':
|
|
|
|
|
trunc = false;
|
|
|
|
|
f = dotPrevChar + 1;
|
|
|
|
|
break;
|
|
|
|
|
case ' ':
|
|
|
|
|
trunc = false;
|
|
|
|
|
|
|
|
|
|
if ( lastBreak ) {
|
|
|
|
|
f = lastBreak;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
trunc = true;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
if ( lastBreakPrevChar ) {
|
2016-01-28 12:10:42 +01:00
|
|
|
|
if ( is_ascii( *( lastBreakPrevChar ) ) ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
switch ( *( lastBreakPrevChar ) ) {
|
|
|
|
|
case '!':
|
|
|
|
|
case '.':
|
|
|
|
|
trunc = false;
|
|
|
|
|
|
|
|
|
|
if (lastBreak) {
|
|
|
|
|
f = lastBreak;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-20 13:32:13 +01:00
|
|
|
|
}
|
|
|
|
|
break;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-12 15:33:42 +01:00
|
|
|
|
}
|
|
|
|
|
|
2016-01-11 15:46:09 +01:00
|
|
|
|
if ( trunc ) {
|
|
|
|
|
if ( lastBreak == NULL ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
return 0;
|
2016-01-19 14:26:35 +01:00
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
f = lastBreak;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
/// @todo ALC we should cater ellipsis for different languages
|
2016-01-19 14:26:35 +01:00
|
|
|
|
if ( addEllipsis ) {
|
|
|
|
|
if ( (fend - f) > 4 ) {
|
2016-05-13 11:31:28 +02:00
|
|
|
|
memcpy ( f, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
|
2016-01-19 14:26:35 +01:00
|
|
|
|
f += 4;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-18 19:09:38 +01:00
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// NULL terminate f
|
|
|
|
|
*f = '\0';
|
|
|
|
|
|
|
|
|
|
return (f - fstart);
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-23 16:55:18 +02:00
|
|
|
|
bool Pos::set( const Words *words, int32_t a, int32_t b ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// free m_buf in case this is a second call
|
|
|
|
|
reset();
|
|
|
|
|
|
|
|
|
|
int32_t nw = words->getNumWords();
|
2016-05-23 16:39:52 +02:00
|
|
|
|
const nodeid_t *tids = words->getTagIds();
|
2016-01-20 13:32:13 +01:00
|
|
|
|
|
|
|
|
|
// -1 is the default value
|
|
|
|
|
if ( b == -1 ) {
|
|
|
|
|
b = nw;
|
2015-12-01 12:38:51 +01:00
|
|
|
|
}
|
|
|
|
|
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// alloc array if need to
|
|
|
|
|
int32_t need = (nw+1) * 4;
|
|
|
|
|
|
|
|
|
|
// do not destroy m_pos/m_numWords if only filtering into a buffer
|
|
|
|
|
m_needsFree = false;
|
|
|
|
|
|
|
|
|
|
m_buf = m_localBuf;
|
|
|
|
|
if ( need > POS_LOCALBUFSIZE ) {
|
|
|
|
|
m_buf = (char *)mmalloc(need,"Pos");
|
|
|
|
|
m_needsFree = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bail on error
|
|
|
|
|
if ( ! m_buf ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
m_bufSize = need;
|
|
|
|
|
m_pos = (int32_t *)m_buf;
|
|
|
|
|
|
|
|
|
|
// this is the CHARACTER count.
|
|
|
|
|
int32_t pos = 0;
|
|
|
|
|
|
|
|
|
|
// flag for stopping back-to-back spaces. only count those as one char.
|
|
|
|
|
bool lastSpace = false;
|
|
|
|
|
|
|
|
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
|
|
|
// set pos for the ith word to "pos"
|
|
|
|
|
m_pos[i] = pos;
|
|
|
|
|
|
|
|
|
|
// is tag?
|
|
|
|
|
if ( tids && tids[i] ) {
|
|
|
|
|
// if not breaking, does nothing
|
|
|
|
|
if ( !g_nodes[tids[i] & 0x7f].m_isBreaking ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// list tag? <li>
|
|
|
|
|
if ( tids[i] == TAG_LI ) {
|
|
|
|
|
++pos;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if had a previous breaking tag and no non-tag
|
|
|
|
|
// word after it, do not count back-to-back spaces
|
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if had a br tag count it as a '. '
|
|
|
|
|
if ( tids[i] ) { // <br>
|
|
|
|
|
pos += 2;
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// count as a single space
|
|
|
|
|
pos++;
|
|
|
|
|
|
|
|
|
|
// do not allow back-to-back spaces
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// scan through all chars discounting back-to-back spaces
|
2016-05-23 16:39:52 +02:00
|
|
|
|
const char *wp = words->getWord(i);
|
|
|
|
|
const char *pend = wp + words->getWordLen(i);
|
2016-01-20 13:32:13 +01:00
|
|
|
|
unsigned char cs = 0;
|
|
|
|
|
|
|
|
|
|
// assume filters out to the same # of chars
|
2016-05-23 16:39:52 +02:00
|
|
|
|
for ( const char *p = wp; p < pend; p += cs ) {
|
2016-01-20 13:32:13 +01:00
|
|
|
|
// get size
|
|
|
|
|
cs = getUtf8CharSize(p);
|
|
|
|
|
|
|
|
|
|
// do not count space if one before
|
|
|
|
|
if ( is_wspace_utf8 (p) ) {
|
|
|
|
|
if ( lastSpace ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lastSpace = true;
|
|
|
|
|
|
|
|
|
|
++pos;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
++pos;
|
|
|
|
|
lastSpace = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set pos for the END of the last word here
|
|
|
|
|
m_pos[nw] = pos;
|
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
|
return true;
|
|
|
|
|
}
|