privacore-open-source-searc.../Pos.cpp

653 lines
14 KiB
C++
Raw Normal View History

2013-08-02 16:12:24 -04:00
#include "Pos.h"
#include "tokenizer.h"
#include "XmlNode.h"
2013-08-02 16:12:24 -04:00
#include "Sections.h"
#include "TitleSummaryCodepointFilter.h"
#include "Conf.h"
2016-12-08 10:56:09 -05:00
#include "Mem.h"
2018-07-26 11:29:51 -04:00
#include "Errno.h"
#include "Log.h"
#include "utf8_fast.h"
2013-08-02 16:12:24 -04:00
Pos::Pos() {
m_buf = NULL;
m_needsFree = false;
2016-09-23 06:21:13 -04:00
m_pos = NULL;
m_bufSize = 0;
2016-10-21 16:41:03 -04:00
memset(m_localBuf, 0, sizeof(m_localBuf));
2013-08-02 16:12:24 -04:00
}
Pos::~Pos () {
reset();
}
void Pos::reset() {
if ( m_buf && m_needsFree )
mfree ( m_buf , m_bufSize , "Pos" );
m_buf = NULL;
}
static bool inTag( nodeid_t tagId, nodeid_t expectedTagId, int *count ) {
if ( !count ) {
return false;
}
if ( tagId == expectedTagId ) {
++( *count );
}
if ( *count ) {
// back tag
if ( ( tagId & BACKBITCOMP ) == expectedTagId ) {
--( *count );
}
}
return ( *count > 0 );
}
unsigned Pos::filter( const TokenizerResult *tr, int32_t a, int32_t b, bool addEllipsis, char *f, char *fend, int32_t version ) {
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "BEGIN");
2013-08-02 16:12:24 -04:00
// save start point for filtering
char *fstart = f;
// -1 is the default value
if ( b == -1 ) {
b = tr->size();
}
2013-08-02 16:12:24 -04:00
bool trunc = false;
static const int32_t maxCharSize = 4; // we are utf8
char* prevChar = NULL;
2013-08-02 16:12:24 -04:00
char* lastBreak = NULL;
char* lastBreakPrevChar = NULL; // store char before space
2013-08-02 16:12:24 -04:00
// flag for stopping back-to-back spaces. only count those as one char.
bool lastSpace = false;
int inBadTags = 0;
int capCount = 0;
2016-05-24 10:55:50 -04:00
const char *lastPunct = NULL;
2016-02-18 16:18:42 -05:00
unsigned char lastPunctSize = 0;
int samePunctCount = 0;
int dotCount = 0; // store last encountered total consecutive dots
char* dotPrevChar = NULL; // store char before dot which is not a space
2016-05-24 10:55:50 -04:00
const char* entityPos[32];
int32_t entityLen[32];
char entityChar[32];
int32_t entityCount = 0;
// we need to decode HTML entities for version above 122 because we stop decoding
// & > < to avoid losing information
if (version >= 122) { // TITLEREC_CURRENT_VERSION
int32_t maxWord = b;
if ((unsigned)maxWord == tr->size()) {
maxWord -= 1;
}
const char *pos = (*tr)[a].token_start;
const char *endPos = (*tr)[maxWord].token_end();
for ( ; ( pos + 3 ) < endPos; ++pos ) {
if (*pos == '&') {
if (*(pos + 3) == ';') {
if (*(pos + 2) == 't') {
char c = *(pos + 1);
if ( c == 'g' || c == 'l' ) {
// &gt; / &lt;
entityPos[entityCount] = pos;
entityLen[entityCount] = 4;
if ( c == 'g' ) {
entityChar[entityCount] = '>';
} else {
entityChar[entityCount] = '<';
}
++entityCount;
}
}
} else if ((pos + 4 < endPos) && *(pos + 4) == ';') {
if (*(pos + 1) == 'a' && *(pos + 2) == 'm' && *(pos + 3) == 'p') {
// &amp;
entityPos[entityCount] = pos;
entityLen[entityCount] = 5;
entityChar[entityCount] = '&';
++entityCount;
}
}
}
// make sure we don't overflow
if (entityCount >= 32) {
break;
}
}
}
int32_t currentEntityPos = 0;
2016-01-20 07:32:13 -05:00
for ( int32_t i = a ; i < b ; ++i ) {
if (trunc) {
break;
}
2013-08-02 16:12:24 -04:00
// is tag?
nodeid_t tid = (*tr)[i].nodeid;
if ( tid ) {
2016-11-11 10:40:39 -05:00
logTrace(g_conf.m_logTracePos, "tags");
2016-01-20 07:32:13 -05:00
// let's not get from bad tags
if ( inTag( tid, TAG_STYLE, &inBadTags ) ) {
2016-01-20 07:32:13 -05:00
continue;
}
if ( inTag( tid, TAG_SCRIPT, &inBadTags ) ) {
continue;
}
2013-08-02 16:12:24 -04:00
// if not breaking, does nothing
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
continue;
}
2013-08-02 16:12:24 -04:00
// list tag? <li>
if ( tid == TAG_LI ) {
2016-01-20 07:32:13 -05:00
if ( ( fend - f > maxCharSize ) ) {
*f++ = '*';
// counted as caps because we're detecting all caps for a sentence
++capCount;
} else {
trunc = true;
2013-08-02 16:12:24 -04:00
}
2016-01-20 07:32:13 -05:00
2013-08-02 16:12:24 -04:00
lastSpace = false;
continue;
}
2013-08-02 16:12:24 -04:00
// if had a previous breaking tag and no non-tag
// word after it, do not count back-to-back spaces
if ( lastSpace ) {
continue;
}
2013-08-02 16:12:24 -04:00
// if had a br tag count it as a '.'
if ( tid ) { // <br>
2016-01-20 07:32:13 -05:00
if ( f != fstart ) {
if ( ( fend - f > 2 * maxCharSize ) ) {
if ( prevChar && is_ascii(*prevChar) && (*prevChar != '.') ) {
*f++ = '.';
// counted as caps because we're detecting all caps for a sentence
++capCount;
}
2016-01-20 07:32:13 -05:00
*f++ = ' ';
++capCount;
} else {
trunc = true;
2013-08-02 16:12:24 -04:00
}
}
2013-08-02 16:12:24 -04:00
lastSpace = true;
2013-08-02 16:12:24 -04:00
continue;
}
2016-01-20 07:32:13 -05:00
if ( ( fend - f > maxCharSize ) ) {
*f++ = ' ';
} else {
trunc = true;
2013-08-02 16:12:24 -04:00
}
2013-08-02 16:12:24 -04:00
// do not allow back-to-back spaces
lastSpace = true;
2013-08-02 16:12:24 -04:00
continue;
}
2013-08-02 16:12:24 -04:00
// scan through all chars discounting back-to-back spaces
unsigned char cs = 0;
const char *p = (*tr)[i].token_start;
const char *pend = (*tr)[i].token_end();
2016-05-24 10:55:50 -04:00
const char *currentEntity = NULL;
int32_t currentEntityLen = 0;
char currentEntityChar = '\0';
2016-05-24 10:55:50 -04:00
const char *nextEntity = NULL;
int32_t nextEntityLen = 0;
char nextEntityChar = '\0';
bool hasEntity = false;
while (currentEntityPos < entityCount) {
currentEntity = entityPos[currentEntityPos];
currentEntityLen = entityLen[currentEntityPos];
currentEntityChar = entityChar[currentEntityPos];
if ( currentEntityPos + 1 < entityCount ) {
nextEntity = entityPos[currentEntityPos + 1];
nextEntityLen = entityLen[currentEntityPos + 1];
nextEntityChar = entityChar[currentEntityPos + 1];
}
if ( p <= currentEntity || p <= (currentEntity + currentEntityLen) ) {
hasEntity = true;
break;
} else {
if (p > currentEntity) {
++currentEntityPos;
} else {
break;
}
}
}
2016-02-18 16:18:42 -05:00
/// @todo ALC configurable maxSamePunctCount so we can tweak this as needed
const int maxSamePunctCount = 5;
char *lastEllipsis = NULL;
// assume filters out to the same # of chars
for ( ; p < pend; p += cs ) {
2013-08-02 16:12:24 -04:00
// get size
cs = getUtf8CharSize(p);
// skip entity
if ( hasEntity ) {
if (p >= currentEntity && p < (currentEntity + currentEntityLen)) {
if (p == currentEntity) {
*f++ = currentEntityChar;
lastSpace = false;
}
continue;
}
if (nextEntity && p >= nextEntity && p < (nextEntity + nextEntityLen)) {
if (p == nextEntity) {
*f++ = nextEntityChar;
lastSpace = false;
}
continue;
}
}
2016-01-20 07:32:13 -05:00
// skip unwanted character
if ( isUtf8UnwantedSymbols( p ) ) {
continue;
}
2016-02-18 16:18:42 -05:00
bool resetPunctCount = true;
if (is_punct_utf8(p) && !is_wspace_utf8(p)) {
2016-02-18 16:18:42 -05:00
if ( ( cs == lastPunctSize) && ( memcmp(lastPunct, p, cs) == 0 ) ) {
resetPunctCount = false;
++samePunctCount;
}
}
if ( resetPunctCount ) {
if (samePunctCount >= maxSamePunctCount) {
f -= (maxSamePunctCount);
bool addEllipsis = false;
if ( lastEllipsis ) {
// if all from f to last ellipsis are punctuation, skip to last ellipsis
for ( char *c = lastEllipsis + 1; c < f; ++c) {
if ( is_alnum_utf8( c ) ) {
2016-11-11 10:40:39 -05:00
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
2016-02-18 16:18:42 -05:00
addEllipsis = true;
break;
}
}
if ( !addEllipsis ) {
f = lastEllipsis;
}
} else {
2016-11-11 10:32:11 -05:00
logTrace(g_conf.m_logTracePos, "addEllipsis=true");
2016-02-18 16:18:42 -05:00
addEllipsis = true;
}
if (addEllipsis) {
2016-11-11 10:32:11 -05:00
logTrace(g_conf.m_logTracePos, "addEllipsis");
2016-02-18 16:18:42 -05:00
if ( f != fstart && *(f - 1) != ' ' ) {
*f++ = ' ';
}
lastSpace = true;
2016-11-11 10:32:11 -05:00
memcpy ( f, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
2016-02-18 16:18:42 -05:00
f += 4;
lastEllipsis = f;
}
}
lastPunct = p;
lastPunctSize = cs;
samePunctCount = 0;
}
if ( samePunctCount >= maxSamePunctCount ) {
continue;
}
2013-08-02 16:12:24 -04:00
// do not count space if one before
if ( is_wspace_utf8 (p) ) {
if ( lastSpace ) {
continue;
}
2013-08-02 16:12:24 -04:00
lastSpace = true;
2016-01-20 07:32:13 -05:00
if ( fend - f > 1 ) {
lastBreakPrevChar = prevChar;
// don't store lastBreak if we have less than ellipsis length ' ...'
if ( fend - f > 4 ) {
lastBreak = f;
}
2016-01-20 07:32:13 -05:00
*f++ = ' ';
2016-01-20 07:32:13 -05:00
// counted as caps because we're detecting all caps for a sentence
++capCount;
2016-01-20 07:32:13 -05:00
dotCount = 0;
2016-01-20 07:32:13 -05:00
// we don't store space as dotPreviousChar because we want to strip ' ...' as well
} else {
trunc = true;
2013-08-02 16:12:24 -04:00
}
2013-08-02 16:12:24 -04:00
continue;
}
2016-01-20 07:32:13 -05:00
if ( fend - f > cs ) {
prevChar = f;
if ( cs == 1 ) {
// we only do it for ascii to avoid catering for different rules in different languages
// https://en.wikipedia.org/wiki/Letter_case#Exceptional_letters_and_digraphs
// eg:
// The Greek upper-case letter "Σ" has two different lower-case forms:
// "ς" in word-final position and "σ" elsewhere
if ( !is_alpha_a( *p ) || is_upper_a( *p ) ) {
// non-alpha is counted as caps as well because we're detecting all caps for a sentence
// and comma/quotes/etc. is included
++capCount;
}
2016-01-20 07:32:13 -05:00
// some sites try to be smart and truncate for us, let's remove that
// if if there are no space between dots and letter
if ( *p == '.' ) {
++dotCount;
} else {
dotCount = 0;
dotPrevChar = f;
2013-08-02 16:12:24 -04:00
}
2016-01-20 07:32:13 -05:00
*f++ = *p;
} else {
2016-01-20 07:32:13 -05:00
dotCount = 0;
dotPrevChar = f;
memcpy( f, p, cs );
2016-01-20 07:32:13 -05:00
f += cs;
}
2016-01-20 07:32:13 -05:00
} else {
trunc = true;
2013-08-02 16:12:24 -04:00
}
lastSpace = false;
}
}
/// @todo ALC simplify logic/break into smaller functions
2016-01-20 07:32:13 -05:00
/// @todo ALC configurable minCapCount so we can tweak this as needed
const int minCapCount = 5;
2016-01-20 07:32:13 -05:00
// only capitalize first letter in a word for a sentence with all caps
//TODO: assumes we want a us-centric title capitilization. There are other styles.
//FIXME: Assumes lowercasing a codepoint doesn't change its utf8-encoding length. This is not true (eg. Turkish U+0130 İ -> U+0069 i)
2016-01-20 07:32:13 -05:00
if ( capCount > minCapCount && capCount == ( f - fstart ) ) {
2016-11-11 10:40:39 -05:00
logTrace(g_conf.m_logTracePos, "all caps");
2016-01-20 07:32:13 -05:00
bool isFirstLetter = true;
2016-01-20 07:32:13 -05:00
unsigned char cs = 0;
for ( char *c = fstart; c < f; c += cs ) {
cs = getUtf8CharSize(c);
bool isAlpha = is_alpha_utf8( c );
if ( isAlpha ) {
if (isFirstLetter) {
isFirstLetter = false;
continue;
}
2016-01-20 07:32:13 -05:00
} else {
2016-06-09 08:51:52 -04:00
// some hard coded punctuation that we don't want to treat as first letter
// eg: Program's instead of Program'S
if ( cs == 1 && *c == '\'' ) {
isFirstLetter = false;
} else {
isFirstLetter = true;
}
2016-01-20 07:32:13 -05:00
continue;
}
2016-01-20 07:32:13 -05:00
if ( !isFirstLetter ) {
to_lower_utf8(c, c);
//TODO: do titlecase on the first letter - don't leave it as uppercase
}
}
2016-01-20 07:32:13 -05:00
}
2016-01-20 07:32:13 -05:00
/// @todo ALC configurable minRemoveEllipsisLen so we can tweak this as needed
const int minRemoveEllipsisLen = 90;
2016-01-20 07:32:13 -05:00
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "len=%ld", (f - fstart));
2016-01-20 07:32:13 -05:00
// let's remove ellipsis (...) at the end
if ( (f - fstart) >= minRemoveEllipsisLen && dotCount == 3 ) {
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "remove ellipsis");
if ( dotPrevChar ) {
if ( is_ascii3( *dotPrevChar ) ) {
logTrace(g_conf.m_logTracePos, "dotPrevChar=%c", *dotPrevChar);
switch ( *dotPrevChar ) {
case ',':
trunc = true;
lastBreak = dotPrevChar + 1;
break;
case '!':
case '.':
trunc = false;
f = dotPrevChar + 1;
break;
case ' ':
trunc = false;
if ( lastBreak ) {
f = lastBreak;
}
break;
default:
trunc = true;
if ( lastBreakPrevChar ) {
logTrace(g_conf.m_logTracePos, "lastBreakPrevChar=%c", *lastBreakPrevChar);
if ( is_ascii( *( lastBreakPrevChar ) ) ) {
switch ( *( lastBreakPrevChar ) ) {
case '!':
case '.':
trunc = false;
if (lastBreak) {
f = lastBreak;
}
break;
default:
break;
}
}
}
break;
}
}
} else {
trunc = true;
lastBreak = nullptr;
}
}
if ( trunc ) {
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "trunc");
if ( lastBreak == NULL ) {
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "END. Return 0");
2016-01-20 07:32:13 -05:00
return 0;
}
2016-01-20 07:32:13 -05:00
f = lastBreak;
2016-01-20 07:32:13 -05:00
/// @todo ALC we should cater ellipsis for different languages
if ( addEllipsis ) {
2016-11-11 07:51:55 -05:00
logTrace(g_conf.m_logTracePos, "addEllipsis");
if ( (fend - f) > 4 ) {
memcpy ( f, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
f += 4;
}
}
}
2016-01-20 07:32:13 -05:00
// NULL terminate f
*f = '\0';
2016-11-11 07:51:55 -05:00
int bytesStored = static_cast<int>(f - fstart);
logTrace(g_conf.m_logTracePos, "END. Return %d", bytesStored);
return bytesStored;
2016-01-20 07:32:13 -05:00
}
bool Pos::set(const TokenizerResult *tr, int32_t a, int32_t b) {
2016-01-20 07:32:13 -05:00
// free m_buf in case this is a second call
reset();
int32_t nw = tr->size();
2016-01-20 07:32:13 -05:00
// -1 is the default value
if ( b == -1 ) {
b = nw;
}
2016-01-20 07:32:13 -05:00
// alloc array if need to
int32_t need = (nw+1) * 4;
// do not destroy m_pos/m_numWords if only filtering into a buffer
m_needsFree = false;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
}
// bail on error
if ( ! m_buf ) {
return false;
}
m_bufSize = need;
m_pos = (int32_t *)m_buf;
// this is the CHARACTER count.
int32_t pos = 0;
// flag for stopping back-to-back spaces. only count those as one char.
bool lastSpace = false;
for ( int32_t i = a ; i < b ; i++ ) {
// set pos for the ith word to "pos"
m_pos[i] = pos;
nodeid_t tid = (*tr)[i].nodeid;
2016-01-20 07:32:13 -05:00
// is tag?
if ( tid ) {
2016-01-20 07:32:13 -05:00
// if not breaking, does nothing
if ( !g_nodes[tid & 0x7f].m_isBreaking ) {
2016-01-20 07:32:13 -05:00
continue;
}
// list tag? <li>
if ( tid == TAG_LI ) {
2016-01-20 07:32:13 -05:00
++pos;
lastSpace = false;
continue;
}
// if had a previous breaking tag and no non-tag
// word after it, do not count back-to-back spaces
if ( lastSpace ) {
continue;
}
// if had a br tag count it as a '. '
if ( tid ) { // <br>
2016-01-20 07:32:13 -05:00
pos += 2;
lastSpace = true;
continue;
}
// count as a single space
pos++;
// do not allow back-to-back spaces
lastSpace = true;
continue;
}
// scan through all chars discounting back-to-back spaces
const char *wp = (*tr)[i].token_start;
const char *pend = wp + (*tr)[i].token_len;
2016-01-20 07:32:13 -05:00
unsigned char cs = 0;
// assume filters out to the same # of chars
for ( const char *p = wp; p < pend; p += cs ) {
2016-01-20 07:32:13 -05:00
// get size
cs = getUtf8CharSize(p);
// do not count space if one before
if ( is_wspace_utf8 (p) ) {
if ( lastSpace ) {
continue;
}
lastSpace = true;
++pos;
continue;
}
++pos;
lastSpace = false;
}
}
// set pos for the END of the last word here
m_pos[nw] = pos;
2013-08-02 16:12:24 -04:00
return true;
}