1449 lines
42 KiB
C++
1449 lines
42 KiB
C++
#include "Summary.h"
|
|
|
|
#include "Sections.h"
|
|
#include "Query.h"
|
|
#include "Xml.h"
|
|
#include "Pos.h"
|
|
#include "Matches.h"
|
|
#include "Process.h"
|
|
#include "Conf.h"
|
|
#include "Url.h"
|
|
#include "Mem.h"
|
|
#include "Errno.h"
|
|
#include "Log.h"
|
|
#include "fctypes.h"
|
|
#include "GbMutex.h"
|
|
#include "ScopedLock.h"
|
|
#include "hash.h"
|
|
|
|
|
|
static void findCookieWarningLessSubrange(const TokenizerResult &tr, int start_word, int end_word, int *cookie_warning_less_start, int *cookie_warning_less_end);
|
|
static bool looksLikeCookieWarning(const TokenizerResult &tr, int start_word_, int end_word_);
|
|
|
|
|
|
//List of words/tokens that in a query signals that we should not filter out cookie-warning-looking text segments. So if you search for "oat meal cookie" you may
|
|
//get a document summary that includes the cookie warning.
|
|
static struct {
|
|
const char *token;
|
|
int64_t token_hash;
|
|
} query_cookie_words[] = {
|
|
{ "cookie",0}, //many languages
|
|
{ "cookies",0}, //many languages
|
|
{ "kakor",0}, //swedish&norwegian
|
|
{ "koekies",0}, //afrikaans
|
|
{ "sīkfailus",0}, //latvian
|
|
{ "küpsiseid",0}, //estonian
|
|
{ "slapukus",0}, //lithuanian
|
|
{ "kolačiće",0}, //croatian
|
|
};
|
|
static GbMutex mtx_query_cookie_words;
|
|
static bool query_cookie_words_initialied = false;
|
|
|
|
static void initializeQueryCookieWords() {
|
|
ScopedLock sl(mtx_query_cookie_words);
|
|
if(!query_cookie_words_initialied) {
|
|
for(auto &e : query_cookie_words) {
|
|
e.token_hash = hash64Lower_utf8(e.token,strlen(e.token));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//does the query has any terms that indicate they may be searching for cookies (the ones you eat)?
|
|
static bool queryHasCookieWords(const Query *q) {
|
|
initializeQueryCookieWords();
|
|
for(int i=0; i<q->getNumTerms(); i++) {
|
|
for(const auto &e : query_cookie_words) {
|
|
if(e.token_hash==q->getRawTermId(i))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
Summary::Summary()
|
|
: m_summaryLen(0)
|
|
, m_numExcerpts(0)
|
|
, m_numDisplayLines(0)
|
|
, m_displayLen(0)
|
|
, m_maxNumCharsPerLine(0)
|
|
, m_isSetFromTags(false)
|
|
, m_q(NULL)
|
|
, m_avoid_cookie_warnings(true)
|
|
, m_wordWeights(NULL)
|
|
, m_wordWeightSize(0)
|
|
, m_buf4(NULL)
|
|
, m_buf4Size(0) {
|
|
|
|
// PVS-Studio
|
|
m_summary[0] = '\0';
|
|
memset(m_summaryExcerptLen, 0, sizeof(m_summaryExcerptLen));
|
|
m_tmpWordWeightsBuf[0] = '\0';
|
|
m_tmpBuf4[0] = '\0';
|
|
}
|
|
|
|
Summary::~Summary() {
|
|
if ( m_wordWeights && m_wordWeights != (float *)m_tmpWordWeightsBuf ) {
|
|
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
|
|
m_wordWeights = NULL;
|
|
}
|
|
|
|
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
|
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
|
m_buf4 = NULL;
|
|
}
|
|
}
|
|
|
|
char* Summary::getSummary() {
|
|
return m_summary;
|
|
}
|
|
|
|
const char *Summary::getSummary() const {
|
|
return m_summary;
|
|
}
|
|
|
|
int32_t Summary::getSummaryDisplayLen() const {
|
|
return m_displayLen;
|
|
}
|
|
|
|
int32_t Summary::getSummaryLen() const {
|
|
return m_summaryLen;
|
|
}
|
|
|
|
bool Summary::isSetFromTags() const {
|
|
return m_isSetFromTags;
|
|
}
|
|
|
|
bool Summary::verifySummary(const char *titleBuf, int32_t titleBufLen) {
|
|
logTrace(g_conf.m_logTraceSummary, "BEGIN");
|
|
//logHexTrace(g_conf.m_logTraceSummary,m_summary,m_summaryLen,"summary candidate:");
|
|
|
|
if ( m_summaryLen > 0 ) {
|
|
// trim elipsis
|
|
if ( ( titleBufLen > 4 ) && ( memcmp( (titleBuf + titleBufLen - 4), " ...", 4 ) == 0 ) ) {
|
|
titleBufLen -= 4;
|
|
}
|
|
|
|
// verify that it's not the same with title
|
|
if ( titleBufLen == m_summaryLen && strncasestr( m_summary, titleBuf, m_summaryLen, titleBufLen ) ) {
|
|
m_summaryLen = 0;
|
|
m_summary[0] = '\0';
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning false");
|
|
return false;
|
|
}
|
|
|
|
//Verify that it isn't html junk. Some pages do have html code such as <img... in their description which is most certainly an error
|
|
//maybeRemoveHtmlFormatting() has already taken care of single- and double-encoded html entities so if there is html in the summary
|
|
//then it is already pure. Unless it was triple-encoded - but some pages just can't be helped.
|
|
if(memchr(m_summary, '<', m_summaryLen)!=NULL) {
|
|
//If the summary mentions "html" or "tag" then it might be a page about html in whcih case the summary could intentionally contain html tags
|
|
if(memmem(m_summary,m_summaryLen,"html",4)==0 &&
|
|
memmem(m_summary,m_summaryLen,"HTML",4)==0 &&
|
|
memmem(m_summary,m_summaryLen,"tag",3)==0 &&
|
|
memmem(m_summary,m_summaryLen,"TAG",3)==0)
|
|
{
|
|
//Only detect it on a few tell-tale markers. Otherwise we could incorrectly reject summaries with equations, eg "n<p" which may look like a starting html tag
|
|
if(memmem(m_summary,m_summaryLen,"<img",4)!=0 ||
|
|
memmem(m_summary,m_summaryLen,"<IMG",4)!=0 ||
|
|
memmem(m_summary,m_summaryLen,"<style",6)!=0 ||
|
|
memmem(m_summary,m_summaryLen,"<STYLE",6)!=0 ||
|
|
memmem(m_summary,m_summaryLen,"<title",6)!=0 ||
|
|
memmem(m_summary,m_summaryLen,"<TITLE",6)!=0)
|
|
{
|
|
logTrace(g_conf.m_logTraceSummary,"Summary looks like html junk. END. Returning false");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
m_summaryExcerptLen[0] = m_summaryLen;
|
|
m_numExcerpts = 1;
|
|
m_displayLen = m_summaryLen;
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning true");
|
|
return true;
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning false");
|
|
return false;
|
|
}
|
|
|
|
void Summary::setSummary(const std::string &summary) {
|
|
m_summaryLen = summary.copy(m_summary, MAX_SUMMARY_LEN);
|
|
m_summary[m_summaryLen] = '\0';
|
|
m_summaryExcerptLen[0] = m_summaryLen;
|
|
m_numExcerpts = 1;
|
|
m_displayLen = m_summaryLen;
|
|
}
|
|
|
|
// let's try to get a nicer summary by using what the website set as description
|
|
// Use the following in priority order (highest first)
|
|
// - itemprop = "description"
|
|
// - meta name = "og:description"
|
|
// - meta name = "description"
|
|
bool Summary::setSummaryFromTags( Xml *xml, unsigned maxSummaryLen, const char *titleBuf, int32_t titleBufLen ) {
|
|
logTrace(g_conf.m_logTraceSummary, "BEGIN");
|
|
|
|
// sanity check
|
|
if ( maxSummaryLen >= MAX_SUMMARY_LEN ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN);
|
|
logTrace(g_conf.m_logTraceSummary, "END. maxSummaryLen[%d] >= MAX_SUMMARY_LEN[%d]. Returning false", maxSummaryLen, MAX_SUMMARY_LEN);
|
|
|
|
return false;
|
|
}
|
|
|
|
/// @todo ALC configurable minSummaryLen so we can tweak this as needed
|
|
const int minSummaryLen = (maxSummaryLen / 3);
|
|
|
|
// itemprop = "description"
|
|
if ( xml->getTagContent("itemprop", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen) ) {
|
|
maybeRemoveHtmlFormatting();
|
|
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
|
m_isSetFromTags = true;
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: generated from itemprop description. summary='%.*s'", m_summaryLen, m_summary);
|
|
logTrace(g_conf.m_logTraceSummary, "END. Generated from itemprop description. Returning true");
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// meta property = "og:description"
|
|
if ( xml->getTagContent("property", "og:description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
|
maybeRemoveHtmlFormatting();
|
|
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
|
m_isSetFromTags = true;
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: generated from meta property og:description. summary='%.*s'", m_summaryLen, m_summary);
|
|
logTrace(g_conf.m_logTraceSummary, "END. Generated from meta property og:description Returning true");
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// meta name = "description"
|
|
if ( xml->getTagContent("name", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
|
maybeRemoveHtmlFormatting();
|
|
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
|
m_isSetFromTags = true;
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: generated from meta name description. summary='%.*s'", m_summaryLen, m_summary);
|
|
logTrace(g_conf.m_logTraceSummary, "END. Generated from meta name description. Returning true");
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// meta property = "description"
|
|
if ( xml->getTagContent("property", "description", m_summary, MAX_SUMMARY_LEN, minSummaryLen, maxSummaryLen, &m_summaryLen, true, TAG_META ) ) {
|
|
if ( verifySummary( titleBuf, titleBufLen ) ) {
|
|
m_isSetFromTags = true;
|
|
maybeRemoveHtmlFormatting();
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: generated from meta property description. summary='%.*s'", m_summaryLen, m_summary);
|
|
logTrace(g_conf.m_logTraceSummary, "END. Generated from meta property description. Returning true");
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
logDebug(g_conf.m_logDebugSummary, "sum: unable to generate summary from itemprop/meta tags");
|
|
logTrace(g_conf.m_logTraceSummary, "END. Unable to generate summary. Returning false");
|
|
|
|
return false;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Summary::setSummary(const Xml *xml, const TokenizerResult *tr, const Sections *sections, Pos *pos, const Query *q, unsigned maxSummaryLen,
|
|
int32_t maxNumLines, int32_t numDisplayLines, int32_t maxNumCharsPerLine, const Url *f,
|
|
const Matches *matches, const char *titleBuf, int32_t titleBufLen) {
|
|
logTrace(g_conf.m_logTraceSummary, "BEGIN");
|
|
|
|
m_numDisplayLines = numDisplayLines;
|
|
m_displayLen = 0;
|
|
|
|
if(q)
|
|
m_avoid_cookie_warnings = !queryHasCookieWords(q);
|
|
|
|
if ( maxNumCharsPerLine < 10 ) {
|
|
maxNumCharsPerLine = 10;
|
|
}
|
|
|
|
// . sanity check
|
|
// . summary must fit in m_summary[]
|
|
// . leave room for tailing \0
|
|
if ( maxSummaryLen >= MAX_SUMMARY_LEN ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
log(LOG_WARN, "query: Summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN);
|
|
logTrace(g_conf.m_logTraceSummary, "END. maxSummaryLen[%d] >= MAX_SUMMARY_LEN[%d]. Returning false", maxSummaryLen, MAX_SUMMARY_LEN);
|
|
return false;
|
|
}
|
|
|
|
// do not overrun the final*[] buffers
|
|
if ( maxNumLines > 256 ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
log(LOG_WARN, "query: More than 256 summary lines requested.");
|
|
logTrace(g_conf.m_logTraceSummary, "END. maxNumLines[%d] > 256. Returning false", maxNumLines);
|
|
return false;
|
|
}
|
|
|
|
// Nothing to match...print beginning of content as summary
|
|
if ( matches->getNumMatches() == 0 && maxNumLines > 0 ) {
|
|
bool status = getDefaultSummary(xml, tr, sections, pos, maxSummaryLen);
|
|
logTrace(g_conf.m_logTraceSummary, "END. getDefaultSummary. Returning %s", status ? "true" : "false");
|
|
return status;
|
|
}
|
|
|
|
int32_t need1 = q->m_numWords * sizeof(float);
|
|
m_wordWeightSize = need1;
|
|
if ( need1 < 128 ) {
|
|
m_wordWeights = (float *)m_tmpWordWeightsBuf;
|
|
} else {
|
|
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
|
|
}
|
|
|
|
if ( ! m_wordWeights ) {
|
|
logTrace(g_conf.m_logTraceSummary, "END. !m_wordWeights. Returning false");
|
|
return false;
|
|
}
|
|
|
|
/// @todo ALC fix word weights
|
|
/// non-working logic is removed in commit 5eacee9063861e859b54ec62035a600aa8af25df
|
|
|
|
// . compute our word weights wrt each query. words which are more rare
|
|
// have a higher weight. We use this to weight the terms importance
|
|
// when generating the summary.
|
|
// . used by the proximity algo
|
|
// . used in setSummaryScores() for scoring summaries
|
|
|
|
|
|
for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
|
|
m_wordWeights[i] = 1.0;
|
|
}
|
|
|
|
// convenience
|
|
m_maxNumCharsPerLine = maxNumCharsPerLine;
|
|
m_q = q;
|
|
|
|
// set the max excerpt len to the max summary excerpt len
|
|
int32_t maxExcerptLen = m_maxNumCharsPerLine;
|
|
|
|
int32_t lastNumFinal = 0;
|
|
int32_t maxLoops = 1024;
|
|
|
|
// if just computing absScore2...
|
|
if ( maxNumLines <= 0 ) {
|
|
logTrace(g_conf.m_logTraceSummary, "END. maxNumLines <= 0. Returning true");
|
|
return true;
|
|
}
|
|
|
|
char *p = m_summary;
|
|
char *pend = m_summary + maxSummaryLen;
|
|
|
|
m_numExcerpts = 0;
|
|
|
|
int32_t need2 = (1+1+1) * m_q->m_numWords;
|
|
m_buf4Size = need2;
|
|
if ( need2 < 128 ) {
|
|
m_buf4 = m_tmpBuf4;
|
|
} else {
|
|
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
|
|
}
|
|
|
|
if ( ! m_buf4 ) {
|
|
logTrace(g_conf.m_logTraceSummary, "END. !m_buf4. Returning false");
|
|
return false;
|
|
}
|
|
|
|
char *x = m_buf4;
|
|
char *retired = x;
|
|
x += m_q->m_numWords;
|
|
char *maxGotIt = x;
|
|
x += m_q->m_numWords;
|
|
char *gotIt = x;
|
|
|
|
// . the "maxGotIt" count vector accumulates into "retired"
|
|
// . that is how we keep track of what query words we used for previous
|
|
// summary excerpts so we try to get diversified excerpts with
|
|
// different query terms/words in them
|
|
//char retired [ MAX_QUERY_WORDS ];
|
|
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
|
|
|
|
// some query words are already matched in the title
|
|
for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
|
|
if ( matches->isTitleMatch(i) ) {
|
|
retired [ i ] = 1;
|
|
}
|
|
}
|
|
|
|
bool hadEllipsis = false;
|
|
|
|
//
|
|
// Loop over all words that match a query term. The matching words
|
|
// could be from any one of the 3 Words arrays above. Find the
|
|
// highest scoring window around each term. And then find the highest
|
|
// of those over all the matching terms.
|
|
//
|
|
int32_t numFinal;
|
|
for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ) {
|
|
if ( numFinal == m_numDisplayLines ) {
|
|
m_displayLen = p - m_summary;
|
|
}
|
|
|
|
// reset these at the top of each loop
|
|
const Match *maxm;
|
|
int64_t maxScore = 0;
|
|
int32_t maxa = 0;
|
|
int32_t maxb = 0;
|
|
int32_t maxi = -1;
|
|
int32_t lasta = -1;
|
|
|
|
if(lastNumFinal == numFinal) {
|
|
if(maxLoops-- <= 0) {
|
|
log(LOG_WARN, "query: got infinite loop bug, query is %s url is %s", m_q->originalQuery(), f->getUrl());
|
|
break;
|
|
}
|
|
}
|
|
lastNumFinal = numFinal;
|
|
|
|
// loop through all the matches and see which is best
|
|
for ( int32_t i = 0 ; i < matches->getNumMatches() ; i++ ) {
|
|
int32_t a , b;
|
|
// reset lasta if we changed words class
|
|
if ( i > 0 && matches->getMatch(i-1).m_tr != matches->getMatch(i).m_tr ) {
|
|
lasta = -1;
|
|
}
|
|
|
|
// only use matches in title, etc.
|
|
mf_t flags = matches->getMatch(i).m_flags;
|
|
|
|
bool skip = true;
|
|
if ( flags & MF_METASUMM ) {
|
|
skip = false;
|
|
}
|
|
if ( flags & MF_METADESC ) {
|
|
skip = false;
|
|
}
|
|
if ( flags & MF_BODY ) {
|
|
skip = false;
|
|
}
|
|
if ( flags & MF_RSSDESC ) {
|
|
skip = false;
|
|
}
|
|
|
|
if ( skip ) {
|
|
continue;
|
|
}
|
|
|
|
// ask him for the query words he matched
|
|
//char gotIt [ MAX_QUERY_WORDS ];
|
|
// clear it for him
|
|
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
|
|
|
|
// . get score of best window around this match
|
|
// . do not allow left post of window to be <= lasta to
|
|
// avoid repeating the same window.
|
|
int64_t score = getBestWindow (matches, i, &lasta, &a, &b, gotIt, retired, maxExcerptLen);
|
|
|
|
// USE THIS BUF BELOW TO DEBUG THE ABOVE CODE.
|
|
// PRINTS OUT THE SUMMARY
|
|
/*
|
|
//if ( score >=12000 ) {
|
|
char buf[10*1024];
|
|
char *xp = buf;
|
|
if ( i == 0 )
|
|
log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
|
|
sprintf(xp, "score=%08" PRId32" a=%05" PRId32" b=%05" PRId32" ",
|
|
(int32_t)score,(int32_t)a,(int32_t)b);
|
|
xp += strlen(xp);
|
|
for ( int32_t j = a; j < b; j++ ){
|
|
//int32_t s = scores->m_scores[j];
|
|
int32_t s = 0;
|
|
if ( s < 0 ) continue;
|
|
char e = 1;
|
|
int32_t len = words->getWordLen(j);
|
|
for(int32_t k=0;k<len;k +=e){
|
|
char c = words->m_words[j][k];
|
|
//if ( is_binary( c ) ) continue;
|
|
*xp = c;
|
|
xp++;
|
|
}
|
|
//p += strlen(p);
|
|
if ( s == 0 ) continue;
|
|
sprintf ( xp ,"(%" PRId32")",s);
|
|
xp += strlen(xp);
|
|
}
|
|
log (LOG_WARN,"query: summary: %s", buf);
|
|
//}
|
|
*/
|
|
|
|
// prints out the best window with the score
|
|
/*
|
|
char buf[MAX_SUMMARY_LEN];
|
|
char *bufPtr = buf;
|
|
char *bufPtrEnd = p + MAX_SUMMARY_LEN;
|
|
if ( i == 0 )
|
|
log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
|
|
int32_t len = 0;
|
|
Words *ww = matches->m_matches[i].m_words;
|
|
//Sections *ss = matches->m_matches[i].m_sections;
|
|
//if ( ss->m_numSections <= 0 ) ss = NULL;
|
|
//len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL);
|
|
//log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
|
|
//score);
|
|
log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
|
|
score);
|
|
*/
|
|
|
|
if(m_avoid_cookie_warnings && looksLikeCookieWarning(*tr,a,b)) {
|
|
logTrace(g_conf.m_logTraceSummary, "Summary looks like cookie warning");
|
|
score = 0.0;
|
|
}
|
|
|
|
// skip if was in title or something
|
|
if ( score <= 0 ) {
|
|
continue;
|
|
}
|
|
|
|
// skip if not a winner
|
|
if ( maxi >= 0 && score <= maxScore ) {
|
|
continue;
|
|
}
|
|
|
|
// we got a new winner
|
|
maxi = i;
|
|
maxa = a;
|
|
maxb = b;
|
|
maxScore = score;
|
|
|
|
// save this too
|
|
memmove ( maxGotIt , gotIt , m_q->m_numWords );
|
|
|
|
}
|
|
|
|
// retire the query words in the winning summary
|
|
|
|
// all done if no winner was made
|
|
if ( maxi == -1 || maxa == -1 || maxb == -1) {
|
|
break;
|
|
}
|
|
|
|
// who is the winning match?
|
|
maxm = &matches->getMatch(maxi);
|
|
const TokenizerResult *tr = maxm->m_tr;
|
|
Bits *swbits = const_cast<Bits*>(maxm->m_bits); //constcast because although the matches are const, their m_bits->*swbit* belong to us
|
|
|
|
// this should be impossible
|
|
if ( maxa > (int32_t)tr->size() || maxb > (int32_t)tr->size() ) {
|
|
log ( LOG_WARN,"query: summary starts or ends after "
|
|
"document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%zu",
|
|
maxa, maxb, tr->size() );
|
|
maxa = tr->size() - 1;
|
|
maxb = tr->size();
|
|
}
|
|
|
|
// assume we do not preceed with ellipsis "..."
|
|
bool needEllipsis = true;
|
|
|
|
const char *c = (*tr)[maxa].token_start;
|
|
|
|
// rule of thumb, don't use ellipsis if the first letter is capital, or a non letter
|
|
// is punct word before us pair acrossable? if so then we probably are not the start of a sentence.
|
|
// or if into the sample and previous excerpt had an ellipsis do not bother using one for us.
|
|
if ( !is_alpha_utf8(c) || is_upper_utf8(c) ||
|
|
(swbits->querySWBits(maxa) & D_STARTS_SENTENCE) ||
|
|
(p > m_summary && hadEllipsis)) {
|
|
needEllipsis = false;
|
|
}
|
|
|
|
if ( needEllipsis ) {
|
|
// break out if no room for "..."
|
|
if ( p + 4 + 2 > pend ) {
|
|
break;
|
|
}
|
|
|
|
// space first?
|
|
if ( p > m_summary ) {
|
|
*p++ = ' ';
|
|
}
|
|
|
|
memcpy ( p, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
|
|
p += 4;
|
|
}
|
|
|
|
// separate summary excerpts with a single space.
|
|
if ( p > m_summary ) {
|
|
if ( p + 2 > pend ) {
|
|
break;
|
|
}
|
|
|
|
*p++ = ' ';
|
|
}
|
|
|
|
// assume we need a trailing ellipsis
|
|
needEllipsis = true;
|
|
|
|
// so next excerpt does not need to have an ellipsis if we
|
|
// have one at the end of this excerpt
|
|
hadEllipsis = needEllipsis;
|
|
|
|
// start with quote?
|
|
if ( (swbits->querySWBits(maxa) & D_IN_QUOTES) && p + 1 < pend ) {
|
|
// preceed with quote
|
|
*p++ = '\"';
|
|
}
|
|
|
|
// . filter the words into p
|
|
// . removes back to back spaces
|
|
// . converts html entities
|
|
// . filters in stores words in [a,b) interval
|
|
int32_t len = pos->filter( tr, maxa, maxb, false, p, pend, xml->getVersion() );
|
|
|
|
// break out if did not fit
|
|
if ( len == 0 ) {
|
|
break;
|
|
}
|
|
|
|
// don't consider it if it is a substring of the title
|
|
if ( len == titleBufLen && strncasestr(titleBuf, p, titleBufLen, len) ) {
|
|
// don't consider this one
|
|
numFinal--;
|
|
goto skip;
|
|
}
|
|
|
|
// don't consider it if the length wasn't anything nice
|
|
if ( len < 5 ){
|
|
numFinal--;
|
|
goto skip;
|
|
}
|
|
|
|
// otherwise, keep going
|
|
p += len;
|
|
|
|
// now we just indicate which query terms we got
|
|
for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
|
|
// do not breach
|
|
if ( retired[i] >= 100 ) {
|
|
continue;
|
|
}
|
|
retired [ i ] += maxGotIt [ i ];
|
|
}
|
|
|
|
// add all the scores of the excerpts to the doc summary score.
|
|
// zero out scores of the winning sample so we don't get them
|
|
// again. use negative one billion to ensure that we don't get
|
|
// them again
|
|
for ( int32_t j = maxa ; j < maxb ; j++ ) {
|
|
// mark it as used
|
|
swbits->setSWBits(j,D_USED);
|
|
}
|
|
|
|
// if we ended on punct that can be paired across we need
|
|
// to add an ellipsis
|
|
if ( needEllipsis ) {
|
|
if ( p + 4 + 2 > pend ) {
|
|
break;
|
|
}
|
|
memcpy ( p, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
|
|
p += 4;
|
|
}
|
|
|
|
// try to put in a small summary excerpt if we have atleast
|
|
// half of the normal excerpt length left
|
|
if ( maxExcerptLen == m_maxNumCharsPerLine && len <= ( m_maxNumCharsPerLine / 2 + 1 ) ) {
|
|
maxExcerptLen = m_maxNumCharsPerLine / 2;
|
|
|
|
// don't count it in the finals since we try to get a small excerpt
|
|
numFinal--;
|
|
} else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS && m_numExcerpts >= 0 ) {
|
|
m_summaryExcerptLen[m_numExcerpts] = p - m_summary;
|
|
m_numExcerpts++;
|
|
|
|
// also reset maxExcerptLen
|
|
maxExcerptLen = m_maxNumCharsPerLine;
|
|
}
|
|
|
|
skip:
|
|
// zero out the scores so they will not be used in others
|
|
for ( int32_t j = maxa ; j < maxb ; j++ ) {
|
|
// mark it
|
|
swbits->setSWBits(j,D_USED);
|
|
}
|
|
}
|
|
|
|
if ( numFinal <= m_numDisplayLines ) {
|
|
m_displayLen = p - m_summary;
|
|
}
|
|
|
|
// free the mem we used if we allocated it
|
|
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
|
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
|
m_buf4 = NULL;
|
|
}
|
|
|
|
// If we still didn't find a summary, get the default summary
|
|
if ( p == m_summary ) {
|
|
bool status = getDefaultSummary ( xml, tr, sections, pos, maxSummaryLen );
|
|
if ( m_numDisplayLines > 0 ) {
|
|
m_displayLen = m_summaryLen;
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Unable to get summary. getDefaultSummary. Returning %s", status ? "true" : "false");
|
|
return status;
|
|
}
|
|
|
|
// if we don't find a summary, theres no need to NULL terminate
|
|
*p++ = '\0';
|
|
|
|
// set length
|
|
m_summaryLen = p - m_summary;
|
|
|
|
if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); }
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning true");
|
|
return true;
|
|
}
|
|
|
|
// . return the score of the highest-scoring window containing match #m
|
|
// . window is defined by the half-open interval [a,b) where a and b are
|
|
// word #'s in the Words array indicated by match #m
|
|
// . return -1 and set g_errno on error
|
|
int64_t Summary::getBestWindow(const Matches *matches, int32_t mm, int32_t *lasta,
|
|
int32_t *besta, int32_t *bestb, char *gotIt,
|
|
char *retired, int32_t maxExcerptLen) {
|
|
logTrace(g_conf.m_logTraceSummary, "BEGIN");
|
|
|
|
// get the window around match #mm
|
|
const Match *m = &matches->getMatch(mm);
|
|
|
|
// what is the word # of match #mm?
|
|
int32_t matchWordNum = m->m_wordNum;
|
|
|
|
// what Words/Pos/Bits classes is this match in?
|
|
const TokenizerResult *tr = m->m_tr;
|
|
Section **sp = NULL;
|
|
int32_t *pos = m->m_pos->m_pos;
|
|
|
|
// shortcut
|
|
if ( m->m_sections ) {
|
|
sp = m->m_sections->m_sectionPtrs;
|
|
}
|
|
|
|
int32_t nw = tr->size();
|
|
|
|
// . sanity check
|
|
// . this prevents a core i've seen
|
|
if ( matchWordNum >= nw ) {
|
|
log("summary: got overflow condition for q=%s",m_q->originalQuery());
|
|
|
|
// assume no best window
|
|
*besta = -1;
|
|
*bestb = -1;
|
|
*lasta = matchWordNum;
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. matchWordNum[%d] >= nw[%d]. Returning 0", matchWordNum, nw);
|
|
return 0;
|
|
}
|
|
|
|
// . we NULLify the section ptrs if we already used the word in another summary.
|
|
int32_t badFlags = NOINDEXFLAGS|SEC_IN_TITLE;
|
|
if ( (m->m_bits->querySWBits(matchWordNum) & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
|
|
// assume no best window
|
|
*besta = -1;
|
|
*bestb = -1;
|
|
*lasta = matchWordNum;
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. word is used/bad. Returning 0");
|
|
return 0;
|
|
}
|
|
|
|
// . "a" is the left fence post of the window (it is a word # in Words)
|
|
// . go to the left as far as we can
|
|
// . thus we decrement "a"
|
|
int32_t a = matchWordNum;
|
|
|
|
// "posa" is the character position of the END of word #a
|
|
int32_t posa = pos[a+1];
|
|
int32_t firstFrag = -1;
|
|
bool startOnQuote = false;
|
|
bool goodStart = false;
|
|
int32_t wordCount = 0;
|
|
|
|
// . decrease "a" as long as we stay within maxNumCharsPerLine
|
|
// . avoid duplicating windows by using "lasta", the last "a" of the
|
|
// previous call to getBestWindow(). This can happen if our last
|
|
// central query term was close to this one.
|
|
for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
|
|
// . don't include any "dead zone",
|
|
// . dead zones have already been used for the summary, and
|
|
// we are getting a second/third/... excerpt here now then
|
|
// stop if its the start of a sentence, too
|
|
// stop before title word
|
|
if ( (m->m_bits->querySWBits(a-1) & D_USED) || (m->m_bits->querySWBits(a) & D_STARTS_SENTENCE) || ( m->m_bits->querySWBits(a-1) & D_IN_TITLE )) {
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
|
|
// don't go beyond an LI, TR, P tag
|
|
nodeid_t tid = (*tr)[a-1].nodeid;
|
|
if ( tid == TAG_LI ||
|
|
tid == TAG_TR ||
|
|
tid == TAG_P ||
|
|
tid == TAG_DIV ) {
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
|
|
// stop if its the start of a quoted sentence
|
|
if ( a+1<nw && (m->m_bits->querySWBits(a+1) & D_IN_QUOTES) &&
|
|
(*tr)[a].token_start[0] == '\"' ){
|
|
startOnQuote = true;
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
|
|
// find out the first instance of a fragment (comma, etc)
|
|
// watch out! because frag also means 's' in there's
|
|
if ( ( m->m_bits->querySWBits(a) & D_STARTS_FRAGMENT ) && !(m->m_bits->querySWBits(a-1) & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
|
|
firstFrag = a;
|
|
}
|
|
|
|
if ( (*tr)[a].is_alfanum ) {
|
|
wordCount++;
|
|
}
|
|
}
|
|
|
|
// if didn't find a good start, then start at the start of the frag
|
|
if ( !goodStart && firstFrag != -1 ) {
|
|
a = firstFrag;
|
|
}
|
|
|
|
// don't let punct or tag word start a line, unless a quote
|
|
if ( a < matchWordNum && !(*tr)[a].is_alfanum && (*tr)[a].token_start[0] != '\"' ){
|
|
while ( a < matchWordNum && !(*tr)[a].is_alfanum ) a++;
|
|
|
|
// do not break right after a "strong connector", like
|
|
// apostrophe
|
|
while ( a < matchWordNum && a > 0 &&
|
|
( m->m_bits->querySWBits(a-1) & D_IS_STRONG_CONNECTOR ) )
|
|
a++;
|
|
|
|
// don't let punct or tag word start a line
|
|
while ( a < matchWordNum && !(*tr)[a].is_alfanum ) a++;
|
|
}
|
|
|
|
// remember, b is not included in the summary, the summary is [a,b-1]
|
|
// remember to include all words in a matched phrase
|
|
int32_t b = matchWordNum + m->m_numWords ;
|
|
int32_t endQuoteWordNum = -1;
|
|
int32_t numTagsCrossed = 0;
|
|
|
|
for ( ; b <= nw; b++ ) {
|
|
if ( b == nw ) {
|
|
break;
|
|
}
|
|
|
|
if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
|
|
break;
|
|
}
|
|
|
|
if ( startOnQuote && (*tr)[b].token_start[0] == '\"' ) {
|
|
endQuoteWordNum = b;
|
|
}
|
|
|
|
// don't include any dead zone, those are already-used samples
|
|
if ( m->m_bits->querySWBits(b) & D_USED ) {
|
|
break;
|
|
}
|
|
|
|
// stop on a title word
|
|
if ( m->m_bits->querySWBits(b) & D_IN_TITLE ) {
|
|
break;
|
|
}
|
|
|
|
if ( (*tr)[b].is_alfanum ) {
|
|
wordCount++;
|
|
}
|
|
|
|
// don't go beyond an LI or TR backtag
|
|
if ( (*tr)[b].nodeid == (BACKBIT|TAG_LI) ||
|
|
(*tr)[b].nodeid == (BACKBIT|TAG_TR) ) {
|
|
numTagsCrossed++;
|
|
|
|
// try to have atleast 10 words in the summary
|
|
if ( wordCount > 10 ) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// go beyond a P or DIV backtag in case the earlier char is a
|
|
// ':'. This came from a special case for wikipedia pages
|
|
// eg. http://en.wikipedia.org/wiki/Flyover
|
|
if ( (*tr)[b].nodeid == (BACKBIT|TAG_P) ||
|
|
(*tr)[b].nodeid == (BACKBIT|TAG_DIV) ) {
|
|
numTagsCrossed++;
|
|
|
|
// try to have atleast 10 words in the summary
|
|
if ( wordCount > 10 && (*tr)[b-1].token_start[0] != ':' ) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// don't end on a lot of punct words
|
|
if ( b > matchWordNum && !(*tr)[b-1].is_alfanum) {
|
|
// remove more than one punct words. if we're ending on a quote
|
|
// keep it
|
|
while ( b > matchWordNum && !(*tr)[b-2].is_alfanum && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
|
|
b--;
|
|
}
|
|
|
|
// do not break right after a "strong connector", like apostrophe
|
|
while ( b > matchWordNum && (m->m_bits->querySWBits(b-2) & D_IS_STRONG_CONNECTOR) ) {
|
|
b--;
|
|
}
|
|
}
|
|
|
|
// make m_matches.m_matches[mi] the first match in our [a,b) window
|
|
int32_t mi ;
|
|
|
|
// . the match at the center of the window is match #"mm", so that
|
|
// matches->m_matches[mm] is the Match class
|
|
// . set "mi" to it and back up "mi" as long as >= a
|
|
for ( mi = mm ; mi > 0 && matches->getMatch(mi-1).m_wordNum >=a ; mi-- )
|
|
;
|
|
|
|
// now get the score of this excerpt. Also mark all the represented
|
|
// query words. Mark the represented query words in the array that
|
|
// comes to us. also mark how many times the same word is repeated in
|
|
// this summary.
|
|
int64_t score = 0LL;
|
|
|
|
// is a url contained in the summary, that looks bad! punish!
|
|
bool hasUrl = false;
|
|
|
|
// the word count we did above was just an approximate. count it right
|
|
wordCount = 0;
|
|
|
|
// for debug
|
|
//char buf[5000];
|
|
//char *xp = buf;
|
|
SafeBuf xp;
|
|
|
|
// wtf?
|
|
if ( b > nw ) {
|
|
b = nw;
|
|
}
|
|
|
|
// first score from the starting match down to a, including match
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
const auto &token = (*tr)[i];
|
|
// debug print out
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
int32_t len = token.token_len;
|
|
char cs;
|
|
for (int32_t k=0;k<len; k+=cs ) {
|
|
const char *c = token.token_start+k;
|
|
cs = getUtf8CharSize(c);
|
|
if ( is_binary_utf8 ( c ) ) {
|
|
continue;
|
|
}
|
|
xp.safeMemcpy ( c , cs );
|
|
xp.nullTerm();
|
|
}
|
|
}
|
|
|
|
// skip if in bad section, marquee, select, script, style
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) {
|
|
continue;
|
|
}
|
|
|
|
// don't count just numeric words
|
|
if ( is_ascii_digit_string(token.token_start, token.token_end()) ) {
|
|
continue;
|
|
}
|
|
|
|
// check if there is a url. best way to check for '://'
|
|
if ( !token.is_alfanum ) {
|
|
if ( token.token_len == 3 && token.token_start[0] == ':' && token.token_start[1] == '/' && token.token_start[2] == '/' ) {
|
|
hasUrl = true;
|
|
}
|
|
}
|
|
|
|
// skip if not wid
|
|
if ( ! token.is_alfanum ) {
|
|
continue;
|
|
}
|
|
|
|
// just make every word 100 pts
|
|
int32_t t = 100;
|
|
|
|
// penalize it if in one of these sections
|
|
if ( m->m_bits->querySWBits(i) & ( D_IN_PARENTHESES | D_IN_SUP | D_IN_LIST ) ) {
|
|
t /= 2;
|
|
}
|
|
|
|
// boost it if in bold or italics
|
|
if ( m->m_bits->querySWBits(i) & D_IN_BOLDORITALICS ) {
|
|
t *= 2;
|
|
}
|
|
|
|
// add the score for this word
|
|
score += t;
|
|
|
|
// print the score, "t"
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
xp.safePrintf("(%" PRId32")",t);
|
|
}
|
|
|
|
// count the alpha words we got
|
|
wordCount++;
|
|
|
|
// if no matches left, skip
|
|
if ( mi >= matches->getNumMatches() ) {
|
|
continue;
|
|
}
|
|
|
|
// get the match
|
|
const Match *next = &matches->getMatch(mi);
|
|
|
|
// skip if not a match
|
|
if ( i != next->m_wordNum ) {
|
|
continue;
|
|
}
|
|
|
|
// must be a match in this class
|
|
if ( next->m_tr != tr ) {
|
|
continue;
|
|
}
|
|
|
|
// advance it
|
|
mi++;
|
|
|
|
// which query word # does it match
|
|
int32_t qwn = next->m_qwordNum;
|
|
|
|
if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}
|
|
|
|
// undo old score
|
|
score -= t;
|
|
|
|
// add 100000 per match
|
|
t = 100000;
|
|
|
|
// weight based on tf, goes from 0.1 to 1.0
|
|
t = (int32_t)((float)t * m_wordWeights [ qwn ]);
|
|
|
|
// if it is a query stop word, make it 10000 pts
|
|
if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
|
|
t = 0;//10000;
|
|
}
|
|
|
|
// penalize it if in one of these sections
|
|
if ( m->m_bits->querySWBits(i) & ( D_IN_PARENTHESES | D_IN_SUP | D_IN_LIST ) ) {
|
|
t /= 2;
|
|
}
|
|
|
|
if ( gotIt[qwn] > 0 ) {
|
|
// have we matched it in this [a,b) already?
|
|
if ( gotIt[qwn] == 1 ) {
|
|
t /= 15;
|
|
} else {
|
|
// if we have more than 2 matches in the same window,
|
|
// it may not give a good summary. give a heavy penalty
|
|
t -= 200000;
|
|
}
|
|
} else if ( retired [qwn] > 0 ) {
|
|
// have we matched it already in a winning window?
|
|
t /= 12;
|
|
}
|
|
|
|
// add it back
|
|
score += t;
|
|
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
|
|
m_wordWeights[qwn]);
|
|
}
|
|
|
|
// inc the query word count for this window
|
|
if ( gotIt[qwn] < 100 ) {
|
|
gotIt[qwn]++;
|
|
}
|
|
}
|
|
|
|
int32_t oldScore = score;
|
|
|
|
// apply the bonus if it starts or a sentence
|
|
// only apply if the score is positive and if the wordcount is decent
|
|
if ( score > 0 && wordCount > 7 ){
|
|
// a match can give us 10k to 100k pts based on the tf weights
|
|
// so we don't want to overwhelm that too much, so let's make
|
|
// this a 20k bonus if it starts a sentence
|
|
if ( (a) & D_STARTS_SENTENCE ) {
|
|
score += 8000;
|
|
} else if ( (a) & D_STARTS_FRAGMENT ) {
|
|
// likewise, a fragment, like after a comma
|
|
score += 4000;
|
|
}
|
|
|
|
// 1k if the match word is very close to the
|
|
// start of a sentence, lets say 3 alphawords
|
|
if ( matchWordNum - a < 7 ) {
|
|
score += 1000;
|
|
}
|
|
}
|
|
|
|
// a summary isn't really a summary if its less than 7 words.
|
|
// reduce the score, but still give it a decent score.
|
|
// minus 5M.
|
|
if ( wordCount < 7 ) {
|
|
score -= 20000;
|
|
}
|
|
|
|
// summaries that cross a lot of tags are usually bad, penalize them
|
|
if ( numTagsCrossed > 1 ) {
|
|
score -= (numTagsCrossed * 20000);
|
|
}
|
|
|
|
if ( hasUrl ) {
|
|
score -= 8000;
|
|
}
|
|
|
|
// show it
|
|
logDebug(g_conf.m_logDebugSummary, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
|
|
(int32_t)score,oldScore,(int32_t)a,(int32_t)b, xp.getBufStart());
|
|
|
|
// set lasta, besta, bestb
|
|
*lasta = a;
|
|
*besta = a;
|
|
*bestb = b;
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning %ld", score);
|
|
return score;
|
|
}
|
|
|
|
// get summary when no search terms could be found
|
|
bool Summary::getDefaultSummary(const Xml *xml, const TokenizerResult *tr, const Sections *sections, Pos *pos, unsigned maxSummaryLen) {
|
|
logTrace(g_conf.m_logTraceSummary, "BEGIN");
|
|
|
|
char *p = m_summary;
|
|
|
|
if (MAX_SUMMARY_LEN < maxSummaryLen) {
|
|
maxSummaryLen = MAX_SUMMARY_LEN;
|
|
}
|
|
|
|
// null it out
|
|
m_summaryLen = 0;
|
|
|
|
bool inLink = false;
|
|
char *pend = m_summary + maxSummaryLen - 2;
|
|
int32_t start = -1, numConsecutive = 0;
|
|
int32_t bestStart = -1;
|
|
int32_t bestEnd = -1;
|
|
int32_t longestConsecutive = 0;
|
|
int32_t lastAlnum = -1;
|
|
int32_t badFlags = NOINDEXFLAGS|SEC_IN_TITLE|SEC_IN_HEAD;
|
|
|
|
// get the section ptr array 1-1 with the words, "sp"
|
|
Section **sp = NULL;
|
|
if ( sections ) {
|
|
sp = sections->m_sectionPtrs;
|
|
}
|
|
|
|
for (size_t i = 0; i < tr->size(); i++) {
|
|
const auto &token = (*tr)[i];
|
|
// skip if in bad section
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) {
|
|
continue;
|
|
}
|
|
|
|
if (start > 0 && bestStart == start &&
|
|
( token.start_pos - (*tr)[start].start_pos ) >= ( maxSummaryLen - 8 ) )
|
|
{
|
|
longestConsecutive = numConsecutive;
|
|
bestStart = start;
|
|
bestEnd = lastAlnum;//i-1;
|
|
break;
|
|
}
|
|
if (token.is_alfanum ) {
|
|
if (!inLink) {
|
|
numConsecutive++;
|
|
}
|
|
lastAlnum = i;
|
|
if (start < 0) start = i;
|
|
continue;
|
|
}
|
|
nodeid_t tid = token.nodeid & BACKBITCOMP;
|
|
// we gotta tag?
|
|
if ( tid ) {
|
|
// ignore <p> tags
|
|
if ( tid == TAG_P ) {
|
|
continue;
|
|
}
|
|
|
|
// is it a front tag?
|
|
if ( ! (token.nodeid & BACKBIT) ) {
|
|
if ( tid == TAG_A ) {
|
|
inLink = true;
|
|
}
|
|
}
|
|
else {
|
|
if ( tid == TAG_A ) {
|
|
inLink = false;
|
|
}
|
|
}
|
|
|
|
if ( ! isBreakingTagId(tid) )
|
|
continue;
|
|
} else if ( ! token.is_alfanum ) {
|
|
continue;
|
|
}
|
|
|
|
// end of consecutive words
|
|
if ( numConsecutive > longestConsecutive ) {
|
|
int cookieWarningLessStart, cookieWarningLessEnd;
|
|
if(m_avoid_cookie_warnings)
|
|
findCookieWarningLessSubrange(*tr, start, i, &cookieWarningLessStart, &cookieWarningLessEnd);
|
|
else {
|
|
cookieWarningLessStart = start;
|
|
cookieWarningLessEnd = i;
|
|
}
|
|
if(cookieWarningLessEnd-cookieWarningLessStart > longestConsecutive) {
|
|
longestConsecutive = cookieWarningLessEnd-cookieWarningLessStart;
|
|
bestStart = cookieWarningLessStart;
|
|
bestEnd = cookieWarningLessEnd-1;
|
|
}
|
|
}
|
|
start = -1;
|
|
numConsecutive = 0;
|
|
}
|
|
|
|
if (bestStart >= 0 && bestEnd > bestStart){
|
|
p += pos->filter( tr, bestStart, bestEnd, true, p, pend - 10, xml->getVersion() );
|
|
|
|
// NULL terminate
|
|
*p++ = '\0';
|
|
|
|
// set length
|
|
m_summaryLen = p - m_summary;
|
|
|
|
if ( m_numDisplayLines > 0 ) {
|
|
m_displayLen = m_summaryLen;
|
|
}
|
|
|
|
if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); }
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning true");
|
|
return true;
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceSummary, "END. Returning true");
|
|
return true;
|
|
}
|
|
|
|
|
|
void Summary::maybeRemoveHtmlFormatting() {
|
|
//Some websites have junk in their meta tags. Eg <br> in the meta description
|
|
//We don't fix all cases as that could hurt correctly written pages about how to write proper html. But
|
|
//if they don't mention "html", "tag" nor "element" then we remove the most common offenders br/b/i/p
|
|
//When changing this function consider keeping in sync with XmlDoc_Indexing.cpp:possiblyDecodeHtmlEntitiesAgain()
|
|
if(memmem(m_summary,m_summaryLen,"html",4)==0 &&
|
|
memmem(m_summary,m_summaryLen,"HTML",4)==0 &&
|
|
memmem(m_summary,m_summaryLen,"tag",3)==0 &&
|
|
memmem(m_summary,m_summaryLen,"Tag",3)==0 &&
|
|
memmem(m_summary,m_summaryLen,"element",7)==0 &&
|
|
memmem(m_summary,m_summaryLen,"Element",7)==0)
|
|
{
|
|
for(int i=0; i<m_summaryLen; ) {
|
|
char *p = (char*)memchr(m_summary+i,'<',m_summaryLen-i);
|
|
if(!p)
|
|
break;
|
|
i = p-m_summary;
|
|
if(i+4<m_summaryLen) {
|
|
if(memcmp(p,"<br>",4)==0) {
|
|
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
|
m_summaryLen -= 3;
|
|
m_summary[i] = ' ';
|
|
} else if(memcmp(p,"<b>",3)==0) {
|
|
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
|
m_summaryLen -= 3;
|
|
} else if(memcmp(p,"<i>",3)==0) {
|
|
memmove(m_summary+i,m_summary+i+3,m_summaryLen-i-3);
|
|
m_summaryLen -= 3;
|
|
} else if(memcmp(p,"<p>",3)==0) {
|
|
memmove(m_summary+i,m_summary+i+2,m_summaryLen-i-2);
|
|
m_summaryLen -= 2;
|
|
m_summary[i] = ' ';
|
|
} else
|
|
i++;
|
|
} else
|
|
break;
|
|
}
|
|
|
|
//also remove any double-encoded / unnecessarily-encoded html entities.
|
|
if(memchr(m_summary,'&',m_summaryLen)!=0) {
|
|
StackBuf<1024> tmpBuf;
|
|
if(tmpBuf.reserve(m_summaryLen + m_summaryLen/2 + 4)) {
|
|
int32_t tmpLen = htmlDecode(tmpBuf.getBufStart(), m_summary,m_summaryLen, false);
|
|
//in some really stupid and artificial cases the result may be longer.
|
|
if(tmpLen<(int)sizeof(m_summary)) {
|
|
memcpy(m_summary, tmpBuf.getBufStart(), tmpLen);
|
|
m_summaryLen = tmpLen;
|
|
m_summary[m_summaryLen] = '\0';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//Hardcoded detection of cookie warnings
|
|
//Various sites uses many, many different ways of showing cookie warnings. So instead of matching on section/span/div id or class, we go directly for text.
|
|
//Note: the cookie warning may not contain the word "cookie" because it can be called something else (eg. kakor in Swedish)
|
|
|
|
//This list is incomplet but it's a start.
|
|
static const char *cookie_warning_excerpt[] = {
|
|
//English
|
|
"website uses certain cookies",
|
|
"We use cookies on our website",
|
|
"Our site uses cookies",
|
|
//German
|
|
"Website benötigt JavaScript und Cookies",
|
|
"Webseite verwendet Cookies",
|
|
//Danish
|
|
"Vi anvender cookies på",
|
|
"Vores cookies hjælper os",
|
|
"Vi bruger cookies for",
|
|
"Brug af cookies",
|
|
"Accepter cookies fra",
|
|
"accepterer du brugen af cookies",
|
|
"bruger cookies til",
|
|
//Swedish
|
|
"Det krävs att cookies är aktiverat",
|
|
"Vi använder kakor(cookies)",
|
|
"godkänner du att vi använder cookies",
|
|
"använder vi cookies för",
|
|
"använder vi oss av kakor",
|
|
//Norwegian
|
|
"vårt benytter cookies for",
|
|
"Vi bruker informasjonskapsler (cookies)",
|
|
//Polish
|
|
"korzysta z plików cookies",
|
|
//French
|
|
"acceptez l'utilisation de cookies",
|
|
//Portuguese
|
|
"Esta página utiliza cookies propias",
|
|
};
|
|
static const size_t cookie_warning_excerpts = sizeof(cookie_warning_excerpt)/sizeof(cookie_warning_excerpt[0]);
|
|
|
|
static TokenizerResult tr_cookie_warning_excepts[cookie_warning_excerpts];
|
|
static size_t min_except_token_count = 0;
|
|
static bool tr_cookie_warning_excepts_initialized = false;
|
|
static GbMutex mtx_tr_cookie_excepts;
|
|
|
|
static void initialize_tr_cookie_warning_excepts() {
|
|
ScopedLock sl(mtx_tr_cookie_excepts);
|
|
if(!tr_cookie_warning_excepts_initialized) {
|
|
min_except_token_count = INT32_MAX;
|
|
for(unsigned i=0; i<cookie_warning_excerpts; i++) {
|
|
plain_tokenizer_phase_1(cookie_warning_excerpt[i], strlen(cookie_warning_excerpt[i]), &(tr_cookie_warning_excepts[i]));
|
|
calculate_tokens_hashes(&(tr_cookie_warning_excepts[i]));
|
|
min_except_token_count = std::min(min_except_token_count,tr_cookie_warning_excepts[i].size());
|
|
}
|
|
tr_cookie_warning_excepts_initialized = true;
|
|
}
|
|
}
|
|
|
|
static bool looksLikeCookieWarning(const TokenizerResult &tr, int start_word_, int end_word_) {
|
|
initialize_tr_cookie_warning_excepts();
|
|
if(end_word_<=start_word_ || start_word_<0 || end_word_<0)
|
|
return false;
|
|
unsigned start_word = (unsigned)start_word_;
|
|
unsigned end_word = (unsigned)end_word_;
|
|
while(start_word<end_word && !tr[start_word].is_alfanum)
|
|
start_word++;
|
|
while(end_word > start_word && !tr[end_word-1].is_alfanum)
|
|
end_word--;
|
|
if(end_word-start_word < min_except_token_count)
|
|
return false;
|
|
|
|
for(unsigned i = 0; i<cookie_warning_excerpts; i++) {
|
|
const auto &cookie_tokens = tr_cookie_warning_excepts[i].tokens;
|
|
if(cookie_tokens.size() > end_word-start_word)
|
|
continue;
|
|
for(unsigned j=start_word; j<=end_word-cookie_tokens.size(); j++) {
|
|
bool match = true;
|
|
for(unsigned k=0; k<cookie_tokens.size(); k++) {
|
|
const auto &cookie_word = cookie_tokens[k];
|
|
const auto &summary_word = tr[j+k];
|
|
if(cookie_word.is_alfanum && cookie_word.token_hash!=summary_word.token_hash)
|
|
{
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
if(match)
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
static unsigned num_alfanum_tokens_in_range(const TokenizerResult &tr, unsigned start, unsigned end) {
|
|
unsigned count = 0;
|
|
for(unsigned i=start; i<end; i++)
|
|
if(tr[i].is_alfanum)
|
|
count++;
|
|
return count;
|
|
}
|
|
|
|
|
|
//In range [start_word..end_word) find a subrange that doesn't contain a cookie warning.
|
|
//In theory we could do binary search, or range scan, etc. but testing if something is a cookie warning is not cheap. So instead we use much simplified logic
|
|
static void findCookieWarningLessSubrange(const TokenizerResult &tr, int start_word, int end_word, int *cookie_warning_less_start, int *cookie_warning_less_end)
|
|
{
|
|
if(!looksLikeCookieWarning(tr,start_word,end_word)) {
|
|
*cookie_warning_less_start = start_word;
|
|
*cookie_warning_less_end = end_word;
|
|
return;
|
|
}
|
|
|
|
int min_alfanum_tokens = std::max(4,(end_word-start_word)/50);
|
|
//find the first subrange that does not contain a cookie warning and has at least 4 (or 2%) alfanum tokens
|
|
*cookie_warning_less_start = start_word;
|
|
*cookie_warning_less_end = start_word;
|
|
int start = start_word;
|
|
int i = start_word;
|
|
while(i<end_word) {
|
|
//sweep to </p> or </div> or end
|
|
while(i<end_word && tr[i].nodeid!=(TAG_P|BACKBIT) && tr[i].nodeid!=(TAG_DIV|BACKBIT))
|
|
i++;
|
|
if(!looksLikeCookieWarning(tr,start,i)) {
|
|
//candidate [start..i[
|
|
if(i-start >= min_alfanum_tokens) {
|
|
int c = num_alfanum_tokens_in_range(tr,start,i);
|
|
if(c>=min_alfanum_tokens) {
|
|
if(c > *cookie_warning_less_end-*cookie_warning_less_start) {
|
|
*cookie_warning_less_start = start;
|
|
*cookie_warning_less_end = i;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
//cookie warning detected. If we already have a good subrange then break out of loop and live with the result we have.
|
|
if(*cookie_warning_less_start!=*cookie_warning_less_end)
|
|
break;
|
|
start = i+1;
|
|
}
|
|
i++;
|
|
}
|
|
}
|