1797 lines
54 KiB
C++
1797 lines
54 KiB
C++
#include "Summary.h"
|
|
#include "Speller.h"
|
|
#include "Words.h"
|
|
//#include "AppendingWordsWindow.h"
|
|
#include "Sections.h"
|
|
|
|
Summary::Summary()
|
|
: m_summaryLocs(m_summaryLocBuf,
|
|
MAX_SUMMARY_LOCS*sizeof(uint64_t)),
|
|
m_summaryLocsPops(m_summaryLocPopsBuf,
|
|
MAX_SUMMARY_LOCS*sizeof(int32_t)) {
|
|
//m_buf = NULL;
|
|
m_bitScoresBuf = NULL;
|
|
m_bitScoresBufSize = 0;
|
|
m_wordWeights = NULL;
|
|
m_buf4 = NULL;
|
|
reset();
|
|
}
|
|
|
|
Summary::~Summary() { reset(); }
|
|
|
|
void Summary::reset() {
|
|
//if ( m_buf && m_freeBuf )
|
|
// mfree ( m_buf, m_bufMaxLen, "Summary" );
|
|
if ( m_bitScoresBuf ){
|
|
mfree ( m_bitScoresBuf, m_bitScoresBufSize,
|
|
"SummaryBitScore" );
|
|
m_bitScoresBuf = NULL;
|
|
m_bitScoresBufSize = 0;
|
|
}
|
|
m_summaryLen = 0;
|
|
m_displayLen = 0;
|
|
//m_bufMaxLen = 0;
|
|
//m_bufLen = 0;
|
|
//m_buf = NULL;
|
|
m_isNormalized = false;
|
|
//m_freeBuf = true;
|
|
m_numExcerpts = 0;
|
|
m_summaryLocs.reset();
|
|
m_summaryLocsPops.reset();
|
|
if ( m_wordWeights && m_wordWeights != (float *)m_tmpBuf ) {
|
|
mfree ( m_wordWeights , m_wordWeightSize , "sumww");
|
|
m_wordWeights = NULL;
|
|
}
|
|
m_wordWeights = NULL;
|
|
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
|
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
|
m_buf4 = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////
|
|
//
|
|
// THE NEW SUMMARY GENERATOR
|
|
//
|
|
//////////////////////////////////////////////////////////////////
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Summary::set2 ( Xml *xml ,
|
|
Words *words ,
|
|
Bits *bits ,
|
|
Sections *sections ,
|
|
Pos *pos ,
|
|
Query *q ,
|
|
int64_t *termFreqs ,
|
|
float *affWeights , // 1-1 with qterms
|
|
//char *coll ,
|
|
//int32_t collLen ,
|
|
bool doStemming ,
|
|
int32_t maxSummaryLen ,
|
|
int32_t maxNumLines ,
|
|
int32_t numDisplayLines ,
|
|
int32_t maxNumCharsPerLine ,
|
|
//int32_t bigSampleRadius ,
|
|
//int32_t bigSampleMaxLen ,
|
|
bool ratInSummary ,
|
|
//TitleRec *tr ,
|
|
Url *f ,
|
|
//bool allowPunctInPhrase ,
|
|
//bool excludeLinkText ,
|
|
//bool excludeMetaText ,
|
|
//bool hackFixWords ,
|
|
//bool hackFixPhrases ,
|
|
//float *queryProximityScore,
|
|
Matches *matches ,
|
|
char *titleBuf ,
|
|
int32_t titleBufLen ) {
|
|
|
|
//m_proximityScore = -1;
|
|
|
|
// pointless, possibly caller in Msg20 is just interested in
|
|
// Msg20Request::m_computeLinkInfo or m_setLinkInfo. NO! we need
|
|
// to see if it has all the query terms...
|
|
//if ( maxNumLines <= 0 ) return true;
|
|
|
|
m_numDisplayLines = numDisplayLines;
|
|
m_displayLen = 0;
|
|
|
|
//m_useDateLists = useDateLists;
|
|
//m_exclDateList = exclDateList;
|
|
//m_begPubDateList = begPubDateList;
|
|
//m_endPubDateList = endPubDateList;
|
|
//m_diversity = 1.0;
|
|
// int64_t start = gettimeofdayInMilliseconds();
|
|
// assume we got maxnumlines of summary
|
|
if ( (maxNumCharsPerLine+6)*maxNumLines > maxSummaryLen ) {
|
|
//maxNumCharsPerLine = (maxSummaryLen-10)/maxNumLines;
|
|
if ( maxNumCharsPerLine < 10 ) maxNumCharsPerLine = 10;
|
|
static char s_flag = 1;
|
|
if ( s_flag ) {
|
|
s_flag = 0;
|
|
log("query: Warning. "
|
|
"Max summary excerpt length decreased to "
|
|
"%" INT32 " chars because max summary excerpts and "
|
|
"max summary length are too big.",
|
|
maxNumCharsPerLine);
|
|
}
|
|
}
|
|
|
|
// . sanity check
|
|
// . summary must fit in m_summary[]
|
|
// . leave room for tailing \0
|
|
if ( maxSummaryLen >= MAX_SUMMARY_LEN ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
return log("query: Summary too big to hold in buffer of %" INT32 " "
|
|
"bytes.",(int32_t)MAX_SUMMARY_LEN);
|
|
}
|
|
|
|
// . hash query word ids into a small hash table
|
|
// . we use this to see what words in the document are query terms
|
|
//int32_t qscores [ MAX_QUERY_TERMS ];
|
|
|
|
// and if we found each query term or not
|
|
//int32_t nt = q->getNumNonFieldedSingletonTerms();
|
|
//int32_t nqt = q->getNumTerms();
|
|
|
|
// do not overrun the final*[] buffers
|
|
if ( maxNumLines > 256 ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
return log("query: More than 256 summary lines requested.");
|
|
}
|
|
|
|
// . MORE BIG HACK
|
|
// . since we're working with fielded query terms we must check BIG
|
|
// HACK here in case the fielded query term is the ONLY query
|
|
// term.
|
|
// . LOGIC MOVED INTO MATCHES.CPP
|
|
|
|
// Nothing to match...print beginning of content as summary
|
|
if ( matches->m_numMatches == 0 && maxNumLines > 0 )
|
|
return getDefaultSummary ( xml,
|
|
words,
|
|
sections, // scores,
|
|
pos,
|
|
//bigSampleRadius,
|
|
maxSummaryLen );
|
|
|
|
/*int64_t end = gettimeofdayInMilliseconds();
|
|
if ( end - start > 2 )
|
|
log ( LOG_WARN,"summary: took %" INT64 " ms to finish big hack",
|
|
end - start );
|
|
start = gettimeofdayInMilliseconds();*/
|
|
//
|
|
int32_t need1 = q->m_numWords * sizeof(float);
|
|
m_wordWeightSize = need1;
|
|
if ( need1 < 128 )
|
|
m_wordWeights = (float *)m_tmpBuf;
|
|
else
|
|
m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
|
|
if ( ! m_wordWeights ) return false;
|
|
|
|
|
|
|
|
// zero out all word weights
|
|
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
|
|
m_wordWeights[i] = 0.0;
|
|
|
|
// query terms
|
|
int32_t numTerms = q->getNumTerms();
|
|
|
|
// . compute our word weights wrt each query. words which are more rare
|
|
// have a higher weight. We use this to weight the terms importance
|
|
// when generating the summary.
|
|
// . used by the proximity algo
|
|
// . used in setSummaryScores() for scoring summaries
|
|
if ( termFreqs && q->m_numWords > 1 ) {
|
|
float maxTermFreq = 0;
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
// www.abc.com --> treat www.abc as same term freq
|
|
// 'www.infonavit.gob.mx do de carne? mxa'
|
|
//if(q->m_qterms[i].m_isPhrase) continue;
|
|
if(termFreqs[i] > maxTermFreq)
|
|
maxTermFreq = termFreqs[i];
|
|
}
|
|
maxTermFreq++; //don't div by 0!
|
|
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
//if(q->m_qterms[i].m_isPhrase) continue;
|
|
// if this is a phrase the other words following
|
|
// the first word will have a word weight of 0
|
|
// so should be ignored for that...
|
|
int32_t ndx = q->m_qterms[i].m_qword - q->m_qwords;
|
|
// oh it is already complemented up here
|
|
m_wordWeights[ndx] = 1.0 -
|
|
((float)termFreqs[i] / maxTermFreq);
|
|
//make sure everything has a little weight:
|
|
if(m_wordWeights[ndx] < .10) m_wordWeights[ndx] = .10;
|
|
//log(LOG_WARN,
|
|
//"query word num %" INT32 " termnum %" INT32 " freq %f max %f",
|
|
//ndx,i,m_wordWeights[ndx],maxTermFreq);
|
|
}
|
|
}
|
|
else {
|
|
for ( int32_t i = 0 ; i < q->m_numWords; i++ )
|
|
m_wordWeights[i] = 1.0;
|
|
}
|
|
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
|
|
int64_t tf = -1;
|
|
if ( termFreqs ) tf = termFreqs[i];
|
|
log("sum: u=%s wordWeights[%" INT32 "]=%f tf=%" INT64 "",
|
|
f->m_url,i,m_wordWeights[i],tf);
|
|
}
|
|
}
|
|
|
|
// convenience
|
|
m_maxNumCharsPerLine = maxNumCharsPerLine;
|
|
//m_qscores = qscores;
|
|
m_q = q;
|
|
|
|
//m_proximityScore = 0;
|
|
|
|
bool hadEllipsis = false;
|
|
|
|
// set the max excerpt len to the max summary excerpt len
|
|
int32_t maxExcerptLen = m_maxNumCharsPerLine;
|
|
|
|
int32_t lastNumFinal = 0;
|
|
int32_t maxLoops = 1024;
|
|
char *p, *pend;
|
|
|
|
// if just computing absScore2...
|
|
if ( maxNumLines <= 0 )//&& bigSampleRadius <= 0 )
|
|
return true;//return matches->m_hasAllQueryTerms;
|
|
|
|
p = m_summary;
|
|
pend = m_summary + maxSummaryLen;
|
|
m_numExcerpts = 0;
|
|
|
|
int32_t need2 = (1+1+1) * m_q->m_numWords;
|
|
m_buf4Size = need2;
|
|
if ( need2 < 128 )
|
|
m_buf4 = m_tmpBuf4;
|
|
else
|
|
m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
|
|
if ( ! m_buf4 ) return false;
|
|
char *x = m_buf4;
|
|
char *retired = x;
|
|
x += m_q->m_numWords;
|
|
char *maxGotIt = x;
|
|
x += m_q->m_numWords;
|
|
char *gotIt = x;
|
|
|
|
// . the "maxGotIt" count vector accumulates into "retired"
|
|
// . that is how we keep track of what query words we used for previous
|
|
// summary excerpts so we try to get diversified excerpts with
|
|
// different query terms/words in them
|
|
//char retired [ MAX_QUERY_WORDS ];
|
|
memset ( retired, 0, m_q->m_numWords * sizeof(char) );
|
|
|
|
// some query words are already matched in the title
|
|
for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ )
|
|
if ( matches->m_qwordFlags[i] & MF_TITLEGEN )
|
|
retired [ i ] = 1;
|
|
|
|
//
|
|
// Loop over all words that match a query term. The matching words
|
|
// could be from any one of the 3 Words arrays above. Find the
|
|
// highest scoring window around each term. And then find the highest
|
|
// of those over all the matching terms.
|
|
//
|
|
int32_t numFinal;
|
|
for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ){
|
|
|
|
if ( numFinal == m_numDisplayLines )
|
|
m_displayLen = p - m_summary;
|
|
|
|
// reset these at the top of each loop
|
|
Match *maxm;
|
|
int64_t maxScore = 0;
|
|
int32_t maxa = 0;
|
|
int32_t maxb = 0;
|
|
int32_t maxi = -1;
|
|
int32_t lasta = -1;
|
|
//char maxGotIt [ MAX_QUERY_WORDS ];
|
|
|
|
if(lastNumFinal == numFinal) {
|
|
if(maxLoops-- <= 0) {
|
|
log(LOG_WARN, "query: got infinite loop "
|
|
"bug, query is %s url is %s",
|
|
m_q->m_orig,
|
|
f->getUrl());
|
|
break;
|
|
}
|
|
}
|
|
lastNumFinal = numFinal;
|
|
// int64_t stget = gettimeofdayInMilliseconds();
|
|
// does the max that we found have a new query word that was
|
|
// not already in the summary?
|
|
//int32_t maxFoundNew = 0;
|
|
// loop through all the matches and see which is best
|
|
for ( int32_t i = 0 ; i < matches->m_numMatches ; i++ ) {
|
|
int32_t a , b;
|
|
// reset lasta if we changed words class
|
|
if ( i > 0 && matches->m_matches[i-1].m_words !=
|
|
matches->m_matches[i].m_words )
|
|
lasta = -1;
|
|
|
|
// only use matches in title, etc.
|
|
mf_t flags = matches->m_matches[i].m_flags;
|
|
|
|
bool skip = true;
|
|
if ( flags & MF_METASUMM ) skip = false;
|
|
if ( flags & MF_METADESC ) skip = false;
|
|
if ( flags & MF_BODY ) skip = false;
|
|
if ( flags & MF_DMOZSUMM ) skip = false;
|
|
if ( flags & MF_RSSDESC ) skip = false;
|
|
if ( skip ) continue;
|
|
|
|
// ask him for the query words he matched
|
|
//char gotIt [ MAX_QUERY_WORDS ];
|
|
// clear it for him
|
|
memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );
|
|
|
|
// . get score of best window around this match
|
|
// . do not allow left post of window to be <= lasta to
|
|
// avoid repeating the same window.
|
|
int64_t score = getBestWindow (matches,
|
|
i,
|
|
&lasta,
|
|
&a, &b,
|
|
gotIt ,
|
|
retired ,
|
|
maxExcerptLen);
|
|
|
|
// USE THIS BUF BELOW TO DEBUG THE ABOVE CODE.
|
|
// PRINTS OUT THE SUMMARY
|
|
/*
|
|
//if ( score >=12000 ) {
|
|
char buf[10*1024];
|
|
char *xp = buf;
|
|
if ( i == 0 )
|
|
log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
|
|
sprintf(xp, "score=%08" INT32 " a=%05" INT32 " b=%05" INT32 " ",
|
|
(int32_t)score,(int32_t)a,(int32_t)b);
|
|
xp += gbstrlen(xp);
|
|
for ( int32_t j = a; j < b; j++ ){
|
|
//int32_t s = scores->m_scores[j];
|
|
int32_t s = 0;
|
|
if ( s < 0 ) continue;
|
|
char e = 1;
|
|
int32_t len = words->getWordLen(j);
|
|
for(int32_t k=0;k<len;k +=e){
|
|
char c = words->m_words[j][k];
|
|
//if ( is_binary( c ) ) continue;
|
|
*xp = c;
|
|
xp++;
|
|
}
|
|
//p += gbstrlen(p);
|
|
if ( s == 0 ) continue;
|
|
sprintf ( xp ,"(%" INT32 ")",s);
|
|
xp += gbstrlen(xp);
|
|
}
|
|
log (LOG_WARN,"query: summary: %s", buf);
|
|
//}
|
|
*/
|
|
|
|
// prints out the best window with the score
|
|
/*
|
|
char buf[MAX_SUMMARY_LEN];
|
|
char *bufPtr = buf;
|
|
char *bufPtrEnd = p + MAX_SUMMARY_LEN;
|
|
if ( i == 0 )
|
|
log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
|
|
int32_t len = 0;
|
|
Words *ww = matches->m_matches[i].m_words;
|
|
//Sections *ss = matches->m_matches[i].m_sections;
|
|
//if ( ss->m_numSections <= 0 ) ss = NULL;
|
|
//len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL);
|
|
//log(LOG_WARN,"summary: %" INT32 ") %s - %" INT64 "",i,bufPtr,
|
|
//score);
|
|
log(LOG_WARN,"summary: %" INT32 ") %s - %" INT64 "",i,bufPtr,
|
|
score);
|
|
*/
|
|
|
|
// skip if was in title or something
|
|
if ( score <= 0 ) continue;
|
|
// skip if not a winner
|
|
if ( maxi >= 0 && score <= maxScore ) continue;
|
|
|
|
// we got a new winner
|
|
maxi = i;
|
|
maxa = a;
|
|
maxb = b;
|
|
maxScore = score;
|
|
// save this too
|
|
gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords );
|
|
|
|
}
|
|
|
|
// retire the query words in the winning summary
|
|
|
|
|
|
//log( LOG_WARN,"summary: took %" INT64 " ms to finish getbestwindo",
|
|
// gettimeofdayInMilliseconds() - stget );
|
|
|
|
|
|
// all done if no winner was made
|
|
if ( maxi == -1 ) break;
|
|
|
|
// sanity check
|
|
//if ( maxa == -1 || maxb == -1 ) { char *xx = NULL; *xx = 0; }
|
|
if ( maxa == -1 ) break;
|
|
if ( maxb == -1 ) break;
|
|
|
|
// who is the winning match?
|
|
maxm = &matches->m_matches[maxi];
|
|
Words *ww = maxm->m_words;
|
|
Sections *ss = maxm->m_sections;
|
|
// we now use "m_swbits" for the summary bits since they are
|
|
// of size sizeof(swbit_t), a int16_t at this point
|
|
swbit_t *bb = maxm->m_bits->m_swbits;
|
|
|
|
// this should be impossible
|
|
if ( maxa > ww->m_numWords || maxb > ww->m_numWords ){
|
|
log ( LOG_WARN,"query: summary starts or ends after "
|
|
"document is over! maxa=%" INT32 " maxb=%" INT32 " nw=%" INT32 "",
|
|
maxa, maxb, ww->m_numWords );
|
|
maxa = ww->m_numWords - 1;
|
|
maxb = ww->m_numWords;
|
|
//char *xx = NULL; *xx = 0;
|
|
}
|
|
|
|
// assume we do not preceed with ellipsis "..."
|
|
bool needEllipsis = true;
|
|
|
|
// rule of thumb, don't use ellipsis if the first letter is
|
|
// capital, or a non letter
|
|
char *c = ww->m_words[maxa]+0;
|
|
if ( ! is_alpha_utf8(c) ) needEllipsis = false;
|
|
else if ( is_upper_utf8(c) ) needEllipsis = false;
|
|
|
|
// is punct word before us pair acrossable? if so then we
|
|
// probably are not the start of a sentence.
|
|
if ( bb[maxa] & D_STARTS_SENTENCE ) needEllipsis = false;
|
|
|
|
// or if into the sample and previous excerpt had an ellipsis
|
|
// do not bother using one for us.
|
|
if ( p > m_summary && hadEllipsis ) needEllipsis = false;
|
|
|
|
if ( needEllipsis ) {
|
|
// break out if no room for "..."
|
|
//int32_t elen;
|
|
if ( p + 4 + 2 > pend ) break;
|
|
// space first?
|
|
if ( p > m_summary ) *p++ = ' ';
|
|
gbmemcpy ( p , "... " , 4 );
|
|
p += 4;
|
|
}
|
|
|
|
// separate summary excerpts with a single space.
|
|
if ( p > m_summary ) {
|
|
if ( p + 2 > pend ) break;
|
|
*p++ = ' ';
|
|
}
|
|
|
|
// assume we need a trailing ellipsis
|
|
needEllipsis = true;
|
|
|
|
// so next excerpt does not need to have an ellipsis if we
|
|
// have one at the end of this excerpt
|
|
hadEllipsis = needEllipsis;
|
|
|
|
// start with quote?
|
|
if ( (bb[maxa] & D_IN_QUOTES) && p + 1 < pend ) {
|
|
// preceed with quote
|
|
*p++ = '\"';
|
|
}
|
|
|
|
// . filter the words into p
|
|
// . removes back to back spaces
|
|
// . converts html entities
|
|
// . filters in stores words in [a,b) interval
|
|
int32_t len = pos->filter(p, pend, ww, maxa, maxb, ss);
|
|
|
|
// break out if did not fit
|
|
if ( len == 0 ) break;
|
|
// don't consider it if it is a substring of the title
|
|
if ( len == titleBufLen &&
|
|
strncasestr(titleBuf, p, titleBufLen, len) ) {
|
|
// don't consider this one
|
|
numFinal--;
|
|
goto skip;
|
|
}
|
|
|
|
// don't consider it if the length wasn't anything nice
|
|
if ( len < 5 ){
|
|
numFinal--;
|
|
goto skip;
|
|
}
|
|
|
|
// otherwise, keep going
|
|
p += len;
|
|
|
|
// now we just indicate which query terms we got
|
|
for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
|
|
// do not breach
|
|
if ( retired[i] >= 100 ) continue;
|
|
retired [ i ] += maxGotIt [ i ];
|
|
}
|
|
|
|
// add all the scores of the excerpts to the doc summary score.
|
|
// zero out scores of the winning sample so we don't get them
|
|
// again. use negative one billion to ensure that we don't get
|
|
// them again
|
|
for ( int32_t j = maxa ; j < maxb ; j++ )
|
|
// mark it as used
|
|
bb[j] |= D_USED;
|
|
|
|
// if we ended on punct that can be paired across we need
|
|
// to add an ellipsis
|
|
if ( needEllipsis ) {
|
|
if ( p + 4 + 2 > pend ) break;
|
|
gbmemcpy ( p , " ..." , 4 );
|
|
p += 4;
|
|
}
|
|
|
|
// try to put in a small summary excerpt if we have at least
|
|
// half of the normal excerpt length left
|
|
if ( maxExcerptLen == m_maxNumCharsPerLine &&
|
|
//pos->m_pos[maxb] - pos->m_pos[maxa]
|
|
len <= ( m_maxNumCharsPerLine / 2 + 1 ) ){
|
|
maxExcerptLen = m_maxNumCharsPerLine / 2;
|
|
// don't count it in the finals since we try to get a
|
|
// small excerpt
|
|
numFinal--;
|
|
}
|
|
else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS &&
|
|
m_numExcerpts >= 0 ) {
|
|
m_summaryExcerptLen[m_numExcerpts] = p - m_summary;
|
|
m_numExcerpts++;
|
|
// also reset maxExcerptLen
|
|
maxExcerptLen = m_maxNumCharsPerLine;
|
|
}
|
|
|
|
skip:
|
|
// zero out the scores so they will not be used in others
|
|
for ( int32_t j = maxa ; j < maxb ; j++ )
|
|
// mark it
|
|
bb[j] |= D_USED;
|
|
}
|
|
|
|
if ( numFinal <= m_numDisplayLines )
|
|
m_displayLen = p - m_summary;
|
|
|
|
/*end = gettimeofdayInMilliseconds();
|
|
if ( end - start > 10 )
|
|
log ( LOG_WARN,"summary: took %" INT64 "ms to finish doing summary "
|
|
"numMatches=%" INT32 " maxNumLines=%" INT32 " url=%s", end - start,
|
|
matches.m_numMatches, maxNumLines, f->m_url );
|
|
start = gettimeofdayInMilliseconds();*/
|
|
|
|
// If we still didn't find a summary, directly use what's given in the
|
|
// meta summary or description.
|
|
if ( p == m_summary ){
|
|
Words *wp;
|
|
Pos *pp;
|
|
Sections *ss;
|
|
// get it from the summary
|
|
if ( matches->getMatchGroup(MF_METASUMM ,&wp,&pp,&ss) )
|
|
p += pp->filter(p,pend, wp, 0, wp->m_numWords, ss );
|
|
else if ( matches->getMatchGroup(MF_METADESC,&wp,&pp,&ss) )
|
|
p += pp->filter(p,pend, wp, 0, wp->m_numWords, ss );
|
|
if ( p != m_summary ){
|
|
m_summaryExcerptLen[0] = p - m_summary;
|
|
m_numExcerpts = 1;
|
|
}
|
|
// in this case we only have one summary line
|
|
if ( m_numDisplayLines > 0 )
|
|
m_displayLen = p - m_summary;
|
|
}
|
|
|
|
// free the mem we used if we allocated it
|
|
if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
|
|
mfree ( m_buf4 , m_buf4Size , "ssstkb" );
|
|
m_buf4 = NULL;
|
|
}
|
|
|
|
|
|
// If we still didn't find a summary, get the default summary
|
|
if ( p == m_summary ) {
|
|
// then return the default summary
|
|
bool status = getDefaultSummary ( xml,
|
|
words,
|
|
sections,
|
|
pos,
|
|
//bigSampleRadius,
|
|
maxSummaryLen );
|
|
if ( m_numDisplayLines > 0 )
|
|
m_displayLen = m_summaryLen;
|
|
|
|
return status;
|
|
}
|
|
|
|
// if we don't find a summary, theres no need to NULL terminate
|
|
if ( p != m_summary ) *p++ = '\0';
|
|
|
|
// set length
|
|
m_summaryLen = p - m_summary;
|
|
|
|
if ( m_summaryLen > 50000 ) { char*xx=NULL;*xx=0; }
|
|
|
|
// it may not have all query terms if rat=0 (Require All Terms=false)
|
|
// so use Matches::m_matchesQuery instead of Matches::m_hasAllQTerms
|
|
//if ( ! matches->m_matchesQuery )
|
|
// log("query: msg20: doc %s missing query terms for q=%s",
|
|
// f->getUrl(),m_q->m_orig );
|
|
|
|
return true;
|
|
}
|
|
|
|
// . usually we get more summary lines than displayed so that the summary
|
|
// deduped, XmlDoc::getSummaryVector(), has adequate sample space
|
|
// . "max excerpts". we truncate the summary if we need to.
|
|
// XmlDoc.cpp::getSummary(), likes to request more excerpts than are
|
|
// actually displayed so it has a bigger summary for deduping purposes.
|
|
int32_t Summary::getSummaryLen ( int32_t maxLines ) {
|
|
int32_t len = 0;
|
|
for ( int32_t i = 0 ; i < m_numExcerpts && i < maxLines ; i++ )
|
|
len += m_summaryExcerptLen[i];
|
|
return len;
|
|
}
|
|
|
|
// MDW: this logic moved mostly to Bits::setForSummary() and
|
|
// Summary::set2(). See the gigawiki url to see the rules for summary
|
|
// generation: http://10.5.1.202:237/eng_wiki/index.php/Eng:Projects
|
|
// i removed this whole function so use git diff to see it later if you
|
|
// need to. setSummaryScores() is obsoleted.
|
|
|
|
// . return the score of the highest-scoring window containing match #m
|
|
// . window is defined by the half-open interval [a,b) where a and b are
|
|
// word #'s in the Words array indicated by match #m
|
|
// . return -1 and set g_errno on error
|
|
int64_t Summary::getBestWindow ( Matches *matches ,
|
|
int32_t mm ,
|
|
int32_t *lasta ,
|
|
int32_t *besta ,
|
|
int32_t *bestb ,
|
|
char *gotIt ,
|
|
char *retired ,
|
|
int32_t maxExcerptLen ) {
|
|
|
|
|
|
// get the window around match #mm
|
|
Match *m = &matches->m_matches[mm];
|
|
// what is the word # of match #mm?
|
|
int32_t matchWordNum = m->m_wordNum;
|
|
|
|
// what Words/Pos/Bits classes is this match in?
|
|
Words *words = m->m_words;
|
|
Section **sp = NULL;
|
|
int32_t *pos = m->m_pos->m_pos;
|
|
// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
|
|
swbit_t *bb = m->m_bits->m_swbits;
|
|
|
|
// int16_tcut
|
|
if ( m->m_sections ) sp = m->m_sections->m_sectionPtrs;
|
|
|
|
int32_t nw = words->getNumWords();
|
|
int64_t *wids = words->getWordIds();
|
|
nodeid_t *tids = words->getTagIds();
|
|
|
|
// . sanity check
|
|
// . this prevents a core i've seen
|
|
if ( matchWordNum >= nw ) {
|
|
log("summary: got overflow condition for q=%s",m_q->m_orig);
|
|
// assume no best window
|
|
*besta = -1;
|
|
*bestb = -1;
|
|
*lasta = matchWordNum;
|
|
return 0;
|
|
}
|
|
|
|
// . we NULLify the section ptrs if we already used the word in another
|
|
// summary.
|
|
// . google seems to index SEC_MARQUEE, so i took that out of here
|
|
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
|
if ( (bb[matchWordNum] & D_USED) ||
|
|
( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
|
|
// assume no best window
|
|
*besta = -1;
|
|
*bestb = -1;
|
|
*lasta = matchWordNum;
|
|
return 0;
|
|
}
|
|
|
|
// . "a" is the left fence post of the window (it is a word # in Words)
|
|
// . go to the left as far as we can
|
|
// . thus we decrement "a"
|
|
int32_t a = matchWordNum;
|
|
// "posa" is the character position of the END of word #a
|
|
int32_t posa = pos[a+1];
|
|
int32_t firstFrag = -1;
|
|
bool startOnQuote = false;
|
|
bool goodStart = false;
|
|
int32_t wordCount = 0;
|
|
// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
|
|
// . avoid duplicating windows by using "lasta", the last "a" of the
|
|
// previous call to getBestWindow(). This can happen if our last
|
|
// central query term was close to this one.
|
|
for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
|
|
// . don't include any "dead zone",
|
|
// . dead zones have already been used for the summary, and
|
|
// we are getting a second/third/... excerpt here now then
|
|
//if ( wscores[a-1] == -1000000000 ||
|
|
if ( (bb[a-1]&D_USED) ||
|
|
// stop on a title word as well
|
|
//wscores[a-1] == -20000000 ||
|
|
// stop if its the start of a sentence, too
|
|
bb[a] & D_STARTS_SENTENCE ){
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
// stop before title word
|
|
if ( bb[a-1] & D_IN_TITLE ) {
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
// don't go beyond an LI, TR, P tag
|
|
if ( tids && ( tids[a-1] == TAG_LI ||
|
|
tids[a-1] == TAG_TR ||
|
|
tids[a-1] == TAG_P ||
|
|
tids[a-1] == TAG_DIV ) ){
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
// stop if its the start of a quoted sentence
|
|
if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) &&
|
|
words->m_words[a][0] == '\"' ){
|
|
startOnQuote = true;
|
|
goodStart = true;
|
|
break;
|
|
}
|
|
// find out the first instance of a fragment (comma, etc)
|
|
// watch out! because frag also means 's' in there's
|
|
if ( ( bb[a] & D_STARTS_FRAG ) &&
|
|
!(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 )
|
|
firstFrag = a;
|
|
if ( wids[a] ) wordCount++;
|
|
}
|
|
|
|
// if didn't find a good start, then start at the start of the frag
|
|
if ( !goodStart && firstFrag != -1 )
|
|
a = firstFrag;
|
|
|
|
// don't let punct or tag word start a line, unless a quote
|
|
if ( a < matchWordNum && !wids[a] && words->m_words[a][0] != '\"' ){
|
|
while ( a < matchWordNum && !wids[a] ) a++;
|
|
|
|
// do not break right after a "strong connector", like
|
|
// apostrophe
|
|
while ( a < matchWordNum && a > 0 &&
|
|
( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
|
|
a++;
|
|
|
|
// don't let punct or tag word start a line
|
|
while ( a < matchWordNum && !wids[a] ) a++;
|
|
}
|
|
|
|
// remember, b is not included in the summary, the summary is [a,b-1]
|
|
// remember to include all words in a matched phrase
|
|
int32_t b = matchWordNum + m->m_numWords ;
|
|
int32_t endQuoteWordNum = -1;
|
|
int32_t numTagsCrossed = 0;
|
|
for ( ; b <= nw; b++ ){
|
|
if ( b == nw ) break;
|
|
if ( pos[b+1] - pos[a] >= maxExcerptLen ) break;
|
|
|
|
if ( startOnQuote && words->m_words[b][0] == '\"' )
|
|
endQuoteWordNum = b;
|
|
// don't include any dead zone, those are already-used samples
|
|
//if ( wscores[b] == -1000000000 ) break;
|
|
if ( bb[b]&D_USED ) break;
|
|
// stop on a title word
|
|
//if ( wscores[b] == -20000000 ) break;
|
|
// stop on a title word
|
|
if ( bb[b] & D_IN_TITLE ) break;
|
|
if ( wids[b] ) wordCount++;
|
|
// don't go beyond an LI or TR backtag
|
|
if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
|
|
tids[b] == (BACKBIT|TAG_TR) ) ){
|
|
numTagsCrossed++;
|
|
// try to have at least 10 words in the summary
|
|
if ( wordCount > 10 )
|
|
break;
|
|
}
|
|
// go beyond a P or DIV backtag in case the earlier char is a
|
|
// ':'. This came from a special case for wikipedia pages
|
|
// eg. http://en.wikipedia.org/wiki/Flyover
|
|
if ( tids && ( tids[b] == (BACKBIT|TAG_P) ||
|
|
tids[b] == (BACKBIT|TAG_DIV) )){
|
|
numTagsCrossed++;
|
|
// try to have at least 10 words in the summary
|
|
if ( wordCount > 10 && words->m_words[b-1][0] != ':' )
|
|
break;
|
|
}
|
|
}
|
|
|
|
// don't end on a lot of punct words
|
|
if ( b > matchWordNum && !wids[b-1]){
|
|
// remove more than one punct words. if we're ending on a quote
|
|
// keep it
|
|
while ( b > matchWordNum && !wids[b-2] &&
|
|
endQuoteWordNum != -1 && b > endQuoteWordNum )
|
|
b--;
|
|
|
|
// do not break right after a "strong connector", like
|
|
// apostrophe
|
|
while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) )
|
|
b--;
|
|
|
|
}
|
|
|
|
// a int16_tcut
|
|
Match *ms = matches->m_matches;
|
|
// make m_matches.m_matches[mi] the first match in our [a,b) window
|
|
int32_t mi ;
|
|
// . the match at the center of the window is match #"mm", so that
|
|
// matches->m_matches[mm] is the Match class
|
|
// . set "mi" to it and back up "mi" as int32_t as >= a
|
|
for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- ) ;
|
|
|
|
// now get the score of this excerpt. Also mark all the represented
|
|
// query words. Mark the represented query words in the array that
|
|
// comes to us. also mark how many times the same word is repeated in
|
|
// this summary.
|
|
int64_t score = 0LL;
|
|
// is a url contained in the summary, that looks bad! punish!
|
|
bool hasUrl = false;
|
|
// the word count we did above was just an approximate. count it right
|
|
wordCount = 0;
|
|
|
|
// for debug
|
|
//char buf[5000];
|
|
//char *xp = buf;
|
|
SafeBuf xp;
|
|
|
|
// wtf?
|
|
if ( b > nw ) b = nw;
|
|
|
|
// first score from the starting match down to a, including match
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
|
|
// debug print out
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
int32_t len = words->getWordLen(i);
|
|
char cs;
|
|
for(int32_t k=0;k<len; k+=cs ) {
|
|
char *c = words->m_words[i]+k;
|
|
cs = getUtf8CharSize(c);
|
|
if ( is_binary_utf8 ( c ) ) continue;
|
|
xp.safeMemcpy ( c , cs );
|
|
xp.nullTerm();
|
|
}
|
|
}
|
|
|
|
//if ( wscores[i] < 0 ) continue;
|
|
// skip if in bad section, marquee, select, script, style
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
// don't count just numeric words
|
|
if ( words->isNum(i) ) continue;
|
|
// check if there is a url. best way to check for '://'
|
|
if ( wids && !wids[i] ){
|
|
char *wrd = words->m_words[i];
|
|
int32_t wrdLen = words->m_wordLens[i];
|
|
if ( wrdLen == 3 &&
|
|
wrd[0] == ':' && wrd[1] == '/' && wrd[2] == '/' )
|
|
hasUrl = true;
|
|
}
|
|
// get the score
|
|
//int32_t t = wscores[i];
|
|
// just make every word 100 pts
|
|
int32_t t = 100;
|
|
// penalize it if in one of these sections
|
|
if ( bb[i] & ( D_IN_PARENS |
|
|
D_IN_HYPERLINK |
|
|
D_IN_LIST |
|
|
D_IN_SUP |
|
|
D_IN_BLOCKQUOTE ) )
|
|
//t /= 3;
|
|
// backoff since posbd has best window
|
|
// in some links, etc.
|
|
//t *= .85;
|
|
t *= 1;
|
|
// boost it if in bold or italics
|
|
if ( bb[i] & D_IN_BOLDORITALICS ) t *= 2;
|
|
// add the score for this word
|
|
score += t;
|
|
|
|
// print the score, "t"
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
xp.safePrintf("(%" INT32 ")",t);
|
|
}
|
|
|
|
// skip if not wid
|
|
if ( ! wids[i] ) continue;
|
|
// count the alpha words we got
|
|
wordCount++;
|
|
// if no matches left, skip
|
|
if ( mi >= matches->m_numMatches ) continue;
|
|
// get the match
|
|
Match *next = &ms[mi];
|
|
// skip if not a match
|
|
if ( i != next->m_wordNum ) continue;
|
|
// must be a match in this class
|
|
if ( next->m_words != words ) continue;
|
|
// advance it
|
|
mi++;
|
|
// which query word # does it match
|
|
int32_t qwn = next->m_qwordNum;
|
|
|
|
if ( qwn < 0 || qwn >= m_q->m_numWords ){char*xx=NULL;*xx=0;}
|
|
|
|
// undo old score
|
|
score -= t;
|
|
// add 100000 per match
|
|
t = 100000;
|
|
// weight based on tf, goes from 0.1 to 1.0
|
|
t = (int32_t)((float)t * m_wordWeights [ qwn ]);
|
|
// if it is a query stop word, make it 10000 pts
|
|
if ( m_q->m_qwords[qwn].m_isQueryStopWord ) t = 0;//10000;
|
|
|
|
// have we matched it in this [a,b) already?
|
|
if ( gotIt[qwn] > 0 ) t /= 15;
|
|
// have we matched it already in a winning window?
|
|
else if ( retired [qwn] > 0 ) t /= 12;
|
|
|
|
// add it back
|
|
score += t;
|
|
|
|
if ( g_conf.m_logDebugSummary ) {
|
|
xp.safePrintf ("[%" INT32 "]{qwn=%" INT32 ",ww=%f}",t,qwn,
|
|
m_wordWeights[qwn]);
|
|
}
|
|
|
|
// inc the query word count for this window
|
|
if ( gotIt[qwn] < 100 ) gotIt[qwn]++;
|
|
}
|
|
|
|
int32_t oldScore = score;
|
|
|
|
// apply the bonus if it starts or a sentence
|
|
// only apply if the score is positive and if the wordcount is decent
|
|
if ( score > 0 && wordCount > 7 ){
|
|
// a match can give us 10k to 100k pts based on the tf weights
|
|
// so we don't want to overwhelm that too much, so let's make
|
|
// this a 20k bonus if it starts a sentence
|
|
if ( bb[a] & D_STARTS_SENTENCE ) score += 8000;
|
|
// likewise, a fragment, like after a comma
|
|
else if ( bb[a] & D_STARTS_FRAG ) score += 4000;
|
|
// 1k if the match word is very close to the
|
|
// start of a sentence, lets say 3 alphawords
|
|
if ( matchWordNum - a < 7 ) score += 1000;
|
|
// 20M in case of meta stuff, and rss description, which
|
|
// should be the best summary. so give a huge boost
|
|
if ( ! tids ) score += 20000000;
|
|
}
|
|
|
|
// a summary isn't really a summary if its less than 7 words.
|
|
// reduce the score, but still give it a decent score.
|
|
// minus 5M.
|
|
if ( wordCount < 7 ) score -= 20000;
|
|
|
|
// summaries that cross a lot of tags are usually bad, penalize them
|
|
if ( numTagsCrossed > 1 ) score -= (numTagsCrossed * 20000);
|
|
|
|
if ( hasUrl ) score -= 8000;
|
|
|
|
// show it
|
|
if ( g_conf.m_logDebugSummary )
|
|
logf(LOG_DEBUG,"score=%08" INT32 " prescore=%08" INT32 " a=%05" INT32 " b=%05" INT32 " %s",
|
|
(int32_t)score,oldScore,(int32_t)a,(int32_t)b,
|
|
xp.getBufStart());
|
|
|
|
// set lasta, besta, bestb
|
|
*lasta = a;
|
|
*besta = a;
|
|
*bestb = b;
|
|
|
|
return score;
|
|
}
|
|
|
|
// get summary when no search terms could be found
|
|
bool Summary::getDefaultSummary ( Xml *xml,
|
|
Words *words,
|
|
Sections *sections,
|
|
Pos *pos,
|
|
int32_t maxSummaryLen ){
|
|
|
|
char *p = m_summary;
|
|
if (MAX_SUMMARY_LEN < maxSummaryLen)
|
|
maxSummaryLen = MAX_SUMMARY_LEN;
|
|
|
|
// null it out
|
|
m_summaryLen = 0;
|
|
|
|
// try the meta summary tag
|
|
if ( m_summaryLen <= 0 )
|
|
m_summaryLen = xml->getMetaContent ( p , maxSummaryLen ,
|
|
"summary",7);
|
|
|
|
// the meta descr
|
|
if ( m_summaryLen <= 0 )
|
|
m_summaryLen = xml->getMetaContent(p,maxSummaryLen,
|
|
"description",11);
|
|
|
|
|
|
if ( m_numDisplayLines > 0 )
|
|
m_displayLen = m_summaryLen;
|
|
|
|
if ( m_summaryLen > 0 ) {
|
|
m_summaryExcerptLen[0] = m_summaryLen;
|
|
m_numExcerpts = 1;
|
|
return true;
|
|
}
|
|
|
|
bool inTitle = false;
|
|
//bool inHeader = false;
|
|
bool inTable = false;
|
|
bool inList = false;
|
|
bool inLink = false;
|
|
bool inStyle = false;
|
|
int scoreMult = 1;
|
|
char *pend = m_summary + maxSummaryLen - 2;
|
|
int32_t start = -1, numConsecutive = 0;
|
|
int32_t bestStart = -1, bestEnd = -1, longestConsecutive = 0;
|
|
int32_t lastAlnum = -1;
|
|
// google seems to index SEC_MARQUEE, so i took that out of here
|
|
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
|
// int16_tcut
|
|
nodeid_t *tids = words->m_tagIds;
|
|
int64_t *wids = words->getWordIds();
|
|
// get the section ptr array 1-1 with the words, "sp"
|
|
Section **sp = NULL;
|
|
if ( sections ) sp = sections->m_sectionPtrs;
|
|
for (int32_t i = 0;i < words->getNumWords(); i++){
|
|
// skip if in bad section
|
|
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
|
|
if (start > 0 && bestStart == start &&
|
|
( words->m_words[i] - words->m_words[start] ) >=
|
|
( maxSummaryLen - 8 )){
|
|
longestConsecutive = numConsecutive;
|
|
bestStart = start;
|
|
bestEnd = lastAlnum;//i-1;
|
|
break;
|
|
}
|
|
if (words->isAlnum(i) ) {
|
|
// (scores->getScore(i) * scoreMult) > 0){
|
|
if (!inLink)
|
|
numConsecutive++;
|
|
lastAlnum = i;
|
|
if (start < 0) start = i;
|
|
continue;
|
|
}
|
|
nodeid_t tid = tids[i] & BACKBITCOMP;
|
|
// we gotta tag?
|
|
if ( tid ) {
|
|
// ignore <p> tags
|
|
if ( tid == TAG_P ) continue;
|
|
// is it a front tag?
|
|
if ( tid && ! (tids[i] & BACKBIT) ) {
|
|
if ( tid == TAG_STYLE )
|
|
inStyle = true;
|
|
else if ( tid == TAG_TITLE )
|
|
inTitle = true;
|
|
else if ( tid == TAG_OL || tid == TAG_UL )
|
|
inList = true;
|
|
else if ( tid == TAG_A )
|
|
inLink = true;
|
|
}
|
|
else if ( tid ) {
|
|
if ( tid == TAG_STYLE )
|
|
inStyle = false;
|
|
else if ( tid == TAG_TITLE )
|
|
inTitle = false;
|
|
else if ( tid == TAG_OL || tid == TAG_UL )
|
|
inList = false;
|
|
else if ( tid == TAG_A )
|
|
inLink = false;
|
|
}
|
|
if (inTitle||inList||inTable||inStyle) scoreMult = -1;
|
|
else scoreMult = 1;
|
|
if ( ! isBreakingTagId(tid) )
|
|
continue;
|
|
}
|
|
else if ( ! wids[i] ) continue;
|
|
|
|
// end of consecutive words
|
|
if ( numConsecutive > longestConsecutive ) {
|
|
longestConsecutive = numConsecutive;
|
|
bestStart = start;
|
|
bestEnd = i-1;
|
|
}
|
|
start = -1;
|
|
numConsecutive = 0;
|
|
}
|
|
if (bestStart >= 0 && bestEnd > bestStart){
|
|
int32_t len = pos->filter(p, pend-10, words,
|
|
bestStart,
|
|
bestEnd,
|
|
sections);//cores);
|
|
p += len;
|
|
if ( len > 0 && p + 3 + 2 < pend ){
|
|
// space first?
|
|
if ( p > m_summary ) *p++ = ' ';
|
|
gbmemcpy ( p , "..." , 3 );
|
|
p += 3;
|
|
}
|
|
// NULL terminate
|
|
*p++ = '\0';
|
|
// set length
|
|
m_summaryLen = p - m_summary;
|
|
|
|
if ( m_numDisplayLines > 0 )
|
|
m_displayLen = m_summaryLen;
|
|
|
|
if ( m_summaryLen > 50000 ) { char*xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool Summary::scanForLocations ( ) {
|
|
m_summaryLocs.reset();
|
|
m_summaryLocsPops.reset();
|
|
|
|
Words words;
|
|
if ( ! words.set( m_buf, m_bufLen, TITLEREC_CURRENT_VERSION,
|
|
false, // computeIds
|
|
false // hasHtmlEntities
|
|
) )
|
|
return false;
|
|
|
|
char locBuf[1024];
|
|
AppendingWordsWindow ww;
|
|
if ( ! ww.set( &words,
|
|
1, // minWindowSize
|
|
5, // maxWindowSize
|
|
1024, // buf size
|
|
locBuf // buf
|
|
) )
|
|
return false;
|
|
|
|
// find all phrases between length of 1 and 5
|
|
for (ww.processFirstWindow(); !ww.isDone(); ww.processNextWindow()) {
|
|
ww.act();
|
|
|
|
char *phrasePtr = ww.getPhrasePtr();
|
|
int32_t phraseLen = ww.getPhraseLen();
|
|
int32_t numPhraseWords = ww.getNumWords();
|
|
if ( numPhraseWords == 0 ) continue;
|
|
|
|
// see if buf phrase is a place
|
|
int32_t placePop = getPlacePop( phrasePtr, phraseLen);
|
|
if ( placePop > 50000 ) {
|
|
uint64_t place = hash64d( phrasePtr, phraseLen);
|
|
if (place == 0) continue;
|
|
|
|
log(LOG_DEBUG, "query: found place:'%s' (len:%" INT32 ") in "
|
|
"summary -- h:%" UINT64 " pop:%" INT32 "",
|
|
phrasePtr, phraseLen, place, placePop);
|
|
|
|
if (!m_summaryLocs.safeMemcpy((char *)&place,
|
|
sizeof(uint64_t)))
|
|
return false;
|
|
if (!m_summaryLocsPops.safeMemcpy((char *)&placePop,
|
|
sizeof(int32_t)))
|
|
return false;
|
|
}
|
|
}
|
|
// sanity check - should have same # of locs as loc pops
|
|
if (m_summaryLocs.length()/sizeof(uint64_t) !=
|
|
m_summaryLocsPops.length()/sizeof(int32_t)) {
|
|
char *xx = NULL; *xx = 0;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
///////////////
|
|
//
|
|
// YE OLDE SUMMARY GENERATOR of LORE
|
|
//
|
|
///////////////
|
|
|
|
// i upped this from 300 to 3000 to better support the BIG HACK
|
|
#define MAX_TO_MATCH 3000
|
|
|
|
bool Summary::set0 ( char *doc , int32_t docLen , Query *q, Msg20Request *mr ) {
|
|
return set1 ( doc ,
|
|
docLen ,
|
|
q ,
|
|
mr->m_summaryMaxLen ,
|
|
mr->m_numSummaryLines,
|
|
mr->m_maxNumCharsPerLine ,
|
|
mr->m_bigSampleRadius ,
|
|
mr->m_bigSampleMaxLen ,
|
|
NULL , // bigSampleLen ptr!
|
|
NULL ,
|
|
(int64_t *)mr->ptr_termFreqs );
|
|
}
|
|
|
|
// . doc must be NULL terminated
|
|
// . returns false and sets g_errno on error
|
|
// . CAUTION: writes into "doc"
|
|
bool Summary::set1 ( char *doc ,
|
|
int32_t docLen ,
|
|
Query *q ,
|
|
int32_t maxSummaryLen ,
|
|
int32_t maxNumLines ,
|
|
int32_t maxNumCharsPerLine ,
|
|
int32_t bigSampleRadius ,
|
|
int32_t bigSampleMaxLen ,
|
|
int32_t *bigSampleLen ,
|
|
char *foundTermVector ,
|
|
int64_t *termFreqs ) {
|
|
// reset summary
|
|
m_summaryLen = 0;
|
|
m_summary[0]='\0';
|
|
// boundary check
|
|
if ( MAX_SUMMARY_LEN < maxNumCharsPerLine * maxNumLines ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
return log("query: Summary too big to hold in buffer of %" INT32 " "
|
|
"bytes.",(int32_t)MAX_SUMMARY_LEN);
|
|
}
|
|
// query terms
|
|
int32_t numTerms = q->getNumTerms();
|
|
// . now assign scores based on term frequencies
|
|
// . highest score is 10000, then 9900, 9800, 9700, ...
|
|
int32_t ptrs [ ABS_MAX_QUERY_TERMS ];
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) ptrs[i] = i;
|
|
// convenience var
|
|
int64_t *freqs = termFreqs; // q->getTermFreqs();
|
|
// . this is taken from IndexTable.cpp
|
|
// . bubble sort so lower freqs (rare terms) are on top
|
|
bool flag = true;
|
|
while ( flag ) {
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < numTerms ; i++ ) {
|
|
if ( freqs[i] >= freqs[i-1] ) continue;
|
|
int32_t tmp = freqs[i];
|
|
freqs[i ] = freqs[i-1];
|
|
freqs[i-1] = tmp;
|
|
tmp = ptrs[i];
|
|
ptrs [i ] = ptrs [i-1];
|
|
ptrs [i-1] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
// assign scores, give rarest terms highest score
|
|
int32_t scores [ ABS_MAX_QUERY_TERMS ];
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ )
|
|
scores[ptrs[i]] = 10000000 - (i*100);
|
|
// force QUERY stop words to have much lower scores at most 10000
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ )
|
|
if ( q->isQueryStopWord(i) && q->getTermSign(i) == '\0' )
|
|
//scores[i] /= 100000;
|
|
scores[i] = 0;
|
|
// . don't bother with ignored terms (mostly stop words) but could be
|
|
// word from a compound word like cd-rom or some_file
|
|
// . typically they will just be represented by a phrase termId
|
|
// . we need to include so we can match on those words
|
|
//for ( int32_t i = 0 ; i < numTerms ; i++ )
|
|
// if ( q->m_ignore[i] ) scores[i] = 0;
|
|
// don't include if no word representation to match (like phrases)
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ )
|
|
if ( q->isPhrase(i) ) scores[i] = 0;
|
|
// don't highlight '-' terms (or boolean terms in a NOT clause)
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
if ( q->getTermSign(i) == '-' ) scores[i] = -1000000;
|
|
//if ( q->m_qterms[i].m_underNOT ) scores[i] = -1000000;
|
|
// don't highlight stuff in fields
|
|
if ( q->m_qterms[i].m_qword->m_fieldCode) scores[i] = -1000000;
|
|
}
|
|
|
|
// . set the "m" array
|
|
// . it helps us avoid excessive use of strcmp()
|
|
// . m [c] lets us know if a query term begins with the letter c
|
|
// . m2[c] lets us know if a query term's 2nd letter is c
|
|
char m [256];
|
|
char m2[256];
|
|
memset ( m , 0 , 256 );
|
|
memset ( m2 , 0 , 256 );
|
|
// populate
|
|
for ( int32_t i = 0 ; i < numTerms ; i++ ) {
|
|
if ( scores[i] <= 0 ) continue;
|
|
int32_t tlen = q->getTermLen ( i );
|
|
char *t = q->getTerm ( i );
|
|
// bitch if NULL!!!
|
|
if ( ! t || tlen <= 0 ) continue;
|
|
char t0 = t[0];
|
|
// count both upper and lower case!
|
|
if ( is_ascii(t0) ) {
|
|
m[(unsigned char)(to_upper_a(t0))] = 1;
|
|
m[(unsigned char)(to_lower_a(t0))] = 1;
|
|
}
|
|
else {
|
|
m[(unsigned char)t0] = 1;
|
|
}
|
|
// if we convert all chars to ascii before hashing, watch out
|
|
if ( tlen <= 2 ) { m2[0]=1; continue; }
|
|
char t1 = t[1];
|
|
// c++ et al are special cases
|
|
// but do we really need to call it '0'???
|
|
//if ( ! is_alnum_a(t1) ) { m2[0] = 1; continue; }
|
|
if ( is_ascii(t1) ) {
|
|
m2[(unsigned char)(to_upper_a((t1)))] = 1;
|
|
m2[(unsigned char)(to_lower_a((t1)))] = 1;
|
|
}
|
|
else {
|
|
m2[(unsigned char)t1] = 1;
|
|
}
|
|
}
|
|
|
|
// . score of each word matching a query term in doc
|
|
// . divide by 2 since we don't match on punctuation words, only alnum
|
|
// . wordPtrs pts into "doc" to the matching word
|
|
char *wordPtrs [MAX_TO_MATCH];
|
|
int32_t qterms [MAX_TO_MATCH];
|
|
int32_t numMatches = 0;
|
|
|
|
// . now find the matches by using strncasecmp()
|
|
// . we make sure the first 2 chars match before call strncasecmp()
|
|
// . we set the scores[] array
|
|
unsigned char *s = (unsigned char *)doc;
|
|
int32_t i = 0;
|
|
int32_t j;
|
|
unsigned char c;
|
|
// this flag is used to ensure we do phrases correctly.
|
|
// without it, the query "business development center" (in quotes)
|
|
// would match a doc with "business development" and
|
|
// "development center" as two separate phrases.
|
|
char cflag = 0;
|
|
while ( s[i] ) {
|
|
// skip non-alnum chars
|
|
// while ( s[i] && ! is_alnum(s[i]) ) i++;
|
|
for ( ; ! is_alnum_utf8 (s+i ) ; i += getUtf8CharSize(s+i) ) {
|
|
// if we hit start of a tag, skip the whole tag
|
|
//if ( s[i] == '<' ) i = skipTag ( s , i );
|
|
// else i += getUtf8CharSize(s+i);
|
|
}
|
|
// get length
|
|
j = i;
|
|
// while ( is_alnum (s[j] ) ) j++;
|
|
for ( ; is_alnum_utf8 (s+j ) ; j += getUtf8CharSize(s+j) );
|
|
// if no alnum after, bail
|
|
if ( j == i ) break;
|
|
// . does this word match a query word?
|
|
// . continue if first char matches no query term
|
|
if ( ! m[s[i]] ) { i = j; cflag = 0; continue; }
|
|
// get 2nd char
|
|
c = s[i+1];
|
|
// . if not alnum use \0
|
|
// . do we need this???
|
|
//if ( ! is_alnum_a ( c ) ) c = '\0';
|
|
// does 2nd char match a query term?
|
|
if ( ! m2[c] ) { i = j; cflag = 0; continue; }
|
|
// add in + or ++ (from Words.cpp)
|
|
if ( s[j] == '+' ) {
|
|
if ( s[j+1]=='+' && !is_alnum_utf8(s+j+2) ) j += 2;
|
|
else if ( !is_alnum_utf8(s+j+1) ) j++;
|
|
}
|
|
// c#
|
|
if ( s[j] == '#' && !is_alnum_utf8(s+j+1) ) j++;
|
|
// . check all the way here, it's probably a match
|
|
// . TODO: what about phrases?
|
|
int32_t k ;
|
|
for ( k = 0 ; k < numTerms ; k++ ) {
|
|
if ( scores[k] <= 0 ) continue;
|
|
if ( q->getTermLen(k) != (j-i)<<1 ) continue;
|
|
// . watch out for foreign chars on this compare
|
|
// . advance over first 2 letters which we know match
|
|
// . no, they could match different words!!! fixed!
|
|
unsigned char *s1 = &s[i] ;
|
|
unsigned char *s2 = (unsigned char *)q->getTerm(k) ;
|
|
//int32_t len = j - i ;
|
|
unsigned char *s1end = s1 + j - i;
|
|
char size1 ;
|
|
char size2 ;
|
|
// compare them independent of case in utf8
|
|
for ( ; s1 < s1end ; ) {
|
|
size1 = getUtf8CharSize(s1);
|
|
size2 = getUtf8CharSize(s2);
|
|
if ( size1 != size2 ) break;
|
|
int32_t low1 = to_lower_utf8_32 ( (char *)s1 );
|
|
int32_t low2 = to_lower_utf8_32 ( (char *)s2 );
|
|
if ( low1 != low2 ) break;
|
|
s1 += size1;
|
|
s2 += size2;
|
|
}
|
|
// if no match, try next term
|
|
if ( s1 < s1end ) continue;
|
|
// if it's matching a term involved in a compound
|
|
// phrase then we must have matched the prev word
|
|
if ( q->m_qterms[k].m_phrasePart >= 0 &&
|
|
! q->m_hasDupWords ) {
|
|
//if ( cflag > 0 && k == 7 )
|
|
// log("hey");
|
|
//if ( cflag > 0 && k == 6 )
|
|
// log("hey");
|
|
// are we the first in a compound phrase?
|
|
if ( k == 0 ||
|
|
q->m_qterms[k-1].m_isPhrase ||
|
|
q->m_qterms[k-1].m_phrasePart !=
|
|
q->m_qterms[k+0].m_phrasePart )
|
|
cflag = k;
|
|
// are we not the first in a compound phrase?
|
|
else if ( cflag == k-1 &&
|
|
q->m_qterms[k+0].m_phrasePart ==
|
|
q->m_qterms[k-1].m_phrasePart )
|
|
cflag = k;
|
|
// if query has dup words, do a strncmp!
|
|
//else if (strncasecmp(q->m_qterms[k].m_term,
|
|
// (char *)s1,j-i)==0)
|
|
// cflag = k;
|
|
// otherwise the phrase chain was broken
|
|
else {
|
|
cflag = 0;
|
|
// do not count as a match even
|
|
continue;
|
|
}
|
|
}
|
|
// set term vector for the BIG HACK
|
|
if ( foundTermVector ) foundTermVector[k] = 1;
|
|
// skip this if we got too many, but we still go
|
|
// through the ropes for the BIG HACK
|
|
if ( numMatches >= MAX_TO_MATCH ) continue;
|
|
// we got a match for sure
|
|
wordPtrs [ numMatches ] = (char *)&s[i];
|
|
qterms [ numMatches ] = k;
|
|
numMatches++;
|
|
//if ( numMatches >= MAX_TO_MATCH ) goto combine;
|
|
break;
|
|
}
|
|
if ( k == numTerms ) cflag = 0;
|
|
// advance to j now
|
|
i = j;
|
|
}
|
|
|
|
|
|
|
|
|
|
combine:
|
|
// if no summary request, we're done
|
|
if ( maxNumLines <= 0 || maxSummaryLen <= 0 ) goto getsample;
|
|
{
|
|
// combine neighbors scores to yours
|
|
int32_t score;
|
|
int32_t radius = maxNumCharsPerLine / 2 - 5;
|
|
// min of one
|
|
if ( radius <= 0 ) radius = 1;
|
|
// if a match is within maxNumCharsPerLine chars of it, add it in
|
|
int32_t a , b ;
|
|
int32_t ascore ;
|
|
int32_t qterm;
|
|
int32_t max = 0;
|
|
int32_t maxi = -1;
|
|
int32_t maxa = 0;
|
|
int32_t maxb = 0;
|
|
char gotIt [ ABS_MAX_QUERY_TERMS ];
|
|
char *maxleft = NULL;
|
|
char *maxright = NULL;
|
|
for ( int32_t i = 0 ; i < numMatches ; i++ ) {
|
|
// if word already used, skip it
|
|
if ( qterms[i] == -1 ) continue;
|
|
// set totalScore base
|
|
score = scores[qterms[i]];
|
|
// use this so we can decrease score of repeated query terms
|
|
for ( int32_t j = 0 ; j < numTerms ; j++ ) gotIt[j] = 0;
|
|
// add a got it for us
|
|
gotIt [qterms[i]] = 1;
|
|
// add in our left neighbors
|
|
a = i ;
|
|
while ( --a >= 0 ) {
|
|
// get distance from center
|
|
int32_t dist = wordPtrs[i] - wordPtrs[a] ;
|
|
// break out if too far away
|
|
if ( dist > radius ) break;
|
|
// stop if we hit start of sentence
|
|
|
|
|
|
// if we hit a term already used, stop
|
|
if ( qterms[a] == -1 ) break;
|
|
// date terms are required so make the score huge, 2B
|
|
if ( qterms[a] < 0 ) {
|
|
score = 2000000000;
|
|
continue;
|
|
}
|
|
// it's score
|
|
ascore = scores[qterms[a]];
|
|
// it's query term #
|
|
qterm = qterms[a];
|
|
// reduce score of this term if we already have it
|
|
if ( gotIt[qterm] ) ascore /= 100;
|
|
// reduce by how far away we are from center
|
|
ascore -= (ascore / radius * dist) / 2 ;
|
|
// ensure a min of 1
|
|
if ( ascore <= 0 ) ascore = 1;
|
|
// add it in
|
|
score += ascore;
|
|
// in case we get it again
|
|
gotIt[qterm]++;
|
|
}
|
|
// inc a so we're on the word to be included
|
|
a++;
|
|
// for summaries, keep going back until we hit some punctuation
|
|
// that delimits the sentence... if any.
|
|
char *pp = wordPtrs[a];
|
|
char *ppmin = pp - 2*radius;
|
|
if ( ppmin < doc ) ppmin = doc;
|
|
char sent = 0;
|
|
for ( ; pp > ppmin ; pp-- ) {
|
|
if ( pp[-1] == '.' ) { sent = 1; break; }
|
|
if ( pp[-1] == '?' ) { sent = 1; break; }
|
|
if ( pp[-1] == '!' ) { sent = 1; break; }
|
|
if ( pp[-1] == ':' ) { sent = 1; break; }
|
|
// Xml::getText() replaces breaking tags with double
|
|
// \n's, so assume it will also break a sentence.
|
|
if ( pp[-1] == '\n' &&
|
|
pp+2 > doc &&
|
|
pp[-2] == '\n' ) { sent = 1; break; }
|
|
}
|
|
// samples that start with a sentence beginning get more points
|
|
if ( sent || pp == doc ) score *= 2;
|
|
// otherwise, don't worry about it
|
|
else pp = wordPtrs[a];
|
|
// skip back over punct
|
|
// while ( ! is_alnum(*pp) && pp < wordPtrs[a] ) pp++;
|
|
for ( ; ! is_alnum_utf8(pp) && pp < wordPtrs[a] ;
|
|
pp += getUtf8CharSize(pp) );
|
|
// this may be smaller than normal if we had to extend the
|
|
// left radius to make sure it started at the beginning of
|
|
// a sentence.
|
|
int32_t bradius = 2*radius - (wordPtrs[a] - pp);
|
|
// do not go over doc end
|
|
if ( pp + bradius > doc + docLen ) bradius = doc + docLen - pp;
|
|
// add in our right neighbors
|
|
b = i ;
|
|
while ( ++b < numMatches ) {
|
|
// get distance from center
|
|
int32_t dist = wordPtrs[b] - wordPtrs[i] ;
|
|
// break out if too far away
|
|
//if ( dist > radius ) break;
|
|
if ( dist > bradius ) break;
|
|
// if we hit a term already used, stop
|
|
if ( qterms[b] == -1 ) break;
|
|
// it's score
|
|
ascore = scores[qterms[b]];
|
|
// it's query term #
|
|
qterm = qterms[b];
|
|
// reduce score of this term if we already have it
|
|
if ( gotIt[qterm] ) ascore /= 100;
|
|
// reduce by how far away we are from center
|
|
ascore -= (ascore / radius * dist) / 2 ;
|
|
// ensure a min of 1
|
|
if ( ascore <= 0 ) ascore = 1;
|
|
// add it in
|
|
score += ascore;
|
|
// in case we get it again
|
|
gotIt[qterm]++;
|
|
}
|
|
// samples with extra punctuation cruft are bad
|
|
char *s = pp;
|
|
char *send = wordPtrs[i] + bradius;
|
|
char ssize;
|
|
for ( ; s < send ; s += ssize ) {
|
|
ssize = getUtf8CharSize(s);
|
|
if ( !is_alnum_utf8(s) &&
|
|
*s!=',' &&
|
|
!is_alnum_utf8(s+ssize) &&
|
|
*(s+ssize)!='\"' )
|
|
score >>= 1;
|
|
}
|
|
// is this the new max? continue, if not
|
|
if ( score <= max && maxi >= 0 ) continue;
|
|
// otherwise, we got a winner
|
|
max = score;
|
|
maxi = i;
|
|
maxa = a;
|
|
maxb = b;
|
|
maxleft = pp;
|
|
maxright = wordPtrs[i] + bradius;
|
|
}
|
|
// if no matches, return
|
|
if ( maxi == -1 ) return true;
|
|
// the winning word, whose neighborhood scored the highest
|
|
//char *center = wordPtrs[maxi];
|
|
// set excerpt boundaries
|
|
//char *left = center - radius;
|
|
char *left = maxleft - 1;
|
|
if ( left < doc ) left = doc;
|
|
char *docLast = doc + docLen - 1;
|
|
//char *right = center + radius;
|
|
char *right = maxright;
|
|
if ( right > docLast ) right = docLast;
|
|
// don't let excerpt ptrs break a word
|
|
//while ( is_alnum (*left ) && left > doc ) left++;
|
|
//while ( is_alnum (*right) && right < docLast ) right--;
|
|
for ( ; is_alnum_utf8 (left ) && left > doc ; )
|
|
left += getUtf8CharSize(left);
|
|
for ( ; is_alnum_utf8 (right) && right < docLast ; )
|
|
// back up over all of utf8 char
|
|
for ( ; (*right & 0xc0) == 0x80 ; right-- );
|
|
// skip the over initial or ending non-alnum chars
|
|
//while ( ! is_alnum (*left ) ) left++;
|
|
//while ( ! is_alnum (*right) ) right--;
|
|
for ( ; ! is_alnum_utf8 (left ) ; )
|
|
left += getUtf8CharSize(left);
|
|
for ( ; ! is_alnum_utf8 (right) ; )
|
|
// back up over all of utf8 char
|
|
for ( ; (*right & 0xc0) == 0x80 ; right-- );
|
|
// get excerpt length
|
|
int32_t elen = right - left + 1;
|
|
// if 0 or less, no summary
|
|
if ( elen <= 0 ) return true;
|
|
// . store in m_summary[]
|
|
// . filter out \n \t \r (and multiple sequential spaces later?)
|
|
// . convert < and > to < and > respectively
|
|
char *p = m_summary + m_summaryLen;
|
|
// leave room for NULL termination and any html entities we insert
|
|
char *pend = m_summary + MAX_SUMMARY_LEN - 6;
|
|
char *pstart = p;
|
|
for ( int32_t i = 0 ; i < elen && p < pend ; i++ ) {
|
|
if ( left[i] == '<' ) {*p++='&';*p++='l';*p++='t';*p=';';}
|
|
else if ( left[i] == '>' ) {*p++='&';*p++='g';*p++='t';*p=';';}
|
|
else if ( left[i] == '\t' ) { *p=' '; }
|
|
else if ( left[i] == '\n' ) { *p=' '; }
|
|
else if ( left[i] == '\r' ) { *p=' '; }
|
|
else { *p = left[i]; }
|
|
// don't add it if it was a space and there's a space before it
|
|
if ( *p==' ' && p > pstart && *(p-1)==' ' ) continue;
|
|
// officially add it
|
|
p++;
|
|
}
|
|
// NULL terminate
|
|
*p++ = '\0';
|
|
// set m_summaryLen
|
|
m_summaryLen = p - m_summary;
|
|
// . now reduce the scores by what's in gotIt, so those terms are less
|
|
// likely to be matched again, it gives others a chance
|
|
// . clear the gotIt array
|
|
for ( int32_t j = 0 ; j < numTerms ; j++ ) gotIt[j] = 0;
|
|
// reduce scores of query terms included in this summary excerpt
|
|
for ( int32_t j = maxa ; j < maxb ; j++ ) {
|
|
qterm = qterms[j];
|
|
if ( gotIt[qterm] != 0 ) continue;
|
|
gotIt[qterm] = 1;
|
|
scores [qterm] /= 8;
|
|
}
|
|
// remove winning matches from our 2 arrays so we don't do again
|
|
for ( int32_t j = maxa ; j < maxb ; j++ ) qterms[j] = -1;
|
|
// clear out from "doc" so we don't dup any of summary, too
|
|
memset ( left , ' ' , elen );
|
|
// . do we have enough excerpts?
|
|
// . if not keep looping
|
|
if ( --maxNumLines > 0 ) goto combine;
|
|
}
|
|
|
|
getsample:
|
|
|
|
char *docEnd = doc + docLen;
|
|
char *p = doc;
|
|
char *oldright = (char *)0x7fffffff;
|
|
char *oldleft = NULL;
|
|
// if no big sample request, skip this part
|
|
if ( bigSampleRadius <= 0 || bigSampleMaxLen <= 0 ) return true;
|
|
// get text within a radius of bigSampleRadius words of every
|
|
// query term for generating related topics and what not
|
|
for ( int32_t i = 0 ; i < numMatches ; i++ ) {
|
|
// if it is a stop word or ignored, skip it, unless forced
|
|
// with a plus sign
|
|
int32_t qt = qterms[i];
|
|
if ( q->isQueryStopWord(qt) && q->getTermSign(qt) == '\0' )
|
|
continue;
|
|
// point to left extreme
|
|
char *left = wordPtrs[i] - bigSampleRadius ;
|
|
if ( left < doc ) left = doc;
|
|
char *right = wordPtrs[i] + bigSampleRadius ;
|
|
if ( right > docEnd ) right = docEnd;
|
|
// increase left to avoid splitting words
|
|
//while(is_alnum(*left ) && left > doc && is_alnum(left[-1] ))
|
|
// left--;
|
|
// decrease right to avoid splitting words
|
|
//while(is_alnum(*right) && right > doc && is_alnum(right[-1]))
|
|
// right--;
|
|
// don't let excerpt ptrs break a word
|
|
for ( ; is_alnum_utf8 (left ) && left > doc ; ) {
|
|
// get char to left
|
|
char *pre = left -1;
|
|
// back up over all of utf8 char
|
|
for ( ; (*pre & 0xc0) == 0x80 ; pre-- );
|
|
// stop if not alnum
|
|
if ( ! is_alnum_utf8(pre) ) break;
|
|
// back up left otherwise
|
|
left = pre;
|
|
}
|
|
for ( ; is_alnum_utf8 (right ) && right > doc ; ) {
|
|
// get char to right
|
|
char *pre = right -1;
|
|
// back up over all of utf8 char
|
|
for ( ; (*pre & 0xc0) == 0x80 ; pre-- );
|
|
// stop if not alnum
|
|
if ( ! is_alnum_utf8(pre) ) break;
|
|
// back up right otherwise
|
|
right = pre;
|
|
}
|
|
|
|
// if no previous sample claim it all
|
|
if ( oldright == (char *)0x7fffffff ) {
|
|
oldleft = left;
|
|
oldright = right;
|
|
}
|
|
// if disjoint with previous sample, write previous sample
|
|
else if ( left > oldright ) {
|
|
int32_t size = oldright - oldleft;
|
|
if ( p + size + 1 < docEnd ) {
|
|
gbmemcpy ( p , oldleft , size );
|
|
p += size ;
|
|
*p++ = '\0';
|
|
}
|
|
// we become the old left and right now
|
|
oldleft = left;
|
|
oldright = right;
|
|
// break out if we got enough
|
|
if ( p - doc >= bigSampleMaxLen ) break;
|
|
}
|
|
// otherwise merge with previous sample
|
|
else oldright = right;
|
|
}
|
|
// write out the last one here
|
|
if ( oldright != (char *)0x7fffffff ) {
|
|
int32_t size = oldright - oldleft;
|
|
if ( p + size + 1 < docEnd ) {
|
|
gbmemcpy ( p , oldleft , size );
|
|
p += size ;
|
|
*p++ = '\0';
|
|
}
|
|
}
|
|
// back up if we exceeded limit
|
|
if ( p > doc + bigSampleMaxLen ) p = doc + bigSampleMaxLen;
|
|
// don't split last word
|
|
//while ( p > doc && is_alnum(*p) && is_alnum(p[-1]) ) p--;
|
|
for ( ; p > doc && is_alnum_utf8 (p ) ; ) {
|
|
// get char to p
|
|
char *pre = p -1;
|
|
// back up over all of utf8 char
|
|
for ( ; (*pre & 0xc0) == 0x80 ; pre-- );
|
|
// stop if not alnum
|
|
if ( ! is_alnum_utf8(pre) ) break;
|
|
// back up p otherwise
|
|
p = pre;
|
|
}
|
|
|
|
|
|
// NULL terminate
|
|
//*p = '\0';
|
|
// debug msg
|
|
// print it all out
|
|
/*
|
|
char *tt = doc;
|
|
char *ttend = tt + (p - doc);
|
|
while ( tt < ttend ) {
|
|
log("%s",tt);
|
|
tt += gbstrlen(tt) + 1;
|
|
}
|
|
*/
|
|
// set sample length
|
|
*bigSampleLen = p - doc;
|
|
// success
|
|
return true;
|
|
}
|