2013-08-02 13:12:24 -07:00
|
|
|
// Matt Wells, copyright Jul 2001
|
|
|
|
|
|
|
|
// . gets xhtml filtered into plain text
|
|
|
|
// . parses plain text into Words
|
|
|
|
// . gets rawTermIds from the query
|
|
|
|
// . uses Matches class to find words that match rawTermIds
|
|
|
|
// . for each term in query, find the line with that term and the most
|
|
|
|
// other matching terms, and print that line
|
|
|
|
|
|
|
|
// . modifications...
|
|
|
|
// . exclude title from the plain text (call xml->getText() twice?)
|
|
|
|
// . find up to X lines
|
|
|
|
// . find phrases by setting the Phrases class as well
|
|
|
|
// . score lines by termfreqs of terms you highlight in the line
|
|
|
|
// . highlight terms in order of their termFreq, lowest first!
|
|
|
|
// . remove junk from start/end of summary (no back-to-back punct)
|
|
|
|
// . stop summary line on space, not non-alnum (no breaking on apostrophe)
|
|
|
|
// . don't highlight stop words????
|
|
|
|
|
2016-03-08 22:14:30 +01:00
|
|
|
#ifndef GB_SUMMARY_H
|
|
|
|
#define GB_SUMMARY_H
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
#include "gb-include.h"
|
|
|
|
|
2016-03-17 00:55:14 +01:00
|
|
|
#define MAX_SUMMARY_LEN (1024*20)
|
2013-08-02 13:12:24 -07:00
|
|
|
#define MAX_SUMMARY_EXCERPTS 1024
|
|
|
|
|
2016-01-12 15:33:42 +01:00
|
|
|
class XmlDoc;
|
2015-11-26 18:45:21 +01:00
|
|
|
class Sections;
|
|
|
|
class Matches;
|
2015-12-10 10:59:26 +01:00
|
|
|
class Xml;
|
|
|
|
class Words;
|
|
|
|
class Pos;
|
|
|
|
class Query;
|
|
|
|
class Url;
|
2015-11-26 18:45:21 +01:00
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
class Summary {
|
2015-11-26 18:13:13 +01:00
|
|
|
public:
|
2013-08-02 13:12:24 -07:00
|
|
|
Summary();
|
|
|
|
~Summary();
|
2015-11-26 18:13:13 +01:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
bool setSummary(const Xml *xml, const Words *words, const Sections *sections, Pos *pos, const Query *q,
|
|
|
|
int32_t maxSummaryLen, int32_t numDisplayLines, int32_t maxNumLines, int32_t maxNumCharsPerLine,
|
2016-12-20 16:35:31 +01:00
|
|
|
const Url *f, const Matches *matches, const char *titleBuf, int32_t titleBufLen);
|
2016-02-26 11:06:52 +01:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
bool setSummaryFromTags(Xml *xml, int32_t maxSummaryLen, const char *titleBuf, int32_t titleBufLen);
|
2015-11-26 18:45:21 +01:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
char *getSummary();
|
|
|
|
const char *getSummary() const;
|
|
|
|
int32_t getSummaryDisplayLen() const;
|
|
|
|
int32_t getSummaryLen() const;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
bool isSetFromTags() const;
|
2016-03-01 18:24:44 +01:00
|
|
|
|
2015-12-10 10:59:26 +01:00
|
|
|
private:
|
2016-12-20 16:33:26 +01:00
|
|
|
bool verifySummary(const char *titleBuf, int32_t titleBufLen);
|
2013-08-02 13:12:24 -07:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
bool getDefaultSummary(const Xml *xml, const Words *words, const Sections *sections, Pos *pos, int32_t maxSummaryLen);
|
2015-11-26 18:45:21 +01:00
|
|
|
|
2016-12-20 16:33:26 +01:00
|
|
|
int64_t getBestWindow (const Matches *matches, int32_t mn, int32_t *lasta, int32_t *besta, int32_t *bestb,
|
2015-12-10 10:59:26 +01:00
|
|
|
char *gotIt, char *retired, int32_t maxExcerptLen );
|
2013-08-02 13:12:24 -07:00
|
|
|
|
|
|
|
// null terminate and store the summary here.
|
2015-12-10 10:59:26 +01:00
|
|
|
char m_summary[ MAX_SUMMARY_LEN ];
|
2014-11-10 14:45:11 -08:00
|
|
|
int32_t m_summaryLen;
|
2015-12-10 10:59:26 +01:00
|
|
|
int32_t m_summaryExcerptLen[ MAX_SUMMARY_EXCERPTS ];
|
2014-11-10 14:45:11 -08:00
|
|
|
int32_t m_numExcerpts;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
2014-07-03 07:29:44 -07:00
|
|
|
// if getting more lines for deduping than we need for displaying,
|
|
|
|
// how big is that part of the summary to display?
|
2014-11-10 14:45:11 -08:00
|
|
|
int32_t m_numDisplayLines;
|
|
|
|
int32_t m_displayLen;
|
2014-07-03 07:29:44 -07:00
|
|
|
|
2014-11-10 14:45:11 -08:00
|
|
|
int32_t m_maxNumCharsPerLine;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
2016-03-01 18:24:44 +01:00
|
|
|
bool m_isSetFromTags;
|
|
|
|
|
2013-08-02 13:12:24 -07:00
|
|
|
// ptr to the query
|
2016-12-20 16:33:26 +01:00
|
|
|
const Query *m_q;
|
2013-08-02 13:12:24 -07:00
|
|
|
|
2015-07-13 14:59:44 -06:00
|
|
|
float *m_wordWeights;
|
|
|
|
int32_t m_wordWeightSize;
|
2015-12-10 10:59:26 +01:00
|
|
|
char m_tmpWordWeightsBuf[128];
|
2015-07-13 14:59:44 -06:00
|
|
|
|
2015-07-13 18:42:13 -06:00
|
|
|
char *m_buf4;
|
|
|
|
int32_t m_buf4Size;
|
|
|
|
char m_tmpBuf4[128];
|
2013-08-02 13:12:24 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|