open-source-search-engine/junkdrawer/Scores.h

101 lines
3.2 KiB
C++

// Matt Wells, copyright Jul 2005
// . the Scores class is a vector to weight Words scores by
// . this was originally made to extract the news article from a web page
// and discard the words in menu sections and other cruft.
// . words are weighted by the number of neighboring words in their "sections"
// that are not in hyperlinks.
// . "sections" are determined by table/tr/td/div/... etc tags
// . m_scores is 1-1 with the words in the supplied "words" class
#ifndef _SCORES_H_
#define _SCORES_H_
#include "Words.h"
// if you change this you must also change the shift logic in Phrases.cpp
// for setting the "minScore"
#define NORM_WORD_SCORE 128
#define SCORES_LOCALBUFSIZE 20
class Scores {
public:
Scores();
~Scores();
void reset();
// if indexContentSectionsOnly is true, only the words in the most
// relevant scores will have positive scores, all other words are
// discarded.
//bool set ( class Words *words , bool indexContentSectionsOnly );
bool set ( class Words *words ,
class Sections *sections ,
int32_t titleRecVersion ,
// this is true to zero-out terms in the menus, otherwise
// we assign them a minimal score of 1
bool eliminateMenus ,
// provide it with a buffer to prevent a malloc
char *buf = NULL ,
int32_t bufSize = 0 ,
int32_t minIndexableWords = -1 );
//char m_localBuf [ MAX_WORDS*8*10 ];
char m_localBuf[SCORES_LOCALBUFSIZE];
char *m_buf;
int32_t m_bufSize;
bool m_needsFree;
private:
// returns false and sets g_errno on error
bool set ( class Words *words ,
class Sections *sections ,
int32_t titleRecVersion ,
bool scoreBySection ,
bool indexContentSectionOnly ,
int32_t minSectionScore ,
int32_t minAvgWordScore ,
int32_t minIndexableWords ,
// these are for weighting top part of news articles
int32_t numTopWords ,
float topWordsWeight ,
float topSentenceWeight ,
int32_t maxWordsInSentence ,
char *buf = NULL ,
int32_t bufSize = 0 ) ;
public:
int32_t getMemUsed () { return m_bufSize; };
int32_t getScore ( int32_t i ) { return m_scores[i]; };
// private:
bool setScoresBySection ( class Words *words ,
bool indexContentSectionOnly ,
int32_t minSectionScore ,
int32_t minAvgWordScore );
// percent to weight word scores by... actually from 0 to 128
// for speed reasons
int32_t *m_scores;
//int32_t *m_rerankScores;
// these are printed out by PageParser.cpp in TermTable.cpp
bool m_scoreBySection ;
bool m_indexContentSectionOnly ;
int32_t m_minSectionScore ;
int32_t m_minAvgWordScore ;
int32_t m_minIndexableWords ;
int32_t m_numTopWords ;
float m_topWordsWeight ;
float m_topSentenceWeight ;
int32_t m_maxWordsInSentence ;
};
#endif