Files
privacore-open-source-searc…/Bits.h

145 lines
4.1 KiB
C
Raw Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, copyright Jun 2001
// . each word has several bits of information we like to keep track of
// . these bits are used for making phrases in Phrases.h
// . also used by spam detector in Spam.h
// . TODO: rename this class to PhraseBits
// . TODO: separate words in phrases w/ period OR space so a search for
// "chicken.rib" gives you the renderman file, not a recipe or something
2016-03-08 22:14:30 +01:00
#ifndef GB_BITS_H
#define GB_BITS_H
2013-08-02 13:12:24 -07:00
// . here's the bit define's:
// . used for phrasing
// . no punctuation or "big" numbers can be in a phrase
#define D_CAN_BE_IN_PHRASE 0x0001
// is this word a stop word?
2016-03-02 10:49:33 +01:00
#define D_IS_STOPWORD 0x0002
2016-02-18 18:22:52 +01:00
2016-03-02 10:49:33 +01:00
//#define D_UNUSED 0x0004
//#define D_UNUSED 0x0008
//#define D_UNUSED 0x0010
2016-02-18 18:22:52 +01:00
2013-08-02 13:12:24 -07:00
// . used for phrasing
// . can we continue forming our phrase after this word?
// . some puntuation words and all stop words can be paired across
#define D_CAN_PAIR_ACROSS 0x0020
2016-02-18 18:22:52 +01:00
2016-03-02 10:49:33 +01:00
//#define D_UNUSED 0x0040
//#define D_UNUSED 0x0080
//#define D_UNUSED 0x0100
2016-03-02 11:03:12 +01:00
//#define D_UNUSED 0x0200
2016-02-18 18:22:52 +01:00
2013-08-02 13:12:24 -07:00
// set by Sections.cpp::setMenu() function
#define D_IN_LINK 0x0400
2016-02-18 18:22:52 +01:00
2016-02-23 15:13:17 +01:00
//#define D_UNUSED 0x0800
2016-03-02 10:49:33 +01:00
//#define D_UNUSED 0x1000
//#define D_UNUSED 0x2000
2016-02-23 15:13:17 +01:00
//#define D_UNUSED 0x4000
2016-03-02 10:49:33 +01:00
//#define D_UNUSED 0x00008000
2016-02-23 15:13:17 +01:00
//#define D_UNUSED 0x00010000
#define D_IS_IN_URL 0x00020000
2016-03-02 10:49:33 +01:00
//#define D_UNUSED 0x00040000
//#define D_UNUSED 0x00080000
2016-02-23 15:13:17 +01:00
2013-08-02 13:12:24 -07:00
//
// the bits below here are used for Summary.cpp when calling
// Bits::setForSummary()
//
// . is this word a strong connector?
// . used by Summary.cpp so we don't split strongly connected things
// . right now, just single character punctuation that is not a space
// . i don't want to split possessive words at the apostrophe, or split
// ip addresses at the period, etc. applies to unicode as well.
#define D_IS_STRONG_CONNECTOR 0x0001
// . does it start a sentence?
// . if our summary excerpt starts with this then it will get bonus points
#define D_STARTS_SENTENCE 0x0002
// . or does it start a sentence fragment, like after a comma or something
// . the summary excerpt will get *some* bonus points for this
#define D_STARTS_FRAG 0x0004
// . does this word have a quote right before it?
#define D_IN_QUOTES 0x0008
// more bits so we can get rid of Summary::setSummaryScores() so that
// Summary::getBestWindow() just uses these bits to score the window now
#define D_IN_TITLE 0x0010
#define D_IN_PARENS 0x0020
#define D_IN_HYPERLINK 0x0040
#define D_IN_BOLDORITALICS 0x0080
#define D_IN_LIST 0x0100
#define D_IN_SUP 0x0200
#define D_IN_PARAGRAPH 0x0400
#define D_IN_BLOCKQUOTE 0x0800
// for Summary.cpp
#define D_USED 0x1000
//
// end summary bits
//
#define BITS_LOCALBUFSIZE 20
// Words class bits. the most common case
typedef uint32_t wbit_t;
// summary bits used for doing summaries at query time
typedef uint16_t swbit_t;
2016-03-01 11:00:52 +01:00
class Words;
2013-08-02 13:12:24 -07:00
class Bits {
2016-02-18 17:07:23 +01:00
public:
2013-08-02 13:12:24 -07:00
Bits();
~Bits();
// . returns false and sets errno on error
2016-05-13 17:11:59 +02:00
bool set( const Words *words, int32_t niceness );
bool setForSummary( const Words *words );
2013-08-02 13:12:24 -07:00
void reset();
2016-05-12 12:49:38 +02:00
bool isStopWord( int32_t i ) const {
2016-02-18 17:07:23 +01:00
return m_bits[i] & D_IS_STOPWORD;
}
2016-05-12 12:49:38 +02:00
bool canBeInPhrase( int32_t i ) const {
2016-02-18 17:07:23 +01:00
return m_bits[i] & D_CAN_BE_IN_PHRASE;
}
2016-05-12 12:49:38 +02:00
bool canPairAcross( int32_t i ) const {
2016-02-18 17:07:23 +01:00
return m_bits[i] & D_CAN_PAIR_ACROSS;
}
2013-08-02 13:12:24 -07:00
void setInLinkBits ( class Sections *ss ) ;
2014-11-10 14:45:11 -08:00
void setInUrlBits ( int32_t niceness );
2013-08-02 13:12:24 -07:00
// leave public so Query.cpp can tweak this
2016-02-18 17:07:23 +01:00
wbit_t *m_bits;
int32_t m_bitsSize;
2013-08-02 13:12:24 -07:00
// . wordbits
// . used only by setForSummary() now to avoid having to update a
// lot of code
swbit_t *m_swbits;
2016-02-18 17:07:23 +01:00
int32_t m_swbitsSize;
2013-08-02 13:12:24 -07:00
private:
2016-03-02 11:30:05 +01:00
int32_t m_niceness;
2016-05-13 17:11:59 +02:00
const Words *m_words;
2016-03-02 11:30:05 +01:00
bool m_inLinkBitsSet;
bool m_inUrlBitsSet;
bool m_needsFree;
char m_localBuf [ BITS_LOCALBUFSIZE ];
2013-08-02 13:12:24 -07:00
2016-03-02 11:30:05 +01:00
// get bits for the ith word
2016-05-12 12:49:38 +02:00
wbit_t getAlnumBits( int32_t i ) const;
2013-08-02 13:12:24 -07:00
};
2016-03-08 22:14:30 +01:00
#endif // GB_BITS_H