184 lines
6.1 KiB
C++
184 lines
6.1 KiB
C++
// Matt Wells, copyright Jun 2001
|
|
|
|
// . each word has several bits of information we like to keep track of
|
|
// . these bits are used for making phrases in Phrases.h
|
|
// . also used by spam detector in Spam.h
|
|
// . TODO: rename this class to PhraseBits
|
|
// . TODO: separate words in phrases w/ period OR space so a search for
|
|
// "chicken.rib" gives you the renderman file, not a recipe or something
|
|
|
|
#ifndef _BITS_H_
|
|
#define _BITS_H_
|
|
|
|
#include "Words.h"
|
|
|
|
// . here's the bit define's:
|
|
// . used for phrasing
|
|
// . no punctuation or "big" numbers can be in a phrase
|
|
#define D_CAN_BE_IN_PHRASE 0x0001
|
|
// is this word a stop word?
|
|
#define D_IS_STOPWORD 0x0002
|
|
// . used for phrasing
|
|
// . stop words can have a period preceeding them in the phrase
|
|
// . words preceeded by "/" , "." or "/~" can have a period preceed them
|
|
#define D_CAN_PERIOD_PRECEED 0x0004
|
|
// same as above (can we hash this word???)
|
|
//#define D_IS_INDEXABLE 0x08
|
|
// this means the word is in a verified address (bit set in Address.cpp)
|
|
#define D_IS_IN_ADDRESS 0x0008
|
|
// . used for phrasing
|
|
// . stop words can only start a phrase if prev word could not "pair across"
|
|
#define D_CAN_START_PHRASE 0x0010
|
|
// . used for phrasing
|
|
// . can we continue forming our phrase after this word?
|
|
// . some puntuation words and all stop words can be paired across
|
|
#define D_CAN_PAIR_ACROSS 0x0020
|
|
// it it capitalized?
|
|
#define D_IS_CAP 0x0040
|
|
// is it in a date?
|
|
#define D_IS_IN_DATE 0x0080
|
|
// is it in a street name. set by Address.cpp code.
|
|
#define D_IS_IN_STREET 0x0100
|
|
#define D_BREAKS_SENTENCE 0x0200
|
|
// set by Sections.cpp::setMenu() function
|
|
#define D_IN_LINK 0x0400
|
|
// in the place name part of an address?
|
|
#define D_IS_IN_VERIFIED_ADDRESS_NAME 0x0800
|
|
// allow for dows for texasdrums.org, so TUESDAYS is set with this and
|
|
// we can keep it as part of the sentence and not split on the colon
|
|
//#define D_IS_IN_DATE_2 0x1000
|
|
// this is so we can still set EV_HASTITLEBYVOTES if a tod date is in the
|
|
// title, all other dates are no-no!
|
|
#define D_IS_DAYNUM 0x1000
|
|
// for setting event titles in Events.cpp
|
|
#define D_GENERIC_WORD 0x2000
|
|
#define D_CRUFTY 0x4000
|
|
#define D_IS_NUM 0x00008000
|
|
#define D_IS_IN_UNVERIFIED_ADDRESS_NAME 0x00010000
|
|
#define D_IS_IN_URL 0x00020000
|
|
// like D_IS_TOD above
|
|
#define D_IS_MONTH 0x00040000
|
|
#define D_IS_HEX_NUM 0x00080000
|
|
//
|
|
// the bits below here are used for Summary.cpp when calling
|
|
// Bits::setForSummary()
|
|
//
|
|
|
|
// . is this word a strong connector?
|
|
// . used by Summary.cpp so we don't split strongly connected things
|
|
// . right now, just single character punctuation that is not a space
|
|
// . i don't want to split possessive words at the apostrophe, or split
|
|
// ip addresses at the period, etc. applies to unicode as well.
|
|
#define D_IS_STRONG_CONNECTOR 0x0001
|
|
// . does it start a sentence?
|
|
// . if our summary excerpt starts with this then it will get bonus points
|
|
#define D_STARTS_SENTENCE 0x0002
|
|
// . or does it start a sentence fragment, like after a comma or something
|
|
// . the summary excerpt will get *some* bonus points for this
|
|
#define D_STARTS_FRAG 0x0004
|
|
// . does this word have a quote right before it?
|
|
#define D_IN_QUOTES 0x0008
|
|
// more bits so we can get rid of Summary::setSummaryScores() so that
|
|
// Summary::getBestWindow() just uses these bits to score the window now
|
|
#define D_IN_TITLE 0x0010
|
|
#define D_IN_PARENS 0x0020
|
|
#define D_IN_HYPERLINK 0x0040
|
|
#define D_IN_BOLDORITALICS 0x0080
|
|
#define D_IN_LIST 0x0100
|
|
#define D_IN_SUP 0x0200
|
|
#define D_IN_PARAGRAPH 0x0400
|
|
#define D_IN_BLOCKQUOTE 0x0800
|
|
// for Summary.cpp
|
|
#define D_USED 0x1000
|
|
|
|
//
|
|
// end summary bits
|
|
//
|
|
|
|
#define BITS_LOCALBUFSIZE 20
|
|
|
|
// Words class bits. the most common case
|
|
typedef uint32_t wbit_t;
|
|
|
|
// summary bits used for doing summaries at query time
|
|
typedef uint16_t swbit_t;
|
|
|
|
// . used by SimpleQuery.cpp
|
|
// . this isn't used for phrasing, it's just so a doc that has the same
|
|
// # of query terms as another, but also one query stop word, won't be
|
|
// ranked above the other doc just because of that
|
|
//#define D_IS_QUERY_STOPWORD 0x40
|
|
|
|
class Bits {
|
|
|
|
public:
|
|
|
|
Bits();
|
|
~Bits();
|
|
|
|
bool set2 ( Words *words, int32_t niceness ) {
|
|
return set ( words,TITLEREC_CURRENT_VERSION,niceness); };
|
|
|
|
// . returns false and sets errno on error
|
|
bool set ( Words *words ,
|
|
char titleRecVersion ,
|
|
int32_t niceness ,
|
|
// provide it with a buffer to prevent a malloc
|
|
char *buf = NULL ,
|
|
int32_t bufSize= 0 );
|
|
|
|
bool setForSummary ( Words *words ,
|
|
// provide it with a buffer to prevent a malloc
|
|
char *buf = NULL ,
|
|
int32_t bufSize= 0 );
|
|
|
|
void reset();
|
|
|
|
bool isStopWord (int32_t i) {return m_bits[i]&D_IS_STOPWORD;};
|
|
bool canBeInPhrase (int32_t i) {return m_bits[i]&D_CAN_BE_IN_PHRASE;};
|
|
bool canStartPhrase (int32_t i) {return m_bits[i]&D_CAN_START_PHRASE;};
|
|
bool canPeriodPreceed(int32_t i) {return m_bits[i]&D_CAN_PERIOD_PRECEED;};
|
|
bool canPairAcross (int32_t i) {return m_bits[i]&D_CAN_PAIR_ACROSS;};
|
|
//bool isIndexable (int32_t i) {return m_bits[i]&D_IS_INDEXABLE;};
|
|
bool isCap (int32_t i) {return m_bits[i]&D_IS_CAP;};
|
|
void printBits ( );
|
|
void printBit ( int32_t i );
|
|
|
|
void setInLinkBits ( class Sections *ss ) ;
|
|
void setInUrlBits ( int32_t niceness );
|
|
|
|
bool m_inLinkBitsSet;
|
|
bool m_inUrlBitsSet;
|
|
|
|
//char m_localBuf [MAX_WORDS*10];
|
|
char m_localBuf [ BITS_LOCALBUFSIZE ];
|
|
|
|
// leave public so Query.cpp can tweak this
|
|
wbit_t *m_bits ;
|
|
int32_t m_bitsSize;
|
|
|
|
int32_t m_niceness;
|
|
|
|
// . wordbits
|
|
// . used only by setForSummary() now to avoid having to update a
|
|
// lot of code
|
|
swbit_t *m_swbits;
|
|
int32_t m_swbitsSize;
|
|
|
|
private:
|
|
|
|
Words *m_words;
|
|
|
|
char m_titleRecVersion;
|
|
|
|
bool m_needsFree;
|
|
|
|
// get bits for the ith word
|
|
wbit_t getAlnumBits ( int32_t i , wbit_t prevBits );
|
|
|
|
// get bits for the ith word
|
|
wbit_t getPunctuationBits ( char *s , int32_t slen ) ;
|
|
};
|
|
|
|
#endif
|