// Matt Wells, copyright Jun 2001 // . each word has several bits of information we like to keep track of // . these bits are used for making phrases in Phrases.h // . also used by spam detector in Spam.h // . TODO: rename this class to PhraseBits // . TODO: separate words in phrases w/ period OR space so a search for // "chicken.rib" gives you the renderman file, not a recipe or something #ifndef GB_BITS_H #define GB_BITS_H // . here's the bit define's: // . used for phrasing // . no punctuation or "big" numbers can be in a phrase #define D_CAN_BE_IN_PHRASE 0x0001 // is this word a stop word? #define D_IS_STOPWORD 0x0002 //#define D_UNUSED 0x0004 //#define D_UNUSED 0x0008 //#define D_UNUSED 0x0010 // . used for phrasing // . can we continue forming our phrase after this word? // . some puntuation words and all stop words can be paired across #define D_CAN_PAIR_ACROSS 0x0020 //#define D_UNUSED 0x0040 //#define D_UNUSED 0x0080 //#define D_UNUSED 0x0100 //#define D_UNUSED 0x0200 // set by Sections.cpp::setMenu() function #define D_IN_LINK 0x0400 //#define D_UNUSED 0x0800 //#define D_UNUSED 0x1000 //#define D_UNUSED 0x2000 //#define D_UNUSED 0x4000 //#define D_UNUSED 0x00008000 //#define D_UNUSED 0x00010000 #define D_IS_IN_URL 0x00020000 //#define D_UNUSED 0x00040000 //#define D_UNUSED 0x00080000 // // the bits below here are used for Summary.cpp when calling // Bits::setForSummary() // // . is this word a strong connector? // . used by Summary.cpp so we don't split strongly connected things // . right now, just single character punctuation that is not a space // . i don't want to split possessive words at the apostrophe, or split // ip addresses at the period, etc. applies to unicode as well. #define D_IS_STRONG_CONNECTOR 0x0001 // . does it start a sentence? // . if our summary excerpt starts with this then it will get bonus points #define D_STARTS_SENTENCE 0x0002 // . or does it start a sentence fragment, like after a comma or something // . the summary excerpt will get *some* bonus points for this #define D_STARTS_FRAG 0x0004 // . does this word have a quote right before it? #define D_IN_QUOTES 0x0008 // more bits so we can get rid of Summary::setSummaryScores() so that // Summary::getBestWindow() just uses these bits to score the window now #define D_IN_TITLE 0x0010 #define D_IN_PARENS 0x0020 #define D_IN_HYPERLINK 0x0040 #define D_IN_BOLDORITALICS 0x0080 #define D_IN_LIST 0x0100 #define D_IN_SUP 0x0200 #define D_IN_PARAGRAPH 0x0400 #define D_IN_BLOCKQUOTE 0x0800 // for Summary.cpp #define D_USED 0x1000 // // end summary bits // #define BITS_LOCALBUFSIZE 20 // Words class bits. the most common case typedef uint32_t wbit_t; // summary bits used for doing summaries at query time typedef uint16_t swbit_t; class Words; class Bits { public: Bits(); ~Bits(); // . returns false and sets errno on error bool set( const Words *words, int32_t niceness ); bool setForSummary( const Words *words ); void reset(); bool isStopWord( int32_t i ) const { return m_bits[i] & D_IS_STOPWORD; } bool canBeInPhrase( int32_t i ) const { return m_bits[i] & D_CAN_BE_IN_PHRASE; } bool canPairAcross( int32_t i ) const { return m_bits[i] & D_CAN_PAIR_ACROSS; } void setInLinkBits ( class Sections *ss ) ; void setInUrlBits ( int32_t niceness ); // leave public so Query.cpp can tweak this wbit_t *m_bits; int32_t m_bitsSize; // . wordbits // . used only by setForSummary() now to avoid having to update a // lot of code swbit_t *m_swbits; int32_t m_swbitsSize; private: int32_t m_niceness; const Words *m_words; bool m_inLinkBitsSet; bool m_inUrlBitsSet; bool m_needsFree; char m_localBuf [ BITS_LOCALBUFSIZE ]; // get bits for the ith word wbit_t getAlnumBits( int32_t i ) const; }; #endif // GB_BITS_H