privacore-open-source-searc.../Phrases.h
2018-03-19 10:15:38 +01:00

61 lines
1.4 KiB
C++

// Matt Wells, copyright Jul 2001
// . generate phrases and store their hashes into m_phraseIds[] array
// . hash() will then hash the phraseIds into the TermTable (hashtable)
// . will it hash a word as a phrase if it's the only word? No, it will not.
// it only hashes 2+ word phrases
#ifndef GB_PHRASES_H
#define GB_PHRASES_H
#include <inttypes.h>
#include <stddef.h>
#include "max_words.h"
class TokenizerResult;
class Bits;
class Phrases {
public:
Phrases();
~Phrases();
void reset() ;
// . set the hashes (m_phraseIds) of the phrases for these words
// . "bits" describes the words in a phrasing context
bool set(const TokenizerResult &tr, const Bits &bits);
int64_t getPhraseId(int i) const {
return m_phraseIds2[i];
}
// . store phrase that starts with word #i into "buf"
// . we also NULL terminated it in "buf"
void getPhrase(int32_t i, const TokenizerResult &tr, char *buf, size_t bufsize, int32_t *phrLen) const;
int32_t getNumWordsInPhrase2( int32_t i ) const {
return m_numWordsTotal2[i];
}
int32_t getMinWordsInPhrase( int32_t i , int64_t *pid ) const;
private:
void setPhrase(unsigned i, const TokenizerResult &tr, const Bits &bits);
char m_localBuf [ MAX_WORDS * 14 ];
char *m_buf;
int32_t m_bufSize;
// the two word hash
int64_t *m_phraseIds2;
// for the two word phrases:
unsigned char *m_numWordsTotal2;
int32_t m_numPhrases; // should equal the # of words
};
#endif // GB_PHRASES_H