247 lines
6.9 KiB
C++
247 lines
6.9 KiB
C++
// . class for linking words together for query expansion
|
|
// . all character strings should be treated as utf8
|
|
|
|
#ifndef _THESAURUS_H_
|
|
#define _THESAURUS_H_
|
|
|
|
#include "HashTableT.h"
|
|
|
|
#define MAX_AFFINITY LONG_MAX
|
|
#define MAX_SYNS 32 // maximum number of synonym links
|
|
#define DEF_AFFINITY (int32_t)(MAX_AFFINITY * 0.9)
|
|
|
|
struct Suffix {
|
|
char *m_suffix;
|
|
int32_t m_len;
|
|
int32_t m_numReps;
|
|
char **m_reps;
|
|
int32_t *m_repLens;
|
|
};
|
|
|
|
class StateAffinity {
|
|
public:
|
|
uint64_t m_time;
|
|
HashTableT<uint64_t, uint64_t> m_hitsTable;
|
|
HashTableT<uint64_t, int64_t> m_synTable;
|
|
HashTableT<uint64_t, int64_t> *m_newTable;
|
|
HashTableT<uint64_t, int64_t> *m_oldTable;
|
|
class Thesaurus *m_thes;
|
|
char *m_synstart;
|
|
char *m_syn;
|
|
char *m_synend;
|
|
char *m_synonymText;
|
|
char m_server[MAX_URL_LEN];
|
|
int32_t m_n;
|
|
int32_t m_next;
|
|
int32_t m_ip;
|
|
int32_t m_built;
|
|
int32_t m_skip;
|
|
int32_t m_sent;
|
|
int32_t m_recv;
|
|
int32_t m_errors;
|
|
int32_t m_cache;
|
|
int32_t m_old;
|
|
bool m_fullRebuild;
|
|
void (*m_callback)(void *);
|
|
};
|
|
|
|
// for m_type field, ranges 0-15 for the current data structure
|
|
#define SYN_SYNONYM 0 // thesaurus
|
|
#define SYN_STEM 1 // stemmer
|
|
#define SYN_SPELLING 2 // speller
|
|
#define SYN_ACRONYM 3 // thesaurus
|
|
#define SYN_NUMBER 4 // number parser
|
|
#define SYN_PHRASE 5 // phrase generator
|
|
#define SYN_TRANS 6 // word is a foreign translation
|
|
#define SYN_UNKNOWN 15
|
|
#define SYN_INVALID 127
|
|
|
|
// bits
|
|
#define SYNBIT_SYNOMYM 0x0001
|
|
#define SYNBIT_STEM 0x0002
|
|
#define SYNBIT_SPELLING 0x0004
|
|
#define SYNBIT_ACRONYM 0x0008
|
|
#define SYNBIT_NUMBER 0x0010
|
|
#define SYNBIT_PHRASE 0x0020
|
|
#define SYNBIT_TRANS 0x0040
|
|
#define SYNBIT_UNKNOWN 0x8000
|
|
#define SYNBIT_STATIC 0x0049 // Synonym, acronym, translation
|
|
#define SYNBIT_DYNAMIC 0x0036 // stem, spelling, number, phrase
|
|
#define SYNBIT_ALL 0x7FFF
|
|
|
|
// TODO: Maybe make this a Msg class
|
|
class SynonymInfo {
|
|
private:
|
|
bool growSyns();
|
|
bool growText();
|
|
bool growTids();
|
|
public:
|
|
SynonymInfo();
|
|
~SynonymInfo();
|
|
|
|
void reset();
|
|
|
|
bool setWord(char *s, int32_t len, uint64_t h);
|
|
|
|
bool addSynonym(char *syn, int32_t affinity,
|
|
int32_t offset, int32_t len,
|
|
char type, char sort, bool hasSpace,
|
|
int64_t leftWordId, int64_t rightWordId);
|
|
|
|
char *m_word;
|
|
uint64_t m_h;
|
|
int32_t m_wordLen;
|
|
int32_t m_numSyns;
|
|
int32_t m_numIds; // not 1:1 with syns
|
|
int32_t m_slots;
|
|
char m_highSort; // highest sort priority
|
|
char **m_syn;
|
|
int32_t *m_affinity;
|
|
int32_t *m_offset;
|
|
int32_t *m_len;
|
|
int32_t *m_firstId; // certain synonyms can have multiple termIds
|
|
int32_t *m_lastId; // so this maps them, inclusive
|
|
char *m_type;
|
|
char *m_sort; // sort priority (lower = higher on the list)
|
|
bool *m_hasSpace;
|
|
int64_t *m_leftSynHash; // for phrases, leftmost/rightmost syn hashes
|
|
int64_t *m_rightSynHash;
|
|
uint64_t *m_synHash;
|
|
int64_t *m_termId; // for getTermFreqs (stored in its own buffer)
|
|
int64_t m_tidBuf[16];
|
|
int32_t m_tidSize;
|
|
char *m_balloc;
|
|
int32_t m_ballocSize;
|
|
char m_buf[1024];
|
|
char *m_talloc;
|
|
int32_t m_tallocSize;
|
|
char m_tbuf[2048];
|
|
int32_t m_tbufLen;
|
|
};
|
|
|
|
// only used for rebuilding the thesaurus, not saved or used elsewhere,
|
|
// but we need this for the HashTableT we use in rebuildSynonyms
|
|
// m_syn here is actually offsets, if positive they were in dict/words, if
|
|
// negative they were only in the add files
|
|
struct SynonymLinkGroup {
|
|
int32_t m_n;
|
|
uint64_t m_h[MAX_SYNS];
|
|
int32_t m_syn[MAX_SYNS];
|
|
char m_type[MAX_SYNS];
|
|
int32_t m_aff[MAX_SYNS];
|
|
};
|
|
|
|
class Thesaurus {
|
|
public:
|
|
Thesaurus();
|
|
~Thesaurus();
|
|
|
|
void reset();
|
|
|
|
// returns NULL if the offset does not point to the beginning of a word
|
|
char *getSynonymFromOffset(int32_t offset);
|
|
|
|
// fills a SynonymInfo object with EVERYTHING
|
|
bool getAllInfo(char *s, SynonymInfo *syn, int32_t slen, int32_t bits);
|
|
|
|
// fills a SynonymInfo with static data
|
|
bool getSynonymInfo(char *s, SynonymInfo *syn,
|
|
int32_t slen = 0, int32_t bits = SYNBIT_ALL);
|
|
bool getSynonymInfo(uint64_t h, SynonymInfo *syn,
|
|
int32_t bits = SYNBIT_ALL);
|
|
|
|
// all these functions are here for convenience/debugging but they're
|
|
// terribly slow (O(n^2/2) if you use them iteratively), it's better
|
|
// to use getSynonymInfo if you need to retrieve all the info at once
|
|
// for all the N functions, N = 0 is the word itself, and N = numSyns
|
|
// is the least popular synonym
|
|
int32_t getAffinity(char *s1, char *s2, int32_t l1 = 0, int32_t l2 = 0);
|
|
int32_t getAffinity(uint64_t h1, uint64_t h2);
|
|
int32_t getAffinityN(char *s1, int32_t n, int32_t l = 0);
|
|
int32_t getAffinityN(uint64_t h1, int32_t n);
|
|
|
|
// maybe this is pointless, but it could be used to verify that one
|
|
// word is a synonym of another, if we decide we want to do that
|
|
char *getSynonym(char *s1, char *s2, int32_t l1 = 0, int32_t l2 = 0);
|
|
char *getSynonym(uint64_t h1, uint64_t h2);
|
|
char *getSynonymN(char *s, int32_t n, int32_t l = 0);
|
|
char *getSynonymN(uint64_t h, int32_t n);
|
|
|
|
int32_t getNumSyns(char *s, int32_t l = 0);
|
|
int32_t getNumSyns(uint64_t h);
|
|
|
|
int32_t getSlot(char *s1, char *s2, int32_t l1 = 0, int32_t l2 = 0);
|
|
int32_t getSlot(uint64_t h1, uint64_t h2);
|
|
int32_t getSlotN(char *s, int32_t n, int32_t l = 0);
|
|
int32_t getSlotN(uint64_t h, int32_t n);
|
|
|
|
int32_t getOffset(char *s, int32_t l = 0);
|
|
int32_t getOffset(uint64_t h);
|
|
|
|
char getFlag(char *s1, char *s2, int32_t l1 = 0, int32_t l2 = 0);
|
|
char getFlag(uint64_t h1, uint64_t h2);
|
|
char getFlagN(char *s, int32_t n, int32_t l = 0);
|
|
char getFlagN(uint64_t h, int32_t n);
|
|
|
|
int64_t getValue(char *s1, char *s2, int32_t l1 = 0, int32_t l2 = 0);
|
|
int64_t getValue(uint64_t h1, uint64_t h2);
|
|
int64_t getValueN(char *s, int32_t n, int32_t l = 0);
|
|
int64_t getValueN(uint64_t h, int32_t n);
|
|
|
|
// cuts off and/or attaches suffixes to generate new words
|
|
bool getStems(char *s, int32_t slen, SynonymInfo *info);
|
|
|
|
// turns a string/int into the other form ("2" -> "two" & vice versa)
|
|
bool parseNumbers(int64_t n, SynonymInfo *syn);
|
|
bool parseNumbers(char *s, int32_t slen, SynonymInfo *syn);
|
|
|
|
// generates new phrase synonyms off two-term phrases
|
|
bool generatePhrases(char *s, int32_t slen,
|
|
SynonymInfo *info, int32_t bits);
|
|
|
|
bool rebuild(char *server, bool fullRebuild);
|
|
bool rebuildAffinity(char *server, bool fullRebuild);
|
|
|
|
bool save();
|
|
bool load();
|
|
|
|
inline bool init() {
|
|
return load();
|
|
}
|
|
|
|
void cancelRebuild();
|
|
|
|
StateAffinity *m_affinityState;
|
|
|
|
int32_t m_lastRebuild; // . gettimeofday() to force another
|
|
// full affinity rebuild
|
|
bool m_rebuilding; // is it currently rebuilding?
|
|
private:
|
|
HashTableT<uint64_t, int64_t> m_synonymTable;
|
|
char m_buf[2 * HT_BUF_SIZE]; // . synonym table buffer
|
|
|
|
char *m_synonymText;
|
|
int32_t m_synonymLen;
|
|
int32_t m_synonymSize;
|
|
int32_t m_numSynonyms;
|
|
int32_t m_totalPairs;
|
|
|
|
Suffix *m_suffixes;
|
|
int32_t m_numSuffixes;
|
|
SafeBuf m_suffixBuffer;
|
|
char **m_reps;
|
|
int32_t m_numReps;
|
|
int32_t *m_repLens;
|
|
SafeBuf m_stemBuffer;
|
|
HashTableT<uint32_t, char *> m_stemTable;
|
|
|
|
bool initStems();
|
|
bool rebuildSynonyms();
|
|
|
|
};
|
|
|
|
extern Thesaurus g_thesaurus;
|
|
|
|
#endif
|
|
|