privacore-open-source-searc.../Posdb.h
2018-06-25 15:36:39 +02:00

351 lines
11 KiB
C++

// Matt Wells, Copyright May 2012
// . format of an 18-byte posdb key
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
// M = multiplier, b = in outlink text
// N = shard by termId
// L = langIdShiftBit (upper bit)
// G: 0 = body
// 1 = intitletag
// 2 = inheading
// 3 = inlist
// 4 = inmetatag
// 5 = inlinktext
// 6 = tag
// 7 = inneighborhood
// 8 = internalinlinktext
// 9 = inurl
// 10 = inmenu
//
// F: 0 = original term
// 1 = conjugate/sing/plural
// 2 = synonym
// 3 = hyponym
// NOTE: N bit is 1 if the shard of the record is determined by the
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
// the hash of the content like this so we can see if it is a dup.
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
// so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms.
// NOTE: for inlinktext terms the spam rank is the siterank of the
// inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes
// for incoming inlink text.
// NOTE: now we can b-step into the termlist looking for a docid match
// and not worry about misalignment from the double compression scheme
// because if the 6th byte's low bit is clear that means its a docid
// 12-byte key, otherwise its the word position 6-byte key since the delbit
// can't be clear for those!
// THEN we can play with a tuner for how these various things affect
// the search results ranking.
#ifndef GB_POSDB_H
#define GB_POSDB_H
#include "Rdb.h"
#include "Titledb.h" // DOCID_MASK/MAX_DOCID
#include "HashTableX.h"
#include "Sanity.h"
#include "termid_mask.h"
#include "Lang.h"
#define MAXSITERANK 0x0f // 4 bits
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
#define MAXWORDPOS 0x0003ffff // 18 bits
#define MAXDENSITYRANK 0x1f // 5 bits
#define MAXWORDSPAMRANK 0x0f // 4 bits
#define MAXDIVERSITYRANK 0x0f // 4 bits
#define MAXHASHGROUP 0x0f // 4 bits
#define MAXMULTIPLIER 0x0f // 4 bits
#define MAXISSYNONYM 0x03 // 2 bits
// values for G bits in the posdb key
#define HASHGROUP_BODY 0 // body implied
#define HASHGROUP_TITLE 1
#define HASHGROUP_HEADING 2 // body implied
#define HASHGROUP_INLIST 3 // body implied, apparently not detected anymore
#define HASHGROUP_INMETATAG 4
#define HASHGROUP_INLINKTEXT 5
#define HASHGROUP_INTAG 6
#define HASHGROUP_NEIGHBORHOOD 7
#define HASHGROUP_INTERNALINLINKTEXT 8
#define HASHGROUP_INURL 9
#define HASHGROUP_INMENU 10 // body implied
#define HASHGROUP_EXPLICIT_KEYWORDS 11
#define HASHGROUP_MIDDOMAIN 12
#define HASHGROUP_LEMMA 13
#define HASHGROUP_END 14
#define POSDB_DELETEDOC_TERMID 0
const char *getHashGroupString ( unsigned char hg );
typedef key144_t posdbkey_t;
class Posdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
static void makeKey ( void *kp ,
int64_t termId ,
uint64_t docId ,
int32_t wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char siteRank ,
char hashGroup ,
char langId ,
// multiplier: we convert into 7 bits in this function
int32_t multiplier ,
bool isSynonym ,
bool isDelKey ,
bool shardByTermId );
static void printKey(const char *key);
static int printList ( RdbList &list ) ;
// we map the 32bit score to like 7 bits here
static void setMultiplierBits ( void *vkp , unsigned char mbits ) {
key144_t *kp = (key144_t *)vkp;
if ( mbits > MAXMULTIPLIER ) { gbshutdownAbort(true); }
kp->n0 &= 0xfc0f;
// map score to bits
kp->n0 |= ((uint16_t)mbits) << 4;
}
static void setDocIdBits ( void *vkp , uint64_t docId ) {
key144_t *kp = (key144_t *)vkp;
kp->n1 &= 0x000003ffffffffffLL;
kp->n1 |= (docId<<(32+10));
kp->n2 &= 0xffffffffffff0000LL;
kp->n2 |= docId>>22;
}
static void setSiteRankBits ( void *vkp , char siteRank ) {
key144_t *kp = (key144_t *)vkp;
if ( siteRank > MAXSITERANK ) { gbshutdownAbort(true); }
kp->n1 &= 0xfffffe1fffffffffLL;
kp->n1 |= ((uint64_t)siteRank)<<(32+5);
}
static void setLangIdBits ( void *vkp , char langId ) {
key144_t *kp = (key144_t *)vkp;
if ( langId > MAXLANGID ) { gbshutdownAbort(true); }
kp->n1 &= 0xffffffe0ffffffffLL;
// put the lower 5 bits here
kp->n1 |= ((uint64_t)(langId&0x1f))<<(32);
// and the upper 6th bit here. n0 is a int16_t.
// 0011 1111
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
static void setFloat ( void *vkp , float f ) {
*(float *)(((char *)vkp) + 2) = f; }
static void setInt ( void *vkp , int32_t x ) {
*(int32_t *)(((char *)vkp) + 2) = x; }
// and read the float as well
static float getFloat ( const void *vkp ) {
return *(const float *)(((char *)vkp) + 2); }
static int32_t getInt ( const void *vkp ) {
return *(const int32_t *)(((char *)vkp) + 2); }
static void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
}
static bool isAlignmentBitClear ( const void *vkp ) {
return ( ( ((const char *)vkp)[1] & 0x02 ) == 0x00 );
}
static void makeStartKey ( void *kp, int64_t termId ,
int64_t docId=0LL){
return makeKey ( kp,
termId ,
docId,
0, // wordpos
0, // density
0, // diversity
0, // wordspam
0, // siterank
0, // hashgroup
0, // langid
0, // multiplier
0, // issynonym/etc.
true , // isdelkey
false ); // shardbytermid?
}
static void makeEndKey ( void *kp,int64_t termId,
int64_t docId = MAX_DOCID ) {
return makeKey ( kp,
termId ,
docId,
MAXWORDPOS,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
MAXSITERANK,
MAXHASHGROUP,
MAXLANGID,
MAXMULTIPLIER,
true, // issynonym/etc.
false, // isdelkey
true);// shard by termid?
}
static void makeDeleteDocKey(void *kp, uint64_t docId, bool isDelKey) {
return makeKey(kp, POSDB_DELETEDOC_TERMID, docId, 0, 0, 0, 0, 0, 0, 0, 0, 0, isDelKey, false);
}
// we got two compression bits!
static unsigned char getKeySize ( const void *key ) {
if ( (((const char *)key)[0])&0x04 ) return 6;
if ( (((const char *)key)[0])&0x02 ) return 12;
return 18;
}
static int64_t getTermId ( const void *key ) {
return ((const key144_t *)key)->n2 >> 16;
}
static uint64_t getDocId ( const void *key ) {
const char *k = (const char*)key;
uint64_t d = *(const uint64_t*)(k+4);
d >>= (64-38);
return d;
}
static unsigned char getSiteRank ( const void *key ) {
return (((const key144_t *)key)->n1 >> 37) & MAXSITERANK;
}
static lang_t getLangId ( const void *key ) {
if ( ((const char *)key)[0] & 0x08 )
return (lang_t)(((((const key144_t *)key)->n1 >> 32) & 0x1f) | 0x20);
else
return (lang_t)((((const key144_t *)key)->n1 >> 32) & 0x1f);
}
static unsigned char getHashGroup ( const void *key ) {
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
//return ((((const unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
//posdb sometimes have crap in it, so protect intersection from dealing with undefined hash groups
unsigned char tmp = ((((const unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
return tmp<HASHGROUP_END ? tmp : HASHGROUP_END-1;
}
static int32_t getWordPos ( const void *key ) {
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
return (*((const uint32_t *)((unsigned char *)key+2))) >> (8+6);
}
static inline void setWordPos ( char *key , uint32_t wpos ) {
// truncate
wpos &= MAXWORDPOS;
if ( wpos & 0x01 ) key[3] |= 0x40;
else key[3] &= ~((unsigned char)0x40);
if ( wpos & 0x02 ) key[3] |= 0x80;
else key[3] &= ~((unsigned char)0x80);
wpos >>= 2;
key[4] = ((char *)&wpos)[0];
key[5] = ((char *)&wpos)[1];
}
static unsigned char getWordSpamRank ( const void *key ) {
return ((((const uint16_t *)key)[1]) >>6) & MAXWORDSPAMRANK;
}
static unsigned char getDiversityRank ( const void *key ) {
return ((((const unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
}
static unsigned char getIsSynonym ( const void *key ) {
return (((const key144_t *)key)->n1 ) & 0x03;
}
static unsigned char getIsHalfStopWikiBigram ( const void *key ) {
return ((const char *)key)[2] & 0x01;
}
static unsigned char getDensityRank ( const void *key ) {
return ((*(const uint16_t *)key) >> 11) & MAXDENSITYRANK;
}
static inline void setDensityRank ( char *key , unsigned char dr ) {
// shift up
dr <<= 3;
// clear out
key[1] &= 0x07;
// or in
key[1] |= dr;
}
static char isShardedByTermId ( const void *key ) {return ((const char *)key)[1] & 0x01; }
static void setShardedByTermIdBit ( void *key ) {
char *k = (char *)key;
k[1] |= 0x01;
}
static unsigned char getMultiplier ( const void *key ) {
return ((*(const uint16_t *)key) >> 4) & MAXMULTIPLIER; }
int64_t getTermFreq ( collnum_t collnum, int64_t termId ) ;
int64_t estimateLocalTermListSize(collnum_t collnum, int64_t termId);
Rdb *getRdb ( ) { return &m_rdb; }
// Rdb init variables
static inline int32_t getFixedDataSize() { return 0; }
static inline bool getUseHalfKeys() { return true; }
static inline char getKeySize() { return sizeof(posdbkey_t); }
private:
Rdb m_rdb;
};
class RdbCache;
extern Posdb g_posdb;
extern Posdb g_posdb2;
extern RdbCache g_termFreqCache;
extern RdbCache g_termListSize;
void reinitializeRankingSettings();
#endif // GB_POSDB_H