Files

347 lines
10 KiB
C
Raw Permalink Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, Copyright May 2012
// . format of an 18-byte posdb key
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
2013-08-02 13:12:24 -07:00
// M = multiplier, b = in outlink text
2017-03-17 12:45:00 +01:00
// N = shard by termId
2013-08-02 13:12:24 -07:00
// L = langIdShiftBit (upper bit)
// G: 0 = body
// 1 = intitletag
// 2 = inheading
// 3 = inlist
// 4 = inmetatag
// 5 = inlinktext
// 6 = tag
// 7 = inneighborhood
// 8 = internalinlinktext
// 9 = inurl
2015-12-29 14:39:44 +01:00
// 10 = inmenu
2013-08-02 13:12:24 -07:00
//
// F: 0 = original term
// 1 = conjugate/sing/plural
// 2 = synonym
// 3 = hyponym
// NOTE: N bit is 1 if the shard of the record is determined by the
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
// the hash of the content like this so we can see if it is a dup.
2013-08-02 13:12:24 -07:00
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
// so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms.
2014-09-24 20:03:16 -07:00
// NOTE: for inlinktext terms the spam rank is the siterank of the
2013-08-02 13:12:24 -07:00
// inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes
// for incoming inlink text.
// NOTE: now we can b-step into the termlist looking for a docid match
// and not worry about misalignment from the double compression scheme
// because if the 6th byte's low bit is clear that means its a docid
// 12-byte key, otherwise its the word position 6-byte key since the delbit
// can't be clear for those!
// THEN we can play with a tuner for how these various things affect
// the search results ranking.
2016-03-08 22:14:30 +01:00
#ifndef GB_POSDB_H
#define GB_POSDB_H
2013-08-02 13:12:24 -07:00
#include "Rdb.h"
2016-11-10 16:56:37 +01:00
#include "Titledb.h" // DOCID_MASK/MAX_DOCID
2013-08-02 13:12:24 -07:00
#include "HashTableX.h"
2016-06-28 11:01:47 +02:00
#include "Sanity.h"
2016-09-06 12:09:51 +02:00
#include "termid_mask.h"
2016-06-20 19:29:10 +02:00
2013-08-02 13:12:24 -07:00
#define MAXSITERANK 0x0f // 4 bits
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
#define MAXWORDPOS 0x0003ffff // 18 bits
#define MAXDENSITYRANK 0x1f // 5 bits
#define MAXWORDSPAMRANK 0x0f // 4 bits
#define MAXDIVERSITYRANK 0x0f // 4 bits
#define MAXHASHGROUP 0x0f // 4 bits
#define MAXMULTIPLIER 0x0f // 4 bits
2013-08-02 13:12:24 -07:00
#define MAXISSYNONYM 0x03 // 2 bits
// values for G bits in the posdb key
#define HASHGROUP_BODY 0 // body implied
#define HASHGROUP_TITLE 1
#define HASHGROUP_HEADING 2 // body implied
#define HASHGROUP_INLIST 3 // body implied
#define HASHGROUP_INMETATAG 4
#define HASHGROUP_INLINKTEXT 5 // apparently not detected anymore
2013-08-02 13:12:24 -07:00
#define HASHGROUP_INTAG 6
#define HASHGROUP_NEIGHBORHOOD 7
#define HASHGROUP_INTERNALINLINKTEXT 8
#define HASHGROUP_INURL 9
#define HASHGROUP_INMENU 10 // body implied
#define HASHGROUP_END 11
#define POSDB_DELETEDOC_TERMID 0
const char *getHashGroupString ( unsigned char hg );
2013-08-02 13:12:24 -07:00
2016-09-02 16:07:53 +02:00
typedef key144_t posdbkey_t;
2013-08-02 13:12:24 -07:00
class Posdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
2014-11-10 14:45:11 -08:00
bool init2 ( int32_t treeMem );
2013-08-02 13:12:24 -07:00
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
static void makeKey ( void *kp ,
2014-10-30 13:36:39 -06:00
int64_t termId ,
uint64_t docId ,
2014-11-10 14:45:11 -08:00
int32_t wordPos ,
2013-08-02 13:12:24 -07:00
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char siteRank ,
char hashGroup ,
char langId ,
// multiplier: we convert into 7 bits in this function
2014-11-10 14:45:11 -08:00
int32_t multiplier ,
2013-08-02 13:12:24 -07:00
bool isSynonym ,
bool isDelKey ,
bool shardByTermId );
2013-08-02 13:12:24 -07:00
2016-09-28 16:59:09 +02:00
static void printKey(const char *key);
static int printList ( RdbList &list ) ;
2013-08-02 13:12:24 -07:00
// we map the 32bit score to like 7 bits here
static void setMultiplierBits ( void *vkp , unsigned char mbits ) {
2013-08-02 13:12:24 -07:00
key144_t *kp = (key144_t *)vkp;
2016-06-20 19:29:10 +02:00
if ( mbits > MAXMULTIPLIER ) { gbshutdownAbort(true); }
2013-08-02 13:12:24 -07:00
kp->n0 &= 0xfc0f;
// map score to bits
2014-11-10 14:45:11 -08:00
kp->n0 |= ((uint16_t)mbits) << 4;
2013-08-02 13:12:24 -07:00
}
static void setDocIdBits ( void *vkp , uint64_t docId ) {
2013-08-02 13:12:24 -07:00
key144_t *kp = (key144_t *)vkp;
kp->n1 &= 0x000003ffffffffffLL;
kp->n1 |= (docId<<(32+10));
kp->n2 &= 0xffffffffffff0000LL;
kp->n2 |= docId>>22;
}
static void setSiteRankBits ( void *vkp , char siteRank ) {
2013-08-02 13:12:24 -07:00
key144_t *kp = (key144_t *)vkp;
2016-06-20 19:29:10 +02:00
if ( siteRank > MAXSITERANK ) { gbshutdownAbort(true); }
2013-08-02 13:12:24 -07:00
kp->n1 &= 0xfffffe1fffffffffLL;
kp->n1 |= ((uint64_t)siteRank)<<(32+5);
2013-08-02 13:12:24 -07:00
}
static void setLangIdBits ( void *vkp , char langId ) {
2013-08-02 13:12:24 -07:00
key144_t *kp = (key144_t *)vkp;
2016-06-20 19:29:10 +02:00
if ( langId > MAXLANGID ) { gbshutdownAbort(true); }
2013-08-02 13:12:24 -07:00
kp->n1 &= 0xffffffe0ffffffffLL;
// put the lower 5 bits here
kp->n1 |= ((uint64_t)(langId&0x1f))<<(32);
2014-11-10 14:45:11 -08:00
// and the upper 6th bit here. n0 is a int16_t.
2013-08-02 13:12:24 -07:00
// 0011 1111
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
static void setFloat ( void *vkp , float f ) {
2016-05-19 18:37:26 +02:00
*(float *)(((char *)vkp) + 2) = f; }
static void setInt ( void *vkp , int32_t x ) {
2016-05-19 18:37:26 +02:00
*(int32_t *)(((char *)vkp) + 2) = x; }
// and read the float as well
static float getFloat ( const void *vkp ) {
2016-05-19 18:37:26 +02:00
return *(const float *)(((char *)vkp) + 2); }
static int32_t getInt ( const void *vkp ) {
2016-05-19 18:37:26 +02:00
return *(const int32_t *)(((char *)vkp) + 2); }
static void setAlignmentBit ( void *vkp , char val ) {
2013-11-11 18:58:45 -08:00
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
2016-05-19 18:37:26 +02:00
}
2013-11-11 18:58:45 -08:00
static bool isAlignmentBitClear ( const void *vkp ) {
2016-03-15 14:35:51 +01:00
return ( ( ((const char *)vkp)[1] & 0x02 ) == 0x00 );
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static void makeStartKey ( void *kp, int64_t termId ,
2014-10-30 13:36:39 -06:00
int64_t docId=0LL){
2013-08-02 13:12:24 -07:00
return makeKey ( kp,
termId ,
docId,
0, // wordpos
0, // density
0, // diversity
0, // wordspam
0, // siterank
0, // hashgroup
0, // langid
0, // multiplier
0, // issynonym/etc.
true , // isdelkey
false ); // shardbytermid?
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static void makeEndKey ( void *kp,int64_t termId,
2014-10-30 13:36:39 -06:00
int64_t docId = MAX_DOCID ) {
2013-08-02 13:12:24 -07:00
return makeKey ( kp,
termId ,
docId,
MAXWORDPOS,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
MAXSITERANK,
MAXHASHGROUP,
MAXLANGID,
MAXMULTIPLIER,
true, // issynonym/etc.
false, // isdelkey
true);// shard by termid?
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static void makeDeleteDocKey(void *kp, uint64_t docId, bool isDelKey) {
return makeKey(kp, POSDB_DELETEDOC_TERMID, docId, 0, 0, 0, 0, 0, 0, 0, 0, 0, isDelKey, false);
}
2013-08-02 13:12:24 -07:00
// we got two compression bits!
static unsigned char getKeySize ( const void *key ) {
2016-02-15 15:59:16 +01:00
if ( (((const char *)key)[0])&0x04 ) return 6;
if ( (((const char *)key)[0])&0x02 ) return 12;
2013-08-02 13:12:24 -07:00
return 18;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static int64_t getTermId ( const void *key ) {
2016-02-15 15:59:16 +01:00
return ((const key144_t *)key)->n2 >> 16;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static uint64_t getDocId ( const void *key ) {
2016-07-05 14:03:09 +02:00
const char *k = (const char*)key;
uint64_t d = *(const uint64_t*)(k+4);
d >>= (64-38);
return d;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getSiteRank ( const void *key ) {
2016-02-15 15:59:16 +01:00
return (((const key144_t *)key)->n1 >> 37) & MAXSITERANK;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getLangId ( const void *key ) {
2016-02-15 15:59:16 +01:00
if ( ((const char *)key)[0] & 0x08 )
return ((((const key144_t *)key)->n1 >> 32) & 0x1f) | 0x20;
2013-08-02 13:12:24 -07:00
else
2016-02-15 15:59:16 +01:00
return ((((const key144_t *)key)->n1 >> 32) & 0x1f) ;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getHashGroup ( const void *key ) {
2013-08-02 13:12:24 -07:00
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
//return ((((const unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
//posdb sometimes have crap in it, so protect intersection from dealing with undefined hash groups
unsigned char tmp = ((((const unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
return tmp<=10 ? tmp : 10;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static int32_t getWordPos ( const void *key ) {
2013-08-02 13:12:24 -07:00
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
2016-02-15 15:59:16 +01:00
return (*((const uint32_t *)((unsigned char *)key+2))) >> (8+6);
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static inline void setWordPos ( char *key , uint32_t wpos ) {
2013-08-02 13:12:24 -07:00
// truncate
wpos &= MAXWORDPOS;
if ( wpos & 0x01 ) key[3] |= 0x40;
else key[3] &= ~((unsigned char)0x40);
if ( wpos & 0x02 ) key[3] |= 0x80;
else key[3] &= ~((unsigned char)0x80);
wpos >>= 2;
key[4] = ((char *)&wpos)[0];
key[5] = ((char *)&wpos)[1];
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getWordSpamRank ( const void *key ) {
2016-02-15 15:59:16 +01:00
return ((((const uint16_t *)key)[1]) >>6) & MAXWORDSPAMRANK;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getDiversityRank ( const void *key ) {
2016-02-15 15:59:16 +01:00
return ((((const unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getIsSynonym ( const void *key ) {
2016-02-15 15:59:16 +01:00
return (((const key144_t *)key)->n1 ) & 0x03;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getIsHalfStopWikiBigram ( const void *key ) {
2016-02-15 15:59:16 +01:00
return ((const char *)key)[2] & 0x01;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static unsigned char getDensityRank ( const void *key ) {
2016-02-15 15:59:16 +01:00
return ((*(const uint16_t *)key) >> 11) & MAXDENSITYRANK;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static inline void setDensityRank ( char *key , unsigned char dr ) {
2013-08-02 13:12:24 -07:00
// shift up
dr <<= 3;
// clear out
key[1] &= 0x07;
// or in
key[1] |= dr;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
static char isShardedByTermId ( const void *key ) {return ((const char *)key)[1] & 0x01; }
static void setShardedByTermIdBit ( void *key ) {
char *k = (char *)key;
k[1] |= 0x01;
2016-05-19 18:37:26 +02:00
}
static unsigned char getMultiplier ( const void *key ) {
2016-05-19 18:37:26 +02:00
return ((*(const uint16_t *)key) >> 4) & MAXMULTIPLIER; }
2013-08-02 13:12:24 -07:00
2014-10-30 13:36:39 -06:00
int64_t getTermFreq ( collnum_t collnum, int64_t termId ) ;
int64_t estimateLocalTermListSize(collnum_t collnum, int64_t termId);
2013-08-02 13:12:24 -07:00
2016-05-19 18:37:26 +02:00
Rdb *getRdb ( ) { return &m_rdb; }
2013-08-02 13:12:24 -07:00
// Rdb init variables
static inline int32_t getFixedDataSize() { return 0; }
static inline bool getUseHalfKeys() { return true; }
static inline char getKeySize() { return sizeof(posdbkey_t); }
2016-03-29 15:09:43 +02:00
private:
2013-08-02 13:12:24 -07:00
Rdb m_rdb;
};
2013-08-02 13:12:24 -07:00
class RdbCache;
2013-08-02 13:12:24 -07:00
extern Posdb g_posdb;
extern Posdb g_posdb2;
extern RdbCache g_termFreqCache;
extern RdbCache g_termListSize;
2013-08-02 13:12:24 -07:00
void reinitializeRankingSettings();
2016-03-08 22:14:30 +01:00
#endif // GB_POSDB_H