Files

149 lines
4.2 KiB
C
Raw Permalink Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, copyright Jun 2001
// . db of XmlDocs
2016-03-08 22:14:30 +01:00
#ifndef GB_TITLEDB_H
#define GB_TITLEDB_H
2013-08-02 13:12:24 -07:00
// how many bits is our docId? (4billion * 64 = 256 billion docs)
#define NUMDOCIDBITS 38
#define DOCID_MASK (0x0000003fffffffffLL)
#define MAX_DOCID DOCID_MASK
#include "TitleRecVersion.h"
2013-08-02 13:12:24 -07:00
#include "Rdb.h"
#include "Url.h"
// new key format:
// . <docId> - 38 bits
// . <urlHash48> - 48 bits (used when looking up by url and not docid)
2016-11-25 17:37:54 +01:00
// <reserved> - 9 bits
2013-08-02 13:12:24 -07:00
// . <delBit> - 1 bit
2017-03-10 14:33:59 +01:00
// dddddddd dddddddd dddddddd dddddddd d: docId
// dddddduu uuuuuuuu uuuuuuuu uuuuuuuu u: urlHash48
// uuuuuuuu uuuuuuuu uuuuuuxx xxxxxxxD D: delBit
2013-08-02 13:12:24 -07:00
class Titledb {
public:
2013-08-02 13:12:24 -07:00
// reset rdb
void reset();
2016-11-28 14:56:03 +01:00
bool verify(const char *coll);
2013-08-02 13:12:24 -07:00
// init m_rdb
bool init ();
// init secondary/rebuild titledb
2014-11-10 14:45:11 -08:00
bool init2 ( int32_t treeMem ) ;
2013-08-02 13:12:24 -07:00
2016-11-28 14:56:03 +01:00
Rdb *getRdb() { return &m_rdb; }
const Rdb *getRdb() const { return &m_rdb; }
// . this is an estimate of the number of docs in the WHOLE db network
// . we assume each group/cluster has about the same # of docs as us
int64_t estimateGlobalNumDocs() const {
return m_rdb.getNumTotalRecs() * (int64_t)g_hostdb.m_numShards;
}
2013-08-02 13:12:24 -07:00
// . get the probable docId from a url/coll
// . it's "probable" because it may not be the actual docId because
// in the case of a collision we pick a nearby docId that is
// different but guaranteed to be in the same group/cluster, so you
// can be assured the top 32 bits of the docId will be unchanged
static uint64_t getProbableDocId(const Url *url) {
return getProbableDocId(url->getUrl(), url->getDomain(), url->getDomainLen());
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
// a different way to do it
static uint64_t getProbableDocId(const char *url) {
2013-08-02 13:12:24 -07:00
Url u;
u.set( url );
return getProbableDocId(&u);
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
// a different way to do it
static uint64_t getProbableDocId(const char *url, const char *dom, int32_t domLen) {
uint64_t probableDocId = hash64b(url,0) &
2013-08-02 13:12:24 -07:00
DOCID_MASK;
// clear bits 6-13 because we want to put the domain hash there
// dddddddd dddddddd ddhhhhhh hhdddddd
2013-08-02 13:12:24 -07:00
probableDocId &= 0xffffffffffffc03fULL;
2014-11-10 14:45:11 -08:00
uint32_t h = hash8(dom,domLen);
2013-08-02 13:12:24 -07:00
//shift the hash by 6
h <<= 6;
// OR in the hash
probableDocId |= h;
return probableDocId;
2016-05-19 18:37:26 +02:00
}
2013-08-02 13:12:24 -07:00
// turn off the last 6 bits
static uint64_t getFirstProbableDocId(int64_t d) {
return d & 0xffffffffffffffc0ULL;
}
2013-08-02 13:12:24 -07:00
// turn on the last 6 bits for the end docId
static uint64_t getLastProbableDocId(int64_t d) {
return d | 0x000000000000003fULL;
}
2013-08-02 13:12:24 -07:00
// . the top NUMDOCIDBITs of "key" are the docId
// . we use the top X bits of the keys to partition the records
// . using the top bits to partition allows us to keep keys that
// are near each other (euclidean metric) in the same partition
static int64_t getDocIdFromKey(const key96_t *key) {
uint64_t docId = ((uint64_t)key->n1) << (NUMDOCIDBITS - 32);
docId |= key->n0 >> (64 - (NUMDOCIDBITS - 32));
2013-08-02 13:12:24 -07:00
return docId;
2016-05-19 18:37:26 +02:00
}
static int64_t getDocId(const key96_t *key) { return getDocIdFromKey(key); }
2013-08-02 13:12:24 -07:00
static uint8_t getDomHash8FromDocId (int64_t d) {
return (d & ~0xffffffffffffc03fULL) >> 6;
}
2013-08-02 13:12:24 -07:00
static int64_t getUrlHash48 ( key96_t *k ) {
return ((k->n0 >> 10) & 0x0000ffffffffffffLL);
}
2013-08-02 13:12:24 -07:00
// does this key/docId/url have it's titleRec stored locally?
static bool isLocal(int64_t docId);
2013-08-02 13:12:24 -07:00
2016-11-28 14:56:03 +01:00
static bool isLocal(const Url *url) {
return isLocal(getProbableDocId(url));
}
2013-08-02 13:12:24 -07:00
// . make the key of a TitleRec from a docId
// . remember to set the low bit so it's not a delete
// . hi bits are set in the key
static key96_t makeKey(int64_t docId, int64_t uh48, bool isDel);
2013-08-02 13:12:24 -07:00
static key96_t makeFirstKey(int64_t docId) {
return makeKey(docId, 0, true);
}
2013-08-02 13:12:24 -07:00
static key96_t makeLastKey(int64_t docId) {
return makeKey(docId, 0xffffffffffffLL, false);
}
2013-08-02 13:12:24 -07:00
static void printKey(const char *key);
static void validateSerializedRecord(const char *rec, int32_t recSize);
// Rdb init variables
static inline int32_t getFixedDataSize() { return -1; }
static inline bool getUseHalfKeys() { return false; }
static inline char getKeySize() { return 12; }
2016-09-12 15:47:09 +02:00
private:
2013-08-02 13:12:24 -07:00
// holds binary format title entries
Rdb m_rdb;
};
void filterTitledbList(RdbList *list);
2013-08-02 13:12:24 -07:00
extern class Titledb g_titledb;
extern class Titledb g_titledb2;
2016-03-08 22:14:30 +01:00
#endif // GB_TITLEDB_H