Files
privacore-open-source-searc…/RdbList.h

326 lines
12 KiB
C
Raw Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, Copyright May 2001
2016-03-08 22:14:30 +01:00
#ifndef GB_RDBLIST_H
#define GB_RDBLIST_H
2013-08-02 13:12:24 -07:00
2016-06-28 11:01:47 +02:00
#include "Sanity.h"
2016-09-02 12:17:46 +02:00
#include "types.h"
#include "GbSignature.h"
2016-09-04 22:11:53 +02:00
#include "rdbid_t.h"
2016-09-02 12:17:46 +02:00
#include <stdint.h>
2016-06-20 19:29:10 +02:00
/**
*
2015-05-15 14:47:47 -07:00
* Core of the storage, this implements a list of <key><dataSize><data>.
*
* Additional documentation by Sam, May 15th 2015
* Compared to a standard vector, this class offers a few low level optimizations
* it seems, like compression of the keys when successive keys start with the same
* bits.
* The size of the key seems to be defined during creation (with maximum of 28 bytes,
* defined in type.h
* Sometimes, this type of list is used without any <data> (I guess in this case dataSize is 0)
* This is the case for the term-lists used in Msg2 for instance.
*
*
* Original documentation by Matt (2001?)
* RdbList is the heart of Rdb, Record DataBase
* an RdbList is a list of rdb records sorted by their keys.
* An rdb record is just a key with an optional dataSize and/or data
* All records in the RdbList must have keys in [m_startKey, m_endKey].
* TODO: speed up by using templates are by having 2-3 different RdbLists:
* 1 for dataLess Rdb's, 1 for fixedDataSize Rdb's, 1 for var dataSize
* m_useHalfKeys is only for IndexLists
* it is a compression method for key-only lists (data-less)
* it allows use of 6-byte keys if the last 12-byte key before has the same
* most significant 6 bytes
* this saves space and time (35% of indexdb can be cut)
* we cannot just override skipCurrentRecord(), etc. in IndexList because
* it would have to be a virtual function thing (ptr to a function) when
* called in RdbMap, Msg1, merge_r(), ... OR the callers would have
* to have a separate routine just for IndexLists
* for speed I opted to just add the m_useHalfKeys option to the RdbList
* class rather than have a virtual function or having to write lots of
* additional support routines for IndexLists
*/
2016-09-05 13:27:54 +02:00
2013-08-02 13:12:24 -07:00
class RdbList {
declare_signature
2016-09-02 11:00:59 +02:00
public:
2016-09-05 14:23:38 +02:00
RdbList();
~RdbList();
void destructor();
2013-08-02 13:12:24 -07:00
// sets m_listSize to 0, keeps any allocated buffer (m_alloc), however
2016-09-05 14:23:38 +02:00
void reset();
2013-08-02 13:12:24 -07:00
// like reset, but frees m_alloc/m_allocSize and resets all to 0
2016-09-05 14:23:38 +02:00
void freeList();
2013-08-02 13:12:24 -07:00
// . set it to this list
// . "list" is a serialized sequence of rdb records sorted by key
// . startKey/endKey specifies the list's range
// . there may, however, be some keys in the list outside of the range
// . if "ownData" is true we free "list" on our reset/destruction
2016-09-05 11:38:57 +02:00
void set(char *list, int32_t listSize, char *alloc, int32_t allocSize, const char *startKey, const char *endKey,
int32_t fixedDataSize, bool ownData, bool useHalfKeys, char keySize);
2013-08-02 13:12:24 -07:00
// like above but uses 0/maxKey for startKey/endKey
2016-09-05 11:38:57 +02:00
void set(char *list, int32_t listSize, char *alloc, int32_t allocSize,
int32_t fixedDataSize, bool ownData, bool useHalfKeys, char keySize = sizeof(key96_t));
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
// just set the start and end keys
void set(const char *startKey, const char *endKey);
2013-08-02 13:12:24 -07:00
2016-09-04 22:11:53 +02:00
void setFromPtr(char *p, int32_t psize, rdbid_t rdbId);
2014-02-04 16:15:27 -08:00
void stealFromOtherList(RdbList *other_list);
2016-09-05 14:23:38 +02:00
// these operate on the whole list
char *getList() { return m_list; }
void setList(char *list) { m_list = list; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
int32_t getListSize() const { return m_listSize; }
void setListSize(int32_t size) { m_listSize = size; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
const char *getStartKey() const { return m_startKey; }
void getStartKey(char *k) const { KEYSET(k, m_startKey, m_ks); }
2016-09-05 13:27:54 +02:00
void setStartKey(const char *startKey) { KEYSET(m_startKey, startKey, m_ks); }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
const char *getEndKey() const { return m_endKey; }
void getEndKey(char *k) const { KEYSET(k, m_endKey, m_ks); }
2016-09-05 13:27:54 +02:00
void setEndKey(const char *endKey) { KEYSET(m_endKey, endKey, m_ks); }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
/// @todo ALC why getListEnd does not return m_listEnd?
char *getListEnd() { return m_list + m_listSize; }
char *getListEndPtr() { return m_listEnd; }
void setListEnd(char *listEnd) { m_listEnd = listEnd; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
char *getCurrentRec() { return m_listPtr; }
char *getListPtr() { return m_listPtr; }
void setListPtr(char *listPtr) { m_listPtr = listPtr; }
const char* getListPtrHi() const { return m_listPtrHi; }
void setListPtrHi(const char *listPtrHi) { m_listPtrHi = listPtrHi; }
const char* getListPtrLo() const { return m_listPtrLo; }
void setListPtrLo(const char *listPtrLo) { m_listPtrLo = listPtrLo; }
2016-09-05 14:23:38 +02:00
void resetListPtr();
2016-09-05 13:27:54 +02:00
2016-09-05 14:23:38 +02:00
// often these equal m_list/m_listSize, but they may encompass
char *getAlloc() { return m_alloc; }
void setAlloc(char *alloc) { m_alloc = alloc; }
2016-09-05 13:27:54 +02:00
2016-09-05 14:23:38 +02:00
int32_t getAllocSize() const { return m_allocSize; }
void setAllocSize(int32_t allocSize) { m_allocSize = allocSize; }
2013-08-02 13:12:24 -07:00
2016-09-12 16:49:39 +02:00
int32_t getFixedDataSize() const { return m_fixedDataSize; }
2016-09-05 14:23:38 +02:00
void setFixedDataSize(int32_t fixedDataSize) { m_fixedDataSize = fixedDataSize; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
// . merge_r() sets m_lastKey for the list it merges the others into
// . otherwise, this may be invalid
const char *getLastKey() const;
void getLastKey(char *k) {
if (!m_lastKeyIsValid) {
gbshutdownAbort(true);
}
KEYSET(k, getLastKey(), m_ks);
}
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
void setLastKey(const char *k);
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
// sometimes we don't have a valid m_lastKey because it is only
// set in calls to constrain(), merge_r() and indexMerge_r()
bool isLastKeyValid() const { return m_lastKeyIsValid; }
void setLastKeyIsValid(bool lastKeyIsValid) { m_lastKeyIsValid = lastKeyIsValid; }
2013-08-02 13:12:24 -07:00
2016-09-12 16:49:39 +02:00
bool getOwnData() const { return m_ownData; }
2016-09-05 14:23:38 +02:00
// if you don't want data to be freed on destruction then don't own it
void setOwnData(bool ownData) { m_ownData = ownData; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
bool getUseHalfKeys() const { return m_useHalfKeys; }
void setUseHalfKeys(bool use) { m_useHalfKeys = use; }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
char getKeySize() const { return m_ks; }
void setKeySize(char ks) { m_ks = ks; }
2013-08-02 13:12:24 -07:00
// will scan through each record if record size is variable
2016-09-05 14:23:38 +02:00
int32_t getNumRecs();
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
int32_t getCurrentRecSize() const { return getRecSize(m_listPtr); }
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
key96_t getCurrentKey() const {
key96_t key;
getKey(m_listPtr, (char *)&key);
return key;
}
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
void getCurrentKey(void *key) const { getKey(m_listPtr, (char *)key); }
2017-03-27 14:40:37 +02:00
char *getCurrentData() { return getData(m_listPtr); }
const char *getCurrentData() const { return getData(m_listPtr); }
2016-09-05 14:23:38 +02:00
int32_t getCurrentDataSize() const { return getDataSize(m_listPtr); }
2013-08-02 13:12:24 -07:00
// . skip over the current record and point to the next one
// . returns false if we skipped into a black hole (end of list)
2016-09-05 13:27:54 +02:00
bool skipCurrentRecord() {
return skipCurrentRec(getRecSize(m_listPtr));
}
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
bool isExhausted() const { return (m_listPtr >= m_listEnd); }
2013-08-02 13:12:24 -07:00
// are there any records in the list?
2016-09-05 14:23:38 +02:00
bool isEmpty() const { return (m_listSize == 0); }
2013-08-02 13:12:24 -07:00
// . add this record to the end of the list, @ m_list+m_listSize
// . returns false and sets errno on error
// . grows list (m_allocSize) if we need more space
2016-09-05 14:23:38 +02:00
bool addRecord(const char *key, int32_t dataSize, const char *data, bool bitch = true);
2013-08-02 13:12:24 -07:00
// . constrain a list to [startKey,endKey]
// . returns false and sets g_errno on error
// . only called by Msg3.cpp for 1 list reads to avoid memmov()'ing
// and malloc()'ing
// . may change m_list and/or m_listSize
bool constrain(const char *startKey, char *endKey, int32_t minRecSizes,
2016-09-04 22:11:53 +02:00
int32_t hintOffset, const char *hintKey, rdbid_t rdbId, const char *filename);
2016-08-02 16:17:43 +02:00
2013-08-02 13:12:24 -07:00
// . this MUST be called before calling merge_r()
// . will alloc enough space for m_listSize + sizes of "lists"
2016-09-05 13:27:54 +02:00
bool prepareForMerge(RdbList **lists, int32_t numLists, int32_t minRecSizes = -1);
2013-08-02 13:12:24 -07:00
// . merge the lists into this list
// . set our startKey/endKey to "startKey"/"endKey"
// . exclude any records from lists not in that range
2016-09-05 14:23:38 +02:00
void merge_r(RdbList **lists, int32_t numLists, const char *startKey, const char *endKey, int32_t minRecSizes,
bool removeNegRecs, rdbid_t rdbId, collnum_t collNum, int32_t startFileNum, bool isRealMerge);
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
bool growList(int32_t newSize);
2013-08-02 13:12:24 -07:00
// . check to see if keys in order
// . logs any problems
// . sleeps if any problems encountered
2016-09-04 22:11:53 +02:00
bool checkList_r(bool abortOnProblem = true, rdbid_t rdbId = RDB_NONE);
2013-08-02 13:12:24 -07:00
// . removes records whose keys aren't in proper range (corruption)
// . returns false and sets errno on error/problem
2016-09-05 14:23:38 +02:00
bool removeBadData_r();
2013-08-02 13:12:24 -07:00
// . print out the list (uses log())
2016-09-05 17:03:03 +02:00
int printList();
int printPosdbList();
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
// . is the format bit set? that means it's a 12-byte key
// . used exclusively for index lists (indexdb)
// . see Indexdb.h for format of the 12-byte and 6-byte indexdb keys
static bool isHalfBitOn(const char *rec) { return (*rec & 0x02); }
2013-08-02 13:12:24 -07:00
2016-09-05 13:38:46 +02:00
private:
2016-09-05 14:23:38 +02:00
// returns false if we skipped into a black hole (end of list)
int32_t getRecSize(const char *rec) const {
// posdb?
if (m_ks == 18) {
if (rec[0] & 0x04) return 6;
if (rec[0] & 0x02) return 12;
return 18;
}
if (m_useHalfKeys) {
if (isHalfBitOn(rec)) return m_ks - 6;
return m_ks;
}
if (m_fixedDataSize == 0) return m_ks;
// negative keys always have no datasize entry
if ((rec[0] & 0x01) == 0) return m_ks;
if (m_fixedDataSize > 0) return m_ks + m_fixedDataSize;
return *(int32_t *)(rec + m_ks) + m_ks + 4;
}
// this is specially-made for RdbMap's processing of IndexLists
bool skipCurrentRec(int32_t recSize) {
m_listPtr += recSize;
if (m_listPtr >= m_listEnd) return false;
if (m_ks == 18) {
// a 6 byte key? do not change listPtrHi nor Lo
if (m_listPtr[0] & 0x04) return true;
// a 12 byte key?
if (m_listPtr[0] & 0x02) {
m_listPtrLo = m_listPtr + 6;
return true;
}
// if it's a full 18 byte key, change both ptrs
m_listPtrHi = m_listPtr + 12;
m_listPtrLo = m_listPtr + 6;
return true;
}
if (m_useHalfKeys && !isHalfBitOn(m_listPtr))
m_listPtrHi = m_listPtr + (m_ks - 6);
return true;
}
2016-09-05 14:23:38 +02:00
void getKey(const char *rec, char *key) const;
2013-08-02 13:12:24 -07:00
2017-03-27 14:40:37 +02:00
char *getData(char *rec) const;
const char *getData(const char *rec) const {
return getData(const_cast<char*>(rec));
}
2016-09-05 14:23:38 +02:00
int32_t getDataSize(const char *rec) const;
2013-08-02 13:12:24 -07:00
bool posdbConstrain(const char *startKey, char *endKey, int32_t minRecSizes,
int32_t hintOffset, const char *hintKey, const char *filename);
bool posdbMerge_r(RdbList **lists, int32_t numLists, const char *startKey, const char *endKey, int32_t minRecSizes,
rdbid_t rdbId, bool removeNegKeys, bool useIndexFile, collnum_t collNum, int32_t startFileIndex, bool isRealMerge);
2013-08-02 13:12:24 -07:00
// the unalterd raw list. keys may be outside of [m_startKey,m_endKey]
2016-09-05 13:27:54 +02:00
char *m_list;
int32_t m_listSize; // how many bytes we're using for a list
2013-08-02 13:12:24 -07:00
// the list contains all the keys in [m_startKey,m_endKey] so make
// sure if the list is truncated by minrecsizes that you decrease
// m_endKey so this is still true. seems like zak did not do that
// for rdbbuckets code.
2016-09-05 14:23:38 +02:00
char m_startKey[MAX_KEY_BYTES];
2016-09-05 13:27:54 +02:00
char m_endKey[MAX_KEY_BYTES];
2013-08-02 13:12:24 -07:00
2016-09-05 13:27:54 +02:00
char *m_listEnd; // = m_list + m_listSize
char *m_listPtr; // points to current record in list
2013-08-02 13:12:24 -07:00
2016-09-05 14:23:38 +02:00
// . this points to the most significant 6 bytes of a key
// . only valid if m_useHalfKeys is true
// . points to start of termId (for 6-bytes/12-bytes posdbkey)
2016-09-05 14:23:38 +02:00
const char *m_listPtrHi;
// for the secondary compression bit for posdb
// points to start of langid (for 6-bytes posdbkey)
2016-09-05 14:23:38 +02:00
const char *m_listPtrLo;
2016-09-05 13:27:54 +02:00
int32_t m_allocSize; // how many bytes we've allocated at m_alloc
char *m_alloc; // start of chunk that was allocated
2013-08-02 13:12:24 -07:00
// m_fixedDataSize is -1 if records are variable length,
// 0 for data-less records (keys only) and N for records of dataSize N
2016-09-05 14:23:38 +02:00
int32_t m_fixedDataSize;
2013-08-02 13:12:24 -07:00
// this is set to the last key in this list if we were made by merge()
2016-09-05 13:27:54 +02:00
char m_lastKey[MAX_KEY_BYTES];
bool m_lastKeyIsValid;
2013-08-02 13:12:24 -07:00
// max list rec sizes to merge as set by prepareForMerge()
2016-09-05 13:27:54 +02:00
int32_t m_mergeMinListSize;
2013-08-02 13:12:24 -07:00
// do we own the list data (m_list)? if so free it on destruction
2016-09-05 13:27:54 +02:00
bool m_ownData;
2013-08-02 13:12:24 -07:00
// are keys compressed? only used for index lists right now
2016-09-05 13:27:54 +02:00
bool m_useHalfKeys;
2013-08-02 13:12:24 -07:00
// keysize, usually 12, for 12 bytes. can be 16 for date index (datedb)
2016-09-05 13:27:54 +02:00
char m_ks;
2013-08-02 13:12:24 -07:00
};
2016-03-08 22:14:30 +01:00
#endif // GB_RDBLIST_H