privacore-open-source-searc…/RdbBuckets.h

// Zak Betz,  Copyright, Jun 2008

// A sorted list of sorted lists for storing rdb keys.  Faster and more
// cache friendly than RdbTree.  Optimized for batched adds, amortized O(1)
// add operation, O(log n) for retrival, ranged getList for k keys is
// O(log(n) + k) where as RdbTree is O(k * log(n)).
// Memory is allocated and used on a on demand basis rather than all up
// front as with RdbTree, so memory usage is much lower most of the time.

// Collections are handled as a linked list, each RdbBuckets has a nextColl
// pointer.  The front bucket acts as the gatekeeper for all of the
// other buckets, Only it's values for needsSave and isWritable are
// significant

//when selecting bucketnum and also when deduping, use KEYCMPNEGEQ which
//will mask off the delbit, that way pos and neg keys will be in the same
//bucket and only the newest key will survive.  When getting or deleting
//a list, use KEYCMP within a bucket and use KEYCMPNEGEQ to select the
//bucket nums.  This is because iterators in rdb dump get a list, then
//add 1 to a key and get the next list and adding 1 to a pos key will get
//the negative one.

#ifndef GB_RDBBUCKETS_H
#define GB_RDBBUCKETS_H

#include <cstdint>
#include <functional>
#include <atomic>
#include "rdbid_t.h"
#include "collnum_t.h"
#include "types.h"
#include "GbMutex.h"
#include "JobScheduler.h"

class BigFile;
class RdbList;
class RdbTree;
class RdbBucket;

class RdbBuckets {
	friend class RdbBucket;

public:
	RdbBuckets( );
	~RdbBuckets( );
	void clear();
	void reset();

	bool set(int32_t fixedDataSize, int32_t maxMem, const char *allocName, rdbid_t rdbId, const char *dbname,
	         char keySize);

	bool addNode(collnum_t collnum, const char *key, const char *data, int32_t dataSize);

	bool getList(collnum_t collnum, const char *startKey, const char *endKey, int32_t minRecSizes, RdbList *list,
	             int32_t *numPosRecs, int32_t *numNegRecs, bool useHalfKeys) const;

	bool deleteNode(collnum_t collnum, const char *key);

	int64_t estimateListSize(collnum_t collnum, const char *startKey, const char *endKey, char *minKey, char *maxKey) const;

	bool collExists(collnum_t coll) const;

	// don't need to lock. only set in RdbBuckets::set
	int32_t getRecSize() const { return m_recSize; }

	/// @todo ALC verify saving/writable logic is okay with multithread
	bool isSaving() const;
	bool needsSave() const;
	void setNeedsSave(bool s);

	// don't need to lock. only set in RdbBuckets::set
	int32_t getMaxMem() const { return m_maxMem; }

	int32_t getMemAllocated() const;
	bool needsDump() const;
	bool hasRoom(int32_t numRecs) const;

	int32_t getNumKeys() const;
	int32_t getMemOccupied() const;

	int32_t getNumNegativeKeys() const;
	int32_t getNumPositiveKeys() const;

	void cleanBuckets();
	bool delColl(collnum_t collnum);

	//just for this collection
	int32_t getNumKeys(collnum_t collnum) const;

	//DEBUG
	void verifyIntegrity();
	int32_t addTree(RdbTree *rt);
	void printBuckets(std::function<void(const char *, int32_t)> print_fn = nullptr);
	void printBucketsStartEnd();

	bool testAndRepair();

	//Save/Load/Dump
	bool fastSave(const char *dir, bool useThread, void *state, void (*callback)(void *state));
	bool loadBuckets(const char *dbname);

private:
	GbMutex& getLock() { return m_mtx; }

	static void saveWrapper(void *state);
	static void saveDoneWrapper(void *state, job_exit_t exit_type);

	void reset_unlocked();

	bool resizeTable_unlocked(int32_t numNeeded);
	bool selfTest_unlocked(bool thorough, bool core);
	bool repair_unlocked();

	bool addNode_unlocked(collnum_t collnum, const char *key, const char *data, int32_t dataSize);

	bool getList_unlocked(collnum_t collnum, const char *startKey, const char *endKey, int32_t minRecSizes, RdbList *list,
	                      int32_t *numPosRecs, int32_t *numNegRecs, bool useHalfKeys) const;

	bool deleteList_unlocked(collnum_t collnum, RdbList *list);

	bool delColl_unlocked(collnum_t collnum);

	void addBucket_unlocked(RdbBucket *newBucket, int32_t i);
	int32_t getBucketNum_unlocked(collnum_t collnum, const char *key) const;
	char bucketCmp_unlocked(collnum_t acoll, const char *akey, const RdbBucket *b) const;

	//syntactic sugar
	RdbBucket* bucketFactory_unlocked();
	void updateNumRecs_unlocked(int32_t n, int32_t bytes, int32_t numNeg);
	int32_t getNumKeys_unlocked() const;

	bool fastSave_unlocked();
	int64_t fastSaveColl_unlocked(int fd);
	bool fastLoad_unlocked(BigFile *f, const char *dbname);
	int64_t fastLoadColl_unlocked(BigFile *f, const char *dbname);

	mutable GbMutex m_mtx;
	RdbBucket **m_buckets;
	RdbBucket *m_bucketsSpace;
	char *m_masterPtr;
	int32_t m_masterSize;
	int32_t m_firstOpenSlot;	//first slot in m_bucketSpace that is available (never-used or empty)
	int32_t m_numBuckets;		//number of used buckets
	int32_t m_maxBuckets;		//current number of (pre-)allocated buckets
	uint8_t m_ks;
	int32_t m_fixedDataSize;
	int32_t m_recSize;
	int32_t m_numKeysApprox;//includes dups
	int32_t m_numNegKeys;
	int32_t m_maxMem;
	int32_t m_maxBucketsCapacity;	//max number of buckets given the memory limit
	int32_t m_dataMemOccupied;

	rdbid_t m_rdbId;
	const char *m_dbname;
	char *m_swapBuf;
	char *m_sortBuf;
	int32_t m_sortBufSize;

	bool m_repairMode;

	std::atomic<bool> m_isSaving;
	// true if buckets was modified and needs to be saved
	bool m_needsSave;

	const char *m_dir;
	void *m_state;

	void (*m_callback)(void *state);

	int64_t m_bytesWritten;

	int32_t m_errno;
	const char *m_allocName;
};

#endif // GB_RDBBUCKETS_H