// Copyright Sep 2000 Matt Wells // . an array of key/pageOffset pairs indexed by disk page number // . slots in Slot (on disk) must be sorted by key from smallest to largest // . used for quick btree lookup of a key to find the page where that slot // resides on disk // . disk page size is system's disk page size (8k) for best performance // . TODO: split up map into several arrays so like the first 1meg of // map can be easily freed like during a merge // . TODO: use a getKey(),getOffset(),getDataSize() to make this easier to do #ifndef GB_RDBMAP_H #define GB_RDBMAP_H #include <atomic> #include "BigFile.h" #include "RdbList.h" #include "Sanity.h" // . this can be increased to provide greater disk coverage but it will // increase delays because each seek will have to read more // . 8k of disk corresponds to 18 bytes of map // . 8megs of disk needs 18k of map (w/ dataSize) // . 8megs of disk needs 14k of map (w/o dataSize) // . a 32gig index would need 14megs*4 = 56 megs of map!!! // . then merging would mean we'd need twice that, 112 megs of map in memory // unless we dumped the map to disk periodically // . 256 megs per machine would be excellent, but costly? // . we need 66 megabytes of mem for every 80-gigs of index (actually 40gigs // considering half the space is for merging) // . PAGE_SIZE is often called the block size // . a page or block is read in "IDE Block Mode" by the drive // . it's the amount of disk that can be read with one i/o (interrupt) // . try to make sure PAGE_SIZE matches your "multiple sector count" // . use hdparm to configure (hdparm -m16 /dev/hda) will set it to 8k since // each sector is 512bytes // . hdparm -u1 -X66 -d1 -c3 -m16 /dev/hda is pretty agressive // . actually "block size" in context of the file system can be 1024,... 4096 // on ext2fs ... set it as high as possible since we have very large files // and want to avoid external fragmentation for the fastest reading/writing // . we now set it to 16k to make map smaller in memory // . NOTE: 80gigs of disk is 80,000,000,000 bytes NOT 80*1024*1024*1024 // . mapping 80 gigs should now take 80G/(16k) = 4.8 million pages // . 4.8 million pages at 16 bytes a page is 74.5 megs of memory // . mapping 320 gigs, at 8k pages is 686 megabytes of RAM (w/ crc) // . mapping 320 gigs, at 16k pages is half that // . mapping 640 gigs, at 16k pages is 686 megabytes of RAM (w/ crc) // . mapping 640 gigs, at 32k pages is 343 megabytes of RAM (w/ crc) #ifndef PRIVACORE_TEST_VERSION #define GB_INDEXDB_PAGE_SIZE (32*1024) #else #define GB_INDEXDB_PAGE_SIZE (1*1024) #endif #define GB_TFNDB_PAGE_SIZE ( 1*1024) //#define PAGE_SIZE (16*1024) //#define PAGE_SIZE (8*1024) // . I define a segment to be a group of pages // . I use 2k pages per segment // . each page represents m_pageSize bytes on disk // . the BigFile's MAX_PART_SIZE should be evenly divisible by PAGES_PER_SEG // . that way, when a part file is removed we can remove an even amount of // segments (we chop the leading Files of a BigFile during merges) #ifndef PRIVACORE_TEST_VERSION #define PAGES_PER_SEGMENT (2*1024) #else #define PAGES_PER_SEGMENT (1*1024) #endif #define PAGES_PER_SEG (PAGES_PER_SEGMENT) class RdbMap { public: RdbMap (); ~RdbMap (); // . does not write data to disk // . frees all void reset ( ); // set the filename, and if it's fixed data size or not void set ( const char *dir, const char *mapFilename, //int32_t fixedDataSize , bool useHalfKeys ); int32_t fixedDataSize , bool useHalfKeys , char keySize , int32_t pageSize ); bool rename ( const char *newMapFilename ) { return m_file.rename ( newMapFilename, NULL); } bool rename ( const char *newMapFilename, const char *newDir, void (* callback)(void *state) , void *state) { return m_file.rename(newMapFilename, newDir, callback, state); } const char *getFilename() const { return m_file.getFilename(); } BigFile *getFile ( ) { return &m_file; } // . writes the map to disk if any slot was added // . returns false if File::close() returns false // . should free up all mem // . resets m_numPages and m_maxNumPages to 0 // . a file's version of the popular reset() function // . if it's urgent we do not call mfree() bool close ( bool urgent ); // . we store the fixed dataSize in the map file // . if it's -1 then each record's data is of variable size int32_t getFixedDataSize() { return m_fixedDataSize; } void reduceMemFootPrint(); // . this is called automatically when close() is called // . however, we may wish to call it externally to ensure no data loss // . return false if any write failes // . returns true when done dumping m_keys and m_offsets to file // . write out the m_keys and m_offsets arrays // . this is totally MTUnsafe // . don't be calling addRecord with this is dumping // . flushes when done bool writeMap ( bool allDone ); bool writeMap2 ( ); int64_t writeSegment ( int32_t segment , int64_t offset ); // . calls addRecord() for each record in the list // . returns false and sets errno on error // . TODO: implement transactional rollback feature bool addList ( RdbList *list ); bool prealloc ( RdbList *list ); // get the number of non-deleted records in the data file we map int64_t getNumPositiveRecs() const { return m_numPositiveRecs; } // get the number of "delete" records in the data file we map int64_t getNumNegativeRecs() const { return m_numNegativeRecs; } // total int64_t getNumRecs() const { return m_numPositiveRecs + m_numNegativeRecs; } // get the size of the file we are mapping int64_t getFileSize () const { return m_offset; } int64_t findNextFullPosdbKeyOffset ( char *buf, int32_t bufSize ) ; // . gets total size of all recs in this page range // . if subtract is true we subtract the sizes of pages that begin // with a delete key (low bit is clear) int64_t getRecSizes ( int32_t startPage , int32_t endPage , bool subtract) const; // like above, but recSizes is guaranteed to be in [startKey,endKey] int64_t getMinRecSizes ( int32_t sp , int32_t ep , const char *startKey , const char *endKey , bool subtract) const; // like above, but sets an upper bound for recs in [startKey,endKey] int64_t getMaxRecSizes ( int32_t sp , int32_t ep , const char *startKey , const char *endKey , bool subtract) const; // get a key range from a page range void getKeyRange ( int32_t startPage , int32_t endPage , char *minKey , char *maxKey ); // . get a page range from a key range // . returns false if no records exist in that key range // . maxKey will be sampled under "oldTruncationLimit" so you // can increase the trunc limit w/o messing up Indexdb::getTermFreq() bool getPageRange ( const char *startKey, const char *endKey, int32_t *startPage , int32_t *endPage , char *maxKey , int64_t oldTruncationLimit = -1) const; // get the ending page so that [startPage,endPage] has ALL the recs // whose keys are in [startKey,endKey] int32_t getEndPage(int32_t startPage, const char *endKey) const; // . offset of first key wholly on page # "page" // . return length of the whole mapped file if "page" > m_numPages // . use m_offset as the size of the file that we're mapping int64_t getAbsoluteOffset(int32_t page) const; // . the offset of a page after "page" that is a different key // . returns m_offset if page >= m_numPages int64_t getNextAbsoluteOffset(int32_t page) const; //char *getLastKey ( ) { return m_lastKey; } void getLastKey(char *key) const { KEYSET(key,m_lastKey,m_ks); } // . these functions operate on one page // . get the first key wholly on page # "page" // . if page >= m_numPages use the lastKey in the file void getKey(int32_t page, char *k) const { if ( page >= m_numPages ) {KEYSET(k,m_lastKey,m_ks);return;} //return m_keys[page/PAGES_PER_SEG][page%PAGES_PER_SEG]; KEYSET(k,&m_keys[page/PAGES_PER_SEG][(page%PAGES_PER_SEG)*m_ks],m_ks); return; } char *getKeyPtr(int32_t page) { if ( page >= m_numPages ) return m_lastKey; return &m_keys[page/PAGES_PER_SEG][(page%PAGES_PER_SEG)*m_ks]; } const char *getKeyPtr(int32_t page) const { return const_cast<RdbMap*>(this)->getKeyPtr(page); } int16_t getOffset(int32_t page) const { if ( page >= m_numPages ) { log(LOG_LOGIC,"RdbMap::getOffset: bad engineer"); return 0; } return m_offsets [page/PAGES_PER_SEG][page%PAGES_PER_SEG]; } void setKey ( int32_t page, const char *k ) { //#ifdef GBSANITYCHECK if ( page >= m_maxNumPages ) { log(LOG_LOGIC,"RdbMap::setKey: bad engineer"); gbshutdownAbort(true); } //#endif KEYSET(&m_keys[page/PAGES_PER_SEG][(page%PAGES_PER_SEG)*m_ks], k,m_ks); } void setOffset ( int32_t page , int16_t offset ) { m_offsets[page/PAGES_PER_SEG][page%PAGES_PER_SEG] = offset;} // . returns true on success // . returns false on i/o error. // . calls allocMap() to get memory for m_keys/m_offsets // . The format of the map on disk is described in Map.h // . sets "m_numPages", "m_keys", and "m_offsets". // . reads the keys and offsets into buffers allocated during open(). bool readMap ( BigFile *dataFile ); bool readMap2 ( ); int64_t readSegment ( int32_t segment, int64_t offset, int32_t fileSize); // due to disk corruption keys or offsets can be out of order in map bool verifyMap ( BigFile *dataFile ); bool verifyMap2 ( ); bool unlink ( ) { return m_file.unlink ( ); } bool unlink ( void (* callback)(void *state) , void *state ) { return m_file.unlink ( callback , state ); } int32_t getNumPages() const { return m_numPages; } // . return first page #, "N", to read to get the record w/ this key // if it exists // . if m_keys[N] < startKey then m_keys[N+1] is > startKey // . if m_keys[N] > startKey then all keys before m_keys[N] in the file // are strictly less than "startKey" and "startKey" does not exist // . if m_keys[N] > startKey then m_keys[N-1] spans multiple pages so // that the key immediately after it on disk is in fact, m_keys[N] int32_t getPage(const char *startKey) const; bool addSegmentPtr ( int32_t n ) ; bool addSegment ( ) ; // . remove and bury (shift over) all segments below the one that // contains page # "pageNum" // . used by RdbMerge when unlinking part files // . returns false and sets errno on error // . the first "fileSize" bytes of the BigFile was chopped off // . we must remove our segments bool chopHead (int32_t fileSize ); // how much mem is being used by this map? int64_t getMemAllocated() const; // . attempts to auto-generate from data file, f // . returns false and sets g_errno on error bool generateMap ( BigFile *f ) ; // . add a slot to the map // . returns false if map size would be exceed by adding this slot bool addRecord ( char *key, char *rec , int32_t recSize ); bool truncateFile ( BigFile *f ) ; void printMap (); private: // the map file BigFile m_file; // . we divide the map up into segments now // . this facilitates merges so one map can shrink while another grows // . these 3 arrays define the map // . see explanation at top of this file for map description // . IMPORTANT: if growing m_pageSize might need to change m_offsets // from int16_t to int32_t char **m_keys; int32_t m_numSegmentPtrs; int16_t **m_offsets; int32_t m_numSegmentOffs; bool m_reducedMem; // number of valid pages in the map. int32_t m_numPages; // . size of m_keys, m_offsets arrays // . not all slots are used, however // . this is sum of all pages in all segments int32_t m_maxNumPages; // each segment holds PAGES_PER_SEGMENT pages of info int32_t m_numSegments; // is the rdb file's dataSize fixed? -1 means it's not. int32_t m_fixedDataSize; // . to keep track of disk offsets of added records // . if this is > 0 we know a key was added to map so we should call // writeMap() on close or destroy // . NOTE: also used as the file size of the file we're mapping int64_t m_offset; long m_newPagesPerSegment; // we keep global tallies on the number of non-deleted records // and deleted records std::atomic<int64_t> m_numPositiveRecs; std::atomic<int64_t> m_numNegativeRecs; // . the last key in the file itself // . getKey(pageNum) returns this when pageNum == m_numPages // . used by Msg3::getSmallestEndKey() char m_lastKey[MAX_KEY_BYTES]; // when close is called, must we write the map? bool m_needToWrite; // when a BigFile gets chopped, keep up a start offset for it int64_t m_fileStartOffset; // are we mapping a data file that supports 6-byte keys? bool m_useHalfKeys; char m_ks; int32_t m_pageSize; int32_t m_pageSizeBits; int64_t m_badKeys ; bool m_needVerify ; }; #endif // GB_RDBMAP_H