try to fix core dumps. not sure how

mem is getting corrupted.
This commit is contained in:
Matt Wells
2015-08-22 08:52:28 -07:00
parent 0f7910125b
commit bb16341f51
8 changed files with 67 additions and 45 deletions

@ -53,6 +53,14 @@ BigFile::BigFile () {
log("file: littlebufsize too small.");
char *xx=NULL;*xx=0;
}
memset ( m_littleBuf , 0 , LITTLEBUFSIZE );
// avoid a malloc for small files.
// this way we can save in memory RdbMaps upon a core, even malloc/free
// related cores, cuz we won't have to do a malloc to save!
m_fileBuf.setBuf ( m_littleBuf,LITTLEBUFSIZE,0,false);
// for this make the length always equal the capacity so when we
// call reserve it builds on the whole thing
m_fileBuf.setLength ( m_fileBuf.getCapacity() );
}
// we alternate parts into "dirname" and "stripeDir"
@ -173,31 +181,27 @@ bool BigFile::addPart ( int32_t n ) {
char *xx=NULL;*xx=0;}
// how much more mem do we need?
int32_t delta = need - m_fileBuf.getLength();
// avoid a malloc for small files.
// this way we can save in memory RdbMaps upon a core, even malloc/free
// related cores, cuz we won't have to do a malloc to save!
if ( delta <= LITTLEBUFSIZE && ! m_fileBuf.getBufStart() ) {
m_fileBuf.setBuf ( m_littleBuf,LITTLEBUFSIZE,0,false);
// do not call reserve() below:
delta = 0;
}
// . make sure our CAPACITY is increased by what we need
// . SafeBuf::reserve() ADDS this much to current capacity
// . true = clear new mem so File::m_calledSet is false for Files
// that may be gaps or not exist because the BigFile was being
// merged.
if ( delta > 0 && ! m_fileBuf.reserve ( delta ,"bfbuf",true ) )
if ( delta > 0 && ! m_fileBuf.reserve ( delta ,"bfbuf",true ) ) {
log("file: failed to reserve %i more mem for part",delta);
return false;
}
// make length the capacity. so if buf is resized in call to
// SafeBuf::reserve() it will copy over all of the old buf to new buf
if ( m_fileBuf.getLength() < m_fileBuf.getCapacity() )
m_fileBuf.setLength ( m_fileBuf.getCapacity() );
m_fileBuf.setLength ( m_fileBuf.getCapacity() );
File *files = (File *)m_fileBuf.getBufStart();
File *f = &files[n];
// sanity to ensure we do not breach the buffer
char *fend = ((char *)f) + sizeof(File);
if ( fend > m_fileBuf.getBuf() ) { char *xx=NULL;*xx=0; }
// we have to call constructor ourself then
f->constructor();
// File *f ;
@ -228,8 +232,10 @@ bool BigFile::doesExist ( ) {
bool BigFile::doesPartExist ( int32_t n ) {
//if ( n >= MAX_PART_FILES ) return false;
if ( n >= m_maxParts ) return false;
File *f = getFile(n);
return f->calledSet();
// f will be null if part does not exist
File *f = getFile2(n);
if ( f ) return true;
return false;
}
static int64_t s_vfd = 0;
@ -307,10 +313,10 @@ int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
}
// get the File ptr from the table
File *f = getFile(n);
File *f = getFile2(n);
// if part does not exist then create it! addPart(n) will call
// File::set() on it and set m_setCalled to true.
if ( ! f->calledSet() ) {
if ( ! f ) {
// don't create File if we're getting it for reading
if ( forReading ) return -1;
if ( ! addPart (n) ) return -1;
@ -344,11 +350,11 @@ int64_t BigFile::getFileSize ( ) {
int64_t totalSize = 0;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
// shortcut
File *f = getFile(n);
File *f = getFile2(n);
// we can have headless big files... count the heads.
// this can happen if the first Files were deleted because
// of an ongoing merge operation.
if ( ! f->calledSet() ) {
if ( ! f ) {
totalSize += MAX_PART_SIZE;
continue;
}
@ -375,9 +381,9 @@ time_t BigFile::getLastModifiedTime ( ) {
time_t min = -1;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
// shortcut
File *f = getFile(n);
File *f = getFile2(n);
// we can have headless big files... count the heads
if ( ! f->calledSet() ) continue;
if ( ! f ) continue;
// returns -1 on error, 0 if file does not exist
time_t date = f->getLastModifiedTime();
if ( date == -1 ) return -2;
@ -1097,9 +1103,9 @@ void *readwriteWrapper_r ( void *state , ThreadEntry *t ) {
File *f2 = NULL;
// when we exit, m_this is invalid!!!
if ( fstate->m_filenum1 < fstate->m_this->m_maxParts )
f1 = fstate->m_this->getFile(fstate->m_filenum1);
f1 = fstate->m_this->getFile2(fstate->m_filenum1);
if ( fstate->m_filenum2 < fstate->m_this->m_maxParts )
f2 = fstate->m_this->getFile(fstate->m_filenum2);
f2 = fstate->m_this->getFile2(fstate->m_filenum2);
// . if open count changed on us our file got unlinked from under us
// and another file was opened with that same fd!!!
@ -1488,8 +1494,8 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
// break out if we should only unlink one part
if ( m_part >= 0 && i != m_part ) break;
// get the ith file to rename/unlink
File *f = getFile(i);
if ( ! f->calledSet() ) {
File *f = getFile2(i);
if ( ! f ) {
// one less part to do
m_partsRemaining--;
continue;
@ -1674,7 +1680,7 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
THIS->getFilename(),mstrerror(g_errno));
// get the ith file we just unlinked
int32_t i = f->m_i;
File *fi = THIS->getFile ( i );
File *fi = THIS->getFile2 ( i );
// rename the part if it checks out
if ( f == fi ) {
// set his new name
@ -1722,7 +1728,7 @@ void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
int32_t i = f->m_i;
// . remove the part if it checks out
// . this will also close the file when it deletes it
File *fi = THIS->getFile(i);
File *fi = THIS->getFile2(i);
if ( f == fi ) THIS->removePart ( i );
// otherwise bitch about it
else log(LOG_LOGIC,"disk: Unlink had bad file ptr.");
@ -1737,7 +1743,7 @@ void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
void BigFile::removePart ( int32_t i ) {
File *f = getFile(i);
File *f = getFile2(i);
// . thread should have stored the filename for unlinking
// . now delete it from memory
f->destructor();
@ -1752,8 +1758,8 @@ void BigFile::removePart ( int32_t i ) {
// set m_maxParts
int32_t j;
for ( j = i ; j >= 0 ; j-- ) {
File *fj = getFile(j);
if ( fj->calledSet() ) { m_maxParts = j+1; break; }
File *fj = getFile2(j);
if ( fj ) { m_maxParts = j+1; break; }
}
// may have no more part files left which means no max part num
if ( j < 0 ) m_maxParts = 0;
@ -1764,8 +1770,8 @@ void BigFile::removePart ( int32_t i ) {
// doesn't work.
bool BigFile::closeFds ( ) {
for ( int32_t i = 0 ; i < m_maxParts ; i++ ) {
File *f = getFile(i);
if ( ! f->calledSet() ) continue;
File *f = getFile2(i);
if ( ! f ) continue;
f->close();
}
return true;
@ -1778,8 +1784,8 @@ bool BigFile::close ( ) {
// subroutines, so put a stop to that circle
m_isClosing = true;
for ( int32_t i = 0 ; i < m_maxParts ; i++ ) {
File *f = getFile(i);
if ( ! f->calledSet() ) continue;
File *f = getFile2(i);
if ( ! f ) continue;
f->close();
f->destructor();
// mdelete ( m_files[i] , sizeof(File) , "BigFile" );

@ -304,13 +304,16 @@ class BigFile {
//bool unlinkPart ( int32_t n , bool block );
File *getFile ( int32_t n ) {
File *getFile2 ( int32_t n ) {
if ( n >= m_maxParts ) return NULL;
File *files = (File *)m_fileBuf.getBufStart();
return &files[n];
File *f = &files[n];
if ( ! f->calledSet() ) return NULL;
return f;
};
// if part file not created, will create it
File *getPartFile ( int32_t n ) { return getFile(n); }
//File *getPartFile2 ( int32_t n ) { return getFile2(n); }
// . put a signal on the queue to do reading/writing
// . we call readwrite ( FileState *) when we handle the signal

@ -50,7 +50,10 @@ bool Dir::open ( ) {
close ( );
if ( ! m_dirname ) return false;
retry8:
// opendir() calls malloc
g_inMemFunction = true;
m_dir = opendir ( m_dirname );
g_inMemFunction = false;
// interrupted system call
if ( ! m_dir && errno == EINTR ) goto retry8;

13
Rdb.cpp

@ -1794,6 +1794,8 @@ void attemptMergeAll2 ( ) {
CollectionRec *last = NULL;
CollectionRec *cr;
rebuild:
//
// . if the first time then build the linked list
// . or if we set s_needsBuild to false, like above, re-build it
@ -1805,7 +1807,7 @@ void attemptMergeAll2 ( ) {
for ( int32_t i=0 ; s_needsBuild && i<g_collectiondb.m_numRecs ; i++) {
// we need this quickpoll for when we got 20,000+ collections
QUICKPOLL ( niceness );
cr = g_collectiondb.m_recs[i];
cr = g_collectiondb.getRec(i);//m_recs[i];
if ( ! cr ) continue;
// add it
if ( ! s_mergeHead ) s_mergeHead = cr;
@ -1827,6 +1829,15 @@ void attemptMergeAll2 ( ) {
// this is a requirement in RdbBase::attemptMerge() so check
// for it here so we can bail out early
if ( g_numThreads > 0 ) break;
// sanity
CollectionRec *vr = g_collectiondb.getRec(cr->m_collnum);
if ( vr != cr ) {
log("rdb: attemptmergeall: bad collnum %i. how "
"did this happen?",
(int)cr->m_collnum);
s_needsBuild = true;
goto rebuild;
}
// pre advance
CollectionRec *next = cr->m_nextLink;
// try to merge the next guy in line, in the linked list

@ -822,10 +822,8 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum ,
// if not a new file sanity check it
for ( int32_t j = 0 ; ! isNew && j < f->m_maxParts - 1 ; j++ ) {
// might be headless
File *ff = f->getFile(j);//m_files[j];
File *ff = f->getFile2(j);//m_files[j];
if ( ! ff ) continue;
// now we don't use NULL, but m_calledSet flag
if ( ! ff->calledSet() ) continue;
if ( ff->getFileSize() == MAX_PART_SIZE ) continue;
log ( "db: File %s/%s has length %"INT64", but it should be %"INT64". "
"You should move it to a temporary directory "

@ -295,7 +295,7 @@ bool RdbMap::verifyMap ( BigFile *dataFile ) {
dataFile->doesPartExist ( numMissingParts-1 ) )
numMissingParts--;
if ( numMissingParts > 0 ) {
File *f = dataFile->getFile ( numMissingParts );
File *f = dataFile->getFile2 ( numMissingParts );
if ( f ) log("db: Missing part file before %s.",
f->getFilename());
}
@ -1678,7 +1678,7 @@ bool RdbMap::truncateFile ( BigFile *f ) {
int32_t numParts = f->getNumParts();
// what part num are we on?
int32_t partnum = f->getPartNum ( m_offset );
File *p = f->getFile ( partnum );
File *p = f->getFile2 ( partnum );
if ( ! p ) return log("db: Unable to get part file.");
// get offset relative to the part file
int32_t newSize = m_offset % (int64_t)MAX_PART_SIZE;
@ -1699,7 +1699,7 @@ bool RdbMap::truncateFile ( BigFile *f ) {
// MAX_TRUNC_SIZE bytes big
File *p2 = NULL;
if ( partnum == numParts-2 ) {
p2 = f->getFile ( partnum + 1 );
p2 = f->getFile2 ( partnum + 1 );
if ( ! p2 ) return log("db: Could not get next part in line.");
if ( p2->getFileSize() > MAX_TRUNC_SIZE )
return log("db: Next part file is bigger than %"INT32" "

@ -479,7 +479,8 @@ int32_t SafeBuf::safeSave (char *filename ) {
retry22:
// first write to tmp file
SafeBuf fn;
char tmp[1024];
SafeBuf fn(tmp,1024);
fn.safePrintf( "%s.saving",filename );
int32_t fd = open ( fn.getBufStart() ,

@ -1349,7 +1349,7 @@ void Syncdb::syncStart_r ( bool amThread ) {
for ( int32_t m = 0 ; m < f->m_numParts ; m++ ) {
// get part file
File *p = f->getFile(m);//m_files[m];
File *p = f->getFile2(m);//m_files[m];
// copy that
sprintf ( cmd , "rcp %s %s:%scoll.%s.%"INT32"/'",
p->getFilename(),ips,dir,coll,collnum);