a lot of times rdb tree has invalid collection

numbers in it so fix our counting algo in case
the collection rec no longer exists!
This commit is contained in:
Matt Wells
2014-01-21 19:01:44 -08:00
parent 45cb5c9a0c
commit 33c5d9c07f
14 changed files with 169 additions and 70 deletions

@ -569,7 +569,9 @@ bool BigFile::readwrite ( void *buf ,
}
// otherwise, thread spawn failed, do it blocking then
g_errno = 0;
if ( ! g_threads.m_disabled ) {
// if threads are manually disabled don't print these msgs because
// we redbox the fact above the controls in Pages.cpp
if ( g_conf.m_useThreads ) { // ! g_threads.m_disabled ) {
static long s_lastTime = 0;
long now = getTime();
if ( now - s_lastTime >= 1 ) {

@ -686,7 +686,8 @@ void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5.m_waitingForMerge &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg5b.m_waitingForMerge ) {
! sc->m_msg5b.m_waitingForMerge &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
return;

@ -108,6 +108,9 @@ bool DiskPageCache::init ( const char *dbname ,
// void (*rmVfd2)(DiskPageCache*, long) ) {
reset();
// fix cores while rebalancing
maxMem = 0;
m_rdbId = rdbId;
bool *tog = NULL;

93
Mem.cpp

@ -13,9 +13,10 @@
#include "Pages.h"
// put me back
//#define _EFENCE_
//#define EFENCE
#define EFENCE_SIZE 100000
// uncomment this for _EFENCE_ to do underflow checks instead of the
// uncomment this for EFENCE to do underflow checks instead of the
// default overflow checks
//#define _CHECKUNDERFLOW_
@ -51,7 +52,7 @@
// there because it will hit a different PAGE, to be more sure we could
// make UNDERPAD and OVERPAD PAGE bytes, although the overrun could still write
// to another allocated area of memory and we can never catch it.
#ifdef _EFENCE_
#ifdef EFENCE
#define UNDERPAD 0
#define OVERPAD 0
#else
@ -67,7 +68,7 @@ extern bool g_isYippy;
bool freeCacheMem();
#ifdef _EFENCE_
#ifdef EFENCE
static void *getElecMem ( long size ) ;
static void freeElecMem ( void *p ) ;
#endif
@ -249,7 +250,7 @@ void * operator new (size_t size) throw (std::bad_alloc) {
throw std::bad_alloc();
//throw 1;
}
#ifdef _EFENCE_
#ifdef EFENCE
void *mem = getElecMem(size);
#else
//void *mem = dlmalloc ( size );
@ -267,7 +268,7 @@ newmemloop:
//return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -275,7 +276,7 @@ newmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx", (unsigned long)mem,
(unsigned long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -327,7 +328,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
throw std::bad_alloc();
//throw 1;
}
#ifdef _EFENCE_
#ifdef EFENCE
void *mem = getElecMem(size);
#else
//void *mem = dlmalloc ( size );
@ -346,7 +347,7 @@ newmemloop:
//return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -354,7 +355,7 @@ newmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx",
(long)mem, (long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -424,6 +425,7 @@ pid_t Mem::getPid() {
bool Mem::init ( long long maxMem ) {
// set main process pid
s_pid = getpid();
// . don't swap our memory out, man...
// . damn, linux 2.4.17 seems to crash the kernel sometimes w/ this
//if ( mlockall( MCL_CURRENT | MCL_FUTURE ) == -1 ) {
@ -441,10 +443,37 @@ bool Mem::init ( long long maxMem ) {
if ( g_conf.m_detectMemLeaks )
log(LOG_INIT,"mem: Memory leak checking is enabled.");
#ifdef _EFENCE_
#ifdef EFENCE
log(LOG_INIT,"mem: using electric fence!!!!!!!");
#endif
// if we can't alloc 3gb exit and retry
long long start = gettimeofdayInMilliseconds();
char *pools[30];
long long count = 0LL;
long long chunk = 100000000LL; // 100MB chunks
long long need = 3000000000LL; // 3GB
long i = 0; for ( i = 0 ; i < 30 ; i++ ) {
pools[i] = (char *)mmalloc(chunk,"testmem");
count += chunk;
if ( pools[i] ) continue;
count -= chunk;
log("mem: could only alloc %lli bytes of the "
"%lli required to run gigablast. exiting.",
count , need );
}
for ( long j = 0 ; j < i ; j++ )
mfree ( pools[j] , chunk , "testmem" );
long long now = gettimeofdayInMilliseconds();
long long took = now - start;
if ( took > 20 ) log("mem: took %lli ms to check memory ceiling",took);
// return if could not alloc the full 3GB
if ( i < 30 ) return false;
// reset this, our max mem used over time ever because we don't
// want the mem test we did above to count towards it
m_maxAlloced = 0;
// init or own malloc stuff in malloc.c (from doug leay)
//if ( mdw_init_sbrk ( maxMem ) ) return true;
// bitch
@ -653,7 +682,7 @@ bool Mem::printMemBreakdownTable ( SafeBuf* sb,
// make sure the admin viewing this table knows that there will be
// frees in here that are delayed if electric fence is enabled.
#ifdef _EFENCE_
#ifdef EFENCE
ss = " <font color=red>*DELAYED FREES ENABLED*</font>";
#endif
@ -1244,14 +1273,24 @@ void *Mem::gbmalloc ( int size , const char *note ) {
return NULL;
}
void *mem;
// to find bug that cores on malloc do this
//printBreeches(true);
//g_errno=ENOMEM;return (void *)log("Mem::malloc: reached mem limit");}
#ifdef _EFENCE_
void *mem = getElecMem(size+UNDERPAD+OVERPAD);
#else
#ifdef EFENCE
mem = getElecMem(size+UNDERPAD+OVERPAD);
// conditional electric fence?
#elif EFENCE_BIG
if ( size >= EFENCE_SIZE )
mem = getElecMem(size+0+0);
else
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
#else
//void *mem = dlmalloc ( size );
void *mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
#endif
// initialization debug
//char *pend = (char *)mem + UNDERPAD + size;
@ -1323,7 +1362,7 @@ mallocmemloop:
return NULL;
}
if ( (unsigned long)mem < 0x00010000 ) {
#ifdef _EFENCE_
#ifdef EFENCE
void *remem = getElecMem(size);
#else
void *remem = sysmalloc(size);
@ -1331,7 +1370,7 @@ mallocmemloop:
log ( LOG_WARN, "mem: Caught low memory allocation at %08lx, "
"reallocated to %08lx",
(unsigned long)mem, (unsigned long)remem );
#ifdef _EFENCE_
#ifdef EFENCE
freeElecMem (mem);
#else
sysfree(mem);
@ -1394,7 +1433,9 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
char *mem;
#ifdef _EFENCE_
// even though size may be < 100k for EFENCE_BIG, do it this way
// for simplicity...
#if defined(EFENCE) || defined(EFENCE_BIG)
mem = (char *)mmalloc ( newSize , note );
if ( ! mem ) return NULL;
// copy over to it
@ -1473,10 +1514,19 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
char *xx = NULL; *xx = 0;
}
#ifdef _EFENCE_
#ifdef EFENCE
// this does a delayed free so do not call rmMem() just yet
freeElecMem ((char *)ptr - UNDERPAD );
#else
return;
#endif
#ifdef EFENCE_BIG
if ( size >= EFENCE_SIZE ) {
freeElecMem ((char *)ptr - 0 );
return;
}
#endif
bool isnew = s_isnew[slot];
// if this returns false it was an unbalanced free
@ -1484,7 +1534,6 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
if ( isnew ) sysfree ( (char *)ptr );
else sysfree ( (char *)ptr - UNDERPAD );
#endif
}
long getLowestLitBitLL ( unsigned long long bits ) {

@ -541,8 +541,9 @@ bool Msg4::addMetaList ( char *metaList ,
s_msg4Tail->m_next = this;
// we are the new tail
s_msg4Tail = this;
// debug log
log("msg4: queueing body msg4=0x%lx",(long)this);
// debug log. seems to happen a lot if not using threads..
if ( g_conf.m_useThreads )
log("msg4: queueing body msg4=0x%lx",(long)this);
// mark it
m_inUse = true;
// all done then, but return false so caller does not free
@ -556,8 +557,10 @@ bool Msg4::addMetaList ( char *metaList ,
// sanity check
if ( s_msg4Head || s_msg4Tail ) { char *xx=NULL; *xx=0; }
// spider hang bug
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
// . spider hang bug
// . debug log. seems to happen a lot if not using threads..
if ( g_conf.m_useThreads )
logf(LOG_DEBUG,"msg4: queueing head msg4=0x%lx",(long)this);
// mark it
m_inUse = true;
@ -1062,8 +1065,10 @@ void storeLineWaiters ( ) {
// . if his callback was NULL, then was loaded in loadAddsInProgress()
// . we no longer do that so callback should never be null now
if ( ! msg4->m_callback ) { char *xx=NULL;*xx=0; }
// log this now i guess
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",(long)msg4);
// log this now i guess. seems to happen a lot if not using threads
if ( g_conf.m_useThreads )
logf(LOG_DEBUG,"msg4: calling callback for msg4=0x%lx",
(long)msg4);
// release it
msg4->m_inUse = false;
// call his callback

@ -1393,7 +1393,7 @@ bool Msg5::gotList2 ( ) {
m_waitingForMerge = false;
// thread creation failed
if ( ! g_threads.areThreadsDisabled() )
if ( g_conf.m_useThreads )
log(LOG_INFO,
"net: Failed to create thread to merge lists. Doing "
"blocking merge. Hurts performance.");

@ -443,6 +443,8 @@ void Multicast::gotReply2 ( UdpSlot *slot ) {
long now = getTime();
if (now - s_elastTime > 10) {s_elastTime = now; logIt=true;}
}
// don't log ETRYAGAIN, may come across as bad when it is normal
if ( m_errnos[i] == ETRYAGAIN ) logIt = false;
// log a failure msg
if ( logIt ) { // m_errnos[i] != ETRYAGAIN ) {
Host *h = m_hostdb->getHost ( slot->m_ip ,slot->m_port );

@ -397,13 +397,14 @@ skipReplaceHost:
"in disagreement with ours.\">H</b></font>");
// rebalancing?
if ( h->m_flags & PFLAG_REBALANCING )
fb.safePrintf("<b title=\"Current rebalancing\">R</b>");
fb.safePrintf("<b title=\"Currently "
"rebalancing\">R</b>");
// has recs that should be in another shard? indicates
// we need to rebalance or there is a bad hosts.conf
if ( h->m_flags & PFLAG_FOREIGNRECS )
fb.safePrintf("<font color=red><b title=\"Foreign data "
"detected. Needs rebalance.\">F"
"</b></font");
"</b></font>");
// if it has spiders going on say "S"
if ( h->m_flags & PFLAG_HASSPIDERS )
fb.safePrintf ( "<span title=\"Spidering\">S</span>");

@ -320,13 +320,15 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
DARK_BLUE );
for ( long i = 0; i < 96; i++ ) {
if ( msgCount0[i] <= 0 ) continue;
p->safePrintf("<tr><td>0</td><td>0x%lx</td><td>%li</td></tr>",
i, msgCount0[i]);
p->safePrintf("<tr bgcolor=#%s>"
"<td>0</td><td>0x%lx</td><td>%li</td></tr>",
LIGHT_BLUE,i, msgCount0[i]);
}
for ( long i = 0; i < 96; i++ ) {
if ( msgCount1[i] <= 0 ) continue;
p->safePrintf("<tr><td>1</td><td>0x%lx</td><td>%li</td></tr>",
i, msgCount1[i]);
p->safePrintf("<tr bgcolor=#%s>"
"<td>1</td><td>0x%lx</td><td>%li</td></tr>",
LIGHT_BLUE,i, msgCount1[i]);
}
p->safePrintf ( "</table><br>" );

@ -963,14 +963,14 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
coll, NULL, fromIp, qs );
}
// end table
sb->safePrintf ("</td></tr></table><br/><br/>\n");
sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
SafeBuf mb;
long adds = 0;
PingServer *ps = &g_pingServer;
mb.safePrintf("<center>"
mb.safePrintf(//"<center>"
"<table cellpadding=5 "
"style=\""
"background-color:#ff6666;"
@ -1015,7 +1015,16 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
"pings.",ps->m_numHostsDead ,s );
}
mb.safePrintf("</td></tr></table></center><br>");
if ( ! g_conf.m_useThreads ) {
if ( adds ) mb.safePrintf("<br><br>");
adds++;
mb.safePrintf("Threads are disabled. Severely hurts "
"performance.");
}
mb.safePrintf("</td></tr></table>"
//"</center>"
"<br>");
// a new table. on the left is collections, on right is other stuff
sb->safePrintf("<TABLE "
@ -1096,7 +1105,7 @@ bool Pages::printAdminTop ( SafeBuf *sb ,
sb->safePrintf("</div></TD>");
// the controls will go here
sb->safePrintf("<TD>");
sb->safePrintf("<TD valign=top>");
return true;
}

@ -1964,8 +1964,8 @@ void RdbBase::gotTokenForMerge ( ) {
// . if we have no g_errno that is bad!!!
// . we should dump core here or something cuz we have to remove the
// merge file still to be correct
if ( ! g_errno )
log(LOG_INFO,"merge: Got token without blocking.");
//if ( ! g_errno )
// log(LOG_INFO,"merge: Got token without blocking.");
// we now set this in init() by calling m_merge.init() so it
// can pre-alloc it's lists in it's s_msg3 class
// g_conf.m_mergeMaxBufSize ) ) return ;

@ -279,7 +279,7 @@ long RdbTree::clear ( ) {
for ( long i = 0 ; i < nc ; i++ ) {
CollectionRec *cr = g_collectiondb.getRec(i);
if ( ! cr ) continue;
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
//if (((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
}
@ -633,9 +633,11 @@ long RdbTree::addNode ( collnum_t collnum ,
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numNegKeysInTree[(unsigned char)m_rdbId]++;
//if( ((unsigned char)m_rdbId)>=RDB_END){
//char *xx=NULL;*xx=0; }
CollectionRec *cr ;
cr = g_collectiondb.m_recs[collnum];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]++;
}
}
else {
@ -644,9 +646,11 @@ long RdbTree::addNode ( collnum_t collnum ,
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numPosKeysInTree[(unsigned char)m_rdbId]++;
//if( ((unsigned char)m_rdbId)>=RDB_END){
//char *xx=NULL;*xx=0; }
CollectionRec *cr ;
cr = g_collectiondb.m_recs[collnum];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]++;
}
}
// debug2 msg
@ -839,16 +843,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
if ( KEYNEG(m_keys,i,m_ks) ) {
m_numNegativeKeys--;
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
if ( m_rdbId >= 0 ) {
CollectionRec *cr;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
}
}
else {
m_numPositiveKeys--;
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
if ( m_rdbId >= 0 ) {
CollectionRec *cr;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
}
// debug step -- check chain from iparent down making sure that
//printTree();
@ -945,8 +953,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numNegKeysInTree[(unsigned char)m_rdbId]--;
CollectionRec *cr ;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numNegKeysInTree[(unsigned char)m_rdbId]--;
}
}
else {
@ -954,8 +963,9 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
CollectionRec *cr ;
cr = g_collectiondb.m_recs[m_collnums[i]];
if(cr)cr->m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
}
// debug step -- check chain from iparent down making sure that

@ -2444,7 +2444,8 @@ static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {
// m_deleteMyself flag will have been set.
if ( THIS->m_deleteMyself &&
! THIS->m_msg5b.m_waitingForMerge &&
! THIS->m_msg5b.m_waitingForList ) {
! THIS->m_msg5b.m_waitingForList &&
! THIS->m_msg1.m_mcast.m_inUse ) {
mdelete ( THIS , sizeof(SpiderColl),"postdel1");
delete ( THIS );
return;
@ -2882,6 +2883,19 @@ static void doledWrapper ( void *state ) {
// we are done!! that was the final step...
THIS->m_isPopulating = false;
// did collection get nuked while we were waiting for msg1 reply?
if ( THIS->m_deleteMyself &&
! THIS->m_msg5.m_waitingForMerge &&
! THIS->m_msg5.m_waitingForList &&
! THIS->m_msg5b.m_waitingForMerge &&
! THIS->m_msg5b.m_waitingForList ) {
mdelete ( THIS , sizeof(SpiderColl),"postdel1");
delete ( THIS );
return;
}
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
// now go to the next node in the wait tree.
// . it will get the next key after m_waitingTreeKey
@ -2967,7 +2981,8 @@ bool SpiderColl::evalIpLoop ( ) {
// m_deleteMyself flag will have been set.
if ( m_deleteMyself &&
! m_msg5b.m_waitingForMerge &&
! m_msg5b.m_waitingForList ) {
! m_msg5b.m_waitingForList &&
! m_msg1.m_mcast.m_inUse ) {
mdelete ( this , sizeof(SpiderColl),"postdel1");
delete ( this );
// pretend to block since we got deleted!!!
@ -7476,15 +7491,15 @@ bool sendPage ( State11 *st ) {
// begin the table
sb.safePrintf ( "<table %s>\n"
"<tr><td colspan=50>"
"<center>"
//"<center>"
"<b>Currently Spidering on This Host</b>"
//" (%li spiders)"
" (%li spiders)"
//" (%li locks)"
"</center>"
"</td></tr>\n" ,
TABLE_STYLE
//(long)g_spiderLoop.m_numSpidersOut
//g_spiderLoop.m_lockTable.m_numSlotsUsed);
//"</center>"
"</td></tr>\n"
, TABLE_STYLE
, (long)g_spiderLoop.m_numSpidersOut
//, g_spiderLoop.m_lockTable.m_numSlotsUsed
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb , true ) ) return false;

@ -51,7 +51,7 @@
<readOnlyMode>0</>
# Controls all spidering for all collections
<spideringEnabled>1</>
<spideringEnabled>0</>
# Can people use the add url interface to add urls to the index?
<addUrlEnabled>1</>