Make CollectionRec::m_needsSave thread safe

This commit is contained in:
Ai Lin Chia
2017-03-29 17:42:28 +02:00
parent 096523bd25
commit 3b75814945
8 changed files with 38 additions and 64 deletions

@ -36,7 +36,6 @@ Collectiondb::Collectiondb ( ) {
m_numRecsUsed = 0;
m_numCollsSwappedOut = 0;
m_initializing = false;
m_needsSave = false;
m_recs = NULL;
// sanity
@ -80,17 +79,14 @@ bool Collectiondb::save ( ) {
}
// which collection rec needs a save
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
// temp debug message
//logf(LOG_DEBUG,"admin: SAVING collection #%" PRId32" ANYWAY",i);
if ( ! m_recs[i]->m_needsSave ) {
for (int32_t i = 0; i < m_numRecs; i++) {
if (!m_recs[i]) {
continue;
}
//log(LOG_INFO,"admin: Saving collection #%" PRId32".",i);
m_recs[i]->save ( );
m_recs[i]->save();
}
// oh well
return true;
}
@ -207,14 +203,6 @@ bool Collectiondb::addExistingColl ( const char *coll, collnum_t collnum ) {
}
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// set collnum right for g_parms.setToDefault() call just in case
// because before it was calling CollectionRec::reset() which
// was resetting the RdbBases for the m_collnum which was garbage
// and ended up resetting random collections' rdb. but now
// CollectionRec::CollectionRec() sets m_collnum to -1 so we should
// not need this!
//cr->m_collnum = oldCollnum;
// get the default.conf from working dir if there
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
@ -225,7 +213,6 @@ bool Collectiondb::addExistingColl ( const char *coll, collnum_t collnum ) {
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
cr->m_needsSave = false;
//log("admin: loaded old coll \"%s\"",coll);
// load coll.conf file
@ -373,8 +360,6 @@ bool Collectiondb::addNewColl ( const char *coll,
// collection. NO, default is in Parms.cpp.
//cr->m_maxNumSpiders = 10;
//cr->m_needsSave = 1;
// start the spiders!
cr->m_spideringEnabled = true;
@ -525,9 +510,6 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) {
log(LOG_INFO,"db: deleting coll \"%s\" (%" PRId32")",coll,
(int32_t)cr->m_collnum);
// we need a save
m_needsSave = true;
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
@ -2498,12 +2480,19 @@ bool CollectionRec::save ( ) {
return true;
}
// only save if we need to
bool needsSave = m_needsSave.exchange(false);
if (!needsSave) {
return true;
}
//File f;
char tmp[1024];
snprintf ( tmp , 1023, "%scoll.%s.%" PRId32"/coll.conf",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) {
snprintf(tmp, 1023, "%scoll.%s.%" PRId32"/coll.conf", g_hostdb.m_dir, m_coll, (int32_t)m_collnum);
if (!g_parms.saveToXml((char *)this, tmp, OBJ_COLL)) {
// we didn't save successfully
m_needsSave = true;
return false;
}
@ -2511,44 +2500,33 @@ bool CollectionRec::save ( ) {
// save the crawlinfo class in the collectionrec for diffbot
//
// SAVE LOCAL
snprintf ( tmp , 1023, "%scoll.%s.%" PRId32"/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
snprintf(tmp, 1023, "%scoll.%s.%" PRId32"/localcrawlinfo.dat", g_hostdb.m_dir, m_coll, (int32_t)m_collnum);
// in case emergency save from malloc core, do not alloc
StackBuf<1024> sb;
// binary now
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log(LOG_WARN, "db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
sb.safeMemcpy(&m_localCrawlInfo, sizeof(CrawlInfo));
if (sb.safeSave(tmp) == -1) {
log(LOG_WARN, "db: failed to save file %s : %s", tmp,mstrerror(g_errno));
g_errno = 0;
}
// SAVE GLOBAL
snprintf ( tmp , 1023, "%scoll.%s.%" PRId32"/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
snprintf(tmp, 1023, "%scoll.%s.%" PRId32"/globalcrawlinfo.dat", g_hostdb.m_dir, m_coll, (int32_t)m_collnum);
sb.reset();
// binary now
sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log(LOG_WARN, "db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
if (sb.safeSave(tmp) == -1) {
log(LOG_WARN, "db: failed to save file %s : %s", tmp,mstrerror(g_errno));
g_errno = 0;
}
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%" PRId32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
sb.safePrintf("%scoll.%s.%" PRId32"/", g_hostdb.m_dir, m_coll, (int32_t)m_collnum);
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
// do not need a save now
m_needsSave = false;
return true;
}

@ -5,6 +5,7 @@
#ifndef GB_COLLECTIONDB_H
#define GB_COLLECTIONDB_H
#include <atomic>
#include "SafeBuf.h"
#include "rdbid_t.h"
#include "GbMutex.h"
@ -91,8 +92,6 @@ private:
bool growRecPtrBuf(collnum_t collnum);
bool setRecPtr(collnum_t collnum, CollectionRec *cr);
bool m_needsSave;
class CollectionRec **m_recs;
// m_recs[] points into a safebuf that is just an array
@ -219,9 +218,13 @@ class CollectionRec {
// . stuff used by Collectiondb
// . do we need a save or not?
bool save ();
bool m_needsSave;
bool save();
void setNeedsSave() { m_needsSave = true; }
private:
std::atomic<bool> m_needsSave;
public:
bool load ( const char *coll , int32_t collNum ) ;
void reset();

@ -257,7 +257,7 @@ void DailyMerge::dailyMergeLoop ( ) {
// getting pushed back.
m_cr->m_dailyMergeStarted = m_savedStartTime; // nowSynced;
// tell it to save, otherwise this might not get saved
m_cr->m_needsSave = true;
m_cr->setNeedsSave();
// initiate dumps
g_spiderdb.getRdb ()->dumpTree();
g_linkdb.getRdb ()->dumpTree();

@ -11108,7 +11108,7 @@ bool Parms::updateParm(const char *rec, WaitEntry *we) {
val1.getBufStart(),
val2.getBufStart());
if ( cr ) cr->m_needsSave = true;
if ( cr ) cr->setNeedsSave();
// HACK #2
if ( base == cr && dst == (char *)&cr->m_importEnabled )

@ -254,12 +254,6 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
// clear it in case it existed
g_errno = 0;
// if some things need to be saved, how did that happen?
// we saved everything before we entered repair mode and did not
// allow anything more to be added... and we do not allow any
// collections to be deleted via Collectiondb::deleteRec() when
// in repair mode... how could this happen?
//if ( m_needsSave ) { g_process.shutdownAbort(true); }
// delete old collection recs
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) {
@ -740,7 +734,6 @@ void Rdb::doneSaving ( ) {
if ( g_errno ) {
log(LOG_WARN, "db: Had error saving %s-saved.dat: %s.", m_dbname,mstrerror(g_errno));
g_errno = 0;
//m_needsSave = true;
m_isSaving = false;
return;
}

@ -783,7 +783,7 @@ bool SpiderColl::addSpiderRequest(const SpiderRequest *sreq, int64_t nowGlobalMS
if ( cr ) {
cr->m_localCrawlInfo .m_urlsHarvested++;
cr->m_globalCrawlInfo.m_urlsHarvested++;
cr->m_needsSave = true;
cr->setNeedsSave();
}
// . we can't do this because we do not have the spiderReply!!!???

@ -1147,7 +1147,7 @@ skipDoledbRec:
cr->m_spiderStatus = SP_INPROGRESS; // this is 7
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
cr->setNeedsSave();
// sometimes the spider coll is reset/deleted while we are
// trying to get the lock in spiderUrl9() so let's use collnum
@ -1861,7 +1861,7 @@ void spiderRoundIncremented ( CollectionRec *cr ) {
cr->localCrawlInfoUpdate();
cr->m_needsSave = true;
cr->setNeedsSave();
}
static void gotCrawlInfoReply(void *state, UdpSlot *slot) {
@ -2057,7 +2057,7 @@ static void gotCrawlInfoReply(void *state, UdpSlot *slot) {
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
// make it save to disk i guess
cr->m_needsSave = true;
cr->setNeedsSave();
// if spidering disabled in master controls then send no
// notifications
@ -2217,7 +2217,7 @@ void handleRequestc1(UdpSlot *slot, int32_t /*niceness*/) {
// if changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
// save that!
cr->m_needsSave = true;
cr->setNeedsSave();
}
doNotEnd:

@ -1609,7 +1609,7 @@ bool XmlDoc::indexDoc ( ) {
cr->localCrawlInfoUpdate();
}
// need to save collection rec now during auto save
cr->m_needsSave = true;
cr->setNeedsSave();
// update this just in case we are the last url crawled
//int64_t now = gettimeofdayInMillisecondsGlobal();
//cr->m_diffbotCrawlEndTime = now;
@ -8105,7 +8105,7 @@ char **XmlDoc::gotHttpReply ( ) {
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
m_incrementedDownloadCount = true;
cr->m_needsSave = true;
cr->setNeedsSave();
// changing status, resend local crawl info to all
cr->localCrawlInfoUpdate();
}