mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-05-29 20:09:34 -04:00
It compiles, links starts and can put seeds into the sqlite database
This commit is contained in:
parent
50b363b8fe
commit
9785fec43b
Collectiondb.cppConvertSpiderdb.cppDailyMerge.cppHostdb.cppMakefileMsg0.cppMsg3.cppMsg4In.cppMsg5.cppPageCrawlBot.cppPageReindex.cppPageSpiderdbLookup.cppPageStats.cppParms.cppProcess.cppRdb.cppRdbBase.cppRdbList.cppRdbMerge.cppRdbTree.cppRepair.cppSpider.cppSpider.hSpiderColl.cppSpiderdbHostDelete.cppSpiderdbRdbSqliteBridge.cppSpiderdbRdbSqliteBridge.hSpiderdbSqlite.cppSpiderdbSqlite.hStatistics.cppXmlDoc.cppmain.cpprdbid_t.h
@ -150,7 +150,7 @@ bool Collectiondb::cleanTrees() {
|
||||
g_posdb.getRdb()->cleanTree();
|
||||
g_titledb.getRdb()->cleanTree();
|
||||
g_tagdb.getRdb()->cleanTree();
|
||||
g_spiderdb.getRdb()->cleanTree();
|
||||
g_spiderdb.getRdb_deprecated()->cleanTree();
|
||||
g_doledb.getRdb()->cleanTree();
|
||||
g_clusterdb.getRdb()->cleanTree();
|
||||
g_linkdb.getRdb()->cleanTree();
|
||||
@ -436,7 +436,7 @@ bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
|
||||
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
if ( ! g_spiderdb.getRdb_deprecated()->addRdbBase1(coll) ) goto hadError;
|
||||
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
|
||||
|
||||
// now clean the trees
|
||||
@ -504,7 +504,6 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) {
|
||||
|
||||
g_titledb.getRdb()->delColl ( coll );
|
||||
g_tagdb.getRdb()->delColl ( coll );
|
||||
g_spiderdb.getRdb()->delColl ( coll );
|
||||
g_doledb.getRdb()->delColl ( coll );
|
||||
g_clusterdb.getRdb()->delColl ( coll );
|
||||
g_linkdb.getRdb()->delColl ( coll );
|
||||
@ -765,7 +764,6 @@ bool Collectiondb::resetColl2(collnum_t oldCollnum, collnum_t newCollnum) {
|
||||
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
|
||||
|
@ -64,7 +64,7 @@ static const char update_statement_duplicate_request[] =
|
||||
int convertSpiderDb(const char *collname) {
|
||||
if(!g_spiderdb.init())
|
||||
return 1;
|
||||
if(!g_spiderdb.getRdb()->addRdbBase1(collname))
|
||||
if(!g_spiderdb.getRdb_deprecated()->addRdbBase1(collname))
|
||||
return 2;
|
||||
|
||||
collnum_t collnum = g_collectiondb.getRec(collname)->m_collnum;
|
||||
@ -134,7 +134,7 @@ int convertSpiderDb(const char *collname) {
|
||||
printf("Starting conversion\n");
|
||||
for(;;) {
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
if(!msg5.getList(RDB_SPIDERDB,
|
||||
if(!msg5.getList(RDB_SPIDERDB_DEPRECATED,
|
||||
collnum,
|
||||
&list,
|
||||
&startKey,
|
||||
@ -221,7 +221,8 @@ int convertSpiderDb(const char *collname) {
|
||||
(spiderRequest->m_hadReply ? (1<<10) : 0) |
|
||||
(spiderRequest->m_fakeFirstIp ? (1<<11) : 0) |
|
||||
(spiderRequest->m_hasAuthorityInlink ? (1<<12) : 0) |
|
||||
(spiderRequest->m_avoidSpiderLinks ? (1<<13) : 0);
|
||||
(spiderRequest->m_hasAuthorityInlinkValid ? (1<<13) : 0) |
|
||||
(spiderRequest->m_avoidSpiderLinks ? (1<<14) : 0);
|
||||
sqlite3_bind_int(stmt, 11, rqf);
|
||||
if(spiderRequest->m_priority>=0)
|
||||
sqlite3_bind_int(stmt, 12, spiderRequest->m_priority);
|
||||
@ -240,7 +241,8 @@ int convertSpiderDb(const char *collname) {
|
||||
(prevSpiderReply->m_isPermalink ? (1<<1) : 0) |
|
||||
(prevSpiderReply->m_isIndexed ? (1<<2) : 0) |
|
||||
(prevSpiderReply->m_hasAuthorityInlink ? (1<<3) : 0) |
|
||||
(prevSpiderReply->m_fromInjectionRequest ? (1<<4) : 0);
|
||||
(prevSpiderReply->m_fromInjectionRequest ? (1<<4) : 0) |
|
||||
(prevSpiderReply->m_isIndexedINValid ? (1<<5) : 0);
|
||||
sqlite3_bind_int(stmt, 21, rpf);
|
||||
}
|
||||
|
||||
|
@ -252,10 +252,8 @@ void DailyMerge::dailyMergeLoop ( ) {
|
||||
// tell it to save, otherwise this might not get saved
|
||||
m_cr->setNeedsSave();
|
||||
// initiate dumps
|
||||
g_spiderdb.getRdb ()->submitRdbDumpJob(true);
|
||||
g_linkdb.getRdb ()->submitRdbDumpJob(true);
|
||||
// if neither has recs in tree, go to next mode
|
||||
if(g_spiderdb.getRdb()->getNumUsedNodes()>0) return;
|
||||
if(g_linkdb .getRdb()->getNumUsedNodes()>0) return;
|
||||
// ok, all trees are clear and dumped
|
||||
m_mergeMode = 5;
|
||||
@ -267,9 +265,6 @@ void DailyMerge::dailyMergeLoop ( ) {
|
||||
if ( m_mergeMode == 5 ) {
|
||||
// kick off the merges if not already going
|
||||
|
||||
if(g_spiderdb.getRdb()->getBase(m_cr->m_collnum)->attemptMerge(1,true,2))
|
||||
return;
|
||||
|
||||
if(g_linkdb.getRdb()->getBase(m_cr->m_collnum)->attemptMerge(1,true,2))
|
||||
return;
|
||||
|
||||
|
@ -1518,8 +1518,10 @@ uint32_t Hostdb::getShardNum(rdbid_t rdbId, const void *k) const {
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
|
||||
case RDB_SPIDERDB:
|
||||
case RDB2_SPIDERDB2: {
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
case RDB2_SPIDERDB2_DEPRECATED:
|
||||
case RDB_SPIDERDB_SQLITE:
|
||||
case RDB2_SPIDERDB2_SQLITE: {
|
||||
int32_t firstIp = Spiderdb::getFirstIp((key128_t *)k);
|
||||
// do what Spider.h getGroupId() used to do so we are
|
||||
// backwards compatible
|
||||
|
2
Makefile
2
Makefile
@ -30,6 +30,8 @@ OBJS_O0 = \
|
||||
Query.o \
|
||||
RdbCache.o RdbDump.o RdbMem.o RdbMerge.o RdbScan.o RdbTree.o \
|
||||
Rebalance.o Repair.o RobotRule.o Robots.o \
|
||||
SpiderdbSqlite.o \
|
||||
SpiderdbRdbSqliteBridge.o \
|
||||
Sanity.o ScalingFunctions.o SearchInput.o SiteGetter.o Speller.o SpiderProxy.o Stats.o SummaryCache.o Synonyms.o \
|
||||
Tagdb.o TcpServer.o Titledb.o \
|
||||
Version.o \
|
||||
|
4
Msg0.cpp
4
Msg0.cpp
@ -164,7 +164,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
// get groupid from hostid here lest we core in getGroupId() below.
|
||||
// it does that for dumping spiderdb to the client browser. they
|
||||
// can download the whole enchilada.
|
||||
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
|
||||
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB_DEPRECATED )
|
||||
m_shardNum = 0;
|
||||
// did they force it? core until i figure out what this is
|
||||
else if ( forceParitySplit >= 0 )
|
||||
@ -197,7 +197,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
|
||||
if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;
|
||||
|
||||
//if it is spiderdb then we only have it it we are a spider host too
|
||||
if((rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
|
||||
if((rdbId == RDB_SPIDERDB_DEPRECATED || rdbId == RDB2_SPIDERDB2_DEPRECATED) &&
|
||||
isLocal &&
|
||||
!g_hostdb.getMyHost()->m_spiderEnabled)
|
||||
{
|
||||
|
2
Msg3.cpp
2
Msg3.cpp
@ -160,7 +160,7 @@ class RdbCache *getDiskPageCache ( rdbid_t rdbId ) {
|
||||
maxRecs = maxMem / 3000;
|
||||
dbname = "titdbcache";
|
||||
break;
|
||||
case RDB_SPIDERDB:
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
rpc = &g_rdbCaches[4];
|
||||
maxMem = g_conf.m_spiderdbFileCacheSize;
|
||||
maxRecs = maxMem / 3000;
|
||||
|
204
Msg4In.cpp
204
Msg4In.cpp
@ -11,6 +11,7 @@
|
||||
#include "ip.h"
|
||||
#include "Mem.h"
|
||||
#include "Titledb.h" // for Titledb::validateSerializedRecord
|
||||
#include "SpiderdbRdbSqliteBridge.h"
|
||||
#include <sys/stat.h> //stat()
|
||||
#include <fcntl.h>
|
||||
|
||||
@ -240,59 +241,71 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
|
||||
Titledb::validateSerializedRecord( rec, recSize );
|
||||
}
|
||||
|
||||
// . get the rdb to which it belongs, use Msg0::getRdb()
|
||||
// . do not call this for every rec if we do not have to
|
||||
if (rdbId != lastRdbId || !rdb) {
|
||||
rdb = getRdbFromId(rdbId);
|
||||
if(rdbId!=RDB_SPIDERDB_DEPRECATED && rdbId!=RDB2_SPIDERDB2_DEPRECATED) {
|
||||
// . get the rdb to which it belongs, use Msg0::getRdb()
|
||||
// . do not call this for every rec if we do not have to
|
||||
if (rdbId != lastRdbId || !rdb) {
|
||||
rdb = getRdbFromId(rdbId);
|
||||
|
||||
if (!rdb) {
|
||||
char ipbuf[16];
|
||||
log(LOG_WARN, "msg4: rdbId of %" PRId32" unrecognized from hostip=%s. dropping WHOLE request",
|
||||
(int32_t)rdbId, slot ? iptoa(slot->getIp(),ipbuf) : "unknown");
|
||||
if (!rdb) {
|
||||
char ipbuf[16];
|
||||
log(LOG_WARN, "msg4: rdbId of %" PRId32" unrecognized from hostip=%s. dropping WHOLE request",
|
||||
(int32_t)rdbId, slot ? iptoa(slot->getIp(),ipbuf) : "unknown");
|
||||
gbshutdownAbort(true);
|
||||
}
|
||||
|
||||
// an uninitialized secondary rdb?
|
||||
// don't core any more, we probably restarted this shard
|
||||
// and it needs to wait for host #0 to syncs its
|
||||
// g_conf.m_repairingEnabled to '1' so it can start its
|
||||
// Repair.cpp repairWrapper() loop and init the secondary
|
||||
// rdbs so "rdb" here won't be NULL any more.
|
||||
if (!rdb->isInitialized()) {
|
||||
time_t currentTime = getTime();
|
||||
static time_t s_lastTime = 0;
|
||||
if (currentTime > s_lastTime + 10) {
|
||||
s_lastTime = currentTime;
|
||||
log(LOG_WARN, "msg4: oops. got an rdbId key for a secondary "
|
||||
"rdb and not in repair mode. waiting to be in repair mode.");
|
||||
}
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// if we don't have data, recSize must be the same with keySize
|
||||
if (rdb->getFixedDataSize() == 0 && recSize != rdb->getKeySize()) {
|
||||
gbshutdownAbort(true);
|
||||
}
|
||||
|
||||
// an uninitialized secondary rdb?
|
||||
// don't core any more, we probably restarted this shard
|
||||
// and it needs to wait for host #0 to syncs its
|
||||
// g_conf.m_repairingEnabled to '1' so it can start its
|
||||
// Repair.cpp repairWrapper() loop and init the secondary
|
||||
// rdbs so "rdb" here won't be NULL any more.
|
||||
if (!rdb->isInitialized()) {
|
||||
time_t currentTime = getTime();
|
||||
static time_t s_lastTime = 0;
|
||||
if (currentTime > s_lastTime + 10) {
|
||||
s_lastTime = currentTime;
|
||||
log(LOG_WARN, "msg4: oops. got an rdbId key for a secondary "
|
||||
"rdb and not in repair mode. waiting to be in repair mode.");
|
||||
}
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
auto &rdbItem = rdbItems[rdbId];
|
||||
++rdbItem.m_numRecs;
|
||||
|
||||
int32_t dataSize = recSize - rdb->getKeySize();
|
||||
if (rdb->getFixedDataSize() == -1) {
|
||||
dataSize -= 4;
|
||||
}
|
||||
|
||||
rdbItem.m_dataSizes += dataSize;
|
||||
|
||||
rdbItem.m_items.emplace_back(collnum, rec, recSize);
|
||||
} else {
|
||||
//spiderdb records no longer reside in an Rdb
|
||||
|
||||
// don't add to spiderdb when we're nospider host
|
||||
if (!g_hostdb.getMyHost()->m_spiderEnabled)
|
||||
continue;
|
||||
|
||||
auto &rdbItem = rdbItems[rdbId];
|
||||
++rdbItem.m_numRecs;
|
||||
|
||||
int32_t dataSize = recSize - sizeof(key128_t) - 4;
|
||||
|
||||
rdbItem.m_dataSizes += dataSize;
|
||||
|
||||
rdbItem.m_items.emplace_back(collnum, rec, recSize);
|
||||
}
|
||||
|
||||
// if we don't have data, recSize must be the same with keySize
|
||||
if (rdb->getFixedDataSize() == 0 && recSize != rdb->getKeySize()) {
|
||||
gbshutdownAbort(true);
|
||||
}
|
||||
|
||||
// don't add to spiderdb when we're nospider host
|
||||
if (!g_hostdb.getMyHost()->m_spiderEnabled && (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &rdbItem = rdbItems[rdbId];
|
||||
++rdbItem.m_numRecs;
|
||||
|
||||
int32_t dataSize = recSize - rdb->getKeySize();
|
||||
if (rdb->getFixedDataSize() == -1) {
|
||||
dataSize -= 4;
|
||||
}
|
||||
|
||||
rdbItem.m_dataSizes += dataSize;
|
||||
|
||||
rdbItem.m_items.emplace_back(collnum, rec, recSize);
|
||||
|
||||
|
||||
// advance over the rec data to point to next entry
|
||||
p += recSize;
|
||||
}
|
||||
@ -300,12 +313,14 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
|
||||
bool hasRoom = true;
|
||||
bool anyDumping = false;
|
||||
for (auto const &rdbItem : rdbItems) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
if (rdb->isDumping()) {
|
||||
anyDumping = true;
|
||||
} else if (!rdb->hasRoom(rdbItem.second.m_numRecs, rdbItem.second.m_dataSizes)) {
|
||||
rdb->submitRdbDumpJob(true);
|
||||
hasRoom = false;
|
||||
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
if (rdb->isDumping()) {
|
||||
anyDumping = true;
|
||||
} else if (!rdb->hasRoom(rdbItem.second.m_numRecs, rdbItem.second.m_dataSizes)) {
|
||||
rdb->submitRdbDumpJob(true);
|
||||
hasRoom = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -323,53 +338,66 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
|
||||
|
||||
|
||||
for (auto const &rdbItem : rdbItems) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
|
||||
bool status = false;
|
||||
for (auto const &item : rdbItem.second.m_items) {
|
||||
// reset g_errno
|
||||
g_errno = 0;
|
||||
|
||||
// . make a list from this data
|
||||
// . skip over the first 4 bytes which is the rdbId
|
||||
// . TODO: embed the rdbId in the msgtype or something...
|
||||
RdbList list;
|
||||
|
||||
// set the list
|
||||
// todo: dodgy cast to char*. RdbList should be fixed
|
||||
list.set((char *)item.m_rec, item.m_recSize, (char *)item.m_rec, item.m_recSize,
|
||||
rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
|
||||
|
||||
// keep track of stats
|
||||
rdb->readRequestAdd(item.m_recSize);
|
||||
|
||||
// this returns false and sets g_errno on error
|
||||
status = rdb->addListNoSpaceCheck(item.m_collNum, &list);
|
||||
|
||||
// bad coll #? ignore it. common when deleting and resetting
|
||||
// collections using crawlbot. but there are other recs in this
|
||||
// list from different collections, so do not abandon the whole
|
||||
// meta list!! otherwise we lose data!!
|
||||
if (g_errno == ENOCOLLREC && !status) {
|
||||
bool status = true;
|
||||
for (auto const &item : rdbItem.second.m_items) {
|
||||
// reset g_errno
|
||||
g_errno = 0;
|
||||
status = true;
|
||||
|
||||
// . make a list from this data
|
||||
// . skip over the first 4 bytes which is the rdbId
|
||||
// . TODO: embed the rdbId in the msgtype or something...
|
||||
RdbList list;
|
||||
|
||||
// set the list
|
||||
// todo: dodgy cast to char*. RdbList should be fixed
|
||||
list.set((char *)item.m_rec, item.m_recSize, (char *)item.m_rec, item.m_recSize,
|
||||
rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
|
||||
|
||||
// keep track of stats
|
||||
rdb->readRequestAdd(item.m_recSize);
|
||||
|
||||
// this returns false and sets g_errno on error
|
||||
status = rdb->addListNoSpaceCheck(item.m_collNum, &list);
|
||||
|
||||
// bad coll #? ignore it. common when deleting and resetting
|
||||
// collections using crawlbot. but there are other recs in this
|
||||
// list from different collections, so do not abandon the whole
|
||||
// meta list!! otherwise we lose data!!
|
||||
if (g_errno == ENOCOLLREC && !status) {
|
||||
g_errno = 0;
|
||||
status = true;
|
||||
}
|
||||
|
||||
if (!status) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!status) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!status) {
|
||||
break;
|
||||
} else {
|
||||
bool status = true;
|
||||
for(auto const &item : rdbItem.second.m_items) {
|
||||
status = SpiderdbRdbSqliteBridge::addRecord(item.m_collNum, item.m_rec, item.m_recSize);
|
||||
if(!status)
|
||||
break;
|
||||
}
|
||||
if(!status)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// verify integrity if wanted
|
||||
if (g_conf.m_verifyTreeIntegrity) {
|
||||
for (auto const &rdbItem : rdbItems) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
rdb->verifyTreeIntegrity();
|
||||
for(auto const &rdbItem : rdbItems) {
|
||||
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
|
||||
Rdb *rdb = getRdbFromId(rdbItem.first);
|
||||
rdb->verifyTreeIntegrity();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
4
Msg5.cpp
4
Msg5.cpp
@ -411,7 +411,7 @@ bool Msg5::readList ( ) {
|
||||
// . we set endKey for spiderdb when reading from tree above
|
||||
// based on the current minRecSizes so do not mess with it
|
||||
// in that case.
|
||||
if ( m_rdbId != RDB_SPIDERDB ) {
|
||||
if ( m_rdbId != RDB_SPIDERDB_DEPRECATED ) {
|
||||
//m_newMinRecSizes += rs * numNegativeRecs;
|
||||
int32_t nn = m_newMinRecSizes + rs * numNegativeRecs;
|
||||
if ( rs > 0 && nn < m_newMinRecSizes ) nn = 0x7fffffff;
|
||||
@ -540,7 +540,7 @@ bool Msg5::needsRecall() {
|
||||
}
|
||||
|
||||
// limit to just doledb for now in case it results in data loss
|
||||
if( rc && m_readAbsolutelyNothing && (m_rdbId==RDB_DOLEDB||m_rdbId==RDB_SPIDERDB) ) {
|
||||
if( rc && m_readAbsolutelyNothing && (m_rdbId==RDB_DOLEDB||m_rdbId==RDB_SPIDERDB_DEPRECATED) ) {
|
||||
rc = false;
|
||||
}
|
||||
|
||||
|
@ -140,7 +140,7 @@ bool getSpiderRequestMetaList ( const char *doc, SafeBuf *listBuf, bool spiderLi
|
||||
}
|
||||
|
||||
// store rdbid first
|
||||
if ( ! listBuf->pushChar(RDB_SPIDERDB) ) {
|
||||
if ( ! listBuf->pushChar(RDB_SPIDERDB_DEPRECATED) ) {
|
||||
// return false with g_errno set
|
||||
return false;
|
||||
}
|
||||
|
@ -453,7 +453,7 @@ bool Msg1c::gotList ( ) {
|
||||
|
||||
log("reindex: adding docid list (docids:%d) to spiderdb", m_numDocIdsAdded);
|
||||
|
||||
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB);
|
||||
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB_DEPRECATED);
|
||||
}
|
||||
|
||||
void addedListWrapper ( void *state ) {
|
||||
|
@ -152,7 +152,7 @@ static bool getSpiderRecs(State *st) {
|
||||
key128_t endKey = Spiderdb::makeLastKey(st->m_firstip, uh48);
|
||||
log(LOG_TRACE,"PageSpiderdbLookup: getSpiderRecs(%p): Calling Msg0::getList()", st);
|
||||
if(!st->m_msg0.getList(-1, //hostId
|
||||
RDB_SPIDERDB,
|
||||
RDB_SPIDERDB_DEPRECATED, //TODO: use rdb_spiderdb_sqlite and new record format (also much simpler)
|
||||
st->m_collnum,
|
||||
&st->m_rdbList,
|
||||
(const char*)&startKey,
|
||||
@ -226,7 +226,7 @@ static bool sendResult(State *st) {
|
||||
if(st->m_url_str[0]) {
|
||||
int64_t uh48 = hash64b(st->m_url_str);
|
||||
key128_t startKey = Spiderdb::makeFirstKey(st->m_firstip, uh48);
|
||||
uint32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB, &startKey);
|
||||
uint32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB_SQLITE, &startKey);
|
||||
sb.safePrintf("<p>Shard: %u</p>\n", shardNum);
|
||||
int32_t numHosts;
|
||||
const Host *host = g_hostdb.getShard(shardNum, &numHosts);
|
||||
|
@ -1420,13 +1420,13 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
|
||||
const Rdb *rdbs[] = {
|
||||
g_posdb.getRdb(),
|
||||
g_titledb.getRdb(),
|
||||
g_spiderdb.getRdb(),
|
||||
g_doledb.getRdb() ,
|
||||
g_tagdb.getRdb(),
|
||||
g_clusterdb.getRdb(),
|
||||
g_linkdb.getRdb(),
|
||||
};
|
||||
int32_t nr = sizeof(rdbs) / sizeof(Rdb *);
|
||||
//TODO: sqlite: show statistics for sqlite database(s)
|
||||
|
||||
// print dbname
|
||||
p.safePrintf("<tr class=poo><td> </td>");
|
||||
|
19
Parms.cpp
19
Parms.cpp
@ -700,11 +700,6 @@ static bool CommandMergeTitledb(const char *rec) {
|
||||
}
|
||||
|
||||
|
||||
static bool CommandMergeSpiderdb(const char *rec) {
|
||||
forceMergeAll(RDB_SPIDERDB);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool CommandMergeLinkdb(const char *rec) {
|
||||
forceMergeAll(RDB_LINKDB);
|
||||
return true;
|
||||
@ -729,7 +724,7 @@ static bool CommandForceIt(const char *rec) {
|
||||
static bool CommandDiskDump(const char *rec) {
|
||||
g_clusterdb.getRdb()->submitRdbDumpJob(true);
|
||||
g_tagdb.getRdb()->submitRdbDumpJob(true);
|
||||
g_spiderdb.getRdb()->submitRdbDumpJob(true);
|
||||
g_spiderdb.getRdb_deprecated()->submitRdbDumpJob(true);
|
||||
g_posdb.getRdb()->submitRdbDumpJob(true);
|
||||
g_titledb.getRdb()->submitRdbDumpJob(true);
|
||||
g_linkdb.getRdb()->submitRdbDumpJob(true);
|
||||
@ -5071,18 +5066,6 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "tight merge spiderdb";
|
||||
m->m_desc = "Merges all outstanding spiderdb files.";
|
||||
m->m_cgi = "spmerge";
|
||||
m->m_type = TYPE_CMD;
|
||||
m->m_func = CommandMergeSpiderdb;
|
||||
m->m_cast = true;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "tight merge linkdb";
|
||||
m->m_desc = "Merges all outstanding linkdb files.";
|
||||
m->m_cgi = "lmerge";
|
||||
|
@ -265,7 +265,7 @@ bool Process::init ( ) {
|
||||
// followed by titledb perhaps...
|
||||
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb_deprecated();
|
||||
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_tagdb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_linkdb.getRdb ();
|
||||
@ -274,7 +274,7 @@ bool Process::init ( ) {
|
||||
m_rdbs[m_numRdbs++] = g_doledb.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb_deprecated();
|
||||
m_rdbs[m_numRdbs++] = g_clusterdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_linkdb2.getRdb ();
|
||||
m_rdbs[m_numRdbs++] = g_tagdb2.getRdb ();
|
||||
|
166
Rdb.cpp
166
Rdb.cpp
@ -137,13 +137,15 @@ bool Rdb::init(const char *dbname,
|
||||
case RDB2_POSDB2:
|
||||
case RDB_TITLEDB:
|
||||
case RDB2_TITLEDB2:
|
||||
case RDB_SPIDERDB:
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
case RDB_DOLEDB:
|
||||
case RDB2_SPIDERDB2:
|
||||
case RDB2_SPIDERDB2_DEPRECATED:
|
||||
case RDB_LINKDB:
|
||||
case RDB2_LINKDB2:
|
||||
m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
break;
|
||||
// Not a real rdb: case RDB_SPIDERDB_SQLITE:
|
||||
// Not a real rdb: case RDB2_SPIDERDB2_SQLITE:
|
||||
default:
|
||||
m_pageSize = GB_TFNDB_PAGE_SIZE;
|
||||
}
|
||||
@ -1062,14 +1064,14 @@ void attemptMergeAll() {
|
||||
RDB_TITLEDB,
|
||||
RDB_TAGDB,
|
||||
RDB_LINKDB,
|
||||
RDB_SPIDERDB,
|
||||
RDB_SPIDERDB_DEPRECATED,
|
||||
RDB_CLUSTERDB,
|
||||
// also try to merge on rdbs being rebuilt
|
||||
RDB2_POSDB2,
|
||||
RDB2_TITLEDB2,
|
||||
RDB2_TAGDB2,
|
||||
RDB2_LINKDB2,
|
||||
RDB2_SPIDERDB2,
|
||||
RDB2_SPIDERDB2_DEPRECATED,
|
||||
RDB2_CLUSTERDB2
|
||||
};
|
||||
static const unsigned numRdbs = sizeof(rdbid)/sizeof(rdbid[0]);
|
||||
@ -1130,7 +1132,7 @@ bool Rdb::addList(collnum_t collnum, RdbList *list, bool checkForRoom) {
|
||||
m_rdbId == RDB_CLUSTERDB ||
|
||||
m_rdbId == RDB_LINKDB ||
|
||||
m_rdbId == RDB_DOLEDB ||
|
||||
m_rdbId == RDB_SPIDERDB ) ) {
|
||||
m_rdbId == RDB_SPIDERDB_DEPRECATED ) ) {
|
||||
|
||||
// allow banning of sites still
|
||||
log(LOG_WARN, "db: How did an add come in while in repair mode? rdbName=%s", getDbnameFromId(m_rdbId));
|
||||
@ -1512,46 +1514,6 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
|
||||
}
|
||||
}
|
||||
|
||||
// . cancel any spider request that is a dup in the dupcache to save disk space
|
||||
// . twins might have different dupcaches so they might have different dups,
|
||||
// but it shouldn't be a big deal because they are dups!
|
||||
if (m_rdbId == RDB_SPIDERDB && !KEYNEG(key)) {
|
||||
// . this will create it if spiders are on and its NULL
|
||||
// . even if spiders are off we need to create it so
|
||||
// that the request can adds its ip to the waitingTree
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
|
||||
|
||||
// skip if not there
|
||||
if (!sc) {
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: No spider coll. Returning true", m_dbname);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// @todo ALC we're making an assumption that data passed in is part of a SpiderRequest (fix this!)
|
||||
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest *>(data - 4 - sizeof(key128_t));
|
||||
|
||||
// is it really a request and not a SpiderReply?
|
||||
if (Spiderdb::isSpiderRequest(&(sreq->m_key))) {
|
||||
// skip if in dup cache. do NOT add to cache since
|
||||
// addToWaitingTree() in Spider.cpp will do that when called
|
||||
// from addSpiderRequest() below
|
||||
if (sc->isInDupCache(sreq, false)) {
|
||||
logDebug(g_conf.m_logDebugSpider, "spider: adding spider req %s is dup. skipping.", sreq->m_url);
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Duplicated spider req. Returning true", m_dbname);
|
||||
return true;
|
||||
}
|
||||
|
||||
// if we are overflowing...
|
||||
if (!sreq->m_isAddUrl && !sreq->m_isPageReindex && !sreq->m_urlIsDocId && !sreq->m_forceDelete &&
|
||||
sc->isFirstIpInOverflowList(sreq->m_firstIp)) {
|
||||
g_stats.m_totalOverflows++;
|
||||
logDebug(g_conf.m_logDebugSpider, "spider: skipping for overflow url %s ", sreq->m_url);
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Overflow. Returning true", m_dbname);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_useTree) {
|
||||
if (!m_tree.addNode(collnum, key, dataCopy, dataSize)) {
|
||||
log(LOG_INFO, "db: Had error adding data to %s: %s", m_dbname, mstrerror(g_errno));
|
||||
@ -1577,7 +1539,7 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
|
||||
}
|
||||
|
||||
// if adding to spiderdb, add to cache, too (except negative key)
|
||||
if ((m_rdbId == RDB_SPIDERDB || m_rdbId == RDB_DOLEDB) && !KEYNEG(key)) {
|
||||
if (m_rdbId == RDB_DOLEDB && !KEYNEG(key)) {
|
||||
// . this will create it if spiders are on and its NULL
|
||||
// . even if spiders are off we need to create it so
|
||||
// that the request can adds its ip to the waitingTree
|
||||
@ -1588,91 +1550,27 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
|
||||
return true;
|
||||
}
|
||||
|
||||
// if doing doledb...
|
||||
if (m_rdbId == RDB_DOLEDB) {
|
||||
int32_t pri = Doledb::getPriority((key96_t *)key);
|
||||
// skip over corruption
|
||||
if (pri < 0 || pri >= MAX_SPIDER_PRIORITIES) {
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Skip over corruption", m_dbname);
|
||||
return true;
|
||||
}
|
||||
// if added positive key is before cursor, update curso
|
||||
if (KEYCMP(key, (char *)&sc->m_nextKeys[pri], sizeof(key96_t)) < 0) {
|
||||
KEYSET((char *)&sc->m_nextKeys[pri], key, sizeof(key96_t));
|
||||
|
||||
if (g_conf.m_logDebugSpider) {
|
||||
char keyStrBuf[MAX_KEYSTR_BYTES];
|
||||
KEYSTR(key, 12, keyStrBuf);
|
||||
logDebug(g_conf.m_logDebugSpider, "spider: cursor reset pri=%" PRId32" to %s", pri, keyStrBuf);
|
||||
}
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. For doledb. Returning true", m_dbname);
|
||||
|
||||
// that's it for doledb mods
|
||||
int32_t pri = Doledb::getPriority((key96_t *)key);
|
||||
// skip over corruption
|
||||
if (pri < 0 || pri >= MAX_SPIDER_PRIORITIES) {
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Skip over corruption", m_dbname);
|
||||
return true;
|
||||
}
|
||||
|
||||
// . ok, now add that reply to the cache
|
||||
|
||||
/// @todo ALC we're making an assumption that data passed in is part of a SpiderRequest (fix this!)
|
||||
// assume this is the rec (4 byte dataSize,spiderdb key is now 16 bytes)
|
||||
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest *>(data - 4 - sizeof(key128_t));
|
||||
|
||||
// is it really a request and not a SpiderReply?
|
||||
if (Spiderdb::isSpiderRequest(&sreq->m_key)) {
|
||||
// add the request
|
||||
// if added positive key is before cursor, update curso
|
||||
if (KEYCMP(key, (char *)&sc->m_nextKeys[pri], sizeof(key96_t)) < 0) {
|
||||
KEYSET((char *)&sc->m_nextKeys[pri], key, sizeof(key96_t));
|
||||
|
||||
if (g_conf.m_logDebugSpider) {
|
||||
// log that. why isn't this undoling always
|
||||
char keyStrBuf[MAX_KEYSTR_BYTES];
|
||||
KEYSTR((const char *)&sreq->m_key, sizeof(key128_t), keyStrBuf);
|
||||
|
||||
char ipbuf[16];
|
||||
logDebug(g_conf.m_logDebugSpider, "spider: rdb: added spider request to spiderdb rdb tree"
|
||||
" request for uh48=%" PRIu64" prntdocid=%" PRIu64" firstIp=%s spiderdbkey=%s",
|
||||
sreq->getUrlHash48(), sreq->getParentDocId(), iptoa(sreq->m_firstIp,ipbuf), keyStrBuf);
|
||||
}
|
||||
|
||||
// false means to NOT call evaluateAllRequests()
|
||||
// because we call it below. the reason we do this
|
||||
// is because it does not always get called
|
||||
// in addSpiderRequest(), like if its a dup and
|
||||
// gets "nuked". (removed callEval arg since not
|
||||
// really needed)
|
||||
sc->addSpiderRequest(sreq, gettimeofdayInMilliseconds());
|
||||
} else {
|
||||
// otherwise repl
|
||||
SpiderReply *rr = (SpiderReply *)sreq;
|
||||
|
||||
// log that. why isn't this undoling always
|
||||
logDebug(g_conf.m_logDebugSpider, "rdb: rdb: got spider reply for uh48=%" PRIu64, rr->getUrlHash48());
|
||||
|
||||
// add the reply
|
||||
sc->addSpiderReply(rr);
|
||||
|
||||
/// @todo ALC why are we removing this here? this check should be at where we're trying to insert this
|
||||
// don't actually add it if "fake". i.e. if it
|
||||
// was an internal error of some sort... this will
|
||||
// make it try over and over again i guess...
|
||||
// no because we need some kinda reply so that gb knows
|
||||
// the pagereindex docid-based spider requests are done,
|
||||
// at least for now, because the replies were not being
|
||||
// added for now. just for internal errors at least...
|
||||
// we were not adding spider replies to the page reindexes
|
||||
// as they completed and when i tried to rerun it
|
||||
// the title recs were not found since they were deleted,
|
||||
// so we gotta add the replies now.
|
||||
int32_t indexCode = rr->m_errCode;
|
||||
if (indexCode == EABANDONED) {
|
||||
log(LOG_WARN, "rdb: not adding spiderreply to rdb because it was an internal error for uh48=%" PRIu64
|
||||
" errCode = %s", rr->getUrlHash48(), mstrerror(indexCode));
|
||||
m_tree.deleteNode(collnum, key, false);
|
||||
KEYSTR(key, 12, keyStrBuf);
|
||||
logDebug(g_conf.m_logDebugSpider, "spider: cursor reset pri=%" PRId32" to %s", pri, keyStrBuf);
|
||||
}
|
||||
}
|
||||
|
||||
// clear errors from adding to SpiderCache
|
||||
g_errno = 0;
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. For doledb. Returning true", m_dbname);
|
||||
|
||||
// that's it for doledb mods
|
||||
return true;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Returning true", m_dbname);
|
||||
@ -1867,14 +1765,14 @@ Rdb *getRdbFromId ( rdbid_t rdbId ) {
|
||||
case RDB_TAGDB: return g_tagdb.getRdb();
|
||||
case RDB_POSDB: return g_posdb.getRdb();
|
||||
case RDB_TITLEDB: return g_titledb.getRdb();
|
||||
case RDB_SPIDERDB: return g_spiderdb.getRdb();
|
||||
case RDB_SPIDERDB_DEPRECATED: return g_spiderdb.getRdb_deprecated();
|
||||
case RDB_DOLEDB: return g_doledb.getRdb();
|
||||
case RDB_CLUSTERDB: return g_clusterdb.getRdb();
|
||||
case RDB_LINKDB: return g_linkdb.getRdb();
|
||||
|
||||
case RDB2_POSDB2: return g_posdb2.getRdb();
|
||||
case RDB2_TITLEDB2: return g_titledb2.getRdb();
|
||||
case RDB2_SPIDERDB2: return g_spiderdb2.getRdb();
|
||||
case RDB2_SPIDERDB2_DEPRECATED: return g_spiderdb2.getRdb_deprecated();
|
||||
case RDB2_CLUSTERDB2: return g_clusterdb2.getRdb();
|
||||
case RDB2_LINKDB2: return g_linkdb2.getRdb();
|
||||
case RDB2_TAGDB2: return g_tagdb2.getRdb();
|
||||
@ -1888,14 +1786,14 @@ rdbid_t getIdFromRdb ( Rdb *rdb ) {
|
||||
if ( rdb == g_tagdb.getRdb () ) return RDB_TAGDB;
|
||||
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
|
||||
if ( rdb == g_titledb.getRdb () ) return RDB_TITLEDB;
|
||||
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
|
||||
if ( rdb == g_spiderdb.getRdb_deprecated() ) return RDB_SPIDERDB_DEPRECATED;
|
||||
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
|
||||
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
|
||||
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
|
||||
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
|
||||
if ( rdb == g_tagdb2.getRdb () ) return RDB2_TAGDB2;
|
||||
if ( rdb == g_titledb2.getRdb () ) return RDB2_TITLEDB2;
|
||||
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
|
||||
if ( rdb == g_spiderdb2.getRdb_deprecated() ) return RDB2_SPIDERDB2_DEPRECATED;
|
||||
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
|
||||
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;
|
||||
|
||||
@ -1908,9 +1806,11 @@ bool isSecondaryRdb ( rdbid_t rdbId ) {
|
||||
case RDB2_POSDB2 : return true;
|
||||
case RDB2_TAGDB2 : return true;
|
||||
case RDB2_TITLEDB2 : return true;
|
||||
case RDB2_SPIDERDB2 : return true;
|
||||
case RDB2_SPIDERDB2_DEPRECATED : return true;
|
||||
case RDB2_CLUSTERDB2 : return true;
|
||||
case RDB2_LINKDB2 : return true;
|
||||
case RDB2_SPIDERDB2_SQLITE : return true;
|
||||
//(todo?) rdb2_spiderdb2_sqlite
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -1919,8 +1819,8 @@ bool isSecondaryRdb ( rdbid_t rdbId ) {
|
||||
// use a quick table now...
|
||||
char getKeySizeFromRdbId(rdbid_t rdbId) {
|
||||
switch(rdbId) {
|
||||
case RDB_SPIDERDB:
|
||||
case RDB2_SPIDERDB2:
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
case RDB2_SPIDERDB2_DEPRECATED:
|
||||
case RDB_TAGDB:
|
||||
case RDB2_TAGDB2:
|
||||
return sizeof(key128_t); // 16
|
||||
@ -1958,7 +1858,8 @@ int32_t getDataSizeFromRdbId ( rdbid_t rdbId ) {
|
||||
ds = 0;
|
||||
else if ( i == RDB_TITLEDB ||
|
||||
i == RDB_TAGDB ||
|
||||
i == RDB_SPIDERDB ||
|
||||
i == RDB_SPIDERDB_DEPRECATED ||
|
||||
i == RDB_SPIDERDB_SQLITE ||
|
||||
i == RDB_DOLEDB )
|
||||
ds = -1;
|
||||
else if ( i == RDB2_POSDB2 ||
|
||||
@ -1967,7 +1868,8 @@ int32_t getDataSizeFromRdbId ( rdbid_t rdbId ) {
|
||||
ds = 0;
|
||||
else if ( i == RDB2_TITLEDB2 ||
|
||||
i == RDB2_TAGDB2 ||
|
||||
i == RDB2_SPIDERDB2 )
|
||||
i == RDB2_SPIDERDB2_DEPRECATED ||
|
||||
i == RDB2_SPIDERDB2_SQLITE )
|
||||
ds = -1;
|
||||
else {
|
||||
continue;
|
||||
|
12
RdbBase.cpp
12
RdbBase.cpp
@ -470,7 +470,7 @@ bool RdbBase::setFiles ( ) {
|
||||
return false;
|
||||
|
||||
// spiderdb should start with file 0001.dat or 0000.dat
|
||||
if ( m_numFiles > 0 && m_fileInfo[0].m_fileId > 1 && m_rdb->getRdbId() == RDB_SPIDERDB ) {
|
||||
if ( m_numFiles > 0 && m_fileInfo[0].m_fileId > 1 && m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED ) {
|
||||
//isj: is that even true anymore? Ok, crashed merges and lost file0000* are not a
|
||||
//good thing but I don't see why it should affect spiderdb especially bad.
|
||||
return fixNonfirstSpiderdbFiles();
|
||||
@ -1194,8 +1194,8 @@ static int32_t getMaxLostPositivesPercentage(rdbid_t rdbId) {
|
||||
case RDB_TITLEDB:
|
||||
case RDB2_TITLEDB2:
|
||||
return g_conf.m_titledbMaxLostPositivesPercentage;
|
||||
case RDB_SPIDERDB:
|
||||
case RDB2_SPIDERDB2:
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
case RDB2_SPIDERDB2_DEPRECATED:
|
||||
return g_conf.m_spiderdbMaxLostPositivesPercentage;
|
||||
case RDB_LINKDB:
|
||||
case RDB2_LINKDB2:
|
||||
@ -1675,7 +1675,7 @@ int32_t RdbBase::getMinToMerge(const CollectionRec *cr, rdbid_t rdbId, int32_t m
|
||||
result = cr->m_titledbMinFilesToMerge;
|
||||
logTrace(g_conf.m_logTraceRdbBase, "titledb. m_minToMerge: %d", m_minToMerge);
|
||||
break;
|
||||
case RDB_SPIDERDB:
|
||||
case RDB_SPIDERDB_DEPRECATED:
|
||||
result = cr->m_spiderdbMinFilesToMerge;
|
||||
logTrace(g_conf.m_logTraceRdbBase, "spiderdb. m_minToMerge: %d", m_minToMerge);
|
||||
break;
|
||||
@ -1745,7 +1745,7 @@ bool RdbBase::attemptMerge(int32_t niceness, bool forceMergeAll, int32_t minToMe
|
||||
if (forceMergeAll) {
|
||||
log(LOG_INFO,"merge: forcing merge for %s. (collnum=%" PRId32")",m_dbname,(int32_t)m_collnum);
|
||||
|
||||
if (m_rdb->getRdbId() == RDB_SPIDERDB) {
|
||||
if (m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED) {
|
||||
minMergeFileCount = 0;
|
||||
}
|
||||
}
|
||||
@ -1907,7 +1907,7 @@ bool RdbBase::attemptMerge(int32_t niceness, bool forceMergeAll, int32_t minToMe
|
||||
}
|
||||
|
||||
if (mergeFileCount == 1) {
|
||||
int logLevel = (m_rdb->getRdbId() == RDB_SPIDERDB) ? LOG_INFO : LOG_LOGIC;
|
||||
int logLevel = (m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED) ? LOG_INFO : LOG_LOGIC;
|
||||
log(logLevel,"merge:attemptMerge:resuming: filename with single file merge for %s coll=%s file=%s",m_dbname,m_coll,m_fileInfo[i].m_file->getFilename());
|
||||
}
|
||||
|
||||
|
@ -735,7 +735,7 @@ bool RdbList::checkList_r(bool abortOnProblem, rdbid_t rdbId) {
|
||||
gbshutdownAbort(true); }
|
||||
}
|
||||
}
|
||||
if ( rdbId == RDB_SPIDERDB && ! KEYNEG(k) &&
|
||||
if ( rdbId == RDB_SPIDERDB_DEPRECATED && ! KEYNEG(k) &&
|
||||
getCurrentDataSize() > 0 ) {
|
||||
char *rec = getCurrentRec();
|
||||
// bad url in spider request?
|
||||
|
@ -527,7 +527,7 @@ void RdbMerge::filterListWrapper(void *state) {
|
||||
|
||||
logTrace(g_conf.m_logTraceRdbMerge, "BEGIN. list=%p m_startKey=%s", &THIS->m_list, KEYSTR(THIS->m_startKey, THIS->m_ks));
|
||||
|
||||
if (THIS->m_rdbId == RDB_SPIDERDB) {
|
||||
if (THIS->m_rdbId == RDB_SPIDERDB_DEPRECATED) {
|
||||
dedupSpiderdbList(&(THIS->m_list));
|
||||
} else if (THIS->m_rdbId == RDB_TITLEDB) {
|
||||
// filterTitledbList(&(THIS->m_list));
|
||||
@ -610,7 +610,7 @@ bool RdbMerge::filterList() {
|
||||
// dedup for spiderdb before we dump it. try to save disk space.
|
||||
//
|
||||
/////
|
||||
if (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB_TITLEDB) {
|
||||
if (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB_TITLEDB) {
|
||||
if (g_jobScheduler.submit(filterListWrapper, filterDoneWrapper, this, thread_type_merge_filter, 0)) {
|
||||
return false;
|
||||
}
|
||||
@ -618,7 +618,7 @@ bool RdbMerge::filterList() {
|
||||
log(LOG_WARN, "db: Unable to submit job for merge filter. Will run in main thread");
|
||||
|
||||
// fall back to filter without thread
|
||||
if (m_rdbId == RDB_SPIDERDB) {
|
||||
if (m_rdbId == RDB_SPIDERDB_DEPRECATED) {
|
||||
dedupSpiderdbList(&m_list);
|
||||
} else {
|
||||
// filterTitledbList(&m_list);
|
||||
|
@ -921,7 +921,7 @@ bool RdbTree::fixTree_unlocked() {
|
||||
|
||||
/// @todo ALC should we check repair RDB as well?
|
||||
bool isTitledb = (m_rdbId == RDB_TITLEDB || m_rdbId == RDB2_TITLEDB2);
|
||||
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB2_SPIDERDB2);
|
||||
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB2_SPIDERDB2_DEPRECATED);
|
||||
|
||||
// now re-add the old nods to the tree, they should not be overwritten
|
||||
// by addNode()
|
||||
@ -1007,7 +1007,7 @@ bool RdbTree::checkTree_unlocked(bool printMsgs, bool doChainTest) const {
|
||||
|
||||
/// @todo ALC should we check repair RDB as well?
|
||||
bool isTitledb = (m_rdbId == RDB_TITLEDB || m_rdbId == RDB2_TITLEDB2);
|
||||
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB2_SPIDERDB2);
|
||||
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB2_SPIDERDB2_DEPRECATED);
|
||||
|
||||
// now check parent kid correlations
|
||||
for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
|
||||
|
51
Repair.cpp
51
Repair.cpp
@ -60,7 +60,6 @@ static Rdb **getSecondaryRdbs ( int32_t *nsr ) {
|
||||
|
||||
s_rdbs[s_nsr++] = g_titledb2.getRdb ();
|
||||
s_rdbs[s_nsr++] = g_posdb2.getRdb ();
|
||||
s_rdbs[s_nsr++] = g_spiderdb2.getRdb ();
|
||||
s_rdbs[s_nsr++] = g_clusterdb2.getRdb ();
|
||||
s_rdbs[s_nsr++] = g_linkdb2.getRdb ();
|
||||
s_rdbs[s_nsr++] = g_tagdb2.getRdb ();
|
||||
@ -529,8 +528,6 @@ void Repair::initScan ( ) {
|
||||
|
||||
if ( m_rebuildClusterdb )
|
||||
if ( ! g_clusterdb2.init2 ( clusterdbMem ) ) goto hadError;
|
||||
if ( m_rebuildSpiderdb )
|
||||
if ( ! g_spiderdb2.init2 ( spiderdbMem ) ) goto hadError;
|
||||
if ( m_rebuildLinkdb )
|
||||
if ( ! g_linkdb2.init2 ( linkdbMem ) ) goto hadError;
|
||||
|
||||
@ -638,11 +635,6 @@ void Repair::getNextCollToRepair ( ) {
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
if ( m_rebuildSpiderdb ) {
|
||||
if ( ! g_spiderdb2.getRdb()->addRdbBase1 ( coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
}
|
||||
|
||||
if ( m_rebuildLinkdb ) {
|
||||
if ( ! g_linkdb2.getRdb()->addRdbBase1 ( coll ) &&
|
||||
g_errno != EEXIST ) goto hadError;
|
||||
@ -924,11 +916,6 @@ void Repair::updateRdbs ( ) {
|
||||
rdb2 = g_clusterdb2.getRdb();
|
||||
rdb1->updateToRebuildFiles ( rdb2 , m_cr->m_coll );
|
||||
}
|
||||
if ( m_rebuildSpiderdb ) {
|
||||
rdb1 = g_spiderdb.getRdb();
|
||||
rdb2 = g_spiderdb2.getRdb();
|
||||
rdb1->updateToRebuildFiles ( rdb2 , m_cr->m_coll );
|
||||
}
|
||||
if ( m_rebuildLinkdb ) {
|
||||
rdb1 = g_linkdb.getRdb();
|
||||
rdb2 = g_linkdb2.getRdb();
|
||||
@ -1356,12 +1343,6 @@ bool Repair::printRepairStatus(SafeBuf *sb) {
|
||||
m_recsCorruptErrors +
|
||||
m_recsDupDocIds ;
|
||||
|
||||
// the spiderdb scan stats (phase 2)
|
||||
int64_t ns2 = m_spiderRecsScanned ;
|
||||
int64_t nr2 = g_spiderdb.getRdb()->getNumTotalRecs() ;
|
||||
float ratio2 = nr2 ? ((float)ns2 * 100.0) / (float)nr2 : 0.0;
|
||||
int64_t errors2 = m_spiderRecSetErrors;
|
||||
|
||||
const char *newColl = " ";
|
||||
|
||||
const char *oldColl = " ";
|
||||
@ -1519,38 +1500,6 @@ bool Repair::printRepairStatus(SafeBuf *sb) {
|
||||
);
|
||||
|
||||
|
||||
sb->safePrintf(
|
||||
// spider recs done
|
||||
"<tr bgcolor=#%s><td><b>spider recs scanned</b></td>"
|
||||
"<td>%" PRId64" of %" PRId64" (%.2f%%)</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec not "
|
||||
"assigned to us</b></td>"
|
||||
"<td>%" PRId32"</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec errors</b></td>"
|
||||
"<td>%" PRId64"</td></tr>\n"
|
||||
|
||||
// spider recs set errors, parsing errors, etc.
|
||||
"<tr bgcolor=#%s><td><b>spider rec bad tld</b></td>"
|
||||
"<td>%" PRId32"</td></tr>\n"
|
||||
|
||||
,
|
||||
LIGHT_BLUE ,
|
||||
ns2 ,
|
||||
nr2 ,
|
||||
ratio2 ,
|
||||
LIGHT_BLUE ,
|
||||
m_spiderRecNotAssigned ,
|
||||
LIGHT_BLUE ,
|
||||
errors2,
|
||||
LIGHT_BLUE ,
|
||||
m_spiderRecBadTLD
|
||||
);
|
||||
|
||||
|
||||
int32_t nsr;
|
||||
Rdb **rdbs = getSecondaryRdbs ( &nsr );
|
||||
|
||||
|
@ -74,7 +74,7 @@ int32_t SpiderRequest::print(SafeBuf *sbarg) const {
|
||||
SafeBuf tmp;
|
||||
SafeBuf *sb = sbarg ? sbarg : &tmp;
|
||||
|
||||
sb->safePrintf("k=%s ", KEYSTR( this, getKeySizeFromRdbId( RDB_SPIDERDB ) ) );
|
||||
sb->safePrintf("k=%s ", KEYSTR( this, getKeySizeFromRdbId( RDB_SPIDERDB_SQLITE ) ) );
|
||||
|
||||
// indicate it's a request not a reply
|
||||
sb->safePrintf("REQ ");
|
||||
@ -128,7 +128,7 @@ int32_t SpiderRequest::print(SafeBuf *sbarg) const {
|
||||
|
||||
if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
|
||||
|
||||
int32_t shardNum = g_hostdb.getShardNum( RDB_SPIDERDB, this );
|
||||
int32_t shardNum = g_hostdb.getShardNum( RDB_SPIDERDB_SQLITE, this );
|
||||
sb->safePrintf("shardnum=%" PRIu32" ",(uint32_t)shardNum);
|
||||
|
||||
sb->safePrintf("url=%s",m_url);
|
||||
@ -999,7 +999,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
|
||||
|
||||
// use spidercoll to contain this msg4 but if in use it
|
||||
// won't be able to be deleted until it comes back..
|
||||
if(!sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, spiderReqBuf, doneAddingSeedsWrapper, RDB_SPIDERDB))
|
||||
if(!sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, spiderReqBuf, doneAddingSeedsWrapper, RDB_SPIDERDB_DEPRECATED))
|
||||
return false;
|
||||
else {
|
||||
delete spiderReqBuf;
|
||||
|
3
Spider.h
3
Spider.h
@ -362,7 +362,8 @@ public:
|
||||
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
||||
bool init2 ( int32_t treeMem );
|
||||
|
||||
Rdb *getRdb ( ) { return &m_rdb; }
|
||||
// Rdb *getRdb ( ) { return &m_rdb; }
|
||||
Rdb *getRdb_deprecated() { return &m_rdb; }
|
||||
|
||||
static int64_t getUrlHash48(const key128_t *k ) {
|
||||
return (((k->n1)<<16) | k->n0>>(64-16)) & 0xffffffffffffLL;
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "ip.h"
|
||||
#include "Conf.h"
|
||||
#include "Mem.h"
|
||||
#include "SpiderdbRdbSqliteBridge.h"
|
||||
#include "ScopedLock.h"
|
||||
#include "Sanity.h"
|
||||
|
||||
@ -1188,25 +1189,17 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
// make state
|
||||
//int32_t state2 = (int32_t)m_cr->m_collnum;
|
||||
// read the list from local disk
|
||||
if ( !m_msg5b.getList(RDB_SPIDERDB,
|
||||
m_cr->m_collnum,
|
||||
&m_waitingTreeList,
|
||||
&m_waitingTreeNextKey,
|
||||
KEYMAX(),
|
||||
SR_READ_SIZE, // minRecSizes (512k)
|
||||
true, // includeTree
|
||||
0, // startFileNum
|
||||
-1, // numFiles (all)
|
||||
this,//(void *)state2,//this//state
|
||||
gotSpiderdbWaitingTreeListWrapper,
|
||||
MAX_NICENESS, // niceness
|
||||
true, // do error correct?
|
||||
-1, // maxRetries
|
||||
false)) // isRealMerge
|
||||
if(!SpiderdbRdbSqliteBridge::getList(m_cr->m_collnum,
|
||||
&m_waitingTreeList,
|
||||
m_waitingTreeNextKey,
|
||||
*(const key128_t*)KEYMAX(),
|
||||
SR_READ_SIZE))
|
||||
{
|
||||
// return if blocked
|
||||
logTrace( g_conf.m_logTraceSpider, "END, msg5b.getList blocked" );
|
||||
return;
|
||||
if(!g_errno) {
|
||||
g_errno = EIO; //imprecise
|
||||
logTrace( g_conf.m_logTraceSpider, "END, got io-error from sqlite" );
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2014,27 +2007,18 @@ bool SpiderColl::readListFromSpiderdb ( ) {
|
||||
// end up timing out the round. so try checking for
|
||||
// m_gettingList in spiderDoledUrls() and setting
|
||||
// m_lastSpiderCouldLaunch
|
||||
if ( ! m_msg5.getList ( RDB_SPIDERDB ,
|
||||
m_cr->m_collnum ,
|
||||
&m_list ,
|
||||
&m_nextKey ,
|
||||
&m_endKey ,
|
||||
SR_READ_SIZE , // minRecSizes (512k)
|
||||
true , // includeTree
|
||||
0 , // startFileNum
|
||||
-1 , // numFiles (all)
|
||||
this,//(void *)state2,//this,//state
|
||||
gotSpiderdbListWrapper ,
|
||||
MAX_NICENESS , // niceness
|
||||
true, // do error correct?
|
||||
-1, // maxRetries
|
||||
false)) // isRealMerge
|
||||
if(!SpiderdbRdbSqliteBridge::getList(m_cr->m_collnum,
|
||||
&m_list,
|
||||
m_nextKey,
|
||||
m_endKey,
|
||||
SR_READ_SIZE))
|
||||
{
|
||||
// return false if blocked
|
||||
logTrace( g_conf.m_logTraceSpider, "END, msg5.getList blocked" );
|
||||
return false ;
|
||||
if(!g_errno)
|
||||
g_errno = EIO; //imprecise
|
||||
logTrace( g_conf.m_logTraceSpider, "END, got io-error from sqlite" );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// note its return
|
||||
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read of %" PRId32" bytes",m_list.getListSize());
|
||||
|
||||
|
@ -151,70 +151,70 @@ void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
|
||||
}
|
||||
|
||||
void SpiderdbHostDelete::processFile(void *item) {
|
||||
FileItem *fileItem = static_cast<FileItem*>(item);
|
||||
|
||||
log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
|
||||
|
||||
g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
|
||||
|
||||
CollectionRec *collRec = g_collectiondb.getRec("main");
|
||||
if (!collRec) {
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
RdbBase *base = collRec->getBase(RDB_SPIDERDB);
|
||||
Rdb *rdb = g_spiderdb.getRdb();
|
||||
|
||||
if (!fileItem->m_resume) {
|
||||
// dump tree
|
||||
rdb->submitRdbDumpJob(true);
|
||||
|
||||
{
|
||||
ScopedLock sl(s_sleepMtx);
|
||||
while (!s_stop && rdb->hasPendingRdbDumpJob()) {
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
ts.tv_sec += 1;
|
||||
|
||||
pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
|
||||
}
|
||||
|
||||
if (s_stop) {
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tight merge (only force merge all when not resuming)
|
||||
if (!base->attemptMerge(0, !fileItem->m_resume)) {
|
||||
// unable to start merge
|
||||
g_urlHostBlackList.unload();
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
ScopedLock sl(s_sleepMtx);
|
||||
while (!s_stop && rdb->isMerging()) {
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
ts.tv_sec += 60;
|
||||
|
||||
pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
|
||||
}
|
||||
|
||||
if (s_stop) {
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
|
||||
|
||||
g_urlHostBlackList.unload();
|
||||
|
||||
// delete files
|
||||
unlink(fileItem->m_tmpFilename);
|
||||
|
||||
delete fileItem;
|
||||
}
|
||||
// FileItem *fileItem = static_cast<FileItem*>(item);
|
||||
//
|
||||
// log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
|
||||
//
|
||||
// g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
|
||||
//
|
||||
// CollectionRec *collRec = g_collectiondb.getRec("main");
|
||||
// if (!collRec) {
|
||||
// gbshutdownLogicError();
|
||||
// }
|
||||
// RdbBase *base = collRec->getBase(RDB_SPIDERDB);
|
||||
// Rdb *rdb = g_spiderdb.getRdb();
|
||||
//
|
||||
// if (!fileItem->m_resume) {
|
||||
// // dump tree
|
||||
// rdb->submitRdbDumpJob(true);
|
||||
//
|
||||
// {
|
||||
// ScopedLock sl(s_sleepMtx);
|
||||
// while (!s_stop && rdb->hasPendingRdbDumpJob()) {
|
||||
// timespec ts;
|
||||
// clock_gettime(CLOCK_REALTIME, &ts);
|
||||
// ts.tv_sec += 1;
|
||||
//
|
||||
// pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
|
||||
// }
|
||||
//
|
||||
// if (s_stop) {
|
||||
// delete fileItem;
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // tight merge (only force merge all when not resuming)
|
||||
// if (!base->attemptMerge(0, !fileItem->m_resume)) {
|
||||
// // unable to start merge
|
||||
// g_urlHostBlackList.unload();
|
||||
// delete fileItem;
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// {
|
||||
// ScopedLock sl(s_sleepMtx);
|
||||
// while (!s_stop && rdb->isMerging()) {
|
||||
// timespec ts;
|
||||
// clock_gettime(CLOCK_REALTIME, &ts);
|
||||
// ts.tv_sec += 60;
|
||||
//
|
||||
// pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
|
||||
// }
|
||||
//
|
||||
// if (s_stop) {
|
||||
// delete fileItem;
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
|
||||
//
|
||||
// g_urlHostBlackList.unload();
|
||||
//
|
||||
// // delete files
|
||||
// unlink(fileItem->m_tmpFilename);
|
||||
//
|
||||
// delete fileItem;
|
||||
}
|
||||
|
466
SpiderdbRdbSqliteBridge.cpp
Normal file
466
SpiderdbRdbSqliteBridge.cpp
Normal file
@ -0,0 +1,466 @@
|
||||
#include "SpiderdbRdbSqliteBridge.h"
|
||||
#include "Spider.h"
|
||||
#include "SpiderdbSqlite.h"
|
||||
#include "types.h"
|
||||
#include "Sanity.h"
|
||||
#include "Log.h"
|
||||
#include "IOBuffer.h"
|
||||
#include "Mem.h"
|
||||
#include "Conf.h"
|
||||
|
||||
|
||||
static bool addRequestRecord(sqlite3 *db, const void *record, size_t record_len);
|
||||
static bool addReplyRecord(sqlite3 *db, const void *record, size_t record_len);
|
||||
|
||||
|
||||
bool SpiderdbRdbSqliteBridge::addRecord(collnum_t collnum, const void *record, size_t record_len) {
|
||||
if(KEYNEG((const char*)record)) {
|
||||
log(LOG_ERROR,"sqlitespider: Got negative spiderrecord");
|
||||
gbshutdownCorrupted();
|
||||
}
|
||||
sqlite3 *db = g_spiderdb_sqlite.getOrCreateDb(collnum);
|
||||
if(!db) {
|
||||
log(LOG_ERROR,"sqlitespider: Could not get sqlite db for collection %d", collnum);
|
||||
return false;
|
||||
}
|
||||
if(Spiderdb::isSpiderRequest(reinterpret_cast<const key128_t *>(record)))
|
||||
return addRequestRecord(db,record,record_len);
|
||||
else
|
||||
return addReplyRecord(db,record,record_len);
|
||||
}
|
||||
|
||||
|
||||
static bool addRequestRecord(sqlite3 *db, const void *record, size_t record_len) {
|
||||
if(record_len<(unsigned)SpiderRequest::getNeededSize(0)) {
|
||||
log(LOG_ERROR,"sqlitespider: Got spiderrequest with record_len=%zu and SpiderRequest::getNeededSize(0)=%d", record_len, SpiderRequest::getNeededSize(0));
|
||||
gbshutdownCorrupted();
|
||||
}
|
||||
//last byte should be the terminating NUL in m_url
|
||||
if(reinterpret_cast<const char*>(record)[record_len-1] != '\0') {
|
||||
log(LOG_ERROR,"sqlitespider: Got spiderrequest where last byte was not ascii-nul");
|
||||
gbshutdownCorrupted();
|
||||
}
|
||||
|
||||
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest*>(record);
|
||||
int32_t firstIp = Spiderdb::getFirstIp(&sreq->m_key);
|
||||
int64_t uh48 = Spiderdb::getUrlHash48(&sreq->m_key);
|
||||
|
||||
//Create or update record. Possible streategies:
|
||||
// insert-then-detect-unique-key-violatione-and-update
|
||||
// select-then-insert-or-update
|
||||
//We go for select-then-insert-or-update
|
||||
const char *pzTail="";
|
||||
sqlite3_stmt *selectStatement = NULL;
|
||||
if(sqlite3_prepare_v2(db, "select 1 from spiderdb where m_firstIp=? and m_uh48=?", -1, &selectStatement, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
sqlite3_close(db);
|
||||
return false;
|
||||
}
|
||||
|
||||
sqlite3_bind_int(selectStatement, 1, firstIp);
|
||||
sqlite3_bind_int64(selectStatement, 2, uh48);
|
||||
int select_rc = sqlite3_step(selectStatement);
|
||||
if(select_rc==SQLITE_DONE) {
|
||||
//statement is finished - so the record currently doesn't exist
|
||||
static const char insert_statement[] =
|
||||
"INSERT INTO spiderdb (m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
|
||||
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
|
||||
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url)"
|
||||
"VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
|
||||
sqlite3_stmt *insertStatement = NULL;
|
||||
if(sqlite3_prepare_v2(db, insert_statement, -1, &insertStatement, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
sqlite3_finalize(selectStatement);
|
||||
return false;
|
||||
}
|
||||
|
||||
sqlite3_bind_int(insertStatement, 1, firstIp);
|
||||
sqlite3_bind_int64(insertStatement, 2, uh48);
|
||||
sqlite3_bind_int(insertStatement, 3, sreq->m_hostHash32);
|
||||
sqlite3_bind_int(insertStatement, 4, sreq->m_domHash32);
|
||||
sqlite3_bind_int(insertStatement, 5, sreq->m_siteHash32);
|
||||
sqlite3_bind_int(insertStatement, 6, sreq->m_siteNumInlinks);
|
||||
sqlite3_bind_int(insertStatement, 7, sreq->m_pageNumInlinks);
|
||||
sqlite3_bind_int(insertStatement, 8, sreq->m_addedTime);
|
||||
sqlite3_bind_int(insertStatement, 9, sreq->m_discoveryTime);
|
||||
if(sreq->m_contentHash32!=0)
|
||||
sqlite3_bind_int(insertStatement, 10, sreq->m_contentHash32);
|
||||
else
|
||||
sqlite3_bind_null(insertStatement, 10);
|
||||
int32_t rqf = (sreq->m_recycleContent ? (1<<0) : 0) |
|
||||
(sreq->m_isAddUrl ? (1<<1) : 0) |
|
||||
(sreq->m_isPageReindex ? (1<<2) : 0) |
|
||||
(sreq->m_isUrlCanonical ? (1<<3) : 0) |
|
||||
(sreq->m_isPageParser ? (1<<4) : 0) |
|
||||
(sreq->m_urlIsDocId ? (1<<5) : 0) |
|
||||
(sreq->m_isRSSExt ? (1<<6) : 0) |
|
||||
(sreq->m_isUrlPermalinkFormat ? (1<<7) : 0) |
|
||||
(sreq->m_forceDelete ? (1<<8) : 0) |
|
||||
(sreq->m_isInjecting ? (1<<9) : 0) |
|
||||
(sreq->m_hadReply ? (1<<10) : 0) |
|
||||
(sreq->m_fakeFirstIp ? (1<<11) : 0) |
|
||||
(sreq->m_hasAuthorityInlink ? (1<<12) : 0) |
|
||||
(sreq->m_avoidSpiderLinks ? (1<<13) : 0);
|
||||
sqlite3_bind_int(insertStatement, 11, rqf);
|
||||
if(sreq->m_priority>=0)
|
||||
sqlite3_bind_int(insertStatement, 12, sreq->m_priority);
|
||||
else
|
||||
sqlite3_bind_null(insertStatement, 12);
|
||||
sqlite3_bind_int(insertStatement, 13, sreq->m_errCount);
|
||||
sqlite3_bind_int(insertStatement, 14, sreq->m_sameErrCount);
|
||||
sqlite3_bind_text(insertStatement, 15, sreq->m_url,-1,SQLITE_TRANSIENT);
|
||||
|
||||
if(sqlite3_step(insertStatement) != SQLITE_DONE) {
|
||||
log(LOG_ERROR,"sqlitespider: Insert error: %s",sqlite3_errmsg(db));
|
||||
sqlite3_finalize(insertStatement);
|
||||
sqlite3_finalize(selectStatement);
|
||||
return false;
|
||||
}
|
||||
sqlite3_finalize(insertStatement);
|
||||
sqlite3_finalize(selectStatement);
|
||||
return true;
|
||||
} else if(select_rc==SQLITE_ROW) {
|
||||
//at least one result, so the record must already be there
|
||||
static const char update_statement[] =
|
||||
"UPDATE spiderdb"
|
||||
" SET m_siteNumInlinks=MAX(m_siteNumInlinks,?),"
|
||||
" m_pageNumInlinks=MAX(m_pageNumInlinks,?),"
|
||||
" m_addedTime=MIN(m_addedTime,?),"
|
||||
" m_discoveryTime=MIN(m_discoveryTime,?),"
|
||||
" m_priority=MAX(m_priority,?)"
|
||||
" WHERE m_firstIp=? AND m_uh48=?";
|
||||
|
||||
sqlite3_stmt *updateStatement = NULL;
|
||||
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
sqlite3_finalize(selectStatement);
|
||||
return false;
|
||||
}
|
||||
|
||||
sqlite3_bind_int(updateStatement, 1, sreq->m_siteNumInlinks);
|
||||
sqlite3_bind_int(updateStatement, 2, sreq->m_pageNumInlinks);
|
||||
sqlite3_bind_int(updateStatement, 3, sreq->m_addedTime);
|
||||
sqlite3_bind_int(updateStatement, 4, sreq->m_discoveryTime);
|
||||
sqlite3_bind_int(updateStatement, 5, sreq->m_priority);
|
||||
sqlite3_bind_int(updateStatement, 6, firstIp);
|
||||
sqlite3_bind_int64(updateStatement, 17, uh48);
|
||||
|
||||
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
|
||||
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
|
||||
sqlite3_finalize(updateStatement);
|
||||
sqlite3_finalize(selectStatement);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
log(LOG_WARN,"sqlitespider: sqlite3_step(...select...) failed with %s", sqlite3_errmsg(db));
|
||||
sqlite3_finalize(selectStatement);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool addReplyRecord(sqlite3 *db, const void *record, size_t record_len) {
|
||||
if(record_len!=sizeof(SpiderReply)) {
|
||||
log(LOG_ERROR,"sqlitespider: Got spiderreply with record_len=%zu and sizeof(SpiderReply)=%zu", record_len, sizeof(SpiderReply));
|
||||
gbshutdownCorrupted();
|
||||
}
|
||||
|
||||
//assumption: the record is already there
|
||||
|
||||
const SpiderReply *srep = reinterpret_cast<const SpiderReply*>(record);
|
||||
int32_t firstIp = Spiderdb::getFirstIp(&srep->m_key);
|
||||
int64_t uh48 = Spiderdb::getUrlHash48(&srep->m_key);
|
||||
|
||||
const char *pzTail="";
|
||||
if(srep->m_errCode==0) {
|
||||
static const char update_statement[] =
|
||||
"UPDATE spiderdb"
|
||||
" SET m_percentChangedPerDay = ?,"
|
||||
" m_spideredTime = ?,"
|
||||
" m_errCode = ?,"
|
||||
" m_httpStatus = ?,"
|
||||
" m_langId = ?,"
|
||||
" m_replyFlags = ?,"
|
||||
" m_errCount = 0,"
|
||||
" m_sameErrCount = 0,"
|
||||
" m_contentHash32 = ?"
|
||||
" WHERE m_firstIp=? and m_uh48=?";
|
||||
sqlite3_stmt *updateStatement = NULL;
|
||||
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
return false;
|
||||
}
|
||||
|
||||
sqlite3_bind_double(updateStatement, 1, srep->m_percentChangedPerDay);
|
||||
sqlite3_bind_int(updateStatement, 2, srep->m_spideredTime);
|
||||
sqlite3_bind_int(updateStatement, 3, srep->m_errCode);
|
||||
sqlite3_bind_int(updateStatement, 4, srep->m_httpStatus);
|
||||
sqlite3_bind_int(updateStatement, 5, srep->m_langId);
|
||||
int32_t rpf = (srep->m_isRSS ? (1<<0) : 0) |
|
||||
(srep->m_isPermalink ? (1<<1) : 0) |
|
||||
(srep->m_isIndexed ? (1<<2) : 0) |
|
||||
(srep->m_hasAuthorityInlink ? (1<<3) : 0) |
|
||||
(srep->m_fromInjectionRequest ? (1<<4) : 0);
|
||||
sqlite3_bind_int(updateStatement, 6, rpf);
|
||||
sqlite3_bind_int(updateStatement, 7, srep->m_contentHash32);
|
||||
sqlite3_bind_int(updateStatement, 8, firstIp);
|
||||
sqlite3_bind_int64(updateStatement, 9, uh48);
|
||||
|
||||
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
|
||||
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
|
||||
sqlite3_finalize(updateStatement);
|
||||
return false;
|
||||
}
|
||||
sqlite3_finalize(updateStatement);
|
||||
return true;
|
||||
} else {
|
||||
static const char update_statement[] =
|
||||
"UPDATE spiderdb"
|
||||
" SET m_spideredTime = ?,"
|
||||
" m_errCode = ?,"
|
||||
" m_httpStatus = ?,"
|
||||
" m_errCount = m_errCount + 1,"
|
||||
" m_sameErrCount = CASE WHEN m_errCode=? THEN IFNULL(m_sameErrCount,0) + 1 ELSE 0 END"
|
||||
" WHERE m_firstIp=? and m_uh48=?";
|
||||
sqlite3_stmt *updateStatement = NULL;
|
||||
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
return false;
|
||||
}
|
||||
|
||||
sqlite3_bind_int(updateStatement, 1, srep->m_spideredTime);
|
||||
sqlite3_bind_int(updateStatement, 2, srep->m_errCode);
|
||||
sqlite3_bind_int(updateStatement, 3, srep->m_httpStatus);
|
||||
sqlite3_bind_int(updateStatement, 4, srep->m_errCode);
|
||||
sqlite3_bind_int(updateStatement, 5, firstIp);
|
||||
sqlite3_bind_int64(updateStatement, 6, uh48);
|
||||
|
||||
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
|
||||
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
|
||||
sqlite3_finalize(updateStatement);
|
||||
return false;
|
||||
}
|
||||
sqlite3_finalize(updateStatement);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool SpiderdbRdbSqliteBridge::getList(collnum_t collnum,
|
||||
RdbList *list,
|
||||
const key128_t &startKey,
|
||||
const key128_t &endKey,
|
||||
int32_t minRecSizes)
|
||||
{
|
||||
sqlite3 *db = g_conf.m_readOnlyMode ? g_spiderdb_sqlite.getDb(collnum) : g_spiderdb_sqlite.getOrCreateDb(collnum);
|
||||
if(!db) {
|
||||
log(LOG_ERROR,"sqlitespider: Could not get sqlite db for collection %d", collnum);
|
||||
g_errno = ENOCOLLREC;
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t firstIpStart = Spiderdb::getFirstIp(&startKey);
|
||||
int32_t firstIpEnd = Spiderdb::getFirstIp(&endKey);
|
||||
int64_t uh48Start = Spiderdb::getUrlHash48(&startKey);
|
||||
int64_t uh48End = Spiderdb::getUrlHash48(&endKey);
|
||||
|
||||
|
||||
bool breakMidIPAddressAllowed;
|
||||
const char *pzTail="";
|
||||
sqlite3_stmt *stmt;
|
||||
if(firstIpStart==firstIpEnd) {
|
||||
breakMidIPAddressAllowed = true;
|
||||
static const char statement_text[] =
|
||||
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
|
||||
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
|
||||
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
|
||||
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
|
||||
" m_replyFlags"
|
||||
" FROM spiderdb"
|
||||
" WHERE m_firstIp=? and m_uh48>=? and m_uh48<=?"
|
||||
" ORDER BY m_firstIp, m_uh48";
|
||||
if(sqlite3_prepare_v2(db, statement_text, -1, &stmt, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
g_errno = EBADENGINEER;
|
||||
return false;
|
||||
}
|
||||
sqlite3_bind_int64(stmt, 1, (uint32_t)firstIpStart);
|
||||
sqlite3_bind_int64(stmt, 2, uh48Start);
|
||||
sqlite3_bind_int64(stmt, 3, uh48End);
|
||||
} else {
|
||||
if(uh48Start!=0) {
|
||||
log(LOG_ERROR, " SpiderdbRdbSqliteBridge::getList(): startip!=endip, and uh48Start!=0");
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
breakMidIPAddressAllowed = false;
|
||||
static const char statement_text[] =
|
||||
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
|
||||
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
|
||||
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
|
||||
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
|
||||
" m_replyFlags"
|
||||
" FROM spiderdb"
|
||||
" WHERE m_firstIp>=? and m_firstIp<=?"
|
||||
" ORDER BY m_firstIp, m_uh48";
|
||||
if(sqlite3_prepare_v2(db, statement_text, -1, &stmt, &pzTail) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
|
||||
g_errno = EBADENGINEER;
|
||||
return false;
|
||||
}
|
||||
sqlite3_bind_int64(stmt, 1, (uint32_t)firstIpStart);
|
||||
sqlite3_bind_int64(stmt, 2, (uint32_t)firstIpEnd);
|
||||
}
|
||||
|
||||
key128_t listLastKey;
|
||||
IOBuffer io_buffer;
|
||||
int rc;
|
||||
while((rc=sqlite3_step(stmt))==SQLITE_ROW) {
|
||||
//fetch all columns. null checks are done later
|
||||
int32_t firstIp = sqlite3_column_int(stmt, 1);
|
||||
int64_t uh48 = sqlite3_column_int64(stmt, 2);
|
||||
int32_t hosthash32 = sqlite3_column_int(stmt, 3);
|
||||
int32_t domHash32 = sqlite3_column_int(stmt, 4);
|
||||
int32_t siteHash32 = sqlite3_column_int(stmt, 5);
|
||||
int32_t siteNumInlinks = sqlite3_column_int(stmt, 6);
|
||||
int32_t pageNumInlinks = sqlite3_column_int(stmt, 7);
|
||||
int32_t addedTime = sqlite3_column_int(stmt, 8);
|
||||
int32_t discoveryTime = sqlite3_column_int(stmt, 9);
|
||||
int32_t contentHash32 = sqlite3_column_int(stmt, 10);
|
||||
int32_t requestFlags = sqlite3_column_int(stmt, 11);
|
||||
int32_t priority = sqlite3_column_int(stmt, 12);
|
||||
int32_t errCount = sqlite3_column_int(stmt, 13);
|
||||
int32_t sameErrCount = sqlite3_column_int(stmt, 14);
|
||||
const unsigned char *url = sqlite3_column_text(stmt, 15);
|
||||
double percentChangedPerDay = sqlite3_column_double(stmt, 16);
|
||||
int32_t spideredTime = sqlite3_column_int(stmt, 17);
|
||||
int32_t errCode = sqlite3_column_int(stmt, 18);
|
||||
int32_t httpStatus = sqlite3_column_int(stmt, 19);
|
||||
int32_t langId = sqlite3_column_int(stmt, 20);
|
||||
int32_t replyFlags = sqlite3_column_int(stmt, 21);
|
||||
|
||||
|
||||
if(breakMidIPAddressAllowed) {
|
||||
if(minRecSizes>0 && io_buffer.used() >= (size_t)minRecSizes)
|
||||
break;
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
if(sqlite3_column_type(stmt,21)!=SQLITE_NULL) {
|
||||
//replyflags are non-null so there must be a reply
|
||||
SpiderReply srep;
|
||||
srep.reset();
|
||||
srep.m_key = Spiderdb::makeKey(firstIp,uh48,false,0,false);
|
||||
srep.m_dataSize = sizeof(srep) - sizeof(srep.m_key) - sizeof(srep.m_dataSize);
|
||||
srep.m_firstIp = firstIp;
|
||||
srep.m_siteHash32 = siteHash32;
|
||||
srep.m_domHash32 = domHash32;
|
||||
srep.m_percentChangedPerDay = percentChangedPerDay;
|
||||
srep.m_spideredTime = spideredTime;
|
||||
srep.m_errCode = errCode;
|
||||
srep.m_siteNumInlinks = siteNumInlinks;
|
||||
srep.m_sameErrCount = sameErrCount;
|
||||
srep.m_contentHash32 = contentHash32;
|
||||
srep.m_crawlDelayMS = 1; //probably only used in-memory.
|
||||
srep.m_downloadEndTime = 0; //probably only used in-memory.
|
||||
srep.m_httpStatus = httpStatus;
|
||||
srep.m_errCount = errCount;
|
||||
srep.m_langId = langId;
|
||||
srep.m_isRSS = (replyFlags&(1<<0))!=0;
|
||||
srep.m_isPermalink = (replyFlags&(1<<1))!=0;
|
||||
srep.m_isIndexed = (replyFlags&(1<<2))!=0;
|
||||
srep.m_hasAuthorityInlink = (replyFlags&(1<<3))!=0;
|
||||
srep.m_fromInjectionRequest = (replyFlags&(1<<4))!=0;
|
||||
srep.m_isIndexedINValid = (replyFlags&(1<<4))!=0;
|
||||
srep.m_hasAuthorityInlinkValid = (requestFlags&(1<<15))!=0;
|
||||
srep.m_siteNumInlinksValid = sqlite3_column_type(stmt,6)!=SQLITE_NULL;
|
||||
|
||||
io_buffer.reserve_extra(sizeof(srep));
|
||||
memcpy(io_buffer.end(), &srep, sizeof(srep));
|
||||
io_buffer.push_back(sizeof(srep));
|
||||
} else
|
||||
replyFlags = 0;
|
||||
|
||||
SpiderRequest sreq;
|
||||
sreq.reset();
|
||||
sreq.m_key = Spiderdb::makeKey(firstIp,uh48,false,0,false);
|
||||
//sreq.m_dataSize
|
||||
sreq.m_firstIp = firstIp;
|
||||
sreq.m_hostHash32 = hosthash32;
|
||||
sreq.m_domHash32 = domHash32;
|
||||
sreq.m_siteHash32 = siteHash32;
|
||||
sreq.m_siteNumInlinks = siteNumInlinks;
|
||||
sreq.m_addedTime = addedTime;
|
||||
sreq.m_pageNumInlinks = pageNumInlinks;
|
||||
sreq.m_sameErrCount = sameErrCount;
|
||||
sreq.m_discoveryTime = discoveryTime;
|
||||
sreq.m_prevErrCode = 0; //done differently now.
|
||||
sreq.m_contentHash32 = contentHash32;
|
||||
sreq.m_hopCount = 0;
|
||||
sreq.m_hopCountValid = 0;
|
||||
sreq.m_isAddUrl = (requestFlags&(1<<1))!=0;
|
||||
sreq.m_isPageReindex = (requestFlags&(1<<2))!=0;
|
||||
sreq.m_isUrlCanonical = (requestFlags&(1<<3))!=0;
|
||||
sreq.m_isPageParser = (requestFlags&(1<<4))!=0;
|
||||
sreq.m_urlIsDocId = (requestFlags&(1<<5))!=0;
|
||||
sreq.m_isRSSExt = (requestFlags&(1<<6))!=0;
|
||||
sreq.m_isUrlPermalinkFormat = (requestFlags&(1<<7))!=0;
|
||||
sreq.m_recycleContent = (requestFlags&(1<<0))!=0;
|
||||
sreq.m_forceDelete = (requestFlags&(1<<8))!=0;
|
||||
sreq.m_isInjecting = (requestFlags&(1<<9))!=0;
|
||||
sreq.m_hadReply = (requestFlags&(1<<10))!=0;
|
||||
sreq.m_fakeFirstIp = (requestFlags&(1<<11))!=0;
|
||||
sreq.m_hasAuthorityInlink = (requestFlags&(1<<12))!=0;
|
||||
sreq.m_hasAuthorityInlinkValid = (requestFlags&(1<<13))!=0;
|
||||
sreq.m_siteNumInlinksValid = sqlite3_column_type(stmt,6)!=SQLITE_NULL;
|
||||
sreq.m_avoidSpiderLinks = (requestFlags&(1<<14))!=0;
|
||||
sreq.m_ufn = 0; //only used in-memory
|
||||
sreq.m_priority = priority;
|
||||
sreq.m_errCount = errCount;
|
||||
strcpy(sreq.m_url,(const char*)url);
|
||||
sreq.setDataSize();
|
||||
|
||||
io_buffer.reserve_extra(sreq.getRecSize());
|
||||
memcpy(io_buffer.end(), &sreq, sreq.getRecSize());
|
||||
io_buffer.push_back(sreq.getRecSize());
|
||||
|
||||
listLastKey = sreq.m_key;
|
||||
}
|
||||
if(rc!=SQLITE_DONE && rc!=SQLITE_ROW) {
|
||||
log(LOG_ERROR,"sqlitespider: Fetch error: %s",sqlite3_errmsg(db));
|
||||
g_errno = EBADENGINEER; //TODO
|
||||
return false;
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
|
||||
|
||||
int32_t listSize = io_buffer.used();
|
||||
char *listMemory;
|
||||
if(listSize>0) {
|
||||
if(!listMemory) {
|
||||
log(LOG_ERROR,"sqlitespider: OOM allocating spiderdb rdblist (%d bytes)", listSize);
|
||||
return false;
|
||||
}
|
||||
listMemory = (char*)mmalloc(listSize, "sqliterdblist");
|
||||
memcpy(listMemory, io_buffer.begin(), io_buffer.used());
|
||||
} else
|
||||
listMemory = NULL;
|
||||
key128_t listFirstKey = Spiderdb::makeFirstKey(firstIpStart, uh48Start);
|
||||
if(rc==SQLITE_ROW) {
|
||||
//early break
|
||||
} else {
|
||||
//select exhaustion, so jump to last specified key
|
||||
listLastKey = Spiderdb::makeFirstKey(firstIpEnd, uh48End);
|
||||
}
|
||||
list->set(listMemory, listSize,
|
||||
listMemory, listSize,
|
||||
(const char*)&listFirstKey, (const char*)&listLastKey,
|
||||
-1, //datasize(variable)
|
||||
true, //owndata
|
||||
false, //halfkeys
|
||||
sizeof(key128_t)); //keysize
|
||||
|
||||
return true;
|
||||
}
|
26
SpiderdbRdbSqliteBridge.h
Normal file
26
SpiderdbRdbSqliteBridge.h
Normal file
@ -0,0 +1,26 @@
|
||||
#ifndef SPIDERDB_RDB_SQLITE_BRIDGE_H_
|
||||
#define SPIDERDB_RDB_SQLITE_BRIDGE_H_
|
||||
#include "collnum_t.h"
|
||||
#include <stddef.h>
|
||||
|
||||
class RdbList;
|
||||
class u_int128_t;
|
||||
|
||||
//Helper function for bridging the old Rdb-style spiderdb records to the new sqlite-based database
|
||||
|
||||
namespace SpiderdbRdbSqliteBridge {
|
||||
|
||||
//Add a record (request or reply) to spiderdb. Returns false if something fails
|
||||
bool addRecord(collnum_t collnum, const void *record, size_t record_len);
|
||||
|
||||
//Fetch all records or a subset of the recoreds with startKey<=key<=endKey, and try to limit the rdblist size of recSizes
|
||||
//Returns false on error
|
||||
bool getList(collnum_t collnum,
|
||||
RdbList *list,
|
||||
const u_int128_t &startKey,
|
||||
const u_int128_t &endKey,
|
||||
int32_t minRecSizes);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
127
SpiderdbSqlite.cpp
Normal file
127
SpiderdbSqlite.cpp
Normal file
@ -0,0 +1,127 @@
|
||||
#include "SpiderdbSqlite.h"
|
||||
#include "ScopedLock.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Collectiondb.h"
|
||||
#include "Conf.h"
|
||||
#include "Log.h"
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
static sqlite3 *createDb(const char *sqlitedbName);
|
||||
|
||||
SpiderdbSqlite g_spiderdb_sqlite(RDB_SPIDERDB_SQLITE);
|
||||
SpiderdbSqlite g_spiderdb_sqlite2(RDB2_SPIDERDB2_SQLITE);
|
||||
|
||||
|
||||
|
||||
void SpiderdbSqlite::finalize() {
|
||||
ScopedLock sl(mtx);
|
||||
for(auto e : dbs)
|
||||
sqlite3_close(e.second);
|
||||
dbs.clear();
|
||||
}
|
||||
|
||||
|
||||
sqlite3 *SpiderdbSqlite::getDb(collnum_t collnum) {
|
||||
ScopedLock sl(mtx);
|
||||
auto iter = dbs.find(collnum);
|
||||
if(iter!=dbs.end())
|
||||
return iter->second;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
sqlite3 *SpiderdbSqlite::getOrCreateDb(collnum_t collnum) {
|
||||
ScopedLock sl(mtx);
|
||||
auto iter = dbs.find(collnum);
|
||||
if(iter!=dbs.end())
|
||||
return iter->second;
|
||||
|
||||
//not found, open or create it
|
||||
const auto cr = g_collectiondb.getRec(collnum);
|
||||
|
||||
char collectionDirName[1024];
|
||||
sprintf(collectionDirName, "%scoll.%s.%d", g_hostdb.m_dir, cr->m_coll, (int)collnum);
|
||||
|
||||
char sqlitedbName[1024];
|
||||
if(rdbid==RDB_SPIDERDB_SQLITE)
|
||||
sprintf(sqlitedbName, "%s/spiderdb.sqlite3", collectionDirName);
|
||||
else
|
||||
sprintf(sqlitedbName, "%s/spiderdbRebuild.sqlite3", collectionDirName);
|
||||
|
||||
sqlite3 *db = createDb(sqlitedbName);
|
||||
|
||||
dbs[collnum] = db;
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static const char create_table_statmeent[] =
|
||||
"CREATE TABLE spiderdb ("
|
||||
" m_firstIp INT NOT NULL,"
|
||||
" m_uh48 INT NOT NULL,"
|
||||
" m_hostHash32 INT NOT NULL,"
|
||||
" m_domHash32 INT NOT NULL,"
|
||||
" m_siteHash32 INT NOT NULL,"
|
||||
" m_siteNumInlinks INT NOT NULL,"
|
||||
" m_pageNumInlinks INT NOT NULL,"
|
||||
" m_addedTime INT NOT NULL,"
|
||||
" m_discoveryTime INT NOT NULL,"
|
||||
" m_contentHash32 INT,"
|
||||
" m_requestFlags INT NOT NULL,"
|
||||
" m_priority INT,"
|
||||
" m_errCount INT NOT NULL,"
|
||||
" m_sameErrCount INT NOT NULL,"
|
||||
" m_url TEXT NOT NULL,"
|
||||
" m_percentChangedPerDay REAL,"
|
||||
" m_spideredTime INT,"
|
||||
" m_errCode INT,"
|
||||
" m_httpStatus INT,"
|
||||
" m_langId INT,"
|
||||
" m_replyFlags INT,"
|
||||
" PRIMARY KEY (m_firstIp,m_uh48)"
|
||||
");"
|
||||
;
|
||||
|
||||
|
||||
static sqlite3 *createDb(const char *sqlitedbName) {
|
||||
sqlite3 *db;
|
||||
if(g_conf.m_readOnlyMode) {
|
||||
//read-only, creation is not allowed
|
||||
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READONLY,NULL);
|
||||
if(rc!=SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlite: Could not open %s: %s", sqlitedbName, sqlite3_errmsg(db));
|
||||
return NULL;
|
||||
}
|
||||
return db;
|
||||
}
|
||||
//read-write, creation is allowed
|
||||
|
||||
if(access(sqlitedbName,F_OK)==0) {
|
||||
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READWRITE,NULL);
|
||||
if(rc!=SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlite: Could not open %s: %s", sqlitedbName, sqlite3_errmsg(db));
|
||||
return NULL;
|
||||
}
|
||||
return db;
|
||||
}
|
||||
|
||||
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE,NULL);
|
||||
if(rc!=SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlite: Could not create %s: %s", sqlitedbName, sqlite3_errmsg(db));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *errmsg = NULL;
|
||||
if(sqlite3_exec(db, create_table_statmeent, NULL, NULL, &errmsg) != SQLITE_OK) {
|
||||
log(LOG_ERROR,"sqlite: %s",sqlite3_errmsg(db));
|
||||
sqlite3_close(db);
|
||||
unlink(sqlitedbName);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return db;
|
||||
}
|
99
SpiderdbSqlite.h
Normal file
99
SpiderdbSqlite.h
Normal file
@ -0,0 +1,99 @@
|
||||
#ifndef SPIDERDB_H_
|
||||
#define SPIDERDB_H_
|
||||
#include "GbMutex.h"
|
||||
#include "collnum_t.h"
|
||||
#include "rdbid_t.h"
|
||||
#include <inttypes.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "sqlite3.h"
|
||||
|
||||
|
||||
class SpiderdbSqlite {
|
||||
std::map<collnum_t,sqlite3*> dbs;
|
||||
GbMutex mtx;
|
||||
rdbid_t rdbid;
|
||||
public:
|
||||
SpiderdbSqlite(rdbid_t rdbid_) : dbs(), mtx(), rdbid(rdbid_) {}
|
||||
~SpiderdbSqlite() { finalize(); }
|
||||
SpiderdbSqlite(const SpiderdbSqlite&) = delete;
|
||||
SpiderdbSqlite& operator=(const SpiderdbSqlite&) = delete;
|
||||
|
||||
void finalize(); //closes all DBs
|
||||
|
||||
sqlite3 *getDb(collnum_t collnum);
|
||||
sqlite3 *getOrCreateDb(collnum_t collnum);
|
||||
};
|
||||
|
||||
extern SpiderdbSqlite g_spiderdb_sqlite;
|
||||
extern SpiderdbSqlite g_spiderdb_sqlite2;
|
||||
|
||||
|
||||
//see Spider.h for field definitions/comments/caveats
|
||||
|
||||
struct RawSpiderdbRecord {
|
||||
int32_t m_firstIp;
|
||||
int32_t m_uh48;
|
||||
//Request fields:
|
||||
int32_t m_hostHash32;
|
||||
int32_t m_domHash32;
|
||||
int32_t m_siteHash32;
|
||||
int32_t m_siteNumInlinks;
|
||||
int32_t m_pageNumInlinks;
|
||||
int32_t m_addedTime;
|
||||
int32_t m_discoveryTime;
|
||||
int32_t m_contentHash32; //0 = unknown/invalid
|
||||
union {
|
||||
struct {
|
||||
bool m_recycleContent:1;
|
||||
bool m_isAddUrl:1;
|
||||
bool m_isPageReindex:1;
|
||||
bool m_isUrlCanonical:1;
|
||||
bool m_isPageParser:1;
|
||||
bool m_urlIsDocId:1;
|
||||
bool m_isRSSExt:1;
|
||||
bool m_isUrlPermalinkFormat:1;
|
||||
bool m_forceDelete:1;
|
||||
bool m_isInjecting:1;
|
||||
bool m_hadReply:1;
|
||||
bool m_fakeFirstIp:1;
|
||||
bool m_hasAuthorityInlink:1;
|
||||
bool m_hasAuthorityInlinkValid:1;
|
||||
bool m_siteNumInlinksValid:1;
|
||||
bool m_avoidSpiderLinks:1;
|
||||
} requestFlags;
|
||||
uint32_t u32_request;
|
||||
};
|
||||
int32_t m_priority;
|
||||
bool m_priorityValid;
|
||||
int32_t m_errCount;
|
||||
bool m_errCountValid;
|
||||
int32_t m_sameErrCount;
|
||||
std::string m_url;
|
||||
//Reply fields
|
||||
float m_percentChangedPerDay;
|
||||
bool m_percentChangedPerDayValid;
|
||||
int32_t m_spideredTime;
|
||||
bool m_spideredTimeValid;
|
||||
int32_t m_errCodeValid;
|
||||
bool m_errCode;
|
||||
int32_t m_httpStatus;
|
||||
bool m_httpStatusValid;
|
||||
int32_t m_langId;
|
||||
bool m_langIdValid;
|
||||
union {
|
||||
struct {
|
||||
bool m_httpStatus:1;
|
||||
bool m_isRSS:1;
|
||||
bool m_isPermalink:1;
|
||||
bool m_isIndexed:1;
|
||||
bool m_hasAuthorityInlink:1;
|
||||
bool m_isIndexedINValid:1;
|
||||
bool m_hasAuthorityInlinkValid:1;
|
||||
bool m_fromInjectionRequest:1;
|
||||
} replyFlags;
|
||||
uint32_t u32_reply;
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
@ -404,7 +404,7 @@ static RdbCacheHistory rdb_cache_history[] = {
|
||||
{RDB_TAGDB, "tagdb", 0,0},
|
||||
{RDB_CLUSTERDB,"clusterdb",0,0},
|
||||
{RDB_TITLEDB, "titledb", 0,0},
|
||||
{RDB_SPIDERDB, "spiderdb", 0,0},
|
||||
{RDB_SPIDERDB_DEPRECATED, "spiderdb", 0,0},
|
||||
{RDB_NONE,0,0,0}
|
||||
};
|
||||
|
||||
|
30
XmlDoc.cpp
30
XmlDoc.cpp
@ -1771,7 +1771,7 @@ bool XmlDoc::indexDoc ( ) {
|
||||
}
|
||||
|
||||
// store the new request (store reply for this below)
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
|
||||
if (!m_metaList2.pushChar(rd)) {
|
||||
logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
|
||||
return true;
|
||||
@ -1812,7 +1812,7 @@ skipNewAdd1:
|
||||
return true;
|
||||
}
|
||||
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
|
||||
if (!m_metaList2.pushChar(rd)) {
|
||||
logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
|
||||
return true;
|
||||
@ -11998,7 +11998,7 @@ void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
|
||||
docId );
|
||||
}
|
||||
// key parsing logic taken from Address::makePlacedbKey
|
||||
else if ( rdbId == RDB_SPIDERDB ) {
|
||||
else if ( rdbId == RDB_SPIDERDB_DEPRECATED ) {
|
||||
sb->safePrintf("<td><nobr>");
|
||||
key128_t *k2 = (key128_t *)k;
|
||||
if ( Spiderdb::isSpiderRequest(k2) ) {
|
||||
@ -12078,7 +12078,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
||||
// positive and a spiderdoc
|
||||
// no, this is no longer the case because we add spider
|
||||
// replies to the index when deleting or rejecting a doc.
|
||||
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
|
||||
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB_DEPRECATED) {
|
||||
// g_process.shutdownAbort(true); }
|
||||
|
||||
// get the key size. a table lookup in Rdb.cpp.
|
||||
@ -12131,7 +12131,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
||||
if ( del ) dataSize = 0;
|
||||
|
||||
// ensure spiderdb request recs have data/url in them
|
||||
if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
|
||||
if ( (rdbId == RDB_SPIDERDB_DEPRECATED || rdbId == RDB2_SPIDERDB2_DEPRECATED) &&
|
||||
g_spiderdb.isSpiderRequest ( (spiderdbkey_t *)rec ) &&
|
||||
! forDelete &&
|
||||
! del &&
|
||||
@ -12202,8 +12202,8 @@ bool XmlDoc::hashMetaList ( HashTableX *ht ,
|
||||
// skip the data
|
||||
p += dataSize;
|
||||
// ignore spiderdb recs for parsing consistency check
|
||||
if ( rdbId == RDB_SPIDERDB ) continue;
|
||||
if ( rdbId == RDB2_SPIDERDB2 ) continue;
|
||||
if ( rdbId == RDB_SPIDERDB_DEPRECATED ) continue;
|
||||
if ( rdbId == RDB2_SPIDERDB2_DEPRECATED ) continue;
|
||||
// ignore tagdb as well!
|
||||
if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
|
||||
|
||||
@ -12304,7 +12304,7 @@ bool XmlDoc::hashMetaList ( HashTableX *ht ,
|
||||
SafeBuf sb2;
|
||||
|
||||
// print it out
|
||||
if ( rdbId == RDB_SPIDERDB ) {
|
||||
if ( rdbId == RDB_SPIDERDB_DEPRECATED ) {
|
||||
// get rec
|
||||
if ( Spiderdb::isSpiderRequest((key128_t *)rec) ) {
|
||||
SpiderRequest *sreq1 = (SpiderRequest *)rec;
|
||||
@ -12652,7 +12652,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "Adding spider reply to spiderdb");
|
||||
|
||||
// rdbid first
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
|
||||
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
|
||||
*m_p++ = (char)rd;
|
||||
|
||||
// get this
|
||||
@ -13356,7 +13356,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
setStatus("adding SpiderReply to spiderdb");
|
||||
|
||||
// rdbid first
|
||||
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
|
||||
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
|
||||
|
||||
// get this
|
||||
if (!m_srepValid) {
|
||||
@ -13423,7 +13423,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
}
|
||||
|
||||
// copy it
|
||||
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
|
||||
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
|
||||
|
||||
// store it back
|
||||
gbmemcpy (m_p, &revisedReq, revisedReq.getRecSize());
|
||||
@ -14849,8 +14849,8 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// sanity check
|
||||
if ( p + 1 + need > m_pend ) { g_process.shutdownAbort(true); }
|
||||
// store the rdbId
|
||||
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
|
||||
else *p++ = RDB_SPIDERDB;
|
||||
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2_DEPRECATED;
|
||||
else *p++ = RDB_SPIDERDB_DEPRECATED;
|
||||
|
||||
// store the spider rec
|
||||
gbmemcpy ( p , &ksr , need );
|
||||
@ -16972,7 +16972,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
if ( m_sreqValid ) {
|
||||
// must not block
|
||||
SpiderRequest *oldsr = &m_sreq;
|
||||
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
|
||||
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB_DEPRECATED,oldsr);
|
||||
sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
|
||||
"</td>\n"
|
||||
"<td><b>%" PRIu32"</b></td></tr>\n",shard);
|
||||
@ -17490,7 +17490,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
int32_t spiderHostId = -1;
|
||||
if (firstIp && firstIp != (int32_t *)-1) {
|
||||
key128_t spiderKey = Spiderdb::makeFirstKey(*firstIp);
|
||||
int32_t spiderShardNum = getShardNum(RDB_SPIDERDB, &spiderKey);
|
||||
int32_t spiderShardNum = getShardNum(RDB_SPIDERDB_DEPRECATED, &spiderKey);
|
||||
spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum, false);
|
||||
}
|
||||
|
||||
|
164
main.cpp
164
main.cpp
@ -106,8 +106,8 @@ static const int32_t commandLineDumpdbRecSize = 10 * 1024 * 1024; //recSizes par
|
||||
|
||||
static void dumpTitledb (const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
|
||||
int64_t docId , bool justPrintDups );
|
||||
static int32_t dumpSpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int printStats, int32_t firstIp);
|
||||
static int32_t dumpSpiderdbCsv(const char *coll);
|
||||
//static int32_t dumpSpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int printStats, int32_t firstIp);
|
||||
//static int32_t dumpSpiderdbCsv(const char *coll);
|
||||
|
||||
static void dumpTagdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char req,
|
||||
const char *site);
|
||||
@ -121,9 +121,9 @@ static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool inc
|
||||
|
||||
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
|
||||
static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
|
||||
//static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
|
||||
//
|
||||
//static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
|
||||
|
||||
static int copyFiles(const char *dstDir);
|
||||
|
||||
@ -1291,19 +1291,19 @@ int main2 ( int argc , char *argv[] ) {
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'x' )
|
||||
dumpDoledb (coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 's' ) {
|
||||
int printStats = 0;
|
||||
int32_t firstIp = 0;
|
||||
if(cmdarg+6 < argc)
|
||||
printStats = atol(argv[cmdarg+6]);
|
||||
if(cmdarg+7 < argc)
|
||||
firstIp = atoip(argv[cmdarg+7]);
|
||||
|
||||
int32_t ret = dumpSpiderdb ( coll, startFileNum, numFiles, includeTree, printStats, firstIp );
|
||||
if ( ret == -1 ) {
|
||||
fprintf(stdout,"error dumping spiderdb\n");
|
||||
}
|
||||
}
|
||||
// else if ( argv[cmdarg+1][0] == 's' ) {
|
||||
// int printStats = 0;
|
||||
// int32_t firstIp = 0;
|
||||
// if(cmdarg+6 < argc)
|
||||
// printStats = atol(argv[cmdarg+6]);
|
||||
// if(cmdarg+7 < argc)
|
||||
// firstIp = atoip(argv[cmdarg+7]);
|
||||
//
|
||||
// int32_t ret = dumpSpiderdb ( coll, startFileNum, numFiles, includeTree, printStats, firstIp );
|
||||
// if ( ret == -1 ) {
|
||||
// fprintf(stdout,"error dumping spiderdb\n");
|
||||
// }
|
||||
// }
|
||||
else if ( argv[cmdarg+1][0] == 'S' ) {
|
||||
char *site = NULL;
|
||||
if ( cmdarg+6 < argc ) {
|
||||
@ -1332,8 +1332,8 @@ int main2 ( int argc , char *argv[] ) {
|
||||
dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else if (strcmp(argv[cmdarg+1], "wt") == 0) {
|
||||
dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else if (strcmp(argv[cmdarg+1], "us") == 0) {
|
||||
dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
// } else if (strcmp(argv[cmdarg+1], "us") == 0) {
|
||||
// dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
|
||||
} else {
|
||||
goto printHelp;
|
||||
}
|
||||
@ -1343,18 +1343,18 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(strcmp(cmd, "dumpcsv") == 0) {
|
||||
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
|
||||
if( !g_collectiondb.loadAllCollRecs()) {
|
||||
log("db: Collectiondb init failed.");
|
||||
return 1;
|
||||
}
|
||||
if(argv[cmdarg+1][0] == 's')
|
||||
dumpSpiderdbCsv(argv[cmdarg+2]);
|
||||
g_log.m_disabled = true;
|
||||
g_collectiondb.reset();
|
||||
return 0;
|
||||
}
|
||||
// if(strcmp(cmd, "dumpcsv") == 0) {
|
||||
// g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
|
||||
// if( !g_collectiondb.loadAllCollRecs()) {
|
||||
// log("db: Collectiondb init failed.");
|
||||
// return 1;
|
||||
// }
|
||||
// if(argv[cmdarg+1][0] == 's')
|
||||
// dumpSpiderdbCsv(argv[cmdarg+2]);
|
||||
// g_log.m_disabled = true;
|
||||
// g_collectiondb.reset();
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
if(strcmp(cmd, "convertspiderdb") == 0) {
|
||||
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
|
||||
@ -1368,53 +1368,53 @@ int main2 ( int argc , char *argv[] ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
|
||||
// . spiderdb is special:
|
||||
// gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
|
||||
// [priority] [printStats?]
|
||||
if ( strcmp ( cmd , "verify" ) == 0 ) {
|
||||
//
|
||||
// tell Collectiondb, not to verify each rdb's data
|
||||
//
|
||||
g_dumpMode = true;
|
||||
|
||||
if ( cmdarg+1 >= argc ) goto printHelp;
|
||||
int32_t startFileNum = 0;
|
||||
int32_t numFiles = -1;
|
||||
bool includeTree = true;
|
||||
const char *coll = "";
|
||||
|
||||
// so we do not log every collection coll.conf we load
|
||||
g_conf.m_doingCommandLine = true;
|
||||
|
||||
// we have to init collection db because we need to know if
|
||||
// the collnum is legit or not in the tree
|
||||
if ( ! g_collectiondb.loadAllCollRecs() ) {
|
||||
log("db: Collectiondb init failed." ); return 1; }
|
||||
|
||||
if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
|
||||
if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
|
||||
if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
|
||||
if ( cmdarg+5 < argc ) includeTree = argToBoolean(argv[cmdarg+5]);
|
||||
|
||||
if ( argv[cmdarg+1][0] == 's' ) {
|
||||
int32_t firstIp = 0;
|
||||
if(cmdarg+6 < argc)
|
||||
firstIp = atoip(argv[cmdarg+6]);
|
||||
|
||||
int32_t ret = verifySpiderdb ( coll, startFileNum, numFiles, includeTree, firstIp );
|
||||
if ( ret == -1 ) {
|
||||
fprintf(stdout,"error verifying spiderdb\n");
|
||||
}
|
||||
}
|
||||
else {
|
||||
goto printHelp;
|
||||
}
|
||||
// disable any further logging so final log msg is clear
|
||||
g_log.m_disabled = true;
|
||||
g_collectiondb.reset();
|
||||
return 0;
|
||||
}
|
||||
// // . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
|
||||
// // . spiderdb is special:
|
||||
// // gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
|
||||
// // [priority] [printStats?]
|
||||
// if ( strcmp ( cmd , "verify" ) == 0 ) {
|
||||
// //
|
||||
// // tell Collectiondb, not to verify each rdb's data
|
||||
// //
|
||||
// g_dumpMode = true;
|
||||
//
|
||||
// if ( cmdarg+1 >= argc ) goto printHelp;
|
||||
// int32_t startFileNum = 0;
|
||||
// int32_t numFiles = -1;
|
||||
// bool includeTree = true;
|
||||
// const char *coll = "";
|
||||
//
|
||||
// // so we do not log every collection coll.conf we load
|
||||
// g_conf.m_doingCommandLine = true;
|
||||
//
|
||||
// // we have to init collection db because we need to know if
|
||||
// // the collnum is legit or not in the tree
|
||||
// if ( ! g_collectiondb.loadAllCollRecs() ) {
|
||||
// log("db: Collectiondb init failed." ); return 1; }
|
||||
//
|
||||
// if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
|
||||
// if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
|
||||
// if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
|
||||
// if ( cmdarg+5 < argc ) includeTree = argToBoolean(argv[cmdarg+5]);
|
||||
//
|
||||
// if ( argv[cmdarg+1][0] == 's' ) {
|
||||
// int32_t firstIp = 0;
|
||||
// if(cmdarg+6 < argc)
|
||||
// firstIp = atoip(argv[cmdarg+6]);
|
||||
//
|
||||
// int32_t ret = verifySpiderdb ( coll, startFileNum, numFiles, includeTree, firstIp );
|
||||
// if ( ret == -1 ) {
|
||||
// fprintf(stdout,"error verifying spiderdb\n");
|
||||
// }
|
||||
// }
|
||||
// else {
|
||||
// goto printHelp;
|
||||
// }
|
||||
// // disable any further logging so final log msg is clear
|
||||
// g_log.m_disabled = true;
|
||||
// g_collectiondb.reset();
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
|
||||
|
||||
@ -2706,6 +2706,7 @@ public:
|
||||
int32_t m_numErrorReplies;
|
||||
};
|
||||
|
||||
#if 0
|
||||
static HashTableX g_ut;
|
||||
|
||||
static void addUStat1(const SpiderRequest *sreq, bool hadReply , int32_t now) {
|
||||
@ -3356,7 +3357,7 @@ static int32_t dumpSpiderdbCsv(const char *coll) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// time speed of inserts into RdbTree for indexdb
|
||||
static bool hashtest() {
|
||||
@ -3874,6 +3875,7 @@ static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
|
||||
if (startFileNum < 0) {
|
||||
log(LOG_LOGIC, "db: Start file number is < 0. Must be >= 0.");
|
||||
@ -4081,7 +4083,7 @@ static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t nu
|
||||
done:
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static bool parseTest(const char *coll, int64_t docId, const char *query) {
|
||||
|
@ -11,7 +11,7 @@ enum rdbid_t {
|
||||
RDB_TITLEDB = 3,
|
||||
// RDB_SECTIONDB = 4,
|
||||
// RDB_SYNCDB = 5,
|
||||
RDB_SPIDERDB = 6,
|
||||
RDB_SPIDERDB_DEPRECATED = 6,
|
||||
RDB_DOLEDB = 7,
|
||||
// RDB_TFNDB = 8,
|
||||
RDB_CLUSTERDB = 9,
|
||||
@ -33,7 +33,7 @@ enum rdbid_t {
|
||||
// RDB2_INDEXDB2 = 21,
|
||||
RDB2_TITLEDB2 = 22,
|
||||
// RDB2_SECTIONDB2 = 23,
|
||||
RDB2_SPIDERDB2 = 24,
|
||||
RDB2_SPIDERDB2_DEPRECATED = 24,
|
||||
// RDB2_TFNDB2 = 25,
|
||||
RDB2_CLUSTERDB2 = 26,
|
||||
// RDB2_DATEDB2 = 27,
|
||||
@ -43,6 +43,8 @@ enum rdbid_t {
|
||||
RDB2_TAGDB2 = 31,
|
||||
RDB2_POSDB2 = 32,
|
||||
// RDB2_CATDB2 = 33,
|
||||
RDB_SPIDERDB_SQLITE = 34,
|
||||
RDB2_SPIDERDB2_SQLITE = 35,
|
||||
RDB_END
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user