It compiles, links starts and can put seeds into the sqlite database

This commit is contained in:
Ivan Skytte Jørgensen 2017-10-10 15:54:16 +02:00
parent 50b363b8fe
commit 9785fec43b
33 changed files with 1103 additions and 535 deletions

@ -150,7 +150,7 @@ bool Collectiondb::cleanTrees() {
g_posdb.getRdb()->cleanTree();
g_titledb.getRdb()->cleanTree();
g_tagdb.getRdb()->cleanTree();
g_spiderdb.getRdb()->cleanTree();
g_spiderdb.getRdb_deprecated()->cleanTree();
g_doledb.getRdb()->cleanTree();
g_clusterdb.getRdb()->cleanTree();
g_linkdb.getRdb()->cleanTree();
@ -436,7 +436,7 @@ bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb_deprecated()->addRdbBase1(coll) ) goto hadError;
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
// now clean the trees
@ -504,7 +504,6 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) {
g_titledb.getRdb()->delColl ( coll );
g_tagdb.getRdb()->delColl ( coll );
g_spiderdb.getRdb()->delColl ( coll );
g_doledb.getRdb()->delColl ( coll );
g_clusterdb.getRdb()->delColl ( coll );
g_linkdb.getRdb()->delColl ( coll );
@ -765,7 +764,6 @@ bool Collectiondb::resetColl2(collnum_t oldCollnum, collnum_t newCollnum) {
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );

@ -64,7 +64,7 @@ static const char update_statement_duplicate_request[] =
int convertSpiderDb(const char *collname) {
if(!g_spiderdb.init())
return 1;
if(!g_spiderdb.getRdb()->addRdbBase1(collname))
if(!g_spiderdb.getRdb_deprecated()->addRdbBase1(collname))
return 2;
collnum_t collnum = g_collectiondb.getRec(collname)->m_collnum;
@ -134,7 +134,7 @@ int convertSpiderDb(const char *collname) {
printf("Starting conversion\n");
for(;;) {
// use msg5 to get the list, should ALWAYS block since no threads
if(!msg5.getList(RDB_SPIDERDB,
if(!msg5.getList(RDB_SPIDERDB_DEPRECATED,
collnum,
&list,
&startKey,
@ -221,7 +221,8 @@ int convertSpiderDb(const char *collname) {
(spiderRequest->m_hadReply ? (1<<10) : 0) |
(spiderRequest->m_fakeFirstIp ? (1<<11) : 0) |
(spiderRequest->m_hasAuthorityInlink ? (1<<12) : 0) |
(spiderRequest->m_avoidSpiderLinks ? (1<<13) : 0);
(spiderRequest->m_hasAuthorityInlinkValid ? (1<<13) : 0) |
(spiderRequest->m_avoidSpiderLinks ? (1<<14) : 0);
sqlite3_bind_int(stmt, 11, rqf);
if(spiderRequest->m_priority>=0)
sqlite3_bind_int(stmt, 12, spiderRequest->m_priority);
@ -240,7 +241,8 @@ int convertSpiderDb(const char *collname) {
(prevSpiderReply->m_isPermalink ? (1<<1) : 0) |
(prevSpiderReply->m_isIndexed ? (1<<2) : 0) |
(prevSpiderReply->m_hasAuthorityInlink ? (1<<3) : 0) |
(prevSpiderReply->m_fromInjectionRequest ? (1<<4) : 0);
(prevSpiderReply->m_fromInjectionRequest ? (1<<4) : 0) |
(prevSpiderReply->m_isIndexedINValid ? (1<<5) : 0);
sqlite3_bind_int(stmt, 21, rpf);
}

@ -252,10 +252,8 @@ void DailyMerge::dailyMergeLoop ( ) {
// tell it to save, otherwise this might not get saved
m_cr->setNeedsSave();
// initiate dumps
g_spiderdb.getRdb ()->submitRdbDumpJob(true);
g_linkdb.getRdb ()->submitRdbDumpJob(true);
// if neither has recs in tree, go to next mode
if(g_spiderdb.getRdb()->getNumUsedNodes()>0) return;
if(g_linkdb .getRdb()->getNumUsedNodes()>0) return;
// ok, all trees are clear and dumped
m_mergeMode = 5;
@ -267,9 +265,6 @@ void DailyMerge::dailyMergeLoop ( ) {
if ( m_mergeMode == 5 ) {
// kick off the merges if not already going
if(g_spiderdb.getRdb()->getBase(m_cr->m_collnum)->attemptMerge(1,true,2))
return;
if(g_linkdb.getRdb()->getBase(m_cr->m_collnum)->attemptMerge(1,true,2))
return;

@ -1518,8 +1518,10 @@ uint32_t Hostdb::getShardNum(rdbid_t rdbId, const void *k) const {
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
case RDB_SPIDERDB:
case RDB2_SPIDERDB2: {
case RDB_SPIDERDB_DEPRECATED:
case RDB2_SPIDERDB2_DEPRECATED:
case RDB_SPIDERDB_SQLITE:
case RDB2_SPIDERDB2_SQLITE: {
int32_t firstIp = Spiderdb::getFirstIp((key128_t *)k);
// do what Spider.h getGroupId() used to do so we are
// backwards compatible

@ -30,6 +30,8 @@ OBJS_O0 = \
Query.o \
RdbCache.o RdbDump.o RdbMem.o RdbMerge.o RdbScan.o RdbTree.o \
Rebalance.o Repair.o RobotRule.o Robots.o \
SpiderdbSqlite.o \
SpiderdbRdbSqliteBridge.o \
Sanity.o ScalingFunctions.o SearchInput.o SiteGetter.o Speller.o SpiderProxy.o Stats.o SummaryCache.o Synonyms.o \
Tagdb.o TcpServer.o Titledb.o \
Version.o \

@ -164,7 +164,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
// get groupid from hostid here lest we core in getGroupId() below.
// it does that for dumping spiderdb to the client browser. they
// can download the whole enchilada.
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB_DEPRECATED )
m_shardNum = 0;
// did they force it? core until i figure out what this is
else if ( forceParitySplit >= 0 )
@ -197,7 +197,7 @@ bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none)
if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;
//if it is spiderdb then we only have it it we are a spider host too
if((rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
if((rdbId == RDB_SPIDERDB_DEPRECATED || rdbId == RDB2_SPIDERDB2_DEPRECATED) &&
isLocal &&
!g_hostdb.getMyHost()->m_spiderEnabled)
{

@ -160,7 +160,7 @@ class RdbCache *getDiskPageCache ( rdbid_t rdbId ) {
maxRecs = maxMem / 3000;
dbname = "titdbcache";
break;
case RDB_SPIDERDB:
case RDB_SPIDERDB_DEPRECATED:
rpc = &g_rdbCaches[4];
maxMem = g_conf.m_spiderdbFileCacheSize;
maxRecs = maxMem / 3000;

@ -11,6 +11,7 @@
#include "ip.h"
#include "Mem.h"
#include "Titledb.h" // for Titledb::validateSerializedRecord
#include "SpiderdbRdbSqliteBridge.h"
#include <sys/stat.h> //stat()
#include <fcntl.h>
@ -240,59 +241,71 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
Titledb::validateSerializedRecord( rec, recSize );
}
// . get the rdb to which it belongs, use Msg0::getRdb()
// . do not call this for every rec if we do not have to
if (rdbId != lastRdbId || !rdb) {
rdb = getRdbFromId(rdbId);
if(rdbId!=RDB_SPIDERDB_DEPRECATED && rdbId!=RDB2_SPIDERDB2_DEPRECATED) {
// . get the rdb to which it belongs, use Msg0::getRdb()
// . do not call this for every rec if we do not have to
if (rdbId != lastRdbId || !rdb) {
rdb = getRdbFromId(rdbId);
if (!rdb) {
char ipbuf[16];
log(LOG_WARN, "msg4: rdbId of %" PRId32" unrecognized from hostip=%s. dropping WHOLE request",
(int32_t)rdbId, slot ? iptoa(slot->getIp(),ipbuf) : "unknown");
if (!rdb) {
char ipbuf[16];
log(LOG_WARN, "msg4: rdbId of %" PRId32" unrecognized from hostip=%s. dropping WHOLE request",
(int32_t)rdbId, slot ? iptoa(slot->getIp(),ipbuf) : "unknown");
gbshutdownAbort(true);
}
// an uninitialized secondary rdb?
// don't core any more, we probably restarted this shard
// and it needs to wait for host #0 to syncs its
// g_conf.m_repairingEnabled to '1' so it can start its
// Repair.cpp repairWrapper() loop and init the secondary
// rdbs so "rdb" here won't be NULL any more.
if (!rdb->isInitialized()) {
time_t currentTime = getTime();
static time_t s_lastTime = 0;
if (currentTime > s_lastTime + 10) {
s_lastTime = currentTime;
log(LOG_WARN, "msg4: oops. got an rdbId key for a secondary "
"rdb and not in repair mode. waiting to be in repair mode.");
}
g_errno = ETRYAGAIN;
return false;
}
}
// if we don't have data, recSize must be the same with keySize
if (rdb->getFixedDataSize() == 0 && recSize != rdb->getKeySize()) {
gbshutdownAbort(true);
}
// an uninitialized secondary rdb?
// don't core any more, we probably restarted this shard
// and it needs to wait for host #0 to syncs its
// g_conf.m_repairingEnabled to '1' so it can start its
// Repair.cpp repairWrapper() loop and init the secondary
// rdbs so "rdb" here won't be NULL any more.
if (!rdb->isInitialized()) {
time_t currentTime = getTime();
static time_t s_lastTime = 0;
if (currentTime > s_lastTime + 10) {
s_lastTime = currentTime;
log(LOG_WARN, "msg4: oops. got an rdbId key for a secondary "
"rdb and not in repair mode. waiting to be in repair mode.");
}
g_errno = ETRYAGAIN;
return false;
auto &rdbItem = rdbItems[rdbId];
++rdbItem.m_numRecs;
int32_t dataSize = recSize - rdb->getKeySize();
if (rdb->getFixedDataSize() == -1) {
dataSize -= 4;
}
rdbItem.m_dataSizes += dataSize;
rdbItem.m_items.emplace_back(collnum, rec, recSize);
} else {
//spiderdb records no longer reside in an Rdb
// don't add to spiderdb when we're nospider host
if (!g_hostdb.getMyHost()->m_spiderEnabled)
continue;
auto &rdbItem = rdbItems[rdbId];
++rdbItem.m_numRecs;
int32_t dataSize = recSize - sizeof(key128_t) - 4;
rdbItem.m_dataSizes += dataSize;
rdbItem.m_items.emplace_back(collnum, rec, recSize);
}
// if we don't have data, recSize must be the same with keySize
if (rdb->getFixedDataSize() == 0 && recSize != rdb->getKeySize()) {
gbshutdownAbort(true);
}
// don't add to spiderdb when we're nospider host
if (!g_hostdb.getMyHost()->m_spiderEnabled && (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2)) {
continue;
}
auto &rdbItem = rdbItems[rdbId];
++rdbItem.m_numRecs;
int32_t dataSize = recSize - rdb->getKeySize();
if (rdb->getFixedDataSize() == -1) {
dataSize -= 4;
}
rdbItem.m_dataSizes += dataSize;
rdbItem.m_items.emplace_back(collnum, rec, recSize);
// advance over the rec data to point to next entry
p += recSize;
}
@ -300,12 +313,14 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
bool hasRoom = true;
bool anyDumping = false;
for (auto const &rdbItem : rdbItems) {
Rdb *rdb = getRdbFromId(rdbItem.first);
if (rdb->isDumping()) {
anyDumping = true;
} else if (!rdb->hasRoom(rdbItem.second.m_numRecs, rdbItem.second.m_dataSizes)) {
rdb->submitRdbDumpJob(true);
hasRoom = false;
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
Rdb *rdb = getRdbFromId(rdbItem.first);
if (rdb->isDumping()) {
anyDumping = true;
} else if (!rdb->hasRoom(rdbItem.second.m_numRecs, rdbItem.second.m_dataSizes)) {
rdb->submitRdbDumpJob(true);
hasRoom = false;
}
}
}
@ -323,53 +338,66 @@ static bool Msg4In::addMetaList(const char *p, UdpSlot *slot) {
for (auto const &rdbItem : rdbItems) {
Rdb *rdb = getRdbFromId(rdbItem.first);
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
Rdb *rdb = getRdbFromId(rdbItem.first);
bool status = false;
for (auto const &item : rdbItem.second.m_items) {
// reset g_errno
g_errno = 0;
// . make a list from this data
// . skip over the first 4 bytes which is the rdbId
// . TODO: embed the rdbId in the msgtype or something...
RdbList list;
// set the list
// todo: dodgy cast to char*. RdbList should be fixed
list.set((char *)item.m_rec, item.m_recSize, (char *)item.m_rec, item.m_recSize,
rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
// keep track of stats
rdb->readRequestAdd(item.m_recSize);
// this returns false and sets g_errno on error
status = rdb->addListNoSpaceCheck(item.m_collNum, &list);
// bad coll #? ignore it. common when deleting and resetting
// collections using crawlbot. but there are other recs in this
// list from different collections, so do not abandon the whole
// meta list!! otherwise we lose data!!
if (g_errno == ENOCOLLREC && !status) {
bool status = true;
for (auto const &item : rdbItem.second.m_items) {
// reset g_errno
g_errno = 0;
status = true;
// . make a list from this data
// . skip over the first 4 bytes which is the rdbId
// . TODO: embed the rdbId in the msgtype or something...
RdbList list;
// set the list
// todo: dodgy cast to char*. RdbList should be fixed
list.set((char *)item.m_rec, item.m_recSize, (char *)item.m_rec, item.m_recSize,
rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
// keep track of stats
rdb->readRequestAdd(item.m_recSize);
// this returns false and sets g_errno on error
status = rdb->addListNoSpaceCheck(item.m_collNum, &list);
// bad coll #? ignore it. common when deleting and resetting
// collections using crawlbot. but there are other recs in this
// list from different collections, so do not abandon the whole
// meta list!! otherwise we lose data!!
if (g_errno == ENOCOLLREC && !status) {
g_errno = 0;
status = true;
}
if (!status) {
break;
}
}
if (!status) {
break;
}
}
if (!status) {
break;
} else {
bool status = true;
for(auto const &item : rdbItem.second.m_items) {
status = SpiderdbRdbSqliteBridge::addRecord(item.m_collNum, item.m_rec, item.m_recSize);
if(!status)
break;
}
if(!status)
break;
}
}
// verify integrity if wanted
if (g_conf.m_verifyTreeIntegrity) {
for (auto const &rdbItem : rdbItems) {
Rdb *rdb = getRdbFromId(rdbItem.first);
rdb->verifyTreeIntegrity();
for(auto const &rdbItem : rdbItems) {
if(rdbItem.first!=RDB_SPIDERDB_DEPRECATED && rdbItem.first!=RDB2_SPIDERDB2_DEPRECATED) {
Rdb *rdb = getRdbFromId(rdbItem.first);
rdb->verifyTreeIntegrity();
}
}
}

@ -411,7 +411,7 @@ bool Msg5::readList ( ) {
// . we set endKey for spiderdb when reading from tree above
// based on the current minRecSizes so do not mess with it
// in that case.
if ( m_rdbId != RDB_SPIDERDB ) {
if ( m_rdbId != RDB_SPIDERDB_DEPRECATED ) {
//m_newMinRecSizes += rs * numNegativeRecs;
int32_t nn = m_newMinRecSizes + rs * numNegativeRecs;
if ( rs > 0 && nn < m_newMinRecSizes ) nn = 0x7fffffff;
@ -540,7 +540,7 @@ bool Msg5::needsRecall() {
}
// limit to just doledb for now in case it results in data loss
if( rc && m_readAbsolutelyNothing && (m_rdbId==RDB_DOLEDB||m_rdbId==RDB_SPIDERDB) ) {
if( rc && m_readAbsolutelyNothing && (m_rdbId==RDB_DOLEDB||m_rdbId==RDB_SPIDERDB_DEPRECATED) ) {
rc = false;
}

@ -140,7 +140,7 @@ bool getSpiderRequestMetaList ( const char *doc, SafeBuf *listBuf, bool spiderLi
}
// store rdbid first
if ( ! listBuf->pushChar(RDB_SPIDERDB) ) {
if ( ! listBuf->pushChar(RDB_SPIDERDB_DEPRECATED) ) {
// return false with g_errno set
return false;
}

@ -453,7 +453,7 @@ bool Msg1c::gotList ( ) {
log("reindex: adding docid list (docids:%d) to spiderdb", m_numDocIdsAdded);
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB);
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB_DEPRECATED);
}
void addedListWrapper ( void *state ) {

@ -152,7 +152,7 @@ static bool getSpiderRecs(State *st) {
key128_t endKey = Spiderdb::makeLastKey(st->m_firstip, uh48);
log(LOG_TRACE,"PageSpiderdbLookup: getSpiderRecs(%p): Calling Msg0::getList()", st);
if(!st->m_msg0.getList(-1, //hostId
RDB_SPIDERDB,
RDB_SPIDERDB_DEPRECATED, //TODO: use rdb_spiderdb_sqlite and new record format (also much simpler)
st->m_collnum,
&st->m_rdbList,
(const char*)&startKey,
@ -226,7 +226,7 @@ static bool sendResult(State *st) {
if(st->m_url_str[0]) {
int64_t uh48 = hash64b(st->m_url_str);
key128_t startKey = Spiderdb::makeFirstKey(st->m_firstip, uh48);
uint32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB, &startKey);
uint32_t shardNum = g_hostdb.getShardNum(RDB_SPIDERDB_SQLITE, &startKey);
sb.safePrintf("<p>Shard: %u</p>\n", shardNum);
int32_t numHosts;
const Host *host = g_hostdb.getShard(shardNum, &numHosts);

@ -1420,13 +1420,13 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
const Rdb *rdbs[] = {
g_posdb.getRdb(),
g_titledb.getRdb(),
g_spiderdb.getRdb(),
g_doledb.getRdb() ,
g_tagdb.getRdb(),
g_clusterdb.getRdb(),
g_linkdb.getRdb(),
};
int32_t nr = sizeof(rdbs) / sizeof(Rdb *);
//TODO: sqlite: show statistics for sqlite database(s)
// print dbname
p.safePrintf("<tr class=poo><td>&nbsp;</td>");

@ -700,11 +700,6 @@ static bool CommandMergeTitledb(const char *rec) {
}
static bool CommandMergeSpiderdb(const char *rec) {
forceMergeAll(RDB_SPIDERDB);
return true;
}
static bool CommandMergeLinkdb(const char *rec) {
forceMergeAll(RDB_LINKDB);
return true;
@ -729,7 +724,7 @@ static bool CommandForceIt(const char *rec) {
static bool CommandDiskDump(const char *rec) {
g_clusterdb.getRdb()->submitRdbDumpJob(true);
g_tagdb.getRdb()->submitRdbDumpJob(true);
g_spiderdb.getRdb()->submitRdbDumpJob(true);
g_spiderdb.getRdb_deprecated()->submitRdbDumpJob(true);
g_posdb.getRdb()->submitRdbDumpJob(true);
g_titledb.getRdb()->submitRdbDumpJob(true);
g_linkdb.getRdb()->submitRdbDumpJob(true);
@ -5071,18 +5066,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "tight merge spiderdb";
m->m_desc = "Merges all outstanding spiderdb files.";
m->m_cgi = "spmerge";
m->m_type = TYPE_CMD;
m->m_func = CommandMergeSpiderdb;
m->m_cast = true;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "tight merge linkdb";
m->m_desc = "Merges all outstanding linkdb files.";
m->m_cgi = "lmerge";

@ -265,7 +265,7 @@ bool Process::init ( ) {
// followed by titledb perhaps...
m_rdbs[m_numRdbs++] = g_titledb.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb_deprecated();
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
m_rdbs[m_numRdbs++] = g_tagdb.getRdb ();
m_rdbs[m_numRdbs++] = g_linkdb.getRdb ();
@ -274,7 +274,7 @@ bool Process::init ( ) {
m_rdbs[m_numRdbs++] = g_doledb.getRdb ();
m_rdbs[m_numRdbs++] = g_titledb2.getRdb ();
m_rdbs[m_numRdbs++] = g_posdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_spiderdb2.getRdb_deprecated();
m_rdbs[m_numRdbs++] = g_clusterdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_linkdb2.getRdb ();
m_rdbs[m_numRdbs++] = g_tagdb2.getRdb ();

166
Rdb.cpp

@ -137,13 +137,15 @@ bool Rdb::init(const char *dbname,
case RDB2_POSDB2:
case RDB_TITLEDB:
case RDB2_TITLEDB2:
case RDB_SPIDERDB:
case RDB_SPIDERDB_DEPRECATED:
case RDB_DOLEDB:
case RDB2_SPIDERDB2:
case RDB2_SPIDERDB2_DEPRECATED:
case RDB_LINKDB:
case RDB2_LINKDB2:
m_pageSize = GB_INDEXDB_PAGE_SIZE;
break;
// Not a real rdb: case RDB_SPIDERDB_SQLITE:
// Not a real rdb: case RDB2_SPIDERDB2_SQLITE:
default:
m_pageSize = GB_TFNDB_PAGE_SIZE;
}
@ -1062,14 +1064,14 @@ void attemptMergeAll() {
RDB_TITLEDB,
RDB_TAGDB,
RDB_LINKDB,
RDB_SPIDERDB,
RDB_SPIDERDB_DEPRECATED,
RDB_CLUSTERDB,
// also try to merge on rdbs being rebuilt
RDB2_POSDB2,
RDB2_TITLEDB2,
RDB2_TAGDB2,
RDB2_LINKDB2,
RDB2_SPIDERDB2,
RDB2_SPIDERDB2_DEPRECATED,
RDB2_CLUSTERDB2
};
static const unsigned numRdbs = sizeof(rdbid)/sizeof(rdbid[0]);
@ -1130,7 +1132,7 @@ bool Rdb::addList(collnum_t collnum, RdbList *list, bool checkForRoom) {
m_rdbId == RDB_CLUSTERDB ||
m_rdbId == RDB_LINKDB ||
m_rdbId == RDB_DOLEDB ||
m_rdbId == RDB_SPIDERDB ) ) {
m_rdbId == RDB_SPIDERDB_DEPRECATED ) ) {
// allow banning of sites still
log(LOG_WARN, "db: How did an add come in while in repair mode? rdbName=%s", getDbnameFromId(m_rdbId));
@ -1512,46 +1514,6 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
}
}
// . cancel any spider request that is a dup in the dupcache to save disk space
// . twins might have different dupcaches so they might have different dups,
// but it shouldn't be a big deal because they are dups!
if (m_rdbId == RDB_SPIDERDB && !KEYNEG(key)) {
// . this will create it if spiders are on and its NULL
// . even if spiders are off we need to create it so
// that the request can adds its ip to the waitingTree
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
// skip if not there
if (!sc) {
logTrace(g_conf.m_logTraceRdb, "END. %s: No spider coll. Returning true", m_dbname);
return true;
}
/// @todo ALC we're making an assumption that data passed in is part of a SpiderRequest (fix this!)
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest *>(data - 4 - sizeof(key128_t));
// is it really a request and not a SpiderReply?
if (Spiderdb::isSpiderRequest(&(sreq->m_key))) {
// skip if in dup cache. do NOT add to cache since
// addToWaitingTree() in Spider.cpp will do that when called
// from addSpiderRequest() below
if (sc->isInDupCache(sreq, false)) {
logDebug(g_conf.m_logDebugSpider, "spider: adding spider req %s is dup. skipping.", sreq->m_url);
logTrace(g_conf.m_logTraceRdb, "END. %s: Duplicated spider req. Returning true", m_dbname);
return true;
}
// if we are overflowing...
if (!sreq->m_isAddUrl && !sreq->m_isPageReindex && !sreq->m_urlIsDocId && !sreq->m_forceDelete &&
sc->isFirstIpInOverflowList(sreq->m_firstIp)) {
g_stats.m_totalOverflows++;
logDebug(g_conf.m_logDebugSpider, "spider: skipping for overflow url %s ", sreq->m_url);
logTrace(g_conf.m_logTraceRdb, "END. %s: Overflow. Returning true", m_dbname);
return true;
}
}
}
if (m_useTree) {
if (!m_tree.addNode(collnum, key, dataCopy, dataSize)) {
log(LOG_INFO, "db: Had error adding data to %s: %s", m_dbname, mstrerror(g_errno));
@ -1577,7 +1539,7 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
}
// if adding to spiderdb, add to cache, too (except negative key)
if ((m_rdbId == RDB_SPIDERDB || m_rdbId == RDB_DOLEDB) && !KEYNEG(key)) {
if (m_rdbId == RDB_DOLEDB && !KEYNEG(key)) {
// . this will create it if spiders are on and its NULL
// . even if spiders are off we need to create it so
// that the request can adds its ip to the waitingTree
@ -1588,91 +1550,27 @@ bool Rdb::addRecord(collnum_t collnum, const char *key, const char *data, int32_
return true;
}
// if doing doledb...
if (m_rdbId == RDB_DOLEDB) {
int32_t pri = Doledb::getPriority((key96_t *)key);
// skip over corruption
if (pri < 0 || pri >= MAX_SPIDER_PRIORITIES) {
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Skip over corruption", m_dbname);
return true;
}
// if added positive key is before cursor, update curso
if (KEYCMP(key, (char *)&sc->m_nextKeys[pri], sizeof(key96_t)) < 0) {
KEYSET((char *)&sc->m_nextKeys[pri], key, sizeof(key96_t));
if (g_conf.m_logDebugSpider) {
char keyStrBuf[MAX_KEYSTR_BYTES];
KEYSTR(key, 12, keyStrBuf);
logDebug(g_conf.m_logDebugSpider, "spider: cursor reset pri=%" PRId32" to %s", pri, keyStrBuf);
}
}
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. For doledb. Returning true", m_dbname);
// that's it for doledb mods
int32_t pri = Doledb::getPriority((key96_t *)key);
// skip over corruption
if (pri < 0 || pri >= MAX_SPIDER_PRIORITIES) {
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Skip over corruption", m_dbname);
return true;
}
// . ok, now add that reply to the cache
/// @todo ALC we're making an assumption that data passed in is part of a SpiderRequest (fix this!)
// assume this is the rec (4 byte dataSize,spiderdb key is now 16 bytes)
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest *>(data - 4 - sizeof(key128_t));
// is it really a request and not a SpiderReply?
if (Spiderdb::isSpiderRequest(&sreq->m_key)) {
// add the request
// if added positive key is before cursor, update curso
if (KEYCMP(key, (char *)&sc->m_nextKeys[pri], sizeof(key96_t)) < 0) {
KEYSET((char *)&sc->m_nextKeys[pri], key, sizeof(key96_t));
if (g_conf.m_logDebugSpider) {
// log that. why isn't this undoling always
char keyStrBuf[MAX_KEYSTR_BYTES];
KEYSTR((const char *)&sreq->m_key, sizeof(key128_t), keyStrBuf);
char ipbuf[16];
logDebug(g_conf.m_logDebugSpider, "spider: rdb: added spider request to spiderdb rdb tree"
" request for uh48=%" PRIu64" prntdocid=%" PRIu64" firstIp=%s spiderdbkey=%s",
sreq->getUrlHash48(), sreq->getParentDocId(), iptoa(sreq->m_firstIp,ipbuf), keyStrBuf);
}
// false means to NOT call evaluateAllRequests()
// because we call it below. the reason we do this
// is because it does not always get called
// in addSpiderRequest(), like if its a dup and
// gets "nuked". (removed callEval arg since not
// really needed)
sc->addSpiderRequest(sreq, gettimeofdayInMilliseconds());
} else {
// otherwise repl
SpiderReply *rr = (SpiderReply *)sreq;
// log that. why isn't this undoling always
logDebug(g_conf.m_logDebugSpider, "rdb: rdb: got spider reply for uh48=%" PRIu64, rr->getUrlHash48());
// add the reply
sc->addSpiderReply(rr);
/// @todo ALC why are we removing this here? this check should be at where we're trying to insert this
// don't actually add it if "fake". i.e. if it
// was an internal error of some sort... this will
// make it try over and over again i guess...
// no because we need some kinda reply so that gb knows
// the pagereindex docid-based spider requests are done,
// at least for now, because the replies were not being
// added for now. just for internal errors at least...
// we were not adding spider replies to the page reindexes
// as they completed and when i tried to rerun it
// the title recs were not found since they were deleted,
// so we gotta add the replies now.
int32_t indexCode = rr->m_errCode;
if (indexCode == EABANDONED) {
log(LOG_WARN, "rdb: not adding spiderreply to rdb because it was an internal error for uh48=%" PRIu64
" errCode = %s", rr->getUrlHash48(), mstrerror(indexCode));
m_tree.deleteNode(collnum, key, false);
KEYSTR(key, 12, keyStrBuf);
logDebug(g_conf.m_logDebugSpider, "spider: cursor reset pri=%" PRId32" to %s", pri, keyStrBuf);
}
}
// clear errors from adding to SpiderCache
g_errno = 0;
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. For doledb. Returning true", m_dbname);
// that's it for doledb mods
return true;
}
logTrace(g_conf.m_logTraceRdb, "END. %s: Done. Returning true", m_dbname);
@ -1867,14 +1765,14 @@ Rdb *getRdbFromId ( rdbid_t rdbId ) {
case RDB_TAGDB: return g_tagdb.getRdb();
case RDB_POSDB: return g_posdb.getRdb();
case RDB_TITLEDB: return g_titledb.getRdb();
case RDB_SPIDERDB: return g_spiderdb.getRdb();
case RDB_SPIDERDB_DEPRECATED: return g_spiderdb.getRdb_deprecated();
case RDB_DOLEDB: return g_doledb.getRdb();
case RDB_CLUSTERDB: return g_clusterdb.getRdb();
case RDB_LINKDB: return g_linkdb.getRdb();
case RDB2_POSDB2: return g_posdb2.getRdb();
case RDB2_TITLEDB2: return g_titledb2.getRdb();
case RDB2_SPIDERDB2: return g_spiderdb2.getRdb();
case RDB2_SPIDERDB2_DEPRECATED: return g_spiderdb2.getRdb_deprecated();
case RDB2_CLUSTERDB2: return g_clusterdb2.getRdb();
case RDB2_LINKDB2: return g_linkdb2.getRdb();
case RDB2_TAGDB2: return g_tagdb2.getRdb();
@ -1888,14 +1786,14 @@ rdbid_t getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_tagdb.getRdb () ) return RDB_TAGDB;
if ( rdb == g_posdb.getRdb () ) return RDB_POSDB;
if ( rdb == g_titledb.getRdb () ) return RDB_TITLEDB;
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
if ( rdb == g_spiderdb.getRdb_deprecated() ) return RDB_SPIDERDB_DEPRECATED;
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
if ( rdb == g_posdb2.getRdb () ) return RDB2_POSDB2;
if ( rdb == g_tagdb2.getRdb () ) return RDB2_TAGDB2;
if ( rdb == g_titledb2.getRdb () ) return RDB2_TITLEDB2;
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
if ( rdb == g_spiderdb2.getRdb_deprecated() ) return RDB2_SPIDERDB2_DEPRECATED;
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;
@ -1908,9 +1806,11 @@ bool isSecondaryRdb ( rdbid_t rdbId ) {
case RDB2_POSDB2 : return true;
case RDB2_TAGDB2 : return true;
case RDB2_TITLEDB2 : return true;
case RDB2_SPIDERDB2 : return true;
case RDB2_SPIDERDB2_DEPRECATED : return true;
case RDB2_CLUSTERDB2 : return true;
case RDB2_LINKDB2 : return true;
case RDB2_SPIDERDB2_SQLITE : return true;
//(todo?) rdb2_spiderdb2_sqlite
default:
return false;
}
@ -1919,8 +1819,8 @@ bool isSecondaryRdb ( rdbid_t rdbId ) {
// use a quick table now...
char getKeySizeFromRdbId(rdbid_t rdbId) {
switch(rdbId) {
case RDB_SPIDERDB:
case RDB2_SPIDERDB2:
case RDB_SPIDERDB_DEPRECATED:
case RDB2_SPIDERDB2_DEPRECATED:
case RDB_TAGDB:
case RDB2_TAGDB2:
return sizeof(key128_t); // 16
@ -1958,7 +1858,8 @@ int32_t getDataSizeFromRdbId ( rdbid_t rdbId ) {
ds = 0;
else if ( i == RDB_TITLEDB ||
i == RDB_TAGDB ||
i == RDB_SPIDERDB ||
i == RDB_SPIDERDB_DEPRECATED ||
i == RDB_SPIDERDB_SQLITE ||
i == RDB_DOLEDB )
ds = -1;
else if ( i == RDB2_POSDB2 ||
@ -1967,7 +1868,8 @@ int32_t getDataSizeFromRdbId ( rdbid_t rdbId ) {
ds = 0;
else if ( i == RDB2_TITLEDB2 ||
i == RDB2_TAGDB2 ||
i == RDB2_SPIDERDB2 )
i == RDB2_SPIDERDB2_DEPRECATED ||
i == RDB2_SPIDERDB2_SQLITE )
ds = -1;
else {
continue;

@ -470,7 +470,7 @@ bool RdbBase::setFiles ( ) {
return false;
// spiderdb should start with file 0001.dat or 0000.dat
if ( m_numFiles > 0 && m_fileInfo[0].m_fileId > 1 && m_rdb->getRdbId() == RDB_SPIDERDB ) {
if ( m_numFiles > 0 && m_fileInfo[0].m_fileId > 1 && m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED ) {
//isj: is that even true anymore? Ok, crashed merges and lost file0000* are not a
//good thing but I don't see why it should affect spiderdb especially bad.
return fixNonfirstSpiderdbFiles();
@ -1194,8 +1194,8 @@ static int32_t getMaxLostPositivesPercentage(rdbid_t rdbId) {
case RDB_TITLEDB:
case RDB2_TITLEDB2:
return g_conf.m_titledbMaxLostPositivesPercentage;
case RDB_SPIDERDB:
case RDB2_SPIDERDB2:
case RDB_SPIDERDB_DEPRECATED:
case RDB2_SPIDERDB2_DEPRECATED:
return g_conf.m_spiderdbMaxLostPositivesPercentage;
case RDB_LINKDB:
case RDB2_LINKDB2:
@ -1675,7 +1675,7 @@ int32_t RdbBase::getMinToMerge(const CollectionRec *cr, rdbid_t rdbId, int32_t m
result = cr->m_titledbMinFilesToMerge;
logTrace(g_conf.m_logTraceRdbBase, "titledb. m_minToMerge: %d", m_minToMerge);
break;
case RDB_SPIDERDB:
case RDB_SPIDERDB_DEPRECATED:
result = cr->m_spiderdbMinFilesToMerge;
logTrace(g_conf.m_logTraceRdbBase, "spiderdb. m_minToMerge: %d", m_minToMerge);
break;
@ -1745,7 +1745,7 @@ bool RdbBase::attemptMerge(int32_t niceness, bool forceMergeAll, int32_t minToMe
if (forceMergeAll) {
log(LOG_INFO,"merge: forcing merge for %s. (collnum=%" PRId32")",m_dbname,(int32_t)m_collnum);
if (m_rdb->getRdbId() == RDB_SPIDERDB) {
if (m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED) {
minMergeFileCount = 0;
}
}
@ -1907,7 +1907,7 @@ bool RdbBase::attemptMerge(int32_t niceness, bool forceMergeAll, int32_t minToMe
}
if (mergeFileCount == 1) {
int logLevel = (m_rdb->getRdbId() == RDB_SPIDERDB) ? LOG_INFO : LOG_LOGIC;
int logLevel = (m_rdb->getRdbId() == RDB_SPIDERDB_DEPRECATED) ? LOG_INFO : LOG_LOGIC;
log(logLevel,"merge:attemptMerge:resuming: filename with single file merge for %s coll=%s file=%s",m_dbname,m_coll,m_fileInfo[i].m_file->getFilename());
}

@ -735,7 +735,7 @@ bool RdbList::checkList_r(bool abortOnProblem, rdbid_t rdbId) {
gbshutdownAbort(true); }
}
}
if ( rdbId == RDB_SPIDERDB && ! KEYNEG(k) &&
if ( rdbId == RDB_SPIDERDB_DEPRECATED && ! KEYNEG(k) &&
getCurrentDataSize() > 0 ) {
char *rec = getCurrentRec();
// bad url in spider request?

@ -527,7 +527,7 @@ void RdbMerge::filterListWrapper(void *state) {
logTrace(g_conf.m_logTraceRdbMerge, "BEGIN. list=%p m_startKey=%s", &THIS->m_list, KEYSTR(THIS->m_startKey, THIS->m_ks));
if (THIS->m_rdbId == RDB_SPIDERDB) {
if (THIS->m_rdbId == RDB_SPIDERDB_DEPRECATED) {
dedupSpiderdbList(&(THIS->m_list));
} else if (THIS->m_rdbId == RDB_TITLEDB) {
// filterTitledbList(&(THIS->m_list));
@ -610,7 +610,7 @@ bool RdbMerge::filterList() {
// dedup for spiderdb before we dump it. try to save disk space.
//
/////
if (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB_TITLEDB) {
if (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB_TITLEDB) {
if (g_jobScheduler.submit(filterListWrapper, filterDoneWrapper, this, thread_type_merge_filter, 0)) {
return false;
}
@ -618,7 +618,7 @@ bool RdbMerge::filterList() {
log(LOG_WARN, "db: Unable to submit job for merge filter. Will run in main thread");
// fall back to filter without thread
if (m_rdbId == RDB_SPIDERDB) {
if (m_rdbId == RDB_SPIDERDB_DEPRECATED) {
dedupSpiderdbList(&m_list);
} else {
// filterTitledbList(&m_list);

@ -921,7 +921,7 @@ bool RdbTree::fixTree_unlocked() {
/// @todo ALC should we check repair RDB as well?
bool isTitledb = (m_rdbId == RDB_TITLEDB || m_rdbId == RDB2_TITLEDB2);
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB2_SPIDERDB2);
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB2_SPIDERDB2_DEPRECATED);
// now re-add the old nods to the tree, they should not be overwritten
// by addNode()
@ -1007,7 +1007,7 @@ bool RdbTree::checkTree_unlocked(bool printMsgs, bool doChainTest) const {
/// @todo ALC should we check repair RDB as well?
bool isTitledb = (m_rdbId == RDB_TITLEDB || m_rdbId == RDB2_TITLEDB2);
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB || m_rdbId == RDB2_SPIDERDB2);
bool isSpiderdb = (m_rdbId == RDB_SPIDERDB_DEPRECATED || m_rdbId == RDB2_SPIDERDB2_DEPRECATED);
// now check parent kid correlations
for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {

@ -60,7 +60,6 @@ static Rdb **getSecondaryRdbs ( int32_t *nsr ) {
s_rdbs[s_nsr++] = g_titledb2.getRdb ();
s_rdbs[s_nsr++] = g_posdb2.getRdb ();
s_rdbs[s_nsr++] = g_spiderdb2.getRdb ();
s_rdbs[s_nsr++] = g_clusterdb2.getRdb ();
s_rdbs[s_nsr++] = g_linkdb2.getRdb ();
s_rdbs[s_nsr++] = g_tagdb2.getRdb ();
@ -529,8 +528,6 @@ void Repair::initScan ( ) {
if ( m_rebuildClusterdb )
if ( ! g_clusterdb2.init2 ( clusterdbMem ) ) goto hadError;
if ( m_rebuildSpiderdb )
if ( ! g_spiderdb2.init2 ( spiderdbMem ) ) goto hadError;
if ( m_rebuildLinkdb )
if ( ! g_linkdb2.init2 ( linkdbMem ) ) goto hadError;
@ -638,11 +635,6 @@ void Repair::getNextCollToRepair ( ) {
g_errno != EEXIST ) goto hadError;
}
if ( m_rebuildSpiderdb ) {
if ( ! g_spiderdb2.getRdb()->addRdbBase1 ( coll ) &&
g_errno != EEXIST ) goto hadError;
}
if ( m_rebuildLinkdb ) {
if ( ! g_linkdb2.getRdb()->addRdbBase1 ( coll ) &&
g_errno != EEXIST ) goto hadError;
@ -924,11 +916,6 @@ void Repair::updateRdbs ( ) {
rdb2 = g_clusterdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_cr->m_coll );
}
if ( m_rebuildSpiderdb ) {
rdb1 = g_spiderdb.getRdb();
rdb2 = g_spiderdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_cr->m_coll );
}
if ( m_rebuildLinkdb ) {
rdb1 = g_linkdb.getRdb();
rdb2 = g_linkdb2.getRdb();
@ -1356,12 +1343,6 @@ bool Repair::printRepairStatus(SafeBuf *sb) {
m_recsCorruptErrors +
m_recsDupDocIds ;
// the spiderdb scan stats (phase 2)
int64_t ns2 = m_spiderRecsScanned ;
int64_t nr2 = g_spiderdb.getRdb()->getNumTotalRecs() ;
float ratio2 = nr2 ? ((float)ns2 * 100.0) / (float)nr2 : 0.0;
int64_t errors2 = m_spiderRecSetErrors;
const char *newColl = " &nbsp; ";
const char *oldColl = " &nbsp; ";
@ -1519,38 +1500,6 @@ bool Repair::printRepairStatus(SafeBuf *sb) {
);
sb->safePrintf(
// spider recs done
"<tr bgcolor=#%s><td><b>spider recs scanned</b></td>"
"<td>%" PRId64" of %" PRId64" (%.2f%%)</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec not "
"assigned to us</b></td>"
"<td>%" PRId32"</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec errors</b></td>"
"<td>%" PRId64"</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec bad tld</b></td>"
"<td>%" PRId32"</td></tr>\n"
,
LIGHT_BLUE ,
ns2 ,
nr2 ,
ratio2 ,
LIGHT_BLUE ,
m_spiderRecNotAssigned ,
LIGHT_BLUE ,
errors2,
LIGHT_BLUE ,
m_spiderRecBadTLD
);
int32_t nsr;
Rdb **rdbs = getSecondaryRdbs ( &nsr );

@ -74,7 +74,7 @@ int32_t SpiderRequest::print(SafeBuf *sbarg) const {
SafeBuf tmp;
SafeBuf *sb = sbarg ? sbarg : &tmp;
sb->safePrintf("k=%s ", KEYSTR( this, getKeySizeFromRdbId( RDB_SPIDERDB ) ) );
sb->safePrintf("k=%s ", KEYSTR( this, getKeySizeFromRdbId( RDB_SPIDERDB_SQLITE ) ) );
// indicate it's a request not a reply
sb->safePrintf("REQ ");
@ -128,7 +128,7 @@ int32_t SpiderRequest::print(SafeBuf *sbarg) const {
if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
int32_t shardNum = g_hostdb.getShardNum( RDB_SPIDERDB, this );
int32_t shardNum = g_hostdb.getShardNum( RDB_SPIDERDB_SQLITE, this );
sb->safePrintf("shardnum=%" PRIu32" ",(uint32_t)shardNum);
sb->safePrintf("url=%s",m_url);
@ -999,7 +999,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
// use spidercoll to contain this msg4 but if in use it
// won't be able to be deleted until it comes back..
if(!sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, spiderReqBuf, doneAddingSeedsWrapper, RDB_SPIDERDB))
if(!sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, spiderReqBuf, doneAddingSeedsWrapper, RDB_SPIDERDB_DEPRECATED))
return false;
else {
delete spiderReqBuf;

@ -362,7 +362,8 @@ public:
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
Rdb *getRdb ( ) { return &m_rdb; }
// Rdb *getRdb ( ) { return &m_rdb; }
Rdb *getRdb_deprecated() { return &m_rdb; }
static int64_t getUrlHash48(const key128_t *k ) {
return (((k->n1)<<16) | k->n0>>(64-16)) & 0xffffffffffffLL;

@ -13,6 +13,7 @@
#include "ip.h"
#include "Conf.h"
#include "Mem.h"
#include "SpiderdbRdbSqliteBridge.h"
#include "ScopedLock.h"
#include "Sanity.h"
@ -1188,25 +1189,17 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
// make state
//int32_t state2 = (int32_t)m_cr->m_collnum;
// read the list from local disk
if ( !m_msg5b.getList(RDB_SPIDERDB,
m_cr->m_collnum,
&m_waitingTreeList,
&m_waitingTreeNextKey,
KEYMAX(),
SR_READ_SIZE, // minRecSizes (512k)
true, // includeTree
0, // startFileNum
-1, // numFiles (all)
this,//(void *)state2,//this//state
gotSpiderdbWaitingTreeListWrapper,
MAX_NICENESS, // niceness
true, // do error correct?
-1, // maxRetries
false)) // isRealMerge
if(!SpiderdbRdbSqliteBridge::getList(m_cr->m_collnum,
&m_waitingTreeList,
m_waitingTreeNextKey,
*(const key128_t*)KEYMAX(),
SR_READ_SIZE))
{
// return if blocked
logTrace( g_conf.m_logTraceSpider, "END, msg5b.getList blocked" );
return;
if(!g_errno) {
g_errno = EIO; //imprecise
logTrace( g_conf.m_logTraceSpider, "END, got io-error from sqlite" );
return;
}
}
}
@ -2014,27 +2007,18 @@ bool SpiderColl::readListFromSpiderdb ( ) {
// end up timing out the round. so try checking for
// m_gettingList in spiderDoledUrls() and setting
// m_lastSpiderCouldLaunch
if ( ! m_msg5.getList ( RDB_SPIDERDB ,
m_cr->m_collnum ,
&m_list ,
&m_nextKey ,
&m_endKey ,
SR_READ_SIZE , // minRecSizes (512k)
true , // includeTree
0 , // startFileNum
-1 , // numFiles (all)
this,//(void *)state2,//this,//state
gotSpiderdbListWrapper ,
MAX_NICENESS , // niceness
true, // do error correct?
-1, // maxRetries
false)) // isRealMerge
if(!SpiderdbRdbSqliteBridge::getList(m_cr->m_collnum,
&m_list,
m_nextKey,
m_endKey,
SR_READ_SIZE))
{
// return false if blocked
logTrace( g_conf.m_logTraceSpider, "END, msg5.getList blocked" );
return false ;
if(!g_errno)
g_errno = EIO; //imprecise
logTrace( g_conf.m_logTraceSpider, "END, got io-error from sqlite" );
return true;
}
// note its return
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read of %" PRId32" bytes",m_list.getListSize());

@ -151,70 +151,70 @@ void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
}
void SpiderdbHostDelete::processFile(void *item) {
FileItem *fileItem = static_cast<FileItem*>(item);
log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
CollectionRec *collRec = g_collectiondb.getRec("main");
if (!collRec) {
gbshutdownLogicError();
}
RdbBase *base = collRec->getBase(RDB_SPIDERDB);
Rdb *rdb = g_spiderdb.getRdb();
if (!fileItem->m_resume) {
// dump tree
rdb->submitRdbDumpJob(true);
{
ScopedLock sl(s_sleepMtx);
while (!s_stop && rdb->hasPendingRdbDumpJob()) {
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
ts.tv_sec += 1;
pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
}
if (s_stop) {
delete fileItem;
return;
}
}
}
// tight merge (only force merge all when not resuming)
if (!base->attemptMerge(0, !fileItem->m_resume)) {
// unable to start merge
g_urlHostBlackList.unload();
delete fileItem;
return;
}
{
ScopedLock sl(s_sleepMtx);
while (!s_stop && rdb->isMerging()) {
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
ts.tv_sec += 60;
pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
}
if (s_stop) {
delete fileItem;
return;
}
}
log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
g_urlHostBlackList.unload();
// delete files
unlink(fileItem->m_tmpFilename);
delete fileItem;
}
// FileItem *fileItem = static_cast<FileItem*>(item);
//
// log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
//
// g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
//
// CollectionRec *collRec = g_collectiondb.getRec("main");
// if (!collRec) {
// gbshutdownLogicError();
// }
// RdbBase *base = collRec->getBase(RDB_SPIDERDB);
// Rdb *rdb = g_spiderdb.getRdb();
//
// if (!fileItem->m_resume) {
// // dump tree
// rdb->submitRdbDumpJob(true);
//
// {
// ScopedLock sl(s_sleepMtx);
// while (!s_stop && rdb->hasPendingRdbDumpJob()) {
// timespec ts;
// clock_gettime(CLOCK_REALTIME, &ts);
// ts.tv_sec += 1;
//
// pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
// }
//
// if (s_stop) {
// delete fileItem;
// return;
// }
// }
// }
//
// // tight merge (only force merge all when not resuming)
// if (!base->attemptMerge(0, !fileItem->m_resume)) {
// // unable to start merge
// g_urlHostBlackList.unload();
// delete fileItem;
// return;
// }
//
// {
// ScopedLock sl(s_sleepMtx);
// while (!s_stop && rdb->isMerging()) {
// timespec ts;
// clock_gettime(CLOCK_REALTIME, &ts);
// ts.tv_sec += 60;
//
// pthread_cond_timedwait(&s_sleepCond, &s_sleepMtx.mtx, &ts);
// }
//
// if (s_stop) {
// delete fileItem;
// return;
// }
// }
//
// log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
//
// g_urlHostBlackList.unload();
//
// // delete files
// unlink(fileItem->m_tmpFilename);
//
// delete fileItem;
}

466
SpiderdbRdbSqliteBridge.cpp Normal file

@ -0,0 +1,466 @@
#include "SpiderdbRdbSqliteBridge.h"
#include "Spider.h"
#include "SpiderdbSqlite.h"
#include "types.h"
#include "Sanity.h"
#include "Log.h"
#include "IOBuffer.h"
#include "Mem.h"
#include "Conf.h"
static bool addRequestRecord(sqlite3 *db, const void *record, size_t record_len);
static bool addReplyRecord(sqlite3 *db, const void *record, size_t record_len);
bool SpiderdbRdbSqliteBridge::addRecord(collnum_t collnum, const void *record, size_t record_len) {
if(KEYNEG((const char*)record)) {
log(LOG_ERROR,"sqlitespider: Got negative spiderrecord");
gbshutdownCorrupted();
}
sqlite3 *db = g_spiderdb_sqlite.getOrCreateDb(collnum);
if(!db) {
log(LOG_ERROR,"sqlitespider: Could not get sqlite db for collection %d", collnum);
return false;
}
if(Spiderdb::isSpiderRequest(reinterpret_cast<const key128_t *>(record)))
return addRequestRecord(db,record,record_len);
else
return addReplyRecord(db,record,record_len);
}
static bool addRequestRecord(sqlite3 *db, const void *record, size_t record_len) {
if(record_len<(unsigned)SpiderRequest::getNeededSize(0)) {
log(LOG_ERROR,"sqlitespider: Got spiderrequest with record_len=%zu and SpiderRequest::getNeededSize(0)=%d", record_len, SpiderRequest::getNeededSize(0));
gbshutdownCorrupted();
}
//last byte should be the terminating NUL in m_url
if(reinterpret_cast<const char*>(record)[record_len-1] != '\0') {
log(LOG_ERROR,"sqlitespider: Got spiderrequest where last byte was not ascii-nul");
gbshutdownCorrupted();
}
const SpiderRequest *sreq = reinterpret_cast<const SpiderRequest*>(record);
int32_t firstIp = Spiderdb::getFirstIp(&sreq->m_key);
int64_t uh48 = Spiderdb::getUrlHash48(&sreq->m_key);
//Create or update record. Possible streategies:
// insert-then-detect-unique-key-violatione-and-update
// select-then-insert-or-update
//We go for select-then-insert-or-update
const char *pzTail="";
sqlite3_stmt *selectStatement = NULL;
if(sqlite3_prepare_v2(db, "select 1 from spiderdb where m_firstIp=? and m_uh48=?", -1, &selectStatement, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
sqlite3_close(db);
return false;
}
sqlite3_bind_int(selectStatement, 1, firstIp);
sqlite3_bind_int64(selectStatement, 2, uh48);
int select_rc = sqlite3_step(selectStatement);
if(select_rc==SQLITE_DONE) {
//statement is finished - so the record currently doesn't exist
static const char insert_statement[] =
"INSERT INTO spiderdb (m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url)"
"VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
sqlite3_stmt *insertStatement = NULL;
if(sqlite3_prepare_v2(db, insert_statement, -1, &insertStatement, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
sqlite3_finalize(selectStatement);
return false;
}
sqlite3_bind_int(insertStatement, 1, firstIp);
sqlite3_bind_int64(insertStatement, 2, uh48);
sqlite3_bind_int(insertStatement, 3, sreq->m_hostHash32);
sqlite3_bind_int(insertStatement, 4, sreq->m_domHash32);
sqlite3_bind_int(insertStatement, 5, sreq->m_siteHash32);
sqlite3_bind_int(insertStatement, 6, sreq->m_siteNumInlinks);
sqlite3_bind_int(insertStatement, 7, sreq->m_pageNumInlinks);
sqlite3_bind_int(insertStatement, 8, sreq->m_addedTime);
sqlite3_bind_int(insertStatement, 9, sreq->m_discoveryTime);
if(sreq->m_contentHash32!=0)
sqlite3_bind_int(insertStatement, 10, sreq->m_contentHash32);
else
sqlite3_bind_null(insertStatement, 10);
int32_t rqf = (sreq->m_recycleContent ? (1<<0) : 0) |
(sreq->m_isAddUrl ? (1<<1) : 0) |
(sreq->m_isPageReindex ? (1<<2) : 0) |
(sreq->m_isUrlCanonical ? (1<<3) : 0) |
(sreq->m_isPageParser ? (1<<4) : 0) |
(sreq->m_urlIsDocId ? (1<<5) : 0) |
(sreq->m_isRSSExt ? (1<<6) : 0) |
(sreq->m_isUrlPermalinkFormat ? (1<<7) : 0) |
(sreq->m_forceDelete ? (1<<8) : 0) |
(sreq->m_isInjecting ? (1<<9) : 0) |
(sreq->m_hadReply ? (1<<10) : 0) |
(sreq->m_fakeFirstIp ? (1<<11) : 0) |
(sreq->m_hasAuthorityInlink ? (1<<12) : 0) |
(sreq->m_avoidSpiderLinks ? (1<<13) : 0);
sqlite3_bind_int(insertStatement, 11, rqf);
if(sreq->m_priority>=0)
sqlite3_bind_int(insertStatement, 12, sreq->m_priority);
else
sqlite3_bind_null(insertStatement, 12);
sqlite3_bind_int(insertStatement, 13, sreq->m_errCount);
sqlite3_bind_int(insertStatement, 14, sreq->m_sameErrCount);
sqlite3_bind_text(insertStatement, 15, sreq->m_url,-1,SQLITE_TRANSIENT);
if(sqlite3_step(insertStatement) != SQLITE_DONE) {
log(LOG_ERROR,"sqlitespider: Insert error: %s",sqlite3_errmsg(db));
sqlite3_finalize(insertStatement);
sqlite3_finalize(selectStatement);
return false;
}
sqlite3_finalize(insertStatement);
sqlite3_finalize(selectStatement);
return true;
} else if(select_rc==SQLITE_ROW) {
//at least one result, so the record must already be there
static const char update_statement[] =
"UPDATE spiderdb"
" SET m_siteNumInlinks=MAX(m_siteNumInlinks,?),"
" m_pageNumInlinks=MAX(m_pageNumInlinks,?),"
" m_addedTime=MIN(m_addedTime,?),"
" m_discoveryTime=MIN(m_discoveryTime,?),"
" m_priority=MAX(m_priority,?)"
" WHERE m_firstIp=? AND m_uh48=?";
sqlite3_stmt *updateStatement = NULL;
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
sqlite3_finalize(selectStatement);
return false;
}
sqlite3_bind_int(updateStatement, 1, sreq->m_siteNumInlinks);
sqlite3_bind_int(updateStatement, 2, sreq->m_pageNumInlinks);
sqlite3_bind_int(updateStatement, 3, sreq->m_addedTime);
sqlite3_bind_int(updateStatement, 4, sreq->m_discoveryTime);
sqlite3_bind_int(updateStatement, 5, sreq->m_priority);
sqlite3_bind_int(updateStatement, 6, firstIp);
sqlite3_bind_int64(updateStatement, 17, uh48);
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
sqlite3_finalize(updateStatement);
sqlite3_finalize(selectStatement);
return false;
}
return true;
} else {
log(LOG_WARN,"sqlitespider: sqlite3_step(...select...) failed with %s", sqlite3_errmsg(db));
sqlite3_finalize(selectStatement);
return false;
}
}
static bool addReplyRecord(sqlite3 *db, const void *record, size_t record_len) {
if(record_len!=sizeof(SpiderReply)) {
log(LOG_ERROR,"sqlitespider: Got spiderreply with record_len=%zu and sizeof(SpiderReply)=%zu", record_len, sizeof(SpiderReply));
gbshutdownCorrupted();
}
//assumption: the record is already there
const SpiderReply *srep = reinterpret_cast<const SpiderReply*>(record);
int32_t firstIp = Spiderdb::getFirstIp(&srep->m_key);
int64_t uh48 = Spiderdb::getUrlHash48(&srep->m_key);
const char *pzTail="";
if(srep->m_errCode==0) {
static const char update_statement[] =
"UPDATE spiderdb"
" SET m_percentChangedPerDay = ?,"
" m_spideredTime = ?,"
" m_errCode = ?,"
" m_httpStatus = ?,"
" m_langId = ?,"
" m_replyFlags = ?,"
" m_errCount = 0,"
" m_sameErrCount = 0,"
" m_contentHash32 = ?"
" WHERE m_firstIp=? and m_uh48=?";
sqlite3_stmt *updateStatement = NULL;
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
return false;
}
sqlite3_bind_double(updateStatement, 1, srep->m_percentChangedPerDay);
sqlite3_bind_int(updateStatement, 2, srep->m_spideredTime);
sqlite3_bind_int(updateStatement, 3, srep->m_errCode);
sqlite3_bind_int(updateStatement, 4, srep->m_httpStatus);
sqlite3_bind_int(updateStatement, 5, srep->m_langId);
int32_t rpf = (srep->m_isRSS ? (1<<0) : 0) |
(srep->m_isPermalink ? (1<<1) : 0) |
(srep->m_isIndexed ? (1<<2) : 0) |
(srep->m_hasAuthorityInlink ? (1<<3) : 0) |
(srep->m_fromInjectionRequest ? (1<<4) : 0);
sqlite3_bind_int(updateStatement, 6, rpf);
sqlite3_bind_int(updateStatement, 7, srep->m_contentHash32);
sqlite3_bind_int(updateStatement, 8, firstIp);
sqlite3_bind_int64(updateStatement, 9, uh48);
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
sqlite3_finalize(updateStatement);
return false;
}
sqlite3_finalize(updateStatement);
return true;
} else {
static const char update_statement[] =
"UPDATE spiderdb"
" SET m_spideredTime = ?,"
" m_errCode = ?,"
" m_httpStatus = ?,"
" m_errCount = m_errCount + 1,"
" m_sameErrCount = CASE WHEN m_errCode=? THEN IFNULL(m_sameErrCount,0) + 1 ELSE 0 END"
" WHERE m_firstIp=? and m_uh48=?";
sqlite3_stmt *updateStatement = NULL;
if(sqlite3_prepare_v2(db, update_statement, -1, &updateStatement, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
return false;
}
sqlite3_bind_int(updateStatement, 1, srep->m_spideredTime);
sqlite3_bind_int(updateStatement, 2, srep->m_errCode);
sqlite3_bind_int(updateStatement, 3, srep->m_httpStatus);
sqlite3_bind_int(updateStatement, 4, srep->m_errCode);
sqlite3_bind_int(updateStatement, 5, firstIp);
sqlite3_bind_int64(updateStatement, 6, uh48);
if(sqlite3_step(updateStatement) != SQLITE_DONE) {
log(LOG_ERROR,"sqlitespider: Update error: %s",sqlite3_errmsg(db));
sqlite3_finalize(updateStatement);
return false;
}
sqlite3_finalize(updateStatement);
return true;
}
}
bool SpiderdbRdbSqliteBridge::getList(collnum_t collnum,
RdbList *list,
const key128_t &startKey,
const key128_t &endKey,
int32_t minRecSizes)
{
sqlite3 *db = g_conf.m_readOnlyMode ? g_spiderdb_sqlite.getDb(collnum) : g_spiderdb_sqlite.getOrCreateDb(collnum);
if(!db) {
log(LOG_ERROR,"sqlitespider: Could not get sqlite db for collection %d", collnum);
g_errno = ENOCOLLREC;
return false;
}
int32_t firstIpStart = Spiderdb::getFirstIp(&startKey);
int32_t firstIpEnd = Spiderdb::getFirstIp(&endKey);
int64_t uh48Start = Spiderdb::getUrlHash48(&startKey);
int64_t uh48End = Spiderdb::getUrlHash48(&endKey);
bool breakMidIPAddressAllowed;
const char *pzTail="";
sqlite3_stmt *stmt;
if(firstIpStart==firstIpEnd) {
breakMidIPAddressAllowed = true;
static const char statement_text[] =
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
" m_replyFlags"
" FROM spiderdb"
" WHERE m_firstIp=? and m_uh48>=? and m_uh48<=?"
" ORDER BY m_firstIp, m_uh48";
if(sqlite3_prepare_v2(db, statement_text, -1, &stmt, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
g_errno = EBADENGINEER;
return false;
}
sqlite3_bind_int64(stmt, 1, (uint32_t)firstIpStart);
sqlite3_bind_int64(stmt, 2, uh48Start);
sqlite3_bind_int64(stmt, 3, uh48End);
} else {
if(uh48Start!=0) {
log(LOG_ERROR, " SpiderdbRdbSqliteBridge::getList(): startip!=endip, and uh48Start!=0");
gbshutdownLogicError();
}
breakMidIPAddressAllowed = false;
static const char statement_text[] =
"SELECT m_firstIp, m_uh48, m_hostHash32, m_domHash32, m_siteHash32,"
" m_siteNumInlinks, m_pageNumInlinks, m_addedTime, m_discoveryTime, m_contentHash32,"
" m_requestFlags, m_priority, m_errCount, m_sameErrCount, m_url,"
" m_percentChangedPerDay, m_spideredTime, m_errCode, m_httpStatus, m_langId,"
" m_replyFlags"
" FROM spiderdb"
" WHERE m_firstIp>=? and m_firstIp<=?"
" ORDER BY m_firstIp, m_uh48";
if(sqlite3_prepare_v2(db, statement_text, -1, &stmt, &pzTail) != SQLITE_OK) {
log(LOG_ERROR,"sqlitespider: Statement preparation error %s at or near %s",sqlite3_errmsg(db),pzTail);
g_errno = EBADENGINEER;
return false;
}
sqlite3_bind_int64(stmt, 1, (uint32_t)firstIpStart);
sqlite3_bind_int64(stmt, 2, (uint32_t)firstIpEnd);
}
key128_t listLastKey;
IOBuffer io_buffer;
int rc;
while((rc=sqlite3_step(stmt))==SQLITE_ROW) {
//fetch all columns. null checks are done later
int32_t firstIp = sqlite3_column_int(stmt, 1);
int64_t uh48 = sqlite3_column_int64(stmt, 2);
int32_t hosthash32 = sqlite3_column_int(stmt, 3);
int32_t domHash32 = sqlite3_column_int(stmt, 4);
int32_t siteHash32 = sqlite3_column_int(stmt, 5);
int32_t siteNumInlinks = sqlite3_column_int(stmt, 6);
int32_t pageNumInlinks = sqlite3_column_int(stmt, 7);
int32_t addedTime = sqlite3_column_int(stmt, 8);
int32_t discoveryTime = sqlite3_column_int(stmt, 9);
int32_t contentHash32 = sqlite3_column_int(stmt, 10);
int32_t requestFlags = sqlite3_column_int(stmt, 11);
int32_t priority = sqlite3_column_int(stmt, 12);
int32_t errCount = sqlite3_column_int(stmt, 13);
int32_t sameErrCount = sqlite3_column_int(stmt, 14);
const unsigned char *url = sqlite3_column_text(stmt, 15);
double percentChangedPerDay = sqlite3_column_double(stmt, 16);
int32_t spideredTime = sqlite3_column_int(stmt, 17);
int32_t errCode = sqlite3_column_int(stmt, 18);
int32_t httpStatus = sqlite3_column_int(stmt, 19);
int32_t langId = sqlite3_column_int(stmt, 20);
int32_t replyFlags = sqlite3_column_int(stmt, 21);
if(breakMidIPAddressAllowed) {
if(minRecSizes>0 && io_buffer.used() >= (size_t)minRecSizes)
break;
} else {
}
if(sqlite3_column_type(stmt,21)!=SQLITE_NULL) {
//replyflags are non-null so there must be a reply
SpiderReply srep;
srep.reset();
srep.m_key = Spiderdb::makeKey(firstIp,uh48,false,0,false);
srep.m_dataSize = sizeof(srep) - sizeof(srep.m_key) - sizeof(srep.m_dataSize);
srep.m_firstIp = firstIp;
srep.m_siteHash32 = siteHash32;
srep.m_domHash32 = domHash32;
srep.m_percentChangedPerDay = percentChangedPerDay;
srep.m_spideredTime = spideredTime;
srep.m_errCode = errCode;
srep.m_siteNumInlinks = siteNumInlinks;
srep.m_sameErrCount = sameErrCount;
srep.m_contentHash32 = contentHash32;
srep.m_crawlDelayMS = 1; //probably only used in-memory.
srep.m_downloadEndTime = 0; //probably only used in-memory.
srep.m_httpStatus = httpStatus;
srep.m_errCount = errCount;
srep.m_langId = langId;
srep.m_isRSS = (replyFlags&(1<<0))!=0;
srep.m_isPermalink = (replyFlags&(1<<1))!=0;
srep.m_isIndexed = (replyFlags&(1<<2))!=0;
srep.m_hasAuthorityInlink = (replyFlags&(1<<3))!=0;
srep.m_fromInjectionRequest = (replyFlags&(1<<4))!=0;
srep.m_isIndexedINValid = (replyFlags&(1<<4))!=0;
srep.m_hasAuthorityInlinkValid = (requestFlags&(1<<15))!=0;
srep.m_siteNumInlinksValid = sqlite3_column_type(stmt,6)!=SQLITE_NULL;
io_buffer.reserve_extra(sizeof(srep));
memcpy(io_buffer.end(), &srep, sizeof(srep));
io_buffer.push_back(sizeof(srep));
} else
replyFlags = 0;
SpiderRequest sreq;
sreq.reset();
sreq.m_key = Spiderdb::makeKey(firstIp,uh48,false,0,false);
//sreq.m_dataSize
sreq.m_firstIp = firstIp;
sreq.m_hostHash32 = hosthash32;
sreq.m_domHash32 = domHash32;
sreq.m_siteHash32 = siteHash32;
sreq.m_siteNumInlinks = siteNumInlinks;
sreq.m_addedTime = addedTime;
sreq.m_pageNumInlinks = pageNumInlinks;
sreq.m_sameErrCount = sameErrCount;
sreq.m_discoveryTime = discoveryTime;
sreq.m_prevErrCode = 0; //done differently now.
sreq.m_contentHash32 = contentHash32;
sreq.m_hopCount = 0;
sreq.m_hopCountValid = 0;
sreq.m_isAddUrl = (requestFlags&(1<<1))!=0;
sreq.m_isPageReindex = (requestFlags&(1<<2))!=0;
sreq.m_isUrlCanonical = (requestFlags&(1<<3))!=0;
sreq.m_isPageParser = (requestFlags&(1<<4))!=0;
sreq.m_urlIsDocId = (requestFlags&(1<<5))!=0;
sreq.m_isRSSExt = (requestFlags&(1<<6))!=0;
sreq.m_isUrlPermalinkFormat = (requestFlags&(1<<7))!=0;
sreq.m_recycleContent = (requestFlags&(1<<0))!=0;
sreq.m_forceDelete = (requestFlags&(1<<8))!=0;
sreq.m_isInjecting = (requestFlags&(1<<9))!=0;
sreq.m_hadReply = (requestFlags&(1<<10))!=0;
sreq.m_fakeFirstIp = (requestFlags&(1<<11))!=0;
sreq.m_hasAuthorityInlink = (requestFlags&(1<<12))!=0;
sreq.m_hasAuthorityInlinkValid = (requestFlags&(1<<13))!=0;
sreq.m_siteNumInlinksValid = sqlite3_column_type(stmt,6)!=SQLITE_NULL;
sreq.m_avoidSpiderLinks = (requestFlags&(1<<14))!=0;
sreq.m_ufn = 0; //only used in-memory
sreq.m_priority = priority;
sreq.m_errCount = errCount;
strcpy(sreq.m_url,(const char*)url);
sreq.setDataSize();
io_buffer.reserve_extra(sreq.getRecSize());
memcpy(io_buffer.end(), &sreq, sreq.getRecSize());
io_buffer.push_back(sreq.getRecSize());
listLastKey = sreq.m_key;
}
if(rc!=SQLITE_DONE && rc!=SQLITE_ROW) {
log(LOG_ERROR,"sqlitespider: Fetch error: %s",sqlite3_errmsg(db));
g_errno = EBADENGINEER; //TODO
return false;
}
sqlite3_finalize(stmt);
int32_t listSize = io_buffer.used();
char *listMemory;
if(listSize>0) {
if(!listMemory) {
log(LOG_ERROR,"sqlitespider: OOM allocating spiderdb rdblist (%d bytes)", listSize);
return false;
}
listMemory = (char*)mmalloc(listSize, "sqliterdblist");
memcpy(listMemory, io_buffer.begin(), io_buffer.used());
} else
listMemory = NULL;
key128_t listFirstKey = Spiderdb::makeFirstKey(firstIpStart, uh48Start);
if(rc==SQLITE_ROW) {
//early break
} else {
//select exhaustion, so jump to last specified key
listLastKey = Spiderdb::makeFirstKey(firstIpEnd, uh48End);
}
list->set(listMemory, listSize,
listMemory, listSize,
(const char*)&listFirstKey, (const char*)&listLastKey,
-1, //datasize(variable)
true, //owndata
false, //halfkeys
sizeof(key128_t)); //keysize
return true;
}

26
SpiderdbRdbSqliteBridge.h Normal file

@ -0,0 +1,26 @@
#ifndef SPIDERDB_RDB_SQLITE_BRIDGE_H_
#define SPIDERDB_RDB_SQLITE_BRIDGE_H_
#include "collnum_t.h"
#include <stddef.h>
class RdbList;
class u_int128_t;
//Helper function for bridging the old Rdb-style spiderdb records to the new sqlite-based database
namespace SpiderdbRdbSqliteBridge {
//Add a record (request or reply) to spiderdb. Returns false if something fails
bool addRecord(collnum_t collnum, const void *record, size_t record_len);
//Fetch all records or a subset of the recoreds with startKey<=key<=endKey, and try to limit the rdblist size of recSizes
//Returns false on error
bool getList(collnum_t collnum,
RdbList *list,
const u_int128_t &startKey,
const u_int128_t &endKey,
int32_t minRecSizes);
}
#endif

127
SpiderdbSqlite.cpp Normal file

@ -0,0 +1,127 @@
#include "SpiderdbSqlite.h"
#include "ScopedLock.h"
#include "Hostdb.h"
#include "Collectiondb.h"
#include "Conf.h"
#include "Log.h"
#include <stddef.h>
static sqlite3 *createDb(const char *sqlitedbName);
SpiderdbSqlite g_spiderdb_sqlite(RDB_SPIDERDB_SQLITE);
SpiderdbSqlite g_spiderdb_sqlite2(RDB2_SPIDERDB2_SQLITE);
void SpiderdbSqlite::finalize() {
ScopedLock sl(mtx);
for(auto e : dbs)
sqlite3_close(e.second);
dbs.clear();
}
sqlite3 *SpiderdbSqlite::getDb(collnum_t collnum) {
ScopedLock sl(mtx);
auto iter = dbs.find(collnum);
if(iter!=dbs.end())
return iter->second;
else
return NULL;
}
sqlite3 *SpiderdbSqlite::getOrCreateDb(collnum_t collnum) {
ScopedLock sl(mtx);
auto iter = dbs.find(collnum);
if(iter!=dbs.end())
return iter->second;
//not found, open or create it
const auto cr = g_collectiondb.getRec(collnum);
char collectionDirName[1024];
sprintf(collectionDirName, "%scoll.%s.%d", g_hostdb.m_dir, cr->m_coll, (int)collnum);
char sqlitedbName[1024];
if(rdbid==RDB_SPIDERDB_SQLITE)
sprintf(sqlitedbName, "%s/spiderdb.sqlite3", collectionDirName);
else
sprintf(sqlitedbName, "%s/spiderdbRebuild.sqlite3", collectionDirName);
sqlite3 *db = createDb(sqlitedbName);
dbs[collnum] = db;
return db;
}
static const char create_table_statmeent[] =
"CREATE TABLE spiderdb ("
" m_firstIp INT NOT NULL,"
" m_uh48 INT NOT NULL,"
" m_hostHash32 INT NOT NULL,"
" m_domHash32 INT NOT NULL,"
" m_siteHash32 INT NOT NULL,"
" m_siteNumInlinks INT NOT NULL,"
" m_pageNumInlinks INT NOT NULL,"
" m_addedTime INT NOT NULL,"
" m_discoveryTime INT NOT NULL,"
" m_contentHash32 INT,"
" m_requestFlags INT NOT NULL,"
" m_priority INT,"
" m_errCount INT NOT NULL,"
" m_sameErrCount INT NOT NULL,"
" m_url TEXT NOT NULL,"
" m_percentChangedPerDay REAL,"
" m_spideredTime INT,"
" m_errCode INT,"
" m_httpStatus INT,"
" m_langId INT,"
" m_replyFlags INT,"
" PRIMARY KEY (m_firstIp,m_uh48)"
");"
;
static sqlite3 *createDb(const char *sqlitedbName) {
sqlite3 *db;
if(g_conf.m_readOnlyMode) {
//read-only, creation is not allowed
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READONLY,NULL);
if(rc!=SQLITE_OK) {
log(LOG_ERROR,"sqlite: Could not open %s: %s", sqlitedbName, sqlite3_errmsg(db));
return NULL;
}
return db;
}
//read-write, creation is allowed
if(access(sqlitedbName,F_OK)==0) {
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READWRITE,NULL);
if(rc!=SQLITE_OK) {
log(LOG_ERROR,"sqlite: Could not open %s: %s", sqlitedbName, sqlite3_errmsg(db));
return NULL;
}
return db;
}
int rc = sqlite3_open_v2(sqlitedbName,&db,SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE,NULL);
if(rc!=SQLITE_OK) {
log(LOG_ERROR,"sqlite: Could not create %s: %s", sqlitedbName, sqlite3_errmsg(db));
return NULL;
}
char *errmsg = NULL;
if(sqlite3_exec(db, create_table_statmeent, NULL, NULL, &errmsg) != SQLITE_OK) {
log(LOG_ERROR,"sqlite: %s",sqlite3_errmsg(db));
sqlite3_close(db);
unlink(sqlitedbName);
return NULL;
}
return db;
}

99
SpiderdbSqlite.h Normal file

@ -0,0 +1,99 @@
#ifndef SPIDERDB_H_
#define SPIDERDB_H_
#include "GbMutex.h"
#include "collnum_t.h"
#include "rdbid_t.h"
#include <inttypes.h>
#include <string>
#include <map>
#include "sqlite3.h"
class SpiderdbSqlite {
std::map<collnum_t,sqlite3*> dbs;
GbMutex mtx;
rdbid_t rdbid;
public:
SpiderdbSqlite(rdbid_t rdbid_) : dbs(), mtx(), rdbid(rdbid_) {}
~SpiderdbSqlite() { finalize(); }
SpiderdbSqlite(const SpiderdbSqlite&) = delete;
SpiderdbSqlite& operator=(const SpiderdbSqlite&) = delete;
void finalize(); //closes all DBs
sqlite3 *getDb(collnum_t collnum);
sqlite3 *getOrCreateDb(collnum_t collnum);
};
extern SpiderdbSqlite g_spiderdb_sqlite;
extern SpiderdbSqlite g_spiderdb_sqlite2;
//see Spider.h for field definitions/comments/caveats
struct RawSpiderdbRecord {
int32_t m_firstIp;
int32_t m_uh48;
//Request fields:
int32_t m_hostHash32;
int32_t m_domHash32;
int32_t m_siteHash32;
int32_t m_siteNumInlinks;
int32_t m_pageNumInlinks;
int32_t m_addedTime;
int32_t m_discoveryTime;
int32_t m_contentHash32; //0 = unknown/invalid
union {
struct {
bool m_recycleContent:1;
bool m_isAddUrl:1;
bool m_isPageReindex:1;
bool m_isUrlCanonical:1;
bool m_isPageParser:1;
bool m_urlIsDocId:1;
bool m_isRSSExt:1;
bool m_isUrlPermalinkFormat:1;
bool m_forceDelete:1;
bool m_isInjecting:1;
bool m_hadReply:1;
bool m_fakeFirstIp:1;
bool m_hasAuthorityInlink:1;
bool m_hasAuthorityInlinkValid:1;
bool m_siteNumInlinksValid:1;
bool m_avoidSpiderLinks:1;
} requestFlags;
uint32_t u32_request;
};
int32_t m_priority;
bool m_priorityValid;
int32_t m_errCount;
bool m_errCountValid;
int32_t m_sameErrCount;
std::string m_url;
//Reply fields
float m_percentChangedPerDay;
bool m_percentChangedPerDayValid;
int32_t m_spideredTime;
bool m_spideredTimeValid;
int32_t m_errCodeValid;
bool m_errCode;
int32_t m_httpStatus;
bool m_httpStatusValid;
int32_t m_langId;
bool m_langIdValid;
union {
struct {
bool m_httpStatus:1;
bool m_isRSS:1;
bool m_isPermalink:1;
bool m_isIndexed:1;
bool m_hasAuthorityInlink:1;
bool m_isIndexedINValid:1;
bool m_hasAuthorityInlinkValid:1;
bool m_fromInjectionRequest:1;
} replyFlags;
uint32_t u32_reply;
};
};
#endif

@ -404,7 +404,7 @@ static RdbCacheHistory rdb_cache_history[] = {
{RDB_TAGDB, "tagdb", 0,0},
{RDB_CLUSTERDB,"clusterdb",0,0},
{RDB_TITLEDB, "titledb", 0,0},
{RDB_SPIDERDB, "spiderdb", 0,0},
{RDB_SPIDERDB_DEPRECATED, "spiderdb", 0,0},
{RDB_NONE,0,0,0}
};

@ -1771,7 +1771,7 @@ bool XmlDoc::indexDoc ( ) {
}
// store the new request (store reply for this below)
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
if (!m_metaList2.pushChar(rd)) {
logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
return true;
@ -1812,7 +1812,7 @@ skipNewAdd1:
return true;
}
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
if (!m_metaList2.pushChar(rd)) {
logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
return true;
@ -11998,7 +11998,7 @@ void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
docId );
}
// key parsing logic taken from Address::makePlacedbKey
else if ( rdbId == RDB_SPIDERDB ) {
else if ( rdbId == RDB_SPIDERDB_DEPRECATED ) {
sb->safePrintf("<td><nobr>");
key128_t *k2 = (key128_t *)k;
if ( Spiderdb::isSpiderRequest(k2) ) {
@ -12078,7 +12078,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
// positive and a spiderdoc
// no, this is no longer the case because we add spider
// replies to the index when deleting or rejecting a doc.
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB_DEPRECATED) {
// g_process.shutdownAbort(true); }
// get the key size. a table lookup in Rdb.cpp.
@ -12131,7 +12131,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
if ( del ) dataSize = 0;
// ensure spiderdb request recs have data/url in them
if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
if ( (rdbId == RDB_SPIDERDB_DEPRECATED || rdbId == RDB2_SPIDERDB2_DEPRECATED) &&
g_spiderdb.isSpiderRequest ( (spiderdbkey_t *)rec ) &&
! forDelete &&
! del &&
@ -12202,8 +12202,8 @@ bool XmlDoc::hashMetaList ( HashTableX *ht ,
// skip the data
p += dataSize;
// ignore spiderdb recs for parsing consistency check
if ( rdbId == RDB_SPIDERDB ) continue;
if ( rdbId == RDB2_SPIDERDB2 ) continue;
if ( rdbId == RDB_SPIDERDB_DEPRECATED ) continue;
if ( rdbId == RDB2_SPIDERDB2_DEPRECATED ) continue;
// ignore tagdb as well!
if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
@ -12304,7 +12304,7 @@ bool XmlDoc::hashMetaList ( HashTableX *ht ,
SafeBuf sb2;
// print it out
if ( rdbId == RDB_SPIDERDB ) {
if ( rdbId == RDB_SPIDERDB_DEPRECATED ) {
// get rec
if ( Spiderdb::isSpiderRequest((key128_t *)rec) ) {
SpiderRequest *sreq1 = (SpiderRequest *)rec;
@ -12652,7 +12652,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
logTrace(g_conf.m_logTraceXmlDoc, "Adding spider reply to spiderdb");
// rdbid first
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
*m_p++ = (char)rd;
// get this
@ -13356,7 +13356,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
setStatus("adding SpiderReply to spiderdb");
// rdbid first
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
// get this
if (!m_srepValid) {
@ -13423,7 +13423,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
}
// copy it
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2_DEPRECATED : RDB_SPIDERDB_DEPRECATED;
// store it back
gbmemcpy (m_p, &revisedReq, revisedReq.getRecSize());
@ -14849,8 +14849,8 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// sanity check
if ( p + 1 + need > m_pend ) { g_process.shutdownAbort(true); }
// store the rdbId
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
else *p++ = RDB_SPIDERDB;
if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2_DEPRECATED;
else *p++ = RDB_SPIDERDB_DEPRECATED;
// store the spider rec
gbmemcpy ( p , &ksr , need );
@ -16972,7 +16972,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
if ( m_sreqValid ) {
// must not block
SpiderRequest *oldsr = &m_sreq;
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB_DEPRECATED,oldsr);
sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
"</td>\n"
"<td><b>%" PRIu32"</b></td></tr>\n",shard);
@ -17490,7 +17490,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
int32_t spiderHostId = -1;
if (firstIp && firstIp != (int32_t *)-1) {
key128_t spiderKey = Spiderdb::makeFirstKey(*firstIp);
int32_t spiderShardNum = getShardNum(RDB_SPIDERDB, &spiderKey);
int32_t spiderShardNum = getShardNum(RDB_SPIDERDB_DEPRECATED, &spiderKey);
spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum, false);
}

164
main.cpp

@ -106,8 +106,8 @@ static const int32_t commandLineDumpdbRecSize = 10 * 1024 * 1024; //recSizes par
static void dumpTitledb (const char *coll, int32_t sfn, int32_t numFiles, bool includeTree,
int64_t docId , bool justPrintDups );
static int32_t dumpSpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int printStats, int32_t firstIp);
static int32_t dumpSpiderdbCsv(const char *coll);
//static int32_t dumpSpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int printStats, int32_t firstIp);
//static int32_t dumpSpiderdbCsv(const char *coll);
static void dumpTagdb(const char *coll, int32_t sfn, int32_t numFiles, bool includeTree, char req,
const char *site);
@ -121,9 +121,9 @@ static void dumpLinkdb(const char *coll, int32_t sfn, int32_t numFiles, bool inc
static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
//static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree);
//
//static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree, int32_t firstIp);
static int copyFiles(const char *dstDir);
@ -1291,19 +1291,19 @@ int main2 ( int argc , char *argv[] ) {
}
else if ( argv[cmdarg+1][0] == 'x' )
dumpDoledb (coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 's' ) {
int printStats = 0;
int32_t firstIp = 0;
if(cmdarg+6 < argc)
printStats = atol(argv[cmdarg+6]);
if(cmdarg+7 < argc)
firstIp = atoip(argv[cmdarg+7]);
int32_t ret = dumpSpiderdb ( coll, startFileNum, numFiles, includeTree, printStats, firstIp );
if ( ret == -1 ) {
fprintf(stdout,"error dumping spiderdb\n");
}
}
// else if ( argv[cmdarg+1][0] == 's' ) {
// int printStats = 0;
// int32_t firstIp = 0;
// if(cmdarg+6 < argc)
// printStats = atol(argv[cmdarg+6]);
// if(cmdarg+7 < argc)
// firstIp = atoip(argv[cmdarg+7]);
//
// int32_t ret = dumpSpiderdb ( coll, startFileNum, numFiles, includeTree, printStats, firstIp );
// if ( ret == -1 ) {
// fprintf(stdout,"error dumping spiderdb\n");
// }
// }
else if ( argv[cmdarg+1][0] == 'S' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) {
@ -1332,8 +1332,8 @@ int main2 ( int argc , char *argv[] ) {
dumpUnwantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "wt") == 0) {
dumpWantedTitledbRecs(coll, startFileNum, numFiles, includeTree);
} else if (strcmp(argv[cmdarg+1], "us") == 0) {
dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
// } else if (strcmp(argv[cmdarg+1], "us") == 0) {
// dumpUnwantedSpiderdbRecs(coll, startFileNum, numFiles, includeTree);
} else {
goto printHelp;
}
@ -1343,18 +1343,18 @@ int main2 ( int argc , char *argv[] ) {
return 0;
}
if(strcmp(cmd, "dumpcsv") == 0) {
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
if( !g_collectiondb.loadAllCollRecs()) {
log("db: Collectiondb init failed.");
return 1;
}
if(argv[cmdarg+1][0] == 's')
dumpSpiderdbCsv(argv[cmdarg+2]);
g_log.m_disabled = true;
g_collectiondb.reset();
return 0;
}
// if(strcmp(cmd, "dumpcsv") == 0) {
// g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
// if( !g_collectiondb.loadAllCollRecs()) {
// log("db: Collectiondb init failed.");
// return 1;
// }
// if(argv[cmdarg+1][0] == 's')
// dumpSpiderdbCsv(argv[cmdarg+2]);
// g_log.m_disabled = true;
// g_collectiondb.reset();
// return 0;
// }
if(strcmp(cmd, "convertspiderdb") == 0) {
g_conf.m_doingCommandLine = true; // so we do not log every collection coll.conf we load
@ -1368,53 +1368,53 @@ int main2 ( int argc , char *argv[] ) {
return 0;
}
// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
// . spiderdb is special:
// gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
// [priority] [printStats?]
if ( strcmp ( cmd , "verify" ) == 0 ) {
//
// tell Collectiondb, not to verify each rdb's data
//
g_dumpMode = true;
if ( cmdarg+1 >= argc ) goto printHelp;
int32_t startFileNum = 0;
int32_t numFiles = -1;
bool includeTree = true;
const char *coll = "";
// so we do not log every collection coll.conf we load
g_conf.m_doingCommandLine = true;
// we have to init collection db because we need to know if
// the collnum is legit or not in the tree
if ( ! g_collectiondb.loadAllCollRecs() ) {
log("db: Collectiondb init failed." ); return 1; }
if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
if ( cmdarg+5 < argc ) includeTree = argToBoolean(argv[cmdarg+5]);
if ( argv[cmdarg+1][0] == 's' ) {
int32_t firstIp = 0;
if(cmdarg+6 < argc)
firstIp = atoip(argv[cmdarg+6]);
int32_t ret = verifySpiderdb ( coll, startFileNum, numFiles, includeTree, firstIp );
if ( ret == -1 ) {
fprintf(stdout,"error verifying spiderdb\n");
}
}
else {
goto printHelp;
}
// disable any further logging so final log msg is clear
g_log.m_disabled = true;
g_collectiondb.reset();
return 0;
}
// // . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
// // . spiderdb is special:
// // gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
// // [priority] [printStats?]
// if ( strcmp ( cmd , "verify" ) == 0 ) {
// //
// // tell Collectiondb, not to verify each rdb's data
// //
// g_dumpMode = true;
//
// if ( cmdarg+1 >= argc ) goto printHelp;
// int32_t startFileNum = 0;
// int32_t numFiles = -1;
// bool includeTree = true;
// const char *coll = "";
//
// // so we do not log every collection coll.conf we load
// g_conf.m_doingCommandLine = true;
//
// // we have to init collection db because we need to know if
// // the collnum is legit or not in the tree
// if ( ! g_collectiondb.loadAllCollRecs() ) {
// log("db: Collectiondb init failed." ); return 1; }
//
// if ( cmdarg+2 < argc ) coll = argv[cmdarg+2];
// if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
// if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]);
// if ( cmdarg+5 < argc ) includeTree = argToBoolean(argv[cmdarg+5]);
//
// if ( argv[cmdarg+1][0] == 's' ) {
// int32_t firstIp = 0;
// if(cmdarg+6 < argc)
// firstIp = atoip(argv[cmdarg+6]);
//
// int32_t ret = verifySpiderdb ( coll, startFileNum, numFiles, includeTree, firstIp );
// if ( ret == -1 ) {
// fprintf(stdout,"error verifying spiderdb\n");
// }
// }
// else {
// goto printHelp;
// }
// // disable any further logging so final log msg is clear
// g_log.m_disabled = true;
// g_collectiondb.reset();
// return 0;
// }
@ -2706,6 +2706,7 @@ public:
int32_t m_numErrorReplies;
};
#if 0
static HashTableX g_ut;
static void addUStat1(const SpiderRequest *sreq, bool hadReply , int32_t now) {
@ -3356,7 +3357,7 @@ static int32_t dumpSpiderdbCsv(const char *coll) {
}
return 0;
}
#endif
// time speed of inserts into RdbTree for indexdb
static bool hashtest() {
@ -3874,6 +3875,7 @@ static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_
#if 0
static void dumpUnwantedSpiderdbRecs(const char *coll, int32_t startFileNum, int32_t numFiles, bool includeTree) {
if (startFileNum < 0) {
log(LOG_LOGIC, "db: Start file number is < 0. Must be >= 0.");
@ -4081,7 +4083,7 @@ static int32_t verifySpiderdb(const char *coll, int32_t startFileNum, int32_t nu
done:
return 0;
}
#endif
static bool parseTest(const char *coll, int64_t docId, const char *query) {

@ -11,7 +11,7 @@ enum rdbid_t {
RDB_TITLEDB = 3,
// RDB_SECTIONDB = 4,
// RDB_SYNCDB = 5,
RDB_SPIDERDB = 6,
RDB_SPIDERDB_DEPRECATED = 6,
RDB_DOLEDB = 7,
// RDB_TFNDB = 8,
RDB_CLUSTERDB = 9,
@ -33,7 +33,7 @@ enum rdbid_t {
// RDB2_INDEXDB2 = 21,
RDB2_TITLEDB2 = 22,
// RDB2_SECTIONDB2 = 23,
RDB2_SPIDERDB2 = 24,
RDB2_SPIDERDB2_DEPRECATED = 24,
// RDB2_TFNDB2 = 25,
RDB2_CLUSTERDB2 = 26,
// RDB2_DATEDB2 = 27,
@ -43,6 +43,8 @@ enum rdbid_t {
RDB2_TAGDB2 = 31,
RDB2_POSDB2 = 32,
// RDB2_CATDB2 = 33,
RDB_SPIDERDB_SQLITE = 34,
RDB2_SPIDERDB2_SQLITE = 35,
RDB_END
};