forked from Mirrors/privacore-open-source-search-engine
fix problem scanning spiderdb.
move dedup spiderdb code to RdbMerge.cpp where it really should be.
This commit is contained in:
@ -2061,8 +2061,8 @@ void RdbList::merge_r ( RdbList **lists ,
|
||||
char*xx=NULL;*xx=0; }
|
||||
|
||||
// dedup for spiderdb
|
||||
if ( rdbId == RDB_SPIDERDB )
|
||||
dedupSpiderdbList ( this , niceness , removeNegRecs );
|
||||
//if ( rdbId == RDB_SPIDERDB )
|
||||
// dedupSpiderdbList ( this , niceness , removeNegRecs );
|
||||
|
||||
/*
|
||||
if ( rdbId == RDB_POSDB ) {
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "Msg3.h"
|
||||
#include "Indexdb.h"
|
||||
#include "Process.h"
|
||||
#include "Spider.h"
|
||||
|
||||
// declare the lock unlocked
|
||||
//static bool s_isMergeLocked = false;
|
||||
@ -565,6 +566,14 @@ bool RdbMerge::dumpList ( ) {
|
||||
//m_startKey += (unsigned long)1;
|
||||
KEYADD(m_startKey,1,m_ks);
|
||||
|
||||
/////
|
||||
//
|
||||
// dedup for spiderdb before we dump it. try to save disk space.
|
||||
//
|
||||
/////
|
||||
if ( m_rdbId == RDB_SPIDERDB )
|
||||
// removeNegRecs? = false
|
||||
dedupSpiderdbList(&m_list,m_niceness,false);
|
||||
|
||||
// if the startKey rolled over we're done
|
||||
//if ( m_startKey.n0 == 0LL && m_startKey.n1 == 0 ) m_doneMerging=true;
|
||||
|
@ -25,6 +25,7 @@ Rebalance::Rebalance ( ) {
|
||||
m_inRebalanceLoop = false;
|
||||
m_numForeignRecs = 0;
|
||||
m_rebalanceCount = 0LL;
|
||||
m_scannedCount = 0LL;
|
||||
// reset
|
||||
m_rdbNum = 0;
|
||||
m_collnum = 0;
|
||||
@ -234,8 +235,10 @@ void Rebalance::scanLoop ( ) {
|
||||
// scan it. returns true if done, false if blocked
|
||||
if ( ! scanRdb ( ) ) return;
|
||||
// note it
|
||||
log("rebal: moved %lli recs",m_rebalanceCount);
|
||||
log("rebal: moved %lli of %lli recs scanned",
|
||||
m_rebalanceCount,m_scannedCount);
|
||||
m_rebalanceCount = 0;
|
||||
m_scannedCount = 0;
|
||||
m_lastPercent = -1;
|
||||
}
|
||||
// reset it for next colls
|
||||
@ -315,6 +318,8 @@ bool Rebalance::scanRdb ( ) {
|
||||
|
||||
readAnother:
|
||||
|
||||
//log("rebal: loading list start = %s",KEYSTR(m_nextKey,rdb->m_ks));
|
||||
|
||||
if ( ! m_msg5.getList ( rdb->m_rdbId ,
|
||||
coll ,
|
||||
&m_list ,
|
||||
@ -376,6 +381,8 @@ bool Rebalance::gotList ( ) {
|
||||
|
||||
m_list.resetListPtr();
|
||||
|
||||
//log("rebal: got list of %li bytes",m_list.getListSize());
|
||||
|
||||
m_posMetaList.reset();
|
||||
m_negMetaList.reset();
|
||||
|
||||
@ -393,6 +400,11 @@ bool Rebalance::gotList ( ) {
|
||||
long shard = getShardNum ( rdbId , rec );
|
||||
// save last ptr
|
||||
last = rec;
|
||||
// debug!
|
||||
//m_list.getKey ( rec , m_nextKey );
|
||||
//log("rebal: checking key %s",KEYSTR(m_nextKey,ks));
|
||||
// count as scanned
|
||||
m_scannedCount++;
|
||||
// skip it if it belongs with us
|
||||
if ( shard == myShard ) continue;
|
||||
// count it
|
||||
@ -424,10 +436,14 @@ bool Rebalance::gotList ( ) {
|
||||
//m_negMetaList.pushChar ( rdbId );
|
||||
// make key a delete
|
||||
key[0] &= 0xfe;
|
||||
// for debug...
|
||||
//log("rebal: rm key %s",KEYSTR(key,ks));
|
||||
// and store that negative key
|
||||
m_negMetaList.safeMemcpy ( key , ks );
|
||||
}
|
||||
|
||||
//log("rebal: done reading list");
|
||||
|
||||
// update nextkey
|
||||
if ( last ) {
|
||||
// get the last key we scanned, all "ks" bytes of it.
|
||||
|
@ -26,6 +26,7 @@ class Rebalance {
|
||||
bool m_inRebalanceLoop;
|
||||
long m_numForeignRecs;
|
||||
long long m_rebalanceCount;
|
||||
long long m_scannedCount;
|
||||
|
||||
long m_rdbNum;
|
||||
collnum_t m_collnum;
|
||||
|
12
Spider.cpp
12
Spider.cpp
@ -62,11 +62,12 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
SafeBuf tmp;
|
||||
if ( ! sb ) sb = &tmp;
|
||||
|
||||
sb->safePrintf("k.n1=0x%llx ",m_key.n1);
|
||||
sb->safePrintf("k.n0=0x%llx ",m_key.n0);
|
||||
//sb->safePrintf("k.n1=0x%llx ",m_key.n1);
|
||||
//sb->safePrintf("k.n0=0x%llx ",m_key.n0);
|
||||
sb->safePrintf("k=%s ",KEYSTR(this,
|
||||
getKeySizeFromRdbId(RDB_SPIDERDB)));
|
||||
|
||||
sb->safePrintf("uh48=%llu ",getUrlHash48());
|
||||
sb->safePrintf("parentDocId=%llu ",getParentDocId());
|
||||
// if negtaive bail early now
|
||||
if ( (m_key.n0 & 0x01) == 0x00 ) {
|
||||
sb->safePrintf("[DELETE]");
|
||||
@ -74,6 +75,9 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
return sb->length();
|
||||
}
|
||||
|
||||
sb->safePrintf("recsize=%li ",getRecSize());
|
||||
sb->safePrintf("parentDocId=%llu ",getParentDocId());
|
||||
|
||||
sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );
|
||||
sb->safePrintf("hostHash32=0x%lx ",m_hostHash32 );
|
||||
sb->safePrintf("domHash32=0x%lx ",m_domHash32 );
|
||||
@ -691,6 +695,8 @@ bool Spiderdb::verify ( char *coll ) {
|
||||
if ( shardNum == g_hostdb.getMyShardNum() ) got++;
|
||||
}
|
||||
if ( got != count ) {
|
||||
// tally it up
|
||||
g_rebalance.m_numForeignRecs += count - got;
|
||||
log ("db: Out of first %li records in spiderdb, "
|
||||
"only %li belong to our shard.",count,got);
|
||||
// exit if NONE, we probably got the wrong data
|
||||
|
4
main.cpp
4
main.cpp
@ -6332,7 +6332,9 @@ long dumpSpiderdb ( char *coll,
|
||||
printf( "offset=%lli ",curOff);
|
||||
g_spiderdb.print ( srec );
|
||||
printf(" age=%lis",now-sreq->m_addedTime);
|
||||
printf(" hadReply=%li\n",(long)hadReply);
|
||||
printf(" hadReply=%li",(long)hadReply);
|
||||
printf(" shard=%li\n",
|
||||
(long)g_hostdb.getShardNum(RDB_SPIDERDB,sreq));
|
||||
}
|
||||
|
||||
// print a counter
|
||||
|
Reference in New Issue
Block a user