privacore-open-source-searc.../Titledb.cpp

300 lines
8.8 KiB
C++

#include "Titledb.h"
#include "Collectiondb.h"
#include "JobScheduler.h"
#include "Rebalance.h"
#include "Process.h"
#include "Conf.h"
#include "XmlDoc.h"
#include "UrlBlockCheck.h"
Titledb g_titledb;
Titledb g_titledb2;
// reset rdb
void Titledb::reset() { m_rdb.reset(); }
// init our rdb
bool Titledb::init ( ) {
// key sanity tests
int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL;
int64_t docId = 123456789;
key96_t k = makeKey(docId,uh48,false);
if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);}
if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);}
const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
Url uu;
uu.set(url);
const char *d1 = uu.getDomain();
int32_t dlen1 = uu.getDomainLen();
int32_t dlen2 = 0;
const char *d2 = getDomFast ( url , &dlen2 );
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// another one
url = "http://ok/";
uu.set(url);
const char *d1a = uu.getDomain();
dlen1 = uu.getDomainLen();
dlen2 = 0;
const char *d2a = getDomFast ( url , &dlen2 );
if ( d1a || d2a ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// another one
url = "http://www.example.net/";
uu.set(url);
d1a = uu.getDomain();
dlen1 = uu.getDomainLen();
dlen2 = 0;
d2a = getDomFast ( url , &dlen2 );
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// another one
url = "http://www.example.ac.uk/";
uu.set(url);
d1a = uu.getDomain();
dlen1 = uu.getDomainLen();
dlen2 = 0;
d2a = getDomFast ( url , &dlen2 );
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// . what's max # of tree nodes?
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = g_conf.m_titledbMaxTreeMem / (1*1024);
// initialize our own internal rdb
return m_rdb.init("titledb",
getFixedDataSize(),
//g_conf.m_titledbMinFilesToMerge ,
// this should not really be changed...
-1,
g_conf.m_titledbMaxTreeMem,
maxTreeNodes,
getUseHalfKeys(),
getKeySize(),
false); //useIndexFile
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Titledb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = treeMem / (1*1024);
// initialize our own internal rdb
return m_rdb.init("titledbRebuild",
getFixedDataSize(),
240, // MinFilesToMerge
treeMem,
maxTreeNodes,
getUseHalfKeys(),
getKeySize(),
false); //useIndexFile
}
bool Titledb::verify(const char *coll) {
log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );
Msg5 msg5;
RdbList list;
key96_t startKey;
key96_t endKey;
startKey.setMin();
endKey.setMax();
const CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_TITLEDB ,
cr->m_collnum ,
&list ,
&startKey ,
&endKey ,
1024*1024 , // minRecSizes ,
true , // includeTree ,
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
-1 , // maxRetries
false)) // isRealMerge
{
log(LOG_DEBUG, "db: HEY! it did not block");
return false;
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
// skip negative keys
if ( (k.n0 & 0x01) == 0x00 ) continue;
count++;
uint32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("db: Out of first %" PRId32" records in titledb, "
"only %" PRId32" belong to our shard. c=%s",count,got,coll);
// exit if NONE, we probably got the wrong data
if ( count > 10 && got == 0 )
log("db: Are you sure you have the right "
"data in the right directory? "
"coll=%s "
"Exiting.",
coll);
// repeat with log
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key96_t k = list.getCurrentKey();
int32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
log("db: docid=%" PRId64" shard=%" PRId32,
getDocId(&k),shardNum);
}
// don't exit any more, allow it, but do not delete
// recs that belong to different shards when we merge now!
log ( "db: db shards unbalanced. "
"Click autoscale in master controls.");
//return false;
return true;
}
log ( LOG_DEBUG, "db: Titledb passed verification successfully for %" PRId32
" recs.", count );
// DONE
return true;
}
bool Titledb::isLocal ( int64_t docId ) {
return ( getShardNumFromDocId(docId) == getMyShardNum() );
}
// . make the key of a TitleRec from a docId
// . remember to set the low bit so it's not a delete
// . hi bits are set in the key
key96_t Titledb::makeKey ( int64_t docId, int64_t uh48, bool isDel ){
key96_t key ;
key.n1 = (uint32_t)(docId >> 6); // (NUMDOCIDBITS-32));
int64_t n0 = (uint64_t)(docId&0x3f);
// sanity check
if ( uh48 & 0xffff000000000000LL ) { g_process.shutdownAbort(true); }
// make room for uh48
n0 <<= 48;
n0 |= uh48;
// 9 bits reserved
n0 <<= 9;
// final del bit
n0 <<= 1;
if ( ! isDel ) n0 |= 0x01;
// store it
key.n0 = n0;
return key;
};
void Titledb::printKey(const char *k) {
logf(LOG_TRACE, "k=%s "
"docId=%012" PRId64" "
"urlHash48=%02" PRId64" "
"isDel=%d",
KEYSTR(k, sizeof(key96_t)),
getDocId((key96_t*)k),
getUrlHash48((key96_t*)k),
KEYNEG(k));
}
void Titledb::validateSerializedRecord(const char *rec, int32_t recSize) {
char *debugp = (char*)rec;
key96_t debug_titleRecKey = *(key96_t *)debugp;
bool debug_keyneg = (debug_titleRecKey.n0 & 0x01) == 0x00;
int64_t debug_docId = Titledb::getDocIdFromKey(&debug_titleRecKey);
if ( debug_keyneg ) {
logTrace(g_conf.m_logTraceTitledb, "TitleDB rec verified. Delete key for DocId=%" PRId64 "", debug_docId);
}
else {
debugp += sizeof(key96_t);
// the size of the data that follows
int32_t debug_dataSize = *(int32_t *) debugp;
if( debug_dataSize < 4 ) {
log(LOG_ERROR, "TITLEDB CORRUPTION. Record shows size of %" PRId32" which is too small. DocId=%" PRId64 "", debug_dataSize, debug_docId);
gbshutdownLogicError();
}
debugp += 4;
// what's the size of the uncompressed compressed stuff below here?
int32_t debug_ubufSize = *(int32_t *) debugp;
if( debug_ubufSize <= 0 ) {
log(LOG_ERROR, "TITLEDB CORRUPTION. Record shows uncompressed size of %" PRId32". DocId=%" PRId64 "", debug_ubufSize, debug_docId);
gbshutdownLogicError();
}
logTrace(g_conf.m_logTraceTitledb, "TitleDB rec verified. recSize %" PRId32 ", uncompressed %" PRId32". DocId=%" PRId64 "", recSize, debug_ubufSize, debug_docId);
}
}
void filterTitledbList(RdbList *list) {
char *newList = list->getList();
char *dst = newList;
char *lastKey = NULL;
int32_t oldSize = list->getListSize();
int32_t filteredCount = 0;
for (list->resetListPtr(); !list->isExhausted();) {
char *rec = list->getCurrentRec();
int32_t recSize = list->getCurrentRecSize();
// pre skip it (necessary because we manipulate the raw list below)
list->skipCurrentRecord();
if (!KEYNEG(rec)) {
XmlDoc xd;
if (xd.set2(rec, recSize, "main", 0)) {
if (isUrlBlocked(*(xd.getFirstUrl()))) {
++filteredCount;
continue;
}
}
}
lastKey = dst;
memmove(dst, rec, recSize);
dst += recSize;
}
// sanity check
if ( dst < list->getList() || dst > list->getListEnd() ) {
g_process.shutdownAbort(true);
}
// and stick our newly filtered list in there
list->setListSize(dst - newList);
// set to end i guess
list->setListEnd(list->getList() + list->getListSize());
list->setListPtr(dst);
list->setListPtrHi(NULL);
log(LOG_DEBUG, "db: filtered %" PRId32" entries of %" PRId32 " bytes out of %" PRId32 " bytes.",
filteredCount, oldSize - list->getListSize(), oldSize);
if( !lastKey ) {
logError("lastKey is null. Should not happen?");
} else {
list->setLastKey(lastKey);
}
}