300 lines
8.8 KiB
C++
300 lines
8.8 KiB
C++
|
|
#include "Titledb.h"
|
|
#include "Collectiondb.h"
|
|
#include "JobScheduler.h"
|
|
#include "Rebalance.h"
|
|
#include "Process.h"
|
|
#include "Conf.h"
|
|
#include "XmlDoc.h"
|
|
#include "UrlBlockCheck.h"
|
|
|
|
Titledb g_titledb;
|
|
Titledb g_titledb2;
|
|
|
|
// reset rdb
|
|
void Titledb::reset() { m_rdb.reset(); }
|
|
|
|
// init our rdb
|
|
bool Titledb::init ( ) {
|
|
|
|
// key sanity tests
|
|
int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL;
|
|
int64_t docId = 123456789;
|
|
key96_t k = makeKey(docId,uh48,false);
|
|
if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);}
|
|
if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);}
|
|
|
|
const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
|
|
Url uu;
|
|
uu.set(url);
|
|
const char *d1 = uu.getDomain();
|
|
int32_t dlen1 = uu.getDomainLen();
|
|
int32_t dlen2 = 0;
|
|
const char *d2 = getDomFast ( url , &dlen2 );
|
|
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
|
|
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
|
|
|
|
// another one
|
|
url = "http://ok/";
|
|
uu.set(url);
|
|
const char *d1a = uu.getDomain();
|
|
dlen1 = uu.getDomainLen();
|
|
dlen2 = 0;
|
|
const char *d2a = getDomFast ( url , &dlen2 );
|
|
if ( d1a || d2a ) { g_process.shutdownAbort(true); }
|
|
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
|
|
|
|
// another one
|
|
url = "http://www.example.net/";
|
|
uu.set(url);
|
|
d1a = uu.getDomain();
|
|
dlen1 = uu.getDomainLen();
|
|
dlen2 = 0;
|
|
d2a = getDomFast ( url , &dlen2 );
|
|
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
|
|
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
|
|
|
|
// another one
|
|
url = "http://www.example.ac.uk/";
|
|
uu.set(url);
|
|
d1a = uu.getDomain();
|
|
dlen1 = uu.getDomainLen();
|
|
dlen2 = 0;
|
|
d2a = getDomFast ( url , &dlen2 );
|
|
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
|
|
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
|
|
|
|
// . what's max # of tree nodes?
|
|
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
|
|
// . NOTE: overhead is about 32 bytes per node
|
|
int32_t maxTreeNodes = g_conf.m_titledbMaxTreeMem / (1*1024);
|
|
|
|
// initialize our own internal rdb
|
|
return m_rdb.init("titledb",
|
|
getFixedDataSize(),
|
|
//g_conf.m_titledbMinFilesToMerge ,
|
|
// this should not really be changed...
|
|
-1,
|
|
g_conf.m_titledbMaxTreeMem,
|
|
maxTreeNodes,
|
|
getUseHalfKeys(),
|
|
getKeySize(),
|
|
false); //useIndexFile
|
|
}
|
|
|
|
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
|
bool Titledb::init2 ( int32_t treeMem ) {
|
|
// . what's max # of tree nodes?
|
|
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
|
|
// . NOTE: overhead is about 32 bytes per node
|
|
int32_t maxTreeNodes = treeMem / (1*1024);
|
|
// initialize our own internal rdb
|
|
return m_rdb.init("titledbRebuild",
|
|
getFixedDataSize(),
|
|
240, // MinFilesToMerge
|
|
treeMem,
|
|
maxTreeNodes,
|
|
getUseHalfKeys(),
|
|
getKeySize(),
|
|
false); //useIndexFile
|
|
}
|
|
|
|
bool Titledb::verify(const char *coll) {
|
|
log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );
|
|
|
|
Msg5 msg5;
|
|
RdbList list;
|
|
key96_t startKey;
|
|
key96_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
const CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
if ( ! msg5.getList ( RDB_TITLEDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
&startKey ,
|
|
&endKey ,
|
|
1024*1024 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
-1 , // maxRetries
|
|
false)) // isRealMerge
|
|
{
|
|
log(LOG_DEBUG, "db: HEY! it did not block");
|
|
return false;
|
|
}
|
|
|
|
int32_t count = 0;
|
|
int32_t got = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key96_t k = list.getCurrentKey();
|
|
// skip negative keys
|
|
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
|
count++;
|
|
uint32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
|
|
if ( shardNum == getMyShardNum() ) got++;
|
|
}
|
|
if ( got != count ) {
|
|
// tally it up
|
|
g_rebalance.m_numForeignRecs += count - got;
|
|
log ("db: Out of first %" PRId32" records in titledb, "
|
|
"only %" PRId32" belong to our shard. c=%s",count,got,coll);
|
|
// exit if NONE, we probably got the wrong data
|
|
if ( count > 10 && got == 0 )
|
|
log("db: Are you sure you have the right "
|
|
"data in the right directory? "
|
|
"coll=%s "
|
|
"Exiting.",
|
|
coll);
|
|
// repeat with log
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key96_t k = list.getCurrentKey();
|
|
int32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
|
|
log("db: docid=%" PRId64" shard=%" PRId32,
|
|
getDocId(&k),shardNum);
|
|
}
|
|
|
|
// don't exit any more, allow it, but do not delete
|
|
// recs that belong to different shards when we merge now!
|
|
log ( "db: db shards unbalanced. "
|
|
"Click autoscale in master controls.");
|
|
//return false;
|
|
return true;
|
|
}
|
|
|
|
log ( LOG_DEBUG, "db: Titledb passed verification successfully for %" PRId32
|
|
" recs.", count );
|
|
// DONE
|
|
return true;
|
|
}
|
|
|
|
bool Titledb::isLocal ( int64_t docId ) {
|
|
return ( getShardNumFromDocId(docId) == getMyShardNum() );
|
|
}
|
|
|
|
// . make the key of a TitleRec from a docId
|
|
// . remember to set the low bit so it's not a delete
|
|
// . hi bits are set in the key
|
|
key96_t Titledb::makeKey ( int64_t docId, int64_t uh48, bool isDel ){
|
|
key96_t key ;
|
|
key.n1 = (uint32_t)(docId >> 6); // (NUMDOCIDBITS-32));
|
|
|
|
int64_t n0 = (uint64_t)(docId&0x3f);
|
|
// sanity check
|
|
if ( uh48 & 0xffff000000000000LL ) { g_process.shutdownAbort(true); }
|
|
// make room for uh48
|
|
n0 <<= 48;
|
|
n0 |= uh48;
|
|
// 9 bits reserved
|
|
n0 <<= 9;
|
|
// final del bit
|
|
n0 <<= 1;
|
|
if ( ! isDel ) n0 |= 0x01;
|
|
// store it
|
|
key.n0 = n0;
|
|
return key;
|
|
};
|
|
|
|
void Titledb::printKey(const char *k) {
|
|
logf(LOG_TRACE, "k=%s "
|
|
"docId=%012" PRId64" "
|
|
"urlHash48=%02" PRId64" "
|
|
"isDel=%d",
|
|
KEYSTR(k, sizeof(key96_t)),
|
|
getDocId((key96_t*)k),
|
|
getUrlHash48((key96_t*)k),
|
|
KEYNEG(k));
|
|
}
|
|
|
|
void Titledb::validateSerializedRecord(const char *rec, int32_t recSize) {
|
|
char *debugp = (char*)rec;
|
|
key96_t debug_titleRecKey = *(key96_t *)debugp;
|
|
bool debug_keyneg = (debug_titleRecKey.n0 & 0x01) == 0x00;
|
|
int64_t debug_docId = Titledb::getDocIdFromKey(&debug_titleRecKey);
|
|
|
|
if ( debug_keyneg ) {
|
|
logTrace(g_conf.m_logTraceTitledb, "TitleDB rec verified. Delete key for DocId=%" PRId64 "", debug_docId);
|
|
}
|
|
else {
|
|
debugp += sizeof(key96_t);
|
|
|
|
// the size of the data that follows
|
|
int32_t debug_dataSize = *(int32_t *) debugp;
|
|
if( debug_dataSize < 4 ) {
|
|
log(LOG_ERROR, "TITLEDB CORRUPTION. Record shows size of %" PRId32" which is too small. DocId=%" PRId64 "", debug_dataSize, debug_docId);
|
|
gbshutdownLogicError();
|
|
}
|
|
debugp += 4;
|
|
|
|
// what's the size of the uncompressed compressed stuff below here?
|
|
int32_t debug_ubufSize = *(int32_t *) debugp;
|
|
if( debug_ubufSize <= 0 ) {
|
|
log(LOG_ERROR, "TITLEDB CORRUPTION. Record shows uncompressed size of %" PRId32". DocId=%" PRId64 "", debug_ubufSize, debug_docId);
|
|
gbshutdownLogicError();
|
|
}
|
|
logTrace(g_conf.m_logTraceTitledb, "TitleDB rec verified. recSize %" PRId32 ", uncompressed %" PRId32". DocId=%" PRId64 "", recSize, debug_ubufSize, debug_docId);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void filterTitledbList(RdbList *list) {
|
|
char *newList = list->getList();
|
|
char *dst = newList;
|
|
char *lastKey = NULL;
|
|
|
|
int32_t oldSize = list->getListSize();
|
|
int32_t filteredCount = 0;
|
|
for (list->resetListPtr(); !list->isExhausted();) {
|
|
char *rec = list->getCurrentRec();
|
|
int32_t recSize = list->getCurrentRecSize();
|
|
|
|
// pre skip it (necessary because we manipulate the raw list below)
|
|
list->skipCurrentRecord();
|
|
|
|
if (!KEYNEG(rec)) {
|
|
XmlDoc xd;
|
|
if (xd.set2(rec, recSize, "main", 0)) {
|
|
if (isUrlBlocked(*(xd.getFirstUrl()))) {
|
|
++filteredCount;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
lastKey = dst;
|
|
memmove(dst, rec, recSize);
|
|
dst += recSize;
|
|
}
|
|
|
|
// sanity check
|
|
if ( dst < list->getList() || dst > list->getListEnd() ) {
|
|
g_process.shutdownAbort(true);
|
|
}
|
|
|
|
// and stick our newly filtered list in there
|
|
list->setListSize(dst - newList);
|
|
// set to end i guess
|
|
list->setListEnd(list->getList() + list->getListSize());
|
|
list->setListPtr(dst);
|
|
list->setListPtrHi(NULL);
|
|
|
|
log(LOG_DEBUG, "db: filtered %" PRId32" entries of %" PRId32 " bytes out of %" PRId32 " bytes.",
|
|
filteredCount, oldSize - list->getListSize(), oldSize);
|
|
|
|
if( !lastKey ) {
|
|
logError("lastKey is null. Should not happen?");
|
|
} else {
|
|
list->setLastKey(lastKey);
|
|
}
|
|
}
|