privacore-open-source-searc.../Linkdb.cpp
2018-07-20 16:12:23 +02:00

242 lines
7.2 KiB
C++

#include "Linkdb.h"
#include "Conf.h"
#include "Titledb.h"
#include "linkspam.h"
#include "Collectiondb.h"
#include "Rebalance.h"
#include "Process.h"
#include "ip.h"
Linkdb g_linkdb;
Linkdb g_linkdb2;
void Linkdb::reset() {
m_rdb.reset();
}
bool Linkdb::init ( ) {
key224_t k;
// sanity tests
uint32_t linkeeSiteHash32 = (uint32_t)rand();
uint32_t linkerSiteHash32 = (uint32_t)rand();
uint64_t linkeeUrlHash64 = (uint64_t)rand() << 32LL | rand();
// mask it to 32+15 bits
linkeeUrlHash64 &= 0x00007fffffffffffLL;
unsigned char linkerSiteRank = 13;
int32_t ip = rand();
int32_t ipdom3 = ipdom(ip);
int64_t docId = ((uint64_t)rand() << 32 | rand()) & DOCID_MASK;
int32_t discoveryDate = 1339784732;
int32_t lostDate = discoveryDate + 86400*23;
char linkSpam = 1;
k = makeKey_uk ( linkeeSiteHash32 ,
linkeeUrlHash64 ,
linkSpam , // islinkspam?
linkerSiteRank ,
ip ,
docId ,
discoveryDate ,
lostDate ,
false , // newaddtooldpage?
linkerSiteHash32 ,
false ); // is del?
// jan 1 2008
uint32_t epoch = LINKDBEPOCH;
int32_t dd2 = (discoveryDate - epoch) / 86400;
dd2 = dd2 * 86400 + epoch;
int32_t ld2 = (lostDate - epoch) / 86400;
if ( lostDate == 0 ) ld2 = 0;
ld2 = ld2 * 86400 + epoch;
// now test it
if(getLinkeeSiteHash32_uk(&k)!=linkeeSiteHash32){g_process.shutdownAbort(true);}
if(getLinkeeUrlHash64_uk(&k)!=linkeeUrlHash64){g_process.shutdownAbort(true);}
if ( isLinkSpam_uk ( &k ) != linkSpam ) {g_process.shutdownAbort(true);}
if (getLinkerSiteHash32_uk(&k)!=linkerSiteHash32){g_process.shutdownAbort(true);}
if ( getLinkerSiteRank_uk(&k) != linkerSiteRank){g_process.shutdownAbort(true);}
if ( getLinkerIp24_uk ( &k ) != ipdom3 ) {g_process.shutdownAbort(true);}
if ( getLinkerIp_uk ( &k ) != ip ) {g_process.shutdownAbort(true);}
if ( getLinkerDocId_uk( &k ) != docId ) {g_process.shutdownAbort(true);}
if ( getDiscoveryDate_uk(&k) != dd2 ) {g_process.shutdownAbort(true);}
// more tests
setDiscoveryDate_uk (&k,discoveryDate);
if ( getDiscoveryDate_uk(&k) != dd2 ) {g_process.shutdownAbort(true);}
int32_t ip3 = 0xabcdef12;
setIp32_uk ( &k , ip3 );
int32_t ip4 = getLinkerIp_uk ( &k );
if ( ip3 != ip4 ) { g_process.shutdownAbort(true); }
int64_t maxTreeMem = g_conf.m_linkdbMaxTreeMem;
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = sizeof(key192_t)+4 +4+4+4+4
// . 32 bytes per record when in the tree
int32_t maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
// init the rdb
return m_rdb.init ( "linkdb" ,
0 , // fixeddatasize is 0 since no data
// keep it high since we are mostly ssds now and
// the reads are small...
-1,//g_conf.m_linkdbMinFilesToMerge ,
// fix this to 15 and rely on the page cache of
// just the satellite files and the daily merge to
// keep things fast.
//15 ,
maxTreeMem ,
maxTreeNodes ,
false, // true , // use half keys
sizeof(key224_t), // key size
false); //useIndexFile
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Linkdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
// . 28 bytes per record when in the tree
int32_t nodeSize = ( sizeof(key224_t) + 12 + 4 ) + sizeof(collnum_t);
int32_t maxTreeNodes = treeMem / nodeSize;
// initialize our own internal rdb
return m_rdb.init ( "linkdbRebuild" ,
0 , // no data now! just docid/s/c
50 , // m_clusterdbMinFilesToMerge,
treeMem , // g_conf.m_clusterdbMaxTreeMem,
maxTreeNodes ,
false, // true , // half keys?
sizeof(key224_t), // key size
false); //useIndexFile
}
// make a "url" key
key224_t Linkdb::makeKey_uk ( uint32_t linkeeSiteHash32 ,
uint64_t linkeeUrlHash64 ,
bool isLinkSpam ,
unsigned char linkerSiteRank ,
uint32_t linkerIp ,
int64_t linkerDocId ,
uint32_t discoveryDate ,
uint32_t lostDate ,
bool newAddToOldPage ,
uint32_t linkerSiteHash32 ,
bool isDelete ) {
// mask it
linkeeUrlHash64 &= LDB_MAXURLHASH;
key224_t k;
k.n3 = linkeeSiteHash32;
k.n3 <<= 32;
k.n3 |= (linkeeUrlHash64>>15) & 0xffffffff;
// finish the url hash
k.n2 = linkeeUrlHash64 & 0x7fff;
k.n2 <<= 1;
if ( isLinkSpam ) k.n2 |= 0x01;
// make it 8-bites for now even though only needs 4
k.n2 <<= 8;
k.n2 |= (unsigned char)~linkerSiteRank;
k.n2 <<= 8;
// this is now part of the linkerip, steve wants the full ip
k.n2 |= (linkerIp >> 24);
//uint32_t id = ipdom(linkerIp);
//if ( id > 0xffffff ) { g_process.shutdownAbort(true); }
k.n2 <<= 24;
k.n2 |= (linkerIp & 0x00ffffff);
k.n2 <<= 8;
k.n2 |= (((uint64_t)linkerDocId) >> 30);
k.n1 = (((uint64_t)linkerDocId) & 0x3fffffffLL);
// two reserved bits
k.n1 <<= 2;
// sanity checks
//if(discoveryDate && discoveryDate < 1025376000){g_process.shutdownAbort(true);}
if ( lostDate && lostDate < LINKDBEPOCH){
lostDate = LINKDBEPOCH;
//g_process.shutdownAbort(true);
}
// . convert discovery date from utc into days since jan 2008 epoch
// . the number is for jan 2012, so subtract 4 years to do 2008
uint32_t epoch = LINKDBEPOCH;
if ( discoveryDate && discoveryDate < epoch ) {
discoveryDate = epoch;
//g_process.shutdownAbort(true);
}
uint32_t nd = (discoveryDate - epoch) / 86400;
if ( discoveryDate == 0 ) nd = 0;
// makeEndKey_uk() maxes this out!
if ( nd > 0x3fff ) nd = 0x3fff;
k.n1 <<= 14;
k.n1 |= nd;
// one reservied bit
k.n1 <<= 1;
k.n1 <<= 1;
if ( newAddToOldPage ) k.n1 |= 0x01;
// the "lost" date. 0 if not yet lost.
uint32_t od = (lostDate - LINKDBEPOCH) / 86400;
if ( lostDate == 0 ) od = 0;
// makeEndKey_uk() maxes this out!
if ( od > 0x3fff ) od = 0x3fff;
k.n1 <<= 14;
k.n1 |= od;
// 2 bits of linker site hash
k.n1 <<= 2;
k.n1 |= linkerSiteHash32 >> 30;
// rest of linker site hash
k.n0 = linkerSiteHash32;
// halfbit - unused now!
k.n0 <<= 1;
// delbit
k.n0 <<= 1;
if ( ! isDelete ) k.n0 |= 0x01;
return k;
}
void Linkdb::printKey(const char *k) {
key224_t *key = (key224_t*)k;
char ipbuf[16];
logf(LOG_TRACE, "k=%s "
"linkeesitehash32=0x%08" PRIx32" "
"linkeeurlhash=0x%012" PRIx64" "
"linkspam=%" PRId32" "
"siterank=%02" PRId32" "
"ip32=%s "
"docId=%012" PRIu64" "
"discovered=%" PRIu32" "
"lost=%" PRIu32" "
"sitehash32=0x%08" PRIx32" "
"shardNum=%" PRIu32" "
"%s",
KEYSTR(&k, sizeof(key224_t)),
(int32_t)Linkdb::getLinkeeSiteHash32_uk(key),
(int64_t)Linkdb::getLinkeeUrlHash64_uk(key),
(int32_t)Linkdb::isLinkSpam_uk(key),
(int32_t)Linkdb::getLinkerSiteRank_uk(key),
iptoa((int32_t)Linkdb::getLinkerIp_uk(key),ipbuf),
(uint64_t)Linkdb::getLinkerDocId_uk(key),
(uint32_t)Linkdb::getDiscoveryDate_uk(key),
(uint32_t)Linkdb::getLostDate_uk(key),
(int32_t)Linkdb::getLinkerSiteHash32_uk(key),
(uint32_t)getShardNum(RDB_LINKDB, k),
KEYNEG(k) ? " (delete)" : "");
}