541 lines
17 KiB
C++
541 lines
17 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Catdb.h"
|
|
#include "Categories.h"
|
|
#include "CatRec.h"
|
|
#include "Threads.h"
|
|
|
|
// use this to query delete all the banned sites in tagdb
|
|
// dsh -a '/a/gb -c /a/hosts.conf dump S main 0 -1 1 | grep =19' | awk '{print $6}' | sort | uniq | grep "\." > banned
|
|
// cat banned | awk -F'http://' '{print $2}' | grep "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=site%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' > URLS
|
|
// cat banned | awk -F'/' '{print $3}' | grep -v "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=ip%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' >> URLS
|
|
// nohup wget -i /a/URLS -O /dev/null &
|
|
|
|
// a global class extern'd in .h file
|
|
Catdb g_catdb;
|
|
|
|
// reset rdb and Xmls
|
|
void Catdb::reset() {
|
|
m_rdb.reset();
|
|
}
|
|
|
|
|
|
bool Catdb::init ( ) {
|
|
// clear our m_keys/m_bufs arrays
|
|
// memset ( m_xml , 0 , MAXNUMSITEFILES * sizeof(Xml *) );
|
|
//memset ( m_keys, 0 , MAXNUMSITEFILES * sizeof(int64_t) );
|
|
// . what's max # of tree nodes?
|
|
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
|
|
// . NOTE: 32 bytes of the 82 are overhead
|
|
//int32_t treeMem = g_conf.m_catdbMaxTreeMem;
|
|
// speed up gen catdb, use 15MB. later maybe once gen is complete
|
|
// we can free this tree or something...
|
|
// TODO!
|
|
int32_t treeMem = 15000000;
|
|
//int32_t treeMem = 100000000;
|
|
//int32_t maxTreeNodes = g_conf.m_catdbMaxTreeMem / 82;
|
|
int32_t maxTreeNodes = treeMem / 82;
|
|
// do not use any page cache if doing tmp cluster in order to
|
|
// prevent swapping
|
|
// int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
|
|
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
|
|
|
// pcmem = 0;
|
|
// each entry in the cache is usually just a single record, no lists,
|
|
// unless a hostname has multiple sites in it. has 24 bytes more
|
|
// overhead in cache.
|
|
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
|
|
// we now use a page cache
|
|
// if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
|
|
// GB_TFNDB_PAGE_SIZE) )
|
|
// return log("db: Catdb init failed.");
|
|
|
|
// . initialize our own internal rdb
|
|
// . i no longer use cache so changes to tagdb are instant
|
|
// . we still use page cache however, which is good enough!
|
|
//if ( this == &g_catdb )
|
|
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
|
"catdb" ,
|
|
true , // dedup same keys?
|
|
-1 , // fixed record size
|
|
//g_hostdb.m_groupMask ,
|
|
//g_hostdb.m_groupId ,
|
|
2,//g_conf.m_catdbMinFilesToMerge ,
|
|
treeMem ,//g_conf.m_catdbMaxTreeMem ,
|
|
maxTreeNodes ,
|
|
// now we balance so Sync.cpp can ordered huge list
|
|
true , // balance tree?
|
|
0 , //g_conf.m_tagdbMaxCacheMem ,
|
|
0 , //maxCacheNodes ,
|
|
false , // half keys?
|
|
false , //m_tagdbSaveCache
|
|
NULL, // &m_pc ,
|
|
false,
|
|
false,
|
|
12, // keysize
|
|
false,
|
|
true )) // is collectionless?
|
|
return false;
|
|
|
|
// normally Collectiondb.addColl() will call Rdb::addColl() which
|
|
// will init the CollectionRec::m_rdbBase, which is what
|
|
// Rdb::getBase(collnum_t) will return. however, for collectionless
|
|
// rdb databases we set Rdb::m_collectionlessBase special here.
|
|
// This was in Rdb.cpp::init().
|
|
return m_rdb.addRdbBase1 ( NULL );
|
|
}
|
|
|
|
bool Catdb::init2 ( int32_t treeMem ) {
|
|
// . what's max # of tree nodes?
|
|
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
|
|
// . NOTE: 32 bytes of the 82 are overhead
|
|
int32_t maxTreeNodes = 0;
|
|
|
|
return m_rdb.init ( g_hostdb.m_dir ,
|
|
"tagdbRebuild" ,
|
|
true , // dedup same keys?
|
|
-1 , // fixed record size
|
|
10 , // min to merge
|
|
treeMem ,
|
|
maxTreeNodes ,
|
|
true , // balance tree?
|
|
0 , //g_conf.m_tagdbMaxCacheMem ,
|
|
0 , //maxCacheNodes ,
|
|
false , // half keys?
|
|
false , //m_tagdbSaveCache
|
|
NULL ); //&m_pc
|
|
}
|
|
|
|
//
|
|
// end support for "cache recs"
|
|
//
|
|
|
|
/*
|
|
bool Catdb::addColl ( char *coll, bool doVerify ) {
|
|
if ( ! m_rdb.addColl ( coll ) ) return false;
|
|
// verify
|
|
return true;
|
|
if ( verify(coll) ) return true;
|
|
// if not allowing scale, return false
|
|
if ( ! g_conf.m_allowScale ) return false;
|
|
// otherwise let it go
|
|
log ( "db: Verify failed, but scaling is allowed, passing." );
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool Catdb::verify ( char *coll ) {
|
|
char *rdbName = "Catdb";
|
|
log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll );
|
|
g_threads.disableThreads();
|
|
|
|
Msg5 msg5;
|
|
//Msg5 msg5b;
|
|
RdbList list;
|
|
key_t startKey;
|
|
key_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
//int32_t minRecSizes = 64000;
|
|
|
|
if ( ! msg5.getList ( RDB_CATDB ,
|
|
0,//collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
64000 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL ,
|
|
0 ,
|
|
-1 ,
|
|
true ,
|
|
-1LL ,
|
|
NULL,//&msg5b ,
|
|
true )) {
|
|
g_threads.enableThreads();
|
|
return log("db: HEY! it did not block");
|
|
}
|
|
|
|
int32_t count = 0;
|
|
int32_t got = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
count++;
|
|
//uint32_t groupId = g_catdb.getGroupId ( &k );
|
|
//uint32_t shardNum = getShardNum ( RDB_CATDB , &k );
|
|
//if ( groupId == g_hostdb.m_groupId ) got++;
|
|
uint32_t shardNum = getShardNum( RDB_CATDB , &k );
|
|
if ( shardNum == getMyShardNum() ) got++;
|
|
}
|
|
if ( got != count ) {
|
|
log ("db: Out of first %" INT32 " records in %s, only %" INT32 " belong "
|
|
"to our group.",count,rdbName,got);
|
|
// exit if NONE, we probably got the wrong data
|
|
if ( got == 0 ) log("db: Are you sure you have the "
|
|
"right "
|
|
"data in the right directory? "
|
|
"Exiting.");
|
|
log ( "db: Exiting due to %s inconsistency.", rdbName );
|
|
g_threads.enableThreads();
|
|
return g_conf.m_bypassValidation;
|
|
}
|
|
log ( LOG_INFO, "db: %s passed verification successfully for %" INT32 " recs.",
|
|
rdbName, count );
|
|
// DONE
|
|
g_threads.enableThreads();
|
|
return true;
|
|
}
|
|
|
|
void Catdb::normalizeUrl ( Url *srcUrl, Url *dstUrl ) {
|
|
char urlStr[MAX_URL_LEN];
|
|
int32_t urlStrLen = srcUrl->getUrlLen();
|
|
gbmemcpy(urlStr, srcUrl->getUrl(), urlStrLen);
|
|
// fix the url
|
|
urlStrLen = g_categories->fixUrl(urlStr, urlStrLen);
|
|
// create the normalized url
|
|
dstUrl->set(urlStr, urlStrLen, true, false, false, true);
|
|
}
|
|
|
|
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
|
|
key_t Catdb::makeKey ( Url *site, bool isDelete ) {
|
|
|
|
key_t k;
|
|
// . get startKey based on "site"'s domain
|
|
// . if "site"'s domain is an ip address (non-canonical) then use ip
|
|
getKeyRange ( site->isIp() , site , &k , NULL);
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = hash64 ( site->getUrl() , site->getUrlLen() );
|
|
// clear low bit if we're a delete, otherwise set it
|
|
if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL;
|
|
else k.n0 |= 0x0000000000000001LL;
|
|
return k;
|
|
}
|
|
|
|
// . get startKey,endKey for all SiteRecs from "url"'s domain
|
|
// . key has the following format:
|
|
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/ collection
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
|
|
// . putting domain as first 32bits will cluster all SiteRecs from the
|
|
// same domain together on the same machine
|
|
void Catdb::getKeyRange ( bool useIp , Url *url,
|
|
key_t *startKey , key_t *endKey ) {
|
|
// log warning msg if we need to
|
|
if ( useIp && ! url->hasIp() )
|
|
log(LOG_LOGIC,"db: tagdb: getKeyRange: useIp is true, "
|
|
"but url has no ip");
|
|
// . the upper 32 bits of the key is basically hash of the domain
|
|
// . mask out the low-order byte (hi byte in little endian order)
|
|
uint32_t h;
|
|
// . make sure we use htonl() on ip domain so top byte is not zero!
|
|
// . this made all our ip-based sites stored in group #0 before
|
|
// if ( useIp ) h = htonl ( url->getIpDomain() ) ;
|
|
// . only hash first 3 bytes of ip domain to keep together w/ ip
|
|
// . if rdbid is tagdb then use hostname as key else use domain
|
|
if ( useIp ) {
|
|
// do htonl so most significant byte is first
|
|
int32_t ipdom = htonl(url->getIpDomain());
|
|
h = hash32 ( (char *)&ipdom , 3 ) ;
|
|
}
|
|
else
|
|
h = hash32 (url->getDomain(), url->getDomainLen());
|
|
|
|
// incorporate collection into "h"
|
|
//h = hash32 ( coll , collLen , h );
|
|
// now make the keys
|
|
key_t k;
|
|
// top 4 bytes is always the domain hash (ip or canonical domain)
|
|
k.n1 = h;
|
|
// don't set the low del bit for startKey
|
|
k.n0 = 0x0000000000000000LL;
|
|
// assign the startKey
|
|
if ( startKey ) *startKey = k;
|
|
// set the low del bit for startKey
|
|
k.n0 = 0xffffffffffffffffLL;
|
|
// endkey is just as simple
|
|
if ( endKey ) *endKey = k;
|
|
}
|
|
|
|
// move the current list pointer back until we hit the start of
|
|
// a valid key
|
|
char *Catdb::moveToCorrectKey ( char *listPtr,
|
|
RdbList *list,
|
|
uint32_t domainHash ) {
|
|
char *listEnd = list->getListEnd();
|
|
char *listStart = list->getList();
|
|
char *p = listPtr;
|
|
// move back from the end
|
|
if (listEnd - p < (int32_t)sizeof(key_t))
|
|
p -= sizeof(key_t);
|
|
// loop until we get it
|
|
for ( ; p > listStart; p--){
|
|
// check for the domain hash in the key
|
|
if ( ((key_t*)p)->n1 == domainHash ) {
|
|
// . verify the match
|
|
// get the current rec size and check
|
|
// the next rec for correct data
|
|
int32_t recSize = list->getRecSize(p);
|
|
char *checkp = p + recSize;
|
|
// step 1, verify the start of the next rec is good
|
|
if ( recSize >= 0 && ( checkp == listEnd ||
|
|
( checkp < listEnd &&
|
|
((key_t*)checkp)->n1 == domainHash ) ) ) {
|
|
// return if we're at end
|
|
if ( checkp == listEnd )
|
|
return p;
|
|
// step 2, verify good rec on next rec
|
|
recSize = list->getRecSize(checkp);
|
|
checkp = checkp + recSize;
|
|
if ( recSize >= 0 && ( checkp == listEnd ||
|
|
( checkp < listEnd &&
|
|
((key_t*)checkp)->n1 == domainHash )))
|
|
// good match, return it
|
|
return p;
|
|
}
|
|
}
|
|
// otherwise backup a byte
|
|
}
|
|
// we'll get here if p == listStart
|
|
return p;
|
|
}
|
|
|
|
// binary search on the given list for the given key
|
|
void Catdb::listSearch ( RdbList *list,
|
|
key_t exactKey,
|
|
char **data,
|
|
int32_t *dataSize ) {
|
|
// init the data
|
|
*data = NULL;
|
|
*dataSize = 0;
|
|
list->resetListPtr();
|
|
// for small lists, just loop through the list
|
|
if (list->getListSize() < 16*1024) {
|
|
while ( ! list->isExhausted() ) {
|
|
// for debug!
|
|
/*
|
|
CatRec crec;
|
|
crec.set ( NULL,
|
|
list->getCurrentData(),
|
|
list->getCurrentDataSize(),
|
|
false);
|
|
log("catdb: caturl=%s #catid=%" INT32 " version=%" INT32 ""
|
|
,crec.m_url
|
|
,(int32_t)crec.m_numCatids
|
|
,(int32_t)crec.m_version
|
|
);
|
|
*/
|
|
// check the current key
|
|
if ( list->getCurrentKey() != exactKey ) {
|
|
// miss, next
|
|
list->skipCurrentRecord();
|
|
continue;
|
|
}
|
|
// get site from this rec
|
|
*data = list->getCurrentData();
|
|
*dataSize = list->getCurrentDataSize();
|
|
break;
|
|
}
|
|
}
|
|
// otherwise do a binary search on the large lists
|
|
else {
|
|
// init the low and high
|
|
char *low = list->getList();
|
|
char *high = list->getListEnd();
|
|
// move the high ptr to the start of last rec
|
|
high = moveToCorrectKey(high, list, exactKey.n1);
|
|
// binary search
|
|
char *currRec;
|
|
while ( low <= high ) {
|
|
// next check spot
|
|
int32_t delta = high - low;
|
|
currRec = low + (delta / 2);
|
|
//currRec = (char*)(((uint64_t)low +
|
|
// (uint64_t)high)/2);
|
|
// do correction
|
|
currRec = moveToCorrectKey( currRec,
|
|
list,
|
|
exactKey.n1 );
|
|
// check for hit
|
|
if (list->getKey(currRec) == exactKey) {
|
|
// hit, save it and get out
|
|
*data = list->getData(currRec);
|
|
*dataSize = list->getDataSize(currRec);
|
|
break;
|
|
}
|
|
else if (list->getKey(currRec) > exactKey) {
|
|
// move high to currRec - one rec
|
|
high = moveToCorrectKey ( currRec - 1,
|
|
list,
|
|
exactKey.n1 );
|
|
}
|
|
else {
|
|
// move low to currRec + one rec
|
|
low = currRec + list->getRecSize(currRec);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// now given an RdbList of SiteRecs can we find the best matching rec
|
|
// for our site?
|
|
char *Catdb::getRec ( RdbList *list , Url *url , int32_t *recSize,
|
|
char* coll, int32_t collLen ) {
|
|
key_t exactKey;
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
int64_t took;
|
|
char *data;
|
|
int32_t dataSize;
|
|
// for now, only get exact hits for catdb
|
|
// check for an exact key/url match
|
|
exactKey = makeKey(url, false);
|
|
// go throught the list looking for the exact key
|
|
//list->resetListPtr();
|
|
data = NULL;
|
|
dataSize = 0;
|
|
// call the search
|
|
listSearch ( list, exactKey, &data, &dataSize );
|
|
// make sure the url matches
|
|
if (data && dataSize > 0) {
|
|
// get the url
|
|
/*
|
|
char *x;
|
|
int32_t xlen;
|
|
// hit, check the url
|
|
// for catdb, skip over the catids
|
|
if (m_rdbid == RDB_CATDB) {
|
|
unsigned char numCatids = *data;
|
|
// . point to stored url/site
|
|
// . skip dataSize/fileNum
|
|
int32_t skip = 1 + (4 * numCatids) + 4;
|
|
x = data + skip;
|
|
xlen = dataSize - skip;
|
|
}
|
|
else {
|
|
// . point to stored url/site
|
|
// . skip dataSize/fileNum
|
|
x = data + 4;
|
|
xlen = dataSize - 4;
|
|
}
|
|
// set the site
|
|
Url site;
|
|
site.set ( x , xlen , false );
|
|
*/
|
|
CatRec site;
|
|
site.set (url, data, dataSize, false);
|
|
// check for an exact match against the full url
|
|
int32_t uflen = url->getUrlLen();
|
|
char *ufull = url->getUrl();
|
|
//int32_t sflen = site.getUrlLen();
|
|
//char *sfull = site.getUrl();
|
|
int32_t sflen = site.m_urlLen;
|
|
char *sfull = site.m_url;
|
|
// if we match, return this rec
|
|
if ( sflen == uflen &&
|
|
strncmp ( sfull, ufull, sflen ) == 0 ) {
|
|
*recSize = dataSize;
|
|
}
|
|
else {
|
|
*recSize = 0;
|
|
data = NULL;
|
|
}
|
|
}
|
|
else {
|
|
*recSize = 0;
|
|
data = NULL;
|
|
}
|
|
took = gettimeofdayInMilliseconds() - startTime;
|
|
if ( took > 10 )
|
|
log(LOG_INFO, "catdb: catdb lookup took %" INT64 " ms, "
|
|
"listSize=%" INT32 "", took, list->getListSize() );
|
|
return data;
|
|
}
|
|
|
|
// . find the indirect matches in the list which match a sub path
|
|
// of the url
|
|
int32_t Catdb::getIndirectMatches ( RdbList *list ,
|
|
Url *url ,
|
|
char **matchRecs ,
|
|
int32_t *matchRecSizes ,
|
|
int32_t maxMatches ,
|
|
char *coll,
|
|
int32_t collLen) {
|
|
char path[MAX_URL_LEN+1];
|
|
int32_t pathLen;
|
|
Url partialUrl;
|
|
key_t partialUrlKey;
|
|
// start with the whole url...include real catid in indirect
|
|
gbmemcpy(path, url->getUrl(), url->getUrlLen());
|
|
pathLen = url->getUrlLen();
|
|
// loop looking for partial matches
|
|
char *data = NULL;
|
|
int32_t dataSize = 0;
|
|
int32_t numMatches = 0;
|
|
while ( numMatches < maxMatches ) {
|
|
// make the partial url
|
|
partialUrl.set(path, pathLen, true);
|
|
normalizeUrl(&partialUrl, &partialUrl);
|
|
// make the next key
|
|
partialUrlKey = makeKey ( &partialUrl, false );
|
|
// search for it
|
|
listSearch ( list, partialUrlKey, &data, &dataSize );
|
|
// store a hit
|
|
if ( data && dataSize > 0 ) {
|
|
// get the url
|
|
char *x;
|
|
int32_t xlen;
|
|
// hit, check the url
|
|
// for catdb, skip over the catids
|
|
/*
|
|
unsigned char numCatids = *data;
|
|
// . point to stored url/site
|
|
// . skip dataSize/fileNum
|
|
int32_t skip = 1 + (4 * numCatids) + 4;
|
|
x = data + skip;
|
|
xlen = dataSize - skip;
|
|
*/
|
|
CatRec sr;
|
|
sr.set ( url, data, dataSize, false);
|
|
x = sr.m_url;
|
|
xlen = sr.m_urlLen;
|
|
// ensure it's a sub-path
|
|
if ( xlen <= url->getUrlLen() &&
|
|
strncasecmp(x, url->getUrl(), xlen) == 0 ) {
|
|
//char msg[4096];
|
|
//char *mp = msg;
|
|
//mp += sprintf(mp, "For ");
|
|
//gbmemcpy(mp, url->getUrl(), url->getUrlLen());
|
|
//mp += url->getUrlLen();
|
|
//mp += sprintf(mp, " , got Indirect: ");
|
|
//gbmemcpy(mp, x, xlen);
|
|
//mp += xlen;
|
|
//*mp = '\0';
|
|
//log ( LOG_INFO, "tagdb: %s", msg );
|
|
matchRecs [numMatches] = data;
|
|
matchRecSizes[numMatches] = dataSize;
|
|
numMatches++;
|
|
}
|
|
}
|
|
// make the next partial url
|
|
pathLen--;
|
|
while ( pathLen > 3 && path[pathLen-1] != '/' )
|
|
pathLen--;
|
|
// check for end
|
|
if ( pathLen <= 3 || strncmp(&path[pathLen-3], "://", 3) == 0 )
|
|
break;
|
|
// chop off the trailing /
|
|
pathLen--;
|
|
}
|
|
return numMatches;
|
|
}
|