Merge branch 'master' into nomerge2

Conflicts:
	Msg40.cpp
	Tagdb.cpp
This commit is contained in:
Ivan Skytte Jørgensen
2016-09-15 13:37:50 +02:00
51 changed files with 1679 additions and 1737 deletions

@ -28,9 +28,6 @@
#define GB_CLUSTERDB_H
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Titledb.h"
// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key96_t))
@ -69,23 +66,19 @@ public:
false, true ); }
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
// // docId occupies the most significant bytes of the key
// docId occupies the most significant bytes of the key
// now docId occupies the bits after the first 23
static int64_t getDocId ( const void *k ) {
//int64_t docId = (k.n0) >> (32+24);
//docId |= ( ((uint64_t)(k.n1)) << 8 );
int64_t docId = (((const key96_t *)k)->n0) >> 35;
docId |= ( ((uint64_t)(((const key96_t *)k)->n1)) << 29 );
return docId;
}
static uint32_t getSiteHash26 ( const char *r ) {
//return g_titledb.getSiteHash ( (key_t *)r ); }
return ((uint32_t)(((const key96_t*)r)->n0 >> 2) & 0x03FFFFFF);
}
static uint32_t hasAdultContent ( const char *r ) {
//return g_titledb.hasAdultContent ( *(key_t *)r ); }
return ((uint32_t)(((const key96_t*)r)->n0 >> 34) & 0x00000001);
}

@ -268,7 +268,7 @@ void DailyMerge::dailyMergeLoop ( ) {
// ok, all trees are clear and dumped
m_mergeMode = 5;
// log it
log("daily: Merging indexdb and datedb files.");
log("daily: Merging indexdb files.");
}
// start the merge

@ -1,6 +1,6 @@
// Copyright Gigablast, Inc. Apr 2008
// tight merge indexdb and datedb at the given time every day
// tight merge indexdb at the given time every day
#ifndef GB_DAILYMERGE_H
#define GB_DAILYMERGE_H

@ -81,7 +81,6 @@ void HashTableX::reset ( ) {
m_flags = NULL;
m_numSlots = 0;
m_numSlotsUsed = 0;
m_addIffNotUnique = false;
m_maskKeyOffset = 0;
//m_useKeyMagic = false;
// we should free it in reset()
@ -621,3 +620,17 @@ int32_t HashTableX::getKeyChecksum32 () const {
}
return checksum;
}
// print as text into sb for debugging
void HashTableX::print() {
for (int32_t i = 0; i < m_numSlots; i++) {
// skip empty bucket
if (!m_flags[i]) {
continue;
}
// get the key
char *kp = (char *)getKeyFromSlot(i);
logf(LOG_WARN, "key=%s", KEYSTR(kp, m_ks));
}
}

@ -298,6 +298,9 @@ class HashTableX {
bool setTableSize ( int32_t numSlots , char *buf , int32_t bufSize );
// for debugging
void print();
void disableWrites () { m_isWritable = false; }
void enableWrites () { m_isWritable = true ; }
bool m_isWritable;
@ -318,18 +321,15 @@ class HashTableX {
int32_t m_numSlotsUsed;
uint32_t m_mask;
char m_doFree;
bool m_doFree;
char *m_buf;
int32_t m_bufSize;
char m_useKeyMagic;
bool m_useKeyMagic;
int32_t m_ks;
int32_t m_ds;
char m_allowDups;
// a flag used by XmlDoc.cpp
bool m_addIffNotUnique;
bool m_allowDups;
bool m_isSaving;
bool m_needsSave;

@ -1571,7 +1571,7 @@ uint32_t Hostdb::getShardNum(rdbid_t rdbId, const void *k) {
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
}
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
uint64_t d = g_titledb.getDocId ( (key96_t *)k );
uint64_t d = Titledb::getDocId ( (key96_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
else if ( rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2 ) {

@ -536,9 +536,9 @@ bool getLinkInfo ( SafeBuf *reqBuf ,
//int32_t siteHash32 = hash32n ( req->ptr_site );
// access different parts of linkdb depending on the "mode"
if ( req->m_mode == MODE_SITELINKINFO )
startKey = g_linkdb.makeStartKey_uk ( req->m_siteHash32 );
startKey = Linkdb::makeStartKey_uk ( req->m_siteHash32 );
else
startKey = g_linkdb.makeStartKey_uk (req->m_siteHash32,
startKey = Linkdb::makeStartKey_uk (req->m_siteHash32,
req->m_linkHash64 );
// what group has this linkdb list?
uint32_t shardNum = getShardNum ( RDB_LINKDB, &startKey );
@ -999,14 +999,14 @@ bool Msg25::doReadLoop ( ) {
// access different parts of linkdb depending on the "mode"
if ( m_mode == MODE_SITELINKINFO ) {
startKey = g_linkdb.makeStartKey_uk ( siteHash32 );
endKey = g_linkdb.makeEndKey_uk ( siteHash32 );
startKey = Linkdb::makeStartKey_uk ( siteHash32 );
endKey = Linkdb::makeEndKey_uk ( siteHash32 );
//log("linkdb: getlinkinfo: "
// "site=%s sitehash32=%" PRIu32,site,siteHash32);
}
else {
startKey = g_linkdb.makeStartKey_uk (siteHash32,m_linkHash64 );
endKey = g_linkdb.makeEndKey_uk (siteHash32,m_linkHash64 );
startKey = Linkdb::makeStartKey_uk (siteHash32,m_linkHash64 );
endKey = Linkdb::makeEndKey_uk (siteHash32,m_linkHash64 );
}
// resume from where we left off?
@ -1329,13 +1329,13 @@ bool Msg25::sendRequests ( ) {
// get the current key if list has more left
key224_t key; m_list.getCurrentKey( &key );
itop = g_linkdb.getLinkerIp24_uk ( &key );
ip32 = g_linkdb.getLinkerIp_uk ( &key );
isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
docId = g_linkdb.getLinkerDocId_uk ( &key );
discovered = g_linkdb.getDiscoveryDate_uk(&key);
itop = Linkdb::getLinkerIp24_uk ( &key );
ip32 = Linkdb::getLinkerIp_uk ( &key );
isLinkSpam = Linkdb::isLinkSpam_uk ( &key );
docId = Linkdb::getLinkerDocId_uk ( &key );
discovered = Linkdb::getDiscoveryDate_uk(&key);
// is it expired?
lostDate = g_linkdb.getLostDate_uk(&key);
lostDate = Linkdb::getLostDate_uk(&key);
// update this
gbmemcpy ( &m_nextKey , &key , LDBKS );
@ -1347,15 +1347,15 @@ bool Msg25::sendRequests ( ) {
// get the current key if list has more left
key224_t key; m_list.getCurrentKey( &key );
itop = g_linkdb.getLinkerIp24_uk ( &key );
ip32 = g_linkdb.getLinkerIp_uk ( &key );
itop = Linkdb::getLinkerIp24_uk ( &key );
ip32 = Linkdb::getLinkerIp_uk ( &key );
isLinkSpam = false;
docId = g_linkdb.getLinkerDocId_uk ( &key );
docId = Linkdb::getLinkerDocId_uk ( &key );
discovered = g_linkdb.getDiscoveryDate_uk(&key);
discovered = Linkdb::getDiscoveryDate_uk(&key);
// is it expired?
lostDate = g_linkdb.getLostDate_uk(&key);
lostDate = Linkdb::getLostDate_uk(&key);
// update this
gbmemcpy ( &m_nextKey , &key , LDBKS );

@ -159,16 +159,18 @@ bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
int32_t getSiteRank ( int32_t sni ) ;
class Linkdb {
public:
public:
void reset();
bool init ( );
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
bool init();
bool init2(int32_t treeMem);
bool verify(char *coll);
Rdb *getRdb() { return &m_rdb; }
// this makes a "url" key
key224_t makeKey_uk ( uint32_t linkeeSiteHash32 ,
static key224_t makeKey_uk ( uint32_t linkeeSiteHash32 ,
uint64_t linkeeUrlHash64 ,
bool isLinkSpam ,
unsigned char linkerSiteRank , // 0-15 i guess
@ -182,7 +184,7 @@ class Linkdb {
bool isDelete );
key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 ,
static key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 ,
uint64_t linkeeUrlHash64 = 0LL ) {
return makeKey_uk ( linkeeSiteHash32,
linkeeUrlHash64,
@ -198,7 +200,7 @@ class Linkdb {
true); // is delete?
}
key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 ,
static key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 ,
uint64_t linkeeUrlHash64 =
0xffffffffffffffffLL ) {
return makeKey_uk ( linkeeSiteHash32,
@ -219,10 +221,11 @@ class Linkdb {
// accessors for "url" keys in linkdb
//
uint32_t getLinkeeSiteHash32_uk ( key224_t *key ) {
return (key->n3) >> 32; }
static uint32_t getLinkeeSiteHash32_uk ( key224_t *key ) {
return (key->n3) >> 32;
}
uint64_t getLinkeeUrlHash64_uk ( key224_t *key ) {
static uint64_t getLinkeeUrlHash64_uk ( key224_t *key ) {
uint64_t h = key->n3;
h &= 0x00000000ffffffffLL;
h <<= 15;
@ -230,19 +233,19 @@ class Linkdb {
return h;
}
char isLinkSpam_uk (key224_t *key ) {
static char isLinkSpam_uk (key224_t *key ) {
if ((key->n2) & 0x1000000000000LL) return true;
return false;
}
unsigned char getLinkerSiteRank_uk ( key224_t *k ) {
static unsigned char getLinkerSiteRank_uk ( key224_t *k ) {
unsigned char rank = (k->n2 >> 40) & 0xff;
// complement it back
rank = (unsigned char)~rank;//LDB_MAXSITERANK - rank;
return rank;
}
int32_t getLinkerIp_uk ( key224_t *k ) {
static int32_t getLinkerIp_uk ( key224_t *k ) {
uint32_t ip ;
// the most significant part of the ip is the lower byte!!!
ip = (uint32_t)((k->n2>>8)&0x00ffffff);
@ -250,7 +253,7 @@ class Linkdb {
return ip;
}
void setIp32_uk ( void *k , uint32_t ip ) {
static void setIp32_uk ( void *k , uint32_t ip ) {
char *ips = (char *)&ip;
char *ks = (char *)k;
ks[16] = ips[3];
@ -261,11 +264,11 @@ class Linkdb {
// we are missing the lower byte, it will be zero
int32_t getLinkerIp24_uk ( key224_t *k ) {
static int32_t getLinkerIp24_uk ( key224_t *k ) {
return (int32_t)((k->n2>>8)&0x00ffffff);
}
int64_t getLinkerDocId_uk( key224_t *k ) {
static int64_t getLinkerDocId_uk( key224_t *k ) {
uint64_t d = k->n2 & 0xff;
d <<= 30;
d |= k->n1 >>34;
@ -274,7 +277,7 @@ class Linkdb {
// . in days since jan 1, 2012 utc
// . timestamp of jan 1, 2012 utc is 1325376000
int32_t getDiscoveryDate_uk ( void *k ) {
static int32_t getDiscoveryDate_uk ( void *k ) {
uint32_t date = ((key224_t *)k)->n1 >> 18;
date &= 0x00003fff;
// if 0 return that
@ -289,7 +292,7 @@ class Linkdb {
// . in days since jan 1, 2012 utc
// . timestamp of jan 1, 2012 utc is 1325376000
void setDiscoveryDate_uk ( void *k , int32_t date ) {
static void setDiscoveryDate_uk ( void *k , int32_t date ) {
// subtract jan 1 2012
date -= LINKDBEPOCH;
// convert into days
@ -302,7 +305,7 @@ class Linkdb {
((key224_t *)k)->n1 |= ((uint64_t)date) << 18;
}
int32_t getLostDate_uk ( void *k ) {
static int32_t getLostDate_uk ( void *k ) {
uint32_t date = ((key224_t *)k)->n1 >> 2;
date &= 0x00003fff;
// if 0 return that
@ -317,7 +320,7 @@ class Linkdb {
// . in days since jan 1, 2012 utc
// . timestamp of jan 1, 2012 utc is 1325376000
void setLostDate_uk ( void *k , int32_t date ) {
static void setLostDate_uk ( void *k , int32_t date ) {
// subtract jan 1 2012
date -= LINKDBEPOCH;
// convert into days
@ -330,18 +333,15 @@ class Linkdb {
((key224_t *)k)->n1 |= ((uint64_t)date) << 2;
}
uint32_t getLinkerSiteHash32_uk( void *k ) {
static uint32_t getLinkerSiteHash32_uk( void *k ) {
uint32_t sh32 = ((key224_t *)k)->n1 & 0x00000003;
sh32 <<= 30;
sh32 |= ((key224_t *)k)->n0 >> 2;
return sh32;
}
Rdb *getRdb() { return &m_rdb; }
private:
Rdb m_rdb;
private:
Rdb m_rdb;
};
extern class Linkdb g_linkdb;

@ -208,6 +208,7 @@ bool Loop::registerSleepCallback ( int32_t tick, void *state, void (* callback)(
return false;
}
ScopedLock sl(m_slotMutex);
if ( tick < m_minTick ) {
m_minTick = tick;
}

@ -775,7 +775,7 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
totalOrigLinks++;
// get rec
char *rec = list->getCurrentRec();
int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec );
int32_t ip32 = Linkdb::getLinkerIp_uk((key224_t *)rec );
// same as one before?
if ( ip32 == lastIp32 &&
// are we the last rec? include that for

@ -141,7 +141,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
if ( req->m_docId >= 0 )
shardNum = g_hostdb.getShardNumFromDocId(req->m_docId);
else {
int64_t pdocId = g_titledb.getProbableDocId(req->ptr_ubuf);
int64_t pdocId = Titledb::getProbableDocId(req->ptr_ubuf);
shardNum = getShardNumFromDocId(pdocId);
}
@ -193,7 +193,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
int64_t probDocId = req->m_docId;
// i think reference pages just pass in a url to get the summary
if ( probDocId < 0 && req->size_ubuf )
probDocId = g_titledb.getProbableDocId ( req->ptr_ubuf );
probDocId = Titledb::getProbableDocId ( req->ptr_ubuf );
if ( probDocId < 0 ) {
log("query: Got bad docid/url combo.");
probDocId = 0;
@ -363,7 +363,7 @@ static void handleRequest20(UdpSlot *slot, int32_t netnice) {
log(LOG_DEBUG, "query: Summary cache miss");
// if it's not stored locally that's an error
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {
if ( req->m_docId >= 0 && ! Titledb::isLocal ( req->m_docId ) ) {
log(LOG_WARN, "query: Got msg20 request for non-local docId %" PRId64, req->m_docId);
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
g_udpServer.sendErrorReply ( slot , ENOTLOCAL );

@ -134,7 +134,7 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
// if no docid provided, use probable docid
if ( ! docId )
docId = g_titledb.getProbableDocId ( url );
docId = Titledb::getProbableDocId ( url );
// get groupId from docId
uint32_t shardNum = getShardNumFromDocId ( docId );
@ -359,8 +359,8 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
// so try the range
if ( r->m_getAvailDocIdOnly ) {
int64_t pd = r->m_docId;
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
int64_t d2 = Titledb::getLastProbableDocId ( pd );
// sanity - bad url with bad subdomain?
if ( pd < d1 || pd > d2 ) { g_process.shutdownAbort(true); }
// make sure we get a decent sample in titledb then in
@ -388,9 +388,9 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
delete ( st );
return;
}
int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
int64_t pd = Titledb::getProbableDocId (r->m_url,dom,dlen);
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
int64_t d2 = Titledb::getLastProbableDocId ( pd );
// sanity - bad url with bad subdomain?
if ( pd < d1 || pd > d2 ) { g_process.shutdownAbort(true); }
// store these
@ -406,8 +406,8 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
// since it would base it on startFileNum and numFiles
key96_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
// make titledb keys
key96_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
key96_t endKey = g_titledb.makeLastKey ( st->m_docId2 );
key96_t startKey = Titledb::makeFirstKey ( st->m_docId1 );
key96_t endKey = Titledb::makeLastKey ( st->m_docId2 );
// . load the list of title recs from disk now
// . our file range should be solid
@ -468,7 +468,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
// set probable docid
int64_t pd = 0LL;
if ( r->m_url[0] ) {
pd = g_titledb.getProbableDocId(r->m_url);
pd = Titledb::getProbableDocId(r->m_url);
if ( pd != st->m_pd ) {
log("db: crap probable docids do not match! u=%s",
r->m_url);
@ -500,7 +500,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
// get docid of that titlerec
int64_t dd = g_titledb.getDocId(k);
int64_t dd = Titledb::getDocId(k);
if ( r->m_getAvailDocIdOnly ) {
// make sure our available docids are availble!
@ -511,7 +511,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
// if we had a url make sure uh48 matches
else if ( r->m_url[0] ) {
// get it
int64_t uh48 = g_titledb.getUrlHash48(k);
int64_t uh48 = Titledb::getUrlHash48(k);
// make sure our available docids are availble!
if ( dd == ad1 ) ad1++;

@ -639,7 +639,7 @@ bool Msg3a::gotAllShardReplies ( ) {
j ,
i ,
docIds [j] ,
(int32_t)g_titledb.getDomHash8FromDocId(docIds[j]),
(int32_t)Titledb::getDomHash8FromDocId(docIds[j]),
scores[j] );
}
}

156
Msg4.cpp

@ -65,22 +65,15 @@ static Msg4 *s_msg4Tail = NULL;
// . also, need to update spiderdb rec for the url in Msg14 using Msg4 too!
// . need to add support for passing in array of lists for Msg14
static bool addMetaList ( const char *p , class UdpSlot *slot = NULL );
static void gotReplyWrapper4 ( void *state , void *state2 ) ;
static void storeLineWaiters ( ) ;
static void handleRequest4 ( UdpSlot *slot , int32_t niceness ) ;
static void sleepCallback4 ( int bogusfd , void *state ) ;
static bool sendBuffer ( int32_t hostId , int32_t niceness ) ;
static Multicast *getMulticast ( ) ;
static void returnMulticast ( Multicast *mcast ) ;
static bool storeRec ( collnum_t collnum ,
char rdbId ,
uint32_t gid ,
int32_t hostId ,
const char *rec ,
int32_t recSize ,
int32_t niceness ) ;
static bool addMetaList(const char *p, class UdpSlot *slot = NULL);
static void gotReplyWrapper4(void *state, void *state2);
static void handleRequest4(UdpSlot *slot, int32_t niceness);
static void sleepCallback4(int bogusfd, void *state);
static void flushLocal();
static bool sendBuffer(int32_t hostId);
static Multicast *getMulticast();
static void returnMulticast(Multicast *mcast);
static bool storeRec(collnum_t collnum, char rdbId, uint32_t gid, int32_t hostId, const char *rec, int32_t recSize);
// all these parameters should be preset
bool Msg4::registerHandler() {
@ -128,9 +121,6 @@ bool Msg4::registerHandler() {
return rc;
}
static void flushLocal ( ) ;
// scan all host bufs and try to send on them
void sleepCallback4 ( int bogusfd , void *state ) {
// wait for clock to be in sync
@ -145,7 +135,7 @@ void flushLocal ( ) {
//storeLineWaiters();
// now try to send the buffers
for ( int32_t i = 0 ; i < s_numHostBufs ; i++ )
sendBuffer ( i , MAX_NICENESS );
sendBuffer ( i );
g_errno = 0;
}
@ -183,12 +173,12 @@ bool hasAddsInQueue ( ) {
}
bool Msg4::addMetaList ( SafeBuf *sb, collnum_t collnum, void *state, void (* callback)(void *state),
int32_t niceness, char rdbId, int32_t shardOverride ) {
return addMetaList ( sb->getBufStart(), sb->length(), collnum, state, callback, niceness, rdbId, shardOverride );
rdbid_t rdbId, int32_t shardOverride ) {
return addMetaList ( sb->getBufStart(), sb->length(), collnum, state, callback, rdbId, shardOverride );
}
bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
void (* callback)(void *state), int32_t niceness, char rdbId,
void (* callback)(void *state), rdbid_t rdbId,
// Rebalance.cpp needs to add negative keys to
// remove foreign records from where they no
// longer belong because of a new hosts.conf file.
@ -212,7 +202,6 @@ bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t c
m_state = state;
m_callback = callback;
m_rdbId = rdbId;
m_niceness = niceness;
m_next = NULL;
m_shardOverride = shardOverride;
@ -279,7 +268,7 @@ bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t c
return false;
}
bool isInMsg4LinkedList ( Msg4 *msg4 ) {
bool Msg4::isInLinkedList ( Msg4 *msg4 ) {
Msg4 *m = s_msg4Head;
for ( ; m ; m = m->m_next )
if ( m == msg4 ) return true;
@ -300,12 +289,10 @@ bool Msg4::addMetaList2 ( ) {
// store each record in the list into the send buffers
for ( ; p < pend ; ) {
// first is rdbId
char rdbId = m_rdbId;
if ( rdbId < 0 ) rdbId = *p++;
// mask off rdbId
rdbId &= 0x7f;
logTrace( g_conf.m_logTraceMsg4, " rdbId: %02x", rdbId);
rdbid_t rdbId = m_rdbId;
if ( rdbId == RDB_NONE ) {
rdbId = (rdbid_t)(*p++ & 0x7f);
}
// get the key of the current record
const char *key = p;
@ -313,12 +300,8 @@ bool Msg4::addMetaList2 ( ) {
// get the key size. a table lookup in Rdb.cpp.
int32_t ks = getKeySizeFromRdbId ( rdbId );
logTrace( g_conf.m_logTraceMsg4, " Key: %s", KEYSTR(key, ks) );
logTrace( g_conf.m_logTraceMsg4, " Key size: %" PRId32, ks);
// negative key?
bool del = !( *key & 0x01 );
logTrace( g_conf.m_logTraceMsg4, " Negative key: %s", del?"true":"false");
// skip key
p += ks;
@ -330,16 +313,12 @@ bool Msg4::addMetaList2 ( ) {
if ( m_shardOverride >= 0 ) {
shardNum = m_shardOverride;
}
logTrace( g_conf.m_logTraceMsg4, " shardNum: %" PRId32, shardNum);
// get the record, is -1 if variable. a table lookup.
// . negative keys have no data
// . this unfortunately is not true according to RdbList.cpp
int32_t dataSize = del ? 0 : getDataSizeFromRdbId ( rdbId );
logTrace( g_conf.m_logTraceMsg4, " dataSize: %" PRId32, dataSize);
// if variable read that in
if ( dataSize == -1 ) {
// -1 means to read it in
@ -349,8 +328,6 @@ bool Msg4::addMetaList2 ( ) {
// skip dataSize
p += 4;
logTrace( g_conf.m_logTraceMsg4, " dataSize: %" PRId32" (variable size read)", dataSize);
}
// skip over the data, if any
@ -358,18 +335,15 @@ bool Msg4::addMetaList2 ( ) {
// breach us?
if ( p > pend ) { g_process.shutdownAbort(true); }
// i fixed UdpServer.cpp to NOT call msg4 handlers when in
// a quickpoll, in case we receive a niceness 0 msg4 request
QUICKPOLL(m_niceness);
// convert the gid to the hostid of the first host in this
// group. uses a quick hash table.
Host *hosts = g_hostdb.getShard ( shardNum );
int32_t hostId = hosts[0].m_hostId;
logTrace( g_conf.m_logTraceMsg4, " hostId: %" PRId32, hostId);
logTrace(g_conf.m_logTraceMsg4, " rdb=%s key=%s keySize=%" PRId32" isDel=%d dataSize=%" PRId32" shardNum=%" PRId32" hostId=%" PRId32,
getDbnameFromId(rdbId), KEYSTR(key, ks), ks, del, shardNum, dataSize, hostId);
// . add that rec to this groupId, gid, includes the key
// . these are NOT allowed to be compressed (half bit set)
// and this point
@ -377,7 +351,7 @@ bool Msg4::addMetaList2 ( ) {
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(key,p-key);
#endif
if ( storeRec ( m_collnum, rdbId, shardNum, hostId, key, p - key, m_niceness )) {
if ( storeRec ( m_collnum, rdbId, shardNum, hostId, key, p - key )) {
// . point to next record
// . will point past records if no more left!
m_currentPtr = p;
@ -416,8 +390,7 @@ bool storeRec ( collnum_t collnum ,
uint32_t shardNum,
int32_t hostId ,
const char *rec ,
int32_t recSize ,
int32_t niceness ) {
int32_t recSize ) {
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(&collnum,sizeof(collnum));
VALGRIND_CHECK_MEM_IS_DEFINED(&rdbId,sizeof(rdbId));
@ -491,7 +464,7 @@ bool storeRec ( collnum_t collnum ,
// will he be able to proceed. we will call his callback
// as soon as we can copy... use this->m_msg1 to add the
// list that was passed in...
if ( ! sendBuffer ( hostId , niceness ) ) return false;
if ( ! sendBuffer ( hostId ) ) return false;
// now the buffer should be empty, try again
goto retry;
}
@ -515,7 +488,7 @@ bool storeRec ( collnum_t collnum ,
// . returns false if we were UNable to get a multicast to launch the buffer,
// true otherwise
// . returns false and sets g_errno on error
bool sendBuffer ( int32_t hostId , int32_t niceness ) {
bool sendBuffer ( int32_t hostId ) {
//logf(LOG_DEBUG,"build: sending buf");
// how many bytes of the buffer are occupied or "in use"?
char *buf = s_hostBufs [hostId];
@ -673,10 +646,10 @@ void gotReplyWrapper4 ( void *state , void *state2 ) {
returnMulticast(mcast);
storeLineWaiters(); // try to launch more msg4 requests in waiting
Msg4::storeLineWaiters(); // try to launch more msg4 requests in waiting
}
void storeLineWaiters ( ) {
void Msg4::storeLineWaiters ( ) {
// try to store all the msg4's lists that are waiting in line
for (;;) {
Msg4 *msg4 = s_msg4Head;
@ -731,17 +704,14 @@ void storeLineWaiters ( ) {
void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
logTrace( g_conf.m_logTraceMsg4, "BEGIN" );
// easy var
UdpServer *us = &g_udpServer;
// if we just came up we need to make sure our hosts.conf is in
// sync with everyone else before accepting this! it might have
// been the case that the sender thinks our hosts.conf is the same
// since last time we were up, so it is up to us to check this
if ( g_pingServer.m_hostsConfInDisagreement ) {
g_errno = EBADHOSTSCONF;
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , g_errno );
logError("call sendErrorReply");
g_udpServer.sendErrorReply ( slot , g_errno );
log(LOG_WARN,"%s:%s: END - hostsConfInDisagreement", __FILE__, __func__ );
return;
@ -753,8 +723,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
// . this is 0 if not received yet
if (!slot->m_host->m_pingInfo.m_hostsConfCRC) {
g_errno = EWAITINGTOSYNCHOSTSCONF;
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , g_errno );
logError("call sendErrorReply");
g_udpServer.sendErrorReply ( slot , g_errno );
log(LOG_WARN,"%s:%s: END - EWAITINGTOSYNCHOSTCONF", __FILE__, __func__ );
return;
@ -763,8 +733,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
// compare our hosts.conf to sender's otherwise
if (slot->m_host->m_pingInfo.m_hostsConfCRC != g_hostdb.getCRC()) {
g_errno = EBADHOSTSCONF;
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , g_errno );
logError("call sendErrorReply");
g_udpServer.sendErrorReply ( slot , g_errno );
log(LOG_WARN,"%s:%s: END - EBADHOSTSCONF", __FILE__, __func__ );
return;
@ -778,8 +748,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
// must at least have an rdbId
if (readBufSize < 7) {
g_errno = EREQUESTTOOSHORT;
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
us->sendErrorReply ( slot , g_errno );
logError("call sendErrorReply");
g_udpServer.sendErrorReply ( slot , g_errno );
log(LOG_ERROR,"%s:%s: END - EREQUESTTOOSHORT", __FILE__, __func__ );
return;
@ -793,20 +763,15 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
if ( used != readBufSize ) {
// if we send back a g_errno then multicast retries forever
// so just absorb it!
log(LOG_ERROR,"%s:%s: msg4: got corrupted request from hostid %" PRId32" "
"used [%" PRId32"] != readBufSize [%" PRId32"]",
__FILE__,
__func__,
slot->m_host->m_hostId,
used,
readBufSize);
logError("msg4: got corrupted request from hostid %" PRId32" used [%" PRId32"] != readBufSize [%" PRId32"]",
slot->m_host->m_hostId, used, readBufSize);
loghex(LOG_ERROR, readBuf, (readBufSize < 160 ? readBufSize : 160), "readBuf (first max. 160 bytes)");
us->sendReply(NULL, 0, NULL, 0, slot);
//us->sendErrorReply(slot,ECORRUPTDATA);return;}
log(LOG_ERROR,"%s:%s: END", __FILE__, __func__ );
g_udpServer.sendReply(NULL, 0, NULL, 0, slot);
//g_udpServer.sendErrorReply(slot,ECORRUPTDATA);return;}
logError("END");
return;
}
@ -821,8 +786,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
}
// tell send to try again shortly
g_errno = ETRYAGAIN;
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
us->sendErrorReply(slot,g_errno);
logError("call sendErrorReply");
g_udpServer.sendErrorReply(slot,g_errno);
logTrace( g_conf.m_logTraceMsg4, "END - ETRYAGAIN. Waiting to sync with host #0" );
return;
@ -830,15 +795,15 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
// this returns false with g_errno set on error
if (!addMetaList(readBuf, slot)) {
log(LOG_ERROR, "%s:%s:%d: call sendErrorReply. error='%s'", __FILE__, __func__, __LINE__, mstrerror(g_errno));
us->sendErrorReply(slot,g_errno);
logError("call sendErrorReply error='%s", mstrerror(g_errno));
g_udpServer.sendErrorReply(slot,g_errno);
logTrace(g_conf.m_logTraceMsg4, "END - addMetaList returned false. g_errno=%d", g_errno);
return;
}
// good to go
us->sendReply(NULL, 0, NULL, 0, slot);
g_udpServer.sendReply(NULL, 0, NULL, 0, slot);
logTrace(g_conf.m_logTraceMsg4, "END - OK");
}
@ -934,23 +899,20 @@ bool addMetaList ( const char *p , UdpSlot *slot ) {
log(LOG_WARN, "seems like a stray /e/repair-addsinprogress.dat file "
"rdbId=%" PRId32". waiting to be in repair mode."
,(int32_t)rdbId);
//not in repair mode. dropping.",(int32_t)rdbId);
g_errno = ETRYAGAIN;
return false;
}
// set the list
list.set ( (char*)p , //todo: dodgy cast. RdbList should be fixed
recSize ,
(char*)p , //todo: dodgy cast. RdbList should be fixed
recSize ,
rdb->getFixedDataSize() ,
false , // ownData?
rdb->useHalfKeys() ,
rdb->getKeySize () );
// todo: dodgy cast to char*. RdbList should be fixed
list.set((char *)p, recSize, (char *)p, recSize, rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
// advance over the rec data to point to next entry
p += recSize;
// keep track of stats
rdb->readRequestAdd ( recSize );
// this returns false and sets g_errno on error
bool status =rdb->addList(collnum, &list, MAX_NICENESS );
@ -966,16 +928,12 @@ bool addMetaList ( const char *p , UdpSlot *slot ) {
// no memory means to try again
if ( g_errno == ENOMEM ) g_errno = ETRYAGAIN;
// doing a full rebuid will add collections
if ( g_errno == ENOCOLLREC &&
g_repairMode > 0 )
//g_repair.m_fullRebuild )
if ( g_errno == ENOCOLLREC && g_repairMode > 0 )
g_errno = ETRYAGAIN;
// ignore enocollrec errors since collection can be reset while
// spiders are on now.
//if ( g_errno == ENOCOLLREC )
// g_errno = 0;
// are we done
if ( g_errno ) return false;
// success
return true;
}

64
Msg4.h

@ -10,58 +10,56 @@ bool loadAddsInProgress ( const char *filenamePrefix );
// used by Repair.cpp to make sure we are not adding any more data ("writing")
bool hasAddsInQueue ( ) ;
bool isInMsg4LinkedList ( class Msg4 *msg4 ) ;
#include "SafeBuf.h"
#include "rdbid_t.h"
class Msg4 {
public:
Msg4()
: m_inUse(false) {
}
public:
static bool registerHandler();
// meta list format =
// (rdbId | 0x08) then rdb record [if nosplit]
// (rdbId | 0x00) then rdb record [if split ]
bool addMetaList( class SafeBuf *sb, collnum_t collnum, void *state,
void (* callback)(void *state), int32_t niceness, char rdbId = -1, int32_t shardOverride = -1 );
bool addMetaList( class SafeBuf *sb, collnum_t collnum, void *state,
void (* callback)(void *state), int32_t niceness, rdbid_t rdbId, int32_t shardOverride = -1 )
{ return addMetaList(sb,collnum,state,callback,niceness,(char)rdbId,shardOverride); }
// why wasn't this saved in addsinprogress.dat file?
~Msg4() {
if (m_inUse) {
log(LOG_ERROR, "BAD: MSG4 in use!!!!!! this=%p", this);
}
}
bool addMetaList(SafeBuf *sb, collnum_t collnum, void *state,
void (*callback)(void *state), rdbid_t rdbId = RDB_NONE, int32_t shardOverride = -1);
// this one is faster...
// returns false if blocked
bool addMetaList( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
void (* callback)(void *state), int32_t niceness, char rdbId = -1, int32_t shardOverride = -1 );
bool addMetaList( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
void (* callback)(void *state), int32_t niceness, rdbid_t rdbId, int32_t shardOverride = -1 )
{ return addMetaList(metaList,metaListSize,collnum,state,callback,niceness,(char)rdbId,shardOverride); }
bool addMetaList(const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
void (*callback)(void *state), rdbid_t rdbId = RDB_NONE, int32_t shardOverride = -1);
bool addMetaList2();
bool isInUse() const { return m_inUse; }
Msg4() { m_inUse = false; }
// why wasn't this saved in addsinprogress.dat file?
~Msg4() { if ( m_inUse ) log("BAD: MSG4 in use!!!!!!"); }
// private:
void (*m_callback ) ( void *state );
void *m_state;
static bool registerHandler();
static bool isInLinkedList(Msg4 *msg4);
static void storeLineWaiters();
SafeBuf m_tmpBuf;
char m_rdbId;
char m_inUse;
private:
bool addMetaList2();
void (*m_callback )(void *state);
void *m_state;
rdbid_t m_rdbId;
bool m_inUse;
collnum_t m_collnum;
int32_t m_niceness;
int32_t m_shardOverride;
const char *m_metaList ;
int32_t m_metaListSize ;
const char *m_currentPtr ; // into m_metaList
const char *m_metaList;
int32_t m_metaListSize;
const char *m_currentPtr; // into m_metaList
// the linked list for waiting in line
class Msg4 *m_next;
Msg4 *m_next;
};
#endif // GB_MSG4_H

@ -16,6 +16,7 @@
#include "Process.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include <new>
// increasing this doesn't seem to improve performance any on a single
@ -62,7 +63,7 @@ void Msg40::resetBuf2 ( ) {
// cast it
Msg20 *m = (Msg20 *)p;
// free its stuff
m->destructor();
m->~Msg20();
// advance
p += sizeof(Msg20);
}
@ -629,7 +630,7 @@ bool Msg40::reallocMsg20Buf ( ) {
// point to the next Msg20
p += sizeof(Msg20);
// init it
tmp[i]->constructor();
new (tmp[i]) Msg20();
// count it
pcount++;
// skip it if it is a new docid, we do not have a Msg20
@ -740,7 +741,7 @@ bool Msg40::reallocMsg20Buf ( ) {
// point it to its memory
m_msg20[i] = (Msg20 *)p;
// call its constructor
m_msg20[i]->constructor();
new (m_msg20[i]) Msg20();
// point to the next Msg20
p += sizeof(Msg20);
// remember num to free in reset() function

@ -12,7 +12,7 @@
#include "RdbCache.h"
#include "ScopedLock.h"
#include "Sanity.h"
#include "Titledb.h"
// how many Msg0 requests can we launch at the same time?
#define MSG51_MAX_REQUESTS 60
@ -511,7 +511,7 @@ bool setClusterLevels ( const key96_t *clusterRecs,
// . get the site hash
// . these are only 32 bits!
if(fakeIt)
h = g_titledb.getDomHash8FromDocId(docIds[i]);
h = Titledb::getDomHash8FromDocId(docIds[i]);
else
h = g_clusterdb.getSiteHash26 ( crec );

@ -1,15 +1,10 @@
#include "gb-include.h"
// i guess both msg0 send requests failed with no route to host,
//and they got retired... why didnt they switch to eth1????
#include "Multicast.h"
#include "Rdb.h" // RDB_TITLEDB
#include "Msg20.h"
#include "Profiler.h"
#include "UdpServer.h"
#include "Hostdb.h"
#include "Stats.h"
#include "Conf.h"
#include "Loop.h" // registerSleepCallback()
#include "ScopedLock.h"
#include "Process.h"
// up to 10 twins in a group
@ -19,20 +14,66 @@
// to send we should send as much as we can and save the remaining
// slots to disk for sending later??
static void sleepWrapper1 ( int bogusfd , void *state ) ;
static void sleepWrapper2 ( int bogusfd , void *state ) ;
static void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) ;
static void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) ;
void Multicast::constructor ( ) {
void Multicast::constructor() {
m_msg = NULL;
m_readBuf = NULL;
m_inUse = false;
}
void Multicast::destructor ( ) { reset(); }
Multicast::Multicast ( ) { constructor(); }
Multicast::~Multicast ( ) { reset(); }
void Multicast::destructor() {
reset();
}
Multicast::Multicast()
: m_msg(NULL),
m_msgSize(0),
m_msgType((msg_type_t)-1),
m_ownMsg(false),
m_slot(NULL),
m_inUse(false),
m_next(NULL),
m_replyingHost(NULL),
m_replyLaunchTime(0),
m_hackFileId(0),
m_hackFileOff(0),
m_importState(NULL),
m_mtx(),
m_state(NULL), m_state2(NULL),
m_callback(NULL),
m_totalTimeout(0),
m_startTime(0),
m_numReplies(0),
//m_hostPtrs
m_numHosts(0),
//m_retired
//m_slots
//m_errnos
//m_inProgress
//m_launchTime
m_readBuf(NULL),
m_readBufSize(0),
m_readBufMaxSize(0),
m_ownReadBuf(false),
m_registeredSleep(false),
m_niceness(0),
m_lastLaunch(0),
m_lastLaunchHost(NULL),
m_freeReadBuf(false),
m_key(0),
m_sendToSelf(false),
m_retryCount(0),
m_sentToTwin(false)
{
constructor();
}
Multicast::~Multicast() {
reset();
}
// free the send/read (request/reply) bufs we pirated from a UdpSlot or
// got from the caller
@ -103,10 +144,10 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
m_key = key;
// clear m_retired, m_errnos, m_slots
memset ( m_retired , 0 , sizeof(bool ) * MAX_HOSTS_PER_GROUP );
memset ( m_errnos , 0 , sizeof(int32_t ) * MAX_HOSTS_PER_GROUP );
memset ( m_slots , 0 , sizeof(UdpSlot *) * MAX_HOSTS_PER_GROUP );
memset ( m_inProgress , 0 , sizeof(char ) * MAX_HOSTS_PER_GROUP );
memset(m_retired, 0, sizeof(m_retired));
memset(m_errnos, 0, sizeof(m_errnos));
memset(m_slots, 0, sizeof(m_slots));
memset(m_inProgress, 0, sizeof(m_inProgress));
// . get the list of hosts in this group
// . returns false if blocked, true otherwise
@ -160,6 +201,7 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
// . TODO: deal with errors from g_udpServer::sendRequest() better
// . returns false and sets g_errno on error
void Multicast::sendToGroup() {
ScopedLock sl(m_mtx);
// see if anyone gets an error
bool hadError = false;
// . cast the msg to ALL hosts in the m_hosts group of hosts
@ -203,7 +245,7 @@ void Multicast::sendToGroup() {
// . send to a single host
// . this creates a transaction control slot, "udpSlot"
// . returns false and sets g_errno on error
if (us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReplyWrapperM2, m_totalTimeout, m_niceness)) {
if (us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReply2, m_totalTimeout, m_niceness)) {
continue;
}
// g_errno must have been set, remember it
@ -237,22 +279,23 @@ void Multicast::sendToGroup() {
}
}
void sleepWrapper2 ( int bogusfd , void *state ) {
Multicast *THIS = (Multicast *)state;
void Multicast::sleepWrapper2(int bogusfd, void *state) {
Multicast *THIS = static_cast<Multicast*>(state);
// try another round of sending to see if hosts had errors or not
THIS->sendToGroup ( );
THIS->sendToGroup();
}
// C wrapper for the C++ callback
void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) {
Multicast *THIS = (Multicast *)state;
THIS->gotReply2 ( slot );
void Multicast::gotReply2(void *state, UdpSlot *slot) {
Multicast *THIS = static_cast<Multicast*>(state);
THIS->gotReply2(slot);
}
// . otherwise, we were sending to a whole group so ALL HOSTS must produce a
// successful reply
// . we keep re-trying forever until they do
void Multicast::gotReply2 ( UdpSlot *slot ) {
ScopedLock sl(m_mtx);
// don't ever let UdpServer free this send buf (it is m_msg)
slot->m_sendBufAlloc = NULL;
// save this for msg4 logic that calls injection callback
@ -290,6 +333,7 @@ void Multicast::gotReply2 ( UdpSlot *slot ) {
// allow us to be re-used now, callback might relaunch
m_inUse = false;
if ( m_callback ) {
sl.unlock();
m_callback ( m_state , m_state2 );
}
return;
@ -626,7 +670,8 @@ bool Multicast::sendToHost ( int32_t i ) {
// . this creates a transaction control slot, "udpSlot"
// . return false and sets g_errno on error
// . returns true on successful launch and calls callback on completion
if (!us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReplyWrapperM1, timeRemaining, m_niceness, NULL, -1, -1, maxResends)) {
ScopedLock sl(m_mtx);
if (!us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReply1, timeRemaining, m_niceness, NULL, -1, -1, maxResends)) {
log(LOG_WARN, "net: Had error sending msgtype 0x%02x to host #%" PRId32": %s. Not retrying.",
m_msgType,h->m_hostId,mstrerror(g_errno));
// i've seen ENOUDPSLOTS available msg here along with oom
@ -635,7 +680,7 @@ bool Multicast::sendToHost ( int32_t i ) {
return false;
}
// mark it as outstanding
m_inProgress[i] = 1;
m_inProgress[i] = true;
// set our last launch date
m_lastLaunch = nowms ; // gettimeofdayInMilliseconds();
// save the host, too
@ -657,7 +702,7 @@ bool Multicast::sendToHost ( int32_t i ) {
// this is called every 50 ms so we have the chance to launch our request
// to a more responsive host
void sleepWrapper1 ( int bogusfd , void *state ) {
void Multicast::sleepWrapper1 ( int bogusfd , void *state ) {
Multicast *THIS = (Multicast *) state;
// . if our last launch was less than X seconds ago, wait another tick
// . we often send out 2+ requests and end up getting one reply before
@ -851,14 +896,16 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
// THIS->m_msgType);
}
// C wrapper for the C++ callback
void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) {
Multicast *THIS = (Multicast *)state;
THIS->gotReply1 ( slot );
void Multicast::gotReply1(void *state, UdpSlot *slot) {
Multicast *THIS = static_cast<Multicast*>(state);
THIS->gotReply1(slot);
}
// come here if we've got a reply from a host that's not part of a group send
void Multicast::gotReply1 ( UdpSlot *slot ) {
ScopedLock sl(m_mtx);
// don't ever let UdpServer free this send buf (it is m_msg)
slot->m_sendBufAlloc = NULL;
@ -887,7 +934,7 @@ void Multicast::gotReply1 ( UdpSlot *slot ) {
}
// mark it as no longer in progress
m_inProgress[i] = 0;
m_inProgress[i] = false;
Host *h = m_hostPtrs[i];
@ -900,6 +947,8 @@ void Multicast::gotReply1 ( UdpSlot *slot ) {
(int32_t) m_msgType, (PTRTYPE) this, mstrerror(g_errno));
}
sl.unlock();
// on error try sending the request to another host
// return if we kicked another request off ok
if ( g_errno ) {
@ -1069,7 +1118,7 @@ void Multicast::destroySlotsInProgress ( UdpSlot *slot ) {
// destroy this slot that's in progress
g_udpServer.destroySlot ( m_slots[i] );
// do not re-destroy. consider no longer in progress.
m_inProgress[i] = 0;
m_inProgress[i] = false;
}
}

@ -17,9 +17,11 @@
#ifndef GB_MULTICAST_H
#define GB_MULTICAST_H
#include "Hostdb.h" // getGroup(), getTimes(), stampHost()
#include "UdpServer.h" // sendRequest()
#include "Loop.h" // registerSleepCallback()
#include "MsgType.h"
#include "GbMutex.h"
#include <inttypes.h>
#include <stddef.h>
#define MAX_HOSTS_PER_GROUP 10
@ -31,6 +33,9 @@ static const int64_t multicast_msg3a_default_timeout = 10000;
static const int64_t multicast_msg3a_maximum_timeout = 60000;
static const int64_t multicast_msg1c_getip_default_timeout = 60000;
class UdpSlot;
class Host;
class Multicast {
@ -103,31 +108,37 @@ class Multicast {
// private:
void destroySlotsInProgress ( UdpSlot *slot );
// keep these public so C wrapper can call them
bool sendToHostLoop(int32_t key, int32_t hostNumToTry, int32_t firstHostId);
bool sendToHost ( int32_t i );
int32_t pickBestHost ( uint32_t key , int32_t hostNumToTry );
void gotReply1 ( UdpSlot *slot ) ;
void closeUpShop ( UdpSlot *slot ) ;
void sendToGroup();
void gotReply2 ( UdpSlot *slot ) ;
// . stuff set directly by send() parameters
char *m_msg;
int32_t m_msgSize;
msg_type_t m_msgType;
bool m_ownMsg;
//uint32_t m_groupId;
class UdpSlot *m_slot;
bool m_inUse;
// for linked list of available Multicasts in Msg4.cpp
class Multicast *m_next;
// host we got reply from. used by Msg3a for timing.
Host *m_replyingHost;
// when the request was launched to the m_replyingHost
int64_t m_replyLaunchTime;
// more hack stuff used by PageInject.cpp
int32_t m_hackFileId;
int64_t m_hackFileOff;
class ImportState *m_importState;
private:
GbMutex m_mtx;
void *m_state;
void *m_state2;
void (* m_callback)( void *state , void *state2 );
int64_t m_totalTimeout; // in milliseconds
class UdpSlot *m_slot;
// . m_slots[] is our list of concurrent transactions
// . we delete all the slots only after cast is done
int64_t m_startTime; // milliseconds since the epoch
@ -150,7 +161,7 @@ class Multicast {
// did we have an errno with this slot?
int32_t m_errnos [MAX_HOSTS_PER_GROUP];
// transaction in progress?
char m_inProgress [MAX_HOSTS_PER_GROUP];
bool m_inProgress [MAX_HOSTS_PER_GROUP];
int64_t m_launchTime [MAX_HOSTS_PER_GROUP];
// steal this from the slot(s) we get
@ -168,6 +179,7 @@ class Multicast {
// . last sending of the request to ONE host in a group (pick & send)
// . in milliseconds
int64_t m_lastLaunch;
Host *m_lastLaunchHost;
// only free m_reply if this is true
@ -180,22 +192,23 @@ class Multicast {
int32_t m_retryCount;
char m_sentToTwin;
bool m_sentToTwin;
char m_inUse;
void destroySlotsInProgress ( UdpSlot *slot );
// for linked list of available Multicasts in Msg4.cpp
class Multicast *m_next;
void sendToGroup();
// host we got reply from. used by Msg3a for timing.
Host *m_replyingHost;
// when the request was launched to the m_replyingHost
int64_t m_replyLaunchTime;
static void sleepWrapper1(int bogusfd, void *state);
static void sleepWrapper2(int bogusfd, void *state);
static void gotReply1(void *state, UdpSlot *slot);
void gotReply1(UdpSlot *slot);
static void gotReply2(void *state, UdpSlot *slot);
void gotReply2(UdpSlot *slot);
// more hack stuff used by PageInject.cpp
int32_t m_hackFileId;
int64_t m_hackFileOff;
class ImportState *m_importState;
bool sendToHostLoop(int32_t key, int32_t hostNumToTry, int32_t firstHostId);
bool sendToHost ( int32_t i );
int32_t pickBestHost ( uint32_t key , int32_t hostNumToTry );
void closeUpShop ( UdpSlot *slot ) ;
};
#endif // GB_MULTICAST_H

@ -95,7 +95,7 @@ bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {
}
// add to spiderdb
if ( ! gr->m_msg4.addMetaList( &(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper, 0 ) ) {
if (!gr->m_msg4.addMetaList(&(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper)) {
// blocked!
return false;
}

@ -188,7 +188,7 @@ bool getSpiderRequestMetaList ( const char *doc, SafeBuf *listBuf, bool spiderLi
if ( url.getUrlLen() <= 0 ) continue;
// need this
int64_t probDocId = g_titledb.getProbableDocId(&url);
int64_t probDocId = Titledb::getProbableDocId(&url);
// make it
SpiderRequest sreq;

@ -107,7 +107,7 @@ Host *getHostToHandleInjection ( char *url ) {
Url norm;
norm.set(url);
int64_t docId = g_titledb.getProbableDocId ( &norm );
int64_t docId = Titledb::getProbableDocId ( &norm );
uint32_t shardNum = getShardNumFromDocId(docId);
Host *host = g_hostdb.getHostWithSpideringEnabled(shardNum);
@ -1057,7 +1057,7 @@ bool ImportState::importLoop ( ) {
mcast->m_hackFileId = m_bfFileId;
// get docid from key
docId = g_titledb.getDocIdFromKey ( &tkey );
docId = Titledb::getDocIdFromKey ( &tkey );
// get shard that holds the titlerec for it
shardNum = g_hostdb.getShardNumFromDocId ( docId );

@ -594,7 +594,7 @@ static bool sendPageParser2 ( TcpSocket *s ,
// if facebook, load xml content from title rec...
bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
if ( isFacebook && ! content ) {
int64_t docId = g_titledb.getProbableDocId((char*)st->m_u);
int64_t docId = Titledb::getProbableDocId((char*)st->m_u);
sprintf(sreq.m_url ,"%" PRIu64, docId );
sreq.m_isPageReindex = true;
}

@ -419,7 +419,7 @@ bool Msg1c::gotList ( ) {
log("reindex: adding docid list to spiderdb");
return m_msg4.addMetaList ( &m_sb, m_collnum, this, addedListWrapper, 0, RDB_SPIDERDB );
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB);
}
void addedListWrapper ( void *state ) {

@ -11867,8 +11867,8 @@ bool Parms::syncParmsWithHost0 ( ) {
void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
// right now we must be host #0
if ( g_hostdb.m_hostId != 0 ) {
hadError:
g_errno = EBADENGINEER;
hadError:
g_udpServer.sendErrorReply( slot, g_errno );
return;
}

@ -143,7 +143,7 @@ bool Posdb::init ( ) {
false , // istitledb?
getKeySize(),
false,
true);
g_conf.m_noInMemoryPosdbMerge);
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
@ -160,15 +160,17 @@ bool Posdb::init2 ( int32_t treeMem ) {
// must be able to fit all bins in memory
// . we do not want posdb's bin tree to ever hit disk since we
// dump it to rdb files when it is 90% full (90% of bins in use)
return m_rdb.init ( g_hostdb.m_dir ,
"posdbRebuild" ,
getFixedDataSize(),
1000 , // min files to merge
treeMem ,
maxTreeNodes ,
getUseHalfKeys(),
false ,
getKeySize());
return m_rdb.init(g_hostdb.m_dir,
"posdbRebuild",
getFixedDataSize(),
1000, // min files to merge
treeMem,
maxTreeNodes,
getUseHalfKeys(),
false,
getKeySize(),
false,
g_conf.m_noInMemoryPosdbMerge);
}
@ -546,7 +548,7 @@ int Posdb::printList ( RdbList &list ) {
const char *dd = "";
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
int64_t d = g_posdb.getDocId(&k);
uint8_t dh = g_titledb.getDomHash8FromDocId(d);
uint8_t dh = Titledb::getDomHash8FromDocId(d);
char *rec = list.getCurrentRec();
int32_t recSize = 18;
if ( rec[0] & 0x04 ) recSize = 6;

@ -2605,6 +2605,309 @@ VALGRIND_CHECK_MEM_IS_DEFINED(&dcs,sizeof(dcs));
}
// Pre-advance each termlist's cursor to skip to next docid.
//
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
// of each termlist so we are ready for a quick skip over this docid.
//
// TODO: use just a single array of termlist ptrs perhaps,
// then we can remove them when they go NULL. and we'd save a little
// time not having a nested loop.
bool PosdbTable::advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf) {
logTrace(g_conf.m_logTracePosdb, "BEGIN");
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &qtibuf[i];
// do not advance negative termlist cursor
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) {
continue;
}
//
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
// In second pass (to get detailed scoring info for UI output), they are initialized above
//
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
// shortcuts
char *xc = qti->m_matchingSubListCursor[j];
char *xcEnd = qti->m_matchingSubListEnd[j];
// exhausted? (we can't make cursor NULL because
// getMaxPossibleScore() needs the last ptr)
// must match docid
if ( xc >= xcEnd ||
*(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) ||
(*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) {
// flag it as not having the docid
qti->m_matchingSubListSavedCursor[j] = NULL;
// skip this sublist if does not have our docid
continue;
}
// save it
qti->m_matchingSubListSavedCursor[j] = xc;
// get new docid
//log("new docid %" PRId64,Posdb::getDocId(xc) );
// advance the cursors. skip our 12
xc += 12;
// then skip any following 6 byte keys because they
// share the same docid
for ( ; ; xc += 6 ) {
// end of whole termlist?
if ( xc >= xcEnd ) {
break;
}
// sanity. no 18 byte keys allowed
if ( (*xc & 0x06) == 0x00 ) {
// i've seen this triggered on gk28.
// a dump of posdb for the termlist
// for 'post' had corruption in it,
// yet its twin, gk92 did not. the
// corruption could have occurred
// anywhere from nov 2012 to may 2013,
// and the posdb file was never
// re-merged! must have been blatant
// disk malfunction?
log("posdb: encountered corrupt posdb list. bailing.");
logTrace(g_conf.m_logTracePosdb, "END.");
return false;
//gbshutdownAbort(true);
}
// the next docid? it will be a 12 byte key.
if ( ! (*xc & 0x04) ) {
break;
}
}
// assign to next docid word position list
qti->m_matchingSubListCursor[j] = xc;
}
}
logTrace(g_conf.m_logTracePosdb, "END");
return true;
}
#define RINGBUFSIZE 4096
//
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'search engine'. it might save time!
//
// Returns:
// false - docid does not meet minimum score requirement
// true - docid can potentially be a top scoring docid
//
bool PosdbTable::prefilterMaxPossibleScoreByDistance(QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore) {
//#define RINGBUFSIZE 1024
unsigned char ringBuf[RINGBUFSIZE+10];
// for overflow conditions in loops below
ringBuf[RINGBUFSIZE+0] = 0xff;
ringBuf[RINGBUFSIZE+1] = 0xff;
ringBuf[RINGBUFSIZE+2] = 0xff;
ringBuf[RINGBUFSIZE+3] = 0xff;
unsigned char qt;
QueryTermInfo *qtx;
uint32_t wx;
int32_t ourFirstPos = -1;
int32_t qdist;
logTrace(g_conf.m_logTracePosdb, "BEGIN");
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
memset ( ringBuf, 0xff, RINGBUFSIZE );
// now to speed up 'time enough for love' query which does not
// have many super high scoring guys on top we need a more restrictive
// filter than getMaxPossibleScore() so let's pick one query term,
// the one with the shortest termlist, and see how close it gets to
// each of the other query terms. then score each of those pairs.
// so quickly record the word positions of each query term into
// a ring buffer of 4096 slots where each slot contains the
// query term # plus 1.
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation");
qtx = &qtibuf[m_minTermListIdx];
// populate ring buf just for this query term
for ( int32_t k = 0 ; k < qtx->m_numMatchingSubLists ; k++ ) {
// scan that sublist and add word positions
char *sub = qtx->m_matchingSubListSavedCursor[k];
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue;
}
char *end = qtx->m_matchingSubListCursor[k];
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = m_minTermListIdx;
// set this
ourFirstPos = wx;
// skip first key
sub += 12;
// then 6 byte keys
for ( ; sub < end ; sub += 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = m_minTermListIdx;
}
}
// now get query term closest to query term # m_minTermListIdx which
// is the query term # with the shortest termlist
// get closest term to m_minTermListIdx and the distance
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2");
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
if ( i == m_minTermListIdx ) {
continue;
}
// get the query term info
QueryTermInfo *qti = &qtibuf[i];
// if we have a negative term, skip it
if ( qti->m_bigramFlags[0] & (BF_NEGATIVE) ) {
continue;
}
// store all his word positions into ring buffer AS WELL
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
// scan that sublist and add word positions
char *sub = qti->m_matchingSubListSavedCursor[k];
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue;
}
char *end = qti->m_matchingSubListCursor[k];
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = i;
// skip first key
sub += 12;
// then 6 byte keys
for ( ; sub < end ; sub += 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = i;
}
}
// reset
int32_t ourLastPos = -1;
int32_t hisLastPos = -1;
int32_t bestDist = 0x7fffffff;
// how far is this guy from the man?
for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) {
// skip next 4 slots if all empty. fast?
if (*(uint32_t *)(ringBuf+x) == 0xffffffff) {
x+=4;
continue;
}
// skip if nobody
if ( ringBuf[x] == 0xff ) {
x++;
continue;
}
// get query term #
qt = ringBuf[x];
// if it's the man
if ( qt == m_minTermListIdx ) {
// record
hisLastPos = x;
// skip if we are not there yet
if ( ourLastPos == -1 ) {
x++;
continue;
}
// try distance fix
if ( x - ourLastPos < bestDist ) {
bestDist = x - ourLastPos;
}
}
// if us
else
if ( qt == i ) {
// record
ourLastPos = x;
// skip if he's not recorded yet
if ( hisLastPos == -1 ) {
x++;
continue;
}
// update
ourLastPos = x;
//@@@ ^^ dupe
// check dist
if ( x - hisLastPos < bestDist ) {
bestDist = x - hisLastPos;
}
}
x++;
continue; //@@@ doh...
}
// compare last occurence of query term #x with our first occ.
// since this is a RING buffer
int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos);
if ( wrapDist < bestDist ) {
bestDist = wrapDist;
}
// query distance
qdist = qpos[m_minTermListIdx] - qpos[i];
// compute it
float maxScore2 = getMaxPossibleScore(&qtibuf[i],
bestDist,
qdist,
&qtibuf[m_minTermListIdx]);
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore2 < 0.0 ) {
continue;
}
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
// @todo: BR. Really? ANY of them?
if ( maxScore2 <= minWinningScore ) {
logTrace(g_conf.m_logTracePosdb, "END - docid score too low");
return false;
}
}
logTrace(g_conf.m_logTracePosdb, "END - docid score high enough");
return true;
}
// . compare the output of this to intersectLists9_r()
// . hopefully this will be easier to understand and faster
@ -2720,8 +3023,8 @@ void PosdbTable::intersectLists10_r ( ) {
float pss;
// scan the posdb keys in the smallest list
// raised from 200 to 300,000 for 'da da da' query
char mbuf[300000];
char *mptrEnd = mbuf + 299000;
char miniMergeBuf[300000];
char *mptrEnd = miniMergeBuf + 299000;
char *mptr;
char *docIdPtr;
char *docIdEnd = m_docIdVoteBuf.getBufStart()+m_docIdVoteBuf.length();
@ -2732,22 +3035,11 @@ void PosdbTable::intersectLists10_r ( ) {
char *lastMptr = NULL;
int32_t topCursor = -9;
int32_t numProcessed = 0;
#define RINGBUFSIZE 4096
//#define RINGBUFSIZE 1024
unsigned char ringBuf[RINGBUFSIZE+10];
// for overflow conditions in loops below
ringBuf[RINGBUFSIZE+0] = 0xff;
ringBuf[RINGBUFSIZE+1] = 0xff;
ringBuf[RINGBUFSIZE+2] = 0xff;
ringBuf[RINGBUFSIZE+3] = 0xff;
unsigned char qt;
QueryTermInfo *qtx;
uint32_t wx;
int32_t fail0 = 0;
int32_t pass0 = 0;
int32_t fail = 0;
int32_t pass = 0;
int32_t ourFirstPos = -1;
int32_t prefiltMaxPossScoreFail = 0;
int32_t prefiltMaxPossScorePass = 0;
int32_t prefiltBestDistMaxPossScoreFail = 0;
int32_t prefiltBestDistMaxPossScorePass = 0;
// populate the cursors for each sublist
@ -2819,6 +3111,8 @@ void PosdbTable::intersectLists10_r ( ) {
bool allDone = false;
while( !allDone && docIdPtr < docIdEnd ) {
logTrace(g_conf.m_logTracePosdb, "Handling next docID");
bool skipToNextDocId = false;
// second pass? for printing out transparency info.
@ -2830,101 +3124,34 @@ void PosdbTable::intersectLists10_r ( ) {
}
}
if( currPassNum == INTERSECT_SCORING ) {
//
// Pre-advance each termlist's cursor to skip to next docid.
//
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
// of each termlist so we are ready for a quick skip over this docid.
//
// TODO: use just a single array of termlist ptrs perhaps,
// then we can remove them when they go NULL. and we'd save a little
// time not having a nested loop.
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &qtibuf[i];
// do not advance negative termlist cursor
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) {
continue;
}
//
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
// In second pass (to get detailed scoring info for UI output), they are initialized above
//
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
// shortcuts
char *xc = qti->m_matchingSubListCursor[j];
char *xcEnd = qti->m_matchingSubListEnd[j];
// exhausted? (we can't make cursor NULL because
// getMaxPossibleScore() needs the last ptr)
// must match docid
if ( xc >= xcEnd ||
*(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) ||
(*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) {
// flag it as not having the docid
qti->m_matchingSubListSavedCursor[j] = NULL;
// skip this sublist if does not have our docid
continue;
}
// save it
qti->m_matchingSubListSavedCursor[j] = xc;
// get new docid
//log("new docid %" PRId64,Posdb::getDocId(xc) );
// advance the cursors. skip our 12
xc += 12;
// then skip any following 6 byte keys because they
// share the same docid
for ( ; ; xc += 6 ) {
// end of whole termlist?
if ( xc >= xcEnd ) {
break;
}
// sanity. no 18 byte keys allowed
if ( (*xc & 0x06) == 0x00 ) {
// i've seen this triggered on gk28.
// a dump of posdb for the termlist
// for 'post' had corruption in it,
// yet its twin, gk92 did not. the
// corruption could have occurred
// anywhere from nov 2012 to may 2013,
// and the posdb file was never
// re-merged! must have been blatant
// disk malfunction?
log("posdb: encountered corrupt posdb list. bailing.");
logTrace(g_conf.m_logTracePosdb, "END.");
return;
//gbshutdownAbort(true);
}
// the next docid? it will be a 12 byte key.
if ( ! (*xc & 0x04) ) {
break;
}
}
// assign to next docid word position list
qti->m_matchingSubListCursor[j] = xc;
}
if( !advanceTermListCursors(docIdPtr, qtibuf) ) {
logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed");
return;
}
if( !m_q->m_isBoolean ) {
//##
//## PRE-FILTERS. Discard DocIDs that cannot meet the minimum required
//## score, before entering the main scoring loop below
//##
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'time enough for love'. it might save time!
//
// Calculate maximum possible score for a document. If the max score
// is lower than the current minimum winning score, give up already
// now and skip to the next docid.
//
// Only go through this if we actually have a minimum score to compare with ...
// No need if minWinningScore is still -1
if ( minWinningScore >= 0 ) {
if ( minWinningScore >= 0.0 ) {
logTrace(g_conf.m_logTracePosdb, "Compute 'upper bound' for each query term");
// If there's no way we can break into the winner's circle, give up!
@ -2949,7 +3176,7 @@ void PosdbTable::intersectLists10_r ( ) {
// worst score of the 10th result, then it can not win.
if ( maxScore <= minWinningScore ) {
docIdPtr += 6;
fail0++;
prefiltMaxPossScoreFail++;
skipToNextDocId = true;
break; // break out of numQueryTermsToHandle loop
}
@ -2961,197 +3188,14 @@ void PosdbTable::intersectLists10_r ( ) {
continue;
}
pass0++;
prefiltMaxPossScorePass++;
if ( minWinningScore >= 0.0 && m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) {
if ( m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) {
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'search engine'. it might save time!
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
memset ( ringBuf, 0xff, RINGBUFSIZE );
// now to speed up 'time enough for love' query which does not
// have many super high scoring guys on top we need a more restrictive
// filter than getMaxPossibleScore() so let's pick one query term,
// the one with the shortest termlist, and see how close it gets to
// each of the other query terms. then score each of those pairs.
// so quickly record the word positions of each query term into
// a ring buffer of 4096 slots where each slot contains the
// query term # plus 1.
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation");
qtx = &qtibuf[m_minTermListIdx];
// populate ring buf just for this query term
for ( int32_t k = 0 ; k < qtx->m_numMatchingSubLists ; k++ ) {
// scan that sublist and add word positions
char *sub = qtx->m_matchingSubListSavedCursor[k];
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue;
}
char *end = qtx->m_matchingSubListCursor[k];
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = m_minTermListIdx;
// set this
ourFirstPos = wx;
// skip first key
sub += 12;
// then 6 byte keys
for ( ; sub < end ; sub += 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = m_minTermListIdx;
}
}
// now get query term closest to query term # m_minTermListIdx which
// is the query term # with the shortest termlist
// get closest term to m_minTermListIdx and the distance
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2");
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip the man
if ( i == m_minTermListIdx ) {
continue;
}
// get the query term info
QueryTermInfo *qti = &qtibuf[i];
// if we have a negative term, skip it
if ( qti->m_bigramFlags[0] & (BF_NEGATIVE) ) {
// if its empty, that's good!
continue;
}
// store all his word positions into ring buffer AS WELL
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
// scan that sublist and add word positions
char *sub = qti->m_matchingSubListSavedCursor[k];
// skip sublist if it's cursor is exhausted
if ( ! sub ) {
continue;
}
char *end = qti->m_matchingSubListCursor[k];
// add first key
//int32_t wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = i;
// skip first key
sub += 12;
// then 6 byte keys
for ( ; sub < end ; sub += 6 ) {
// get word position
//wx = Posdb::getWordPos(sub);
wx = (*((uint32_t *)(sub+3))) >> 6;
// mod with 4096
wx &= (RINGBUFSIZE-1);
// store it. 0 is legit.
ringBuf[wx] = i;
}
}
// reset
int32_t ourLastPos = -1;
int32_t hisLastPos = -1;
int32_t bestDist = 0x7fffffff;
// how far is this guy from the man?
for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) {
// skip next 4 slots if all empty. fast?
if (*(uint32_t *)(ringBuf+x) == 0xffffffff) {
x+=4;
continue;
}
// skip if nobody
if ( ringBuf[x] == 0xff ) {
x++;
continue;
}
// get query term #
qt = ringBuf[x];
// if it's the man
if ( qt == m_minTermListIdx ) {
// record
hisLastPos = x;
// skip if we are not there yet
if ( ourLastPos == -1 ) {
x++;
continue;
}
// try distance fix
if ( x - ourLastPos < bestDist ) {
bestDist = x - ourLastPos;
}
}
// if us
else
if ( qt == i ) {
// record
ourLastPos = x;
// skip if he's not recorded yet
if ( hisLastPos == -1 ) {
x++;
continue;
}
// update
ourLastPos = x;
// check dist
if ( x - hisLastPos < bestDist ) {
bestDist = x - hisLastPos;
}
}
x++;
continue;
}
// compare last occurence of query term #x with our first occ.
// since this is a RING buffer
int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos);
if ( wrapDist < bestDist ) {
bestDist = wrapDist;
}
// query distance
qdist = qpos[m_minTermListIdx] - qpos[i];
// compute it
float maxScore2 = getMaxPossibleScore(&qtibuf[i],
bestDist,
qdist,
&qtibuf[m_minTermListIdx]);
// -1 means it has inlink text so do not apply this constraint
// to this docid because it is too difficult because we
// sum up the inlink text
if ( maxScore2 < 0.0 ) {
continue;
}
// if any one of these terms have a max score below the
// worst score of the 10th result, then it can not win.
if ( maxScore2 <= minWinningScore ) {
docIdPtr += 6;
fail++;
skipToNextDocId = true;
break; // break out of numQueryTermsToHandle loop
}
if( !prefilterMaxPossibleScoreByDistance(qtibuf, qpos, minWinningScore) ) {
docIdPtr += 6;
prefiltBestDistMaxPossScoreFail++;
skipToNextDocId = true;
}
} // not m_sortByTermNum or m_sortByTermNumInt
@ -3159,7 +3203,7 @@ void PosdbTable::intersectLists10_r ( ) {
// Continue docIdPtr < docIdEnd loop
continue;
}
pass++;
prefiltBestDistMaxPossScorePass++;
} // !m_q->m_isBoolean
} // currPassNum == INTERSECT_SCORING
@ -3186,6 +3230,8 @@ void PosdbTable::intersectLists10_r ( ) {
}
}
//
// PERFORMANCE HACK:
//
@ -3196,7 +3242,7 @@ void PosdbTable::intersectLists10_r ( ) {
// all posdb keys for this docid should fit in here, the
// mini merge buf:
mptr = mbuf;
mptr = miniMergeBuf;
// . merge each set of sublists
// . like we merge a term's list with its two associated bigram
@ -3204,10 +3250,11 @@ void PosdbTable::intersectLists10_r ( ) {
// . and merge all the synonym lists for that term together as well.
// so if the term is 'run' we merge it with the lists for
// 'running' 'ran' etc.
logTrace(g_conf.m_logTracePosdb, "Merge sublists");
logTrace(g_conf.m_logTracePosdb, "Merge sublists into a single list per query term");
for ( int32_t j = 0 ; j < m_numQueryTermInfos ; j++ ) {
// get the query term info
QueryTermInfo *qti = &qtibuf[j];
// just use the flags from first term i guess
// NO! this loses the wikihalfstopbigram bit! so we gotta
// add that in for the key i guess the same way we add in
@ -3220,9 +3267,11 @@ void PosdbTable::intersectLists10_r ( ) {
// if its empty, that's good!
continue;
}
// the merged list for term #j is here:
miniMergedList [j] = mptr;
miniMergedList[j] = mptr;
bool isFirstKey = true;
// populate the nwp[] arrays for merging
int32_t nsub = 0;
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
@ -3261,7 +3310,7 @@ void PosdbTable::intersectLists10_r ( ) {
bflags [j] = nwpFlags[0];
continue;
}
// . ok, merge the lists into a list in mbuf
// . ok, merge the lists into a list in miniMergeBuf
// . get the min of each list
bool currTermDone = false;
@ -3396,7 +3445,7 @@ void PosdbTable::intersectLists10_r ( ) {
}
// breach?
if ( mptr > mbuf + 300000 ) {
if ( mptr > miniMergeBuf + 300000 ) {
gbshutdownAbort(true);
}
@ -4074,9 +4123,7 @@ void PosdbTable::intersectLists10_r ( ) {
// advance to next docid
docIdPtr += 6;
logTrace(g_conf.m_logTracePosdb, "^ Now repeat for next docID");
}
} // docIdPtr < docIdEnd loop
if ( m_debug ) {
@ -4091,10 +4138,10 @@ void PosdbTable::intersectLists10_r ( ) {
if ( m_debug ) {
log(LOG_INFO, "posdb: # fail0 = %" PRId32" ", fail0 );
log(LOG_INFO, "posdb: # pass0 = %" PRId32" ", pass0 );
log(LOG_INFO, "posdb: # fail = %" PRId32" ", fail );
log(LOG_INFO, "posdb: # pass = %" PRId32" ", pass );
log(LOG_INFO, "posdb: # prefiltMaxPossScoreFail........: %" PRId32" ", prefiltMaxPossScoreFail );
log(LOG_INFO, "posdb: # prefiltMaxPossScorePass........: %" PRId32" ", prefiltMaxPossScorePass );
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScoreFail: %" PRId32" ", prefiltBestDistMaxPossScoreFail );
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScorePass: %" PRId32" ", prefiltBestDistMaxPossScorePass );
}
// get time now

@ -126,8 +126,11 @@ class PosdbTable {
return m_initialized;
}
// functions used by intersectlist
bool genDebugScoreInfo1(int32_t &numProcessed, int32_t &topCursor, QueryTermInfo *qtibuf);
bool genDebugScoreInfo2(DocIdScore &dcs, int32_t &lastLen, uint64_t &lastDocId, char siteRank, float score, int32_t intScore, char docLang);
bool advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf);
bool prefilterMaxPossibleScoreByDistance(QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore);
uint64_t m_docId;

@ -123,6 +123,7 @@ bool Rdb::init ( const char *dir ,
m_useHalfKeys = useHalfKeys;
m_isTitledb = isTitledb;
m_ks = keySize;
m_useIndexFile = useIndexFile;
m_inDumpLoop = false;
// set our id
@ -137,12 +138,6 @@ bool Rdb::init ( const char *dir ,
g_process.shutdownAbort(true);
}
if (m_rdbId == RDB_POSDB || m_rdbId == RDB2_POSDB2) {
m_useIndexFile = g_conf.m_noInMemoryPosdbMerge ? useIndexFile : false;
} else {
m_useIndexFile = useIndexFile;
}
// get page size
m_pageSize = GB_TFNDB_PAGE_SIZE;
if ( m_rdbId == RDB_POSDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;

2
Rdb.h

@ -239,6 +239,8 @@ public:
m_inDumpLoop = inDumpLoop;
}
bool isUseIndexFile() const { return m_useIndexFile; }
bool inAddList() const { return m_inAddList; }
// . you'll lose your data in this class if you call this

@ -1408,7 +1408,7 @@ bool RdbBase::attemptMerge( int32_t niceness, bool forceMergeAll, bool doLog , i
// then do not do the merge, we do not want to overwrite tfndb via
// RdbDump::updateTfndbLoop()
rdbid_t rdbId = getIdFromRdb ( m_rdb );
if ( rdbId == RDB_TITLEDB && g_titledb.m_rdb.isDumping() ) {
if ( rdbId == RDB_TITLEDB && g_titledb.getRdb()->isDumping() ) {
if ( doLog ) {
log( LOG_INFO, "db: Can not merge titledb while it is dumping." );
}

@ -133,26 +133,32 @@ void RdbList::set(char *list, int32_t listSize, char *alloc, int32_t allocSize,
verify_signature();
logTrace(g_conf.m_logTraceRdbList, "BEGIN. list=%p listSize=%" PRId32" alloc=%p allocSize=%" PRId32,
list, listSize, alloc, allocSize);
logTrace(g_conf.m_logTraceRdbList, "startKey=%s endKey=%s keySize=%hhu fixedDataSize=%" PRId32,
KEYSTR(startKey, keySize), KEYSTR(endKey, keySize), keySize, fixedDataSize);
// free and NULLify any old m_list we had to make room for our new list
freeList();
// set this first since others depend on it
m_ks = keySize;
// sanity check (happens when IndexReadInfo exhausts a list to Msg2)
if ( KEYCMP(startKey,endKey,m_ks) > 0 )
log(LOG_REMIND,"db: rdblist: set: startKey > endKey.");
if (KEYCMP(startKey, endKey, m_ks) > 0) {
log(LOG_WARN, "db: rdblist: set: startKey > endKey.");
}
// safety check
if ( fixedDataSize != 0 && useHalfKeys ) {
log(LOG_LOGIC,"db: rdblist: set: useHalfKeys 1 when "
"fixedDataSize not 0.");
if (fixedDataSize != 0 && useHalfKeys) {
log(LOG_LOGIC, "db: rdblist: set: useHalfKeys 1 when fixedDataSize not 0.");
useHalfKeys = false;
}
// got an extremely ugly corrupt stack core without this check
if ( m_list && m_listSize == 0 ){
log ( LOG_WARN, "rdblist: listSize of 0 but list pointer not "
"NULL!" );
if (m_list && m_listSize == 0) {
log(LOG_WARN, "rdblist: listSize of 0 but list pointer not NULL!");
m_list = NULL;
}
// set our list parms
m_list = list;
m_listSize = listSize;
@ -164,8 +170,11 @@ void RdbList::set(char *list, int32_t listSize, char *alloc, int32_t allocSize,
m_fixedDataSize = fixedDataSize;
m_ownData = ownData;
m_useHalfKeys = useHalfKeys;
// use this call now to set m_listPtr and m_listPtrHi based on m_list
resetListPtr();
logTrace(g_conf.m_logTraceRdbList, "END");
}
// like above but uses 0/maxKey for startKey/endKey
@ -976,9 +985,6 @@ bool RdbList::removeBadData_r ( ) {
int RdbList::printPosdbList() {
logf(LOG_DEBUG, "%s:%s: BEGIN",__FILE__,__func__);
// save
char *oldp = m_listPtr;
const char *oldphi = m_listPtrHi;
@ -1074,7 +1080,6 @@ int RdbList::printPosdbList() {
m_listPtr = oldp;
m_listPtrHi = oldphi;
logf(LOG_DEBUG, "%s:%s: END",__FILE__,__func__);
return 0;
}
@ -1084,9 +1089,6 @@ int RdbList::printList() {
return printPosdbList();
}
logf(LOG_DEBUG, "%s:%s: BEGIN",__FILE__,__func__);
//log("m_list=%" PRId32,(int32_t)m_list);
// save
char *oldp = m_listPtr;
const char *oldphi = m_listPtrHi;
@ -1120,7 +1122,6 @@ int RdbList::printList() {
m_listPtr = oldp;
m_listPtrHi = oldphi;
logf(LOG_DEBUG, "%s:%s: END",__FILE__,__func__);
return 0;
}
@ -1570,11 +1571,6 @@ bool RdbList::posdbConstrain(const char *startKey, char *endKey, int32_t minRecS
}
// write the full key back into "p"
KEYSET(p, k, 18);
} else if (p[0] & 0x02) {
// write the key back 6 bytes
p -= 6;
KEYSET(p, k, 18);
}
@ -1755,8 +1751,7 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
// did they call prepareForMerge()?
if ( m_mergeMinListSize == -1 ) {
log(LOG_LOGIC,"db: rdblist: merge_r: prepareForMerge() not "
"called. ignoring error and returning emtpy list.");
log(LOG_LOGIC,"db: rdblist: merge_r: prepareForMerge() not called. ignoring error and returning emtpy list.");
// this happens if we nuke doledb during a merge of it. it is just bad timing
return;
// save state and dump core, sigBadHandler will catch this
@ -1770,8 +1765,8 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
// warning msg
if ( m_listPtr != m_listEnd ) {
log( LOG_LOGIC, "db: rdblist: merge_r: warning. merge not storing at end of list for %s.",
getDbnameFromId( ( uint8_t ) rdbId ) );
log(LOG_LOGIC, "db: rdblist: merge_r: warning. merge not storing at end of list for %s.",
getDbnameFromId((uint8_t)rdbId));
}
// set our key range
@ -1782,8 +1777,6 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
// deletes all the urls then does a dump of just negative keys.
// so let's comment it out for now
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) ) {
// log(LOG_LOGIC,"db: rdblist: merge_r: Illegal endKey for "
// "merging rdb=%s. fixing.",getDbnameFromId(rdbId));
// make it legal so it will be read first NEXT time
KEYDEC(m_endKey,m_ks);
}
@ -1814,6 +1807,13 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
return;
}
// check that we're not using index for other rdb file than posdb
Rdb* rdb = getRdbFromId(rdbId);
if (rdb->isUseIndexFile()) {
/// @todo ALC logic to use index file is not implemented for any rdb other than posdb. add it below if required
gbshutdownLogicError();
}
int32_t required = -1;
// . if merge not necessary, print a warning message.
// . caller should have just called constrain() then
@ -2133,6 +2133,7 @@ skip:
///////
bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startKey, const char *endKey, int32_t minRecSizes, bool removeNegKeys) {
logTrace(g_conf.m_logTraceRdbList, "BEGIN");
// sanity
if (m_ks != sizeof(key144_t)) {
gbshutdownAbort(true);
@ -2264,6 +2265,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
// . continue if tie, so we get the oldest first
// . treat negative and positive keys as identical for this
if (ss < 0) {
logTrace(g_conf.m_logTraceRdbList, "ss < 0. continue");
continue;
}
@ -2271,9 +2273,12 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
// and minPtrBase/Lo/Hi was a negative key! so this is
// the annihilation. skip the positive key.
if (ss == 0) {
logTrace(g_conf.m_logTraceRdbList, "ss == 0. skip");
goto skip;
}
logTrace(g_conf.m_logTraceRdbList, "new min i=%" PRId32, i);
// we got a new min
minPtrBase = ptrs [i];
minPtrLo = loKeys[i];
@ -2283,6 +2288,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
// ignore if negative i guess, just skip it
if (removeNegKeys && (minPtrBase[0] & 0x01) == 0x00) {
logTrace(g_conf.m_logTraceRdbList, "removeNegKeys. skip");
goto skip;
}
@ -2293,11 +2299,13 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
if (m_listPtrHi && cmp_6bytes_equal(minPtrHi, m_listPtrHi)) {
if (m_listPtrLo && cmp_6bytes_equal(minPtrLo, m_listPtrLo)) {
// 6-byte entry
logTrace(g_conf.m_logTraceRdbList, "store 6-byte key");
memcpy(new_listPtr, minPtrBase, 6);
new_listPtr += 6;
*pp |= 0x06; //turn on both compression bits
} else {
// 12-byte entry
logTrace(g_conf.m_logTraceRdbList, "store 12-byte key");
memcpy(new_listPtr, minPtrBase, 6);
new_listPtr += 6;
memcpy(new_listPtr, minPtrLo, 6);
@ -2307,6 +2315,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
}
} else {
// 18-byte entry
logTrace(g_conf.m_logTraceRdbList, "store 18-byte key");
memcpy(new_listPtr, minPtrBase, 6);
new_listPtr += 6;
memcpy(new_listPtr, minPtrLo, 6);
@ -2336,11 +2345,14 @@ skip:
// is new key 6 bytes? then do not touch hi/lo ptrs
if ( ptrs[mini][0] & 0x04 ) {
// no-op
logTrace(g_conf.m_logTraceRdbList, "new 6-byte key");
} else if ( ptrs[mini][0] & 0x02 ) {
// is new key 12 bytes?
logTrace(g_conf.m_logTraceRdbList, "new 12-byte key");
memcpy(loKeys[mini], ptrs[mini] + 6, 6);
} else {
// is new key 18 bytes? full key.
logTrace(g_conf.m_logTraceRdbList, "new 18-byte key");
memcpy(hiKeys[mini], ptrs[mini] + 12, 6);
memcpy(loKeys[mini], ptrs[mini] + 6, 6);
}
@ -2348,6 +2360,7 @@ skip:
//
// REMOVE THE LIST at mini
//
logTrace(g_conf.m_logTraceRdbList, "remove list at mini=%" PRId32, mini);
// otherwise, remove him from array
for (int32_t i = mini; i < numLists - 1; i++) {
@ -2375,6 +2388,7 @@ skip:
// return now if we're empty... all our recs annihilated?
if (m_listSize <= 0) {
logTrace(g_conf.m_logTraceRdbList, "END. no more list");
return true;
}
@ -2410,6 +2424,7 @@ skip:
if (g_conf.m_logTraceRdbList) {
printList();
}
logTrace(g_conf.m_logTraceRdbList, "END. Less than requested");
return true;
}
@ -2419,6 +2434,7 @@ skip:
if (g_conf.m_logTraceRdbList) {
printList();
}
logTrace(g_conf.m_logTraceRdbList, "END. No more list");
return true;
}
@ -2449,6 +2465,7 @@ skip:
printList();
}
logTrace(g_conf.m_logTraceRdbList, "END. Done");
return true;
}

@ -115,7 +115,7 @@ public:
int32_t getAllocSize() const { return m_allocSize; }
void setAllocSize(int32_t allocSize) { m_allocSize = allocSize; }
int32_t getFixedDataSize() { return m_fixedDataSize; }
int32_t getFixedDataSize() const { return m_fixedDataSize; }
void setFixedDataSize(int32_t fixedDataSize) { m_fixedDataSize = fixedDataSize; }
// . merge_r() sets m_lastKey for the list it merges the others into
@ -135,7 +135,7 @@ public:
bool isLastKeyValid() const { return m_lastKeyIsValid; }
void setLastKeyIsValid(bool lastKeyIsValid) { m_lastKeyIsValid = lastKeyIsValid; }
bool getOwnData() { return m_ownData; }
bool getOwnData() const { return m_ownData; }
// if you don't want data to be freed on destruction then don't own it
void setOwnData(bool ownData) { m_ownData = ownData; }

@ -1196,13 +1196,10 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
// these guy always use a collnum of 0
bool doCollRecCheck = true;
if ( !strcmp(m_dbname,"catdb") ) doCollRecCheck = false;
if ( !strcmp(m_dbname,"statsdb") ) doCollRecCheck = false;
if ( !strcmp(m_dbname,"indexdb") ) useHalfKeys = true;
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
bool isTitledb = false;

@ -525,11 +525,11 @@ bool Rebalance::gotList ( ) {
KEYINC ( m_nextKey , ks );
}
if ( ! m_msg4a.addMetaList( &m_posMetaList, m_collnum, this, doneAddingMetaWrapper, MAX_NICENESS, rdb->getRdbId(), -1 ) ) { // shard override, not!
if (!m_msg4a.addMetaList(&m_posMetaList, m_collnum, this, doneAddingMetaWrapper, rdb->getRdbId(), -1)) { // shard override, not!
++m_blocked;
}
if ( ! m_msg4b.addMetaList( &m_negMetaList, m_collnum, this, doneAddingMetaWrapper, MAX_NICENESS, rdb->getRdbId(), myShard ) ) { // shard override, not!
if (!m_msg4b.addMetaList(&m_negMetaList, m_collnum, this, doneAddingMetaWrapper, rdb->getRdbId(), myShard)) { // shard override, not!
++m_blocked;
}

@ -1157,11 +1157,11 @@ bool Repair::gotScanRecList ( ) {
m_nextTitledbKey = next;
*/
// get the docid
//int64_t dd = g_titledb.getDocIdFromKey(&m_nextTitledbKey);
//int64_t dd = Titledb::getDocIdFromKey(&m_nextTitledbKey);
// inc it
//dd++;
// re-make key
//m_nextTitledbKey = g_titledb.makeFirstTitleRecKey ( dd );
//m_nextTitledbKey = Titledb::makeFirstTitleRecKey ( dd );
// advance one if positive, must always start on a neg
if ( (m_nextTitledbKey.n0 & 0x01) == 0x01 )
m_nextTitledbKey += (uint32_t)1;
@ -1209,7 +1209,7 @@ bool Repair::gotScanRecList ( ) {
// nextRec2:
key96_t tkey = m_titleRecList.getCurrentKey();
int64_t docId = g_titledb.getDocId ( &tkey );
int64_t docId = Titledb::getDocId ( &tkey );
// save it
//m_currentTitleRecKey = tkey;
@ -1372,7 +1372,7 @@ bool Repair::injectTitleRec ( ) {
// skip negative recs, first one should not be negative however
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
// get docid of that guy
int64_t dd = g_titledb.getDocId(k);
int64_t dd = Titledb::getDocId(k);
// compare that
if ( m_docId != dd ) continue;
// we got it!

@ -13,10 +13,6 @@
// hhhhhhhh hhhhhhhh tttttttt dddddddd t = tag type
// dddddddd dddddddd dddddddd ddddddHD d = docid
// DATA:
// SSSSSSSS SSSSSSSS SSSSSSSS SSSSSSSS S = SectionVote::m_score
// NNNNNNNN NNNNNNNN NNNNNNNN NNNNNNNN N = SectionVote::m_numSampled
// h: hash value. typically the lower 32 bits of the
// Section::m_contentHash64 vars. we
// do not need the full 64 bits because we have the 48 bit site hash included
@ -277,22 +273,4 @@ public:
class Section *m_firstSent;
};
// . the key in sectiondb is basically the Section::m_tagHash
// (with a docId) and the data portion of the Rdb record is this SectionVote
// . the Sections::m_nsvt and m_osvt hash tables contain SectionVotes
// as their data value and use an tagHash key as well
class SectionVote {
public:
// . seems like addVote*() always uses a score of 1.0
// . seems to be a weight used when setting Section::m_votesFor[Not]Dup
// . not sure if we really use this now
float m_score;
// . how many times does this tagHash occur in this doc?
// . this eliminates the need for the SV_UNIQUE section type
// . this is not used for tags of type contenthash or taghash
// . seems like pastdate and futuredate and eurdatefmt
// are the only vote types that actually really use this...
float m_numSampled;
} __attribute__((packed, aligned(4)));
#endif // GB_SECTIONS_H

@ -1674,7 +1674,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum );
// sanity. if in use we should not even be here
if ( sc->m_msg4x.m_inUse ) {
if ( sc->m_msg4x.isInUse() ) {
log( LOG_WARN, "basic: trying to update site list while previous update still outstanding.");
g_errno = EBADENGINEER;
return true;
@ -2005,7 +2005,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
// use spidercoll to contain this msg4 but if in use it
// won't be able to be deleted until it comes back..
return sc->m_msg4x.addMetaList ( spiderReqBuf, sc->m_collnum, sc, doneAddingSeedsWrapper, MAX_NICENESS, RDB_SPIDERDB );
return sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, sc, doneAddingSeedsWrapper, RDB_SPIDERDB);
}
// . Spider.cpp calls this to see if a url it wants to spider is
@ -4139,7 +4139,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
static int32_t getFakeIpForUrl2(Url *url2) {
// make the probable docid
int64_t probDocId = g_titledb.getProbableDocId ( url2 );
int64_t probDocId = Titledb::getProbableDocId ( url2 );
// make one up, like we do in PageReindex.cpp
int32_t firstIp = (probDocId & 0xffffffff);
return firstIp;
@ -4154,7 +4154,7 @@ bool SpiderRequest::setFromAddUrl(const char *url) {
// reset it
reset();
// make the probable docid
int64_t probDocId = g_titledb.getProbableDocId ( url );
int64_t probDocId = Titledb::getProbableDocId ( url );
// make one up, like we do in PageReindex.cpp
int32_t firstIp = (probDocId & 0xffffffff);

@ -3174,9 +3174,8 @@ bool SpiderColl::scanListForWinners ( ) {
// mdw: for testing take this out!
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
// sanity. make sure read is somewhat hefty for our
// maxWinners=1 thing
if ( (int32_t)SR_READ_SIZE < 500000 ) { g_process.shutdownAbort(true); }
// sanity. make sure read is somewhat hefty for our maxWinners=1 thing
static_assert(SR_READ_SIZE >= 500000, "ensure read size is big enough");
// only compare to min winner in tree if tree is full
if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) {

@ -1327,10 +1327,6 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
// shortcut
int64_t lockKeyUh48 = makeLockTableKey ( sreq );
//uint64_t lockKey ;
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
// . now that we have to use msg12 to see if the thing is locked
// to avoid spidering it.. (see comment in above function)
// we often try to spider something we are already spidering. that

276
Tagdb.cpp

@ -16,7 +16,6 @@
#include "GbMutex.h"
#include "ScopedLock.h"
static void gotMsg0ReplyWrapper ( void *state );
static HashTableX s_ht;
static bool s_initialized = false;
@ -1204,9 +1203,27 @@ static bool s_cacheInitialized = false;
static RdbCache s_cache;
static GbMutex s_cacheInitializedMutex;
Msg8a::Msg8a() {
m_replies = 0;
m_requests = 0;
Msg8a::Msg8a()
: m_url(NULL),
m_collnum(-1),
m_callback(NULL),
m_state(NULL),
//m_msg0s
//m_siteStartKey
//m_siteEndKey
m_niceness(0),
m_dom(NULL),
m_hostEnd(NULL),
m_p(NULL),
m_requests(0), m_replies(0),
m_doneLaunching(false),
m_mtx(),
m_errno(0),
m_tagRec(NULL),
m_state2(NULL),
m_state3(NULL)
{
}
Msg8a::~Msg8a ( ) {
@ -1378,43 +1395,9 @@ struct Msg8aState {
bool Msg8a::launchGetRequests ( ) {
// clear it
g_errno = 0;
bool tryDomain = false;
loop:
// return true if nothing to launch
if ( m_doneLaunching )
return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno )
return (m_requests == m_replies);
// limit max to 5ish
if (m_requests >= MAX_TAGDB_REQUESTS)
return (m_requests == m_replies);
// take a breath
QUICKPOLL(m_niceness);
key128_t startKey ;
key128_t endKey ;
if ( tryDomain ) {
startKey = g_tagdb.makeDomainStartKey ( m_url );
endKey = g_tagdb.makeDomainEndKey ( m_url );
log( LOG_DEBUG, "tagdb: looking up domain tags for %.*s", m_url->getDomainLen(), m_url->getDomain() );
}
else {
// usually the site is the hostname but sometimes it is like
// "www.last.fm/user/breendaxx/"
startKey = m_siteStartKey;
endKey = m_siteEndKey;
log( LOG_DEBUG, "tagdb: looking up site tags for %s", m_url->getUrl() );
}
// initialize cache
ScopedLock sl(s_cacheInitializedMutex);
ScopedLock sl_cache(s_cacheInitializedMutex);
if ( !s_cacheInitialized ) {
int64_t maxCacheSize = g_conf.m_tagRecCacheSize;
int64_t maxCacheNodes = ( maxCacheSize / 200 );
@ -1422,104 +1405,120 @@ bool Msg8a::launchGetRequests ( ) {
s_cacheInitialized = true;
s_cache.init( maxCacheSize, -1, true, maxCacheNodes, false, "tagreccache", false, 16, 16, -1 );
}
sl.unlock();
sl_cache.unlock();
// get the next mcast
Msg0 *m = &m_msg0s[m_requests];
//get tag for url and then domain
for(int getLoop = 0; getLoop<1; getLoop++) {
// and the list
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
key128_t startKey;
key128_t endKey;
// try to get from cache
RdbCacheLock rcl(s_cache);
if ( s_cache.getList( m_collnum, (char*)&startKey, (char*)&startKey, listPtr, true,
g_conf.m_tagRecCacheMaxAge, true) ) {
// got from cache
log( LOG_DEBUG, "tagdb: got key=%s from cache", KEYSTR(&startKey, sizeof(startKey)) );
if(getLoop==1) {
startKey = g_tagdb.makeDomainStartKey ( m_url );
endKey = g_tagdb.makeDomainEndKey ( m_url );
log( LOG_DEBUG, "tagdb: looking up domain tags for %.*s", m_url->getDomainLen(), m_url->getDomain() );
} else {
// usually the site is the hostname but sometimes it is like
// "www.last.fm/user/breendaxx/"
startKey = m_siteStartKey;
endKey = m_siteEndKey;
rcl.unlock();
m_requests++;
m_replies++;
} else {
rcl.unlock();
// bias based on the top 64 bits which is the hash of the "site" now
int32_t shardNum = getShardNum ( RDB_TAGDB , &startKey );
Host *firstHost ;
// if niceness 0 can't pick noquery host.
// if niceness 1 can't pick nospider host.
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
int32_t firstHostId = firstHost->m_hostId;
Msg8aState *state = NULL;
try {
state = new Msg8aState(this, startKey, endKey, m_requests);
} catch (...) {
g_errno = ENOMEM;
log(LOG_WARN, "tagdb: unable to allocate memory for Msg8aState");
return false;
}
mnew(state, sizeof(*state), "msg8astate");
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( firstHostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
RDB_TAGDB ,
m_collnum ,
listPtr ,
(char *) &startKey ,
(char *) &endKey ,
10000000 , // minRecSizes
state , // state
gotMsg0ReplyWrapper ,
m_niceness ,
true , // error correction?
true , // include tree?
true , // doMerge?
firstHostId , // firstHostId
0 , // startFileNum
-1 , // numFiles
msg0_getlist_infinite_timeout );// timeout
// error?
if ( status && g_errno ) {
// g_errno should be set, we had an error
m_errno = g_errno;
return (m_requests == m_replies);
log( LOG_DEBUG, "tagdb: looking up site tags for %s", m_url->getUrl() );
}
// successfully launched
m_requests++;
// get the next mcast
Msg0 *m = &m_msg0s[m_requests];
// if we got a reply instantly
if ( status ) {
// and the list
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
// try to get from cache
RdbCacheLock rcl(s_cache);
if ( s_cache.getList( m_collnum, (char*)&startKey, (char*)&startKey, listPtr, true,
g_conf.m_tagRecCacheMaxAge, true) ) {
// got from cache
log( LOG_DEBUG, "tagdb: got key=%s from cache", KEYSTR(&startKey, sizeof(startKey)) );
rcl.unlock();
ScopedLock sl(m_mtx);
m_requests++;
m_replies++;
} else {
rcl.unlock();
// bias based on the top 64 bits which is the hash of the "site" now
int32_t shardNum = getShardNum ( RDB_TAGDB , &startKey );
Host *firstHost ;
// if niceness 0 can't pick noquery host.
// if niceness 1 can't pick nospider host.
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
int32_t firstHostId = firstHost->m_hostId;
Msg8aState *state = NULL;
try {
state = new Msg8aState(this, startKey, endKey, m_requests);
} catch (...) {
g_errno = m_errno = ENOMEM;
log(LOG_WARN, "tagdb: unable to allocate memory for Msg8aState");
break;
}
mnew(state, sizeof(*state), "msg8astate");
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( firstHostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
RDB_TAGDB ,
m_collnum ,
listPtr ,
(char *) &startKey ,
(char *) &endKey ,
10000000 , // minRecSizes
state , // state
gotMsg0ReplyWrapper ,
m_niceness ,
true , // error correction?
true , // include tree?
true , // doMerge?
firstHostId , // firstHostId
0 , // startFileNum
-1 , // numFiles
msg0_getlist_infinite_timeout );// timeout
// error?
if ( status && g_errno ) {
// g_errno should be set, we had an error
m_errno = g_errno;
break;
}
ScopedLock sl(m_mtx);
// successfully launched
m_requests++;
// if we got a reply instantly
if ( status ) {
m_replies++;
}
}
}
if ( ! tryDomain ) {
tryDomain = true;
goto loop;
}
ScopedLock sl(m_mtx);
//
// no more looping!
//
// i don't think we need to loop any more because we got all the
// tags for this hostname. then the lower bits of the Tag key
// corresponds to the actual SITE hash. so we gotta filter those
// out i guess after we read the whole list.
//
m_doneLaunching = true;
return (m_requests == m_replies);
if(m_requests == m_replies)
return true; // all requests done
else
return false; // some requests weren't immediate
}
static void gotMsg0ReplyWrapper ( void *state ) {
void Msg8a::gotMsg0ReplyWrapper ( void *state ) {
Msg8aState *msg8aState = (Msg8aState*)state;
Msg8a *msg8a = msg8aState->m_msg8a;
@ -1529,9 +1528,6 @@ static void gotMsg0ReplyWrapper ( void *state ) {
mdelete( msg8aState, sizeof(*msg8aState), "msg8astate" );
delete msg8aState;
// we got one
msg8a->m_replies++;
// error?
if ( g_errno ) {
msg8a->m_errno = g_errno;
@ -1548,21 +1544,23 @@ static void gotMsg0ReplyWrapper ( void *state ) {
s_cache.addList( msg8a->m_collnum, (char*)&startKey, list);
}
// launchGetRequests() returns false if still waiting for replies...
if ( ! msg8a->launchGetRequests() ) {
return;
ScopedLock sl(msg8a->m_mtx);
msg8a->m_replies++;
if(msg8a->m_doneLaunching && msg8a->m_requests==msg8a->m_replies) {
sl.unlock();
// got all the replies
msg8a->gotAllReplies();
// set g_errno for the callback
if ( msg8a->m_errno ) {
g_errno = msg8a->m_errno;
}
// call callback
msg8a->m_callback ( msg8a->m_state );
}
// get all the replies
msg8a->gotAllReplies();
// set g_errno for the callback
if ( msg8a->m_errno ) {
g_errno = msg8a->m_errno;
}
// otherwise, call callback
msg8a->m_callback ( msg8a->m_state );
}
// get the TagRec from the reply

@ -10,6 +10,7 @@
#include "Loop.h"
#include "SafeBuf.h"
#include "Msg0.h"
#include "GbMutex.h"
// . Tag::m_type is this if its a dup in the TagRec
// . so if www.xyz.com has one tag and xyz.com has another, then
@ -214,9 +215,12 @@ class Msg8a {
bool getTagRec( Url *url, collnum_t collnum, int32_t niceness, void *state, void (*callback)( void * ),
TagRec *tagRec );
private:
bool launchGetRequests();
void gotAllReplies ( ) ;
static void gotMsg0ReplyWrapper(void *);
// some specified input
Url *m_url;
@ -238,13 +242,15 @@ class Msg8a {
int32_t m_requests;
int32_t m_replies;
char m_doneLaunching;
bool m_doneLaunching;
GbMutex m_mtx;
int32_t m_errno;
// we set this for the caller
TagRec *m_tagRec;
public:
// hack for MsgE
void *m_state2;
void *m_state3;

@ -13,8 +13,6 @@
#include "TitleRecVersion.h"
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Xml.h"
// new key format:
// . <docId> - 38 bits
@ -22,15 +20,11 @@
// . <delBit> - 1 bit
class Titledb {
public:
public:
// reset rdb
void reset();
bool verify ( char *coll );
//bool addColl ( char *coll, bool doVerify = true );
bool verify(char *coll);
// init m_rdb
bool init ();
@ -38,12 +32,20 @@ class Titledb {
// init secondary/rebuild titledb
bool init2 ( int32_t treeMem ) ;
Rdb* getRdb() { return &m_rdb; }
// . this is an estimate of the number of docs in the WHOLE db network
// . we assume each group/cluster has about the same # of docs as us
int64_t getGlobalNumDocs() {
return m_rdb.getNumTotalRecs() * (int64_t)g_hostdb.m_numShards;
}
// . get the probable docId from a url/coll
// . it's "probable" because it may not be the actual docId because
// in the case of a collision we pick a nearby docId that is
// different but guaranteed to be in the same group/cluster, so you
// can be assured the top 32 bits of the docId will be unchanged
uint64_t getProbableDocId ( Url *url , bool mask = true ) {
static uint64_t getProbableDocId(const Url *url, bool mask = true) {
uint64_t probableDocId = hash64b(url->getUrl(),0);
// Linkdb::getUrlHash() does not mask it
if ( mask ) probableDocId = probableDocId & DOCID_MASK;
@ -59,14 +61,14 @@ class Titledb {
}
// a different way to do it
uint64_t getProbableDocId ( const char *url ) {
static uint64_t getProbableDocId(const char *url) {
Url u;
u.set( url );
return getProbableDocId ( &u );
return getProbableDocId(&u);
}
// a different way to do it
uint64_t getProbableDocId(const char *url,const char *dom,int32_t domLen) {
static uint64_t getProbableDocId(const char *url, const char *dom, int32_t domLen) {
uint64_t probableDocId = hash64b(url,0) &
DOCID_MASK;
// clear bits 6-13 because we want to put the domain hash there
@ -80,73 +82,56 @@ class Titledb {
}
// turn off the last 6 bits
uint64_t getFirstProbableDocId ( int64_t d ) {
return d & 0xffffffffffffffc0LL; }
static uint64_t getFirstProbableDocId(int64_t d) {
return d & 0xffffffffffffffc0ULL;
}
// turn on the last 6 bits for the end docId
uint64_t getLastProbableDocId ( int64_t d ) {
return d | 0x000000000000003fLL; }
static uint64_t getLastProbableDocId(int64_t d) {
return d | 0x000000000000003fULL;
}
// . the top NUMDOCIDBITs of "key" are the docId
// . we use the top X bits of the keys to partition the records
// . using the top bits to partition allows us to keep keys that
// are near each other (euclidean metric) in the same partition
int64_t getDocIdFromKey ( key96_t *key ) {
uint64_t docId;
docId = ((uint64_t)key->n1)<<(NUMDOCIDBITS - 32);
docId|= key->n0 >>(64-(NUMDOCIDBITS-32));
static int64_t getDocIdFromKey(const key96_t *key) {
uint64_t docId = ((uint64_t)key->n1) << (NUMDOCIDBITS - 32);
docId |= key->n0 >> (64 - (NUMDOCIDBITS - 32));
return docId;
}
int64_t getDocId ( key96_t *key ) { return getDocIdFromKey(key); }
int64_t getDocIdFromKey ( key96_t key ) {
return getDocIdFromKey(&key);}
uint8_t getDomHash8FromDocId (int64_t d) {
return (d & ~0xffffffffffffc03fULL) >> 6; }
static int64_t getDocId(const key96_t *key) { return getDocIdFromKey(key); }
int64_t getUrlHash48 ( key96_t *k ) {
return ((k->n0 >> 10) & 0x0000ffffffffffffLL); }
static uint8_t getDomHash8FromDocId (int64_t d) {
return (d & ~0xffffffffffffc03fULL) >> 6;
}
// . dptr is a char ptr to the docid
// . used by IndexTable2.cpp
// . "dptr" is pointing into a 6-byte indexdb key
// . see IndexTable2.cpp, grep for gbmemcpy() to see
// how the docid is parsed out of this key (or see
// Indexdb.h)
// . return ((*((uint16_t *)dptr)) >> 8) & 0xff; }
uint8_t getDomHash8 ( uint8_t *dptr ) { return dptr[1]; }
static int64_t getUrlHash48 ( key96_t *k ) {
return ((k->n0 >> 10) & 0x0000ffffffffffffLL);
}
// does this key/docId/url have it's titleRec stored locally?
bool isLocal ( int64_t docId );
bool isLocal ( Url *url ) {
return isLocal ( getProbableDocId(url) ); }
bool isLocal ( key96_t key ) {
return isLocal (getDocIdFromKey(&key));}
static bool isLocal(int64_t docId);
Rdb *getRdb() { return &m_rdb; }
static bool isLocal(Url *url) {
return isLocal(getProbableDocId(url));
}
// . make the key of a TitleRec from a docId
// . remember to set the low bit so it's not a delete
// . hi bits are set in the key
key96_t makeKey ( int64_t docId, int64_t uh48, bool isDel );
static key96_t makeKey(int64_t docId, int64_t uh48, bool isDel);
key96_t makeFirstKey ( int64_t docId ) {
return makeKey ( docId , 0, true ); }
static key96_t makeFirstKey(int64_t docId) {
return makeKey(docId, 0, true);
}
key96_t makeLastKey ( int64_t docId ) {
return makeKey ( docId , 0xffffffffffffLL, false ); }
// . this is an estimate of the number of docs in the WHOLE db network
// . we assume each group/cluster has about the same # of docs as us
int64_t getGlobalNumDocs ( ) {
return m_rdb.getNumTotalRecs()*
(int64_t)g_hostdb.m_numShards;}
int32_t getLocalNumDocs () { return m_rdb.getNumTotalRecs(); }
int32_t getNumDocsInMem () { return m_rdb.getNumUsedNodes(); }
int32_t getMemUsed () { return m_rdb.getTreeMemOccupied(); }
static key96_t makeLastKey(int64_t docId) {
return makeKey(docId, 0xffffffffffffLL, false);
}
private:
// holds binary format title entries
Rdb m_rdb;
};

@ -203,8 +203,7 @@ int32_t TopTree::getHighNode ( ) {
bool TopTree::addNode ( TopNode *t , int32_t tnn ) {
// respect the dom hashes
//uint8_t domHash = g_titledb.getDomHash8((uint8_t*)t->m_docIdPtr);
uint8_t domHash = g_titledb.getDomHash8FromDocId(t->m_docId);
uint8_t domHash = Titledb::getDomHash8FromDocId(t->m_docId);
// if vcount is satisfied, only add if better score than tail
if ( m_vcount >= m_docsWanted ) {
@ -449,9 +448,7 @@ bool TopTree::addNode ( TopNode *t , int32_t tnn ) {
//if ( getNext(tn) == -1 ) gbshutdownLogicError();
// get the min node
TopNode *t = &m_nodes[tn];
// get its docid ptr
//uint8_t domHash2 = g_titledb.getDomHash8((ui)t->m_docIdPtr);
uint8_t domHash2 = g_titledb.getDomHash8FromDocId(t->m_docId);
uint8_t domHash2 = Titledb::getDomHash8FromDocId(t->m_docId);
// . also must delete from m_t2
// . make the key
key96_t k;

1128
XmlDoc.cpp

File diff suppressed because it is too large Load Diff

109
XmlDoc.h

@ -361,7 +361,7 @@ public:
char *getIsPermalink ( ) ;
char *getIsUrlPermalinkFormat ( ) ;
char *getIsRSS ( ) ;
char *getIsSiteMap ( ) ;
bool *getIsSiteMap ( ) ;
class Xml *getXml ( ) ;
uint8_t *getLangVector ( ) ;
uint8_t *getLangId ( ) ;
@ -693,62 +693,59 @@ public:
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
// DO NOT add validity flags above this line!
char m_metaListValid;
char m_addedSpiderRequestSizeValid;
char m_addedSpiderReplySizeValid;
char m_addedStatusDocSizeValid;
char m_downloadStartTimeValid;
char m_siteValid;
char m_startTimeValid;
char m_currentUrlValid;
char m_useTimeAxisValid;
char m_timeAxisUrlValid;
char m_firstUrlValid;
char m_firstUrlHash48Valid;
char m_firstUrlHash64Valid;
char m_lastUrlValid;
char m_docIdValid;
char m_availDocIdValid;
char m_tagRecValid;
char m_robotsTxtLenValid;
char m_tagRecDataValid;
char m_newTagBufValid;
char m_rootTitleBufValid;
char m_filteredRootTitleBufValid;
char m_titleBufValid;
char m_fragBufValid;
char m_isRobotsTxtUrlValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
bool m_metaListValid;
bool m_addedSpiderRequestSizeValid;
bool m_addedSpiderReplySizeValid;
bool m_addedStatusDocSizeValid;
bool m_downloadStartTimeValid;
bool m_siteValid;
bool m_startTimeValid;
bool m_currentUrlValid;
bool m_useTimeAxisValid;
bool m_timeAxisUrlValid;
bool m_firstUrlValid;
bool m_firstUrlHash48Valid;
bool m_firstUrlHash64Valid;
bool m_lastUrlValid;
bool m_docIdValid;
bool m_availDocIdValid;
bool m_tagRecValid;
bool m_robotsTxtLenValid;
bool m_tagRecDataValid;
bool m_newTagBufValid;
bool m_rootTitleBufValid;
bool m_filteredRootTitleBufValid;
bool m_titleBufValid;
bool m_fragBufValid;
bool m_isRobotsTxtUrlValid;
bool m_wordSpamBufValid;
bool m_finalSummaryBufValid;
char m_hopCountValid;
char m_isInjectingValid;
char m_isImportingValid;
char m_metaListCheckSum8Valid;
char m_contentValid;
char m_filteredContentValid;
char m_charsetValid;
char m_langVectorValid;
char m_langIdValid;
char m_datedbDateValid;
char m_isRSSValid;
char m_isSiteMapValid;
char m_isContentTruncatedValid;
char m_xmlValid;
char m_linksValid;
char m_wordsValid;
char m_bitsValid;
char m_bits2Valid;
char m_posValid;
char m_phrasesValid;
char m_sectionsValid;
bool m_hopCountValid;
bool m_isInjectingValid;
bool m_isImportingValid;
bool m_metaListCheckSum8Valid;
bool m_contentValid;
bool m_filteredContentValid;
bool m_charsetValid;
bool m_langVectorValid;
bool m_langIdValid;
bool m_isRSSValid;
bool m_isSiteMapValid;
bool m_isContentTruncatedValid;
bool m_xmlValid;
bool m_linksValid;
bool m_wordsValid;
bool m_bitsValid;
bool m_bits2Valid;
bool m_posValid;
bool m_phrasesValid;
bool m_sectionsValid;
char m_imageDataValid;
char m_imagesValid;
char m_msge0Valid;
char m_msge1Valid;
char m_sreqValid;
char m_srepValid;
bool m_imageDataValid;
bool m_imagesValid;
bool m_sreqValid;
bool m_srepValid;
bool m_ipValid;
bool m_firstIpValid;
@ -851,7 +848,7 @@ public:
bool m_exactContentHash64Valid;
bool m_jpValid;
char m_isSiteMap;
bool m_isSiteMap;
// shadows
char m_isRSS2;

@ -142,7 +142,7 @@ static bool storeTerm ( const char *s ,
// . hash terms that are sharded by TERMID not DOCID!!
//
// . returns false and sets g_errno on error
// . these terms are stored in indexdb/datedb, but all terms with the same
// . these terms are stored in indexdb, but all terms with the same
// termId reside in one and only one group. whereas normally the records
// are split based on docid and every group gets 1/nth of the termlist.
// . we do this "no splitting" so that only one disk seek is required, and
@ -289,155 +289,143 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . "sr" is the tagdb Record
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll ( HashTableX *table ) {
char *XmlDoc::hashAll(HashTableX *table) {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__,__func__, __LINE__);
setStatus ( "hashing document" );
setStatus("hashing document");
if ( m_allHashed ) return (char *)1;
if (m_allHashed) {
return (char *)1;
}
// sanity checks
if ( table->m_ks != 18 ) { g_process.shutdownAbort(true); }
if ( table->m_ds != 4 ) { g_process.shutdownAbort(true); }
if (table->m_ks != 18 || table->m_ds != 4) {
g_process.shutdownAbort(true);
}
if ( m_wts && m_wts->m_ks != 12 ) { g_process.shutdownAbort(true); }
// ptr to term = 4 + score = 4 + ptr to sec = 4
if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){g_process.shutdownAbort(true);}
if (m_wts && (m_wts->m_ks != 12 || m_wts->m_ds != sizeof(TermDebugInfo))) {
g_process.shutdownAbort(true);
}
uint8_t *ct = getContentType();
if ( ! ct )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getContentType failed", __FILE__,__func__, __LINE__);
if (!ct) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
return NULL;
}
// BR 20160127: Never index JSON and XML content
if ( *ct == CT_JSON || *ct == CT_XML )
{
if (*ct == CT_JSON || *ct == CT_XML) {
// For XML (JSON should not get here as it should be filtered out during spidering)
// store the URL as the only thing in posdb so we are able to find it, and
// eventually ban it.
if ( !hashUrl( table, true ) ) // urlOnly (skip IP and term generation)
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
m_allHashed = true;
return (char *)1;
}
unsigned char *hc = (unsigned char *)getHopCount();
if ( ! hc || hc == (void *)-1 )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getHopCount returned -1", __FILE__,__func__, __LINE__);
if (!hc || hc == (void *)-1) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getHopCount returned -1");
return (char *)hc;
}
// need this for hashing
HashTableX *cnt = getCountTable();
if ( ! cnt )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCountTable failed", __FILE__,__func__, __LINE__);
if (!cnt) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
return (char *)cnt;
}
if ( cnt == (void *)-1 ) { g_process.shutdownAbort(true); }
if (cnt == (void *)-1) {
g_process.shutdownAbort(true);
}
// and this
Links *links = getLinks();
if ( ! links )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLinks failed", __FILE__,__func__, __LINE__);
if (!links) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
return (char *)links;
}
if ( links == (Links *)-1 ) { g_process.shutdownAbort(true); }
if (links == (Links *)-1) {
g_process.shutdownAbort(true);
}
char *wordSpamVec = getWordSpamVec();
if (!wordSpamVec)
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getWordSpamVec failed", __FILE__,__func__, __LINE__);
return (char *)wordSpamVec;
if (!wordSpamVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
return wordSpamVec;
}
if (wordSpamVec == (void *)-1) {
g_process.shutdownAbort(true);
}
if (wordSpamVec==(void *)-1) {g_process.shutdownAbort(true);}
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
if ( ! fragVec )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getFragVec failed", __FILE__,__func__, __LINE__);
return (char *)fragVec;
char *fragVec = getFragVec();
if (!fragVec) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
return fragVec;
}
if (fragVec == (void *)-1) {
g_process.shutdownAbort(true);
}
if ( fragVec == (void *)-1 ) { g_process.shutdownAbort(true); }
// why do we need this?
if ( m_wts ) {
uint8_t *lv = getLangVector();
if ( ! lv )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLangVector failed", __FILE__,__func__, __LINE__);
if (!lv) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
return (char *)lv;
}
if ( lv == (void *)-1 ) { g_process.shutdownAbort(true); }
if (lv == (void *)-1) {
g_process.shutdownAbort(true);
}
}
CollectionRec *cr = getCollRec();
if ( ! cr )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCollRec failed", __FILE__,__func__, __LINE__);
if ( ! cr ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
return NULL;
}
// do not repeat this if the cachedb storage call blocks
m_allHashed = true;
// reset distance cursor
m_dist = 0;
if ( ! hashContentType ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashContentType failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashUrl ( table, false ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashLanguage ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguage failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashCountry ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
if (!hashContentType(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
return NULL;
}
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
if ( ! hashIsAdult ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIsAdult failed", __FILE__,__func__, __LINE__);
if (!hashUrl(table, false)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
return NULL;
}
// has gbhasthumbnail:1 or 0
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
if (!hashLanguage(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
return NULL;
}
if (!hashCountry(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
return NULL;
}
if (!hashIsAdult(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIsAdult failed");
return NULL;
}
// now hash the terms sharded by termid and not docid here since they
// just set a special bit in posdb key so Rebalance.cpp can work.
// this will hash the content checksum which we need for deduping
// which we use for diffbot custom crawls as well.
if ( ! hashNoSplit ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNoSplit failed", __FILE__,__func__, __LINE__);
if (!hashNoSplit(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
return NULL;
}
@ -445,16 +433,13 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// global index now, so don't need this... 9/28/2014
// stop indexing xml docs
bool indexDoc = true;
if ( ! cr->m_indexBody ) indexDoc = false;
bool indexDoc = cr->m_indexBody;
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
if ( ! indexDoc )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, !indexDoc", __FILE__,__func__, __LINE__);
if (!indexDoc) {
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
return (char *)1;
}
@ -464,9 +449,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections
if ( ! hashBody2 (table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashBody2 failed", __FILE__,__func__, __LINE__);
if (!hashBody2(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
return NULL;
}
@ -476,18 +460,16 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// repeated title terms because we do not do spam detection
// on them. thus, we need to hash these first before anything
// else. give them triple the body score
if ( ! hashTitle ( table ))
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashTitle failed", __FILE__,__func__, __LINE__);
if (!hashTitle(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
return NULL;
}
// . hash the keywords tag, limited to first 2k of them so far
// . hash above the neighborhoods so the neighborhoods only index
// what is already in the hash table
if ( ! hashMetaKeywords(table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaKeywords failed", __FILE__,__func__, __LINE__);
if (!hashMetaKeywords(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
return NULL;
}
@ -495,18 +477,16 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// we index the single words in the neighborhoods next, and
// we had songfacts.com coming up for the 'street light facts'
// query because it had a bunch of anomalous inlink text.
if ( ! hashIncomingLinkText(table,false,true))
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIncomingLinkText failed", __FILE__,__func__, __LINE__);
if (!hashIncomingLinkText(table, false, true)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
return NULL;
}
// then the meta summary and description tags with half the score of
// the body, and only hash a term if was not already hashed above
// somewhere.
if ( ! hashMetaSummary(table) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaSummary failed", __FILE__,__func__, __LINE__);
if (!hashMetaSummary(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
return NULL;
}
@ -514,68 +494,48 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// BR 20160220
// Store value of meta tag "geo.placename" to help aid searches for
// location specific sites, e.g. 'Restaurant in London'
if ( ! hashMetaGeoPlacename(table) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaGeoPlacename failed", __FILE__,__func__, __LINE__);
if (!hashMetaGeoPlacename(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
return NULL;
}
skip:
skip:
// this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision
if ( ! hashNeighborhoods ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNeighborhoods failed", __FILE__,__func__, __LINE__);
if (!hashNeighborhoods(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
return NULL;
}
if ( ! hashLinks ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLinks failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashDateNumbers ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashDateNumbers failed", __FILE__,__func__, __LINE__);
return NULL;
}
if ( ! hashMetaTags ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
if (!hashLinks(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
return NULL;
}
if ( ! hashPermalink ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
if (!hashDateNumbers(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashDateNumbers failed");
return NULL;
}
if (!hashMetaTags(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
return NULL;
}
if (!hashPermalink(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashPermaLink failed");
return NULL;
}
// hash gblang:de last for parsing consistency
if ( ! hashLanguageString ( table ) )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguageString failed", __FILE__,__func__, __LINE__);
if (!hashLanguageString(table)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
return NULL;
}
// . hash gbkeyword:gbmininlinks where the score is the inlink count
// . the inlink count can go from 1 to 255
// . an ip neighborhood can vote no more than once
// . this is in LinkInfo::hash
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
// return true if we don't need to print parser info
//if ( ! m_pbuf ) return true;
// print out the table into g_bufPtr now if we need to
//table->print ( );
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, OK", __FILE__,__func__, __LINE__);
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
return (char *)1;
}
@ -640,7 +600,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t tagLen;
char *tag = m_xml.getString ( i , "name" , &tagLen );
char *tptr = tag;
char tagLower[128];
int32_t j ;
int32_t code;
@ -697,13 +656,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
continue;
}
// . don't allow reserved names: site, url, suburl, link and ip
// . actually, the colon is included as part of those
// field names, so we really lucked out...!
// . index this converted tag name
tptr = tagLower;
// get the content
int32_t len;
char *s = m_xml.getString ( i , "content" , &len );
@ -742,22 +694,13 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
// NULL terminate the buffer
buf[len] = '\0';
// temp null term
char c = tptr[tagLen];
tptr[tagLen] = 0;
// BR 20160220
// Now index the wanted meta tags as normal text without prefix so they
// are used in user searches automatically.
// custom
//hi.m_prefix = tptr;
hi.m_prefix = NULL;
// desc is NULL, prefix will be used as desc
bool status = hashString ( buf,len,&hi );
// put it back
tptr[tagLen] = c;
// bail on error, g_errno should be set
if ( ! status ) return false;
@ -1088,7 +1031,7 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
#endif
// set this key, it is the entire record
key224_t k;
k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
m_links.getLinkHash64(i) ,
spam , // link spam?
siteRank , // was quality
@ -1509,8 +1452,7 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
// sanity check
if ( hashAnomalies == hashNonAnomalies ) { g_process.shutdownAbort(true); }
// display this note in page parser
const char *note = "hashing incoming link text";
// sanity
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
@ -1531,8 +1473,6 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
// brought the following code in from LinkInfo.cpp
//
int32_t noteLen = 0;
if ( note ) noteLen = strlen ( note );
// count "external" inlinkers
int32_t ecount = 0;
@ -1631,11 +1571,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
//int32_t inlinks = *getSiteNumInlinks();
// HACK: to avoid having to pass a flag to TermTable, then to
// Words::hash(), Phrases::hash(), etc. just flip a bit in the
// table to make it not add anything unless it is already in there.
tt->m_addIffNotUnique = true;
// update hash parms
HashInfo hi;
hi.m_tt = tt;
@ -1647,9 +1582,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
int32_t len = k->size_surroundingText - 1;
if ( ! hashString ( s, len, &hi ) ) return false;
// now turn it back off
tt->m_addIffNotUnique = false;
// get the next Inlink
goto loop;
}
@ -1992,7 +1924,7 @@ bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
key144_t k;
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
final,
0LL, // docid
0, // dist
@ -2355,7 +2287,7 @@ bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sec
// if using posdb
key144_t k;
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
h ,
0LL,//docid
wposvec[i], // dist,
@ -2405,7 +2337,7 @@ bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sec
int64_t nah ;
nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
if ( plen>0 ) nah = hash64 ( nah , prefixHash );
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
nah,
0LL,//docid
wposvec[i], // dist,
@ -2462,7 +2394,7 @@ skipsingleword:
// hash with prefix
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
else ph2 = npid;
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
ph2 ,
0LL,//docid
wposvec[i],//dist,
@ -2565,7 +2497,7 @@ bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
@ -2696,7 +2628,7 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
// a prefix hash
// . use mostly fake value otherwise
key144_t k;
g_posdb.makeKey ( &k ,
Posdb::makeKey ( &k ,
ph2 ,
0,//docid
0,// word pos #
@ -2719,14 +2651,7 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
false , // delkey?
hi->m_shardByTermId );
//int64_t final = hash64n("products.offerprice",0);
//int64_t prefix = hash64n("gbsortby",0);
//int64_t h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
//g_posdb.setFloat ( &k , f );
g_posdb.setInt ( &k , n );
Posdb::setInt ( &k , n );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
@ -2736,11 +2661,11 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
Posdb::setAlignmentBit ( &k , 0 );
// sanity
//float t = g_posdb.getFloat ( &k );
int32_t x = g_posdb.getInt ( &k );
//float t = Posdb::getFloat ( &k );
int32_t x = Posdb::getInt ( &k );
if ( x != n ) { g_process.shutdownAbort(true); }
HashTableX *dt = hi->m_tt;

@ -2844,7 +2844,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
startKey.setMin();
endKey.setMax();
lastKey.setMin();
startKey = g_titledb.makeFirstKey ( docid );
startKey = Titledb::makeFirstKey ( docid );
// turn off threads
g_jobScheduler.disallow_new_jobs();
// get a meg at a time
@ -2909,7 +2909,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = g_titledb.getDocIdFromKey ( k );
int64_t docId = Titledb::getDocIdFromKey ( &k );
if ( k <= lastKey )
log("key out of order. "
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
@ -4292,8 +4292,8 @@ bool parseTest ( const char *coll, int64_t docId, const char *query ) {
// get a title rec
g_jobScheduler.disallow_new_jobs();
RdbList tlist;
key96_t startKey = g_titledb.makeFirstKey ( docId );
key96_t endKey = g_titledb.makeLastKey ( docId );
key96_t startKey = Titledb::makeFirstKey ( docId );
key96_t endKey = Titledb::makeLastKey ( docId );
// a niceness of 0 tells it to block until it gets results!!
Msg5 msg5;
@ -4722,7 +4722,7 @@ void dumpPosdb (const char *coll, int32_t startFileNum, int32_t numFiles, bool i
const char *dd = "";
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
int64_t d = g_posdb.getDocId(&k);
uint8_t dh = g_titledb.getDomHash8FromDocId(d);
uint8_t dh = Titledb::getDomHash8FromDocId(d);
char *rec = list.getCurrentRec();
int32_t recSize = 18;
if ( rec[0] & 0x04 ) recSize = 6;
@ -4947,10 +4947,10 @@ void dumpLinkdb ( const char *coll,
if ( url ) {
Url u;
u.set( url, strlen( url ), true, false );
uint32_t h32 = u.getHostHash32();//g_linkdb.getUrlHash(&u)
uint32_t h32 = u.getHostHash32();
int64_t uh64 = hash64n(url,0);
startKey = g_linkdb.makeStartKey_uk ( h32 , uh64 );
endKey = g_linkdb.makeEndKey_uk ( h32 , uh64 );
startKey = Linkdb::makeStartKey_uk ( h32 , uh64 );
endKey = Linkdb::makeEndKey_uk ( h32 , uh64 );
}
// turn off threads
g_jobScheduler.disallow_new_jobs();
@ -5006,7 +5006,7 @@ void dumpLinkdb ( const char *coll,
// is it a delete?
const char *dd = "";
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
int64_t docId = (int64_t)g_linkdb.getLinkerDocId_uk(&k);
int64_t docId = (int64_t)Linkdb::getLinkerDocId_uk(&k);
int32_t shardNum = getShardNum(RDB_LINKDB,&k);
printf("k=%s "
"linkeesitehash32=0x%08" PRIx32" "
@ -5022,16 +5022,16 @@ void dumpLinkdb ( const char *coll,
"shardNum=%" PRIu32" "
"%s\n",
KEYSTR(&k,sizeof(key224_t)),
(int32_t)g_linkdb.getLinkeeSiteHash32_uk(&k),
(int64_t)g_linkdb.getLinkeeUrlHash64_uk(&k),
(int32_t)g_linkdb.isLinkSpam_uk(&k),
(int32_t)g_linkdb.getLinkerSiteRank_uk(&k),
//hc,//g_linkdb.getLinkerHopCount_uk(&k),
iptoa((int32_t)g_linkdb.getLinkerIp_uk(&k)),
(int32_t)Linkdb::getLinkeeSiteHash32_uk(&k),
(int64_t)Linkdb::getLinkeeUrlHash64_uk(&k),
(int32_t)Linkdb::isLinkSpam_uk(&k),
(int32_t)Linkdb::getLinkerSiteRank_uk(&k),
//hc,//Linkdb::getLinkerHopCount_uk(&k),
iptoa((int32_t)Linkdb::getLinkerIp_uk(&k)),
docId,
(int32_t)g_linkdb.getDiscoveryDate_uk(&k),
(int32_t)g_linkdb.getLostDate_uk(&k),
(int32_t)g_linkdb.getLinkerSiteHash32_uk(&k),
(int32_t)Linkdb::getDiscoveryDate_uk(&k),
(int32_t)Linkdb::getLostDate_uk(&k),
(int32_t)Linkdb::getLinkerSiteHash32_uk(&k),
shardNum,
dd );
}
@ -5441,7 +5441,7 @@ int injectFile ( const char *filename , char *ips , const char *coll ) {
}
if ( startDocId != 0LL )
s_titledbKey = g_titledb.makeFirstKey(startDocId);
s_titledbKey = Titledb::makeFirstKey(startDocId);
s_endDocId = endDocId;
@ -5569,7 +5569,7 @@ void doInject ( int fd , void *state ) {
// turn off threads so this happens right away
g_jobScheduler.disallow_new_jobs();
key96_t endKey; //endKey.setMax();
endKey = g_titledb.makeFirstKey(s_endDocId);
endKey = Titledb::makeFirstKey(s_endDocId);
RdbList list;
Msg5 msg5;
const char *coll = "main";
@ -7160,7 +7160,7 @@ void countdomains( const char* coll, int32_t numRecs, int32_t verbosity, int32_t
key96_t k = list.getCurrentKey();
char *rec = list.getCurrentRec();
int32_t recSize = list.getCurrentRecSize();
int64_t docId = g_titledb.getDocId ( &k );
int64_t docId = Titledb::getDocId ( &k );
attempts++;
if ( k <= lastKey )

@ -928,7 +928,7 @@ bool Test::injectLoop ( ) {
m_sreq.m_domHash32 = fakeIp;
m_sreq.m_hostHash32 = fakeIp;
m_sreq.m_siteHash32 = fakeIp;
//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
//m_sreq.m_probDocId = Titledb::getProbableDocId( m_sreq.m_url );
// this crap is fake
m_sreq.m_isInjecting = 1;
// use test-spider subdir for storing pages and spider times?
@ -973,7 +973,6 @@ bool Test::injectLoop ( ) {
m_coll ,
NULL ,
injectedWrapper ,
MAX_NICENESS ,
RDB_SPIDERDB ) )
// return false if blocked
return false;

@ -165,9 +165,9 @@ int main ( int argc , char *argv[] ) {
printf("encoded: %s\n",dst);
// the probable docid
int64_t pd = g_titledb.getProbableDocId(&u);
int64_t pd = Titledb::getProbableDocId(&u);
printf("pdocid: %"UINT64"\n", pd );
printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
printf("dom8: 0x%"XINT32"\n", (int32_t)Titledb::getDomHash8FromDocId(pd) );
if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
else printf("islinkloop: no\n");
int64_t hh64 = u.getHostHash64();

@ -10,6 +10,7 @@ static const char* makePosdbKey(char *key, int64_t termId, uint64_t docId, int32
}
TEST(RdbListTest, MergeTestPosdbEmptyAll) {
g_conf.m_logTraceRdbList = true;
// setup test
RdbList list1;
list1.set(NULL, 0, NULL, 0, 0, true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
@ -32,6 +33,7 @@ TEST(RdbListTest, MergeTestPosdbEmptyAll) {
}
TEST(RdbListTest, MergeTestPosdbEmptyOne) {
g_conf.m_logTraceRdbList = true;
char key[MAX_KEY_BYTES];
// setup test
@ -83,6 +85,7 @@ TEST(RdbListTest, MergeTestPosdbEmptyOne) {
// verify that list order is from oldest to newest (last list will override first list)
TEST(RdbListTest, MergeTestPosdbVerifyListOrder) {
g_conf.m_logTraceRdbList = true;
char key[MAX_KEY_BYTES];
// setup test