mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-13 02:36:06 -04:00
Merge branch 'master' into nomerge2
Conflicts: Msg40.cpp Tagdb.cpp
This commit is contained in:
Clusterdb.hDailyMerge.cppDailyMerge.hHashTableX.cppHashTableX.hHostdb.cppLinkdb.cppLinkdb.hLoop.cppMsg0.cppMsg20.cppMsg22.cppMsg3a.cppMsg4.cppMsg4.hMsg40.cppMsg51.cppMulticast.cppMulticast.hPageAddUrl.cppPageCrawlBot.cppPageInject.cppPageParser.cppPageReindex.cppParms.cppPosdb.cppPosdbTable.cppPosdbTable.hRdb.cppRdb.hRdbBase.cppRdbList.cppRdbList.hRdbTree.cppRebalance.cppRepair.cppSections.hSpider.cppSpiderColl.cppSpiderLoop.cppTagdb.cppTagdb.hTitledb.hTopTree.cppXmlDoc.cppXmlDoc.hXmlDoc_Indexing.cppmain.cpp
misc
test/unit
@ -28,9 +28,6 @@
|
||||
#define GB_CLUSTERDB_H
|
||||
|
||||
#include "Rdb.h"
|
||||
#include "Url.h"
|
||||
#include "Conf.h"
|
||||
#include "Titledb.h"
|
||||
|
||||
// these are now just TitleRec keys
|
||||
#define CLUSTER_REC_SIZE (sizeof(key96_t))
|
||||
@ -69,23 +66,19 @@ public:
|
||||
false, true ); }
|
||||
|
||||
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
|
||||
// // docId occupies the most significant bytes of the key
|
||||
// docId occupies the most significant bytes of the key
|
||||
// now docId occupies the bits after the first 23
|
||||
static int64_t getDocId ( const void *k ) {
|
||||
//int64_t docId = (k.n0) >> (32+24);
|
||||
//docId |= ( ((uint64_t)(k.n1)) << 8 );
|
||||
int64_t docId = (((const key96_t *)k)->n0) >> 35;
|
||||
docId |= ( ((uint64_t)(((const key96_t *)k)->n1)) << 29 );
|
||||
return docId;
|
||||
}
|
||||
|
||||
static uint32_t getSiteHash26 ( const char *r ) {
|
||||
//return g_titledb.getSiteHash ( (key_t *)r ); }
|
||||
return ((uint32_t)(((const key96_t*)r)->n0 >> 2) & 0x03FFFFFF);
|
||||
}
|
||||
|
||||
static uint32_t hasAdultContent ( const char *r ) {
|
||||
//return g_titledb.hasAdultContent ( *(key_t *)r ); }
|
||||
return ((uint32_t)(((const key96_t*)r)->n0 >> 34) & 0x00000001);
|
||||
}
|
||||
|
||||
|
@ -268,7 +268,7 @@ void DailyMerge::dailyMergeLoop ( ) {
|
||||
// ok, all trees are clear and dumped
|
||||
m_mergeMode = 5;
|
||||
// log it
|
||||
log("daily: Merging indexdb and datedb files.");
|
||||
log("daily: Merging indexdb files.");
|
||||
}
|
||||
|
||||
// start the merge
|
||||
|
@ -1,6 +1,6 @@
|
||||
// Copyright Gigablast, Inc. Apr 2008
|
||||
|
||||
// tight merge indexdb and datedb at the given time every day
|
||||
// tight merge indexdb at the given time every day
|
||||
|
||||
#ifndef GB_DAILYMERGE_H
|
||||
#define GB_DAILYMERGE_H
|
||||
|
@ -81,7 +81,6 @@ void HashTableX::reset ( ) {
|
||||
m_flags = NULL;
|
||||
m_numSlots = 0;
|
||||
m_numSlotsUsed = 0;
|
||||
m_addIffNotUnique = false;
|
||||
m_maskKeyOffset = 0;
|
||||
//m_useKeyMagic = false;
|
||||
// we should free it in reset()
|
||||
@ -621,3 +620,17 @@ int32_t HashTableX::getKeyChecksum32 () const {
|
||||
}
|
||||
return checksum;
|
||||
}
|
||||
|
||||
// print as text into sb for debugging
|
||||
void HashTableX::print() {
|
||||
for (int32_t i = 0; i < m_numSlots; i++) {
|
||||
// skip empty bucket
|
||||
if (!m_flags[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the key
|
||||
char *kp = (char *)getKeyFromSlot(i);
|
||||
logf(LOG_WARN, "key=%s", KEYSTR(kp, m_ks));
|
||||
}
|
||||
}
|
||||
|
12
HashTableX.h
12
HashTableX.h
@ -298,6 +298,9 @@ class HashTableX {
|
||||
|
||||
bool setTableSize ( int32_t numSlots , char *buf , int32_t bufSize );
|
||||
|
||||
// for debugging
|
||||
void print();
|
||||
|
||||
void disableWrites () { m_isWritable = false; }
|
||||
void enableWrites () { m_isWritable = true ; }
|
||||
bool m_isWritable;
|
||||
@ -318,18 +321,15 @@ class HashTableX {
|
||||
int32_t m_numSlotsUsed;
|
||||
uint32_t m_mask;
|
||||
|
||||
char m_doFree;
|
||||
bool m_doFree;
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
|
||||
char m_useKeyMagic;
|
||||
bool m_useKeyMagic;
|
||||
|
||||
int32_t m_ks;
|
||||
int32_t m_ds;
|
||||
char m_allowDups;
|
||||
|
||||
// a flag used by XmlDoc.cpp
|
||||
bool m_addIffNotUnique;
|
||||
bool m_allowDups;
|
||||
|
||||
bool m_isSaving;
|
||||
bool m_needsSave;
|
||||
|
@ -1571,7 +1571,7 @@ uint32_t Hostdb::getShardNum(rdbid_t rdbId, const void *k) {
|
||||
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
|
||||
}
|
||||
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
|
||||
uint64_t d = g_titledb.getDocId ( (key96_t *)k );
|
||||
uint64_t d = Titledb::getDocId ( (key96_t *)k );
|
||||
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
|
||||
}
|
||||
else if ( rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2 ) {
|
||||
|
34
Linkdb.cpp
34
Linkdb.cpp
@ -536,9 +536,9 @@ bool getLinkInfo ( SafeBuf *reqBuf ,
|
||||
//int32_t siteHash32 = hash32n ( req->ptr_site );
|
||||
// access different parts of linkdb depending on the "mode"
|
||||
if ( req->m_mode == MODE_SITELINKINFO )
|
||||
startKey = g_linkdb.makeStartKey_uk ( req->m_siteHash32 );
|
||||
startKey = Linkdb::makeStartKey_uk ( req->m_siteHash32 );
|
||||
else
|
||||
startKey = g_linkdb.makeStartKey_uk (req->m_siteHash32,
|
||||
startKey = Linkdb::makeStartKey_uk (req->m_siteHash32,
|
||||
req->m_linkHash64 );
|
||||
// what group has this linkdb list?
|
||||
uint32_t shardNum = getShardNum ( RDB_LINKDB, &startKey );
|
||||
@ -999,14 +999,14 @@ bool Msg25::doReadLoop ( ) {
|
||||
|
||||
// access different parts of linkdb depending on the "mode"
|
||||
if ( m_mode == MODE_SITELINKINFO ) {
|
||||
startKey = g_linkdb.makeStartKey_uk ( siteHash32 );
|
||||
endKey = g_linkdb.makeEndKey_uk ( siteHash32 );
|
||||
startKey = Linkdb::makeStartKey_uk ( siteHash32 );
|
||||
endKey = Linkdb::makeEndKey_uk ( siteHash32 );
|
||||
//log("linkdb: getlinkinfo: "
|
||||
// "site=%s sitehash32=%" PRIu32,site,siteHash32);
|
||||
}
|
||||
else {
|
||||
startKey = g_linkdb.makeStartKey_uk (siteHash32,m_linkHash64 );
|
||||
endKey = g_linkdb.makeEndKey_uk (siteHash32,m_linkHash64 );
|
||||
startKey = Linkdb::makeStartKey_uk (siteHash32,m_linkHash64 );
|
||||
endKey = Linkdb::makeEndKey_uk (siteHash32,m_linkHash64 );
|
||||
}
|
||||
|
||||
// resume from where we left off?
|
||||
@ -1329,13 +1329,13 @@ bool Msg25::sendRequests ( ) {
|
||||
// get the current key if list has more left
|
||||
key224_t key; m_list.getCurrentKey( &key );
|
||||
|
||||
itop = g_linkdb.getLinkerIp24_uk ( &key );
|
||||
ip32 = g_linkdb.getLinkerIp_uk ( &key );
|
||||
isLinkSpam = g_linkdb.isLinkSpam_uk ( &key );
|
||||
docId = g_linkdb.getLinkerDocId_uk ( &key );
|
||||
discovered = g_linkdb.getDiscoveryDate_uk(&key);
|
||||
itop = Linkdb::getLinkerIp24_uk ( &key );
|
||||
ip32 = Linkdb::getLinkerIp_uk ( &key );
|
||||
isLinkSpam = Linkdb::isLinkSpam_uk ( &key );
|
||||
docId = Linkdb::getLinkerDocId_uk ( &key );
|
||||
discovered = Linkdb::getDiscoveryDate_uk(&key);
|
||||
// is it expired?
|
||||
lostDate = g_linkdb.getLostDate_uk(&key);
|
||||
lostDate = Linkdb::getLostDate_uk(&key);
|
||||
// update this
|
||||
gbmemcpy ( &m_nextKey , &key , LDBKS );
|
||||
|
||||
@ -1347,15 +1347,15 @@ bool Msg25::sendRequests ( ) {
|
||||
// get the current key if list has more left
|
||||
key224_t key; m_list.getCurrentKey( &key );
|
||||
|
||||
itop = g_linkdb.getLinkerIp24_uk ( &key );
|
||||
ip32 = g_linkdb.getLinkerIp_uk ( &key );
|
||||
itop = Linkdb::getLinkerIp24_uk ( &key );
|
||||
ip32 = Linkdb::getLinkerIp_uk ( &key );
|
||||
|
||||
isLinkSpam = false;
|
||||
docId = g_linkdb.getLinkerDocId_uk ( &key );
|
||||
docId = Linkdb::getLinkerDocId_uk ( &key );
|
||||
|
||||
discovered = g_linkdb.getDiscoveryDate_uk(&key);
|
||||
discovered = Linkdb::getDiscoveryDate_uk(&key);
|
||||
// is it expired?
|
||||
lostDate = g_linkdb.getLostDate_uk(&key);
|
||||
lostDate = Linkdb::getLostDate_uk(&key);
|
||||
// update this
|
||||
gbmemcpy ( &m_nextKey , &key , LDBKS );
|
||||
|
||||
|
56
Linkdb.h
56
Linkdb.h
@ -159,16 +159,18 @@ bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
|
||||
int32_t getSiteRank ( int32_t sni ) ;
|
||||
|
||||
class Linkdb {
|
||||
public:
|
||||
public:
|
||||
void reset();
|
||||
|
||||
bool init ( );
|
||||
bool init2 ( int32_t treeMem );
|
||||
bool verify ( char *coll );
|
||||
bool addColl ( char *coll, bool doVerify = true );
|
||||
bool init();
|
||||
bool init2(int32_t treeMem);
|
||||
|
||||
bool verify(char *coll);
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; }
|
||||
|
||||
// this makes a "url" key
|
||||
key224_t makeKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
static key224_t makeKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
uint64_t linkeeUrlHash64 ,
|
||||
bool isLinkSpam ,
|
||||
unsigned char linkerSiteRank , // 0-15 i guess
|
||||
@ -182,7 +184,7 @@ class Linkdb {
|
||||
bool isDelete );
|
||||
|
||||
|
||||
key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
static key224_t makeStartKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
uint64_t linkeeUrlHash64 = 0LL ) {
|
||||
return makeKey_uk ( linkeeSiteHash32,
|
||||
linkeeUrlHash64,
|
||||
@ -198,7 +200,7 @@ class Linkdb {
|
||||
true); // is delete?
|
||||
}
|
||||
|
||||
key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
static key224_t makeEndKey_uk ( uint32_t linkeeSiteHash32 ,
|
||||
uint64_t linkeeUrlHash64 =
|
||||
0xffffffffffffffffLL ) {
|
||||
return makeKey_uk ( linkeeSiteHash32,
|
||||
@ -219,10 +221,11 @@ class Linkdb {
|
||||
// accessors for "url" keys in linkdb
|
||||
//
|
||||
|
||||
uint32_t getLinkeeSiteHash32_uk ( key224_t *key ) {
|
||||
return (key->n3) >> 32; }
|
||||
static uint32_t getLinkeeSiteHash32_uk ( key224_t *key ) {
|
||||
return (key->n3) >> 32;
|
||||
}
|
||||
|
||||
uint64_t getLinkeeUrlHash64_uk ( key224_t *key ) {
|
||||
static uint64_t getLinkeeUrlHash64_uk ( key224_t *key ) {
|
||||
uint64_t h = key->n3;
|
||||
h &= 0x00000000ffffffffLL;
|
||||
h <<= 15;
|
||||
@ -230,19 +233,19 @@ class Linkdb {
|
||||
return h;
|
||||
}
|
||||
|
||||
char isLinkSpam_uk (key224_t *key ) {
|
||||
static char isLinkSpam_uk (key224_t *key ) {
|
||||
if ((key->n2) & 0x1000000000000LL) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned char getLinkerSiteRank_uk ( key224_t *k ) {
|
||||
static unsigned char getLinkerSiteRank_uk ( key224_t *k ) {
|
||||
unsigned char rank = (k->n2 >> 40) & 0xff;
|
||||
// complement it back
|
||||
rank = (unsigned char)~rank;//LDB_MAXSITERANK - rank;
|
||||
return rank;
|
||||
}
|
||||
|
||||
int32_t getLinkerIp_uk ( key224_t *k ) {
|
||||
|
||||
static int32_t getLinkerIp_uk ( key224_t *k ) {
|
||||
uint32_t ip ;
|
||||
// the most significant part of the ip is the lower byte!!!
|
||||
ip = (uint32_t)((k->n2>>8)&0x00ffffff);
|
||||
@ -250,7 +253,7 @@ class Linkdb {
|
||||
return ip;
|
||||
}
|
||||
|
||||
void setIp32_uk ( void *k , uint32_t ip ) {
|
||||
static void setIp32_uk ( void *k , uint32_t ip ) {
|
||||
char *ips = (char *)&ip;
|
||||
char *ks = (char *)k;
|
||||
ks[16] = ips[3];
|
||||
@ -261,11 +264,11 @@ class Linkdb {
|
||||
|
||||
|
||||
// we are missing the lower byte, it will be zero
|
||||
int32_t getLinkerIp24_uk ( key224_t *k ) {
|
||||
static int32_t getLinkerIp24_uk ( key224_t *k ) {
|
||||
return (int32_t)((k->n2>>8)&0x00ffffff);
|
||||
}
|
||||
|
||||
int64_t getLinkerDocId_uk( key224_t *k ) {
|
||||
static int64_t getLinkerDocId_uk( key224_t *k ) {
|
||||
uint64_t d = k->n2 & 0xff;
|
||||
d <<= 30;
|
||||
d |= k->n1 >>34;
|
||||
@ -274,7 +277,7 @@ class Linkdb {
|
||||
|
||||
// . in days since jan 1, 2012 utc
|
||||
// . timestamp of jan 1, 2012 utc is 1325376000
|
||||
int32_t getDiscoveryDate_uk ( void *k ) {
|
||||
static int32_t getDiscoveryDate_uk ( void *k ) {
|
||||
uint32_t date = ((key224_t *)k)->n1 >> 18;
|
||||
date &= 0x00003fff;
|
||||
// if 0 return that
|
||||
@ -289,7 +292,7 @@ class Linkdb {
|
||||
|
||||
// . in days since jan 1, 2012 utc
|
||||
// . timestamp of jan 1, 2012 utc is 1325376000
|
||||
void setDiscoveryDate_uk ( void *k , int32_t date ) {
|
||||
static void setDiscoveryDate_uk ( void *k , int32_t date ) {
|
||||
// subtract jan 1 2012
|
||||
date -= LINKDBEPOCH;
|
||||
// convert into days
|
||||
@ -302,7 +305,7 @@ class Linkdb {
|
||||
((key224_t *)k)->n1 |= ((uint64_t)date) << 18;
|
||||
}
|
||||
|
||||
int32_t getLostDate_uk ( void *k ) {
|
||||
static int32_t getLostDate_uk ( void *k ) {
|
||||
uint32_t date = ((key224_t *)k)->n1 >> 2;
|
||||
date &= 0x00003fff;
|
||||
// if 0 return that
|
||||
@ -317,7 +320,7 @@ class Linkdb {
|
||||
|
||||
// . in days since jan 1, 2012 utc
|
||||
// . timestamp of jan 1, 2012 utc is 1325376000
|
||||
void setLostDate_uk ( void *k , int32_t date ) {
|
||||
static void setLostDate_uk ( void *k , int32_t date ) {
|
||||
// subtract jan 1 2012
|
||||
date -= LINKDBEPOCH;
|
||||
// convert into days
|
||||
@ -330,18 +333,15 @@ class Linkdb {
|
||||
((key224_t *)k)->n1 |= ((uint64_t)date) << 2;
|
||||
}
|
||||
|
||||
uint32_t getLinkerSiteHash32_uk( void *k ) {
|
||||
static uint32_t getLinkerSiteHash32_uk( void *k ) {
|
||||
uint32_t sh32 = ((key224_t *)k)->n1 & 0x00000003;
|
||||
sh32 <<= 30;
|
||||
sh32 |= ((key224_t *)k)->n0 >> 2;
|
||||
return sh32;
|
||||
}
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; }
|
||||
|
||||
private:
|
||||
Rdb m_rdb;
|
||||
|
||||
private:
|
||||
Rdb m_rdb;
|
||||
};
|
||||
|
||||
extern class Linkdb g_linkdb;
|
||||
|
1
Loop.cpp
1
Loop.cpp
@ -208,6 +208,7 @@ bool Loop::registerSleepCallback ( int32_t tick, void *state, void (* callback)(
|
||||
return false;
|
||||
}
|
||||
|
||||
ScopedLock sl(m_slotMutex);
|
||||
if ( tick < m_minTick ) {
|
||||
m_minTick = tick;
|
||||
}
|
||||
|
2
Msg0.cpp
2
Msg0.cpp
@ -775,7 +775,7 @@ void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
|
||||
totalOrigLinks++;
|
||||
// get rec
|
||||
char *rec = list->getCurrentRec();
|
||||
int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec );
|
||||
int32_t ip32 = Linkdb::getLinkerIp_uk((key224_t *)rec );
|
||||
// same as one before?
|
||||
if ( ip32 == lastIp32 &&
|
||||
// are we the last rec? include that for
|
||||
|
@ -141,7 +141,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
if ( req->m_docId >= 0 )
|
||||
shardNum = g_hostdb.getShardNumFromDocId(req->m_docId);
|
||||
else {
|
||||
int64_t pdocId = g_titledb.getProbableDocId(req->ptr_ubuf);
|
||||
int64_t pdocId = Titledb::getProbableDocId(req->ptr_ubuf);
|
||||
shardNum = getShardNumFromDocId(pdocId);
|
||||
}
|
||||
|
||||
@ -193,7 +193,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
int64_t probDocId = req->m_docId;
|
||||
// i think reference pages just pass in a url to get the summary
|
||||
if ( probDocId < 0 && req->size_ubuf )
|
||||
probDocId = g_titledb.getProbableDocId ( req->ptr_ubuf );
|
||||
probDocId = Titledb::getProbableDocId ( req->ptr_ubuf );
|
||||
if ( probDocId < 0 ) {
|
||||
log("query: Got bad docid/url combo.");
|
||||
probDocId = 0;
|
||||
@ -363,7 +363,7 @@ static void handleRequest20(UdpSlot *slot, int32_t netnice) {
|
||||
log(LOG_DEBUG, "query: Summary cache miss");
|
||||
|
||||
// if it's not stored locally that's an error
|
||||
if ( req->m_docId >= 0 && ! g_titledb.isLocal ( req->m_docId ) ) {
|
||||
if ( req->m_docId >= 0 && ! Titledb::isLocal ( req->m_docId ) ) {
|
||||
log(LOG_WARN, "query: Got msg20 request for non-local docId %" PRId64, req->m_docId);
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
g_udpServer.sendErrorReply ( slot , ENOTLOCAL );
|
||||
|
22
Msg22.cpp
22
Msg22.cpp
@ -134,7 +134,7 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
|
||||
|
||||
// if no docid provided, use probable docid
|
||||
if ( ! docId )
|
||||
docId = g_titledb.getProbableDocId ( url );
|
||||
docId = Titledb::getProbableDocId ( url );
|
||||
|
||||
// get groupId from docId
|
||||
uint32_t shardNum = getShardNumFromDocId ( docId );
|
||||
@ -359,8 +359,8 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// so try the range
|
||||
if ( r->m_getAvailDocIdOnly ) {
|
||||
int64_t pd = r->m_docId;
|
||||
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
||||
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
||||
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
|
||||
int64_t d2 = Titledb::getLastProbableDocId ( pd );
|
||||
// sanity - bad url with bad subdomain?
|
||||
if ( pd < d1 || pd > d2 ) { g_process.shutdownAbort(true); }
|
||||
// make sure we get a decent sample in titledb then in
|
||||
@ -388,9 +388,9 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
|
||||
delete ( st );
|
||||
return;
|
||||
}
|
||||
int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
|
||||
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
||||
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
||||
int64_t pd = Titledb::getProbableDocId (r->m_url,dom,dlen);
|
||||
int64_t d1 = Titledb::getFirstProbableDocId ( pd );
|
||||
int64_t d2 = Titledb::getLastProbableDocId ( pd );
|
||||
// sanity - bad url with bad subdomain?
|
||||
if ( pd < d1 || pd > d2 ) { g_process.shutdownAbort(true); }
|
||||
// store these
|
||||
@ -406,8 +406,8 @@ void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// since it would base it on startFileNum and numFiles
|
||||
key96_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
|
||||
// make titledb keys
|
||||
key96_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
|
||||
key96_t endKey = g_titledb.makeLastKey ( st->m_docId2 );
|
||||
key96_t startKey = Titledb::makeFirstKey ( st->m_docId1 );
|
||||
key96_t endKey = Titledb::makeLastKey ( st->m_docId2 );
|
||||
|
||||
// . load the list of title recs from disk now
|
||||
// . our file range should be solid
|
||||
@ -468,7 +468,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
// set probable docid
|
||||
int64_t pd = 0LL;
|
||||
if ( r->m_url[0] ) {
|
||||
pd = g_titledb.getProbableDocId(r->m_url);
|
||||
pd = Titledb::getProbableDocId(r->m_url);
|
||||
if ( pd != st->m_pd ) {
|
||||
log("db: crap probable docids do not match! u=%s",
|
||||
r->m_url);
|
||||
@ -500,7 +500,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
|
||||
|
||||
// get docid of that titlerec
|
||||
int64_t dd = g_titledb.getDocId(k);
|
||||
int64_t dd = Titledb::getDocId(k);
|
||||
|
||||
if ( r->m_getAvailDocIdOnly ) {
|
||||
// make sure our available docids are availble!
|
||||
@ -511,7 +511,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
// if we had a url make sure uh48 matches
|
||||
else if ( r->m_url[0] ) {
|
||||
// get it
|
||||
int64_t uh48 = g_titledb.getUrlHash48(k);
|
||||
int64_t uh48 = Titledb::getUrlHash48(k);
|
||||
|
||||
// make sure our available docids are availble!
|
||||
if ( dd == ad1 ) ad1++;
|
||||
|
@ -639,7 +639,7 @@ bool Msg3a::gotAllShardReplies ( ) {
|
||||
j ,
|
||||
i ,
|
||||
docIds [j] ,
|
||||
(int32_t)g_titledb.getDomHash8FromDocId(docIds[j]),
|
||||
(int32_t)Titledb::getDomHash8FromDocId(docIds[j]),
|
||||
scores[j] );
|
||||
}
|
||||
}
|
||||
|
156
Msg4.cpp
156
Msg4.cpp
@ -65,22 +65,15 @@ static Msg4 *s_msg4Tail = NULL;
|
||||
// . also, need to update spiderdb rec for the url in Msg14 using Msg4 too!
|
||||
// . need to add support for passing in array of lists for Msg14
|
||||
|
||||
static bool addMetaList ( const char *p , class UdpSlot *slot = NULL );
|
||||
static void gotReplyWrapper4 ( void *state , void *state2 ) ;
|
||||
static void storeLineWaiters ( ) ;
|
||||
static void handleRequest4 ( UdpSlot *slot , int32_t niceness ) ;
|
||||
static void sleepCallback4 ( int bogusfd , void *state ) ;
|
||||
static bool sendBuffer ( int32_t hostId , int32_t niceness ) ;
|
||||
static Multicast *getMulticast ( ) ;
|
||||
static void returnMulticast ( Multicast *mcast ) ;
|
||||
|
||||
static bool storeRec ( collnum_t collnum ,
|
||||
char rdbId ,
|
||||
uint32_t gid ,
|
||||
int32_t hostId ,
|
||||
const char *rec ,
|
||||
int32_t recSize ,
|
||||
int32_t niceness ) ;
|
||||
static bool addMetaList(const char *p, class UdpSlot *slot = NULL);
|
||||
static void gotReplyWrapper4(void *state, void *state2);
|
||||
static void handleRequest4(UdpSlot *slot, int32_t niceness);
|
||||
static void sleepCallback4(int bogusfd, void *state);
|
||||
static void flushLocal();
|
||||
static bool sendBuffer(int32_t hostId);
|
||||
static Multicast *getMulticast();
|
||||
static void returnMulticast(Multicast *mcast);
|
||||
static bool storeRec(collnum_t collnum, char rdbId, uint32_t gid, int32_t hostId, const char *rec, int32_t recSize);
|
||||
|
||||
// all these parameters should be preset
|
||||
bool Msg4::registerHandler() {
|
||||
@ -128,9 +121,6 @@ bool Msg4::registerHandler() {
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static void flushLocal ( ) ;
|
||||
|
||||
// scan all host bufs and try to send on them
|
||||
void sleepCallback4 ( int bogusfd , void *state ) {
|
||||
// wait for clock to be in sync
|
||||
@ -145,7 +135,7 @@ void flushLocal ( ) {
|
||||
//storeLineWaiters();
|
||||
// now try to send the buffers
|
||||
for ( int32_t i = 0 ; i < s_numHostBufs ; i++ )
|
||||
sendBuffer ( i , MAX_NICENESS );
|
||||
sendBuffer ( i );
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
@ -183,12 +173,12 @@ bool hasAddsInQueue ( ) {
|
||||
}
|
||||
|
||||
bool Msg4::addMetaList ( SafeBuf *sb, collnum_t collnum, void *state, void (* callback)(void *state),
|
||||
int32_t niceness, char rdbId, int32_t shardOverride ) {
|
||||
return addMetaList ( sb->getBufStart(), sb->length(), collnum, state, callback, niceness, rdbId, shardOverride );
|
||||
rdbid_t rdbId, int32_t shardOverride ) {
|
||||
return addMetaList ( sb->getBufStart(), sb->length(), collnum, state, callback, rdbId, shardOverride );
|
||||
}
|
||||
|
||||
bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
|
||||
void (* callback)(void *state), int32_t niceness, char rdbId,
|
||||
void (* callback)(void *state), rdbid_t rdbId,
|
||||
// Rebalance.cpp needs to add negative keys to
|
||||
// remove foreign records from where they no
|
||||
// longer belong because of a new hosts.conf file.
|
||||
@ -212,7 +202,6 @@ bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t c
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
m_rdbId = rdbId;
|
||||
m_niceness = niceness;
|
||||
m_next = NULL;
|
||||
m_shardOverride = shardOverride;
|
||||
|
||||
@ -279,7 +268,7 @@ bool Msg4::addMetaList ( const char *metaList, int32_t metaListSize, collnum_t c
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isInMsg4LinkedList ( Msg4 *msg4 ) {
|
||||
bool Msg4::isInLinkedList ( Msg4 *msg4 ) {
|
||||
Msg4 *m = s_msg4Head;
|
||||
for ( ; m ; m = m->m_next )
|
||||
if ( m == msg4 ) return true;
|
||||
@ -300,12 +289,10 @@ bool Msg4::addMetaList2 ( ) {
|
||||
// store each record in the list into the send buffers
|
||||
for ( ; p < pend ; ) {
|
||||
// first is rdbId
|
||||
char rdbId = m_rdbId;
|
||||
if ( rdbId < 0 ) rdbId = *p++;
|
||||
// mask off rdbId
|
||||
rdbId &= 0x7f;
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, " rdbId: %02x", rdbId);
|
||||
rdbid_t rdbId = m_rdbId;
|
||||
if ( rdbId == RDB_NONE ) {
|
||||
rdbId = (rdbid_t)(*p++ & 0x7f);
|
||||
}
|
||||
|
||||
// get the key of the current record
|
||||
const char *key = p;
|
||||
@ -313,12 +300,8 @@ bool Msg4::addMetaList2 ( ) {
|
||||
// get the key size. a table lookup in Rdb.cpp.
|
||||
int32_t ks = getKeySizeFromRdbId ( rdbId );
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, " Key: %s", KEYSTR(key, ks) );
|
||||
logTrace( g_conf.m_logTraceMsg4, " Key size: %" PRId32, ks);
|
||||
|
||||
// negative key?
|
||||
bool del = !( *key & 0x01 );
|
||||
logTrace( g_conf.m_logTraceMsg4, " Negative key: %s", del?"true":"false");
|
||||
|
||||
// skip key
|
||||
p += ks;
|
||||
@ -330,16 +313,12 @@ bool Msg4::addMetaList2 ( ) {
|
||||
if ( m_shardOverride >= 0 ) {
|
||||
shardNum = m_shardOverride;
|
||||
}
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, " shardNum: %" PRId32, shardNum);
|
||||
|
||||
// get the record, is -1 if variable. a table lookup.
|
||||
// . negative keys have no data
|
||||
// . this unfortunately is not true according to RdbList.cpp
|
||||
int32_t dataSize = del ? 0 : getDataSizeFromRdbId ( rdbId );
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, " dataSize: %" PRId32, dataSize);
|
||||
|
||||
// if variable read that in
|
||||
if ( dataSize == -1 ) {
|
||||
// -1 means to read it in
|
||||
@ -349,8 +328,6 @@ bool Msg4::addMetaList2 ( ) {
|
||||
|
||||
// skip dataSize
|
||||
p += 4;
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, " dataSize: %" PRId32" (variable size read)", dataSize);
|
||||
}
|
||||
|
||||
// skip over the data, if any
|
||||
@ -358,18 +335,15 @@ bool Msg4::addMetaList2 ( ) {
|
||||
|
||||
// breach us?
|
||||
if ( p > pend ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// i fixed UdpServer.cpp to NOT call msg4 handlers when in
|
||||
// a quickpoll, in case we receive a niceness 0 msg4 request
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// convert the gid to the hostid of the first host in this
|
||||
// group. uses a quick hash table.
|
||||
Host *hosts = g_hostdb.getShard ( shardNum );
|
||||
int32_t hostId = hosts[0].m_hostId;
|
||||
logTrace( g_conf.m_logTraceMsg4, " hostId: %" PRId32, hostId);
|
||||
|
||||
|
||||
|
||||
logTrace(g_conf.m_logTraceMsg4, " rdb=%s key=%s keySize=%" PRId32" isDel=%d dataSize=%" PRId32" shardNum=%" PRId32" hostId=%" PRId32,
|
||||
getDbnameFromId(rdbId), KEYSTR(key, ks), ks, del, shardNum, dataSize, hostId);
|
||||
|
||||
// . add that rec to this groupId, gid, includes the key
|
||||
// . these are NOT allowed to be compressed (half bit set)
|
||||
// and this point
|
||||
@ -377,7 +351,7 @@ bool Msg4::addMetaList2 ( ) {
|
||||
#ifdef _VALGRIND_
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(key,p-key);
|
||||
#endif
|
||||
if ( storeRec ( m_collnum, rdbId, shardNum, hostId, key, p - key, m_niceness )) {
|
||||
if ( storeRec ( m_collnum, rdbId, shardNum, hostId, key, p - key )) {
|
||||
// . point to next record
|
||||
// . will point past records if no more left!
|
||||
m_currentPtr = p;
|
||||
@ -416,8 +390,7 @@ bool storeRec ( collnum_t collnum ,
|
||||
uint32_t shardNum,
|
||||
int32_t hostId ,
|
||||
const char *rec ,
|
||||
int32_t recSize ,
|
||||
int32_t niceness ) {
|
||||
int32_t recSize ) {
|
||||
#ifdef _VALGRIND_
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(&collnum,sizeof(collnum));
|
||||
VALGRIND_CHECK_MEM_IS_DEFINED(&rdbId,sizeof(rdbId));
|
||||
@ -491,7 +464,7 @@ bool storeRec ( collnum_t collnum ,
|
||||
// will he be able to proceed. we will call his callback
|
||||
// as soon as we can copy... use this->m_msg1 to add the
|
||||
// list that was passed in...
|
||||
if ( ! sendBuffer ( hostId , niceness ) ) return false;
|
||||
if ( ! sendBuffer ( hostId ) ) return false;
|
||||
// now the buffer should be empty, try again
|
||||
goto retry;
|
||||
}
|
||||
@ -515,7 +488,7 @@ bool storeRec ( collnum_t collnum ,
|
||||
// . returns false if we were UNable to get a multicast to launch the buffer,
|
||||
// true otherwise
|
||||
// . returns false and sets g_errno on error
|
||||
bool sendBuffer ( int32_t hostId , int32_t niceness ) {
|
||||
bool sendBuffer ( int32_t hostId ) {
|
||||
//logf(LOG_DEBUG,"build: sending buf");
|
||||
// how many bytes of the buffer are occupied or "in use"?
|
||||
char *buf = s_hostBufs [hostId];
|
||||
@ -673,10 +646,10 @@ void gotReplyWrapper4 ( void *state , void *state2 ) {
|
||||
|
||||
returnMulticast(mcast);
|
||||
|
||||
storeLineWaiters(); // try to launch more msg4 requests in waiting
|
||||
Msg4::storeLineWaiters(); // try to launch more msg4 requests in waiting
|
||||
}
|
||||
|
||||
void storeLineWaiters ( ) {
|
||||
void Msg4::storeLineWaiters ( ) {
|
||||
// try to store all the msg4's lists that are waiting in line
|
||||
for (;;) {
|
||||
Msg4 *msg4 = s_msg4Head;
|
||||
@ -731,17 +704,14 @@ void storeLineWaiters ( ) {
|
||||
void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
logTrace( g_conf.m_logTraceMsg4, "BEGIN" );
|
||||
|
||||
// easy var
|
||||
UdpServer *us = &g_udpServer;
|
||||
|
||||
// if we just came up we need to make sure our hosts.conf is in
|
||||
// sync with everyone else before accepting this! it might have
|
||||
// been the case that the sender thinks our hosts.conf is the same
|
||||
// since last time we were up, so it is up to us to check this
|
||||
if ( g_pingServer.m_hostsConfInDisagreement ) {
|
||||
g_errno = EBADHOSTSCONF;
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
logError("call sendErrorReply");
|
||||
g_udpServer.sendErrorReply ( slot , g_errno );
|
||||
|
||||
log(LOG_WARN,"%s:%s: END - hostsConfInDisagreement", __FILE__, __func__ );
|
||||
return;
|
||||
@ -753,8 +723,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// . this is 0 if not received yet
|
||||
if (!slot->m_host->m_pingInfo.m_hostsConfCRC) {
|
||||
g_errno = EWAITINGTOSYNCHOSTSCONF;
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
logError("call sendErrorReply");
|
||||
g_udpServer.sendErrorReply ( slot , g_errno );
|
||||
|
||||
log(LOG_WARN,"%s:%s: END - EWAITINGTOSYNCHOSTCONF", __FILE__, __func__ );
|
||||
return;
|
||||
@ -763,8 +733,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// compare our hosts.conf to sender's otherwise
|
||||
if (slot->m_host->m_pingInfo.m_hostsConfCRC != g_hostdb.getCRC()) {
|
||||
g_errno = EBADHOSTSCONF;
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
logError("call sendErrorReply");
|
||||
g_udpServer.sendErrorReply ( slot , g_errno );
|
||||
|
||||
log(LOG_WARN,"%s:%s: END - EBADHOSTSCONF", __FILE__, __func__ );
|
||||
return;
|
||||
@ -778,8 +748,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
// must at least have an rdbId
|
||||
if (readBufSize < 7) {
|
||||
g_errno = EREQUESTTOOSHORT;
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
logError("call sendErrorReply");
|
||||
g_udpServer.sendErrorReply ( slot , g_errno );
|
||||
|
||||
log(LOG_ERROR,"%s:%s: END - EREQUESTTOOSHORT", __FILE__, __func__ );
|
||||
return;
|
||||
@ -793,20 +763,15 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
if ( used != readBufSize ) {
|
||||
// if we send back a g_errno then multicast retries forever
|
||||
// so just absorb it!
|
||||
log(LOG_ERROR,"%s:%s: msg4: got corrupted request from hostid %" PRId32" "
|
||||
"used [%" PRId32"] != readBufSize [%" PRId32"]",
|
||||
__FILE__,
|
||||
__func__,
|
||||
slot->m_host->m_hostId,
|
||||
used,
|
||||
readBufSize);
|
||||
logError("msg4: got corrupted request from hostid %" PRId32" used [%" PRId32"] != readBufSize [%" PRId32"]",
|
||||
slot->m_host->m_hostId, used, readBufSize);
|
||||
|
||||
loghex(LOG_ERROR, readBuf, (readBufSize < 160 ? readBufSize : 160), "readBuf (first max. 160 bytes)");
|
||||
|
||||
us->sendReply(NULL, 0, NULL, 0, slot);
|
||||
//us->sendErrorReply(slot,ECORRUPTDATA);return;}
|
||||
|
||||
log(LOG_ERROR,"%s:%s: END", __FILE__, __func__ );
|
||||
|
||||
g_udpServer.sendReply(NULL, 0, NULL, 0, slot);
|
||||
//g_udpServer.sendErrorReply(slot,ECORRUPTDATA);return;}
|
||||
|
||||
logError("END");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -821,8 +786,8 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
}
|
||||
// tell send to try again shortly
|
||||
g_errno = ETRYAGAIN;
|
||||
log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
|
||||
us->sendErrorReply(slot,g_errno);
|
||||
logError("call sendErrorReply");
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
|
||||
logTrace( g_conf.m_logTraceMsg4, "END - ETRYAGAIN. Waiting to sync with host #0" );
|
||||
return;
|
||||
@ -830,15 +795,15 @@ void handleRequest4 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
// this returns false with g_errno set on error
|
||||
if (!addMetaList(readBuf, slot)) {
|
||||
log(LOG_ERROR, "%s:%s:%d: call sendErrorReply. error='%s'", __FILE__, __func__, __LINE__, mstrerror(g_errno));
|
||||
us->sendErrorReply(slot,g_errno);
|
||||
logError("call sendErrorReply error='%s", mstrerror(g_errno));
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
|
||||
logTrace(g_conf.m_logTraceMsg4, "END - addMetaList returned false. g_errno=%d", g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
// good to go
|
||||
us->sendReply(NULL, 0, NULL, 0, slot);
|
||||
g_udpServer.sendReply(NULL, 0, NULL, 0, slot);
|
||||
|
||||
logTrace(g_conf.m_logTraceMsg4, "END - OK");
|
||||
}
|
||||
@ -934,23 +899,20 @@ bool addMetaList ( const char *p , UdpSlot *slot ) {
|
||||
log(LOG_WARN, "seems like a stray /e/repair-addsinprogress.dat file "
|
||||
"rdbId=%" PRId32". waiting to be in repair mode."
|
||||
,(int32_t)rdbId);
|
||||
//not in repair mode. dropping.",(int32_t)rdbId);
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
|
||||
// set the list
|
||||
list.set ( (char*)p , //todo: dodgy cast. RdbList should be fixed
|
||||
recSize ,
|
||||
(char*)p , //todo: dodgy cast. RdbList should be fixed
|
||||
recSize ,
|
||||
rdb->getFixedDataSize() ,
|
||||
false , // ownData?
|
||||
rdb->useHalfKeys() ,
|
||||
rdb->getKeySize () );
|
||||
// todo: dodgy cast to char*. RdbList should be fixed
|
||||
list.set((char *)p, recSize, (char *)p, recSize, rdb->getFixedDataSize(), false, rdb->useHalfKeys(), rdb->getKeySize());
|
||||
|
||||
// advance over the rec data to point to next entry
|
||||
p += recSize;
|
||||
|
||||
// keep track of stats
|
||||
rdb->readRequestAdd ( recSize );
|
||||
|
||||
// this returns false and sets g_errno on error
|
||||
bool status =rdb->addList(collnum, &list, MAX_NICENESS );
|
||||
|
||||
@ -966,16 +928,12 @@ bool addMetaList ( const char *p , UdpSlot *slot ) {
|
||||
// no memory means to try again
|
||||
if ( g_errno == ENOMEM ) g_errno = ETRYAGAIN;
|
||||
// doing a full rebuid will add collections
|
||||
if ( g_errno == ENOCOLLREC &&
|
||||
g_repairMode > 0 )
|
||||
//g_repair.m_fullRebuild )
|
||||
if ( g_errno == ENOCOLLREC && g_repairMode > 0 )
|
||||
g_errno = ETRYAGAIN;
|
||||
// ignore enocollrec errors since collection can be reset while
|
||||
// spiders are on now.
|
||||
//if ( g_errno == ENOCOLLREC )
|
||||
// g_errno = 0;
|
||||
|
||||
// are we done
|
||||
if ( g_errno ) return false;
|
||||
|
||||
// success
|
||||
return true;
|
||||
}
|
||||
|
64
Msg4.h
64
Msg4.h
@ -10,58 +10,56 @@ bool loadAddsInProgress ( const char *filenamePrefix );
|
||||
// used by Repair.cpp to make sure we are not adding any more data ("writing")
|
||||
bool hasAddsInQueue ( ) ;
|
||||
|
||||
bool isInMsg4LinkedList ( class Msg4 *msg4 ) ;
|
||||
|
||||
#include "SafeBuf.h"
|
||||
#include "rdbid_t.h"
|
||||
|
||||
class Msg4 {
|
||||
public:
|
||||
Msg4()
|
||||
: m_inUse(false) {
|
||||
}
|
||||
|
||||
public:
|
||||
static bool registerHandler();
|
||||
// meta list format =
|
||||
// (rdbId | 0x08) then rdb record [if nosplit]
|
||||
// (rdbId | 0x00) then rdb record [if split ]
|
||||
bool addMetaList( class SafeBuf *sb, collnum_t collnum, void *state,
|
||||
void (* callback)(void *state), int32_t niceness, char rdbId = -1, int32_t shardOverride = -1 );
|
||||
bool addMetaList( class SafeBuf *sb, collnum_t collnum, void *state,
|
||||
void (* callback)(void *state), int32_t niceness, rdbid_t rdbId, int32_t shardOverride = -1 )
|
||||
{ return addMetaList(sb,collnum,state,callback,niceness,(char)rdbId,shardOverride); }
|
||||
// why wasn't this saved in addsinprogress.dat file?
|
||||
~Msg4() {
|
||||
if (m_inUse) {
|
||||
log(LOG_ERROR, "BAD: MSG4 in use!!!!!! this=%p", this);
|
||||
}
|
||||
}
|
||||
|
||||
bool addMetaList(SafeBuf *sb, collnum_t collnum, void *state,
|
||||
void (*callback)(void *state), rdbid_t rdbId = RDB_NONE, int32_t shardOverride = -1);
|
||||
|
||||
// this one is faster...
|
||||
// returns false if blocked
|
||||
bool addMetaList( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
|
||||
void (* callback)(void *state), int32_t niceness, char rdbId = -1, int32_t shardOverride = -1 );
|
||||
bool addMetaList( const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
|
||||
void (* callback)(void *state), int32_t niceness, rdbid_t rdbId, int32_t shardOverride = -1 )
|
||||
{ return addMetaList(metaList,metaListSize,collnum,state,callback,niceness,(char)rdbId,shardOverride); }
|
||||
bool addMetaList(const char *metaList, int32_t metaListSize, collnum_t collnum, void *state,
|
||||
void (*callback)(void *state), rdbid_t rdbId = RDB_NONE, int32_t shardOverride = -1);
|
||||
|
||||
bool addMetaList2();
|
||||
bool isInUse() const { return m_inUse; }
|
||||
|
||||
Msg4() { m_inUse = false; }
|
||||
// why wasn't this saved in addsinprogress.dat file?
|
||||
~Msg4() { if ( m_inUse ) log("BAD: MSG4 in use!!!!!!"); }
|
||||
|
||||
// private:
|
||||
|
||||
void (*m_callback ) ( void *state );
|
||||
void *m_state;
|
||||
static bool registerHandler();
|
||||
static bool isInLinkedList(Msg4 *msg4);
|
||||
static void storeLineWaiters();
|
||||
|
||||
SafeBuf m_tmpBuf;
|
||||
|
||||
char m_rdbId;
|
||||
char m_inUse;
|
||||
private:
|
||||
bool addMetaList2();
|
||||
|
||||
void (*m_callback )(void *state);
|
||||
void *m_state;
|
||||
|
||||
rdbid_t m_rdbId;
|
||||
bool m_inUse;
|
||||
collnum_t m_collnum;
|
||||
int32_t m_niceness;
|
||||
|
||||
int32_t m_shardOverride;
|
||||
|
||||
const char *m_metaList ;
|
||||
int32_t m_metaListSize ;
|
||||
const char *m_currentPtr ; // into m_metaList
|
||||
const char *m_metaList;
|
||||
int32_t m_metaListSize;
|
||||
const char *m_currentPtr; // into m_metaList
|
||||
|
||||
// the linked list for waiting in line
|
||||
class Msg4 *m_next;
|
||||
Msg4 *m_next;
|
||||
};
|
||||
|
||||
#endif // GB_MSG4_H
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "Process.h"
|
||||
#include "GbMutex.h"
|
||||
#include "ScopedLock.h"
|
||||
#include <new>
|
||||
|
||||
|
||||
// increasing this doesn't seem to improve performance any on a single
|
||||
@ -62,7 +63,7 @@ void Msg40::resetBuf2 ( ) {
|
||||
// cast it
|
||||
Msg20 *m = (Msg20 *)p;
|
||||
// free its stuff
|
||||
m->destructor();
|
||||
m->~Msg20();
|
||||
// advance
|
||||
p += sizeof(Msg20);
|
||||
}
|
||||
@ -629,7 +630,7 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
// point to the next Msg20
|
||||
p += sizeof(Msg20);
|
||||
// init it
|
||||
tmp[i]->constructor();
|
||||
new (tmp[i]) Msg20();
|
||||
// count it
|
||||
pcount++;
|
||||
// skip it if it is a new docid, we do not have a Msg20
|
||||
@ -740,7 +741,7 @@ bool Msg40::reallocMsg20Buf ( ) {
|
||||
// point it to its memory
|
||||
m_msg20[i] = (Msg20 *)p;
|
||||
// call its constructor
|
||||
m_msg20[i]->constructor();
|
||||
new (m_msg20[i]) Msg20();
|
||||
// point to the next Msg20
|
||||
p += sizeof(Msg20);
|
||||
// remember num to free in reset() function
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include "RdbCache.h"
|
||||
#include "ScopedLock.h"
|
||||
#include "Sanity.h"
|
||||
|
||||
#include "Titledb.h"
|
||||
|
||||
// how many Msg0 requests can we launch at the same time?
|
||||
#define MSG51_MAX_REQUESTS 60
|
||||
@ -511,7 +511,7 @@ bool setClusterLevels ( const key96_t *clusterRecs,
|
||||
// . get the site hash
|
||||
// . these are only 32 bits!
|
||||
if(fakeIt)
|
||||
h = g_titledb.getDomHash8FromDocId(docIds[i]);
|
||||
h = Titledb::getDomHash8FromDocId(docIds[i]);
|
||||
else
|
||||
h = g_clusterdb.getSiteHash26 ( crec );
|
||||
|
||||
|
127
Multicast.cpp
127
Multicast.cpp
@ -1,15 +1,10 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
|
||||
// i guess both msg0 send requests failed with no route to host,
|
||||
//and they got retired... why didnt they switch to eth1????
|
||||
|
||||
|
||||
#include "Multicast.h"
|
||||
#include "Rdb.h" // RDB_TITLEDB
|
||||
#include "Msg20.h"
|
||||
#include "Profiler.h"
|
||||
#include "UdpServer.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Stats.h"
|
||||
#include "Conf.h"
|
||||
#include "Loop.h" // registerSleepCallback()
|
||||
#include "ScopedLock.h"
|
||||
#include "Process.h"
|
||||
|
||||
// up to 10 twins in a group
|
||||
@ -19,20 +14,66 @@
|
||||
// to send we should send as much as we can and save the remaining
|
||||
// slots to disk for sending later??
|
||||
|
||||
static void sleepWrapper1 ( int bogusfd , void *state ) ;
|
||||
static void sleepWrapper2 ( int bogusfd , void *state ) ;
|
||||
static void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) ;
|
||||
static void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) ;
|
||||
|
||||
void Multicast::constructor ( ) {
|
||||
|
||||
void Multicast::constructor() {
|
||||
m_msg = NULL;
|
||||
m_readBuf = NULL;
|
||||
m_inUse = false;
|
||||
}
|
||||
void Multicast::destructor ( ) { reset(); }
|
||||
|
||||
Multicast::Multicast ( ) { constructor(); }
|
||||
Multicast::~Multicast ( ) { reset(); }
|
||||
void Multicast::destructor() {
|
||||
reset();
|
||||
}
|
||||
|
||||
Multicast::Multicast()
|
||||
: m_msg(NULL),
|
||||
m_msgSize(0),
|
||||
m_msgType((msg_type_t)-1),
|
||||
m_ownMsg(false),
|
||||
m_slot(NULL),
|
||||
m_inUse(false),
|
||||
m_next(NULL),
|
||||
m_replyingHost(NULL),
|
||||
m_replyLaunchTime(0),
|
||||
m_hackFileId(0),
|
||||
m_hackFileOff(0),
|
||||
m_importState(NULL),
|
||||
m_mtx(),
|
||||
m_state(NULL), m_state2(NULL),
|
||||
m_callback(NULL),
|
||||
m_totalTimeout(0),
|
||||
m_startTime(0),
|
||||
m_numReplies(0),
|
||||
//m_hostPtrs
|
||||
m_numHosts(0),
|
||||
//m_retired
|
||||
//m_slots
|
||||
//m_errnos
|
||||
//m_inProgress
|
||||
//m_launchTime
|
||||
m_readBuf(NULL),
|
||||
m_readBufSize(0),
|
||||
m_readBufMaxSize(0),
|
||||
m_ownReadBuf(false),
|
||||
m_registeredSleep(false),
|
||||
m_niceness(0),
|
||||
m_lastLaunch(0),
|
||||
m_lastLaunchHost(NULL),
|
||||
m_freeReadBuf(false),
|
||||
m_key(0),
|
||||
m_sendToSelf(false),
|
||||
m_retryCount(0),
|
||||
m_sentToTwin(false)
|
||||
{
|
||||
constructor();
|
||||
|
||||
}
|
||||
|
||||
Multicast::~Multicast() {
|
||||
reset();
|
||||
}
|
||||
|
||||
|
||||
// free the send/read (request/reply) bufs we pirated from a UdpSlot or
|
||||
// got from the caller
|
||||
@ -103,10 +144,10 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
|
||||
m_key = key;
|
||||
|
||||
// clear m_retired, m_errnos, m_slots
|
||||
memset ( m_retired , 0 , sizeof(bool ) * MAX_HOSTS_PER_GROUP );
|
||||
memset ( m_errnos , 0 , sizeof(int32_t ) * MAX_HOSTS_PER_GROUP );
|
||||
memset ( m_slots , 0 , sizeof(UdpSlot *) * MAX_HOSTS_PER_GROUP );
|
||||
memset ( m_inProgress , 0 , sizeof(char ) * MAX_HOSTS_PER_GROUP );
|
||||
memset(m_retired, 0, sizeof(m_retired));
|
||||
memset(m_errnos, 0, sizeof(m_errnos));
|
||||
memset(m_slots, 0, sizeof(m_slots));
|
||||
memset(m_inProgress, 0, sizeof(m_inProgress));
|
||||
|
||||
// . get the list of hosts in this group
|
||||
// . returns false if blocked, true otherwise
|
||||
@ -160,6 +201,7 @@ bool Multicast::send(char *msg, int32_t msgSize, msg_type_t msgType, bool ownMsg
|
||||
// . TODO: deal with errors from g_udpServer::sendRequest() better
|
||||
// . returns false and sets g_errno on error
|
||||
void Multicast::sendToGroup() {
|
||||
ScopedLock sl(m_mtx);
|
||||
// see if anyone gets an error
|
||||
bool hadError = false;
|
||||
// . cast the msg to ALL hosts in the m_hosts group of hosts
|
||||
@ -203,7 +245,7 @@ void Multicast::sendToGroup() {
|
||||
// . send to a single host
|
||||
// . this creates a transaction control slot, "udpSlot"
|
||||
// . returns false and sets g_errno on error
|
||||
if (us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReplyWrapperM2, m_totalTimeout, m_niceness)) {
|
||||
if (us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReply2, m_totalTimeout, m_niceness)) {
|
||||
continue;
|
||||
}
|
||||
// g_errno must have been set, remember it
|
||||
@ -237,22 +279,23 @@ void Multicast::sendToGroup() {
|
||||
}
|
||||
}
|
||||
|
||||
void sleepWrapper2 ( int bogusfd , void *state ) {
|
||||
Multicast *THIS = (Multicast *)state;
|
||||
void Multicast::sleepWrapper2(int bogusfd, void *state) {
|
||||
Multicast *THIS = static_cast<Multicast*>(state);
|
||||
// try another round of sending to see if hosts had errors or not
|
||||
THIS->sendToGroup ( );
|
||||
THIS->sendToGroup();
|
||||
}
|
||||
|
||||
// C wrapper for the C++ callback
|
||||
void gotReplyWrapperM2 ( void *state , UdpSlot *slot ) {
|
||||
Multicast *THIS = (Multicast *)state;
|
||||
THIS->gotReply2 ( slot );
|
||||
|
||||
void Multicast::gotReply2(void *state, UdpSlot *slot) {
|
||||
Multicast *THIS = static_cast<Multicast*>(state);
|
||||
THIS->gotReply2(slot);
|
||||
}
|
||||
|
||||
// . otherwise, we were sending to a whole group so ALL HOSTS must produce a
|
||||
// successful reply
|
||||
// . we keep re-trying forever until they do
|
||||
void Multicast::gotReply2 ( UdpSlot *slot ) {
|
||||
ScopedLock sl(m_mtx);
|
||||
// don't ever let UdpServer free this send buf (it is m_msg)
|
||||
slot->m_sendBufAlloc = NULL;
|
||||
// save this for msg4 logic that calls injection callback
|
||||
@ -290,6 +333,7 @@ void Multicast::gotReply2 ( UdpSlot *slot ) {
|
||||
// allow us to be re-used now, callback might relaunch
|
||||
m_inUse = false;
|
||||
if ( m_callback ) {
|
||||
sl.unlock();
|
||||
m_callback ( m_state , m_state2 );
|
||||
}
|
||||
return;
|
||||
@ -626,7 +670,8 @@ bool Multicast::sendToHost ( int32_t i ) {
|
||||
// . this creates a transaction control slot, "udpSlot"
|
||||
// . return false and sets g_errno on error
|
||||
// . returns true on successful launch and calls callback on completion
|
||||
if (!us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReplyWrapperM1, timeRemaining, m_niceness, NULL, -1, -1, maxResends)) {
|
||||
ScopedLock sl(m_mtx);
|
||||
if (!us->sendRequest(m_msg, m_msgSize, m_msgType, bestIp, destPort, hid, &m_slots[i], this, gotReply1, timeRemaining, m_niceness, NULL, -1, -1, maxResends)) {
|
||||
log(LOG_WARN, "net: Had error sending msgtype 0x%02x to host #%" PRId32": %s. Not retrying.",
|
||||
m_msgType,h->m_hostId,mstrerror(g_errno));
|
||||
// i've seen ENOUDPSLOTS available msg here along with oom
|
||||
@ -635,7 +680,7 @@ bool Multicast::sendToHost ( int32_t i ) {
|
||||
return false;
|
||||
}
|
||||
// mark it as outstanding
|
||||
m_inProgress[i] = 1;
|
||||
m_inProgress[i] = true;
|
||||
// set our last launch date
|
||||
m_lastLaunch = nowms ; // gettimeofdayInMilliseconds();
|
||||
// save the host, too
|
||||
@ -657,7 +702,7 @@ bool Multicast::sendToHost ( int32_t i ) {
|
||||
|
||||
// this is called every 50 ms so we have the chance to launch our request
|
||||
// to a more responsive host
|
||||
void sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
void Multicast::sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
Multicast *THIS = (Multicast *) state;
|
||||
// . if our last launch was less than X seconds ago, wait another tick
|
||||
// . we often send out 2+ requests and end up getting one reply before
|
||||
@ -851,14 +896,16 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
// THIS->m_msgType);
|
||||
}
|
||||
|
||||
// C wrapper for the C++ callback
|
||||
void gotReplyWrapperM1 ( void *state , UdpSlot *slot ) {
|
||||
Multicast *THIS = (Multicast *)state;
|
||||
THIS->gotReply1 ( slot );
|
||||
|
||||
void Multicast::gotReply1(void *state, UdpSlot *slot) {
|
||||
Multicast *THIS = static_cast<Multicast*>(state);
|
||||
THIS->gotReply1(slot);
|
||||
}
|
||||
|
||||
// come here if we've got a reply from a host that's not part of a group send
|
||||
void Multicast::gotReply1 ( UdpSlot *slot ) {
|
||||
ScopedLock sl(m_mtx);
|
||||
|
||||
// don't ever let UdpServer free this send buf (it is m_msg)
|
||||
slot->m_sendBufAlloc = NULL;
|
||||
|
||||
@ -887,7 +934,7 @@ void Multicast::gotReply1 ( UdpSlot *slot ) {
|
||||
}
|
||||
|
||||
// mark it as no longer in progress
|
||||
m_inProgress[i] = 0;
|
||||
m_inProgress[i] = false;
|
||||
|
||||
Host *h = m_hostPtrs[i];
|
||||
|
||||
@ -900,6 +947,8 @@ void Multicast::gotReply1 ( UdpSlot *slot ) {
|
||||
(int32_t) m_msgType, (PTRTYPE) this, mstrerror(g_errno));
|
||||
}
|
||||
|
||||
sl.unlock();
|
||||
|
||||
// on error try sending the request to another host
|
||||
// return if we kicked another request off ok
|
||||
if ( g_errno ) {
|
||||
@ -1069,7 +1118,7 @@ void Multicast::destroySlotsInProgress ( UdpSlot *slot ) {
|
||||
// destroy this slot that's in progress
|
||||
g_udpServer.destroySlot ( m_slots[i] );
|
||||
// do not re-destroy. consider no longer in progress.
|
||||
m_inProgress[i] = 0;
|
||||
m_inProgress[i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
75
Multicast.h
75
Multicast.h
@ -17,9 +17,11 @@
|
||||
#ifndef GB_MULTICAST_H
|
||||
#define GB_MULTICAST_H
|
||||
|
||||
#include "Hostdb.h" // getGroup(), getTimes(), stampHost()
|
||||
#include "UdpServer.h" // sendRequest()
|
||||
#include "Loop.h" // registerSleepCallback()
|
||||
#include "MsgType.h"
|
||||
#include "GbMutex.h"
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
#define MAX_HOSTS_PER_GROUP 10
|
||||
|
||||
@ -31,6 +33,9 @@ static const int64_t multicast_msg3a_default_timeout = 10000;
|
||||
static const int64_t multicast_msg3a_maximum_timeout = 60000;
|
||||
static const int64_t multicast_msg1c_getip_default_timeout = 60000;
|
||||
|
||||
class UdpSlot;
|
||||
class Host;
|
||||
|
||||
|
||||
class Multicast {
|
||||
|
||||
@ -103,31 +108,37 @@ class Multicast {
|
||||
|
||||
// private:
|
||||
|
||||
void destroySlotsInProgress ( UdpSlot *slot );
|
||||
|
||||
// keep these public so C wrapper can call them
|
||||
bool sendToHostLoop(int32_t key, int32_t hostNumToTry, int32_t firstHostId);
|
||||
bool sendToHost ( int32_t i );
|
||||
int32_t pickBestHost ( uint32_t key , int32_t hostNumToTry );
|
||||
void gotReply1 ( UdpSlot *slot ) ;
|
||||
void closeUpShop ( UdpSlot *slot ) ;
|
||||
|
||||
void sendToGroup();
|
||||
void gotReply2 ( UdpSlot *slot ) ;
|
||||
|
||||
// . stuff set directly by send() parameters
|
||||
char *m_msg;
|
||||
int32_t m_msgSize;
|
||||
msg_type_t m_msgType;
|
||||
bool m_ownMsg;
|
||||
//uint32_t m_groupId;
|
||||
|
||||
class UdpSlot *m_slot;
|
||||
|
||||
bool m_inUse;
|
||||
|
||||
// for linked list of available Multicasts in Msg4.cpp
|
||||
class Multicast *m_next;
|
||||
|
||||
// host we got reply from. used by Msg3a for timing.
|
||||
Host *m_replyingHost;
|
||||
// when the request was launched to the m_replyingHost
|
||||
int64_t m_replyLaunchTime;
|
||||
|
||||
// more hack stuff used by PageInject.cpp
|
||||
int32_t m_hackFileId;
|
||||
int64_t m_hackFileOff;
|
||||
class ImportState *m_importState;
|
||||
|
||||
private:
|
||||
GbMutex m_mtx;
|
||||
|
||||
void *m_state;
|
||||
void *m_state2;
|
||||
void (* m_callback)( void *state , void *state2 );
|
||||
int64_t m_totalTimeout; // in milliseconds
|
||||
|
||||
class UdpSlot *m_slot;
|
||||
|
||||
// . m_slots[] is our list of concurrent transactions
|
||||
// . we delete all the slots only after cast is done
|
||||
int64_t m_startTime; // milliseconds since the epoch
|
||||
@ -150,7 +161,7 @@ class Multicast {
|
||||
// did we have an errno with this slot?
|
||||
int32_t m_errnos [MAX_HOSTS_PER_GROUP];
|
||||
// transaction in progress?
|
||||
char m_inProgress [MAX_HOSTS_PER_GROUP];
|
||||
bool m_inProgress [MAX_HOSTS_PER_GROUP];
|
||||
int64_t m_launchTime [MAX_HOSTS_PER_GROUP];
|
||||
|
||||
// steal this from the slot(s) we get
|
||||
@ -168,6 +179,7 @@ class Multicast {
|
||||
// . last sending of the request to ONE host in a group (pick & send)
|
||||
// . in milliseconds
|
||||
int64_t m_lastLaunch;
|
||||
|
||||
Host *m_lastLaunchHost;
|
||||
|
||||
// only free m_reply if this is true
|
||||
@ -180,22 +192,23 @@ class Multicast {
|
||||
|
||||
int32_t m_retryCount;
|
||||
|
||||
char m_sentToTwin;
|
||||
bool m_sentToTwin;
|
||||
|
||||
char m_inUse;
|
||||
void destroySlotsInProgress ( UdpSlot *slot );
|
||||
|
||||
// for linked list of available Multicasts in Msg4.cpp
|
||||
class Multicast *m_next;
|
||||
void sendToGroup();
|
||||
|
||||
// host we got reply from. used by Msg3a for timing.
|
||||
Host *m_replyingHost;
|
||||
// when the request was launched to the m_replyingHost
|
||||
int64_t m_replyLaunchTime;
|
||||
static void sleepWrapper1(int bogusfd, void *state);
|
||||
static void sleepWrapper2(int bogusfd, void *state);
|
||||
static void gotReply1(void *state, UdpSlot *slot);
|
||||
void gotReply1(UdpSlot *slot);
|
||||
static void gotReply2(void *state, UdpSlot *slot);
|
||||
void gotReply2(UdpSlot *slot);
|
||||
|
||||
// more hack stuff used by PageInject.cpp
|
||||
int32_t m_hackFileId;
|
||||
int64_t m_hackFileOff;
|
||||
class ImportState *m_importState;
|
||||
bool sendToHostLoop(int32_t key, int32_t hostNumToTry, int32_t firstHostId);
|
||||
bool sendToHost ( int32_t i );
|
||||
int32_t pickBestHost ( uint32_t key , int32_t hostNumToTry );
|
||||
void closeUpShop ( UdpSlot *slot ) ;
|
||||
};
|
||||
|
||||
#endif // GB_MULTICAST_H
|
||||
|
@ -95,7 +95,7 @@ bool sendPageAddUrl2 ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
// add to spiderdb
|
||||
if ( ! gr->m_msg4.addMetaList( &(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper, 0 ) ) {
|
||||
if (!gr->m_msg4.addMetaList(&(gr->m_listBuf), cr->m_collnum, gr, addedUrlsToSpiderdbWrapper)) {
|
||||
// blocked!
|
||||
return false;
|
||||
}
|
||||
|
@ -188,7 +188,7 @@ bool getSpiderRequestMetaList ( const char *doc, SafeBuf *listBuf, bool spiderLi
|
||||
if ( url.getUrlLen() <= 0 ) continue;
|
||||
|
||||
// need this
|
||||
int64_t probDocId = g_titledb.getProbableDocId(&url);
|
||||
int64_t probDocId = Titledb::getProbableDocId(&url);
|
||||
|
||||
// make it
|
||||
SpiderRequest sreq;
|
||||
|
@ -107,7 +107,7 @@ Host *getHostToHandleInjection ( char *url ) {
|
||||
Url norm;
|
||||
norm.set(url);
|
||||
|
||||
int64_t docId = g_titledb.getProbableDocId ( &norm );
|
||||
int64_t docId = Titledb::getProbableDocId ( &norm );
|
||||
uint32_t shardNum = getShardNumFromDocId(docId);
|
||||
Host *host = g_hostdb.getHostWithSpideringEnabled(shardNum);
|
||||
|
||||
@ -1057,7 +1057,7 @@ bool ImportState::importLoop ( ) {
|
||||
mcast->m_hackFileId = m_bfFileId;
|
||||
|
||||
// get docid from key
|
||||
docId = g_titledb.getDocIdFromKey ( &tkey );
|
||||
docId = Titledb::getDocIdFromKey ( &tkey );
|
||||
|
||||
// get shard that holds the titlerec for it
|
||||
shardNum = g_hostdb.getShardNumFromDocId ( docId );
|
||||
|
@ -594,7 +594,7 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
// if facebook, load xml content from title rec...
|
||||
bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
|
||||
if ( isFacebook && ! content ) {
|
||||
int64_t docId = g_titledb.getProbableDocId((char*)st->m_u);
|
||||
int64_t docId = Titledb::getProbableDocId((char*)st->m_u);
|
||||
sprintf(sreq.m_url ,"%" PRIu64, docId );
|
||||
sreq.m_isPageReindex = true;
|
||||
}
|
||||
|
@ -419,7 +419,7 @@ bool Msg1c::gotList ( ) {
|
||||
|
||||
log("reindex: adding docid list to spiderdb");
|
||||
|
||||
return m_msg4.addMetaList ( &m_sb, m_collnum, this, addedListWrapper, 0, RDB_SPIDERDB );
|
||||
return m_msg4.addMetaList(&m_sb, m_collnum, this, addedListWrapper, RDB_SPIDERDB);
|
||||
}
|
||||
|
||||
void addedListWrapper ( void *state ) {
|
||||
|
@ -11867,8 +11867,8 @@ bool Parms::syncParmsWithHost0 ( ) {
|
||||
void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
|
||||
// right now we must be host #0
|
||||
if ( g_hostdb.m_hostId != 0 ) {
|
||||
hadError:
|
||||
g_errno = EBADENGINEER;
|
||||
hadError:
|
||||
g_udpServer.sendErrorReply( slot, g_errno );
|
||||
return;
|
||||
}
|
||||
|
24
Posdb.cpp
24
Posdb.cpp
@ -143,7 +143,7 @@ bool Posdb::init ( ) {
|
||||
false , // istitledb?
|
||||
getKeySize(),
|
||||
false,
|
||||
true);
|
||||
g_conf.m_noInMemoryPosdbMerge);
|
||||
}
|
||||
|
||||
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
||||
@ -160,15 +160,17 @@ bool Posdb::init2 ( int32_t treeMem ) {
|
||||
// must be able to fit all bins in memory
|
||||
// . we do not want posdb's bin tree to ever hit disk since we
|
||||
// dump it to rdb files when it is 90% full (90% of bins in use)
|
||||
return m_rdb.init ( g_hostdb.m_dir ,
|
||||
"posdbRebuild" ,
|
||||
getFixedDataSize(),
|
||||
1000 , // min files to merge
|
||||
treeMem ,
|
||||
maxTreeNodes ,
|
||||
getUseHalfKeys(),
|
||||
false ,
|
||||
getKeySize());
|
||||
return m_rdb.init(g_hostdb.m_dir,
|
||||
"posdbRebuild",
|
||||
getFixedDataSize(),
|
||||
1000, // min files to merge
|
||||
treeMem,
|
||||
maxTreeNodes,
|
||||
getUseHalfKeys(),
|
||||
false,
|
||||
getKeySize(),
|
||||
false,
|
||||
g_conf.m_noInMemoryPosdbMerge);
|
||||
}
|
||||
|
||||
|
||||
@ -546,7 +548,7 @@ int Posdb::printList ( RdbList &list ) {
|
||||
const char *dd = "";
|
||||
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
||||
int64_t d = g_posdb.getDocId(&k);
|
||||
uint8_t dh = g_titledb.getDomHash8FromDocId(d);
|
||||
uint8_t dh = Titledb::getDomHash8FromDocId(d);
|
||||
char *rec = list.getCurrentRec();
|
||||
int32_t recSize = 18;
|
||||
if ( rec[0] & 0x04 ) recSize = 6;
|
||||
|
647
PosdbTable.cpp
647
PosdbTable.cpp
@ -2605,6 +2605,309 @@ VALGRIND_CHECK_MEM_IS_DEFINED(&dcs,sizeof(dcs));
|
||||
}
|
||||
|
||||
|
||||
// Pre-advance each termlist's cursor to skip to next docid.
|
||||
//
|
||||
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
|
||||
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
|
||||
// of each termlist so we are ready for a quick skip over this docid.
|
||||
//
|
||||
// TODO: use just a single array of termlist ptrs perhaps,
|
||||
// then we can remove them when they go NULL. and we'd save a little
|
||||
// time not having a nested loop.
|
||||
bool PosdbTable::advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf) {
|
||||
logTrace(g_conf.m_logTracePosdb, "BEGIN");
|
||||
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
// get it
|
||||
QueryTermInfo *qti = &qtibuf[i];
|
||||
// do not advance negative termlist cursor
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
|
||||
// In second pass (to get detailed scoring info for UI output), they are initialized above
|
||||
//
|
||||
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
|
||||
// shortcuts
|
||||
char *xc = qti->m_matchingSubListCursor[j];
|
||||
char *xcEnd = qti->m_matchingSubListEnd[j];
|
||||
|
||||
// exhausted? (we can't make cursor NULL because
|
||||
// getMaxPossibleScore() needs the last ptr)
|
||||
// must match docid
|
||||
if ( xc >= xcEnd ||
|
||||
*(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) ||
|
||||
(*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) {
|
||||
// flag it as not having the docid
|
||||
qti->m_matchingSubListSavedCursor[j] = NULL;
|
||||
// skip this sublist if does not have our docid
|
||||
continue;
|
||||
}
|
||||
|
||||
// save it
|
||||
qti->m_matchingSubListSavedCursor[j] = xc;
|
||||
// get new docid
|
||||
//log("new docid %" PRId64,Posdb::getDocId(xc) );
|
||||
// advance the cursors. skip our 12
|
||||
xc += 12;
|
||||
// then skip any following 6 byte keys because they
|
||||
// share the same docid
|
||||
for ( ; ; xc += 6 ) {
|
||||
// end of whole termlist?
|
||||
if ( xc >= xcEnd ) {
|
||||
break;
|
||||
}
|
||||
|
||||
// sanity. no 18 byte keys allowed
|
||||
if ( (*xc & 0x06) == 0x00 ) {
|
||||
// i've seen this triggered on gk28.
|
||||
// a dump of posdb for the termlist
|
||||
// for 'post' had corruption in it,
|
||||
// yet its twin, gk92 did not. the
|
||||
// corruption could have occurred
|
||||
// anywhere from nov 2012 to may 2013,
|
||||
// and the posdb file was never
|
||||
// re-merged! must have been blatant
|
||||
// disk malfunction?
|
||||
log("posdb: encountered corrupt posdb list. bailing.");
|
||||
logTrace(g_conf.m_logTracePosdb, "END.");
|
||||
return false;
|
||||
//gbshutdownAbort(true);
|
||||
}
|
||||
// the next docid? it will be a 12 byte key.
|
||||
if ( ! (*xc & 0x04) ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// assign to next docid word position list
|
||||
qti->m_matchingSubListCursor[j] = xc;
|
||||
}
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "END");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define RINGBUFSIZE 4096
|
||||
|
||||
//
|
||||
// TODO: consider skipping this pre-filter if it sucks, as it does
|
||||
// for 'search engine'. it might save time!
|
||||
//
|
||||
// Returns:
|
||||
// false - docid does not meet minimum score requirement
|
||||
// true - docid can potentially be a top scoring docid
|
||||
//
|
||||
bool PosdbTable::prefilterMaxPossibleScoreByDistance(QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore) {
|
||||
//#define RINGBUFSIZE 1024
|
||||
unsigned char ringBuf[RINGBUFSIZE+10];
|
||||
// for overflow conditions in loops below
|
||||
ringBuf[RINGBUFSIZE+0] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+1] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+2] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+3] = 0xff;
|
||||
unsigned char qt;
|
||||
QueryTermInfo *qtx;
|
||||
uint32_t wx;
|
||||
int32_t ourFirstPos = -1;
|
||||
int32_t qdist;
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "BEGIN");
|
||||
|
||||
|
||||
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
|
||||
memset ( ringBuf, 0xff, RINGBUFSIZE );
|
||||
|
||||
// now to speed up 'time enough for love' query which does not
|
||||
// have many super high scoring guys on top we need a more restrictive
|
||||
// filter than getMaxPossibleScore() so let's pick one query term,
|
||||
// the one with the shortest termlist, and see how close it gets to
|
||||
// each of the other query terms. then score each of those pairs.
|
||||
// so quickly record the word positions of each query term into
|
||||
// a ring buffer of 4096 slots where each slot contains the
|
||||
// query term # plus 1.
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation");
|
||||
qtx = &qtibuf[m_minTermListIdx];
|
||||
// populate ring buf just for this query term
|
||||
for ( int32_t k = 0 ; k < qtx->m_numMatchingSubLists ; k++ ) {
|
||||
// scan that sublist and add word positions
|
||||
char *sub = qtx->m_matchingSubListSavedCursor[k];
|
||||
// skip sublist if it's cursor is exhausted
|
||||
if ( ! sub ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char *end = qtx->m_matchingSubListCursor[k];
|
||||
// add first key
|
||||
//int32_t wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = m_minTermListIdx;
|
||||
// set this
|
||||
ourFirstPos = wx;
|
||||
// skip first key
|
||||
sub += 12;
|
||||
// then 6 byte keys
|
||||
for ( ; sub < end ; sub += 6 ) {
|
||||
// get word position
|
||||
//wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = m_minTermListIdx;
|
||||
}
|
||||
}
|
||||
|
||||
// now get query term closest to query term # m_minTermListIdx which
|
||||
// is the query term # with the shortest termlist
|
||||
// get closest term to m_minTermListIdx and the distance
|
||||
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2");
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
if ( i == m_minTermListIdx ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the query term info
|
||||
QueryTermInfo *qti = &qtibuf[i];
|
||||
|
||||
// if we have a negative term, skip it
|
||||
if ( qti->m_bigramFlags[0] & (BF_NEGATIVE) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// store all his word positions into ring buffer AS WELL
|
||||
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
|
||||
// scan that sublist and add word positions
|
||||
char *sub = qti->m_matchingSubListSavedCursor[k];
|
||||
// skip sublist if it's cursor is exhausted
|
||||
if ( ! sub ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char *end = qti->m_matchingSubListCursor[k];
|
||||
// add first key
|
||||
//int32_t wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = i;
|
||||
// skip first key
|
||||
sub += 12;
|
||||
// then 6 byte keys
|
||||
for ( ; sub < end ; sub += 6 ) {
|
||||
// get word position
|
||||
//wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// reset
|
||||
int32_t ourLastPos = -1;
|
||||
int32_t hisLastPos = -1;
|
||||
int32_t bestDist = 0x7fffffff;
|
||||
// how far is this guy from the man?
|
||||
for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) {
|
||||
// skip next 4 slots if all empty. fast?
|
||||
if (*(uint32_t *)(ringBuf+x) == 0xffffffff) {
|
||||
x+=4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// skip if nobody
|
||||
if ( ringBuf[x] == 0xff ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// get query term #
|
||||
qt = ringBuf[x];
|
||||
|
||||
// if it's the man
|
||||
if ( qt == m_minTermListIdx ) {
|
||||
// record
|
||||
hisLastPos = x;
|
||||
// skip if we are not there yet
|
||||
if ( ourLastPos == -1 ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// try distance fix
|
||||
if ( x - ourLastPos < bestDist ) {
|
||||
bestDist = x - ourLastPos;
|
||||
}
|
||||
}
|
||||
// if us
|
||||
else
|
||||
if ( qt == i ) {
|
||||
// record
|
||||
ourLastPos = x;
|
||||
// skip if he's not recorded yet
|
||||
if ( hisLastPos == -1 ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// update
|
||||
ourLastPos = x;
|
||||
//@@@ ^^ dupe
|
||||
// check dist
|
||||
if ( x - hisLastPos < bestDist ) {
|
||||
bestDist = x - hisLastPos;
|
||||
}
|
||||
}
|
||||
x++;
|
||||
continue; //@@@ doh...
|
||||
}
|
||||
|
||||
// compare last occurence of query term #x with our first occ.
|
||||
// since this is a RING buffer
|
||||
int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos);
|
||||
if ( wrapDist < bestDist ) {
|
||||
bestDist = wrapDist;
|
||||
}
|
||||
|
||||
// query distance
|
||||
qdist = qpos[m_minTermListIdx] - qpos[i];
|
||||
// compute it
|
||||
float maxScore2 = getMaxPossibleScore(&qtibuf[i],
|
||||
bestDist,
|
||||
qdist,
|
||||
&qtibuf[m_minTermListIdx]);
|
||||
// -1 means it has inlink text so do not apply this constraint
|
||||
// to this docid because it is too difficult because we
|
||||
// sum up the inlink text
|
||||
if ( maxScore2 < 0.0 ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if any one of these terms have a max score below the
|
||||
// worst score of the 10th result, then it can not win.
|
||||
// @todo: BR. Really? ANY of them?
|
||||
if ( maxScore2 <= minWinningScore ) {
|
||||
logTrace(g_conf.m_logTracePosdb, "END - docid score too low");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "END - docid score high enough");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . compare the output of this to intersectLists9_r()
|
||||
// . hopefully this will be easier to understand and faster
|
||||
@ -2720,8 +3023,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
float pss;
|
||||
// scan the posdb keys in the smallest list
|
||||
// raised from 200 to 300,000 for 'da da da' query
|
||||
char mbuf[300000];
|
||||
char *mptrEnd = mbuf + 299000;
|
||||
char miniMergeBuf[300000];
|
||||
char *mptrEnd = miniMergeBuf + 299000;
|
||||
char *mptr;
|
||||
char *docIdPtr;
|
||||
char *docIdEnd = m_docIdVoteBuf.getBufStart()+m_docIdVoteBuf.length();
|
||||
@ -2732,22 +3035,11 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
char *lastMptr = NULL;
|
||||
int32_t topCursor = -9;
|
||||
int32_t numProcessed = 0;
|
||||
#define RINGBUFSIZE 4096
|
||||
//#define RINGBUFSIZE 1024
|
||||
unsigned char ringBuf[RINGBUFSIZE+10];
|
||||
// for overflow conditions in loops below
|
||||
ringBuf[RINGBUFSIZE+0] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+1] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+2] = 0xff;
|
||||
ringBuf[RINGBUFSIZE+3] = 0xff;
|
||||
unsigned char qt;
|
||||
QueryTermInfo *qtx;
|
||||
uint32_t wx;
|
||||
int32_t fail0 = 0;
|
||||
int32_t pass0 = 0;
|
||||
int32_t fail = 0;
|
||||
int32_t pass = 0;
|
||||
int32_t ourFirstPos = -1;
|
||||
|
||||
int32_t prefiltMaxPossScoreFail = 0;
|
||||
int32_t prefiltMaxPossScorePass = 0;
|
||||
int32_t prefiltBestDistMaxPossScoreFail = 0;
|
||||
int32_t prefiltBestDistMaxPossScorePass = 0;
|
||||
|
||||
|
||||
// populate the cursors for each sublist
|
||||
@ -2819,6 +3111,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
bool allDone = false;
|
||||
|
||||
while( !allDone && docIdPtr < docIdEnd ) {
|
||||
logTrace(g_conf.m_logTracePosdb, "Handling next docID");
|
||||
|
||||
bool skipToNextDocId = false;
|
||||
|
||||
// second pass? for printing out transparency info.
|
||||
@ -2830,101 +3124,34 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if( currPassNum == INTERSECT_SCORING ) {
|
||||
//
|
||||
// Pre-advance each termlist's cursor to skip to next docid.
|
||||
//
|
||||
// Set QueryTermInfo::m_matchingSubListCursor to NEXT docid
|
||||
// Set QueryTermInfo::m_matchingSubListSavedCursor to CURRENT docid
|
||||
// of each termlist so we are ready for a quick skip over this docid.
|
||||
//
|
||||
// TODO: use just a single array of termlist ptrs perhaps,
|
||||
// then we can remove them when they go NULL. and we'd save a little
|
||||
// time not having a nested loop.
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
// get it
|
||||
QueryTermInfo *qti = &qtibuf[i];
|
||||
// do not advance negative termlist cursor
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// In first pass, sublists data is initialized by delNonMatchingDocIdsFromSubLists.
|
||||
// In second pass (to get detailed scoring info for UI output), they are initialized above
|
||||
//
|
||||
for ( int32_t j = 0 ; j < qti->m_numMatchingSubLists ; j++ ) {
|
||||
// shortcuts
|
||||
char *xc = qti->m_matchingSubListCursor[j];
|
||||
char *xcEnd = qti->m_matchingSubListEnd[j];
|
||||
|
||||
// exhausted? (we can't make cursor NULL because
|
||||
// getMaxPossibleScore() needs the last ptr)
|
||||
// must match docid
|
||||
if ( xc >= xcEnd ||
|
||||
*(int32_t *)(xc+8) != *(int32_t *)(docIdPtr+1) ||
|
||||
(*(char *)(xc+7)&0xfc) != (*(char *)(docIdPtr)&0xfc) ) {
|
||||
// flag it as not having the docid
|
||||
qti->m_matchingSubListSavedCursor[j] = NULL;
|
||||
// skip this sublist if does not have our docid
|
||||
continue;
|
||||
}
|
||||
|
||||
// save it
|
||||
qti->m_matchingSubListSavedCursor[j] = xc;
|
||||
// get new docid
|
||||
//log("new docid %" PRId64,Posdb::getDocId(xc) );
|
||||
// advance the cursors. skip our 12
|
||||
xc += 12;
|
||||
// then skip any following 6 byte keys because they
|
||||
// share the same docid
|
||||
for ( ; ; xc += 6 ) {
|
||||
// end of whole termlist?
|
||||
if ( xc >= xcEnd ) {
|
||||
break;
|
||||
}
|
||||
|
||||
// sanity. no 18 byte keys allowed
|
||||
if ( (*xc & 0x06) == 0x00 ) {
|
||||
// i've seen this triggered on gk28.
|
||||
// a dump of posdb for the termlist
|
||||
// for 'post' had corruption in it,
|
||||
// yet its twin, gk92 did not. the
|
||||
// corruption could have occurred
|
||||
// anywhere from nov 2012 to may 2013,
|
||||
// and the posdb file was never
|
||||
// re-merged! must have been blatant
|
||||
// disk malfunction?
|
||||
log("posdb: encountered corrupt posdb list. bailing.");
|
||||
logTrace(g_conf.m_logTracePosdb, "END.");
|
||||
return;
|
||||
//gbshutdownAbort(true);
|
||||
}
|
||||
// the next docid? it will be a 12 byte key.
|
||||
if ( ! (*xc & 0x04) ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// assign to next docid word position list
|
||||
qti->m_matchingSubListCursor[j] = xc;
|
||||
}
|
||||
if( !advanceTermListCursors(docIdPtr, qtibuf) ) {
|
||||
logTrace(g_conf.m_logTracePosdb, "END. advanceTermListCursors failed");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if( !m_q->m_isBoolean ) {
|
||||
|
||||
//##
|
||||
//## PRE-FILTERS. Discard DocIDs that cannot meet the minimum required
|
||||
//## score, before entering the main scoring loop below
|
||||
//##
|
||||
|
||||
|
||||
// TODO: consider skipping this pre-filter if it sucks, as it does
|
||||
// for 'time enough for love'. it might save time!
|
||||
|
||||
//
|
||||
// Calculate maximum possible score for a document. If the max score
|
||||
// is lower than the current minimum winning score, give up already
|
||||
// now and skip to the next docid.
|
||||
//
|
||||
|
||||
// Only go through this if we actually have a minimum score to compare with ...
|
||||
// No need if minWinningScore is still -1
|
||||
if ( minWinningScore >= 0 ) {
|
||||
if ( minWinningScore >= 0.0 ) {
|
||||
logTrace(g_conf.m_logTracePosdb, "Compute 'upper bound' for each query term");
|
||||
|
||||
// If there's no way we can break into the winner's circle, give up!
|
||||
@ -2949,7 +3176,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// worst score of the 10th result, then it can not win.
|
||||
if ( maxScore <= minWinningScore ) {
|
||||
docIdPtr += 6;
|
||||
fail0++;
|
||||
prefiltMaxPossScoreFail++;
|
||||
skipToNextDocId = true;
|
||||
break; // break out of numQueryTermsToHandle loop
|
||||
}
|
||||
@ -2961,197 +3188,14 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
pass0++;
|
||||
prefiltMaxPossScorePass++;
|
||||
|
||||
if ( minWinningScore >= 0.0 && m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) {
|
||||
|
||||
if ( m_sortByTermNum < 0 && m_sortByTermNumInt < 0 ) {
|
||||
// TODO: consider skipping this pre-filter if it sucks, as it does
|
||||
// for 'search engine'. it might save time!
|
||||
|
||||
// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
|
||||
memset ( ringBuf, 0xff, RINGBUFSIZE );
|
||||
|
||||
// now to speed up 'time enough for love' query which does not
|
||||
// have many super high scoring guys on top we need a more restrictive
|
||||
// filter than getMaxPossibleScore() so let's pick one query term,
|
||||
// the one with the shortest termlist, and see how close it gets to
|
||||
// each of the other query terms. then score each of those pairs.
|
||||
// so quickly record the word positions of each query term into
|
||||
// a ring buffer of 4096 slots where each slot contains the
|
||||
// query term # plus 1.
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation");
|
||||
qtx = &qtibuf[m_minTermListIdx];
|
||||
// populate ring buf just for this query term
|
||||
for ( int32_t k = 0 ; k < qtx->m_numMatchingSubLists ; k++ ) {
|
||||
// scan that sublist and add word positions
|
||||
char *sub = qtx->m_matchingSubListSavedCursor[k];
|
||||
// skip sublist if it's cursor is exhausted
|
||||
if ( ! sub ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char *end = qtx->m_matchingSubListCursor[k];
|
||||
// add first key
|
||||
//int32_t wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = m_minTermListIdx;
|
||||
// set this
|
||||
ourFirstPos = wx;
|
||||
// skip first key
|
||||
sub += 12;
|
||||
// then 6 byte keys
|
||||
for ( ; sub < end ; sub += 6 ) {
|
||||
// get word position
|
||||
//wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = m_minTermListIdx;
|
||||
}
|
||||
}
|
||||
|
||||
// now get query term closest to query term # m_minTermListIdx which
|
||||
// is the query term # with the shortest termlist
|
||||
// get closest term to m_minTermListIdx and the distance
|
||||
logTrace(g_conf.m_logTracePosdb, "Ring buffer generation 2");
|
||||
for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
// skip the man
|
||||
if ( i == m_minTermListIdx ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the query term info
|
||||
QueryTermInfo *qti = &qtibuf[i];
|
||||
// if we have a negative term, skip it
|
||||
if ( qti->m_bigramFlags[0] & (BF_NEGATIVE) ) {
|
||||
// if its empty, that's good!
|
||||
continue;
|
||||
}
|
||||
|
||||
// store all his word positions into ring buffer AS WELL
|
||||
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
|
||||
// scan that sublist and add word positions
|
||||
char *sub = qti->m_matchingSubListSavedCursor[k];
|
||||
// skip sublist if it's cursor is exhausted
|
||||
if ( ! sub ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char *end = qti->m_matchingSubListCursor[k];
|
||||
// add first key
|
||||
//int32_t wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = i;
|
||||
// skip first key
|
||||
sub += 12;
|
||||
// then 6 byte keys
|
||||
for ( ; sub < end ; sub += 6 ) {
|
||||
// get word position
|
||||
//wx = Posdb::getWordPos(sub);
|
||||
wx = (*((uint32_t *)(sub+3))) >> 6;
|
||||
// mod with 4096
|
||||
wx &= (RINGBUFSIZE-1);
|
||||
// store it. 0 is legit.
|
||||
ringBuf[wx] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// reset
|
||||
int32_t ourLastPos = -1;
|
||||
int32_t hisLastPos = -1;
|
||||
int32_t bestDist = 0x7fffffff;
|
||||
// how far is this guy from the man?
|
||||
for ( int32_t x = 0 ; x < (int32_t)RINGBUFSIZE ; ) {
|
||||
// skip next 4 slots if all empty. fast?
|
||||
if (*(uint32_t *)(ringBuf+x) == 0xffffffff) {
|
||||
x+=4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// skip if nobody
|
||||
if ( ringBuf[x] == 0xff ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// get query term #
|
||||
qt = ringBuf[x];
|
||||
|
||||
// if it's the man
|
||||
if ( qt == m_minTermListIdx ) {
|
||||
// record
|
||||
hisLastPos = x;
|
||||
// skip if we are not there yet
|
||||
if ( ourLastPos == -1 ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// try distance fix
|
||||
if ( x - ourLastPos < bestDist ) {
|
||||
bestDist = x - ourLastPos;
|
||||
}
|
||||
}
|
||||
// if us
|
||||
else
|
||||
if ( qt == i ) {
|
||||
// record
|
||||
ourLastPos = x;
|
||||
// skip if he's not recorded yet
|
||||
if ( hisLastPos == -1 ) {
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// update
|
||||
ourLastPos = x;
|
||||
|
||||
// check dist
|
||||
if ( x - hisLastPos < bestDist ) {
|
||||
bestDist = x - hisLastPos;
|
||||
}
|
||||
}
|
||||
x++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// compare last occurence of query term #x with our first occ.
|
||||
// since this is a RING buffer
|
||||
int32_t wrapDist = ourFirstPos + ((int32_t)RINGBUFSIZE-hisLastPos);
|
||||
if ( wrapDist < bestDist ) {
|
||||
bestDist = wrapDist;
|
||||
}
|
||||
|
||||
// query distance
|
||||
qdist = qpos[m_minTermListIdx] - qpos[i];
|
||||
// compute it
|
||||
float maxScore2 = getMaxPossibleScore(&qtibuf[i],
|
||||
bestDist,
|
||||
qdist,
|
||||
&qtibuf[m_minTermListIdx]);
|
||||
// -1 means it has inlink text so do not apply this constraint
|
||||
// to this docid because it is too difficult because we
|
||||
// sum up the inlink text
|
||||
if ( maxScore2 < 0.0 ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if any one of these terms have a max score below the
|
||||
// worst score of the 10th result, then it can not win.
|
||||
if ( maxScore2 <= minWinningScore ) {
|
||||
docIdPtr += 6;
|
||||
fail++;
|
||||
skipToNextDocId = true;
|
||||
break; // break out of numQueryTermsToHandle loop
|
||||
}
|
||||
if( !prefilterMaxPossibleScoreByDistance(qtibuf, qpos, minWinningScore) ) {
|
||||
docIdPtr += 6;
|
||||
prefiltBestDistMaxPossScoreFail++;
|
||||
skipToNextDocId = true;
|
||||
}
|
||||
} // not m_sortByTermNum or m_sortByTermNumInt
|
||||
|
||||
@ -3159,7 +3203,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// Continue docIdPtr < docIdEnd loop
|
||||
continue;
|
||||
}
|
||||
pass++;
|
||||
prefiltBestDistMaxPossScorePass++;
|
||||
} // !m_q->m_isBoolean
|
||||
} // currPassNum == INTERSECT_SCORING
|
||||
|
||||
@ -3186,6 +3230,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// PERFORMANCE HACK:
|
||||
//
|
||||
@ -3196,7 +3242,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// all posdb keys for this docid should fit in here, the
|
||||
// mini merge buf:
|
||||
mptr = mbuf;
|
||||
mptr = miniMergeBuf;
|
||||
|
||||
// . merge each set of sublists
|
||||
// . like we merge a term's list with its two associated bigram
|
||||
@ -3204,10 +3250,11 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// . and merge all the synonym lists for that term together as well.
|
||||
// so if the term is 'run' we merge it with the lists for
|
||||
// 'running' 'ran' etc.
|
||||
logTrace(g_conf.m_logTracePosdb, "Merge sublists");
|
||||
logTrace(g_conf.m_logTracePosdb, "Merge sublists into a single list per query term");
|
||||
for ( int32_t j = 0 ; j < m_numQueryTermInfos ; j++ ) {
|
||||
// get the query term info
|
||||
QueryTermInfo *qti = &qtibuf[j];
|
||||
|
||||
// just use the flags from first term i guess
|
||||
// NO! this loses the wikihalfstopbigram bit! so we gotta
|
||||
// add that in for the key i guess the same way we add in
|
||||
@ -3220,9 +3267,11 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// if its empty, that's good!
|
||||
continue;
|
||||
}
|
||||
|
||||
// the merged list for term #j is here:
|
||||
miniMergedList [j] = mptr;
|
||||
miniMergedList[j] = mptr;
|
||||
bool isFirstKey = true;
|
||||
|
||||
// populate the nwp[] arrays for merging
|
||||
int32_t nsub = 0;
|
||||
for ( int32_t k = 0 ; k < qti->m_numMatchingSubLists ; k++ ) {
|
||||
@ -3261,7 +3310,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
bflags [j] = nwpFlags[0];
|
||||
continue;
|
||||
}
|
||||
// . ok, merge the lists into a list in mbuf
|
||||
// . ok, merge the lists into a list in miniMergeBuf
|
||||
// . get the min of each list
|
||||
|
||||
bool currTermDone = false;
|
||||
@ -3396,7 +3445,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
|
||||
// breach?
|
||||
if ( mptr > mbuf + 300000 ) {
|
||||
if ( mptr > miniMergeBuf + 300000 ) {
|
||||
gbshutdownAbort(true);
|
||||
}
|
||||
|
||||
@ -4074,9 +4123,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// advance to next docid
|
||||
docIdPtr += 6;
|
||||
|
||||
logTrace(g_conf.m_logTracePosdb, "^ Now repeat for next docID");
|
||||
}
|
||||
} // docIdPtr < docIdEnd loop
|
||||
|
||||
|
||||
if ( m_debug ) {
|
||||
@ -4091,10 +4138,10 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
|
||||
if ( m_debug ) {
|
||||
log(LOG_INFO, "posdb: # fail0 = %" PRId32" ", fail0 );
|
||||
log(LOG_INFO, "posdb: # pass0 = %" PRId32" ", pass0 );
|
||||
log(LOG_INFO, "posdb: # fail = %" PRId32" ", fail );
|
||||
log(LOG_INFO, "posdb: # pass = %" PRId32" ", pass );
|
||||
log(LOG_INFO, "posdb: # prefiltMaxPossScoreFail........: %" PRId32" ", prefiltMaxPossScoreFail );
|
||||
log(LOG_INFO, "posdb: # prefiltMaxPossScorePass........: %" PRId32" ", prefiltMaxPossScorePass );
|
||||
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScoreFail: %" PRId32" ", prefiltBestDistMaxPossScoreFail );
|
||||
log(LOG_INFO, "posdb: # prefiltBestDistMaxPossScorePass: %" PRId32" ", prefiltBestDistMaxPossScorePass );
|
||||
}
|
||||
|
||||
// get time now
|
||||
|
@ -126,8 +126,11 @@ class PosdbTable {
|
||||
return m_initialized;
|
||||
}
|
||||
|
||||
// functions used by intersectlist
|
||||
bool genDebugScoreInfo1(int32_t &numProcessed, int32_t &topCursor, QueryTermInfo *qtibuf);
|
||||
bool genDebugScoreInfo2(DocIdScore &dcs, int32_t &lastLen, uint64_t &lastDocId, char siteRank, float score, int32_t intScore, char docLang);
|
||||
bool advanceTermListCursors(const char *docIdPtr, QueryTermInfo *qtibuf);
|
||||
bool prefilterMaxPossibleScoreByDistance(QueryTermInfo *qtibuf, const int32_t *qpos, float minWinningScore);
|
||||
|
||||
uint64_t m_docId;
|
||||
|
||||
|
7
Rdb.cpp
7
Rdb.cpp
@ -123,6 +123,7 @@ bool Rdb::init ( const char *dir ,
|
||||
m_useHalfKeys = useHalfKeys;
|
||||
m_isTitledb = isTitledb;
|
||||
m_ks = keySize;
|
||||
m_useIndexFile = useIndexFile;
|
||||
m_inDumpLoop = false;
|
||||
|
||||
// set our id
|
||||
@ -137,12 +138,6 @@ bool Rdb::init ( const char *dir ,
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
if (m_rdbId == RDB_POSDB || m_rdbId == RDB2_POSDB2) {
|
||||
m_useIndexFile = g_conf.m_noInMemoryPosdbMerge ? useIndexFile : false;
|
||||
} else {
|
||||
m_useIndexFile = useIndexFile;
|
||||
}
|
||||
|
||||
// get page size
|
||||
m_pageSize = GB_TFNDB_PAGE_SIZE;
|
||||
if ( m_rdbId == RDB_POSDB ) m_pageSize = GB_INDEXDB_PAGE_SIZE;
|
||||
|
2
Rdb.h
2
Rdb.h
@ -239,6 +239,8 @@ public:
|
||||
m_inDumpLoop = inDumpLoop;
|
||||
}
|
||||
|
||||
bool isUseIndexFile() const { return m_useIndexFile; }
|
||||
|
||||
bool inAddList() const { return m_inAddList; }
|
||||
|
||||
// . you'll lose your data in this class if you call this
|
||||
|
@ -1408,7 +1408,7 @@ bool RdbBase::attemptMerge( int32_t niceness, bool forceMergeAll, bool doLog , i
|
||||
// then do not do the merge, we do not want to overwrite tfndb via
|
||||
// RdbDump::updateTfndbLoop()
|
||||
rdbid_t rdbId = getIdFromRdb ( m_rdb );
|
||||
if ( rdbId == RDB_TITLEDB && g_titledb.m_rdb.isDumping() ) {
|
||||
if ( rdbId == RDB_TITLEDB && g_titledb.getRdb()->isDumping() ) {
|
||||
if ( doLog ) {
|
||||
log( LOG_INFO, "db: Can not merge titledb while it is dumping." );
|
||||
}
|
||||
|
71
RdbList.cpp
71
RdbList.cpp
@ -133,26 +133,32 @@ void RdbList::set(char *list, int32_t listSize, char *alloc, int32_t allocSize,
|
||||
verify_signature();
|
||||
logTrace(g_conf.m_logTraceRdbList, "BEGIN. list=%p listSize=%" PRId32" alloc=%p allocSize=%" PRId32,
|
||||
list, listSize, alloc, allocSize);
|
||||
logTrace(g_conf.m_logTraceRdbList, "startKey=%s endKey=%s keySize=%hhu fixedDataSize=%" PRId32,
|
||||
KEYSTR(startKey, keySize), KEYSTR(endKey, keySize), keySize, fixedDataSize);
|
||||
|
||||
// free and NULLify any old m_list we had to make room for our new list
|
||||
freeList();
|
||||
|
||||
// set this first since others depend on it
|
||||
m_ks = keySize;
|
||||
|
||||
// sanity check (happens when IndexReadInfo exhausts a list to Msg2)
|
||||
if ( KEYCMP(startKey,endKey,m_ks) > 0 )
|
||||
log(LOG_REMIND,"db: rdblist: set: startKey > endKey.");
|
||||
if (KEYCMP(startKey, endKey, m_ks) > 0) {
|
||||
log(LOG_WARN, "db: rdblist: set: startKey > endKey.");
|
||||
}
|
||||
|
||||
// safety check
|
||||
if ( fixedDataSize != 0 && useHalfKeys ) {
|
||||
log(LOG_LOGIC,"db: rdblist: set: useHalfKeys 1 when "
|
||||
"fixedDataSize not 0.");
|
||||
if (fixedDataSize != 0 && useHalfKeys) {
|
||||
log(LOG_LOGIC, "db: rdblist: set: useHalfKeys 1 when fixedDataSize not 0.");
|
||||
useHalfKeys = false;
|
||||
}
|
||||
|
||||
// got an extremely ugly corrupt stack core without this check
|
||||
if ( m_list && m_listSize == 0 ){
|
||||
log ( LOG_WARN, "rdblist: listSize of 0 but list pointer not "
|
||||
"NULL!" );
|
||||
if (m_list && m_listSize == 0) {
|
||||
log(LOG_WARN, "rdblist: listSize of 0 but list pointer not NULL!");
|
||||
m_list = NULL;
|
||||
}
|
||||
|
||||
// set our list parms
|
||||
m_list = list;
|
||||
m_listSize = listSize;
|
||||
@ -164,8 +170,11 @@ void RdbList::set(char *list, int32_t listSize, char *alloc, int32_t allocSize,
|
||||
m_fixedDataSize = fixedDataSize;
|
||||
m_ownData = ownData;
|
||||
m_useHalfKeys = useHalfKeys;
|
||||
|
||||
// use this call now to set m_listPtr and m_listPtrHi based on m_list
|
||||
resetListPtr();
|
||||
|
||||
logTrace(g_conf.m_logTraceRdbList, "END");
|
||||
}
|
||||
|
||||
// like above but uses 0/maxKey for startKey/endKey
|
||||
@ -976,9 +985,6 @@ bool RdbList::removeBadData_r ( ) {
|
||||
|
||||
|
||||
int RdbList::printPosdbList() {
|
||||
|
||||
logf(LOG_DEBUG, "%s:%s: BEGIN",__FILE__,__func__);
|
||||
|
||||
// save
|
||||
char *oldp = m_listPtr;
|
||||
const char *oldphi = m_listPtrHi;
|
||||
@ -1074,7 +1080,6 @@ int RdbList::printPosdbList() {
|
||||
m_listPtr = oldp;
|
||||
m_listPtrHi = oldphi;
|
||||
|
||||
logf(LOG_DEBUG, "%s:%s: END",__FILE__,__func__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1084,9 +1089,6 @@ int RdbList::printList() {
|
||||
return printPosdbList();
|
||||
}
|
||||
|
||||
logf(LOG_DEBUG, "%s:%s: BEGIN",__FILE__,__func__);
|
||||
|
||||
//log("m_list=%" PRId32,(int32_t)m_list);
|
||||
// save
|
||||
char *oldp = m_listPtr;
|
||||
const char *oldphi = m_listPtrHi;
|
||||
@ -1120,7 +1122,6 @@ int RdbList::printList() {
|
||||
m_listPtr = oldp;
|
||||
m_listPtrHi = oldphi;
|
||||
|
||||
logf(LOG_DEBUG, "%s:%s: END",__FILE__,__func__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1570,11 +1571,6 @@ bool RdbList::posdbConstrain(const char *startKey, char *endKey, int32_t minRecS
|
||||
}
|
||||
|
||||
// write the full key back into "p"
|
||||
KEYSET(p, k, 18);
|
||||
} else if (p[0] & 0x02) {
|
||||
// write the key back 6 bytes
|
||||
p -= 6;
|
||||
|
||||
KEYSET(p, k, 18);
|
||||
}
|
||||
|
||||
@ -1755,8 +1751,7 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
|
||||
|
||||
// did they call prepareForMerge()?
|
||||
if ( m_mergeMinListSize == -1 ) {
|
||||
log(LOG_LOGIC,"db: rdblist: merge_r: prepareForMerge() not "
|
||||
"called. ignoring error and returning emtpy list.");
|
||||
log(LOG_LOGIC,"db: rdblist: merge_r: prepareForMerge() not called. ignoring error and returning emtpy list.");
|
||||
// this happens if we nuke doledb during a merge of it. it is just bad timing
|
||||
return;
|
||||
// save state and dump core, sigBadHandler will catch this
|
||||
@ -1770,8 +1765,8 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
|
||||
|
||||
// warning msg
|
||||
if ( m_listPtr != m_listEnd ) {
|
||||
log( LOG_LOGIC, "db: rdblist: merge_r: warning. merge not storing at end of list for %s.",
|
||||
getDbnameFromId( ( uint8_t ) rdbId ) );
|
||||
log(LOG_LOGIC, "db: rdblist: merge_r: warning. merge not storing at end of list for %s.",
|
||||
getDbnameFromId((uint8_t)rdbId));
|
||||
}
|
||||
|
||||
// set our key range
|
||||
@ -1782,8 +1777,6 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
|
||||
// deletes all the urls then does a dump of just negative keys.
|
||||
// so let's comment it out for now
|
||||
if ( KEYCMP(m_startKey,m_endKey,m_ks)!=0 && KEYNEG(m_endKey) ) {
|
||||
// log(LOG_LOGIC,"db: rdblist: merge_r: Illegal endKey for "
|
||||
// "merging rdb=%s. fixing.",getDbnameFromId(rdbId));
|
||||
// make it legal so it will be read first NEXT time
|
||||
KEYDEC(m_endKey,m_ks);
|
||||
}
|
||||
@ -1814,6 +1807,13 @@ void RdbList::merge_r(RdbList **lists, int32_t numLists, const char *startKey, c
|
||||
return;
|
||||
}
|
||||
|
||||
// check that we're not using index for other rdb file than posdb
|
||||
Rdb* rdb = getRdbFromId(rdbId);
|
||||
if (rdb->isUseIndexFile()) {
|
||||
/// @todo ALC logic to use index file is not implemented for any rdb other than posdb. add it below if required
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
int32_t required = -1;
|
||||
// . if merge not necessary, print a warning message.
|
||||
// . caller should have just called constrain() then
|
||||
@ -2133,6 +2133,7 @@ skip:
|
||||
///////
|
||||
|
||||
bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startKey, const char *endKey, int32_t minRecSizes, bool removeNegKeys) {
|
||||
logTrace(g_conf.m_logTraceRdbList, "BEGIN");
|
||||
// sanity
|
||||
if (m_ks != sizeof(key144_t)) {
|
||||
gbshutdownAbort(true);
|
||||
@ -2264,6 +2265,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
|
||||
// . continue if tie, so we get the oldest first
|
||||
// . treat negative and positive keys as identical for this
|
||||
if (ss < 0) {
|
||||
logTrace(g_conf.m_logTraceRdbList, "ss < 0. continue");
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -2271,9 +2273,12 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
|
||||
// and minPtrBase/Lo/Hi was a negative key! so this is
|
||||
// the annihilation. skip the positive key.
|
||||
if (ss == 0) {
|
||||
logTrace(g_conf.m_logTraceRdbList, "ss == 0. skip");
|
||||
goto skip;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceRdbList, "new min i=%" PRId32, i);
|
||||
|
||||
// we got a new min
|
||||
minPtrBase = ptrs [i];
|
||||
minPtrLo = loKeys[i];
|
||||
@ -2283,6 +2288,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
|
||||
|
||||
// ignore if negative i guess, just skip it
|
||||
if (removeNegKeys && (minPtrBase[0] & 0x01) == 0x00) {
|
||||
logTrace(g_conf.m_logTraceRdbList, "removeNegKeys. skip");
|
||||
goto skip;
|
||||
}
|
||||
|
||||
@ -2293,11 +2299,13 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
|
||||
if (m_listPtrHi && cmp_6bytes_equal(minPtrHi, m_listPtrHi)) {
|
||||
if (m_listPtrLo && cmp_6bytes_equal(minPtrLo, m_listPtrLo)) {
|
||||
// 6-byte entry
|
||||
logTrace(g_conf.m_logTraceRdbList, "store 6-byte key");
|
||||
memcpy(new_listPtr, minPtrBase, 6);
|
||||
new_listPtr += 6;
|
||||
*pp |= 0x06; //turn on both compression bits
|
||||
} else {
|
||||
// 12-byte entry
|
||||
logTrace(g_conf.m_logTraceRdbList, "store 12-byte key");
|
||||
memcpy(new_listPtr, minPtrBase, 6);
|
||||
new_listPtr += 6;
|
||||
memcpy(new_listPtr, minPtrLo, 6);
|
||||
@ -2307,6 +2315,7 @@ bool RdbList::posdbMerge_r(RdbList **lists, int32_t numLists, const char *startK
|
||||
}
|
||||
} else {
|
||||
// 18-byte entry
|
||||
logTrace(g_conf.m_logTraceRdbList, "store 18-byte key");
|
||||
memcpy(new_listPtr, minPtrBase, 6);
|
||||
new_listPtr += 6;
|
||||
memcpy(new_listPtr, minPtrLo, 6);
|
||||
@ -2336,11 +2345,14 @@ skip:
|
||||
// is new key 6 bytes? then do not touch hi/lo ptrs
|
||||
if ( ptrs[mini][0] & 0x04 ) {
|
||||
// no-op
|
||||
logTrace(g_conf.m_logTraceRdbList, "new 6-byte key");
|
||||
} else if ( ptrs[mini][0] & 0x02 ) {
|
||||
// is new key 12 bytes?
|
||||
logTrace(g_conf.m_logTraceRdbList, "new 12-byte key");
|
||||
memcpy(loKeys[mini], ptrs[mini] + 6, 6);
|
||||
} else {
|
||||
// is new key 18 bytes? full key.
|
||||
logTrace(g_conf.m_logTraceRdbList, "new 18-byte key");
|
||||
memcpy(hiKeys[mini], ptrs[mini] + 12, 6);
|
||||
memcpy(loKeys[mini], ptrs[mini] + 6, 6);
|
||||
}
|
||||
@ -2348,6 +2360,7 @@ skip:
|
||||
//
|
||||
// REMOVE THE LIST at mini
|
||||
//
|
||||
logTrace(g_conf.m_logTraceRdbList, "remove list at mini=%" PRId32, mini);
|
||||
|
||||
// otherwise, remove him from array
|
||||
for (int32_t i = mini; i < numLists - 1; i++) {
|
||||
@ -2375,6 +2388,7 @@ skip:
|
||||
|
||||
// return now if we're empty... all our recs annihilated?
|
||||
if (m_listSize <= 0) {
|
||||
logTrace(g_conf.m_logTraceRdbList, "END. no more list");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2410,6 +2424,7 @@ skip:
|
||||
if (g_conf.m_logTraceRdbList) {
|
||||
printList();
|
||||
}
|
||||
logTrace(g_conf.m_logTraceRdbList, "END. Less than requested");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2419,6 +2434,7 @@ skip:
|
||||
if (g_conf.m_logTraceRdbList) {
|
||||
printList();
|
||||
}
|
||||
logTrace(g_conf.m_logTraceRdbList, "END. No more list");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2449,6 +2465,7 @@ skip:
|
||||
printList();
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceRdbList, "END. Done");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -115,7 +115,7 @@ public:
|
||||
int32_t getAllocSize() const { return m_allocSize; }
|
||||
void setAllocSize(int32_t allocSize) { m_allocSize = allocSize; }
|
||||
|
||||
int32_t getFixedDataSize() { return m_fixedDataSize; }
|
||||
int32_t getFixedDataSize() const { return m_fixedDataSize; }
|
||||
void setFixedDataSize(int32_t fixedDataSize) { m_fixedDataSize = fixedDataSize; }
|
||||
|
||||
// . merge_r() sets m_lastKey for the list it merges the others into
|
||||
@ -135,7 +135,7 @@ public:
|
||||
bool isLastKeyValid() const { return m_lastKeyIsValid; }
|
||||
void setLastKeyIsValid(bool lastKeyIsValid) { m_lastKeyIsValid = lastKeyIsValid; }
|
||||
|
||||
bool getOwnData() { return m_ownData; }
|
||||
bool getOwnData() const { return m_ownData; }
|
||||
// if you don't want data to be freed on destruction then don't own it
|
||||
void setOwnData(bool ownData) { m_ownData = ownData; }
|
||||
|
||||
|
@ -1196,13 +1196,10 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
|
||||
// these guy always use a collnum of 0
|
||||
bool doCollRecCheck = true;
|
||||
if ( !strcmp(m_dbname,"catdb") ) doCollRecCheck = false;
|
||||
if ( !strcmp(m_dbname,"statsdb") ) doCollRecCheck = false;
|
||||
|
||||
|
||||
if ( !strcmp(m_dbname,"indexdb") ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
|
||||
|
||||
bool isTitledb = false;
|
||||
|
@ -525,11 +525,11 @@ bool Rebalance::gotList ( ) {
|
||||
KEYINC ( m_nextKey , ks );
|
||||
}
|
||||
|
||||
if ( ! m_msg4a.addMetaList( &m_posMetaList, m_collnum, this, doneAddingMetaWrapper, MAX_NICENESS, rdb->getRdbId(), -1 ) ) { // shard override, not!
|
||||
if (!m_msg4a.addMetaList(&m_posMetaList, m_collnum, this, doneAddingMetaWrapper, rdb->getRdbId(), -1)) { // shard override, not!
|
||||
++m_blocked;
|
||||
}
|
||||
|
||||
if ( ! m_msg4b.addMetaList( &m_negMetaList, m_collnum, this, doneAddingMetaWrapper, MAX_NICENESS, rdb->getRdbId(), myShard ) ) { // shard override, not!
|
||||
if (!m_msg4b.addMetaList(&m_negMetaList, m_collnum, this, doneAddingMetaWrapper, rdb->getRdbId(), myShard)) { // shard override, not!
|
||||
++m_blocked;
|
||||
}
|
||||
|
||||
|
@ -1157,11 +1157,11 @@ bool Repair::gotScanRecList ( ) {
|
||||
m_nextTitledbKey = next;
|
||||
*/
|
||||
// get the docid
|
||||
//int64_t dd = g_titledb.getDocIdFromKey(&m_nextTitledbKey);
|
||||
//int64_t dd = Titledb::getDocIdFromKey(&m_nextTitledbKey);
|
||||
// inc it
|
||||
//dd++;
|
||||
// re-make key
|
||||
//m_nextTitledbKey = g_titledb.makeFirstTitleRecKey ( dd );
|
||||
//m_nextTitledbKey = Titledb::makeFirstTitleRecKey ( dd );
|
||||
// advance one if positive, must always start on a neg
|
||||
if ( (m_nextTitledbKey.n0 & 0x01) == 0x01 )
|
||||
m_nextTitledbKey += (uint32_t)1;
|
||||
@ -1209,7 +1209,7 @@ bool Repair::gotScanRecList ( ) {
|
||||
|
||||
// nextRec2:
|
||||
key96_t tkey = m_titleRecList.getCurrentKey();
|
||||
int64_t docId = g_titledb.getDocId ( &tkey );
|
||||
int64_t docId = Titledb::getDocId ( &tkey );
|
||||
// save it
|
||||
//m_currentTitleRecKey = tkey;
|
||||
|
||||
@ -1372,7 +1372,7 @@ bool Repair::injectTitleRec ( ) {
|
||||
// skip negative recs, first one should not be negative however
|
||||
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
|
||||
// get docid of that guy
|
||||
int64_t dd = g_titledb.getDocId(k);
|
||||
int64_t dd = Titledb::getDocId(k);
|
||||
// compare that
|
||||
if ( m_docId != dd ) continue;
|
||||
// we got it!
|
||||
|
22
Sections.h
22
Sections.h
@ -13,10 +13,6 @@
|
||||
// hhhhhhhh hhhhhhhh tttttttt dddddddd t = tag type
|
||||
// dddddddd dddddddd dddddddd ddddddHD d = docid
|
||||
|
||||
// DATA:
|
||||
// SSSSSSSS SSSSSSSS SSSSSSSS SSSSSSSS S = SectionVote::m_score
|
||||
// NNNNNNNN NNNNNNNN NNNNNNNN NNNNNNNN N = SectionVote::m_numSampled
|
||||
|
||||
// h: hash value. typically the lower 32 bits of the
|
||||
// Section::m_contentHash64 vars. we
|
||||
// do not need the full 64 bits because we have the 48 bit site hash included
|
||||
@ -277,22 +273,4 @@ public:
|
||||
class Section *m_firstSent;
|
||||
};
|
||||
|
||||
// . the key in sectiondb is basically the Section::m_tagHash
|
||||
// (with a docId) and the data portion of the Rdb record is this SectionVote
|
||||
// . the Sections::m_nsvt and m_osvt hash tables contain SectionVotes
|
||||
// as their data value and use an tagHash key as well
|
||||
class SectionVote {
|
||||
public:
|
||||
// . seems like addVote*() always uses a score of 1.0
|
||||
// . seems to be a weight used when setting Section::m_votesFor[Not]Dup
|
||||
// . not sure if we really use this now
|
||||
float m_score;
|
||||
// . how many times does this tagHash occur in this doc?
|
||||
// . this eliminates the need for the SV_UNIQUE section type
|
||||
// . this is not used for tags of type contenthash or taghash
|
||||
// . seems like pastdate and futuredate and eurdatefmt
|
||||
// are the only vote types that actually really use this...
|
||||
float m_numSampled;
|
||||
} __attribute__((packed, aligned(4)));
|
||||
|
||||
#endif // GB_SECTIONS_H
|
||||
|
@ -1674,7 +1674,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum );
|
||||
|
||||
// sanity. if in use we should not even be here
|
||||
if ( sc->m_msg4x.m_inUse ) {
|
||||
if ( sc->m_msg4x.isInUse() ) {
|
||||
log( LOG_WARN, "basic: trying to update site list while previous update still outstanding.");
|
||||
g_errno = EBADENGINEER;
|
||||
return true;
|
||||
@ -2005,7 +2005,7 @@ bool updateSiteListBuf ( collnum_t collnum ,
|
||||
|
||||
// use spidercoll to contain this msg4 but if in use it
|
||||
// won't be able to be deleted until it comes back..
|
||||
return sc->m_msg4x.addMetaList ( spiderReqBuf, sc->m_collnum, sc, doneAddingSeedsWrapper, MAX_NICENESS, RDB_SPIDERDB );
|
||||
return sc->m_msg4x.addMetaList(spiderReqBuf, sc->m_collnum, sc, doneAddingSeedsWrapper, RDB_SPIDERDB);
|
||||
}
|
||||
|
||||
// . Spider.cpp calls this to see if a url it wants to spider is
|
||||
@ -4139,7 +4139,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
|
||||
|
||||
static int32_t getFakeIpForUrl2(Url *url2) {
|
||||
// make the probable docid
|
||||
int64_t probDocId = g_titledb.getProbableDocId ( url2 );
|
||||
int64_t probDocId = Titledb::getProbableDocId ( url2 );
|
||||
// make one up, like we do in PageReindex.cpp
|
||||
int32_t firstIp = (probDocId & 0xffffffff);
|
||||
return firstIp;
|
||||
@ -4154,7 +4154,7 @@ bool SpiderRequest::setFromAddUrl(const char *url) {
|
||||
// reset it
|
||||
reset();
|
||||
// make the probable docid
|
||||
int64_t probDocId = g_titledb.getProbableDocId ( url );
|
||||
int64_t probDocId = Titledb::getProbableDocId ( url );
|
||||
|
||||
// make one up, like we do in PageReindex.cpp
|
||||
int32_t firstIp = (probDocId & 0xffffffff);
|
||||
|
@ -3174,9 +3174,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// mdw: for testing take this out!
|
||||
if ( m_totalBytesScanned < 25000 ) maxWinners = 1;
|
||||
|
||||
// sanity. make sure read is somewhat hefty for our
|
||||
// maxWinners=1 thing
|
||||
if ( (int32_t)SR_READ_SIZE < 500000 ) { g_process.shutdownAbort(true); }
|
||||
// sanity. make sure read is somewhat hefty for our maxWinners=1 thing
|
||||
static_assert(SR_READ_SIZE >= 500000, "ensure read size is big enough");
|
||||
|
||||
// only compare to min winner in tree if tree is full
|
||||
if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) {
|
||||
|
@ -1327,10 +1327,6 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
|
||||
// shortcut
|
||||
int64_t lockKeyUh48 = makeLockTableKey ( sreq );
|
||||
|
||||
//uint64_t lockKey ;
|
||||
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
|
||||
//lockKey = g_titledb.getFirstProbableDocId(sreq->m_probDocId);
|
||||
|
||||
// . now that we have to use msg12 to see if the thing is locked
|
||||
// to avoid spidering it.. (see comment in above function)
|
||||
// we often try to spider something we are already spidering. that
|
||||
|
276
Tagdb.cpp
276
Tagdb.cpp
@ -16,7 +16,6 @@
|
||||
#include "GbMutex.h"
|
||||
#include "ScopedLock.h"
|
||||
|
||||
static void gotMsg0ReplyWrapper ( void *state );
|
||||
|
||||
static HashTableX s_ht;
|
||||
static bool s_initialized = false;
|
||||
@ -1204,9 +1203,27 @@ static bool s_cacheInitialized = false;
|
||||
static RdbCache s_cache;
|
||||
static GbMutex s_cacheInitializedMutex;
|
||||
|
||||
Msg8a::Msg8a() {
|
||||
m_replies = 0;
|
||||
m_requests = 0;
|
||||
|
||||
Msg8a::Msg8a()
|
||||
: m_url(NULL),
|
||||
m_collnum(-1),
|
||||
m_callback(NULL),
|
||||
m_state(NULL),
|
||||
//m_msg0s
|
||||
//m_siteStartKey
|
||||
//m_siteEndKey
|
||||
m_niceness(0),
|
||||
m_dom(NULL),
|
||||
m_hostEnd(NULL),
|
||||
m_p(NULL),
|
||||
m_requests(0), m_replies(0),
|
||||
m_doneLaunching(false),
|
||||
m_mtx(),
|
||||
m_errno(0),
|
||||
m_tagRec(NULL),
|
||||
m_state2(NULL),
|
||||
m_state3(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
Msg8a::~Msg8a ( ) {
|
||||
@ -1378,43 +1395,9 @@ struct Msg8aState {
|
||||
bool Msg8a::launchGetRequests ( ) {
|
||||
// clear it
|
||||
g_errno = 0;
|
||||
bool tryDomain = false;
|
||||
|
||||
loop:
|
||||
// return true if nothing to launch
|
||||
if ( m_doneLaunching )
|
||||
return (m_requests == m_replies);
|
||||
|
||||
// don't bother if already got an error
|
||||
if ( m_errno )
|
||||
return (m_requests == m_replies);
|
||||
|
||||
// limit max to 5ish
|
||||
if (m_requests >= MAX_TAGDB_REQUESTS)
|
||||
return (m_requests == m_replies);
|
||||
|
||||
// take a breath
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
key128_t startKey ;
|
||||
key128_t endKey ;
|
||||
|
||||
if ( tryDomain ) {
|
||||
startKey = g_tagdb.makeDomainStartKey ( m_url );
|
||||
endKey = g_tagdb.makeDomainEndKey ( m_url );
|
||||
log( LOG_DEBUG, "tagdb: looking up domain tags for %.*s", m_url->getDomainLen(), m_url->getDomain() );
|
||||
}
|
||||
else {
|
||||
// usually the site is the hostname but sometimes it is like
|
||||
// "www.last.fm/user/breendaxx/"
|
||||
startKey = m_siteStartKey;
|
||||
endKey = m_siteEndKey;
|
||||
|
||||
log( LOG_DEBUG, "tagdb: looking up site tags for %s", m_url->getUrl() );
|
||||
}
|
||||
|
||||
// initialize cache
|
||||
ScopedLock sl(s_cacheInitializedMutex);
|
||||
ScopedLock sl_cache(s_cacheInitializedMutex);
|
||||
if ( !s_cacheInitialized ) {
|
||||
int64_t maxCacheSize = g_conf.m_tagRecCacheSize;
|
||||
int64_t maxCacheNodes = ( maxCacheSize / 200 );
|
||||
@ -1422,104 +1405,120 @@ bool Msg8a::launchGetRequests ( ) {
|
||||
s_cacheInitialized = true;
|
||||
s_cache.init( maxCacheSize, -1, true, maxCacheNodes, false, "tagreccache", false, 16, 16, -1 );
|
||||
}
|
||||
sl.unlock();
|
||||
sl_cache.unlock();
|
||||
|
||||
// get the next mcast
|
||||
Msg0 *m = &m_msg0s[m_requests];
|
||||
//get tag for url and then domain
|
||||
for(int getLoop = 0; getLoop<1; getLoop++) {
|
||||
|
||||
// and the list
|
||||
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
|
||||
key128_t startKey;
|
||||
key128_t endKey;
|
||||
|
||||
// try to get from cache
|
||||
RdbCacheLock rcl(s_cache);
|
||||
if ( s_cache.getList( m_collnum, (char*)&startKey, (char*)&startKey, listPtr, true,
|
||||
g_conf.m_tagRecCacheMaxAge, true) ) {
|
||||
// got from cache
|
||||
log( LOG_DEBUG, "tagdb: got key=%s from cache", KEYSTR(&startKey, sizeof(startKey)) );
|
||||
if(getLoop==1) {
|
||||
startKey = g_tagdb.makeDomainStartKey ( m_url );
|
||||
endKey = g_tagdb.makeDomainEndKey ( m_url );
|
||||
log( LOG_DEBUG, "tagdb: looking up domain tags for %.*s", m_url->getDomainLen(), m_url->getDomain() );
|
||||
} else {
|
||||
// usually the site is the hostname but sometimes it is like
|
||||
// "www.last.fm/user/breendaxx/"
|
||||
startKey = m_siteStartKey;
|
||||
endKey = m_siteEndKey;
|
||||
|
||||
rcl.unlock();
|
||||
m_requests++;
|
||||
m_replies++;
|
||||
} else {
|
||||
rcl.unlock();
|
||||
// bias based on the top 64 bits which is the hash of the "site" now
|
||||
int32_t shardNum = getShardNum ( RDB_TAGDB , &startKey );
|
||||
Host *firstHost ;
|
||||
|
||||
// if niceness 0 can't pick noquery host.
|
||||
// if niceness 1 can't pick nospider host.
|
||||
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
|
||||
int32_t firstHostId = firstHost->m_hostId;
|
||||
|
||||
Msg8aState *state = NULL;
|
||||
try {
|
||||
state = new Msg8aState(this, startKey, endKey, m_requests);
|
||||
} catch (...) {
|
||||
g_errno = ENOMEM;
|
||||
log(LOG_WARN, "tagdb: unable to allocate memory for Msg8aState");
|
||||
return false;
|
||||
}
|
||||
mnew(state, sizeof(*state), "msg8astate");
|
||||
|
||||
// . launch this request, even if to ourselves
|
||||
// . TODO: just use msg0!!
|
||||
bool status = m->getList ( firstHostId , // hostId
|
||||
0 , // ip
|
||||
0 , // port
|
||||
0 , // maxCacheAge
|
||||
false , // addToCache
|
||||
RDB_TAGDB ,
|
||||
m_collnum ,
|
||||
listPtr ,
|
||||
(char *) &startKey ,
|
||||
(char *) &endKey ,
|
||||
10000000 , // minRecSizes
|
||||
state , // state
|
||||
gotMsg0ReplyWrapper ,
|
||||
m_niceness ,
|
||||
true , // error correction?
|
||||
true , // include tree?
|
||||
true , // doMerge?
|
||||
firstHostId , // firstHostId
|
||||
0 , // startFileNum
|
||||
-1 , // numFiles
|
||||
msg0_getlist_infinite_timeout );// timeout
|
||||
|
||||
// error?
|
||||
if ( status && g_errno ) {
|
||||
// g_errno should be set, we had an error
|
||||
m_errno = g_errno;
|
||||
return (m_requests == m_replies);
|
||||
log( LOG_DEBUG, "tagdb: looking up site tags for %s", m_url->getUrl() );
|
||||
}
|
||||
|
||||
// successfully launched
|
||||
m_requests++;
|
||||
// get the next mcast
|
||||
Msg0 *m = &m_msg0s[m_requests];
|
||||
|
||||
// if we got a reply instantly
|
||||
if ( status ) {
|
||||
// and the list
|
||||
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
|
||||
|
||||
// try to get from cache
|
||||
RdbCacheLock rcl(s_cache);
|
||||
if ( s_cache.getList( m_collnum, (char*)&startKey, (char*)&startKey, listPtr, true,
|
||||
g_conf.m_tagRecCacheMaxAge, true) ) {
|
||||
// got from cache
|
||||
log( LOG_DEBUG, "tagdb: got key=%s from cache", KEYSTR(&startKey, sizeof(startKey)) );
|
||||
|
||||
rcl.unlock();
|
||||
ScopedLock sl(m_mtx);
|
||||
m_requests++;
|
||||
m_replies++;
|
||||
} else {
|
||||
rcl.unlock();
|
||||
// bias based on the top 64 bits which is the hash of the "site" now
|
||||
int32_t shardNum = getShardNum ( RDB_TAGDB , &startKey );
|
||||
Host *firstHost ;
|
||||
|
||||
// if niceness 0 can't pick noquery host.
|
||||
// if niceness 1 can't pick nospider host.
|
||||
firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
|
||||
int32_t firstHostId = firstHost->m_hostId;
|
||||
|
||||
Msg8aState *state = NULL;
|
||||
try {
|
||||
state = new Msg8aState(this, startKey, endKey, m_requests);
|
||||
} catch (...) {
|
||||
g_errno = m_errno = ENOMEM;
|
||||
log(LOG_WARN, "tagdb: unable to allocate memory for Msg8aState");
|
||||
break;
|
||||
}
|
||||
mnew(state, sizeof(*state), "msg8astate");
|
||||
|
||||
// . launch this request, even if to ourselves
|
||||
// . TODO: just use msg0!!
|
||||
bool status = m->getList ( firstHostId , // hostId
|
||||
0 , // ip
|
||||
0 , // port
|
||||
0 , // maxCacheAge
|
||||
false , // addToCache
|
||||
RDB_TAGDB ,
|
||||
m_collnum ,
|
||||
listPtr ,
|
||||
(char *) &startKey ,
|
||||
(char *) &endKey ,
|
||||
10000000 , // minRecSizes
|
||||
state , // state
|
||||
gotMsg0ReplyWrapper ,
|
||||
m_niceness ,
|
||||
true , // error correction?
|
||||
true , // include tree?
|
||||
true , // doMerge?
|
||||
firstHostId , // firstHostId
|
||||
0 , // startFileNum
|
||||
-1 , // numFiles
|
||||
msg0_getlist_infinite_timeout );// timeout
|
||||
|
||||
// error?
|
||||
if ( status && g_errno ) {
|
||||
// g_errno should be set, we had an error
|
||||
m_errno = g_errno;
|
||||
break;
|
||||
}
|
||||
|
||||
ScopedLock sl(m_mtx);
|
||||
|
||||
// successfully launched
|
||||
m_requests++;
|
||||
|
||||
// if we got a reply instantly
|
||||
if ( status ) {
|
||||
m_replies++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( ! tryDomain ) {
|
||||
tryDomain = true;
|
||||
goto loop;
|
||||
}
|
||||
ScopedLock sl(m_mtx);
|
||||
|
||||
//
|
||||
// no more looping!
|
||||
//
|
||||
// i don't think we need to loop any more because we got all the
|
||||
// tags for this hostname. then the lower bits of the Tag key
|
||||
// corresponds to the actual SITE hash. so we gotta filter those
|
||||
// out i guess after we read the whole list.
|
||||
//
|
||||
m_doneLaunching = true;
|
||||
|
||||
return (m_requests == m_replies);
|
||||
|
||||
if(m_requests == m_replies)
|
||||
return true; // all requests done
|
||||
else
|
||||
return false; // some requests weren't immediate
|
||||
}
|
||||
|
||||
static void gotMsg0ReplyWrapper ( void *state ) {
|
||||
void Msg8a::gotMsg0ReplyWrapper ( void *state ) {
|
||||
Msg8aState *msg8aState = (Msg8aState*)state;
|
||||
|
||||
Msg8a *msg8a = msg8aState->m_msg8a;
|
||||
@ -1529,9 +1528,6 @@ static void gotMsg0ReplyWrapper ( void *state ) {
|
||||
mdelete( msg8aState, sizeof(*msg8aState), "msg8astate" );
|
||||
delete msg8aState;
|
||||
|
||||
// we got one
|
||||
msg8a->m_replies++;
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
msg8a->m_errno = g_errno;
|
||||
@ -1548,21 +1544,23 @@ static void gotMsg0ReplyWrapper ( void *state ) {
|
||||
s_cache.addList( msg8a->m_collnum, (char*)&startKey, list);
|
||||
}
|
||||
|
||||
// launchGetRequests() returns false if still waiting for replies...
|
||||
if ( ! msg8a->launchGetRequests() ) {
|
||||
return;
|
||||
ScopedLock sl(msg8a->m_mtx);
|
||||
|
||||
msg8a->m_replies++;
|
||||
|
||||
if(msg8a->m_doneLaunching && msg8a->m_requests==msg8a->m_replies) {
|
||||
sl.unlock();
|
||||
// got all the replies
|
||||
msg8a->gotAllReplies();
|
||||
|
||||
// set g_errno for the callback
|
||||
if ( msg8a->m_errno ) {
|
||||
g_errno = msg8a->m_errno;
|
||||
}
|
||||
|
||||
// call callback
|
||||
msg8a->m_callback ( msg8a->m_state );
|
||||
}
|
||||
|
||||
// get all the replies
|
||||
msg8a->gotAllReplies();
|
||||
|
||||
// set g_errno for the callback
|
||||
if ( msg8a->m_errno ) {
|
||||
g_errno = msg8a->m_errno;
|
||||
}
|
||||
|
||||
// otherwise, call callback
|
||||
msg8a->m_callback ( msg8a->m_state );
|
||||
}
|
||||
|
||||
// get the TagRec from the reply
|
||||
|
8
Tagdb.h
8
Tagdb.h
@ -10,6 +10,7 @@
|
||||
#include "Loop.h"
|
||||
#include "SafeBuf.h"
|
||||
#include "Msg0.h"
|
||||
#include "GbMutex.h"
|
||||
|
||||
// . Tag::m_type is this if its a dup in the TagRec
|
||||
// . so if www.xyz.com has one tag and xyz.com has another, then
|
||||
@ -214,9 +215,12 @@ class Msg8a {
|
||||
bool getTagRec( Url *url, collnum_t collnum, int32_t niceness, void *state, void (*callback)( void * ),
|
||||
TagRec *tagRec );
|
||||
|
||||
private:
|
||||
bool launchGetRequests();
|
||||
void gotAllReplies ( ) ;
|
||||
|
||||
static void gotMsg0ReplyWrapper(void *);
|
||||
|
||||
// some specified input
|
||||
Url *m_url;
|
||||
|
||||
@ -238,13 +242,15 @@ class Msg8a {
|
||||
|
||||
int32_t m_requests;
|
||||
int32_t m_replies;
|
||||
char m_doneLaunching;
|
||||
bool m_doneLaunching;
|
||||
GbMutex m_mtx;
|
||||
|
||||
int32_t m_errno;
|
||||
|
||||
// we set this for the caller
|
||||
TagRec *m_tagRec;
|
||||
|
||||
public:
|
||||
// hack for MsgE
|
||||
void *m_state2;
|
||||
void *m_state3;
|
||||
|
99
Titledb.h
99
Titledb.h
@ -13,8 +13,6 @@
|
||||
#include "TitleRecVersion.h"
|
||||
#include "Rdb.h"
|
||||
#include "Url.h"
|
||||
#include "Conf.h"
|
||||
#include "Xml.h"
|
||||
|
||||
// new key format:
|
||||
// . <docId> - 38 bits
|
||||
@ -22,15 +20,11 @@
|
||||
// . <delBit> - 1 bit
|
||||
|
||||
class Titledb {
|
||||
|
||||
public:
|
||||
|
||||
public:
|
||||
// reset rdb
|
||||
void reset();
|
||||
|
||||
bool verify ( char *coll );
|
||||
|
||||
//bool addColl ( char *coll, bool doVerify = true );
|
||||
bool verify(char *coll);
|
||||
|
||||
// init m_rdb
|
||||
bool init ();
|
||||
@ -38,12 +32,20 @@ class Titledb {
|
||||
// init secondary/rebuild titledb
|
||||
bool init2 ( int32_t treeMem ) ;
|
||||
|
||||
Rdb* getRdb() { return &m_rdb; }
|
||||
|
||||
// . this is an estimate of the number of docs in the WHOLE db network
|
||||
// . we assume each group/cluster has about the same # of docs as us
|
||||
int64_t getGlobalNumDocs() {
|
||||
return m_rdb.getNumTotalRecs() * (int64_t)g_hostdb.m_numShards;
|
||||
}
|
||||
|
||||
// . get the probable docId from a url/coll
|
||||
// . it's "probable" because it may not be the actual docId because
|
||||
// in the case of a collision we pick a nearby docId that is
|
||||
// different but guaranteed to be in the same group/cluster, so you
|
||||
// can be assured the top 32 bits of the docId will be unchanged
|
||||
uint64_t getProbableDocId ( Url *url , bool mask = true ) {
|
||||
static uint64_t getProbableDocId(const Url *url, bool mask = true) {
|
||||
uint64_t probableDocId = hash64b(url->getUrl(),0);
|
||||
// Linkdb::getUrlHash() does not mask it
|
||||
if ( mask ) probableDocId = probableDocId & DOCID_MASK;
|
||||
@ -59,14 +61,14 @@ class Titledb {
|
||||
}
|
||||
|
||||
// a different way to do it
|
||||
uint64_t getProbableDocId ( const char *url ) {
|
||||
static uint64_t getProbableDocId(const char *url) {
|
||||
Url u;
|
||||
u.set( url );
|
||||
return getProbableDocId ( &u );
|
||||
return getProbableDocId(&u);
|
||||
}
|
||||
|
||||
// a different way to do it
|
||||
uint64_t getProbableDocId(const char *url,const char *dom,int32_t domLen) {
|
||||
static uint64_t getProbableDocId(const char *url, const char *dom, int32_t domLen) {
|
||||
uint64_t probableDocId = hash64b(url,0) &
|
||||
DOCID_MASK;
|
||||
// clear bits 6-13 because we want to put the domain hash there
|
||||
@ -80,73 +82,56 @@ class Titledb {
|
||||
}
|
||||
|
||||
// turn off the last 6 bits
|
||||
uint64_t getFirstProbableDocId ( int64_t d ) {
|
||||
return d & 0xffffffffffffffc0LL; }
|
||||
static uint64_t getFirstProbableDocId(int64_t d) {
|
||||
return d & 0xffffffffffffffc0ULL;
|
||||
}
|
||||
|
||||
// turn on the last 6 bits for the end docId
|
||||
uint64_t getLastProbableDocId ( int64_t d ) {
|
||||
return d | 0x000000000000003fLL; }
|
||||
static uint64_t getLastProbableDocId(int64_t d) {
|
||||
return d | 0x000000000000003fULL;
|
||||
}
|
||||
|
||||
// . the top NUMDOCIDBITs of "key" are the docId
|
||||
// . we use the top X bits of the keys to partition the records
|
||||
// . using the top bits to partition allows us to keep keys that
|
||||
// are near each other (euclidean metric) in the same partition
|
||||
int64_t getDocIdFromKey ( key96_t *key ) {
|
||||
uint64_t docId;
|
||||
docId = ((uint64_t)key->n1)<<(NUMDOCIDBITS - 32);
|
||||
docId|= key->n0 >>(64-(NUMDOCIDBITS-32));
|
||||
static int64_t getDocIdFromKey(const key96_t *key) {
|
||||
uint64_t docId = ((uint64_t)key->n1) << (NUMDOCIDBITS - 32);
|
||||
docId |= key->n0 >> (64 - (NUMDOCIDBITS - 32));
|
||||
return docId;
|
||||
}
|
||||
int64_t getDocId ( key96_t *key ) { return getDocIdFromKey(key); }
|
||||
int64_t getDocIdFromKey ( key96_t key ) {
|
||||
return getDocIdFromKey(&key);}
|
||||
|
||||
uint8_t getDomHash8FromDocId (int64_t d) {
|
||||
return (d & ~0xffffffffffffc03fULL) >> 6; }
|
||||
static int64_t getDocId(const key96_t *key) { return getDocIdFromKey(key); }
|
||||
|
||||
int64_t getUrlHash48 ( key96_t *k ) {
|
||||
return ((k->n0 >> 10) & 0x0000ffffffffffffLL); }
|
||||
static uint8_t getDomHash8FromDocId (int64_t d) {
|
||||
return (d & ~0xffffffffffffc03fULL) >> 6;
|
||||
}
|
||||
|
||||
// . dptr is a char ptr to the docid
|
||||
// . used by IndexTable2.cpp
|
||||
// . "dptr" is pointing into a 6-byte indexdb key
|
||||
// . see IndexTable2.cpp, grep for gbmemcpy() to see
|
||||
// how the docid is parsed out of this key (or see
|
||||
// Indexdb.h)
|
||||
// . return ((*((uint16_t *)dptr)) >> 8) & 0xff; }
|
||||
uint8_t getDomHash8 ( uint8_t *dptr ) { return dptr[1]; }
|
||||
static int64_t getUrlHash48 ( key96_t *k ) {
|
||||
return ((k->n0 >> 10) & 0x0000ffffffffffffLL);
|
||||
}
|
||||
|
||||
// does this key/docId/url have it's titleRec stored locally?
|
||||
bool isLocal ( int64_t docId );
|
||||
bool isLocal ( Url *url ) {
|
||||
return isLocal ( getProbableDocId(url) ); }
|
||||
bool isLocal ( key96_t key ) {
|
||||
return isLocal (getDocIdFromKey(&key));}
|
||||
static bool isLocal(int64_t docId);
|
||||
|
||||
|
||||
Rdb *getRdb() { return &m_rdb; }
|
||||
static bool isLocal(Url *url) {
|
||||
return isLocal(getProbableDocId(url));
|
||||
}
|
||||
|
||||
// . make the key of a TitleRec from a docId
|
||||
// . remember to set the low bit so it's not a delete
|
||||
// . hi bits are set in the key
|
||||
key96_t makeKey ( int64_t docId, int64_t uh48, bool isDel );
|
||||
static key96_t makeKey(int64_t docId, int64_t uh48, bool isDel);
|
||||
|
||||
key96_t makeFirstKey ( int64_t docId ) {
|
||||
return makeKey ( docId , 0, true ); }
|
||||
static key96_t makeFirstKey(int64_t docId) {
|
||||
return makeKey(docId, 0, true);
|
||||
}
|
||||
|
||||
key96_t makeLastKey ( int64_t docId ) {
|
||||
return makeKey ( docId , 0xffffffffffffLL, false ); }
|
||||
|
||||
// . this is an estimate of the number of docs in the WHOLE db network
|
||||
// . we assume each group/cluster has about the same # of docs as us
|
||||
int64_t getGlobalNumDocs ( ) {
|
||||
return m_rdb.getNumTotalRecs()*
|
||||
(int64_t)g_hostdb.m_numShards;}
|
||||
|
||||
int32_t getLocalNumDocs () { return m_rdb.getNumTotalRecs(); }
|
||||
int32_t getNumDocsInMem () { return m_rdb.getNumUsedNodes(); }
|
||||
int32_t getMemUsed () { return m_rdb.getTreeMemOccupied(); }
|
||||
static key96_t makeLastKey(int64_t docId) {
|
||||
return makeKey(docId, 0xffffffffffffLL, false);
|
||||
}
|
||||
|
||||
private:
|
||||
// holds binary format title entries
|
||||
Rdb m_rdb;
|
||||
};
|
||||
|
@ -203,8 +203,7 @@ int32_t TopTree::getHighNode ( ) {
|
||||
bool TopTree::addNode ( TopNode *t , int32_t tnn ) {
|
||||
|
||||
// respect the dom hashes
|
||||
//uint8_t domHash = g_titledb.getDomHash8((uint8_t*)t->m_docIdPtr);
|
||||
uint8_t domHash = g_titledb.getDomHash8FromDocId(t->m_docId);
|
||||
uint8_t domHash = Titledb::getDomHash8FromDocId(t->m_docId);
|
||||
|
||||
// if vcount is satisfied, only add if better score than tail
|
||||
if ( m_vcount >= m_docsWanted ) {
|
||||
@ -449,9 +448,7 @@ bool TopTree::addNode ( TopNode *t , int32_t tnn ) {
|
||||
//if ( getNext(tn) == -1 ) gbshutdownLogicError();
|
||||
// get the min node
|
||||
TopNode *t = &m_nodes[tn];
|
||||
// get its docid ptr
|
||||
//uint8_t domHash2 = g_titledb.getDomHash8((ui)t->m_docIdPtr);
|
||||
uint8_t domHash2 = g_titledb.getDomHash8FromDocId(t->m_docId);
|
||||
uint8_t domHash2 = Titledb::getDomHash8FromDocId(t->m_docId);
|
||||
// . also must delete from m_t2
|
||||
// . make the key
|
||||
key96_t k;
|
||||
|
1128
XmlDoc.cpp
1128
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
109
XmlDoc.h
109
XmlDoc.h
@ -361,7 +361,7 @@ public:
|
||||
char *getIsPermalink ( ) ;
|
||||
char *getIsUrlPermalinkFormat ( ) ;
|
||||
char *getIsRSS ( ) ;
|
||||
char *getIsSiteMap ( ) ;
|
||||
bool *getIsSiteMap ( ) ;
|
||||
class Xml *getXml ( ) ;
|
||||
uint8_t *getLangVector ( ) ;
|
||||
uint8_t *getLangId ( ) ;
|
||||
@ -693,62 +693,59 @@ public:
|
||||
// validity flags. on reset() all these are set to false.
|
||||
char m_VALIDSTART;
|
||||
// DO NOT add validity flags above this line!
|
||||
char m_metaListValid;
|
||||
char m_addedSpiderRequestSizeValid;
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
char m_currentUrlValid;
|
||||
char m_useTimeAxisValid;
|
||||
char m_timeAxisUrlValid;
|
||||
char m_firstUrlValid;
|
||||
char m_firstUrlHash48Valid;
|
||||
char m_firstUrlHash64Valid;
|
||||
char m_lastUrlValid;
|
||||
char m_docIdValid;
|
||||
char m_availDocIdValid;
|
||||
char m_tagRecValid;
|
||||
char m_robotsTxtLenValid;
|
||||
char m_tagRecDataValid;
|
||||
char m_newTagBufValid;
|
||||
char m_rootTitleBufValid;
|
||||
char m_filteredRootTitleBufValid;
|
||||
char m_titleBufValid;
|
||||
char m_fragBufValid;
|
||||
char m_isRobotsTxtUrlValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
bool m_metaListValid;
|
||||
bool m_addedSpiderRequestSizeValid;
|
||||
bool m_addedSpiderReplySizeValid;
|
||||
bool m_addedStatusDocSizeValid;
|
||||
bool m_downloadStartTimeValid;
|
||||
bool m_siteValid;
|
||||
bool m_startTimeValid;
|
||||
bool m_currentUrlValid;
|
||||
bool m_useTimeAxisValid;
|
||||
bool m_timeAxisUrlValid;
|
||||
bool m_firstUrlValid;
|
||||
bool m_firstUrlHash48Valid;
|
||||
bool m_firstUrlHash64Valid;
|
||||
bool m_lastUrlValid;
|
||||
bool m_docIdValid;
|
||||
bool m_availDocIdValid;
|
||||
bool m_tagRecValid;
|
||||
bool m_robotsTxtLenValid;
|
||||
bool m_tagRecDataValid;
|
||||
bool m_newTagBufValid;
|
||||
bool m_rootTitleBufValid;
|
||||
bool m_filteredRootTitleBufValid;
|
||||
bool m_titleBufValid;
|
||||
bool m_fragBufValid;
|
||||
bool m_isRobotsTxtUrlValid;
|
||||
bool m_wordSpamBufValid;
|
||||
bool m_finalSummaryBufValid;
|
||||
|
||||
char m_hopCountValid;
|
||||
char m_isInjectingValid;
|
||||
char m_isImportingValid;
|
||||
char m_metaListCheckSum8Valid;
|
||||
char m_contentValid;
|
||||
char m_filteredContentValid;
|
||||
char m_charsetValid;
|
||||
char m_langVectorValid;
|
||||
char m_langIdValid;
|
||||
char m_datedbDateValid;
|
||||
char m_isRSSValid;
|
||||
char m_isSiteMapValid;
|
||||
char m_isContentTruncatedValid;
|
||||
char m_xmlValid;
|
||||
char m_linksValid;
|
||||
char m_wordsValid;
|
||||
char m_bitsValid;
|
||||
char m_bits2Valid;
|
||||
char m_posValid;
|
||||
char m_phrasesValid;
|
||||
char m_sectionsValid;
|
||||
bool m_hopCountValid;
|
||||
bool m_isInjectingValid;
|
||||
bool m_isImportingValid;
|
||||
bool m_metaListCheckSum8Valid;
|
||||
bool m_contentValid;
|
||||
bool m_filteredContentValid;
|
||||
bool m_charsetValid;
|
||||
bool m_langVectorValid;
|
||||
bool m_langIdValid;
|
||||
bool m_isRSSValid;
|
||||
bool m_isSiteMapValid;
|
||||
bool m_isContentTruncatedValid;
|
||||
bool m_xmlValid;
|
||||
bool m_linksValid;
|
||||
bool m_wordsValid;
|
||||
bool m_bitsValid;
|
||||
bool m_bits2Valid;
|
||||
bool m_posValid;
|
||||
bool m_phrasesValid;
|
||||
bool m_sectionsValid;
|
||||
|
||||
char m_imageDataValid;
|
||||
char m_imagesValid;
|
||||
char m_msge0Valid;
|
||||
char m_msge1Valid;
|
||||
char m_sreqValid;
|
||||
char m_srepValid;
|
||||
bool m_imageDataValid;
|
||||
bool m_imagesValid;
|
||||
bool m_sreqValid;
|
||||
bool m_srepValid;
|
||||
|
||||
bool m_ipValid;
|
||||
bool m_firstIpValid;
|
||||
@ -851,7 +848,7 @@ public:
|
||||
bool m_exactContentHash64Valid;
|
||||
bool m_jpValid;
|
||||
|
||||
char m_isSiteMap;
|
||||
bool m_isSiteMap;
|
||||
|
||||
// shadows
|
||||
char m_isRSS2;
|
||||
|
@ -142,7 +142,7 @@ static bool storeTerm ( const char *s ,
|
||||
// . hash terms that are sharded by TERMID not DOCID!!
|
||||
//
|
||||
// . returns false and sets g_errno on error
|
||||
// . these terms are stored in indexdb/datedb, but all terms with the same
|
||||
// . these terms are stored in indexdb, but all terms with the same
|
||||
// termId reside in one and only one group. whereas normally the records
|
||||
// are split based on docid and every group gets 1/nth of the termlist.
|
||||
// . we do this "no splitting" so that only one disk seek is required, and
|
||||
@ -289,155 +289,143 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// . returns -1 if blocked, returns NULL and sets g_errno on error
|
||||
// . "sr" is the tagdb Record
|
||||
// . "ws" store the terms for PageParser.cpp display
|
||||
char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
char *XmlDoc::hashAll(HashTableX *table) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
|
||||
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__,__func__, __LINE__);
|
||||
|
||||
setStatus ( "hashing document" );
|
||||
setStatus("hashing document");
|
||||
|
||||
if ( m_allHashed ) return (char *)1;
|
||||
if (m_allHashed) {
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
// sanity checks
|
||||
if ( table->m_ks != 18 ) { g_process.shutdownAbort(true); }
|
||||
if ( table->m_ds != 4 ) { g_process.shutdownAbort(true); }
|
||||
if (table->m_ks != 18 || table->m_ds != 4) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
if ( m_wts && m_wts->m_ks != 12 ) { g_process.shutdownAbort(true); }
|
||||
// ptr to term = 4 + score = 4 + ptr to sec = 4
|
||||
if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){g_process.shutdownAbort(true);}
|
||||
if (m_wts && (m_wts->m_ks != 12 || m_wts->m_ds != sizeof(TermDebugInfo))) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getContentType failed", __FILE__,__func__, __LINE__);
|
||||
if (!ct) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getContentType failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BR 20160127: Never index JSON and XML content
|
||||
if ( *ct == CT_JSON || *ct == CT_XML )
|
||||
{
|
||||
if (*ct == CT_JSON || *ct == CT_XML) {
|
||||
// For XML (JSON should not get here as it should be filtered out during spidering)
|
||||
// store the URL as the only thing in posdb so we are able to find it, and
|
||||
// eventually ban it.
|
||||
if ( !hashUrl( table, true ) ) // urlOnly (skip IP and term generation)
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashUrl(table, true)) { // urlOnly (skip IP and term generation)
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
||||
return NULL;
|
||||
}
|
||||
m_allHashed = true;
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
unsigned char *hc = (unsigned char *)getHopCount();
|
||||
if ( ! hc || hc == (void *)-1 )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getHopCount returned -1", __FILE__,__func__, __LINE__);
|
||||
if (!hc || hc == (void *)-1) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getHopCount returned -1");
|
||||
return (char *)hc;
|
||||
}
|
||||
|
||||
// need this for hashing
|
||||
HashTableX *cnt = getCountTable();
|
||||
if ( ! cnt )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCountTable failed", __FILE__,__func__, __LINE__);
|
||||
if (!cnt) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getCountTable failed");
|
||||
return (char *)cnt;
|
||||
}
|
||||
if ( cnt == (void *)-1 ) { g_process.shutdownAbort(true); }
|
||||
if (cnt == (void *)-1) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
// and this
|
||||
Links *links = getLinks();
|
||||
if ( ! links )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLinks failed", __FILE__,__func__, __LINE__);
|
||||
if (!links) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getLinks failed");
|
||||
return (char *)links;
|
||||
}
|
||||
if ( links == (Links *)-1 ) { g_process.shutdownAbort(true); }
|
||||
if (links == (Links *)-1) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
char *wordSpamVec = getWordSpamVec();
|
||||
if (!wordSpamVec)
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getWordSpamVec failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)wordSpamVec;
|
||||
if (!wordSpamVec) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getWordSpamVec failed");
|
||||
return wordSpamVec;
|
||||
}
|
||||
if (wordSpamVec == (void *)-1) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
if (wordSpamVec==(void *)-1) {g_process.shutdownAbort(true);}
|
||||
|
||||
char *fragVec = getFragVec();//m_fragBuf.getBufStart();
|
||||
if ( ! fragVec )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getFragVec failed", __FILE__,__func__, __LINE__);
|
||||
return (char *)fragVec;
|
||||
char *fragVec = getFragVec();
|
||||
if (!fragVec) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getFragVec failed");
|
||||
return fragVec;
|
||||
}
|
||||
if (fragVec == (void *)-1) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
if ( fragVec == (void *)-1 ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// why do we need this?
|
||||
if ( m_wts ) {
|
||||
uint8_t *lv = getLangVector();
|
||||
if ( ! lv )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getLangVector failed", __FILE__,__func__, __LINE__);
|
||||
if (!lv) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getLangVector failed");
|
||||
return (char *)lv;
|
||||
}
|
||||
if ( lv == (void *)-1 ) { g_process.shutdownAbort(true); }
|
||||
if (lv == (void *)-1) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getCollRec failed", __FILE__,__func__, __LINE__);
|
||||
if ( ! cr ) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getCollRec failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
// do not repeat this if the cachedb storage call blocks
|
||||
m_allHashed = true;
|
||||
|
||||
// reset distance cursor
|
||||
m_dist = 0;
|
||||
|
||||
|
||||
if ( ! hashContentType ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashContentType failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashUrl ( table, false ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashUrl failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashLanguage ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguage failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashCountry ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashContentType(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashContentType failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// BR 20160106 removed: if ( ! hashAds ( table ) ) return NULL;
|
||||
// BR 20160106 removed: if ( ! hashSubmitUrls ( table ) ) return NULL;
|
||||
if ( ! hashIsAdult ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIsAdult failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashUrl(table, false)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashUrl failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// has gbhasthumbnail:1 or 0
|
||||
// BR 20160106 removed: if ( ! hashImageStuff ( table ) ) return NULL;
|
||||
if (!hashLanguage(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguage failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!hashCountry(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashCountry failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!hashIsAdult(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIsAdult failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// now hash the terms sharded by termid and not docid here since they
|
||||
// just set a special bit in posdb key so Rebalance.cpp can work.
|
||||
// this will hash the content checksum which we need for deduping
|
||||
// which we use for diffbot custom crawls as well.
|
||||
if ( ! hashNoSplit ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNoSplit failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashNoSplit(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNoSplit failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -445,16 +433,13 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// global index now, so don't need this... 9/28/2014
|
||||
|
||||
// stop indexing xml docs
|
||||
bool indexDoc = true;
|
||||
if ( ! cr->m_indexBody ) indexDoc = false;
|
||||
|
||||
bool indexDoc = cr->m_indexBody;
|
||||
|
||||
// global index unless this is a json object in which case it is
|
||||
// hashed above in the call to hashJSON(). this will decrease disk
|
||||
// usage by about half, posdb* files are pretty big.
|
||||
if ( ! indexDoc )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, !indexDoc", __FILE__,__func__, __LINE__);
|
||||
if (!indexDoc) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, !indexDoc");
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
@ -464,9 +449,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
|
||||
// hash the body of the doc first so m_dist is 0 to match
|
||||
// the rainbow display of sections
|
||||
if ( ! hashBody2 (table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashBody2 failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashBody2(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashBody2 failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -476,18 +460,16 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// repeated title terms because we do not do spam detection
|
||||
// on them. thus, we need to hash these first before anything
|
||||
// else. give them triple the body score
|
||||
if ( ! hashTitle ( table ))
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashTitle failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashTitle(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashTitle failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// . hash the keywords tag, limited to first 2k of them so far
|
||||
// . hash above the neighborhoods so the neighborhoods only index
|
||||
// what is already in the hash table
|
||||
if ( ! hashMetaKeywords(table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaKeywords failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashMetaKeywords(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaKeywords failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -495,18 +477,16 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// we index the single words in the neighborhoods next, and
|
||||
// we had songfacts.com coming up for the 'street light facts'
|
||||
// query because it had a bunch of anomalous inlink text.
|
||||
if ( ! hashIncomingLinkText(table,false,true))
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashIncomingLinkText failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashIncomingLinkText(table, false, true)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashIncomingLinkText failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// then the meta summary and description tags with half the score of
|
||||
// the body, and only hash a term if was not already hashed above
|
||||
// somewhere.
|
||||
if ( ! hashMetaSummary(table) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaSummary failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashMetaSummary(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaSummary failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -514,68 +494,48 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// BR 20160220
|
||||
// Store value of meta tag "geo.placename" to help aid searches for
|
||||
// location specific sites, e.g. 'Restaurant in London'
|
||||
if ( ! hashMetaGeoPlacename(table) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaGeoPlacename failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashMetaGeoPlacename(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaGeoPlacename failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
skip:
|
||||
skip:
|
||||
|
||||
// this will only increment the scores of terms already in the table
|
||||
// because we neighborhoods are not techincally in the document
|
||||
// necessarily and we do not want to ruin our precision
|
||||
if ( ! hashNeighborhoods ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashNeighborhoods failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashNeighborhoods(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashNeighborhoods failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashLinks ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLinks failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashDateNumbers ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashDateNumbers failed", __FILE__,__func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashMetaTags ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashLinks(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinks failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! hashPermalink ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashDateNumbers(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashDateNumbers failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!hashMetaTags(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashMetaTags failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!hashPermalink(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashPermaLink failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// hash gblang:de last for parsing consistency
|
||||
if ( ! hashLanguageString ( table ) )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashLanguageString failed", __FILE__,__func__, __LINE__);
|
||||
if (!hashLanguageString(table)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, hashLanguageString failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// . hash gbkeyword:gbmininlinks where the score is the inlink count
|
||||
// . the inlink count can go from 1 to 255
|
||||
// . an ip neighborhood can vote no more than once
|
||||
// . this is in LinkInfo::hash
|
||||
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
|
||||
|
||||
|
||||
// return true if we don't need to print parser info
|
||||
//if ( ! m_pbuf ) return true;
|
||||
// print out the table into g_bufPtr now if we need to
|
||||
//table->print ( );
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, OK", __FILE__,__func__, __LINE__);
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, OK");
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
@ -640,7 +600,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
// only get content for <meta name=..> not <meta http-equiv=..>
|
||||
int32_t tagLen;
|
||||
char *tag = m_xml.getString ( i , "name" , &tagLen );
|
||||
char *tptr = tag;
|
||||
char tagLower[128];
|
||||
int32_t j ;
|
||||
int32_t code;
|
||||
@ -697,13 +656,6 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// . don't allow reserved names: site, url, suburl, link and ip
|
||||
// . actually, the colon is included as part of those
|
||||
// field names, so we really lucked out...!
|
||||
// . index this converted tag name
|
||||
tptr = tagLower;
|
||||
|
||||
// get the content
|
||||
int32_t len;
|
||||
char *s = m_xml.getString ( i , "content" , &len );
|
||||
@ -742,22 +694,13 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
// NULL terminate the buffer
|
||||
buf[len] = '\0';
|
||||
|
||||
// temp null term
|
||||
char c = tptr[tagLen];
|
||||
tptr[tagLen] = 0;
|
||||
|
||||
|
||||
// BR 20160220
|
||||
// Now index the wanted meta tags as normal text without prefix so they
|
||||
// are used in user searches automatically.
|
||||
// custom
|
||||
//hi.m_prefix = tptr;
|
||||
hi.m_prefix = NULL;
|
||||
|
||||
// desc is NULL, prefix will be used as desc
|
||||
bool status = hashString ( buf,len,&hi );
|
||||
// put it back
|
||||
tptr[tagLen] = c;
|
||||
|
||||
// bail on error, g_errno should be set
|
||||
if ( ! status ) return false;
|
||||
|
||||
@ -1088,7 +1031,7 @@ bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {
|
||||
#endif
|
||||
// set this key, it is the entire record
|
||||
key224_t k;
|
||||
k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
|
||||
k = Linkdb::makeKey_uk ( linkeeSiteHash32 ,
|
||||
m_links.getLinkHash64(i) ,
|
||||
spam , // link spam?
|
||||
siteRank , // was quality
|
||||
@ -1509,8 +1452,7 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
|
||||
// sanity check
|
||||
if ( hashAnomalies == hashNonAnomalies ) { g_process.shutdownAbort(true); }
|
||||
// display this note in page parser
|
||||
const char *note = "hashing incoming link text";
|
||||
|
||||
// sanity
|
||||
if ( ! m_linkInfo1Valid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
@ -1531,8 +1473,6 @@ bool XmlDoc::hashIncomingLinkText ( HashTableX *tt ,
|
||||
// brought the following code in from LinkInfo.cpp
|
||||
//
|
||||
|
||||
int32_t noteLen = 0;
|
||||
if ( note ) noteLen = strlen ( note );
|
||||
// count "external" inlinkers
|
||||
int32_t ecount = 0;
|
||||
|
||||
@ -1631,11 +1571,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
||||
|
||||
//int32_t inlinks = *getSiteNumInlinks();
|
||||
|
||||
// HACK: to avoid having to pass a flag to TermTable, then to
|
||||
// Words::hash(), Phrases::hash(), etc. just flip a bit in the
|
||||
// table to make it not add anything unless it is already in there.
|
||||
tt->m_addIffNotUnique = true;
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
@ -1647,9 +1582,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
|
||||
int32_t len = k->size_surroundingText - 1;
|
||||
if ( ! hashString ( s, len, &hi ) ) return false;
|
||||
|
||||
// now turn it back off
|
||||
tt->m_addIffNotUnique = false;
|
||||
|
||||
// get the next Inlink
|
||||
goto loop;
|
||||
}
|
||||
@ -1992,7 +1924,7 @@ bool XmlDoc::hashSingleTerm( const char *s, int32_t slen, HashInfo *hi ) {
|
||||
|
||||
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
final,
|
||||
0LL, // docid
|
||||
0, // dist
|
||||
@ -2355,7 +2287,7 @@ bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sec
|
||||
// if using posdb
|
||||
key144_t k;
|
||||
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
h ,
|
||||
0LL,//docid
|
||||
wposvec[i], // dist,
|
||||
@ -2405,7 +2337,7 @@ bool XmlDoc::hashWords3( HashInfo *hi, const Words *words, Phrases *phrases, Sec
|
||||
int64_t nah ;
|
||||
nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
|
||||
if ( plen>0 ) nah = hash64 ( nah , prefixHash );
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
nah,
|
||||
0LL,//docid
|
||||
wposvec[i], // dist,
|
||||
@ -2462,7 +2394,7 @@ skipsingleword:
|
||||
// hash with prefix
|
||||
if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
|
||||
else ph2 = npid;
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
ph2 ,
|
||||
0LL,//docid
|
||||
wposvec[i],//dist,
|
||||
@ -2565,7 +2497,7 @@ bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
@ -2696,7 +2628,7 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
Posdb::makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
@ -2719,14 +2651,7 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
|
||||
false , // delkey?
|
||||
hi->m_shardByTermId );
|
||||
|
||||
//int64_t final = hash64n("products.offerprice",0);
|
||||
//int64_t prefix = hash64n("gbsortby",0);
|
||||
//int64_t h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
// now set the float in that key
|
||||
//g_posdb.setFloat ( &k , f );
|
||||
g_posdb.setInt ( &k , n );
|
||||
Posdb::setInt ( &k , n );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
@ -2736,11 +2661,11 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , const char
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
Posdb::setAlignmentBit ( &k , 0 );
|
||||
|
||||
// sanity
|
||||
//float t = g_posdb.getFloat ( &k );
|
||||
int32_t x = g_posdb.getInt ( &k );
|
||||
//float t = Posdb::getFloat ( &k );
|
||||
int32_t x = Posdb::getInt ( &k );
|
||||
if ( x != n ) { g_process.shutdownAbort(true); }
|
||||
|
||||
HashTableX *dt = hi->m_tt;
|
||||
|
42
main.cpp
42
main.cpp
@ -2844,7 +2844,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
lastKey.setMin();
|
||||
startKey = g_titledb.makeFirstKey ( docid );
|
||||
startKey = Titledb::makeFirstKey ( docid );
|
||||
// turn off threads
|
||||
g_jobScheduler.disallow_new_jobs();
|
||||
// get a meg at a time
|
||||
@ -2909,7 +2909,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
key96_t k = list.getCurrentKey();
|
||||
char *rec = list.getCurrentRec();
|
||||
int32_t recSize = list.getCurrentRecSize();
|
||||
int64_t docId = g_titledb.getDocIdFromKey ( k );
|
||||
int64_t docId = Titledb::getDocIdFromKey ( &k );
|
||||
if ( k <= lastKey )
|
||||
log("key out of order. "
|
||||
"lastKey.n1=%" PRIx32" n0=%" PRIx64" "
|
||||
@ -4292,8 +4292,8 @@ bool parseTest ( const char *coll, int64_t docId, const char *query ) {
|
||||
// get a title rec
|
||||
g_jobScheduler.disallow_new_jobs();
|
||||
RdbList tlist;
|
||||
key96_t startKey = g_titledb.makeFirstKey ( docId );
|
||||
key96_t endKey = g_titledb.makeLastKey ( docId );
|
||||
key96_t startKey = Titledb::makeFirstKey ( docId );
|
||||
key96_t endKey = Titledb::makeLastKey ( docId );
|
||||
// a niceness of 0 tells it to block until it gets results!!
|
||||
Msg5 msg5;
|
||||
|
||||
@ -4722,7 +4722,7 @@ void dumpPosdb (const char *coll, int32_t startFileNum, int32_t numFiles, bool i
|
||||
const char *dd = "";
|
||||
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
||||
int64_t d = g_posdb.getDocId(&k);
|
||||
uint8_t dh = g_titledb.getDomHash8FromDocId(d);
|
||||
uint8_t dh = Titledb::getDomHash8FromDocId(d);
|
||||
char *rec = list.getCurrentRec();
|
||||
int32_t recSize = 18;
|
||||
if ( rec[0] & 0x04 ) recSize = 6;
|
||||
@ -4947,10 +4947,10 @@ void dumpLinkdb ( const char *coll,
|
||||
if ( url ) {
|
||||
Url u;
|
||||
u.set( url, strlen( url ), true, false );
|
||||
uint32_t h32 = u.getHostHash32();//g_linkdb.getUrlHash(&u)
|
||||
uint32_t h32 = u.getHostHash32();
|
||||
int64_t uh64 = hash64n(url,0);
|
||||
startKey = g_linkdb.makeStartKey_uk ( h32 , uh64 );
|
||||
endKey = g_linkdb.makeEndKey_uk ( h32 , uh64 );
|
||||
startKey = Linkdb::makeStartKey_uk ( h32 , uh64 );
|
||||
endKey = Linkdb::makeEndKey_uk ( h32 , uh64 );
|
||||
}
|
||||
// turn off threads
|
||||
g_jobScheduler.disallow_new_jobs();
|
||||
@ -5006,7 +5006,7 @@ void dumpLinkdb ( const char *coll,
|
||||
// is it a delete?
|
||||
const char *dd = "";
|
||||
if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
|
||||
int64_t docId = (int64_t)g_linkdb.getLinkerDocId_uk(&k);
|
||||
int64_t docId = (int64_t)Linkdb::getLinkerDocId_uk(&k);
|
||||
int32_t shardNum = getShardNum(RDB_LINKDB,&k);
|
||||
printf("k=%s "
|
||||
"linkeesitehash32=0x%08" PRIx32" "
|
||||
@ -5022,16 +5022,16 @@ void dumpLinkdb ( const char *coll,
|
||||
"shardNum=%" PRIu32" "
|
||||
"%s\n",
|
||||
KEYSTR(&k,sizeof(key224_t)),
|
||||
(int32_t)g_linkdb.getLinkeeSiteHash32_uk(&k),
|
||||
(int64_t)g_linkdb.getLinkeeUrlHash64_uk(&k),
|
||||
(int32_t)g_linkdb.isLinkSpam_uk(&k),
|
||||
(int32_t)g_linkdb.getLinkerSiteRank_uk(&k),
|
||||
//hc,//g_linkdb.getLinkerHopCount_uk(&k),
|
||||
iptoa((int32_t)g_linkdb.getLinkerIp_uk(&k)),
|
||||
(int32_t)Linkdb::getLinkeeSiteHash32_uk(&k),
|
||||
(int64_t)Linkdb::getLinkeeUrlHash64_uk(&k),
|
||||
(int32_t)Linkdb::isLinkSpam_uk(&k),
|
||||
(int32_t)Linkdb::getLinkerSiteRank_uk(&k),
|
||||
//hc,//Linkdb::getLinkerHopCount_uk(&k),
|
||||
iptoa((int32_t)Linkdb::getLinkerIp_uk(&k)),
|
||||
docId,
|
||||
(int32_t)g_linkdb.getDiscoveryDate_uk(&k),
|
||||
(int32_t)g_linkdb.getLostDate_uk(&k),
|
||||
(int32_t)g_linkdb.getLinkerSiteHash32_uk(&k),
|
||||
(int32_t)Linkdb::getDiscoveryDate_uk(&k),
|
||||
(int32_t)Linkdb::getLostDate_uk(&k),
|
||||
(int32_t)Linkdb::getLinkerSiteHash32_uk(&k),
|
||||
shardNum,
|
||||
dd );
|
||||
}
|
||||
@ -5441,7 +5441,7 @@ int injectFile ( const char *filename , char *ips , const char *coll ) {
|
||||
}
|
||||
|
||||
if ( startDocId != 0LL )
|
||||
s_titledbKey = g_titledb.makeFirstKey(startDocId);
|
||||
s_titledbKey = Titledb::makeFirstKey(startDocId);
|
||||
|
||||
s_endDocId = endDocId;
|
||||
|
||||
@ -5569,7 +5569,7 @@ void doInject ( int fd , void *state ) {
|
||||
// turn off threads so this happens right away
|
||||
g_jobScheduler.disallow_new_jobs();
|
||||
key96_t endKey; //endKey.setMax();
|
||||
endKey = g_titledb.makeFirstKey(s_endDocId);
|
||||
endKey = Titledb::makeFirstKey(s_endDocId);
|
||||
RdbList list;
|
||||
Msg5 msg5;
|
||||
const char *coll = "main";
|
||||
@ -7160,7 +7160,7 @@ void countdomains( const char* coll, int32_t numRecs, int32_t verbosity, int32_t
|
||||
key96_t k = list.getCurrentKey();
|
||||
char *rec = list.getCurrentRec();
|
||||
int32_t recSize = list.getCurrentRecSize();
|
||||
int64_t docId = g_titledb.getDocId ( &k );
|
||||
int64_t docId = Titledb::getDocId ( &k );
|
||||
attempts++;
|
||||
|
||||
if ( k <= lastKey )
|
||||
|
@ -928,7 +928,7 @@ bool Test::injectLoop ( ) {
|
||||
m_sreq.m_domHash32 = fakeIp;
|
||||
m_sreq.m_hostHash32 = fakeIp;
|
||||
m_sreq.m_siteHash32 = fakeIp;
|
||||
//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
|
||||
//m_sreq.m_probDocId = Titledb::getProbableDocId( m_sreq.m_url );
|
||||
// this crap is fake
|
||||
m_sreq.m_isInjecting = 1;
|
||||
// use test-spider subdir for storing pages and spider times?
|
||||
@ -973,7 +973,6 @@ bool Test::injectLoop ( ) {
|
||||
m_coll ,
|
||||
NULL ,
|
||||
injectedWrapper ,
|
||||
MAX_NICENESS ,
|
||||
RDB_SPIDERDB ) )
|
||||
// return false if blocked
|
||||
return false;
|
||||
|
@ -165,9 +165,9 @@ int main ( int argc , char *argv[] ) {
|
||||
printf("encoded: %s\n",dst);
|
||||
|
||||
// the probable docid
|
||||
int64_t pd = g_titledb.getProbableDocId(&u);
|
||||
int64_t pd = Titledb::getProbableDocId(&u);
|
||||
printf("pdocid: %"UINT64"\n", pd );
|
||||
printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
|
||||
printf("dom8: 0x%"XINT32"\n", (int32_t)Titledb::getDomHash8FromDocId(pd) );
|
||||
if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
|
||||
else printf("islinkloop: no\n");
|
||||
int64_t hh64 = u.getHostHash64();
|
||||
|
@ -10,6 +10,7 @@ static const char* makePosdbKey(char *key, int64_t termId, uint64_t docId, int32
|
||||
}
|
||||
|
||||
TEST(RdbListTest, MergeTestPosdbEmptyAll) {
|
||||
g_conf.m_logTraceRdbList = true;
|
||||
// setup test
|
||||
RdbList list1;
|
||||
list1.set(NULL, 0, NULL, 0, 0, true, Posdb::getUseHalfKeys(), Posdb::getKeySize());
|
||||
@ -32,6 +33,7 @@ TEST(RdbListTest, MergeTestPosdbEmptyAll) {
|
||||
}
|
||||
|
||||
TEST(RdbListTest, MergeTestPosdbEmptyOne) {
|
||||
g_conf.m_logTraceRdbList = true;
|
||||
char key[MAX_KEY_BYTES];
|
||||
|
||||
// setup test
|
||||
@ -83,6 +85,7 @@ TEST(RdbListTest, MergeTestPosdbEmptyOne) {
|
||||
|
||||
// verify that list order is from oldest to newest (last list will override first list)
|
||||
TEST(RdbListTest, MergeTestPosdbVerifyListOrder) {
|
||||
g_conf.m_logTraceRdbList = true;
|
||||
char key[MAX_KEY_BYTES];
|
||||
|
||||
// setup test
|
||||
|
Reference in New Issue
Block a user