868 lines
27 KiB
C++
868 lines
27 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Clusterdb.h"
|
|
#include "Threads.h"
|
|
#include "Rebalance.h"
|
|
|
|
// a global class extern'd in .h file
|
|
Clusterdb g_clusterdb;
|
|
Clusterdb g_clusterdb2;
|
|
|
|
/*
|
|
// for making the cluster cache key
|
|
static key_t makeClusterCacheKey ( uint32_t vfd,
|
|
uint32_t pageNum ) {
|
|
key_t key;
|
|
key.n1 = vfd + 1;
|
|
key.n0 = (uint64_t)pageNum + 1;
|
|
return key;
|
|
}
|
|
|
|
// DiskPageCache override functions
|
|
static void clusterGetPages ( DiskPageCache *pc,
|
|
int32_t vfd,
|
|
char *buf,
|
|
int32_t numBytes,
|
|
int64_t offset,
|
|
int32_t *newNumBytes,
|
|
int64_t *newOffset ) {
|
|
bool cacheMiss = false;
|
|
// return new disk offset, assume unchanged
|
|
*newOffset = offset;
|
|
*newNumBytes = numBytes;
|
|
// what is the page range?
|
|
int64_t sp = offset / GB_PAGE_SIZE ;
|
|
int64_t ep = (offset + (numBytes-1)) / GB_PAGE_SIZE ;
|
|
// setup the cache list
|
|
RdbList cacheList;
|
|
key_t startKey;
|
|
startKey.n1 = 0;
|
|
startKey.n0 = 0;
|
|
// point to the buffer to fill
|
|
char *bufPtr = buf;
|
|
char *bufEnd = buf + numBytes;
|
|
// read in the pages
|
|
while ( sp <= ep && bufPtr < bufEnd ) {
|
|
cacheList.reset();
|
|
// get the cache key for the page
|
|
key_t cacheKey = makeClusterCacheKey ( vfd, sp );
|
|
// read in the list from cache
|
|
collnum_t collnum = 0;
|
|
g_clusterdb.getRdb()->m_cache.getList ( collnum,
|
|
(char*)&cacheKey,
|
|
(char*)&startKey,
|
|
&cacheList,
|
|
false,
|
|
3600*24*265,
|
|
true );
|
|
//cacheList.checkList_r ( false, true );
|
|
//log ( LOG_INFO, "cache: got list [%" INT32 ", %" INT64 "] [%" INT32 "]",
|
|
// vfd, sp, cacheList.m_listSize );
|
|
int32_t size = cacheList.m_listSize;
|
|
if ( size == 0 ) {
|
|
cacheMiss = true;
|
|
goto getPagesEnd;
|
|
}
|
|
//log ( LOG_INFO, "cache: got list [%" INT32 ", %" INT32 "] [%" INT32 "]",
|
|
// vfd, sp, size );
|
|
if ( bufPtr + size >= bufEnd )
|
|
size = bufEnd - bufPtr;
|
|
// copy the list into the buffer
|
|
gbmemcpy ( bufPtr, cacheList.m_list, size );
|
|
// advance to the next page
|
|
bufPtr += size;
|
|
*newOffset += size;
|
|
*newNumBytes -= size;
|
|
sp++;
|
|
}
|
|
getPagesEnd:
|
|
if ( !cacheMiss ) {
|
|
pc->m_hits++;
|
|
// *newNumBytes = -(*newNumBytes);
|
|
}
|
|
else
|
|
pc->m_misses++;
|
|
}
|
|
|
|
static void clusterAddPages ( DiskPageCache *pc,
|
|
int32_t vfd,
|
|
char *buf,
|
|
int32_t numBytes,
|
|
int64_t offset ) {
|
|
// make sure we have a clean vfd
|
|
if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 )
|
|
return;
|
|
// make sure the file didn't get unlinked
|
|
if ( ! pc->m_memOff[vfd] )
|
|
return;
|
|
// get the number of twins, used for filtering
|
|
int32_t numTwins = g_hostdb.getNumHostsPerShard();
|
|
int32_t thisTwin = g_hostdb.m_hostId/g_hostdb.m_numShards;
|
|
// get the bias range for this twin
|
|
int64_t biasStart = ((0x0000003fffffffffLL)/(int64_t)numTwins) *
|
|
(int64_t)thisTwin;
|
|
int64_t biasEnd;
|
|
if ( thisTwin == numTwins - 1 )
|
|
biasEnd = 0x0000003fffffffffLL + 1LL;
|
|
else
|
|
biasEnd = ((0x0000003fffffffffLL)/(int64_t)numTwins) *
|
|
(int64_t)(thisTwin+1);
|
|
// get the page range
|
|
int64_t sp = offset / GB_PAGE_SIZE;
|
|
// point to it
|
|
char *bufPtr = buf;
|
|
char *bufEnd = buf + numBytes;
|
|
// how much did we exceed the boundary by?
|
|
int32_t skip = (int32_t)(offset - sp * GB_PAGE_SIZE);
|
|
int32_t size = GB_PAGE_SIZE - skip;
|
|
// setup the cache lists, may need to merge with an old list
|
|
RdbList cacheList1;
|
|
cacheList1.set ( NULL,
|
|
0,
|
|
NULL,
|
|
0,
|
|
0,
|
|
true,
|
|
true,
|
|
g_clusterdb.getRdb()->m_ks );
|
|
cacheList1.growList(GB_PAGE_SIZE);
|
|
// set the buffer data to a list so we can read it nicely
|
|
key_t startKey;
|
|
key_t endKey;
|
|
startKey.n1 = 0;
|
|
startKey.n0 = 0;
|
|
endKey.n1 = 0xffffffff;
|
|
endKey.n0 = 0xffffffffffffffffULL;
|
|
// setup our source list
|
|
RdbList dataList;
|
|
dataList.set ( bufPtr,
|
|
numBytes,
|
|
bufPtr,
|
|
numBytes,
|
|
(char*)&startKey,
|
|
(char*)&endKey,
|
|
0,
|
|
false,
|
|
true,
|
|
g_clusterdb.getRdb()->m_ks );
|
|
dataList.resetListPtr();
|
|
// add pages to the cache
|
|
while ( bufPtr < bufEnd ) {
|
|
int32_t filled = 0;
|
|
// ensure "size" is not too big
|
|
if ( bufPtr + size > bufEnd )
|
|
size = bufEnd - bufPtr;
|
|
// . add the page to the cache
|
|
cacheList1.reset();
|
|
// check the first key, if it's too large, we're all done here
|
|
key_t key = dataList.getCurrentKey();
|
|
int64_t docId = g_clusterdb.getDocId ( key );
|
|
//if ( docId >= biasEnd ) {
|
|
// log ( "clusterdb: DocId after bias end, key.n1=%" XINT32 " key.n0=%" XINT64 "", key.n1, key.n0 );
|
|
// log ( "clusterdb: DocId after bias end, %" XINT64 " >= %" XINT64 "", docId, biasEnd );
|
|
// return;
|
|
//}
|
|
// make the cache key using vfd and page number
|
|
key_t cacheKey = makeClusterCacheKey ( vfd, sp );
|
|
// filter the data into a list to be cached
|
|
while ( filled < size && !dataList.isExhausted() ) {
|
|
key = dataList.getCurrentKey();
|
|
// check the key for filtering
|
|
//int64_t docId = g_clusterdb.getDocId ( key );
|
|
//int32_t twin = hashLong((int32_t)docId) % numTwins;
|
|
//if ( twin == thisTwin ) {
|
|
// add the key to the rdb list
|
|
cacheList1.addRecord(key, 0, NULL);
|
|
//}
|
|
// next key
|
|
filled += dataList.getCurrentRecSize();
|
|
dataList.skipCurrentRecord();
|
|
}
|
|
collnum_t collnum = 0;
|
|
// if the last key is too small, don't add the page
|
|
docId = g_clusterdb.getDocId ( key );
|
|
if ( docId >= biasStart )
|
|
g_clusterdb.getRdb()->m_cache.addList ( collnum,
|
|
cacheKey,
|
|
&cacheList1 );
|
|
//else
|
|
// log ( "clusterdb: DocId before bias start, %" INT64 " >= %" INT64 "", docId, biasStart );
|
|
//cacheList1.checkList_r ( false, true );
|
|
//log ( LOG_INFO, "cache: add list [%" INT32 ", %" INT64 "] [%" INT32 "]",
|
|
// vfd, sp, cacheList1.m_listSize );
|
|
// advance
|
|
bufPtr += filled;
|
|
sp++;
|
|
size = GB_PAGE_SIZE;
|
|
skip = 0;
|
|
}
|
|
}
|
|
|
|
static int32_t clusterGetVfd ( DiskPageCache *pc,
|
|
int64_t maxFileSize ) {
|
|
// pick a vfd for this file, will be used in the cache key
|
|
int32_t i;
|
|
int32_t count = MAX_NUM_VFDS2;
|
|
for ( i = pc->m_nexti; count-- > 0; i++ ) {
|
|
if ( i >= MAX_NUM_VFDS2 ) i = 0;
|
|
if ( ! pc->m_memOff[i] ) break;
|
|
}
|
|
// bail if none left
|
|
if ( count == 0 ) {
|
|
g_errno = EBADENGINEER;
|
|
log ( LOG_LOGIC, "db: pagecache: clusterGetVfd: "
|
|
"no vds remaining." );
|
|
return -1;
|
|
}
|
|
// start looking here next time
|
|
pc->m_nexti = i + 1;
|
|
// set m_memOff[i] to something to hold the vfd
|
|
pc->m_memOff[i] = (int32_t*)0x7fffffff;
|
|
// return the vfd
|
|
return i;
|
|
}
|
|
|
|
static void clusterRmVfd ( DiskPageCache *pc,
|
|
int32_t vfd ) {
|
|
// make sure it's a clean vfd
|
|
if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 )
|
|
return;
|
|
// clear the vfd for use
|
|
pc->m_memOff[vfd] = NULL;
|
|
// need to clear out the cache records using this vfd
|
|
collnum_t collnum = 0;
|
|
key_t startKey, endKey;
|
|
startKey.n1 = vfd + 1;
|
|
startKey.n0 = 0;
|
|
endKey.n1 = vfd + 1;
|
|
endKey.n0 = 0xffffffffffffffffULL;
|
|
g_clusterdb.getRdb()->m_cache.removeKeyRange ( collnum,
|
|
(char*)&startKey,
|
|
(char*)&endKey );
|
|
//log ( LOG_INFO, "cache: BIASED CACHE REMOVED VFD!!" );
|
|
}
|
|
*/
|
|
|
|
// reset rdb
|
|
void Clusterdb::reset() { m_rdb.reset(); }
|
|
|
|
// . this no longer maintains an rdb of cluster recs
|
|
// . Msg22 now just uses the cache to hold cluster recs that it computes
|
|
// from titlteRecs
|
|
// . clusterRecs are now just TitleRec keys...
|
|
// . we can load one the same from titledb as we could from clusterdb
|
|
// and we still don't need to uncompress the titleRec to get the info
|
|
bool Clusterdb::init ( ) {
|
|
// this should be about 200/4 = 50 megs per host on my current setup
|
|
int32_t maxTreeMem = g_conf.m_clusterdbMaxTreeMem;
|
|
// . what's max # of tree nodes?
|
|
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
|
|
// . 28 bytes per record when in the tree
|
|
int32_t maxTreeNodes = maxTreeMem / ( 16 + CLUSTER_REC_SIZE );
|
|
// . each cahched list is just one key in the tree...
|
|
// . 28(tree space) + 24(cacheoverhead) = 52
|
|
//int32_t maxCacheMem = g_conf.m_clusterdbMaxCacheMem ;
|
|
// do not use any page cache if doing tmp cluster in order to
|
|
// prevent swapping
|
|
//int32_t pcmem = g_conf.m_clusterdbMaxDiskPageCacheMem;
|
|
int32_t pcmem = 0;
|
|
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
|
// we need that 100MB for termlists! they are >90MB now!!
|
|
pcmem = 10000000; // 10MB
|
|
// temp hack for rebuild
|
|
//pcmem = 0;
|
|
// RdbCache has a 4 byte ptr to each rec in the cache
|
|
//int32_t maxCacheNodes = maxCacheMem / ( 4 + CLUSTER_REC_SIZE );
|
|
//int32_t nodeSize = sizeof(key_t) + sizeof(collnum_t);
|
|
//int32_t pageSize = GB_TFNDB_PAGE_SIZE;
|
|
//int32_t nodeSize = (pageSize + 12) + sizeof(collnum_t) + 20;
|
|
//int32_t maxCacheNodes = maxCacheMem / nodeSize ;
|
|
// init the page cache
|
|
// if ( ! m_pc.init ( "clusterdb",
|
|
// RDB_CLUSTERDB,
|
|
// pcmem ,
|
|
// pageSize ) )
|
|
// //g_conf.m_clusterdbMaxDiskPageCacheMem,
|
|
// //clusterGetPages,
|
|
// //clusterAddPages,
|
|
// //clusterGetVfd,
|
|
// //clusterRmVfd ))
|
|
// return log("db: Clusterdb init failed.");
|
|
//bool bias = true;
|
|
//if ( g_conf.m_fullSplit ) bias = false;
|
|
bool bias = false;
|
|
// initialize our own internal rdb
|
|
return m_rdb.init ( g_hostdb.m_dir ,
|
|
"clusterdb" ,
|
|
true , // dedup
|
|
//CLUSTER_REC_SIZE - sizeof(key_t),//fixedDataSize
|
|
0 , // no data now! just docid/s/c
|
|
2, // g_conf.m_clusterdbMinFilesToMerge,
|
|
g_conf.m_clusterdbMaxTreeMem,
|
|
maxTreeNodes , // maxTreeNodes ,
|
|
true , //false , // balance tree?
|
|
0,//maxCacheMem ,
|
|
0,//maxCacheNodes ,
|
|
true , // half keys?
|
|
g_conf.m_clusterdbSaveCache,
|
|
NULL,//&m_pc ,
|
|
false, // is titledb
|
|
true , // preload disk page cache
|
|
12, // key size
|
|
bias ); // bias disk page cache?
|
|
}
|
|
|
|
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
|
bool Clusterdb::init2 ( int32_t treeMem ) {
|
|
// . what's max # of tree nodes?
|
|
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
|
|
// . 28 bytes per record when in the tree
|
|
int32_t maxTreeNodes = treeMem / ( 16 + CLUSTER_REC_SIZE );
|
|
// initialize our own internal rdb
|
|
return m_rdb.init ( g_hostdb.m_dir ,
|
|
"clusterdbRebuild" ,
|
|
true , // dedup
|
|
0 , // no data now! just docid/s/c
|
|
50 , // m_clusterdbMinFilesToMerge,
|
|
treeMem , // g_conf.m_clusterdbMaxTreeMem,
|
|
maxTreeNodes ,
|
|
true , // balance tree?
|
|
0 , // maxCacheMem ,
|
|
0 , // maxCacheNodes ,
|
|
true , // half keys?
|
|
false , // g_conf.m_clusterdbSaveCache,
|
|
NULL , // &m_pc ,
|
|
false , // is titledb
|
|
false , // preload disk page cache
|
|
12 , // key size
|
|
true ); // bias disk page cache
|
|
}
|
|
/*
|
|
bool Clusterdb::addColl ( char *coll, bool doVerify ) {
|
|
if ( ! m_rdb.addColl ( coll ) ) return false;
|
|
if ( ! doVerify ) return true;
|
|
// verify
|
|
if ( verify(coll) ) return true;
|
|
// if not allowing scale, return false
|
|
if ( ! g_conf.m_allowScale ) return false;
|
|
// otherwise let it go
|
|
log ( "db: Verify failed, but scaling is allowed, passing." );
|
|
return true;
|
|
}
|
|
*/
|
|
bool Clusterdb::verify ( char *coll ) {
|
|
log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
|
|
g_threads.disableThreads();
|
|
|
|
Msg5 msg5;
|
|
Msg5 msg5b;
|
|
RdbList list;
|
|
key_t startKey;
|
|
key_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
//int32_t minRecSizes = 64000;
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
if ( ! msg5.getList ( RDB_CLUSTERDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
64000 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL ,
|
|
0 ,
|
|
-1 ,
|
|
true ,
|
|
-1LL ,
|
|
&msg5b ,
|
|
true )) {
|
|
g_threads.enableThreads();
|
|
return log("db: HEY! it did not block");
|
|
}
|
|
|
|
int32_t count = 0;
|
|
int32_t got = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key_t k = list.getCurrentKey();
|
|
// skip negative keys
|
|
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
|
count++;
|
|
//uint32_t groupId = getGroupId ( RDB_CLUSTERDB , &k );
|
|
//if ( groupId == g_hostdb.m_groupId ) got++;
|
|
uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
|
|
if ( shardNum == getMyShardNum() ) got++;
|
|
}
|
|
if ( got != count ) {
|
|
// tally it up
|
|
g_rebalance.m_numForeignRecs += count - got;
|
|
log ("db: Out of first %" INT32 " records in clusterdb, "
|
|
"only %" INT32 " belong to our group.",count,got);
|
|
// exit if NONE, we probably got the wrong data
|
|
if ( got == 0 ) log("db: Are you sure you have the "
|
|
"right "
|
|
"data in the right directory? "
|
|
"Exiting.");
|
|
log ( "db: Exiting due to Clusterdb inconsistency." );
|
|
g_threads.enableThreads();
|
|
return g_conf.m_bypassValidation;
|
|
}
|
|
log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for "
|
|
"%" INT32 " recs.", count );
|
|
// DONE
|
|
g_threads.enableThreads();
|
|
return true;
|
|
}
|
|
|
|
#include "IndexList.h"
|
|
|
|
// . this routine is very slow...
|
|
// . it is used to get a titleRec's (document's) sample vector at query time
|
|
// but we should really compute this vector at build time and store it in
|
|
// the titleRec itself, to avoid having to compute it at query time.
|
|
// . vector must have at least VECTOR_SIZE bytes available
|
|
/*
|
|
void Clusterdb::getSampleVector ( char *vec ,
|
|
class Doc *doc,
|
|
char *coll ,
|
|
int32_t collLen ,
|
|
int32_t niceness) {
|
|
int64_t startTime = gettimeofdayInMilliseconds();
|
|
TitleRec *tr = doc->getTitleRec();
|
|
SiteRec *sr = doc->getSiteRec();
|
|
//sr->set ( tr->getSite() , tr->getColl() , tr->getCollLen() ,
|
|
sr->set ( tr->getSite() , coll , collLen ,
|
|
tr->getSiteFilenum() , SITEREC_CURRENT_VERSION );
|
|
// hashes the whole doc, but more importantly for us, computes
|
|
// XmlDoc::m_vector
|
|
//doc->set ( niceness );
|
|
XmlDoc *xd = doc->getXmlDoc();
|
|
xd->set ( tr , sr , NULL, niceness);
|
|
// this just sets the vector
|
|
doc->getIndexList(NULL,true,true,false,NULL,NULL,NULL, niceness);
|
|
// log the time
|
|
int64_t took =gettimeofdayInMilliseconds()-startTime;
|
|
if ( took > 3 )
|
|
log(LOG_INFO,"query: Took %" INT64 " ms to make indexlist.",took);
|
|
// so get it
|
|
char *p = doc->getSampleVector ( );
|
|
// and store it. int16_t vectors are padded with 0's.
|
|
gbmemcpy ( vec , p , SAMPLE_VECTOR_SIZE );
|
|
}
|
|
*/
|
|
|
|
// if VECTOR_SIZE is 128 bytes then that is 32 termIds (4 bytes each) that we
|
|
// use to make this vector. these 32 termids are the lowest 32 termids out of
|
|
// all the termids for the document. we can further hash pairs to reduce the
|
|
// vector size from 128 to 64 bytes. but we must hash the pair strategically.
|
|
// What are the odds of two things being 90% similar when they are not?
|
|
#define SAMPLE_VECTOR_LEN (SAMPLE_VECTOR_SIZE / 4)
|
|
|
|
// . it would be nice to use the new addition to the Words class that allows
|
|
// a word to be a tag. this kinda replaces the xml class.
|
|
// . returns false and sets g_errno on error
|
|
/*
|
|
bool Clusterdb::getGigabitVector ( char *vec , Xml *xml ) {
|
|
// . get filtered text, no link text since that is usually for menus
|
|
// . get first 64k
|
|
char buf[64*1024];
|
|
xml->getText ( buf , 64*1024 );
|
|
// hash into this table
|
|
TermTable table;
|
|
Query q;
|
|
TopicGroup t;
|
|
t.m_numTopics = 32;
|
|
t.m_maxTopics = 32;
|
|
t.m_docsToScanForTopics = 1;
|
|
t.m_minTopicScore = 0;
|
|
t.m_maxWordsPerTopic = 4;
|
|
t.m_meta[0] = '\0';
|
|
t.m_delimeter = 0;
|
|
t.m_useIdfForTopics = true;
|
|
t.m_dedup = false;
|
|
t.m_minDocCount = 1;
|
|
t.m_ipRestrict = false;
|
|
t.m_dedupSamplePercent = 0;
|
|
t.m_topicRemoveOverlaps = true;
|
|
t.m_topicSampleSize = 64*1024;
|
|
t.m_topicMaxPunctLen = 3;
|
|
State23 st;
|
|
st.m_numRequests = 1;
|
|
st->m_msg20[0].m_bufSampleBuf = buf;
|
|
st->m_msg20[0].m_bufSampleBufLen = bufLen;
|
|
st->m_returnDocIdCount = false;
|
|
st->m_returnDocIds = false;
|
|
st->m_returnPops = false;
|
|
Msg24 msg24;
|
|
if ( ! msg24.getTopics ( &st , // State24
|
|
&t ,
|
|
&table ,
|
|
&q ,
|
|
0 , // gid
|
|
&buf ,
|
|
&bufLen ) )
|
|
return false;
|
|
// now hash the winning topics into our vector
|
|
|
|
}
|
|
*/
|
|
|
|
/*
|
|
void Clusterdb::getSampleVector ( char *vec , TermTable *table ) {
|
|
// no compression is used in this list so each docId/termId is 12 bytes
|
|
int32_t numTerms = table->getNumTermsUsed();
|
|
// . how many can we hold? we'll just use 4 bytes per vector component
|
|
// . let's get 2x as many termids as required, then we will combine
|
|
// every 2 termids into one via hashing... this makes falsely high
|
|
// similarities less likely, but makes truly high similarities less
|
|
// likely to be detected as well.
|
|
int32_t maxTerms = (1 * SAMPLE_VECTOR_LEN) - 1;
|
|
// what portion of them do we want to mask out from the rest?
|
|
int32_t ratio = numTerms / maxTerms ;
|
|
unsigned char mask = 0x00;
|
|
while ( ratio >= 2 ) {
|
|
// shift the mask down, ensure hi bit is set
|
|
mask >>= 1;
|
|
mask |= 0x80;
|
|
ratio >>= 1; // /2
|
|
}
|
|
uint32_t d [ 3000 ];
|
|
// if we don't have enough, make them 0's
|
|
memset ( d , 0 , SAMPLE_VECTOR_SIZE );
|
|
memset ( vec , 0 , SAMPLE_VECTOR_SIZE );
|
|
again:
|
|
// a buffer to hold the top termIds
|
|
int32_t nd = 0;
|
|
// . buffer should have at least "maxTerms" in it
|
|
// . these should all be 12 byte keys
|
|
int32_t i = 0 ;
|
|
int32_t n = table->getNumTerms();
|
|
int64_t *termIds = table->getTermIds();
|
|
uint32_t *scores = table->getScores ();
|
|
for ( ; i < n ; i++ ) {
|
|
// skip if empty bucket
|
|
if ( ! scores[i] ) continue;
|
|
// skip if negative key, since we can be deleting old keys
|
|
// from call from Msg14.cpp
|
|
// NO! this should be the indexlist directly from Msg16, not
|
|
// after subtracting the one from Msg15
|
|
//if ( (*p & 0x01) == 0x00 ) continue;
|
|
// skip if it's not to be considered
|
|
//fprintf(stderr,"%hhu\n",p[11]);
|
|
//if ( (p[11] & mask) != 0 ) continue;
|
|
if ( ((termIds[i]>>(NUMTERMIDBITS-8)) & mask) != 0 ) continue;
|
|
// add it
|
|
//d[nd++] = *(int32_t *)(p+12-5); // last byte has del bit, etc.
|
|
d[nd] = (uint32_t)(termIds[i] >> (NUMTERMIDBITS-32));
|
|
// 0 has special meaning, it terminates the vector
|
|
if ( d[nd] == 0 ) d[nd] = 1;
|
|
if ( ++nd < 3000 ) continue;
|
|
// bitch and break out on error
|
|
log(LOG_INFO,"build: Sample vector overflow. Slight "
|
|
"performance hit.");
|
|
break;
|
|
}
|
|
// if nd was too small, don't use a mask to save time
|
|
if ( nd < maxTerms && nd < numTerms && mask ) {
|
|
// sanity check
|
|
if ( mask == 0 ) {
|
|
log (LOG_LOGIC,"build: Clusterdb sample vector mask "
|
|
"is already at 0.");
|
|
char *xx = NULL; *xx = 0;
|
|
}
|
|
// debug msg
|
|
//log("AGAIN");
|
|
//val >>= 1;
|
|
// shift the mask UP, allow more termIds to pass through
|
|
mask <<= 1;
|
|
goto again;
|
|
}
|
|
|
|
// bubble sort them
|
|
bool flag = true;
|
|
while ( flag ) {
|
|
flag = false;
|
|
for ( int32_t i = 1 ; i < nd ; i++ ) {
|
|
if ( d[i-1] <= d[i] ) continue;
|
|
uint32_t tmp = d[i-1];
|
|
d[i-1] = d[i];
|
|
d[i] = tmp;
|
|
flag = true;
|
|
}
|
|
}
|
|
|
|
if ( nd > SAMPLE_VECTOR_LEN - 1 ) nd = SAMPLE_VECTOR_LEN - 1;
|
|
// make sure last component is a 0
|
|
d [ nd ] = 0;
|
|
gbmemcpy ( vec , (char *)d , (nd+1) * 4 );
|
|
}
|
|
*/
|
|
|
|
// return the percent similar
|
|
char Clusterdb::getSampleSimilarity ( char *vec0 , char *vec1, int32_t size ) {
|
|
// . the termIds are sorted
|
|
// . point each recs sample vector of termIds
|
|
//int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4);
|
|
//int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4);
|
|
// . we sorted them above as uint32_ts, so we must make sure
|
|
// we use uint32_ts here, too
|
|
uint32_t *t0 = (uint32_t *)vec0;
|
|
uint32_t *t1 = (uint32_t *)vec1;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *t0 == 0 ) return 0;
|
|
if ( *t1 == 0 ) return 0;
|
|
//int32_t size0 = *(int32_t *)(rec + sizeof(key_t));
|
|
//int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12));
|
|
//int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12));
|
|
// how many total termIds?
|
|
//int32_t total = (end0 - t0 + end1 - t1) / 2;
|
|
//if ( total <= 0 ) return 0;
|
|
// count matches between the sample vectors
|
|
int32_t count = 0;
|
|
loop:
|
|
if( ((char*)t0 - vec0) > size ) {
|
|
log( LOG_INFO, "query: sample vector 0 is malformed. "
|
|
"Returning 0%% similarity." );
|
|
return 0;
|
|
}
|
|
if( ((char*)t1 - vec1) > size ) {
|
|
log( LOG_INFO, "query: sample vector 1 is malformed. "
|
|
"Returning 0%% similarity." );
|
|
return 0;
|
|
}
|
|
|
|
// terminate on a 0
|
|
if ( *t0 < *t1 ) { if ( *++t0 == 0 ) goto done; }
|
|
else if ( *t1 < *t0 ) { if ( *++t1 == 0 ) goto done; }
|
|
else {
|
|
// if both are zero... do not inc count
|
|
if ( *t0 == 0 ) goto done;
|
|
count++;
|
|
t0++;
|
|
t1++;
|
|
if ( *t0 == 0 ) goto done;
|
|
if ( *t1 == 0 ) goto done;
|
|
}
|
|
goto loop;
|
|
|
|
done:
|
|
// count total components in each sample vector
|
|
while ( *t0 ) {
|
|
t0++;
|
|
if( ((char*)t0 - vec0) > size ) {
|
|
log( LOG_INFO, "query: sample vector 0 is malformed. "
|
|
"Returning 0%% similarity." );
|
|
return 0;
|
|
}
|
|
}
|
|
while ( *t1 ) {
|
|
t1++;
|
|
if( ((char*)t1 - vec1) > size ) {
|
|
log( LOG_INFO, "query: sample vector 1 is malformed. "
|
|
"Returning 0%% similarity." );
|
|
return 0;
|
|
}
|
|
}
|
|
int32_t total = 0;
|
|
total += t0 - ((uint32_t *)vec0);
|
|
total += t1 - ((uint32_t *)vec1);
|
|
// how similar are they?
|
|
// if both are empty, assume not similar at all. this happens if we
|
|
// do not have a content vector for either, or if both are small docs
|
|
// with no words or links in them (framesets?)
|
|
if ( total == 0 ) return 0;
|
|
int32_t sim = (count * 2 * 100) / total;
|
|
if ( sim > 100 ) sim = 100;
|
|
return (char)sim;
|
|
}
|
|
|
|
/*
|
|
// return the percent similar
|
|
char Clusterdb::getGigabitSimilarity ( char *vec0 , char *vec1 ,
|
|
int32_t *qtable , int32_t numSlots ) {
|
|
// . the termIds are sorted
|
|
// . point each recs sample vector of termIds
|
|
//int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4);
|
|
//int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4);
|
|
uint32_t *t0 = (uint32_t *)vec0;
|
|
uint32_t *t1 = (uint32_t *)vec1;
|
|
int16_t *s0 = (int16_t *)(vec0 + 4*GIGABITS_IN_VECTOR);
|
|
int16_t *s1 = (int16_t *)(vec1 + 4*GIGABITS_IN_VECTOR);
|
|
int32_t i0 = 0;
|
|
int32_t i1 = 0;
|
|
// if both empty, cluster together... assume same topic
|
|
//if ( *t0 == 0 && *t1 == 0 ) return 100;
|
|
if ( *t0 == 0 && *t1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *t0 == 0 ) return 0;
|
|
if ( *t1 == 0 ) return 0;
|
|
if ( numSlots == 0 ) return 0;
|
|
//int32_t size0 = *(int32_t *)(rec + sizeof(key_t));
|
|
//int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12));
|
|
//int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12));
|
|
// how many total termIds?
|
|
//int32_t total = (end0 - t0 + end1 - t1) / 2;
|
|
//if ( total <= 0 ) return 0;
|
|
// count matches between the sample vectors
|
|
int32_t count = 0;
|
|
int32_t n;
|
|
uint32_t mask = numSlots - 1;
|
|
loop:
|
|
// skip if t0[i0] matches a query term
|
|
n = t0[i0] & mask;
|
|
while ( qtable[n] && qtable[n] != (int32_t)t0[i0] )
|
|
if ( ++n >= numSlots ) n = 0;
|
|
if ( qtable[n] ) {
|
|
s0[i0] = 0; // remove score for tallying up total
|
|
i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; }
|
|
// skip if t1[i1] matches a query term
|
|
n = t1[i1] & mask;
|
|
while ( qtable[n] && qtable[n] != (int32_t)t1[i1] )
|
|
if ( ++n >= numSlots ) n = 0;
|
|
if ( qtable[n] ) {
|
|
s1[i1] = 0; // remove score for tallying up total
|
|
i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; }
|
|
// terminate on a 0
|
|
if ( t0[i0] < t1[i1] ) {
|
|
i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; }
|
|
else if ( t1[i1] < t0[i0] ) {
|
|
i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; }
|
|
else {
|
|
// if both are zero... do not inc count
|
|
if ( t0[i0] == 0 ) goto done;
|
|
//count++;
|
|
// now we do a weighted count
|
|
count += s0[i0] + s1[i1];
|
|
i0++;
|
|
i1++;
|
|
if ( t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done;
|
|
if ( t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done;
|
|
}
|
|
goto loop;
|
|
|
|
done:
|
|
// count total components in each sample vector
|
|
while ( t0[i0] && i0 < GIGABITS_IN_VECTOR ) i0++;
|
|
while ( t1[i1] && i1 < GIGABITS_IN_VECTOR ) i1++;
|
|
int32_t total = 0;
|
|
//total += t0 - ((int32_t *)vec0);
|
|
//total += t1 - ((int32_t *)vec1);
|
|
// get total score
|
|
for ( int32_t i = 0 ; i < i0 ; i++ ) total += s0[i] ;
|
|
for ( int32_t i = 0 ; i < i1 ; i++ ) total += s1[i] ;
|
|
// how similar are they?
|
|
// if both are empty, assume not similar at all. this happens if we
|
|
// do not have a content vector for either, or if both are small docs
|
|
// with no words or links in them (framesets?)
|
|
if ( total == 0 ) return 0;
|
|
//int32_t sim = (count * 2 * 100) / total;
|
|
int32_t sim = (count * 100) / total;
|
|
if ( sim > 100 ) sim = 100;
|
|
return (char)sim;
|
|
}
|
|
*/
|
|
|
|
key_t Clusterdb::makeClusterRecKey ( int64_t docId,
|
|
bool familyFilter,
|
|
uint8_t languageBits,
|
|
int32_t siteHash,
|
|
bool isDelKey,
|
|
bool isHalfKey ) {
|
|
key_t key;
|
|
// set the docId upper bits
|
|
key.n1 = (uint32_t)(docId >> 29);
|
|
key.n1 &= 0x000001ff;
|
|
// set the docId lower bits
|
|
key.n0 = docId;
|
|
key.n0 <<= 35;
|
|
// set the family filter bit
|
|
if ( familyFilter ) key.n0 |= 0x0000000400000000ULL;
|
|
else key.n0 &= 0xfffffffbffffffffULL;
|
|
// set the language bits
|
|
key.n0 |= ((uint64_t)(languageBits & 0x3f)) << 28;
|
|
// set the site hash
|
|
key.n0 |= (uint64_t)(siteHash & 0x03ffffff) << 2;
|
|
// set the del bit
|
|
if ( isDelKey ) key.n0 &= 0xfffffffffffffffeULL;
|
|
else key.n0 |= 0x0000000000000001ULL;
|
|
// set half bit
|
|
if ( !isHalfKey ) key.n0 &= 0xfffffffffffffffdULL;
|
|
else key.n0 |= 0x0000000000000002ULL;
|
|
// return the key
|
|
return key;
|
|
}
|
|
|
|
/*
|
|
key_t Clusterdb::convertTitleRecKey ( key_t titleKey ) {
|
|
// extract the docid
|
|
int64_t docId;
|
|
docId = titleKey.n1;
|
|
docId <<= 6;
|
|
docId |= titleKey.n0 >> 58;
|
|
// extract the family filter
|
|
bool familyFilter;
|
|
if ( ( titleKey.n1 & 0x0100000000000000ULL ) ||
|
|
( titleKey.n1 & 0x0200000000000000ULL ) )
|
|
familyFilter = true;
|
|
else
|
|
familyFilter = false;
|
|
// extract the site hash
|
|
uint32_t siteHash;
|
|
siteHash = (uint32_t)((titleKey.n0 >> 30) & 0x0000000003ffffffULL);
|
|
// make and return the key
|
|
return makeClusterRecKey ( docId, familyFilter, 0, siteHash, false );
|
|
}
|
|
|
|
void Clusterdb::makeRecFromTitleRec ( char *rec,
|
|
TitleRec *titleRec,
|
|
bool isDelKey ) {
|
|
// get the docId
|
|
int64_t docId = titleRec->getDocId();
|
|
// get the family filter
|
|
bool familyFilter = titleRec->hasAdultContent();
|
|
// get the language byte
|
|
unsigned char lang = titleRec->getLanguage();
|
|
// . get the site hash
|
|
// . this is really the host hash because tfndb key most use
|
|
// the host hash in case site changes in tagdb
|
|
uint32_t siteHash = titleRec->getHostHash();
|
|
// make the key and copy it to rec
|
|
key_t key = makeClusterRecKey ( docId,
|
|
familyFilter,
|
|
lang,
|
|
siteHash,
|
|
false );
|
|
gbmemcpy(rec, &key, sizeof(key_t));
|
|
}
|
|
|
|
void Clusterdb::makeRecFromTitleRecKey ( char *rec,
|
|
char *key,
|
|
bool isDelKey ) {
|
|
// get the docId
|
|
int64_t docId = g_titledb.getDocIdFromKey((key_t*)key);
|
|
// get the family filter
|
|
bool familyFilter = g_titledb.hasAdultContent(*(key_t*)key);
|
|
// . get the site hash
|
|
// . this is really the host hash because tfndb key most use
|
|
// the host hash in case site changes in tagdb
|
|
uint32_t siteHash = g_titledb.getHostHash((key_t*)key);
|
|
// make the key and copy it to rec
|
|
key_t ckey = makeClusterRecKey ( docId,
|
|
familyFilter,
|
|
0,
|
|
siteHash,
|
|
false );
|
|
gbmemcpy(rec, &ckey, sizeof(key_t));
|
|
}
|
|
*/
|