speed up whitelist hashtable like 20x
using hashtable key magic.
This commit is contained in:
@ -14,6 +14,7 @@ void HashTableX::constructor() {
|
||||
m_doFree = false;
|
||||
m_isWritable = true;
|
||||
m_txtBuf = NULL;
|
||||
m_useKeyMagic = false;
|
||||
}
|
||||
|
||||
void HashTableX::destructor() {
|
||||
@ -82,6 +83,7 @@ void HashTableX::reset ( ) {
|
||||
m_numSlotsUsed = 0;
|
||||
m_addIffNotUnique = false;
|
||||
m_maskKeyOffset = 0;
|
||||
m_useKeyMagic = false;
|
||||
// we should free it in reset()
|
||||
if ( m_doFree && m_txtBuf ) {
|
||||
mfree ( m_txtBuf , m_txtBufSize,"ftxtbuf");
|
||||
@ -135,8 +137,16 @@ long HashTableX::getCount ( void *key ) {
|
||||
// . returns -1 if key not in hash table
|
||||
long HashTableX::getOccupiedSlotNum ( void *key ) {
|
||||
if ( m_numSlots <= 0 ) return -1;
|
||||
|
||||
long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
|
||||
|
||||
// use magic to "randomize" key a little
|
||||
if ( m_useKeyMagic )
|
||||
n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
|
||||
|
||||
// mask on the lower 32 bits i guess
|
||||
long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
|
||||
n &= m_mask;
|
||||
|
||||
long count = 0;
|
||||
while ( count++ < m_numSlots ) {
|
||||
// this is set to 0x01 if non-empty
|
||||
@ -176,7 +186,18 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
|
||||
if ( growTo > m_maxSlots ) growTo = m_maxSlots;
|
||||
if ( ! setTableSize ( (long)growTo , NULL , 0 ) ) return false;
|
||||
}
|
||||
long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
|
||||
|
||||
//long n=(*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
|
||||
|
||||
long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
|
||||
|
||||
// use magic to "randomize" key a little
|
||||
if ( m_useKeyMagic )
|
||||
n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
|
||||
|
||||
// mask on the lower 32 bits i guess
|
||||
n &= m_mask;
|
||||
|
||||
long count = 0;
|
||||
m_needsSave = true;
|
||||
while ( count++ < m_numSlots ) {
|
||||
|
@ -408,6 +408,8 @@ class HashTableX {
|
||||
char *m_buf;
|
||||
long m_bufSize;
|
||||
|
||||
char m_useKeyMagic;
|
||||
|
||||
long m_ks;
|
||||
long m_ds;
|
||||
char m_allowDups;
|
||||
|
@ -1368,6 +1368,9 @@ bool gotResults ( void *state ) {
|
||||
args.safePrintf("&sb=1");
|
||||
if ( ! si->m_showBanned && si->m_isAdmin )
|
||||
args.safePrintf("&sb=0");
|
||||
// carry over the sites we are restricting the search results to
|
||||
if ( si->m_whiteListBuf.length() )
|
||||
args.safePrintf("&sites=%s",si->m_whiteListBuf.getBufStart());
|
||||
|
||||
|
||||
if ( firstNum > 0 && ! si->m_xml ) {
|
||||
|
24
Posdb.cpp
24
Posdb.cpp
@ -562,7 +562,7 @@ void PosdbTable::reset() {
|
||||
m_qiBuf.reset();
|
||||
// assume no-op
|
||||
m_t1 = 0LL;
|
||||
m_whiteTable.reset();
|
||||
m_whiteListTable.reset();
|
||||
m_addedSites = false;
|
||||
}
|
||||
|
||||
@ -664,14 +664,26 @@ bool PosdbTable::allocWhiteListTable ( ) {
|
||||
sum += size / 12 + 1;
|
||||
}
|
||||
if ( sum ) {
|
||||
// making this sum * 3 does not show a speedup... hmmm...
|
||||
long numSlots = sum * 2;
|
||||
// keep it restricted to 5 byte keys so we do not have to
|
||||
// extract the docid, we can just hash the ptr to those
|
||||
// 5 bytes (which includes 1 siterank bit as the lowbit,
|
||||
// but should be ok since it should be set the same in
|
||||
// all termlists that have that docid)
|
||||
if ( ! m_whiteTable.set(5,0,numSlots,NULL,0,false,0,"wtall"))
|
||||
if ( ! m_whiteListTable.set(5,0,numSlots,NULL,0,false,
|
||||
0,"wtall"))
|
||||
return false;
|
||||
// try to speed up. wow, this slowed it down about 4x!!
|
||||
//m_whiteListTable.m_maskKeyOffset = 1;
|
||||
//
|
||||
////////////
|
||||
//
|
||||
// this seems to make it like 20x faster... 1444ms vs 27000ms:
|
||||
//
|
||||
////////////
|
||||
//
|
||||
m_whiteListTable.m_useKeyMagic = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -4672,7 +4684,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , long listGroupNum ) {
|
||||
// the 6 bytes of the docid ptr as is though since the siterank
|
||||
// should be the same for the site: terms we indexed for the same
|
||||
// docid!!
|
||||
if ( m_useWhiteTable && ! m_whiteTable.isInTable(minRecPtr+7) )
|
||||
if ( m_useWhiteTable && ! m_whiteListTable.isInTable(minRecPtr+7) )
|
||||
goto getMin;
|
||||
|
||||
|
||||
@ -4903,12 +4915,16 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
|
||||
RdbList *list = &whiteLists[i];
|
||||
if ( list->isEmpty() ) continue;
|
||||
// sanity test
|
||||
long long d1 = g_posdb.getDocId(list->getList());
|
||||
if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
|
||||
if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
|
||||
// first key is always 18 bytes cuz it has the termid
|
||||
// scan recs in the list
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
||||
char *rec = list->getCurrentRec();
|
||||
// point to the 5 bytes of docid
|
||||
m_whiteTable.addKey ( rec + 7 );
|
||||
m_whiteListTable.addKey ( rec + 7 );
|
||||
}
|
||||
}
|
||||
m_addedSites = true;
|
||||
|
4
Posdb.h
4
Posdb.h
@ -446,7 +446,7 @@ class PosdbTable {
|
||||
class Msg2 *msg2,
|
||||
class Msg39Request *r );
|
||||
|
||||
// pre-allocate m_whiteTable
|
||||
// pre-allocate m_whiteListTable
|
||||
bool allocWhiteListTable ( ) ;
|
||||
|
||||
// pre-allocate memory since intersection runs in a thread
|
||||
@ -579,7 +579,7 @@ class PosdbTable {
|
||||
// the new intersection/scoring algo
|
||||
void intersectLists10_r ( );
|
||||
|
||||
HashTableX m_whiteTable;
|
||||
HashTableX m_whiteListTable;
|
||||
bool m_useWhiteTable;
|
||||
bool m_addedSites;
|
||||
|
||||
|
@ -1637,7 +1637,7 @@ float Title::getSimilarity ( Words *w1 , long i0 , long i1 ,
|
||||
// prepare for next link, it may never come if we're last one!
|
||||
//oldi = i;
|
||||
// add to table
|
||||
if ( ! table.addKey ( (long)wid , score , NULL ) )
|
||||
if ( ! table.addKey ( (long)wid , (long)score , NULL ) )
|
||||
return -1.0;
|
||||
// if no last wid, continue
|
||||
if ( lastWid == -1LL ) {lastWid=wid;lastScore=score;continue; }
|
||||
@ -1657,7 +1657,7 @@ float Title::getSimilarity ( Words *w1 , long i0 , long i1 ,
|
||||
// "adding pid=%li score=%.02f sum=%.02f",
|
||||
// (long)pid,phrScore,sum);
|
||||
// now add that
|
||||
if ( ! table.addKey ( (long)pid , phrScore , NULL ) )
|
||||
if ( ! table.addKey ( (long)pid , (long)phrScore , NULL ) )
|
||||
return -1.0;
|
||||
// we are now the last wid
|
||||
lastWid = wid;
|
||||
|
@ -1068,7 +1068,8 @@ long Words::getLanguage( Sections *sections ,
|
||||
//long wordBase = 0;
|
||||
long wordi = 0;
|
||||
//if ( ! ht.set(maxSamples*1.5) ) return -1;
|
||||
if ( ! ht.set(8,1,maxSamples*8.0,NULL,0,false,niceness,"wordslang"))
|
||||
if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
|
||||
niceness,"wordslang"))
|
||||
return -1;
|
||||
|
||||
// . avoid words in these bad sections
|
||||
|
Reference in New Issue
Block a user