speed up whitelist hashtable like 20x

using hashtable key magic.
This commit is contained in:
Matt Wells
2013-09-15 21:10:53 -07:00
parent 928dc36a03
commit 991e2f30f7
7 changed files with 54 additions and 11 deletions

@ -14,6 +14,7 @@ void HashTableX::constructor() {
m_doFree = false;
m_isWritable = true;
m_txtBuf = NULL;
m_useKeyMagic = false;
}
void HashTableX::destructor() {
@ -82,6 +83,7 @@ void HashTableX::reset ( ) {
m_numSlotsUsed = 0;
m_addIffNotUnique = false;
m_maskKeyOffset = 0;
m_useKeyMagic = false;
// we should free it in reset()
if ( m_doFree && m_txtBuf ) {
mfree ( m_txtBuf , m_txtBufSize,"ftxtbuf");
@ -135,8 +137,16 @@ long HashTableX::getCount ( void *key ) {
// . returns -1 if key not in hash table
long HashTableX::getOccupiedSlotNum ( void *key ) {
if ( m_numSlots <= 0 ) return -1;
long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
// use magic to "randomize" key a little
if ( m_useKeyMagic )
n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
// mask on the lower 32 bits i guess
long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
n &= m_mask;
long count = 0;
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
@ -176,7 +186,18 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
if ( growTo > m_maxSlots ) growTo = m_maxSlots;
if ( ! setTableSize ( (long)growTo , NULL , 0 ) ) return false;
}
long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
//long n=(*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
// use magic to "randomize" key a little
if ( m_useKeyMagic )
n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
// mask on the lower 32 bits i guess
n &= m_mask;
long count = 0;
m_needsSave = true;
while ( count++ < m_numSlots ) {

@ -408,6 +408,8 @@ class HashTableX {
char *m_buf;
long m_bufSize;
char m_useKeyMagic;
long m_ks;
long m_ds;
char m_allowDups;

@ -1368,6 +1368,9 @@ bool gotResults ( void *state ) {
args.safePrintf("&sb=1");
if ( ! si->m_showBanned && si->m_isAdmin )
args.safePrintf("&sb=0");
// carry over the sites we are restricting the search results to
if ( si->m_whiteListBuf.length() )
args.safePrintf("&sites=%s",si->m_whiteListBuf.getBufStart());
if ( firstNum > 0 && ! si->m_xml ) {

@ -562,7 +562,7 @@ void PosdbTable::reset() {
m_qiBuf.reset();
// assume no-op
m_t1 = 0LL;
m_whiteTable.reset();
m_whiteListTable.reset();
m_addedSites = false;
}
@ -664,14 +664,26 @@ bool PosdbTable::allocWhiteListTable ( ) {
sum += size / 12 + 1;
}
if ( sum ) {
// making this sum * 3 does not show a speedup... hmmm...
long numSlots = sum * 2;
// keep it restricted to 5 byte keys so we do not have to
// extract the docid, we can just hash the ptr to those
// 5 bytes (which includes 1 siterank bit as the lowbit,
// but should be ok since it should be set the same in
// all termlists that have that docid)
if ( ! m_whiteTable.set(5,0,numSlots,NULL,0,false,0,"wtall"))
if ( ! m_whiteListTable.set(5,0,numSlots,NULL,0,false,
0,"wtall"))
return false;
// try to speed up. wow, this slowed it down about 4x!!
//m_whiteListTable.m_maskKeyOffset = 1;
//
////////////
//
// this seems to make it like 20x faster... 1444ms vs 27000ms:
//
////////////
//
m_whiteListTable.m_useKeyMagic = true;
}
return true;
}
@ -4672,7 +4684,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , long listGroupNum ) {
// the 6 bytes of the docid ptr as is though since the siterank
// should be the same for the site: terms we indexed for the same
// docid!!
if ( m_useWhiteTable && ! m_whiteTable.isInTable(minRecPtr+7) )
if ( m_useWhiteTable && ! m_whiteListTable.isInTable(minRecPtr+7) )
goto getMin;
@ -4903,12 +4915,16 @@ void PosdbTable::intersectLists10_r ( ) {
for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
RdbList *list = &whiteLists[i];
if ( list->isEmpty() ) continue;
// sanity test
long long d1 = g_posdb.getDocId(list->getList());
if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
// first key is always 18 bytes cuz it has the termid
// scan recs in the list
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
char *rec = list->getCurrentRec();
// point to the 5 bytes of docid
m_whiteTable.addKey ( rec + 7 );
m_whiteListTable.addKey ( rec + 7 );
}
}
m_addedSites = true;

@ -446,7 +446,7 @@ class PosdbTable {
class Msg2 *msg2,
class Msg39Request *r );
// pre-allocate m_whiteTable
// pre-allocate m_whiteListTable
bool allocWhiteListTable ( ) ;
// pre-allocate memory since intersection runs in a thread
@ -579,7 +579,7 @@ class PosdbTable {
// the new intersection/scoring algo
void intersectLists10_r ( );
HashTableX m_whiteTable;
HashTableX m_whiteListTable;
bool m_useWhiteTable;
bool m_addedSites;

@ -1637,7 +1637,7 @@ float Title::getSimilarity ( Words *w1 , long i0 , long i1 ,
// prepare for next link, it may never come if we're last one!
//oldi = i;
// add to table
if ( ! table.addKey ( (long)wid , score , NULL ) )
if ( ! table.addKey ( (long)wid , (long)score , NULL ) )
return -1.0;
// if no last wid, continue
if ( lastWid == -1LL ) {lastWid=wid;lastScore=score;continue; }
@ -1657,7 +1657,7 @@ float Title::getSimilarity ( Words *w1 , long i0 , long i1 ,
// "adding pid=%li score=%.02f sum=%.02f",
// (long)pid,phrScore,sum);
// now add that
if ( ! table.addKey ( (long)pid , phrScore , NULL ) )
if ( ! table.addKey ( (long)pid , (long)phrScore , NULL ) )
return -1.0;
// we are now the last wid
lastWid = wid;

@ -1068,7 +1068,8 @@ long Words::getLanguage( Sections *sections ,
//long wordBase = 0;
long wordi = 0;
//if ( ! ht.set(maxSamples*1.5) ) return -1;
if ( ! ht.set(8,1,maxSamples*8.0,NULL,0,false,niceness,"wordslang"))
if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
niceness,"wordslang"))
return -1;
// . avoid words in these bad sections