speed up whitelist hashtable like 20x

using hashtable key magic.
2013-09-15 21:10:53 -07:00
parent 928dc36a03
commit 991e2f30f7
7 changed files with 54 additions and 11 deletions
--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -14,6 +14,7 @@ void HashTableX::constructor() {
 	m_doFree = false;
 	m_isWritable = true;
 	m_txtBuf = NULL;
+	m_useKeyMagic = false;
 }

 void HashTableX::destructor() {
@ -82,6 +83,7 @@ void HashTableX::reset ( ) {
 	m_numSlotsUsed = 0;
 	m_addIffNotUnique = false;
 	m_maskKeyOffset = 0;
+	m_useKeyMagic = false;
 	// we should free it in reset()
 	if ( m_doFree && m_txtBuf ) {
 		mfree ( m_txtBuf , m_txtBufSize,"ftxtbuf");
@ -135,8 +137,16 @@ long HashTableX::getCount ( void *key ) {
 // . returns -1 if key not in hash table
 long HashTableX::getOccupiedSlotNum ( void *key ) {
 	if ( m_numSlots <= 0 ) return -1;
+
+        long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
+
+	// use magic to "randomize" key a little
+	if ( m_useKeyMagic ) 
+		n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
+
 	// mask on the lower 32 bits i guess
-        long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
+        n &= m_mask;
+
        long count = 0;
        while ( count++ < m_numSlots ) {
 		// this is set to 0x01 if non-empty
@ -176,7 +186,18 @@ bool HashTableX::addKey ( void *key , void *val , long *slot ) {
 		if ( growTo > m_maxSlots ) growTo = m_maxSlots;
 		if ( ! setTableSize ( (long)growTo , NULL , 0 ) ) return false;
 	}
-        long n = (*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
+
+        //long n=(*(unsigned long *)(((char *)key)+m_maskKeyOffset)) & m_mask;
+
+        long n = *(unsigned long *)(((char *)key)+m_maskKeyOffset);
+
+	// use magic to "randomize" key a little
+	if ( m_useKeyMagic ) 
+		n^=g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
+
+	// mask on the lower 32 bits i guess
+        n &= m_mask;
+
        long count = 0;
 	m_needsSave = true;
        while ( count++ < m_numSlots ) {
--- a/HashTableX.h
+++ b/HashTableX.h
@ -408,6 +408,8 @@ class HashTableX {
 	char *m_buf;
 	long  m_bufSize;

+	char m_useKeyMagic;
+
 	long m_ks;
 	long m_ds;
 	char m_allowDups;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1368,6 +1368,9 @@ bool gotResults ( void *state ) {
 		args.safePrintf("&sb=1");
 	if ( ! si->m_showBanned && si->m_isAdmin )
 		args.safePrintf("&sb=0");
+	// carry over the sites we are restricting the search results to
+	if ( si->m_whiteListBuf.length() )
+		args.safePrintf("&sites=%s",si->m_whiteListBuf.getBufStart());
 	

 	if ( firstNum > 0 && ! si->m_xml ) {
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -562,7 +562,7 @@ void PosdbTable::reset() {
 	m_qiBuf.reset();
 	// assume no-op
 	m_t1 = 0LL;
-	m_whiteTable.reset();
+	m_whiteListTable.reset();
 	m_addedSites = false;
 }

@ -664,14 +664,26 @@ bool PosdbTable::allocWhiteListTable ( ) {
 		sum += size / 12 + 1;
 	}
 	if ( sum ) {
+		// making this sum * 3 does not show a speedup... hmmm...
 		long numSlots = sum * 2;
 		// keep it restricted to 5 byte keys so we do not have to
 		// extract the docid, we can just hash the ptr to those
 		// 5 bytes (which includes 1 siterank bit as the lowbit,
 		// but should be ok since it should be set the same in
 		// all termlists that have that docid)
-		if ( ! m_whiteTable.set(5,0,numSlots,NULL,0,false,0,"wtall"))
+		if ( ! m_whiteListTable.set(5,0,numSlots,NULL,0,false,
+					    0,"wtall"))
 			return false;
+		// try to speed up. wow, this slowed it down about 4x!!
+		//m_whiteListTable.m_maskKeyOffset = 1;
+		//
+		////////////
+		//
+		// this seems to make it like 20x faster... 1444ms vs 27000ms:
+		//
+		////////////
+		//
+		m_whiteListTable.m_useKeyMagic = true;
 	}
 	return true;
 }
@ -4672,7 +4684,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , long   listGroupNum ) {
 	//   the 6 bytes of the docid ptr as is though since the siterank
 	//   should be the same for the site: terms we indexed for the same
 	//   docid!!
-	if ( m_useWhiteTable && ! m_whiteTable.isInTable(minRecPtr+7) )
+	if ( m_useWhiteTable && ! m_whiteListTable.isInTable(minRecPtr+7) )
 		goto getMin;
 		

@ -4903,12 +4915,16 @@ void PosdbTable::intersectLists10_r ( ) {
 	for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
 		RdbList *list = &whiteLists[i];
 		if ( list->isEmpty() ) continue;
+		// sanity test
+		long long d1 = g_posdb.getDocId(list->getList());
+		if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
+		if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
 		// first key is always 18 bytes cuz it has the termid
 		// scan recs in the list
 		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
 			char *rec = list->getCurrentRec();
 			// point to the 5 bytes of docid
-			m_whiteTable.addKey ( rec + 7 );
+			m_whiteListTable.addKey ( rec + 7 );
 		}
 	}
 	m_addedSites = true;
--- a/Posdb.h
+++ b/Posdb.h
@ -446,7 +446,7 @@ class PosdbTable {
 		   class Msg2 *msg2, 
 		   class          Msg39Request *r );

-	// pre-allocate m_whiteTable
+	// pre-allocate m_whiteListTable
 	bool allocWhiteListTable ( ) ;

 	// pre-allocate memory since intersection runs in a thread
@ -579,7 +579,7 @@ class PosdbTable {
 	// the new intersection/scoring algo
 	void intersectLists10_r ( );	

-	HashTableX m_whiteTable;
+	HashTableX m_whiteListTable;
 	bool m_useWhiteTable;
 	bool m_addedSites;

--- a/Title.cpp
+++ b/Title.cpp
@ -1637,7 +1637,7 @@ float Title::getSimilarity ( Words  *w1 , long i0 , long i1 ,
 		// prepare for next link, it may never come if we're last one!
 		//oldi = i;
 		// add to table
-		if ( ! table.addKey ( (long)wid , score , NULL ) ) 
+		if ( ! table.addKey ( (long)wid , (long)score , NULL ) ) 
 			return -1.0;
 		// if no last wid, continue
 		if ( lastWid == -1LL ) {lastWid=wid;lastScore=score;continue; }
@ -1657,7 +1657,7 @@ float Title::getSimilarity ( Words  *w1 , long i0 , long i1 ,
 		//     "adding pid=%li score=%.02f sum=%.02f",
 		//	     (long)pid,phrScore,sum);
 		// now add that
-		if ( ! table.addKey ( (long)pid , phrScore , NULL ) )
+		if ( ! table.addKey ( (long)pid , (long)phrScore , NULL ) )
 			return -1.0;
 		// we are now the last wid
 		lastWid   = wid;
--- a/Words.cpp
+++ b/Words.cpp
@ -1068,7 +1068,8 @@ long Words::getLanguage( Sections *sections ,
 	//long wordBase  = 0;
 	long wordi     = 0;
 	//if ( ! ht.set(maxSamples*1.5) ) return -1;
-	if ( ! ht.set(8,1,maxSamples*8.0,NULL,0,false,niceness,"wordslang")) 
+	if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
+		      niceness,"wordslang")) 
 		return -1;
 
 	// . avoid words in these bad sections