mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
376 lines
9.6 KiB
C
376 lines
9.6 KiB
C
// Matt Wells, Copyright Apr 2001
|
|
|
|
// . don't use XOR for hashing, "dog" would be the same as "god"
|
|
|
|
#ifndef GB_HASH_H
|
|
#define GB_HASH_H
|
|
#include <inttypes.h>
|
|
#include "utf8_fast.h"
|
|
|
|
|
|
//#define SEED8 148
|
|
//#define SEED16 22081
|
|
//#define SEED32 987654321
|
|
//#define SEED64 5148502070294393521LL
|
|
//#define SEED8 9876
|
|
//#define SEED32 87654321
|
|
//#define SEED64 7651331LL
|
|
|
|
// call this before calling any hash*() routines so we can fill our table
|
|
extern const uint64_t g_hashtab[256][256];
|
|
|
|
#include "types.h"
|
|
#include "fctypes.h"
|
|
#include "Sanity.h"
|
|
|
|
bool hashinit ();
|
|
|
|
unsigned char hash8 ( const char *s , int32_t len ) ;
|
|
uint64_t hash64n_nospaces ( const char *s , int32_t len ) ;
|
|
uint32_t hash32n ( const char *s ) ;
|
|
uint32_t hash32 ( const char *s, int32_t len,uint32_t startHash=0);
|
|
uint32_t hash32h ( uint32_t h1 , uint32_t h2 ) ;
|
|
uint64_t hash64h ( uint64_t h1 , uint64_t h2 );
|
|
uint32_t hash32Lower_a ( const char *s, int32_t len,uint32_t startHash=0);
|
|
uint64_t hash64n(const char *s);
|
|
uint64_t hash64n(const char *s, int32_t len);
|
|
uint96_t hash96 ( const char *s, int32_t slen);
|
|
|
|
// . these convert \n to \0 when hashing
|
|
// . these hash all punct as a space, except for hyphen and single quote!
|
|
// . these lower-case all alnum chars, even crazy utf8 chars that can be cap'd
|
|
// . these only take utf8 strings
|
|
uint64_t hash64d ( const char *s, int32_t slen );
|
|
|
|
|
|
|
|
static inline uint64_t hash64b ( const char *s , uint64_t startHash = 0) {
|
|
uint64_t h = startHash;
|
|
int32_t i = 0;
|
|
while ( s[i] ) {
|
|
h ^= g_hashtab [(unsigned char)i] [(unsigned char)s[i]];
|
|
i++;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
static inline uint64_t hash64 ( const char *s, int32_t len,
|
|
uint64_t startHash = 0 ) {
|
|
uint64_t h = startHash;
|
|
int32_t i = 0;
|
|
while ( i < len ) {
|
|
h ^= g_hashtab [(unsigned char)i] [(unsigned char)s[i]];
|
|
i++;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
static inline uint64_t hash64_cont ( const char *s, int32_t len,
|
|
uint64_t startHash ,
|
|
int32_t *conti ) {
|
|
uint64_t h = startHash;
|
|
int32_t i = 0;//*conti;
|
|
while ( i < len ) {
|
|
h ^= g_hashtab [(unsigned char)(i+*conti)][(unsigned char)s[i]];
|
|
i++;
|
|
}
|
|
*conti = *conti + i;
|
|
return h;
|
|
}
|
|
|
|
static inline uint32_t hash32Fast ( uint32_t h1 , uint32_t h2 ) {
|
|
return (h2 << 1) ^ h1;
|
|
}
|
|
|
|
// . combine 2 hashes into 1
|
|
// . TODO: ensure this is a good way
|
|
// . used for combining words' hashes into phrases (also fields,collections)..
|
|
static inline uint64_t hash64 (uint64_t h1,uint64_t h2){
|
|
// treat the 16 bytes as a string now instead of multiplying them
|
|
uint64_t h = 0;
|
|
|
|
h ^= g_hashtab [ 0] [ *((unsigned char *)(&h1)+0) ] ;
|
|
h ^= g_hashtab [ 1] [ *((unsigned char *)(&h1)+1) ] ;
|
|
h ^= g_hashtab [ 2] [ *((unsigned char *)(&h1)+2) ] ;
|
|
h ^= g_hashtab [ 3] [ *((unsigned char *)(&h1)+3) ] ;
|
|
h ^= g_hashtab [ 4] [ *((unsigned char *)(&h1)+4) ] ;
|
|
h ^= g_hashtab [ 5] [ *((unsigned char *)(&h1)+5) ] ;
|
|
h ^= g_hashtab [ 6] [ *((unsigned char *)(&h1)+6) ] ;
|
|
h ^= g_hashtab [ 7] [ *((unsigned char *)(&h1)+7) ] ;
|
|
|
|
h ^= g_hashtab [ 8] [ *((unsigned char *)(&h2)+0) ] ;
|
|
h ^= g_hashtab [ 9] [ *((unsigned char *)(&h2)+1) ] ;
|
|
h ^= g_hashtab [10] [ *((unsigned char *)(&h2)+2) ] ;
|
|
h ^= g_hashtab [11] [ *((unsigned char *)(&h2)+3) ] ;
|
|
h ^= g_hashtab [12] [ *((unsigned char *)(&h2)+4) ] ;
|
|
h ^= g_hashtab [13] [ *((unsigned char *)(&h2)+5) ] ;
|
|
h ^= g_hashtab [14] [ *((unsigned char *)(&h2)+6) ] ;
|
|
h ^= g_hashtab [15] [ *((unsigned char *)(&h2)+7) ] ;
|
|
|
|
return h;
|
|
}
|
|
|
|
|
|
static inline uint64_t hash64Lower_a ( const char *s, int32_t len,
|
|
uint64_t startHash = 0) {
|
|
uint64_t h = startHash;
|
|
int32_t i = 0;
|
|
while ( i < len ) {
|
|
h ^= g_hashtab [(unsigned char)i]
|
|
[(unsigned char)to_lower_a(s[i])];
|
|
i++;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
// utf8
|
|
static inline uint64_t hash64Lower_utf8 ( const char *p, int32_t len, uint64_t startHash = 0) {
|
|
uint64_t h = startHash;
|
|
uint8_t i = 0;
|
|
const char *pend = p + len;
|
|
char cs;
|
|
UChar32 x;
|
|
UChar32 y;
|
|
for ( ; p < pend ; p += cs ) {
|
|
// get the size
|
|
cs = getUtf8CharSize(p);
|
|
// deal with one ascii char quickly
|
|
if ( cs == 1 ) {
|
|
h ^= g_hashtab [i++]
|
|
[(uint8_t)to_lower_a(*p)];
|
|
continue;
|
|
}
|
|
// otherwise, lower case it
|
|
x = utf8Decode(p);
|
|
// convert to lower
|
|
y = UnicodeMaps::to_lower(x);
|
|
// back to utf8
|
|
char tmp[4];
|
|
char ncs = utf8Encode ( y , tmp );
|
|
// sanity check
|
|
if ( ncs > 4 ) { gbshutdownAbort(true); }
|
|
// i've seen this happen for 4 byte char =
|
|
// -16,-112,-51,-125 which has x=66371 and y=66371
|
|
// but utf8Encode() returned 0!
|
|
if ( ncs == 0 ) {
|
|
// let's just hash it as-is then
|
|
tmp[0] = p[0];
|
|
if ( cs >= 1 ) tmp[1] = p[1];
|
|
if ( cs >= 2 ) tmp[2] = p[2];
|
|
if ( cs >= 3 ) tmp[3] = p[3];
|
|
ncs = cs;
|
|
}
|
|
// hash it up
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
|
if ( ncs == 1 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[1]];
|
|
if ( ncs == 2 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[2]];
|
|
if ( ncs == 3 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[3]];
|
|
}
|
|
return h;
|
|
}
|
|
|
|
static inline uint64_t hash64Lower_utf8_nospaces ( const char *p, int32_t len ) {
|
|
uint64_t h = 0LL;
|
|
uint8_t i = 0;
|
|
const char *pend = p + len;
|
|
char cs;
|
|
UChar32 x;
|
|
UChar32 y;
|
|
for ( ; p < pend ; p += cs ) {
|
|
// get the size
|
|
cs = getUtf8CharSize(p);
|
|
// deal with one ascii char quickly
|
|
if ( cs == 1 ) {
|
|
// skip spaces
|
|
if ( is_wspace_a(*p) ) continue;
|
|
h ^= g_hashtab [i++]
|
|
[(uint8_t)to_lower_a(*p)];
|
|
continue;
|
|
}
|
|
// otherwise, lower case it
|
|
x = utf8Decode(p);
|
|
// convert to lower
|
|
y = UnicodeMaps::to_lower(x);
|
|
// back to utf8
|
|
char tmp[4];
|
|
char ncs = utf8Encode ( y , tmp );
|
|
// sanity check
|
|
if ( ncs > 4 ) { gbshutdownAbort(true); }
|
|
// i've seen this happen for 4 byte char =
|
|
// -16,-112,-51,-125 which has x=66371 and y=66371
|
|
// but utf8Encode() returned 0!
|
|
if ( ncs == 0 ) {
|
|
// let's just hash it as-is then
|
|
tmp[0] = p[0];
|
|
if ( cs >= 1 ) tmp[1] = p[1];
|
|
if ( cs >= 2 ) tmp[2] = p[2];
|
|
if ( cs >= 3 ) tmp[3] = p[3];
|
|
ncs = cs;
|
|
}
|
|
// hash it up
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
|
if ( ncs == 1 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[1]];
|
|
if ( ncs == 2 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[2]];
|
|
if ( ncs == 3 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[3]];
|
|
}
|
|
return h;
|
|
}
|
|
|
|
|
|
static inline uint64_t hash64Lower_utf8_cont ( const char *p,
|
|
int32_t len,
|
|
uint64_t startHash ,
|
|
int32_t *conti ) {
|
|
uint64_t h = startHash;
|
|
uint8_t i = *conti;
|
|
const char *pend = p + len;
|
|
char cs;
|
|
UChar32 x;
|
|
UChar32 y;
|
|
for ( ; p < pend ; p += cs ) {
|
|
// get the size
|
|
cs = getUtf8CharSize(p);
|
|
// deal with one ascii char quickly
|
|
if ( cs == 1 ) {
|
|
h ^= g_hashtab [i++][(uint8_t)to_lower_a(*p)];
|
|
continue;
|
|
}
|
|
|
|
// otherwise, lower case it
|
|
x = utf8Decode(p);
|
|
|
|
// convert to lower
|
|
y = UnicodeMaps::to_lower(x);
|
|
|
|
// back to utf8
|
|
char tmp[4];
|
|
char ncs = utf8Encode ( y , tmp );
|
|
|
|
// sanity check
|
|
if ( ncs > 4 ) { gbshutdownAbort(true); }
|
|
|
|
// i've seen this happen for 4 byte char =
|
|
// -16,-112,-51,-125 which has x=66371 and y=66371
|
|
// but utf8Encode() returned 0!
|
|
if ( ncs == 0 ) {
|
|
// let's just hash it as-is then
|
|
tmp[0] = p[0];
|
|
if ( cs >= 1 ) tmp[1] = p[1];
|
|
if ( cs >= 2 ) tmp[2] = p[2];
|
|
if ( cs >= 3 ) tmp[3] = p[3];
|
|
ncs = cs;
|
|
}
|
|
|
|
// hash it up
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
|
if ( ncs == 1 )
|
|
continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[1]];
|
|
if ( ncs == 2 )
|
|
continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[2]];
|
|
if ( ncs == 3 )
|
|
continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[3]];
|
|
}
|
|
// update this so caller can re-call with the right i
|
|
*conti = i;
|
|
return h;
|
|
}
|
|
|
|
static inline uint32_t hash32_cont ( const char *p, int32_t plen,
|
|
uint32_t startHash , int32_t *conti ) {
|
|
uint32_t h = startHash;
|
|
uint8_t i = *conti;
|
|
const char *pend = p + plen;
|
|
for ( ; p < pend ; p++ ) {
|
|
h ^= (uint32_t)g_hashtab [i++] [(uint8_t)(*p)];
|
|
}
|
|
// update this so caller can re-call with the right i
|
|
*conti = i;
|
|
return h;
|
|
}
|
|
|
|
|
|
// utf8
|
|
|
|
// exactly like above but p is NULL terminated for sure
|
|
static inline uint64_t hash64Lower_utf8 ( const char *p ) {
|
|
uint64_t h = 0;
|
|
uint8_t i = 0;
|
|
UChar32 x;
|
|
UChar32 y;
|
|
char cs;
|
|
for ( ; *p ; p += cs ) {
|
|
// get the size
|
|
cs = getUtf8CharSize(p);
|
|
// deal with one ascii char quickly
|
|
if ( cs == 1 ) {
|
|
h ^= g_hashtab [i++]
|
|
[(uint8_t)to_lower_a(*p)];
|
|
continue;
|
|
}
|
|
// otherwise, lower case it
|
|
x = utf8Decode(p);
|
|
// convert to lower
|
|
y = UnicodeMaps::to_lower(x);
|
|
// back to utf8
|
|
char tmp[4];
|
|
char ncs = utf8Encode(y, tmp);
|
|
// sanity check
|
|
if ( ncs > 4 ) { gbshutdownAbort(true); }
|
|
// i've seen this happen for 4 byte char =
|
|
// -16,-112,-51,-125 which has x=66371 and y=66371
|
|
// but utf8Encode() returned 0!
|
|
if ( ncs == 0 ) {
|
|
// let's just hash it as-is then
|
|
tmp[0] = p[0];
|
|
if ( cs >= 1 ) tmp[1] = p[1];
|
|
if ( cs >= 2 ) tmp[2] = p[2];
|
|
if ( cs >= 3 ) tmp[3] = p[3];
|
|
ncs = cs;
|
|
}
|
|
// hash it up
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[0]];
|
|
if ( ncs == 1 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[1]];
|
|
if ( ncs == 2 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[2]];
|
|
if ( ncs == 3 ) continue;
|
|
h ^= g_hashtab [i++][(uint8_t)tmp[3]];
|
|
}
|
|
return h;
|
|
}
|
|
|
|
|
|
static inline uint64_t hash64Upper_a ( const char *s , int32_t len ,
|
|
uint64_t startHash ) {
|
|
uint64_t h = startHash;
|
|
int32_t i = 0;
|
|
while ( i < len ) {
|
|
h ^= g_hashtab [(unsigned char)i]
|
|
[(unsigned char)to_upper_a(s[i])];
|
|
i++;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
|
|
static inline uint32_t hashLong ( uint32_t x ) {
|
|
uint32_t h = 0;
|
|
unsigned char *p = (unsigned char *)&x;
|
|
h ^= (uint32_t) g_hashtab [0][p[0]];
|
|
h ^= (uint32_t) g_hashtab [1][p[1]];
|
|
h ^= (uint32_t) g_hashtab [2][p[2]];
|
|
h ^= (uint32_t) g_hashtab [3][p[3]];
|
|
return h;
|
|
}
|
|
|
|
#endif // GB_HASH_H
|