privacore-open-source-searc.../matches2.cpp
2018-08-07 15:44:56 +02:00

339 lines
11 KiB
C++

#include "matches2.h"
#include "fctypes.h"
#include "utf8_fast.h"
// . get the first substring in "haystack" that matches a string in "needles"
// . set *needleNum to the # of the needle in "needles" that matched
// . return a ptr into "haystack" that matches, NULL if none
// . if "linkPos" is non-NULL then certain types of matches must occur
// BEFORE "linkPos" in order to be counted. those certain types are indicated
// by a Needle::m_section of 1. this is for allowing links before comment
// sections to be able to vote, and links in/after comment section to not
// vote. (see LinkText.cpp's call to getMatch() to better understand this)
// . ALL needles.m_strings MUST BE IN LOWER CASE!! otherwise, we should convert
// to lower and store into tmp[]. TODO.
// . a space (includes \r \n) in a needle will match a consecutive sequence
// of spaces in the haystack
typedef uint64_t BITVEC;
char *getMatches2(const Needle *needles, NeedleMatch *needlesMatch, int32_t numNeedles,
const char *haystack, int32_t haystackSize, char *linkPos, bool *hadPreMatch) {
// assume not
if ( hadPreMatch ) *hadPreMatch = false;
// empty haystack? then no matches
if ( ! haystack || haystackSize <= 0 ) return NULL;
// JAB: no needles? then no matches
if ( ! needles || numNeedles <= 0 ) return NULL;
// . set up the quick tables.
// . utf16 is not as effective here because half the bytes are zeroes!
// . TODO: use a static cache of like 4 of these tables where the key
// is the Needles ptr ... done
int32_t numNeedlesToInit = numNeedles;
char space[256 * 4 * sizeof(BITVEC)];
char *buf = NULL;
BITVEC *s0;
BITVEC *s1;
BITVEC *s2;
BITVEC *s3;
if(!buf) {
buf = space;
memset ( buf , 0 , sizeof(BITVEC)*256*4);
}
// try 64 bit bit vectors now since we doubled # of needles
int32_t offset = 0;
s0 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s1 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s2 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s3 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
// set the letter tables, s0[] through sN[], for each needle
for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) {
unsigned char *w = (unsigned char *)needles[i].m_string;
unsigned char *wend = w + needles[i].m_stringSize;
// BITVEC is now 64 bits
BITVEC mask = ((BITVEC)1)<<(i&0x3f); // (1<<(i%64));
// if the needle is small, fill up the remaining letter tables
// with its mask... so it matches any character in haystack.
s0[(unsigned char)to_lower_a(*w)] |= mask;
s0[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( int32_t j = 0 ; j < 256 ; j++ ) {
s1[j] |= mask;
s2[j] |= mask;
s3[j] |= mask;
}
continue;
}
s1[(unsigned char)to_lower_a(*w)] |= mask;
s1[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( int32_t j = 0 ; j < 256 ; j++ ) {
s2[j] |= mask;
s3[j] |= mask;
}
continue;
}
s2[(unsigned char)to_lower_a(*w)] |= mask;
s2[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( int32_t j = 0 ; j < 256 ; j++ ) {
s3[j] |= mask;
}
continue;
}
s3[(unsigned char)to_lower_a(*w)] |= mask;
s3[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
}
// return a ptr to the first match if we should, this is it
char *retVal = NULL;
// now find the first needle in the haystack
unsigned char *p = (unsigned char *)haystack;
unsigned char *pend = (unsigned char *)haystack + haystackSize;
char *dend = (char *)pend;
// do not breach!
pend -= 4;
for ( ; p < pend ; p++ ) {
// analytics...
// is this a possible match? (this should be VERY fast)
BITVEC mask = s0[*(p+0)];
if ( ! mask ) continue;
mask &= s1[*(p+1)];
if ( ! mask ) continue;
mask &= s2[*(p+2)];
if ( ! mask ) continue;
mask &= s3[*(p+3)];
if ( ! mask ) continue;
// XXX: do a hashtable lookup here so we have the candidate
// matches in a chain...
// XXX: for small needles which match frequently let's have
// a single char hash table, a 2 byte char hash table,
// etc. so if we have small needles we check the hash
// in those tables first, but only if mask & SMALL_NEEDLE
// is true! the single byte needle hash table can just
// be a lookup table. just XOR the bytes together for
// the hash.
// XXX: just hash the mask into a table to get candidate
// matches in a chain? but there's 4B hashes!!
// we got a good candidate, loop through all the needles
for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
continue;
if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
continue;
if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
continue;
if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
continue;
}
// get needle size
int32_t msize = needles[j].m_stringSize;
// can p possibly be big enough?
if ( pend - p < msize ) continue;
// needle is "m" now
const char *m = needles[j].m_string;
const char *mend = needles[j].m_stringSize + m;
// use a tmp ptr for ptr into haystack
char *d = (char *)p;
// skip first 4 bytes since we know they match
if(msize > 3) {
d += 4;
m += 4;
}
// loop over each char in "m"
for ( ; m < mend ; m++ ) {
// if we are a non alnum, that will match
// any string of non-alnums, like a space
// for instance. the 0 byte does not count
// because it is used in utf16 a lot. this
// may trigger some false matches in utf16
// but, oh well... this way "link partner"
// will match "link - partner" in the haystk
if ( is_wspace_a(*m) && m < mend ) {
// skip all in "d" then.
while (d<dend&&is_wspace_a(*d)) d++;
// advance m then
continue;
}
// make sure we match otherwise
if ( *m != to_lower_a(*d) ) break;
// ok, we matched, go to next
d++;
}
// if not null, keep going
if ( m < mend ) continue;
// if this needle is "special" AND it occurs AFTER
// linkPos, then do not consider it a match. this is
// if we have a comment section indicator, like
// "div id=\"comment" AND it occurs AFTER linkPos
// (the char ptr to our link in the haystack) then
// the match does not count.
if ( linkPos && needles[j].m_isSection &&
(char *)p>linkPos ) {
// record this for LinkText.cpp
if ( hadPreMatch ) *hadPreMatch = true;
continue;
}
// store ptr if NULL
if ( ! needlesMatch[j].m_firstMatch )
needlesMatch[j].m_firstMatch = (char *)p;
// otherwise, just count it
needlesMatch[j].m_count++;
// see if we match another needle, fixes bug
// of matching "anal" but not "analy[tics]"
continue;
}
// ok, we did not match any needles, advance p and try again
}
//
// HACK:
//
// repeat above loop but for the last 4 characters in haystack!!
// this fixes a electric fence mem breach core
//
// it is slower because we check for \0
//
pend += 4;
for ( ; p < pend ; p++ ) {
// is this a possible match? (this should be VERY fast)
BITVEC mask = s0[*(p+0)];
if ( ! mask ) continue;
if ( p+1 < pend ) {
mask &= s1[*(p+1)];
if ( ! mask ) continue;
}
if ( p+2 < pend ) {
mask &= s2[*(p+2)];
if ( ! mask ) continue;
}
if ( p+3 < pend ) {
mask &= s3[*(p+3)];
if ( ! mask ) continue;
}
// XXX: do a hashtable lookup here so we have the candidate
// matches in a chain...
// XXX: for small needles which match frequently let's have
// a single char hash table, a 2 byte char hash table,
// etc. so if we have small needles we check the hash
// in those tables first, but only if mask & SMALL_NEEDLE
// is true! the single byte needle hash table can just
// be a lookup table. just XOR the bytes together for
// the hash.
// XXX: just hash the mask into a table to get candidate
// matches in a chain? but there's 4B hashes!!
// we got a good candidate, loop through all the needles
for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
continue;
if (!p[1] ||
needles[j].m_string[1]!=to_lower_a(*(p+1)))
continue;
if (!p[2] ||
needles[j].m_string[2]!=to_lower_a(*(p+2)))
continue;
if (!p[3] ||
needles[j].m_string[3]!=to_lower_a(*(p+3)))
continue;
}
// get needle size
int32_t msize = needles[j].m_stringSize;
// can p possibly be big enough?
if ( pend - p < msize ) continue;
// needle is "m" now
const char *m = needles[j].m_string;
const char *mend = needles[j].m_stringSize + m;
// use a tmp ptr for ptr into haystack
char *d = (char *)p;
// skip first 4 bytes since we know they match
if(msize > 3) {
d += 4;
m += 4;
}
// loop over each char in "m"
//for ( ; *m ; m++ ) {
for ( ; m < mend ; m++ ) {
// if we are a non alnum, that will match
// any string of non-alnums, like a space
// for instance. the 0 byte does not count
// because it is used in utf16 a lot. this
// may trigger some false matches in utf16
// but, oh well... this way "link partner"
// will match "link - partner" in the haystk
if ( is_wspace_a(*m) && m < mend ) {
// skip all in "d" then.
while (d<dend&&is_wspace_a(*d)) d++;
// advance m then
continue;
}
// make sure we match otherwise
if ( *m != to_lower_a(*d) ) break;
// ok, we matched, go to next
d++;
}
// if not null, keep going
if ( m < mend ) continue;
// if this needle is "special" AND it occurs AFTER
// linkPos, then do not consider it a match. this is
// if we have a comment section indicator, like
// "div id=\"comment" AND it occurs AFTER linkPos
// (the char ptr to our link in the haystack) then
// the match does not count.
if ( linkPos && needles[j].m_isSection &&
(char *)p>linkPos ) {
// record this for LinkText.cpp
if ( hadPreMatch ) *hadPreMatch = true;
continue;
}
// store ptr if NULL
if ( ! needlesMatch[j].m_firstMatch )
needlesMatch[j].m_firstMatch = (char *)p;
// otherwise, just count it
needlesMatch[j].m_count++;
// advance to next char in the haystack
break;
}
// ok, we did not match any needles, advance p and try again
}
// before we exit, clean up
return retVal;
}