forked from Mirrors/privacore-open-source-search-engine
323 lines
9.0 KiB
C++
323 lines
9.0 KiB
C++
//#include "gb-include.h"
|
|
|
|
#include "Phrases.h"
|
|
#include "Words.h"
|
|
#include "Bits.h"
|
|
#include "Mem.h"
|
|
#include "Conf.h"
|
|
#include "Sanity.h"
|
|
|
|
|
|
Phrases::Phrases() : m_buf(NULL) {
|
|
|
|
memset(m_localBuf, 0, sizeof(m_localBuf));
|
|
|
|
// Coverity
|
|
m_bufSize = 0;
|
|
m_phraseIds2 = NULL;
|
|
m_numWordsTotal2 = NULL;
|
|
m_numPhrases = 0;
|
|
m_words = NULL;
|
|
m_wids = NULL;
|
|
m_wptrs = NULL;
|
|
m_wlens = NULL;
|
|
m_bits = NULL;
|
|
|
|
reset();
|
|
}
|
|
|
|
Phrases::~Phrases ( ) {
|
|
reset();
|
|
}
|
|
|
|
void Phrases::reset() {
|
|
if ( m_buf && m_buf != m_localBuf ) {
|
|
mfree ( m_buf , m_bufSize , "Phrases" );
|
|
}
|
|
m_buf = NULL;
|
|
}
|
|
|
|
|
|
// initialize this token array with the string, "s" of length, "len".
|
|
bool Phrases::set( const Words *words, const Bits *bits ) {
|
|
// reset in case being re-used
|
|
reset();
|
|
|
|
// ensure we have words
|
|
if ( ! words ) return true;
|
|
|
|
// . we have one phrase per word
|
|
// . a phrase #n is "empty" if spam[n] == PSKIP
|
|
m_numPhrases = words->getNumWords();
|
|
|
|
// how much mem do we need?
|
|
int32_t need = m_numPhrases * (8+1);
|
|
|
|
// alloc if we need to
|
|
if ( (unsigned)need > sizeof(m_localBuf) )
|
|
m_buf = (char *)mmalloc ( need , "Phrases" );
|
|
else
|
|
m_buf = m_localBuf;
|
|
|
|
if ( ! m_buf ) {
|
|
log(LOG_WARN, "query: Phrases::set: %s",mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
|
|
m_bufSize = need;
|
|
|
|
// set up arrays
|
|
char *p = m_buf;
|
|
|
|
// phrase not using stop words
|
|
m_phraseIds2 = (int64_t *)p;
|
|
p += m_numPhrases * 8;
|
|
|
|
m_numWordsTotal2 = (unsigned char *)p;
|
|
p += m_numPhrases * 1;
|
|
|
|
// sanity
|
|
if ( p != m_buf + need ) gbshutdownLogicError();
|
|
|
|
// point to this info while we parse
|
|
m_words = words;
|
|
m_wptrs = words->getWordPtrs();
|
|
m_wlens = words->getWordLens();
|
|
m_wids = words->getWordIds();
|
|
m_bits = bits;
|
|
|
|
// . set the phrases
|
|
// . sets m_phraseIds [i]
|
|
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
|
|
for ( int32_t i = 0 ; i < words->getNumWords() ; ++i ) {
|
|
if ( ! m_wids[i] ) {
|
|
continue;
|
|
}
|
|
|
|
setPhrase ( i );
|
|
}
|
|
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
// . add the phrase that starts with the ith word
|
|
// . "read Of Mice and Men" should make 3 phrases:
|
|
// . read.ofmice
|
|
// . ofmice
|
|
// . mice.andmen
|
|
void Phrases::setPhrase ( int32_t i ) {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 " BEGIN", i);
|
|
|
|
// hash of the phrase
|
|
int64_t h = 0LL;
|
|
|
|
// the hash of the two-word phrase
|
|
int64_t h2 = 0LL;
|
|
|
|
// reset
|
|
unsigned char pos = 0;
|
|
|
|
// now look for other tokens that should follow the ith token
|
|
int32_t nw = m_words->getNumWords();
|
|
int32_t numWordsInPhrase = 1;
|
|
|
|
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
|
|
char isNum = is_digit(m_wptrs[i][0]);
|
|
|
|
// do not include punct/tag words in the m_numWordsTotal[j] count
|
|
// of the total words in the phrase. these are just usesless tails.
|
|
int32_t lastWordj = -1;
|
|
|
|
// loop over following words
|
|
bool hasHyphen;
|
|
bool hasStopWord2 ;
|
|
|
|
// . NOTE: a token can start a phrase but NOT be in it.
|
|
// . like a large number for example.
|
|
// . wordId is the lower ascii hash of the ith word
|
|
// . NO... this is allowing the query operator PiiPe to start
|
|
// a phrase but not be in it, then the phrase id ends up just
|
|
// being the following word's id. causing the synonyms code to
|
|
// give a synonym which it should not un Synonyms::set()
|
|
if ( ! m_bits->canBeInPhrase(i) ) {
|
|
// so indeed, skip it then
|
|
goto nophrase;
|
|
}
|
|
|
|
h = m_wids[i];
|
|
|
|
// set position
|
|
pos = (unsigned char)m_wlens[i];
|
|
|
|
hasHyphen = false;
|
|
hasStopWord2 = m_bits->isStopWord(i);
|
|
|
|
for( int32_t j = i + 1 ; j < nw ; j++ ) {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". LOOP START", i, j, m_wids[i], m_wids[j] );
|
|
|
|
// Do not allow more than 32 alnum/punct "words" in a phrase.
|
|
// Tthis prevents phrases with 100,000 words from slowing
|
|
// us down. would put us in a huge double-nested for loop
|
|
// BR: But it will never happen? It breaks out of the loop
|
|
// when the phrase contains 2 (real) words?
|
|
if ( j > i + 32 ) {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j > i+32. no phrase", i, j, m_wids[i], m_wids[j] );
|
|
goto nophrase;
|
|
}
|
|
|
|
// deal with punct words
|
|
if ( ! m_wids[j] ) {
|
|
// if we cannot pair across word j then break
|
|
if ( !m_bits->canPairAcross( j ) ) {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Pair cannot cross. Breaking.", i, j, m_wids[i], m_wids[j] );
|
|
break;
|
|
}
|
|
|
|
// does it have a hyphen?
|
|
if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {
|
|
hasHyphen = true;
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is hyphen, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
|
|
}
|
|
else {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is space, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// record lastWordj to indicate that word #j was a true word
|
|
lastWordj = j;
|
|
|
|
// if word #j can be in phrase then incorporate it's hash
|
|
if ( m_bits->canBeInPhrase (j) ) {
|
|
int32_t conti = pos;
|
|
|
|
// hash the jth word into the hash
|
|
h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );
|
|
|
|
pos = conti;
|
|
|
|
++numWordsInPhrase;
|
|
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". CAN be in phrase. Adding j's hash. numWordsInPhrase=%" PRId32 "", i, j, m_wids[i], m_wids[j], numWordsInPhrase);
|
|
|
|
|
|
// N-word phrases?
|
|
if ( numWordsInPhrase == 2 ) {
|
|
h2 = h;
|
|
m_numWordsTotal2[i] = j - i + 1;
|
|
hasStopWord2 = m_bits->isStopWord(j);
|
|
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Words in phrase is 2. Breaking.", i, j, m_wids[i], m_wids[j] );
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j cannot be in a phrase.", i, j, m_wids[i], m_wids[j] );
|
|
}
|
|
|
|
|
|
// if we cannot pair across word j then break
|
|
if ( ! m_bits->canPairAcross (j) ) {
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Cannot pair across. Breaking.", i, j, m_wids[i], m_wids[j] );
|
|
break;
|
|
}
|
|
|
|
// otherwise, get the next word
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Get next word", i, j, m_wids[i], m_wids[j] );
|
|
}
|
|
|
|
// if we had no phrase then use 0 as id (need 2+ words to be a phrase)
|
|
if ( numWordsInPhrase <= 1 ) {
|
|
nophrase:
|
|
m_phraseIds2[i] = 0LL;
|
|
m_numWordsTotal2[i] = 0;
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Not a phrase. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
|
|
return;
|
|
}
|
|
|
|
// sanity check
|
|
if ( lastWordj == -1 ) gbshutdownLogicError();
|
|
|
|
// sanity check
|
|
if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError();
|
|
|
|
// hyphen between numbers does not count (so 1-2 != 12)
|
|
if ( isNum ) hasHyphen = false;
|
|
|
|
// . the two word phrase id
|
|
// . "cd rom" -> cdrom
|
|
// . "fly paper" -> flypaper
|
|
// . "i-phone" -> iphone
|
|
// . "e-mail" -> email
|
|
if ( hasHyphen || ! hasStopWord2 ) {
|
|
m_phraseIds2[i] = h2;
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Has hyphen or no stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i] );
|
|
}
|
|
// . "st. and" !-> stand
|
|
// . "the rapist" !-> therapist
|
|
else {
|
|
m_phraseIds2[i] = h2 ^ 0x768867;
|
|
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. either no hyphen or a stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// . store phrase that starts with word #i into "printBuf"
|
|
// . return bytes stored in "printBuf"
|
|
void Phrases::getPhrase(int32_t i, char *buf, size_t bufsize, int32_t *phrLen) const {
|
|
// return 0 if no phrase
|
|
if ( m_phraseIds2[i] == 0LL ) {
|
|
*buf='\0';
|
|
return;
|
|
}
|
|
|
|
// . how many words, including punct words, are in phrase?
|
|
// . this should never be 1 or less
|
|
int32_t n = m_numWordsTotal2[i] ;
|
|
|
|
char *s = buf;
|
|
char *send = buf + bufsize - 1;
|
|
for (int32_t w = i;w<i+n;w++){
|
|
if (!m_words->isAlnum(w)){
|
|
// skip spaces for now since we has altogether now
|
|
*s++ = ' ';
|
|
continue;
|
|
}
|
|
const char *w1 = m_words->getWord(w);
|
|
const char *wend = w1 + m_words->getWordLen(w);
|
|
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
|
|
// make sure not to overflow destination buffer
|
|
if( s + m_words->getWordLen(w) >= send ) {
|
|
*phrLen=0;
|
|
*buf='\0';
|
|
return;
|
|
}
|
|
|
|
// write the lower case char from w1+j into "s"
|
|
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
|
|
// advance
|
|
j += size;
|
|
s += size;
|
|
}
|
|
}
|
|
// null terminate
|
|
*s = '\0';
|
|
|
|
// set length we wrote into "buf"
|
|
*phrLen = s - buf;
|
|
}
|
|
|
|
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) const {
|
|
*pid = 0LL;
|
|
|
|
if ( m_numWordsTotal2[i] ) {
|
|
*pid = m_phraseIds2[i];
|
|
return m_numWordsTotal2[i];
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|