Files
privacore-open-source-searc…/Phrases.cpp
2017-10-01 18:04:56 +02:00

323 lines
9.0 KiB
C++

//#include "gb-include.h"
#include "Phrases.h"
#include "Words.h"
#include "Bits.h"
#include "Mem.h"
#include "Conf.h"
#include "Sanity.h"
Phrases::Phrases() : m_buf(NULL) {
memset(m_localBuf, 0, sizeof(m_localBuf));
// Coverity
m_bufSize = 0;
m_phraseIds2 = NULL;
m_numWordsTotal2 = NULL;
m_numPhrases = 0;
m_words = NULL;
m_wids = NULL;
m_wptrs = NULL;
m_wlens = NULL;
m_bits = NULL;
reset();
}
Phrases::~Phrases ( ) {
reset();
}
void Phrases::reset() {
if ( m_buf && m_buf != m_localBuf ) {
mfree ( m_buf , m_bufSize , "Phrases" );
}
m_buf = NULL;
}
// initialize this token array with the string, "s" of length, "len".
bool Phrases::set( const Words *words, const Bits *bits ) {
// reset in case being re-used
reset();
// ensure we have words
if ( ! words ) return true;
// . we have one phrase per word
// . a phrase #n is "empty" if spam[n] == PSKIP
m_numPhrases = words->getNumWords();
// how much mem do we need?
int32_t need = m_numPhrases * (8+1);
// alloc if we need to
if ( (unsigned)need > sizeof(m_localBuf) )
m_buf = (char *)mmalloc ( need , "Phrases" );
else
m_buf = m_localBuf;
if ( ! m_buf ) {
log(LOG_WARN, "query: Phrases::set: %s",mstrerror(g_errno));
return false;
}
m_bufSize = need;
// set up arrays
char *p = m_buf;
// phrase not using stop words
m_phraseIds2 = (int64_t *)p;
p += m_numPhrases * 8;
m_numWordsTotal2 = (unsigned char *)p;
p += m_numPhrases * 1;
// sanity
if ( p != m_buf + need ) gbshutdownLogicError();
// point to this info while we parse
m_words = words;
m_wptrs = words->getWordPtrs();
m_wlens = words->getWordLens();
m_wids = words->getWordIds();
m_bits = bits;
// . set the phrases
// . sets m_phraseIds [i]
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
for ( int32_t i = 0 ; i < words->getNumWords() ; ++i ) {
if ( ! m_wids[i] ) {
continue;
}
setPhrase ( i );
}
// success
return true;
}
// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase ( int32_t i ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 " BEGIN", i);
// hash of the phrase
int64_t h = 0LL;
// the hash of the two-word phrase
int64_t h2 = 0LL;
// reset
unsigned char pos = 0;
// now look for other tokens that should follow the ith token
int32_t nw = m_words->getNumWords();
int32_t numWordsInPhrase = 1;
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
char isNum = is_digit(m_wptrs[i][0]);
// do not include punct/tag words in the m_numWordsTotal[j] count
// of the total words in the phrase. these are just usesless tails.
int32_t lastWordj = -1;
// loop over following words
bool hasHyphen;
bool hasStopWord2 ;
// . NOTE: a token can start a phrase but NOT be in it.
// . like a large number for example.
// . wordId is the lower ascii hash of the ith word
// . NO... this is allowing the query operator PiiPe to start
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
if ( ! m_bits->canBeInPhrase(i) ) {
// so indeed, skip it then
goto nophrase;
}
h = m_wids[i];
// set position
pos = (unsigned char)m_wlens[i];
hasHyphen = false;
hasStopWord2 = m_bits->isStopWord(i);
for( int32_t j = i + 1 ; j < nw ; j++ ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". LOOP START", i, j, m_wids[i], m_wids[j] );
// Do not allow more than 32 alnum/punct "words" in a phrase.
// Tthis prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
// BR: But it will never happen? It breaks out of the loop
// when the phrase contains 2 (real) words?
if ( j > i + 32 ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j > i+32. no phrase", i, j, m_wids[i], m_wids[j] );
goto nophrase;
}
// deal with punct words
if ( ! m_wids[j] ) {
// if we cannot pair across word j then break
if ( !m_bits->canPairAcross( j ) ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Pair cannot cross. Breaking.", i, j, m_wids[i], m_wids[j] );
break;
}
// does it have a hyphen?
if ( j == i + 1 && m_words->hasChar( j, '-' ) ) {
hasHyphen = true;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is hyphen, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
}
else {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is space, NOT adding to phrase", i, j, m_wids[i], m_wids[j] );
}
continue;
}
// record lastWordj to indicate that word #j was a true word
lastWordj = j;
// if word #j can be in phrase then incorporate it's hash
if ( m_bits->canBeInPhrase (j) ) {
int32_t conti = pos;
// hash the jth word into the hash
h = hash64Lower_utf8_cont( m_wptrs[j], m_wlens[j], h, &conti );
pos = conti;
++numWordsInPhrase;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". CAN be in phrase. Adding j's hash. numWordsInPhrase=%" PRId32 "", i, j, m_wids[i], m_wids[j], numWordsInPhrase);
// N-word phrases?
if ( numWordsInPhrase == 2 ) {
h2 = h;
m_numWordsTotal2[i] = j - i + 1;
hasStopWord2 = m_bits->isStopWord(j);
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Words in phrase is 2. Breaking.", i, j, m_wids[i], m_wids[j] );
break;
}
}
else {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j cannot be in a phrase.", i, j, m_wids[i], m_wids[j] );
}
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Cannot pair across. Breaking.", i, j, m_wids[i], m_wids[j] );
break;
}
// otherwise, get the next word
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Get next word", i, j, m_wids[i], m_wids[j] );
}
// if we had no phrase then use 0 as id (need 2+ words to be a phrase)
if ( numWordsInPhrase <= 1 ) {
nophrase:
m_phraseIds2[i] = 0LL;
m_numWordsTotal2[i] = 0;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Not a phrase. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
return;
}
// sanity check
if ( lastWordj == -1 ) gbshutdownLogicError();
// sanity check
if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError();
// hyphen between numbers does not count (so 1-2 != 12)
if ( isNum ) hasHyphen = false;
// . the two word phrase id
// . "cd rom" -> cdrom
// . "fly paper" -> flypaper
// . "i-phone" -> iphone
// . "e-mail" -> email
if ( hasHyphen || ! hasStopWord2 ) {
m_phraseIds2[i] = h2;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Has hyphen or no stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i] );
}
// . "st. and" !-> stand
// . "the rapist" !-> therapist
else {
m_phraseIds2[i] = h2 ^ 0x768867;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. either no hyphen or a stopword. m_phraseIds2[i]=%" PRIu64 "", i, m_wids[i], m_phraseIds2[i]);
}
}
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
void Phrases::getPhrase(int32_t i, char *buf, size_t bufsize, int32_t *phrLen) const {
// return 0 if no phrase
if ( m_phraseIds2[i] == 0LL ) {
*buf='\0';
return;
}
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
int32_t n = m_numWordsTotal2[i] ;
char *s = buf;
char *send = buf + bufsize - 1;
for (int32_t w = i;w<i+n;w++){
if (!m_words->isAlnum(w)){
// skip spaces for now since we has altogether now
*s++ = ' ';
continue;
}
const char *w1 = m_words->getWord(w);
const char *wend = w1 + m_words->getWordLen(w);
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
// make sure not to overflow destination buffer
if( s + m_words->getWordLen(w) >= send ) {
*phrLen=0;
*buf='\0';
return;
}
// write the lower case char from w1+j into "s"
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
// advance
j += size;
s += size;
}
}
// null terminate
*s = '\0';
// set length we wrote into "buf"
*phrLen = s - buf;
}
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) const {
*pid = 0LL;
if ( m_numWordsTotal2[i] ) {
*pid = m_phraseIds2[i];
return m_numWordsTotal2[i];
}
return 0;
}