privacore-open-source-searc.../Phrases.cpp
Ivan Skytte Jørgensen 8853a156ac bugfix/workaround for bigram hashes
If a bigram contained one of the 109 eglish stopwords then the hash was XORed with 0x768867 in an attempt to distinquish bigrams that were split compound words and bigrams that couldn't form a compound word. eg:
  Not compound word: "the rapist" versus "therapist"
  Possibly a compound word: "light footed" vs. "lightfooted"
However, the code was english-specific and could hurt resutls for other languages. And you need a POS-tagger to do it correctly.
2018-08-02 13:14:44 +02:00

321 lines
9.6 KiB
C++

#include "Phrases.h"
#include "tokenizer.h"
#include "Bits.h"
#include "Mem.h"
#include "Conf.h"
#include "Sanity.h"
#include "GbUtil.h"
#include "Errno.h"
#include "Log.h"
#include "fctypes.h"
#include "utf8_fast.h"
#include "hash.h"
Phrases::Phrases() : m_buf(NULL) {
memset(m_localBuf, 0, sizeof(m_localBuf));
// Coverity
m_bufSize = 0;
m_phraseIds2 = NULL;
m_numWordsTotal2 = NULL;
m_numPhrases = 0;
reset();
}
Phrases::~Phrases ( ) {
reset();
}
void Phrases::reset() {
if ( m_buf && m_buf != m_localBuf ) {
mfree ( m_buf , m_bufSize , "Phrases" );
}
m_buf = NULL;
}
// initialize this token array with the string, "s" of length, "len".
bool Phrases::set(const TokenizerResult &tr, const Bits &bits) {
// reset in case being re-used
reset();
// . we have one phrase per word
// . a phrase #n is "empty" if spam[n] == PSKIP
m_numPhrases = tr.size();
// how much mem do we need?
int32_t need = m_numPhrases * (8+1);
// alloc if we need to
if ( (unsigned)need > sizeof(m_localBuf) )
m_buf = (char *)mmalloc ( need , "Phrases" );
else
m_buf = m_localBuf;
if ( ! m_buf ) {
log(LOG_WARN, "query: Phrases::set: %s",mstrerror(g_errno));
return false;
}
m_bufSize = need;
// set up arrays
char *p = m_buf;
// phrase not using stop words
m_phraseIds2 = (int64_t *)p;
p += m_numPhrases * 8;
m_numWordsTotal2 = (unsigned char *)p;
p += m_numPhrases * 1;
// sanity
if ( p != m_buf + need ) gbshutdownLogicError();
// . set the phrases
// . sets m_phraseIds [i]
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
for ( unsigned i = 0; i < tr.size(); ++i ) {
if ( !tr[i].is_alfanum ) {
continue;
}
setPhrase(i,tr,bits);
}
// success
return true;
}
// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase(unsigned i, const TokenizerResult &tr, const Bits &bits) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 " BEGIN", i);
// hash of the phrase
int64_t h = 0LL;
// the hash of the two-word phrase
int64_t h2 = 0LL;
// now look for other tokens that should follow the ith token
int32_t numWordsInPhrase = 1;
const auto &token1 = tr[i];
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
bool isNum = is_digit( token1.token_start[0] );
// do not include punct/tag words in the m_numWordsTotal[j] count
// of the total words in the phrase. these are just usesless tails.
int32_t lastWordj = -1;
// loop over following words
bool hasHyphen;
bool hasStopWord2 ;
unsigned pos; //decl. here due to goto
// . NOTE: a token can start a phrase but NOT be in it.
// . like a large number for example.
// . wordId is the lower ascii hash of the ith word
// . NO... this is allowing the query operator PiiPe to start
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
if ( ! bits.canBeInPhrase(i) ) {
// so indeed, skip it then
goto nophrase;
}
h = token1.token_hash;
// set position
pos = token1.token_len;
hasHyphen = false;
hasStopWord2 = bits.isStopWord(i);
for( unsigned j = i + 1 ; j < tr.size(); j++ ) {
const auto &token2 = tr[j];
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". LOOP START", i, j, token1.token_hash, token2.token_hash );
// Do not allow more than 32 alnum/punct "words" in a phrase.
// Tthis prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
// BR: But it will never happen? It breaks out of the loop
// when the phrase contains 2 (real) words?
if ( j > i + 32 ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j > i+32. no phrase", i, j, token1.token_hash, token2.token_hash );
goto nophrase;
}
// deal with punct words
if ( !token2.is_alfanum ) {
// if we cannot pair across word j then break
if ( !bits.canPairAcross( j ) ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Pair cannot cross. Breaking.", i, j, token1.token_hash, token2.token_hash );
break;
}
// does it have a hyphen?
if ( j == i + 1 && has_char(token2.token_start,token2.token_end(), '-' ) ) {
hasHyphen = true;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is hyphen, NOT adding to phrase", i, j, token1.token_hash, token2.token_hash );
}
else {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64 ". j is space, NOT adding to phrase", i, j, token1.token_hash, token2.token_hash );
}
continue;
}
// record lastWordj to indicate that word #j was a true word
lastWordj = j;
// if word #j can be in phrase then incorporate it's hash
if ( bits.canBeInPhrase (j) ) {
int32_t conti = pos;
// hash the jth word into the hash
h = hash64Lower_utf8_cont( token2.token_start, token2.token_len, h, &conti );
pos = conti;
++numWordsInPhrase;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". CAN be in phrase. Adding j's hash. h=%ld. numWordsInPhrase=%" PRId32 "", i, j, token1.token_hash, token2.token_hash, h, numWordsInPhrase);
// N-word phrases?
if ( numWordsInPhrase == 2 ) {
h2 = h;
m_numWordsTotal2[i] = j - i + 1;
hasStopWord2 = bits.isStopWord(j);
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Words in phrase is 2. Breaking.", i, j, token1.token_hash, token2.token_hash );
break;
}
}
else {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". j cannot be in a phrase.", i, j, token1.token_hash, token2.token_hash );
}
// if we cannot pair across word j then break
if ( ! bits.canPairAcross (j) ) {
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Cannot pair across. Breaking.", i, j, token1.token_hash, token2.token_hash );
break;
}
// otherwise, get the next word
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", j=%3" PRId32 ", wids[i]=%20" PRIu64", wids[j]=%20" PRIu64". Get next word", i, j, token1.token_hash, token2.token_hash );
}
// if we had no phrase then use 0 as id (need 2+ words to be a phrase)
if ( numWordsInPhrase <= 1 ) {
nophrase:
m_phraseIds2[i] = 0LL;
m_numWordsTotal2[i] = 0;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Not a phrase. m_phraseIds2[i]=%" PRIu64 "", i, token1.token_hash, m_phraseIds2[i]);
return;
}
// sanity check
if ( lastWordj == -1 ) gbshutdownLogicError();
// sanity check
if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError();
// hyphen between numbers does not count (so 1-2 != 12)
if ( isNum ) hasHyphen = false;
// . the two word phrase id
// . "cd rom" -> cdrom
// . "fly paper" -> flypaper
// . "i-phone" -> iphone
// . "e-mail" -> email
if ( hasHyphen || ! hasStopWord2 ) {
m_phraseIds2[i] = h2;
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. Has hyphen or no stopword. m_phraseIds2[i]=%" PRIu64 "", i, token1.token_hash, m_phraseIds2[i] );
}
// . "st. and" !-> stand
// . "the rapist" !-> therapist
else {
m_phraseIds2[i] = h2;
//The XORing with 0x768867 was lost in the code that uses the new tokenizer instead of the old "Words" class.
//So the above detection of stopwords doesn't have any effect currently. I'm not even sure if it has any
//benefits for languages that compound words more often than English (german/danish/norwegian/greek/…)
//We keep the code around if we ever want to do something similar, but possibly for more than just English,
//and hopefully with detection of part-of-speech.
//Old code:
//m_phraseIds2[i] = h2 ^ 0x768867;
//note: Synonyms.cpp also does this
logTrace( g_conf.m_logTracePhrases, "i=%3" PRId32 ", wids[i]=%20" PRIu64". END. either no hyphen or a stopword. m_phraseIds2[i]=%" PRIu64 "", i, token1.token_hash, m_phraseIds2[i]);
}
}
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
void Phrases::getPhrase(int32_t i, const TokenizerResult &tr, char *buf, size_t bufsize, int32_t *phrLen) const {
// return 0 if no phrase
if ( m_phraseIds2[i] == 0LL ) {
*buf='\0';
return;
}
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
int32_t n = m_numWordsTotal2[i] ;
char *s = buf;
char *send = buf + bufsize - 1;
for (int32_t w = i; w<i+n; w++) {
const auto &token = tr[w];
if (!token.is_alfanum) {
// skip spaces for now since we has altogether now
*s++ = ' ';
continue;
}
const char *w1 = token.token_start;
const char *wend = w1 + token.token_len;
for ( unsigned j = 0 ; j < token.token_len && s<send ; j++) {
// make sure not to overflow destination buffer
if( s + token.token_len >= send ) {
*phrLen=0;
*buf='\0';
return;
}
// write the lower case char from w1+j into "s"
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
// advance
j += size;
s += size;
}
}
// null terminate
*s = '\0';
// set length we wrote into "buf"
*phrLen = s - buf;
}
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) const {
*pid = 0LL;
if ( m_numWordsTotal2[i] ) {
*pid = m_phraseIds2[i];
return m_numWordsTotal2[i];
}
return 0;
}