2016-08-05 14:31:18 +02:00
//#include "gb-include.h"
2013-08-02 13:12:24 -07:00
# include "Phrases.h"
2016-08-05 14:31:18 +02:00
# include "Words.h"
# include "Bits.h"
2013-08-02 13:12:24 -07:00
# include "Mem.h"
2016-11-03 12:19:59 +01:00
# include "Conf.h"
2016-08-05 14:31:18 +02:00
# include "Sanity.h"
2016-06-20 12:30:26 +02:00
2013-08-02 13:12:24 -07:00
2016-09-26 17:33:45 +02:00
Phrases : : Phrases ( ) : m_buf ( NULL ) {
2013-08-02 13:12:24 -07:00
2016-10-21 11:49:06 +02:00
memset ( m_localBuf , 0 , sizeof ( m_localBuf ) ) ;
2016-09-26 17:33:45 +02:00
// Coverity
m_bufSize = 0 ;
m_phraseIds2 = NULL ;
m_numWordsTotal2 = NULL ;
m_numPhrases = 0 ;
m_words = NULL ;
m_wids = NULL ;
m_wptrs = NULL ;
m_wlens = NULL ;
m_bits = NULL ;
2016-10-21 11:49:06 +02:00
reset ( ) ;
}
Phrases : : ~ Phrases ( ) {
reset ( ) ;
}
void Phrases : : reset ( ) {
if ( m_buf & & m_buf ! = m_localBuf ) {
mfree ( m_buf , m_bufSize , " Phrases " ) ;
}
m_buf = NULL ;
2013-08-02 13:12:24 -07:00
}
2016-09-26 17:33:45 +02:00
2013-08-02 13:12:24 -07:00
// initialize this token array with the string, "s" of length, "len".
2016-09-23 02:40:50 +02:00
bool Phrases : : set ( const Words * words , const Bits * bits ) {
2013-08-02 13:12:24 -07:00
// reset in case being re-used
reset ( ) ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// ensure we have words
if ( ! words ) return true ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// . we have one phrase per word
// . a phrase #n is "empty" if spam[n] == PSKIP
m_numPhrases = words - > getNumWords ( ) ;
// how much mem do we need?
2016-02-24 16:12:05 +01:00
int32_t need = m_numPhrases * ( 8 + 1 ) ;
2013-08-02 13:12:24 -07:00
// alloc if we need to
2016-08-05 14:31:18 +02:00
if ( ( unsigned ) need > sizeof ( m_localBuf ) )
2013-08-02 13:12:24 -07:00
m_buf = ( char * ) mmalloc ( need , " Phrases " ) ;
else
m_buf = m_localBuf ;
2016-02-25 16:59:06 +01:00
if ( ! m_buf ) {
2016-08-01 15:29:03 +02:00
log ( LOG_WARN , " query: Phrases::set: %s " , mstrerror ( g_errno ) ) ;
return false ;
2016-02-25 16:59:06 +01:00
}
2013-08-02 13:12:24 -07:00
m_bufSize = need ;
2016-02-25 16:59:06 +01:00
2013-08-02 13:12:24 -07:00
// set up arrays
char * p = m_buf ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// phrase not using stop words
2016-02-25 16:59:06 +01:00
m_phraseIds2 = ( int64_t * ) p ;
p + = m_numPhrases * 8 ;
m_numWordsTotal2 = ( unsigned char * ) p ;
p + = m_numPhrases * 1 ;
2013-08-02 13:12:24 -07:00
// sanity
2016-08-05 14:31:18 +02:00
if ( p ! = m_buf + need ) gbshutdownLogicError ( ) ;
2013-08-02 13:12:24 -07:00
// point to this info while we parse
m_words = words ;
2016-10-19 19:53:00 +02:00
m_wptrs = words - > getWordPtrs ( ) ;
2013-08-02 13:12:24 -07:00
m_wlens = words - > getWordLens ( ) ;
m_wids = words - > getWordIds ( ) ;
m_bits = bits ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// . set the phrases
// . sets m_phraseIds [i]
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
2016-02-25 16:59:06 +01:00
for ( int32_t i = 0 ; i < words - > getNumWords ( ) ; + + i ) {
2016-02-24 16:12:05 +01:00
if ( ! m_wids [ i ] ) {
continue ;
}
2016-02-25 17:12:23 +01:00
setPhrase ( i ) ;
2013-08-02 13:12:24 -07:00
}
2016-02-25 16:59:06 +01:00
2013-08-02 13:12:24 -07:00
// success
return true ;
}
// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
2016-02-25 17:12:23 +01:00
void Phrases : : setPhrase ( int32_t i ) {
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " BEGIN " , i ) ;
2013-08-02 13:12:24 -07:00
// hash of the phrase
2014-10-30 13:36:39 -06:00
int64_t h = 0LL ;
2016-02-18 17:07:23 +01:00
2016-02-24 16:12:05 +01:00
// the hash of the two-word phrase
2014-10-30 13:36:39 -06:00
int64_t h2 = 0LL ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// reset
unsigned char pos = 0 ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// now look for other tokens that should follow the ith token
2016-02-24 16:12:05 +01:00
int32_t nw = m_words - > getNumWords ( ) ;
int32_t numWordsInPhrase = 1 ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
char isNum = is_digit ( m_wptrs [ i ] [ 0 ] ) ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// do not include punct/tag words in the m_numWordsTotal[j] count
// of the total words in the phrase. these are just usesless tails.
2014-11-10 14:45:11 -08:00
int32_t lastWordj = - 1 ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// loop over following words
2016-11-04 12:39:47 +01:00
bool hasHyphen ;
2013-08-02 13:12:24 -07:00
bool hasStopWord2 ;
// . NOTE: a token can start a phrase but NOT be in it.
// . like a large number for example.
// . wordId is the lower ascii hash of the ith word
// . NO... this is allowing the query operator PiiPe to start
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
2016-02-24 16:12:05 +01:00
if ( ! m_bits - > canBeInPhrase ( i ) ) {
2013-08-02 13:12:24 -07:00
// so indeed, skip it then
goto nophrase ;
2016-02-24 16:12:05 +01:00
}
2013-08-02 13:12:24 -07:00
h = m_wids [ i ] ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// set position
pos = ( unsigned char ) m_wlens [ i ] ;
hasHyphen = false ;
hasStopWord2 = m_bits - > isStopWord ( i ) ;
2016-11-04 12:39:47 +01:00
for ( int32_t j = i + 1 ; j < nw ; j + + ) {
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . LOOP START " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
// Do not allow more than 32 alnum/punct "words" in a phrase.
// Tthis prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
// BR: But it will never happen? It breaks out of the loop
// when the phrase contains 2 (real) words?
2016-02-24 16:12:05 +01:00
if ( j > i + 32 ) {
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . j > i+32. no phrase " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2016-02-24 16:12:05 +01:00
goto nophrase ;
}
2013-08-02 13:12:24 -07:00
// deal with punct words
if ( ! m_wids [ j ] ) {
// if we cannot pair across word j then break
2016-02-24 16:12:05 +01:00
if ( ! m_bits - > canPairAcross ( j ) ) {
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . Pair cannot cross. Breaking. " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2016-02-24 16:12:05 +01:00
break ;
}
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// does it have a hyphen?
2016-02-24 16:12:05 +01:00
if ( j = = i + 1 & & m_words - > hasChar ( j , ' - ' ) ) {
hasHyphen = true ;
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . j is hyphen, NOT adding to phrase " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
}
else {
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . j is space, NOT adding to phrase " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2016-02-24 16:12:05 +01:00
}
2013-08-02 13:12:24 -07:00
continue ;
}
// record lastWordj to indicate that word #j was a true word
lastWordj = j ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// if word #j can be in phrase then incorporate it's hash
if ( m_bits - > canBeInPhrase ( j ) ) {
2014-11-10 14:45:11 -08:00
int32_t conti = pos ;
2013-08-02 13:12:24 -07:00
// hash the jth word into the hash
2016-02-24 16:12:05 +01:00
h = hash64Lower_utf8_cont ( m_wptrs [ j ] , m_wlens [ j ] , h , & conti ) ;
2013-08-02 13:12:24 -07:00
pos = conti ;
2016-02-18 17:07:23 +01:00
2016-02-24 16:12:05 +01:00
+ + numWordsInPhrase ;
2013-08-02 13:12:24 -07:00
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . CAN be in phrase. Adding j's hash. numWordsInPhrase=% " PRId32 " " , i , j , m_wids [ i ] , m_wids [ j ] , numWordsInPhrase ) ;
2013-08-02 13:12:24 -07:00
// N-word phrases?
2016-02-18 17:07:23 +01:00
if ( numWordsInPhrase = = 2 ) {
2013-08-02 13:12:24 -07:00
h2 = h ;
2016-02-24 16:12:05 +01:00
m_numWordsTotal2 [ i ] = j - i + 1 ;
hasStopWord2 = m_bits - > isStopWord ( j ) ;
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . Words in phrase is 2. Breaking. " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2013-08-02 13:12:24 -07:00
break ;
}
}
2016-11-03 12:26:58 +01:00
else {
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . j cannot be in a phrase. " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
}
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// if we cannot pair across word j then break
2016-02-24 16:12:05 +01:00
if ( ! m_bits - > canPairAcross ( j ) ) {
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . Cannot pair across. Breaking. " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2016-02-24 16:12:05 +01:00
break ;
2013-08-02 13:12:24 -07:00
}
2016-02-24 16:12:05 +01:00
2013-08-02 13:12:24 -07:00
// otherwise, get the next word
2016-11-03 12:19:59 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , j=%3 " PRId32 " , wids[i]=%20 " PRIu64 " , wids[j]=%20 " PRIu64 " . Get next word " , i , j , m_wids [ i ] , m_wids [ j ] ) ;
2013-08-02 13:12:24 -07:00
}
2016-02-18 17:07:23 +01:00
2016-11-03 12:19:59 +01:00
// if we had no phrase then use 0 as id (need 2+ words to be a phrase)
2013-08-02 13:12:24 -07:00
if ( numWordsInPhrase < = 1 ) {
nophrase :
m_phraseIds2 [ i ] = 0LL ;
m_numWordsTotal2 [ i ] = 0 ;
2016-11-04 12:39:47 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , wids[i]=%20 " PRIu64 " . END. Not a phrase. m_phraseIds2[i]=% " PRIu64 " " , i , m_wids [ i ] , m_phraseIds2 [ i ] ) ;
2013-08-02 13:12:24 -07:00
return ;
}
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// sanity check
2016-08-05 14:31:18 +02:00
if ( lastWordj = = - 1 ) gbshutdownLogicError ( ) ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// sanity check
2016-08-05 14:31:18 +02:00
if ( lastWordj - i + 1 > 255 ) gbshutdownLogicError ( ) ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// hyphen between numbers does not count (so 1-2 != 12)
if ( isNum ) hasHyphen = false ;
2016-02-18 17:07:23 +01:00
2013-08-02 13:12:24 -07:00
// . the two word phrase id
// . "cd rom" -> cdrom
// . "fly paper" -> flypaper
// . "i-phone" -> iphone
// . "e-mail" -> email
if ( hasHyphen | | ! hasStopWord2 ) {
m_phraseIds2 [ i ] = h2 ;
2016-11-04 12:39:47 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , wids[i]=%20 " PRIu64 " . END. Has hyphen or no stopword. m_phraseIds2[i]=% " PRIu64 " " , i , m_wids [ i ] , m_phraseIds2 [ i ] ) ;
2013-08-02 13:12:24 -07:00
}
// . "st. and" !-> stand
// . "the rapist" !-> therapist
else {
m_phraseIds2 [ i ] = h2 ^ 0x768867 ;
2016-11-04 12:39:47 +01:00
logTrace ( g_conf . m_logTracePhrases , " i=%3 " PRId32 " , wids[i]=%20 " PRIu64 " . END. either no hyphen or a stopword. m_phraseIds2[i]=% " PRIu64 " " , i , m_wids [ i ] , m_phraseIds2 [ i ] ) ;
2013-08-02 13:12:24 -07:00
}
}
2016-11-03 12:19:59 +01:00
2013-08-02 13:12:24 -07:00
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
2016-05-27 16:02:21 +02:00
void Phrases : : getPhrase ( int32_t i , char * buf , size_t bufsize , int32_t * phrLen ) const {
2013-08-02 13:12:24 -07:00
// return 0 if no phrase
2016-02-24 16:12:05 +01:00
if ( m_phraseIds2 [ i ] = = 0LL ) {
2016-05-27 16:02:21 +02:00
* buf = ' \0 ' ;
return ;
2016-02-24 16:12:05 +01:00
}
2013-08-02 13:12:24 -07:00
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
2016-03-01 11:42:30 +01:00
int32_t n = m_numWordsTotal2 [ i ] ;
2013-08-02 13:12:24 -07:00
char * s = buf ;
2016-05-27 16:02:21 +02:00
char * send = buf + bufsize - 1 ;
2014-11-10 14:45:11 -08:00
for ( int32_t w = i ; w < i + n ; w + + ) {
2013-08-02 13:12:24 -07:00
if ( ! m_words - > isAlnum ( w ) ) {
// skip spaces for now since we has altogether now
* s + + = ' ' ;
continue ;
}
2016-05-27 16:10:31 +02:00
const char * w1 = m_words - > getWord ( w ) ;
const char * wend = w1 + m_words - > getWordLen ( w ) ;
2014-11-10 14:45:11 -08:00
for ( int32_t j = 0 ; j < m_words - > getWordLen ( w ) & & s < send ; j + + ) {
2017-10-01 18:04:56 +02:00
// make sure not to overflow destination buffer
if ( s + m_words - > getWordLen ( w ) > = send ) {
* phrLen = 0 ;
* buf = ' \0 ' ;
return ;
}
2013-08-02 13:12:24 -07:00
// write the lower case char from w1+j into "s"
2014-11-10 14:45:11 -08:00
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend ) ;
2013-08-02 13:12:24 -07:00
// advance
j + = size ;
s + = size ;
}
}
// null terminate
* s = ' \0 ' ;
2016-03-01 11:42:30 +01:00
2013-08-02 13:12:24 -07:00
// set length we wrote into "buf"
* phrLen = s - buf ;
}
2016-03-07 17:06:21 +01:00
2016-05-27 16:10:31 +02:00
int32_t Phrases : : getMinWordsInPhrase ( int32_t i , int64_t * pid ) const {
2016-03-07 17:06:21 +01:00
* pid = 0LL ;
if ( m_numWordsTotal2 [ i ] ) {
* pid = m_phraseIds2 [ i ] ;
return m_numWordsTotal2 [ i ] ;
}
return 0 ;
}