777 lines
23 KiB
C++
777 lines
23 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Phrases.h"
|
|
#include "Mem.h"
|
|
|
|
Phrases::Phrases ( ) {
|
|
m_buf = NULL;
|
|
//m_phraseScores = NULL;
|
|
m_phraseSpam = NULL;
|
|
//m_phraseIds = NULL;
|
|
}
|
|
|
|
Phrases::~Phrases ( ) {
|
|
reset();
|
|
}
|
|
|
|
void Phrases::reset() {
|
|
if ( m_buf && m_buf != m_localBuf )
|
|
mfree ( m_buf , m_bufSize , "Phrases" );
|
|
m_buf = NULL;
|
|
//m_phraseScores = NULL;
|
|
m_phraseSpam = NULL;
|
|
//m_phraseIds = NULL;
|
|
}
|
|
|
|
// initialize this token array with the string, "s" of length, "len".
|
|
bool Phrases::set( Words *words,
|
|
Bits *bits ,
|
|
bool useStopWords ,
|
|
bool useStems ,
|
|
int32_t titleRecVersion,
|
|
int32_t niceness) {
|
|
// reset in case being re-used
|
|
reset();
|
|
// always reset this
|
|
//m_phraseScores = NULL;
|
|
m_phraseSpam = NULL;
|
|
//m_phraseIds = NULL;
|
|
// now we never use stop words and we just index two-word phrases
|
|
// so that a search for "get a" in quotes will match a doc that has
|
|
// the phrase "get a clue". it might impact performance, but it should
|
|
// be insignificant... but we need to have this level of precision.
|
|
// ok -- but what about 'kick a ball'. we might not have that phrase
|
|
// in the results for "kick a" AND "a ball"!! so we really need to
|
|
// index "kick a ball" as well as "kick a" and "a ball". i don't think
|
|
// that will cause too much bloat.
|
|
//useStopWords = false;
|
|
// ensure we have words
|
|
if ( ! words ) return true;
|
|
// set the words' scores array, m_wordScores
|
|
//if ( scores ) m_wordScores = scores->m_scores;
|
|
//else m_wordScores = NULL;
|
|
// . we have one phrase per word
|
|
// . a phrase #n is "empty" if spam[n] == PSKIP
|
|
m_numPhrases = words->getNumWords();
|
|
|
|
// replaces scores
|
|
//m_sections = m_sections;
|
|
//m_sectionPtrs = NULL;
|
|
//if ( m_sections ) m_sectionPtrs = m_sections->m_sectionPtrs;
|
|
|
|
// how much mem do we need?
|
|
//int32_t need = (18+1+(3+8*3)) * m_numPhrases;
|
|
int32_t need = m_numPhrases * (8+8+1+1+1);
|
|
//if ( m_wordScores ) need += 4 * m_numPhrases;
|
|
|
|
// alloc if we need to
|
|
if ( need > PHRASE_BUF_SIZE )
|
|
m_buf = (char *)mmalloc ( need , "Phrases" );
|
|
else
|
|
m_buf = m_localBuf;
|
|
|
|
if ( ! m_buf )
|
|
return log("query: Phrases::set: %s",mstrerror(g_errno));
|
|
m_bufSize = need;
|
|
// set up arrays
|
|
char *p = m_buf;
|
|
//m_phraseIds = (int64_t *)p ; p += m_numPhrases * 8;
|
|
// phrase not using stop words
|
|
m_phraseIds2 = (int64_t *)p ; p += m_numPhrases * 8;
|
|
m_phraseIds3 = (int64_t *)p ; p += m_numPhrases * 8;
|
|
//m_phraseIds4 = (int64_t *)p ; p += m_numPhrases * 8;
|
|
//m_phraseIds5 = (int64_t *)p ; p += m_numPhrases * 8;
|
|
//m_stripPhraseIds = (int64_t *)p ; p += m_numPhrases * 8;
|
|
//if ( m_wordScores ) {
|
|
// m_phraseScores = (int32_t *)p ;
|
|
// p += m_numPhrases * 4;
|
|
//}
|
|
m_phraseSpam = (unsigned char *)p ; p += m_numPhrases * 1;
|
|
//m_numWordsTotal = (unsigned char *)p ; p += m_numPhrases * 1;
|
|
m_numWordsTotal2= (unsigned char *)p ; p += m_numPhrases * 1;
|
|
m_numWordsTotal3= (unsigned char *)p ; p += m_numPhrases * 1;
|
|
//m_numWordsTotal4= (unsigned char *)p ; p += m_numPhrases * 1;
|
|
//m_numWordsTotal5= (unsigned char *)p ; p += m_numPhrases * 1;
|
|
|
|
// sanity
|
|
if ( p != m_buf + need ) { char *xx=NULL;*xx=0; }
|
|
|
|
// clear this
|
|
//memset ( m_numWordsTotal , 0 , m_numPhrases );
|
|
|
|
memset ( m_numWordsTotal2 , 0 , m_numPhrases );
|
|
memset ( m_numWordsTotal3 , 0 , m_numPhrases );
|
|
//memset ( m_numWordsTotal4 , 0 , m_numPhrases );
|
|
//memset ( m_numWordsTotal5 , 0 , m_numPhrases );
|
|
|
|
// point to this info while we parse
|
|
m_words = words;
|
|
m_wptrs = words->getWords();
|
|
m_wlens = words->getWordLens();
|
|
m_wids = words->getWordIds();
|
|
m_bits = bits;
|
|
m_useStopWords = useStopWords;
|
|
m_useStems = useStems;
|
|
// we now are dependent on this
|
|
m_titleRecVersion = titleRecVersion;
|
|
// . set the phrases
|
|
// . sets m_phraseIds [i]
|
|
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
|
|
for ( int32_t i = 0 ; i < words->getNumWords() ; i++ ) {
|
|
if ( ! m_wids[i] ) continue;
|
|
setPhrase ( i , niceness);
|
|
}
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
// . add the phrase that starts with the ith word
|
|
// . "read Of Mice and Men" should make 3 phrases:
|
|
// . read.ofmice
|
|
// . ofmice
|
|
// . mice.andmen
|
|
void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
|
// . if the ith word cannot start a phrase then we have no phrase
|
|
// . we indicate NULL phrasesIds with a spam of PSKIP
|
|
// . we now index all regardless! we want to be able to search
|
|
// for "a thing" or something. so do it!
|
|
//if ( ! m_bits->canStartPhrase ( i ) ) {
|
|
// m_phraseSpam[i] = PSKIP;
|
|
// m_phraseIds [i] = 0LL;
|
|
// return;
|
|
//}
|
|
|
|
// MDW: now Weights.cpp should encompass all this logic
|
|
// or if score <= 0, set in Scores.cpp
|
|
//if ( m_wordScores && m_wordScores[i] <= 0 ) {
|
|
// m_phraseSpam[i] = PSKIP;
|
|
// m_phraseIds [i] = 0LL;
|
|
// return;
|
|
//}
|
|
|
|
// hash of the phrase
|
|
int64_t h = 0LL;
|
|
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
|
|
int64_t h2 = 0LL;
|
|
int64_t h3 = 0LL;
|
|
//int64_t h4 = 0LL;
|
|
//int64_t h5 = 0LL;
|
|
// reset
|
|
unsigned char pos = 0;
|
|
// now look for other tokens that should follow the ith token
|
|
int32_t nw = m_words->getNumWords();
|
|
int32_t numWordsInPhrase = 1;
|
|
// use the min spam from all words in the phrase as the spam for phrase
|
|
char minSpam = -1;
|
|
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
|
|
char isNum = is_digit(m_wptrs[i][0]);
|
|
// min score
|
|
//int32_t minScore ;
|
|
//if ( m_wordScores ) minScore = m_wordScores[i];
|
|
// if i is not a stop word, it can set the min spam initially
|
|
//if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i);
|
|
// do not include punct/tag words in the m_numWordsTotal[j] count
|
|
// of the total words in the phrase. these are just usesless tails.
|
|
int32_t lastWordj = -1;
|
|
// loop over following words
|
|
int32_t j;
|
|
bool hasHyphen ;
|
|
bool hasStopWord2 ;
|
|
|
|
// . NOTE: a token can start a phrase but NOT be in it.
|
|
// . like a large number for example.
|
|
// . wordId is the lower ascii hash of the ith word
|
|
// . NO... this is allowing the query operator PiiPe to start
|
|
// a phrase but not be in it, then the phrase id ends up just
|
|
// being the following word's id. causing the synonyms code to
|
|
// give a synonym which it should not un Synonyms::set()
|
|
if ( ! m_bits->canBeInPhrase(i) )
|
|
// so indeed, skip it then
|
|
goto nophrase;
|
|
|
|
//h = hash64 ( h, m_words->getWordId(i));
|
|
h = m_wids[i];
|
|
// set position
|
|
pos = (unsigned char)m_wlens[i];
|
|
//if (m_words->getStripWordId(i))
|
|
// h2 = hash64 ( h2, m_words->getStripWordId(i));
|
|
//else h2 = h;
|
|
|
|
hasHyphen = false;
|
|
hasStopWord2 = m_bits->isStopWord(i);
|
|
// this makes it true now too
|
|
//if ( m_wlens[i] <= 2 ) hasStopWord = true;
|
|
|
|
for ( j = i + 1 ; j < nw ; j++ ) {
|
|
QUICKPOLL(niceness);
|
|
|
|
// . do not allow more than 32 alnum/punct "words" in a phrase
|
|
// . this prevents phrases with 100,000 words from slowing
|
|
// us down. would put us in a huge double-nested for loop
|
|
if ( j > i + 32 ) goto nophrase;
|
|
// deal with punct words
|
|
if ( ! m_wids[j] ) {
|
|
// if we cannot pair across word j then break
|
|
if ( ! m_bits->canPairAcross (j) ) break;
|
|
// does it have a hyphen?
|
|
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
|
|
/*
|
|
// "D & B" --> dandb
|
|
if (j==i+1 && m_words->hasChar(j,'&')) {
|
|
// set this
|
|
hasStopWord = true;
|
|
// insert "and"
|
|
int32_t conti=pos;
|
|
h = hash64Lower_utf8_cont("and",3,h,&conti);
|
|
pos=conti;
|
|
// the two-word phrase, set it if we need to
|
|
h2 = h;
|
|
m_numWordsTotal2[i] = j-i+1;
|
|
}
|
|
*/
|
|
continue;
|
|
}
|
|
// . if this word can not be in a phrase then continue our
|
|
// search for a word that can
|
|
// . no punctuation can be in a phrase currently (++?)
|
|
//if ( m_bits->canBeInPhrase (j) ) {
|
|
//}
|
|
|
|
// keep this set right
|
|
//if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true;
|
|
//if ( m_bits->isStopWord(j) ) hasStopWord = true;
|
|
|
|
// record lastWordj to indicate that word #j was a true word
|
|
lastWordj = j;
|
|
// . stop words should have a 0 spam value so don't count those
|
|
// . added by mdw in march 2002
|
|
/*
|
|
if ( ! m_bits->isStopWord(j) && m_spam ) {
|
|
// maintain the min spam
|
|
char spam = m_spam->getSpam ( j );
|
|
if ( minSpam == -1 || spam < minSpam ) minSpam = spam;
|
|
// . min weight from score vector
|
|
// . normal score here is 256, not 128, so shift
|
|
// down 3 to normalize it relatively
|
|
//if ( m_wordScores && (m_wordScores[j]>>3)<minScore)
|
|
// minScore = m_wordScores[j]>>3;
|
|
//if ( m_wordScores && m_wordScores[j] < minScore )
|
|
// minScore = m_wordScores[j];
|
|
}
|
|
*/
|
|
// if word #j can be in phrase then incorporate it's hash
|
|
if ( m_bits->canBeInPhrase (j) ) {
|
|
// continue the hash
|
|
//unsigned char *p= (unsigned char *)m_wptrs[j];
|
|
//unsigned char *pend = p + m_wlens[j];
|
|
//for ( ; p < pend ; p++ )
|
|
// h ^= g_hashtab[pos++][*p];
|
|
|
|
int32_t conti = pos;
|
|
|
|
// . get the punctuation mark separating two numbers
|
|
// . use space if can't find one
|
|
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
|
|
//if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) {
|
|
// // get punct mark
|
|
// char c = m_wptrs[i+1][0];
|
|
// // if space try next
|
|
// if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
|
|
// // treat comma as nothing
|
|
// if ( c==',' ) c='\0';
|
|
// // treat / and . and - as they are, everything
|
|
// // else should be treated as a space
|
|
// else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
|
|
// // incorporate into hash if c is there
|
|
// if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti);
|
|
//}
|
|
|
|
// hash the jth word into the hash
|
|
h = hash64Lower_utf8_cont(m_wptrs[j],
|
|
m_wlens[j],
|
|
h,
|
|
&conti );
|
|
pos = conti;
|
|
//h = hash64 ( h , m_words->getWordId (j) );
|
|
//if (m_words->getStripWordId(j))
|
|
// h2 = hash64 ( h2, m_words->getStripWordId(j));
|
|
//else h2 = hash64(h2, m_words->getWordId(j));
|
|
numWordsInPhrase++;
|
|
|
|
// N-word phrases?
|
|
if ( numWordsInPhrase == 2 ) { // h != h2 ) {
|
|
h2 = h;
|
|
m_numWordsTotal2[i] = j-i+1;
|
|
if ( m_bits->isStopWord(j) )
|
|
hasStopWord2 = true;
|
|
continue;
|
|
}
|
|
if ( numWordsInPhrase == 3 ) {
|
|
h3 = h;
|
|
m_numWordsTotal3[i] = j-i+1;
|
|
//continue;
|
|
break;
|
|
}
|
|
/*
|
|
if ( numWordsInPhrase == 4 ) {
|
|
h4 = h;
|
|
m_numWordsTotal4[i] = j-i+1;
|
|
continue;
|
|
}
|
|
if ( numWordsInPhrase == 5 ) {
|
|
h5 = h;
|
|
m_numWordsTotal5[i] = j-i+1;
|
|
continue;
|
|
}
|
|
*/
|
|
}
|
|
// if we cannot pair across word j then break
|
|
if ( ! m_bits->canPairAcross (j) ) break;
|
|
// keep chugging?
|
|
if ( numWordsInPhrase >= 5 ) {
|
|
// if we're not using stop words then break
|
|
if ( ! m_useStopWords ) break;
|
|
// if it's not a stop word then break
|
|
if ( ! m_bits->isStopWord (j) ) break;
|
|
}
|
|
// otherwise, get the next word
|
|
}
|
|
// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
|
|
if ( numWordsInPhrase <= 1 ) {
|
|
nophrase:
|
|
m_phraseSpam[i] = PSKIP;
|
|
//m_phraseIds [i] = 0LL;
|
|
m_phraseIds2[i] = 0LL;
|
|
m_phraseIds3[i] = 0LL;
|
|
//m_stripPhraseIds [i] = 0LL;
|
|
//m_numWordsTotal[i] = 0;
|
|
m_numWordsTotal2[i] = 0;
|
|
m_numWordsTotal3[i] = 0;
|
|
return;
|
|
}
|
|
// don't jump the edge
|
|
//if ( j >= nw ) j = nw - 1;
|
|
// sanity check
|
|
if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; }
|
|
// set the phrase length (from word #i up to & including word #j)
|
|
//m_numWordsTotal[i] = j - i + 1;
|
|
//m_numWordsTotal [i] = lastWordj - i + 1;
|
|
// sanity check
|
|
if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; }
|
|
// set the phrase spam
|
|
if ( minSpam == -1 ) minSpam = 0;
|
|
m_phraseSpam[i] = minSpam;
|
|
// return the phraseId
|
|
//m_phraseIds [i] = h;
|
|
// hyphen between numbers does not count (so 1-2 != 12)
|
|
if ( isNum ) hasHyphen = false;
|
|
// . the two word phrase id
|
|
// . "cd rom" -> cdrom
|
|
// . "fly paper" -> flypaper
|
|
// . "i-phone" -> iphone
|
|
// . "e-mail" -> email
|
|
if ( hasHyphen || ! hasStopWord2 ) {
|
|
//m_phraseIds [i] = h;
|
|
m_phraseIds2[i] = h2;
|
|
}
|
|
// . "st. and" !-> stand
|
|
// . "the rapist" !-> therapist
|
|
else {
|
|
//m_phraseIds [i] = h ^ 0x768867;
|
|
m_phraseIds2[i] = h2 ^ 0x768867;
|
|
}
|
|
// forget hyphen logic for these
|
|
m_phraseIds3[i] = h3;
|
|
//m_phraseIds4[i] = h4;
|
|
//m_phraseIds5[i] = h5;
|
|
|
|
//if ( h != h2 ) m_stripPhraseIds[i] = h2;
|
|
//else m_stripPhraseIds[i] = 0LL;
|
|
|
|
// the score weight, if any
|
|
//if ( m_phraseScores ) m_phraseScores [i] = minScore;
|
|
// sanity check
|
|
//if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;}
|
|
// debug msg
|
|
//char *w = m_words->getWord(i) ;
|
|
//int32_t wlen = m_words->getWordLen(i) ;
|
|
//for ( int32_t k = 0 ; k < wlen ; k++ )
|
|
// fprintf(stderr,"%c",w[k]);
|
|
//fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h);
|
|
}
|
|
|
|
// . store phrase that starts with word #i into "printBuf"
|
|
// . return bytes stored in "printBuf"
|
|
char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
|
// return 0 if no phrase
|
|
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
|
// store the phrase in here
|
|
static char buf[256];
|
|
// . how many words, including punct words, are in phrase?
|
|
// . this should never be 1 or less
|
|
//int32_t n = m_numWordsTotal[i] ;
|
|
int32_t n ;
|
|
if ( npw == 2 ) n = m_numWordsTotal2[i] ;
|
|
else if ( npw == 3 ) n = m_numWordsTotal3[i] ;
|
|
else { char *xx=NULL; *xx=0; }
|
|
//char *w1 = m_words->getWord(i);
|
|
//char *w2 = m_words->getWord(i+n-1);
|
|
//int32_t wlen2 = m_words->getWordLen(i+n-1);
|
|
//int32_t plen = ( w2 - w1 ) + wlen2;
|
|
|
|
char *s = buf;
|
|
char *send = buf + 255;
|
|
for (int32_t w = i;w<i+n;w++){
|
|
if (!m_words->isAlnum(w)){
|
|
// skip spaces for now since we has altogether now
|
|
*s++ = ' ';
|
|
/*
|
|
// . get the punctuation mark separating two numbers
|
|
// . use space if can't find one
|
|
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
|
|
if ( is_digit(m_wptrs[i][0]) && w == i + 1 &&
|
|
is_digit(m_wptrs[i+2][0]) ) {
|
|
// get punct mark
|
|
char c = m_wptrs[i+1][0];
|
|
// if space try next
|
|
if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
|
|
// treat comma as nothing
|
|
if ( c==',' ) continue;//c='\0';
|
|
// treat / and . and - as they are, everything
|
|
// else should be treated as a space
|
|
else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
|
|
// print that
|
|
*s++ = c;
|
|
}
|
|
*/
|
|
continue;
|
|
}
|
|
char *w1 = m_words->getWord(w);
|
|
char *wend = w1 + m_words->getWordLen(w);
|
|
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
|
|
// write the lower case char from w1+j into "s"
|
|
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
|
|
// advance
|
|
j += size;
|
|
s += size;
|
|
}
|
|
}
|
|
// null terminate
|
|
*s = '\0';
|
|
// set length we wrote into "buf"
|
|
*phrLen = s - buf;
|
|
|
|
// return ptr to buf
|
|
return buf;
|
|
}
|
|
|
|
/*
|
|
// . store phrase that starts with word #i into "printBuf"
|
|
// . return bytes stored in "printBuf"
|
|
char *Phrases::getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
|
// return 0 if no phrase
|
|
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
|
// store the phrase in here
|
|
static char buf[512];
|
|
// . how many words, including punct words, are in phrase?
|
|
// . this should never be 1 or less
|
|
int32_t n = m_numWordsTotal[i] ;
|
|
char *dst = buf;
|
|
char *dend = buf + 255;
|
|
int32_t count = 0;
|
|
for (int32_t w = i; w<i+n && count<npw; w++ ) {
|
|
// do not breach the buffer
|
|
if ( dst + 4 >= dend ) break;
|
|
// all non alnum chars are spaces now
|
|
if ( ! m_words->isAlnum(w) ) {
|
|
// skip spaces for now since we has altogether now
|
|
*dst++ = ' ';
|
|
continue;
|
|
}
|
|
count++;
|
|
char *w1 = m_words->getWord(w);
|
|
int32_t wlen = m_words->getWordLen(w);
|
|
// store the word in lower case into "dst"
|
|
to_lower_utf8 ( dst , dend , w1 , w1 + wlen );
|
|
// advance destination cursor
|
|
dst += wlen;
|
|
}
|
|
// null terminate
|
|
*dst = '\0';
|
|
// set length we wrote into "buf"
|
|
*phrLen = dst - buf;
|
|
// return ptr to buf
|
|
return buf;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
char *Phrases::getStripPhrase ( int32_t i , int32_t *phrLen ) {
|
|
// return 0 if no phrase
|
|
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
|
// store the phrase in here
|
|
static char buf[512];
|
|
// . how many words, including punct words, are in phrase?
|
|
// . this should never be 1 or less
|
|
int32_t n = m_numWordsTotal[i] ;
|
|
//char *w1 = m_words->getWord(i);
|
|
//char *w2 = m_words->getWord(i+n-1);
|
|
//int32_t wlen2 = m_words->getWordLen(i+n-1);
|
|
//int32_t plen = ( w2 - w1 ) + wlen2;
|
|
|
|
char *s = buf;
|
|
char *send = buf + 255;
|
|
for (int32_t w = i;w<i+n;w++){
|
|
if (!m_words->isAlnum(w)){
|
|
*s++ = ' ';
|
|
continue;
|
|
}
|
|
char *w1 = m_words->getWord(w);
|
|
|
|
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
|
|
// write the lower case char from w1+j into "s"
|
|
int32_t size = to_lower_ascii_utf8 ( s , send , w1 + j );
|
|
// advance
|
|
j += size;
|
|
s += size;
|
|
}
|
|
}
|
|
// null terminate
|
|
*s = '\0';
|
|
// set length we wrote into "buf"
|
|
*phrLen = s - buf;
|
|
|
|
// return ptr to buf
|
|
return buf;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// for getTermId()
|
|
#include "Indexdb.h"
|
|
|
|
// . hash all the words into "table"
|
|
bool Phrases::hash ( TermTable *table ,
|
|
Weights *weightsPtr ,
|
|
uint32_t baseScore ,
|
|
uint32_t maxScore ,
|
|
int64_t startHash ,
|
|
char *prefix1 ,
|
|
int32_t prefixLen1 ,
|
|
char *prefix2 ,
|
|
int32_t prefixLen2 ,
|
|
bool hashUniqueOnly ,
|
|
int32_t titleRecVersion,
|
|
int32_t niceness ) {
|
|
|
|
// don't hash if score is 0 or less.
|
|
if (baseScore <= 0) return true;
|
|
|
|
// point to the phrase weights array, m_pw[]
|
|
int32_t *weights = NULL;
|
|
if ( weightsPtr ) weights = weightsPtr->m_pw;
|
|
|
|
// is the table storing the terms as strings, too? used by
|
|
// PageParser.cpp
|
|
SafeBuf *pbuf = table->getParserBuf();
|
|
|
|
// . now add each phraseId to the index table
|
|
// . TODO: might want to add w/ uniqueOnly on if spam is 100%
|
|
uint32_t score;
|
|
bool huo;
|
|
for (int32_t i =0; i < m_numPhrases; i++) {
|
|
// should we hash this phraseId only if it's not hashed yet?
|
|
huo = hashUniqueOnly;
|
|
// a phraseSpam of PSKIP means word #i does not start a phrase
|
|
if ( m_phraseSpam[i] == PSKIP ) continue;
|
|
// don't hash it if it's heavily spammed (spam of 100%)
|
|
score = baseScore - ( baseScore * m_phraseSpam[i] ) / 100;
|
|
// . use weights instead if we have them
|
|
// . default weight should be 128!
|
|
if ( weights ) {
|
|
// skip if the weight is 0, we probably have menu
|
|
// eelimination technology turned on...
|
|
if ( weights[i] == 0 ) continue;
|
|
// . the old way: we used a signed int32_t which could
|
|
// overflow before the divide and make artificially
|
|
// high term scores
|
|
//if(titleRecVersion < 85)
|
|
// score = (int32_t)((int32_t)score * weights[i]) / DW;
|
|
//else score = (score * weights[i]) / DW;
|
|
score = (score * weights[i]) / DW;
|
|
}
|
|
|
|
// weight by score if we need to
|
|
|
|
// if score is 0 because it's heavily spammed then we
|
|
// should hash just enough to index the phrase
|
|
if ( ! score ) { score = 1; huo = true; }
|
|
// get the phrase hash (includes coll,field prefixes)
|
|
int64_t h = g_indexdb.getTermId (startHash ,m_phraseIds[i]) ;
|
|
|
|
//int64_t h2 = 0LL;
|
|
//if (m_stripPhraseIds[i])
|
|
// h2 = g_indexdb.getTermId (startHash ,
|
|
// m_stripPhraseIds[i]) ;
|
|
int64_t h2 = g_indexdb.getTermId(startHash,m_phraseIds2[i]);
|
|
// we must mask it before adding it to the table because
|
|
// this table is also used to hash IndexLists into that come
|
|
// from LinkInfo classes (incoming link text). And when
|
|
// those IndexLists are hashed they used masked termIds.
|
|
// So we should too...
|
|
//h = h & TERMID_MASK;
|
|
// add to table
|
|
//int32_t score2;
|
|
//if ( titleRecVersion >= 36 ) {
|
|
//score2 = score >> 1;
|
|
//if ( score2 <= 0 ) score2 = 1;
|
|
//}
|
|
//else
|
|
// score2 = score;
|
|
|
|
QUICKPOLL(niceness);
|
|
|
|
if ( ! pbuf ) {
|
|
if ( ! table->addTerm ( h, score, maxScore, huo,
|
|
titleRecVersion ))
|
|
return false;
|
|
// hash the two-word phrase if h is not two words
|
|
if ( h2 && h2 != h &&
|
|
! table->addTerm ( h2, score, maxScore,
|
|
huo, titleRecVersion ))
|
|
return false;
|
|
continue;
|
|
}
|
|
// add phrase as string to hash table if we need to as well
|
|
int32_t plen;
|
|
char *p = getPhrase ( i , &plen );
|
|
int32_t slen;
|
|
//#if 1
|
|
char *s = table->storeTerm ( p , plen ,
|
|
prefix1 , prefixLen1 ,
|
|
prefix2 , prefixLen2 ,true,&slen);
|
|
//#else
|
|
//char *s = table->storeTerm ( p , plen ,
|
|
// "phrase" , 6 ,
|
|
// prefix2 , prefixLen2 , true, &slen );
|
|
//#endif
|
|
if ( ! table->addTerm( h, score, maxScore, huo ,
|
|
titleRecVersion, s, slen ) )
|
|
return false;
|
|
|
|
// if no strippable chars in phrase, we're done
|
|
if ( ! h2 || h2 == h ) continue;
|
|
|
|
p = getTwoWordPhrase(i, &plen);
|
|
|
|
s = table->storeTerm ( p , plen ,
|
|
prefix1 , prefixLen1 ,
|
|
prefix2 , prefixLen2 ,true,&slen);
|
|
if ( ! table->addTerm( h2, score , maxScore, huo ,
|
|
titleRecVersion , s, slen ) )
|
|
return false;
|
|
}
|
|
// . TODO: print spam %'s for phrases!!!
|
|
// . see Words.cpp for template code to do this
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// . word #n is in a phrase if he has [word][punct] or [punct][word]
|
|
// before/after him and you can pair across the punct and include both
|
|
// in a phrase
|
|
// . used by SimpleQuery class to see if a word is in a phrase or not
|
|
// . if it is then the query may choose not to represent the word by itself
|
|
bool Phrases::isInPhrase ( int32_t n ) {
|
|
// returns true if we started a phrase (our phraseSpam is not PSKIP)
|
|
if ( m_phraseSpam[n] != PSKIP ) return true;
|
|
// . see if we were in a phrase started by a word before us
|
|
// . this only words since stop words - whose previous word cannot be
|
|
// paired across - are able to start phrases
|
|
if ( n < 2 ) return false;
|
|
if ( ! m_bits->canPairAcross(n-1) ) return false;
|
|
if ( ! m_bits->canBeInPhrase(n-2) ) return false;
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// . get the index of the word that starts this phrase
|
|
// . returns -1 if none...factored out for
|
|
// . getLeftPhraseId and getLeftStripPhraseId
|
|
int32_t Phrases::getLeftPhraseIndex( int32_t i ) {
|
|
// return 0 if we no words before us
|
|
while ( i > 0 ) {
|
|
// check punct before
|
|
i--;
|
|
// can he be paired across
|
|
if ( m_words->isPunct(i)){
|
|
if ( ! m_bits->canPairAcross(i) ) return -1;
|
|
}
|
|
else{
|
|
// if word before him not in a phrase, bail
|
|
if ( ! isInPhrase ( i ) ) return -1;
|
|
// can he start ?
|
|
if ( ! m_bits->canStartPhrase ( i ) ) continue;
|
|
// yes he can
|
|
return i;
|
|
}
|
|
}
|
|
// none
|
|
return -1;
|
|
}
|
|
// . get the id of the phrase we are in that we do not start
|
|
// . returns 0 if none, even though 0 may be a valid phraseId!! TODO: fix
|
|
|
|
int64_t Phrases::getLeftPhraseId ( int32_t i ) {
|
|
int32_t index = getLeftPhraseIndex(i);
|
|
if ( index < 0 ) return 0LL;
|
|
return getPhraseId(index);
|
|
}
|
|
|
|
int64_t Phrases::getLeftStripPhraseId ( int32_t i ) {
|
|
int32_t index = getLeftPhraseIndex(i);
|
|
if ( index < 0 ) return 0LL;
|
|
return getStripPhraseId(index);
|
|
}
|
|
*/
|
|
int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
|
|
|
|
*pid = 0LL;
|
|
|
|
/*
|
|
if ( m_numWordsTotal5[i] ) {
|
|
*pid = m_phraseIds5[i];
|
|
return m_numWordsTotal5[i];
|
|
}
|
|
|
|
if ( m_numWordsTotal4[i] ) {
|
|
*pid = m_phraseIds4[i];
|
|
return m_numWordsTotal4[i];
|
|
}
|
|
*/
|
|
if ( m_numWordsTotal3[i] ) {
|
|
*pid = m_phraseIds3[i];
|
|
return m_numWordsTotal3[i];
|
|
}
|
|
|
|
if ( m_numWordsTotal2[i] ) {
|
|
*pid = m_phraseIds2[i];
|
|
return m_numWordsTotal2[i];
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) {
|
|
|
|
*pid = 0LL;
|
|
|
|
if ( m_numWordsTotal2[i] ) {
|
|
*pid = m_phraseIds2[i];
|
|
return m_numWordsTotal2[i];
|
|
}
|
|
|
|
return 0;
|
|
}
|