forked from Mirrors/privacore-open-source-search-engine
Remove commented out codes
This commit is contained in:
81
Bits.h
81
Bits.h
@ -103,44 +103,48 @@ typedef uint32_t wbit_t;
|
||||
// summary bits used for doing summaries at query time
|
||||
typedef uint16_t swbit_t;
|
||||
|
||||
// . used by SimpleQuery.cpp
|
||||
// . this isn't used for phrasing, it's just so a doc that has the same
|
||||
// # of query terms as another, but also one query stop word, won't be
|
||||
// ranked above the other doc just because of that
|
||||
//#define D_IS_QUERY_STOPWORD 0x40
|
||||
|
||||
class Bits {
|
||||
|
||||
public:
|
||||
|
||||
public:
|
||||
Bits();
|
||||
~Bits();
|
||||
|
||||
bool set2 ( Words *words, int32_t niceness ) {
|
||||
return set ( words,TITLEREC_CURRENT_VERSION,niceness); };
|
||||
bool set2( Words *words, int32_t niceness ) {
|
||||
return set( words, TITLEREC_CURRENT_VERSION, niceness );
|
||||
}
|
||||
|
||||
// . returns false and sets errno on error
|
||||
bool set ( Words *words ,
|
||||
char titleRecVersion ,
|
||||
int32_t niceness ,
|
||||
// provide it with a buffer to prevent a malloc
|
||||
char *buf = NULL ,
|
||||
int32_t bufSize= 0 );
|
||||
// provide it with a buffer to prevent a malloc
|
||||
bool set( Words *words, char titleRecVersion, int32_t niceness, char *buf = NULL, int32_t bufSize = 0 );
|
||||
|
||||
bool setForSummary ( Words *words ,
|
||||
// provide it with a buffer to prevent a malloc
|
||||
char *buf = NULL ,
|
||||
int32_t bufSize= 0 );
|
||||
// provide it with a buffer to prevent a malloc
|
||||
bool setForSummary( Words *words, char *buf = NULL, int32_t bufSize = 0 );
|
||||
|
||||
void reset();
|
||||
|
||||
bool isStopWord (int32_t i) {return m_bits[i]&D_IS_STOPWORD;};
|
||||
bool canBeInPhrase (int32_t i) {return m_bits[i]&D_CAN_BE_IN_PHRASE;};
|
||||
bool canStartPhrase (int32_t i) {return m_bits[i]&D_CAN_START_PHRASE;};
|
||||
bool canPeriodPreceed(int32_t i) {return m_bits[i]&D_CAN_PERIOD_PRECEED;};
|
||||
bool canPairAcross (int32_t i) {return m_bits[i]&D_CAN_PAIR_ACROSS;};
|
||||
//bool isIndexable (int32_t i) {return m_bits[i]&D_IS_INDEXABLE;};
|
||||
bool isCap (int32_t i) {return m_bits[i]&D_IS_CAP;};
|
||||
bool isStopWord( int32_t i ) {
|
||||
return m_bits[i] & D_IS_STOPWORD;
|
||||
}
|
||||
|
||||
bool canBeInPhrase( int32_t i ) {
|
||||
return m_bits[i] & D_CAN_BE_IN_PHRASE;
|
||||
}
|
||||
|
||||
bool canStartPhrase( int32_t i ) {
|
||||
return m_bits[i] & D_CAN_START_PHRASE;
|
||||
}
|
||||
|
||||
bool canPeriodPreceed( int32_t i ) {
|
||||
return m_bits[i] & D_CAN_PERIOD_PRECEED;
|
||||
}
|
||||
|
||||
bool canPairAcross( int32_t i ) {
|
||||
return m_bits[i] & D_CAN_PAIR_ACROSS;
|
||||
}
|
||||
|
||||
bool isCap( int32_t i ) {
|
||||
return m_bits[i] & D_IS_CAP;
|
||||
}
|
||||
|
||||
void printBits ( );
|
||||
void printBit ( int32_t i );
|
||||
|
||||
@ -150,12 +154,11 @@ class Bits {
|
||||
bool m_inLinkBitsSet;
|
||||
bool m_inUrlBitsSet;
|
||||
|
||||
//char m_localBuf [MAX_WORDS*10];
|
||||
char m_localBuf [ BITS_LOCALBUFSIZE ];
|
||||
|
||||
// leave public so Query.cpp can tweak this
|
||||
wbit_t *m_bits ;
|
||||
int32_t m_bitsSize;
|
||||
wbit_t *m_bits;
|
||||
int32_t m_bitsSize;
|
||||
|
||||
int32_t m_niceness;
|
||||
|
||||
@ -163,19 +166,15 @@ class Bits {
|
||||
// . used only by setForSummary() now to avoid having to update a
|
||||
// lot of code
|
||||
swbit_t *m_swbits;
|
||||
int32_t m_swbitsSize;
|
||||
int32_t m_swbitsSize;
|
||||
|
||||
private:
|
||||
Words *m_words;
|
||||
char m_titleRecVersion;
|
||||
bool m_needsFree;
|
||||
|
||||
Words *m_words;
|
||||
|
||||
char m_titleRecVersion;
|
||||
|
||||
bool m_needsFree;
|
||||
|
||||
// get bits for the ith word
|
||||
wbit_t getAlnumBits ( int32_t i , wbit_t prevBits );
|
||||
|
||||
// get bits for the ith word
|
||||
wbit_t getAlnumBits( int32_t i, wbit_t prevBits );
|
||||
};
|
||||
|
||||
#endif
|
||||
|
5
Linkdb.h
5
Linkdb.h
@ -505,12 +505,9 @@ class Msg25 {
|
||||
|
||||
// hack for seo pipeline in xmldoc.cpp
|
||||
int32_t m_hackrd;
|
||||
|
||||
// . we use Msg0 to get an indexList for href: terms
|
||||
// . the href: IndexList's docIds are docs that link to us
|
||||
|
||||
// . we now use Msg2 since it has "restrictIndexdb" support to limit
|
||||
// indexdb searches to just the root file to decrease disk seeks
|
||||
//Msg0 m_msg0;
|
||||
Msg5 m_msg5;
|
||||
RdbList m_list;
|
||||
|
||||
|
493
Phrases.cpp
493
Phrases.cpp
@ -5,9 +5,7 @@
|
||||
|
||||
Phrases::Phrases ( ) {
|
||||
m_buf = NULL;
|
||||
//m_phraseScores = NULL;
|
||||
m_phraseSpam = NULL;
|
||||
//m_phraseIds = NULL;
|
||||
}
|
||||
|
||||
Phrases::~Phrases ( ) {
|
||||
@ -15,12 +13,12 @@ Phrases::~Phrases ( ) {
|
||||
}
|
||||
|
||||
void Phrases::reset() {
|
||||
if ( m_buf && m_buf != m_localBuf )
|
||||
if ( m_buf && m_buf != m_localBuf ) {
|
||||
mfree ( m_buf , m_bufSize , "Phrases" );
|
||||
}
|
||||
|
||||
m_buf = NULL;
|
||||
//m_phraseScores = NULL;
|
||||
m_phraseSpam = NULL;
|
||||
//m_phraseIds = NULL;
|
||||
}
|
||||
|
||||
// initialize this token array with the string, "s" of length, "len".
|
||||
@ -32,10 +30,7 @@ bool Phrases::set( Words *words,
|
||||
int32_t niceness) {
|
||||
// reset in case being re-used
|
||||
reset();
|
||||
// always reset this
|
||||
//m_phraseScores = NULL;
|
||||
m_phraseSpam = NULL;
|
||||
//m_phraseIds = NULL;
|
||||
|
||||
// now we never use stop words and we just index two-word phrases
|
||||
// so that a search for "get a" in quotes will match a doc that has
|
||||
// the phrase "get a clue". it might impact performance, but it should
|
||||
@ -45,24 +40,16 @@ bool Phrases::set( Words *words,
|
||||
// index "kick a ball" as well as "kick a" and "a ball". i don't think
|
||||
// that will cause too much bloat.
|
||||
//useStopWords = false;
|
||||
|
||||
// ensure we have words
|
||||
if ( ! words ) return true;
|
||||
// set the words' scores array, m_wordScores
|
||||
//if ( scores ) m_wordScores = scores->m_scores;
|
||||
//else m_wordScores = NULL;
|
||||
|
||||
// . we have one phrase per word
|
||||
// . a phrase #n is "empty" if spam[n] == PSKIP
|
||||
m_numPhrases = words->getNumWords();
|
||||
|
||||
// replaces scores
|
||||
//m_sections = m_sections;
|
||||
//m_sectionPtrs = NULL;
|
||||
//if ( m_sections ) m_sectionPtrs = m_sections->m_sectionPtrs;
|
||||
|
||||
// how much mem do we need?
|
||||
//int32_t need = (18+1+(3+8*3)) * m_numPhrases;
|
||||
int32_t need = m_numPhrases * (8+8+1+1+1);
|
||||
//if ( m_wordScores ) need += 4 * m_numPhrases;
|
||||
|
||||
// alloc if we need to
|
||||
if ( need > PHRASE_BUF_SIZE )
|
||||
@ -75,35 +62,21 @@ bool Phrases::set( Words *words,
|
||||
m_bufSize = need;
|
||||
// set up arrays
|
||||
char *p = m_buf;
|
||||
//m_phraseIds = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
|
||||
// phrase not using stop words
|
||||
m_phraseIds2 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
m_phraseIds3 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
//m_phraseIds4 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
//m_phraseIds5 = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
//m_stripPhraseIds = (int64_t *)p ; p += m_numPhrases * 8;
|
||||
//if ( m_wordScores ) {
|
||||
// m_phraseScores = (int32_t *)p ;
|
||||
// p += m_numPhrases * 4;
|
||||
//}
|
||||
m_phraseSpam = (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
//m_numWordsTotal = (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
m_numWordsTotal2= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
m_numWordsTotal3= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
//m_numWordsTotal4= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
//m_numWordsTotal5= (unsigned char *)p ; p += m_numPhrases * 1;
|
||||
|
||||
// sanity
|
||||
if ( p != m_buf + need ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// clear this
|
||||
//memset ( m_numWordsTotal , 0 , m_numPhrases );
|
||||
|
||||
memset ( m_numWordsTotal2 , 0 , m_numPhrases );
|
||||
memset ( m_numWordsTotal3 , 0 , m_numPhrases );
|
||||
//memset ( m_numWordsTotal4 , 0 , m_numPhrases );
|
||||
//memset ( m_numWordsTotal5 , 0 , m_numPhrases );
|
||||
|
||||
|
||||
// point to this info while we parse
|
||||
m_words = words;
|
||||
m_wptrs = words->getWords();
|
||||
@ -112,8 +85,10 @@ bool Phrases::set( Words *words,
|
||||
m_bits = bits;
|
||||
m_useStopWords = useStopWords;
|
||||
m_useStems = useStems;
|
||||
|
||||
// we now are dependent on this
|
||||
m_titleRecVersion = titleRecVersion;
|
||||
|
||||
// . set the phrases
|
||||
// . sets m_phraseIds [i]
|
||||
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
|
||||
@ -131,48 +106,30 @@ bool Phrases::set( Words *words,
|
||||
// . ofmice
|
||||
// . mice.andmen
|
||||
void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// . if the ith word cannot start a phrase then we have no phrase
|
||||
// . we indicate NULL phrasesIds with a spam of PSKIP
|
||||
// . we now index all regardless! we want to be able to search
|
||||
// for "a thing" or something. so do it!
|
||||
//if ( ! m_bits->canStartPhrase ( i ) ) {
|
||||
// m_phraseSpam[i] = PSKIP;
|
||||
// m_phraseIds [i] = 0LL;
|
||||
// return;
|
||||
//}
|
||||
|
||||
// MDW: now Weights.cpp should encompass all this logic
|
||||
// or if score <= 0, set in Scores.cpp
|
||||
//if ( m_wordScores && m_wordScores[i] <= 0 ) {
|
||||
// m_phraseSpam[i] = PSKIP;
|
||||
// m_phraseIds [i] = 0LL;
|
||||
// return;
|
||||
//}
|
||||
|
||||
// hash of the phrase
|
||||
int64_t h = 0LL;
|
||||
|
||||
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
|
||||
int64_t h2 = 0LL;
|
||||
int64_t h3 = 0LL;
|
||||
//int64_t h4 = 0LL;
|
||||
//int64_t h5 = 0LL;
|
||||
|
||||
// reset
|
||||
unsigned char pos = 0;
|
||||
|
||||
// now look for other tokens that should follow the ith token
|
||||
int32_t nw = m_words->getNumWords();
|
||||
int32_t numWordsInPhrase = 1;
|
||||
|
||||
// use the min spam from all words in the phrase as the spam for phrase
|
||||
char minSpam = -1;
|
||||
|
||||
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
|
||||
char isNum = is_digit(m_wptrs[i][0]);
|
||||
// min score
|
||||
//int32_t minScore ;
|
||||
//if ( m_wordScores ) minScore = m_wordScores[i];
|
||||
// if i is not a stop word, it can set the min spam initially
|
||||
//if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i);
|
||||
|
||||
// do not include punct/tag words in the m_numWordsTotal[j] count
|
||||
// of the total words in the phrase. these are just usesless tails.
|
||||
int32_t lastWordj = -1;
|
||||
|
||||
// loop over following words
|
||||
int32_t j;
|
||||
bool hasHyphen ;
|
||||
@ -189,18 +146,13 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
// so indeed, skip it then
|
||||
goto nophrase;
|
||||
|
||||
//h = hash64 ( h, m_words->getWordId(i));
|
||||
h = m_wids[i];
|
||||
|
||||
// set position
|
||||
pos = (unsigned char)m_wlens[i];
|
||||
//if (m_words->getStripWordId(i))
|
||||
// h2 = hash64 ( h2, m_words->getStripWordId(i));
|
||||
//else h2 = h;
|
||||
|
||||
hasHyphen = false;
|
||||
hasStopWord2 = m_bits->isStopWord(i);
|
||||
// this makes it true now too
|
||||
//if ( m_wlens[i] <= 2 ) hasStopWord = true;
|
||||
|
||||
for ( j = i + 1 ; j < nw ; j++ ) {
|
||||
QUICKPOLL(niceness);
|
||||
@ -213,93 +165,31 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
if ( ! m_wids[j] ) {
|
||||
// if we cannot pair across word j then break
|
||||
if ( ! m_bits->canPairAcross (j) ) break;
|
||||
|
||||
// does it have a hyphen?
|
||||
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
|
||||
/*
|
||||
// "D & B" --> dandb
|
||||
if (j==i+1 && m_words->hasChar(j,'&')) {
|
||||
// set this
|
||||
hasStopWord = true;
|
||||
// insert "and"
|
||||
int32_t conti=pos;
|
||||
h = hash64Lower_utf8_cont("and",3,h,&conti);
|
||||
pos=conti;
|
||||
// the two-word phrase, set it if we need to
|
||||
h2 = h;
|
||||
m_numWordsTotal2[i] = j-i+1;
|
||||
}
|
||||
*/
|
||||
|
||||
continue;
|
||||
}
|
||||
// . if this word can not be in a phrase then continue our
|
||||
// search for a word that can
|
||||
// . no punctuation can be in a phrase currently (++?)
|
||||
//if ( m_bits->canBeInPhrase (j) ) {
|
||||
//}
|
||||
|
||||
// keep this set right
|
||||
//if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true;
|
||||
//if ( m_bits->isStopWord(j) ) hasStopWord = true;
|
||||
|
||||
// record lastWordj to indicate that word #j was a true word
|
||||
lastWordj = j;
|
||||
// . stop words should have a 0 spam value so don't count those
|
||||
// . added by mdw in march 2002
|
||||
/*
|
||||
if ( ! m_bits->isStopWord(j) && m_spam ) {
|
||||
// maintain the min spam
|
||||
char spam = m_spam->getSpam ( j );
|
||||
if ( minSpam == -1 || spam < minSpam ) minSpam = spam;
|
||||
// . min weight from score vector
|
||||
// . normal score here is 256, not 128, so shift
|
||||
// down 3 to normalize it relatively
|
||||
//if ( m_wordScores && (m_wordScores[j]>>3)<minScore)
|
||||
// minScore = m_wordScores[j]>>3;
|
||||
//if ( m_wordScores && m_wordScores[j] < minScore )
|
||||
// minScore = m_wordScores[j];
|
||||
}
|
||||
*/
|
||||
|
||||
// if word #j can be in phrase then incorporate it's hash
|
||||
if ( m_bits->canBeInPhrase (j) ) {
|
||||
// continue the hash
|
||||
//unsigned char *p= (unsigned char *)m_wptrs[j];
|
||||
//unsigned char *pend = p + m_wlens[j];
|
||||
//for ( ; p < pend ; p++ )
|
||||
// h ^= g_hashtab[pos++][*p];
|
||||
|
||||
int32_t conti = pos;
|
||||
|
||||
// . get the punctuation mark separting two numbers
|
||||
// . use space if can't find one
|
||||
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
|
||||
//if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) {
|
||||
// // get punct mark
|
||||
// char c = m_wptrs[i+1][0];
|
||||
// // if space try next
|
||||
// if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
|
||||
// // treat comma as nothing
|
||||
// if ( c==',' ) c='\0';
|
||||
// // treat / and . and - as they are, everything
|
||||
// // else should be treated as a space
|
||||
// else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
|
||||
// // incorporate into hash if c is there
|
||||
// if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti);
|
||||
//}
|
||||
|
||||
// hash the jth word into the hash
|
||||
h = hash64Lower_utf8_cont(m_wptrs[j],
|
||||
m_wlens[j],
|
||||
h,
|
||||
&conti );
|
||||
pos = conti;
|
||||
//h = hash64 ( h , m_words->getWordId (j) );
|
||||
//if (m_words->getStripWordId(j))
|
||||
// h2 = hash64 ( h2, m_words->getStripWordId(j));
|
||||
//else h2 = hash64(h2, m_words->getWordId(j));
|
||||
|
||||
numWordsInPhrase++;
|
||||
|
||||
// N-word phrases?
|
||||
if ( numWordsInPhrase == 2 ) { // h != h2 ) {
|
||||
if ( numWordsInPhrase == 2 ) {
|
||||
h2 = h;
|
||||
m_numWordsTotal2[i] = j-i+1;
|
||||
if ( m_bits->isStopWord(j) )
|
||||
@ -312,21 +202,11 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
//continue;
|
||||
break;
|
||||
}
|
||||
/*
|
||||
if ( numWordsInPhrase == 4 ) {
|
||||
h4 = h;
|
||||
m_numWordsTotal4[i] = j-i+1;
|
||||
continue;
|
||||
}
|
||||
if ( numWordsInPhrase == 5 ) {
|
||||
h5 = h;
|
||||
m_numWordsTotal5[i] = j-i+1;
|
||||
continue;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// if we cannot pair across word j then break
|
||||
if ( ! m_bits->canPairAcross (j) ) break;
|
||||
|
||||
// keep chugging?
|
||||
if ( numWordsInPhrase >= 5 ) {
|
||||
// if we're not using stop words then break
|
||||
@ -336,35 +216,31 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
}
|
||||
// otherwise, get the next word
|
||||
}
|
||||
|
||||
// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
|
||||
if ( numWordsInPhrase <= 1 ) {
|
||||
nophrase:
|
||||
m_phraseSpam[i] = PSKIP;
|
||||
//m_phraseIds [i] = 0LL;
|
||||
m_phraseIds2[i] = 0LL;
|
||||
m_phraseIds3[i] = 0LL;
|
||||
//m_stripPhraseIds [i] = 0LL;
|
||||
//m_numWordsTotal[i] = 0;
|
||||
m_phraseIds3[i] = 0LL;
|
||||
m_numWordsTotal2[i] = 0;
|
||||
m_numWordsTotal3[i] = 0;
|
||||
return;
|
||||
}
|
||||
// don't jump the edge
|
||||
//if ( j >= nw ) j = nw - 1;
|
||||
|
||||
// sanity check
|
||||
if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; }
|
||||
// set the phrase length (from word #i upto & including word #j)
|
||||
//m_numWordsTotal[i] = j - i + 1;
|
||||
//m_numWordsTotal [i] = lastWordj - i + 1;
|
||||
|
||||
// sanity check
|
||||
if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// set the phrase spam
|
||||
if ( minSpam == -1 ) minSpam = 0;
|
||||
m_phraseSpam[i] = minSpam;
|
||||
// return the phraseId
|
||||
//m_phraseIds [i] = h;
|
||||
|
||||
// hyphen between numbers does not count (so 1-2 != 12)
|
||||
if ( isNum ) hasHyphen = false;
|
||||
|
||||
// . the two word phrase id
|
||||
// . "cd rom" -> cdrom
|
||||
// . "fly paper" -> flypaper
|
||||
@ -380,24 +256,9 @@ void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
|
||||
//m_phraseIds [i] = h ^ 0x768867;
|
||||
m_phraseIds2[i] = h2 ^ 0x768867;
|
||||
}
|
||||
|
||||
// forget hyphen logic for these
|
||||
m_phraseIds3[i] = h3;
|
||||
//m_phraseIds4[i] = h4;
|
||||
//m_phraseIds5[i] = h5;
|
||||
|
||||
//if ( h != h2 ) m_stripPhraseIds[i] = h2;
|
||||
//else m_stripPhraseIds[i] = 0LL;
|
||||
|
||||
// the score weight, if any
|
||||
//if ( m_phraseScores ) m_phraseScores [i] = minScore;
|
||||
// sanity check
|
||||
//if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;}
|
||||
// debug msg
|
||||
//char *w = m_words->getWord(i) ;
|
||||
//int32_t wlen = m_words->getWordLen(i) ;
|
||||
//for ( int32_t k = 0 ; k < wlen ; k++ )
|
||||
// fprintf(stderr,"%c",w[k]);
|
||||
//fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h);
|
||||
}
|
||||
|
||||
// . store phrase that starts with word #i into "printBuf"
|
||||
@ -414,10 +275,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
if ( npw == 2 ) n = m_numWordsTotal2[i] ;
|
||||
else if ( npw == 3 ) n = m_numWordsTotal3[i] ;
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
//char *w1 = m_words->getWord(i);
|
||||
//char *w2 = m_words->getWord(i+n-1);
|
||||
//int32_t wlen2 = m_words->getWordLen(i+n-1);
|
||||
//int32_t plen = ( w2 - w1 ) + wlen2;
|
||||
|
||||
char *s = buf;
|
||||
char *send = buf + 255;
|
||||
@ -425,25 +282,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
if (!m_words->isAlnum(w)){
|
||||
// skip spaces for now since we has altogether now
|
||||
*s++ = ' ';
|
||||
/*
|
||||
// . get the punctuation mark separting two numbers
|
||||
// . use space if can't find one
|
||||
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
|
||||
if ( is_digit(m_wptrs[i][0]) && w == i + 1 &&
|
||||
is_digit(m_wptrs[i+2][0]) ) {
|
||||
// get punct mark
|
||||
char c = m_wptrs[i+1][0];
|
||||
// if space try next
|
||||
if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
|
||||
// treat comma as nothing
|
||||
if ( c==',' ) continue;//c='\0';
|
||||
// treat / and . and - as they are, everything
|
||||
// else should be treated as a space
|
||||
else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
|
||||
// print that
|
||||
*s++ = c;
|
||||
}
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
char *w1 = m_words->getWord(w);
|
||||
@ -465,218 +303,6 @@ char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
/*
|
||||
// . store phrase that starts with word #i into "printBuf"
|
||||
// . return bytes stored in "printBuf"
|
||||
char *Phrases::getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
|
||||
// return 0 if no phrase
|
||||
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
||||
// store the phrase in here
|
||||
static char buf[512];
|
||||
// . how many words, including punct words, are in phrase?
|
||||
// . this should never be 1 or less
|
||||
int32_t n = m_numWordsTotal[i] ;
|
||||
char *dst = buf;
|
||||
char *dend = buf + 255;
|
||||
int32_t count = 0;
|
||||
for (int32_t w = i; w<i+n && count<npw; w++ ) {
|
||||
// do not breach the buffer
|
||||
if ( dst + 4 >= dend ) break;
|
||||
// all non alnum chars are spaces now
|
||||
if ( ! m_words->isAlnum(w) ) {
|
||||
// skip spaces for now since we has altogether now
|
||||
*dst++ = ' ';
|
||||
continue;
|
||||
}
|
||||
count++;
|
||||
char *w1 = m_words->getWord(w);
|
||||
int32_t wlen = m_words->getWordLen(w);
|
||||
// store the word in lower case into "dst"
|
||||
to_lower_utf8 ( dst , dend , w1 , w1 + wlen );
|
||||
// advance destination cursor
|
||||
dst += wlen;
|
||||
}
|
||||
// null terminate
|
||||
*dst = '\0';
|
||||
// set length we wrote into "buf"
|
||||
*phrLen = dst - buf;
|
||||
// return ptr to buf
|
||||
return buf;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
char *Phrases::getStripPhrase ( int32_t i , int32_t *phrLen ) {
|
||||
// return 0 if no phrase
|
||||
if ( m_phraseSpam[i] == PSKIP ) return NULL;
|
||||
// store the phrase in here
|
||||
static char buf[512];
|
||||
// . how many words, including punct words, are in phrase?
|
||||
// . this should never be 1 or less
|
||||
int32_t n = m_numWordsTotal[i] ;
|
||||
//char *w1 = m_words->getWord(i);
|
||||
//char *w2 = m_words->getWord(i+n-1);
|
||||
//int32_t wlen2 = m_words->getWordLen(i+n-1);
|
||||
//int32_t plen = ( w2 - w1 ) + wlen2;
|
||||
|
||||
char *s = buf;
|
||||
char *send = buf + 255;
|
||||
for (int32_t w = i;w<i+n;w++){
|
||||
if (!m_words->isAlnum(w)){
|
||||
*s++ = ' ';
|
||||
continue;
|
||||
}
|
||||
char *w1 = m_words->getWord(w);
|
||||
|
||||
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
|
||||
// write the lower case char from w1+j into "s"
|
||||
int32_t size = to_lower_ascii_utf8 ( s , send , w1 + j );
|
||||
// advance
|
||||
j += size;
|
||||
s += size;
|
||||
}
|
||||
}
|
||||
// null terminate
|
||||
*s = '\0';
|
||||
// set length we wrote into "buf"
|
||||
*phrLen = s - buf;
|
||||
|
||||
// return ptr to buf
|
||||
return buf;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// for getTermId()
|
||||
#include "Indexdb.h"
|
||||
|
||||
// . hash all the words into "table"
|
||||
bool Phrases::hash ( TermTable *table ,
|
||||
Weights *weightsPtr ,
|
||||
uint32_t baseScore ,
|
||||
uint32_t maxScore ,
|
||||
int64_t startHash ,
|
||||
char *prefix1 ,
|
||||
int32_t prefixLen1 ,
|
||||
char *prefix2 ,
|
||||
int32_t prefixLen2 ,
|
||||
bool hashUniqueOnly ,
|
||||
int32_t titleRecVersion,
|
||||
int32_t niceness ) {
|
||||
|
||||
// don't hash if score is 0 or less.
|
||||
if (baseScore <= 0) return true;
|
||||
|
||||
// point to the phrase weights array, m_pw[]
|
||||
int32_t *weights = NULL;
|
||||
if ( weightsPtr ) weights = weightsPtr->m_pw;
|
||||
|
||||
// is the table storing the terms as strings, too? used by
|
||||
// PageParser.cpp
|
||||
SafeBuf *pbuf = table->getParserBuf();
|
||||
|
||||
// . now add each phraseId to the index table
|
||||
// . TODO: might want to add w/ uniqueOnly on if spam is 100%
|
||||
uint32_t score;
|
||||
bool huo;
|
||||
for (int32_t i =0; i < m_numPhrases; i++) {
|
||||
// should we hash this phraseId only if it's not hashed yet?
|
||||
huo = hashUniqueOnly;
|
||||
// a phraseSpam of PSKIP means word #i does not start a phrase
|
||||
if ( m_phraseSpam[i] == PSKIP ) continue;
|
||||
// don't hash it if it's heavily spammed (spam of 100%)
|
||||
score = baseScore - ( baseScore * m_phraseSpam[i] ) / 100;
|
||||
// . use weights instead if we have them
|
||||
// . default weight should be 128!
|
||||
if ( weights ) {
|
||||
// skip if the weight is 0, we probably have menu
|
||||
// eelimination technology turned on...
|
||||
if ( weights[i] == 0 ) continue;
|
||||
// . the old way: we used a signed int32_t which could
|
||||
// overflow before the divide and make artificially
|
||||
// high term scores
|
||||
//if(titleRecVersion < 85)
|
||||
// score = (int32_t)((int32_t)score * weights[i]) / DW;
|
||||
//else score = (score * weights[i]) / DW;
|
||||
score = (score * weights[i]) / DW;
|
||||
}
|
||||
|
||||
// weight by score if we need to
|
||||
|
||||
// if score is 0 because it's heavily spammed then we
|
||||
// should hash just enough to index the phrase
|
||||
if ( ! score ) { score = 1; huo = true; }
|
||||
// get the phrase hash (includes coll,field prefixes)
|
||||
int64_t h = g_indexdb.getTermId (startHash ,m_phraseIds[i]) ;
|
||||
|
||||
//int64_t h2 = 0LL;
|
||||
//if (m_stripPhraseIds[i])
|
||||
// h2 = g_indexdb.getTermId (startHash ,
|
||||
// m_stripPhraseIds[i]) ;
|
||||
int64_t h2 = g_indexdb.getTermId(startHash,m_phraseIds2[i]);
|
||||
// we must mask it before adding it to the table because
|
||||
// this table is also used to hash IndexLists into that come
|
||||
// from LinkInfo classes (incoming link text). And when
|
||||
// those IndexLists are hashed they used masked termIds.
|
||||
// So we should too...
|
||||
//h = h & TERMID_MASK;
|
||||
// add to table
|
||||
//int32_t score2;
|
||||
//if ( titleRecVersion >= 36 ) {
|
||||
//score2 = score >> 1;
|
||||
//if ( score2 <= 0 ) score2 = 1;
|
||||
//}
|
||||
//else
|
||||
// score2 = score;
|
||||
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
if ( ! pbuf ) {
|
||||
if ( ! table->addTerm ( h, score, maxScore, huo,
|
||||
titleRecVersion ))
|
||||
return false;
|
||||
// hash the two-word phrase if h is not two words
|
||||
if ( h2 && h2 != h &&
|
||||
! table->addTerm ( h2, score, maxScore,
|
||||
huo, titleRecVersion ))
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
// add phrase as string to hash table if we need to as well
|
||||
int32_t plen;
|
||||
char *p = getPhrase ( i , &plen );
|
||||
int32_t slen;
|
||||
//#if 1
|
||||
char *s = table->storeTerm ( p , plen ,
|
||||
prefix1 , prefixLen1 ,
|
||||
prefix2 , prefixLen2 ,true,&slen);
|
||||
//#else
|
||||
//char *s = table->storeTerm ( p , plen ,
|
||||
// "phrase" , 6 ,
|
||||
// prefix2 , prefixLen2 , true, &slen );
|
||||
//#endif
|
||||
if ( ! table->addTerm( h, score, maxScore, huo ,
|
||||
titleRecVersion, s, slen ) )
|
||||
return false;
|
||||
|
||||
// if no strippable chars in phrase, we're done
|
||||
if ( ! h2 || h2 == h ) continue;
|
||||
|
||||
p = getTwoWordPhrase(i, &plen);
|
||||
|
||||
s = table->storeTerm ( p , plen ,
|
||||
prefix1 , prefixLen1 ,
|
||||
prefix2 , prefixLen2 ,true,&slen);
|
||||
if ( ! table->addTerm( h2, score , maxScore, huo ,
|
||||
titleRecVersion , s, slen ) )
|
||||
return false;
|
||||
}
|
||||
// . TODO: print spam %'s for phrases!!!
|
||||
// . see Words.cpp for template code to do this
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
// . word #n is in a phrase if he has [word][punct] or [punct][word]
|
||||
// before/after him and you can pair across the punct and include both
|
||||
// in a phrase
|
||||
@ -685,6 +311,7 @@ bool Phrases::hash ( TermTable *table ,
|
||||
bool Phrases::isInPhrase ( int32_t n ) {
|
||||
// returns true if we started a phrase (our phraseSpam is not PSKIP)
|
||||
if ( m_phraseSpam[n] != PSKIP ) return true;
|
||||
|
||||
// . see if we were in a phrase started by a word before us
|
||||
// . this only words since stop words - whose previous word cannot be
|
||||
// paired across - are able to start phrases
|
||||
@ -694,61 +321,10 @@ bool Phrases::isInPhrase ( int32_t n ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
// . get the index of the word that starts this phrase
|
||||
// . returns -1 if none...factored out for
|
||||
// . getLeftPhraseId and getLeftStripPhraseId
|
||||
int32_t Phrases::getLeftPhraseIndex( int32_t i ) {
|
||||
// return 0 if we no words before us
|
||||
while ( i > 0 ) {
|
||||
// check punct before
|
||||
i--;
|
||||
// can he be paired across
|
||||
if ( m_words->isPunct(i)){
|
||||
if ( ! m_bits->canPairAcross(i) ) return -1;
|
||||
}
|
||||
else{
|
||||
// if word before him not in a phrase, bail
|
||||
if ( ! isInPhrase ( i ) ) return -1;
|
||||
// can he start ?
|
||||
if ( ! m_bits->canStartPhrase ( i ) ) continue;
|
||||
// yes he can
|
||||
return i;
|
||||
}
|
||||
}
|
||||
// none
|
||||
return -1;
|
||||
}
|
||||
// . get the id of the phrase we are in that we do not start
|
||||
// . returns 0 if none, even though 0 may be a valid phraseId!! TODO: fix
|
||||
|
||||
int64_t Phrases::getLeftPhraseId ( int32_t i ) {
|
||||
int32_t index = getLeftPhraseIndex(i);
|
||||
if ( index < 0 ) return 0LL;
|
||||
return getPhraseId(index);
|
||||
}
|
||||
|
||||
int64_t Phrases::getLeftStripPhraseId ( int32_t i ) {
|
||||
int32_t index = getLeftPhraseIndex(i);
|
||||
if ( index < 0 ) return 0LL;
|
||||
return getStripPhraseId(index);
|
||||
}
|
||||
*/
|
||||
int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
|
||||
|
||||
*pid = 0LL;
|
||||
|
||||
/*
|
||||
if ( m_numWordsTotal5[i] ) {
|
||||
*pid = m_phraseIds5[i];
|
||||
return m_numWordsTotal5[i];
|
||||
}
|
||||
|
||||
if ( m_numWordsTotal4[i] ) {
|
||||
*pid = m_phraseIds4[i];
|
||||
return m_numWordsTotal4[i];
|
||||
}
|
||||
*/
|
||||
if ( m_numWordsTotal3[i] ) {
|
||||
*pid = m_phraseIds3[i];
|
||||
return m_numWordsTotal3[i];
|
||||
@ -764,7 +340,6 @@ int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
|
||||
|
||||
|
||||
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) {
|
||||
|
||||
*pid = 0LL;
|
||||
|
||||
if ( m_numWordsTotal2[i] ) {
|
||||
|
Reference in New Issue
Block a user