mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Remove commented out code
This commit is contained in:
34
XmlDoc.cpp
34
XmlDoc.cpp
@ -5062,13 +5062,6 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
char *fv = getFragVec();
|
||||
if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;
|
||||
|
||||
//LinkInfo *info2 = getLinkInfo2();
|
||||
//if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2;
|
||||
|
||||
// init our count table otherwise
|
||||
//if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl"))
|
||||
// return NULL;
|
||||
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
@ -5079,9 +5072,6 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
// shortcut
|
||||
HashTableX *ct = &m_countTable;
|
||||
|
||||
// reset the counts, just in case set() below does not
|
||||
//ct->reset();
|
||||
|
||||
// ez var
|
||||
int64_t *wids = words->getWordIds ();
|
||||
nodeid_t *tids = words->getTagIds ();
|
||||
@ -5096,9 +5086,6 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct"))
|
||||
return (HashTableX *)NULL;
|
||||
|
||||
//char *ff = getFragVec ( ) ;
|
||||
//if ( ! ff ) return false;
|
||||
|
||||
// . now hash all the phrase ids we have in order to see if the phrase
|
||||
// is unique or not. if phrase is repeated a lot we punish the scores
|
||||
// of the individual words in the phrase and boost the score of the
|
||||
@ -5108,8 +5095,7 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
QUICKPOLL ( m_niceness );
|
||||
// add the word
|
||||
if ( wids[i] == 0LL ) continue;
|
||||
//if ( wids[i] == 708411945052722517LL )
|
||||
// log("hey4 got new pid=%"INT64" i=%"INT32"",pids[i],i);
|
||||
|
||||
// . skip if in repeated fragment
|
||||
// . unfortunately we truncate the frag vec to like
|
||||
// the first 80,000 words for performance reasons
|
||||
@ -5133,7 +5119,7 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// skip if not a meta tag
|
||||
if ( tids[i] != 68 ) continue;
|
||||
if ( tids[i] != TAG_META ) continue;
|
||||
// find the "content=" word
|
||||
char *w = wptrs[i];
|
||||
int32_t wlen = wlens[i];
|
||||
@ -5146,9 +5132,10 @@ HashTableX *XmlDoc::getCountTable ( ) {
|
||||
p += 8;
|
||||
// skip if empty meta content
|
||||
if ( wend - p <= 0 ) continue;
|
||||
|
||||
// our ouw hash
|
||||
if ( ! hashString_ct ( ct , p , wend - p ) )
|
||||
return (HashTableX *)NULL;
|
||||
return (HashTableX *)NULL;
|
||||
}
|
||||
// add each incoming link text
|
||||
for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
|
||||
@ -24294,15 +24281,9 @@ bool getDensityRanks ( int64_t *wids ,
|
||||
// . string is usually the document body or inlink text of an inlinker or
|
||||
// perhaps meta keywords. it could be anything. so we need to create this
|
||||
// vector based on that string, which is represented by words/phrases here.
|
||||
bool getDiversityVec ( Words *words ,
|
||||
Phrases *phrases ,
|
||||
HashTableX *countTable ,
|
||||
SafeBuf *sbWordVec ,
|
||||
//SafeBuf *sbPhraseVec ,
|
||||
int32_t niceness ) {
|
||||
|
||||
bool getDiversityVec( Words *words, Phrases *phrases, HashTableX *countTable, SafeBuf *sbWordVec,
|
||||
int32_t niceness ) {
|
||||
int64_t *wids = words->getWordIds ();
|
||||
//nodeid_t *tids = words->getTagIds ();
|
||||
int32_t nw = words->getNumWords();
|
||||
int64_t *pids = phrases->getPhraseIds2();
|
||||
|
||||
@ -24337,7 +24318,7 @@ bool getDiversityVec ( Words *words ,
|
||||
int64_t pid = pids[i];
|
||||
// get the word and phrase weights for term #i
|
||||
float ww2;
|
||||
//float pw2;
|
||||
|
||||
getWordToPhraseRatioWeights ( lastPid , // pids[i-1],
|
||||
wids[i] ,
|
||||
pid ,
|
||||
@ -24687,7 +24668,6 @@ void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
|
||||
int64_t pid2 ,
|
||||
int64_t wid2 , // post word
|
||||
float *retww ,
|
||||
//float *retpw ,
|
||||
HashTableX *tt1 ,
|
||||
int32_t titleRecVersion ) {
|
||||
|
||||
|
@ -2675,107 +2675,12 @@ bool XmlDoc::hashIsAdult ( HashTableX *tt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
BR 20160106 removed. We don't want to store this in posdb as we don't use it.
|
||||
|
||||
// hash destination urls for embedded gb search boxes
|
||||
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing submit urls" );
|
||||
|
||||
Url *baseUrl = getBaseUrl();
|
||||
if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;}
|
||||
|
||||
for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) {
|
||||
// Find forms
|
||||
if ( m_xml.getNodeId(i) != TAG_FORM ) continue;
|
||||
if ( m_xml.isBackTag(i) ) continue;
|
||||
int32_t score = *getSiteNumInlinks8() * 256;
|
||||
if ( score <= 0 ) score = 1;
|
||||
int32_t len;
|
||||
char *s = m_xml.getString ( i , "action" , &len );
|
||||
if (!s || len == 0) continue;
|
||||
Url url; url.set(baseUrl, s, len, true);
|
||||
|
||||
char *buf = url.getUrl();
|
||||
int32_t blen = url.getUrlLen();
|
||||
|
||||
// update hash parms
|
||||
HashInfo hi;
|
||||
hi.m_tt = tt;
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
hi.m_prefix = "gbsubmiturl";
|
||||
hi.m_desc = "submit url for form";
|
||||
|
||||
// this returns false on failure
|
||||
if ( ! hashString ( buf,blen,&hi ) ) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) {
|
||||
// combine with a non-NULL prefix
|
||||
if ( hi->m_prefix ) {
|
||||
int64_t prefixHash = hash64b ( hi->m_prefix );
|
||||
// sanity test, make sure it is in supported list
|
||||
if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
termId = hash64 ( termId , prefixHash );
|
||||
}
|
||||
|
||||
// save it?
|
||||
if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0,
|
||||
MAXDENSITYRANK,
|
||||
MAXDIVERSITYRANK,
|
||||
MAXWORDSPAMRANK,
|
||||
hi->m_hashGroup,
|
||||
false,&m_wbuf,m_wts,false) )
|
||||
return false;
|
||||
|
||||
// shortcut
|
||||
HashTableX *dt = hi->m_tt;
|
||||
// sanity check
|
||||
if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; }
|
||||
// make the key like we do in hashWords()
|
||||
key96_t k;
|
||||
k.n1 = hi->m_date;
|
||||
k.n0 = termId;
|
||||
// get current score for this wordid
|
||||
int32_t slot = dt->getSlot ( &k );
|
||||
// does this termid/date already exist?
|
||||
if ( slot >= 0 ) {
|
||||
// done
|
||||
return true;
|
||||
}
|
||||
// otherwise, add a new slot
|
||||
char val = 1;
|
||||
if ( ! hi->m_tt->addKey ( (char *)k , &val ) )
|
||||
return false;
|
||||
// return true on success
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
bool XmlDoc::hashSingleTerm ( char *s ,
|
||||
int32_t slen ,
|
||||
HashInfo *hi ) {
|
||||
bool XmlDoc::hashSingleTerm( char *s, int32_t slen, HashInfo *hi ) {
|
||||
// empty?
|
||||
if ( slen <= 0 ) return true;
|
||||
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
||||
|
||||
//
|
||||
// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
|
||||
//
|
||||
//if ( ! m_storeTermListInfo )
|
||||
// return true;
|
||||
|
||||
|
||||
// a single blob hash
|
||||
int64_t termId = hash64 ( s , slen );
|
||||
// combine with prefix
|
||||
@ -2841,12 +2746,13 @@ bool XmlDoc::hashSingleTerm ( char *s ,
|
||||
bool XmlDoc::hashString ( char *s, HashInfo *hi ) {
|
||||
return hashString ( s , gbstrlen(s), hi ); }
|
||||
|
||||
bool XmlDoc::hashString ( char *s ,
|
||||
int32_t slen ,
|
||||
HashInfo *hi ) {
|
||||
bool XmlDoc::hashString( char *s, int32_t slen, HashInfo *hi ) {
|
||||
if ( ! m_versionValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
int32_t *sni = getSiteNumInlinks();
|
||||
return hashString3( s ,
|
||||
slen ,
|
||||
@ -2874,8 +2780,7 @@ bool XmlDoc::hashString3( char *s ,
|
||||
Words words;
|
||||
Bits bits;
|
||||
Phrases phrases;
|
||||
//Weights weights;
|
||||
//Synonyms synonyms;
|
||||
|
||||
if ( ! words.set ( s , slen , true , niceness ) )
|
||||
return false;
|
||||
if ( ! bits.set ( &words , version , niceness ) )
|
||||
@ -3021,8 +2926,9 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
// phrase score. thus, a search for 'mexico' should not bring up
|
||||
// the page for university of new mexico!
|
||||
SafeBuf dwbuf;
|
||||
if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness))
|
||||
if ( !getDiversityVec( words, phrases, countTable, &dwbuf, niceness ) ) {
|
||||
return false;
|
||||
}
|
||||
char *wdv = dwbuf.getBufStart();
|
||||
|
||||
int32_t nw = words->getNumWords();
|
||||
@ -3154,10 +3060,11 @@ bool XmlDoc::hashWords3 ( //int32_t wordStart ,
|
||||
if ( m_wts && langVec ) langId = langVec[i];
|
||||
|
||||
char wd;
|
||||
if ( hi->m_useCountTable ) wd = wdv[i];
|
||||
else wd = MAXDIVERSITYRANK;
|
||||
|
||||
|
||||
if ( hi->m_useCountTable ) {
|
||||
wd = wdv[i];
|
||||
} else {
|
||||
wd = MAXDIVERSITYRANK;
|
||||
}
|
||||
|
||||
// BR 20160115: Don't hash 'junk' words
|
||||
bool skipword = false;
|
||||
|
Reference in New Issue
Block a user