Remove more commented out codes & unused sections codes

This commit is contained in:
Ai Lin Chia
2016-02-24 11:56:54 +01:00
parent df2d952e1f
commit 32147c302c
10 changed files with 30 additions and 1237 deletions

@ -1068,7 +1068,7 @@ void Msg39::estimateHitsAndSendReply ( ) {
mr.size_pairScoreBuf = 0;
mr.size_singleScoreBuf = 0;
}
//mr.m_sectionStats = pt->m_sectionStats;
// reserve space for these guys, we fill them in below
mr.ptr_docIds = NULL;
mr.ptr_scores = NULL;

@ -131,13 +131,6 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
log(LOG_LOGIC,"net: bad collection. msg3a. %"INT32"",
(int32_t)m_r->m_collnum);
//m_indexdbSplit = g_hostdb.m_indexSplits;
// certain query term, like, gbdom:xyz.com, are NOT split
// at all in order to keep performance high because such
// terms are looked up by the spider. if a query contains
// multiple "no split" terms, then it becomes split unfortunately...
//if ( ! m_q->isSplit() ) m_indexdbSplit = 1;
// for a sanity check in Msg39.cpp
r->m_nqt = m_q->getNumTerms();
@ -154,10 +147,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// . return now if query empty, no docids, or none wanted...
// . if query terms = 0, might have been "x AND NOT x"
if ( m_q->getNumTerms() <= 0 ) return true;
// sometimes we want to get section stats from the hacked
// sectionhash: posdb termlists
//if ( m_docsToGet <= 0 && ! m_r->m_getSectionStats )
// return true;
// . set g_errno if not found and return true
// . coll is null terminated
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
@ -234,24 +224,17 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// update our read info
for ( int32_t j = 0; j < n ; j++ ) {
// the read size for THIS query term
int32_t rs = 300000000; // toRead; 300MB i guess...
// limit to 50MB man! this was 30MB but the
// 'time enough for love' query was hitting 30MB termlists.
//rs = 50000000;
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// it is better to go oom then leave users scratching their
// heads as to why some results are not being returned.
// no, because we are going out of mem for queries like
// 'www.disney.nl' etc.
//rs = -1;
// if section stats, limit to 1MB
//if ( m_r->m_getSectionStats ) rs = 1000000;
int32_t rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
// get the jth query term
QueryTerm *qt = &m_q->m_qterms[j];
// if query term is ignored, skip it
if ( qt->m_ignored ) rs = 0;
// set it
readSizes[j] = rs;
// serialize these too
tfw[j] = qt->m_termFreqWeight;
}
@ -265,8 +248,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
// Query::expandQuery() above
m_r->ptr_query = m_q->m_orig;
m_r->size_query = m_q->m_origLen+1;
// the white list now too...
//m_r->ptr_whiteList = si->m_whiteListBuf.getBufStart();
// free us?
if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
mfree ( m_rbufPtr , m_rbufSize, "Msg3a" );
@ -774,64 +756,6 @@ bool Msg3a::mergeLists ( ) {
//m_totalDocCount = 0; // int32_t docCount = 0;
m_moreDocIdsAvail = true;
/*
this version is too simple. now each query term can be a
gbfacet:price or gbfacet:type term and each has a
list in the Msg39Reply::ptr_facetHashList for its termid
//
// compile facet stats
//
for ( int32_t j = 0; j < m_numHosts ; j++ ) {
Msg39Reply *mr =m_reply[j];
// one table for each query term
char *p = mr->ptr_facetHashList;
// loop over all query terms
int32_t n = m_q->getNumTerms();
// use this
HashTableX tmp;
// do the loop
for ( int32_t i = 0 ; i < n ; i++ ) {
// size of it
int32_t psize = *(int32_t *)p;
p += 4;
tmp.deserialize ( p , psize );
p += psize;
// now compile the stats into a master table
for ( int32_t k = 0 ; k < tmp.m_numSlots ; k++ ) {
if ( ! tmp.m_flags[k] ) continue;
// get the vlaue
int32_t v32 = *(int32_t *)tmp.getKeyFromSlot(k);
// and how many of them there where
int32_t count = *(int32_t *)tmp.getValueFromSlot(k);
// add to master
master.addScore32 ( v32 , count );
}
}
}
////////
//
// now set m_facetStats
//
////////
// add up all counts
int64_t count = 0LL;
for ( int32_t i = 0 ; i < master.getNumSlots() ; i++ ) {
if ( ! master.m_flags[i] ) continue;
int64_t slotCount = *(int32_t *)master.getValueFromSlot(i);
int32_t h32 = *(int32_t *)master.getKeyFromSlot(i);
if ( h32 == m_r->m_myFacetVal32 )
m_facetStats.m_myValCount = slotCount;
count += slotCount;
}
m_facetStats.m_totalUniqueValues = master.getNumUsedSlots();
m_facetStats.m_totalValues = count;
*/
// shortcut
//int32_t numSplits = m_numHosts;//indexdbSplit;
// . point to the various docids, etc. in each shard reply
// . tcPtr = term count. how many required query terms does the doc
@ -920,11 +844,6 @@ bool Msg3a::mergeLists ( ) {
for ( int32_t j = 0; j < m_numQueriedHosts ; j++ ) {
Msg39Reply *mr =m_reply[j];
if ( ! mr ) continue;
//SectionStats *src = &mr->m_sectionStats;
//dst->m_onSiteDocIds += src->m_onSiteDocIds;
//dst->m_offSiteDocIds += src->m_offSiteDocIds;
//dst->m_totalMatches += src->m_totalMatches;
//dst->m_totalEntries += src->m_totalEntries;
// now the list should be the unique site hashes that
// had the section hash. we need to uniquify them again
// here.
@ -1036,7 +955,6 @@ bool Msg3a::mergeLists ( ) {
if ( ! sortFacetEntries() )
return true;
//if ( m_r->m_getSectionStats ) return true;
//
// HACK: END section stats merge
//

11
Msg3a.h

@ -145,17 +145,6 @@ public:
// when merging this list of docids into a final list keep
// track of the cursor into m_docIds[]
int32_t m_cursor;
// what collection # are these docids from if m_collnums[] is NULL
//collnum_t m_collnum;
// we don't have FacetStats because we have the actual
// Msg39Reply::ptr_facetHashList from each shard which contains
// all the facet hash lists for each gbfacet: query term we had
// and the query "Msg3a::m_q.m_qterms[].m_dt" is the hash table
// where each key is a facethash for that gbfacet:xxxx term and
// the value if the # of occurences.
//SectionStats m_sectionStats;
};
#endif

@ -3839,95 +3839,6 @@ void PosdbTable::intersectLists10_r ( ) {
if( g_conf.m_logTracePosdb ) log(LOG_TRACE,"%s:%s:%d: seoHack: %s, numTerms: %"INT32"", __FILE__,__func__, __LINE__, seoHack?"true":"false", m_q->m_numTerms);
// if we are just a sitehash:xxxxx list and m_getSectionStats is
// true then assume the list is one of hacked posdb keys where
// the wordposition bits and others are really a 32-bit site hash
// and we have to see how many different docids and sites have
// this term. and we compare to our site hash,
// m_r->m_sectionSiteHash32 to determine if the posdb key is
// onsite or offsite. then XmlDoc::printRainbowSections()
// can print out how many page/sites duplicate your section's content.
// MDW: TODO: for the facet terms just compile the stats and do not
// send to intersecting. they are ignored for those purposes. send
// the hashtable back so msg3a can integrate the stats. keep in mind
// we have multiple docid ranges sometimes for one query!!!!
/*
MDW: take this out. now treat as a normal termlist but
do not use for scoring. so it is kinda like gbmin: gbmax:
query operators but it will just add the facet values to
QueryTerm::m_facetHashList for transmission back to the aggregator
node. however, it is only for docids in the final result set!
if ( m_r->m_getFacetStats ) {
// reset
m_facetStats.m_totalMatches = 0;
m_facetStats.m_totalEntries = 0;
m_dt.clear();
// scan the posdb keys
//for ( int32_t i = 0 ; i < m_msg2->getNumListsInGroup(0); i++) {
// get the sublist
RdbList *list = m_msg2->getList(0);//Group(0)[i];
char *p = list->getList ();
char *pend = p + list->getListSize();
// test
//int64_t final = 5663137686803656554LL;
//final &= TERMID_MASK;
//if ( p<pend && g_posdb.getTermId(p) == final )
// log("boo");
// scan it
for ( ; p < pend ; ) {
// . first key is the full size
// . uses the w,G,s,v and F bits to hold this
// . this is no longer necessarily sitehash, but
// can be any val, like now FacetStats is using
// it for the innerHtml sentence content hash32
int32_t sh32 = g_posdb.getFacetVal32 ( p );
//int64_t d = g_posdb.getDocId(p);
//int32_t rs = list->getRecSize(p);
// this will not update listptrlo, watch out!
p += list->getRecSize ( p );
// does this xpath from another docid have the
// same inner html as us?
if ( sh32 == m_r->m_myFacetVal32 ) // m_siteHash32 )
m_facetStats.m_totalMatches++;
// always this
m_facetStats.m_totalEntries++;
// unique site count
if ( m_dt.isInTable ( &sh32 ) ) continue;
// count it
m_facetStats.m_numUniqueVals++;
// only once
m_dt.addKey ( &sh32 );
// log it
//log("usite: %08"XINT32" %"INT64" rs=%"INT32"",sh32,d,rs);
// stop if too much so we do not try to
// re-alloc in a thread!
if ( m_dt.m_numSlotsUsed >= 1000000 ) break;
}
// and return the list of merging
int32_t *s = (int32_t *)m_facetHashList.getBufStart();
int32_t *send = (int32_t *)m_facetHashList.getBufEnd();
//if ( m_facetStats.m_numUniqueSites == 17 ) {
// log("q=%s",m_r->ptr_query);
// log("hey");
// //char *xx = NULL;*xx=0;
//}
//if(!strcmp(m_r->ptr_query,"gbsectionhash:3335323672699668766"
// log("boo");
int32_t *orig = s;
for ( int32_t i = 0 ; i < m_dt.m_numSlots ; i++ ) {
if ( ! m_dt.m_flags[i] ) continue;
*s++ = *(int32_t *)m_dt.getKeyFromSlot(i);
if ( s >= send ) break;
}
m_facetHashList.setLength((char *)s-(char *)orig);
return;
}
*/
//
// hash the docids in the whitelist termlists into a hashtable.
// every docid in the search results must be in there. the

106
Posdb.h

@ -440,80 +440,8 @@ public:
int32_t m_quotedStartId;
};
/*
#include "RdbList.h"
class PosdbList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , int32_t listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
int64_t getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
int64_t getTermId12 ( char *rec ) {
return (*(uint64_t *)(&rec[4])) >> 16 ;
};
int64_t getTermId16 ( char *rec ) {
return (*(uint64_t *)(&rec[8])) >> 16 ;
};
// these 2 assume 12 and 6 byte keys respectively
int64_t getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
int64_t getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
int64_t getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
int64_t getDocId12 ( char *rec ) {
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
int64_t getDocId6 ( char *rec ) {
int64_t docid;
*(int32_t *)(&docid) = *(int32_t *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
};
*/
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
//#define MAX_RESULTS 1000
class PosdbTable {
public:
@ -525,10 +453,7 @@ class PosdbTable {
char debug ,
void *logstate ,
class TopTree *topTree ,
//char *coll ,
collnum_t collnum ,
//IndexList *lists ,
//int32_t numLists ,
class Msg2 *msg2,
class Msg39Request *r );
@ -538,12 +463,6 @@ class PosdbTable {
// pre-allocate memory since intersection runs in a thread
bool allocTopTree ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
//void intersectLists_r ( );
//void intersectLists9_r ( );
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
@ -580,7 +499,9 @@ class PosdbTable {
void freeMem ( ) ;
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
bool isInitialized() {
return m_initialized;
}
uint64_t m_docId;
@ -609,56 +530,37 @@ class PosdbTable {
int32_t m_maxScores;
//char *m_coll;
collnum_t m_collnum;
int32_t *m_qpos;
int32_t *m_wikiPhraseIds;
int32_t *m_quotedStartIds;
//class DocIdScore *m_ds;
int32_t m_qdist;
float *m_freqWeights;
//int64_t *m_freqs;
char *m_bflags;
int32_t *m_qtermNums;
float m_bestWindowScore;
//char **m_finalWinners1;
//char **m_finalWinners2;
//float *m_finalScores;
char **m_windowTermPtrs;
// how many docs in the collection?
int64_t m_docsInColl;
//SectionStats m_sectionStats;
//SafeBuf m_facetHashList;
//HashTableX m_dt;
class Msg2 *m_msg2;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
//HashTableX m_docIdTable;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
SafeBuf m_stackBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
//IndexList *m_lists;
//int32_t m_numLists;
// has init() been called?
bool m_initialized;
@ -668,8 +570,6 @@ class PosdbTable {
// for debug msgs
void *m_logstate;
//int64_t m_numDocsInColl;
class Msg39Request *m_r;
// for gbsortby:item.price ...

@ -3798,26 +3798,6 @@ bool Sections::printSectionDiv ( Section *sk , char format ) {
,mod);
}
SectionStats *ss = &sk->m_stats;
// also the value of the inner html hashed
if ( sk->m_flags & SEC_HASHXPATH ) {
uint32_t val ;
val = (uint32_t) sk->m_indirectSentHash64 ;
m_sbuf->safePrintf("xpathsitehashval=%"UINT32" ", val );
}
// some voting stats
if ( sk->m_flags & SEC_HASHXPATH ) {
m_sbuf->safePrintf("_s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32" "
,(int32_t)ss->m_totalMatches
,(int32_t)ss->m_totalDocIds
,(int32_t)ss->m_totalEntries
,(int32_t)ss->m_numUniqueVals
,(uint32_t)mod
);
}
printFlags ( m_sbuf , sk );
if ( isHardSection(sk) )

@ -167,10 +167,6 @@ public:
// are a sentence section then this points to itself.
class Section *m_sentenceSection;
// . set in XmlDoc::getSectionsWithDupStats()
// . voting info for this section over all indexed pages from this site
SectionStats m_stats;
// position of the first and last alnum word contained directly OR
// indirectly in this section. use -1 if no text contained...
int32_t m_firstWordPos;
@ -324,9 +320,6 @@ class Sections {
bool printSectionDiv ( class Section *sk , char format = FMT_HTML );
class SafeBuf *m_sbuf;
char *getSectionsReply ( int32_t *size );
char *getSectionsVotes ( int32_t *size );
bool isHardSection ( class Section *sn );
bool setMenus ( );

@ -89,8 +89,6 @@ XmlDoc::XmlDoc() {
m_msg22Request.m_inUse = 0;
m_msg4Waiting = false;
m_msg4Launched = false;
//m_sectiondbData = NULL;
//m_placedbData = NULL;
m_dupTrPtr = NULL;
m_oldTitleRec = NULL;
m_filteredContent = NULL;
@ -98,40 +96,27 @@ XmlDoc::XmlDoc() {
m_metaList = NULL;
m_metaListSize = 0;
m_metaListAllocSize = 0;
//m_titleRec = NULL;
//m_freeTitleRec = true;
m_rootTitleRec = NULL;
m_isIndexed = false;
m_isInIndex = false;
m_wasInIndex = false;
m_outlinkHopCountVector = NULL;
//m_gsbuf = NULL;
m_extraDoc = NULL;
m_wikiqbuf = NULL;
//m_cr = NULL;
//m_notifyBlocked = 0;
//m_mcasts = NULL;
//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
// m_currentBinPtrs[i] = NULL;
reset();
};
}
XmlDoc::~XmlDoc() {
setStatus("freeing this xmldoc");
reset();
m_freed = true;
};
}
static int64_t s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
m_savedChar = '\0';
m_redirUrl.reset();
m_updatedMetaData = false;
@ -148,8 +133,6 @@ void XmlDoc::reset ( ) {
m_bodyStartPos = 0;
m_mcastArray = NULL;
m_skipIframeExpansion = false;
m_indexedTime = 0;
@ -187,19 +170,9 @@ void XmlDoc::reset ( ) {
m_allHashed = false;
// reset this crap
m_beginTimeAllMatch = 0LL;
m_beginTimeMatchUrl = 0LL;
m_beginTimeFullQueries = 0LL;
m_beginTimeLinks = 0LL;
//m_beginMsg98s = 0LL;
m_beginRelatedQueries = 0LL;
m_doledbKey.n0 = 0LL;
m_doledbKey.n1 = 0;
m_sortedPosdbListBuf.purge();
m_termListBuf.purge();
@ -219,9 +192,6 @@ void XmlDoc::reset ( ) {
m_domDedupTablePtr = NULL;
m_storeTermListInfo = false;
m_gotDupStats = false;
//m_nextSection = (Section *)-1;
m_si = (Section *)-1;
// for limiting # of iframe tag expansions
m_numExpansions = 0;
@ -273,16 +243,6 @@ void XmlDoc::reset ( ) {
// if this is true, then only index if new
m_newOnly = 0;
//if ( m_sectiondbData ) {
// mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
// m_sectiondbData = NULL;
//}
//if ( m_placedbData ) {
// mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
// m_placedbData = NULL;
//}
if ( m_httpReplyValid && m_httpReply ) {
mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
m_httpReply = NULL;
@ -295,10 +255,6 @@ void XmlDoc::reset ( ) {
m_filteredContentAllocSize = 0;
}
//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
// mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");
if ( m_metaList ) { // m_metaListValid && m_metaList ) {
mfree ( m_metaList , m_metaListAllocSize , "metalist");
m_metaList = NULL;
@ -307,18 +263,10 @@ void XmlDoc::reset ( ) {
}
if ( m_ubuf ) {
// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
// , (PTRTYPE) m_ubuf
// , (PTRTYPE) this
// );
mfree ( m_ubuf , m_ubufAlloc , "ubuf");
m_ubuf = NULL;
}
//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
// mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
//}
//m_titleRec = NULL;
m_titleRecBuf.purge();
if ( m_dupTrPtr ) {
@ -345,10 +293,6 @@ void XmlDoc::reset ( ) {
}
m_outlinkHopCountVector = NULL;
//if ( m_gsbufValid && m_gsbuf ) {
// mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
//}
//m_gsbuf = NULL;
m_gsbuf.reset();
@ -359,7 +303,6 @@ void XmlDoc::reset ( ) {
m_hashedMetas = false;
m_mcastBuf.purge();
m_serpBuf.purge();
// Doc.cpp:
@ -376,11 +319,6 @@ void XmlDoc::reset ( ) {
m_bits2.reset();
m_pos.reset();
m_synBuf.reset();
m_turkVotingTable.reset();
m_turkBitsTable.reset();
m_vtr.reset();
m_vctab.reset();
m_vcduptab.reset();
m_images.reset();
m_countTable.reset();
m_mime.reset();
@ -466,10 +404,6 @@ void XmlDoc::reset ( ) {
m_launchedSpecialMsg8a = false;
m_launchedMsg8a2 = false;
m_numSectiondbReads = 0;
m_numSectiondbNeeds = 0;
m_sectiondbRecall = 0;
m_setTr = false;
m_triedTagRec = false;
m_didGatewayPage = false;
@ -4843,606 +4777,6 @@ Sections *XmlDoc::getSections ( ) {
return &m_sections;
}
// . scan every section and look up its tag and content hashes in
// sectiondb to find out how many pages and sites have the same hash
// . use the secondary sectiondb key, key2
// . then store the stats in the Sections::m_stats class
Sections *XmlDoc::getSectionsWithDupStats ( ) {
Sections *ss = getSections();
if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;
if ( m_gotDupStats ) return ss;
int32_t *sh32 = getSiteHash32();
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
uint32_t siteHash32 = (uint32_t)*sh32;
// if this is -1, we are called for the first time
if ( m_si == (void *)-1 ) {
m_si = ss->m_rootSection;
m_mcastRequestsIn = 0;
m_mcastRequestsOut = 0;
m_secStatsErrno = 0;
}
for ( ; m_si ; m_si = m_si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index.
if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
continue;
// get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
if ( ! val32 )
continue;
// get section xpath hash combined with sitehash
uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;
// convert this to 32 bits
uint32_t innerHash32 ;
//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
innerHash32 = (uint32_t)m_si->m_indirectSentHash64;
// save in case we need to read more than 5MB
//m_lastSection = si;
// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
// . we hack the "sentContentHash32" into each posdb key
// as the "value" so we can do a facet-like histogram
// over all the possible values this xpath has for this site
SectionStats *stats = getSectionStats ( secHash32,
innerHash32,
false ); // cache only?
// it returns -1 if would block
if ( stats == (void *)-1 ) {
// count it as outstanding
//m_mcastRequestsOut++;
// launch more if we have room
// UdpServer.cpp has a limit of 10 on 0x39 requests
if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
continue;
// advance m_si so we do not repeat
m_si = m_si->m_next;
// otherwise, return -1 to indicate blocked
return (Sections *)-1;
}
// NULL means g_errno
if ( ! stats ) {
// ensure g_errno is set
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// save it
m_secStatsErrno = g_errno;
// clear it
g_errno = 0;
// if still waiting though return -1
if ( m_mcastRequestsOut > m_mcastRequestsIn )
return (Sections *)-1;
// otherwise, all done i guess
return NULL;
}
// if already in the table, skip it!
}
// waiting for more replies to come back?
if ( m_mcastRequestsOut > m_mcastRequestsIn )
return (Sections *) -1;
// now scan the sections and copy the stats from the table
// into Section::m_stats of each sentence section.
// use the key hash as the the hash of the tag/xpath and the innerhtml
// and the val instead of being site hash will be hash of the
// content. then we can get the histogram of our content hash
// for this xpath on our site.
Section *si = ss->m_rootSection;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// skip if no content to hash
//if ( ! si->m_sentenceContentHash64 ) continue;
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
continue;
// skip if sentence, only hash tags now i guess for diffbot
//if ( si->m_sentenceContentHash64 )
// continue;
// get hash of sentences this tag contains indirectly
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
if ( ! val32 )
continue;
// skip if menu!
//if ( si->m_flags & menuFlags ) continue;
// get section xpath hash combined with sitehash
uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;
// convert this to 32 bits
uint32_t innerHash32 ;
innerHash32 = (uint32_t)si->m_indirectSentHash64;
// the "stats" class should be in the table from
// the lookups above!!
SectionStats *stats = getSectionStats ( secHash32,
innerHash32,
true ); // cache only?
// sanity
//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
// must have had a network error or something
if ( ! stats ) continue;
// copy
gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
}
//
// now if a section has no stats but has the same
// m_indirectSentHash64 as a kid, take his stats
//
Section *sx = ss->m_rootSection;
for ( ; sx ; sx = sx->m_next ) {
// breathe
QUICKPOLL(m_niceness);
// don't bother with the section if it doesn't have this set
// because this eliminates parent dupage to reduce amount
// of gbxpathsitehash123456 terms we index
if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
continue;
// scan up parents and set their stats to ours as int32_t as
// they have the same indirect sent hash64
Section *p = sx->m_parent;
for ( ; p ; p = p->m_parent ) {
// if parent is like an img tag, skip it
if ( p->m_tagId == TAG_IMG )
continue;
if ( p ->m_indirectSentHash64 !=
sx->m_indirectSentHash64 )
break;
// copy it to parent with the same inner html hash
gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
}
}
// now free the table's mem
m_sectionStatsTable.reset();
m_gotDupStats = true;
return ss;
}
static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
//XmlDoc *THIS = (XmlDoc *)state;
XmlDoc *THIS = (XmlDoc *)state1;
Multicast *mcast = (Multicast *)state2;
THIS->gotSectionFacets ( mcast );
// this will end up calling getSectionsWithDupStats() again
// which will call getSectionStats() some more on new sections
// until m_gotDupStats is set to true.
THIS->m_masterLoop ( THIS->m_masterState );
}
// . launch a single msg3a::getDocIds() for a section hash, secHash32
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
uint32_t innerHash32 ,
bool cacheOnly ) {
// init cache?
if ( m_sectionStatsTable.m_numSlots == 0 &&
! m_sectionStatsTable.set(4,
sizeof(SectionStats),
32,
NULL,
0,
false,
m_niceness,
"secstatsch"))
return NULL;
// check in cache...
SectionStats *stats ;
stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
// if there, return it
if ( stats ) return stats;
// if cache only do not launch
if ( cacheOnly ) return NULL;
//
// TODO: shard gbxpathsitehashxxxxx by termid
// and make sure msg3a only sends to that single shard and sends
// the stats back. should make us much faster to sectionize
// a web page. but for now try without it...
//
//int32_t *sh32 = getSiteHash32();
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;
int32_t maxOut = 32;
// . need to make new msg39Request and a new Multicast arrays
// . only need multicast since these gbfacetstr:gbxpathsitehash123456
// terms are sharded by termid, otherwise we'd have to use msg3a
if ( ! m_mcastArray ) {
// how much mem to alloc?
int32_t need = 0;
need += sizeof(Multicast);
need += sizeof(Msg39Request);
// query buf str
need += 100;
need *= maxOut;
// a single query now to be shared
//need += sizeof(Query);
// just in case we are being re-used
m_mcastBuf.reset();
// alloc space
if ( ! m_mcastBuf.reserve(need) ) return NULL;
// point to buf
char *p = m_mcastBuf.getBufStart();
// set them up
m_mcastArray = (Multicast *)p;
p += sizeof(Multicast) * maxOut;
m_msg39RequestArray = (Msg39Request *)p;
p += sizeof(Msg39Request) * maxOut;
//m_queryArray = (Query *)p;
//p += sizeof(Query) * maxOut;
//m_sharedQuery = (Query *)p;
//p += sizeof(Query);
// for holding the query string
// assume query will not exceed 100 bytes incuding \0
m_queryBuf = p;
p += 100 * maxOut;
// initialize all!
for ( int32_t i = 0 ; i < maxOut ; i++ ) {
m_mcastArray [i].constructor();
m_msg39RequestArray[i].reset();//constructor();
//m_queryArray [i].constructor();
m_queryBuf[100*i] = '\0';
//m_inUse[i] = 0;
}
}
// get first available
int32_t i;
for ( i = 0 ; i < maxOut ; i++ )
if ( ! m_mcastArray[i].m_inUse ) break;
// wtf?
if ( i >= maxOut ) { char *xx=NULL;*xx=0; }
// and our vehicle
Multicast *mcast = &m_mcastArray[i];
// mark as in use up here in case we quickpoll into this same code?!
// yeah, i guess set2() calls quickpoll?
//mcast->m_inUse = 1;
// save this for reply
//mcast->m_hack = this;
char *qbuf = m_queryBuf + 100 * i;
// . hash this special term (was gbsectionhash)
// . the wordbits etc will be a number though, the hash of the content
// of the xpath, the inner html hash
// . preceeding this term with gbfacet: will make gigablast return
// the statistics for all the values in the posdb keys of this
// termlist, which happen to be innerHTML hashes for all pages
// with this same xpath and on this same site.
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"",
(uint32_t)secHash32);
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// set the msg39 request
Msg39Request *r = &m_msg39RequestArray[i];
// reset all to defaults
r->reset();
//r-> ptr_coll = cr->m_coll;
//r->size_coll = gbstrlen(cr->m_coll)+1;
r->m_collnum = cr->m_collnum;
r->m_maxAge = 60; // cache timeout?
r->m_addToCache = true;
r->m_docsToGet = 0; // just calc stats
r->m_niceness = m_niceness;
r->m_debug = 0;
r->m_doSiteClustering = false;
//r->m_doIpClustering = false;
r->m_doDupContentRemoval = false;
r->m_boolFlag = 2;
r->m_familyFilter = 0;
r->m_language = 0;
r->ptr_query = qbuf;//m_sectionHashQueryBuf;
r->size_query = gbstrlen(r->ptr_query)+1;
r->m_timeout = 3600*1000; //todo: do we really want to wait an hour for this?
r->m_maxQueryTerms = 10;
// how much of each termlist to read in bytes
int32_t readList = 10000;
r-> ptr_readSizes = (char *)&readList;
r->size_readSizes = 4;
// term freqs
float tfw = 1.0;
r-> ptr_termFreqWeights = (char *)&tfw;
r->size_termFreqWeights = 4;
// speed it up some with this flag
r->m_forSectionStats = true;
// only do a single read of docids... do not split up
r->m_numDocIdSplits = 1;
// 1 query term
r->m_nqt = 1;
///////////////////////
//
// this tells msg3a/msg39/posdbtable its a hack! no need to do this
// because it's implied by the query.
// BUT REALLY let's eliminate this and just make our queries like
// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
// the section's xpath with the site. the values of that term in
// the posdb key will be 32-bit hashes of the innerHtml for such
// sections from all pages with the same xpath on the same site.
// so no need for this now, comment out.
//
//r->m_getFacetStats = true;
//
/////////////////////////
// we need to know what site is the base site so the section stats
// can set m_onSiteDocIds and m_offSiteDocIds correctly
//r->m_siteHash32 = *sh32;
// . now we use the hash of the innerHtml of the xpath
// . this is our value for the facet field of gbxpathsitehash12345678
// which is the hash of the innerHTML for that xpath on this site.
// 12345678 is the hash of the xpath and the site.
//r->m_myFacetVal32 = sentHash32;
//Query *qq = &m_queryArray[i];
// set query for msg3a. queryExpansion=false
//qq->set2 ( r->ptr_query , langUnknown , false );
Query qq;
qq.set2 ( r->ptr_query , langUnknown , false );
// TODO: ensure this just hits the one host since it is sharded
// by termid...
// what shard owns this termlist. we shard these
// gbfacetstr:gbxpathsitehash123456 terms by termid.
int64_t termId = qq.getTermId(0);
int32_t shardNum = getShardNumFromTermId ( termId );
// hack in our inner html content hash for this xpath
mcast->m_hack32 = innerHash32;
mcast->m_hack64 = secHash32;
// malloc and store the request. mcast will free it when done.
int32_t reqSize;
char *req = serializeMsg ( sizeof(Msg39Request),
&r->size_readSizes,
&r->size_whiteList,
&r->ptr_readSizes,
r,
&reqSize,
NULL,
0,
false);
// . send out a msg39 request to each shard
// . multicasts to a host in group "groupId"
// . we always block waiting for the reply with a multicast
// . returns false and sets g_errno on error
// . sends the request to fastest host in group "groupId"
// . if that host takes more than about 5 secs then sends to
// next host
// . key should be largest termId in group we're sending to
bool status;
status = mcast->send ( req , // m_rbufPtr ,
reqSize , // request size
0x39 , // msgType 0x39
true , // mcast owns m_request?
shardNum , // group to send to
false , // send to whole group?
0,//(int32_t)qh , // 0 // startKey.n1
this , // state1 data
mcast , // state2 data
gotReplyWrapper39 ,
multicast_xmldoc_sectionstats_timeout, //timeout
m_niceness,//m_r->m_niceness ,
-1, // firstHostId, // -1// bestHandlingHostId ,
NULL , // m_replyBuf ,
0 , // MSG39REPLYSIZE,
// this is true if multicast should free the
// reply, otherwise caller is responsible
// for freeing it after calling
// getBestReply().
// actually, this should always be false,
// there is a bug in Multicast.cpp.
// no, if we error out and never steal
// the buffers then they will go unfreed
// so they are freed by multicast by default
// then we steal control explicitly
true );
m_mcastRequestsOut++;
// if successfully launch, wait...
if ( status ) return (SectionStats *) -1;
// error?
if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }
// sets &m_sectionStats and adds to the table
gotSectionFacets ( mcast );
// i guess did not block...
//return &msg3a->m_sectionStats;
return &m_sectionStats;
}
// . come here when msg39 got the ptr_faceHashList for our single
// gbfacet:gbxpathsitehash
// . returns false and sets g_errno on error
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
//SectionStats *stats = &msg39->m_sectionStats;
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}
// count it as returned
m_mcastRequestsIn++;
// mark it as available now
int32_t num = mcast - m_mcastArray;
// sanity
//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }
// grab the xpath/site hash
uint32_t secHash32 = mcast->m_hack64;
// and our innher html for that xpath
int32_t myFacetVal32 = mcast->m_hack32;
// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }
// reset all counts to 0
m_sectionStats.reset();
//////
//
// compile m_sectionStats
//
///////
// set m_sectionStats from the list of facet values for this
// gbfacet:xpathsitehash term...
// Query::m_queryTerm.m_facetHashTable has the facets merged
// from all the shards. so now compute the stats from them.
// set the section stats.
//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
//HashTableX *ft = &qt->m_facetHashTable;
// . get the list of facet field/value pairs.
// . see how Msg3a.cpp merges these to see how they are stored
Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();
// this is NULL with g_errno set on error
if ( ! mr ) {
log("xmldoc: got error from sec stats mcast: %s",
mstrerror(g_errno));
return false;
}
deserializeMsg ( sizeof(Msg39Reply) ,
&mr->size_docIds,
&mr->size_clusterRecs,
&mr->ptr_docIds,
mr->m_buf );
char *p = (char *)(mr->ptr_facetHashList);
//char *pfinal = p + mr->size_facetHashList;
//
// should only be one termid of facets in here, so no need to re-loop
//
int32_t nh = 0;
// "matches" is how many docids with this facet field had our facet val
int32_t matches = 0;
// "totalDocIds" is how many docids had this facet field
int32_t totalFields = 0;
if ( p ) {
// first is the termid
//int64_t termId = *(int64_t *)p;
// skip that
p += 8;
// the # of unique 32-bit facet values
nh = *(int32_t *)p;
p += 4;
// the end point
char *pend = p + (8 * nh);
// now compile the facet hash list into there
for ( ; p < pend ; ) {
// does this facet value match ours?
// (i.e. same inner html?)
if ( *(int32_t *)p == myFacetVal32 )
matches += *(int32_t *)(p+4);
p += 4;
// now how many docids had this facet value?
totalFields += *(int32_t *)p;
p += 4;
}
}
// how many unique inner html content hashes for this xpath/site
// hash were there?
m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;
// how many xpaths existsed over all docs. doc can have multiple.
m_sectionStats.m_totalEntries = totalFields;
// total # unique docids that had this facet
m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;
// how many had the same inner html content hash for
// this xpath/site as we did?
m_sectionStats.m_totalMatches = matches;
////////
//
// store m_sectionStats in cache
//
////////
// cache them. this does a copy of m_sectionStats
if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
log("xmldoc: failed to add sections stats: %s",
mstrerror(g_errno));
// reset that msg39 to free its data
//msg39->reset();
if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }
// . make it available again
// . do this after all in case we were in quickpoll interruptting
// the getSectionStats() function below
//mcast->m_inUse = 0;
// free query Query::m_qwords array etc. to stop mem leaks
m_mcastArray [num].reset();
m_msg39RequestArray[num].reset();
//m_queryArray [num].reset();
// now when the master loop calls getSectionsWithDupStats() it
// should find the stats class in the cache!
return true;
}
int32_t *XmlDoc::getLinkSiteHashes ( ) {
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
@ -20338,15 +19672,6 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }
//CollectionRec *cr = getCollRec();
//if ( ! cr ) return NULL;
// set this important member var
//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
// return NULL with g_errno set on error
//if ( ! cr ) return NULL;
// . cache it for one hour
// . this will set our ptr_ and size_ member vars
char **otr = getOldTitleRec ( );
@ -22760,9 +22085,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
printRainbowSections ( sb , NULL );
// cut it short for debugging
logf(LOG_DEBUG,"xmldoc: FIX ME remove return");
//
// PRINT LINKINFO
//
@ -23645,11 +22967,7 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
//
// PRINT SECTIONS
//
Sections *sections ;
// hr is NULL if being called from page parser which does not have the
// dup stats! and we core if we block here!
if ( hr ) sections = getSectionsWithDupStats();
else sections = getSections();
Sections *sections = getSections();
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
Words *words = getWords();
@ -23661,18 +22979,14 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
int32_t nw = words->getNumWords();
//int32_t wordStart = 0;
//int32_t wordEnd = nw;
int64_t *wids = words->getWordIds();
int32_t isXml = false;
if ( hr ) isXml = (bool)hr->getLong("xml",0);
//if ( ! isXml ) printMenu ( sb );
// now complement, cuz bigger is better in the ranking world
//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
SafeBuf densBuf;
// returns false and sets g_errno on error
if ( ! getDensityRanks((int64_t *)wids,
nw,
@ -23683,20 +22997,6 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
return true;
// a handy ptr
char *densityVec = (char *)densBuf.getBufStart();
/*
if ( ! isXml )
sb->safePrintf("<br><b>density rank of body = %"INT32"</b> "
"(out of %"INT32")"
"<br>"
"<br>"
, densityRank
, (int32_t)MAXDENSITYRANK
);
*/
char *wordSpamVec = getWordSpamVec();
char *fragVec = m_fragBuf.getBufStart();
@ -23704,18 +23004,13 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
char *diversityVec = dwbuf.getBufStart();
// hack fack debug
//m_bodyStartPos =2136;
SafeBuf wpos;
if ( ! getWordPosVec ( words ,
sections,
//wordStart,
//wordEnd,
// we save this in the titlerec, when we
// start hashing the body. we have the url
// terms before the body, so this is necessary.
m_bodyStartPos,//0, // hi->m_startDist,
m_bodyStartPos,
fragVec,
m_niceness,
&wpos) ) return true;

209
XmlDoc.h

@ -401,16 +401,9 @@ public:
class Bits *getBitsForSummary ( ) ;
class Pos *getPos ( );
class Phrases *getPhrases ( ) ;
//class Synonyms *getSynonyms ( );
class Sections *getExplicitSections ( ) ;
class Sections *getImpliedSections ( ) ;
class Sections *getSections ( ) ;
class Sections *getSectionsWithDupStats ( );
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
bool gotSectionFacets( class Multicast *mcast );
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
char **getSectionsReply ( ) ;
char **getSectionsVotes ( ) ;
int32_t *getLinkSiteHashes ( );
class Links *getLinks ( bool doQuickSet = false ) ;
class HashTableX *getCountTable ( ) ;
@ -591,10 +584,6 @@ public:
char *addOutlinkSpiderRecsToMetaList ( );
//bool addTable96 ( class HashTableX *tt1 ,
// int32_t date1 ,
// bool nosplit ) ;
int32_t getSiteRank ();
bool addTable144 ( class HashTableX *tt1 ,
int64_t docId ,
@ -602,11 +591,6 @@ public:
bool addTable224 ( HashTableX *tt1 ) ;
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
// uint64_t docId ,
// uint8_t rdbId ,
// bool nosplit ) ;
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
uint8_t rdbId ,
bool forDelete ) ;
@ -627,10 +611,7 @@ public:
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashDateNumbers ( class HashTableX *tt );
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
bool hashNonAnomalies ) ;
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashRSSInfo ( class HashTableX *table ) ;
@ -648,11 +629,8 @@ public:
bool hashTagRec ( class HashTableX *table ) ;
bool hashPermalink ( class HashTableX *table ) ;
bool hashVectors(class HashTableX *table ) ;
// BR 20160106 removed: bool hashAds(class HashTableX *table ) ;
class Url *getBaseUrl ( ) ;
// BR 20160106 removed: bool hashSubmitUrls ( class HashTableX *table ) ;
// BR 20160106 removed: bool hashImageStuff ( class HashTableX *table ) ;
bool hashIsAdult ( class HashTableX *table ) ;
void set20 ( Msg20Request *req ) ;
@ -672,59 +650,21 @@ public:
char *getIsErrorPage ( ) ;
char* matchErrorMsg(char* p, char* pend );
bool hashWords ( //int32_t wordStart ,
//int32_t wordEnd ,
class HashInfo *hi ) ;
bool hashSingleTerm ( int64_t termId ,
class HashInfo *hi ) ;
bool hashSingleTerm ( char *s ,
int32_t slen ,
class HashInfo *hi );
bool hashString ( class HashTableX *ht ,
//class Weights *we ,
class Bits *bits ,
char *s ,
int32_t slen ) ;
bool hashString ( char *s ,
int32_t slen ,
class HashInfo *hi ) ;
bool hashString ( char *s ,
class HashInfo *hi ) ;
bool hashWords( class HashInfo *hi );
bool hashSingleTerm( int64_t termId, class HashInfo *hi );
bool hashSingleTerm( char *s, int32_t slen, class HashInfo *hi );
bool hashString( class HashTableX *ht, class Bits *bits, char *s, int32_t slen );
bool hashString( char *s, int32_t slen, class HashInfo *hi );
bool hashString( char *s, class HashInfo *hi );
bool hashWords3( class HashInfo *hi, class Words *words, class Phrases *phrases, class Synonyms *synonyms,
class Sections *sections, class HashTableX *countTable, char *fragVec, char *wordSpamVec,
char *langVec, char docLangId, class SafeBuf *pbuf, class HashTableX *wts,
class SafeBuf *wbuf, int32_t niceness );
bool hashWords3 ( //int32_t wordStart ,
//int32_t wordEnd ,
class HashInfo *hi ,
class Words *words ,
class Phrases *phrases ,
class Synonyms *synonyms ,
class Sections *sections ,
class HashTableX *countTable ,
char *fragVec ,
char *wordSpamVec ,
char *langVec ,
char docLangId , // default lang id
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
int32_t niceness );
bool hashString3 ( char *s ,
int32_t slen ,
class HashInfo *hi ,
class HashTableX *countTable ,
class SafeBuf *pbuf ,
class HashTableX *wts ,
class SafeBuf *wbuf ,
int32_t version ,
int32_t siteNumInlinks ,
int32_t niceness );
//bool hashSectionTerm ( char *term ,
// class HashInfo *hi ,
// int32_t sentHash32 ) ;
bool hashString3( char *s, int32_t slen, class HashInfo *hi, class HashTableX *countTable,
class SafeBuf *pbuf, class HashTableX *wts, class SafeBuf *wbuf, int32_t version,
int32_t siteNumInlinks, int32_t niceness );
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
@ -782,16 +722,12 @@ public:
public:
// stuff set from the key of the titleRec, above the compression area
//key_t m_key;
int64_t m_docId;
char *m_ubuf;
int32_t m_ubufSize;
int32_t m_ubufAlloc;
// does this page link to gigablast, or has a search form to it?
//bool searchboxToGigablast();
// private:
// we we started spidering it, in milliseconds since the epoch
@ -806,16 +742,6 @@ public:
int64_t m_setTime;
int64_t m_cpuSummaryStartTime;
// timers
int64_t m_beginSEOTime;
int64_t m_beginTimeAllMatch;
int64_t m_beginTimeMatchUrl;
int64_t m_beginTimeFullQueries;
int64_t m_beginTimeLinks;
//int64_t m_beginMsg98s;
int64_t m_beginRelatedQueries;
int64_t m_beginMsg95s;
// . these should all be set using set*() function calls so their
// individual validity flags can bet set to true, and successive
// calls to their corresponding get*() functions will not core
@ -836,8 +762,6 @@ public:
int64_t m_firstUrlHash64;
Url m_currentUrl;
//char *m_coll;
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
CollectionRec *m_lastcr;
collnum_t m_collnum;
int32_t m_lastCollRecResetCount;
@ -871,88 +795,24 @@ public:
Bits m_bits2;
Pos m_pos;
Phrases m_phrases;
//Synonyms m_synonyms;
SafeBuf m_synBuf;
//Weights m_weights;
Sections m_sections;
// a hack storage thing used by Msg13.cpp
class Msg13Request *m_hsr;
Section *m_si;
//Section *m_nextSection;
//Section *m_lastSection;
int32_t m_mcastRequestsOut;
int32_t m_mcastRequestsIn;
int32_t m_secStatsErrno;
char *m_queryBuf;
Msg39Request *m_msg39RequestArray;
SafeBuf m_mcastBuf;
Multicast *m_mcastArray;
//char *m_inUse;
//Query *m_queryArray;
//Query *m_sharedQuery;
bool m_gotDupStats;
//Query m_q4;
//Msg3a m_msg3a;
//Msg39Request m_r39;
Msg39Request m_mr2;
SectionStats m_sectionStats;
HashTableX m_sectionStatsTable;
//char m_sectionHashQueryBuf[128];
// also set in getSections()
int32_t m_maxVotesForDup;
// . for rebuild logging of what's changed
// . Repair.cpp sets these based on titlerec
char m_logLangId;
int32_t m_logSiteNumInlinks;
int32_t m_numSectiondbReads;
int32_t m_numSectiondbNeeds;
key128_t m_sectiondbStartKey;
RdbList m_secdbList;
int32_t m_sectiondbRecall;
bool m_gotFacets;
SafeBuf m_tmpBuf2;
SafeBuf m_inlineSectionVotingBuf;
//HashTableX m_rvt;
//Msg17 m_msg17;
//char *m_cachedRootVoteRec;
//int32_t m_cachedRootVoteRecSize;
//bool m_triedVoteCache;
//bool m_storedVoteCache;
//SafeBuf m_cacheRecBuf;
SafeBuf m_timeAxisUrl;
HashTableX m_turkVotingTable;
HashTableX m_turkBitsTable;
uint32_t m_confirmedTitleContentHash ;
uint32_t m_confirmedTitleTagHash ;
// turk voting tag rec
TagRec m_vtr;
// tagrec of banned turks
TagRec m_bannedTurkRec;
// and the table of the hashed banned turk users
HashTableX m_turkBanTable;
// used for displaying turk votes...
HashTableX m_vctab;
HashTableX m_vcduptab;
Images m_images;
HashTableX m_countTable;
HttpMime m_mime;
TagRec m_tagRec;
SafeBuf m_tagRecBuf;
// copy of m_oldTagRec but with our modifications, if any
//TagRec m_newTagRec;
SafeBuf m_newTagBuf;
SafeBuf m_fragBuf;
SafeBuf m_wordSpamBuf;
@ -962,9 +822,6 @@ public:
class SafeBuf *m_savedSb;
class HttpRequest *m_savedHr;
char m_savedChar;
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
// DO NOT add validity flags above this line!
@ -992,7 +849,6 @@ public:
char m_filteredRootTitleBufValid;
char m_titleBufValid;
char m_fragBufValid;
char m_inlineSectionVotingBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;
char m_matchingQueryBufValid;
@ -1044,10 +900,6 @@ public:
char m_sectionsValid;
char m_subSentsValid;
char m_turkVotingTableValid;
char m_turkBitsTableValid;
char m_turkBanTableValid;
char m_vctabValid;
char m_explicitSectionsValid;
char m_impliedSectionsValid;
char m_imageDataValid;
@ -1132,9 +984,6 @@ public:
bool m_isWWWDupValid;
bool m_linkInfo1Valid;
bool m_linkSiteHashesValid;
bool m_sectionsReplyValid;
bool m_sectionsVotesValid;
bool m_sectiondbDataValid;
bool m_placedbDataValid;
bool m_siteHash64Valid;
bool m_siteHash32Valid;
@ -1197,9 +1046,6 @@ public:
// DO NOT add validity flags below this line!
char m_VALIDEND;
// more stuff
//char *m_utf8Content;
//int32_t m_utf8ContentLen;
bool m_printedMenu;
int32_t m_urlPubDate;
@ -1253,11 +1099,9 @@ public:
int32_t m_siteSpiderQuota;
//int32_t m_numBannedOutlinks;
class XmlDoc *m_oldDoc;
class XmlDoc *m_extraDoc;
class XmlDoc *m_rootDoc;
//class XmlDoc *m_gatewayDoc;
RdbList m_oldMetaList;
char *m_oldTitleRec;
int32_t m_oldTitleRecSize;
@ -1275,10 +1119,7 @@ public:
int32_t m_tagdbCollLen;
Url m_extraUrl;
//int32_t m_siteNumInlinksFresh;
//int32_t m_sitePop;
uint8_t m_siteNumInlinks8;
//int32_t m_siteNumInlinks;
LinkInfo m_siteLinkInfo;
SafeBuf m_mySiteLinkInfoBuf;
SafeBuf m_myPageLinkInfoBuf;
@ -1289,7 +1130,6 @@ public:
char m_useSiteLinkBuf;
char m_usePageLinkBuf;
char m_printInXml;
//Msg25 m_msg25;
SafeBuf m_tmpBuf11;
SafeBuf m_tmpBuf12;
Multicast m_mcast11;
@ -1297,7 +1137,6 @@ public:
// lists from cachedb for msg25's msg20 replies serialized
RdbList m_siteReplyList;
RdbList m_pageReplyList;
//void (* m_masterLoopWrapper) (void *state);
MsgC m_msgc;
bool m_isAllowed;
bool m_forwardDownloadRequest;
@ -1308,10 +1147,6 @@ public:
// for limiting # of iframe tag expansions
int32_t m_numExpansions;
char m_newOnly;
//int32_t m_tryAgainTimeDelta;
//int32_t m_sameIpWait;
//int32_t m_sameDomainWait;
//int32_t m_maxSpidersPerDomain;
char m_isWWWDup;
char m_calledMsg0b;
@ -1322,24 +1157,14 @@ public:
class RdbList *m_ulist;
void *m_hack;
class XmlDoc *m_hackxd;
//class LinkInfo *m_linkInfo1Ptr;
char *m_linkInfoColl;
//char m_injectedReply;
//int32_t m_minInlinkerHopCount;
//class LinkInfo *m_linkInfo2Ptr;
SiteGetter m_siteGetter;
int64_t m_siteHash64;
//char *m_site;
//int32_t m_siteLen;
//Url m_siteUrl;
int32_t m_siteHash32;
char *m_httpReply;
//char m_downloadAttempted;
char m_incrementedAttemptsCount;
char m_incrementedDownloadCount;
char m_redirectFlag;
//char m_isScraping;
//char m_throttleDownload;
char m_spamCheckDisabled;
char m_useRobotsTxt;
int32_t m_robotsTxtLen;
@ -1353,15 +1178,12 @@ public:
int32_t m_filteredContentMaxSize;
char m_calledThread;
int32_t m_errno;
//class CollectionRec *m_cr;
//int32_t m_utf8ContentAllocSize;
int32_t m_hostHash32a;
int32_t m_hostHash32b;
int32_t m_domHash32;
int32_t m_priorityQueueNum;
// this points into m_msge0 i guess
//class TagRec **m_outlinkTagRecVector;
Msge0 m_msge0;
// this points into m_msge1 i guess
@ -1657,10 +1479,7 @@ public:
bool m_storeTermListInfo;
char m_sortTermListBy;
SafeBuf m_sectiondbData;
//char *m_sectiondbData;
char *m_placedbData;
//int32_t m_sectiondbDataSize;
int32_t m_placedbDataSize;
// we now have HashInfo to replace this

@ -1869,18 +1869,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
}
/////////////
//
// CHROME DETECTION
//
// we search for these terms we hash here in getSectionsWithDupStats()
// so we can remove chrome.
//
/////////////
// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashSections ( HashTableX *tt ) {
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
return true;
@ -3539,7 +3528,6 @@ skipsingleword:
// between calls? i.e. hashTitle() and hashBody()
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
return true;