mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
Remove more commented out codes & unused sections codes
This commit is contained in:
@ -1068,7 +1068,7 @@ void Msg39::estimateHitsAndSendReply ( ) {
|
||||
mr.size_pairScoreBuf = 0;
|
||||
mr.size_singleScoreBuf = 0;
|
||||
}
|
||||
//mr.m_sectionStats = pt->m_sectionStats;
|
||||
|
||||
// reserve space for these guys, we fill them in below
|
||||
mr.ptr_docIds = NULL;
|
||||
mr.ptr_scores = NULL;
|
||||
|
96
Msg3a.cpp
96
Msg3a.cpp
@ -131,13 +131,6 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
log(LOG_LOGIC,"net: bad collection. msg3a. %"INT32"",
|
||||
(int32_t)m_r->m_collnum);
|
||||
|
||||
//m_indexdbSplit = g_hostdb.m_indexSplits;
|
||||
// certain query term, like, gbdom:xyz.com, are NOT split
|
||||
// at all in order to keep performance high because such
|
||||
// terms are looked up by the spider. if a query contains
|
||||
// multiple "no split" terms, then it becomes split unfortunately...
|
||||
//if ( ! m_q->isSplit() ) m_indexdbSplit = 1;
|
||||
|
||||
// for a sanity check in Msg39.cpp
|
||||
r->m_nqt = m_q->getNumTerms();
|
||||
|
||||
@ -154,10 +147,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// . return now if query empty, no docids, or none wanted...
|
||||
// . if query terms = 0, might have been "x AND NOT x"
|
||||
if ( m_q->getNumTerms() <= 0 ) return true;
|
||||
// sometimes we want to get section stats from the hacked
|
||||
// sectionhash: posdb termlists
|
||||
//if ( m_docsToGet <= 0 && ! m_r->m_getSectionStats )
|
||||
// return true;
|
||||
|
||||
// . set g_errno if not found and return true
|
||||
// . coll is null terminated
|
||||
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
|
||||
@ -234,24 +224,17 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// update our read info
|
||||
for ( int32_t j = 0; j < n ; j++ ) {
|
||||
// the read size for THIS query term
|
||||
int32_t rs = 300000000; // toRead; 300MB i guess...
|
||||
// limit to 50MB man! this was 30MB but the
|
||||
// 'time enough for love' query was hitting 30MB termlists.
|
||||
//rs = 50000000;
|
||||
rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
// it is better to go oom then leave users scratching their
|
||||
// heads as to why some results are not being returned.
|
||||
// no, because we are going out of mem for queries like
|
||||
// 'www.disney.nl' etc.
|
||||
//rs = -1;
|
||||
// if section stats, limit to 1MB
|
||||
//if ( m_r->m_getSectionStats ) rs = 1000000;
|
||||
int32_t rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
|
||||
|
||||
// get the jth query term
|
||||
QueryTerm *qt = &m_q->m_qterms[j];
|
||||
|
||||
// if query term is ignored, skip it
|
||||
if ( qt->m_ignored ) rs = 0;
|
||||
|
||||
// set it
|
||||
readSizes[j] = rs;
|
||||
|
||||
// serialize these too
|
||||
tfw[j] = qt->m_termFreqWeight;
|
||||
}
|
||||
@ -265,8 +248,7 @@ bool Msg3a::getDocIds ( Msg39Request *r ,
|
||||
// Query::expandQuery() above
|
||||
m_r->ptr_query = m_q->m_orig;
|
||||
m_r->size_query = m_q->m_origLen+1;
|
||||
// the white list now too...
|
||||
//m_r->ptr_whiteList = si->m_whiteListBuf.getBufStart();
|
||||
|
||||
// free us?
|
||||
if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
|
||||
mfree ( m_rbufPtr , m_rbufSize, "Msg3a" );
|
||||
@ -774,64 +756,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
//m_totalDocCount = 0; // int32_t docCount = 0;
|
||||
m_moreDocIdsAvail = true;
|
||||
|
||||
/*
|
||||
|
||||
this version is too simple. now each query term can be a
|
||||
gbfacet:price or gbfacet:type term and each has a
|
||||
list in the Msg39Reply::ptr_facetHashList for its termid
|
||||
|
||||
//
|
||||
// compile facet stats
|
||||
//
|
||||
for ( int32_t j = 0; j < m_numHosts ; j++ ) {
|
||||
Msg39Reply *mr =m_reply[j];
|
||||
// one table for each query term
|
||||
char *p = mr->ptr_facetHashList;
|
||||
// loop over all query terms
|
||||
int32_t n = m_q->getNumTerms();
|
||||
// use this
|
||||
HashTableX tmp;
|
||||
// do the loop
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
// size of it
|
||||
int32_t psize = *(int32_t *)p;
|
||||
p += 4;
|
||||
tmp.deserialize ( p , psize );
|
||||
p += psize;
|
||||
// now compile the stats into a master table
|
||||
for ( int32_t k = 0 ; k < tmp.m_numSlots ; k++ ) {
|
||||
if ( ! tmp.m_flags[k] ) continue;
|
||||
// get the vlaue
|
||||
int32_t v32 = *(int32_t *)tmp.getKeyFromSlot(k);
|
||||
// and how many of them there where
|
||||
int32_t count = *(int32_t *)tmp.getValueFromSlot(k);
|
||||
// add to master
|
||||
master.addScore32 ( v32 , count );
|
||||
}
|
||||
}
|
||||
}
|
||||
////////
|
||||
//
|
||||
// now set m_facetStats
|
||||
//
|
||||
////////
|
||||
// add up all counts
|
||||
int64_t count = 0LL;
|
||||
for ( int32_t i = 0 ; i < master.getNumSlots() ; i++ ) {
|
||||
if ( ! master.m_flags[i] ) continue;
|
||||
int64_t slotCount = *(int32_t *)master.getValueFromSlot(i);
|
||||
int32_t h32 = *(int32_t *)master.getKeyFromSlot(i);
|
||||
if ( h32 == m_r->m_myFacetVal32 )
|
||||
m_facetStats.m_myValCount = slotCount;
|
||||
count += slotCount;
|
||||
}
|
||||
m_facetStats.m_totalUniqueValues = master.getNumUsedSlots();
|
||||
m_facetStats.m_totalValues = count;
|
||||
*/
|
||||
|
||||
|
||||
// shortcut
|
||||
//int32_t numSplits = m_numHosts;//indexdbSplit;
|
||||
|
||||
// . point to the various docids, etc. in each shard reply
|
||||
// . tcPtr = term count. how many required query terms does the doc
|
||||
@ -920,11 +844,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
for ( int32_t j = 0; j < m_numQueriedHosts ; j++ ) {
|
||||
Msg39Reply *mr =m_reply[j];
|
||||
if ( ! mr ) continue;
|
||||
//SectionStats *src = &mr->m_sectionStats;
|
||||
//dst->m_onSiteDocIds += src->m_onSiteDocIds;
|
||||
//dst->m_offSiteDocIds += src->m_offSiteDocIds;
|
||||
//dst->m_totalMatches += src->m_totalMatches;
|
||||
//dst->m_totalEntries += src->m_totalEntries;
|
||||
// now the list should be the unique site hashes that
|
||||
// had the section hash. we need to uniquify them again
|
||||
// here.
|
||||
@ -1036,7 +955,6 @@ bool Msg3a::mergeLists ( ) {
|
||||
if ( ! sortFacetEntries() )
|
||||
return true;
|
||||
|
||||
//if ( m_r->m_getSectionStats ) return true;
|
||||
//
|
||||
// HACK: END section stats merge
|
||||
//
|
||||
|
11
Msg3a.h
11
Msg3a.h
@ -145,17 +145,6 @@ public:
|
||||
// when merging this list of docids into a final list keep
|
||||
// track of the cursor into m_docIds[]
|
||||
int32_t m_cursor;
|
||||
|
||||
// what collection # are these docids from if m_collnums[] is NULL
|
||||
//collnum_t m_collnum;
|
||||
|
||||
// we don't have FacetStats because we have the actual
|
||||
// Msg39Reply::ptr_facetHashList from each shard which contains
|
||||
// all the facet hash lists for each gbfacet: query term we had
|
||||
// and the query "Msg3a::m_q.m_qterms[].m_dt" is the hash table
|
||||
// where each key is a facethash for that gbfacet:xxxx term and
|
||||
// the value if the # of occurences.
|
||||
//SectionStats m_sectionStats;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
89
Posdb.cpp
89
Posdb.cpp
@ -3839,95 +3839,6 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
if( g_conf.m_logTracePosdb ) log(LOG_TRACE,"%s:%s:%d: seoHack: %s, numTerms: %"INT32"", __FILE__,__func__, __LINE__, seoHack?"true":"false", m_q->m_numTerms);
|
||||
|
||||
// if we are just a sitehash:xxxxx list and m_getSectionStats is
|
||||
// true then assume the list is one of hacked posdb keys where
|
||||
// the wordposition bits and others are really a 32-bit site hash
|
||||
// and we have to see how many different docids and sites have
|
||||
// this term. and we compare to our site hash,
|
||||
// m_r->m_sectionSiteHash32 to determine if the posdb key is
|
||||
// onsite or offsite. then XmlDoc::printRainbowSections()
|
||||
// can print out how many page/sites duplicate your section's content.
|
||||
|
||||
// MDW: TODO: for the facet terms just compile the stats and do not
|
||||
// send to intersecting. they are ignored for those purposes. send
|
||||
// the hashtable back so msg3a can integrate the stats. keep in mind
|
||||
// we have multiple docid ranges sometimes for one query!!!!
|
||||
|
||||
/*
|
||||
|
||||
MDW: take this out. now treat as a normal termlist but
|
||||
do not use for scoring. so it is kinda like gbmin: gbmax:
|
||||
query operators but it will just add the facet values to
|
||||
QueryTerm::m_facetHashList for transmission back to the aggregator
|
||||
node. however, it is only for docids in the final result set!
|
||||
|
||||
if ( m_r->m_getFacetStats ) {
|
||||
// reset
|
||||
m_facetStats.m_totalMatches = 0;
|
||||
m_facetStats.m_totalEntries = 0;
|
||||
m_dt.clear();
|
||||
// scan the posdb keys
|
||||
//for ( int32_t i = 0 ; i < m_msg2->getNumListsInGroup(0); i++) {
|
||||
// get the sublist
|
||||
RdbList *list = m_msg2->getList(0);//Group(0)[i];
|
||||
char *p = list->getList ();
|
||||
char *pend = p + list->getListSize();
|
||||
// test
|
||||
//int64_t final = 5663137686803656554LL;
|
||||
//final &= TERMID_MASK;
|
||||
//if ( p<pend && g_posdb.getTermId(p) == final )
|
||||
// log("boo");
|
||||
// scan it
|
||||
for ( ; p < pend ; ) {
|
||||
// . first key is the full size
|
||||
// . uses the w,G,s,v and F bits to hold this
|
||||
// . this is no longer necessarily sitehash, but
|
||||
// can be any val, like now FacetStats is using
|
||||
// it for the innerHtml sentence content hash32
|
||||
int32_t sh32 = g_posdb.getFacetVal32 ( p );
|
||||
//int64_t d = g_posdb.getDocId(p);
|
||||
//int32_t rs = list->getRecSize(p);
|
||||
// this will not update listptrlo, watch out!
|
||||
p += list->getRecSize ( p );
|
||||
// does this xpath from another docid have the
|
||||
// same inner html as us?
|
||||
if ( sh32 == m_r->m_myFacetVal32 ) // m_siteHash32 )
|
||||
m_facetStats.m_totalMatches++;
|
||||
// always this
|
||||
m_facetStats.m_totalEntries++;
|
||||
// unique site count
|
||||
if ( m_dt.isInTable ( &sh32 ) ) continue;
|
||||
// count it
|
||||
m_facetStats.m_numUniqueVals++;
|
||||
// only once
|
||||
m_dt.addKey ( &sh32 );
|
||||
// log it
|
||||
//log("usite: %08"XINT32" %"INT64" rs=%"INT32"",sh32,d,rs);
|
||||
// stop if too much so we do not try to
|
||||
// re-alloc in a thread!
|
||||
if ( m_dt.m_numSlotsUsed >= 1000000 ) break;
|
||||
}
|
||||
// and return the list of merging
|
||||
int32_t *s = (int32_t *)m_facetHashList.getBufStart();
|
||||
int32_t *send = (int32_t *)m_facetHashList.getBufEnd();
|
||||
//if ( m_facetStats.m_numUniqueSites == 17 ) {
|
||||
// log("q=%s",m_r->ptr_query);
|
||||
// log("hey");
|
||||
// //char *xx = NULL;*xx=0;
|
||||
//}
|
||||
//if(!strcmp(m_r->ptr_query,"gbsectionhash:3335323672699668766"
|
||||
// log("boo");
|
||||
int32_t *orig = s;
|
||||
for ( int32_t i = 0 ; i < m_dt.m_numSlots ; i++ ) {
|
||||
if ( ! m_dt.m_flags[i] ) continue;
|
||||
*s++ = *(int32_t *)m_dt.getKeyFromSlot(i);
|
||||
if ( s >= send ) break;
|
||||
}
|
||||
m_facetHashList.setLength((char *)s-(char *)orig);
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// hash the docids in the whitelist termlists into a hashtable.
|
||||
// every docid in the search results must be in there. the
|
||||
|
106
Posdb.h
106
Posdb.h
@ -440,80 +440,8 @@ public:
|
||||
int32_t m_quotedStartId;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
#include "RdbList.h"
|
||||
|
||||
class PosdbList : public RdbList {
|
||||
|
||||
public:
|
||||
|
||||
// why do i have to repeat this for LinkInfo::set() calling our set()??
|
||||
void set ( char *list , int32_t listSize , bool ownData ) {
|
||||
RdbList::set ( list ,
|
||||
listSize ,
|
||||
list , // alloc
|
||||
listSize , // alloc size
|
||||
0 , // fixed data size
|
||||
ownData ,
|
||||
true , // use half keys?
|
||||
sizeof(key_t));// 12 bytes per key
|
||||
};
|
||||
|
||||
// clear the low bits on the keys so terms are DELETED
|
||||
void clearDelBits ( );
|
||||
|
||||
void print();
|
||||
|
||||
|
||||
// . these are made for special IndexLists, too
|
||||
// . getTermId() assumes as 12 byte key
|
||||
int64_t getCurrentTermId12 ( ) {
|
||||
return getTermId12 ( m_listPtr ); };
|
||||
int64_t getTermId12 ( char *rec ) {
|
||||
return (*(uint64_t *)(&rec[4])) >> 16 ;
|
||||
};
|
||||
int64_t getTermId16 ( char *rec ) {
|
||||
return (*(uint64_t *)(&rec[8])) >> 16 ;
|
||||
};
|
||||
// these 2 assume 12 and 6 byte keys respectively
|
||||
int64_t getCurrentDocId () {
|
||||
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
|
||||
else return getDocId12(m_listPtr);
|
||||
};
|
||||
int64_t getDocId ( char *rec ) {
|
||||
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
|
||||
else return getDocId12(rec);
|
||||
};
|
||||
int64_t getCurrentDocId12 ( ) {
|
||||
return getDocId12 ( m_listPtr ); };
|
||||
int64_t getDocId12 ( char *rec ) {
|
||||
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
|
||||
int64_t getDocId6 ( char *rec ) {
|
||||
int64_t docid;
|
||||
*(int32_t *)(&docid) = *(int32_t *)rec;
|
||||
((char *)&docid)[4] = rec[4];
|
||||
docid >>= 2;
|
||||
return docid & DOCID_MASK;
|
||||
};
|
||||
// this works with either 12 or 6 byte keys
|
||||
unsigned char getCurrentScore ( ) {
|
||||
return getScore(m_listPtr); };
|
||||
unsigned char getScore ( char *rec ) { return ~rec[5]; };
|
||||
|
||||
// uncomplemented...
|
||||
void setScore ( char *rec , char score ) { rec[5] = score; };
|
||||
|
||||
// for date lists only...
|
||||
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
|
||||
};
|
||||
*/
|
||||
|
||||
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
||||
|
||||
// max # search results that can be viewed without using TopTree
|
||||
//#define MAX_RESULTS 1000
|
||||
|
||||
class PosdbTable {
|
||||
|
||||
public:
|
||||
@ -525,10 +453,7 @@ class PosdbTable {
|
||||
char debug ,
|
||||
void *logstate ,
|
||||
class TopTree *topTree ,
|
||||
//char *coll ,
|
||||
collnum_t collnum ,
|
||||
//IndexList *lists ,
|
||||
//int32_t numLists ,
|
||||
class Msg2 *msg2,
|
||||
class Msg39Request *r );
|
||||
|
||||
@ -538,12 +463,6 @@ class PosdbTable {
|
||||
// pre-allocate memory since intersection runs in a thread
|
||||
bool allocTopTree ( );
|
||||
|
||||
// . returns false on error and sets errno
|
||||
// . we assume there are "m_numTerms" lists passed in (see set() above)
|
||||
//void intersectLists_r ( );
|
||||
|
||||
//void intersectLists9_r ( );
|
||||
|
||||
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
|
||||
char *wpi, char *wpj,
|
||||
char *endi, char *endj,
|
||||
@ -580,7 +499,9 @@ class PosdbTable {
|
||||
void freeMem ( ) ;
|
||||
|
||||
// has init already been called?
|
||||
bool isInitialized ( ) { return m_initialized; };
|
||||
bool isInitialized() {
|
||||
return m_initialized;
|
||||
}
|
||||
|
||||
uint64_t m_docId;
|
||||
|
||||
@ -609,56 +530,37 @@ class PosdbTable {
|
||||
|
||||
int32_t m_maxScores;
|
||||
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
|
||||
int32_t *m_qpos;
|
||||
int32_t *m_wikiPhraseIds;
|
||||
int32_t *m_quotedStartIds;
|
||||
//class DocIdScore *m_ds;
|
||||
int32_t m_qdist;
|
||||
float *m_freqWeights;
|
||||
//int64_t *m_freqs;
|
||||
char *m_bflags;
|
||||
int32_t *m_qtermNums;
|
||||
float m_bestWindowScore;
|
||||
//char **m_finalWinners1;
|
||||
//char **m_finalWinners2;
|
||||
//float *m_finalScores;
|
||||
char **m_windowTermPtrs;
|
||||
|
||||
// how many docs in the collection?
|
||||
int64_t m_docsInColl;
|
||||
|
||||
//SectionStats m_sectionStats;
|
||||
//SafeBuf m_facetHashList;
|
||||
//HashTableX m_dt;
|
||||
|
||||
class Msg2 *m_msg2;
|
||||
|
||||
// if getting more than MAX_RESULTS results, use this top tree to hold
|
||||
// them rather than the m_top*[] arrays above
|
||||
class TopTree *m_topTree;
|
||||
|
||||
//HashTableX m_docIdTable;
|
||||
|
||||
SafeBuf m_scoreInfoBuf;
|
||||
SafeBuf m_pairScoreBuf;
|
||||
SafeBuf m_singleScoreBuf;
|
||||
|
||||
SafeBuf m_stackBuf;
|
||||
|
||||
//SafeBuf m_mergeBuf;
|
||||
|
||||
// a reference to the query
|
||||
Query *m_q;
|
||||
int32_t m_nqt;
|
||||
|
||||
// these are NOT in imap space, but in query term space, 1-1 with
|
||||
// Query::m_qterms[]
|
||||
//IndexList *m_lists;
|
||||
//int32_t m_numLists;
|
||||
|
||||
// has init() been called?
|
||||
bool m_initialized;
|
||||
|
||||
@ -668,8 +570,6 @@ class PosdbTable {
|
||||
// for debug msgs
|
||||
void *m_logstate;
|
||||
|
||||
//int64_t m_numDocsInColl;
|
||||
|
||||
class Msg39Request *m_r;
|
||||
|
||||
// for gbsortby:item.price ...
|
||||
|
20
Sections.cpp
20
Sections.cpp
@ -3798,26 +3798,6 @@ bool Sections::printSectionDiv ( Section *sk , char format ) {
|
||||
,mod);
|
||||
}
|
||||
|
||||
SectionStats *ss = &sk->m_stats;
|
||||
|
||||
// also the value of the inner html hashed
|
||||
if ( sk->m_flags & SEC_HASHXPATH ) {
|
||||
uint32_t val ;
|
||||
val = (uint32_t) sk->m_indirectSentHash64 ;
|
||||
m_sbuf->safePrintf("xpathsitehashval=%"UINT32" ", val );
|
||||
}
|
||||
|
||||
// some voting stats
|
||||
if ( sk->m_flags & SEC_HASHXPATH ) {
|
||||
m_sbuf->safePrintf("_s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32" "
|
||||
,(int32_t)ss->m_totalMatches
|
||||
,(int32_t)ss->m_totalDocIds
|
||||
,(int32_t)ss->m_totalEntries
|
||||
,(int32_t)ss->m_numUniqueVals
|
||||
,(uint32_t)mod
|
||||
);
|
||||
}
|
||||
|
||||
printFlags ( m_sbuf , sk );
|
||||
|
||||
if ( isHardSection(sk) )
|
||||
|
@ -167,10 +167,6 @@ public:
|
||||
// are a sentence section then this points to itself.
|
||||
class Section *m_sentenceSection;
|
||||
|
||||
// . set in XmlDoc::getSectionsWithDupStats()
|
||||
// . voting info for this section over all indexed pages from this site
|
||||
SectionStats m_stats;
|
||||
|
||||
// position of the first and last alnum word contained directly OR
|
||||
// indirectly in this section. use -1 if no text contained...
|
||||
int32_t m_firstWordPos;
|
||||
@ -324,9 +320,6 @@ class Sections {
|
||||
bool printSectionDiv ( class Section *sk , char format = FMT_HTML );
|
||||
class SafeBuf *m_sbuf;
|
||||
|
||||
char *getSectionsReply ( int32_t *size );
|
||||
char *getSectionsVotes ( int32_t *size );
|
||||
|
||||
bool isHardSection ( class Section *sn );
|
||||
|
||||
bool setMenus ( );
|
||||
|
715
XmlDoc.cpp
715
XmlDoc.cpp
@ -89,8 +89,6 @@ XmlDoc::XmlDoc() {
|
||||
m_msg22Request.m_inUse = 0;
|
||||
m_msg4Waiting = false;
|
||||
m_msg4Launched = false;
|
||||
//m_sectiondbData = NULL;
|
||||
//m_placedbData = NULL;
|
||||
m_dupTrPtr = NULL;
|
||||
m_oldTitleRec = NULL;
|
||||
m_filteredContent = NULL;
|
||||
@ -98,40 +96,27 @@ XmlDoc::XmlDoc() {
|
||||
m_metaList = NULL;
|
||||
m_metaListSize = 0;
|
||||
m_metaListAllocSize = 0;
|
||||
//m_titleRec = NULL;
|
||||
//m_freeTitleRec = true;
|
||||
m_rootTitleRec = NULL;
|
||||
m_isIndexed = false;
|
||||
m_isInIndex = false;
|
||||
m_wasInIndex = false;
|
||||
m_outlinkHopCountVector = NULL;
|
||||
//m_gsbuf = NULL;
|
||||
m_extraDoc = NULL;
|
||||
m_wikiqbuf = NULL;
|
||||
//m_cr = NULL;
|
||||
|
||||
//m_notifyBlocked = 0;
|
||||
//m_mcasts = NULL;
|
||||
//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
|
||||
// m_currentBinPtrs[i] = NULL;
|
||||
reset();
|
||||
};
|
||||
}
|
||||
|
||||
XmlDoc::~XmlDoc() {
|
||||
setStatus("freeing this xmldoc");
|
||||
reset();
|
||||
m_freed = true;
|
||||
};
|
||||
}
|
||||
|
||||
static int64_t s_lastTimeStart = 0LL;
|
||||
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
m_savedChar = '\0';
|
||||
|
||||
|
||||
|
||||
m_redirUrl.reset();
|
||||
|
||||
m_updatedMetaData = false;
|
||||
@ -148,8 +133,6 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_bodyStartPos = 0;
|
||||
|
||||
m_mcastArray = NULL;
|
||||
|
||||
m_skipIframeExpansion = false;
|
||||
m_indexedTime = 0;
|
||||
|
||||
@ -187,19 +170,9 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_allHashed = false;
|
||||
|
||||
|
||||
// reset this crap
|
||||
m_beginTimeAllMatch = 0LL;
|
||||
m_beginTimeMatchUrl = 0LL;
|
||||
m_beginTimeFullQueries = 0LL;
|
||||
m_beginTimeLinks = 0LL;
|
||||
//m_beginMsg98s = 0LL;
|
||||
m_beginRelatedQueries = 0LL;
|
||||
|
||||
m_doledbKey.n0 = 0LL;
|
||||
m_doledbKey.n1 = 0;
|
||||
|
||||
|
||||
m_sortedPosdbListBuf.purge();
|
||||
m_termListBuf.purge();
|
||||
|
||||
@ -219,9 +192,6 @@ void XmlDoc::reset ( ) {
|
||||
m_domDedupTablePtr = NULL;
|
||||
|
||||
m_storeTermListInfo = false;
|
||||
m_gotDupStats = false;
|
||||
//m_nextSection = (Section *)-1;
|
||||
m_si = (Section *)-1;
|
||||
|
||||
// for limiting # of iframe tag expansions
|
||||
m_numExpansions = 0;
|
||||
@ -273,16 +243,6 @@ void XmlDoc::reset ( ) {
|
||||
// if this is true, then only index if new
|
||||
m_newOnly = 0;
|
||||
|
||||
//if ( m_sectiondbData ) {
|
||||
// mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
|
||||
// m_sectiondbData = NULL;
|
||||
//}
|
||||
|
||||
//if ( m_placedbData ) {
|
||||
// mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
|
||||
// m_placedbData = NULL;
|
||||
//}
|
||||
|
||||
if ( m_httpReplyValid && m_httpReply ) {
|
||||
mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
|
||||
m_httpReply = NULL;
|
||||
@ -295,10 +255,6 @@ void XmlDoc::reset ( ) {
|
||||
m_filteredContentAllocSize = 0;
|
||||
}
|
||||
|
||||
//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
|
||||
// mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");
|
||||
|
||||
|
||||
if ( m_metaList ) { // m_metaListValid && m_metaList ) {
|
||||
mfree ( m_metaList , m_metaListAllocSize , "metalist");
|
||||
m_metaList = NULL;
|
||||
@ -307,18 +263,10 @@ void XmlDoc::reset ( ) {
|
||||
}
|
||||
|
||||
if ( m_ubuf ) {
|
||||
// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
|
||||
// , (PTRTYPE) m_ubuf
|
||||
// , (PTRTYPE) this
|
||||
// );
|
||||
mfree ( m_ubuf , m_ubufAlloc , "ubuf");
|
||||
m_ubuf = NULL;
|
||||
}
|
||||
|
||||
//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
|
||||
// mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
|
||||
//}
|
||||
//m_titleRec = NULL;
|
||||
m_titleRecBuf.purge();
|
||||
|
||||
if ( m_dupTrPtr ) {
|
||||
@ -345,10 +293,6 @@ void XmlDoc::reset ( ) {
|
||||
}
|
||||
m_outlinkHopCountVector = NULL;
|
||||
|
||||
//if ( m_gsbufValid && m_gsbuf ) {
|
||||
// mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
|
||||
//}
|
||||
//m_gsbuf = NULL;
|
||||
m_gsbuf.reset();
|
||||
|
||||
|
||||
@ -359,7 +303,6 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_hashedMetas = false;
|
||||
|
||||
m_mcastBuf.purge();
|
||||
m_serpBuf.purge();
|
||||
|
||||
// Doc.cpp:
|
||||
@ -376,11 +319,6 @@ void XmlDoc::reset ( ) {
|
||||
m_bits2.reset();
|
||||
m_pos.reset();
|
||||
m_synBuf.reset();
|
||||
m_turkVotingTable.reset();
|
||||
m_turkBitsTable.reset();
|
||||
m_vtr.reset();
|
||||
m_vctab.reset();
|
||||
m_vcduptab.reset();
|
||||
m_images.reset();
|
||||
m_countTable.reset();
|
||||
m_mime.reset();
|
||||
@ -466,10 +404,6 @@ void XmlDoc::reset ( ) {
|
||||
m_launchedSpecialMsg8a = false;
|
||||
m_launchedMsg8a2 = false;
|
||||
|
||||
m_numSectiondbReads = 0;
|
||||
m_numSectiondbNeeds = 0;
|
||||
m_sectiondbRecall = 0;
|
||||
|
||||
m_setTr = false;
|
||||
m_triedTagRec = false;
|
||||
m_didGatewayPage = false;
|
||||
@ -4843,606 +4777,6 @@ Sections *XmlDoc::getSections ( ) {
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// . scan every section and look up its tag and content hashes in
|
||||
// sectiondb to find out how many pages and sites have the same hash
|
||||
// . use the secondary sectiondb key, key2
|
||||
// . then store the stats in the Sections::m_stats class
|
||||
Sections *XmlDoc::getSectionsWithDupStats ( ) {
|
||||
|
||||
Sections *ss = getSections();
|
||||
if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;
|
||||
|
||||
if ( m_gotDupStats ) return ss;
|
||||
|
||||
int32_t *sh32 = getSiteHash32();
|
||||
if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
|
||||
uint32_t siteHash32 = (uint32_t)*sh32;
|
||||
|
||||
// if this is -1, we are called for the first time
|
||||
if ( m_si == (void *)-1 ) {
|
||||
m_si = ss->m_rootSection;
|
||||
m_mcastRequestsIn = 0;
|
||||
m_mcastRequestsOut = 0;
|
||||
m_secStatsErrno = 0;
|
||||
}
|
||||
|
||||
for ( ; m_si ; m_si = m_si->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// don't bother with the section if it doesn't have this set
|
||||
// because this eliminates parent dupage to reduce amount
|
||||
// of gbxpathsitehash123456 terms we index.
|
||||
if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
|
||||
continue;
|
||||
|
||||
// get hash of sentences this tag contains indirectly
|
||||
uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
|
||||
if ( ! val32 )
|
||||
continue;
|
||||
|
||||
// get section xpath hash combined with sitehash
|
||||
uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;
|
||||
|
||||
// convert this to 32 bits
|
||||
uint32_t innerHash32 ;
|
||||
//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
|
||||
innerHash32 = (uint32_t)m_si->m_indirectSentHash64;
|
||||
|
||||
// save in case we need to read more than 5MB
|
||||
//m_lastSection = si;
|
||||
// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
|
||||
// . we hack the "sentContentHash32" into each posdb key
|
||||
// as the "value" so we can do a facet-like histogram
|
||||
// over all the possible values this xpath has for this site
|
||||
SectionStats *stats = getSectionStats ( secHash32,
|
||||
innerHash32,
|
||||
false ); // cache only?
|
||||
// it returns -1 if would block
|
||||
if ( stats == (void *)-1 ) {
|
||||
// count it as outstanding
|
||||
//m_mcastRequestsOut++;
|
||||
// launch more if we have room
|
||||
// UdpServer.cpp has a limit of 10 on 0x39 requests
|
||||
if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
|
||||
continue;
|
||||
// advance m_si so we do not repeat
|
||||
m_si = m_si->m_next;
|
||||
// otherwise, return -1 to indicate blocked
|
||||
return (Sections *)-1;
|
||||
}
|
||||
// NULL means g_errno
|
||||
if ( ! stats ) {
|
||||
// ensure g_errno is set
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// save it
|
||||
m_secStatsErrno = g_errno;
|
||||
// clear it
|
||||
g_errno = 0;
|
||||
// if still waiting though return -1
|
||||
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
||||
return (Sections *)-1;
|
||||
// otherwise, all done i guess
|
||||
return NULL;
|
||||
}
|
||||
// if already in the table, skip it!
|
||||
}
|
||||
|
||||
// waiting for more replies to come back?
|
||||
if ( m_mcastRequestsOut > m_mcastRequestsIn )
|
||||
return (Sections *) -1;
|
||||
|
||||
// now scan the sections and copy the stats from the table
|
||||
// into Section::m_stats of each sentence section.
|
||||
// use the key hash as the the hash of the tag/xpath and the innerhtml
|
||||
// and the val instead of being site hash will be hash of the
|
||||
// content. then we can get the histogram of our content hash
|
||||
// for this xpath on our site.
|
||||
Section *si = ss->m_rootSection;
|
||||
for ( ; si ; si = si->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// skip if no content to hash
|
||||
//if ( ! si->m_sentenceContentHash64 ) continue;
|
||||
|
||||
// don't bother with the section if it doesn't have this set
|
||||
// because this eliminates parent dupage to reduce amount
|
||||
// of gbxpathsitehash123456 terms we index
|
||||
if ( ! ( si->m_flags & SEC_HASHXPATH ) )
|
||||
continue;
|
||||
|
||||
// skip if sentence, only hash tags now i guess for diffbot
|
||||
//if ( si->m_sentenceContentHash64 )
|
||||
// continue;
|
||||
|
||||
// get hash of sentences this tag contains indirectly
|
||||
uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
|
||||
if ( ! val32 )
|
||||
continue;
|
||||
|
||||
// skip if menu!
|
||||
//if ( si->m_flags & menuFlags ) continue;
|
||||
|
||||
|
||||
// get section xpath hash combined with sitehash
|
||||
uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;
|
||||
|
||||
// convert this to 32 bits
|
||||
uint32_t innerHash32 ;
|
||||
innerHash32 = (uint32_t)si->m_indirectSentHash64;
|
||||
|
||||
// the "stats" class should be in the table from
|
||||
// the lookups above!!
|
||||
SectionStats *stats = getSectionStats ( secHash32,
|
||||
innerHash32,
|
||||
true ); // cache only?
|
||||
// sanity
|
||||
//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
|
||||
// must have had a network error or something
|
||||
if ( ! stats ) continue;
|
||||
// copy
|
||||
gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
|
||||
}
|
||||
|
||||
//
|
||||
// now if a section has no stats but has the same
|
||||
// m_indirectSentHash64 as a kid, take his stats
|
||||
//
|
||||
Section *sx = ss->m_rootSection;
|
||||
for ( ; sx ; sx = sx->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// don't bother with the section if it doesn't have this set
|
||||
// because this eliminates parent dupage to reduce amount
|
||||
// of gbxpathsitehash123456 terms we index
|
||||
if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
|
||||
continue;
|
||||
// scan up parents and set their stats to ours as int32_t as
|
||||
// they have the same indirect sent hash64
|
||||
Section *p = sx->m_parent;
|
||||
for ( ; p ; p = p->m_parent ) {
|
||||
|
||||
// if parent is like an img tag, skip it
|
||||
if ( p->m_tagId == TAG_IMG )
|
||||
continue;
|
||||
|
||||
if ( p ->m_indirectSentHash64 !=
|
||||
sx->m_indirectSentHash64 )
|
||||
break;
|
||||
|
||||
// copy it to parent with the same inner html hash
|
||||
gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
|
||||
}
|
||||
}
|
||||
|
||||
// now free the table's mem
|
||||
m_sectionStatsTable.reset();
|
||||
|
||||
m_gotDupStats = true;
|
||||
return ss;
|
||||
}
|
||||
|
||||
|
||||
static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
|
||||
//XmlDoc *THIS = (XmlDoc *)state;
|
||||
XmlDoc *THIS = (XmlDoc *)state1;
|
||||
Multicast *mcast = (Multicast *)state2;
|
||||
THIS->gotSectionFacets ( mcast );
|
||||
// this will end up calling getSectionsWithDupStats() again
|
||||
// which will call getSectionStats() some more on new sections
|
||||
// until m_gotDupStats is set to true.
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// . launch a single msg3a::getDocIds() for a section hash, secHash32
|
||||
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
|
||||
uint32_t innerHash32 ,
|
||||
bool cacheOnly ) {
|
||||
|
||||
// init cache?
|
||||
if ( m_sectionStatsTable.m_numSlots == 0 &&
|
||||
! m_sectionStatsTable.set(4,
|
||||
sizeof(SectionStats),
|
||||
32,
|
||||
NULL,
|
||||
0,
|
||||
false,
|
||||
m_niceness,
|
||||
"secstatsch"))
|
||||
return NULL;
|
||||
|
||||
// check in cache...
|
||||
SectionStats *stats ;
|
||||
stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
|
||||
// if there, return it
|
||||
if ( stats ) return stats;
|
||||
|
||||
// if cache only do not launch
|
||||
if ( cacheOnly ) return NULL;
|
||||
|
||||
//
|
||||
// TODO: shard gbxpathsitehashxxxxx by termid
|
||||
// and make sure msg3a only sends to that single shard and sends
|
||||
// the stats back. should make us much faster to sectionize
|
||||
// a web page. but for now try without it...
|
||||
//
|
||||
|
||||
//int32_t *sh32 = getSiteHash32();
|
||||
//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;
|
||||
|
||||
int32_t maxOut = 32;
|
||||
|
||||
// . need to make new msg39Request and a new Multicast arrays
|
||||
// . only need multicast since these gbfacetstr:gbxpathsitehash123456
|
||||
// terms are sharded by termid, otherwise we'd have to use msg3a
|
||||
if ( ! m_mcastArray ) {
|
||||
// how much mem to alloc?
|
||||
int32_t need = 0;
|
||||
need += sizeof(Multicast);
|
||||
need += sizeof(Msg39Request);
|
||||
// query buf str
|
||||
need += 100;
|
||||
need *= maxOut;
|
||||
// a single query now to be shared
|
||||
//need += sizeof(Query);
|
||||
// just in case we are being re-used
|
||||
m_mcastBuf.reset();
|
||||
// alloc space
|
||||
if ( ! m_mcastBuf.reserve(need) ) return NULL;
|
||||
// point to buf
|
||||
char *p = m_mcastBuf.getBufStart();
|
||||
// set them up
|
||||
m_mcastArray = (Multicast *)p;
|
||||
p += sizeof(Multicast) * maxOut;
|
||||
m_msg39RequestArray = (Msg39Request *)p;
|
||||
p += sizeof(Msg39Request) * maxOut;
|
||||
//m_queryArray = (Query *)p;
|
||||
//p += sizeof(Query) * maxOut;
|
||||
//m_sharedQuery = (Query *)p;
|
||||
//p += sizeof(Query);
|
||||
// for holding the query string
|
||||
// assume query will not exceed 100 bytes incuding \0
|
||||
m_queryBuf = p;
|
||||
p += 100 * maxOut;
|
||||
// initialize all!
|
||||
for ( int32_t i = 0 ; i < maxOut ; i++ ) {
|
||||
m_mcastArray [i].constructor();
|
||||
m_msg39RequestArray[i].reset();//constructor();
|
||||
//m_queryArray [i].constructor();
|
||||
m_queryBuf[100*i] = '\0';
|
||||
//m_inUse[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// get first available
|
||||
int32_t i;
|
||||
for ( i = 0 ; i < maxOut ; i++ )
|
||||
if ( ! m_mcastArray[i].m_inUse ) break;
|
||||
|
||||
// wtf?
|
||||
if ( i >= maxOut ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// and our vehicle
|
||||
Multicast *mcast = &m_mcastArray[i];
|
||||
|
||||
// mark as in use up here in case we quickpoll into this same code?!
|
||||
// yeah, i guess set2() calls quickpoll?
|
||||
//mcast->m_inUse = 1;
|
||||
|
||||
// save this for reply
|
||||
//mcast->m_hack = this;
|
||||
|
||||
char *qbuf = m_queryBuf + 100 * i;
|
||||
|
||||
// . hash this special term (was gbsectionhash)
|
||||
// . the wordbits etc will be a number though, the hash of the content
|
||||
// of the xpath, the inner html hash
|
||||
// . preceeding this term with gbfacet: will make gigablast return
|
||||
// the statistics for all the values in the posdb keys of this
|
||||
// termlist, which happen to be innerHTML hashes for all pages
|
||||
// with this same xpath and on this same site.
|
||||
sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"",
|
||||
(uint32_t)secHash32);
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// set the msg39 request
|
||||
Msg39Request *r = &m_msg39RequestArray[i];
|
||||
|
||||
// reset all to defaults
|
||||
r->reset();
|
||||
|
||||
//r-> ptr_coll = cr->m_coll;
|
||||
//r->size_coll = gbstrlen(cr->m_coll)+1;
|
||||
r->m_collnum = cr->m_collnum;
|
||||
r->m_maxAge = 60; // cache timeout?
|
||||
r->m_addToCache = true;
|
||||
r->m_docsToGet = 0; // just calc stats
|
||||
r->m_niceness = m_niceness;
|
||||
r->m_debug = 0;
|
||||
r->m_doSiteClustering = false;
|
||||
//r->m_doIpClustering = false;
|
||||
r->m_doDupContentRemoval = false;
|
||||
r->m_boolFlag = 2;
|
||||
r->m_familyFilter = 0;
|
||||
r->m_language = 0;
|
||||
r->ptr_query = qbuf;//m_sectionHashQueryBuf;
|
||||
r->size_query = gbstrlen(r->ptr_query)+1;
|
||||
r->m_timeout = 3600*1000; //todo: do we really want to wait an hour for this?
|
||||
r->m_maxQueryTerms = 10;
|
||||
|
||||
// how much of each termlist to read in bytes
|
||||
int32_t readList = 10000;
|
||||
r-> ptr_readSizes = (char *)&readList;
|
||||
r->size_readSizes = 4;
|
||||
|
||||
// term freqs
|
||||
float tfw = 1.0;
|
||||
r-> ptr_termFreqWeights = (char *)&tfw;
|
||||
r->size_termFreqWeights = 4;
|
||||
|
||||
// speed it up some with this flag
|
||||
r->m_forSectionStats = true;
|
||||
|
||||
// only do a single read of docids... do not split up
|
||||
r->m_numDocIdSplits = 1;
|
||||
|
||||
// 1 query term
|
||||
r->m_nqt = 1;
|
||||
|
||||
///////////////////////
|
||||
//
|
||||
// this tells msg3a/msg39/posdbtable its a hack! no need to do this
|
||||
// because it's implied by the query.
|
||||
// BUT REALLY let's eliminate this and just make our queries like
|
||||
// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
|
||||
// the section's xpath with the site. the values of that term in
|
||||
// the posdb key will be 32-bit hashes of the innerHtml for such
|
||||
// sections from all pages with the same xpath on the same site.
|
||||
// so no need for this now, comment out.
|
||||
//
|
||||
//r->m_getFacetStats = true;
|
||||
//
|
||||
/////////////////////////
|
||||
|
||||
|
||||
// we need to know what site is the base site so the section stats
|
||||
// can set m_onSiteDocIds and m_offSiteDocIds correctly
|
||||
//r->m_siteHash32 = *sh32;
|
||||
|
||||
// . now we use the hash of the innerHtml of the xpath
|
||||
// . this is our value for the facet field of gbxpathsitehash12345678
|
||||
// which is the hash of the innerHTML for that xpath on this site.
|
||||
// 12345678 is the hash of the xpath and the site.
|
||||
//r->m_myFacetVal32 = sentHash32;
|
||||
|
||||
|
||||
//Query *qq = &m_queryArray[i];
|
||||
// set query for msg3a. queryExpansion=false
|
||||
//qq->set2 ( r->ptr_query , langUnknown , false );
|
||||
|
||||
Query qq;
|
||||
qq.set2 ( r->ptr_query , langUnknown , false );
|
||||
|
||||
// TODO: ensure this just hits the one host since it is sharded
|
||||
// by termid...
|
||||
|
||||
// what shard owns this termlist. we shard these
|
||||
// gbfacetstr:gbxpathsitehash123456 terms by termid.
|
||||
int64_t termId = qq.getTermId(0);
|
||||
int32_t shardNum = getShardNumFromTermId ( termId );
|
||||
|
||||
// hack in our inner html content hash for this xpath
|
||||
mcast->m_hack32 = innerHash32;
|
||||
mcast->m_hack64 = secHash32;
|
||||
|
||||
// malloc and store the request. mcast will free it when done.
|
||||
int32_t reqSize;
|
||||
char *req = serializeMsg ( sizeof(Msg39Request),
|
||||
&r->size_readSizes,
|
||||
&r->size_whiteList,
|
||||
&r->ptr_readSizes,
|
||||
r,
|
||||
&reqSize,
|
||||
NULL,
|
||||
0,
|
||||
false);
|
||||
|
||||
// . send out a msg39 request to each shard
|
||||
// . multicasts to a host in group "groupId"
|
||||
// . we always block waiting for the reply with a multicast
|
||||
// . returns false and sets g_errno on error
|
||||
// . sends the request to fastest host in group "groupId"
|
||||
// . if that host takes more than about 5 secs then sends to
|
||||
// next host
|
||||
// . key should be largest termId in group we're sending to
|
||||
bool status;
|
||||
status = mcast->send ( req , // m_rbufPtr ,
|
||||
reqSize , // request size
|
||||
0x39 , // msgType 0x39
|
||||
true , // mcast owns m_request?
|
||||
shardNum , // group to send to
|
||||
false , // send to whole group?
|
||||
0,//(int32_t)qh , // 0 // startKey.n1
|
||||
this , // state1 data
|
||||
mcast , // state2 data
|
||||
gotReplyWrapper39 ,
|
||||
multicast_xmldoc_sectionstats_timeout, //timeout
|
||||
m_niceness,//m_r->m_niceness ,
|
||||
-1, // firstHostId, // -1// bestHandlingHostId ,
|
||||
NULL , // m_replyBuf ,
|
||||
0 , // MSG39REPLYSIZE,
|
||||
// this is true if multicast should free the
|
||||
// reply, otherwise caller is responsible
|
||||
// for freeing it after calling
|
||||
// getBestReply().
|
||||
// actually, this should always be false,
|
||||
// there is a bug in Multicast.cpp.
|
||||
// no, if we error out and never steal
|
||||
// the buffers then they will go unfreed
|
||||
// so they are freed by multicast by default
|
||||
// then we steal control explicitly
|
||||
true );
|
||||
|
||||
m_mcastRequestsOut++;
|
||||
|
||||
// if successfully launch, wait...
|
||||
if ( status ) return (SectionStats *) -1;
|
||||
|
||||
// error?
|
||||
if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }
|
||||
|
||||
// sets &m_sectionStats and adds to the table
|
||||
gotSectionFacets ( mcast );
|
||||
|
||||
// i guess did not block...
|
||||
//return &msg3a->m_sectionStats;
|
||||
return &m_sectionStats;
|
||||
}
|
||||
|
||||
|
||||
// . come here when msg39 got the ptr_faceHashList for our single
|
||||
// gbfacet:gbxpathsitehash
|
||||
// . returns false and sets g_errno on error
|
||||
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
|
||||
//SectionStats *stats = &msg39->m_sectionStats;
|
||||
|
||||
if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
// count it as returned
|
||||
m_mcastRequestsIn++;
|
||||
// mark it as available now
|
||||
int32_t num = mcast - m_mcastArray;
|
||||
// sanity
|
||||
//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// grab the xpath/site hash
|
||||
uint32_t secHash32 = mcast->m_hack64;
|
||||
|
||||
// and our innher html for that xpath
|
||||
int32_t myFacetVal32 = mcast->m_hack32;
|
||||
|
||||
// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
|
||||
//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// reset all counts to 0
|
||||
m_sectionStats.reset();
|
||||
|
||||
//////
|
||||
//
|
||||
// compile m_sectionStats
|
||||
//
|
||||
///////
|
||||
|
||||
// set m_sectionStats from the list of facet values for this
|
||||
// gbfacet:xpathsitehash term...
|
||||
// Query::m_queryTerm.m_facetHashTable has the facets merged
|
||||
// from all the shards. so now compute the stats from them.
|
||||
// set the section stats.
|
||||
//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
|
||||
//HashTableX *ft = &qt->m_facetHashTable;
|
||||
|
||||
// . get the list of facet field/value pairs.
|
||||
// . see how Msg3a.cpp merges these to see how they are stored
|
||||
Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();
|
||||
|
||||
// this is NULL with g_errno set on error
|
||||
if ( ! mr ) {
|
||||
log("xmldoc: got error from sec stats mcast: %s",
|
||||
mstrerror(g_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
deserializeMsg ( sizeof(Msg39Reply) ,
|
||||
&mr->size_docIds,
|
||||
&mr->size_clusterRecs,
|
||||
&mr->ptr_docIds,
|
||||
mr->m_buf );
|
||||
|
||||
char *p = (char *)(mr->ptr_facetHashList);
|
||||
//char *pfinal = p + mr->size_facetHashList;
|
||||
|
||||
//
|
||||
// should only be one termid of facets in here, so no need to re-loop
|
||||
//
|
||||
int32_t nh = 0;
|
||||
// "matches" is how many docids with this facet field had our facet val
|
||||
int32_t matches = 0;
|
||||
// "totalDocIds" is how many docids had this facet field
|
||||
int32_t totalFields = 0;
|
||||
|
||||
if ( p ) {
|
||||
// first is the termid
|
||||
//int64_t termId = *(int64_t *)p;
|
||||
// skip that
|
||||
p += 8;
|
||||
// the # of unique 32-bit facet values
|
||||
nh = *(int32_t *)p;
|
||||
p += 4;
|
||||
// the end point
|
||||
char *pend = p + (8 * nh);
|
||||
// now compile the facet hash list into there
|
||||
for ( ; p < pend ; ) {
|
||||
// does this facet value match ours?
|
||||
// (i.e. same inner html?)
|
||||
if ( *(int32_t *)p == myFacetVal32 )
|
||||
matches += *(int32_t *)(p+4);
|
||||
p += 4;
|
||||
// now how many docids had this facet value?
|
||||
totalFields += *(int32_t *)p;
|
||||
p += 4;
|
||||
}
|
||||
}
|
||||
|
||||
// how many unique inner html content hashes for this xpath/site
|
||||
// hash were there?
|
||||
m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;
|
||||
|
||||
// how many xpaths existsed over all docs. doc can have multiple.
|
||||
m_sectionStats.m_totalEntries = totalFields;
|
||||
|
||||
// total # unique docids that had this facet
|
||||
m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;
|
||||
|
||||
// how many had the same inner html content hash for
|
||||
// this xpath/site as we did?
|
||||
m_sectionStats.m_totalMatches = matches;
|
||||
|
||||
////////
|
||||
//
|
||||
// store m_sectionStats in cache
|
||||
//
|
||||
////////
|
||||
|
||||
// cache them. this does a copy of m_sectionStats
|
||||
if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
|
||||
log("xmldoc: failed to add sections stats: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
// reset that msg39 to free its data
|
||||
//msg39->reset();
|
||||
|
||||
if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// . make it available again
|
||||
// . do this after all in case we were in quickpoll interruptting
|
||||
// the getSectionStats() function below
|
||||
//mcast->m_inUse = 0;
|
||||
|
||||
// free query Query::m_qwords array etc. to stop mem leaks
|
||||
m_mcastArray [num].reset();
|
||||
m_msg39RequestArray[num].reset();
|
||||
//m_queryArray [num].reset();
|
||||
// now when the master loop calls getSectionsWithDupStats() it
|
||||
// should find the stats class in the cache!
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t *XmlDoc::getLinkSiteHashes ( ) {
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
@ -20338,15 +19672,6 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }
|
||||
|
||||
|
||||
//CollectionRec *cr = getCollRec();
|
||||
//if ( ! cr ) return NULL;
|
||||
|
||||
// set this important member var
|
||||
//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
|
||||
// return NULL with g_errno set on error
|
||||
//if ( ! cr ) return NULL;
|
||||
|
||||
// . cache it for one hour
|
||||
// . this will set our ptr_ and size_ member vars
|
||||
char **otr = getOldTitleRec ( );
|
||||
@ -22760,9 +22085,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
|
||||
printRainbowSections ( sb , NULL );
|
||||
|
||||
// cut it short for debugging
|
||||
logf(LOG_DEBUG,"xmldoc: FIX ME remove return");
|
||||
|
||||
//
|
||||
// PRINT LINKINFO
|
||||
//
|
||||
@ -23645,11 +22967,7 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
//
|
||||
// PRINT SECTIONS
|
||||
//
|
||||
Sections *sections ;
|
||||
// hr is NULL if being called from page parser which does not have the
|
||||
// dup stats! and we core if we block here!
|
||||
if ( hr ) sections = getSectionsWithDupStats();
|
||||
else sections = getSections();
|
||||
Sections *sections = getSections();
|
||||
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
|
||||
|
||||
Words *words = getWords();
|
||||
@ -23661,18 +22979,14 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
|
||||
int32_t nw = words->getNumWords();
|
||||
//int32_t wordStart = 0;
|
||||
//int32_t wordEnd = nw;
|
||||
int64_t *wids = words->getWordIds();
|
||||
|
||||
int32_t isXml = false;
|
||||
if ( hr ) isXml = (bool)hr->getLong("xml",0);
|
||||
|
||||
//if ( ! isXml ) printMenu ( sb );
|
||||
|
||||
// now complement, cuz bigger is better in the ranking world
|
||||
//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
|
||||
SafeBuf densBuf;
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
if ( ! getDensityRanks((int64_t *)wids,
|
||||
nw,
|
||||
@ -23683,20 +22997,6 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
return true;
|
||||
// a handy ptr
|
||||
char *densityVec = (char *)densBuf.getBufStart();
|
||||
|
||||
|
||||
/*
|
||||
if ( ! isXml )
|
||||
sb->safePrintf("<br><b>density rank of body = %"INT32"</b> "
|
||||
"(out of %"INT32")"
|
||||
"<br>"
|
||||
"<br>"
|
||||
, densityRank
|
||||
, (int32_t)MAXDENSITYRANK
|
||||
);
|
||||
*/
|
||||
|
||||
|
||||
char *wordSpamVec = getWordSpamVec();
|
||||
char *fragVec = m_fragBuf.getBufStart();
|
||||
|
||||
@ -23704,18 +23004,13 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
|
||||
char *diversityVec = dwbuf.getBufStart();
|
||||
|
||||
// hack fack debug
|
||||
//m_bodyStartPos =2136;
|
||||
|
||||
SafeBuf wpos;
|
||||
if ( ! getWordPosVec ( words ,
|
||||
sections,
|
||||
//wordStart,
|
||||
//wordEnd,
|
||||
// we save this in the titlerec, when we
|
||||
// start hashing the body. we have the url
|
||||
// terms before the body, so this is necessary.
|
||||
m_bodyStartPos,//0, // hi->m_startDist,
|
||||
m_bodyStartPos,
|
||||
fragVec,
|
||||
m_niceness,
|
||||
&wpos) ) return true;
|
||||
|
209
XmlDoc.h
209
XmlDoc.h
@ -401,16 +401,9 @@ public:
|
||||
class Bits *getBitsForSummary ( ) ;
|
||||
class Pos *getPos ( );
|
||||
class Phrases *getPhrases ( ) ;
|
||||
//class Synonyms *getSynonyms ( );
|
||||
class Sections *getExplicitSections ( ) ;
|
||||
class Sections *getImpliedSections ( ) ;
|
||||
class Sections *getSections ( ) ;
|
||||
class Sections *getSectionsWithDupStats ( );
|
||||
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
|
||||
bool gotSectionFacets( class Multicast *mcast );
|
||||
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
|
||||
char **getSectionsReply ( ) ;
|
||||
char **getSectionsVotes ( ) ;
|
||||
int32_t *getLinkSiteHashes ( );
|
||||
class Links *getLinks ( bool doQuickSet = false ) ;
|
||||
class HashTableX *getCountTable ( ) ;
|
||||
@ -591,10 +584,6 @@ public:
|
||||
|
||||
char *addOutlinkSpiderRecsToMetaList ( );
|
||||
|
||||
//bool addTable96 ( class HashTableX *tt1 ,
|
||||
// int32_t date1 ,
|
||||
// bool nosplit ) ;
|
||||
|
||||
int32_t getSiteRank ();
|
||||
bool addTable144 ( class HashTableX *tt1 ,
|
||||
int64_t docId ,
|
||||
@ -602,11 +591,6 @@ public:
|
||||
|
||||
bool addTable224 ( HashTableX *tt1 ) ;
|
||||
|
||||
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
|
||||
// uint64_t docId ,
|
||||
// uint8_t rdbId ,
|
||||
// bool nosplit ) ;
|
||||
|
||||
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
|
||||
uint8_t rdbId ,
|
||||
bool forDelete ) ;
|
||||
@ -627,10 +611,7 @@ public:
|
||||
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
||||
bool hashDateNumbers ( class HashTableX *tt );
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
bool hashNonAnomalies ) ;
|
||||
|
||||
bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
|
||||
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
||||
bool hashNeighborhoods ( class HashTableX *table ) ;
|
||||
bool hashRSSInfo ( class HashTableX *table ) ;
|
||||
@ -648,11 +629,8 @@ public:
|
||||
bool hashTagRec ( class HashTableX *table ) ;
|
||||
bool hashPermalink ( class HashTableX *table ) ;
|
||||
bool hashVectors(class HashTableX *table ) ;
|
||||
// BR 20160106 removed: bool hashAds(class HashTableX *table ) ;
|
||||
|
||||
class Url *getBaseUrl ( ) ;
|
||||
// BR 20160106 removed: bool hashSubmitUrls ( class HashTableX *table ) ;
|
||||
// BR 20160106 removed: bool hashImageStuff ( class HashTableX *table ) ;
|
||||
bool hashIsAdult ( class HashTableX *table ) ;
|
||||
|
||||
void set20 ( Msg20Request *req ) ;
|
||||
@ -672,59 +650,21 @@ public:
|
||||
char *getIsErrorPage ( ) ;
|
||||
char* matchErrorMsg(char* p, char* pend );
|
||||
|
||||
bool hashWords ( //int32_t wordStart ,
|
||||
//int32_t wordEnd ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashSingleTerm ( int64_t termId ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashSingleTerm ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi );
|
||||
bool hashString ( class HashTableX *ht ,
|
||||
//class Weights *we ,
|
||||
class Bits *bits ,
|
||||
char *s ,
|
||||
int32_t slen ) ;
|
||||
bool hashString ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashString ( char *s ,
|
||||
class HashInfo *hi ) ;
|
||||
bool hashWords( class HashInfo *hi );
|
||||
bool hashSingleTerm( int64_t termId, class HashInfo *hi );
|
||||
bool hashSingleTerm( char *s, int32_t slen, class HashInfo *hi );
|
||||
bool hashString( class HashTableX *ht, class Bits *bits, char *s, int32_t slen );
|
||||
bool hashString( char *s, int32_t slen, class HashInfo *hi );
|
||||
bool hashString( char *s, class HashInfo *hi );
|
||||
|
||||
bool hashWords3( class HashInfo *hi, class Words *words, class Phrases *phrases, class Synonyms *synonyms,
|
||||
class Sections *sections, class HashTableX *countTable, char *fragVec, char *wordSpamVec,
|
||||
char *langVec, char docLangId, class SafeBuf *pbuf, class HashTableX *wts,
|
||||
class SafeBuf *wbuf, int32_t niceness );
|
||||
|
||||
|
||||
bool hashWords3 ( //int32_t wordStart ,
|
||||
//int32_t wordEnd ,
|
||||
class HashInfo *hi ,
|
||||
class Words *words ,
|
||||
class Phrases *phrases ,
|
||||
class Synonyms *synonyms ,
|
||||
class Sections *sections ,
|
||||
class HashTableX *countTable ,
|
||||
char *fragVec ,
|
||||
char *wordSpamVec ,
|
||||
char *langVec ,
|
||||
char docLangId , // default lang id
|
||||
class SafeBuf *pbuf ,
|
||||
class HashTableX *wts ,
|
||||
class SafeBuf *wbuf ,
|
||||
int32_t niceness );
|
||||
|
||||
bool hashString3 ( char *s ,
|
||||
int32_t slen ,
|
||||
class HashInfo *hi ,
|
||||
class HashTableX *countTable ,
|
||||
class SafeBuf *pbuf ,
|
||||
class HashTableX *wts ,
|
||||
class SafeBuf *wbuf ,
|
||||
int32_t version ,
|
||||
int32_t siteNumInlinks ,
|
||||
int32_t niceness );
|
||||
|
||||
|
||||
//bool hashSectionTerm ( char *term ,
|
||||
// class HashInfo *hi ,
|
||||
// int32_t sentHash32 ) ;
|
||||
bool hashString3( char *s, int32_t slen, class HashInfo *hi, class HashTableX *countTable,
|
||||
class SafeBuf *pbuf, class HashTableX *wts, class SafeBuf *wbuf, int32_t version,
|
||||
int32_t siteNumInlinks, int32_t niceness );
|
||||
|
||||
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
|
||||
|
||||
@ -782,16 +722,12 @@ public:
|
||||
public:
|
||||
|
||||
// stuff set from the key of the titleRec, above the compression area
|
||||
//key_t m_key;
|
||||
int64_t m_docId;
|
||||
|
||||
char *m_ubuf;
|
||||
int32_t m_ubufSize;
|
||||
int32_t m_ubufAlloc;
|
||||
|
||||
// does this page link to gigablast, or has a search form to it?
|
||||
//bool searchboxToGigablast();
|
||||
|
||||
// private:
|
||||
|
||||
// we we started spidering it, in milliseconds since the epoch
|
||||
@ -806,16 +742,6 @@ public:
|
||||
int64_t m_setTime;
|
||||
int64_t m_cpuSummaryStartTime;
|
||||
|
||||
// timers
|
||||
int64_t m_beginSEOTime;
|
||||
int64_t m_beginTimeAllMatch;
|
||||
int64_t m_beginTimeMatchUrl;
|
||||
int64_t m_beginTimeFullQueries;
|
||||
int64_t m_beginTimeLinks;
|
||||
//int64_t m_beginMsg98s;
|
||||
int64_t m_beginRelatedQueries;
|
||||
int64_t m_beginMsg95s;
|
||||
|
||||
// . these should all be set using set*() function calls so their
|
||||
// individual validity flags can bet set to true, and successive
|
||||
// calls to their corresponding get*() functions will not core
|
||||
@ -836,8 +762,6 @@ public:
|
||||
int64_t m_firstUrlHash64;
|
||||
Url m_currentUrl;
|
||||
|
||||
//char *m_coll;
|
||||
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
|
||||
CollectionRec *m_lastcr;
|
||||
collnum_t m_collnum;
|
||||
int32_t m_lastCollRecResetCount;
|
||||
@ -871,88 +795,24 @@ public:
|
||||
Bits m_bits2;
|
||||
Pos m_pos;
|
||||
Phrases m_phrases;
|
||||
//Synonyms m_synonyms;
|
||||
SafeBuf m_synBuf;
|
||||
//Weights m_weights;
|
||||
Sections m_sections;
|
||||
|
||||
// a hack storage thing used by Msg13.cpp
|
||||
class Msg13Request *m_hsr;
|
||||
|
||||
Section *m_si;
|
||||
//Section *m_nextSection;
|
||||
//Section *m_lastSection;
|
||||
int32_t m_mcastRequestsOut;
|
||||
int32_t m_mcastRequestsIn;
|
||||
int32_t m_secStatsErrno;
|
||||
char *m_queryBuf;
|
||||
Msg39Request *m_msg39RequestArray;
|
||||
SafeBuf m_mcastBuf;
|
||||
Multicast *m_mcastArray;
|
||||
//char *m_inUse;
|
||||
//Query *m_queryArray;
|
||||
//Query *m_sharedQuery;
|
||||
bool m_gotDupStats;
|
||||
//Query m_q4;
|
||||
//Msg3a m_msg3a;
|
||||
//Msg39Request m_r39;
|
||||
Msg39Request m_mr2;
|
||||
SectionStats m_sectionStats;
|
||||
HashTableX m_sectionStatsTable;
|
||||
//char m_sectionHashQueryBuf[128];
|
||||
|
||||
// also set in getSections()
|
||||
int32_t m_maxVotesForDup;
|
||||
|
||||
// . for rebuild logging of what's changed
|
||||
// . Repair.cpp sets these based on titlerec
|
||||
char m_logLangId;
|
||||
int32_t m_logSiteNumInlinks;
|
||||
|
||||
int32_t m_numSectiondbReads;
|
||||
int32_t m_numSectiondbNeeds;
|
||||
key128_t m_sectiondbStartKey;
|
||||
RdbList m_secdbList;
|
||||
int32_t m_sectiondbRecall;
|
||||
|
||||
bool m_gotFacets;
|
||||
SafeBuf m_tmpBuf2;
|
||||
|
||||
SafeBuf m_inlineSectionVotingBuf;
|
||||
|
||||
//HashTableX m_rvt;
|
||||
//Msg17 m_msg17;
|
||||
//char *m_cachedRootVoteRec;
|
||||
//int32_t m_cachedRootVoteRecSize;
|
||||
//bool m_triedVoteCache;
|
||||
//bool m_storedVoteCache;
|
||||
//SafeBuf m_cacheRecBuf;
|
||||
|
||||
SafeBuf m_timeAxisUrl;
|
||||
|
||||
HashTableX m_turkVotingTable;
|
||||
HashTableX m_turkBitsTable;
|
||||
uint32_t m_confirmedTitleContentHash ;
|
||||
uint32_t m_confirmedTitleTagHash ;
|
||||
|
||||
// turk voting tag rec
|
||||
TagRec m_vtr;
|
||||
// tagrec of banned turks
|
||||
TagRec m_bannedTurkRec;
|
||||
// and the table of the hashed banned turk users
|
||||
HashTableX m_turkBanTable;
|
||||
|
||||
// used for displaying turk votes...
|
||||
HashTableX m_vctab;
|
||||
HashTableX m_vcduptab;
|
||||
|
||||
Images m_images;
|
||||
HashTableX m_countTable;
|
||||
HttpMime m_mime;
|
||||
TagRec m_tagRec;
|
||||
SafeBuf m_tagRecBuf;
|
||||
// copy of m_oldTagRec but with our modifications, if any
|
||||
//TagRec m_newTagRec;
|
||||
SafeBuf m_newTagBuf;
|
||||
SafeBuf m_fragBuf;
|
||||
SafeBuf m_wordSpamBuf;
|
||||
@ -962,9 +822,6 @@ public:
|
||||
class SafeBuf *m_savedSb;
|
||||
class HttpRequest *m_savedHr;
|
||||
|
||||
char m_savedChar;
|
||||
|
||||
|
||||
// validity flags. on reset() all these are set to false.
|
||||
char m_VALIDSTART;
|
||||
// DO NOT add validity flags above this line!
|
||||
@ -992,7 +849,6 @@ public:
|
||||
char m_filteredRootTitleBufValid;
|
||||
char m_titleBufValid;
|
||||
char m_fragBufValid;
|
||||
char m_inlineSectionVotingBufValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
char m_matchingQueryBufValid;
|
||||
@ -1044,10 +900,6 @@ public:
|
||||
char m_sectionsValid;
|
||||
char m_subSentsValid;
|
||||
|
||||
char m_turkVotingTableValid;
|
||||
char m_turkBitsTableValid;
|
||||
char m_turkBanTableValid;
|
||||
char m_vctabValid;
|
||||
char m_explicitSectionsValid;
|
||||
char m_impliedSectionsValid;
|
||||
char m_imageDataValid;
|
||||
@ -1132,9 +984,6 @@ public:
|
||||
bool m_isWWWDupValid;
|
||||
bool m_linkInfo1Valid;
|
||||
bool m_linkSiteHashesValid;
|
||||
bool m_sectionsReplyValid;
|
||||
bool m_sectionsVotesValid;
|
||||
bool m_sectiondbDataValid;
|
||||
bool m_placedbDataValid;
|
||||
bool m_siteHash64Valid;
|
||||
bool m_siteHash32Valid;
|
||||
@ -1197,9 +1046,6 @@ public:
|
||||
// DO NOT add validity flags below this line!
|
||||
char m_VALIDEND;
|
||||
|
||||
// more stuff
|
||||
//char *m_utf8Content;
|
||||
//int32_t m_utf8ContentLen;
|
||||
|
||||
bool m_printedMenu;
|
||||
int32_t m_urlPubDate;
|
||||
@ -1253,11 +1099,9 @@ public:
|
||||
|
||||
|
||||
int32_t m_siteSpiderQuota;
|
||||
//int32_t m_numBannedOutlinks;
|
||||
class XmlDoc *m_oldDoc;
|
||||
class XmlDoc *m_extraDoc;
|
||||
class XmlDoc *m_rootDoc;
|
||||
//class XmlDoc *m_gatewayDoc;
|
||||
RdbList m_oldMetaList;
|
||||
char *m_oldTitleRec;
|
||||
int32_t m_oldTitleRecSize;
|
||||
@ -1275,10 +1119,7 @@ public:
|
||||
int32_t m_tagdbCollLen;
|
||||
|
||||
Url m_extraUrl;
|
||||
//int32_t m_siteNumInlinksFresh;
|
||||
//int32_t m_sitePop;
|
||||
uint8_t m_siteNumInlinks8;
|
||||
//int32_t m_siteNumInlinks;
|
||||
LinkInfo m_siteLinkInfo;
|
||||
SafeBuf m_mySiteLinkInfoBuf;
|
||||
SafeBuf m_myPageLinkInfoBuf;
|
||||
@ -1289,7 +1130,6 @@ public:
|
||||
char m_useSiteLinkBuf;
|
||||
char m_usePageLinkBuf;
|
||||
char m_printInXml;
|
||||
//Msg25 m_msg25;
|
||||
SafeBuf m_tmpBuf11;
|
||||
SafeBuf m_tmpBuf12;
|
||||
Multicast m_mcast11;
|
||||
@ -1297,7 +1137,6 @@ public:
|
||||
// lists from cachedb for msg25's msg20 replies serialized
|
||||
RdbList m_siteReplyList;
|
||||
RdbList m_pageReplyList;
|
||||
//void (* m_masterLoopWrapper) (void *state);
|
||||
MsgC m_msgc;
|
||||
bool m_isAllowed;
|
||||
bool m_forwardDownloadRequest;
|
||||
@ -1308,10 +1147,6 @@ public:
|
||||
// for limiting # of iframe tag expansions
|
||||
int32_t m_numExpansions;
|
||||
char m_newOnly;
|
||||
//int32_t m_tryAgainTimeDelta;
|
||||
//int32_t m_sameIpWait;
|
||||
//int32_t m_sameDomainWait;
|
||||
//int32_t m_maxSpidersPerDomain;
|
||||
char m_isWWWDup;
|
||||
char m_calledMsg0b;
|
||||
|
||||
@ -1322,24 +1157,14 @@ public:
|
||||
class RdbList *m_ulist;
|
||||
void *m_hack;
|
||||
class XmlDoc *m_hackxd;
|
||||
//class LinkInfo *m_linkInfo1Ptr;
|
||||
char *m_linkInfoColl;
|
||||
//char m_injectedReply;
|
||||
//int32_t m_minInlinkerHopCount;
|
||||
//class LinkInfo *m_linkInfo2Ptr;
|
||||
SiteGetter m_siteGetter;
|
||||
int64_t m_siteHash64;
|
||||
//char *m_site;
|
||||
//int32_t m_siteLen;
|
||||
//Url m_siteUrl;
|
||||
int32_t m_siteHash32;
|
||||
char *m_httpReply;
|
||||
//char m_downloadAttempted;
|
||||
char m_incrementedAttemptsCount;
|
||||
char m_incrementedDownloadCount;
|
||||
char m_redirectFlag;
|
||||
//char m_isScraping;
|
||||
//char m_throttleDownload;
|
||||
char m_spamCheckDisabled;
|
||||
char m_useRobotsTxt;
|
||||
int32_t m_robotsTxtLen;
|
||||
@ -1353,15 +1178,12 @@ public:
|
||||
int32_t m_filteredContentMaxSize;
|
||||
char m_calledThread;
|
||||
int32_t m_errno;
|
||||
//class CollectionRec *m_cr;
|
||||
//int32_t m_utf8ContentAllocSize;
|
||||
int32_t m_hostHash32a;
|
||||
int32_t m_hostHash32b;
|
||||
int32_t m_domHash32;
|
||||
int32_t m_priorityQueueNum;
|
||||
|
||||
// this points into m_msge0 i guess
|
||||
//class TagRec **m_outlinkTagRecVector;
|
||||
Msge0 m_msge0;
|
||||
|
||||
// this points into m_msge1 i guess
|
||||
@ -1657,10 +1479,7 @@ public:
|
||||
bool m_storeTermListInfo;
|
||||
char m_sortTermListBy;
|
||||
|
||||
SafeBuf m_sectiondbData;
|
||||
//char *m_sectiondbData;
|
||||
char *m_placedbData;
|
||||
//int32_t m_sectiondbDataSize;
|
||||
int32_t m_placedbDataSize;
|
||||
|
||||
// we now have HashInfo to replace this
|
||||
|
@ -1869,18 +1869,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////
|
||||
//
|
||||
// CHROME DETECTION
|
||||
//
|
||||
// we search for these terms we hash here in getSectionsWithDupStats()
|
||||
// so we can remove chrome.
|
||||
//
|
||||
/////////////
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
|
||||
bool XmlDoc::hashSections ( HashTableX *tt ) {
|
||||
// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
|
||||
return true;
|
||||
@ -3539,7 +3528,6 @@ skipsingleword:
|
||||
|
||||
|
||||
// between calls? i.e. hashTitle() and hashBody()
|
||||
//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
|
||||
if ( i > 0 ) m_dist = wposvec[i-1] + 100;
|
||||
|
||||
return true;
|
||||
|
Reference in New Issue
Block a user