Remove SectionVotingTable & passing Phrases into Sections

This commit is contained in:
Ai Lin Chia
2016-02-23 14:35:40 +01:00
parent 0f91da871e
commit 0efdf88387
7 changed files with 34 additions and 532 deletions

@ -2104,7 +2104,6 @@ bool Repair::injectTitleRec ( ) {
xd->m_useSpiderdb = m_rebuildSpiderdb;
xd->m_useTitledb = m_rebuildTitledb;
//xd->m_usePlacedb = m_rebuildPlacedb;
//xd->m_useSectiondb = m_rebuildSectiondb;
//xd->m_useRevdb = m_rebuildRevdb;
xd->m_useSecondaryRdbs = addToSecondaryRdbs;

@ -102,7 +102,7 @@ public:
// . sets m_sections[] array, 1-1 with words array "w"
// . the Weights class can look at these sections and zero out the weights
// for words in script, style, select and marquee sections
bool Sections::set( Words *w, Phrases *phrases, Bits *bits, Url *url, int64_t siteHash64,
bool Sections::set( Words *w, Bits *bits, Url *url, int64_t siteHash64,
char *coll, int32_t niceness, uint8_t contentType ) {
reset();
@ -141,8 +141,6 @@ bool Sections::set( Words *w, Phrases *phrases, Bits *bits, Url *url, int64_t si
m_wlens = wlens;
m_wptrs = wptrs;
m_tids = tids;
m_pids = phrases->getPhraseIds2();
m_isRSSExt = false;
char *ext = m_url->getExtension();

@ -483,7 +483,7 @@ class Sections {
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . sets m_sections[] array, 1-1 with words array "w"
bool set(class Words *w, class Phrases *phrases, class Bits *bits, class Url *url,
bool set(class Words *w, class Bits *bits, class Url *url,
int64_t siteHash64, char *coll, int32_t niceness, uint8_t contentType );
bool addVotes(class SectionVotingTable *nsvt, uint32_t tagPairHash );
@ -644,7 +644,6 @@ class Sections {
bool m_badHtml;
int64_t *m_wids;
int64_t *m_pids;
int32_t *m_wlens;
char **m_wptrs;
nodeid_t *m_tids;

@ -384,7 +384,6 @@ void XmlDoc::reset ( ) {
m_phrases.reset();
m_bits.reset();
m_sections.reset();
//m_weights.reset();
m_countTable.reset();
// other crap
@ -392,10 +391,7 @@ void XmlDoc::reset ( ) {
m_links.reset();
m_bits2.reset();
m_pos.reset();
//m_synonyms.reset();
m_synBuf.reset();
//m_nsvt.reset();
//m_osvt.reset();
m_turkVotingTable.reset();
m_turkBitsTable.reset();
m_vtr.reset();
@ -406,17 +402,10 @@ void XmlDoc::reset ( ) {
m_mime.reset();
m_tagRec.reset();
m_newTagBuf.reset();
//m_clockCandidatesTable.reset();
//m_cctbuf.reset();
m_dupList.reset();
//m_oldMetaList.reset();
m_msg8a.reset();
//m_siteLinkInfo.reset();
//m_msg25.reset();
//m_msgc.reset();
m_msg13.reset();
m_msg0b.reset();
//m_siteGetter.reset();
m_msge0.reset();
m_msge1.reset();
m_reply.reset();
@ -430,9 +419,6 @@ void XmlDoc::reset ( ) {
m_xbuf.reset();
m_tagRecBuf.reset();
//m_titleRec = NULL;
//m_titleRecSize = 0;
// origin of this XmlDoc
m_setFromTitleRec = false;
m_setFromUrl = false;
@ -463,11 +449,6 @@ void XmlDoc::reset ( ) {
m_listFlushed = false;
m_updatedCounts = false;
m_updatedCounts2 = false;
//m_updatedTagdb1 = false;
//m_updatedTagdb2 = false;
//m_updatedTagdb3 = false;
//m_updatedTagdb4 = false;
//m_updatedTagdb5 = false;
m_copied1 = false;
m_updatingSiteLinkInfoTags = false;
m_addressSetCalled = false;
@ -478,18 +459,12 @@ void XmlDoc::reset ( ) {
m_numRedirects = 0;
m_numOutlinksAdded = 0;
// . use sameDomain and sameIp waits?
// . these may be bypassed in getContactDoc()
//m_throttleDownload = true;
m_spamCheckDisabled = false;
m_useRobotsTxt = true;
m_redirectFlag = false;
m_allowSimplifiedRedirs = false;
//m_calledMsg22a = false;
//m_calledMsg22b = false;
//m_calledMsg22c = false;
m_didDelay = false;
m_didDelayUnregister = false;
m_calledMsg22d = 0LL;
@ -511,25 +486,17 @@ void XmlDoc::reset ( ) {
m_numSectiondbNeeds = 0;
m_sectiondbRecall = 0;
//m_triedVoteCache = false;
//m_storedVoteCache = false;
m_setTr = false;
//m_checkedRobots = false;
m_triedTagRec = false;
m_didGatewayPage = false;
m_didQuickDupCheck = false;
m_calledMsg8b = false;
m_recycleContent = false;
//m_loadFromOldTitleRec = false;
m_callback1 = NULL;
m_callback2 = NULL;
m_state = NULL;
//m_checkForRedir = true;
m_processedLang = false;
m_doingConsistencyCheck = false;
@ -550,21 +517,14 @@ void XmlDoc::reset ( ) {
// Repair.cpp now explicitly sets these to false if needs to
m_usePosdb = true;
//m_useDatedb = true;
m_useClusterdb = true;
m_useLinkdb = true;
m_useSpiderdb = true;
m_useTitledb = true;
m_useTagdb = true;
m_usePlacedb = true;
//m_useTimedb = true;
// only use for custom crawls for now to save disk space
m_useSectiondb = false;
//m_useRevdb = true;
m_useSecondaryRdbs = false;
//m_useIpsTxtFile = true;
// used by Msg13.cpp only. kinda a hack.
m_isSpiderProxy = false;
@ -593,29 +553,11 @@ void XmlDoc::reset ( ) {
char *XmlDoc::getTestDir ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// return NULL if we are not the "qatest123" collection
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
// then return "test-spider" otherwise...
//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
// return "qa";//"test-spider";
// ... default to "test-parser"
//return "test-parser";
return "qa";
/*
if ( getIsPageParser() )
return "test-page-parser";
//if ( m_sreqValid && m_sreq.m_isInjecting )
// return "test-page-inject";
else if ( g_conf.m_testParserEnabled )
return "test-parser";
else if ( g_conf.m_testSpiderEnabled )
return "test-spider";
// default to being from PageInject
return "test-page-inject";
*/
//else { char *xx=NULL;*xx=0; }
//return NULL;
}
int32_t XmlDoc::getSpideredTime ( ) {
@ -4934,23 +4876,25 @@ Pos *XmlDoc::getPos ( ) {
Phrases *XmlDoc::getPhrases ( ) {
// return it if it is set
if ( m_phrasesValid ) return &m_phrases;
if ( m_phrasesValid ) {
return &m_phrases;
}
// this will set it if necessary
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
// get this
Bits *bits = getBits();
// bail on error
if ( ! bits ) return NULL;
// now set what we need
if ( ! m_phrases.set ( words ,
bits ,
true , // use stop words
false , // use stems
m_version ,
m_niceness ) )
if ( !m_phrases.set( words, bits, true, false, m_version, m_niceness ) ) {
return NULL;
}
// we got it
m_phrasesValid = true;
return &m_phrases;
@ -4970,6 +4914,7 @@ Sections *XmlDoc::getExplicitSections ( ) {
//}
setStatus ( "getting explicit sections" );
// use the old title rec to make sure we parse consistently!
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
@ -4977,21 +4922,22 @@ Sections *XmlDoc::getExplicitSections ( ) {
Words *words = getWords();
// returns NULL on error, -1 if blocked
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
// need these too now
Phrases *phrases = getPhrases();
if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
// get this
Bits *bits = getBits();
// bail on error
if ( ! bits ) return NULL;
// the site hash
int64_t *sh64 = getSiteHash64();
// sanity check
if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
// the docid
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
@ -5007,26 +4953,23 @@ Sections *XmlDoc::getExplicitSections ( ) {
// this uses the sectionsReply to see which sections are "text", etc.
// rather than compute it expensively
if ( !m_calledSections &&
!m_sections.set( &m_words, &m_phrases, bits, getFirstUrl(), *sh64, cr->m_coll, m_niceness, *ct ) ) {
!m_sections.set( &m_words, bits, getFirstUrl(), *sh64, cr->m_coll, m_niceness, *ct ) ) {
m_calledSections = true;
// sanity check, this should not block, we are setting
// exclusively from the titleRec
//if ( sd ) { char *xx=NULL;*xx=0; }
// it blocked, return -1
return (Sections *) -1;
}
int64_t end = gettimeofdayInMillisecondsLocal();
if ( end - start > 1000 )
if ( end - start > 100 )
log("build: %s section set took %"INT64" ms",
m_firstUrl.m_url,end -start);
// error? ETAGBREACH for example... or maybe ENOMEM
if ( g_errno ) return NULL;
// set inlink bits
m_bits.setInLinkBits ( &m_sections );
// we got it
m_explicitSectionsValid = true;
return &m_sections;
@ -5047,105 +4990,14 @@ Sections *XmlDoc::getImpliedSections ( ) {
// add in Section::m_sentFlags bits having to do with our voting tables
Sections *XmlDoc::getSections ( ) {
setStatus("getting sections");
// get the sections without implied sections
Sections *ss = getImpliedSections();
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
// returns NULL if our url is root!
//HashTableX *rvt = getRootVotingTable();
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
SectionVotingTable *osvt = getOldSectionVotingTable();
if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;
uint32_t *tph = getTagPairHash32();
if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;
// need a getUseSectiondb() function...
if ( ! m_useSectiondb ) {
m_sectionsValid = true;
return &m_sections;
}
// start here
Section *si;
// get first sentence in doc
si = ss->m_firstSent;
// do not bother scanning if no votes
if ( osvt->getNumVotes() <= 0 ) si = NULL;
// assume no dups
m_maxVotesForDup = 0;
// scan the sentence sections and or in the bits we should
for ( ; si ; si = si->m_nextSent ) {
// breathe
QUICKPOLL ( m_niceness );
// sanity check
if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
// how many pages from this site have this taghash for
// a sentence
float nt;
nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
// skip if nobody! (except us)
if ( nt <= 0.0 ) continue;
// . get out tag content hash
// . for some reason m_contentHash is 0 for like menu-y sectns
int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
// . now how many pages also had same content in that tag?
// . TODO: make sure numsampled only counts a docid once!
// and this is not each time it occurs on that page.
float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
// cast it to a int32_t
int32_t votes1 = (int32_t)nsam;
// by default, complement
int32_t votes2 = (int32_t)nt - votes1;
// store votes
si->m_votesForDup = votes1;
si->m_votesForNotDup = votes2;
// what's the most dup votes we had...
if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
}
m_sectionsValid = true;
return &m_sections;
}
SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
if ( m_nsvtValid ) return &m_nsvt;
// need sections
Sections *ss = getSections();
if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
// hash of all adjacent tag pairs
uint32_t *tph = getTagPairHash32 ( ) ;
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
// are we a site root url?
//char *isRoot = getIsSiteRoot();
//if ( ! isRoot || isRoot == (char *)-1 )
// return (SectionVotingTable *)isRoot;
// init table
if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
// . tally the section votes from the sections class
// . only add the date votes, not the taghash/contenthash keys
// from the root, since we add those from the root voting table
// into m_osvt directly!
// . we no longer have root voting table!
// . this adds keys of the hash of each tag xpath
// . and it adds keys of the hash of each tag path PLUS its innerhtml
if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
// our new section voting table is now valid, and ready to be added
// to sectiondb by calling SectionVotingTable::hash()
m_nsvtValid = true;
return &m_nsvt;
}
// . scan every section and look up its tag and content hashes in
// sectiondb to find out how many pages and sites have the same hash
// . use the secondary sectiondb key, key2
@ -5756,262 +5608,6 @@ bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
return true;
}
// . for all urls from this subdomain...
// . EXCEPT root url since we use msg17 to cache that, etc.
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
if ( m_osvtValid ) return &m_osvt;
// do not consult sectiondb if we are set from the title rec,
// that way we avoid parsining inconsistencies since sectiondb changes!
if ( m_setFromTitleRec ) {
char *p = ptr_sectiondbData;
m_osvtValid = true;
m_osvt.m_totalSiteVoters = 0;
if ( size_sectiondbData <= 4 ) return &m_osvt;
m_osvt.m_totalSiteVoters = *(int32_t *)p;
p += 4;
int32_t remaining = size_sectiondbData - 4;
m_osvt.m_svt.deserialize(p,remaining,m_niceness);
return &m_osvt;
}
// returns empty table if WE are the site root url!
//HashTableX *rvt = getRootVotingTable();
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
// need sections
//Sections *ss = getSections();
//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
// hash of all adjacent tag pairs
uint32_t *tph = getTagPairHash32 ( ) ;
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
int64_t *siteHash64 = getSiteHash64();
if ( ! siteHash64 || siteHash64 == (void *)-1 )
return (SectionVotingTable *)siteHash64;
// the docid
int64_t *d = getDocId();
if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// . for us, dates are really containers of the flags and tag hash
// . init this up here, it is re-set if we re-call getSectiondbList()
// because there were too many records in it to handle in one read
if ( m_numSectiondbReads == 0 ) {
// init table
if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
// use site hash as the main thing
int64_t termId = *siteHash64 & TERMID_MASK;
// . start key for reading list from sectiondb
// . read all the section votes for this site
m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
// how many reads we have to do...
m_numSectiondbNeeds = 1;
}
//bool skipRecall = false;
// always read 5MB at a time from sectiondb
int32_t minRecSizes = 5000000;
// crap! host #28 is being totall slammed!!!!!
// why?????? in the meantime do this
//minRecSizes = 100000;
//skipRecall = true;
// is it facebook?
bool limitSectiondb = false;
// limit now to speed up repair rebuild
// limit now to speed up injection!
limitSectiondb = true;
// facebook lists often clog the tree, and when we read 2MB worth of
// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
// because facebook is a well structured xml feed so why read any
// really!
if ( limitSectiondb ) minRecSizes = 50000;
key128_t *lastKey = NULL;
// if msg0 blocked and came back with g_errno set, like
// in preparing to merge it got an OOM
if ( g_errno ) {
log("build: sectiondb read2: %s",mstrerror(g_errno));
return NULL;
}
readLoop:
// before looking up TitleRecs using Msg20, let's first consult
// datedb to see if we got adequate data as to what sections
// are the article sections
// only get the list once
if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
// only do this once
m_numSectiondbReads++;
// make the termid
uint64_t termId = *siteHash64 & TERMID_MASK;
// end key is always the same
key128_t end = g_datedb.makeEndKey ( termId , 0 );
// shortcut
Msg0 *m = &m_msg0;
// get the group this list is in (split = false)
uint32_t shardNum;
shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
// we need a group # from the groupId
//int32_t split = g_hostdb.getGroupNum ( gid );
// note it
//logf(LOG_DEBUG,"sections: "
// "reading list from sectiondb: "
// "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" "
// "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" "
// ,m_sectiondbStartKey.n1
// ,m_sectiondbStartKey.n0
// ,end.n1
// ,end.n0
// );
// . get the list
// . gets all votes for one particular site
if ( ! m->getList ( -1 , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
RDB_SECTIONDB , // was RDB_DATEDB
cr->m_collnum ,
&m_secdbList ,
(char *)&m_sectiondbStartKey ,
(char *)&end ,
minRecSizes ,
m_masterState ,
m_masterLoop ,
m_niceness , // MAX_NICENESS
// default parms follow
true , // doErrorCorrection?
true , // includeTree?
true , // doMerge?
-1 , // firstHostId
0 , // startFileNum
-1 , // numFiles
msg0_getlist_infinite_timeout , // timeout
-1 , // syncPoint
-1 , // preferLocalReads
NULL , // msg5
NULL , // msg5b
false , // isrealmerge?
true , // allowpagecache?
false , // forceLocalIndexdb?
false , // doIndexdbSplit?
shardNum ) )//split ))
// return -1 if blocks
return (SectionVotingTable *)-1;
// error?
if ( g_errno ) {
log("build: sectiondb read: %s",mstrerror(g_errno));
return NULL;
}
}
// it also returns the lastKey in the list so we can use that to
// set the startKey for a re-call if we read >= 5MB
lastKey = NULL;
//logf(LOG_DEBUG,"sections: read list of %"INT32" bytes",
// m_secdbList.m_listSize);
bool recall = true;
if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;
// . unless it had special byte set in Msg0.cpp HACK
// . we send back a compressed list and tack on an extra 0 byte at
// the end so that we know we had a full list!
if ( (m_secdbList.m_listSize % 2) == 1 ) {
m_secdbList.m_listSize--;
m_secdbList.m_listEnd --;
recall = true;
}
// no longer bother re-calling, because facebook is way slow...
if ( limitSectiondb ) recall = false;
// . returns false and sets g_errno on error
// . compile the votes from sectiondb for this site into a hashtable
// . m_osvt is a SectionVotingTable and each entry in the hashtable
// is a SectionVote class.
// . the taghash is the key of the vote and is a hash of all the
// nested tags the section is in.
// . another vote uses the tag hash hashed with the hash of the
// content contained by the section
// . using these two vote counts we set Section::m_votesForDup
// or Section::m_votesForNotDup counts which let us know how the
// section is repeated or not repeated on the site
// . SectionVote::m_score is always 1.0 from what i can tell
// cuz it seems like addVote*() always uses a score of 1.0
// . SectionVote::m_numSampled is how many times that tagHash
// occurs in the document.
if ( ! m_osvt.addListOfVotes(&m_secdbList,
&lastKey,
*d , // docid
m_niceness))
return NULL;
// why is this always zero it seems?
if ( g_conf.m_logDebugBuild )
log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"",
m_secdbList.m_listSize,(int32_t)recall);
// . recall? yes if we had to truncate our list...
// . we need to be able to scan all votes for the website... that is
// why we recall here
// . limit votes by a special sectiondb key then that is a vote...
if ( recall ) {
// another debug
//logf(LOG_DEBUG,"sections: recallling read");
// just note it for now
//if ( m_sectiondbRecall > 5 )
if ( m_numSectiondbNeeds > 5 )
logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"",
m_sectiondbRecall++);
// we should really limit voting per site! we do now!
//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
// update our start key
if ( lastKey ) m_sectiondbStartKey = *lastKey;
// inc by 2 since we already had this key
m_sectiondbStartKey += 2;
// unflag
m_numSectiondbNeeds++;
// and repeat
goto readLoop;
}
//
// set ptr_sectiondbData so this can be set from a title rec without
// having to lookup in sectiondb again which might have changed!
//
m_sectiondbData.purge();
// alloc
int32_t need = m_osvt.m_svt.getStoredSize() + 4;
if ( ! m_sectiondbData.reserve(need) )
// oom error?
return NULL;
// serialize this number
m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
// serialize the hashtablex
m_osvt.m_svt.serialize ( &m_sectiondbData );
// reference it for title rec serialization
ptr_sectiondbData = m_sectiondbData.getBufStart();
size_sectiondbData = m_sectiondbData.length();
m_osvtValid = true;
return &m_osvt;
}
int32_t *XmlDoc::getLinkSiteHashes ( ) {
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
@ -17770,30 +17366,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// get the voting table which we will add to sectiondb
SectionVotingTable *nsvt = NULL;
SectionVotingTable *osvt = NULL;
// seems like
// sectiondb takes up abotu 15% of the disk space like this. no!
// cuz then there is revdb, so we are 30%. so that's a no go.
bool addSectionVotes = false;
if ( nd ) addSectionVotes = true;
if ( ! m_useSectiondb ) addSectionVotes = false;
// to save disk space no longer add the roots! nto only saves sectiondb
// but also saves space in revdb
//if ( nd && *isRoot ) addSectionVotes = true;
if ( addSectionVotes ) {
nsvt = getNewSectionVotingTable();
if ( ! nsvt || nsvt == (void *)-1 )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getNewSectionVotingTable returned -1", __FILE__, __func__, __LINE__);
return (char *)nsvt;
}
// get the old table too!
osvt = getNewSectionVotingTable();
if ( ! osvt || osvt == (void *)-1 )
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getNewSectionVotingTable returned -1", __FILE__, __func__, __LINE__);
return (char *)osvt;
}
}
// need firstip if adding a rebuilt spider request
if ( m_useSecondaryRdbs && m_useSpiderdb ) {
@ -18435,12 +18007,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
setStatus ( "adding sectiondb keys");
// checkpoint
saved = m_p;
// add that table to the metalist
if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
{
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: addTable128 failed", __FILE__, __func__, __LINE__);
return NULL;
}
//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
// sanity check
if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
@ -23419,7 +22986,6 @@ Summary *XmlDoc::getSummary () {
if ( ! ct || ct == (void *)-1 ) {
return (Summary *)ct;
}
// xml and json docs have empty summaries
if ( *ct == CT_JSON || *ct == CT_XML ) {
m_summaryValid = true;
@ -23446,6 +23012,7 @@ Summary *XmlDoc::getSummary () {
if ( ! pos || pos == (Pos *)-1 ) {
return (Summary *)pos;
}
char *site = getSite();
if ( ! site || site == (char *)-1 ) {
return (Summary *)site;
@ -23829,12 +23396,11 @@ SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL;
Bits bb;
if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
Phrases pp;
if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
// this uses the sectionsReply to see which sections are
// "text", etc. rather than compute it expensively
Sections sec;
if ( !sec.set( &ww, &pp, &bb, getFirstUrl(), 0, "", m_niceness, CT_JSON ) ) {
if ( !sec.set( &ww, &bb, getFirstUrl(), 0, "", m_niceness, CT_JSON ) ) {
return NULL;
}
@ -24882,11 +24448,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
//
Sections *sections = getSections();
if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
//SectionVotingTable *nsvt = getNewSectionVotingTable();
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
//SectionVotingTable *osvt = getOldSectionVotingTable();
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
// these are nice
//HashTableX *pt = dp->getPhoneTable();
@ -26128,10 +25689,7 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
if ( hr ) sections = getSectionsWithDupStats();
else sections = getSections();
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
//SectionVotingTable *nsvt = getNewSectionVotingTable();
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
//SectionVotingTable *osvt = getOldSectionVotingTable();
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
Words *words = getWords();
if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
Phrases *phrases = getPhrases();
@ -29537,13 +29095,6 @@ SafeBuf *XmlDoc::getTermListBuf ( ) {
m_termListBufValid = true;
return &m_termListBuf;
// print timing
//int64_t now = gettimeofdayInMilliseconds();
//int64_t took = now - m_cacheStartTime;
//log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId);
// . flag it as being completely cached now
// . returns false and sets g_errno on error
//return addDocIdToTermListCache ( m_docId , cr->m_coll );
}

@ -429,8 +429,6 @@ public:
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
bool gotSectionFacets( class Multicast *mcast );
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
class SectionVotingTable *getOldSectionVotingTable();
class SectionVotingTable *getNewSectionVotingTable();
char **getSectionsReply ( ) ;
char **getSectionsVotes ( ) ;
HashTableX *getSectionVotingTable();
@ -946,9 +944,6 @@ public:
char m_logLangId;
int32_t m_logSiteNumInlinks;
SectionVotingTable m_nsvt;
SectionVotingTable m_osvt;
int32_t m_numSectiondbReads;
int32_t m_numSectiondbNeeds;
key128_t m_sectiondbStartKey;
@ -1013,7 +1008,6 @@ public:
char m_addedSpiderReplySizeValid;
char m_addedStatusDocSizeValid;
char m_downloadStartTimeValid;
//char m_docQualityValid;
char m_siteValid;
char m_startTimeValid;
char m_currentUrlValid;
@ -1025,7 +1019,6 @@ public:
char m_lastUrlValid;
char m_docIdValid;
char m_availDocIdValid;
//char m_collValid;
char m_tagRecValid;
char m_robotsTxtLenValid;
char m_tagRecDataValid;
@ -1042,32 +1035,24 @@ public:
char m_relatedQueryBufValid;
char m_queryLinkBufValid;
char m_redirSpiderRequestValid;
//char m_queryPtrsValid;
char m_queryOffsetsValid;
//char m_queryPtrsSortedValid;
char m_queryPtrsWholeValid;
char m_relatedDocIdBufValid;
char m_topMatchingQueryBufValid;
char m_relatedDocIdsScoredBufValid;
char m_relatedDocIdsWithTitlesValid;
char m_relatedTitleBufValid;
//char m_queryLinkBufValid;
char m_missingTermBufValid;
char m_matchingTermBufValid;
//char m_relPtrsValid;
char m_sortedPosdbListBufValid;
char m_wpSortedPosdbListBufValid;
char m_termListBufValid;
char m_insertableTermsBufValid;
char m_scoredInsertableTermsBufValid;
//char m_iwfiBufValid; // for holding WordFreqInfo instances
char m_wordPosInfoBufValid;
char m_recommendedLinksBufValid;
//char m_queryHashTableValid;
char m_queryOffsetTableValid;
//char m_socketWriteBufValid;
//char m_numBannedOutlinksValid;
char m_hopCountValid;
char m_isInjectingValid;
char m_isImportingValid;
@ -1091,13 +1076,9 @@ public:
char m_posValid;
char m_isUrlBadYearValid;
char m_phrasesValid;
//char m_synonymsValid;
//char m_weightsValid;
char m_sectionsValid;
char m_subSentsValid;
char m_osvtValid;
char m_nsvtValid;
//char m_rvtValid;
char m_turkVotingTableValid;
char m_turkBitsTableValid;
char m_turkBanTableValid;
@ -1109,17 +1090,13 @@ public:
char m_imagesValid;
char m_msge0Valid;
char m_msge1Valid;
//char m_msge2Valid;
//char m_sampleVectorValid;
char m_gigabitHashesValid;
//char m_oldsrValid;
char m_sreqValid;
char m_srepValid;
bool m_ipValid;
bool m_firstIpValid;
bool m_spideredTimeValid;
//bool m_nextSpiderTimeValid;
bool m_indexedTimeValid;
bool m_firstIndexedValid;
bool m_isInIndexValid;
@ -1127,14 +1104,7 @@ public:
bool m_outlinksAddedDateValid;
bool m_countryIdValid;
bool m_bodyStartPosValid;
/*
bool m_titleWeightValid;
bool m_headerWeightValid;
bool m_urlPathWeightValid;
bool m_externalLinkTextWeightValid;
bool m_internalLinkTextWeightValid;
bool m_conceptWeightValid;
*/
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
@ -1145,8 +1115,6 @@ public:
bool m_expandedUtf8ContentValid;
bool m_utf8ContentValid;
bool m_isAllowedValid;
//bool m_tryAgainTimeDeltaValid;
//bool m_eliminateMenusValid;
bool m_redirUrlValid;
bool m_redirCookieBufValid;
bool m_metaRedirUrlValid;
@ -1163,11 +1131,9 @@ public:
bool m_redirErrorValid;
bool m_domHash32Valid;
bool m_contentHash32Valid;
//bool m_tagHash32Valid;
bool m_tagPairHash32Valid;
bool m_spiderLinksValid;
//bool m_nextSpiderPriorityValid;
bool m_firstIndexedDateValid;
bool m_isPermalinkValid;
@ -1196,15 +1162,11 @@ public:
bool m_oldDocValid;
bool m_extraDocValid;
bool m_rootDocValid;
//bool m_gatewayDocValid;
bool m_oldMetaListValid;
bool m_oldTitleRecValid;
bool m_rootTitleRecValid;
bool m_isIndexedValid;
bool m_siteNumInlinksValid;
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
//bool m_siteNumInlinksTotalValid;
bool m_siteNumInlinks8Valid;
bool m_siteLinkInfoValid;
bool m_isWWWDupValid;
@ -1228,7 +1190,6 @@ public:
bool m_isSiteRootValid;
bool m_wasContentInjectedValid;
bool m_outlinkHopCountVectorValid;
//bool m_isSpamValid;
bool m_isFilteredValid;
bool m_urlFilterNumValid;
bool m_numOutlinksAddedValid;
@ -1245,7 +1206,6 @@ public:
bool m_titleValid;
bool m_htbValid;
bool m_collnumValid;
//bool m_twidsValid;
bool m_termId32BufValid;
bool m_termInfoBufValid;
bool m_newTermInfoBufValid;
@ -1254,9 +1214,6 @@ public:
bool m_spiderStatusDocMetaListValid;
bool m_isCompromisedValid;
bool m_isNoArchiveValid;
//bool m_isVisibleValid;
//bool m_clockCandidatesTableValid;
//bool m_clockCandidatesDataValid;
bool m_titleRecBufValid;
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
@ -1749,7 +1706,6 @@ public:
bool m_useTagdb ;
bool m_usePlacedb ;
//bool m_useTimedb ;
bool m_useSectiondb ;
//bool m_useRevdb ;
bool m_useSecondaryRdbs ;

@ -8473,13 +8473,11 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
// computeWordIds from xml
words.set ( &xml , true , true ) ;
bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0);
Phrases phrases;
phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0);
t = gettimeofdayInMilliseconds_force();
for ( int32_t i = 0 ; i < 100 ; i++ )
//if ( ! words.set ( &xml , true , true ) )
// do not supply xd so it will be set from scratch
if ( !sections.set( &words, &phrases, &bits, NULL, 0, NULL, 0, 0 ) )
if ( !sections.set( &words, &bits, NULL, 0, NULL, 0, 0 ) )
return log("build: speedtestxml: sections set: %s",
mstrerror(g_errno));
@ -8493,6 +8491,7 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
//Phrases phrases;
Phrases phrases;
t = gettimeofdayInMilliseconds_force();
for ( int32_t i = 0 ; i < 100 ; i++ )
if ( ! phrases.set ( &words ,

@ -28,14 +28,11 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
Bits bits;
ASSERT_TRUE(bits.set(&words, TITLEREC_CURRENT_VERSION, 0));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
Url url;
url.set(urlStr);
Sections sections;
ASSERT_TRUE(sections.set(&words, &phrases, &bits, &url, 0, "", 0, CT_HTML));
ASSERT_TRUE(sections.set(&words, &bits, &url, 0, "", 0, CT_HTML));
Query query;
ASSERT_TRUE(query.set2(queryStr, langEnglish, true));
@ -53,6 +50,9 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
Bits bitsForSummary;
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
Matches matches;
matches.setQuery(&query);
ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo, 0));