mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-16 02:46:08 -04:00
Remove SectionVotingTable & passing Phrases into Sections
This commit is contained in:
@ -2104,7 +2104,6 @@ bool Repair::injectTitleRec ( ) {
|
||||
xd->m_useSpiderdb = m_rebuildSpiderdb;
|
||||
xd->m_useTitledb = m_rebuildTitledb;
|
||||
//xd->m_usePlacedb = m_rebuildPlacedb;
|
||||
//xd->m_useSectiondb = m_rebuildSectiondb;
|
||||
//xd->m_useRevdb = m_rebuildRevdb;
|
||||
xd->m_useSecondaryRdbs = addToSecondaryRdbs;
|
||||
|
||||
|
@ -102,7 +102,7 @@ public:
|
||||
// . sets m_sections[] array, 1-1 with words array "w"
|
||||
// . the Weights class can look at these sections and zero out the weights
|
||||
// for words in script, style, select and marquee sections
|
||||
bool Sections::set( Words *w, Phrases *phrases, Bits *bits, Url *url, int64_t siteHash64,
|
||||
bool Sections::set( Words *w, Bits *bits, Url *url, int64_t siteHash64,
|
||||
char *coll, int32_t niceness, uint8_t contentType ) {
|
||||
reset();
|
||||
|
||||
@ -141,8 +141,6 @@ bool Sections::set( Words *w, Phrases *phrases, Bits *bits, Url *url, int64_t si
|
||||
m_wlens = wlens;
|
||||
m_wptrs = wptrs;
|
||||
m_tids = tids;
|
||||
m_pids = phrases->getPhraseIds2();
|
||||
|
||||
|
||||
m_isRSSExt = false;
|
||||
char *ext = m_url->getExtension();
|
||||
|
@ -483,7 +483,7 @@ class Sections {
|
||||
// . returns false if blocked, true otherwise
|
||||
// . returns true and sets g_errno on error
|
||||
// . sets m_sections[] array, 1-1 with words array "w"
|
||||
bool set(class Words *w, class Phrases *phrases, class Bits *bits, class Url *url,
|
||||
bool set(class Words *w, class Bits *bits, class Url *url,
|
||||
int64_t siteHash64, char *coll, int32_t niceness, uint8_t contentType );
|
||||
|
||||
bool addVotes(class SectionVotingTable *nsvt, uint32_t tagPairHash );
|
||||
@ -644,7 +644,6 @@ class Sections {
|
||||
bool m_badHtml;
|
||||
|
||||
int64_t *m_wids;
|
||||
int64_t *m_pids;
|
||||
int32_t *m_wlens;
|
||||
char **m_wptrs;
|
||||
nodeid_t *m_tids;
|
||||
|
497
XmlDoc.cpp
497
XmlDoc.cpp
@ -384,7 +384,6 @@ void XmlDoc::reset ( ) {
|
||||
m_phrases.reset();
|
||||
m_bits.reset();
|
||||
m_sections.reset();
|
||||
//m_weights.reset();
|
||||
m_countTable.reset();
|
||||
|
||||
// other crap
|
||||
@ -392,10 +391,7 @@ void XmlDoc::reset ( ) {
|
||||
m_links.reset();
|
||||
m_bits2.reset();
|
||||
m_pos.reset();
|
||||
//m_synonyms.reset();
|
||||
m_synBuf.reset();
|
||||
//m_nsvt.reset();
|
||||
//m_osvt.reset();
|
||||
m_turkVotingTable.reset();
|
||||
m_turkBitsTable.reset();
|
||||
m_vtr.reset();
|
||||
@ -406,17 +402,10 @@ void XmlDoc::reset ( ) {
|
||||
m_mime.reset();
|
||||
m_tagRec.reset();
|
||||
m_newTagBuf.reset();
|
||||
//m_clockCandidatesTable.reset();
|
||||
//m_cctbuf.reset();
|
||||
m_dupList.reset();
|
||||
//m_oldMetaList.reset();
|
||||
m_msg8a.reset();
|
||||
//m_siteLinkInfo.reset();
|
||||
//m_msg25.reset();
|
||||
//m_msgc.reset();
|
||||
m_msg13.reset();
|
||||
m_msg0b.reset();
|
||||
//m_siteGetter.reset();
|
||||
m_msge0.reset();
|
||||
m_msge1.reset();
|
||||
m_reply.reset();
|
||||
@ -430,9 +419,6 @@ void XmlDoc::reset ( ) {
|
||||
m_xbuf.reset();
|
||||
m_tagRecBuf.reset();
|
||||
|
||||
//m_titleRec = NULL;
|
||||
//m_titleRecSize = 0;
|
||||
|
||||
// origin of this XmlDoc
|
||||
m_setFromTitleRec = false;
|
||||
m_setFromUrl = false;
|
||||
@ -463,11 +449,6 @@ void XmlDoc::reset ( ) {
|
||||
m_listFlushed = false;
|
||||
m_updatedCounts = false;
|
||||
m_updatedCounts2 = false;
|
||||
//m_updatedTagdb1 = false;
|
||||
//m_updatedTagdb2 = false;
|
||||
//m_updatedTagdb3 = false;
|
||||
//m_updatedTagdb4 = false;
|
||||
//m_updatedTagdb5 = false;
|
||||
m_copied1 = false;
|
||||
m_updatingSiteLinkInfoTags = false;
|
||||
m_addressSetCalled = false;
|
||||
@ -478,18 +459,12 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_numRedirects = 0;
|
||||
m_numOutlinksAdded = 0;
|
||||
// . use sameDomain and sameIp waits?
|
||||
// . these may be bypassed in getContactDoc()
|
||||
//m_throttleDownload = true;
|
||||
m_spamCheckDisabled = false;
|
||||
m_useRobotsTxt = true;
|
||||
m_redirectFlag = false;
|
||||
|
||||
m_allowSimplifiedRedirs = false;
|
||||
|
||||
//m_calledMsg22a = false;
|
||||
//m_calledMsg22b = false;
|
||||
//m_calledMsg22c = false;
|
||||
m_didDelay = false;
|
||||
m_didDelayUnregister = false;
|
||||
m_calledMsg22d = 0LL;
|
||||
@ -511,25 +486,17 @@ void XmlDoc::reset ( ) {
|
||||
m_numSectiondbNeeds = 0;
|
||||
m_sectiondbRecall = 0;
|
||||
|
||||
//m_triedVoteCache = false;
|
||||
//m_storedVoteCache = false;
|
||||
|
||||
m_setTr = false;
|
||||
//m_checkedRobots = false;
|
||||
m_triedTagRec = false;
|
||||
m_didGatewayPage = false;
|
||||
m_didQuickDupCheck = false;
|
||||
m_calledMsg8b = false;
|
||||
|
||||
m_recycleContent = false;
|
||||
//m_loadFromOldTitleRec = false;
|
||||
m_callback1 = NULL;
|
||||
m_callback2 = NULL;
|
||||
m_state = NULL;
|
||||
|
||||
|
||||
//m_checkForRedir = true;
|
||||
|
||||
m_processedLang = false;
|
||||
|
||||
m_doingConsistencyCheck = false;
|
||||
@ -550,21 +517,14 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
// Repair.cpp now explicitly sets these to false if needs to
|
||||
m_usePosdb = true;
|
||||
//m_useDatedb = true;
|
||||
m_useClusterdb = true;
|
||||
m_useLinkdb = true;
|
||||
m_useSpiderdb = true;
|
||||
m_useTitledb = true;
|
||||
m_useTagdb = true;
|
||||
m_usePlacedb = true;
|
||||
//m_useTimedb = true;
|
||||
// only use for custom crawls for now to save disk space
|
||||
m_useSectiondb = false;
|
||||
//m_useRevdb = true;
|
||||
m_useSecondaryRdbs = false;
|
||||
|
||||
//m_useIpsTxtFile = true;
|
||||
|
||||
// used by Msg13.cpp only. kinda a hack.
|
||||
m_isSpiderProxy = false;
|
||||
|
||||
@ -593,29 +553,11 @@ void XmlDoc::reset ( ) {
|
||||
char *XmlDoc::getTestDir ( ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// return NULL if we are not the "qatest123" collection
|
||||
if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
|
||||
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
|
||||
// then return "test-spider" otherwise...
|
||||
//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
|
||||
// return "qa";//"test-spider";
|
||||
// ... default to "test-parser"
|
||||
//return "test-parser";
|
||||
|
||||
return "qa";
|
||||
/*
|
||||
if ( getIsPageParser() )
|
||||
return "test-page-parser";
|
||||
//if ( m_sreqValid && m_sreq.m_isInjecting )
|
||||
// return "test-page-inject";
|
||||
else if ( g_conf.m_testParserEnabled )
|
||||
return "test-parser";
|
||||
else if ( g_conf.m_testSpiderEnabled )
|
||||
return "test-spider";
|
||||
// default to being from PageInject
|
||||
return "test-page-inject";
|
||||
*/
|
||||
//else { char *xx=NULL;*xx=0; }
|
||||
//return NULL;
|
||||
}
|
||||
|
||||
int32_t XmlDoc::getSpideredTime ( ) {
|
||||
@ -4934,23 +4876,25 @@ Pos *XmlDoc::getPos ( ) {
|
||||
|
||||
Phrases *XmlDoc::getPhrases ( ) {
|
||||
// return it if it is set
|
||||
if ( m_phrasesValid ) return &m_phrases;
|
||||
if ( m_phrasesValid ) {
|
||||
return &m_phrases;
|
||||
}
|
||||
|
||||
// this will set it if necessary
|
||||
Words *words = getWords();
|
||||
// returns NULL on error, -1 if blocked
|
||||
if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
|
||||
|
||||
// get this
|
||||
Bits *bits = getBits();
|
||||
// bail on error
|
||||
if ( ! bits ) return NULL;
|
||||
|
||||
// now set what we need
|
||||
if ( ! m_phrases.set ( words ,
|
||||
bits ,
|
||||
true , // use stop words
|
||||
false , // use stems
|
||||
m_version ,
|
||||
m_niceness ) )
|
||||
if ( !m_phrases.set( words, bits, true, false, m_version, m_niceness ) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// we got it
|
||||
m_phrasesValid = true;
|
||||
return &m_phrases;
|
||||
@ -4970,6 +4914,7 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
//}
|
||||
|
||||
setStatus ( "getting explicit sections" );
|
||||
|
||||
// use the old title rec to make sure we parse consistently!
|
||||
XmlDoc **pod = getOldXmlDoc ( );
|
||||
if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
|
||||
@ -4977,21 +4922,22 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
Words *words = getWords();
|
||||
// returns NULL on error, -1 if blocked
|
||||
if ( ! words || words == (Words *)-1 ) return (Sections *)words;
|
||||
// need these too now
|
||||
Phrases *phrases = getPhrases();
|
||||
if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
|
||||
|
||||
// get this
|
||||
Bits *bits = getBits();
|
||||
// bail on error
|
||||
if ( ! bits ) return NULL;
|
||||
|
||||
// the site hash
|
||||
int64_t *sh64 = getSiteHash64();
|
||||
// sanity check
|
||||
if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
|
||||
if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
|
||||
|
||||
// the docid
|
||||
int64_t *d = getDocId();
|
||||
if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
|
||||
|
||||
// get the content type
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
@ -5007,26 +4953,23 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
// this uses the sectionsReply to see which sections are "text", etc.
|
||||
// rather than compute it expensively
|
||||
if ( !m_calledSections &&
|
||||
!m_sections.set( &m_words, &m_phrases, bits, getFirstUrl(), *sh64, cr->m_coll, m_niceness, *ct ) ) {
|
||||
!m_sections.set( &m_words, bits, getFirstUrl(), *sh64, cr->m_coll, m_niceness, *ct ) ) {
|
||||
m_calledSections = true;
|
||||
// sanity check, this should not block, we are setting
|
||||
// exclusively from the titleRec
|
||||
//if ( sd ) { char *xx=NULL;*xx=0; }
|
||||
// it blocked, return -1
|
||||
return (Sections *) -1;
|
||||
}
|
||||
|
||||
int64_t end = gettimeofdayInMillisecondsLocal();
|
||||
|
||||
if ( end - start > 1000 )
|
||||
if ( end - start > 100 )
|
||||
log("build: %s section set took %"INT64" ms",
|
||||
m_firstUrl.m_url,end -start);
|
||||
|
||||
|
||||
// error? ETAGBREACH for example... or maybe ENOMEM
|
||||
if ( g_errno ) return NULL;
|
||||
// set inlink bits
|
||||
m_bits.setInLinkBits ( &m_sections );
|
||||
|
||||
// we got it
|
||||
m_explicitSectionsValid = true;
|
||||
return &m_sections;
|
||||
@ -5047,105 +4990,14 @@ Sections *XmlDoc::getImpliedSections ( ) {
|
||||
|
||||
// add in Section::m_sentFlags bits having to do with our voting tables
|
||||
Sections *XmlDoc::getSections ( ) {
|
||||
|
||||
setStatus("getting sections");
|
||||
|
||||
// get the sections without implied sections
|
||||
Sections *ss = getImpliedSections();
|
||||
if ( ! ss || ss==(void *)-1) return (Sections *)ss;
|
||||
|
||||
// returns NULL if our url is root!
|
||||
//HashTableX *rvt = getRootVotingTable();
|
||||
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
||||
|
||||
SectionVotingTable *osvt = getOldSectionVotingTable();
|
||||
if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;
|
||||
|
||||
uint32_t *tph = getTagPairHash32();
|
||||
if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;
|
||||
|
||||
// need a getUseSectiondb() function...
|
||||
|
||||
if ( ! m_useSectiondb ) {
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
// start here
|
||||
Section *si;
|
||||
|
||||
|
||||
// get first sentence in doc
|
||||
si = ss->m_firstSent;
|
||||
// do not bother scanning if no votes
|
||||
if ( osvt->getNumVotes() <= 0 ) si = NULL;
|
||||
// assume no dups
|
||||
m_maxVotesForDup = 0;
|
||||
// scan the sentence sections and or in the bits we should
|
||||
for ( ; si ; si = si->m_nextSent ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// sanity check
|
||||
if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
|
||||
// how many pages from this site have this taghash for
|
||||
// a sentence
|
||||
float nt;
|
||||
nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
|
||||
// skip if nobody! (except us)
|
||||
if ( nt <= 0.0 ) continue;
|
||||
// . get out tag content hash
|
||||
// . for some reason m_contentHash is 0 for like menu-y sectns
|
||||
int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
|
||||
// . now how many pages also had same content in that tag?
|
||||
// . TODO: make sure numsampled only counts a docid once!
|
||||
// and this is not each time it occurs on that page.
|
||||
float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
|
||||
// cast it to a int32_t
|
||||
int32_t votes1 = (int32_t)nsam;
|
||||
// by default, complement
|
||||
int32_t votes2 = (int32_t)nt - votes1;
|
||||
// store votes
|
||||
si->m_votesForDup = votes1;
|
||||
si->m_votesForNotDup = votes2;
|
||||
// what's the most dup votes we had...
|
||||
if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
|
||||
}
|
||||
|
||||
m_sectionsValid = true;
|
||||
return &m_sections;
|
||||
}
|
||||
|
||||
SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
|
||||
if ( m_nsvtValid ) return &m_nsvt;
|
||||
// need sections
|
||||
Sections *ss = getSections();
|
||||
if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
||||
// hash of all adjacent tag pairs
|
||||
uint32_t *tph = getTagPairHash32 ( ) ;
|
||||
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
||||
// are we a site root url?
|
||||
//char *isRoot = getIsSiteRoot();
|
||||
//if ( ! isRoot || isRoot == (char *)-1 )
|
||||
// return (SectionVotingTable *)isRoot;
|
||||
|
||||
// init table
|
||||
if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
|
||||
// . tally the section votes from the sections class
|
||||
// . only add the date votes, not the taghash/contenthash keys
|
||||
// from the root, since we add those from the root voting table
|
||||
// into m_osvt directly!
|
||||
// . we no longer have root voting table!
|
||||
// . this adds keys of the hash of each tag xpath
|
||||
// . and it adds keys of the hash of each tag path PLUS its innerhtml
|
||||
if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
|
||||
// our new section voting table is now valid, and ready to be added
|
||||
// to sectiondb by calling SectionVotingTable::hash()
|
||||
m_nsvtValid = true;
|
||||
return &m_nsvt;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . scan every section and look up its tag and content hashes in
|
||||
// sectiondb to find out how many pages and sites have the same hash
|
||||
// . use the secondary sectiondb key, key2
|
||||
@ -5756,262 +5608,6 @@ bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// . for all urls from this subdomain...
|
||||
// . EXCEPT root url since we use msg17 to cache that, etc.
|
||||
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
|
||||
|
||||
if ( m_osvtValid ) return &m_osvt;
|
||||
|
||||
// do not consult sectiondb if we are set from the title rec,
|
||||
// that way we avoid parsining inconsistencies since sectiondb changes!
|
||||
if ( m_setFromTitleRec ) {
|
||||
char *p = ptr_sectiondbData;
|
||||
m_osvtValid = true;
|
||||
m_osvt.m_totalSiteVoters = 0;
|
||||
if ( size_sectiondbData <= 4 ) return &m_osvt;
|
||||
m_osvt.m_totalSiteVoters = *(int32_t *)p;
|
||||
p += 4;
|
||||
int32_t remaining = size_sectiondbData - 4;
|
||||
m_osvt.m_svt.deserialize(p,remaining,m_niceness);
|
||||
return &m_osvt;
|
||||
}
|
||||
|
||||
// returns empty table if WE are the site root url!
|
||||
//HashTableX *rvt = getRootVotingTable();
|
||||
//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;
|
||||
|
||||
// need sections
|
||||
//Sections *ss = getSections();
|
||||
//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
|
||||
|
||||
// hash of all adjacent tag pairs
|
||||
uint32_t *tph = getTagPairHash32 ( ) ;
|
||||
if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
|
||||
|
||||
int64_t *siteHash64 = getSiteHash64();
|
||||
if ( ! siteHash64 || siteHash64 == (void *)-1 )
|
||||
return (SectionVotingTable *)siteHash64;
|
||||
|
||||
// the docid
|
||||
int64_t *d = getDocId();
|
||||
if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// . for us, dates are really containers of the flags and tag hash
|
||||
// . init this up here, it is re-set if we re-call getSectiondbList()
|
||||
// because there were too many records in it to handle in one read
|
||||
if ( m_numSectiondbReads == 0 ) {
|
||||
// init table
|
||||
if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
|
||||
// use site hash as the main thing
|
||||
int64_t termId = *siteHash64 & TERMID_MASK;
|
||||
// . start key for reading list from sectiondb
|
||||
// . read all the section votes for this site
|
||||
m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
|
||||
// how many reads we have to do...
|
||||
m_numSectiondbNeeds = 1;
|
||||
}
|
||||
|
||||
//bool skipRecall = false;
|
||||
// always read 5MB at a time from sectiondb
|
||||
int32_t minRecSizes = 5000000;
|
||||
|
||||
// crap! host #28 is being totall slammed!!!!!
|
||||
// why?????? in the meantime do this
|
||||
//minRecSizes = 100000;
|
||||
//skipRecall = true;
|
||||
|
||||
// is it facebook?
|
||||
bool limitSectiondb = false;
|
||||
// limit now to speed up repair rebuild
|
||||
// limit now to speed up injection!
|
||||
limitSectiondb = true;
|
||||
// facebook lists often clog the tree, and when we read 2MB worth of
|
||||
// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
|
||||
// because facebook is a well structured xml feed so why read any
|
||||
// really!
|
||||
if ( limitSectiondb ) minRecSizes = 50000;
|
||||
|
||||
key128_t *lastKey = NULL;
|
||||
|
||||
// if msg0 blocked and came back with g_errno set, like
|
||||
// in preparing to merge it got an OOM
|
||||
if ( g_errno ) {
|
||||
log("build: sectiondb read2: %s",mstrerror(g_errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
readLoop:
|
||||
// before looking up TitleRecs using Msg20, let's first consult
|
||||
// datedb to see if we got adequate data as to what sections
|
||||
// are the article sections
|
||||
|
||||
// only get the list once
|
||||
if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
|
||||
// only do this once
|
||||
m_numSectiondbReads++;
|
||||
// make the termid
|
||||
uint64_t termId = *siteHash64 & TERMID_MASK;
|
||||
// end key is always the same
|
||||
key128_t end = g_datedb.makeEndKey ( termId , 0 );
|
||||
// shortcut
|
||||
Msg0 *m = &m_msg0;
|
||||
// get the group this list is in (split = false)
|
||||
uint32_t shardNum;
|
||||
shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
|
||||
// we need a group # from the groupId
|
||||
//int32_t split = g_hostdb.getGroupNum ( gid );
|
||||
// note it
|
||||
//logf(LOG_DEBUG,"sections: "
|
||||
// "reading list from sectiondb: "
|
||||
// "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" "
|
||||
// "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" "
|
||||
// ,m_sectiondbStartKey.n1
|
||||
// ,m_sectiondbStartKey.n0
|
||||
// ,end.n1
|
||||
// ,end.n0
|
||||
// );
|
||||
// . get the list
|
||||
// . gets all votes for one particular site
|
||||
if ( ! m->getList ( -1 , // hostId
|
||||
0 , // ip
|
||||
0 , // port
|
||||
0 , // maxCacheAge
|
||||
false , // addToCache
|
||||
RDB_SECTIONDB , // was RDB_DATEDB
|
||||
cr->m_collnum ,
|
||||
&m_secdbList ,
|
||||
(char *)&m_sectiondbStartKey ,
|
||||
(char *)&end ,
|
||||
minRecSizes ,
|
||||
m_masterState ,
|
||||
m_masterLoop ,
|
||||
m_niceness , // MAX_NICENESS
|
||||
// default parms follow
|
||||
true , // doErrorCorrection?
|
||||
true , // includeTree?
|
||||
true , // doMerge?
|
||||
-1 , // firstHostId
|
||||
0 , // startFileNum
|
||||
-1 , // numFiles
|
||||
msg0_getlist_infinite_timeout , // timeout
|
||||
-1 , // syncPoint
|
||||
-1 , // preferLocalReads
|
||||
NULL , // msg5
|
||||
NULL , // msg5b
|
||||
false , // isrealmerge?
|
||||
true , // allowpagecache?
|
||||
false , // forceLocalIndexdb?
|
||||
false , // doIndexdbSplit?
|
||||
shardNum ) )//split ))
|
||||
// return -1 if blocks
|
||||
return (SectionVotingTable *)-1;
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("build: sectiondb read: %s",mstrerror(g_errno));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// it also returns the lastKey in the list so we can use that to
|
||||
// set the startKey for a re-call if we read >= 5MB
|
||||
lastKey = NULL;
|
||||
|
||||
//logf(LOG_DEBUG,"sections: read list of %"INT32" bytes",
|
||||
// m_secdbList.m_listSize);
|
||||
|
||||
bool recall = true;
|
||||
|
||||
if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;
|
||||
|
||||
// . unless it had special byte set in Msg0.cpp HACK
|
||||
// . we send back a compressed list and tack on an extra 0 byte at
|
||||
// the end so that we know we had a full list!
|
||||
if ( (m_secdbList.m_listSize % 2) == 1 ) {
|
||||
m_secdbList.m_listSize--;
|
||||
m_secdbList.m_listEnd --;
|
||||
recall = true;
|
||||
}
|
||||
|
||||
// no longer bother re-calling, because facebook is way slow...
|
||||
if ( limitSectiondb ) recall = false;
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . compile the votes from sectiondb for this site into a hashtable
|
||||
// . m_osvt is a SectionVotingTable and each entry in the hashtable
|
||||
// is a SectionVote class.
|
||||
// . the taghash is the key of the vote and is a hash of all the
|
||||
// nested tags the section is in.
|
||||
// . another vote uses the tag hash hashed with the hash of the
|
||||
// content contained by the section
|
||||
// . using these two vote counts we set Section::m_votesForDup
|
||||
// or Section::m_votesForNotDup counts which let us know how the
|
||||
// section is repeated or not repeated on the site
|
||||
// . SectionVote::m_score is always 1.0 from what i can tell
|
||||
// cuz it seems like addVote*() always uses a score of 1.0
|
||||
// . SectionVote::m_numSampled is how many times that tagHash
|
||||
// occurs in the document.
|
||||
if ( ! m_osvt.addListOfVotes(&m_secdbList,
|
||||
&lastKey,
|
||||
*d , // docid
|
||||
m_niceness))
|
||||
return NULL;
|
||||
|
||||
// why is this always zero it seems?
|
||||
if ( g_conf.m_logDebugBuild )
|
||||
log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"",
|
||||
m_secdbList.m_listSize,(int32_t)recall);
|
||||
|
||||
// . recall? yes if we had to truncate our list...
|
||||
// . we need to be able to scan all votes for the website... that is
|
||||
// why we recall here
|
||||
// . limit votes by a special sectiondb key then that is a vote...
|
||||
if ( recall ) {
|
||||
// another debug
|
||||
//logf(LOG_DEBUG,"sections: recallling read");
|
||||
// just note it for now
|
||||
//if ( m_sectiondbRecall > 5 )
|
||||
if ( m_numSectiondbNeeds > 5 )
|
||||
logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"",
|
||||
m_sectiondbRecall++);
|
||||
// we should really limit voting per site! we do now!
|
||||
//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
|
||||
// update our start key
|
||||
if ( lastKey ) m_sectiondbStartKey = *lastKey;
|
||||
// inc by 2 since we already had this key
|
||||
m_sectiondbStartKey += 2;
|
||||
// unflag
|
||||
m_numSectiondbNeeds++;
|
||||
// and repeat
|
||||
goto readLoop;
|
||||
}
|
||||
|
||||
//
|
||||
// set ptr_sectiondbData so this can be set from a title rec without
|
||||
// having to lookup in sectiondb again which might have changed!
|
||||
//
|
||||
m_sectiondbData.purge();
|
||||
// alloc
|
||||
int32_t need = m_osvt.m_svt.getStoredSize() + 4;
|
||||
if ( ! m_sectiondbData.reserve(need) )
|
||||
// oom error?
|
||||
return NULL;
|
||||
// serialize this number
|
||||
m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
|
||||
// serialize the hashtablex
|
||||
m_osvt.m_svt.serialize ( &m_sectiondbData );
|
||||
// reference it for title rec serialization
|
||||
ptr_sectiondbData = m_sectiondbData.getBufStart();
|
||||
size_sectiondbData = m_sectiondbData.length();
|
||||
|
||||
m_osvtValid = true;
|
||||
return &m_osvt;
|
||||
}
|
||||
|
||||
int32_t *XmlDoc::getLinkSiteHashes ( ) {
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: BEGIN", __FILE__, __func__, __LINE__);
|
||||
|
||||
@ -17770,30 +17366,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// get the voting table which we will add to sectiondb
|
||||
SectionVotingTable *nsvt = NULL;
|
||||
SectionVotingTable *osvt = NULL;
|
||||
// seems like
|
||||
// sectiondb takes up abotu 15% of the disk space like this. no!
|
||||
// cuz then there is revdb, so we are 30%. so that's a no go.
|
||||
bool addSectionVotes = false;
|
||||
if ( nd ) addSectionVotes = true;
|
||||
if ( ! m_useSectiondb ) addSectionVotes = false;
|
||||
// to save disk space no longer add the roots! nto only saves sectiondb
|
||||
// but also saves space in revdb
|
||||
//if ( nd && *isRoot ) addSectionVotes = true;
|
||||
if ( addSectionVotes ) {
|
||||
nsvt = getNewSectionVotingTable();
|
||||
if ( ! nsvt || nsvt == (void *)-1 )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getNewSectionVotingTable returned -1", __FILE__, __func__, __LINE__);
|
||||
return (char *)nsvt;
|
||||
}
|
||||
// get the old table too!
|
||||
osvt = getNewSectionVotingTable();
|
||||
if ( ! osvt || osvt == (void *)-1 )
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, getNewSectionVotingTable returned -1", __FILE__, __func__, __LINE__);
|
||||
return (char *)osvt;
|
||||
}
|
||||
}
|
||||
|
||||
// need firstip if adding a rebuilt spider request
|
||||
if ( m_useSecondaryRdbs && m_useSpiderdb ) {
|
||||
@ -18435,12 +18007,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
setStatus ( "adding sectiondb keys");
|
||||
// checkpoint
|
||||
saved = m_p;
|
||||
// add that table to the metalist
|
||||
if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
|
||||
{
|
||||
if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: addTable128 failed", __FILE__, __func__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
|
||||
// sanity check
|
||||
if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
|
||||
@ -23419,7 +22986,6 @@ Summary *XmlDoc::getSummary () {
|
||||
if ( ! ct || ct == (void *)-1 ) {
|
||||
return (Summary *)ct;
|
||||
}
|
||||
|
||||
// xml and json docs have empty summaries
|
||||
if ( *ct == CT_JSON || *ct == CT_XML ) {
|
||||
m_summaryValid = true;
|
||||
@ -23446,6 +23012,7 @@ Summary *XmlDoc::getSummary () {
|
||||
if ( ! pos || pos == (Pos *)-1 ) {
|
||||
return (Summary *)pos;
|
||||
}
|
||||
|
||||
char *site = getSite();
|
||||
if ( ! site || site == (char *)-1 ) {
|
||||
return (Summary *)site;
|
||||
@ -23829,12 +23396,11 @@ SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
|
||||
if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL;
|
||||
Bits bb;
|
||||
if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
|
||||
Phrases pp;
|
||||
if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
|
||||
|
||||
// this uses the sectionsReply to see which sections are
|
||||
// "text", etc. rather than compute it expensively
|
||||
Sections sec;
|
||||
if ( !sec.set( &ww, &pp, &bb, getFirstUrl(), 0, "", m_niceness, CT_JSON ) ) {
|
||||
if ( !sec.set( &ww, &bb, getFirstUrl(), 0, "", m_niceness, CT_JSON ) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -24882,11 +24448,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
//
|
||||
Sections *sections = getSections();
|
||||
if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
|
||||
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
||||
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
||||
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
||||
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
||||
|
||||
|
||||
// these are nice
|
||||
//HashTableX *pt = dp->getPhoneTable();
|
||||
@ -26128,10 +25689,7 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
if ( hr ) sections = getSectionsWithDupStats();
|
||||
else sections = getSections();
|
||||
if ( ! sections) return true;if (sections==(Sections *)-1)return false;
|
||||
//SectionVotingTable *nsvt = getNewSectionVotingTable();
|
||||
//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
||||
//SectionVotingTable *osvt = getOldSectionVotingTable();
|
||||
//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
|
||||
|
||||
Words *words = getWords();
|
||||
if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
|
||||
Phrases *phrases = getPhrases();
|
||||
@ -29537,13 +29095,6 @@ SafeBuf *XmlDoc::getTermListBuf ( ) {
|
||||
m_termListBufValid = true;
|
||||
|
||||
return &m_termListBuf;
|
||||
// print timing
|
||||
//int64_t now = gettimeofdayInMilliseconds();
|
||||
//int64_t took = now - m_cacheStartTime;
|
||||
//log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId);
|
||||
// . flag it as being completely cached now
|
||||
// . returns false and sets g_errno on error
|
||||
//return addDocIdToTermListCache ( m_docId , cr->m_coll );
|
||||
}
|
||||
|
||||
|
||||
|
48
XmlDoc.h
48
XmlDoc.h
@ -429,8 +429,6 @@ public:
|
||||
//BR 20160106 removed: class SafeBuf *getInlineSectionVotingBuf();
|
||||
bool gotSectionFacets( class Multicast *mcast );
|
||||
class SectionStats *getSectionStats ( uint32_t secHash32, uint32_t sentHash32, bool cacheOnly );
|
||||
class SectionVotingTable *getOldSectionVotingTable();
|
||||
class SectionVotingTable *getNewSectionVotingTable();
|
||||
char **getSectionsReply ( ) ;
|
||||
char **getSectionsVotes ( ) ;
|
||||
HashTableX *getSectionVotingTable();
|
||||
@ -946,9 +944,6 @@ public:
|
||||
char m_logLangId;
|
||||
int32_t m_logSiteNumInlinks;
|
||||
|
||||
SectionVotingTable m_nsvt;
|
||||
|
||||
SectionVotingTable m_osvt;
|
||||
int32_t m_numSectiondbReads;
|
||||
int32_t m_numSectiondbNeeds;
|
||||
key128_t m_sectiondbStartKey;
|
||||
@ -1013,7 +1008,6 @@ public:
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
char m_currentUrlValid;
|
||||
@ -1025,7 +1019,6 @@ public:
|
||||
char m_lastUrlValid;
|
||||
char m_docIdValid;
|
||||
char m_availDocIdValid;
|
||||
//char m_collValid;
|
||||
char m_tagRecValid;
|
||||
char m_robotsTxtLenValid;
|
||||
char m_tagRecDataValid;
|
||||
@ -1042,32 +1035,24 @@ public:
|
||||
char m_relatedQueryBufValid;
|
||||
char m_queryLinkBufValid;
|
||||
char m_redirSpiderRequestValid;
|
||||
//char m_queryPtrsValid;
|
||||
char m_queryOffsetsValid;
|
||||
//char m_queryPtrsSortedValid;
|
||||
char m_queryPtrsWholeValid;
|
||||
char m_relatedDocIdBufValid;
|
||||
char m_topMatchingQueryBufValid;
|
||||
char m_relatedDocIdsScoredBufValid;
|
||||
char m_relatedDocIdsWithTitlesValid;
|
||||
char m_relatedTitleBufValid;
|
||||
//char m_queryLinkBufValid;
|
||||
char m_missingTermBufValid;
|
||||
char m_matchingTermBufValid;
|
||||
//char m_relPtrsValid;
|
||||
char m_sortedPosdbListBufValid;
|
||||
char m_wpSortedPosdbListBufValid;
|
||||
char m_termListBufValid;
|
||||
char m_insertableTermsBufValid;
|
||||
char m_scoredInsertableTermsBufValid;
|
||||
//char m_iwfiBufValid; // for holding WordFreqInfo instances
|
||||
char m_wordPosInfoBufValid;
|
||||
char m_recommendedLinksBufValid;
|
||||
|
||||
//char m_queryHashTableValid;
|
||||
char m_queryOffsetTableValid;
|
||||
//char m_socketWriteBufValid;
|
||||
//char m_numBannedOutlinksValid;
|
||||
char m_hopCountValid;
|
||||
char m_isInjectingValid;
|
||||
char m_isImportingValid;
|
||||
@ -1091,13 +1076,9 @@ public:
|
||||
char m_posValid;
|
||||
char m_isUrlBadYearValid;
|
||||
char m_phrasesValid;
|
||||
//char m_synonymsValid;
|
||||
//char m_weightsValid;
|
||||
char m_sectionsValid;
|
||||
char m_subSentsValid;
|
||||
char m_osvtValid;
|
||||
char m_nsvtValid;
|
||||
//char m_rvtValid;
|
||||
|
||||
char m_turkVotingTableValid;
|
||||
char m_turkBitsTableValid;
|
||||
char m_turkBanTableValid;
|
||||
@ -1109,17 +1090,13 @@ public:
|
||||
char m_imagesValid;
|
||||
char m_msge0Valid;
|
||||
char m_msge1Valid;
|
||||
//char m_msge2Valid;
|
||||
//char m_sampleVectorValid;
|
||||
char m_gigabitHashesValid;
|
||||
//char m_oldsrValid;
|
||||
char m_sreqValid;
|
||||
char m_srepValid;
|
||||
|
||||
bool m_ipValid;
|
||||
bool m_firstIpValid;
|
||||
bool m_spideredTimeValid;
|
||||
//bool m_nextSpiderTimeValid;
|
||||
bool m_indexedTimeValid;
|
||||
bool m_firstIndexedValid;
|
||||
bool m_isInIndexValid;
|
||||
@ -1127,14 +1104,7 @@ public:
|
||||
bool m_outlinksAddedDateValid;
|
||||
bool m_countryIdValid;
|
||||
bool m_bodyStartPosValid;
|
||||
/*
|
||||
bool m_titleWeightValid;
|
||||
bool m_headerWeightValid;
|
||||
bool m_urlPathWeightValid;
|
||||
bool m_externalLinkTextWeightValid;
|
||||
bool m_internalLinkTextWeightValid;
|
||||
bool m_conceptWeightValid;
|
||||
*/
|
||||
|
||||
bool m_httpStatusValid;
|
||||
bool m_crawlDelayValid;
|
||||
bool m_finalCrawlDelayValid;
|
||||
@ -1145,8 +1115,6 @@ public:
|
||||
bool m_expandedUtf8ContentValid;
|
||||
bool m_utf8ContentValid;
|
||||
bool m_isAllowedValid;
|
||||
//bool m_tryAgainTimeDeltaValid;
|
||||
//bool m_eliminateMenusValid;
|
||||
bool m_redirUrlValid;
|
||||
bool m_redirCookieBufValid;
|
||||
bool m_metaRedirUrlValid;
|
||||
@ -1163,11 +1131,9 @@ public:
|
||||
bool m_redirErrorValid;
|
||||
bool m_domHash32Valid;
|
||||
bool m_contentHash32Valid;
|
||||
//bool m_tagHash32Valid;
|
||||
bool m_tagPairHash32Valid;
|
||||
|
||||
bool m_spiderLinksValid;
|
||||
//bool m_nextSpiderPriorityValid;
|
||||
bool m_firstIndexedDateValid;
|
||||
bool m_isPermalinkValid;
|
||||
|
||||
@ -1196,15 +1162,11 @@ public:
|
||||
bool m_oldDocValid;
|
||||
bool m_extraDocValid;
|
||||
bool m_rootDocValid;
|
||||
//bool m_gatewayDocValid;
|
||||
bool m_oldMetaListValid;
|
||||
bool m_oldTitleRecValid;
|
||||
bool m_rootTitleRecValid;
|
||||
bool m_isIndexedValid;
|
||||
bool m_siteNumInlinksValid;
|
||||
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
|
||||
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
|
||||
//bool m_siteNumInlinksTotalValid;
|
||||
bool m_siteNumInlinks8Valid;
|
||||
bool m_siteLinkInfoValid;
|
||||
bool m_isWWWDupValid;
|
||||
@ -1228,7 +1190,6 @@ public:
|
||||
bool m_isSiteRootValid;
|
||||
bool m_wasContentInjectedValid;
|
||||
bool m_outlinkHopCountVectorValid;
|
||||
//bool m_isSpamValid;
|
||||
bool m_isFilteredValid;
|
||||
bool m_urlFilterNumValid;
|
||||
bool m_numOutlinksAddedValid;
|
||||
@ -1245,7 +1206,6 @@ public:
|
||||
bool m_titleValid;
|
||||
bool m_htbValid;
|
||||
bool m_collnumValid;
|
||||
//bool m_twidsValid;
|
||||
bool m_termId32BufValid;
|
||||
bool m_termInfoBufValid;
|
||||
bool m_newTermInfoBufValid;
|
||||
@ -1254,9 +1214,6 @@ public:
|
||||
bool m_spiderStatusDocMetaListValid;
|
||||
bool m_isCompromisedValid;
|
||||
bool m_isNoArchiveValid;
|
||||
//bool m_isVisibleValid;
|
||||
//bool m_clockCandidatesTableValid;
|
||||
//bool m_clockCandidatesDataValid;
|
||||
bool m_titleRecBufValid;
|
||||
bool m_isLinkSpamValid;
|
||||
bool m_isErrorPageValid;
|
||||
@ -1749,7 +1706,6 @@ public:
|
||||
bool m_useTagdb ;
|
||||
bool m_usePlacedb ;
|
||||
//bool m_useTimedb ;
|
||||
bool m_useSectiondb ;
|
||||
//bool m_useRevdb ;
|
||||
bool m_useSecondaryRdbs ;
|
||||
|
||||
|
5
main.cpp
5
main.cpp
@ -8473,13 +8473,11 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
// computeWordIds from xml
|
||||
words.set ( &xml , true , true ) ;
|
||||
bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0);
|
||||
Phrases phrases;
|
||||
phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0);
|
||||
t = gettimeofdayInMilliseconds_force();
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
//if ( ! words.set ( &xml , true , true ) )
|
||||
// do not supply xd so it will be set from scratch
|
||||
if ( !sections.set( &words, &phrases, &bits, NULL, 0, NULL, 0, 0 ) )
|
||||
if ( !sections.set( &words, &bits, NULL, 0, NULL, 0, 0 ) )
|
||||
return log("build: speedtestxml: sections set: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
@ -8493,6 +8491,7 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
|
||||
|
||||
//Phrases phrases;
|
||||
Phrases phrases;
|
||||
t = gettimeofdayInMilliseconds_force();
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
if ( ! phrases.set ( &words ,
|
||||
|
@ -28,14 +28,11 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
|
||||
Bits bits;
|
||||
ASSERT_TRUE(bits.set(&words, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Phrases phrases;
|
||||
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Url url;
|
||||
url.set(urlStr);
|
||||
|
||||
Sections sections;
|
||||
ASSERT_TRUE(sections.set(&words, &phrases, &bits, &url, 0, "", 0, CT_HTML));
|
||||
ASSERT_TRUE(sections.set(&words, &bits, &url, 0, "", 0, CT_HTML));
|
||||
|
||||
Query query;
|
||||
ASSERT_TRUE(query.set2(queryStr, langEnglish, true));
|
||||
@ -53,6 +50,9 @@ static void generateSummary(Summary &summary, char *htmlInput, char *queryStr, c
|
||||
Bits bitsForSummary;
|
||||
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
|
||||
|
||||
Phrases phrases;
|
||||
ASSERT_TRUE(phrases.set(&words, &bits, true, false, TITLEREC_CURRENT_VERSION, 0));
|
||||
|
||||
Matches matches;
|
||||
matches.setQuery(&query);
|
||||
ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo, 0));
|
||||
|
Reference in New Issue
Block a user