mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Merge branch 'master' of https://github.com/privacore/open-source-search-engine
This commit is contained in:
47
Matches.cpp
47
Matches.cpp
@ -555,37 +555,7 @@ bool Matches::addMatches( Words *words, Phrases *phrases, Sections *sections, Bi
|
||||
int32_t nextMatchWordPos = 0;
|
||||
int32_t lasti = -3;
|
||||
|
||||
int32_t dist = 0;
|
||||
|
||||
// . every tag increments "dist" by a value
|
||||
// . rather than use a switch/case statement, which does a binary
|
||||
// lookup thing which is really slow, let's use a 256 bucket table
|
||||
// for constant lookup, rather than log(N).
|
||||
static char s_tableInit = false;
|
||||
static int8_t s_tab[512];
|
||||
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
|
||||
for ( int32_t i = 0 ; ! s_tableInit && i < 128 ; i++ ) {
|
||||
char step = 0;
|
||||
if ( i == TAG_TR ) step = 2;
|
||||
if ( i == TAG_P ) step = 10;
|
||||
if ( i == TAG_HR ) step = 10;
|
||||
if ( i == TAG_H1 ) step = 10;
|
||||
if ( i == TAG_H2 ) step = 10;
|
||||
if ( i == TAG_H3 ) step = 10;
|
||||
if ( i == TAG_H4 ) step = 10;
|
||||
if ( i == TAG_H5 ) step = 10;
|
||||
if ( i == TAG_H6 ) step = 10;
|
||||
if ( i == TAG_TABLE ) step = 30;
|
||||
if ( i == TAG_BLOCKQUOTE ) step = 10;
|
||||
// default
|
||||
if ( step == 0 ) {
|
||||
if ( g_nodes[i].m_isBreaking ) step = 10;
|
||||
else step = 1;
|
||||
}
|
||||
// account for both the back and the front tags
|
||||
s_tab[i ] = step;
|
||||
}
|
||||
s_tableInit = true;
|
||||
|
||||
// google seems to index SEC_MARQUEE so i took that out of here
|
||||
int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
|
||||
@ -604,29 +574,13 @@ bool Matches::addMatches( Words *words, Phrases *phrases, Sections *sections, Bi
|
||||
//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A)
|
||||
// inAnchTag = false;
|
||||
|
||||
// for each word increment distance
|
||||
dist++;
|
||||
|
||||
//if ( addToMatches && tids && tids[i] ){
|
||||
if ( tids && tids[i] ){
|
||||
int32_t tid = tids[i] & BACKBITCOMP;
|
||||
// accumulate distance
|
||||
dist += s_tab[tid];
|
||||
// monitor boundaries so that the proximity algo
|
||||
// knows when two matches are separated by such tags
|
||||
// MDW: isn't the "dist" good enough for this?????
|
||||
// let's try just using "dist" then.
|
||||
// "crossedSection" is hereby replaced by "dist".
|
||||
//if ( s_tab[tid]
|
||||
// tagIds don't have wids and are skipped
|
||||
continue;
|
||||
}
|
||||
|
||||
// skip if wid is 0, it is not an alnum word then
|
||||
if ( ! wids[i] ) {
|
||||
// and extra unit if it starts with \n i guess
|
||||
if ( words->m_words[i][0] == '\n' ) dist++;
|
||||
// dist += words->m_wordLens[i] / 3;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -859,7 +813,6 @@ bool Matches::addMatches( Words *words, Phrases *phrases, Sections *sections, Bi
|
||||
m->m_sections = sections;
|
||||
m->m_bits = bits;
|
||||
m->m_pos = pos;
|
||||
m->m_dist = dist;
|
||||
m->m_flags = flags | eflag ;
|
||||
|
||||
// add to our vector. we want to know where each QueryWord
|
||||
|
@ -62,10 +62,6 @@ class Match {
|
||||
// "match group" or type of match. i.e. MF_TITLETAG, MF_METASUMM, ...
|
||||
mf_t m_flags;
|
||||
|
||||
// improve summary generation parms
|
||||
int32_t m_dist;
|
||||
//bool m_crossedSection;
|
||||
|
||||
// . for convenience, these four class ptrs are used by Summary.cpp
|
||||
// . m_wordNum is relative to this "words" class (and scores,bits,pos)
|
||||
class Words *m_words;
|
||||
|
27
Pos.cpp
27
Pos.cpp
@ -82,6 +82,8 @@ bool Pos::set (Words *words, char *f, char *fend, int32_t *len , int32_t a , int
|
||||
// flag for stopping back-to-back spaces. only count those as one char.
|
||||
bool lastSpace = false;
|
||||
int32_t maxCharSize = 4; // we are utf8
|
||||
int in_bad_tags = 0;
|
||||
|
||||
for ( int32_t i = a ; i < b ; i++ ) {
|
||||
if (trunc) {
|
||||
break;
|
||||
@ -94,8 +96,26 @@ bool Pos::set (Words *words, char *f, char *fend, int32_t *len , int32_t a , int
|
||||
|
||||
// is tag?
|
||||
if ( tids && tids[i] ) {
|
||||
if ( f ) {
|
||||
// let's not get from bad tags when filtering into buffer (used for generating summaries)
|
||||
if ( ( tids[i] == TAG_STYLE ) || ( tids[i] == TAG_SCRIPT ) ) {
|
||||
++in_bad_tags;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( in_bad_tags ) {
|
||||
if ( ( ( tids[i] & BACKBITCOMP ) == TAG_STYLE ) ||
|
||||
( ( tids[i] & BACKBITCOMP ) == TAG_SCRIPT ) ) {
|
||||
--in_bad_tags;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if not breaking, does nothing
|
||||
if ( ! g_nodes[tids[i]&0x7f].m_isBreaking ) continue;
|
||||
if ( ! g_nodes[tids[i]&0x7f].m_isBreaking ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// list tag? <li>
|
||||
if ( tids[i] == TAG_LI ) {
|
||||
if ( f ){
|
||||
@ -153,6 +173,11 @@ bool Pos::set (Words *words, char *f, char *fend, int32_t *len , int32_t a , int
|
||||
continue;
|
||||
}
|
||||
|
||||
// skip words if we're in 'bad' tags
|
||||
if ( in_bad_tags ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// scan through all chars discounting back-to-back spaces
|
||||
|
||||
// assume filters out to the same # of chars
|
||||
|
491
Sections.cpp
491
Sections.cpp
@ -124,14 +124,7 @@ bool Sections::set ( Words *w ,
|
||||
int32_t niceness ,
|
||||
void *state ,
|
||||
void (*callback)(void *state) ,
|
||||
uint8_t contentType ,
|
||||
// from XmlDoc::ptr_sectionsData in a title rec
|
||||
char *sectionsData ,
|
||||
bool sectionsDataValid ,
|
||||
char *sectionsVotes ,
|
||||
//uint64_t tagPairHash ,
|
||||
char *buf ,
|
||||
int32_t bufSize ) {
|
||||
uint8_t contentType ) {
|
||||
|
||||
reset();
|
||||
|
||||
@ -1821,23 +1814,6 @@ bool Sections::set ( Words *w ,
|
||||
///////////////////////////////////////
|
||||
setMenus();
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
// now set SENT_LIST flags on m_sentFlags
|
||||
//
|
||||
// try to capture sentences that are not menus but are a list of
|
||||
// things. if the sentence itself has a list of short items, or a bunch
|
||||
// of commas, then also set the SEC_LIST flag on it. or if sentence
|
||||
// is part of a sequence of sentences that are a list of sentences then
|
||||
// set it for them as well. typically such sentences will be separated
|
||||
// by a vertical space, have no periods, maybe have an <li> tag or only
|
||||
// have a few words per sentence. this will help us demote search results
|
||||
// that have the query terms in such a list because it is usually not
|
||||
// very useful information.
|
||||
//
|
||||
///////////////////////////////////////
|
||||
setListFlags();
|
||||
|
||||
//verifySections();
|
||||
|
||||
// don't use nsvt/osvt now
|
||||
@ -1856,16 +1832,6 @@ bool Sections::addImpliedSections ( Addresses *aa ) {
|
||||
// no point in going any further if we have nothing
|
||||
if ( m_numSections == 0 ) return true;
|
||||
|
||||
// set this
|
||||
//m_osvt = osvt;
|
||||
|
||||
|
||||
// as part of a replacement for table swoggling which is confusing
|
||||
// and didn't really work right, especially when we had both
|
||||
// table header row and column, we set these on each table cell:
|
||||
// SEC_HASDATEHEADERROW and SEC_HASDATEHEADERCOL
|
||||
if ( ! setTableStuff ( ) ) return false;
|
||||
|
||||
m_aa = aa;
|
||||
|
||||
|
||||
@ -4060,7 +4026,6 @@ bool Sections::setSentFlagsPart2 ( ) {
|
||||
inParens = false;
|
||||
int32_t dollarCount = 0;
|
||||
int32_t priceWordCount = 0;
|
||||
bool hadAt = false;
|
||||
|
||||
// watchout if in a table. the last table column header
|
||||
// should not be applied to the first table cell in the
|
||||
@ -4435,8 +4400,7 @@ bool Sections::setSentFlagsPart2 ( ) {
|
||||
isStopWord = true;
|
||||
// count them
|
||||
if ( isStopWord ) stops++;
|
||||
// set this
|
||||
if ( m_wids[i] == h_at ) hadAt = true;
|
||||
|
||||
// if we end on a stop word that is usually indicative
|
||||
// of something like
|
||||
// "Search Results for <h1>Doughnuts</h1>" as for
|
||||
@ -5270,8 +5234,6 @@ int32_t hasTitleWords ( sentflags_t sflags ,
|
||||
static int64_t h_tickets;
|
||||
static int64_t h_events;
|
||||
static int64_t h_jobs;
|
||||
static int64_t h_this;
|
||||
static int64_t h_series;
|
||||
static int64_t h_total;
|
||||
static int64_t h_times;
|
||||
static int64_t h_purchase;
|
||||
@ -5321,8 +5283,6 @@ int32_t hasTitleWords ( sentflags_t sflags ,
|
||||
h_tickets = hash64n("tickets");
|
||||
h_events = hash64n("events");
|
||||
h_jobs = hash64n("jobs");
|
||||
h_this = hash64n("this");
|
||||
h_series = hash64n("series");
|
||||
h_total = hash64n("total");
|
||||
h_times = hash64n("times");
|
||||
h_purchase = hash64n("purchase");
|
||||
@ -6537,10 +6497,6 @@ int32_t hasTitleWords ( sentflags_t sflags ,
|
||||
to_lower_a(wptrs[i][wlens[i]-1]) == 't' )
|
||||
hadAthon = true;
|
||||
|
||||
//if ( wids[i] == h_this &&
|
||||
// i+2<nw && wids[i+2] == h_series )
|
||||
// log("hey");
|
||||
|
||||
// save it
|
||||
int64_t savedWid = lastWid;
|
||||
// assign
|
||||
@ -7088,7 +7044,6 @@ int32_t Sections::addImpliedSections3 ( ) {
|
||||
//bro = sk;
|
||||
// assume no winner
|
||||
int32_t bestScore = 0;
|
||||
Section *bestBro = NULL;
|
||||
char bestMethod = -1;
|
||||
Partition *bestPart = NULL;
|
||||
// loop over all enumerated methods
|
||||
@ -7147,7 +7102,6 @@ int32_t Sections::addImpliedSections3 ( ) {
|
||||
if ( score <= bestScore ) continue;
|
||||
// is best of all methods so far?
|
||||
bestScore = score;
|
||||
bestBro = bro;
|
||||
bestMethod = m;
|
||||
bestPart = &parts[m];
|
||||
}
|
||||
@ -7186,11 +7140,9 @@ int32_t Sections::addImpliedSections3 ( ) {
|
||||
Section *cc = m_sectionPtrs[a];
|
||||
if ( cc && cc->m_a == a && cc->m_b == b ) continue;
|
||||
// this returns false and sets g_errno on error
|
||||
if ( ! insertSubSection( sk->m_parent ,
|
||||
winnerPart->m_a[i],
|
||||
winnerPart->m_b[i],
|
||||
BH_IMPLIED ) )
|
||||
if ( ! insertSubSection( winnerPart->m_a[i], winnerPart->m_b[i], BH_IMPLIED ) ) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// ok, flag it
|
||||
@ -7214,8 +7166,7 @@ float computeSimilarity2 ( int32_t *vec0 ,
|
||||
int32_t niceness ,
|
||||
SafeBuf *pbuf ,
|
||||
HashTableX *labelTable ,
|
||||
int32_t nv0 ,
|
||||
int32_t nv1 ) {
|
||||
int32_t nv0 ) {
|
||||
// if both empty, assume not similar at all
|
||||
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
||||
// if either is empty, return 0 to be on the safe side
|
||||
@ -7379,7 +7330,7 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
// save it
|
||||
Section *start = bro;
|
||||
|
||||
int32_t dh = getDelimHash ( method , delim , start );
|
||||
int32_t dh = getDelimHash ( method , delim );
|
||||
|
||||
// bro must be certain type for some methods
|
||||
if ( dh == -1 ) return -2;
|
||||
@ -7395,8 +7346,6 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
// sanity check... should all be brothers (same parent)
|
||||
if ( delim->m_parent != container ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// the head section of a particular partition's section
|
||||
Section *currDelim = bro;
|
||||
// scores
|
||||
int32_t brosWithWords = 0;
|
||||
int32_t maxBrosWithWords = 0;
|
||||
@ -7453,7 +7402,6 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
|
||||
// reset prev sentence
|
||||
Section *prevSent = NULL;
|
||||
Section *lastBro = NULL;
|
||||
|
||||
// scan the brothers
|
||||
for ( ; ; bro = bro->m_nextBrother ) {
|
||||
@ -7467,7 +7415,7 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
|
||||
// get its hash
|
||||
int32_t h = 0LL ;
|
||||
if ( bro ) h = getDelimHash ( method , bro , start );
|
||||
if ( bro ) h = getDelimHash ( method , bro );
|
||||
|
||||
// . check this out
|
||||
// . don't return 0 because we make a vector of these hashes
|
||||
@ -7480,12 +7428,6 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
// if first time, ignore crap above the first delimeter occurnc
|
||||
if ( ignoreAbove ) continue;
|
||||
|
||||
// update this for insertSubSection()
|
||||
lastBro = bro;
|
||||
|
||||
if ( h == dh )
|
||||
currDelim = bro;
|
||||
|
||||
// count non delimeter sections. at least one section
|
||||
// must have text and not be a delimeter section
|
||||
if ( h != dh && bro && bro->m_firstWordPos >= 0 )
|
||||
@ -7592,8 +7534,7 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
m_niceness ,
|
||||
pbuf ,
|
||||
dbt ,
|
||||
nva ,
|
||||
nvb );
|
||||
nva );
|
||||
// add up all sims
|
||||
if ( cellCount >= 2 ) { // ! firstTime ) {
|
||||
simTotal += sim;
|
||||
@ -7610,8 +7551,6 @@ int32_t Sections::getDelimScore ( Section *bro ,
|
||||
if ( pbuf )
|
||||
minBuf.safeMemcpy ( pbuf );
|
||||
}
|
||||
// a new head
|
||||
//currDelim = bro;
|
||||
// reset vht for next partition cell to call
|
||||
// hashSentenceBits() into
|
||||
vht.clear();
|
||||
@ -8171,7 +8110,7 @@ bool Sections::hashSentPairs (Section *sx ,
|
||||
|
||||
// . don't return 0 because we make a vector of these hashes
|
||||
// and computeSimilarity() assumes vectors are NULL term'd. return -1 instead
|
||||
int32_t Sections::getDelimHash ( char method , Section *bro , Section *head ) {
|
||||
int32_t Sections::getDelimHash ( char method , Section *bro ) {
|
||||
|
||||
// now all must have text!
|
||||
//if ( bro->m_firstWordPos < 0 ) return -1;
|
||||
@ -8370,31 +8309,6 @@ int32_t Sections::getDelimHash ( char method , Section *bro , Section *head ) {
|
||||
// was not getting an implied section set, so let's do away
|
||||
// with the pure dow algo and see what happens.
|
||||
return -1;
|
||||
// must be a sort of heading like "Jul 24"
|
||||
//if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
||||
// !(bro->m_flags & SEC_HEADING ) )
|
||||
// return -1;
|
||||
if ( ! (bro->m_flags & SEC_HAS_DOW) )
|
||||
return -1;
|
||||
// this is causing core
|
||||
if ( bro->m_tagId == TAG_TC ) return -1;
|
||||
// now it must be all date words
|
||||
int32_t a = bro->m_firstWordPos;
|
||||
int32_t b = bro->m_lastWordPos;
|
||||
// sanity check
|
||||
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
||||
// scan
|
||||
for ( int32_t i = a ; i <= b ; i++ ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// skip if not wid
|
||||
if ( ! m_wids[i] ) continue;
|
||||
// must be in date
|
||||
if ( ! ( m_bits->m_bits[i] & D_IS_IN_DATE ) )
|
||||
return -1;
|
||||
}
|
||||
// do not collide with tagids
|
||||
return 66666;
|
||||
}
|
||||
if ( method == METHOD_ABOVE_DOW ) {
|
||||
// must be a sort of heading like "Jul 24"
|
||||
@ -8503,10 +8417,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
static int64_t h_the;
|
||||
static int64_t h_and;
|
||||
static int64_t h_a;
|
||||
static int64_t h_p;
|
||||
static int64_t h_m;
|
||||
static int64_t h_am;
|
||||
static int64_t h_pm;
|
||||
static int64_t h_http;
|
||||
static int64_t h_https;
|
||||
static int64_t h_room;
|
||||
@ -8516,8 +8426,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
static int64_t h_suite;
|
||||
static int64_t h_ste;
|
||||
static int64_t h_tags;
|
||||
//static int64_t h_noon;
|
||||
//static int64_t h_midnight;
|
||||
if ( ! s_init ) {
|
||||
s_init = true;
|
||||
h_tags = hash64n("tags");
|
||||
@ -8525,10 +8433,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
h_the = hash64n("the");
|
||||
h_and = hash64n("and");
|
||||
h_a = hash64n("a");
|
||||
h_p = hash64n("p");
|
||||
h_m = hash64n("m");
|
||||
h_am = hash64n("am");
|
||||
h_pm = hash64n("pm");
|
||||
h_a = hash64n("a");
|
||||
h_at = hash64n("at");
|
||||
h_for = hash64n("for");
|
||||
@ -8549,8 +8453,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
h_building = hash64n("building");
|
||||
h_suite = hash64n("suite");
|
||||
h_ste = hash64n("ste");
|
||||
//h_noon = hash64n("noon");
|
||||
//h_midnight = hash64n("midnight");
|
||||
}
|
||||
|
||||
// need D_IS_IN_URL bits to be valid
|
||||
@ -8578,7 +8480,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
bool lastWasComma = false;
|
||||
nodeid_t includedTag = -2;
|
||||
int32_t lastbr = -1;
|
||||
bool hasColon = false;
|
||||
bool endOnBr = false;
|
||||
bool endOnBold = false;
|
||||
bool capped = true;
|
||||
@ -8842,11 +8743,6 @@ bool Sections::addSentenceSections ( ) {
|
||||
j == i )
|
||||
break;
|
||||
|
||||
// flag it, but only if not in
|
||||
// a time format like "8:30"
|
||||
if ( j>0 && !is_digit(m_wptrs[j][-1]))
|
||||
hasColon = true;
|
||||
|
||||
// a "::" is used in breadcrumbs,
|
||||
// so break on that.
|
||||
// fixes "Dining :: Visit ::
|
||||
@ -8881,39 +8777,9 @@ bool Sections::addSentenceSections ( ) {
|
||||
|
||||
if ( tagBefore ) break;
|
||||
if ( tagAfter ) break;
|
||||
|
||||
// for now allow it!
|
||||
continue;
|
||||
// do not break http://... though
|
||||
if ( p[1] == '/' ) continue;
|
||||
// or 10:30 etc.
|
||||
if ( is_digit(p[1]) ) continue;
|
||||
if ( j>0 && is_digit(p[-1]) ) continue;
|
||||
// allow trumba titles to have colons
|
||||
// so they can get the TSF_TITLEY
|
||||
// event title boost in Events.cpp
|
||||
if ( m_isTrumba &&
|
||||
sp[j]->m_tagId == TAG_TITLE )
|
||||
continue;
|
||||
// fix guysndollsllc.com which has
|
||||
// "Battle of the Bands with: The Cincy
|
||||
// Rockers, Second Wind, ..."
|
||||
// if last word was a lowercase
|
||||
// and one of these, let it in the
|
||||
// sentence
|
||||
//if ( lastWidPos < 0 )
|
||||
// break;
|
||||
// must have been lowercase
|
||||
if(!is_lower_utf8(m_wptrs[lastWidPos]))
|
||||
break;
|
||||
// and must be one of these words:
|
||||
if ( prevWid == h_with ||
|
||||
// "Send info to: Booking"
|
||||
// from guysndollsllc.com/page4.ht
|
||||
prevWid == h_to ||
|
||||
prevWid == h_and )
|
||||
continue;
|
||||
// otherwise, break it
|
||||
break;
|
||||
}
|
||||
// . special hyphen
|
||||
// . breaks up title for peachpundit.com
|
||||
@ -8961,28 +8827,12 @@ bool Sections::addSentenceSections ( ) {
|
||||
|
||||
// set "next" to next alnum word after us
|
||||
int32_t next = j+1;
|
||||
int64_t nwid = 0LL;
|
||||
int32_t max = next + 10;
|
||||
if ( max > m_nw ) max = m_nw;
|
||||
for ( ; next < max ; next++ ) {
|
||||
if ( ! m_wids[next] ) continue;
|
||||
nwid = m_wids[next];
|
||||
break;
|
||||
}
|
||||
// am. pm.
|
||||
// if prev word was like 'm' as in am or pm
|
||||
// then assume a cap word following ends sent.
|
||||
// although if we got
|
||||
// "At 1 p.m. Bob Jones plays"
|
||||
// then we'd be wrong.
|
||||
bool isAmPm = false;
|
||||
if ( prevWid == h_m &&
|
||||
(prevPrevWid == h_a ||
|
||||
prevPrevWid == h_p ) )
|
||||
isAmPm = true;
|
||||
if ( (prevWid == h_am ||
|
||||
prevWid == h_pm ) )
|
||||
isAmPm = true;
|
||||
|
||||
// was previous word/abbr capitalized?
|
||||
// if so, assume period does not end sentence.
|
||||
@ -9206,7 +9056,7 @@ bool Sections::addSentenceSections ( ) {
|
||||
if ( addb >= m_nw ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// ok, now add the split sentence
|
||||
Section *is =insertSubSection(parent,adda,addb+1,bh);
|
||||
Section *is =insertSubSection(adda,addb+1,bh);
|
||||
// panic?
|
||||
if ( ! is ) return false;
|
||||
// set sentence flag on it
|
||||
@ -9270,8 +9120,7 @@ bool Sections::addSentenceSections ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Section *Sections::insertSubSection ( Section *parentArg , int32_t a , int32_t b ,
|
||||
int32_t newBaseHash ) {
|
||||
Section *Sections::insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) {
|
||||
// debug
|
||||
//log("sect: inserting subsection [%"INT32",%"INT32")",a,b);
|
||||
|
||||
@ -9373,11 +9222,11 @@ Section *Sections::insertSubSection ( Section *parentArg , int32_t a , int32_t b
|
||||
m_numSections--;
|
||||
char *xx=NULL;*xx=0;
|
||||
return NULL;
|
||||
sk->m_next = m_rootSection;//m_rootSection;
|
||||
sk->m_prev = NULL;
|
||||
//m_sections[0].m_prev = sk;
|
||||
m_rootSection->m_prev = sk;
|
||||
m_rootSection = sk;
|
||||
// sk->m_next = m_rootSection;//m_rootSection;
|
||||
// sk->m_prev = NULL;
|
||||
// //m_sections[0].m_prev = sk;
|
||||
// m_rootSection->m_prev = sk;
|
||||
// m_rootSection = sk;
|
||||
} else {
|
||||
// insert us into the linked list of sections
|
||||
if ( si->m_next ) si->m_next->m_prev = sk;
|
||||
@ -9541,96 +9390,6 @@ Section *Sections::insertSubSection ( Section *parentArg , int32_t a , int32_t b
|
||||
}
|
||||
|
||||
return sk;
|
||||
|
||||
// start scanning here
|
||||
Section *start = parent->m_next;
|
||||
|
||||
int32_t lastb = -1;
|
||||
// try just scanning sections in parent
|
||||
for ( Section *sx = start ; sx ; sx = sx->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// get it
|
||||
//Section *sx = &m_sections[xx];
|
||||
// skip if section ends before our sentence begins
|
||||
if ( sx->m_b <= a ) continue;
|
||||
// stop if beyond sk
|
||||
if ( sx->m_a >= b ) break;
|
||||
// skip if sn not parent
|
||||
if ( sx->m_parent != parent ) continue;
|
||||
// when splitting a section do not reparent if
|
||||
// not in our split...
|
||||
//if ( sx->m_a >= b ) continue;
|
||||
// do not reparent if it contains us
|
||||
if ( sx->m_a <= a && sx->m_b >= b ) continue;
|
||||
// reset his parent to the newly added section
|
||||
sx->m_parent = sk;
|
||||
// and or his flags into us. SEC_HAS_DOM, etc.
|
||||
sk->m_flags |= sx->m_flags & mask;
|
||||
// sanity check
|
||||
if ( sx->m_b > sk->m_b ) { char *xy=NULL;*xy=0; }
|
||||
if ( sx->m_a < sk->m_a ) { char *xy=NULL;*xy=0; }
|
||||
// skip if already got the xor for this section
|
||||
if ( sx->m_a < lastb ) continue;
|
||||
// set this
|
||||
lastb = sx->m_b;
|
||||
// add all the entries from this child section from the
|
||||
// phone/email/etc. tables
|
||||
sk->m_phoneXor ^= sx->m_phoneXor;
|
||||
sk->m_emailXor ^= sx->m_emailXor;
|
||||
sk->m_priceXor ^= sx->m_priceXor;
|
||||
sk->m_todXor ^= sx->m_todXor;
|
||||
sk->m_dayXor ^= sx->m_dayXor;
|
||||
sk->m_addrXor ^= sx->m_addrXor;
|
||||
// make sure did not make it zero
|
||||
if ( sx->m_phoneXor && sk->m_phoneXor == 0 )
|
||||
sk->m_phoneXor = sx->m_phoneXor;
|
||||
if ( sx->m_emailXor && sk->m_emailXor == 0 )
|
||||
sk->m_emailXor = sx->m_emailXor;
|
||||
if ( sx->m_priceXor && sk->m_priceXor == 0 )
|
||||
sk->m_priceXor = sx->m_priceXor;
|
||||
if ( sx->m_todXor && sk->m_todXor == 0 )
|
||||
sk->m_todXor = sx->m_todXor;
|
||||
if ( sx->m_dayXor && sk->m_dayXor == 0 )
|
||||
sk->m_dayXor = sx->m_dayXor;
|
||||
if ( sx->m_addrXor && sk->m_addrXor == 0 )
|
||||
sk->m_addrXor = sx->m_addrXor;
|
||||
// set this perhaps
|
||||
if ( sk->m_firstPlaceNum < 0 )
|
||||
sk->m_firstPlaceNum = sx->m_firstPlaceNum;
|
||||
// update this?
|
||||
if ( sx->m_alnumPosA < 0 ) continue;
|
||||
// take the first one we get
|
||||
if ( sk->m_alnumPosA == -1 )
|
||||
sk->m_alnumPosA = sx->m_alnumPosA;
|
||||
// update to the last one always
|
||||
sk->m_alnumPosB = sx->m_alnumPosB;
|
||||
}
|
||||
|
||||
// a flag
|
||||
bool needsFirst = true;
|
||||
// . set the words ptrs to it
|
||||
// . TODO: can later speed up with ptr to ptr logic
|
||||
for ( int32_t yy = a ; yy < b ; yy++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// and first/last word pos
|
||||
if ( m_wids[yy] ) {
|
||||
// mark this
|
||||
if ( needsFirst ) {
|
||||
sk->m_firstWordPos = yy;
|
||||
needsFirst = false;
|
||||
}
|
||||
// remember last
|
||||
sk->m_lastWordPos = yy;
|
||||
}
|
||||
// must have had sn as parent
|
||||
if ( m_sectionPtrs[yy] != parent ) continue;
|
||||
// "sk" becomes the new parent
|
||||
m_sectionPtrs[yy] = sk;
|
||||
}
|
||||
|
||||
return sk;
|
||||
}
|
||||
|
||||
// for brbr and hr splitting delimeters
|
||||
@ -9666,8 +9425,6 @@ int32_t Sections::splitSectionsByTag ( nodeid_t tagid ) {
|
||||
for ( ; first->m_prevBrother ; first = first->m_prevBrother )
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
// save parent
|
||||
Section *parent = first->m_parent;
|
||||
|
||||
subloop:
|
||||
// mark it
|
||||
@ -9715,7 +9472,7 @@ int32_t Sections::splitSectionsByTag ( nodeid_t tagid ) {
|
||||
// and must group together something meaningful
|
||||
numTextSections >= 2 ) {
|
||||
// do the insertion
|
||||
Section *sk = insertSubSection (parent,a,b,BH_IMPLIED);
|
||||
Section *sk = insertSubSection (a,b,BH_IMPLIED);
|
||||
// error?
|
||||
if ( ! sk ) return -1;
|
||||
// fix it
|
||||
@ -9792,7 +9549,7 @@ bool Sections::splitSections ( char *delimeter , int32_t dh ) {
|
||||
//
|
||||
// try this now
|
||||
//
|
||||
Section *sk = insertSubSection ( sn , start , i , dh );
|
||||
Section *sk = insertSubSection ( start , i , dh );
|
||||
|
||||
// do not resplit this split section with same delimeter!!
|
||||
if ( sk ) sk->m_processedHash = dh;
|
||||
@ -9900,7 +9657,6 @@ SectionVotingTable::SectionVotingTable ( ) {
|
||||
//bool Sections::gotSectiondbList ( bool *needsRecall ) {
|
||||
bool SectionVotingTable::addListOfVotes ( RdbList *list,
|
||||
key128_t **lastKey ,
|
||||
uint32_t tagPairHash ,
|
||||
int64_t myDocId ,
|
||||
int32_t niceness ) {
|
||||
|
||||
@ -11296,27 +11052,6 @@ bool Sections::containsTagId ( Section *si, nodeid_t tagId ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Sections::setTableStuff ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// . does table have a date header row or column?
|
||||
// . we need this for our weekly schedule detection algorithm
|
||||
// . many sites have a row header this is the days of the week
|
||||
// . sometimes they have tods in the first column, and sometimes they
|
||||
// just put the tods and tod ranges in the table cells directly.
|
||||
// . sets Section::m_flags SEC_HASDATEHEADERCOL/ROW for JUST the table
|
||||
// section if it indeed has such date headers
|
||||
// . Dates::isCompatible() looks at that table flag to see if it should
|
||||
// apply special processing when deciding if two dates should be paired
|
||||
// . then we set DF_TABLEDATEHEADERROW/COL for the dates in those
|
||||
// header rows/cols so that we can set SF_RECURRING_DOW if the dow date
|
||||
// was in the header row/col
|
||||
bool Sections::setTableDateHeaders ( Section *ts ) {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . just the voting info for passing into diffbot in json
|
||||
// . along w/ the title/summary/etc. we can return this json blob for each search result
|
||||
bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
|
||||
@ -11354,10 +11089,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
|
||||
char *diversityVec,
|
||||
char *wordSpamVec,
|
||||
char *fragVec,
|
||||
HashTableX *st2 ,
|
||||
HashTableX *tt ,
|
||||
Addresses *aa ,
|
||||
char format ) { // bool forProCog ){
|
||||
char format ) {
|
||||
//FORMAT_PROCOG FORMAT_JSON HTML
|
||||
|
||||
//sbuf->safePrintf("<b>Sections in Document</b>\n");
|
||||
@ -11367,10 +11100,6 @@ bool Sections::print2 ( SafeBuf *sbuf ,
|
||||
|
||||
m_sbuf->setLabel ("sectprnt");
|
||||
|
||||
//m_pt = pt;
|
||||
//m_et = et;
|
||||
//m_at = at;
|
||||
//m_priceTable = priceTable;
|
||||
m_aa = aa;
|
||||
m_hiPos = hiPos;
|
||||
|
||||
@ -11991,7 +11720,6 @@ bool Sections::setRegistrationBits ( ) {
|
||||
static int64_t h_request;
|
||||
static int64_t h_requesting;
|
||||
static int64_t h_get;
|
||||
static int64_t h_enroll;
|
||||
static int64_t h_buy;
|
||||
static int64_t h_presale ;
|
||||
static int64_t h_pre ;
|
||||
@ -12004,7 +11732,6 @@ bool Sections::setRegistrationBits ( ) {
|
||||
static int64_t h_box; // box office for newmexicojazzfestival.org
|
||||
static int64_t h_office;
|
||||
static int64_t h_ticket;//ticket window for newmexicojazzfestival.org
|
||||
static int64_t h_online;
|
||||
static int64_t h_window;
|
||||
static int64_t h_patron;
|
||||
static int64_t h_service;
|
||||
@ -12043,7 +11770,6 @@ bool Sections::setRegistrationBits ( ) {
|
||||
h_requesting = hash64n("requesting");
|
||||
h_request = hash64n("request");
|
||||
h_get = hash64n("get");
|
||||
h_enroll = hash64n("enroll");
|
||||
h_buy = hash64n("buy");
|
||||
h_presale = hash64n("presale");
|
||||
h_pre = hash64n("pre");
|
||||
@ -12069,7 +11795,6 @@ bool Sections::setRegistrationBits ( ) {
|
||||
h_box = hash64n("box");
|
||||
h_office = hash64n("office");
|
||||
h_ticket = hash64n("ticket");
|
||||
h_online = hash64n("online");
|
||||
h_window = hash64n("window");
|
||||
h_patron = hash64n("patron");
|
||||
h_service = hash64n("service");
|
||||
@ -12186,7 +11911,6 @@ bool Sections::setRegistrationBits ( ) {
|
||||
if ( wid == h_sign && nextWid == h_up ) gotIt = 1;
|
||||
if ( wid == h_signup ) gotIt = 1;
|
||||
if ( wid == h_buy && nextWid == h_ticket ) gotIt = 1;
|
||||
//if ( wid == h_buy && nextWid == h_online ) gotIt = 1;
|
||||
if ( wid == h_purchase&&nextWid==h_ticket ) gotIt = 1;
|
||||
if ( wid == h_get && nextWid==h_ticket ) gotIt = 1;
|
||||
// for that jimmy kimmel live url "requesting tickets online"
|
||||
@ -13102,100 +12826,95 @@ bool Sectiondb::verify ( char *coll ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Sections::setListFlags ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Sections::growSections ( ) {
|
||||
// make a log note b/c this should not happen a lot because it's slow
|
||||
log("build: growing sections!");
|
||||
g_errno = EDOCBADSECTIONS;
|
||||
return true;
|
||||
// record old buf start
|
||||
char *oldBuf = m_sectionBuf.getBufStart();
|
||||
// grow by 20MB at a time
|
||||
if ( ! m_sectionBuf.reserve ( 20000000 ) ) return false;
|
||||
// for fixing ptrs:
|
||||
char *newBuf = m_sectionBuf.getBufStart();
|
||||
// set the new max
|
||||
m_maxNumSections = m_sectionBuf.getCapacity() / sizeof(Section);
|
||||
// update ptrs in the old sections
|
||||
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
Section *si = &m_sections[i];
|
||||
if ( si->m_parent ) {
|
||||
char *np = (char *)si->m_parent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_parent = (Section *)np;
|
||||
}
|
||||
if ( si->m_next ) {
|
||||
char *np = (char *)si->m_next;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_next = (Section *)np;
|
||||
}
|
||||
if ( si->m_prev ) {
|
||||
char *np = (char *)si->m_prev;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prev = (Section *)np;
|
||||
}
|
||||
if ( si->m_listContainer ) {
|
||||
char *np = (char *)si->m_listContainer;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_listContainer = (Section *)np;
|
||||
}
|
||||
if ( si->m_prevBrother ) {
|
||||
char *np = (char *)si->m_prevBrother;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prevBrother = (Section *)np;
|
||||
}
|
||||
if ( si->m_nextBrother ) {
|
||||
char *np = (char *)si->m_nextBrother;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_nextBrother = (Section *)np;
|
||||
}
|
||||
if ( si->m_sentenceSection ) {
|
||||
char *np = (char *)si->m_sentenceSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_sentenceSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_prevSent ) {
|
||||
char *np = (char *)si->m_prevSent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_prevSent = (Section *)np;
|
||||
}
|
||||
if ( si->m_nextSent ) {
|
||||
char *np = (char *)si->m_nextSent;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_nextSent = (Section *)np;
|
||||
}
|
||||
if ( si->m_tableSec ) {
|
||||
char *np = (char *)si->m_tableSec;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_tableSec = (Section *)np;
|
||||
}
|
||||
if ( si->m_headColSection ) {
|
||||
char *np = (char *)si->m_headColSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_headColSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_headRowSection ) {
|
||||
char *np = (char *)si->m_headRowSection;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_headRowSection = (Section *)np;
|
||||
}
|
||||
if ( si->m_leftCell ) {
|
||||
char *np = (char *)si->m_leftCell;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_leftCell = (Section *)np;
|
||||
}
|
||||
if ( si->m_aboveCell ) {
|
||||
char *np = (char *)si->m_aboveCell;
|
||||
np = np - oldBuf + newBuf;
|
||||
si->m_aboveCell = (Section *)np;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
// // record old buf start
|
||||
// char *oldBuf = m_sectionBuf.getBufStart();
|
||||
// // grow by 20MB at a time
|
||||
// if ( ! m_sectionBuf.reserve ( 20000000 ) ) return false;
|
||||
// // for fixing ptrs:
|
||||
// char *newBuf = m_sectionBuf.getBufStart();
|
||||
// // set the new max
|
||||
// m_maxNumSections = m_sectionBuf.getCapacity() / sizeof(Section);
|
||||
// // update ptrs in the old sections
|
||||
// for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
||||
// // breathe
|
||||
// QUICKPOLL(m_niceness);
|
||||
// Section *si = &m_sections[i];
|
||||
// if ( si->m_parent ) {
|
||||
// char *np = (char *)si->m_parent;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_parent = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_next ) {
|
||||
// char *np = (char *)si->m_next;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_next = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_prev ) {
|
||||
// char *np = (char *)si->m_prev;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_prev = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_listContainer ) {
|
||||
// char *np = (char *)si->m_listContainer;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_listContainer = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_prevBrother ) {
|
||||
// char *np = (char *)si->m_prevBrother;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_prevBrother = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_nextBrother ) {
|
||||
// char *np = (char *)si->m_nextBrother;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_nextBrother = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_sentenceSection ) {
|
||||
// char *np = (char *)si->m_sentenceSection;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_sentenceSection = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_prevSent ) {
|
||||
// char *np = (char *)si->m_prevSent;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_prevSent = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_nextSent ) {
|
||||
// char *np = (char *)si->m_nextSent;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_nextSent = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_tableSec ) {
|
||||
// char *np = (char *)si->m_tableSec;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_tableSec = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_headColSection ) {
|
||||
// char *np = (char *)si->m_headColSection;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_headColSection = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_headRowSection ) {
|
||||
// char *np = (char *)si->m_headRowSection;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_headRowSection = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_leftCell ) {
|
||||
// char *np = (char *)si->m_leftCell;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_leftCell = (Section *)np;
|
||||
// }
|
||||
// if ( si->m_aboveCell ) {
|
||||
// char *np = (char *)si->m_aboveCell;
|
||||
// np = np - oldBuf + newBuf;
|
||||
// si->m_aboveCell = (Section *)np;
|
||||
// }
|
||||
// }
|
||||
// return true;
|
||||
}
|
||||
|
||||
|
219
Sections.h
219
Sections.h
@ -33,8 +33,6 @@
|
||||
// . these are descriptive flags, they are computed when Sections is set
|
||||
// . SEC_NOTEXT sections do not vote, i.e. they are not stored in Sectiondb
|
||||
#define SEC_NOTEXT 0x0001 // implies section has no alnum words
|
||||
//#define SEC_ARTICLE 0x0002 // section is SV_UNIQUE and SV_TEXTY
|
||||
//#define SEC_DUP 0x0004 // content hash repeated on same site
|
||||
|
||||
// . Weights.cpp zeroes out the weights for these types of sections
|
||||
// . is section delimeted by the <script> tag, <marquee> tag, etc.
|
||||
@ -43,9 +41,6 @@
|
||||
#define SEC_SELECT 0x0020
|
||||
#define SEC_MARQUEE 0x0040
|
||||
#define SEC_CONTAINER 0x0080
|
||||
// . is section in anchor text
|
||||
// . is section delimeted by the <a href...> tag
|
||||
//#define SEC_A 0x0080
|
||||
|
||||
// . in title/header. for gigabits in XmlDoc.cpp
|
||||
// . is section delemited by <title> or <hN> tags?
|
||||
@ -69,8 +64,6 @@
|
||||
#define SEC_HEADING 0x200000
|
||||
|
||||
// reasons why a section is not an event
|
||||
//#define SEC_MULT_PLACES 0x008000
|
||||
//#define SEC_IS_MENUITEM 0x00040000 // in a list of menu items?
|
||||
#define SEC_UNBALANCED 0x00400000 // interlaced section/tags
|
||||
#define SEC_OPEN_ENDED 0x00800000 // no closing tag found
|
||||
#define SEC_SENTENCE 0x01000000 // made by a sentence?
|
||||
@ -115,26 +108,11 @@
|
||||
#define SEC_MULTIDIMS 0x0008000000000000LL
|
||||
#define SEC_HASHXPATH 0x0010000000000000LL
|
||||
|
||||
//#define SEC_HAS_ADDRESS 0x08000000
|
||||
//#define SEC_ADDRESS_CONTAINER 0x40000000
|
||||
//#define SEC_HAS_STOREHOURS 0x01000000 // event is really just store hours
|
||||
//#define SEC_HAS_NONSTOREHOURS 0x02000000
|
||||
//#define SEC_HAS_NON_EVENT_DATE 0x04000000
|
||||
|
||||
|
||||
// . some random-y numbers for Section::m_baseHash
|
||||
// . used by splitSection() function
|
||||
//#define BH_BR -1113348753
|
||||
//#define BH_BRBR 3947503
|
||||
//#define BH_HR 1378153634
|
||||
//#define BH_H1 -1788814047
|
||||
//#define BH_H2 -1170023066
|
||||
//#define BH_H3 -132582659
|
||||
//#define BH_H4 2095609929
|
||||
#define BH_BULLET 7845934
|
||||
#define BH_SENTENCE 4590649
|
||||
#define BH_IMPLIED 95468323
|
||||
//#define BH_IMPLIED_LIST 9434499
|
||||
|
||||
// values for Section::m_sentFlags (sentence flags)
|
||||
#define SENT_HAS_COLON 0x00000001
|
||||
@ -342,12 +320,7 @@ public:
|
||||
// used by Events.cpp to count # of timeofdays in section
|
||||
//class Event *m_event;
|
||||
|
||||
// for Events class
|
||||
//uint8_t m_numAddresses;
|
||||
//class Address *m_address;
|
||||
// for Events class, usually streets!
|
||||
//uint8_t m_numPlaces;
|
||||
//class Place *m_place;
|
||||
class Addresses *m_aa;
|
||||
|
||||
// . if we are an element in a list, what is the list container section
|
||||
@ -357,10 +330,6 @@ public:
|
||||
// . used to set SEC_HAS_MENUBROTHER flag
|
||||
class Section *m_listContainer;
|
||||
|
||||
// if we are a header, of what list are we a header of?
|
||||
//class Section *m_headerOfList;
|
||||
|
||||
|
||||
// the sibling section before/after us. can be NULL.
|
||||
class Section *m_prevBrother;
|
||||
class Section *m_nextBrother;
|
||||
@ -453,30 +422,9 @@ public:
|
||||
// for debug output display of color coded nested sections
|
||||
uint32_t m_colorHash;
|
||||
|
||||
// like tag hash but only the tag ids, no hashed attributes or
|
||||
// virtual section base hashes
|
||||
//int32_t m_formatHash;
|
||||
|
||||
// tagid of this section, 0 means none (like sentence section, etc.)
|
||||
nodeid_t m_tagId;
|
||||
|
||||
/*
|
||||
// used by addImpliedSections()
|
||||
int32_t getBaseHash2 ( ) {
|
||||
// fix for funkefiredarts.com since one of the header tags
|
||||
// has a different tag attribute, but it says "Monday". so
|
||||
// treat all these special headers the same since it is
|
||||
// critical we get these type of implied sections right, lest
|
||||
// we hurt our date telscoping.
|
||||
if ( m_flags & SEC_HAS_DOM_DOW ) return 22222;
|
||||
if ( m_flags&SEC_HEADING_CONTAINER) return m_baseHash^0x789123;
|
||||
else return m_baseHash;
|
||||
};
|
||||
*/
|
||||
|
||||
//int32_t getBaseHash3 ();
|
||||
|
||||
|
||||
// usually just the m_tagId, but hashes in the class attributes of
|
||||
// div and span tags, etc. to make them unique
|
||||
uint32_t m_baseHash;
|
||||
@ -491,25 +439,12 @@ public:
|
||||
// these deal with enumertated tags and are used by Events.cpp
|
||||
int32_t m_occNum;
|
||||
int32_t m_numOccurences;
|
||||
// section with same m_tagHash and before you
|
||||
//class Section *m_prevSibling;
|
||||
|
||||
// used by XmlDoc.cpp to set a topological distance
|
||||
int32_t m_topDist;
|
||||
//int32_t m_sortedIndex;
|
||||
|
||||
// all the parent tags are enumerated, but the kid (youngest tag)
|
||||
// is not enumerated
|
||||
//int32_t m_enumTagHash;
|
||||
|
||||
// . tag hash which disregards non-breaking or tags with no back tags
|
||||
// . used by Events.cpp
|
||||
//int32_t m_hardTagHash;
|
||||
|
||||
// hash of all the alnum words DIRECTLY in this section
|
||||
uint64_t m_contentHash64;
|
||||
// if section contains words indirectly, then store xor'ed wids in here
|
||||
//int32_t m_contentHash2;
|
||||
|
||||
uint64_t m_sentenceContentHash64;
|
||||
|
||||
@ -523,12 +458,6 @@ public:
|
||||
// uses m_sentenceContentHash64 (for sentences)
|
||||
uint64_t m_indirectSentHash64;
|
||||
|
||||
// for voting! we basically ignore numbers and dates, months, etc.
|
||||
// for doing this hash so that if the date changes from page to page
|
||||
// it will still be recognized as a "dup section" and m_votesForDup
|
||||
// should be high
|
||||
//uint32_t m_voteHash32;
|
||||
|
||||
// . range of words in Words class we encompass
|
||||
// . m_wordStart and m_wordEnd are the tag word #'s
|
||||
// . ACTUALLY it is a half-closed interval [a,b) like all else
|
||||
@ -539,31 +468,12 @@ public:
|
||||
int32_t m_b;//wordEnd;
|
||||
|
||||
// for event titles and descriptions
|
||||
//float m_titleScore;
|
||||
//float m_descScore;
|
||||
//titleflags_t m_titleFlags;
|
||||
sentflags_t m_sentFlags;
|
||||
|
||||
// bits set based on turk votes. see the TB_* bits in XmlDoc.h
|
||||
//turkbits_t m_turkBits;
|
||||
|
||||
// alnum count for us and all sections we contain
|
||||
//int32_t m_alnumCount;
|
||||
|
||||
// . # alnum words only in this and only this section
|
||||
// . if we have none, we are SEC_NOTEXT
|
||||
int32_t m_exclusive;
|
||||
|
||||
// like above, but word must also NOT be in a hyperlink
|
||||
//int32_t m_plain;
|
||||
|
||||
// Address.cpp uses this
|
||||
//char m_numBackToBackSubsections;
|
||||
//nodeid_t m_lastTid;
|
||||
|
||||
// # of times this section appears in this doc
|
||||
//int32_t m_totalOccurences;
|
||||
|
||||
// our depth. # of tags in the hash
|
||||
int32_t m_depth;
|
||||
|
||||
@ -574,61 +484,15 @@ public:
|
||||
int32_t m_mark;
|
||||
|
||||
// Events.cpp assigns a date to each section
|
||||
//int32_t m_fullDate;
|
||||
//class Date *m_datePtr;
|
||||
int32_t m_firstDate;
|
||||
|
||||
char m_used;
|
||||
|
||||
//int32_t m_numTods;
|
||||
|
||||
// the event section we contain. used by Events.cpp
|
||||
//class Section *m_eventSec;
|
||||
|
||||
// used by Events.cpp for determining what range of events a section
|
||||
// contains. we store that range in Events::hash() when we index each
|
||||
// word into datedb for events.
|
||||
//int32_t m_minEventId;
|
||||
//int32_t m_maxEventId;
|
||||
|
||||
// used in Sections::splitSections() function
|
||||
int32_t m_processedHash;
|
||||
|
||||
int32_t m_gbFrameNum;
|
||||
|
||||
// . support event ids from 0 to 255
|
||||
// . this increases the sizeof this class from 160 to 192 bytes
|
||||
//char m_evIdBits[32];
|
||||
// how many bits in the above array are set?
|
||||
//int16_t m_numEventIdBits;
|
||||
|
||||
/*
|
||||
bool hasEventId ( int32_t evId ) {
|
||||
// this is an overflow condition...
|
||||
if ( evId > 255 ) return false;
|
||||
// -1 or 0 means not associated with any event id since
|
||||
// all eventIds are >= 1
|
||||
if ( m_minEventId <= 0 ) return false;
|
||||
if ( evId < m_minEventId ) return false;
|
||||
if ( evId > m_maxEventId ) return false;
|
||||
unsigned char bitMask = 1 << (evId % 8);
|
||||
return m_evIdBits[evId/8] & bitMask;
|
||||
};
|
||||
|
||||
void addEventId ( int32_t eid ) {
|
||||
if ( eid >= 256 ) return;
|
||||
unsigned char bitMask = 1 << (eid % 8);
|
||||
unsigned char byteOff = eid / 8;
|
||||
if ( m_evIdBits[byteOff] & bitMask ) return;
|
||||
m_evIdBits[byteOff] |= bitMask;
|
||||
m_numEventIdBits++;
|
||||
if ( m_minEventId <= 0 || m_minEventId > eid )
|
||||
m_minEventId = eid;
|
||||
if ( m_maxEventId <= 0 || m_maxEventId < eid )
|
||||
m_maxEventId = eid;
|
||||
};
|
||||
*/
|
||||
|
||||
// do we contain section "arg"?
|
||||
bool contains ( class Section *arg ) {
|
||||
return ( m_a <= arg->m_a && m_b >= arg->m_b ); };
|
||||
@ -675,12 +539,7 @@ class Sections {
|
||||
int32_t niceness ,
|
||||
void *state ,
|
||||
void (*callback)(void *state) ,
|
||||
uint8_t contentType ,
|
||||
char *sectionsData,
|
||||
bool sectionsDataValid ,
|
||||
char *sectionsData2,
|
||||
char *buf ,
|
||||
int32_t bufSize ) ;
|
||||
uint8_t contentType ) ;
|
||||
|
||||
|
||||
bool addVotes(class SectionVotingTable *nsvt, uint32_t tagPairHash );
|
||||
@ -721,17 +580,10 @@ class Sections {
|
||||
char *diversityVec,
|
||||
char *wordSpamVec,
|
||||
char *fragVec,
|
||||
class HashTableX *st2 ,
|
||||
class HashTableX *tt ,
|
||||
class Addresses *aa ,
|
||||
char format = FMT_HTML ); // bool forProCog );
|
||||
char format = FMT_HTML );
|
||||
bool printSectionDiv ( class Section *sk , char format = FMT_HTML );
|
||||
//bool forProCog = false ) ;
|
||||
class SafeBuf *m_sbuf;
|
||||
//class HashTableX *m_pt;
|
||||
//class HashTableX *m_et;
|
||||
//class HashTableX *m_at;
|
||||
//class HashTableX *m_priceTable;
|
||||
|
||||
char *getSectionsReply ( int32_t *size );
|
||||
char *getSectionsVotes ( int32_t *size );
|
||||
@ -739,13 +591,10 @@ class Sections {
|
||||
bool isHardSection ( class Section *sn );
|
||||
|
||||
bool setMenus ( );
|
||||
bool setListFlags ( );
|
||||
|
||||
bool setFormTableBits ( ) ;
|
||||
bool setTableRowsAndCols ( class Section *tableSec ) ;
|
||||
bool setTableHeaderBits ( class Section *table );
|
||||
bool setTableStuff ( ) ;
|
||||
bool setTableDateHeaders ( class Section *ts ) ;
|
||||
bool setTableScanPtrs ( class Section *ts ) ;
|
||||
|
||||
void setHeader ( int32_t r , class Section *first , sec_t flag ) ;
|
||||
@ -765,7 +614,6 @@ class Sections {
|
||||
class Url *m_url ;
|
||||
int64_t m_docId ;
|
||||
int64_t m_siteHash64 ;
|
||||
//int64_t m_tagPairHash;
|
||||
char *m_coll ;
|
||||
void *m_state ;
|
||||
void (*m_callback) ( void *state );
|
||||
@ -797,7 +645,6 @@ class Sections {
|
||||
bool m_waitInLine;
|
||||
int32_t m_articleStartWord;
|
||||
int32_t m_articleEndWord;
|
||||
//int32_t m_totalSimilarLayouts;
|
||||
bool m_hadArticle;
|
||||
int32_t m_numInvalids;
|
||||
int32_t m_totalSiteVoters;
|
||||
@ -848,13 +695,6 @@ class Sections {
|
||||
|
||||
int32_t m_numSentenceSections;
|
||||
|
||||
// . the section ptrs sorted by Section::m_a
|
||||
// . since we set SEC_FAKE from splitSections() those new sections
|
||||
// are appended on m_sections[] array and are out of order, so
|
||||
// we merge sort the two sublists of m_sections[] and put the
|
||||
// pointers into here...
|
||||
//class Section **m_sorted;
|
||||
|
||||
bool m_isTestColl;
|
||||
|
||||
// assume no malloc
|
||||
@ -870,15 +710,8 @@ class Sections {
|
||||
char **m_wptrs;
|
||||
nodeid_t *m_tids;
|
||||
|
||||
//int32_t addImpliedSections ( bool needHR );
|
||||
//int32_t addHeaderImpliedSections ( );
|
||||
|
||||
//int32_t addImpliedSectionsOld ( );
|
||||
//int32_t getHeadingScore ( class Section *sk , int32_t baseHash );
|
||||
|
||||
// the new way
|
||||
bool addImpliedSections ( class Addresses *aa );//, HashTableX *svt );
|
||||
//HashTableX *m_svt;
|
||||
|
||||
bool setSentFlagsPart1 ( );
|
||||
bool setSentFlagsPart2 ( );
|
||||
@ -899,10 +732,7 @@ class Sections {
|
||||
char method,
|
||||
class Section *delim ,
|
||||
class Partition *part );
|
||||
int32_t getDelimHash ( char method , class Section *bro ,
|
||||
class Section *head ) ;
|
||||
//int32_t m_totalHdrCount;
|
||||
//bool m_called;
|
||||
int32_t getDelimHash ( char method , class Section *bro ) ;
|
||||
|
||||
bool addImpliedLists ( ) ;
|
||||
int32_t getDelimScore2 ( class Section *bro,
|
||||
@ -926,10 +756,7 @@ class Sections {
|
||||
|
||||
bool addSentenceSections ( ) ;
|
||||
|
||||
class Section *insertSubSection ( class Section *parent ,
|
||||
int32_t a ,
|
||||
int32_t b ,
|
||||
int32_t newBaseHash ) ;
|
||||
class Section *insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) ;
|
||||
|
||||
int32_t splitSectionsByTag ( nodeid_t tagid ) ;
|
||||
bool splitSections ( char *delimeter , int32_t dh );
|
||||
@ -1040,7 +867,6 @@ class SectionVotingTable {
|
||||
// stock table from a sectiondb rdblist
|
||||
bool addListOfVotes ( RdbList *list,
|
||||
key128_t **lastKey ,
|
||||
uint32_t tagPairHash ,
|
||||
int64_t docId ,
|
||||
int32_t niceness ) ;
|
||||
|
||||
@ -1105,26 +931,7 @@ class SectionVotingTable {
|
||||
#define SV_EURDATEFMT 3 // DateParse2.cpp. contains european date fmt
|
||||
#define SV_EVENT 4 // used in Events.cpp to indicate event container
|
||||
#define SV_ADDRESS 5 // used in Events.cpp to indicate address container
|
||||
// . place types here
|
||||
// . these #define's are used for values of Place::m_type in Events.cpp too!
|
||||
// . score is from 0 to 1.0 which is probability section is a place container
|
||||
// for the specified place type
|
||||
// . used by Events.cpp for address extraction
|
||||
/*
|
||||
#define SV_PLACE_NAME_1 7 // places now have two names
|
||||
#define SV_PLACE_NAME_2 8 // places now have two names
|
||||
#define SV_PLACE_STREET 9
|
||||
#define SV_PLACE_CITY 10
|
||||
#define SV_PLACE_ZIP 11
|
||||
#define SV_PLACE_SUITE 12
|
||||
#define SV_PLACE_ADM1 13
|
||||
#define SV_PLACE_ADM2 14
|
||||
#define SV_PLACE_ADM3 15
|
||||
#define SV_PLACE_ADM4 16
|
||||
#define SV_PLACE_CTRY 17
|
||||
#define SV_PLACE_SCH 18
|
||||
#define SV_PLACE_PRK 19
|
||||
*/
|
||||
|
||||
// . HACK: the "date" is not the enum tag hash, but is the tagPairHash for this
|
||||
// . every doc has just one of these describing the entire layout of the page
|
||||
// . basically looking for these is same as doing a gbtaghash: query
|
||||
@ -1133,25 +940,11 @@ class SectionVotingTable {
|
||||
// . this allows us to detect a duplicate section even though the layout
|
||||
// of the web page is not quite the same, but is from the same site
|
||||
#define SV_TAGCONTENTHASH 21
|
||||
// . HACK: a statistic
|
||||
// . the voter that had the max SectionVote::m_numSampled
|
||||
// . the m_numSampled for this statistic is his m_numSampled
|
||||
// . if we find that a section is not unique (i.e. repeated) on just one
|
||||
// voting document, then we think it is probably a comment and we do not
|
||||
// set the SEC_ARTICLE flag for that section
|
||||
//#define SV_TEXTY_MAX_SAMPLED 22
|
||||
// . HACK: the "date" is not the enum tag hash, but is the tagPairHash!
|
||||
// . indicates this doc is waiting in line for enough docs from its site
|
||||
// with the same page layout (tagpairhash) to become indexed so that it can
|
||||
// make an informed decision in regards to eliminating comment sections
|
||||
// and determining article sections
|
||||
//#define SV_WAITINLINE 23
|
||||
|
||||
// now Dates.cpp sets these too
|
||||
#define SV_FUTURE_DATE 24
|
||||
#define SV_PAST_DATE 25
|
||||
#define SV_CURRENT_DATE 26
|
||||
//#define SV_DUP 27
|
||||
//#define SV_NOT_DUP 28
|
||||
#define SV_SITE_VOTER 29
|
||||
#define SV_TURKTAGHASH 30
|
||||
|
||||
|
18
XmlDoc.cpp
18
XmlDoc.cpp
@ -5424,13 +5424,7 @@ Sections *XmlDoc::getExplicitSections ( ) {
|
||||
m_niceness ,
|
||||
m_masterState , // state
|
||||
m_masterLoop , // callback
|
||||
*ct ,
|
||||
NULL , // sd // sections data
|
||||
true , // sections data valid?
|
||||
NULL , // sv // for m_nsvt
|
||||
//*tph ,
|
||||
NULL , // buf
|
||||
0 )) { // bufSize
|
||||
*ct )) {
|
||||
m_calledSections = true;
|
||||
// sanity check, this should not block, we are setting
|
||||
// exclusively from the titleRec
|
||||
@ -6438,7 +6432,6 @@ SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {
|
||||
// occurs in the document.
|
||||
if ( ! m_osvt.addListOfVotes(&m_secdbList,
|
||||
&lastKey,
|
||||
*tph,
|
||||
*d , // docid
|
||||
m_niceness))
|
||||
return NULL;
|
||||
@ -30703,12 +30696,7 @@ SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {
|
||||
m_niceness ,
|
||||
NULL,//m_masterState , // state
|
||||
NULL,//m_masterLoop , // callback
|
||||
CT_JSON, // *ct ,
|
||||
NULL , // sd // sections data
|
||||
true , // sections data valid?
|
||||
NULL , // sv // for m_nsvt
|
||||
NULL , // buf
|
||||
0 )) { // bufSize
|
||||
CT_JSON )) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -34963,8 +34951,6 @@ bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
diversityVec,
|
||||
wordSpamVec,
|
||||
fragVec,
|
||||
NULL,
|
||||
NULL ,
|
||||
&m_addresses ,
|
||||
true );
|
||||
return true;
|
||||
|
8
main.cpp
8
main.cpp
@ -8806,13 +8806,7 @@ bool parseTest ( char *coll , int64_t docId , char *query ) {
|
||||
// do not supply xd so it will be set from scratch
|
||||
if ( ! sections.set (&words,&phrases,&bits,NULL,0,0,
|
||||
NULL,0,NULL,NULL,
|
||||
0, // contenttype
|
||||
NULL, // sectionsdata
|
||||
false, // sectionsdatavalid
|
||||
NULL, // sectionsdata2
|
||||
//0, // tagpairhash
|
||||
NULL, // buf
|
||||
0)) // bufSize
|
||||
0))
|
||||
return log("build: speedtestxml: sections set: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
|
Reference in New Issue
Block a user