privacore-open-source-searc.../Sections.cpp

3450 lines
100 KiB
C++
Raw Permalink Blame History

// print events should print   if nothing else to print
// when a div tag's parent truncates its section, it may have been
// paired up with a div back tag which then should become free...
// that is the problem... because those back tags are unpaired.
// so your parent should constrain you as SOON as it is constrained and
// close you up at that point. that way you cannot falsely pair-claim
// a div back tag.
#include "Sections.h"
#include "Url.h"
#include "tokenizer.h"
#include "Conf.h"
#include "XmlDoc.h"
#include "Bits.h"
#include "sort.h"
#include "Abbreviations.h"
#include "StopWords.h"
#include "Process.h"
#include "Posdb.h"
#include "GbUtil.h"
#include "Errno.h"
Sections::Sections ( ) {
m_sections = NULL;
reset();
}
void Sections::reset() {
m_sectionBuf.purge();
m_sectionPtrBuf.purge();
m_sections = NULL;
m_bits = NULL;
m_numSections = 0;
m_rootSection = NULL;
m_lastSection = NULL;
m_lastAdded = NULL;
m_nw = 0;
m_firstSentence = NULL;
m_sectionPtrs = NULL;
// Coverity
m_tr = NULL;
m_contentType = 0;
m_isRSSExt = false;
m_maxNumSections = 0;
}
Sections::~Sections ( ) {
reset();
}
#define TXF_MATCHED 1
// an element on the stack is a Tag
class Tagx {
public:
// id of the fron tag we pushed
nodeid_t m_tid;
// section number we represent
int32_t m_secNum;
// set to TXF_MATCHED
char m_flags;
};
// i lowered from 1000 to 300 so that we more sensitive to malformed pages
// because typically they seem to take longer to parse. i also added some
// new logic for dealing with table tr and td back tags that allow us to
// pop off the other contained tags right away rather than delaying it until
// we are done because that will often breach this stack.
#define MAXTAGSTACK 300
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . sets m_sections[] array, 1-1 with words array "w"
// . the Weights class can look at these sections and zero out the weights
// for words in script, style, select and marquee sections
bool Sections::set(const TokenizerResult *tr, Bits *bits, const Url *url, uint8_t contentType ) {
reset();
if ( ! tr ) return true;
if ( tr->size() > 1000000 ) {
log("sections: over 1M words. skipping sections set for "
"performance.");
return true;
}
// save it
m_tr = tr;
m_bits = bits;
m_contentType = contentType;
// reset this just in case
g_errno = 0;
if ( tr->empty() ) return true;
// shortcuts
int32_t nw = tr->size();
m_isRSSExt = false;
const char *ext = url->getExtension();
if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true;
if ( m_contentType == CT_XML ) m_isRSSExt = true;
// . how many sections do we have max?
// . init at one to count the root section
int32_t max = 1;
for ( int32_t i = 0 ; i < nw ; i++ ) {
const auto &token = (*tr)[i];
// . count all front tags
// count back tags too since some url
// http://www.tedxhz.com/tags.asp?id=3919&id2=494 had a bunch
// of </p> tags with no front tags and it cored us because
// m_numSections > m_maxNumSections!
if ( token.nodeid ) {
max += 2;
// . any punct tag could have a bullet in it...
// . or if its a period could make a sentence section
} else if ( !token.is_alfanum ) {
// only do not count simple spaces
if ( token.token_len == 1 && is_wspace_a(token.token_start[0]))
continue;
// otherwise count it as sentence delimeter
max++;
}
}
// . then \0 allows for a sentence too!
// . fix doc that was just "localize-sf-prod\n"
max++;
// and each section may create a sentence section
max *= 2;
// truncate if excessive.
if ( max > 1000000 ) {
log("sections: truncating max sections to 1000000");
max = 1000000;
}
int32_t need = max * sizeof(Section);
// set this
m_maxNumSections = max;
m_sectionPtrBuf.setLabel("psectbuf");
// separate buf now for section ptr for each word
if ( ! m_sectionPtrBuf.reserve ( nw *sizeof(Section *)) ) return true;
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
// allocate m_sectionBuf
m_sections = NULL;
m_sectionBuf.setLabel ( "sectbuf" );
if ( ! m_sectionBuf.reserve ( need ) )
return true;
// point into it
m_sections = (Section *)m_sectionBuf.getBufStart();
// save this too
m_nw = nw;
// stack of front tags we encounter
Tagx stack[MAXTAGSTACK];
Tagx *stackPtr = stack;
Section *current = NULL;
Section *rootSection = NULL;
// assume none
m_rootSection = NULL;
// only add root section if we got some words
if ( nw > 0 ) {
// record this i guess
rootSection = &m_sections[m_numSections];
// clear
memset ( rootSection , 0 , sizeof(Section) );
// . the current section we are in
// . let's use a root section
current = rootSection;
// init that to be the whole page
rootSection->m_b = nw;
// save it
m_rootSection = rootSection;
// to fix a core dump
rootSection->m_baseHash = 1;
// advance
m_numSections++;
}
// Sections are no longer 1-1 with words, just with front tags
for ( int32_t i = 0 ; i < nw ; i++ ) {
const auto &token = (*tr)[i];
nodeid_t fullTid = token.nodeid;
// are we a non-tag?
if ( ! fullTid ) {
continue;
}
// make a single section for input tags
if ( fullTid == TAG_INPUT ||
fullTid == TAG_HR ||
fullTid == TAG_COMMENT ) {
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections) {
g_errno = EDOCBADSECTIONS;
return true;
}
// get the section
Section *sn = &m_sections[m_numSections];
// clear
memset ( sn , 0 , sizeof(Section) );
// inc it
m_numSections++;
// sanity check - breach check
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
// set our parent
sn->m_parent = current;
// need to keep a word range that the section covers
sn->m_a = i;
// section consists of just this tag
sn->m_b = i + 1;
// go on to next
continue;
}
// a section of multiple br tags in a sequence
if ( fullTid == TAG_BR ) {
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections) {
g_errno = EDOCBADSECTIONS;
return true;
}
// get the section
Section *sn = &m_sections[m_numSections];
// clear
memset ( sn , 0 , sizeof(Section) );
// inc it
m_numSections++;
// sanity check - breach check
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
// set our parent
sn->m_parent = current;
// need to keep a word range that the section covers
sn->m_a = i;
// count em up
int32_t brcnt = 1;
// scan for whole sequence
int32_t lastBrPos = i;
for ( int32_t j = i + 1 ; j < nw ; j++ ) {
const auto &token2 = (*tr)[j];
// claim br tags
if ( token2.nodeid == TAG_BR ) {
lastBrPos = j;
brcnt++;
continue;
}
// break on words
if ( token2.is_alfanum ) break;
// all spaces is ok
if ( is_wspace_utf8_string(token2.token_start,token2.token_end()) ) continue;
// otherwise, stop on other punct
break;
}
// section consists of just this tag
sn->m_b = lastBrPos + 1;
// advance
i = lastBrPos;
// set this for later so that getDelimHash() returns
// something different based on the br count for
// METHOD_ATTRIBUTE
sn->m_baseHash = 19999 + brcnt;
// go on to next
continue;
}
// get the tag id without the back bit
nodeid_t tid = fullTid & BACKBITCOMP;
// . ignore tags with no corresponding back tags
// . if they have bad html and have front tags
// with no corresponding back tags, that will hurt!
// . make exception for <li> tag!!!
// . was messing up:
// http://events.kqed.org/events/index.php?com=detail&
// eID=9812&year=2009&month=11
// for parsing out events
// . make excpetion for <p> tag too! most ppl use </p>
if ( ( ! hasBackTag ( tid ) ||
token.token_start[1] =='!' || // <!ENTITY rdfns...>
token.token_start[1] =='?' ) &&
tid != TAG_P &&
tid != TAG_LI )
continue;
// . these imply no back tag
// . <description />
// . fixes inconsistency in
// www.trumba.com/calendars/KRQE_Calendar.rss
if ( token.token_start[token.token_len-2] == '/' && tid == TAG_XMLTAG )
continue;
// do not breach the stack
if ( stackPtr - stack >= MAXTAGSTACK ) {
log( LOG_WARN, "html: stack breach for %s",url->getUrl());
// if we set g_errno and return then the url just
// ends up getting retried once the spider lock
// in Spider.cpp expires in MAX_LOCK_AGE seconds.
// about an hour. but really we should completely
// give up on this. whereas we should retry OOM errors
// etc. but this just means bad html really.
// just reset to 0 sections then
reset();
return true;
}
char gotBackTag ;
if ( fullTid != tid ) gotBackTag = 1;
else gotBackTag = 0;
// "pop tid", tid to pop off stack
nodeid_t ptid = tid;
nodeid_t fullPopTid = fullTid;
// no nested <li> tags allowed
if ( fullTid == TAG_LI &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_LI )
gotBackTag = 2;
// no nested <b> tags allowed
if ( fullTid == TAG_B &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_B )
gotBackTag = 2;
// no nested <a> tags allowed
if ( fullTid == TAG_A &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_A )
gotBackTag = 2;
// no nested <p> tags allowed
if ( fullTid == TAG_P &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_P )
gotBackTag = 2;
// no <hN> tags inside a <p> tag
// fixes http://www.law.berkeley.edu/140.htm
if ( fullTid >= TAG_H1 &&
fullTid <= TAG_H5 &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_P ) {
// match this on stack
ptid = TAG_P;
fullPopTid = TAG_P;
gotBackTag = 2;
}
// no nested <td> tags allowed
if ( fullTid == TAG_TD &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_TD )
gotBackTag = 2;
// encountering <tr> when in a <td> closes the <td> AND
// should also close the <tr>!!
if ( fullTid == TAG_TR &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_TD )
gotBackTag = 2;
// no nested <tr> tags allowed
if ( fullTid == TAG_TR &&
stackPtr > stack &&
((stackPtr-1)->m_tid)==TAG_TR )
gotBackTag = 2;
// this is true if we are a BACK TAG
if ( gotBackTag ) {
// ignore span tags that are non-breaking because they
// do not change the grouping/sectioning behavior of
// the web page and are often abused.
if ( ptid == TAG_SPAN ) continue;
// fix for gwair.org
if ( ptid == TAG_FONT ) continue;
// too many people use these like a <br> tag or
// make them open-ended or unbalanced
//if ( tid == TAG_P ) continue;
if ( ptid == TAG_CENTER ) continue;
subloop:
// don't blow the stack
if ( stackPtr == stack ) continue;
// point to it
Tagx *spp = (stackPtr - 1);
// init it
Tagx *p ;
// scan through the stack until we find a
// front tag that matches this back tag
//for(p = spp ; p >= stack && gotBackTag == 1 ; p-- ) {
for ( p = spp ; p >= stack ; p-- ) {
// no match?
if ( p->m_tid != ptid ) {
// matched before? we can pop
if ( p->m_flags & TXF_MATCHED )
continue;
// keep on going
continue;
}
// do not double match
if ( p->m_flags & TXF_MATCHED )
continue;
// flag it cuz we matched it
p->m_flags |= TXF_MATCHED;
// set the stack ptr to it
spp = p;
// and stop
break;
}
// no matching front tag at all?
// then just ignore this back tag
if ( p < stack ) continue;
// get section number of the front tag
//int32_t xn = *(secNumPtr-1);
int32_t xn = spp->m_secNum;
// sanity
if ( xn<0 || xn>=m_numSections ) {g_process.shutdownAbort(true);}
// get it
Section *sn = &m_sections[xn];
// record the word range of the secion we complete
sn->m_b = i+1;
// do not include the <li> tag as part of it
// otherwise we end up with overlapping section since
// this tag ALSO starts a section!!
if ( gotBackTag == 2 ) sn->m_b = i;
// if our parent got closed before "sn" closed because
// it hit its back tag before we hit ours, then we
// must cut ourselves short and try to match this
// back tag to another front tag on the stack
Section *ps = sn->m_parent;
for ( ; ps != rootSection ; ps = ps->m_parent ) {
// skip if parent no longer contains us!
if ( ps->m_b <= sn->m_a ) continue;
// skip if this parent is still open
if ( ps->m_b <= 0 ) continue;
// parent must have closed before us
if ( ps->m_b > sn->m_b ) {g_process.shutdownAbort(true);}
// cut our end shorter
sn->m_b = ps->m_b;
// our TXF_MATCHED bit should still be set
// for spp->m_flags, so try to match ANOTHER
// front tag with this back tag now
if ( ! ( spp->m_flags & TXF_MATCHED ) ) {
g_process.shutdownAbort(true); }
// ok, try to match this back tag with another
// front tag on the stack, because the front
// tag we had selected got cut short because
// its parent forced it to cut short.
goto subloop;
}
// sanity check
if ( sn->m_b <= sn->m_a ) { g_process.shutdownAbort(true);}
// revert it to this guy, may not equal stackPtr-1 !!
stackPtr = spp;
// get parent section
if ( stackPtr > stack ) {
// get parent section now
xn = (stackPtr-1)->m_secNum;
// set current to that
current = &m_sections[xn];
}
else {
// i guess this is bad html!
current = rootSection;
}
// debug log
if ( g_conf.m_logDebugSections ) {
const char *ms = "";
if ( stackPtr->m_tid != ptid) ms =" UNMATCHED";
const char *back ="";
if ( fullPopTid & BACKBIT ) back = "/";
logf(LOG_DEBUG,"section: pop tid=%" PRId32" "
"i=%" PRId32" "
"level=%" PRId32" "
"%s%s "
//"h=0x%" PRIx32
"%s",(int32_t)tid,
i,
(int32_t)(stackPtr - stack),
back,g_nodes[tid].m_nodeName,
//h,
ms);
}
// . if we were a back tag, we are done... but if we
// were a front tag, we must add ourselves below...
// . MDW: this seems more logical than the if-statement
// below...
if ( fullTid != tid ) continue;
}
if ( tid == TAG_CENTER ) continue;
if ( tid == TAG_SPAN ) continue;
// gwair.org has font tags the pair up a date "1st Sundays"
// with the address above it, and it shouldn't do that!
if ( tid == TAG_FONT ) continue;
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections) {
g_errno = EDOCBADSECTIONS;
return true;
}
// get the section
Section *sn = &m_sections[m_numSections];
// clear
memset ( sn , 0 , sizeof(Section) );
// inc it
m_numSections++;
// sanity check - breach check
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
// set our parent
sn->m_parent = current;
// set this
current = sn;
// need to keep a word range that the section covers
sn->m_a = i;
// assume no terminating bookend
sn->m_b = -1;
// push a unique id on the stack so we can pop if we
// enter a subsection
stackPtr->m_tid = tid;
stackPtr->m_secNum = m_numSections - 1;
stackPtr->m_flags = 0;
stackPtr++;
// debug log
if ( ! g_conf.m_logDebugSections ) continue;
logf(LOG_DEBUG,"section: push tid=%" PRId32" "
"i=%" PRId32" "
"level=%" PRId32" "
"%s "
,
(int32_t)tid,
i,
(int32_t)(stackPtr - stack)-1,
g_nodes[(int32_t)tid].m_nodeName
);
}
// if first word in a section false outside of the parent section
// then reparent to the grandparent. this can happen when we end
// up closing a parent section before ???????
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *si = &m_sections[i];
// skip if we are still open-ended
if ( si->m_b < 0 ) continue;
// get parent
Section *sp = si->m_parent;
// skip if no parent
if ( ! sp ) continue;
// skip if parent still open ended
if ( sp->m_b < 0 ) continue;
// subloop it
doagain:
// skip if no parent
if ( ! sp ) continue;
// parent must start before us
if ( sp->m_a > si->m_a ) { g_process.shutdownAbort(true); }
// . does parent contain our first word?
// . it need not fully contain our last word!!!
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
// if parent is open ended, then it is ok for now
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
// get grandparent
sp = sp->m_parent;
// set
si->m_parent = sp;
// try again
goto doagain;
}
bool inGbFrame = false;
int32_t gbFrameNum = 0;
bool inIFrame = false;
//
// . set Section::m_xmlNameHash for xml tags here
// . set Section::m_frameNum and SEC_IN_GBFRAME bit
//
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *sn = &m_sections[i];
// get it
int32_t ws = sn->m_a;
const auto &token = (*m_tr)[ws];
// shortcut
nodeid_t tid = token.nodeid;
if (tid == TAG_IFRAME) {
//if the section doesn't have the closing iframe tag then set inIFrame
bool hasClosingIframeTag = false;
for(int j=sn->m_b-1; j>i; j--) {
if((*m_tr)[j].nodeid == (TAG_IFRAME|BACKBIT)) {
hasClosingIframeTag = true;
break;
}
}
if(!hasClosingIframeTag)
inIFrame = true;
else if(!inGbFrame)
sn->m_flags |= SEC_IN_IFRAME;
} else if (tid == (TAG_IFRAME | BACKBIT)) { //never happens how sentences are currently split
inIFrame = false;
} else if ( tid == TAG_GBFRAME ) {
// start or end?
gbFrameNum++;
inGbFrame = true;
} else if ( tid == (TAG_GBFRAME | BACKBIT) ) {
inGbFrame = false;
}
if (inIFrame && !inGbFrame)
sn->m_flags |= SEC_IN_IFRAME;
// mark it
if (inGbFrame)
sn->m_gbFrameNum = gbFrameNum;
// custom xml tag, hash the tag itself
if ( tid != TAG_XMLTAG ) continue;
// stop at first space to avoid fields!!
const char *p = token.token_start + 1;
const char *pend = p + token.token_len;
// skip back tags
if ( *p == '/' ) continue;
// reset hash
int64_t xh = 0;
// and hash char count
unsigned char cnt = 0;
// hash till space or / or >
for ( ; p < pend ; p++ ) {
// stop on space or / or >
if ( is_wspace_a(*p) ) break;
if ( *p == '/' ) break;
if ( *p == '>' ) break;
// hash it in
xh ^= g_hashtab[cnt++][(unsigned char )*p];
}
// if it is a string of the same chars it can be 0
if ( ! xh ) xh = 1;
// store that
sn->m_xmlNameHash = (int32_t)xh;
}
//TODO: implement section m_flags inheritance correctly. Currently SEC_IN_IFRAME/SEC_HIDDEN/... are not inherited by child sections.
// find any open ended tags and constrain them based on their parent
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *si = &m_sections[i];
// get its parent
Section *ps = si->m_parent;
// if parent is open-ended panic!
if ( ps && ps->m_b < 0 ) { g_process.shutdownAbort(true); }
// if our parent got constrained from under us, we need
// to telescope to a new parent
for ( ; ps && ps->m_b >= 0 && ps->m_b <= si->m_a ; ) {
ps = ps->m_parent;
si->m_parent = ps;
}
// assume end is end of doc
int32_t end = m_tr->size();
// get end of parent
if ( ps ) end = ps->m_b;
// shrink our section if parent ends before us OR if we
// are open ended
if ( si->m_b != -1 && si->m_b <= end ) continue;
// this might constrain someone's parent such that
// that someone no longer can use that parent!!
si->m_b = end;
// . get our tag type
// . use int32_t instead of nodeid_t so we can re-set this
// to the xml tag hash if we need to
int32_t tid1 = (*m_tr)[si->m_a].nodeid;
// use the tag hash if this is an xml tag
if ( tid1 == TAG_XMLTAG ) {
// we computed this above
tid1 = si->m_xmlNameHash;
// skip if zero!
if ( ! tid1 ) continue;
}
// must be there to be open ended
if ( ! tid1 ) { g_process.shutdownAbort(true); }
// NOW, see if within that parent there is actually another
// tag after us of our same tag type, then use that to
// constrain us instead!!
// this hurts <p><table><tr><td><p>.... because it
// uses that 2nd <p> tag to constrain si->m_b of the first
// <p> tag which is not right! sunsetpromotions.com has that.
for ( int32_t j = i + 1 ; j < m_numSections ; j++ ) {
// get it
Section *sj = &m_sections[j];
// get word start
int32_t a = sj->m_a;
// skip if ties with us already
if ( a == si->m_a ) continue;
// stop if out
if ( a >= end ) break;
// . it must be in the same expanded frame src, if any
// . this fixes trulia.com which was ending our html
// tag, which was open-ended, with the html tag in
// a frame src expansion
if ( sj->m_gbFrameNum != si->m_gbFrameNum ) continue;
// fix sunsetpromotions.com bug. see above.
if ( sj->m_parent != si->m_parent ) continue;
// get its tid
int32_t tid2 = (*m_tr)[a].nodeid;
// use base hash if xml tag
if ( tid2 == TAG_XMLTAG )
tid2 = sj->m_xmlNameHash;
// must be our tag type!
if ( tid2 != tid1 ) continue;
// ok end us there instead!
si->m_b = a;
// stop
break;
}
}
// reparent again now that things are closed
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *si = &m_sections[i];
// skip if we are still open-ended
if ( si->m_b < 0 ) { g_process.shutdownAbort(true); }
// get parent
Section *sp = si->m_parent;
// skip if null
if ( ! sp ) continue;
// skip if parent still open ended
if ( sp->m_b < 0 ) { g_process.shutdownAbort(true); }
// subloop it
doagain2:
// skip if no parent
if ( ! sp ) continue;
// . does parent contain our first word?
// . it need not fully contain our last word!!!
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
// if parent is open ended, then it is ok for now
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
// if parent is open ended, then it is ok for now
if ( sp->m_b == -1 ) { g_process.shutdownAbort(true); }
// get grandparent
sp = sp->m_parent;
// set
si->m_parent = sp;
// try again
goto doagain2;
}
//
//
// now assign m_sectionPtrs[] which map a word to the first
// section that contains it
//
//
Section *dstack[MAXTAGSTACK];
int32_t ns = 0;
int32_t j = 0;
current = m_rootSection;//&m_sections[0];
Section *next = m_rootSection;//&m_sections[0];
// first print the html lines out
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// pop all off the stack that match us
for ( ; ns>0 && dstack[ns-1]->m_b == i ; ) {
ns--;
current = dstack[ns-1];
}
// push our current section onto the stack if i equals
// its first word #
for ( ; next && i == next->m_a ; ) {
dstack[ns++] = next;
// set our current section to this now
current = next;
// get next section for setting "next"
j++;
// if no more left, set "next" to NULL and stop loop
if ( j >= m_numSections ) { next=NULL; break; }
// grab it
next = &m_sections[j];
}
// assign
m_sectionPtrs[i] = current;
}
// . addImpliedSections() requires Section::m_baseHash
// . set Section::m_baseHash
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// these have to be in order of sn->m_a to work right
// because we rely on the parent tag hash, which would not
// necessarily be set if we were not sorted, because the
// parent section could have SEC_FAKE flag set because it is
// a br section added afterwards.
Section *sn = &m_sections[i];
// get word start into "ws"
int32_t ws = sn->m_a;
const auto &token = (*m_tr)[ws];
// shortcut
nodeid_t tid = token.nodeid;
// sanity check, <a> guys are not sections
//if ( tid == TAG_A &&
// !(sn->m_flags & SEC_SENTENCE) ) { g_process.shutdownAbort(true); }
// use a modified tid as the tag hash?
int64_t mtid = tid;
// custom xml tag, hash the tag itself
if ( tid == TAG_XMLTAG )
mtid = hash32 ( token.token_start,token.token_len );
// an unknown tag like <!! ...->
if ( tid == 0 )
mtid = 1;
// . if we are a div tag, mod it
// . treat the fields in the div tag as
// part of the tag hash.
// . helps Events.cpp be more precise about
// section identification!!!!
// . we now do this for TD and TR so Nov 2009 can telescope for
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
// so the calendar title "Nov 2009" can affect all dates
// below the calendar.
if ( tid == TAG_DIV ||
tid == TAG_TD ||
tid == TAG_TR ||
tid == TAG_LI || // newmexico.org urls class=xxx
tid == TAG_UL || // newmexico.org urls class=xxx
tid == TAG_P || // <p class="pstrg"> stjohnscollege.edu
tid == TAG_SPAN ) {
// get ptr
const char *p = token.token_start;
// skip <
p++;
// skip following alnums, that is the tag name
for ( ; is_alnum_a(*p) ; p++ );
// scan for "id" or "class" in it
// . i had to increase this because we were missing
// some stuff causing us to get the wrong implied
// sections for
// www.guysndollsllc.com/page5/page4/page4.html
// causing "The Remains" to be paired up with
// "Aug 7, 2010" in an implied section which was
// just wrong. it was 20, i made it 100...
const char *pend = p + 100;
// position ptr
unsigned char cnt = 0;
// a flag
bool skipTillSpace = false;
// . just hash every freakin char i guess
// . TODO: maybe don't hash "width" for <td><tr>
for ( ; *p && *p !='>' && p < pend ; p++ ) {
// skip bgcolor= tags because panjea.org
// interlaces different colored <tr>s in the
// table and i want them to be seen as brother
// sections, mostly for the benefit of the
// setting of lastBrother1/2 in Events.cpp
if ( is_wspace_a(p[0]) &&
to_lower_a (p[1])=='b' &&
to_lower_a (p[2])=='g' ) {
skipTillSpace = true;
continue;
}
// if not a space continue
if ( skipTillSpace ) {
if ( ! is_wspace_a(*p) ) continue;
skipTillSpace = false;
}
// do not hash until we get a space
if ( skipTillSpace ) continue;
// skip if not alnum
if ( !is_alnum_a(*p)) continue;
// hash it in
mtid ^= g_hashtab[cnt++][(unsigned char)*p];
}
}
// should not have either of these yet!
if ( sn->m_flags & SEC_FAKE ) { g_process.shutdownAbort(true); }
if ( sn->m_flags & SEC_SENTENCE ) { g_process.shutdownAbort(true); }
// sanity check
if ( mtid == 0 ) { g_process.shutdownAbort(true); }
// . set the base hash, usually just tid
// . usually base hash is zero but if it is a br tag
// we set it to something special to indicate the number
// of br tags in the sequence
sn->m_baseHash ^= mtid;
// fix this
if ( sn == rootSection ) sn->m_baseHash = 1;
// fix root section i guess
if ( sn->m_baseHash == 0 ) {
// fix core on gk21
sn->m_baseHash = 2;
}
// set this now too WHY? should already be set!!! was
// causing the root section to become a title section
// because first word was "<title>". then every word in
// the doc got SEC_IN_TITLE set and did not get hashed
// in XmlDoc::hashBody()... NOR in XmlDoc::hashTitle()!!!
if ( sn != rootSection )
sn->m_tagId = tid;
}
// set up our linked list, the functions below will insert sections
// and modify this linked list
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// set it
if ( i + 1 < m_numSections )
m_sections[i].m_next = &m_sections[i+1];
if ( i - 1 >= 0 )
m_sections[i].m_prev = &m_sections[i-1];
}
// init to -1 to indicate none
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
// reset it
si->m_firstWordPos = -1;
si->m_lastWordPos = -1;
si->m_senta = -1;
si->m_sentb = -1;
}
// now set position of first word each section contains
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// skip if not alnum word
if ( ! (*m_tr)[i].is_alfanum ) continue;
// get smallest section containing
Section *si = m_sectionPtrs[i];
// do each parent as well
for ( ; si ; si = si->m_parent ) {
// skip if already had one!
if ( si->m_firstWordPos >= 0 ) break;
// otherwise, we are it
si->m_firstWordPos = i;
// . set format hash of it
// . do it manually since tagHash not set yet
}
}
// and last word position
for ( int32_t i = m_nw - 1 ; i > 0 ; i-- ) {
// skip if not alnum word
if ( ! (*m_tr)[i].is_alfanum ) continue;
// get smallest section containing
Section *si = m_sectionPtrs[i];
// do each parent as well
for ( ; si ; si = si->m_parent ) {
// skip if already had one!
if ( si->m_lastWordPos >= 0 ) break;
// otherwise, we are it
si->m_lastWordPos = i;
}
}
sec_t inFlag = 0;
int32_t istack[1000];
sec_t iflags[1000];
int32_t ni = 0;
//
// now set the inFlags here because the tags might not have all
// been closed, making tags like SEC_STYLE overflow from where
// they should be...
//
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
// did we exceed a tag boundary?
for ( ; ni>0 && si->m_a >= istack[ni-1] ; ) {
// undo flag
inFlag &= ~iflags[ni-1];
// pop off
ni--;
}
// get the flag if any into mf
sec_t mf = 0;
// skip if not special tag id
nodeid_t tid = si->m_tagId;
if ( tid == TAG_SCRIPT ) mf = SEC_SCRIPT;
else if ( tid == TAG_NOSCRIPT) mf = SEC_NOSCRIPT;
else if ( tid == TAG_STYLE ) mf = SEC_STYLE;
else if ( tid == TAG_SELECT ) mf = SEC_SELECT;
else if ( tid == TAG_H1 ) mf = SEC_IN_HEADER;
else if ( tid == TAG_H2 ) mf = SEC_IN_HEADER;
else if ( tid == TAG_H3 ) mf = SEC_IN_HEADER;
else if ( tid == TAG_H4 ) mf = SEC_IN_HEADER;
else if ( tid == TAG_TITLE ) mf = SEC_IN_TITLE;
else if ( tid == TAG_HEAD ) mf = SEC_IN_HEAD;
// accumulate
inFlag |= mf;
// add in the flags
si->m_flags |= inFlag;
// skip if nothing special
if ( ! mf ) continue;
// sanity
if ( ni >= 1000 ) { g_process.shutdownAbort(true); }
// otherwise, store on stack
istack[ni] = si->m_b;
iflags[ni] = mf;
ni++;
}
// . now we insert sentence sections
// . find the smallest section containing the first and last
// word of each sentence and inserts a subsection into that
// . we have to be careful to reparent, etc.
// . kinda copy splitSections() function
// . maybe add an "insertSection()" function???
if ( m_contentType != CT_JS ) {
// add sentence sections
if ( ! addSentenceSections() ) return true;
// this is needed by setSentFlags()
setNextSentPtrs();
}
// . set m_nextBrother
// . we call this now to aid in setHeadingBit() and for adding the
// implied sections, but it is ultimately
// called a second time once all the new sections are inserted
setNextBrotherPtrs ( false );
// . set SEC_HEADING bit
// . need this before implied sections
setHeadingBit ();
setTagHashes();
//
//
// TODO TODO
//
// TAKE OUT THESE SANITY CHECKS TO SPEED UP!!!!!!
//
//
// clear this
bool isHidden = false;
int32_t startHide = 0x7fffffff;
int32_t endHide = 0 ;
// now that we have closed any open tag, set the SEC_HIDDEN bit
// for all sections that are like <div style=display:none>
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
// set m_lastSection so we can scan backwards
m_lastSection = sn;
// set this
int32_t wn = sn->m_a;
// stop hiding it?
if ( isHidden ) {
// turn it off if not contained
if ( wn >= endHide ) isHidden = false;
else sn->m_flags |= SEC_HIDDEN;
}
// get tag id
nodeid_t tid = sn->m_tagId;
// is div, td or tr tag start?
if ( tid!=TAG_DIV &&
tid!=TAG_TD &&
tid!=TAG_TR &&
tid!=TAG_UL &&
tid!=TAG_SPAN) continue;
// . if we are a div tag, mod it
// . treat the fields in the div tag as
// part of the tag hash.
// . helps Events.cpp be more precise about
// section identification!!!!
// . we now do this for TD and TR so Nov 2009 can telescope for
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
// so the calendar title "Nov 2009" can affect all dates
// below the calendar.
// get the style tag in there and check it for "display: none"!
int32_t slen = (*m_tr)[wn].token_len;
const char *s = (*m_tr)[wn].token_start;
const char *send = s + slen;
// check out any div tag that has a style
const char *style = gb_strncasestr(s,slen,"style=") ;
if ( ! style ) continue;
// . check for hidden
// . if no hidden tag assume it is UNhidden
// . TODO: later push & pop on stack
const char *ds = gb_strncasestr(style,send-style,"display:");
// if display:none not found turn off SEC_HIDDEN
if ( ! ds || ! gb_strncasestr(s,slen,"none") ) {
// turn off the hiding
isHidden = false;
// off in us too
sn->m_flags &= ~SEC_HIDDEN;
continue;
}
// mark all sections in this with the tag
isHidden = true;
// on in us
sn->m_flags |= SEC_HIDDEN;
// stop it after this word for sure
if ( sn->m_b > endHide ) endHide = sn->m_b;
if ( sn->m_a < startHide ) startHide = sn->m_a;
}
// now set the content hash of each section
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// must be an alnum word
if ( ! (*m_tr)[i].is_alfanum ) continue;
// get its section
m_sectionPtrs[i]->m_contentHash64 ^= (*m_tr)[i].token_hash;
// fix "smooth smooth!"
if ( m_sectionPtrs[i]->m_contentHash64 == 0 )
m_sectionPtrs[i]->m_contentHash64 = 123456;
}
// now set SEC_NOTEXT flag if content hash is zero!
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *sn = &m_sections[i];
// skip if had text
if ( sn->m_contentHash64 ) continue;
// no text!
sn->m_flags |= SEC_NOTEXT;
}
//
// set Section::m_alnumPosA/m_alnumPosB
//
int32_t alnumCount2 = 0;
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *sn = &m_sections[i];
// skip if had text
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
// save this
sn->m_alnumPosA = alnumCount2;
// scan the wids of the whole sentence, which may not
// be completely contained in the "sn" section!!
int32_t a = sn->m_senta;
int32_t b = sn->m_sentb;
for ( int32_t j = a ; j < b ; j++ ) {
// must be an alnum word
if ( ! (*m_tr)[j].is_alfanum ) continue;
// alnumcount
alnumCount2++;
}
// so we contain the range [a,b), typical half-open interval
sn->m_alnumPosB = alnumCount2;
// sanity check
if ( sn->m_alnumPosA == sn->m_alnumPosB ){g_process.shutdownAbort(true);}
// propagate through parents
Section *si = sn->m_parent;
// do each parent as well
for ( ; si ; si = si->m_parent ) {
// skip if already had one!
if ( si->m_alnumPosA > 0 ) break;
// otherwise, we are it
si->m_alnumPosA = sn->m_alnumPosA;
}
}
// propagate up alnumPosB now
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
// get it
Section *sn = &m_sections[i];
// skip if had text
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
// propagate through parents
Section *si = sn->m_parent;
// do each parent as well
for ( ; si ; si = si->m_parent ) {
// skip if already had one! no, because we need to
// get the MAX of all of our kids!!
//if ( si->m_alnumPosB > 0 ) break;
// otherwise, we are it
si->m_alnumPosB = sn->m_alnumPosB;
}
}
///////////////////////////////////////
//
// now set Section::m_listContainer
//
// . a containing section is a section containing
// MULTIPLE smaller sections
// . so if a section has a containing section set its m_listContainer
// to that containing section
// . we limit this to sections that directly contain text for now
// . Events.cpp::getRegistrationTable() uses m_nextBrother so we
// need this now!!
//
///////////////////////////////////////
setNextBrotherPtrs ( true );
///////////////////////////////////////
//
// now set SEC_MENU and SEC_LINK_TEXT flags
//
///////////////////////////////////////
setMenus();
//verifySections();
return true;
}
// . PROBLEM: because we ignore non-breaking tags we often get sections
// that are really not sentences, but we are forced into them because
// we cannot split span or bold tags
// i.e. "<div>This is <b>a sentence. And this</b> is a sentence.</div>"
// forces us to treat the entire div tag as a sentence section.
// . i did add some logic to ignore those (the two for-k loops below) but then
// Address.cpp cores because it expects every alnum word to be in a sentence
// . now make sure to shrink into our current parent if we would not lose
// alnum chars!! fixes sentence flip flopping
// . returns false and sets g_errno on error
bool Sections::addSentenceSections ( ) {
sec_t badFlags = SEC_STYLE | SEC_SCRIPT | SEC_SELECT | SEC_HIDDEN | SEC_NOSCRIPT;
// shortcut
Section **sp = m_sectionPtrs;
static bool s_init = false;
static int64_t h_in;
static int64_t h_at;
static int64_t h_for;
static int64_t h_to;
static int64_t h_on;
static int64_t h_under;
static int64_t h_with;
static int64_t h_along;
static int64_t h_from;
static int64_t h_by;
static int64_t h_of;
static int64_t h_some;
static int64_t h_the;
static int64_t h_and;
static int64_t h_a;
static int64_t h_http;
static int64_t h_https;
static int64_t h_room;
static int64_t h_rm;
static int64_t h_bldg;
static int64_t h_building;
static int64_t h_suite;
static int64_t h_ste;
static int64_t h_tags;
if ( ! s_init ) {
s_init = true;
h_tags = hash64n("tags");
h_in = hash64n("in");
h_the = hash64n("the");
h_and = hash64n("and");
h_a = hash64n("a");
h_at = hash64n("at");
h_for = hash64n("for");
h_to = hash64n("to");
h_on = hash64n("on");
h_under = hash64n("under");
h_with = hash64n("with");
h_along = hash64n("along");
h_from = hash64n("from");
h_by = hash64n("by");
h_of = hash64n("of");
h_some = hash64n("some");
h_http = hash64n("http");
h_https = hash64n("https");
h_room = hash64n("room");
h_rm = hash64n("rm");
h_bldg = hash64n("bldg");
h_building = hash64n("building");
h_suite = hash64n("suite");
h_ste = hash64n("ste");
}
// need D_IS_IN_URL bits to be valid
m_bits->setInUrlBits ( );
// is the abbr. a noun? like "appt."
bool hasWordAfter = false;
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// need a wid
if ( ! (*m_tr)[i].is_alfanum ) continue;
// get section we are currently in
Section *cs = m_sectionPtrs[i];
// skip if its bad! i.e. style or script or whatever
if ( cs->m_flags & badFlags ) continue;
// set that
int64_t prevWid = (*m_tr)[i].token_hash;
int64_t prevPrevWid = 0LL;
// flag
int32_t lastWidPos = i;//-1;
bool lastWasComma = false;
nodeid_t includedTag = -2;
int32_t lastbr = -1;
bool endOnBr = false;
bool endOnBold = false;
bool capped = true;
int32_t upper = 0;
int32_t numAlnums = 0;
// scan for sentence end
int32_t j;
for ( j = i ; j < m_nw ; j++ ) {
const auto &token2 = (*m_tr)[j];
// skip words
if ( token2.is_alfanum ) {
// prev prev
prevPrevWid = prevWid;
// assume not a word like "vs."
hasWordAfter = false;
// set prev
prevWid = token2.token_hash;
lastWidPos = j;
lastWasComma = false;
endOnBr = false;
endOnBold = false;
numAlnums++;
// skip if stop word and need not be
// capitalized
if ( m_bits->queryBits(j) & D_IS_STOPWORD ) continue;
if ( token2.token_len <= 1 ) continue;
if ( is_digit(token2.token_start[0]) ) continue;
if ( !is_upper_utf8(token2.token_start)) capped=false;
else upper++;
continue;
}
// tag?
if ( token2.nodeid ) {
// shortcut
nodeid_t tid = token2.nodeid & BACKBITCOMP;
// treat nobr as breaking to fix ceder.net
// which has it after the group title
if ( tid == TAG_NOBR ) break;
if ( tid == TAG_BR ) endOnBr = true;
if ( tid == TAG_B ) endOnBold = true;
// a </b><br> is usually like a header
if ( capped && upper && endOnBr && endOnBold )
break;
// if it is <span style="display:none"> or
// div or whatever, that is breaking!
// fixes http://chuckprophet.com/gigs/
if ( (tid == TAG_DIV ||
tid == TAG_SPAN ) &&
token2.token_len > 14 &&
strncasestr(token2.token_start,"display:none",
token2.token_len) )
break;
// ok, treat span as non-breaking for a second
if ( tid == TAG_SPAN ) continue;
// mark this
if ( tid == TAG_BR ) lastbr = j;
//
// certain tags like span and br sometimes
// do and sometimes do not break a sentence.
// so by default assume they do, but check
// for certain indicators...
//
if ( tid == TAG_SPAN ||
tid == TAG_BR ||
// fixes guysndollsllc.com:
// causes core dump:
tid == TAG_P || // villr.com
// fixes americantowns.com
tid == TAG_DIV ) {
// if nothing after, moot point
if ( j+1 >= m_nw ) break;
// if we already included this tag
// then keep including it. but some
// span tags will break and some won't
// even when in or around the same
// sentence. see that local.yahoo.com
// food delivery services url for
// the first street address,
// 5013 Miramar
if ( includedTag == tid &&
(token2.nodeid & BACKBIT) ) {
// reset it in case next
// <span> tag is not connective
includedTag = -2;
continue;
}
// if we included this tag type
// as a front tag, then include its
// back tag in sentence as well.
// fixes nonamejustfriends.com
// which has a span tag in sentence:
// ".. Club holds a <span>FREE</span>
// Cruise Night..." and we allow
// "<span>" because it follows "a",
// but we were breaking on </span>!
if ( !(token2.nodeid&BACKBIT))
includedTag = tid;
// if prev punct was comma and not
// an alnum word
if ( lastWasComma ) continue;
// get punct words bookcasing this tag
if ( ! (*m_tr)[j+1].is_alfanum &&
! (*m_tr)[j+1].nodeid &&
has_char((*m_tr)[j+1].token_start,(*m_tr)[j+1].token_end(),',') )
continue;
// if prevwid is like "vs." then
// that means keep going even if
// we hit one of these tags. fixes
// "new york knicks vs.<br>orlando
// magic"
if ( hasWordAfter )
continue;
// if first alnum word after tag
// is lower case, that is good too
int32_t aw = j + 1;
int32_t maxaw = j + 12;
if ( maxaw > m_nw ) maxaw = m_nw;
for ( ; aw < maxaw ; aw++ )
if ( (*m_tr)[aw].is_alfanum ) break;
bool isLower = false;
if ( aw < maxaw &&
is_lower_utf8((*m_tr)[aw].token_start) )
isLower = true;
// http or https is not to be
// considered as such! fixes
// webnetdesign.com from getting
// sentences continued by an http://
// url below them.
if ( aw < maxaw &&
((*m_tr)[aw].token_hash == h_http ||
(*m_tr)[aw].token_hash == h_https) )
isLower = false;
if ( tid == TAG_P &&
isLower &&
// Oscar G<p>along with xxxx
(*m_tr)[aw].token_hash != h_along &&
(*m_tr)[aw].token_hash != h_with )
isLower = false;
if ( isLower ) continue;
// get pre word, preopsitional
// phrase starter?
if ( prevWid == h_in ||
prevWid == h_the ||
prevWid == h_and ||
// fix for ending on "(Room A)"
(prevWid == h_a &&
prevPrevWid != h_rm &&
prevPrevWid != h_room &&
prevPrevWid != h_bldg &&
prevPrevWid != h_building &&
prevPrevWid != h_suite &&
prevPrevWid != h_ste ) ||
prevWid == h_for ||
prevWid == h_to ||
prevWid == h_on ||
prevWid == h_under ||
prevWid == h_with ||
prevWid == h_from ||
prevWid == h_by ||
prevWid == h_of ||
// "some ... Wednesdays"
prevWid == h_some ||
prevWid == h_at )
continue;
}
// seems like span breaks for meetup.com
// et al and not for abqtango.com maybe, we
// need to download the css??? or what???
// by default span tags do not seem to break
// the line but ppl maybe configure them to
if ( tid == TAG_SPAN ) break;
// if like <font> ignore it
if ( ! isBreakingTagId(token2.nodeid) ) continue;
// only break on xml tags if in rss feed to
// fix <st1:State w:st="on">Arizona</st1>
// for gwair.org
if ( tid==TAG_XMLTAG && !m_isRSSExt) continue;
// otherwise, stop!
break;
}
// skip simple spaces for speed
if ( token2.token_len == 1 && is_wspace_a(token2.token_start[0]))
continue;
// do not allow punctuation that is in a url
// to be split up or used as a splitter. we want
// to keep the full url intact.
if ( j > i && j+1 < m_nw &&
(m_bits->queryBits(j-1) & D_IS_IN_URL) &&
(m_bits->queryBits(j ) & D_IS_IN_URL) &&
(m_bits->queryBits(j+1) & D_IS_IN_URL) )
continue;
// was last punct containing a comma?
lastWasComma = false;
// scan the punct chars, stop if we hit a sent breaker
const char *p = token2.token_start;
const char *pend = p + token2.token_len;
for ( ; p < pend ; p++ ) {
// punct word...
if ( *p == '.' ) break;
if ( *p == ',' ) lastWasComma =true;
// allow this too for now... no...
if ( *p == ';' ) break;
// now hyphen breaks, mostly for stuff
// in title tags like dukecityfix.com
if ( sp[j]->m_tagId == TAG_TITLE &&
*p == '-' &&
is_wspace_a(p[-1]) &&
is_wspace_a(p[+1]) &&
lastWidPos >= 0 &&
! m_isRSSExt &&
j+1<m_nw &&
(*m_tr)[j+1].is_alfanum &&
//( ! (bb[lastWidPos] & D_IS_IN_DATE) ||
// ! (bb[j+1] & D_IS_IN_DATE) ) &&
// fix for $10 - $12
( ! is_digit ( (*m_tr)[lastWidPos].token_start[0]) ||
! is_digit ( (*m_tr)[j+1].token_start[0]) ) )
break;
// . treat colon like comma now
// . for unm.edu we have
// "Summer Hours: March 15 - Oct15:
// 8 am. Mon - Fri, 7:30 am - 10 am Sun.,
// Winter Hours: Oct. 15 - March 15:
// 8 am., seven days a week"
// . and we don't want "winter hours" being
// toplogically closer to the summer hours
// . that is, the colon is a stronger binder
// than the comma?
// . but for villr.com Hours: May-Aug.. gets
// made into two sentences and Hours is
// seen as a heading section and causes
// addImpliedSections() to be wrong.
// . why not the colon?
if ( *p == ':' ) {
// Tags: music,concert,fun
if ( prevWid == h_tags &&
// just Tags: so far in sentence
j == i )
break;
// a "::" is used in breadcrumbs,
// so break on that.
// fixes "Dining :: Visit ::
// Cal Performacnes" title
if ( p[1] == ':' )
break;
// if "with" preceeds, allow
if ( prevWid == h_with ) continue;
// or prev word was tag! like
// "blah</b>:..."
bool tagAfter = (j-1>=0 && (*m_tr)[j-1].nodeid);
// do not allow if next word is tag
bool tagBefore = (j+1<m_nw && (*m_tr)[j+1].nodeid);
// do not allow
// "<br>...:<br>" or
// "<br>...<br>:" or
// since such things are usually
// somewhat like headers. isolated
// lines ending on a colon.
// should fix st. martin's center
// for unm.edu "Summer Hours: ..."
if ( lastbr >= 0 &&
( tagBefore || tagAfter ) ) {
// end sentence there then
j = lastbr;
break;
}
if ( tagBefore ) break;
if ( tagAfter ) break;
// for now allow it!
continue;
}
// . special hyphen
// . breaks up title for peachpundit.com
// so we get better event title generation
// since peachpundit.com will be a reepat sec
// . BUT it did not work!
if ( p[0] == (char)-30 &&
p[1] == (char)-128 &&
p[2] == (char)-108 )
break;
// this for sure
// "Home > Albuquerque Events > Love Song ..."
if ( *p == '>' ) break;
if ( *p == '!' ) break;
if ( *p == '?' ) break;
if ( *p == '|' )
break;
// bullets
if ( p[0] == (char)226 &&
p[1] == (char)128 &&
p[2] == (char)162 )
break;
redo:
continue;
}
// if none, keep going
if ( p == pend ) continue;
// if an alnum char follows the ., it is ok
// probably a hostname or ip or phone #
if ( is_alnum_utf8(p+1) &&
// "venue:ABQ Sq Dance Center..." for
// americantowns.com has no space after the colon!
*p !=':' )
goto redo;
// if abbreviation before we are ok too
if ( *p == '.' && isAbbr(prevWid,&hasWordAfter) ) {
// but the period may serve a double purpose
// to end the abbr and terminate the sentence
// if the word that follows is capitalized,
// and if the abbr is a lower-case noun.
//
// if abbr is like "vs" then do not end sentenc
if ( hasWordAfter )
goto redo;
// set "next" to next alnum word after us
int32_t next = j+1;
int32_t max = next + 10;
if ( max > m_nw ) max = m_nw;
for ( ; next < max ; next++ ) {
if ( ! (*m_tr)[next].is_alfanum ) continue;
break;
}
// was previous word/abbr capitalized?
// if so, assume period does not end sentence.
if ( is_capitalized_utf8((*m_tr)[lastWidPos].token_start) )
goto redo;
// if next word is NOT capitalized, assume
// period does not end sentence...
if ( next < max &&
! is_capitalized_utf8((*m_tr)[next].token_start) )
goto redo;
// otherwise, abbr is NOT capitalized and
// next word IS capitalized, so assume the
// period does NOT end the sentence
}
// fix "1. library name" for cabq.gov
if ( *p == '.' &&
lastWidPos == i) {
auto const &t = (*m_tr)[lastWidPos];
if(is_ascii_digit_string(t.token_start, t.token_end()))
goto redo;
}
// ok, stop otherwise
break;
}
// do not include tag at end. try to fix sentence flip flop.
for ( ; j > i ; j-- )
// stop when we just contain the last word
if ( (*m_tr)[j-1].is_alfanum ) break;
// make our sentence endpoints now
int32_t senta = i;
// make the sentence defined by [senta,sentb) where sentb
// defines a half-open interval like we do for almost
// everything else
int32_t sentb = j;
// update i for next iteration
i = sentb - 1;
// crap, but now sentences intersect with our tag-based
// sections because they can now split tags because of websites
// like aliconference.com and abqtango.com whose sentences
// do not align with the tag sections. therefore we introduce
// the SEC_TOP_SPLIT and SEC_BOTTOM_SPLIT to indicate
// that the section is a top/bottom piece of a split sentence.
// if both bits are set we assume SEC_MIDDLE_SPLIT.
// then we set the Section::m_senta and m_sentb to
// indicate the whole sentence of which it is a split.
// but the vast majority of the time m_senta and m_sentb
// will equal m_firstWordPos and m_lastWordPos respectively.
// then, any routine that
// so scan the words in the sentence and as we scan we have
// to determine the parent section we inserting the sentence
// into as a child section.
//Section *parent = NULL;
int32_t start = -1;
Section *pp;
int32_t lastk = 0;
Section *splitSection = NULL;
Section *lastGuy = NULL;
for ( int32_t k = senta ; k <= sentb ; k++ ) {
// add final piece
if ( k == sentb ) {
// stop i no final piece
if ( start == -1 ) break;
// otherwise, add it
goto addit;
}
// need a real alnum word
if ( ! (*m_tr)[k].is_alfanum ) continue;
// get his parent
pp = m_sectionPtrs[k];
// set parent if need to
//if ( ! parent ) parent = pp;
// and start sentence if need to
if ( start == -1 ) start = k;
// if same as exact section as last guy, save some time
if ( pp == lastGuy ) pp = NULL;
// store it
lastGuy = pp;
// . i'd say blow up "pp" until its contains "start"
// . but if before it contains start it breaches
// [senta,sentb) then we have to cut things short
for ( ; pp ; pp = pp->m_parent ) {
// we now have to split section "pp"
// when adding the sentence section.
// once we have such a section we
// cannot use a different parent...
if ( pp->m_firstWordPos < start ||
pp->m_lastWordPos >= sentb ) {
// set it
if ( ! splitSection ) splitSection =pp;
// WE ARE ONLY ALLOWED TO SPLIT ONE
// SECTION ONLY...
if ( pp != splitSection)
goto addit;
break;
}
// keep telescoping until "parent" contains
// [senta,k] , and we already know that it
// contains k because that is what we set it to
//if ( pp->m_a <= senta ) break;
}
// mark it
if ( (*m_tr)[k].is_alfanum ) lastk = k;
// ok, keep chugging
continue;
// add the final piece if we go to this label
addit:
// use this flag
int32_t bh = BH_SENTENCE;
// determine parent section, smallest section
// containing [start,lastk]
Section *parent = m_sectionPtrs[start];
for ( ; parent ; parent = parent->m_parent ) {
// stop if contains lastk
if ( parent->m_b > lastk ) break;
}
//
// for "<span>Albuquerque</span>, New Mexico"
// "start" points to "Albuquerque" but needs to
// point to the "<span>" so its parent is "parent"
int32_t adda = start;
int32_t addb = lastk;
// need to update "start" to so its parent is the new
// "parent" now so insertSubSection() does not core
for ( ; adda >= 0 ; ) {
// stop if we finally got the right parent
if ( m_sectionPtrs[adda]==parent ) break;
// or if he's a tag and his parent
// is "parent" we can stop.
// i.e. STOP on a proper subsection of
// the section containing the sentence.
if ( m_sectionPtrs[adda]->m_parent==parent &&
m_sectionPtrs[adda]->m_a == adda )
break;
// backup
adda--;
// check
if ( adda < 0 ) break;
// how can this happen?
if ( (*m_tr)[adda].is_alfanum ) { g_process.shutdownAbort(true); }
}
// sanity
if ( adda < 0 ) { g_process.shutdownAbort(true); }
// same for right endpoint
for ( ; addb < m_nw ; ) {
// stop if we finally got the right parent
if ( m_sectionPtrs[addb]==parent ) break;
// get it
Section *sp = m_sectionPtrs[addb];
// come back up here in the case of a section
// sharing its Section::m_b with its parent
subloop:
// or if he's a tag and his parent
// is "parent" we can stop
if ( sp->m_parent==parent &&
sp->m_b == addb+1 )
break;
// or if we ran into a brother section
// that does not contain the sentence...
// fix core dump for webnetdesign.com whose
// sentence consisted of 3 sections from
// A=7079 to B=7198. but now i am getting rid
// of allowing a lower case http(s):// on
// a separate line to indicate that the
// sentence continues... so we will not have
// this sentence anymore in case you are
// wondering why it is not there any more.
if ( sp->m_parent==parent &&
sp->m_a == addb ) {
// do not include that brother's tag
addb--;
break;
}
// when we have bad tag formations like for
// http://gocitykids.parentsconnect.com/catego
// ry/buffalo-ny-usa/places-to-go/tourist-stops
// like <a><b>...</div> with no ending </a> or
// </b> tags then we have to get the parent
// of the parent as long as its m_b is the
// same and check that before advancing addb
// otherwise we can miss the parent section
// that we want! (this is because the kid
// sections share the same m_b as their
// parent because of they have no ending tag)
if ( sp->m_parent &&
sp->m_parent->m_b == sp->m_b ) {
sp = sp->m_parent;
goto subloop;
}
// advance
addb++;
// stop if addb
if ( addb >= m_nw ) break;
// how can this happen?
if ( (*m_tr)[addb].is_alfanum ) { g_process.shutdownAbort(true); }
}
// sanity
if ( addb >= m_nw ) { g_process.shutdownAbort(true); }
// ok, now add the split sentence
Section *is =insertSubSection(adda,addb+1,bh);
// panic?
if ( ! is )
break;
// set sentence flag on it
is->m_flags |= SEC_SENTENCE;
// . set this
// . sentence is from [senta,sentb)
is->m_senta = senta;//start;
is->m_sentb = sentb;//k;
// stop if that was it
if ( k == sentb ) break;
// go on to next fragment then
start = -1;
parent = NULL;
splitSection = NULL;
lastGuy = NULL;
// redo this same k
k--;
}
}
int32_t inSentTil = 0;
Section *lastSent = NULL;
// get the section of each word. if not a sentence section then
// make its m_sentenceSection point to its parent that is a sentence
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
// need sentence
if ( ( sk->m_flags & SEC_SENTENCE ) ) {
inSentTil = sk->m_b;
lastSent = sk;
sk->m_sentenceSection = sk;
continue;
}
// skip if outside of the last sentence we had
if ( sk->m_a >= inSentTil ) continue;
// we are in that sentence
sk->m_sentenceSection = lastSent;
}
return true;
}
Section *Sections::insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) {
// try to realloc i guess. should keep ptrs in tact.
if ( m_numSections >= m_maxNumSections ) {
g_errno = EDOCBADSECTIONS;
return NULL;
}
//
// make a new section
//
Section *sk = &m_sections[m_numSections];
// clear
memset ( sk , 0 , sizeof(Section) );
// inc it
m_numSections++;
// now set it
sk->m_a = a;
sk->m_b = b;
// don't mess this up!
if ( m_lastSection && a > m_lastSection->m_a )
m_lastSection = sk;
// the base hash (delimeter hash) hack
sk->m_baseHash = 0;// dh; ????????????????????
// get first section containing word #a
Section *si = m_sectionPtrs[a];
for ( ; si ; si = si->m_prev ) {
// we become his child if this is true
if ( si->m_a < a ) {
break;
}
// if he is bigger (or equal) we become his child
// and are after him
if ( si->m_a == a && si->m_b >= b ) {
break;
}
}
// . try using section before us if it is contained by "si"
// . like in the case when word #a belongs to the root section
// and there are thousands of child sections of the root before "a"
// we really want to get the child section of the root before us
// as the prev section, "si", otherwise the 2nd for loop below here
// will hafta loop through thousands of sibling sections
// . this will fail if word before a is part of our same section
// . what if we ignored this for now and set m_sectionPtrs[a] to point
// to the newly inserted section, then when done adding sentence
// sections we scanned all the words, keeping track of the last
// html section we entered and used that to insert the sentence sections
if ( m_lastAdded && si && m_lastAdded->m_a > si->m_a && m_lastAdded->m_a < a ) {
si = m_lastAdded;
}
// crap we may have
// "<p> <strong>hey there!</strong> this is another sentence.</p>"
// then "si" will be pointing at the "<p>" section, and we will
// not get the "<strong>" section as the "prev" to sk, which we should!
// that is where sk is the "this is another sentence." sentence
// section. so to fix that try iterating over si->m_next to get si to
// be closer to sk.
for ( ; si ; si = si->m_next ) {
// stop if no more eavailable
if ( ! si->m_next ) break;
// stop if would break
if ( si->m_next->m_a > a ) break;
// if it gets closer to us without exceeding us, use it
if ( si->m_next->m_a < a ) continue;
// if tied, check b. if it contains us, go to it
if ( si->m_next->m_b >= b ) continue;
// otherwise, stop
break;
}
// set this
m_lastAdded = si;
// a br tag can split the very first base html tag like for
// mapsandatlases.org we have
// "<html>...</html> <br> ...." so the br tag splits the first
// section!
// SO we need to check for NULL si's!
if ( ! si ) {
// skip this until we figure it out
m_numSections--;
g_process.shutdownAbort(true);
return NULL;
} else {
// insert us into the linked list of sections
if ( si->m_next ) si->m_next->m_prev = sk;
sk->m_next = si->m_next;
sk->m_prev = si;
si->m_next = sk;
}
// now set the parent
Section *parent = m_sectionPtrs[a];
// expand until it encompasses both a and b
for ( ; ; parent = parent->m_parent ) {
if ( parent->m_a > a ) continue;
if ( parent->m_b < b ) continue;
break;
}
// now we assign the parent to you
sk->m_parent = parent;
// sometimes an implied section is a subsection of a sentence!
// like when there are a lot of brbr (double br) tags in it...
sk->m_sentenceSection = parent->m_sentenceSection;
// take out certain flags from parent
sec_t flags = parent->m_flags;
flags &= ~SEC_SENTENCE;
// add in fake
flags |= SEC_FAKE;
// flag it as a fake section
sk->m_flags = flags ;
// need this
sk->m_baseHash = newBaseHash;
// reset these
sk->m_firstWordPos = -1;
sk->m_lastWordPos = -1;
sk->m_alnumPosA = -1;
sk->m_alnumPosB = -1;
sk->m_senta = -1;
sk->m_sentb = -1;
// set sk->m_firstWordPos
for ( int32_t i = a ; i < b ; i++ ) {
// and first/last word pos
if ( ! (*m_tr)[i].is_alfanum ) continue;
// mark this
sk->m_firstWordPos = i;
break;
}
// set sk->m_lastWordPos
for ( int32_t i = b-1 ; i >= a ; i-- ) {
// and first/last word pos
if ( ! (*m_tr)[i].is_alfanum ) continue;
// mark this
sk->m_lastWordPos = i;
break;
}
//
// to speed up scan the words in our inserted section, usually
// a sentence section i guess, because our parent can have a ton
// of children sections!!
//
for ( int32_t i = a ; i < b ; i++ ) {
// get current parent of that word
Section *wp = m_sectionPtrs[i];
// if sentence section does NOT contain the word's current
// section then the sentence section becomes the new section
// for that word.
if ( ! sk->strictlyContains ( wp ) ) {
// now if "wp" is like a root, then sk becomes the kid
m_sectionPtrs[i] = sk;
// our parent is wp
sk->m_parent = wp;
continue;
}
// we gotta blow up wp until right before it is bigger
// than "sk" and use that
for ( ; wp->m_parent ; wp = wp->m_parent )
// this could be equal to, not just contains
// otherwise we use strictlyContains()
if ( wp->m_parent->contains(sk) ) break;
// already parented to us?
if ( wp->m_parent == sk ) continue;
// sentence's parent is now wp's parent
sk->m_parent = wp->m_parent;
// and we become wp's parent
wp->m_parent = sk;
// sanity check
if ( wp->m_b > sk->m_b ) { g_process.shutdownAbort(true); }
if ( wp->m_a < sk->m_a ) { g_process.shutdownAbort(true); }
}
return sk;
}
// this is a function because we also call it from addImpliedSections()!
void Sections::setNextBrotherPtrs ( bool setContainer ) {
// clear out
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
si->m_nextBrother = NULL;
si->m_prevBrother = NULL;
}
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
Section *sj = NULL;
// get word after us
int32_t wn = si->m_b;
int32_t nw2 = m_nw;
// if we hit a word in our parent.. then increment wn
// PROBLEM "<root><t1>hey</t1> blah blah blah x 1 mill</root>"
// would exhaust the full word list when si is the "t1"
// section.
Section *j2 = si->m_next;
if ( j2 && j2->m_a >= si->m_b ) {
sj = j2;
nw2 = 0;
}
// try one more ahead for things like so we don't end up
// setting sj to the "t2" section as in:
// "<root><t1><t2>hey</t2></t1> ...."
if ( ! sj && j2 ) {
// try the next section then
j2 = j2->m_next;
// set "sj" if its a potential brother section
if ( j2 && j2->m_a >= si->m_b ) {
sj = j2;
nw2 = 0;
}
}
// ok, try the next word algo approach
for ( ; wn < nw2 ; wn++ ) {
sj = m_sectionPtrs[wn];
if ( sj->m_a >= si->m_b ) break;
}
// bail if none
if ( wn >= m_nw ) continue;
// telescope up until brother if possible
for ( ; sj ; sj = sj->m_parent )
if ( sj->m_parent == si->m_parent ) break;
// give up?
if ( ! sj || sj->m_parent != si->m_parent ) continue;
// sanity check
if ( sj->m_a < si->m_b &&
sj->m_tagId != TAG_TC &&
si->m_tagId != TAG_TC ) {
g_process.shutdownAbort(true); }
// set brother
si->m_nextBrother = sj;
// set his prev then
sj->m_prevBrother = si;
// sanity check
if ( sj->m_parent != si->m_parent ) { g_process.shutdownAbort(true); }
// sanity check
if ( sj->m_a < si->m_b &&
sj->m_tagId != TAG_TC &&
si->m_tagId != TAG_TC ) {
g_process.shutdownAbort(true); }
// do more?
if ( ! setContainer ) continue;
// telescope this
Section *te = sj;
// telescope up until it contains "si"
for ( ; te && te->m_a > si->m_a ; te = te->m_parent );
// only update list container if smaller than previous
if ( ! si->m_listContainer )
si->m_listContainer = te;
else if ( te && te->m_a > si->m_listContainer->m_a )
si->m_listContainer = te;
if ( ! sj->m_listContainer )
sj->m_listContainer = te;
else if ( te && te->m_a > sj->m_listContainer->m_a )
sj->m_listContainer = te;
// now
}
}
void Sections::setNextSentPtrs ( ) {
// kinda like m_rootSection
m_firstSentence = NULL;
Section *finalSec = NULL;
// scan the sentence sections and number them to set m_sentNum
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
// record final section
finalSec = sk;
// need sentence
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
continue;
}
// first one?
if ( ! m_firstSentence ) {
m_firstSentence = sk;
}
}
Section *lastSent = NULL;
// now set "m_nextSentence" of each section
for ( Section *sk = finalSec ; sk ; sk = sk->m_prev ) {
// set this
sk->m_nextSentence = lastSent;
// need sentence
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
continue;
}
// we are the sentence now
lastSent = sk;
}
}
#define TABLE_ROWS 25
void Sections::printFlags(SafeBuf *sbuf, const Section *sn) {
sec_t f = sn->m_flags;
if ( f & SEC_HEADING )
sbuf->safePrintf("heading ");
if ( f & SEC_MENU_SENTENCE )
sbuf->safePrintf("menusentence " );
if ( f & SEC_MENU )
sbuf->safePrintf("ismenu " );
if ( f & SEC_MENU_HEADER )
sbuf->safePrintf("menuheader " );
if ( f & SEC_LINK_TEXT )
sbuf->safePrintf("linktext " );
if ( f & SEC_PLAIN_TEXT )
sbuf->safePrintf("plaintext " );
if ( f & SEC_FAKE ) {
if ( sn->m_baseHash == BH_BULLET )
sbuf->safePrintf("bulletdelim ");
else if ( sn->m_baseHash == BH_SENTENCE )
sbuf->safePrintf("<b>sentence</b> ");
else if ( sn->m_baseHash == BH_IMPLIED )
sbuf->safePrintf("<b>impliedsec</b> ");
else { g_process.shutdownAbort(true); }
}
if ( f & SEC_NOTEXT )
sbuf->safePrintf("notext ");
if ( f & SEC_SCRIPT )
sbuf->safePrintf("inscript ");
if ( f & SEC_NOSCRIPT )
sbuf->safePrintf("innoscript ");
if ( f & SEC_STYLE )
sbuf->safePrintf("instyle ");
if ( f & SEC_HIDDEN )
sbuf->safePrintf("indivhide ");
if ( f & SEC_SELECT )
sbuf->safePrintf("inselect ");
if ( f & SEC_IN_HEAD )
sbuf->safePrintf("inhead ");
if ( f & SEC_IN_TITLE )
sbuf->safePrintf("intitle ");
if ( f & SEC_IN_HEADER )
sbuf->safePrintf("inheader ");
if ( f & SEC_IN_IFRAME )
sbuf->safePrintf("iniframe ");
}
bool Sections::isHardSection(const Section *sn) const {
int32_t a = sn->m_a;
// . treat this as hard... kinda like a div section...
// fixes gwair.org date from stealing address of another date
// because the span tags are fucked up...
// . crap, no this prevents publicbroadcasting.net and other urls
// from telescoping to header dates they need to telescope to.
// the header dates are in span tags and if that is seen as a hard
// section bad things happen
//if ( m_tids[a] == TAG_SPAN ) return true;
if ( ! isBreakingTagId((*m_tr)[a].nodeid) ) {
// . if first child is hard that works!
// . fixes "<blockquote><p>..." for collectorsguide.com
if ( sn->m_next &&
sn->m_next->m_tagId &&
// fix "blah blah<br>blah blah" for sentence
sn->m_next->m_tagId != TAG_BR &&
sn->m_next->m_a < sn->m_b &&
isBreakingTagId(sn->m_next->m_tagId) )
return true;
// otherwise, forget it!
return false;
}
// trumba.com has sub dates in br-based implied sections that need
// to telescope to their parent above
if ( (*m_tr)[a].nodeid == TAG_BR ) return false;
if ( sn->m_flags & SEC_SENTENCE ) return false;
// xml tag exception for gwair.org. treat <st1:Place>... as soft
if ( ((*m_tr)[a].nodeid & BACKBITCOMP) == TAG_XMLTAG && ! m_isRSSExt )
return false;
return true;
}
bool Sections::setMenus ( ) {
// . this just returns if already set
// . sets Bits::m_bits[x].m_flags & D_IN_LINK if its in a link
// . this bits array is 1-1 with the words
m_bits->setInLinkBits(this);
sec_t flag;
// set SEC_PLAIN_TEXT and SEC_LINK_TEXT for all sections
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// need alnum word
if ( ! (*m_tr)[i].is_alfanum ) continue;
// get our flag
if ( m_bits->queryBits(i) & D_IN_LINK ) flag = SEC_LINK_TEXT;
else flag = SEC_PLAIN_TEXT;
// get section ptr
Section *sk = m_sectionPtrs[i];
// loop for sk
for ( ; sk ; sk = sk->m_parent ) {
// skip if already set
if ( sk->m_flags & flag ) break;
// set it
sk->m_flags |= flag;
}
}
Section *last = NULL;
// . alernatively, scan through all anchor tags
// . compare to last anchor tag
// . and blow up each to their max non-intersection section and make
// sure no PLAIN text in either of those!
// . this is all to fix texasdrums.drums.org which has various span
// and bold tags throughout its menu at random
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
// . if we hit plain text, we kill our last
// . this was causing "geeks who drink" for blackbirdbuvette
// to get is SEC_MENU set because there was a link after it
if ( si->m_flags & SEC_PLAIN_TEXT ) {
last = NULL;
}
// skip if not a href section
if ( si->m_baseHash != TAG_A ) {
continue;
}
// . if it is a mailto link forget it
// . fixes abtango.com from detecting a bad menu
const char *ptr = (*m_tr)[si->m_a].token_start;
int32_t plen = (*m_tr)[si->m_a].token_len;
const char *mailto = strncasestr(ptr,plen,"mailto:");
if ( mailto ) {
last = NULL;
}
// bail if no last
if ( ! last ) { last = si; continue; }
// save last
Section *prev = last;
// set last for next round, used "saved" below
last = si;
// get first "hard" section encountered while telescoping
Section *prevHard = NULL;
// blow up last until right before it contains us
for ( ; prev ; prev = prev->m_parent ) {
// record?
if ( ! prevHard && isHardSection(prev) )
prevHard = prev;
// if parent contains us, stop
if ( prev->m_parent->contains ( si ) ) break;
}
// if it has plain text, forget it!
if ( prev && prev->m_flags & SEC_PLAIN_TEXT ) continue;
// use this for us
Section *sk = si;
// get first "hard" section encountered while telescoping
Section *skHard = NULL;
// same for us
for ( ; sk ; sk = sk->m_parent ) {
// record?
if ( ! skHard && isHardSection(sk) ) skHard = sk;
// if parent contains us, stop
if ( prev && sk->m_parent->contains ( prev ) ) break;
}
// if it has plain text, forget it!
if ( sk && sk->m_flags & SEC_PLAIN_TEXT ) continue;
// . first hard sections encountered must match!
// . otherwise for switchborad.com we lose "A B C ..." as
// title candidate because we think it is an SEC_MENU
// because the sections before it have links in them, but
// they have different hard sections
if ( prevHard && ! skHard ) continue;
if ( ! prevHard && skHard ) continue;
if ( prevHard && prevHard->m_tagId != skHard->m_tagId ) continue;
// ok, great that works!
if( prev ) {
prev->m_flags |= SEC_MENU;
}
if( sk ) {
sk->m_flags |= SEC_MENU;
}
}
int64_t h_copyright = hash64n("copyright");
// copyright check
// the copyright symbol in utf8 (see Entities.cpp for the code)
static const char copy[] = "<EFBFBD>";
// scan all years, lists and ranges of years, and look for
// a preceeding copyright sign. mark such years as DF_COPYRIGHT
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// skip if tag
if ( (*m_tr)[i].nodeid ) continue;
// do we have an alnum word before us here?
if ( (*m_tr)[i].is_alfanum ) {
// if word check for copyright
if ( (*m_tr)[i].token_hash != h_copyright ) continue;
}
// must have copyright sign in it i guess
else if ( ! gb_strncasestr((*m_tr)[i].token_start, (*m_tr)[i].token_len, copy))
continue;
// mark section as copyright section then
Section *sp = m_sectionPtrs[i];
// flag as menu
sp->m_flags |= SEC_MENU;
}
sec_t ff = SEC_MENU;
// set SEC_MENU of child sections of SEC_MENU sections
for ( Section *si = m_rootSection; si; si = si->m_next ) {
// must be a link text only section
if ( !( si->m_flags & ff ) )
continue;
// ignore if went down this path
if ( si->m_used == 82 ) {
continue;
}
// get first potential kid
Section *sk = si->m_next;
// scan child sections
for ( ; sk; sk = sk->m_next ) {
// stop if not contained
if ( !si->contains( sk ) ) {
break;
}
// mark it
sk->m_flags |= ( si->m_flags & ff ); // SEC_MENU;
// ignore in big loop
sk->m_used = 82;
}
}
//
// set SEC_MENU_HEADER
//
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
// skip if not in a menu
if ( ! ( sk->m_flags & SEC_MENU ) ) {
continue;
}
// get his list container
Section *c = sk->m_listContainer;
// skip if none
if ( !c ) {
continue;
}
// already flagged?
if ( c->m_used == 89 ) {
continue;
}
// do not repeat on any item in this list
c->m_used = 89;
// flag all its brothers!
Section *zz = sk;
for ( ; zz; zz = zz->m_nextBrother ) {
// bail if not in menu
if ( !( zz->m_flags & SEC_MENU ) ) {
break;
}
}
// if broked it, stop
if ( zz ) {
continue;
}
//
// ok, every item in list is a menu item, so try to set header
//
// get word before first item in list
int32_t r = sk->m_a - 1;
for ( ; r >= 0 && !(*m_tr)[r].is_alfanum; r-- )
;
// if no header, skip
if ( r < 0 ) {
continue;
}
// set SEC_MENU_HEADER
setHeader( r, sk, SEC_MENU_HEADER );
}
//
// set SEC_MENU_SENTENCE flag
//
for ( Section *si = m_rootSection; si; si = si->m_next ) {
// must be a link text only section
if ( !( si->m_flags & SEC_MENU ) ) {
continue;
}
// set this
bool gotSentence = ( si->m_flags & SEC_SENTENCE );
// set SEC_MENU of the sentence
if ( gotSentence ) {
continue;
}
// parent up otherwise
for ( Section *sk = si->m_parent; sk; sk = sk->m_parent ) {
// stop if sentence finally
if ( !( sk->m_flags & SEC_SENTENCE ) ) {
continue;
}
// not a menu sentence if it has plain text in it
// though! we have to make this exception to stop
// stuff like
// "Wedding Ceremonies, No preservatives, more... "
// from switchboard.com from being a menu sentence
// just because "more" is in a link.
if ( sk->m_flags & SEC_PLAIN_TEXT ) {
break;
}
// set it
sk->m_flags |= SEC_MENU_SENTENCE;
// and stop
break;
}
}
static bool s_init = false;
static int64_t h_close ;
static int64_t h_send ;
static int64_t h_map ;
static int64_t h_maps ;
static int64_t h_directions ;
static int64_t h_driving ;
static int64_t h_help ;
static int64_t h_more ;
static int64_t h_log ;
static int64_t h_sign ;
static int64_t h_change ;
static int64_t h_write ;
static int64_t h_save ;
static int64_t h_share ;
static int64_t h_forgot ;
static int64_t h_home ;
static int64_t h_sitemap ;
static int64_t h_advanced ;
static int64_t h_go ;
static int64_t h_website ;
static int64_t h_view;
static int64_t h_add;
static int64_t h_submit;
static int64_t h_get;
static int64_t h_about;
// new stuff
static int64_t h_back; // back to top
static int64_t h_next;
static int64_t h_buy; // buy tickets
static int64_t h_english; // english french german versions
static int64_t h_click;
if ( ! s_init ) {
s_init = true;
h_close = hash64n("close");
h_send = hash64n("send");
h_map = hash64n("map");
h_maps = hash64n("maps");
h_directions = hash64n("directions");
h_driving = hash64n("driving");
h_help = hash64n("help");
h_more = hash64n("more");
h_log = hash64n("log");
h_sign = hash64n("sign");
h_change = hash64n("change");
h_write = hash64n("write");
h_save = hash64n("save");
h_share = hash64n("share");
h_forgot = hash64n("forgot");
h_home = hash64n("home");
h_sitemap = hash64n("sitemap");
h_advanced = hash64n("advanced");
h_go = hash64n("go");
h_website = hash64n("website");
h_view = hash64n("view");
h_add = hash64n("add");
h_submit = hash64n("submit");
h_get = hash64n("get");
h_about = hash64n("about");
h_back = hash64n ("back");
h_next = hash64n ("next");
h_buy = hash64n ("buy");
h_english = hash64n ("english");
h_click = hash64n ("click");
}
// . when dup/non-dup voting info is not available because we are
// more or less an isolated page, guess that these links are
// menu links and not to be considered for title or event description
// . we completely exclude a word from title/description if its
// SEC_MENU is set.
// . set SEC_MENU for renegade links that start with an action
// verb like "close" or "add" etc. but if their # of non dup votes
// is high relative to their # of dup votes, then do not set this
// because it might be a name of a band like "More" or something
// and be in a link
// . scan all href sections
// set SEC_LINK_ONLY on sections that just contain a link
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
// skip if not a href section
if ( si->m_baseHash != TAG_A ) continue;
// set points to scan
int32_t a = si->m_a;
int32_t b = si->m_b;
// assume not bad
bool bad = false;
int32_t i;
// scan words if any
for ( i = a ; i < b ; i++ ) {
const auto &token = (*m_tr)[i];
// skip if not word
if ( ! token.is_alfanum ) continue;
// assume bad
bad = true;
// certain words are indicative of menus
if ( token.token_hash == h_close ) break;
if ( token.token_hash == h_send ) break;
if ( token.token_hash == h_map ) break;
if ( token.token_hash == h_maps ) break;
if ( token.token_hash == h_directions ) break;
if ( token.token_hash == h_driving ) break;
if ( token.token_hash == h_help ) break;
if ( token.token_hash == h_more ) break;
if ( token.token_hash == h_log ) break; // log in
if ( token.token_hash == h_sign ) break; // sign up/in
if ( token.token_hash == h_change ) break; // change my loc.
if ( token.token_hash == h_write ) break; // write a review
if ( token.token_hash == h_save ) break;
if ( token.token_hash == h_share ) break;
if ( token.token_hash == h_forgot ) break; // forgot your pwd
if ( token.token_hash == h_home ) break;
if ( token.token_hash == h_sitemap ) break;
if ( token.token_hash == h_advanced ) break; // adv search
if ( token.token_hash == h_go ) break; // go to top of page
if ( token.token_hash == h_website ) break;
if ( token.token_hash == h_view ) break;
if ( token.token_hash == h_add ) break;
if ( token.token_hash == h_submit ) break;
if ( token.token_hash == h_get ) break;
if ( token.token_hash == h_about ) break;
if ( token.token_hash == h_back ) break;
if ( token.token_hash == h_next ) break;
if ( token.token_hash == h_buy ) break;
if ( token.token_hash == h_english ) break;
if ( token.token_hash == h_click ) break;
bad = false;
break;
}
// skip if ok
if ( ! bad ) continue;
// get smallest section
Section *sm = m_sectionPtrs[i];
// if bad mark it!
sm->m_flags |= SEC_MENU;
}
return true;
}
// "first" is first item in the list we are getting header for
void Sections::setHeader ( int32_t r , Section *first , sec_t flag ) {
// get smallest section containing word #r
Section *sr = m_sectionPtrs[r];
// save orig
Section *orig = sr;
// blow up until just before "first" section
for ( ; sr ; sr = sr->m_parent ) {
// forget it if in title tag already!
if ( sr->m_flags & SEC_IN_TITLE ) return;
// stop if no parent
if ( ! sr->m_parent ) continue;
// parent must not contain first
if ( sr->m_parent->contains ( first ) ) break;
}
// if we failed to contain "first"... what does this mean? i dunno
// but its dropping core for
// http://tedserbinski.com/jcalendar/jcalendar.js
if ( ! sr ) return;
// save that
Section *biggest = sr;
// check out prev brother
Section *prev = biggest->m_prevBrother;
// if we are in a hard section and capitalized (part of the
// SEC_HEADING) requirements, then it should be ok if we have
// a prev brother of a different tagid.
// this will fix americantowns.com which has a list of header tags
// and ul tags intermingled, with menus in the ul tags.
// should also fix upcoming.yahoo.com which has alternating
// dd and dt tags for its menus. now that we got rid of
// addImpliedSections() we have to deal with this here, and it will
// be more accurate since addImpliedSections() was often wrong.
if ( prev &&
(orig->m_flags & SEC_HEADING) &&
prev->m_tagId != biggest->m_tagId )
prev = NULL;
// but if prev brother is a blank, we should view that as a delimeter
// BUT really we should have added those sections in with the new
// delimeter logic! but let's put this in for now anyway...
if ( prev && prev->m_firstWordPos < 0 )
prev = NULL;
// if the header section has a prev brother, forget it!
if ( prev ) return;
// . if we gained extra text, that is a no-no then
// . these two checks replaced the two commented out ones above
// . they allow for empty sections preceeding "sr" at any level as
// we telescope it up
if ( biggest->m_firstWordPos != orig->m_firstWordPos ) return;
if ( biggest->m_lastWordPos != orig->m_lastWordPos ) return;
// . now blow up first until just before it hits biggest as well
// . this fixes reverbnation on the nextBrother check below
for ( ; first ; first = first->m_parent ) {
// stop if parent is NULL
if ( ! first->m_parent ) break;
// stop if parent would contain biggest
if ( first->m_parent->contains ( biggest ) ) break;
}
// if after blowing it up "first" contains more than just menu
// sections, then bail. that really was not a menu header!
// fixes reverbnation url that thought "That 1 Guy" was a menu header.
if ( flag == SEC_MENU_HEADER ) {
Section *fx = first;
for ( ; fx ; fx = fx->m_next ) {
// stop when list is over
if ( fx->m_a >= first->m_b ) break;
// ignore if no next
if ( fx->m_flags & SEC_NOTEXT ) continue;
// thats bad if SEC_MENU not set, it should be for all!
if ( fx->m_flags & SEC_MENU ) continue;
// we got these now
if ( fx->m_flags & SEC_MENU_SENTENCE ) continue;
// otherwise, bad!
return;
}
}
// scan until outside biggest
int32_t lastb = biggest->m_b;
// . make sure sr does not contain any list in it
// . scan all sections between sr and "saved"
for ( ; sr ; sr = sr->m_next ) {
// stop if over
if ( sr->m_a >= lastb ) break;
// if we have a brother with same taghash we are
// part of a list
if ( sr->m_nextBrother &&
sr->m_nextBrother->m_tagHash == sr->m_tagHash &&
sr->m_nextBrother != first )
return;
if ( sr->m_prevBrother &&
sr->m_prevBrother->m_tagHash == sr->m_tagHash &&
// for footers
sr->m_prevBrother != first )
return;
}
// restart loop
sr = biggest;
// ok, not part of a list, flag it
for ( ; sr ; sr = sr->m_next ) {
// stop if over
if ( sr->m_a >= lastb ) break;
// flag each subsection
sr->m_flags |= flag; // SEC_MENU_HEADER;
}
}
// . set SEC_HEADING bits in Section::m_flags
// . identifies sections that are most likely headings
// . the WHOLE idea of this algo is to take a list of sections that are all
// the same tagId/baseHash and differentiate them so we can insert implied
// sections with headers.
bool Sections::setHeadingBit ( ) {
int32_t headings = 0;
// scan the sections
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
int32_t fwp = si->m_firstWordPos;
if ( fwp == -1 ) continue;
// we must be the smallest container around this text
if ( m_sectionPtrs[fwp] != si ) continue;
// . make sure we are in our own hard section
// . TODO: allow for bold or strong, etc. tags as well
bool hasHard = false;
int32_t a = si->m_firstWordPos;
int32_t b = si->m_lastWordPos;
// go to parent
Section *pp = si;
Section *biggest = NULL;
bool inLink = false;
// . we need to be isolated in our own hard section container
// . TODO: what about "<b>Hi There <i>Bob</i></b>" as a heading
// . i guess that will still work!
for ( ; pp ; pp = pp->m_parent ) {
// stop if breached
if ( pp->m_firstWordPos != a ) break;
if ( pp->m_lastWordPos != b ) break;
// record this
if ( pp->m_tagId == TAG_A ) inLink = true;
// record the biggest section containing just our text
biggest = pp;
// is it a hard section?
if ( isHardSection(pp) ) hasHard = true;
// . allow bold and strong tags
// . fixes gwair.org which has the dates of the
// month in strong tags. so we need to set
// SEC_HEADING for those so getDelimHash() will
// recognize such tags as date header tags in the
// METHOD_DOM algorithm and we get the proper
// implied sections
if ( pp->m_tagId == TAG_STRONG ) hasHard = true;
if ( pp->m_tagId == TAG_B ) hasHard = true;
}
// need to be isolated in a hard section
if ( ! hasHard ) continue;
// now make sure the text is capitalized etc
bool hadUpper = false;
//bool hadLower = false;
int32_t lowerCount = 0;
bool hadYear = false;
bool hadAlpha = false;
int32_t i;
// scan the alnum words we contain
for ( i = a ; i <= b ; i++ ) {
const auto &token = (*m_tr)[i];
// . did we hit a breaking tag?
// . "<div> blah <table><tr><td>blah... </div>"
if ( token.nodeid && isBreakingTagId(token.nodeid) ) break;
// skip if not alnum word
if ( ! token.is_alfanum ) continue;
// skip digits
if(token.token_len == 4 &&
is_digit(token.token_start[0]) &&
is_digit(token.token_start[1]) &&
is_digit(token.token_start[2]) &&
is_digit(token.token_start[3])) {
// . but if we had a year like "2010" that
// is allowed to be a header.
// . this fixes 770kob.com because the events
// under the "2010" header were telescoping
// up into events in the "December 2009"
// section, when they should have been in
// their own section! and now they are in
// their own implied section...
int32_t num = atol2(token.token_start,token.token_len);
if ( num < 1800 ) continue;
if ( num > 2100 ) continue;
// mark it
hadYear = true;
continue;
}
// mark this
hadAlpha = true;
// is it upper?
if ( is_upper_utf8(token.token_start) ) {
hadUpper = true;
continue;
}
// skip stop words
if(isStopWord(token.token_start, token.token_len, token.token_hash)) continue;
// . skip short words
// . November 4<sup>th</sup> for facebook.com
if ( token.token_len <= 2 ) continue;
// is it lower?
if ( is_lower_utf8(token.token_start) ) lowerCount++;
// stop now if bad
//if ( hadUpper ) break;
if ( lowerCount >= 2 ) break;
}
// is it a header?
bool isHeader = hadUpper;
// a single year by itself is ok though too
if ( hadYear && ! hadAlpha ) isHeader = true;
// allow for one mistake like we do in Events.cpp for titles
if ( lowerCount >= 2 ) isHeader = false;
if ( ! isHeader ) continue;
// ok, mark this section as a heading section
si->m_flags |= SEC_HEADING;
// a hack!
if ( inLink ) biggest->m_flags |= SEC_LINK_TEXT;
// count them
headings++;
}
// bail now if no headings were set
if ( ! headings ) return true;
return true;
}
void Sections::setTagHashes ( ) {
if ( m_numSections == 0 ) return;
// now recompute the tagHashes and depths and content hashes since
// we have eliminate open-ended sections in the loop above
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
// these have to be in order of sn->m_a to work right
// because we rely on the parent tag hash, which would not
// necessarily be set if we were not sorted, because the
// parent section could have SEC_FAKE flag set because it is
// a br section added afterwards.
// shortcut
int64_t bh = (int64_t)sn->m_baseHash;
// sanity check
if ( bh == 0 ) { g_process.shutdownAbort(true); }
// if no parent, use initial values
if ( ! sn->m_parent ) {
sn->m_depth = 0;
sn->m_tagHash = bh;
// sanity check
if ( bh == 0 ) { g_process.shutdownAbort(true); }
continue;
}
// sanity check
if ( sn->m_parent->m_tagHash == 0 ) { g_process.shutdownAbort(true); }
// . update the cumulative front tag hash
// . do not include hyperlinks as part of the cumulative hash!
sn->m_tagHash = hash32h ( bh , sn->m_parent->m_tagHash );
sn->m_colorHash = hash32h ( bh , sn->m_parent->m_colorHash );
// if we are an implied section, just use the tag hash of
// our parent. that way since we add different implied
// sections for msichicago.com root than we do the kid,
// the section voting should still match up
if ( bh == BH_IMPLIED ) {
sn->m_tagHash = sn->m_parent->m_tagHash;
}
if ( sn->m_tagHash == 0 ) {
sn->m_tagHash = 1234567;
}
// depth based on parent, too
sn->m_depth = sn->m_parent->m_depth + 1;
}
}
// make this replace ::print() when it works
bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) const {
PrintData pd;
pd.sbuf = sbuf;
pd.hiPos = hiPos;
pd.wposVec = wposVec;
pd.densityVec = densityVec;
pd.wordSpamVec = wordSpamVec;
pd.fragVec = fragVec;
return print(&pd);
}
bool Sections::print(PrintData *pd) const {
pd->sbuf->setLabel ("sectprnt");
//verifySections();
int32_t nw = m_tr->size();
// check words
for ( int32_t i = 0 ; i < nw ; i++ ) {
// get section
Section *sn = m_sectionPtrs[i];
if ( sn->m_a > i ) { g_process.shutdownAbort(true); }
if ( sn->m_b <= i ) { g_process.shutdownAbort(true); }
}
// print sections out
for ( Section *sk = m_rootSection ; sk ; ) {
// print this section
printSectionDiv(pd,sk);
// advance
int32_t b = sk->m_b;
// stop if last
if ( b >= m_nw ) break;
// get section after that
sk = m_sectionPtrs[b];
}
// print header
const char *hdr =
"<table border=1>"
"<tr>"
"<td><b>sec #</b></td>"
"<td><b>wordStart</b></td>"
"<td><b>wordEnd</b></td>"
"<td><b>baseHash</b></td>"
"<td><b>cumulTagHash</b></td>"
"<td><b>contentHash</b></td>"
"<td><b>contentTagHash</b></td>"
"<td><b>XOR</b></td>" // only valid for contentHashes
"<td><b>depth</b></td>"
"<td><b>parent word range</b></td>"
"<td><b>flags</b></td>"
"<td><b>evIds</b></td>"
"<td><b>text snippet</b></td>"
"</tr>\n";
pd->sbuf->safePrintf("%s",hdr);
int32_t rcount = 0;
int32_t scount = 0;
// show word # of each section so we can look in PageParser.cpp's
// output to see exactly where it starts, since we now label all
// the words
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
// see if one big table causes a browser slowdown
if ( (++rcount % TABLE_ROWS ) == 0 )
pd->sbuf->safePrintf("</table>%s\n",hdr);
const char *xs = "--";
char ttt[100];
if ( sn->m_contentHash64 ) {
int32_t modified = sn->m_tagHash ^ sn->m_contentHash64;
sprintf(ttt,"0x%" PRIx32,modified);
xs = ttt;
}
// shortcut
Section *parent = sn->m_parent;
int32_t pswn = -1;
int32_t pewn = -1;
if ( parent ) {
pswn = parent->m_a;
pewn = parent->m_b;
}
// print it
pd->sbuf->safePrintf("<tr><td>%" PRId32"</td>\n"
"<td>%" PRId32"</td>"
"<td>%" PRId32"</td>"
"<td>0x%" PRIx32"</td>"
"<td>0x%" PRIx32"</td>"
"<td>0x%" PRIx32"</td>"
"<td>0x%" PRIx32"</td>"
"<td>%s</td>"
"<td>%" PRId32"</td>"
"<td><nobr>%" PRId32" to %" PRId32"</nobr></td>"
"<td><nobr>" ,
scount++,
sn->m_a,
sn->m_b,
(int32_t)sn->m_baseHash,
(int32_t)sn->m_tagHash,
(int32_t)sn->m_contentHash64,
(int32_t)(sn->m_contentHash64^sn->m_tagHash),
xs,
sn->m_depth,
pswn,
pewn);
// now show the flags
printFlags ( pd->sbuf , sn );
// first few words of section
int32_t a = sn->m_a;
int32_t b = sn->m_b;
// -1 means an unclosed tag!! should no longer be the case
if ( b == -1 ) { g_process.shutdownAbort(true); }//b=m_words->m_numWords;
pd->sbuf->safePrintf("</nobr></td>");
pd->sbuf->safePrintf("<td>&nbsp;</td>");
pd->sbuf->safePrintf("<td><nobr>");
// 70 chars max
int32_t max = 70;
int32_t count = 0;
char truncated = 0;
// do not print last word/tag in section
for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) {
const char *s = (*m_tr)[i].token_start;
int32_t slen = (*m_tr)[i].token_len;
if ( count + slen > max ) {
truncated = 1;
slen = max - count;
}
count += slen;
// boldify front tag
if ( i == a ) pd->sbuf->safePrintf("<b>");
pd->sbuf->htmlEncode(s,slen,false);
// boldify front tag
if ( i == a ) pd->sbuf->safePrintf("</b>");
}
// if we truncated print a ...
if ( truncated ) pd->sbuf->safePrintf("<b>…</b>");
// then print ending tag
if ( b < nw ) {
int32_t blen = (*m_tr)[b-1].token_len; //b is from m_b and always>0 so indexing b-1 is safe
if ( blen>20 ) blen = 20;
pd->sbuf->safePrintf("<b>");
pd->sbuf->htmlEncode((*m_tr)[b-1].token_start,blen,false);
pd->sbuf->safePrintf("</b>");
}
pd->sbuf->safePrintf("</nobr></td></tr>\n");
}
pd->sbuf->safePrintf("</table>\n<br>\n");
return true;
}
bool Sections::printSectionDiv(PrintData *pd, const Section *sk) const {
// enter a new div section now
pd->sbuf->safePrintf("<br>");
// only make font color different
int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff;
int32_t fcolor = 0x000000;
int32_t rcolor = 0x000000;
uint8_t *bp = (uint8_t *)&bcolor;
bool dark = false;
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
dark = true;
// or if two are less than 50
if ( (bp[0]<100 && bp[1]<100) || (bp[1]<100 && bp[2]<100) || (bp[0]<100 && bp[2]<100) ) dark = true;
// if bg color is dark, make font color light
if ( dark ) {
fcolor = 0x00ffffff;
rcolor = 0x00ffffff;
}
// start the new div
pd->sbuf->safePrintf("<div "
"style=\""
"background-color:#%06" PRIx32";"
"margin-left:20px;"
"border:#%06" PRIx32" 1px solid;"
"color:#%06" PRIx32"\">",
//(int32_t)sk,
bcolor,
rcolor,
fcolor);
bool printWord = true;
if ( ! sk->m_parent && sk->m_next && sk->m_next->m_a == sk->m_a )
printWord = false;
// print word/tag #i
if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord )
// only encode if it is a tag
pd->sbuf->htmlEncode((*m_tr)[sk->m_a].token_start, (*m_tr)[sk->m_a].token_len, false);
pd->sbuf->safePrintf("<i>");
// print the flags
pd->sbuf->safePrintf("A=%" PRId32" ",sk->m_a);
// print tag hash now
pd->sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash);
if ( sk->m_contentHash64 )
pd->sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64);
printFlags ( pd->sbuf , sk );
if ( isHardSection(sk) )
pd->sbuf->safePrintf("hardsec ");
pd->sbuf->safePrintf("</i>\n");
// now print each word and subsections in this section
int32_t a = sk->m_a;
int32_t b = sk->m_b;
for ( int32_t i = a ; i < b ; i++ ) {
const auto &token = (*m_tr)[i];
// . if its a and us, skip
// . BUT if we are root then really this tag belongs to
// our first child, so make an exception for root!
if ( i == a && token.is_alfanum && (sk->m_parent) ) continue;
// . get section of this word
// . TODO: what if this was the tr tag we removed??? i guess
// maybe make it NULL now?
Section *ws = m_sectionPtrs[i];
// get top most parent that starts at word position #a and
// is not "sk"
for ( ; ; ws = ws->m_parent ) {
if ( ws == sk ) break;
if ( ! ws->m_parent ) break;
if ( ws->m_parent->m_a != ws->m_a ) break;
if ( ws->m_parent == sk ) break;
}
// if it belongs to another sections, print that section
if ( ws != sk ) {
// print out this subsection
printSectionDiv(pd,ws);
// advance to end of that then
i = ws->m_b - 1;
// and try next word
continue;
}
// ignore if in style section, etc. just print it out
if ( sk->m_flags & NOINDEXFLAGS ) {
pd->sbuf->htmlEncode(token.token_start,token.token_len,false );
continue;
}
// boldify alnum words
if ( token.is_alfanum ) {
if ( pd->wposVec[i] == pd->hiPos )
pd->sbuf->safePrintf("<a name=hipos></a>");
pd->sbuf->safePrintf("<nobr><b>");
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
pd->sbuf->safePrintf("<strike>");
}
if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
pd->sbuf->safePrintf("<blink style=\""
"background-color:yellow;"
"color:black;\">");
// print that word
pd->sbuf->htmlEncode(token.token_start, token.token_len, false );
if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
pd->sbuf->safePrintf("</blink>");
// boldify alnum words
if ( token.is_alfanum ) {
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
pd->sbuf->safePrintf("</strike>");
pd->sbuf->safePrintf("</b>");
}
// and print out their pos/div/spam sub
if ( token.is_alfanum ) {
pd->sbuf->safePrintf("<sub "
"style=\"background-color:white;"
"font-size:10px;"
"border:black 1px solid;"
"color:black;\">");
pd->sbuf->safePrintf("%" PRId32, pd->wposVec[i]);
if ( pd->densityVec[i] != MAXDENSITYRANK )
pd->sbuf->safePrintf("/<font color=purple><b>%" PRId32
"</b></font>"
,
(int32_t)pd->densityVec[i]);
if ( pd->wordSpamVec[i] != MAXWORDSPAMRANK )
pd->sbuf->safePrintf("/<font color=red><b>%" PRId32
"</b></font>"
,
(int32_t)pd->wordSpamVec[i]);
pd->sbuf->safePrintf("</sub></nobr>");
}
}
pd->sbuf->safePrintf("</div>\n");
return true;
}
bool Sections::verifySections ( ) {
// make sure we map each word to a section that contains it at least
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
Section *si = m_sectionPtrs[i];
if ( si->m_a > i ) { g_process.shutdownAbort(true); }
if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
// must have checksum
if ( (*m_tr)[i].is_alfanum && si->m_contentHash64==0) { g_process.shutdownAbort(true); }
// must have this set if 0
if ( ! si->m_contentHash64 && !(si->m_flags & SEC_NOTEXT)) {
g_process.shutdownAbort(true);}
if ( si->m_contentHash64 && (si->m_flags & SEC_NOTEXT)) {
g_process.shutdownAbort(true);}
}
// sanity check
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
// get it
//Section *sn = &m_sections[i];
// get parent
for(const Section *sp = sn->m_parent; sp; sp = sp->m_parent) {
// make sure parent fully contains
if ( sp->m_a > sn->m_a ) { g_process.shutdownAbort(true); }
if ( sp->m_b < sn->m_b ) { g_process.shutdownAbort(true); }
// and make sure every grandparent fully contains us too!
}
}
// sanity check
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
Section *sn = &m_sections[i];
if ( sn->m_a >= sn->m_b ) { g_process.shutdownAbort(true); }
}
// sanity check, make sure each section is contained by the
// smallest section containing it
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
for ( Section *sj = m_rootSection ; sj ; sj = sj->m_next ) {
// skip if us
if ( sj == si ) continue;
// skip column sections because they are artificial
// and only truly contain some of the sections that
// their [a,b) interval says they contain.
if ( sj->m_tagId == TAG_TC ) continue;
// or if an implied section of td tags in a tc
if ( sj->m_baseHash == BH_IMPLIED &&
sj->m_parent &&
sj->m_parent->m_tagId == TAG_TC )
continue;
// skip if sj does not contain first word in si
if ( sj->m_a > si->m_a ) continue;
if ( sj->m_b <= si->m_a ) continue;
// ok, make sure in our parent path
Section *ps = si;
for ( ; ps ; ps = ps->m_parent )
if ( ps == sj ) break;
// ok if we found it
if ( ps ) continue;
// sometimes if sections are equal then the other
// is the parent
ps = sj;
for ( ; ps ; ps = ps->m_parent )
if ( ps == si ) break;
// must have had us
if ( ps ) continue;
g_process.shutdownAbort(true);
}
}
// make sure we map each word to a section that contains it at least
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
Section *si = m_sectionPtrs[i];
if ( si->m_a > i ) { g_process.shutdownAbort(true); }
if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
}
return true;
}