2013-08-02 16:12:24 -04:00
|
|
|
|
// print events should print if nothing else to print
|
|
|
|
|
|
|
|
|
|
// when a div tag's parent truncates its section, it may have been
|
|
|
|
|
// paired up with a div back tag which then should become free...
|
|
|
|
|
// that is the problem... because those back tags are unpaired.
|
|
|
|
|
// so your parent should constrain you as SOON as it is constrained and
|
|
|
|
|
// close you up at that point. that way you cannot falsely pair-claim
|
|
|
|
|
// a div back tag.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "Sections.h"
|
|
|
|
|
#include "Url.h"
|
2018-03-09 10:24:39 -05:00
|
|
|
|
#include "tokenizer.h"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
#include "Conf.h"
|
|
|
|
|
#include "XmlDoc.h"
|
|
|
|
|
#include "Bits.h"
|
|
|
|
|
#include "sort.h"
|
|
|
|
|
#include "Abbreviations.h"
|
2018-02-26 07:33:26 -05:00
|
|
|
|
#include "StopWords.h"
|
2016-06-20 06:30:26 -04:00
|
|
|
|
#include "Process.h"
|
2016-07-05 06:54:26 -04:00
|
|
|
|
#include "Posdb.h"
|
2018-03-09 10:24:39 -05:00
|
|
|
|
#include "GbUtil.h"
|
2018-07-26 11:29:51 -04:00
|
|
|
|
#include "Errno.h"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
Sections::Sections ( ) {
|
|
|
|
|
m_sections = NULL;
|
|
|
|
|
reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Sections::reset() {
|
2013-10-06 13:04:10 -04:00
|
|
|
|
m_sectionBuf.purge();
|
|
|
|
|
m_sectionPtrBuf.purge();
|
2016-02-23 08:09:26 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
m_sections = NULL;
|
|
|
|
|
m_bits = NULL;
|
|
|
|
|
m_numSections = 0;
|
|
|
|
|
m_rootSection = NULL;
|
|
|
|
|
m_lastSection = NULL;
|
|
|
|
|
m_lastAdded = NULL;
|
|
|
|
|
m_nw = 0;
|
2018-02-23 07:53:03 -05:00
|
|
|
|
m_firstSentence = NULL;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
m_sectionPtrs = NULL;
|
2016-09-27 11:24:19 -04:00
|
|
|
|
|
|
|
|
|
// Coverity
|
2018-03-09 10:24:39 -05:00
|
|
|
|
m_tr = NULL;
|
2016-09-27 11:24:19 -04:00
|
|
|
|
m_contentType = 0;
|
|
|
|
|
m_isRSSExt = false;
|
|
|
|
|
m_maxNumSections = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Sections::~Sections ( ) {
|
|
|
|
|
reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define TXF_MATCHED 1
|
|
|
|
|
|
|
|
|
|
// an element on the stack is a Tag
|
|
|
|
|
class Tagx {
|
|
|
|
|
public:
|
|
|
|
|
// id of the fron tag we pushed
|
|
|
|
|
nodeid_t m_tid;
|
|
|
|
|
// section number we represent
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t m_secNum;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set to TXF_MATCHED
|
|
|
|
|
char m_flags;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// i lowered from 1000 to 300 so that we more sensitive to malformed pages
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// because typically they seem to take longer to parse. i also added some
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// new logic for dealing with table tr and td back tags that allow us to
|
|
|
|
|
// pop off the other contained tags right away rather than delaying it until
|
|
|
|
|
// we are done because that will often breach this stack.
|
|
|
|
|
#define MAXTAGSTACK 300
|
|
|
|
|
|
|
|
|
|
// . returns false if blocked, true otherwise
|
|
|
|
|
// . returns true and sets g_errno on error
|
|
|
|
|
// . sets m_sections[] array, 1-1 with words array "w"
|
|
|
|
|
// . the Weights class can look at these sections and zero out the weights
|
|
|
|
|
// for words in script, style, select and marquee sections
|
2018-03-09 10:24:39 -05:00
|
|
|
|
bool Sections::set(const TokenizerResult *tr, Bits *bits, const Url *url, uint8_t contentType ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
reset();
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! tr ) return true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tr->size() > 1000000 ) {
|
2014-02-25 14:09:07 -05:00
|
|
|
|
log("sections: over 1M words. skipping sections set for "
|
|
|
|
|
"performance.");
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// save it
|
2018-03-09 10:24:39 -05:00
|
|
|
|
m_tr = tr;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
m_bits = bits;
|
|
|
|
|
m_contentType = contentType;
|
|
|
|
|
|
|
|
|
|
// reset this just in case
|
|
|
|
|
g_errno = 0;
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( tr->empty() ) return true;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcuts
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t nw = tr->size();
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
m_isRSSExt = false;
|
2018-02-22 12:05:46 -05:00
|
|
|
|
const char *ext = url->getExtension();
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true;
|
|
|
|
|
if ( m_contentType == CT_XML ) m_isRSSExt = true;
|
|
|
|
|
|
|
|
|
|
// . how many sections do we have max?
|
|
|
|
|
// . init at one to count the root section
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t max = 1;
|
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*tr)[i];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . count all front tags
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// count back tags too since some url
|
|
|
|
|
// http://www.tedxhz.com/tags.asp?id=3919&id2=494 had a bunch
|
|
|
|
|
// of </p> tags with no front tags and it cored us because
|
|
|
|
|
// m_numSections > m_maxNumSections!
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.nodeid ) {
|
2016-02-24 10:12:05 -05:00
|
|
|
|
max += 2;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . any punct tag could have a bullet in it...
|
|
|
|
|
// . or if its a period could make a sentence section
|
2018-03-09 10:24:39 -05:00
|
|
|
|
} else if ( !token.is_alfanum ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// only do not count simple spaces
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.token_len == 1 && is_wspace_a(token.token_start[0]))
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
// otherwise count it as sentence delimeter
|
|
|
|
|
max++;
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . then \0 allows for a sentence too!
|
|
|
|
|
// . fix doc that was just "localize-sf-prod\n"
|
|
|
|
|
max++;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and each section may create a sentence section
|
|
|
|
|
max *= 2;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
|
2016-09-08 11:44:05 -04:00
|
|
|
|
// truncate if excessive.
|
2013-10-06 13:04:10 -04:00
|
|
|
|
if ( max > 1000000 ) {
|
|
|
|
|
log("sections: truncating max sections to 1000000");
|
|
|
|
|
max = 1000000;
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t need = max * sizeof(Section);
|
2013-10-06 13:04:10 -04:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set this
|
|
|
|
|
m_maxNumSections = max;
|
|
|
|
|
|
2013-11-20 01:53:40 -05:00
|
|
|
|
m_sectionPtrBuf.setLabel("psectbuf");
|
|
|
|
|
|
2013-10-06 13:04:10 -04:00
|
|
|
|
// separate buf now for section ptr for each word
|
2014-11-14 20:30:32 -05:00
|
|
|
|
if ( ! m_sectionPtrBuf.reserve ( nw *sizeof(Section *)) ) return true;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
|
|
|
|
|
|
|
|
|
|
// allocate m_sectionBuf
|
2013-08-02 16:12:24 -04:00
|
|
|
|
m_sections = NULL;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
|
2013-11-20 01:53:40 -05:00
|
|
|
|
m_sectionBuf.setLabel ( "sectbuf" );
|
|
|
|
|
|
2013-10-06 13:04:10 -04:00
|
|
|
|
if ( ! m_sectionBuf.reserve ( need ) )
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
// point into it
|
|
|
|
|
m_sections = (Section *)m_sectionBuf.getBufStart();
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// save this too
|
|
|
|
|
m_nw = nw;
|
|
|
|
|
|
|
|
|
|
// stack of front tags we encounter
|
|
|
|
|
Tagx stack[MAXTAGSTACK];
|
|
|
|
|
Tagx *stackPtr = stack;
|
|
|
|
|
|
|
|
|
|
Section *current = NULL;
|
|
|
|
|
Section *rootSection = NULL;
|
|
|
|
|
|
|
|
|
|
// assume none
|
|
|
|
|
m_rootSection = NULL;
|
|
|
|
|
|
|
|
|
|
// only add root section if we got some words
|
|
|
|
|
if ( nw > 0 ) {
|
|
|
|
|
// record this i guess
|
|
|
|
|
rootSection = &m_sections[m_numSections];
|
|
|
|
|
// clear
|
|
|
|
|
memset ( rootSection , 0 , sizeof(Section) );
|
|
|
|
|
// . the current section we are in
|
|
|
|
|
// . let's use a root section
|
|
|
|
|
current = rootSection;
|
|
|
|
|
// init that to be the whole page
|
|
|
|
|
rootSection->m_b = nw;
|
|
|
|
|
// save it
|
|
|
|
|
m_rootSection = rootSection;
|
|
|
|
|
// to fix a core dump
|
|
|
|
|
rootSection->m_baseHash = 1;
|
|
|
|
|
// advance
|
|
|
|
|
m_numSections++;
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// Sections are no longer 1-1 with words, just with front tags
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*tr)[i];
|
|
|
|
|
nodeid_t fullTid = token.nodeid;
|
2016-02-23 15:25:32 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// are we a non-tag?
|
|
|
|
|
if ( ! fullTid ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make a single section for input tags
|
|
|
|
|
if ( fullTid == TAG_INPUT ||
|
|
|
|
|
fullTid == TAG_HR ||
|
|
|
|
|
fullTid == TAG_COMMENT ) {
|
2013-10-06 13:04:10 -04:00
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
2016-09-08 11:44:05 -04:00
|
|
|
|
if ( m_numSections >= m_maxNumSections) {
|
|
|
|
|
g_errno = EDOCBADSECTIONS;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
return true;
|
2016-09-08 11:44:05 -04:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get the section
|
|
|
|
|
Section *sn = &m_sections[m_numSections];
|
|
|
|
|
// clear
|
|
|
|
|
memset ( sn , 0 , sizeof(Section) );
|
|
|
|
|
// inc it
|
|
|
|
|
m_numSections++;
|
|
|
|
|
// sanity check - breach check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set our parent
|
|
|
|
|
sn->m_parent = current;
|
|
|
|
|
// need to keep a word range that the section covers
|
|
|
|
|
sn->m_a = i;
|
|
|
|
|
// section consists of just this tag
|
|
|
|
|
sn->m_b = i + 1;
|
|
|
|
|
// go on to next
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// a section of multiple br tags in a sequence
|
|
|
|
|
if ( fullTid == TAG_BR ) {
|
2013-10-06 13:04:10 -04:00
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
2016-09-08 11:44:05 -04:00
|
|
|
|
if ( m_numSections >= m_maxNumSections) {
|
|
|
|
|
g_errno = EDOCBADSECTIONS;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
return true;
|
2016-09-08 11:44:05 -04:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get the section
|
|
|
|
|
Section *sn = &m_sections[m_numSections];
|
|
|
|
|
// clear
|
|
|
|
|
memset ( sn , 0 , sizeof(Section) );
|
|
|
|
|
// inc it
|
|
|
|
|
m_numSections++;
|
|
|
|
|
// sanity check - breach check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set our parent
|
|
|
|
|
sn->m_parent = current;
|
|
|
|
|
// need to keep a word range that the section covers
|
|
|
|
|
sn->m_a = i;
|
|
|
|
|
// count em up
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t brcnt = 1;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan for whole sequence
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lastBrPos = i;
|
|
|
|
|
for ( int32_t j = i + 1 ; j < nw ; j++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token2 = (*tr)[j];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// claim br tags
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.nodeid == TAG_BR ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
lastBrPos = j;
|
|
|
|
|
brcnt++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// break on words
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.is_alfanum ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// all spaces is ok
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( is_wspace_utf8_string(token2.token_start,token2.token_end()) ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// otherwise, stop on other punct
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// section consists of just this tag
|
|
|
|
|
sn->m_b = lastBrPos + 1;
|
|
|
|
|
// advance
|
|
|
|
|
i = lastBrPos;
|
|
|
|
|
// set this for later so that getDelimHash() returns
|
|
|
|
|
// something different based on the br count for
|
|
|
|
|
// METHOD_ATTRIBUTE
|
|
|
|
|
sn->m_baseHash = 19999 + brcnt;
|
|
|
|
|
// go on to next
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// get the tag id without the back bit
|
|
|
|
|
nodeid_t tid = fullTid & BACKBITCOMP;
|
|
|
|
|
|
|
|
|
|
// . ignore tags with no corresponding back tags
|
|
|
|
|
// . if they have bad html and have front tags
|
|
|
|
|
// with no corresponding back tags, that will hurt!
|
|
|
|
|
// . make exception for <li> tag!!!
|
|
|
|
|
// . was messing up:
|
|
|
|
|
// http://events.kqed.org/events/index.php?com=detail&
|
|
|
|
|
// eID=9812&year=2009&month=11
|
|
|
|
|
// for parsing out events
|
|
|
|
|
// . make excpetion for <p> tag too! most ppl use </p>
|
|
|
|
|
if ( ( ! hasBackTag ( tid ) ||
|
2018-03-09 10:24:39 -05:00
|
|
|
|
token.token_start[1] =='!' || // <!ENTITY rdfns...>
|
|
|
|
|
token.token_start[1] =='?' ) &&
|
2013-08-02 16:12:24 -04:00
|
|
|
|
tid != TAG_P &&
|
|
|
|
|
tid != TAG_LI )
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// . these imply no back tag
|
|
|
|
|
// . <description />
|
|
|
|
|
// . fixes inconsistency in
|
|
|
|
|
// www.trumba.com/calendars/KRQE_Calendar.rss
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.token_start[token.token_len-2] == '/' && tid == TAG_XMLTAG )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// do not breach the stack
|
|
|
|
|
if ( stackPtr - stack >= MAXTAGSTACK ) {
|
2016-06-29 10:27:18 -04:00
|
|
|
|
log( LOG_WARN, "html: stack breach for %s",url->getUrl());
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if we set g_errno and return then the url just
|
|
|
|
|
// ends up getting retried once the spider lock
|
|
|
|
|
// in Spider.cpp expires in MAX_LOCK_AGE seconds.
|
|
|
|
|
// about an hour. but really we should completely
|
|
|
|
|
// give up on this. whereas we should retry OOM errors
|
|
|
|
|
// etc. but this just means bad html really.
|
2016-06-29 10:27:18 -04:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// just reset to 0 sections then
|
|
|
|
|
reset();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char gotBackTag ;
|
|
|
|
|
if ( fullTid != tid ) gotBackTag = 1;
|
|
|
|
|
else gotBackTag = 0;
|
|
|
|
|
|
|
|
|
|
// "pop tid", tid to pop off stack
|
|
|
|
|
nodeid_t ptid = tid;
|
|
|
|
|
nodeid_t fullPopTid = fullTid;
|
|
|
|
|
|
|
|
|
|
// no nested <li> tags allowed
|
|
|
|
|
if ( fullTid == TAG_LI &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_LI )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// no nested <b> tags allowed
|
|
|
|
|
if ( fullTid == TAG_B &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_B )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// no nested <a> tags allowed
|
|
|
|
|
if ( fullTid == TAG_A &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_A )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// no nested <p> tags allowed
|
|
|
|
|
if ( fullTid == TAG_P &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_P )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// no <hN> tags inside a <p> tag
|
|
|
|
|
// fixes http://www.law.berkeley.edu/140.htm
|
|
|
|
|
if ( fullTid >= TAG_H1 &&
|
|
|
|
|
fullTid <= TAG_H5 &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_P ) {
|
|
|
|
|
// match this on stack
|
|
|
|
|
ptid = TAG_P;
|
|
|
|
|
fullPopTid = TAG_P;
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// no nested <td> tags allowed
|
|
|
|
|
if ( fullTid == TAG_TD &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_TD )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// encountering <tr> when in a <td> closes the <td> AND
|
|
|
|
|
// should also close the <tr>!!
|
|
|
|
|
if ( fullTid == TAG_TR &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_TD )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// no nested <tr> tags allowed
|
|
|
|
|
if ( fullTid == TAG_TR &&
|
|
|
|
|
stackPtr > stack &&
|
|
|
|
|
((stackPtr-1)->m_tid)==TAG_TR )
|
|
|
|
|
gotBackTag = 2;
|
|
|
|
|
|
|
|
|
|
// this is true if we are a BACK TAG
|
|
|
|
|
if ( gotBackTag ) {
|
|
|
|
|
|
|
|
|
|
// ignore span tags that are non-breaking because they
|
|
|
|
|
// do not change the grouping/sectioning behavior of
|
|
|
|
|
// the web page and are often abused.
|
|
|
|
|
if ( ptid == TAG_SPAN ) continue;
|
|
|
|
|
|
|
|
|
|
// fix for gwair.org
|
|
|
|
|
if ( ptid == TAG_FONT ) continue;
|
|
|
|
|
|
|
|
|
|
// too many people use these like a <br> tag or
|
|
|
|
|
// make them open-ended or unbalanced
|
|
|
|
|
//if ( tid == TAG_P ) continue;
|
|
|
|
|
if ( ptid == TAG_CENTER ) continue;
|
|
|
|
|
|
|
|
|
|
subloop:
|
|
|
|
|
// don't blow the stack
|
|
|
|
|
if ( stackPtr == stack ) continue;
|
|
|
|
|
|
|
|
|
|
// point to it
|
|
|
|
|
Tagx *spp = (stackPtr - 1);
|
|
|
|
|
|
|
|
|
|
// init it
|
|
|
|
|
Tagx *p ;
|
|
|
|
|
// scan through the stack until we find a
|
|
|
|
|
// front tag that matches this back tag
|
|
|
|
|
//for(p = spp ; p >= stack && gotBackTag == 1 ; p-- ) {
|
|
|
|
|
for ( p = spp ; p >= stack ; p-- ) {
|
|
|
|
|
// no match?
|
|
|
|
|
if ( p->m_tid != ptid ) {
|
|
|
|
|
// matched before? we can pop
|
|
|
|
|
if ( p->m_flags & TXF_MATCHED )
|
|
|
|
|
continue;
|
|
|
|
|
// keep on going
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// do not double match
|
|
|
|
|
if ( p->m_flags & TXF_MATCHED )
|
|
|
|
|
continue;
|
|
|
|
|
// flag it cuz we matched it
|
|
|
|
|
p->m_flags |= TXF_MATCHED;
|
|
|
|
|
// set the stack ptr to it
|
|
|
|
|
spp = p;
|
|
|
|
|
// and stop
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// no matching front tag at all?
|
|
|
|
|
// then just ignore this back tag
|
|
|
|
|
if ( p < stack ) continue;
|
|
|
|
|
|
|
|
|
|
// get section number of the front tag
|
2014-11-10 17:45:11 -05:00
|
|
|
|
//int32_t xn = *(secNumPtr-1);
|
|
|
|
|
int32_t xn = spp->m_secNum;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( xn<0 || xn>=m_numSections ) {g_process.shutdownAbort(true);}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sn = &m_sections[xn];
|
|
|
|
|
|
|
|
|
|
// record the word range of the secion we complete
|
|
|
|
|
sn->m_b = i+1;
|
|
|
|
|
|
|
|
|
|
// do not include the <li> tag as part of it
|
|
|
|
|
// otherwise we end up with overlapping section since
|
|
|
|
|
// this tag ALSO starts a section!!
|
|
|
|
|
if ( gotBackTag == 2 ) sn->m_b = i;
|
|
|
|
|
|
|
|
|
|
// if our parent got closed before "sn" closed because
|
|
|
|
|
// it hit its back tag before we hit ours, then we
|
2016-01-05 05:39:00 -05:00
|
|
|
|
// must cut ourselves short and try to match this
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// back tag to another front tag on the stack
|
|
|
|
|
Section *ps = sn->m_parent;
|
|
|
|
|
for ( ; ps != rootSection ; ps = ps->m_parent ) {
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// skip if parent no longer contains us!
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( ps->m_b <= sn->m_a ) continue;
|
|
|
|
|
// skip if this parent is still open
|
|
|
|
|
if ( ps->m_b <= 0 ) continue;
|
|
|
|
|
// parent must have closed before us
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( ps->m_b > sn->m_b ) {g_process.shutdownAbort(true);}
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// cut our end shorter
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sn->m_b = ps->m_b;
|
|
|
|
|
// our TXF_MATCHED bit should still be set
|
|
|
|
|
// for spp->m_flags, so try to match ANOTHER
|
|
|
|
|
// front tag with this back tag now
|
|
|
|
|
if ( ! ( spp->m_flags & TXF_MATCHED ) ) {
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ok, try to match this back tag with another
|
|
|
|
|
// front tag on the stack, because the front
|
2016-01-05 05:39:00 -05:00
|
|
|
|
// tag we had selected got cut short because
|
|
|
|
|
// its parent forced it to cut short.
|
2013-08-02 16:12:24 -04:00
|
|
|
|
goto subloop;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_b <= sn->m_a ) { g_process.shutdownAbort(true);}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// revert it to this guy, may not equal stackPtr-1 !!
|
|
|
|
|
stackPtr = spp;
|
|
|
|
|
|
|
|
|
|
// get parent section
|
|
|
|
|
if ( stackPtr > stack ) {
|
|
|
|
|
// get parent section now
|
|
|
|
|
xn = (stackPtr-1)->m_secNum;
|
|
|
|
|
// set current to that
|
|
|
|
|
current = &m_sections[xn];
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// i guess this is bad html!
|
|
|
|
|
current = rootSection;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// debug log
|
|
|
|
|
if ( g_conf.m_logDebugSections ) {
|
2016-05-30 10:12:42 -04:00
|
|
|
|
const char *ms = "";
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( stackPtr->m_tid != ptid) ms =" UNMATCHED";
|
2016-05-30 10:12:42 -04:00
|
|
|
|
const char *back ="";
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( fullPopTid & BACKBIT ) back = "/";
|
2016-05-20 03:18:32 -04:00
|
|
|
|
logf(LOG_DEBUG,"section: pop tid=%" PRId32" "
|
|
|
|
|
"i=%" PRId32" "
|
|
|
|
|
"level=%" PRId32" "
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"%s%s "
|
2016-05-20 03:18:32 -04:00
|
|
|
|
//"h=0x%" PRIx32
|
2014-11-10 17:45:11 -05:00
|
|
|
|
"%s",(int32_t)tid,
|
2013-08-02 16:12:24 -04:00
|
|
|
|
i,
|
2014-11-10 17:45:11 -05:00
|
|
|
|
(int32_t)(stackPtr - stack),
|
2013-08-02 16:12:24 -04:00
|
|
|
|
back,g_nodes[tid].m_nodeName,
|
|
|
|
|
//h,
|
|
|
|
|
ms);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . if we were a back tag, we are done... but if we
|
|
|
|
|
// were a front tag, we must add ourselves below...
|
|
|
|
|
// . MDW: this seems more logical than the if-statement
|
|
|
|
|
// below...
|
|
|
|
|
if ( fullTid != tid ) continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( tid == TAG_CENTER ) continue;
|
|
|
|
|
|
|
|
|
|
if ( tid == TAG_SPAN ) continue;
|
|
|
|
|
// gwair.org has font tags the pair up a date "1st Sundays"
|
|
|
|
|
// with the address above it, and it shouldn't do that!
|
|
|
|
|
if ( tid == TAG_FONT ) continue;
|
|
|
|
|
|
2013-10-06 13:04:10 -04:00
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
2016-09-08 11:44:05 -04:00
|
|
|
|
if ( m_numSections >= m_maxNumSections) {
|
|
|
|
|
g_errno = EDOCBADSECTIONS;
|
2013-10-06 13:04:10 -04:00
|
|
|
|
return true;
|
2016-09-08 11:44:05 -04:00
|
|
|
|
}
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get the section
|
|
|
|
|
Section *sn = &m_sections[m_numSections];
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// clear
|
|
|
|
|
memset ( sn , 0 , sizeof(Section) );
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// inc it
|
|
|
|
|
m_numSections++;
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check - breach check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( m_numSections > max ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// set our parent
|
|
|
|
|
sn->m_parent = current;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set this
|
|
|
|
|
current = sn;
|
|
|
|
|
|
|
|
|
|
// need to keep a word range that the section covers
|
|
|
|
|
sn->m_a = i;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// assume no terminating bookend
|
|
|
|
|
sn->m_b = -1;
|
|
|
|
|
|
|
|
|
|
// push a unique id on the stack so we can pop if we
|
|
|
|
|
// enter a subsection
|
|
|
|
|
stackPtr->m_tid = tid;
|
|
|
|
|
stackPtr->m_secNum = m_numSections - 1;
|
|
|
|
|
stackPtr->m_flags = 0;
|
|
|
|
|
stackPtr++;
|
|
|
|
|
|
|
|
|
|
// debug log
|
|
|
|
|
if ( ! g_conf.m_logDebugSections ) continue;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2016-05-20 03:18:32 -04:00
|
|
|
|
logf(LOG_DEBUG,"section: push tid=%" PRId32" "
|
|
|
|
|
"i=%" PRId32" "
|
|
|
|
|
"level=%" PRId32" "
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"%s "
|
|
|
|
|
,
|
2014-11-10 17:45:11 -05:00
|
|
|
|
(int32_t)tid,
|
2013-08-02 16:12:24 -04:00
|
|
|
|
i,
|
2014-11-10 17:45:11 -05:00
|
|
|
|
(int32_t)(stackPtr - stack)-1,
|
|
|
|
|
g_nodes[(int32_t)tid].m_nodeName
|
2013-08-02 16:12:24 -04:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if first word in a section false outside of the parent section
|
|
|
|
|
// then reparent to the grandparent. this can happen when we end
|
|
|
|
|
// up closing a parent section before ???????
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *si = &m_sections[i];
|
|
|
|
|
// skip if we are still open-ended
|
|
|
|
|
if ( si->m_b < 0 ) continue;
|
|
|
|
|
// get parent
|
|
|
|
|
Section *sp = si->m_parent;
|
|
|
|
|
// skip if no parent
|
|
|
|
|
if ( ! sp ) continue;
|
|
|
|
|
// skip if parent still open ended
|
|
|
|
|
if ( sp->m_b < 0 ) continue;
|
|
|
|
|
// subloop it
|
|
|
|
|
doagain:
|
|
|
|
|
// skip if no parent
|
|
|
|
|
if ( ! sp ) continue;
|
|
|
|
|
// parent must start before us
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sp->m_a > si->m_a ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . does parent contain our first word?
|
|
|
|
|
// . it need not fully contain our last word!!!
|
|
|
|
|
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
|
|
|
|
|
// if parent is open ended, then it is ok for now
|
|
|
|
|
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
|
|
|
|
|
// get grandparent
|
|
|
|
|
sp = sp->m_parent;
|
|
|
|
|
// set
|
|
|
|
|
si->m_parent = sp;
|
|
|
|
|
// try again
|
|
|
|
|
goto doagain;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-18 12:56:46 -04:00
|
|
|
|
bool inGbFrame = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t gbFrameNum = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2017-09-18 12:56:46 -04:00
|
|
|
|
bool inIFrame = false;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
//
|
|
|
|
|
// . set Section::m_xmlNameHash for xml tags here
|
|
|
|
|
// . set Section::m_frameNum and SEC_IN_GBFRAME bit
|
|
|
|
|
//
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sn = &m_sections[i];
|
2017-09-18 12:56:46 -04:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t ws = sn->m_a;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*m_tr)[ws];
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2018-03-09 10:24:39 -05:00
|
|
|
|
nodeid_t tid = token.nodeid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2017-09-18 12:56:46 -04:00
|
|
|
|
if (tid == TAG_IFRAME) {
|
2018-03-22 12:19:06 -04:00
|
|
|
|
//if the section doesn't have the closing iframe tag then set inIFrame
|
|
|
|
|
bool hasClosingIframeTag = false;
|
|
|
|
|
for(int j=sn->m_b-1; j>i; j--) {
|
2018-03-22 12:36:21 -04:00
|
|
|
|
if((*m_tr)[j].nodeid == (TAG_IFRAME|BACKBIT)) {
|
2018-03-22 12:19:06 -04:00
|
|
|
|
hasClosingIframeTag = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(!hasClosingIframeTag)
|
|
|
|
|
inIFrame = true;
|
2018-04-03 10:19:31 -04:00
|
|
|
|
else if(!inGbFrame)
|
|
|
|
|
sn->m_flags |= SEC_IN_IFRAME;
|
2018-03-22 12:19:06 -04:00
|
|
|
|
} else if (tid == (TAG_IFRAME | BACKBIT)) { //never happens how sentences are currently split
|
2017-09-18 12:56:46 -04:00
|
|
|
|
inIFrame = false;
|
|
|
|
|
} else if ( tid == TAG_GBFRAME ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// start or end?
|
|
|
|
|
gbFrameNum++;
|
2017-09-18 12:56:46 -04:00
|
|
|
|
inGbFrame = true;
|
|
|
|
|
} else if ( tid == (TAG_GBFRAME | BACKBIT) ) {
|
|
|
|
|
inGbFrame = false;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-22 12:19:06 -04:00
|
|
|
|
if (inIFrame && !inGbFrame)
|
2017-09-18 12:56:46 -04:00
|
|
|
|
sn->m_flags |= SEC_IN_IFRAME;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// mark it
|
2018-03-22 12:19:06 -04:00
|
|
|
|
if (inGbFrame)
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sn->m_gbFrameNum = gbFrameNum;
|
|
|
|
|
|
|
|
|
|
// custom xml tag, hash the tag itself
|
|
|
|
|
if ( tid != TAG_XMLTAG ) continue;
|
|
|
|
|
// stop at first space to avoid fields!!
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *p = token.token_start + 1;
|
|
|
|
|
const char *pend = p + token.token_len;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip back tags
|
|
|
|
|
if ( *p == '/' ) continue;
|
|
|
|
|
// reset hash
|
2014-10-30 15:36:39 -04:00
|
|
|
|
int64_t xh = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and hash char count
|
|
|
|
|
unsigned char cnt = 0;
|
|
|
|
|
// hash till space or / or >
|
|
|
|
|
for ( ; p < pend ; p++ ) {
|
|
|
|
|
// stop on space or / or >
|
|
|
|
|
if ( is_wspace_a(*p) ) break;
|
|
|
|
|
if ( *p == '/' ) break;
|
|
|
|
|
if ( *p == '>' ) break;
|
|
|
|
|
// hash it in
|
|
|
|
|
xh ^= g_hashtab[cnt++][(unsigned char )*p];
|
|
|
|
|
}
|
2015-07-08 10:15:30 -04:00
|
|
|
|
// if it is a string of the same chars it can be 0
|
|
|
|
|
if ( ! xh ) xh = 1;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// store that
|
2014-11-10 17:45:11 -05:00
|
|
|
|
sn->m_xmlNameHash = (int32_t)xh;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2018-03-22 12:19:06 -04:00
|
|
|
|
|
|
|
|
|
//TODO: implement section m_flags inheritance correctly. Currently SEC_IN_IFRAME/SEC_HIDDEN/... are not inherited by child sections.
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// find any open ended tags and constrain them based on their parent
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *si = &m_sections[i];
|
|
|
|
|
// get its parent
|
|
|
|
|
Section *ps = si->m_parent;
|
|
|
|
|
// if parent is open-ended panic!
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( ps && ps->m_b < 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// if our parent got constrained from under us, we need
|
|
|
|
|
// to telescope to a new parent
|
|
|
|
|
for ( ; ps && ps->m_b >= 0 && ps->m_b <= si->m_a ; ) {
|
|
|
|
|
ps = ps->m_parent;
|
|
|
|
|
si->m_parent = ps;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// assume end is end of doc
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t end = m_tr->size();
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get end of parent
|
|
|
|
|
if ( ps ) end = ps->m_b;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// shrink our section if parent ends before us OR if we
|
|
|
|
|
// are open ended
|
|
|
|
|
if ( si->m_b != -1 && si->m_b <= end ) continue;
|
|
|
|
|
// this might constrain someone's parent such that
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// that someone no longer can use that parent!!
|
2013-08-02 16:12:24 -04:00
|
|
|
|
si->m_b = end;
|
|
|
|
|
// . get our tag type
|
2014-11-10 17:45:11 -05:00
|
|
|
|
// . use int32_t instead of nodeid_t so we can re-set this
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// to the xml tag hash if we need to
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t tid1 = (*m_tr)[si->m_a].nodeid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// use the tag hash if this is an xml tag
|
|
|
|
|
if ( tid1 == TAG_XMLTAG ) {
|
|
|
|
|
// we computed this above
|
|
|
|
|
tid1 = si->m_xmlNameHash;
|
|
|
|
|
// skip if zero!
|
|
|
|
|
if ( ! tid1 ) continue;
|
|
|
|
|
}
|
|
|
|
|
// must be there to be open ended
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( ! tid1 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// NOW, see if within that parent there is actually another
|
|
|
|
|
// tag after us of our same tag type, then use that to
|
|
|
|
|
// constrain us instead!!
|
|
|
|
|
// this hurts <p><table><tr><td><p>.... because it
|
|
|
|
|
// uses that 2nd <p> tag to constrain si->m_b of the first
|
|
|
|
|
// <p> tag which is not right! sunsetpromotions.com has that.
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t j = i + 1 ; j < m_numSections ; j++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sj = &m_sections[j];
|
|
|
|
|
// get word start
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = sj->m_a;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if ties with us already
|
|
|
|
|
if ( a == si->m_a ) continue;
|
|
|
|
|
// stop if out
|
|
|
|
|
if ( a >= end ) break;
|
|
|
|
|
|
|
|
|
|
// . it must be in the same expanded frame src, if any
|
|
|
|
|
// . this fixes trulia.com which was ending our html
|
|
|
|
|
// tag, which was open-ended, with the html tag in
|
|
|
|
|
// a frame src expansion
|
|
|
|
|
if ( sj->m_gbFrameNum != si->m_gbFrameNum ) continue;
|
|
|
|
|
// fix sunsetpromotions.com bug. see above.
|
|
|
|
|
if ( sj->m_parent != si->m_parent ) continue;
|
|
|
|
|
// get its tid
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t tid2 = (*m_tr)[a].nodeid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// use base hash if xml tag
|
|
|
|
|
if ( tid2 == TAG_XMLTAG )
|
|
|
|
|
tid2 = sj->m_xmlNameHash;
|
|
|
|
|
// must be our tag type!
|
|
|
|
|
if ( tid2 != tid1 ) continue;
|
|
|
|
|
// ok end us there instead!
|
|
|
|
|
si->m_b = a;
|
|
|
|
|
// stop
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// reparent again now that things are closed
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *si = &m_sections[i];
|
|
|
|
|
// skip if we are still open-ended
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( si->m_b < 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get parent
|
|
|
|
|
Section *sp = si->m_parent;
|
|
|
|
|
// skip if null
|
|
|
|
|
if ( ! sp ) continue;
|
|
|
|
|
// skip if parent still open ended
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sp->m_b < 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// subloop it
|
|
|
|
|
doagain2:
|
|
|
|
|
// skip if no parent
|
|
|
|
|
if ( ! sp ) continue;
|
|
|
|
|
// . does parent contain our first word?
|
|
|
|
|
// . it need not fully contain our last word!!!
|
|
|
|
|
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
|
|
|
|
|
// if parent is open ended, then it is ok for now
|
|
|
|
|
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
|
|
|
|
|
// if parent is open ended, then it is ok for now
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sp->m_b == -1 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get grandparent
|
|
|
|
|
sp = sp->m_parent;
|
|
|
|
|
// set
|
|
|
|
|
si->m_parent = sp;
|
|
|
|
|
// try again
|
|
|
|
|
goto doagain2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// now assign m_sectionPtrs[] which map a word to the first
|
|
|
|
|
// section that contains it
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
Section *dstack[MAXTAGSTACK];
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t ns = 0;
|
|
|
|
|
int32_t j = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
current = m_rootSection;//&m_sections[0];
|
|
|
|
|
Section *next = m_rootSection;//&m_sections[0];
|
|
|
|
|
// first print the html lines out
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// pop all off the stack that match us
|
|
|
|
|
for ( ; ns>0 && dstack[ns-1]->m_b == i ; ) {
|
|
|
|
|
ns--;
|
|
|
|
|
current = dstack[ns-1];
|
|
|
|
|
}
|
|
|
|
|
// push our current section onto the stack if i equals
|
|
|
|
|
// its first word #
|
|
|
|
|
for ( ; next && i == next->m_a ; ) {
|
|
|
|
|
dstack[ns++] = next;
|
|
|
|
|
// set our current section to this now
|
|
|
|
|
current = next;
|
|
|
|
|
// get next section for setting "next"
|
|
|
|
|
j++;
|
|
|
|
|
// if no more left, set "next" to NULL and stop loop
|
|
|
|
|
if ( j >= m_numSections ) { next=NULL; break; }
|
|
|
|
|
// grab it
|
|
|
|
|
next = &m_sections[j];
|
|
|
|
|
}
|
|
|
|
|
// assign
|
|
|
|
|
m_sectionPtrs[i] = current;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . addImpliedSections() requires Section::m_baseHash
|
|
|
|
|
// . set Section::m_baseHash
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// these have to be in order of sn->m_a to work right
|
|
|
|
|
// because we rely on the parent tag hash, which would not
|
2016-03-02 05:30:05 -05:00
|
|
|
|
// necessarily be set if we were not sorted, because the
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// parent section could have SEC_FAKE flag set because it is
|
|
|
|
|
// a br section added afterwards.
|
|
|
|
|
Section *sn = &m_sections[i];
|
|
|
|
|
// get word start into "ws"
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t ws = sn->m_a;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*m_tr)[ws];
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2018-03-09 10:24:39 -05:00
|
|
|
|
nodeid_t tid = token.nodeid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check, <a> guys are not sections
|
|
|
|
|
//if ( tid == TAG_A &&
|
2016-06-20 06:30:26 -04:00
|
|
|
|
// !(sn->m_flags & SEC_SENTENCE) ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// use a modified tid as the tag hash?
|
2014-10-30 15:36:39 -04:00
|
|
|
|
int64_t mtid = tid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// custom xml tag, hash the tag itself
|
|
|
|
|
if ( tid == TAG_XMLTAG )
|
2018-03-09 10:24:39 -05:00
|
|
|
|
mtid = hash32 ( token.token_start,token.token_len );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// an unknown tag like <!! ...->
|
|
|
|
|
if ( tid == 0 )
|
|
|
|
|
mtid = 1;
|
|
|
|
|
// . if we are a div tag, mod it
|
|
|
|
|
// . treat the fields in the div tag as
|
|
|
|
|
// part of the tag hash.
|
|
|
|
|
// . helps Events.cpp be more precise about
|
|
|
|
|
// section identification!!!!
|
|
|
|
|
// . we now do this for TD and TR so Nov 2009 can telescope for
|
|
|
|
|
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
|
|
|
|
|
// so the calendar title "Nov 2009" can affect all dates
|
|
|
|
|
// below the calendar.
|
|
|
|
|
if ( tid == TAG_DIV ||
|
|
|
|
|
tid == TAG_TD ||
|
|
|
|
|
tid == TAG_TR ||
|
|
|
|
|
tid == TAG_LI || // newmexico.org urls class=xxx
|
|
|
|
|
tid == TAG_UL || // newmexico.org urls class=xxx
|
|
|
|
|
tid == TAG_P || // <p class="pstrg"> stjohnscollege.edu
|
|
|
|
|
tid == TAG_SPAN ) {
|
|
|
|
|
// get ptr
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *p = token.token_start;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip <
|
|
|
|
|
p++;
|
|
|
|
|
// skip following alnums, that is the tag name
|
|
|
|
|
for ( ; is_alnum_a(*p) ; p++ );
|
|
|
|
|
// scan for "id" or "class" in it
|
|
|
|
|
// . i had to increase this because we were missing
|
|
|
|
|
// some stuff causing us to get the wrong implied
|
|
|
|
|
// sections for
|
|
|
|
|
// www.guysndollsllc.com/page5/page4/page4.html
|
|
|
|
|
// causing "The Remains" to be paired up with
|
|
|
|
|
// "Aug 7, 2010" in an implied section which was
|
|
|
|
|
// just wrong. it was 20, i made it 100...
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *pend = p + 100;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// position ptr
|
|
|
|
|
unsigned char cnt = 0;
|
|
|
|
|
// a flag
|
|
|
|
|
bool skipTillSpace = false;
|
|
|
|
|
// . just hash every freakin char i guess
|
|
|
|
|
// . TODO: maybe don't hash "width" for <td><tr>
|
|
|
|
|
for ( ; *p && *p !='>' && p < pend ; p++ ) {
|
|
|
|
|
// skip bgcolor= tags because panjea.org
|
|
|
|
|
// interlaces different colored <tr>s in the
|
|
|
|
|
// table and i want them to be seen as brother
|
|
|
|
|
// sections, mostly for the benefit of the
|
|
|
|
|
// setting of lastBrother1/2 in Events.cpp
|
|
|
|
|
if ( is_wspace_a(p[0]) &&
|
|
|
|
|
to_lower_a (p[1])=='b' &&
|
|
|
|
|
to_lower_a (p[2])=='g' ) {
|
|
|
|
|
skipTillSpace = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if not a space continue
|
|
|
|
|
if ( skipTillSpace ) {
|
|
|
|
|
if ( ! is_wspace_a(*p) ) continue;
|
|
|
|
|
skipTillSpace = false;
|
|
|
|
|
}
|
|
|
|
|
// do not hash until we get a space
|
|
|
|
|
if ( skipTillSpace ) continue;
|
|
|
|
|
// skip if not alnum
|
|
|
|
|
if ( !is_alnum_a(*p)) continue;
|
|
|
|
|
// hash it in
|
|
|
|
|
mtid ^= g_hashtab[cnt++][(unsigned char)*p];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// should not have either of these yet!
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_flags & SEC_FAKE ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( sn->m_flags & SEC_SENTENCE ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( mtid == 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . set the base hash, usually just tid
|
|
|
|
|
// . usually base hash is zero but if it is a br tag
|
|
|
|
|
// we set it to something special to indicate the number
|
|
|
|
|
// of br tags in the sequence
|
|
|
|
|
sn->m_baseHash ^= mtid;
|
|
|
|
|
// fix this
|
|
|
|
|
if ( sn == rootSection ) sn->m_baseHash = 1;
|
|
|
|
|
// fix root section i guess
|
|
|
|
|
if ( sn->m_baseHash == 0 ) {
|
|
|
|
|
// fix core on gk21
|
|
|
|
|
sn->m_baseHash = 2;
|
|
|
|
|
}
|
2014-12-11 21:24:28 -05:00
|
|
|
|
// set this now too WHY? should already be set!!! was
|
|
|
|
|
// causing the root section to become a title section
|
|
|
|
|
// because first word was "<title>". then every word in
|
|
|
|
|
// the doc got SEC_IN_TITLE set and did not get hashed
|
|
|
|
|
// in XmlDoc::hashBody()... NOR in XmlDoc::hashTitle()!!!
|
2016-02-24 07:37:41 -05:00
|
|
|
|
if ( sn != rootSection )
|
2014-12-11 22:54:33 -05:00
|
|
|
|
sn->m_tagId = tid;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set up our linked list, the functions below will insert sections
|
|
|
|
|
// and modify this linked list
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set it
|
|
|
|
|
if ( i + 1 < m_numSections )
|
|
|
|
|
m_sections[i].m_next = &m_sections[i+1];
|
|
|
|
|
if ( i - 1 >= 0 )
|
|
|
|
|
m_sections[i].m_prev = &m_sections[i-1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// init to -1 to indicate none
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
// reset it
|
|
|
|
|
si->m_firstWordPos = -1;
|
|
|
|
|
si->m_lastWordPos = -1;
|
|
|
|
|
si->m_senta = -1;
|
|
|
|
|
si->m_sentb = -1;
|
|
|
|
|
}
|
|
|
|
|
// now set position of first word each section contains
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get smallest section containing
|
|
|
|
|
Section *si = m_sectionPtrs[i];
|
|
|
|
|
// do each parent as well
|
|
|
|
|
for ( ; si ; si = si->m_parent ) {
|
|
|
|
|
// skip if already had one!
|
|
|
|
|
if ( si->m_firstWordPos >= 0 ) break;
|
|
|
|
|
// otherwise, we are it
|
|
|
|
|
si->m_firstWordPos = i;
|
|
|
|
|
// . set format hash of it
|
|
|
|
|
// . do it manually since tagHash not set yet
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// and last word position
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = m_nw - 1 ; i > 0 ; i-- ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get smallest section containing
|
|
|
|
|
Section *si = m_sectionPtrs[i];
|
|
|
|
|
// do each parent as well
|
|
|
|
|
for ( ; si ; si = si->m_parent ) {
|
|
|
|
|
// skip if already had one!
|
|
|
|
|
if ( si->m_lastWordPos >= 0 ) break;
|
|
|
|
|
// otherwise, we are it
|
|
|
|
|
si->m_lastWordPos = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sec_t inFlag = 0;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t istack[1000];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sec_t iflags[1000];
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t ni = 0;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
//
|
|
|
|
|
// now set the inFlags here because the tags might not have all
|
|
|
|
|
// been closed, making tags like SEC_STYLE overflow from where
|
|
|
|
|
// they should be...
|
|
|
|
|
//
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
// did we exceed a tag boundary?
|
|
|
|
|
for ( ; ni>0 && si->m_a >= istack[ni-1] ; ) {
|
|
|
|
|
// undo flag
|
|
|
|
|
inFlag &= ~iflags[ni-1];
|
|
|
|
|
// pop off
|
|
|
|
|
ni--;
|
|
|
|
|
}
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get the flag if any into mf
|
|
|
|
|
sec_t mf = 0;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not special tag id
|
|
|
|
|
nodeid_t tid = si->m_tagId;
|
|
|
|
|
if ( tid == TAG_SCRIPT ) mf = SEC_SCRIPT;
|
|
|
|
|
else if ( tid == TAG_NOSCRIPT) mf = SEC_NOSCRIPT;
|
|
|
|
|
else if ( tid == TAG_STYLE ) mf = SEC_STYLE;
|
|
|
|
|
else if ( tid == TAG_SELECT ) mf = SEC_SELECT;
|
|
|
|
|
else if ( tid == TAG_H1 ) mf = SEC_IN_HEADER;
|
|
|
|
|
else if ( tid == TAG_H2 ) mf = SEC_IN_HEADER;
|
|
|
|
|
else if ( tid == TAG_H3 ) mf = SEC_IN_HEADER;
|
|
|
|
|
else if ( tid == TAG_H4 ) mf = SEC_IN_HEADER;
|
|
|
|
|
else if ( tid == TAG_TITLE ) mf = SEC_IN_TITLE;
|
2016-06-09 11:06:29 -04:00
|
|
|
|
else if ( tid == TAG_HEAD ) mf = SEC_IN_HEAD;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// accumulate
|
|
|
|
|
inFlag |= mf;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// add in the flags
|
|
|
|
|
si->m_flags |= inFlag;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if nothing special
|
|
|
|
|
if ( ! mf ) continue;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( ni >= 1000 ) { g_process.shutdownAbort(true); }
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// otherwise, store on stack
|
|
|
|
|
istack[ni] = si->m_b;
|
|
|
|
|
iflags[ni] = mf;
|
|
|
|
|
ni++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . now we insert sentence sections
|
|
|
|
|
// . find the smallest section containing the first and last
|
|
|
|
|
// word of each sentence and inserts a subsection into that
|
|
|
|
|
// . we have to be careful to reparent, etc.
|
|
|
|
|
// . kinda copy splitSections() function
|
|
|
|
|
// . maybe add an "insertSection()" function???
|
|
|
|
|
if ( m_contentType != CT_JS ) {
|
|
|
|
|
// add sentence sections
|
|
|
|
|
if ( ! addSentenceSections() ) return true;
|
|
|
|
|
// this is needed by setSentFlags()
|
|
|
|
|
setNextSentPtrs();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . set m_nextBrother
|
|
|
|
|
// . we call this now to aid in setHeadingBit() and for adding the
|
|
|
|
|
// implied sections, but it is ultimately
|
|
|
|
|
// called a second time once all the new sections are inserted
|
2016-02-24 10:12:05 -05:00
|
|
|
|
setNextBrotherPtrs ( false );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// . set SEC_HEADING bit
|
|
|
|
|
// . need this before implied sections
|
|
|
|
|
setHeadingBit ();
|
|
|
|
|
|
|
|
|
|
setTagHashes();
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// TODO TODO
|
|
|
|
|
//
|
|
|
|
|
// TAKE OUT THESE SANITY CHECKS TO SPEED UP!!!!!!
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
// clear this
|
|
|
|
|
bool isHidden = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t startHide = 0x7fffffff;
|
|
|
|
|
int32_t endHide = 0 ;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// now that we have closed any open tag, set the SEC_HIDDEN bit
|
|
|
|
|
// for all sections that are like <div style=display:none>
|
|
|
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
|
|
|
// set m_lastSection so we can scan backwards
|
|
|
|
|
m_lastSection = sn;
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set this
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t wn = sn->m_a;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// stop hiding it?
|
|
|
|
|
if ( isHidden ) {
|
|
|
|
|
// turn it off if not contained
|
|
|
|
|
if ( wn >= endHide ) isHidden = false;
|
|
|
|
|
else sn->m_flags |= SEC_HIDDEN;
|
|
|
|
|
}
|
|
|
|
|
// get tag id
|
2016-03-01 11:32:15 -05:00
|
|
|
|
nodeid_t tid = sn->m_tagId;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// is div, td or tr tag start?
|
|
|
|
|
if ( tid!=TAG_DIV &&
|
|
|
|
|
tid!=TAG_TD &&
|
|
|
|
|
tid!=TAG_TR &&
|
|
|
|
|
tid!=TAG_UL &&
|
|
|
|
|
tid!=TAG_SPAN) continue;
|
|
|
|
|
|
|
|
|
|
// . if we are a div tag, mod it
|
|
|
|
|
// . treat the fields in the div tag as
|
|
|
|
|
// part of the tag hash.
|
|
|
|
|
// . helps Events.cpp be more precise about
|
|
|
|
|
// section identification!!!!
|
|
|
|
|
// . we now do this for TD and TR so Nov 2009 can telescope for
|
|
|
|
|
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
|
|
|
|
|
// so the calendar title "Nov 2009" can affect all dates
|
|
|
|
|
// below the calendar.
|
|
|
|
|
|
|
|
|
|
// get the style tag in there and check it for "display: none"!
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t slen = (*m_tr)[wn].token_len;
|
|
|
|
|
const char *s = (*m_tr)[wn].token_start;
|
2018-02-09 11:52:47 -05:00
|
|
|
|
const char *send = s + slen;
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// check out any div tag that has a style
|
2018-02-09 11:52:47 -05:00
|
|
|
|
const char *style = gb_strncasestr(s,slen,"style=") ;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( ! style ) continue;
|
2016-03-01 11:32:15 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . check for hidden
|
|
|
|
|
// . if no hidden tag assume it is UNhidden
|
|
|
|
|
// . TODO: later push & pop on stack
|
2018-02-09 11:52:47 -05:00
|
|
|
|
const char *ds = gb_strncasestr(style,send-style,"display:");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if display:none not found turn off SEC_HIDDEN
|
|
|
|
|
if ( ! ds || ! gb_strncasestr(s,slen,"none") ) {
|
|
|
|
|
// turn off the hiding
|
|
|
|
|
isHidden = false;
|
|
|
|
|
// off in us too
|
|
|
|
|
sn->m_flags &= ~SEC_HIDDEN;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// mark all sections in this with the tag
|
|
|
|
|
isHidden = true;
|
|
|
|
|
// on in us
|
|
|
|
|
sn->m_flags |= SEC_HIDDEN;
|
|
|
|
|
// stop it after this word for sure
|
|
|
|
|
if ( sn->m_b > endHide ) endHide = sn->m_b;
|
|
|
|
|
if ( sn->m_a < startHide ) startHide = sn->m_a;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// now set the content hash of each section
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must be an alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get its section
|
2018-03-09 10:24:39 -05:00
|
|
|
|
m_sectionPtrs[i]->m_contentHash64 ^= (*m_tr)[i].token_hash;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// fix "smooth smooth!"
|
|
|
|
|
if ( m_sectionPtrs[i]->m_contentHash64 == 0 )
|
|
|
|
|
m_sectionPtrs[i]->m_contentHash64 = 123456;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// now set SEC_NOTEXT flag if content hash is zero!
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sn = &m_sections[i];
|
|
|
|
|
// skip if had text
|
|
|
|
|
if ( sn->m_contentHash64 ) continue;
|
|
|
|
|
// no text!
|
|
|
|
|
sn->m_flags |= SEC_NOTEXT;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// set Section::m_alnumPosA/m_alnumPosB
|
|
|
|
|
//
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t alnumCount2 = 0;
|
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sn = &m_sections[i];
|
|
|
|
|
// skip if had text
|
|
|
|
|
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
|
|
|
|
|
// save this
|
|
|
|
|
sn->m_alnumPosA = alnumCount2;
|
|
|
|
|
// scan the wids of the whole sentence, which may not
|
|
|
|
|
// be completely contained in the "sn" section!!
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = sn->m_senta;
|
|
|
|
|
int32_t b = sn->m_sentb;
|
|
|
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must be an alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[j].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// alnumcount
|
|
|
|
|
alnumCount2++;
|
|
|
|
|
}
|
|
|
|
|
// so we contain the range [a,b), typical half-open interval
|
|
|
|
|
sn->m_alnumPosB = alnumCount2;
|
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_alnumPosA == sn->m_alnumPosB ){g_process.shutdownAbort(true);}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// propagate through parents
|
|
|
|
|
Section *si = sn->m_parent;
|
|
|
|
|
// do each parent as well
|
|
|
|
|
for ( ; si ; si = si->m_parent ) {
|
|
|
|
|
// skip if already had one!
|
|
|
|
|
if ( si->m_alnumPosA > 0 ) break;
|
|
|
|
|
// otherwise, we are it
|
|
|
|
|
si->m_alnumPosA = sn->m_alnumPosA;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
// propagate up alnumPosB now
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get it
|
|
|
|
|
Section *sn = &m_sections[i];
|
|
|
|
|
// skip if had text
|
|
|
|
|
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
|
|
|
|
|
// propagate through parents
|
|
|
|
|
Section *si = sn->m_parent;
|
|
|
|
|
// do each parent as well
|
|
|
|
|
for ( ; si ; si = si->m_parent ) {
|
|
|
|
|
// skip if already had one! no, because we need to
|
|
|
|
|
// get the MAX of all of our kids!!
|
|
|
|
|
//if ( si->m_alnumPosB > 0 ) break;
|
|
|
|
|
// otherwise, we are it
|
|
|
|
|
si->m_alnumPosB = sn->m_alnumPosB;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
///////////////////////////////////////
|
|
|
|
|
//
|
|
|
|
|
// now set Section::m_listContainer
|
|
|
|
|
//
|
|
|
|
|
// . a containing section is a section containing
|
|
|
|
|
// MULTIPLE smaller sections
|
|
|
|
|
// . so if a section has a containing section set its m_listContainer
|
|
|
|
|
// to that containing section
|
|
|
|
|
// . we limit this to sections that directly contain text for now
|
|
|
|
|
// . Events.cpp::getRegistrationTable() uses m_nextBrother so we
|
|
|
|
|
// need this now!!
|
|
|
|
|
//
|
|
|
|
|
///////////////////////////////////////
|
2016-02-24 06:52:26 -05:00
|
|
|
|
setNextBrotherPtrs ( true );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
///////////////////////////////////////
|
|
|
|
|
//
|
|
|
|
|
// now set SEC_MENU and SEC_LINK_TEXT flags
|
|
|
|
|
//
|
|
|
|
|
///////////////////////////////////////
|
|
|
|
|
setMenus();
|
|
|
|
|
|
|
|
|
|
//verifySections();
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . PROBLEM: because we ignore non-breaking tags we often get sections
|
|
|
|
|
// that are really not sentences, but we are forced into them because
|
|
|
|
|
// we cannot split span or bold tags
|
|
|
|
|
// i.e. "<div>This is <b>a sentence. And this</b> is a sentence.</div>"
|
|
|
|
|
// forces us to treat the entire div tag as a sentence section.
|
|
|
|
|
// . i did add some logic to ignore those (the two for-k loops below) but then
|
|
|
|
|
// Address.cpp cores because it expects every alnum word to be in a sentence
|
|
|
|
|
// . now make sure to shrink into our current parent if we would not lose
|
|
|
|
|
// alnum chars!! fixes sentence flip flopping
|
|
|
|
|
// . returns false and sets g_errno on error
|
|
|
|
|
bool Sections::addSentenceSections ( ) {
|
2016-02-24 05:21:06 -05:00
|
|
|
|
sec_t badFlags = SEC_STYLE | SEC_SCRIPT | SEC_SELECT | SEC_HIDDEN | SEC_NOSCRIPT;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section **sp = m_sectionPtrs;
|
|
|
|
|
|
|
|
|
|
static bool s_init = false;
|
2014-10-30 15:36:39 -04:00
|
|
|
|
static int64_t h_in;
|
|
|
|
|
static int64_t h_at;
|
|
|
|
|
static int64_t h_for;
|
|
|
|
|
static int64_t h_to;
|
|
|
|
|
static int64_t h_on;
|
|
|
|
|
static int64_t h_under;
|
|
|
|
|
static int64_t h_with;
|
2014-11-17 21:13:36 -05:00
|
|
|
|
static int64_t h_along;
|
2014-10-30 15:36:39 -04:00
|
|
|
|
static int64_t h_from;
|
|
|
|
|
static int64_t h_by;
|
|
|
|
|
static int64_t h_of;
|
|
|
|
|
static int64_t h_some;
|
|
|
|
|
static int64_t h_the;
|
|
|
|
|
static int64_t h_and;
|
|
|
|
|
static int64_t h_a;
|
|
|
|
|
static int64_t h_http;
|
|
|
|
|
static int64_t h_https;
|
|
|
|
|
static int64_t h_room;
|
|
|
|
|
static int64_t h_rm;
|
|
|
|
|
static int64_t h_bldg;
|
|
|
|
|
static int64_t h_building;
|
|
|
|
|
static int64_t h_suite;
|
|
|
|
|
static int64_t h_ste;
|
|
|
|
|
static int64_t h_tags;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( ! s_init ) {
|
|
|
|
|
s_init = true;
|
|
|
|
|
h_tags = hash64n("tags");
|
|
|
|
|
h_in = hash64n("in");
|
|
|
|
|
h_the = hash64n("the");
|
|
|
|
|
h_and = hash64n("and");
|
|
|
|
|
h_a = hash64n("a");
|
|
|
|
|
h_at = hash64n("at");
|
|
|
|
|
h_for = hash64n("for");
|
|
|
|
|
h_to = hash64n("to");
|
|
|
|
|
h_on = hash64n("on");
|
|
|
|
|
h_under = hash64n("under");
|
|
|
|
|
h_with = hash64n("with");
|
2014-11-17 21:13:36 -05:00
|
|
|
|
h_along = hash64n("along");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
h_from = hash64n("from");
|
|
|
|
|
h_by = hash64n("by");
|
|
|
|
|
h_of = hash64n("of");
|
|
|
|
|
h_some = hash64n("some");
|
|
|
|
|
h_http = hash64n("http");
|
|
|
|
|
h_https = hash64n("https");
|
|
|
|
|
h_room = hash64n("room");
|
|
|
|
|
h_rm = hash64n("rm");
|
|
|
|
|
h_bldg = hash64n("bldg");
|
|
|
|
|
h_building = hash64n("building");
|
|
|
|
|
h_suite = hash64n("suite");
|
|
|
|
|
h_ste = hash64n("ste");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// need D_IS_IN_URL bits to be valid
|
2016-09-22 19:15:54 -04:00
|
|
|
|
m_bits->setInUrlBits ( );
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// is the abbr. a noun? like "appt."
|
|
|
|
|
bool hasWordAfter = false;
|
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need a wid
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get section we are currently in
|
|
|
|
|
Section *cs = m_sectionPtrs[i];
|
|
|
|
|
// skip if its bad! i.e. style or script or whatever
|
|
|
|
|
if ( cs->m_flags & badFlags ) continue;
|
|
|
|
|
// set that
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int64_t prevWid = (*m_tr)[i].token_hash;
|
2014-10-30 15:36:39 -04:00
|
|
|
|
int64_t prevPrevWid = 0LL;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// flag
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lastWidPos = i;//-1;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bool lastWasComma = false;
|
|
|
|
|
nodeid_t includedTag = -2;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lastbr = -1;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bool endOnBr = false;
|
|
|
|
|
bool endOnBold = false;
|
|
|
|
|
bool capped = true;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t upper = 0;
|
|
|
|
|
int32_t numAlnums = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan for sentence end
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t j;
|
|
|
|
|
for ( j = i ; j < m_nw ; j++ ) {
|
|
|
|
|
const auto &token2 = (*m_tr)[j];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip words
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.is_alfanum ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// prev prev
|
|
|
|
|
prevPrevWid = prevWid;
|
|
|
|
|
// assume not a word like "vs."
|
|
|
|
|
hasWordAfter = false;
|
|
|
|
|
// set prev
|
2018-03-09 10:24:39 -05:00
|
|
|
|
prevWid = token2.token_hash;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
lastWidPos = j;
|
|
|
|
|
lastWasComma = false;
|
|
|
|
|
endOnBr = false;
|
|
|
|
|
endOnBold = false;
|
|
|
|
|
numAlnums++;
|
|
|
|
|
// skip if stop word and need not be
|
|
|
|
|
// capitalized
|
2018-09-04 08:50:38 -04:00
|
|
|
|
if ( m_bits->queryBits(j) & D_IS_STOPWORD ) continue;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.token_len <= 1 ) continue;
|
|
|
|
|
if ( is_digit(token2.token_start[0]) ) continue;
|
|
|
|
|
if ( !is_upper_utf8(token2.token_start)) capped=false;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
else upper++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// tag?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.nodeid ) {
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2018-03-09 10:24:39 -05:00
|
|
|
|
nodeid_t tid = token2.nodeid & BACKBITCOMP;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// treat nobr as breaking to fix ceder.net
|
|
|
|
|
// which has it after the group title
|
|
|
|
|
if ( tid == TAG_NOBR ) break;
|
|
|
|
|
|
|
|
|
|
if ( tid == TAG_BR ) endOnBr = true;
|
|
|
|
|
if ( tid == TAG_B ) endOnBold = true;
|
|
|
|
|
|
|
|
|
|
// a </b><br> is usually like a header
|
|
|
|
|
if ( capped && upper && endOnBr && endOnBold )
|
|
|
|
|
break;
|
|
|
|
|
// if it is <span style="display:none"> or
|
|
|
|
|
// div or whatever, that is breaking!
|
|
|
|
|
// fixes http://chuckprophet.com/gigs/
|
|
|
|
|
if ( (tid == TAG_DIV ||
|
|
|
|
|
tid == TAG_SPAN ) &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
token2.token_len > 14 &&
|
|
|
|
|
strncasestr(token2.token_start,"display:none",
|
|
|
|
|
token2.token_len) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
break;
|
|
|
|
|
// ok, treat span as non-breaking for a second
|
|
|
|
|
if ( tid == TAG_SPAN ) continue;
|
|
|
|
|
// mark this
|
|
|
|
|
if ( tid == TAG_BR ) lastbr = j;
|
|
|
|
|
//
|
|
|
|
|
// certain tags like span and br sometimes
|
|
|
|
|
// do and sometimes do not break a sentence.
|
|
|
|
|
// so by default assume they do, but check
|
|
|
|
|
// for certain indicators...
|
|
|
|
|
//
|
|
|
|
|
if ( tid == TAG_SPAN ||
|
|
|
|
|
tid == TAG_BR ||
|
|
|
|
|
// fixes guysndollsllc.com:
|
|
|
|
|
// causes core dump:
|
|
|
|
|
tid == TAG_P || // villr.com
|
|
|
|
|
// fixes americantowns.com
|
|
|
|
|
tid == TAG_DIV ) {
|
|
|
|
|
// if nothing after, moot point
|
|
|
|
|
if ( j+1 >= m_nw ) break;
|
|
|
|
|
// if we already included this tag
|
|
|
|
|
// then keep including it. but some
|
|
|
|
|
// span tags will break and some won't
|
|
|
|
|
// even when in or around the same
|
|
|
|
|
// sentence. see that local.yahoo.com
|
|
|
|
|
// food delivery services url for
|
|
|
|
|
// the first street address,
|
|
|
|
|
// 5013 Miramar
|
|
|
|
|
if ( includedTag == tid &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
(token2.nodeid & BACKBIT) ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// reset it in case next
|
|
|
|
|
// <span> tag is not connective
|
|
|
|
|
includedTag = -2;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// if we included this tag type
|
|
|
|
|
// as a front tag, then include its
|
|
|
|
|
// back tag in sentence as well.
|
|
|
|
|
// fixes nonamejustfriends.com
|
|
|
|
|
// which has a span tag in sentence:
|
|
|
|
|
// ".. Club holds a <span>FREE</span>
|
|
|
|
|
// Cruise Night..." and we allow
|
|
|
|
|
// "<span>" because it follows "a",
|
|
|
|
|
// but we were breaking on </span>!
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( !(token2.nodeid&BACKBIT))
|
2013-08-02 16:12:24 -04:00
|
|
|
|
includedTag = tid;
|
|
|
|
|
// if prev punct was comma and not
|
|
|
|
|
// an alnum word
|
|
|
|
|
if ( lastWasComma ) continue;
|
|
|
|
|
// get punct words bookcasing this tag
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[j+1].is_alfanum &&
|
|
|
|
|
! (*m_tr)[j+1].nodeid &&
|
|
|
|
|
has_char((*m_tr)[j+1].token_start,(*m_tr)[j+1].token_end(),',') )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
// if prevwid is like "vs." then
|
|
|
|
|
// that means keep going even if
|
|
|
|
|
// we hit one of these tags. fixes
|
|
|
|
|
// "new york knicks vs.<br>orlando
|
|
|
|
|
// magic"
|
|
|
|
|
if ( hasWordAfter )
|
|
|
|
|
continue;
|
|
|
|
|
// if first alnum word after tag
|
|
|
|
|
// is lower case, that is good too
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t aw = j + 1;
|
|
|
|
|
int32_t maxaw = j + 12;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( maxaw > m_nw ) maxaw = m_nw;
|
|
|
|
|
for ( ; aw < maxaw ; aw++ )
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[aw].is_alfanum ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bool isLower = false;
|
|
|
|
|
if ( aw < maxaw &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
is_lower_utf8((*m_tr)[aw].token_start) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
isLower = true;
|
|
|
|
|
|
|
|
|
|
// http or https is not to be
|
|
|
|
|
// considered as such! fixes
|
|
|
|
|
// webnetdesign.com from getting
|
|
|
|
|
// sentences continued by an http://
|
|
|
|
|
// url below them.
|
|
|
|
|
if ( aw < maxaw &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
((*m_tr)[aw].token_hash == h_http ||
|
|
|
|
|
(*m_tr)[aw].token_hash == h_https) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
isLower = false;
|
|
|
|
|
|
|
|
|
|
if ( tid == TAG_P &&
|
|
|
|
|
isLower &&
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// Oscar G<p>along with xxxx
|
2018-03-09 10:24:39 -05:00
|
|
|
|
(*m_tr)[aw].token_hash != h_along &&
|
|
|
|
|
(*m_tr)[aw].token_hash != h_with )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
isLower = false;
|
|
|
|
|
|
|
|
|
|
if ( isLower ) continue;
|
|
|
|
|
// get pre word, preopsitional
|
|
|
|
|
// phrase starter?
|
|
|
|
|
if ( prevWid == h_in ||
|
|
|
|
|
prevWid == h_the ||
|
|
|
|
|
prevWid == h_and ||
|
|
|
|
|
// fix for ending on "(Room A)"
|
|
|
|
|
(prevWid == h_a &&
|
|
|
|
|
prevPrevWid != h_rm &&
|
|
|
|
|
prevPrevWid != h_room &&
|
|
|
|
|
prevPrevWid != h_bldg &&
|
|
|
|
|
prevPrevWid != h_building &&
|
|
|
|
|
prevPrevWid != h_suite &&
|
|
|
|
|
prevPrevWid != h_ste ) ||
|
|
|
|
|
prevWid == h_for ||
|
|
|
|
|
prevWid == h_to ||
|
|
|
|
|
prevWid == h_on ||
|
|
|
|
|
prevWid == h_under ||
|
|
|
|
|
prevWid == h_with ||
|
|
|
|
|
prevWid == h_from ||
|
|
|
|
|
prevWid == h_by ||
|
|
|
|
|
prevWid == h_of ||
|
|
|
|
|
// "some ... Wednesdays"
|
|
|
|
|
prevWid == h_some ||
|
|
|
|
|
prevWid == h_at )
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// seems like span breaks for meetup.com
|
|
|
|
|
// et al and not for abqtango.com maybe, we
|
|
|
|
|
// need to download the css??? or what???
|
|
|
|
|
// by default span tags do not seem to break
|
|
|
|
|
// the line but ppl maybe configure them to
|
|
|
|
|
if ( tid == TAG_SPAN ) break;
|
|
|
|
|
// if like <font> ignore it
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! isBreakingTagId(token2.nodeid) ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// only break on xml tags if in rss feed to
|
|
|
|
|
// fix <st1:State w:st="on">Arizona</st1>
|
|
|
|
|
// for gwair.org
|
|
|
|
|
if ( tid==TAG_XMLTAG && !m_isRSSExt) continue;
|
|
|
|
|
// otherwise, stop!
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// skip simple spaces for speed
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token2.token_len == 1 && is_wspace_a(token2.token_start[0]))
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// do not allow punctuation that is in a url
|
|
|
|
|
// to be split up or used as a splitter. we want
|
|
|
|
|
// to keep the full url intact.
|
|
|
|
|
if ( j > i && j+1 < m_nw &&
|
2018-09-04 08:50:38 -04:00
|
|
|
|
(m_bits->queryBits(j-1) & D_IS_IN_URL) &&
|
|
|
|
|
(m_bits->queryBits(j ) & D_IS_IN_URL) &&
|
|
|
|
|
(m_bits->queryBits(j+1) & D_IS_IN_URL) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// was last punct containing a comma?
|
|
|
|
|
lastWasComma = false;
|
|
|
|
|
// scan the punct chars, stop if we hit a sent breaker
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *p = token2.token_start;
|
|
|
|
|
const char *pend = p + token2.token_len;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
for ( ; p < pend ; p++ ) {
|
|
|
|
|
// punct word...
|
|
|
|
|
if ( *p == '.' ) break;
|
|
|
|
|
if ( *p == ',' ) lastWasComma =true;
|
|
|
|
|
// allow this too for now... no...
|
|
|
|
|
if ( *p == ';' ) break;
|
|
|
|
|
// now hyphen breaks, mostly for stuff
|
|
|
|
|
// in title tags like dukecityfix.com
|
|
|
|
|
if ( sp[j]->m_tagId == TAG_TITLE &&
|
|
|
|
|
*p == '-' &&
|
|
|
|
|
is_wspace_a(p[-1]) &&
|
|
|
|
|
is_wspace_a(p[+1]) &&
|
|
|
|
|
lastWidPos >= 0 &&
|
|
|
|
|
! m_isRSSExt &&
|
|
|
|
|
j+1<m_nw &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
(*m_tr)[j+1].is_alfanum &&
|
2013-08-02 16:12:24 -04:00
|
|
|
|
//( ! (bb[lastWidPos] & D_IS_IN_DATE) ||
|
|
|
|
|
// ! (bb[j+1] & D_IS_IN_DATE) ) &&
|
|
|
|
|
// fix for $10 - $12
|
2018-03-09 10:24:39 -05:00
|
|
|
|
( ! is_digit ( (*m_tr)[lastWidPos].token_start[0]) ||
|
|
|
|
|
! is_digit ( (*m_tr)[j+1].token_start[0]) ) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
break;
|
|
|
|
|
// . treat colon like comma now
|
|
|
|
|
// . for unm.edu we have
|
|
|
|
|
// "Summer Hours: March 15 - Oct15:
|
|
|
|
|
// 8 am. Mon - Fri, 7:30 am - 10 am Sun.,
|
|
|
|
|
// Winter Hours: Oct. 15 - March 15:
|
|
|
|
|
// 8 am., seven days a week"
|
|
|
|
|
// . and we don't want "winter hours" being
|
|
|
|
|
// toplogically closer to the summer hours
|
|
|
|
|
// . that is, the colon is a stronger binder
|
|
|
|
|
// than the comma?
|
|
|
|
|
// . but for villr.com Hours: May-Aug.. gets
|
|
|
|
|
// made into two sentences and Hours is
|
|
|
|
|
// seen as a heading section and causes
|
|
|
|
|
// addImpliedSections() to be wrong.
|
|
|
|
|
// . why not the colon?
|
|
|
|
|
if ( *p == ':' ) {
|
|
|
|
|
|
|
|
|
|
// Tags: music,concert,fun
|
|
|
|
|
if ( prevWid == h_tags &&
|
|
|
|
|
// just Tags: so far in sentence
|
|
|
|
|
j == i )
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
// a "::" is used in breadcrumbs,
|
|
|
|
|
// so break on that.
|
|
|
|
|
// fixes "Dining :: Visit ::
|
|
|
|
|
// Cal Performacnes" title
|
|
|
|
|
if ( p[1] == ':' )
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
// if "with" preceeds, allow
|
|
|
|
|
if ( prevWid == h_with ) continue;
|
|
|
|
|
|
|
|
|
|
// or prev word was tag! like
|
|
|
|
|
// "blah</b>:..."
|
2018-03-09 10:24:39 -05:00
|
|
|
|
bool tagAfter = (j-1>=0 && (*m_tr)[j-1].nodeid);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// do not allow if next word is tag
|
2018-03-09 10:24:39 -05:00
|
|
|
|
bool tagBefore = (j+1<m_nw && (*m_tr)[j+1].nodeid);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// do not allow
|
|
|
|
|
// "<br>...:<br>" or
|
|
|
|
|
// "<br>...<br>:" or
|
|
|
|
|
// since such things are usually
|
|
|
|
|
// somewhat like headers. isolated
|
|
|
|
|
// lines ending on a colon.
|
|
|
|
|
// should fix st. martin's center
|
|
|
|
|
// for unm.edu "Summer Hours: ..."
|
|
|
|
|
if ( lastbr >= 0 &&
|
|
|
|
|
( tagBefore || tagAfter ) ) {
|
|
|
|
|
// end sentence there then
|
|
|
|
|
j = lastbr;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( tagBefore ) break;
|
|
|
|
|
if ( tagAfter ) break;
|
2016-01-06 18:01:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// for now allow it!
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// . special hyphen
|
|
|
|
|
// . breaks up title for peachpundit.com
|
|
|
|
|
// so we get better event title generation
|
|
|
|
|
// since peachpundit.com will be a reepat sec
|
|
|
|
|
// . BUT it did not work!
|
|
|
|
|
if ( p[0] == (char)-30 &&
|
|
|
|
|
p[1] == (char)-128 &&
|
|
|
|
|
p[2] == (char)-108 )
|
|
|
|
|
break;
|
|
|
|
|
// this for sure
|
|
|
|
|
// "Home > Albuquerque Events > Love Song ..."
|
|
|
|
|
if ( *p == '>' ) break;
|
|
|
|
|
if ( *p == '!' ) break;
|
|
|
|
|
if ( *p == '?' ) break;
|
|
|
|
|
if ( *p == '|' )
|
|
|
|
|
break;
|
|
|
|
|
// bullets
|
|
|
|
|
if ( p[0] == (char)226 &&
|
|
|
|
|
p[1] == (char)128 &&
|
|
|
|
|
p[2] == (char)162 )
|
|
|
|
|
break;
|
|
|
|
|
redo:
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// if none, keep going
|
|
|
|
|
if ( p == pend ) continue;
|
|
|
|
|
// if an alnum char follows the ., it is ok
|
|
|
|
|
// probably a hostname or ip or phone #
|
|
|
|
|
if ( is_alnum_utf8(p+1) &&
|
|
|
|
|
// "venue:ABQ Sq Dance Center..." for
|
|
|
|
|
// americantowns.com has no space after the colon!
|
|
|
|
|
*p !=':' )
|
|
|
|
|
goto redo;
|
|
|
|
|
// if abbreviation before we are ok too
|
|
|
|
|
if ( *p == '.' && isAbbr(prevWid,&hasWordAfter) ) {
|
|
|
|
|
// but the period may serve a double purpose
|
|
|
|
|
// to end the abbr and terminate the sentence
|
|
|
|
|
// if the word that follows is capitalized,
|
|
|
|
|
// and if the abbr is a lower-case noun.
|
|
|
|
|
//
|
|
|
|
|
// if abbr is like "vs" then do not end sentenc
|
|
|
|
|
if ( hasWordAfter )
|
|
|
|
|
goto redo;
|
|
|
|
|
|
|
|
|
|
// set "next" to next alnum word after us
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t next = j+1;
|
|
|
|
|
int32_t max = next + 10;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( max > m_nw ) max = m_nw;
|
|
|
|
|
for ( ; next < max ; next++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[next].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// was previous word/abbr capitalized?
|
|
|
|
|
// if so, assume period does not end sentence.
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( is_capitalized_utf8((*m_tr)[lastWidPos].token_start) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
goto redo;
|
|
|
|
|
// if next word is NOT capitalized, assume
|
|
|
|
|
// period does not end sentence...
|
|
|
|
|
if ( next < max &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
! is_capitalized_utf8((*m_tr)[next].token_start) )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
goto redo;
|
|
|
|
|
// otherwise, abbr is NOT capitalized and
|
|
|
|
|
// next word IS capitalized, so assume the
|
|
|
|
|
// period does NOT end the sentence
|
|
|
|
|
}
|
|
|
|
|
// fix "1. library name" for cabq.gov
|
|
|
|
|
if ( *p == '.' &&
|
2018-03-09 10:24:39 -05:00
|
|
|
|
lastWidPos == i) {
|
|
|
|
|
auto const &t = (*m_tr)[lastWidPos];
|
|
|
|
|
if(is_ascii_digit_string(t.token_start, t.token_end()))
|
|
|
|
|
goto redo;
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ok, stop otherwise
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// do not include tag at end. try to fix sentence flip flop.
|
|
|
|
|
for ( ; j > i ; j-- )
|
|
|
|
|
// stop when we just contain the last word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[j-1].is_alfanum ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// make our sentence endpoints now
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t senta = i;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// make the sentence defined by [senta,sentb) where sentb
|
|
|
|
|
// defines a half-open interval like we do for almost
|
|
|
|
|
// everything else
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t sentb = j;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// update i for next iteration
|
|
|
|
|
i = sentb - 1;
|
|
|
|
|
|
|
|
|
|
// crap, but now sentences intersect with our tag-based
|
|
|
|
|
// sections because they can now split tags because of websites
|
|
|
|
|
// like aliconference.com and abqtango.com whose sentences
|
|
|
|
|
// do not align with the tag sections. therefore we introduce
|
|
|
|
|
// the SEC_TOP_SPLIT and SEC_BOTTOM_SPLIT to indicate
|
|
|
|
|
// that the section is a top/bottom piece of a split sentence.
|
|
|
|
|
// if both bits are set we assume SEC_MIDDLE_SPLIT.
|
|
|
|
|
// then we set the Section::m_senta and m_sentb to
|
|
|
|
|
// indicate the whole sentence of which it is a split.
|
|
|
|
|
// but the vast majority of the time m_senta and m_sentb
|
|
|
|
|
// will equal m_firstWordPos and m_lastWordPos respectively.
|
|
|
|
|
// then, any routine that
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// so scan the words in the sentence and as we scan we have
|
|
|
|
|
// to determine the parent section we inserting the sentence
|
|
|
|
|
// into as a child section.
|
|
|
|
|
//Section *parent = NULL;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t start = -1;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *pp;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lastk = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *splitSection = NULL;
|
|
|
|
|
Section *lastGuy = NULL;
|
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t k = senta ; k <= sentb ; k++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// add final piece
|
|
|
|
|
if ( k == sentb ) {
|
|
|
|
|
// stop i no final piece
|
|
|
|
|
if ( start == -1 ) break;
|
|
|
|
|
// otherwise, add it
|
|
|
|
|
goto addit;
|
|
|
|
|
}
|
|
|
|
|
// need a real alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[k].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get his parent
|
|
|
|
|
pp = m_sectionPtrs[k];
|
|
|
|
|
// set parent if need to
|
|
|
|
|
//if ( ! parent ) parent = pp;
|
|
|
|
|
// and start sentence if need to
|
|
|
|
|
if ( start == -1 ) start = k;
|
|
|
|
|
// if same as exact section as last guy, save some time
|
|
|
|
|
if ( pp == lastGuy ) pp = NULL;
|
|
|
|
|
// store it
|
|
|
|
|
lastGuy = pp;
|
|
|
|
|
// . i'd say blow up "pp" until its contains "start"
|
|
|
|
|
// . but if before it contains start it breaches
|
2016-01-05 05:39:00 -05:00
|
|
|
|
// [senta,sentb) then we have to cut things short
|
2013-08-02 16:12:24 -04:00
|
|
|
|
for ( ; pp ; pp = pp->m_parent ) {
|
|
|
|
|
// we now have to split section "pp"
|
|
|
|
|
// when adding the sentence section.
|
|
|
|
|
// once we have such a section we
|
|
|
|
|
// cannot use a different parent...
|
|
|
|
|
if ( pp->m_firstWordPos < start ||
|
|
|
|
|
pp->m_lastWordPos >= sentb ) {
|
|
|
|
|
// set it
|
|
|
|
|
if ( ! splitSection ) splitSection =pp;
|
|
|
|
|
// WE ARE ONLY ALLOWED TO SPLIT ONE
|
|
|
|
|
// SECTION ONLY...
|
|
|
|
|
if ( pp != splitSection)
|
|
|
|
|
goto addit;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// keep telescoping until "parent" contains
|
|
|
|
|
// [senta,k] , and we already know that it
|
|
|
|
|
// contains k because that is what we set it to
|
|
|
|
|
//if ( pp->m_a <= senta ) break;
|
|
|
|
|
}
|
|
|
|
|
// mark it
|
2018-03-19 12:25:36 -04:00
|
|
|
|
if ( (*m_tr)[k].is_alfanum ) lastk = k;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ok, keep chugging
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// add the final piece if we go to this label
|
|
|
|
|
addit:
|
|
|
|
|
// use this flag
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t bh = BH_SENTENCE;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// determine parent section, smallest section
|
|
|
|
|
// containing [start,lastk]
|
|
|
|
|
Section *parent = m_sectionPtrs[start];
|
|
|
|
|
for ( ; parent ; parent = parent->m_parent ) {
|
|
|
|
|
// stop if contains lastk
|
|
|
|
|
if ( parent->m_b > lastk ) break;
|
|
|
|
|
}
|
|
|
|
|
//
|
|
|
|
|
// for "<span>Albuquerque</span>, New Mexico"
|
|
|
|
|
// "start" points to "Albuquerque" but needs to
|
|
|
|
|
// point to the "<span>" so its parent is "parent"
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t adda = start;
|
|
|
|
|
int32_t addb = lastk;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need to update "start" to so its parent is the new
|
|
|
|
|
// "parent" now so insertSubSection() does not core
|
|
|
|
|
for ( ; adda >= 0 ; ) {
|
|
|
|
|
// stop if we finally got the right parent
|
|
|
|
|
if ( m_sectionPtrs[adda]==parent ) break;
|
|
|
|
|
// or if he's a tag and his parent
|
|
|
|
|
// is "parent" we can stop.
|
|
|
|
|
// i.e. STOP on a proper subsection of
|
|
|
|
|
// the section containing the sentence.
|
|
|
|
|
if ( m_sectionPtrs[adda]->m_parent==parent &&
|
|
|
|
|
m_sectionPtrs[adda]->m_a == adda )
|
|
|
|
|
break;
|
|
|
|
|
// backup
|
|
|
|
|
adda--;
|
|
|
|
|
// check
|
|
|
|
|
if ( adda < 0 ) break;
|
|
|
|
|
// how can this happen?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[adda].is_alfanum ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// sanity
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( adda < 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// same for right endpoint
|
|
|
|
|
for ( ; addb < m_nw ; ) {
|
|
|
|
|
// stop if we finally got the right parent
|
|
|
|
|
if ( m_sectionPtrs[addb]==parent ) break;
|
|
|
|
|
// get it
|
|
|
|
|
Section *sp = m_sectionPtrs[addb];
|
|
|
|
|
// come back up here in the case of a section
|
|
|
|
|
// sharing its Section::m_b with its parent
|
|
|
|
|
subloop:
|
|
|
|
|
// or if he's a tag and his parent
|
|
|
|
|
// is "parent" we can stop
|
|
|
|
|
if ( sp->m_parent==parent &&
|
|
|
|
|
sp->m_b == addb+1 )
|
|
|
|
|
break;
|
|
|
|
|
// or if we ran into a brother section
|
|
|
|
|
// that does not contain the sentence...
|
|
|
|
|
// fix core dump for webnetdesign.com whose
|
|
|
|
|
// sentence consisted of 3 sections from
|
|
|
|
|
// A=7079 to B=7198. but now i am getting rid
|
|
|
|
|
// of allowing a lower case http(s):// on
|
|
|
|
|
// a separate line to indicate that the
|
|
|
|
|
// sentence continues... so we will not have
|
|
|
|
|
// this sentence anymore in case you are
|
|
|
|
|
// wondering why it is not there any more.
|
|
|
|
|
if ( sp->m_parent==parent &&
|
|
|
|
|
sp->m_a == addb ) {
|
|
|
|
|
// do not include that brother's tag
|
|
|
|
|
addb--;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// when we have bad tag formations like for
|
|
|
|
|
// http://gocitykids.parentsconnect.com/catego
|
|
|
|
|
// ry/buffalo-ny-usa/places-to-go/tourist-stops
|
|
|
|
|
// like <a><b>...</div> with no ending </a> or
|
|
|
|
|
// </b> tags then we have to get the parent
|
2016-12-19 11:14:13 -05:00
|
|
|
|
// of the parent as long as its m_b is the
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// same and check that before advancing addb
|
|
|
|
|
// otherwise we can miss the parent section
|
|
|
|
|
// that we want! (this is because the kid
|
|
|
|
|
// sections share the same m_b as their
|
|
|
|
|
// parent because of they have no ending tag)
|
|
|
|
|
if ( sp->m_parent &&
|
|
|
|
|
sp->m_parent->m_b == sp->m_b ) {
|
|
|
|
|
sp = sp->m_parent;
|
|
|
|
|
goto subloop;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// advance
|
|
|
|
|
addb++;
|
|
|
|
|
// stop if addb
|
|
|
|
|
if ( addb >= m_nw ) break;
|
|
|
|
|
// how can this happen?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[addb].is_alfanum ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// sanity
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( addb >= m_nw ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// ok, now add the split sentence
|
2016-01-06 11:25:42 -05:00
|
|
|
|
Section *is =insertSubSection(adda,addb+1,bh);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// panic?
|
2016-09-08 11:44:05 -04:00
|
|
|
|
if ( ! is )
|
|
|
|
|
break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set sentence flag on it
|
|
|
|
|
is->m_flags |= SEC_SENTENCE;
|
|
|
|
|
// . set this
|
|
|
|
|
// . sentence is from [senta,sentb)
|
|
|
|
|
is->m_senta = senta;//start;
|
|
|
|
|
is->m_sentb = sentb;//k;
|
|
|
|
|
// stop if that was it
|
|
|
|
|
if ( k == sentb ) break;
|
|
|
|
|
// go on to next fragment then
|
|
|
|
|
start = -1;
|
|
|
|
|
parent = NULL;
|
|
|
|
|
splitSection = NULL;
|
|
|
|
|
lastGuy = NULL;
|
|
|
|
|
// redo this same k
|
|
|
|
|
k--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t inSentTil = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *lastSent = NULL;
|
|
|
|
|
// get the section of each word. if not a sentence section then
|
|
|
|
|
// make its m_sentenceSection point to its parent that is a sentence
|
|
|
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
|
|
|
// need sentence
|
|
|
|
|
if ( ( sk->m_flags & SEC_SENTENCE ) ) {
|
|
|
|
|
inSentTil = sk->m_b;
|
|
|
|
|
lastSent = sk;
|
|
|
|
|
sk->m_sentenceSection = sk;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// skip if outside of the last sentence we had
|
|
|
|
|
if ( sk->m_a >= inSentTil ) continue;
|
|
|
|
|
// we are in that sentence
|
|
|
|
|
sk->m_sentenceSection = lastSent;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-06 11:25:42 -05:00
|
|
|
|
Section *Sections::insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) {
|
2013-10-06 13:04:10 -04:00
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
2016-02-24 06:52:26 -05:00
|
|
|
|
if ( m_numSections >= m_maxNumSections ) {
|
2016-09-08 11:44:05 -04:00
|
|
|
|
g_errno = EDOCBADSECTIONS;
|
|
|
|
|
return NULL;
|
2016-02-24 06:52:26 -05:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// make a new section
|
|
|
|
|
//
|
|
|
|
|
Section *sk = &m_sections[m_numSections];
|
|
|
|
|
// clear
|
|
|
|
|
memset ( sk , 0 , sizeof(Section) );
|
|
|
|
|
// inc it
|
|
|
|
|
m_numSections++;
|
|
|
|
|
// now set it
|
|
|
|
|
sk->m_a = a;
|
|
|
|
|
sk->m_b = b;
|
|
|
|
|
|
|
|
|
|
// don't mess this up!
|
|
|
|
|
if ( m_lastSection && a > m_lastSection->m_a )
|
|
|
|
|
m_lastSection = sk;
|
|
|
|
|
|
|
|
|
|
// the base hash (delimeter hash) hack
|
|
|
|
|
sk->m_baseHash = 0;// dh; ????????????????????
|
|
|
|
|
|
|
|
|
|
// get first section containing word #a
|
|
|
|
|
Section *si = m_sectionPtrs[a];
|
|
|
|
|
|
|
|
|
|
for ( ; si ; si = si->m_prev ) {
|
|
|
|
|
// we become his child if this is true
|
2016-01-05 04:41:16 -05:00
|
|
|
|
if ( si->m_a < a ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if he is bigger (or equal) we become his child
|
|
|
|
|
// and are after him
|
2016-01-05 04:41:16 -05:00
|
|
|
|
if ( si->m_a == a && si->m_b >= b ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . try using section before us if it is contained by "si"
|
2014-11-17 21:24:38 -05:00
|
|
|
|
// . like in the case when word #a belongs to the root section
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and there are thousands of child sections of the root before "a"
|
|
|
|
|
// we really want to get the child section of the root before us
|
|
|
|
|
// as the prev section, "si", otherwise the 2nd for loop below here
|
|
|
|
|
// will hafta loop through thousands of sibling sections
|
|
|
|
|
// . this will fail if word before a is part of our same section
|
|
|
|
|
// . what if we ignored this for now and set m_sectionPtrs[a] to point
|
|
|
|
|
// to the newly inserted section, then when done adding sentence
|
|
|
|
|
// sections we scanned all the words, keeping track of the last
|
|
|
|
|
// html section we entered and used that to insert the sentence sections
|
2016-09-30 16:55:55 -04:00
|
|
|
|
if ( m_lastAdded && si && m_lastAdded->m_a > si->m_a && m_lastAdded->m_a < a ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
si = m_lastAdded;
|
2016-01-05 04:41:16 -05:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// crap we may have
|
|
|
|
|
// "<p> <strong>hey there!</strong> this is another sentence.</p>"
|
|
|
|
|
// then "si" will be pointing at the "<p>" section, and we will
|
|
|
|
|
// not get the "<strong>" section as the "prev" to sk, which we should!
|
|
|
|
|
// that is where sk is the "this is another sentence." sentence
|
|
|
|
|
// section. so to fix that try iterating over si->m_next to get si to
|
|
|
|
|
// be closer to sk.
|
|
|
|
|
for ( ; si ; si = si->m_next ) {
|
|
|
|
|
// stop if no more eavailable
|
|
|
|
|
if ( ! si->m_next ) break;
|
|
|
|
|
// stop if would break
|
|
|
|
|
if ( si->m_next->m_a > a ) break;
|
|
|
|
|
// if it gets closer to us without exceeding us, use it
|
|
|
|
|
if ( si->m_next->m_a < a ) continue;
|
|
|
|
|
// if tied, check b. if it contains us, go to it
|
|
|
|
|
if ( si->m_next->m_b >= b ) continue;
|
|
|
|
|
// otherwise, stop
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set this
|
|
|
|
|
m_lastAdded = si;
|
|
|
|
|
|
|
|
|
|
// a br tag can split the very first base html tag like for
|
|
|
|
|
// mapsandatlases.org we have
|
|
|
|
|
// "<html>...</html> <br> ...." so the br tag splits the first
|
|
|
|
|
// section!
|
|
|
|
|
// SO we need to check for NULL si's!
|
|
|
|
|
if ( ! si ) {
|
|
|
|
|
// skip this until we figure it out
|
|
|
|
|
m_numSections--;
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
return NULL;
|
2016-01-05 04:41:16 -05:00
|
|
|
|
} else {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// insert us into the linked list of sections
|
|
|
|
|
if ( si->m_next ) si->m_next->m_prev = sk;
|
|
|
|
|
sk->m_next = si->m_next;
|
|
|
|
|
sk->m_prev = si;
|
|
|
|
|
si->m_next = sk;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// now set the parent
|
|
|
|
|
Section *parent = m_sectionPtrs[a];
|
|
|
|
|
// expand until it encompasses both a and b
|
|
|
|
|
for ( ; ; parent = parent->m_parent ) {
|
|
|
|
|
if ( parent->m_a > a ) continue;
|
|
|
|
|
if ( parent->m_b < b ) continue;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// now we assign the parent to you
|
|
|
|
|
sk->m_parent = parent;
|
|
|
|
|
// sometimes an implied section is a subsection of a sentence!
|
|
|
|
|
// like when there are a lot of brbr (double br) tags in it...
|
|
|
|
|
sk->m_sentenceSection = parent->m_sentenceSection;
|
|
|
|
|
// take out certain flags from parent
|
|
|
|
|
sec_t flags = parent->m_flags;
|
|
|
|
|
flags &= ~SEC_SENTENCE;
|
|
|
|
|
|
|
|
|
|
// add in fake
|
|
|
|
|
flags |= SEC_FAKE;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// flag it as a fake section
|
|
|
|
|
sk->m_flags = flags ;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need this
|
|
|
|
|
sk->m_baseHash = newBaseHash;
|
|
|
|
|
|
|
|
|
|
// reset these
|
|
|
|
|
sk->m_firstWordPos = -1;
|
|
|
|
|
sk->m_lastWordPos = -1;
|
|
|
|
|
sk->m_alnumPosA = -1;
|
|
|
|
|
sk->m_alnumPosB = -1;
|
|
|
|
|
sk->m_senta = -1;
|
|
|
|
|
sk->m_sentb = -1;
|
|
|
|
|
|
|
|
|
|
// set sk->m_firstWordPos
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and first/last word pos
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// mark this
|
|
|
|
|
sk->m_firstWordPos = i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set sk->m_lastWordPos
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = b-1 ; i >= a ; i-- ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and first/last word pos
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// mark this
|
|
|
|
|
sk->m_lastWordPos = i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// to speed up scan the words in our inserted section, usually
|
|
|
|
|
// a sentence section i guess, because our parent can have a ton
|
|
|
|
|
// of children sections!!
|
|
|
|
|
//
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get current parent of that word
|
|
|
|
|
Section *wp = m_sectionPtrs[i];
|
|
|
|
|
// if sentence section does NOT contain the word's current
|
|
|
|
|
// section then the sentence section becomes the new section
|
|
|
|
|
// for that word.
|
|
|
|
|
if ( ! sk->strictlyContains ( wp ) ) {
|
|
|
|
|
// now if "wp" is like a root, then sk becomes the kid
|
|
|
|
|
m_sectionPtrs[i] = sk;
|
|
|
|
|
// our parent is wp
|
|
|
|
|
sk->m_parent = wp;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// we gotta blow up wp until right before it is bigger
|
|
|
|
|
// than "sk" and use that
|
|
|
|
|
for ( ; wp->m_parent ; wp = wp->m_parent )
|
|
|
|
|
// this could be equal to, not just contains
|
|
|
|
|
// otherwise we use strictlyContains()
|
|
|
|
|
if ( wp->m_parent->contains(sk) ) break;
|
|
|
|
|
// already parented to us?
|
|
|
|
|
if ( wp->m_parent == sk ) continue;
|
|
|
|
|
// sentence's parent is now wp's parent
|
|
|
|
|
sk->m_parent = wp->m_parent;
|
|
|
|
|
// and we become wp's parent
|
|
|
|
|
wp->m_parent = sk;
|
|
|
|
|
// sanity check
|
2016-06-20 13:48:19 -04:00
|
|
|
|
if ( wp->m_b > sk->m_b ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( wp->m_a < sk->m_a ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return sk;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// this is a function because we also call it from addImpliedSections()!
|
|
|
|
|
void Sections::setNextBrotherPtrs ( bool setContainer ) {
|
|
|
|
|
|
|
|
|
|
// clear out
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
si->m_nextBrother = NULL;
|
|
|
|
|
si->m_prevBrother = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
Section *sj = NULL;
|
|
|
|
|
|
|
|
|
|
// get word after us
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t wn = si->m_b;
|
|
|
|
|
int32_t nw2 = m_nw;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// if we hit a word in our parent.. then increment wn
|
|
|
|
|
// PROBLEM "<root><t1>hey</t1> blah blah blah x 1 mill</root>"
|
|
|
|
|
// would exhaust the full word list when si is the "t1"
|
|
|
|
|
// section.
|
|
|
|
|
Section *j2 = si->m_next;
|
|
|
|
|
if ( j2 && j2->m_a >= si->m_b ) {
|
|
|
|
|
sj = j2;
|
|
|
|
|
nw2 = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// try one more ahead for things like so we don't end up
|
|
|
|
|
// setting sj to the "t2" section as in:
|
|
|
|
|
// "<root><t1><t2>hey</t2></t1> ...."
|
|
|
|
|
if ( ! sj && j2 ) {
|
|
|
|
|
// try the next section then
|
|
|
|
|
j2 = j2->m_next;
|
|
|
|
|
// set "sj" if its a potential brother section
|
|
|
|
|
if ( j2 && j2->m_a >= si->m_b ) {
|
|
|
|
|
sj = j2;
|
|
|
|
|
nw2 = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ok, try the next word algo approach
|
|
|
|
|
for ( ; wn < nw2 ; wn++ ) {
|
|
|
|
|
sj = m_sectionPtrs[wn];
|
|
|
|
|
if ( sj->m_a >= si->m_b ) break;
|
|
|
|
|
}
|
|
|
|
|
// bail if none
|
|
|
|
|
if ( wn >= m_nw ) continue;
|
|
|
|
|
|
|
|
|
|
// telescope up until brother if possible
|
|
|
|
|
for ( ; sj ; sj = sj->m_parent )
|
|
|
|
|
if ( sj->m_parent == si->m_parent ) break;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// give up?
|
|
|
|
|
if ( ! sj || sj->m_parent != si->m_parent ) continue;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
|
|
|
|
if ( sj->m_a < si->m_b &&
|
|
|
|
|
sj->m_tagId != TAG_TC &&
|
|
|
|
|
si->m_tagId != TAG_TC ) {
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set brother
|
|
|
|
|
si->m_nextBrother = sj;
|
|
|
|
|
// set his prev then
|
|
|
|
|
sj->m_prevBrother = si;
|
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sj->m_parent != si->m_parent ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
|
|
|
|
if ( sj->m_a < si->m_b &&
|
|
|
|
|
sj->m_tagId != TAG_TC &&
|
|
|
|
|
si->m_tagId != TAG_TC ) {
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// do more?
|
|
|
|
|
if ( ! setContainer ) continue;
|
|
|
|
|
// telescope this
|
|
|
|
|
Section *te = sj;
|
|
|
|
|
// telescope up until it contains "si"
|
|
|
|
|
for ( ; te && te->m_a > si->m_a ; te = te->m_parent );
|
|
|
|
|
// only update list container if smaller than previous
|
|
|
|
|
if ( ! si->m_listContainer )
|
|
|
|
|
si->m_listContainer = te;
|
2016-09-30 16:50:32 -04:00
|
|
|
|
else if ( te && te->m_a > si->m_listContainer->m_a )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
si->m_listContainer = te;
|
|
|
|
|
if ( ! sj->m_listContainer )
|
|
|
|
|
sj->m_listContainer = te;
|
2016-09-30 16:50:32 -04:00
|
|
|
|
else if ( te && te->m_a > sj->m_listContainer->m_a )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sj->m_listContainer = te;
|
|
|
|
|
|
|
|
|
|
// now
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Sections::setNextSentPtrs ( ) {
|
|
|
|
|
// kinda like m_rootSection
|
2018-02-23 07:53:03 -05:00
|
|
|
|
m_firstSentence = NULL;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
Section *finalSec = NULL;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan the sentence sections and number them to set m_sentNum
|
|
|
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
|
|
|
// record final section
|
|
|
|
|
finalSec = sk;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need sentence
|
2016-02-24 10:12:05 -05:00
|
|
|
|
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// first one?
|
2018-02-23 07:53:03 -05:00
|
|
|
|
if ( ! m_firstSentence ) {
|
|
|
|
|
m_firstSentence = sk;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
|
|
|
|
Section *lastSent = NULL;
|
|
|
|
|
|
2018-02-23 07:53:03 -05:00
|
|
|
|
// now set "m_nextSentence" of each section
|
2013-08-02 16:12:24 -04:00
|
|
|
|
for ( Section *sk = finalSec ; sk ; sk = sk->m_prev ) {
|
|
|
|
|
// set this
|
2018-02-23 07:53:03 -05:00
|
|
|
|
sk->m_nextSentence = lastSent;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need sentence
|
2016-02-24 10:12:05 -05:00
|
|
|
|
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// we are the sentence now
|
|
|
|
|
lastSent = sk;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define TABLE_ROWS 25
|
|
|
|
|
|
2018-02-22 11:15:18 -05:00
|
|
|
|
void Sections::printFlags(SafeBuf *sbuf, const Section *sn) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sec_t f = sn->m_flags;
|
|
|
|
|
|
|
|
|
|
if ( f & SEC_HEADING )
|
|
|
|
|
sbuf->safePrintf("heading ");
|
|
|
|
|
|
|
|
|
|
if ( f & SEC_MENU_SENTENCE )
|
|
|
|
|
sbuf->safePrintf("menusentence " );
|
|
|
|
|
if ( f & SEC_MENU )
|
|
|
|
|
sbuf->safePrintf("ismenu " );
|
|
|
|
|
if ( f & SEC_MENU_HEADER )
|
|
|
|
|
sbuf->safePrintf("menuheader " );
|
2016-01-05 07:30:41 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_LINK_TEXT )
|
|
|
|
|
sbuf->safePrintf("linktext " );
|
|
|
|
|
if ( f & SEC_PLAIN_TEXT )
|
|
|
|
|
sbuf->safePrintf("plaintext " );
|
|
|
|
|
|
|
|
|
|
if ( f & SEC_FAKE ) {
|
|
|
|
|
if ( sn->m_baseHash == BH_BULLET )
|
|
|
|
|
sbuf->safePrintf("bulletdelim ");
|
|
|
|
|
else if ( sn->m_baseHash == BH_SENTENCE )
|
|
|
|
|
sbuf->safePrintf("<b>sentence</b> ");
|
|
|
|
|
else if ( sn->m_baseHash == BH_IMPLIED )
|
|
|
|
|
sbuf->safePrintf("<b>impliedsec</b> ");
|
2016-06-20 06:30:26 -04:00
|
|
|
|
else { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( f & SEC_NOTEXT )
|
|
|
|
|
sbuf->safePrintf("notext ");
|
|
|
|
|
|
|
|
|
|
if ( f & SEC_SCRIPT )
|
|
|
|
|
sbuf->safePrintf("inscript ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_NOSCRIPT )
|
|
|
|
|
sbuf->safePrintf("innoscript ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_STYLE )
|
|
|
|
|
sbuf->safePrintf("instyle ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_HIDDEN )
|
|
|
|
|
sbuf->safePrintf("indivhide ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_SELECT )
|
|
|
|
|
sbuf->safePrintf("inselect ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2016-06-09 11:07:45 -04:00
|
|
|
|
if ( f & SEC_IN_HEAD )
|
|
|
|
|
sbuf->safePrintf("inhead ");
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_IN_TITLE )
|
|
|
|
|
sbuf->safePrintf("intitle ");
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( f & SEC_IN_HEADER )
|
|
|
|
|
sbuf->safePrintf("inheader ");
|
2017-09-18 12:56:46 -04:00
|
|
|
|
|
|
|
|
|
if ( f & SEC_IN_IFRAME )
|
|
|
|
|
sbuf->safePrintf("iniframe ");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
2018-02-22 11:15:18 -05:00
|
|
|
|
bool Sections::isHardSection(const Section *sn) const {
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = sn->m_a;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . treat this as hard... kinda like a div section...
|
|
|
|
|
// fixes gwair.org date from stealing address of another date
|
|
|
|
|
// because the span tags are fucked up...
|
|
|
|
|
// . crap, no this prevents publicbroadcasting.net and other urls
|
|
|
|
|
// from telescoping to header dates they need to telescope to.
|
|
|
|
|
// the header dates are in span tags and if that is seen as a hard
|
|
|
|
|
// section bad things happen
|
|
|
|
|
//if ( m_tids[a] == TAG_SPAN ) return true;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! isBreakingTagId((*m_tr)[a].nodeid) ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . if first child is hard that works!
|
|
|
|
|
// . fixes "<blockquote><p>..." for collectorsguide.com
|
|
|
|
|
if ( sn->m_next &&
|
|
|
|
|
sn->m_next->m_tagId &&
|
|
|
|
|
// fix "blah blah<br>blah blah" for sentence
|
|
|
|
|
sn->m_next->m_tagId != TAG_BR &&
|
|
|
|
|
sn->m_next->m_a < sn->m_b &&
|
|
|
|
|
isBreakingTagId(sn->m_next->m_tagId) )
|
|
|
|
|
return true;
|
|
|
|
|
// otherwise, forget it!
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
// trumba.com has sub dates in br-based implied sections that need
|
|
|
|
|
// to telescope to their parent above
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[a].nodeid == TAG_BR ) return false;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( sn->m_flags & SEC_SENTENCE ) return false;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// xml tag exception for gwair.org. treat <st1:Place>... as soft
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ((*m_tr)[a].nodeid & BACKBITCOMP) == TAG_XMLTAG && ! m_isRSSExt )
|
2013-08-02 16:12:24 -04:00
|
|
|
|
return false;
|
2016-01-06 06:41:40 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool Sections::setMenus ( ) {
|
|
|
|
|
// . this just returns if already set
|
|
|
|
|
// . sets Bits::m_bits[x].m_flags & D_IN_LINK if its in a link
|
|
|
|
|
// . this bits array is 1-1 with the words
|
|
|
|
|
m_bits->setInLinkBits(this);
|
|
|
|
|
|
|
|
|
|
sec_t flag;
|
|
|
|
|
// set SEC_PLAIN_TEXT and SEC_LINK_TEXT for all sections
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// need alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! (*m_tr)[i].is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get our flag
|
2018-09-04 08:50:38 -04:00
|
|
|
|
if ( m_bits->queryBits(i) & D_IN_LINK ) flag = SEC_LINK_TEXT;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
else flag = SEC_PLAIN_TEXT;
|
|
|
|
|
// get section ptr
|
|
|
|
|
Section *sk = m_sectionPtrs[i];
|
|
|
|
|
// loop for sk
|
|
|
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
|
|
|
// skip if already set
|
|
|
|
|
if ( sk->m_flags & flag ) break;
|
|
|
|
|
// set it
|
|
|
|
|
sk->m_flags |= flag;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Section *last = NULL;
|
|
|
|
|
// . alernatively, scan through all anchor tags
|
|
|
|
|
// . compare to last anchor tag
|
|
|
|
|
// . and blow up each to their max non-intersection section and make
|
|
|
|
|
// sure no PLAIN text in either of those!
|
|
|
|
|
// . this is all to fix texasdrums.drums.org which has various span
|
|
|
|
|
// and bold tags throughout its menu at random
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
// . if we hit plain text, we kill our last
|
|
|
|
|
// . this was causing "geeks who drink" for blackbirdbuvette
|
|
|
|
|
// to get is SEC_MENU set because there was a link after it
|
2016-02-25 16:04:00 -05:00
|
|
|
|
if ( si->m_flags & SEC_PLAIN_TEXT ) {
|
|
|
|
|
last = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not a href section
|
2016-02-25 16:04:00 -05:00
|
|
|
|
if ( si->m_baseHash != TAG_A ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . if it is a mailto link forget it
|
|
|
|
|
// . fixes abtango.com from detecting a bad menu
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *ptr = (*m_tr)[si->m_a].token_start;
|
|
|
|
|
int32_t plen = (*m_tr)[si->m_a].token_len;
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2018-02-09 11:52:47 -05:00
|
|
|
|
const char *mailto = strncasestr(ptr,plen,"mailto:");
|
2016-02-25 16:04:00 -05:00
|
|
|
|
if ( mailto ) {
|
|
|
|
|
last = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// bail if no last
|
|
|
|
|
if ( ! last ) { last = si; continue; }
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// save last
|
|
|
|
|
Section *prev = last;
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set last for next round, used "saved" below
|
|
|
|
|
last = si;
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get first "hard" section encountered while telescoping
|
|
|
|
|
Section *prevHard = NULL;
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// blow up last until right before it contains us
|
|
|
|
|
for ( ; prev ; prev = prev->m_parent ) {
|
|
|
|
|
// record?
|
|
|
|
|
if ( ! prevHard && isHardSection(prev) )
|
|
|
|
|
prevHard = prev;
|
|
|
|
|
// if parent contains us, stop
|
|
|
|
|
if ( prev->m_parent->contains ( si ) ) break;
|
|
|
|
|
}
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if it has plain text, forget it!
|
2016-09-30 16:57:33 -04:00
|
|
|
|
if ( prev && prev->m_flags & SEC_PLAIN_TEXT ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// use this for us
|
|
|
|
|
Section *sk = si;
|
|
|
|
|
// get first "hard" section encountered while telescoping
|
|
|
|
|
Section *skHard = NULL;
|
|
|
|
|
// same for us
|
|
|
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
|
|
|
// record?
|
|
|
|
|
if ( ! skHard && isHardSection(sk) ) skHard = sk;
|
|
|
|
|
// if parent contains us, stop
|
2016-10-02 03:44:50 -04:00
|
|
|
|
if ( prev && sk->m_parent->contains ( prev ) ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// if it has plain text, forget it!
|
2016-09-30 16:54:20 -04:00
|
|
|
|
if ( sk && sk->m_flags & SEC_PLAIN_TEXT ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// . first hard sections encountered must match!
|
|
|
|
|
// . otherwise for switchborad.com we lose "A B C ..." as
|
|
|
|
|
// title candidate because we think it is an SEC_MENU
|
|
|
|
|
// because the sections before it have links in them, but
|
|
|
|
|
// they have different hard sections
|
|
|
|
|
if ( prevHard && ! skHard ) continue;
|
|
|
|
|
if ( ! prevHard && skHard ) continue;
|
2016-09-30 16:57:33 -04:00
|
|
|
|
if ( prevHard && prevHard->m_tagId != skHard->m_tagId ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// ok, great that works!
|
2016-09-30 16:57:33 -04:00
|
|
|
|
if( prev ) {
|
|
|
|
|
prev->m_flags |= SEC_MENU;
|
|
|
|
|
}
|
2016-09-30 16:54:20 -04:00
|
|
|
|
if( sk ) {
|
|
|
|
|
sk->m_flags |= SEC_MENU;
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
2014-10-30 15:36:39 -04:00
|
|
|
|
int64_t h_copyright = hash64n("copyright");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// copyright check
|
|
|
|
|
// the copyright symbol in utf8 (see Entities.cpp for the code)
|
2018-03-19 12:42:06 -04:00
|
|
|
|
static const char copy[] = "<EFBFBD>";
|
2016-02-25 16:04:00 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan all years, lists and ranges of years, and look for
|
|
|
|
|
// a preceeding copyright sign. mark such years as DF_COPYRIGHT
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if tag
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[i].nodeid ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// do we have an alnum word before us here?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[i].is_alfanum ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if word check for copyright
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[i].token_hash != h_copyright ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// must have copyright sign in it i guess
|
2018-03-20 09:09:39 -04:00
|
|
|
|
else if ( ! gb_strncasestr((*m_tr)[i].token_start, (*m_tr)[i].token_len, copy))
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
// mark section as copyright section then
|
|
|
|
|
Section *sp = m_sectionPtrs[i];
|
|
|
|
|
// flag as menu
|
|
|
|
|
sp->m_flags |= SEC_MENU;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-03 10:07:28 -05:00
|
|
|
|
sec_t ff = SEC_MENU;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// set SEC_MENU of child sections of SEC_MENU sections
|
2016-02-24 05:21:06 -05:00
|
|
|
|
for ( Section *si = m_rootSection; si; si = si->m_next ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must be a link text only section
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !( si->m_flags & ff ) )
|
|
|
|
|
continue;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ignore if went down this path
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( si->m_used == 82 ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get first potential kid
|
|
|
|
|
Section *sk = si->m_next;
|
|
|
|
|
// scan child sections
|
2016-02-24 05:21:06 -05:00
|
|
|
|
for ( ; sk; sk = sk->m_next ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// stop if not contained
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !si->contains( sk ) ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// mark it
|
2016-02-24 05:21:06 -05:00
|
|
|
|
sk->m_flags |= ( si->m_flags & ff ); // SEC_MENU;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ignore in big loop
|
|
|
|
|
sk->m_used = 82;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// set SEC_MENU_HEADER
|
|
|
|
|
//
|
|
|
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
|
|
|
// skip if not in a menu
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( ! ( sk->m_flags & SEC_MENU ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get his list container
|
|
|
|
|
Section *c = sk->m_listContainer;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if none
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !c ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// already flagged?
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( c->m_used == 89 ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// do not repeat on any item in this list
|
|
|
|
|
c->m_used = 89;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// flag all its brothers!
|
|
|
|
|
Section *zz = sk;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
for ( ; zz; zz = zz->m_nextBrother ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// bail if not in menu
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !( zz->m_flags & SEC_MENU ) ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if broked it, stop
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( zz ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
//
|
|
|
|
|
// ok, every item in list is a menu item, so try to set header
|
|
|
|
|
//
|
|
|
|
|
// get word before first item in list
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t r = sk->m_a - 1;
|
2018-03-09 10:24:39 -05:00
|
|
|
|
for ( ; r >= 0 && !(*m_tr)[r].is_alfanum; r-- )
|
2016-07-26 16:59:59 -04:00
|
|
|
|
;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if no header, skip
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( r < 0 ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set SEC_MENU_HEADER
|
2016-02-24 05:21:06 -05:00
|
|
|
|
setHeader( r, sk, SEC_MENU_HEADER );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// set SEC_MENU_SENTENCE flag
|
|
|
|
|
//
|
2016-02-24 05:21:06 -05:00
|
|
|
|
for ( Section *si = m_rootSection; si; si = si->m_next ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must be a link text only section
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !( si->m_flags & SEC_MENU ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set this
|
|
|
|
|
bool gotSentence = ( si->m_flags & SEC_SENTENCE );
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set SEC_MENU of the sentence
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( gotSentence ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// parent up otherwise
|
2016-02-24 05:21:06 -05:00
|
|
|
|
for ( Section *sk = si->m_parent; sk; sk = sk->m_parent ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// stop if sentence finally
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( !( sk->m_flags & SEC_SENTENCE ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// not a menu sentence if it has plain text in it
|
|
|
|
|
// though! we have to make this exception to stop
|
2016-02-24 05:21:06 -05:00
|
|
|
|
// stuff like
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// "Wedding Ceremonies, No preservatives, more... "
|
|
|
|
|
// from switchboard.com from being a menu sentence
|
|
|
|
|
// just because "more" is in a link.
|
2016-02-24 05:21:06 -05:00
|
|
|
|
if ( sk->m_flags & SEC_PLAIN_TEXT ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// set it
|
|
|
|
|
sk->m_flags |= SEC_MENU_SENTENCE;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// and stop
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool s_init = false;
|
2014-10-30 15:36:39 -04:00
|
|
|
|
static int64_t h_close ;
|
|
|
|
|
static int64_t h_send ;
|
|
|
|
|
static int64_t h_map ;
|
|
|
|
|
static int64_t h_maps ;
|
|
|
|
|
static int64_t h_directions ;
|
|
|
|
|
static int64_t h_driving ;
|
|
|
|
|
static int64_t h_help ;
|
|
|
|
|
static int64_t h_more ;
|
|
|
|
|
static int64_t h_log ;
|
|
|
|
|
static int64_t h_sign ;
|
|
|
|
|
static int64_t h_change ;
|
|
|
|
|
static int64_t h_write ;
|
|
|
|
|
static int64_t h_save ;
|
|
|
|
|
static int64_t h_share ;
|
|
|
|
|
static int64_t h_forgot ;
|
|
|
|
|
static int64_t h_home ;
|
|
|
|
|
static int64_t h_sitemap ;
|
|
|
|
|
static int64_t h_advanced ;
|
|
|
|
|
static int64_t h_go ;
|
|
|
|
|
static int64_t h_website ;
|
|
|
|
|
static int64_t h_view;
|
|
|
|
|
static int64_t h_add;
|
|
|
|
|
static int64_t h_submit;
|
|
|
|
|
static int64_t h_get;
|
|
|
|
|
static int64_t h_about;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// new stuff
|
2014-10-30 15:36:39 -04:00
|
|
|
|
static int64_t h_back; // back to top
|
|
|
|
|
static int64_t h_next;
|
|
|
|
|
static int64_t h_buy; // buy tickets
|
|
|
|
|
static int64_t h_english; // english french german versions
|
|
|
|
|
static int64_t h_click;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
if ( ! s_init ) {
|
|
|
|
|
s_init = true;
|
|
|
|
|
h_close = hash64n("close");
|
|
|
|
|
h_send = hash64n("send");
|
|
|
|
|
h_map = hash64n("map");
|
|
|
|
|
h_maps = hash64n("maps");
|
|
|
|
|
h_directions = hash64n("directions");
|
|
|
|
|
h_driving = hash64n("driving");
|
|
|
|
|
h_help = hash64n("help");
|
|
|
|
|
h_more = hash64n("more");
|
|
|
|
|
h_log = hash64n("log");
|
|
|
|
|
h_sign = hash64n("sign");
|
|
|
|
|
h_change = hash64n("change");
|
|
|
|
|
h_write = hash64n("write");
|
|
|
|
|
h_save = hash64n("save");
|
|
|
|
|
h_share = hash64n("share");
|
|
|
|
|
h_forgot = hash64n("forgot");
|
|
|
|
|
h_home = hash64n("home");
|
|
|
|
|
h_sitemap = hash64n("sitemap");
|
|
|
|
|
h_advanced = hash64n("advanced");
|
|
|
|
|
h_go = hash64n("go");
|
|
|
|
|
h_website = hash64n("website");
|
|
|
|
|
h_view = hash64n("view");
|
|
|
|
|
h_add = hash64n("add");
|
|
|
|
|
h_submit = hash64n("submit");
|
|
|
|
|
h_get = hash64n("get");
|
|
|
|
|
h_about = hash64n("about");
|
|
|
|
|
h_back = hash64n ("back");
|
|
|
|
|
h_next = hash64n ("next");
|
|
|
|
|
h_buy = hash64n ("buy");
|
|
|
|
|
h_english = hash64n ("english");
|
|
|
|
|
h_click = hash64n ("click");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// . when dup/non-dup voting info is not available because we are
|
|
|
|
|
// more or less an isolated page, guess that these links are
|
|
|
|
|
// menu links and not to be considered for title or event description
|
|
|
|
|
// . we completely exclude a word from title/description if its
|
|
|
|
|
// SEC_MENU is set.
|
|
|
|
|
// . set SEC_MENU for renegade links that start with an action
|
|
|
|
|
// verb like "close" or "add" etc. but if their # of non dup votes
|
|
|
|
|
// is high relative to their # of dup votes, then do not set this
|
|
|
|
|
// because it might be a name of a band like "More" or something
|
|
|
|
|
// and be in a link
|
|
|
|
|
// . scan all href sections
|
|
|
|
|
// set SEC_LINK_ONLY on sections that just contain a link
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
// skip if not a href section
|
|
|
|
|
if ( si->m_baseHash != TAG_A ) continue;
|
|
|
|
|
// set points to scan
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = si->m_a;
|
|
|
|
|
int32_t b = si->m_b;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// assume not bad
|
|
|
|
|
bool bad = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t i;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan words if any
|
|
|
|
|
for ( i = a ; i < b ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*m_tr)[i];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! token.is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// assume bad
|
|
|
|
|
bad = true;
|
|
|
|
|
// certain words are indicative of menus
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.token_hash == h_close ) break;
|
|
|
|
|
if ( token.token_hash == h_send ) break;
|
|
|
|
|
if ( token.token_hash == h_map ) break;
|
|
|
|
|
if ( token.token_hash == h_maps ) break;
|
|
|
|
|
if ( token.token_hash == h_directions ) break;
|
|
|
|
|
if ( token.token_hash == h_driving ) break;
|
|
|
|
|
if ( token.token_hash == h_help ) break;
|
|
|
|
|
if ( token.token_hash == h_more ) break;
|
|
|
|
|
if ( token.token_hash == h_log ) break; // log in
|
|
|
|
|
if ( token.token_hash == h_sign ) break; // sign up/in
|
|
|
|
|
if ( token.token_hash == h_change ) break; // change my loc.
|
|
|
|
|
if ( token.token_hash == h_write ) break; // write a review
|
|
|
|
|
if ( token.token_hash == h_save ) break;
|
|
|
|
|
if ( token.token_hash == h_share ) break;
|
|
|
|
|
if ( token.token_hash == h_forgot ) break; // forgot your pwd
|
|
|
|
|
if ( token.token_hash == h_home ) break;
|
|
|
|
|
if ( token.token_hash == h_sitemap ) break;
|
|
|
|
|
if ( token.token_hash == h_advanced ) break; // adv search
|
|
|
|
|
if ( token.token_hash == h_go ) break; // go to top of page
|
|
|
|
|
if ( token.token_hash == h_website ) break;
|
|
|
|
|
if ( token.token_hash == h_view ) break;
|
|
|
|
|
if ( token.token_hash == h_add ) break;
|
|
|
|
|
if ( token.token_hash == h_submit ) break;
|
|
|
|
|
if ( token.token_hash == h_get ) break;
|
|
|
|
|
if ( token.token_hash == h_about ) break;
|
|
|
|
|
if ( token.token_hash == h_back ) break;
|
|
|
|
|
if ( token.token_hash == h_next ) break;
|
|
|
|
|
if ( token.token_hash == h_buy ) break;
|
|
|
|
|
if ( token.token_hash == h_english ) break;
|
|
|
|
|
if ( token.token_hash == h_click ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bad = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
// skip if ok
|
|
|
|
|
if ( ! bad ) continue;
|
|
|
|
|
// get smallest section
|
|
|
|
|
Section *sm = m_sectionPtrs[i];
|
|
|
|
|
// if bad mark it!
|
|
|
|
|
sm->m_flags |= SEC_MENU;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// "first" is first item in the list we are getting header for
|
2014-11-10 17:45:11 -05:00
|
|
|
|
void Sections::setHeader ( int32_t r , Section *first , sec_t flag ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get smallest section containing word #r
|
|
|
|
|
Section *sr = m_sectionPtrs[r];
|
|
|
|
|
// save orig
|
|
|
|
|
Section *orig = sr;
|
|
|
|
|
|
|
|
|
|
// blow up until just before "first" section
|
|
|
|
|
for ( ; sr ; sr = sr->m_parent ) {
|
|
|
|
|
// forget it if in title tag already!
|
|
|
|
|
if ( sr->m_flags & SEC_IN_TITLE ) return;
|
|
|
|
|
// stop if no parent
|
|
|
|
|
if ( ! sr->m_parent ) continue;
|
|
|
|
|
// parent must not contain first
|
|
|
|
|
if ( sr->m_parent->contains ( first ) ) break;
|
|
|
|
|
}
|
|
|
|
|
// if we failed to contain "first"... what does this mean? i dunno
|
|
|
|
|
// but its dropping core for
|
|
|
|
|
// http://tedserbinski.com/jcalendar/jcalendar.js
|
|
|
|
|
if ( ! sr ) return;
|
|
|
|
|
|
|
|
|
|
// save that
|
|
|
|
|
Section *biggest = sr;
|
|
|
|
|
|
|
|
|
|
// check out prev brother
|
|
|
|
|
Section *prev = biggest->m_prevBrother;
|
|
|
|
|
|
|
|
|
|
// if we are in a hard section and capitalized (part of the
|
|
|
|
|
// SEC_HEADING) requirements, then it should be ok if we have
|
|
|
|
|
// a prev brother of a different tagid.
|
|
|
|
|
// this will fix americantowns.com which has a list of header tags
|
|
|
|
|
// and ul tags intermingled, with menus in the ul tags.
|
|
|
|
|
// should also fix upcoming.yahoo.com which has alternating
|
|
|
|
|
// dd and dt tags for its menus. now that we got rid of
|
|
|
|
|
// addImpliedSections() we have to deal with this here, and it will
|
|
|
|
|
// be more accurate since addImpliedSections() was often wrong.
|
|
|
|
|
if ( prev &&
|
|
|
|
|
(orig->m_flags & SEC_HEADING) &&
|
|
|
|
|
prev->m_tagId != biggest->m_tagId )
|
|
|
|
|
prev = NULL;
|
|
|
|
|
|
|
|
|
|
// but if prev brother is a blank, we should view that as a delimeter
|
|
|
|
|
// BUT really we should have added those sections in with the new
|
|
|
|
|
// delimeter logic! but let's put this in for now anyway...
|
|
|
|
|
if ( prev && prev->m_firstWordPos < 0 )
|
|
|
|
|
prev = NULL;
|
|
|
|
|
|
|
|
|
|
// if the header section has a prev brother, forget it!
|
|
|
|
|
if ( prev ) return;
|
|
|
|
|
|
|
|
|
|
// . if we gained extra text, that is a no-no then
|
|
|
|
|
// . these two checks replaced the two commented out ones above
|
|
|
|
|
// . they allow for empty sections preceeding "sr" at any level as
|
|
|
|
|
// we telescope it up
|
|
|
|
|
if ( biggest->m_firstWordPos != orig->m_firstWordPos ) return;
|
|
|
|
|
if ( biggest->m_lastWordPos != orig->m_lastWordPos ) return;
|
|
|
|
|
|
|
|
|
|
// . now blow up first until just before it hits biggest as well
|
|
|
|
|
// . this fixes reverbnation on the nextBrother check below
|
|
|
|
|
for ( ; first ; first = first->m_parent ) {
|
|
|
|
|
// stop if parent is NULL
|
|
|
|
|
if ( ! first->m_parent ) break;
|
|
|
|
|
// stop if parent would contain biggest
|
|
|
|
|
if ( first->m_parent->contains ( biggest ) ) break;
|
|
|
|
|
}
|
|
|
|
|
// if after blowing it up "first" contains more than just menu
|
|
|
|
|
// sections, then bail. that really was not a menu header!
|
|
|
|
|
// fixes reverbnation url that thought "That 1 Guy" was a menu header.
|
|
|
|
|
if ( flag == SEC_MENU_HEADER ) {
|
|
|
|
|
Section *fx = first;
|
|
|
|
|
for ( ; fx ; fx = fx->m_next ) {
|
|
|
|
|
// stop when list is over
|
|
|
|
|
if ( fx->m_a >= first->m_b ) break;
|
|
|
|
|
// ignore if no next
|
|
|
|
|
if ( fx->m_flags & SEC_NOTEXT ) continue;
|
|
|
|
|
// thats bad if SEC_MENU not set, it should be for all!
|
|
|
|
|
if ( fx->m_flags & SEC_MENU ) continue;
|
|
|
|
|
// we got these now
|
|
|
|
|
if ( fx->m_flags & SEC_MENU_SENTENCE ) continue;
|
|
|
|
|
// otherwise, bad!
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// scan until outside biggest
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lastb = biggest->m_b;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . make sure sr does not contain any list in it
|
|
|
|
|
// . scan all sections between sr and "saved"
|
|
|
|
|
for ( ; sr ; sr = sr->m_next ) {
|
|
|
|
|
// stop if over
|
|
|
|
|
if ( sr->m_a >= lastb ) break;
|
|
|
|
|
// if we have a brother with same taghash we are
|
|
|
|
|
// part of a list
|
|
|
|
|
if ( sr->m_nextBrother &&
|
|
|
|
|
sr->m_nextBrother->m_tagHash == sr->m_tagHash &&
|
|
|
|
|
sr->m_nextBrother != first )
|
|
|
|
|
return;
|
|
|
|
|
if ( sr->m_prevBrother &&
|
|
|
|
|
sr->m_prevBrother->m_tagHash == sr->m_tagHash &&
|
|
|
|
|
// for footers
|
|
|
|
|
sr->m_prevBrother != first )
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// restart loop
|
|
|
|
|
sr = biggest;
|
|
|
|
|
// ok, not part of a list, flag it
|
|
|
|
|
for ( ; sr ; sr = sr->m_next ) {
|
|
|
|
|
// stop if over
|
|
|
|
|
if ( sr->m_a >= lastb ) break;
|
|
|
|
|
// flag each subsection
|
|
|
|
|
sr->m_flags |= flag; // SEC_MENU_HEADER;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-23 15:59:42 -05:00
|
|
|
|
// . set SEC_HEADING bits in Section::m_flags
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . identifies sections that are most likely headings
|
|
|
|
|
// . the WHOLE idea of this algo is to take a list of sections that are all
|
|
|
|
|
// the same tagId/baseHash and differentiate them so we can insert implied
|
|
|
|
|
// sections with headers.
|
|
|
|
|
bool Sections::setHeadingBit ( ) {
|
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t headings = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan the sections
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t fwp = si->m_firstWordPos;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( fwp == -1 ) continue;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// we must be the smallest container around this text
|
|
|
|
|
if ( m_sectionPtrs[fwp] != si ) continue;
|
|
|
|
|
|
|
|
|
|
// . make sure we are in our own hard section
|
|
|
|
|
// . TODO: allow for bold or strong, etc. tags as well
|
|
|
|
|
bool hasHard = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = si->m_firstWordPos;
|
|
|
|
|
int32_t b = si->m_lastWordPos;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// go to parent
|
|
|
|
|
Section *pp = si;
|
|
|
|
|
Section *biggest = NULL;
|
|
|
|
|
bool inLink = false;
|
|
|
|
|
// . we need to be isolated in our own hard section container
|
|
|
|
|
// . TODO: what about "<b>Hi There <i>Bob</i></b>" as a heading
|
|
|
|
|
// . i guess that will still work!
|
|
|
|
|
for ( ; pp ; pp = pp->m_parent ) {
|
|
|
|
|
// stop if breached
|
|
|
|
|
if ( pp->m_firstWordPos != a ) break;
|
|
|
|
|
if ( pp->m_lastWordPos != b ) break;
|
|
|
|
|
// record this
|
|
|
|
|
if ( pp->m_tagId == TAG_A ) inLink = true;
|
|
|
|
|
// record the biggest section containing just our text
|
|
|
|
|
biggest = pp;
|
|
|
|
|
// is it a hard section?
|
|
|
|
|
if ( isHardSection(pp) ) hasHard = true;
|
|
|
|
|
// . allow bold and strong tags
|
|
|
|
|
// . fixes gwair.org which has the dates of the
|
2016-03-02 05:30:05 -05:00
|
|
|
|
// month in strong tags. so we need to set
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// SEC_HEADING for those so getDelimHash() will
|
|
|
|
|
// recognize such tags as date header tags in the
|
|
|
|
|
// METHOD_DOM algorithm and we get the proper
|
|
|
|
|
// implied sections
|
|
|
|
|
if ( pp->m_tagId == TAG_STRONG ) hasHard = true;
|
|
|
|
|
if ( pp->m_tagId == TAG_B ) hasHard = true;
|
|
|
|
|
}
|
|
|
|
|
// need to be isolated in a hard section
|
|
|
|
|
if ( ! hasHard ) continue;
|
|
|
|
|
|
|
|
|
|
// now make sure the text is capitalized etc
|
|
|
|
|
bool hadUpper = false;
|
|
|
|
|
//bool hadLower = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t lowerCount = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bool hadYear = false;
|
|
|
|
|
bool hadAlpha = false;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t i;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// scan the alnum words we contain
|
|
|
|
|
for ( i = a ; i <= b ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*m_tr)[i];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . did we hit a breaking tag?
|
|
|
|
|
// . "<div> blah <table><tr><td>blah... </div>"
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.nodeid && isBreakingTagId(token.nodeid) ) break;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if not alnum word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( ! token.is_alfanum ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip digits
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if(token.token_len == 4 &&
|
|
|
|
|
is_digit(token.token_start[0]) &&
|
|
|
|
|
is_digit(token.token_start[1]) &&
|
|
|
|
|
is_digit(token.token_start[2]) &&
|
|
|
|
|
is_digit(token.token_start[3])) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . but if we had a year like "2010" that
|
|
|
|
|
// is allowed to be a header.
|
|
|
|
|
// . this fixes 770kob.com because the events
|
|
|
|
|
// under the "2010" header were telescoping
|
|
|
|
|
// up into events in the "December 2009"
|
|
|
|
|
// section, when they should have been in
|
|
|
|
|
// their own section! and now they are in
|
|
|
|
|
// their own implied section...
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t num = atol2(token.token_start,token.token_len);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( num < 1800 ) continue;
|
|
|
|
|
if ( num > 2100 ) continue;
|
|
|
|
|
// mark it
|
|
|
|
|
hadYear = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// mark this
|
|
|
|
|
hadAlpha = true;
|
|
|
|
|
// is it upper?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( is_upper_utf8(token.token_start) ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
hadUpper = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// skip stop words
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if(isStopWord(token.token_start, token.token_len, token.token_hash)) continue;
|
2016-01-05 05:39:00 -05:00
|
|
|
|
// . skip short words
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . November 4<sup>th</sup> for facebook.com
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.token_len <= 2 ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// is it lower?
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( is_lower_utf8(token.token_start) ) lowerCount++;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// stop now if bad
|
|
|
|
|
//if ( hadUpper ) break;
|
|
|
|
|
if ( lowerCount >= 2 ) break;
|
|
|
|
|
}
|
|
|
|
|
// is it a header?
|
|
|
|
|
bool isHeader = hadUpper;
|
|
|
|
|
// a single year by itself is ok though too
|
|
|
|
|
if ( hadYear && ! hadAlpha ) isHeader = true;
|
|
|
|
|
// allow for one mistake like we do in Events.cpp for titles
|
|
|
|
|
if ( lowerCount >= 2 ) isHeader = false;
|
|
|
|
|
if ( ! isHeader ) continue;
|
|
|
|
|
|
|
|
|
|
// ok, mark this section as a heading section
|
|
|
|
|
si->m_flags |= SEC_HEADING;
|
|
|
|
|
|
|
|
|
|
// a hack!
|
|
|
|
|
if ( inLink ) biggest->m_flags |= SEC_LINK_TEXT;
|
|
|
|
|
|
|
|
|
|
// count them
|
|
|
|
|
headings++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bail now if no headings were set
|
|
|
|
|
if ( ! headings ) return true;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void Sections::setTagHashes ( ) {
|
|
|
|
|
if ( m_numSections == 0 ) return;
|
|
|
|
|
|
|
|
|
|
// now recompute the tagHashes and depths and content hashes since
|
|
|
|
|
// we have eliminate open-ended sections in the loop above
|
|
|
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
|
|
|
// these have to be in order of sn->m_a to work right
|
|
|
|
|
// because we rely on the parent tag hash, which would not
|
2016-03-02 05:30:05 -05:00
|
|
|
|
// necessarily be set if we were not sorted, because the
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// parent section could have SEC_FAKE flag set because it is
|
|
|
|
|
// a br section added afterwards.
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2014-10-30 15:36:39 -04:00
|
|
|
|
int64_t bh = (int64_t)sn->m_baseHash;
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( bh == 0 ) { g_process.shutdownAbort(true); }
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if no parent, use initial values
|
|
|
|
|
if ( ! sn->m_parent ) {
|
|
|
|
|
sn->m_depth = 0;
|
|
|
|
|
sn->m_tagHash = bh;
|
2016-02-24 07:37:41 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( bh == 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2016-02-24 10:12:05 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sanity check
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_parent->m_tagHash == 0 ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// . update the cumulative front tag hash
|
|
|
|
|
// . do not include hyperlinks as part of the cumulative hash!
|
|
|
|
|
sn->m_tagHash = hash32h ( bh , sn->m_parent->m_tagHash );
|
|
|
|
|
|
|
|
|
|
sn->m_colorHash = hash32h ( bh , sn->m_parent->m_colorHash );
|
|
|
|
|
|
|
|
|
|
// if we are an implied section, just use the tag hash of
|
|
|
|
|
// our parent. that way since we add different implied
|
|
|
|
|
// sections for msichicago.com root than we do the kid,
|
|
|
|
|
// the section voting should still match up
|
|
|
|
|
if ( bh == BH_IMPLIED ) {
|
|
|
|
|
sn->m_tagHash = sn->m_parent->m_tagHash;
|
|
|
|
|
}
|
|
|
|
|
|
2016-02-24 10:12:05 -05:00
|
|
|
|
if ( sn->m_tagHash == 0 ) {
|
|
|
|
|
sn->m_tagHash = 1234567;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// depth based on parent, too
|
|
|
|
|
sn->m_depth = sn->m_parent->m_depth + 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make this replace ::print() when it works
|
2018-02-22 12:05:46 -05:00
|
|
|
|
bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) const {
|
|
|
|
|
PrintData pd;
|
|
|
|
|
pd.sbuf = sbuf;
|
|
|
|
|
pd.hiPos = hiPos;
|
|
|
|
|
pd.wposVec = wposVec;
|
|
|
|
|
pd.densityVec = densityVec;
|
|
|
|
|
pd.wordSpamVec = wordSpamVec;
|
|
|
|
|
pd.fragVec = fragVec;
|
|
|
|
|
return print(&pd);
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
bool Sections::print(PrintData *pd) const {
|
|
|
|
|
pd->sbuf->setLabel ("sectprnt");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
//verifySections();
|
|
|
|
|
|
2018-03-09 10:24:39 -05:00
|
|
|
|
int32_t nw = m_tr->size();
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// check words
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// get section
|
|
|
|
|
Section *sn = m_sectionPtrs[i];
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_a > i ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( sn->m_b <= i ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// print sections out
|
|
|
|
|
for ( Section *sk = m_rootSection ; sk ; ) {
|
|
|
|
|
// print this section
|
2018-02-22 12:05:46 -05:00
|
|
|
|
printSectionDiv(pd,sk);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// advance
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t b = sk->m_b;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// stop if last
|
|
|
|
|
if ( b >= m_nw ) break;
|
|
|
|
|
// get section after that
|
|
|
|
|
sk = m_sectionPtrs[b];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// print header
|
2016-05-30 10:12:42 -04:00
|
|
|
|
const char *hdr =
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"<table border=1>"
|
|
|
|
|
"<tr>"
|
|
|
|
|
"<td><b>sec #</b></td>"
|
|
|
|
|
"<td><b>wordStart</b></td>"
|
|
|
|
|
"<td><b>wordEnd</b></td>"
|
|
|
|
|
"<td><b>baseHash</b></td>"
|
|
|
|
|
"<td><b>cumulTagHash</b></td>"
|
|
|
|
|
"<td><b>contentHash</b></td>"
|
|
|
|
|
"<td><b>contentTagHash</b></td>"
|
|
|
|
|
"<td><b>XOR</b></td>" // only valid for contentHashes
|
|
|
|
|
"<td><b>depth</b></td>"
|
|
|
|
|
"<td><b>parent word range</b></td>"
|
|
|
|
|
"<td><b>flags</b></td>"
|
|
|
|
|
"<td><b>evIds</b></td>"
|
|
|
|
|
"<td><b>text snippet</b></td>"
|
|
|
|
|
"</tr>\n";
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("%s",hdr);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t rcount = 0;
|
|
|
|
|
int32_t scount = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// show word # of each section so we can look in PageParser.cpp's
|
|
|
|
|
// output to see exactly where it starts, since we now label all
|
|
|
|
|
// the words
|
|
|
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
|
|
|
// see if one big table causes a browser slowdown
|
|
|
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</table>%s\n",hdr);
|
2016-05-30 10:12:42 -04:00
|
|
|
|
const char *xs = "--";
|
2013-08-02 16:12:24 -04:00
|
|
|
|
char ttt[100];
|
|
|
|
|
if ( sn->m_contentHash64 ) {
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t modified = sn->m_tagHash ^ sn->m_contentHash64;
|
2016-05-20 03:18:32 -04:00
|
|
|
|
sprintf(ttt,"0x%" PRIx32,modified);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
xs = ttt;
|
|
|
|
|
}
|
2016-01-04 07:55:46 -05:00
|
|
|
|
// shortcut
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *parent = sn->m_parent;
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t pswn = -1;
|
|
|
|
|
int32_t pewn = -1;
|
2016-10-22 07:54:19 -04:00
|
|
|
|
if ( parent ) {
|
|
|
|
|
pswn = parent->m_a;
|
|
|
|
|
pewn = parent->m_b;
|
|
|
|
|
}
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// print it
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<tr><td>%" PRId32"</td>\n"
|
2016-05-20 03:18:32 -04:00
|
|
|
|
"<td>%" PRId32"</td>"
|
|
|
|
|
"<td>%" PRId32"</td>"
|
|
|
|
|
"<td>0x%" PRIx32"</td>"
|
|
|
|
|
"<td>0x%" PRIx32"</td>"
|
|
|
|
|
"<td>0x%" PRIx32"</td>"
|
|
|
|
|
"<td>0x%" PRIx32"</td>"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"<td>%s</td>"
|
2016-05-20 03:18:32 -04:00
|
|
|
|
"<td>%" PRId32"</td>"
|
|
|
|
|
"<td><nobr>%" PRId32" to %" PRId32"</nobr></td>"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"<td><nobr>" ,
|
2016-02-23 10:50:10 -05:00
|
|
|
|
scount++,
|
2013-08-02 16:12:24 -04:00
|
|
|
|
sn->m_a,
|
|
|
|
|
sn->m_b,
|
2014-11-10 17:45:11 -05:00
|
|
|
|
(int32_t)sn->m_baseHash,
|
|
|
|
|
(int32_t)sn->m_tagHash,
|
|
|
|
|
(int32_t)sn->m_contentHash64,
|
|
|
|
|
(int32_t)(sn->m_contentHash64^sn->m_tagHash),
|
2013-08-02 16:12:24 -04:00
|
|
|
|
xs,
|
|
|
|
|
sn->m_depth,
|
|
|
|
|
pswn,
|
2016-02-23 10:50:10 -05:00
|
|
|
|
pewn);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// now show the flags
|
2018-02-22 12:05:46 -05:00
|
|
|
|
printFlags ( pd->sbuf , sn );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// first few words of section
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = sn->m_a;
|
|
|
|
|
int32_t b = sn->m_b;
|
2014-11-17 21:13:36 -05:00
|
|
|
|
// -1 means an unclosed tag!! should no longer be the case
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( b == -1 ) { g_process.shutdownAbort(true); }//b=m_words->m_numWords;
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</nobr></td>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<td> </td>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<td><nobr>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// 70 chars max
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t max = 70;
|
|
|
|
|
int32_t count = 0;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
char truncated = 0;
|
|
|
|
|
// do not print last word/tag in section
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const char *s = (*m_tr)[i].token_start;
|
|
|
|
|
int32_t slen = (*m_tr)[i].token_len;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( count + slen > max ) {
|
|
|
|
|
truncated = 1;
|
|
|
|
|
slen = max - count;
|
|
|
|
|
}
|
|
|
|
|
count += slen;
|
|
|
|
|
// boldify front tag
|
2018-02-22 12:05:46 -05:00
|
|
|
|
if ( i == a ) pd->sbuf->safePrintf("<b>");
|
|
|
|
|
pd->sbuf->htmlEncode(s,slen,false);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// boldify front tag
|
2018-02-22 12:05:46 -05:00
|
|
|
|
if ( i == a ) pd->sbuf->safePrintf("</b>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// if we truncated print a ...
|
2018-03-16 10:53:30 -04:00
|
|
|
|
if ( truncated ) pd->sbuf->safePrintf("<b>…</b>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// then print ending tag
|
|
|
|
|
if ( b < nw ) {
|
2018-03-16 10:47:10 -04:00
|
|
|
|
int32_t blen = (*m_tr)[b-1].token_len; //b is from m_b and always>0 so indexing b-1 is safe
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( blen>20 ) blen = 20;
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<b>");
|
2018-03-09 10:24:39 -05:00
|
|
|
|
pd->sbuf->htmlEncode((*m_tr)[b-1].token_start,blen,false);
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</b>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</nobr></td></tr>\n");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</table>\n<br>\n");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
bool Sections::printSectionDiv(PrintData *pd, const Section *sk) const {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// enter a new div section now
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<br>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// only make font color different
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff;
|
|
|
|
|
int32_t fcolor = 0x000000;
|
|
|
|
|
int32_t rcolor = 0x000000;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
uint8_t *bp = (uint8_t *)&bcolor;
|
|
|
|
|
bool dark = false;
|
|
|
|
|
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
|
|
|
|
|
dark = true;
|
|
|
|
|
// or if two are less than 50
|
2016-10-22 07:54:19 -04:00
|
|
|
|
if ( (bp[0]<100 && bp[1]<100) || (bp[1]<100 && bp[2]<100) || (bp[0]<100 && bp[2]<100) ) dark = true;
|
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// if bg color is dark, make font color light
|
|
|
|
|
if ( dark ) {
|
|
|
|
|
fcolor = 0x00ffffff;
|
|
|
|
|
rcolor = 0x00ffffff;
|
|
|
|
|
}
|
|
|
|
|
// start the new div
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<div "
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"style=\""
|
2016-05-20 03:18:32 -04:00
|
|
|
|
"background-color:#%06" PRIx32";"
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"margin-left:20px;"
|
2016-05-20 03:18:32 -04:00
|
|
|
|
"border:#%06" PRIx32" 1px solid;"
|
|
|
|
|
"color:#%06" PRIx32"\">",
|
2014-11-10 17:45:11 -05:00
|
|
|
|
//(int32_t)sk,
|
2013-08-02 16:12:24 -04:00
|
|
|
|
bcolor,
|
|
|
|
|
rcolor,
|
|
|
|
|
fcolor);
|
|
|
|
|
|
|
|
|
|
bool printWord = true;
|
|
|
|
|
if ( ! sk->m_parent && sk->m_next && sk->m_next->m_a == sk->m_a )
|
|
|
|
|
printWord = false;
|
|
|
|
|
|
|
|
|
|
// print word/tag #i
|
|
|
|
|
if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord )
|
|
|
|
|
// only encode if it is a tag
|
2018-03-09 10:24:39 -05:00
|
|
|
|
pd->sbuf->htmlEncode((*m_tr)[sk->m_a].token_start, (*m_tr)[sk->m_a].token_len, false);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<i>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-05 06:17:17 -05:00
|
|
|
|
// print the flags
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("A=%" PRId32" ",sk->m_a);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-05 06:17:17 -05:00
|
|
|
|
// print tag hash now
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-05 06:17:17 -05:00
|
|
|
|
if ( sk->m_contentHash64 )
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64);
|
2016-01-05 06:17:17 -05:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
printFlags ( pd->sbuf , sk );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2016-01-05 06:17:17 -05:00
|
|
|
|
if ( isHardSection(sk) )
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("hardsec ");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</i>\n");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// now print each word and subsections in this section
|
2014-11-10 17:45:11 -05:00
|
|
|
|
int32_t a = sk->m_a;
|
|
|
|
|
int32_t b = sk->m_b;
|
|
|
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
const auto &token = (*m_tr)[i];
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// . if its a and us, skip
|
2014-11-17 21:24:38 -05:00
|
|
|
|
// . BUT if we are root then really this tag belongs to
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// our first child, so make an exception for root!
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( i == a && token.is_alfanum && (sk->m_parent) ) continue;
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
// . get section of this word
|
|
|
|
|
// . TODO: what if this was the tr tag we removed??? i guess
|
|
|
|
|
// maybe make it NULL now?
|
|
|
|
|
Section *ws = m_sectionPtrs[i];
|
|
|
|
|
// get top most parent that starts at word position #a and
|
|
|
|
|
// is not "sk"
|
|
|
|
|
for ( ; ; ws = ws->m_parent ) {
|
|
|
|
|
if ( ws == sk ) break;
|
|
|
|
|
if ( ! ws->m_parent ) break;
|
|
|
|
|
if ( ws->m_parent->m_a != ws->m_a ) break;
|
|
|
|
|
if ( ws->m_parent == sk ) break;
|
|
|
|
|
}
|
2014-11-17 21:24:38 -05:00
|
|
|
|
// if it belongs to another sections, print that section
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( ws != sk ) {
|
|
|
|
|
// print out this subsection
|
2018-02-22 12:05:46 -05:00
|
|
|
|
printSectionDiv(pd,ws);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// advance to end of that then
|
|
|
|
|
i = ws->m_b - 1;
|
|
|
|
|
// and try next word
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ignore if in style section, etc. just print it out
|
|
|
|
|
if ( sk->m_flags & NOINDEXFLAGS ) {
|
2018-03-09 10:24:39 -05:00
|
|
|
|
pd->sbuf->htmlEncode(token.token_start,token.token_len,false );
|
2013-08-02 16:12:24 -04:00
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// boldify alnum words
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.is_alfanum ) {
|
2018-02-22 12:05:46 -05:00
|
|
|
|
if ( pd->wposVec[i] == pd->hiPos )
|
|
|
|
|
pd->sbuf->safePrintf("<a name=hipos></a>");
|
|
|
|
|
pd->sbuf->safePrintf("<nobr><b>");
|
|
|
|
|
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
|
|
|
|
|
pd->sbuf->safePrintf("<strike>");
|
|
|
|
|
}
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<blink style=\""
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"background-color:yellow;"
|
|
|
|
|
"color:black;\">");
|
|
|
|
|
// print that word
|
2018-03-09 10:24:39 -05:00
|
|
|
|
pd->sbuf->htmlEncode(token.token_start, token.token_len, false );
|
|
|
|
|
if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</blink>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// boldify alnum words
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.is_alfanum ) {
|
2018-02-22 12:05:46 -05:00
|
|
|
|
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
|
|
|
|
|
pd->sbuf->safePrintf("</strike>");
|
|
|
|
|
pd->sbuf->safePrintf("</b>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
// and print out their pos/div/spam sub
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( token.is_alfanum ) {
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("<sub "
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"style=\"background-color:white;"
|
|
|
|
|
"font-size:10px;"
|
|
|
|
|
"border:black 1px solid;"
|
|
|
|
|
"color:black;\">");
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("%" PRId32, pd->wposVec[i]);
|
|
|
|
|
if ( pd->densityVec[i] != MAXDENSITYRANK )
|
|
|
|
|
pd->sbuf->safePrintf("/<font color=purple><b>%" PRId32
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"</b></font>"
|
|
|
|
|
,
|
2018-02-22 12:05:46 -05:00
|
|
|
|
(int32_t)pd->densityVec[i]);
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2018-02-22 12:05:46 -05:00
|
|
|
|
if ( pd->wordSpamVec[i] != MAXWORDSPAMRANK )
|
|
|
|
|
pd->sbuf->safePrintf("/<font color=red><b>%" PRId32
|
2013-08-02 16:12:24 -04:00
|
|
|
|
"</b></font>"
|
|
|
|
|
,
|
2018-02-22 12:05:46 -05:00
|
|
|
|
(int32_t)pd->wordSpamVec[i]);
|
|
|
|
|
pd->sbuf->safePrintf("</sub></nobr>");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-02-22 12:05:46 -05:00
|
|
|
|
pd->sbuf->safePrintf("</div>\n");
|
2013-08-02 16:12:24 -04:00
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool Sections::verifySections ( ) {
|
|
|
|
|
|
|
|
|
|
// make sure we map each word to a section that contains it at least
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *si = m_sectionPtrs[i];
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( si->m_a > i ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must have checksum
|
2018-03-09 10:24:39 -05:00
|
|
|
|
if ( (*m_tr)[i].is_alfanum && si->m_contentHash64==0) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must have this set if 0
|
|
|
|
|
if ( ! si->m_contentHash64 && !(si->m_flags & SEC_NOTEXT)) {
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true);}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
if ( si->m_contentHash64 && (si->m_flags & SEC_NOTEXT)) {
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true);}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sanity check
|
|
|
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
|
|
|
// get it
|
|
|
|
|
//Section *sn = &m_sections[i];
|
|
|
|
|
// get parent
|
2018-02-22 12:09:05 -05:00
|
|
|
|
for(const Section *sp = sn->m_parent; sp; sp = sp->m_parent) {
|
|
|
|
|
// make sure parent fully contains
|
|
|
|
|
if ( sp->m_a > sn->m_a ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( sp->m_b < sn->m_b ) { g_process.shutdownAbort(true); }
|
|
|
|
|
// and make sure every grandparent fully contains us too!
|
|
|
|
|
}
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sanity check
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *sn = &m_sections[i];
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( sn->m_a >= sn->m_b ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sanity check, make sure each section is contained by the
|
|
|
|
|
// smallest section containing it
|
|
|
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
|
|
|
for ( Section *sj = m_rootSection ; sj ; sj = sj->m_next ) {
|
|
|
|
|
// skip if us
|
|
|
|
|
if ( sj == si ) continue;
|
|
|
|
|
// skip column sections because they are artificial
|
|
|
|
|
// and only truly contain some of the sections that
|
|
|
|
|
// their [a,b) interval says they contain.
|
|
|
|
|
if ( sj->m_tagId == TAG_TC ) continue;
|
|
|
|
|
// or if an implied section of td tags in a tc
|
|
|
|
|
if ( sj->m_baseHash == BH_IMPLIED &&
|
|
|
|
|
sj->m_parent &&
|
|
|
|
|
sj->m_parent->m_tagId == TAG_TC )
|
|
|
|
|
continue;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// skip if sj does not contain first word in si
|
|
|
|
|
if ( sj->m_a > si->m_a ) continue;
|
|
|
|
|
if ( sj->m_b <= si->m_a ) continue;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ok, make sure in our parent path
|
|
|
|
|
Section *ps = si;
|
|
|
|
|
for ( ; ps ; ps = ps->m_parent )
|
|
|
|
|
if ( ps == sj ) break;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// ok if we found it
|
|
|
|
|
if ( ps ) continue;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// sometimes if sections are equal then the other
|
|
|
|
|
// is the parent
|
|
|
|
|
ps = sj;
|
|
|
|
|
for ( ; ps ; ps = ps->m_parent )
|
|
|
|
|
if ( ps == si ) break;
|
2016-02-24 05:21:06 -05:00
|
|
|
|
|
2013-08-02 16:12:24 -04:00
|
|
|
|
// must have had us
|
|
|
|
|
if ( ps ) continue;
|
2016-06-20 06:30:26 -04:00
|
|
|
|
g_process.shutdownAbort(true);
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// make sure we map each word to a section that contains it at least
|
2014-11-10 17:45:11 -05:00
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
2013-08-02 16:12:24 -04:00
|
|
|
|
Section *si = m_sectionPtrs[i];
|
2016-06-20 06:30:26 -04:00
|
|
|
|
if ( si->m_a > i ) { g_process.shutdownAbort(true); }
|
|
|
|
|
if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
|
2013-08-02 16:12:24 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|