privacore-open-source-searc.../Sections.cpp

// print events should print &nbsp; if nothing else to print

// when a div tag's parent truncates its section, it may have been
// paired up with a div back tag which then should become free...
// that is the problem... because those back tags are unpaired.
// so your parent should constrain you as SOON as it is constrained and
// close you up at that point. that way you cannot falsely pair-claim
// a div back tag.


#include "Sections.h"
#include "Url.h"
#include "tokenizer.h"
#include "Conf.h"
#include "XmlDoc.h"
#include "Bits.h"
#include "sort.h"
#include "Abbreviations.h"
#include "StopWords.h"
#include "Process.h"
#include "Posdb.h"
#include "GbUtil.h"
#include "Errno.h"

Sections::Sections ( ) {
	m_sections = NULL;
	reset();
}

void Sections::reset() {
	m_sectionBuf.purge();
	m_sectionPtrBuf.purge();

	m_sections         = NULL;
	m_bits             = NULL;
	m_numSections      = 0;
	m_rootSection      = NULL;
	m_lastSection      = NULL;
	m_lastAdded        = NULL;
	m_nw = 0;
	m_firstSentence = NULL;
	m_sectionPtrs = NULL;
	
	// Coverity
	m_tr = NULL;
	m_contentType = 0;
	m_isRSSExt = false;
	m_maxNumSections = 0;
}

Sections::~Sections ( ) {
	reset();
}

#define TXF_MATCHED 1

// an element on the stack is a Tag
class Tagx {
public:
	// id of the fron tag we pushed
	nodeid_t m_tid;
	// section number we represent
	int32_t     m_secNum;
	// set to TXF_MATCHED
	char     m_flags;
};

// i lowered from 1000 to 300 so that we more sensitive to malformed pages
// because typically they seem to take longer to parse. i also added some
// new logic for dealing with table tr and td back tags that allow us to
// pop off the other contained tags right away rather than delaying it until
// we are done because that will often breach this stack.
#define MAXTAGSTACK 300

// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . sets m_sections[] array, 1-1 with words array "w"
// . the Weights class can look at these sections and zero out the weights
//   for words in script, style, select and marquee sections
bool Sections::set(const TokenizerResult *tr, Bits *bits, const Url *url, uint8_t contentType ) {
	reset();

	if ( ! tr ) return true;

	if ( tr->size() > 1000000 ) {
		log("sections: over 1M words. skipping sections set for "
		    "performance.");
		return true;
	}

	// save it
	m_tr              = tr;
	m_bits            = bits;
	m_contentType     = contentType;

	// reset this just in case
	g_errno = 0;

	if ( tr->empty() ) return true;

	// shortcuts
	int32_t           nw  = tr->size();

	m_isRSSExt = false;
	const char *ext = url->getExtension();
	if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true;
	if ( m_contentType == CT_XML ) m_isRSSExt = true;

	// . how many sections do we have max?
	// . init at one to count the root section
	int32_t max = 1;
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		const auto &token = (*tr)[i];
		// . count all front tags

		// count back tags too since some url 
		// http://www.tedxhz.com/tags.asp?id=3919&id2=494 had a bunch
		// of </p> tags with no front tags and it cored us because
		// m_numSections > m_maxNumSections!
		if ( token.nodeid ) {
			max += 2;
		// . any punct tag could have a bullet in it...
		// . or if its a period could make a sentence section
		} else if ( !token.is_alfanum ) {
			// only do not count simple spaces
			if ( token.token_len == 1 && is_wspace_a(token.token_start[0]))
				continue;
			// otherwise count it as sentence delimeter
			max++;
		}
	}

	// . then \0 allows for a sentence too!
	// . fix doc that was just "localize-sf-prod\n"
	max++;

	// and each section may create a sentence section
	max *= 2;

	// truncate if excessive.
	if ( max > 1000000 ) {
		log("sections: truncating max sections to 1000000");
		max = 1000000;
	}

	int32_t need = max * sizeof(Section);

	// set this
	m_maxNumSections = max;

	m_sectionPtrBuf.setLabel("psectbuf");

	// separate buf now for section ptr for each word
	if ( ! m_sectionPtrBuf.reserve ( nw *sizeof(Section *)) ) return true;
	m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();

	// allocate m_sectionBuf
	m_sections = NULL;

	m_sectionBuf.setLabel ( "sectbuf" );

	if ( ! m_sectionBuf.reserve ( need ) )
		return true;

	// point into it
	m_sections = (Section *)m_sectionBuf.getBufStart();

	// save this too
	m_nw = nw;

	// stack of front tags we encounter
	Tagx stack[MAXTAGSTACK];
	Tagx *stackPtr = stack;

	Section *current     = NULL;
	Section *rootSection = NULL;

	// assume none
	m_rootSection = NULL;

	// only add root section if we got some words
	if ( nw > 0 ) {
		// record this i guess
		rootSection = &m_sections[m_numSections];
		// clear
		memset ( rootSection , 0 , sizeof(Section) );
		// . the current section we are in
		// . let's use a root section
		current = rootSection;
		// init that to be the whole page
		rootSection->m_b = nw;
		// save it
		m_rootSection = rootSection;
		// to fix a core dump
		rootSection->m_baseHash = 1;
		// advance
		m_numSections++;
	}

	// Sections are no longer 1-1 with words, just with front tags
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		const auto &token = (*tr)[i];
		nodeid_t fullTid = token.nodeid;

		// are we a non-tag?
		if ( ! fullTid ) { 
			continue;
		}

		// make a single section for input tags
		if ( fullTid == TAG_INPUT ||
		     fullTid == TAG_HR    ||
		     fullTid == TAG_COMMENT ) {
			// try to realloc i guess. should keep ptrs in tact.
			if ( m_numSections >= m_maxNumSections) {
				g_errno = EDOCBADSECTIONS;
				return true;
			}
			// get the section
			Section *sn = &m_sections[m_numSections];
			// clear
			memset ( sn , 0 , sizeof(Section) );
			// inc it
			m_numSections++;
			// sanity check - breach check
			if ( m_numSections > max ) { g_process.shutdownAbort(true); }
			// set our parent
			sn->m_parent = current;
			// need to keep a word range that the section covers
			sn->m_a = i;
			// section consists of just this tag
			sn->m_b = i + 1;
			// go on to next
			continue;
		}

		// a section of multiple br tags in a sequence
		if ( fullTid == TAG_BR ) {
			// try to realloc i guess. should keep ptrs in tact.
			if ( m_numSections >= m_maxNumSections) {
				g_errno = EDOCBADSECTIONS;
				return true;
			}
			// get the section
			Section *sn = &m_sections[m_numSections];
			// clear
			memset ( sn , 0 , sizeof(Section) );
			// inc it
			m_numSections++;
			// sanity check - breach check
			if ( m_numSections > max ) { g_process.shutdownAbort(true); }
			// set our parent
			sn->m_parent = current;
			// need to keep a word range that the section covers
			sn->m_a = i;
			// count em up
			int32_t brcnt = 1;
			// scan for whole sequence
			int32_t lastBrPos = i;
			for ( int32_t j = i + 1 ; j < nw ; j++ ) {
				const auto &token2 = (*tr)[j];
				// claim br tags
				if ( token2.nodeid == TAG_BR ) { 
					lastBrPos = j;
					brcnt++; 
					continue; 
				}
				// break on words
				if ( token2.is_alfanum ) break;
				// all spaces is ok
				if ( is_wspace_utf8_string(token2.token_start,token2.token_end()) ) continue;
				// otherwise, stop on other punct
				break;
			}
			// section consists of just this tag
			sn->m_b = lastBrPos + 1;
			// advance
			i = lastBrPos;
			// set this for later so that getDelimHash() returns
			// something different based on the br count for
			// METHOD_ATTRIBUTE
			sn->m_baseHash = 19999 + brcnt;
			// go on to next
			continue;
		}

		// get the tag id without the back bit
		nodeid_t tid = fullTid & BACKBITCOMP;

		// . ignore tags with no corresponding back tags
		// . if they have bad html and have front tags
		//   with no corresponding back tags, that will hurt!
		// . make exception for <li> tag!!!
		// . was messing up:
		//   http://events.kqed.org/events/index.php?com=detail&
		//   eID=9812&year=2009&month=11
		//   for parsing out events
		// . make excpetion for <p> tag too! most ppl use </p>
		if ( ( ! hasBackTag ( tid ) || 
		       token.token_start[1] =='!'    || // <!ENTITY rdfns...>
		       token.token_start[1] =='?'    ) &&
		     tid != TAG_P &&
		     tid != TAG_LI )
			continue;

		// . these imply no back tag
		// . <description />
		// . fixes inconsistency in 
		//   www.trumba.com/calendars/KRQE_Calendar.rss
		if ( token.token_start[token.token_len-2] == '/' && tid == TAG_XMLTAG )
			continue;

		// do not breach the stack
		if ( stackPtr - stack >= MAXTAGSTACK ) {
			log( LOG_WARN, "html: stack breach for %s",url->getUrl());
			// if we set g_errno and return then the url just
			// ends up getting retried once the spider lock
			// in Spider.cpp expires in MAX_LOCK_AGE seconds.
			// about an hour. but really we should completely
			// give up on this. whereas we should retry OOM errors
			// etc. but this just means bad html really.

			// just reset to 0 sections then
			reset();
			return true;
		}

		char gotBackTag ;
		if ( fullTid != tid ) gotBackTag = 1;
		else                  gotBackTag = 0;

		// "pop tid", tid to pop off stack
		nodeid_t ptid       = tid;
		nodeid_t fullPopTid = fullTid;

		// no nested <li> tags allowed
		if ( fullTid == TAG_LI &&  
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_LI ) 
			gotBackTag = 2;

		// no nested <b> tags allowed
		if ( fullTid == TAG_B &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_B ) 
			gotBackTag = 2;

		// no nested <a> tags allowed
		if ( fullTid == TAG_A &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_A ) 
			gotBackTag = 2;

		// no nested <p> tags allowed
		if ( fullTid == TAG_P &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_P ) 
			gotBackTag = 2;

		// no <hN> tags inside a <p> tag
		// fixes http://www.law.berkeley.edu/140.htm
		if ( fullTid >= TAG_H1 &&
		     fullTid <= TAG_H5 &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_P ) {
			// match this on stack
			ptid       = TAG_P;
			fullPopTid = TAG_P;
			gotBackTag = 2;
		}

		// no nested <td> tags allowed
		if ( fullTid == TAG_TD &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_TD ) 
			gotBackTag = 2;

		// encountering <tr> when in a <td> closes the <td> AND
		// should also close the <tr>!!
		if ( fullTid == TAG_TR &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_TD )
			gotBackTag = 2;

		// no nested <tr> tags allowed
		if ( fullTid == TAG_TR &&
		     stackPtr > stack && 
		     ((stackPtr-1)->m_tid)==TAG_TR )
			gotBackTag = 2;

		// this is true if we are a BACK TAG
		if ( gotBackTag ) {

			// ignore span tags that are non-breaking because they
			// do not change the grouping/sectioning behavior of
			// the web page and are often abused.
			if ( ptid == TAG_SPAN ) continue;

			// fix for gwair.org
			if ( ptid == TAG_FONT ) continue;

			// too many people use these like a <br> tag or
			// make them open-ended or unbalanced
			//if ( tid == TAG_P      ) continue;
			if ( ptid == TAG_CENTER ) continue;
			
		subloop:
			// don't blow the stack
			if ( stackPtr == stack ) continue;

			// point to it
			Tagx *spp = (stackPtr - 1);

			// init it
			Tagx *p ;
			// scan through the stack until we find a
			// front tag that matches this back tag
			//for(p = spp ; p >= stack && gotBackTag == 1 ; p-- ) {
			for ( p = spp ; p >= stack ; p-- ) {
				// no match?
				if ( p->m_tid != ptid ) {
					// matched before? we can pop
					if ( p->m_flags & TXF_MATCHED )
						continue;
					// keep on going
					continue;
				}
				// do not double match
				if ( p->m_flags & TXF_MATCHED )
					continue;
				// flag it cuz we matched it
				p->m_flags |= TXF_MATCHED;
				// set the stack ptr to it
				spp = p;
				// and stop
				break;
			}

			// no matching front tag at all?
			// then just ignore this back tag
			if ( p < stack ) continue;

			// get section number of the front tag
			//int32_t xn = *(secNumPtr-1);
			int32_t xn = spp->m_secNum;
			// sanity
			if ( xn<0 || xn>=m_numSections ) {g_process.shutdownAbort(true);}
			// get it
			Section *sn = &m_sections[xn];

			// record the word range of the secion we complete
			sn->m_b = i+1;

			// do not include the <li> tag as part of it
			// otherwise we end up with overlapping section since
			// this tag ALSO starts a section!!
			if ( gotBackTag == 2 ) sn->m_b = i;

			// if our parent got closed before "sn" closed because
			// it hit its back tag before we hit ours, then we
			// must cut ourselves short and try to match this
			// back tag to another front tag on the stack
			Section *ps = sn->m_parent;
			for ( ; ps != rootSection ; ps = ps->m_parent ) {
				// skip if parent no longer contains us!
				if ( ps->m_b <= sn->m_a ) continue;
				// skip if this parent is still open
				if ( ps->m_b <= 0 ) continue;
				// parent must have closed before us
				if ( ps->m_b > sn->m_b ) {g_process.shutdownAbort(true);}

				// cut our end shorter
				sn->m_b = ps->m_b;
				// our TXF_MATCHED bit should still be set
				// for spp->m_flags, so try to match ANOTHER
				// front tag with this back tag now
				if ( ! ( spp->m_flags & TXF_MATCHED ) ) {
					g_process.shutdownAbort(true); }
				// ok, try to match this back tag with another
				// front tag on the stack, because the front
				// tag we had selected got cut short because
				// its parent forced it to cut short.
				goto subloop;
			}
   
			// sanity check
			if ( sn->m_b <= sn->m_a ) { g_process.shutdownAbort(true);}

			// revert it to this guy, may not equal stackPtr-1 !!
			stackPtr = spp;
			
			// get parent section
			if ( stackPtr > stack ) {
				// get parent section now
				xn = (stackPtr-1)->m_secNum;
				// set current to that
				current = &m_sections[xn];
			}
			else {
				// i guess this is bad html!
				current = rootSection;
			}
			
			// debug log
			if ( g_conf.m_logDebugSections ) {
				const char *ms = "";
				if ( stackPtr->m_tid != ptid) ms =" UNMATCHED";
				const char *back ="";
				if ( fullPopTid & BACKBIT ) back = "/";
				logf(LOG_DEBUG,"section: pop tid=%" PRId32" "
				     "i=%" PRId32" "
				     "level=%" PRId32" "
				     "%s%s "
				     //"h=0x%" PRIx32
				     "%s",(int32_t)tid,
				     i,
				     (int32_t)(stackPtr - stack),
				     back,g_nodes[tid].m_nodeName,
				     //h,
				     ms);
			}
			
			// . if we were a back tag, we are done... but if we
			//   were a front tag, we must add ourselves below...
			// . MDW: this seems more logical than the if-statement
			//        below...
			if ( fullTid != tid ) continue;
		}

		if ( tid == TAG_CENTER ) continue;

		if ( tid == TAG_SPAN ) continue;
		// gwair.org has font tags the pair up a date "1st Sundays"
		// with the address above it, and it shouldn't do that!
		if ( tid == TAG_FONT ) continue;

		// try to realloc i guess. should keep ptrs in tact.
		if ( m_numSections >= m_maxNumSections) {
			g_errno = EDOCBADSECTIONS;
			return true;
		}

		// get the section
		Section *sn = &m_sections[m_numSections];

		// clear
		memset ( sn , 0 , sizeof(Section) );

		// inc it
		m_numSections++;

		// sanity check - breach check
		if ( m_numSections > max ) { g_process.shutdownAbort(true); }
		
		// set our parent
		sn->m_parent = current;

		// set this
		current = sn;
		
		// need to keep a word range that the section covers
		sn->m_a = i;

		// assume no terminating bookend
		sn->m_b = -1;

		// push a unique id on the stack so we can pop if we
		// enter a subsection
		stackPtr->m_tid         = tid;
		stackPtr->m_secNum      = m_numSections - 1;
		stackPtr->m_flags       = 0;
		stackPtr++;

		// debug log
		if ( ! g_conf.m_logDebugSections ) continue;

		logf(LOG_DEBUG,"section: push tid=%" PRId32" "
		     "i=%" PRId32" "
		     "level=%" PRId32" "
		     "%s "
		     ,
		     (int32_t)tid,
		     i,
		     (int32_t)(stackPtr - stack)-1,
		     g_nodes[(int32_t)tid].m_nodeName
		     );
	}

	// if first word in a section false outside of the parent section
	// then reparent to the grandparent. this can happen when we end
	// up closing a parent section before ???????
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *si = &m_sections[i];
		// skip if we are still open-ended
		if ( si->m_b < 0 ) continue;
		// get parent
		Section *sp = si->m_parent;
		// skip if no parent
		if ( ! sp ) continue;
		// skip if parent still open ended
		if ( sp->m_b < 0 ) continue;
		// subloop it
	doagain:
		// skip if no parent
		if ( ! sp ) continue;
		// parent must start before us
		if ( sp->m_a > si->m_a ) { g_process.shutdownAbort(true); }
		// . does parent contain our first word?
		// . it need not fully contain our last word!!!
		if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
		// if parent is open ended, then it is ok for now
		if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
		// get grandparent
		sp = sp->m_parent;
		// set
		si->m_parent = sp;
		// try again
		goto doagain;
	}

	bool inGbFrame = false;
	int32_t gbFrameNum = 0;

	bool inIFrame = false;

	//
	// . set Section::m_xmlNameHash for xml tags here
	// . set Section::m_frameNum and SEC_IN_GBFRAME bit
	//
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *sn = &m_sections[i];

		// get it
		int32_t ws = sn->m_a;
		const auto &token = (*m_tr)[ws];
		// shortcut
		nodeid_t tid = token.nodeid;

		if (tid == TAG_IFRAME) {
			//if the section doesn't have the closing iframe tag then set inIFrame
			bool hasClosingIframeTag = false;
			for(int j=sn->m_b-1; j>i; j--) {
				if((*m_tr)[j].nodeid == (TAG_IFRAME|BACKBIT)) {
					hasClosingIframeTag = true;
					break;
				}
			}
			if(!hasClosingIframeTag)
				inIFrame = true;
			else if(!inGbFrame)
				sn->m_flags |= SEC_IN_IFRAME;
		} else if (tid == (TAG_IFRAME | BACKBIT)) { //never happens how sentences are currently split
			inIFrame = false;
		} else if ( tid == TAG_GBFRAME ) {
			// start or end?
			gbFrameNum++;
			inGbFrame = true;
		} else if ( tid == (TAG_GBFRAME | BACKBIT) ) {
			inGbFrame = false;
		}

		if (inIFrame && !inGbFrame)
			sn->m_flags |= SEC_IN_IFRAME;

		// mark it
		if (inGbFrame)
			sn->m_gbFrameNum = gbFrameNum;

		// custom xml tag, hash the tag itself
		if ( tid != TAG_XMLTAG ) continue;
		// stop at first space to avoid fields!!
		const char *p    =     token.token_start + 1;
		const char *pend = p + token.token_len;
		// skip back tags
		if ( *p == '/' ) continue;
		// reset hash
		int64_t xh = 0;
		// and hash char count
		unsigned char cnt = 0;
		// hash till space or / or >
		for ( ; p < pend ; p++ ) {
			// stop on space or / or >
			if ( is_wspace_a(*p) ) break;
			if ( *p == '/' ) break;
			if ( *p == '>' ) break;
			// hash it in
			xh ^= g_hashtab[cnt++][(unsigned char )*p];
		}
		// if it is a string of the same chars it can be 0
		if ( ! xh ) xh = 1;
		// store that
		sn->m_xmlNameHash = (int32_t)xh;
	}
	
	//TODO: implement section m_flags inheritance correctly. Currently SEC_IN_IFRAME/SEC_HIDDEN/... are not inherited by  child sections.

	// find any open ended tags and constrain them based on their parent
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *si = &m_sections[i];
		// get its parent
		Section *ps = si->m_parent;
		// if parent is open-ended panic!
		if ( ps && ps->m_b < 0 ) { g_process.shutdownAbort(true); }

		// if our parent got constrained from under us, we need
		// to telescope to a new parent
		for  ( ; ps && ps->m_b >= 0 && ps->m_b <= si->m_a ; ) {
			ps = ps->m_parent;
			si->m_parent = ps;
		}

		// assume end is end of doc
		int32_t end = m_tr->size();
		// get end of parent
		if ( ps ) end = ps->m_b;

		// shrink our section if parent ends before us OR if we
		// are open ended
		if ( si->m_b != -1 && si->m_b <= end ) continue;
		// this might constrain someone's parent such that
		// that someone no longer can use that parent!!
		si->m_b = end;
		// . get our tag type
		// . use int32_t instead of nodeid_t so we can re-set this
		//   to the xml tag hash if we need to
		int32_t tid1 = (*m_tr)[si->m_a].nodeid;
		// use the tag hash if this is an xml tag
		if ( tid1 == TAG_XMLTAG ) {
			// we computed this above
			tid1 = si->m_xmlNameHash;
			// skip if zero!
			if ( ! tid1 ) continue;
		}
		// must be there to be open ended
		if ( ! tid1 ) { g_process.shutdownAbort(true); }
		// NOW, see if within that parent there is actually another
		// tag after us of our same tag type, then use that to
		// constrain us instead!!
		// this hurts <p><table><tr><td><p>.... because it
		// uses that 2nd <p> tag to constrain si->m_b of the first
		// <p> tag which is not right! sunsetpromotions.com has that.
		for ( int32_t j = i + 1 ; j < m_numSections ; j++ ) {
			// get it
			Section *sj = &m_sections[j];
			// get word start
			int32_t a = sj->m_a;
			// skip if ties with us already
			if ( a == si->m_a ) continue;
			// stop if out
			if ( a >= end ) break;

			// . it must be in the same expanded frame src, if any
			// . this fixes trulia.com which was ending our html
			//   tag, which was open-ended, with the html tag in
			//   a frame src expansion
			if ( sj->m_gbFrameNum != si->m_gbFrameNum ) continue;
			// fix sunsetpromotions.com bug. see above.
			if ( sj->m_parent != si->m_parent ) continue;
			// get its tid
			int32_t tid2 = (*m_tr)[a].nodeid;
			// use base hash if xml tag
			if ( tid2 == TAG_XMLTAG )
				tid2 = sj->m_xmlNameHash;
			// must be our tag type!
			if ( tid2 != tid1 ) continue;
			// ok end us there instead!
			si->m_b = a;
			// stop
			break;
		}
	}


	// reparent again now that things are closed
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *si = &m_sections[i];
		// skip if we are still open-ended
		if ( si->m_b < 0 ) { g_process.shutdownAbort(true); }
		// get parent
		Section *sp = si->m_parent;
		// skip if null
		if ( ! sp ) continue;
		// skip if parent still open ended
		if ( sp->m_b < 0 ) { g_process.shutdownAbort(true); }
		// subloop it
	doagain2:
		// skip if no parent
		if ( ! sp ) continue;
		// . does parent contain our first word?
		// . it need not fully contain our last word!!!
		if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
		// if parent is open ended, then it is ok for now
		if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
		// if parent is open ended, then it is ok for now
		if ( sp->m_b == -1 ) { g_process.shutdownAbort(true); }
		// get grandparent
		sp = sp->m_parent;
		// set
		si->m_parent = sp;
		// try again
		goto doagain2;
	}

	//
	//
	// now assign m_sectionPtrs[] which map a word to the first
	// section that contains it
	//
	//
	Section *dstack[MAXTAGSTACK];
	int32_t     ns = 0;
	int32_t      j = 0;
	current       = m_rootSection;//&m_sections[0];
	Section *next = m_rootSection;//&m_sections[0];
	// first print the html lines out
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// pop all off the stack that match us
		for ( ; ns>0 && dstack[ns-1]->m_b == i ; ) {
			ns--;
			current = dstack[ns-1];
		}
		// push our current section onto the stack if i equals
		// its first word #
		for ( ; next && i == next->m_a ; ) {
			dstack[ns++] = next;
			// set our current section to this now
			current = next;
			// get next section for setting "next"
			j++;
			// if no more left, set "next" to NULL and stop loop
			if ( j >= m_numSections ) { next=NULL; break; }
			// grab it
			next = &m_sections[j];
		}
		// assign
		m_sectionPtrs[i] = current;
	}

	// . addImpliedSections() requires Section::m_baseHash
	// . set Section::m_baseHash
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// these have to be in order of sn->m_a to work right
		// because we rely on the parent tag hash, which would not
		// necessarily be set if we were not sorted, because the
		// parent section could have SEC_FAKE flag set because it is
		// a br section added afterwards.
		Section *sn = &m_sections[i];
		// get word start into "ws"
		int32_t ws = sn->m_a;
		const auto &token = (*m_tr)[ws];
		// shortcut
		nodeid_t tid = token.nodeid;
		// sanity check, <a> guys are not sections
		//if ( tid == TAG_A &&
		//     !(sn->m_flags & SEC_SENTENCE) ) { g_process.shutdownAbort(true); }
		// use a modified tid as the tag hash?
		int64_t mtid = tid;
		// custom xml tag, hash the tag itself
		if ( tid == TAG_XMLTAG )
			mtid = hash32 ( token.token_start,token.token_len );
		// an unknown tag like <!! ...->
		if ( tid == 0 )
			mtid = 1;
		// . if we are a div tag, mod it
		// . treat the fields in the div tag as 
		//   part of the tag hash. 
		// . helps Events.cpp be more precise about
		//   section identification!!!!
		// . we now do this for TD and TR so Nov 2009 can telescope for
		//   http://10.5.1.203:8000/test/doc.17096238520293298312.html
		//   so the calendar title "Nov 2009" can affect all dates
		//   below the calendar.
		if ( tid == TAG_DIV  || 
		     tid == TAG_TD   || 
		     tid == TAG_TR   ||
		     tid == TAG_LI   || // newmexico.org urls class=xxx
		     tid == TAG_UL   || // newmexico.org urls class=xxx
		     tid == TAG_P    || // <p class="pstrg"> stjohnscollege.edu
		     tid == TAG_SPAN ) {
			// get ptr
		        const char *p = token.token_start;
			// skip <
			p++;
			// skip following alnums, that is the tag name
			for ( ; is_alnum_a(*p) ; p++ );
			// scan for "id" or "class" in it
			// . i had to increase this because we were missing
			//   some stuff causing us to get the wrong implied
			//   sections for 
			//   www.guysndollsllc.com/page5/page4/page4.html
			//   causing "The Remains" to be paired up with
			//   "Aug 7, 2010" in an implied section which was
			//   just wrong. it was 20, i made it 100...
			const char *pend = p + 100;
			// position ptr
			unsigned char cnt = 0;
			// a flag
			bool skipTillSpace = false;
			// . just hash every freakin char i guess
			// . TODO: maybe don't hash "width" for <td><tr>
			for ( ; *p && *p !='>' && p < pend ; p++ ) {
				// skip bgcolor= tags because panjea.org
				// interlaces different colored <tr>s in the
				// table and i want them to be seen as brother
				// sections, mostly for the benefit of the
				// setting of lastBrother1/2 in Events.cpp
				if ( is_wspace_a(p[0])      &&
				     to_lower_a (p[1])=='b' &&
				     to_lower_a (p[2])=='g' ) {
					skipTillSpace = true;
					continue;
				}

				// if not a space continue
				if ( skipTillSpace ) {
					if ( ! is_wspace_a(*p) ) continue;
					skipTillSpace = false;
				}
				// do not hash until we get a space
				if ( skipTillSpace ) continue;
				// skip if not alnum
				if ( !is_alnum_a(*p)) continue;
				// hash it in
				mtid ^= g_hashtab[cnt++][(unsigned char)*p];
			}
		}
		// should not have either of these yet!
		if ( sn->m_flags & SEC_FAKE     ) { g_process.shutdownAbort(true); }
		if ( sn->m_flags & SEC_SENTENCE ) { g_process.shutdownAbort(true); }
		// sanity check
		if ( mtid == 0 ) { g_process.shutdownAbort(true); }
		// . set the base hash, usually just tid
		// . usually base hash is zero but if it is a br tag
		//   we set it to something special to indicate the number
		//   of br tags in the sequence
		sn->m_baseHash ^= mtid;
		// fix this
		if ( sn == rootSection ) sn->m_baseHash = 1;
		// fix root section i guess
		if ( sn->m_baseHash == 0 ) { 
			// fix core on gk21
			sn->m_baseHash = 2;
		}
		// set this now too WHY? should already be set!!! was
		// causing the root section to become a title section
		// because first word was "<title>". then every word in
		// the doc got SEC_IN_TITLE set and did not get hashed
		// in XmlDoc::hashBody()... NOR in XmlDoc::hashTitle()!!!
		if ( sn != rootSection )
			sn->m_tagId = tid;
	}

	// set up our linked list, the functions below will insert sections
	// and modify this linked list
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// set it
		if ( i + 1 < m_numSections )
			m_sections[i].m_next = &m_sections[i+1];
		if ( i - 1 >= 0 )
			m_sections[i].m_prev = &m_sections[i-1];
	}

	// init to -1 to indicate none
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		// reset it
		si->m_firstWordPos = -1;
		si->m_lastWordPos  = -1;
		si->m_senta        = -1;
		si->m_sentb        = -1;
	}
	// now set position of first word each section contains
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// skip if not alnum word
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// get smallest section containing
		Section *si = m_sectionPtrs[i];
		// do each parent as well
		for ( ; si ; si = si->m_parent ) {
			// skip if already had one!
			if ( si->m_firstWordPos >= 0 ) break;
			// otherwise, we are it
			si->m_firstWordPos = i;
			// . set format hash of it
			// . do it manually since tagHash not set yet
		}
	}
	// and last word position
	for ( int32_t i = m_nw - 1 ; i > 0 ; i-- ) {
		// skip if not alnum word
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// get smallest section containing
		Section *si = m_sectionPtrs[i];
		// do each parent as well
		for ( ; si ; si = si->m_parent ) {
			// skip if already had one!
			if ( si->m_lastWordPos >= 0 ) break;
			// otherwise, we are it
			si->m_lastWordPos = i;
		}
	}

	sec_t inFlag = 0;
	int32_t  istack[1000];
	sec_t iflags[1000];
	int32_t  ni = 0;

	// 
	// now set the inFlags here because the tags might not have all
	// been closed, making tags like SEC_STYLE overflow from where
	// they should be...
	//
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		// did we exceed a tag boundary?
		for ( ; ni>0 && si->m_a >= istack[ni-1] ; ) {
			// undo flag
			inFlag &= ~iflags[ni-1];
			// pop off
			ni--;
		}

		// get the flag if any into mf
		sec_t mf = 0;

		// skip if not special tag id
		nodeid_t tid = si->m_tagId;
		if      ( tid == TAG_SCRIPT  ) mf = SEC_SCRIPT;
		else if ( tid == TAG_NOSCRIPT) mf = SEC_NOSCRIPT;
		else if ( tid == TAG_STYLE   ) mf = SEC_STYLE;
		else if ( tid == TAG_SELECT  ) mf = SEC_SELECT;
		else if ( tid == TAG_H1      ) mf = SEC_IN_HEADER;
		else if ( tid == TAG_H2      ) mf = SEC_IN_HEADER;
		else if ( tid == TAG_H3      ) mf = SEC_IN_HEADER;
		else if ( tid == TAG_H4      ) mf = SEC_IN_HEADER;
		else if ( tid == TAG_TITLE   ) mf = SEC_IN_TITLE;
		else if ( tid == TAG_HEAD    ) mf = SEC_IN_HEAD;

		// accumulate
		inFlag |= mf;

		// add in the flags
		si->m_flags |= inFlag;

		// skip if nothing special
		if ( ! mf ) continue;

		// sanity
		if ( ni >= 1000 ) { g_process.shutdownAbort(true); }

		// otherwise, store on stack
		istack[ni] = si->m_b;
		iflags[ni] = mf;
		ni++;
	}

	// . now we insert sentence sections
	// . find the smallest section containing the first and last
	//   word of each sentence and inserts a subsection into that
	// . we have to be careful to reparent, etc.
	// . kinda copy splitSections() function
	// . maybe add an "insertSection()" function???
	if ( m_contentType != CT_JS ) {
		// add sentence sections
		if ( ! addSentenceSections() ) return true;
		// this is needed by setSentFlags()
		setNextSentPtrs();
	}

	// . set m_nextBrother
	// . we call this now to aid in setHeadingBit() and for adding the
	//   implied sections, but it is ultimately
	//   called a second time once all the new sections are inserted
	setNextBrotherPtrs ( false );

	// . set SEC_HEADING bit
	// . need this before implied sections
	setHeadingBit ();

	setTagHashes();

	//
	//
	// TODO TODO
	//
	// TAKE OUT THESE SANITY CHECKS TO SPEED UP!!!!!!
	//
	//

	// clear this
	bool isHidden  = false;
	int32_t startHide = 0x7fffffff;
	int32_t endHide   = 0 ;
	// now that we have closed any open tag, set the SEC_HIDDEN bit
	// for all sections that are like <div style=display:none>
	for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
		// set m_lastSection so we can scan backwards
		m_lastSection = sn;

		// set this
		int32_t wn = sn->m_a;
		// stop hiding it?
		if ( isHidden ) {
			// turn it off if not contained
			if      ( wn >= endHide   ) isHidden = false;
			else    sn->m_flags |= SEC_HIDDEN;
		}
		// get tag id
		nodeid_t tid = sn->m_tagId;
		// is div, td or tr tag start?
		if ( tid!=TAG_DIV && 
		     tid!=TAG_TD && 
		     tid!=TAG_TR &&
		     tid!=TAG_UL &&
		     tid!=TAG_SPAN) continue;

		// . if we are a div tag, mod it
		// . treat the fields in the div tag as 
		//   part of the tag hash. 
		// . helps Events.cpp be more precise about
		//   section identification!!!!
		// . we now do this for TD and TR so Nov 2009 can telescope for
		//   http://10.5.1.203:8000/test/doc.17096238520293298312.html
		//   so the calendar title "Nov 2009" can affect all dates
		//   below the calendar.

		// get the style tag in there and check it for "display: none"!
		int32_t  slen = (*m_tr)[wn].token_len;
		const char *s    = (*m_tr)[wn].token_start;
		const char *send = s + slen;

		// check out any div tag that has a style
		const char *style = gb_strncasestr(s,slen,"style=") ;
		if ( ! style ) continue;

		// . check for hidden
		// . if no hidden tag assume it is UNhidden
		// . TODO: later push & pop on stack
		const char *ds = gb_strncasestr(style,send-style,"display:");
		// if display:none not found turn off SEC_HIDDEN
		if ( ! ds || ! gb_strncasestr(s,slen,"none") ) {
			// turn off the hiding
			isHidden = false;
			// off in us too
			sn->m_flags &= ~SEC_HIDDEN;
			continue;
		}
		// mark all sections in this with the tag
		isHidden = true;
		// on in us
		sn->m_flags |= SEC_HIDDEN;
		// stop it after this word for sure
		if ( sn->m_b   > endHide   ) endHide   = sn->m_b;
		if ( sn->m_a < startHide ) startHide = sn->m_a;
	}

	// now set the content hash of each section
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// must be an alnum word
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// get its section
		m_sectionPtrs[i]->m_contentHash64 ^= (*m_tr)[i].token_hash;
		// fix "smooth smooth!"
		if ( m_sectionPtrs[i]->m_contentHash64 == 0 )
			m_sectionPtrs[i]->m_contentHash64 = 123456;

	}

	// now set SEC_NOTEXT flag if content hash is zero!
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *sn = &m_sections[i];
		// skip if had text
		if ( sn->m_contentHash64 ) continue;
		// no text!
		sn->m_flags |= SEC_NOTEXT;
	}

	//
	// set Section::m_alnumPosA/m_alnumPosB
	//
	int32_t alnumCount2 = 0;
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *sn = &m_sections[i];
		// skip if had text
		if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
		// save this
		sn->m_alnumPosA = alnumCount2;
		// scan the wids of the whole sentence, which may not
		// be completely contained in the "sn" section!!
		int32_t a = sn->m_senta;
		int32_t b = sn->m_sentb;
		for ( int32_t j = a ; j < b ; j++ ) {
			// must be an alnum word
			if ( ! (*m_tr)[j].is_alfanum ) continue;
			// alnumcount
			alnumCount2++;
		}
		// so we contain the range [a,b), typical half-open interval
		sn->m_alnumPosB = alnumCount2;
		// sanity check
		if ( sn->m_alnumPosA == sn->m_alnumPosB ){g_process.shutdownAbort(true);}

		// propagate through parents
		Section *si = sn->m_parent;
		// do each parent as well
		for ( ; si ; si = si->m_parent ) {
			// skip if already had one!
			if ( si->m_alnumPosA > 0 ) break;
			// otherwise, we are it
			si->m_alnumPosA = sn->m_alnumPosA;
		}

	}
	// propagate up alnumPosB now
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		// get it
		Section *sn = &m_sections[i];
		// skip if had text
		if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
		// propagate through parents
		Section *si = sn->m_parent;
		// do each parent as well
		for ( ; si ; si = si->m_parent ) {
			// skip if already had one! no, because we need to
			// get the MAX of all of our kids!!
			//if ( si->m_alnumPosB > 0 ) break;
			// otherwise, we are it
			si->m_alnumPosB = sn->m_alnumPosB;
		}
	}

	///////////////////////////////////////
	//
	// now set Section::m_listContainer
	//
	// . a containing section is a section containing
	//   MULTIPLE smaller sections
	// . so if a section has a containing section set its m_listContainer
	//   to that containing section
	// . we limit this to sections that directly contain text for now
	// . Events.cpp::getRegistrationTable() uses m_nextBrother so we
	//   need this now!!
	//
	///////////////////////////////////////
	setNextBrotherPtrs ( true );

	///////////////////////////////////////
	//
	// now set SEC_MENU and SEC_LINK_TEXT flags
	//
	///////////////////////////////////////
	setMenus();

	//verifySections();

	return true;
}

// . PROBLEM: because we ignore non-breaking tags we often get sections
//   that are really not sentences, but we are forced into them because
//   we cannot split span or bold tags
//   i.e. "<div>This is <b>a sentence. And this</b> is a sentence.</div>"
//   forces us to treat the entire div tag as a sentence section.
// . i did add some logic to ignore those (the two for-k loops below) but then
//   Address.cpp cores because it expects every alnum word to be in a sentence
// . now make sure to shrink into our current parent if we would not lose
//   alnum chars!! fixes sentence flip flopping
// . returns false and sets g_errno on error
bool Sections::addSentenceSections ( ) {
	sec_t badFlags = SEC_STYLE | SEC_SCRIPT | SEC_SELECT | SEC_HIDDEN | SEC_NOSCRIPT;

	// shortcut
	Section **sp = m_sectionPtrs;

	static bool s_init = false;
	static int64_t h_in;
	static int64_t h_at;
	static int64_t h_for;
	static int64_t h_to;
	static int64_t h_on;
	static int64_t h_under;
	static int64_t h_with;
	static int64_t h_along;
	static int64_t h_from;
	static int64_t h_by;
	static int64_t h_of;
	static int64_t h_some;
	static int64_t h_the;
	static int64_t h_and;
	static int64_t h_a;
	static int64_t h_http;
	static int64_t h_https;
	static int64_t h_room;
	static int64_t h_rm;
	static int64_t h_bldg;
	static int64_t h_building;
	static int64_t h_suite;
	static int64_t h_ste;
	static int64_t h_tags;
	if ( ! s_init ) {
		s_init = true;
		h_tags = hash64n("tags");
		h_in = hash64n("in");
		h_the = hash64n("the");
		h_and = hash64n("and");
		h_a = hash64n("a");
		h_at = hash64n("at");
		h_for = hash64n("for");
		h_to = hash64n("to");
		h_on = hash64n("on");
		h_under = hash64n("under");
		h_with = hash64n("with");
		h_along = hash64n("along");
		h_from = hash64n("from");
		h_by = hash64n("by");
		h_of = hash64n("of");
		h_some = hash64n("some");
		h_http = hash64n("http");
		h_https = hash64n("https");
		h_room = hash64n("room");
		h_rm = hash64n("rm");
		h_bldg = hash64n("bldg");
		h_building = hash64n("building");
		h_suite = hash64n("suite");
		h_ste = hash64n("ste");
	}

	// need D_IS_IN_URL bits to be valid
	m_bits->setInUrlBits ( );

	// is the abbr. a noun? like "appt."
	bool hasWordAfter = false;

	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// need a wid
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// get section we are currently in
		Section *cs = m_sectionPtrs[i];
		// skip if its bad! i.e. style or script or whatever
		if ( cs->m_flags & badFlags ) continue;
		// set that
		int64_t prevWid = (*m_tr)[i].token_hash;
		int64_t prevPrevWid = 0LL;
		// flag
		int32_t lastWidPos = i;//-1;
		bool lastWasComma = false;
		nodeid_t includedTag = -2;
		int32_t lastbr = -1;
		bool endOnBr = false;
		bool endOnBold = false;
		bool capped = true;
		int32_t upper = 0;
		int32_t numAlnums = 0;
		// scan for sentence end
		int32_t j;
		for ( j = i ; j < m_nw ; j++ ) {
			const auto &token2 = (*m_tr)[j];
			// skip words
			if ( token2.is_alfanum ) {
				// prev prev
				prevPrevWid = prevWid;
				// assume not a word like "vs."
				hasWordAfter = false;
				// set prev
				prevWid = token2.token_hash;
				lastWidPos = j;
				lastWasComma = false;
				endOnBr = false;
				endOnBold = false;
				numAlnums++;
				// skip if stop word and need not be
				// capitalized
				if ( m_bits->queryBits(j) & D_IS_STOPWORD ) continue;
				if ( token2.token_len <= 1 ) continue;
				if ( is_digit(token2.token_start[0]) ) continue;
				if ( !is_upper_utf8(token2.token_start)) capped=false;
				else                           upper++;
				continue;
			}
			// tag?
			if ( token2.nodeid ) {
				// shortcut
				nodeid_t tid = token2.nodeid & BACKBITCOMP;

				// treat nobr as breaking to fix ceder.net
				// which has it after the group title
				if ( tid == TAG_NOBR ) break;

				if ( tid == TAG_BR ) endOnBr = true;
				if ( tid == TAG_B  ) endOnBold = true;

				// a </b><br> is usually like a header
				if ( capped && upper && endOnBr && endOnBold )
					break;
				// if it is <span style="display:none"> or
				// div or whatever, that is breaking!
				// fixes http://chuckprophet.com/gigs/ 
				if ( (tid == TAG_DIV ||
				      tid == TAG_SPAN ) &&
				    token2.token_len > 14 &&
				     strncasestr(token2.token_start,"display:none",
						 token2.token_len) )
					break;
				// ok, treat span as non-breaking for a second
				if ( tid == TAG_SPAN ) continue;
				// mark this
				if ( tid == TAG_BR ) lastbr = j;
				//
				// certain tags like span and br sometimes
				// do and sometimes do not break a sentence.
				// so by default assume they do, but check
				// for certain indicators...
				//
				if ( tid == TAG_SPAN || 
				     tid == TAG_BR   ||
				     // fixes guysndollsllc.com:
				     // causes core dump:
				     tid == TAG_P    || // villr.com
				     // fixes americantowns.com
				     tid == TAG_DIV     ) {
					// if nothing after, moot point
					if ( j+1 >= m_nw ) break;
					// if we already included this tag
					// then keep including it. but some
					// span tags will break and some won't
					// even when in or around the same
					// sentence. see that local.yahoo.com
					// food delivery services url for
					// the first street address, 
					// 5013 Miramar
					if ( includedTag == tid &&
					     (token2.nodeid & BACKBIT) ) {
						// reset it in case next
						// <span> tag is not connective
						includedTag = -2;
						continue;
					}
					// if we included this tag type
					// as a front tag, then include its
					// back tag in sentence as well.
					// fixes nonamejustfriends.com
					// which has a span tag in sentence:
					// ".. Club holds a <span>FREE</span> 
					//  Cruise Night..." and we allow
					// "<span>" because it follows "a",
					// but we were breaking on </span>!
					if ( !(token2.nodeid&BACKBIT))
						includedTag = tid;
					// if prev punct was comma and not
					// an alnum word
					if ( lastWasComma ) continue;
					// get punct words bookcasing this tag
					if ( ! (*m_tr)[j+1].is_alfanum && 
					     ! (*m_tr)[j+1].nodeid &&
					     has_char((*m_tr)[j+1].token_start,(*m_tr)[j+1].token_end(),',') )
						continue;
					// if prevwid is like "vs." then
					// that means keep going even if
					// we hit one of these tags. fixes
					// "new york knicks vs.<br>orlando
					//  magic"
					if ( hasWordAfter )
						continue;
					// if first alnum word after tag
					// is lower case, that is good too
					int32_t aw = j + 1;
					int32_t maxaw = j + 12;
					if ( maxaw > m_nw ) maxaw = m_nw;
					for ( ; aw < maxaw ; aw++ )
						if ( (*m_tr)[aw].is_alfanum ) break;
					bool isLower = false;
					if ( aw < maxaw &&
					     is_lower_utf8((*m_tr)[aw].token_start) ) 
						isLower = true;

					// http or https is not to be
					// considered as such! fixes
					// webnetdesign.com from getting
					// sentences continued by an http://
					// url below them.
					if ( aw < maxaw &&
					     ((*m_tr)[aw].token_hash == h_http ||
					      (*m_tr)[aw].token_hash == h_https) )
						isLower = false;

					if ( tid == TAG_P &&
					     isLower &&
					     // Oscar G<p>along with xxxx
					     (*m_tr)[aw].token_hash != h_along &&
					     (*m_tr)[aw].token_hash != h_with )
						isLower = false;

					if ( isLower ) continue;
					// get pre word, preopsitional
					// phrase starter?
					if ( prevWid == h_in ||
					     prevWid == h_the ||
					     prevWid == h_and ||
					     // fix for ending on "(Room A)"
					     (prevWid == h_a &&
					      prevPrevWid != h_rm &&
					      prevPrevWid != h_room &&
					      prevPrevWid != h_bldg &&
					      prevPrevWid != h_building &&
					      prevPrevWid != h_suite &&
					      prevPrevWid != h_ste ) ||
					     prevWid == h_for ||
					     prevWid == h_to ||
					     prevWid == h_on ||
					     prevWid == h_under ||
					     prevWid == h_with ||
					     prevWid == h_from ||
					     prevWid == h_by ||
					     prevWid == h_of ||
					     // "some ... Wednesdays"
					     prevWid == h_some ||
					     prevWid == h_at )
						continue;
				}


				// seems like span breaks for meetup.com
				// et al and not for abqtango.com maybe, we
				// need to download the css??? or what???
				// by default span tags do not seem to break
				// the line but ppl maybe configure them to
				if ( tid == TAG_SPAN ) break;
				// if like <font> ignore it
				if ( ! isBreakingTagId(token2.nodeid) ) continue;
				// only break on xml tags if in rss feed to
				// fix <st1:State w:st="on">Arizona</st1>
				// for gwair.org
				if ( tid==TAG_XMLTAG && !m_isRSSExt) continue;
				// otherwise, stop!
				break;
			}
			// skip simple spaces for speed
			if ( token2.token_len == 1 && is_wspace_a(token2.token_start[0]))
				continue;

			// do not allow punctuation that is in a url
			// to be split up or used as a splitter. we want
			// to keep the full url intact.
			if ( j > i && j+1 < m_nw &&
			     (m_bits->queryBits(j-1) & D_IS_IN_URL) &&
			     (m_bits->queryBits(j  ) & D_IS_IN_URL) &&
			     (m_bits->queryBits(j+1) & D_IS_IN_URL) )
				continue;

			// was last punct containing a comma?
			lastWasComma = false;
			// scan the punct chars, stop if we hit a sent breaker
			const char *p    =     token2.token_start;
			const char *pend = p + token2.token_len;
			for ( ; p < pend ; p++ ) {
				// punct word...
				if ( *p == '.' ) break;
				if ( *p == ',' ) lastWasComma =true;
				// allow this too for now... no...
				if ( *p == ';' ) break;
				// now hyphen breaks, mostly for stuff
				// in title tags like dukecityfix.com
				if ( sp[j]->m_tagId == TAG_TITLE &&
				     *p == '-' &&
				     is_wspace_a(p[-1]) &&
				     is_wspace_a(p[+1]) &&
				     lastWidPos >= 0 &&
				     ! m_isRSSExt &&
				     j+1<m_nw &&
				     (*m_tr)[j+1].is_alfanum &&
				     //( ! (bb[lastWidPos] & D_IS_IN_DATE) ||
				     //  ! (bb[j+1] & D_IS_IN_DATE)       ) &&
				     // fix for $10 - $12
				     ( ! is_digit ( (*m_tr)[lastWidPos].token_start[0]) ||
				       ! is_digit ( (*m_tr)[j+1].token_start[0]) ) )
					break;
				// . treat colon like comma now
				// . for unm.edu we have 
				//   "Summer Hours: March 15 - Oct15:
				//    8 am. Mon - Fri, 7:30 am - 10 am Sun.,
				//    Winter Hours: Oct. 15 - March 15:
				//    8 am., seven days a week"
				// . and we don't want "winter hours" being
				//   toplogically closer to the summer hours
				// . that is, the colon is a stronger binder
				//   than the comma?
				// . but for villr.com Hours: May-Aug.. gets
				//   made into two sentences and Hours is
				//   seen as a heading section and causes
				//   addImpliedSections() to be wrong.
				// . why not the colon?
				if ( *p == ':' ) {

					// Tags: music,concert,fun
					if ( prevWid == h_tags &&
					     // just Tags: so far in sentence
					     j == i )
						break;

					// a "::" is used in breadcrumbs,
					// so break on that.
					// fixes "Dining :: Visit :: 
					// Cal Performacnes" title
					if ( p[1] == ':' ) 
						break;

					// if "with" preceeds, allow
					if ( prevWid == h_with ) continue;

					// or prev word was tag! like
					// "blah</b>:..."
					bool tagAfter = (j-1>=0 && (*m_tr)[j-1].nodeid);

					// do not allow if next word is tag
					bool tagBefore = (j+1<m_nw && (*m_tr)[j+1].nodeid);

					// do not allow 
					// "<br>...:<br>" or
					// "<br>...<br>:" or
					// since such things are usually
					// somewhat like headers. isolated
					// lines ending on a colon.
					// should fix st. martin's center
					// for unm.edu "Summer Hours: ..."
					if ( lastbr >= 0 && 
					     ( tagBefore || tagAfter ) ) {
						// end sentence there then
						j = lastbr;
						break;
					}
					     
					if ( tagBefore ) break;
					if ( tagAfter  ) break;

					// for now allow it!
					continue;
				}
				// . special hyphen
				// . breaks up title for peachpundit.com
				//   so we get better event title generation
				//   since peachpundit.com will be a reepat sec
				// . BUT it did not work!
				if ( p[0] == (char)-30 &&
				     p[1] == (char)-128 &&
				     p[2] == (char)-108 )
					break;
				// this for sure
				// "Home > Albuquerque Events > Love Song ..."
				if ( *p == '>' ) break;
				if ( *p == '!' ) break;
				if ( *p == '?' ) break;
				if ( *p == '|' ) 
					break;
				// bullets
				if ( p[0] == (char)226 &&
				     p[1] == (char)128 &&
				     p[2] == (char)162 )
					break;
			redo:
				continue;
			}
			// if none, keep going
			if ( p == pend ) continue;
			// if an alnum char follows the ., it is ok
			// probably a hostname or ip or phone #
			if ( is_alnum_utf8(p+1) &&
			     // "venue:ABQ Sq Dance Center..." for
			     // americantowns.com has no space after the colon!
			     *p !=':' ) 
				goto redo;
			// if abbreviation before we are ok too
			if ( *p == '.' && isAbbr(prevWid,&hasWordAfter) ) {
				// but the period may serve a double purpose
				// to end the abbr and terminate the sentence
				// if the word that follows is capitalized,
				// and if the abbr is a lower-case noun.
				//
				// if abbr is like "vs" then do not end sentenc
				if ( hasWordAfter )
					goto redo;

				// set "next" to next alnum word after us
				int32_t next = j+1;
				int32_t max  = next + 10;
				if ( max > m_nw ) max = m_nw;
				for ( ; next < max ; next++ ) {
					if ( ! (*m_tr)[next].is_alfanum ) continue;
					break;
				}

				// was previous word/abbr capitalized?
				// if so, assume period does not end sentence.
				if ( is_capitalized_utf8((*m_tr)[lastWidPos].token_start) )
					goto redo;
				// if next word is NOT capitalized, assume
				// period does not end sentence...
				if ( next < max &&
				     ! is_capitalized_utf8((*m_tr)[next].token_start) )
					goto redo;
				// otherwise, abbr is NOT capitalized and
				// next word IS capitalized, so assume the
				// period does NOT end the sentence
			}
			// fix "1. library name" for cabq.gov
			if ( *p == '.' && 
			     lastWidPos == i) {
				auto const &t = (*m_tr)[lastWidPos];
				if(is_ascii_digit_string(t.token_start, t.token_end()))
					goto redo;
			}
			// ok, stop otherwise
			break;
		}

		// do not include tag at end. try to fix sentence flip flop.
		for ( ; j > i ; j-- ) 
			// stop when we just contain the last word
			if ( (*m_tr)[j-1].is_alfanum ) break;

		// make our sentence endpoints now
		int32_t senta = i;
		// make the sentence defined by [senta,sentb) where sentb
		// defines a half-open interval like we do for almost 
		// everything else
		int32_t sentb = j;

		// update i for next iteration
		i = sentb - 1;

		// crap, but now sentences intersect with our tag-based
		// sections because they can now split tags because of websites
		// like aliconference.com and abqtango.com whose sentences
		// do not align with the tag sections. therefore we introduce
		// the SEC_TOP_SPLIT and SEC_BOTTOM_SPLIT to indicate 
		// that the section is a top/bottom piece of a split sentence.
		// if both bits are set we assume SEC_MIDDLE_SPLIT.
		// then we set the Section::m_senta and m_sentb to
		// indicate the whole sentence of which it is a split.
		// but the vast majority of the time m_senta and m_sentb
		// will equal m_firstWordPos and m_lastWordPos respectively.
		// then, any routine that


		// so scan the words in the sentence and as we scan we have
		// to determine the parent section we inserting the sentence
		// into as a child section.
		//Section *parent = NULL;
		int32_t     start  = -1;
		Section *pp;
		int32_t     lastk = 0;
		Section *splitSection = NULL;
		Section *lastGuy = NULL;

		for ( int32_t k = senta ; k <= sentb ; k++ ) {
			// add final piece
			if ( k == sentb ) {
				// stop i no final piece
				if ( start == -1 ) break;
				// otherwise, add it
				goto addit;
			}
			// need a real alnum word
			if ( ! (*m_tr)[k].is_alfanum ) continue;
			// get his parent
			pp = m_sectionPtrs[k];
			// set parent if need to
			//if ( ! parent ) parent = pp;
			// and start sentence if need to
			if ( start == -1 ) start = k;
			// if same as exact section as last guy, save some time
			if ( pp == lastGuy ) pp = NULL;
			// store it
			lastGuy = pp;
			// . i'd say blow up "pp" until its contains "start"
			// . but if before it contains start it breaches
			//   [senta,sentb) then we have to cut things short
			for ( ; pp ; pp = pp->m_parent ) {
				// we now have to split section "pp"
				// when adding the sentence section.
				// once we have such a section we
				// cannot use a different parent...
				if ( pp->m_firstWordPos < start ||
				     pp->m_lastWordPos >= sentb ) {
					// set it
					if ( ! splitSection ) splitSection =pp;
					// WE ARE ONLY ALLOWED TO SPLIT ONE
					// SECTION ONLY...
					if ( pp != splitSection)
						goto addit;
					break;
				}
				// keep telescoping until "parent" contains
				// [senta,k] , and we already know that it
				// contains k because that is what we set it to
				//if ( pp->m_a <= senta ) break;
			}
			// mark it
			if ( (*m_tr)[k].is_alfanum ) lastk = k;
			// ok, keep chugging
			continue;

			// add the final piece if we go to this label
		addit:
			// use this flag
			int32_t bh = BH_SENTENCE;
			// determine parent section, smallest section 
			// containing [start,lastk]
			Section *parent = m_sectionPtrs[start];
			for ( ; parent ; parent = parent->m_parent ) {
				// stop if contains lastk
				if ( parent->m_b > lastk ) break;
			}
			// 
			// for "<span>Albuquerque</span>, New Mexico" 
			// "start" points to "Albuquerque" but needs to 
			// point to the "<span>" so its parent is "parent"
			int32_t adda = start;
			int32_t addb = lastk;
			// need to update "start" to so its parent is the new 
			// "parent" now so insertSubSection() does not core
			for ( ; adda >= 0 ; ) {
				// stop if we finally got the right parent
				if ( m_sectionPtrs[adda]==parent ) break;
				// or if he's a tag and his parent
				// is "parent" we can stop.
				// i.e. STOP on a proper subsection of
				// the section containing the sentence.
				if ( m_sectionPtrs[adda]->m_parent==parent &&
				     m_sectionPtrs[adda]->m_a == adda )
					break;
				// backup
				adda--;
				// check
				if ( adda < 0 ) break;
				// how can this happen?
				if ( (*m_tr)[adda].is_alfanum ) { g_process.shutdownAbort(true); }
			}
			// sanity
			if ( adda < 0 ) { g_process.shutdownAbort(true); }

			// same for right endpoint
			for ( ; addb < m_nw ; ) {
				// stop if we finally got the right parent
				if ( m_sectionPtrs[addb]==parent ) break;
				// get it
				Section *sp = m_sectionPtrs[addb];
				// come back up here in the case of a section
				// sharing its Section::m_b with its parent
			subloop:
				// or if he's a tag and his parent
				// is "parent" we can stop
				if ( sp->m_parent==parent &&
				     sp->m_b == addb+1 )
					break;
				// or if we ran into a brother section
				// that does not contain the sentence...
				// fix core dump for webnetdesign.com whose
				// sentence consisted of 3 sections from
				// A=7079 to B=7198. but now i am getting rid
				// of allowing a lower case http(s):// on
				// a separate line to indicate that the
				// sentence continues... so we will not have
				// this sentence anymore in case  you are
				// wondering why it is not there any more.
				if ( sp->m_parent==parent &&
				     sp->m_a == addb ) {
					// do not include that brother's tag
					addb--;
					break;
				}

				// when we have bad tag formations like for
				// http://gocitykids.parentsconnect.com/catego
				// ry/buffalo-ny-usa/places-to-go/tourist-stops
				// like <a><b>...</div> with no ending </a> or
				// </b> tags then we have to get the parent
				// of the parent as long as its m_b is the
				// same and check that before advancing addb
				// otherwise we can miss the parent section
				// that we want! (this is because the kid
				// sections share the same m_b as their 
				// parent because of they have no ending tag)
				if ( sp->m_parent &&
				     sp->m_parent->m_b == sp->m_b ) {
					sp = sp->m_parent;
					goto subloop;
				}

				// advance
				addb++;
				// stop if addb 
				if ( addb >= m_nw ) break;
				// how can this happen?
				if ( (*m_tr)[addb].is_alfanum ) { g_process.shutdownAbort(true); }
			}
			// sanity
			if ( addb >= m_nw ) { g_process.shutdownAbort(true); }

			// ok, now add the split sentence
			Section *is =insertSubSection(adda,addb+1,bh);
			// panic?
			if ( ! is )
				break;
			// set sentence flag on it
			is->m_flags |= SEC_SENTENCE;
			// . set this
			// . sentence is from [senta,sentb)
			is->m_senta = senta;//start;
			is->m_sentb = sentb;//k;
			// stop if that was it
			if ( k == sentb ) break;
			// go on to next fragment then
			start = -1;
			parent = NULL;
			splitSection = NULL;
			lastGuy = NULL;
			// redo this same k
			k--;
		}
	}

	int32_t     inSentTil = 0;
	Section *lastSent = NULL;
	// get the section of each word. if not a sentence section then
	// make its m_sentenceSection point to its parent that is a sentence
	for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
		// need sentence
		if ( ( sk->m_flags & SEC_SENTENCE ) ) {
			inSentTil = sk->m_b;
			lastSent  = sk;
			sk->m_sentenceSection = sk;
			continue;
		}
		// skip if outside of the last sentence we had
		if ( sk->m_a >= inSentTil ) continue;
		// we are in that sentence
		sk->m_sentenceSection = lastSent;
	}

	return true;
}

Section *Sections::insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) {
	// try to realloc i guess. should keep ptrs in tact.
	if ( m_numSections >= m_maxNumSections ) {
		g_errno = EDOCBADSECTIONS;
		return NULL;
	}

	//
	// make a new section
	//
	Section *sk = &m_sections[m_numSections];
	// clear
	memset ( sk , 0 , sizeof(Section) );
	// inc it
	m_numSections++;
	// now set it
	sk->m_a   = a;
	sk->m_b   = b;

	// don't mess this up!
	if ( m_lastSection && a > m_lastSection->m_a )
		m_lastSection = sk;

	// the base hash (delimeter hash) hack
	sk->m_baseHash = 0;// dh; ????????????????????

	// get first section containing word #a
	Section *si = m_sectionPtrs[a];

	for ( ; si ; si = si->m_prev ) {
		// we become his child if this is true
		if ( si->m_a < a ) {
			break;
		}

		// if he is bigger (or equal) we become his child
		// and are after him
		if ( si->m_a == a && si->m_b >= b ) {
			break;
		}
	}

	// . try using section before us if it is contained by "si"
	// . like in the case when word #a belongs to the root section
	//   and there are thousands of child sections of the root before "a"
	//   we really want to get the child section of the root before us
	//   as the prev section, "si", otherwise the 2nd for loop below here
	//   will hafta loop through thousands of sibling sections
	// . this will fail if word before a is part of our same section
	// . what if we ignored this for now and set m_sectionPtrs[a] to point
	//   to the newly inserted section, then when done adding sentence
	//   sections we scanned all the words, keeping track of the last
	//   html section we entered and used that to insert the sentence sections
	if ( m_lastAdded && si && m_lastAdded->m_a > si->m_a && m_lastAdded->m_a < a ) {
		si = m_lastAdded;
	}

	// crap we may have 
	// "<p> <strong>hey there!</strong> this is another sentence.</p>"
	// then "si" will be pointing at the "<p>" section, and we will
	// not get the "<strong>" section as the "prev" to sk, which we should!
	// that is where sk is the "this is another sentence." sentence
	// section. so to fix that try iterating over si->m_next to get si to
	// be closer to sk.
	for ( ; si ; si = si->m_next ) {
		// stop if no more eavailable
		if ( ! si->m_next ) break;
		// stop if would break
		if ( si->m_next->m_a > a ) break;
		// if it gets closer to us without exceeding us, use it
		if ( si->m_next->m_a < a ) continue;
		// if tied, check b. if it contains us, go to it
		if ( si->m_next->m_b >= b ) continue;
		// otherwise, stop
		break;
	}

	// set this
	m_lastAdded = si;

	// a br tag can split the very first base html tag like for
	// mapsandatlases.org we have
	// "<html>...</html> <br> ...." so the br tag splits the first
	// section!
	// SO we need to check for NULL si's!
	if ( ! si ) {
		// skip this until we figure it out
		m_numSections--;
		g_process.shutdownAbort(true);
		return NULL;
	} else {
		// insert us into the linked list of sections
		if ( si->m_next ) si->m_next->m_prev = sk;
		sk->m_next   = si->m_next;
		sk->m_prev   = si;
		si->m_next   = sk;
	}

	// now set the parent
	Section *parent = m_sectionPtrs[a];
	// expand until it encompasses both a and b
	for ( ; ; parent = parent->m_parent ) {
		if ( parent->m_a > a ) continue;
		if ( parent->m_b < b ) continue;
		break;
	}
	// now we assign the parent to you
	sk->m_parent    = parent;
	// sometimes an implied section is a subsection of a sentence!
	// like when there are a lot of brbr (double br) tags in it...
	sk->m_sentenceSection = parent->m_sentenceSection;
	// take out certain flags from parent
	sec_t flags = parent->m_flags;
	flags &= ~SEC_SENTENCE;

	// add in fake
	flags |= SEC_FAKE;

	// flag it as a fake section
	sk->m_flags = flags ;

	// need this
	sk->m_baseHash = newBaseHash;

	// reset these
	sk->m_firstWordPos = -1;
	sk->m_lastWordPos  = -1;
	sk->m_alnumPosA    = -1;
	sk->m_alnumPosB    = -1;
	sk->m_senta        = -1;
	sk->m_sentb        = -1;

	// set sk->m_firstWordPos
	for ( int32_t i = a ; i < b ; i++ ) {
		// and first/last word pos
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// mark this
		sk->m_firstWordPos = i;
		break;
	}

	// set sk->m_lastWordPos
	for ( int32_t i = b-1 ; i >= a ; i-- ) {
		// and first/last word pos
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// mark this
		sk->m_lastWordPos = i;
		break;
	}


	//
	// to speed up scan the words in our inserted section, usually
	// a sentence section i guess, because our parent can have a ton
	// of children sections!!
	//
	for ( int32_t i = a ; i < b ; i++ ) {
		// get current parent of that word
		Section *wp = m_sectionPtrs[i];
		// if sentence section does NOT contain the word's current
		// section then the sentence section becomes the new section
		// for that word.
		if ( ! sk->strictlyContains ( wp ) ) {
			// now if "wp" is like a root, then sk becomes the kid
			m_sectionPtrs[i] = sk;
			// our parent is wp
			sk->m_parent = wp;
			continue;
		}
		// we gotta blow up wp until right before it is bigger
		// than "sk" and use that
		for ( ; wp->m_parent ; wp = wp->m_parent )
			// this could be equal to, not just contains
			// otherwise we use strictlyContains()
			if ( wp->m_parent->contains(sk) ) break;
		// already parented to us?
		if ( wp->m_parent == sk ) continue;
		// sentence's parent is now wp's parent
		sk->m_parent = wp->m_parent;
		// and we become wp's parent
		wp->m_parent = sk;
		// sanity check
		if ( wp->m_b > sk->m_b ) { g_process.shutdownAbort(true); }
		if ( wp->m_a < sk->m_a ) { g_process.shutdownAbort(true); }
	}

	return sk;
}

// this is a function because we also call it from addImpliedSections()!
void Sections::setNextBrotherPtrs ( bool setContainer ) {

	// clear out
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		si->m_nextBrother = NULL;
		si->m_prevBrother = NULL;
	}

	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		Section *sj = NULL;

		// get word after us
		int32_t wn = si->m_b;
		int32_t nw2 = m_nw;

		// if we hit a word in our parent.. then increment wn
		// PROBLEM "<root><t1>hey</t1> blah blah blah x 1 mill</root>"
		// would exhaust the full word list when si is the "t1"
		// section. 
		Section *j2 = si->m_next;
		if ( j2 && j2->m_a >= si->m_b ) {
			sj = j2;
			nw2 = 0;
		}

		// try one more ahead for things like so we don't end up
		// setting sj to the "t2" section as in:
		// "<root><t1><t2>hey</t2></t1> ...."
		if ( ! sj && j2 ) {
			// try the next section then
			j2 = j2->m_next;
			// set "sj" if its a potential brother section
			if ( j2 && j2->m_a >= si->m_b ) {
				sj = j2;
				nw2 = 0;
			}
		}

		// ok, try the next word algo approach
		for ( ; wn < nw2 ; wn++ ) {
			sj = m_sectionPtrs[wn];
			if ( sj->m_a >= si->m_b ) break;
		}
		// bail if none
		if ( wn >= m_nw ) continue;

		// telescope up until brother if possible
		for ( ; sj ; sj = sj->m_parent )
			if ( sj->m_parent == si->m_parent ) break;

		// give up?
		if ( ! sj || sj->m_parent != si->m_parent ) continue;

		// sanity check
		if ( sj->m_a < si->m_b && 
		     sj->m_tagId != TAG_TC &&
		     si->m_tagId != TAG_TC ) {
			g_process.shutdownAbort(true); }
		// set brother
		si->m_nextBrother = sj;
		// set his prev then
		sj->m_prevBrother = si;
		// sanity check
		if ( sj->m_parent != si->m_parent ) { g_process.shutdownAbort(true); }
		// sanity check
		if ( sj->m_a < si->m_b &&
		     sj->m_tagId != TAG_TC &&
		     si->m_tagId != TAG_TC ) { 
			g_process.shutdownAbort(true); }
		// do more?
		if ( ! setContainer ) continue;
		// telescope this
		Section *te = sj;
		// telescope up until it contains "si"
		for ( ; te && te->m_a > si->m_a ; te = te->m_parent );
		// only update list container if smaller than previous
		if ( ! si->m_listContainer )
			si->m_listContainer = te;
		else if ( te && te->m_a > si->m_listContainer->m_a )
			si->m_listContainer = te;
		if ( ! sj->m_listContainer )
			sj->m_listContainer = te;
		else if ( te && te->m_a > sj->m_listContainer->m_a )
			sj->m_listContainer = te;

		// now 
	}
}

void Sections::setNextSentPtrs ( ) {
	// kinda like m_rootSection
	m_firstSentence = NULL;

	Section *finalSec = NULL;

	// scan the sentence sections and number them to set m_sentNum
	for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
		// record final section
		finalSec = sk;

		// need sentence
		if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
			continue;
		}

		// first one?
		if ( ! m_firstSentence ) {
			m_firstSentence = sk;
		}
	}

	Section *lastSent = NULL;

	// now set "m_nextSentence" of each section
	for ( Section *sk = finalSec ; sk ; sk = sk->m_prev ) {
		// set this
		sk->m_nextSentence = lastSent;

		// need sentence
		if ( ! ( sk->m_flags & SEC_SENTENCE ) ) {
			continue;
		}

		// we are the sentence now
		lastSent = sk;
	}
}

#define TABLE_ROWS 25

void Sections::printFlags(SafeBuf *sbuf, const Section *sn) {
	sec_t f = sn->m_flags;

	if ( f & SEC_HEADING )
		sbuf->safePrintf("heading ");

	if ( f & SEC_MENU_SENTENCE )
		sbuf->safePrintf("menusentence " );
	if ( f & SEC_MENU )
		sbuf->safePrintf("ismenu " );
	if ( f & SEC_MENU_HEADER )
		sbuf->safePrintf("menuheader " );

	if ( f & SEC_LINK_TEXT )
		sbuf->safePrintf("linktext " );
	if ( f & SEC_PLAIN_TEXT )
		sbuf->safePrintf("plaintext " );

	if ( f & SEC_FAKE ) {
		if ( sn->m_baseHash == BH_BULLET )
			sbuf->safePrintf("bulletdelim ");
		else if ( sn->m_baseHash == BH_SENTENCE )
			sbuf->safePrintf("<b>sentence</b> ");
		else if ( sn->m_baseHash == BH_IMPLIED )
			sbuf->safePrintf("<b>impliedsec</b> ");
		else { g_process.shutdownAbort(true); }
	}

	if ( f & SEC_NOTEXT )
		sbuf->safePrintf("notext ");

	if ( f & SEC_SCRIPT )
		sbuf->safePrintf("inscript ");

	if ( f & SEC_NOSCRIPT )
		sbuf->safePrintf("innoscript ");

	if ( f & SEC_STYLE )
		sbuf->safePrintf("instyle ");

	if ( f & SEC_HIDDEN )
		sbuf->safePrintf("indivhide ");

	if ( f & SEC_SELECT )
		sbuf->safePrintf("inselect ");

	if ( f & SEC_IN_HEAD )
		sbuf->safePrintf("inhead ");

	if ( f & SEC_IN_TITLE )
		sbuf->safePrintf("intitle ");

	if ( f & SEC_IN_HEADER )
		sbuf->safePrintf("inheader ");

	if ( f & SEC_IN_IFRAME )
		sbuf->safePrintf("iniframe ");
}

bool Sections::isHardSection(const Section *sn) const {
	int32_t a = sn->m_a;
	// . treat this as hard... kinda like a div section...
	//   fixes gwair.org date from stealing address of another date
	//   because the span tags are fucked up...
	// . crap, no this prevents publicbroadcasting.net and other urls
	//   from telescoping to header dates they need to telescope to.
	//   the header dates are in span tags and if that is seen as a hard
	//   section bad things happen
	//if ( m_tids[a] == TAG_SPAN ) return true;
	if ( ! isBreakingTagId((*m_tr)[a].nodeid) ) {
		// . if first child is hard that works!
		// . fixes "<blockquote><p>..." for collectorsguide.com
		if ( sn->m_next && 
		     sn->m_next->m_tagId &&
		     // fix "blah blah<br>blah blah" for sentence
		     sn->m_next->m_tagId != TAG_BR &&
		     sn->m_next->m_a < sn->m_b &&
		     isBreakingTagId(sn->m_next->m_tagId) )
			return true;
		// otherwise, forget it!
		return false;
	}
	// trumba.com has sub dates in br-based implied sections that need
	// to telescope to their parent above
	if ( (*m_tr)[a].nodeid == TAG_BR ) return false;
	if ( sn->m_flags & SEC_SENTENCE ) return false;

	// xml tag exception for gwair.org. treat <st1:Place>... as soft
	if ( ((*m_tr)[a].nodeid & BACKBITCOMP) == TAG_XMLTAG && ! m_isRSSExt )
		return false;

	return true;
}

bool Sections::setMenus ( ) {
	// . this just returns if already set
	// . sets Bits::m_bits[x].m_flags & D_IN_LINK if its in a link
	// . this bits array is 1-1 with the words
	m_bits->setInLinkBits(this);

	sec_t flag;
	// set SEC_PLAIN_TEXT and SEC_LINK_TEXT for all sections
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// need alnum word
		if ( ! (*m_tr)[i].is_alfanum ) continue;
		// get our flag
		if ( m_bits->queryBits(i) & D_IN_LINK ) flag = SEC_LINK_TEXT;
		else                     flag = SEC_PLAIN_TEXT;
		// get section ptr
		Section *sk = m_sectionPtrs[i];
		// loop for sk
		for ( ; sk ; sk = sk->m_parent ) {
			// skip if already set
			if ( sk->m_flags & flag ) break;
			// set it
			sk->m_flags |= flag;
		}
	}

	Section *last = NULL;
	// . alernatively, scan through all anchor tags
	// . compare to last anchor tag
	// . and blow up each to their max non-intersection section and make
	//   sure no PLAIN text in either of those!
	// . this is all to fix texasdrums.drums.org which has various span
	//   and bold tags throughout its menu at random
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		// . if we hit plain text, we kill our last
		// . this was causing "geeks who drink" for blackbirdbuvette
		//   to get is SEC_MENU set because there was a link after it
		if ( si->m_flags & SEC_PLAIN_TEXT ) {
			last = NULL;
		}

		// skip if not a href section
		if ( si->m_baseHash != TAG_A ) {
			continue;
		}

		// . if it is a mailto link forget it
		// . fixes abtango.com from detecting a bad menu
		const char *ptr  = (*m_tr)[si->m_a].token_start;
		int32_t  plen = (*m_tr)[si->m_a].token_len;

		const char *mailto = strncasestr(ptr,plen,"mailto:");
		if ( mailto ) {
			last = NULL;
		}

		// bail if no last
		if ( ! last ) { last = si; continue; }

		// save last
		Section *prev = last;

		// set last for next round, used "saved" below
		last = si;

		// get first "hard" section encountered while telescoping
		Section *prevHard = NULL;

		// blow up last until right before it contains us
		for ( ; prev ; prev = prev->m_parent ) {
			// record?
			if ( ! prevHard && isHardSection(prev) ) 
				prevHard = prev;
			// if parent contains us, stop
			if ( prev->m_parent->contains ( si ) ) break;
		}

		// if it has plain text, forget it!
		if ( prev && prev->m_flags & SEC_PLAIN_TEXT ) continue;
		// use this for us
		Section *sk = si;
		// get first "hard" section encountered while telescoping
		Section *skHard = NULL;
		// same for us
		for ( ; sk ; sk = sk->m_parent ) {
			// record?
			if ( ! skHard && isHardSection(sk) ) skHard = sk;
			// if parent contains us, stop
			if ( prev && sk->m_parent->contains ( prev ) ) break;
		}
		// if it has plain text, forget it!
		if ( sk && sk->m_flags & SEC_PLAIN_TEXT ) continue;

		// . first hard sections encountered must match!
		// . otherwise for switchborad.com we lose "A B C ..." as
		//   title candidate because we think it is an SEC_MENU
		//   because the sections before it have links in them, but
		//   they have different hard sections
		if (   prevHard && ! skHard ) continue;
		if ( ! prevHard &&   skHard ) continue;
		if ( prevHard && prevHard->m_tagId != skHard->m_tagId ) continue;

		// ok, great that works!
		if( prev ) {
			prev->m_flags |= SEC_MENU;
		}
		if( sk ) {
			sk->m_flags |= SEC_MENU;
		}
	}

	int64_t h_copyright = hash64n("copyright");
	// copyright check
	// the copyright symbol in utf8 (see Entities.cpp for the code)
	static const char copy[] = "<EFBFBD>";

	// scan all years, lists and ranges of years, and look for
	// a preceeding copyright sign. mark such years as DF_COPYRIGHT
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		// skip if tag
		if ( (*m_tr)[i].nodeid ) continue;
		// do we have an alnum word before us here?
		if ( (*m_tr)[i].is_alfanum ) {
			// if word check for copyright
			if ( (*m_tr)[i].token_hash != h_copyright ) continue;
		}
		// must have copyright sign in it i guess
		else if ( ! gb_strncasestr((*m_tr)[i].token_start, (*m_tr)[i].token_len, copy)) 
			continue;
		// mark section as copyright section then
		Section *sp = m_sectionPtrs[i];
		// flag as menu
		sp->m_flags |= SEC_MENU;
	}

	sec_t ff = SEC_MENU;

	// set SEC_MENU of child sections of SEC_MENU sections
	for ( Section *si = m_rootSection; si; si = si->m_next ) {
		// must be a link text only section
		if ( !( si->m_flags & ff ) )
			continue;

		// ignore if went down this path
		if ( si->m_used == 82 ) {
			continue;
		}

		// get first potential kid
		Section *sk = si->m_next;
		// scan child sections
		for ( ; sk; sk = sk->m_next ) {
			// stop if not contained
			if ( !si->contains( sk ) ) {
				break;
			}

			// mark it
			sk->m_flags |= ( si->m_flags & ff ); // SEC_MENU;

			// ignore in big loop
			sk->m_used = 82;
		}
	}

	//
	// set SEC_MENU_HEADER
	//
	for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
		// skip if not in a menu
		if ( ! ( sk->m_flags & SEC_MENU ) ) {
			continue;
		}

		// get his list container
		Section *c = sk->m_listContainer;

		// skip if none
		if ( !c ) {
			continue;
		}

		// already flagged?
		if ( c->m_used == 89 ) {
			continue;
		}

		// do not repeat on any item in this list
		c->m_used = 89;

		// flag all its brothers!
		Section *zz = sk;
		for ( ; zz; zz = zz->m_nextBrother ) {
			// bail if not in menu
			if ( !( zz->m_flags & SEC_MENU ) ) {
				break;
			}
		}

		// if broked it, stop
		if ( zz ) {
			continue;
		}

		//
		// ok, every item in list is a menu item, so try to set header
		//
		// get word before first item in list
		int32_t r = sk->m_a - 1;
		for ( ; r >= 0 && !(*m_tr)[r].is_alfanum; r-- )
			;

		// if no header, skip
		if ( r < 0 ) {
			continue;
		}

		// set SEC_MENU_HEADER
		setHeader( r, sk, SEC_MENU_HEADER );
	}

	//
	// set SEC_MENU_SENTENCE flag
	//
	for ( Section *si = m_rootSection; si; si = si->m_next ) {
		// must be a link text only section
		if ( !( si->m_flags & SEC_MENU ) ) {
			continue;
		}

		// set this
		bool gotSentence = ( si->m_flags & SEC_SENTENCE );

		// set SEC_MENU of the sentence
		if ( gotSentence ) {
			continue;
		}

		// parent up otherwise
		for ( Section *sk = si->m_parent; sk; sk = sk->m_parent ) {
			// stop if sentence finally
			if ( !( sk->m_flags & SEC_SENTENCE ) ) {
				continue;
			}

			// not a menu sentence if it has plain text in it
			// though! we have to make this exception to stop
			// stuff like
			// "Wedding Ceremonies, No preservatives, more... "
			// from switchboard.com from being a menu sentence
			// just because "more" is in a link.
			if ( sk->m_flags & SEC_PLAIN_TEXT ) {
				break;
			}

			// set it
			sk->m_flags |= SEC_MENU_SENTENCE;

			// and stop
			break;
		}
	}

	static bool s_init = false;
	static int64_t h_close ;
	static int64_t h_send ;
	static int64_t h_map ;
	static int64_t h_maps ;
	static int64_t h_directions ;
	static int64_t h_driving ;
	static int64_t h_help ;
	static int64_t h_more ;
	static int64_t h_log ;
	static int64_t h_sign ;
	static int64_t h_change ;
	static int64_t h_write ;
	static int64_t h_save ;
	static int64_t h_share ;
	static int64_t h_forgot ;
	static int64_t h_home ;
	static int64_t h_sitemap ;
	static int64_t h_advanced ;
	static int64_t h_go ;
	static int64_t h_website ;
	static int64_t h_view;
	static int64_t h_add;
	static int64_t h_submit;
	static int64_t h_get;
	static int64_t h_about;
	// new stuff
	static int64_t h_back; // back to top
	static int64_t h_next;
	static int64_t h_buy; // buy tickets
	static int64_t h_english; // english french german versions
	static int64_t h_click;

	if ( ! s_init ) {
		s_init = true;
		h_close = hash64n("close");
		h_send = hash64n("send");
		h_map = hash64n("map");
		h_maps = hash64n("maps");
		h_directions = hash64n("directions");
		h_driving = hash64n("driving");
		h_help = hash64n("help");
		h_more = hash64n("more");
		h_log = hash64n("log");
		h_sign = hash64n("sign");
		h_change = hash64n("change");
		h_write = hash64n("write");
		h_save = hash64n("save");
		h_share = hash64n("share");
		h_forgot = hash64n("forgot");
		h_home = hash64n("home");
		h_sitemap = hash64n("sitemap");
		h_advanced = hash64n("advanced");
		h_go = hash64n("go");
		h_website = hash64n("website");
		h_view = hash64n("view");
		h_add = hash64n("add");
		h_submit = hash64n("submit");
		h_get = hash64n("get");
		h_about = hash64n("about");
		h_back = hash64n ("back");
		h_next = hash64n ("next");
		h_buy = hash64n ("buy");
		h_english = hash64n ("english");
		h_click = hash64n ("click");
	}

	// . when dup/non-dup voting info is not available because we are
	//   more or less an isolated page, guess that these links are
	//   menu links and not to be considered for title or event description
	// . we completely exclude a word from title/description if its 
	//   SEC_MENU is set.
	// . set SEC_MENU for renegade links that start with an action
	//   verb like "close" or "add" etc. but if their # of non dup votes
	//   is high relative to their # of dup votes, then do not set this
	//   because it might be a name of a band like "More" or something
	//   and be in a link
	// . scan all href sections
	// set SEC_LINK_ONLY on sections that just contain a link
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		// skip if not a href section
		if ( si->m_baseHash != TAG_A ) continue;
		// set points to scan
		int32_t a = si->m_a;
		int32_t b = si->m_b;
		// assume not bad
		bool bad = false;
		int32_t i;
		// scan words if any
		for ( i = a ; i < b ; i++ ) {
			const auto &token = (*m_tr)[i];
			// skip if not word
			if ( ! token.is_alfanum ) continue;
			// assume bad
			bad = true;
			// certain words are indicative of menus
			if ( token.token_hash == h_close ) break;
			if ( token.token_hash == h_send ) break;
			if ( token.token_hash == h_map ) break;
			if ( token.token_hash == h_maps ) break;
			if ( token.token_hash == h_directions ) break;
			if ( token.token_hash == h_driving ) break;
			if ( token.token_hash == h_help ) break;
			if ( token.token_hash == h_more ) break;
			if ( token.token_hash == h_log ) break; // log in
			if ( token.token_hash == h_sign ) break; // sign up/in
			if ( token.token_hash == h_change ) break; // change my loc.
			if ( token.token_hash == h_write ) break; // write a review
			if ( token.token_hash == h_save ) break;
			if ( token.token_hash == h_share ) break;
			if ( token.token_hash == h_forgot ) break; // forgot your pwd
			if ( token.token_hash == h_home ) break;
			if ( token.token_hash == h_sitemap ) break;
			if ( token.token_hash == h_advanced ) break; // adv search
			if ( token.token_hash == h_go ) break; // go to top of page
			if ( token.token_hash == h_website ) break;
			if ( token.token_hash == h_view ) break;
			if ( token.token_hash == h_add ) break;
			if ( token.token_hash == h_submit ) break;
			if ( token.token_hash == h_get ) break;
			if ( token.token_hash == h_about ) break;
			if ( token.token_hash == h_back ) break;
			if ( token.token_hash == h_next ) break;
			if ( token.token_hash == h_buy ) break;
			if ( token.token_hash == h_english ) break;
			if ( token.token_hash == h_click ) break;
			bad = false;
			break;
		}
		// skip if ok
		if ( ! bad ) continue;
		// get smallest section
		Section *sm = m_sectionPtrs[i];
		// if bad mark it!
		sm->m_flags |= SEC_MENU;
	}			

	return true;
}

// "first" is first item in the list we are getting header for
void Sections::setHeader ( int32_t r , Section *first , sec_t flag ) {
	// get smallest section containing word #r
	Section *sr = m_sectionPtrs[r];
	// save orig
	Section *orig = sr;

	// blow up until just before "first" section
	for ( ; sr ; sr = sr->m_parent ) {
		// forget it if in title tag already!
		if ( sr->m_flags & SEC_IN_TITLE ) return;
		// stop if no parent
		if ( ! sr->m_parent ) continue;
		// parent must not contain first
		if ( sr->m_parent->contains ( first ) ) break;
	}
	// if we failed to contain "first"... what does this mean? i dunno
	// but its dropping core for
	// http://tedserbinski.com/jcalendar/jcalendar.js
	if ( ! sr ) return;
	
	// save that
	Section *biggest = sr;

	// check out prev brother
	Section *prev = biggest->m_prevBrother;

	// if we are in a hard section and capitalized (part of the 
	// SEC_HEADING) requirements, then it should be ok if we have
	// a prev brother of a different tagid.
	// this will fix americantowns.com which has a list of header tags
	// and ul tags intermingled, with menus in the ul tags.
	// should also fix upcoming.yahoo.com which has alternating
	// dd and dt tags for its menus. now that we got rid of
	// addImpliedSections() we have to deal with this here, and it will
	// be more accurate since addImpliedSections() was often wrong.
	if ( prev &&
	     (orig->m_flags & SEC_HEADING) &&
	     prev->m_tagId != biggest->m_tagId )
		prev = NULL;

	// but if prev brother is a blank, we should view that as a delimeter
	// BUT really we should have added those sections in with the new
	// delimeter logic! but let's put this in for now anyway...
	if ( prev && prev->m_firstWordPos < 0 )
		prev = NULL;

	// if the header section has a prev brother, forget it!
	if ( prev ) return;

	// . if we gained extra text, that is a no-no then
	// . these two checks replaced the two commented out ones above
	// . they allow for empty sections preceeding "sr" at any level as
	//   we telescope it up
	if ( biggest->m_firstWordPos != orig->m_firstWordPos ) return;
	if ( biggest->m_lastWordPos  != orig->m_lastWordPos  ) return;

	// . now blow up first until just before it hits biggest as well
	// . this fixes reverbnation on the nextBrother check below
	for ( ; first ; first = first->m_parent ) {
		// stop if parent is NULL
		if ( ! first->m_parent ) break;
		// stop if parent would contain biggest
		if ( first->m_parent->contains ( biggest ) ) break;
	}
	// if after blowing it up "first" contains more than just menu
	// sections, then bail. that really was not a menu header!
	// fixes reverbnation url that thought "That 1 Guy" was a menu header.
	if ( flag == SEC_MENU_HEADER ) {
		Section *fx = first;
		for ( ; fx ; fx = fx->m_next ) {
			// stop when list is over
			if ( fx->m_a >= first->m_b ) break;
			// ignore if no next
			if ( fx->m_flags & SEC_NOTEXT ) continue;
			// thats bad if SEC_MENU not set, it should be for all!
			if ( fx->m_flags & SEC_MENU ) continue;
			// we got these now
			if ( fx->m_flags & SEC_MENU_SENTENCE ) continue;
			// otherwise, bad!
			return;
		}
	}

	// scan until outside biggest
	int32_t lastb = biggest->m_b;
	// . make sure sr does not contain any list in it
	// . scan all sections between sr and "saved"
	for ( ; sr ; sr = sr->m_next ) {
		// stop if over
		if ( sr->m_a >= lastb ) break;
		// if we have a brother with same taghash we are
		// part of a list
		if ( sr->m_nextBrother &&
		     sr->m_nextBrother->m_tagHash == sr->m_tagHash &&
		     sr->m_nextBrother != first )
			return;
		if ( sr->m_prevBrother &&
		     sr->m_prevBrother->m_tagHash == sr->m_tagHash &&
		     // for footers
		     sr->m_prevBrother != first )
			return;
	}

	// restart loop
	sr = biggest;
	// ok, not part of a list, flag it
	for ( ; sr ; sr = sr->m_next ) {
		// stop if over
		if ( sr->m_a >= lastb ) break;
		// flag each subsection
		sr->m_flags |= flag; // SEC_MENU_HEADER;
	}
}


// . set SEC_HEADING bits in Section::m_flags
// . identifies sections that are most likely headings
// . the WHOLE idea of this algo is to take a list of sections that are all 
//   the same tagId/baseHash and differentiate them so we can insert implied 
//   sections with headers. 
bool Sections::setHeadingBit ( ) {

	int32_t headings = 0;
	// scan the sections
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		int32_t fwp = si->m_firstWordPos;
		if ( fwp == -1 ) continue;

		// we must be the smallest container around this text
		if ( m_sectionPtrs[fwp] != si ) continue;

		// . make sure we are in our own hard section
		// . TODO: allow for bold or strong, etc. tags as well
		bool hasHard = false;
		int32_t a = si->m_firstWordPos;
		int32_t b = si->m_lastWordPos;
		// go to parent
		Section *pp = si;
		Section *biggest = NULL;
		bool inLink = false;
		// . we need to be isolated in our own hard section container
		// . TODO: what about "<b>Hi There <i>Bob</i></b>" as a heading
		// . i guess that will still work!
		for ( ; pp ; pp = pp->m_parent ) {
			// stop if breached
			if ( pp->m_firstWordPos != a ) break;
			if ( pp->m_lastWordPos  != b ) break;
			// record this
			if ( pp->m_tagId == TAG_A ) inLink = true;
			// record the biggest section containing just our text
			biggest = pp;
			// is it a hard section?
			if ( isHardSection(pp) )  hasHard = true;
			// . allow bold and strong tags
			// . fixes gwair.org which has the dates of the
			//   month in strong tags. so we need to set
			//   SEC_HEADING for those so getDelimHash() will
			//   recognize such tags as date header tags in the
			//   METHOD_DOM algorithm and we get the proper
			//   implied sections
			if ( pp->m_tagId == TAG_STRONG ) hasHard = true;
			if ( pp->m_tagId == TAG_B      ) hasHard = true;
		}
		// need to be isolated in a hard section
		if ( ! hasHard ) continue;

		// now make sure the text is capitalized etc
		bool hadUpper = false;
		//bool hadLower = false;
		int32_t lowerCount = 0;
		bool hadYear  = false;
		bool hadAlpha = false;
		int32_t i;
		// scan the alnum words we contain
		for ( i = a ; i <= b ; i++ ) {
			const auto &token = (*m_tr)[i];
			// . did we hit a breaking tag?
			// . "<div> blah <table><tr><td>blah... </div>"
			if ( token.nodeid && isBreakingTagId(token.nodeid) ) break;
			// skip if not alnum word
			if ( ! token.is_alfanum ) continue;
			// skip digits
			if(token.token_len == 4 &&
			   is_digit(token.token_start[0]) &&
			   is_digit(token.token_start[1]) &&
			   is_digit(token.token_start[2]) &&
			   is_digit(token.token_start[3])) {
				// . but if we had a year like "2010" that
				//   is allowed to be a header.
				// . this fixes 770kob.com because the events
				//   under the "2010" header were telescoping
				//   up into events in the "December 2009"
				//   section, when they should have been in
				//   their own section! and now they are in
				//   their own implied section...
				int32_t num = atol2(token.token_start,token.token_len);
				if ( num < 1800 ) continue;
				if ( num > 2100 ) continue;
				// mark it
				hadYear = true;
				continue;
			}
			// mark this
			hadAlpha = true;
			// is it upper?
			if ( is_upper_utf8(token.token_start) ) {
				hadUpper = true;
				continue;
			}
			// skip stop words
			if(isStopWord(token.token_start, token.token_len, token.token_hash)) continue;
			// . skip short words
			// . November 4<sup>th</sup> for facebook.com
			if ( token.token_len <= 2 ) continue;
			// is it lower?
			if ( is_lower_utf8(token.token_start) ) lowerCount++;
			// stop now if bad
			//if ( hadUpper ) break;
			if ( lowerCount >= 2 ) break;
		}
		// is it a header?
		bool isHeader = hadUpper;
		// a single year by itself is ok though too
		if ( hadYear && ! hadAlpha ) isHeader = true;
		// allow for one mistake like we do in Events.cpp for titles
		if ( lowerCount >= 2 ) isHeader = false;
		if ( ! isHeader ) continue;

		// ok, mark this section as a heading section
		si->m_flags |= SEC_HEADING;

		// a hack!
		if ( inLink ) biggest->m_flags |= SEC_LINK_TEXT;

		// count them
		headings++;
	}

	// bail now if no headings were set
	if ( ! headings ) return true;

	return true;
}

void Sections::setTagHashes ( ) {
	if ( m_numSections == 0 ) return;

	// now recompute the tagHashes and depths and content hashes since
	// we have eliminate open-ended sections in the loop above
	for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
		// these have to be in order of sn->m_a to work right
		// because we rely on the parent tag hash, which would not
		// necessarily be set if we were not sorted, because the
		// parent section could have SEC_FAKE flag set because it is
		// a br section added afterwards.

		// shortcut
		int64_t bh = (int64_t)sn->m_baseHash;

		// sanity check
		if ( bh == 0 ) { g_process.shutdownAbort(true); }

		// if no parent, use initial values
		if ( ! sn->m_parent ) {
			sn->m_depth   = 0;
			sn->m_tagHash = bh;

			// sanity check
			if ( bh == 0 ) { g_process.shutdownAbort(true); }
			continue;
		}

		// sanity check
		if ( sn->m_parent->m_tagHash == 0 ) { g_process.shutdownAbort(true); }

		// . update the cumulative front tag hash
		// . do not include hyperlinks as part of the cumulative hash!
		sn->m_tagHash = hash32h ( bh , sn->m_parent->m_tagHash );

		sn->m_colorHash = hash32h ( bh , sn->m_parent->m_colorHash );

		// if we are an implied section, just use the tag hash of
		// our parent. that way since we add different implied
		// sections for msichicago.com root than we do the kid,
		// the section voting should still match up
		if ( bh == BH_IMPLIED ) {
			sn->m_tagHash     = sn->m_parent->m_tagHash;
		}

		if ( sn->m_tagHash == 0 ) {
			sn->m_tagHash = 1234567;
		}

		// depth based on parent, too
		sn->m_depth = sn->m_parent->m_depth + 1;
	}
}

// make this replace ::print() when it works
bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) const {
	PrintData pd;
	pd.sbuf = sbuf;
	pd.hiPos = hiPos;
	pd.wposVec = wposVec;
	pd.densityVec = densityVec;
	pd.wordSpamVec = wordSpamVec;
	pd.fragVec = fragVec;
	return print(&pd);
}

bool Sections::print(PrintData *pd) const {
	pd->sbuf->setLabel ("sectprnt");

	//verifySections();

	int32_t    nw    = m_tr->size();

	// check words
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// get section
		Section *sn = m_sectionPtrs[i];
		if ( sn->m_a >  i ) { g_process.shutdownAbort(true); }
		if ( sn->m_b <= i ) { g_process.shutdownAbort(true); }
	}


	// print sections out
	for ( Section *sk = m_rootSection ; sk ; ) {
		// print this section
		printSectionDiv(pd,sk);
		// advance
		int32_t b = sk->m_b;
		// stop if last
		if ( b >= m_nw ) break;
		// get section after that
		sk = m_sectionPtrs[b];
	}

	// print header
	const char *hdr =
		"<table border=1>"
		"<tr>"
		"<td><b>sec #</b></td>"
		"<td><b>wordStart</b></td>"
		"<td><b>wordEnd</b></td>"
		"<td><b>baseHash</b></td>"
		"<td><b>cumulTagHash</b></td>"
		"<td><b>contentHash</b></td>"
		"<td><b>contentTagHash</b></td>"
		"<td><b>XOR</b></td>" // only valid for contentHashes
		"<td><b>depth</b></td>"
		"<td><b>parent word range</b></td>"
		"<td><b>flags</b></td>"
		"<td><b>evIds</b></td>"
		"<td><b>text snippet</b></td>"
		"</tr>\n";
	pd->sbuf->safePrintf("%s",hdr);

	int32_t rcount = 0;
	int32_t scount = 0;
	// show word # of each section so we can look in PageParser.cpp's
	// output to see exactly where it starts, since we now label all
	// the words
	for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS ) == 0 ) 
			pd->sbuf->safePrintf("</table>%s\n",hdr);
		const char *xs = "--";
		char ttt[100];
		if ( sn->m_contentHash64 ) {
			int32_t modified = sn->m_tagHash ^ sn->m_contentHash64;
			sprintf(ttt,"0x%" PRIx32,modified);
			xs = ttt;
		}
		// shortcut
		Section *parent = sn->m_parent;
		int32_t pswn = -1;
		int32_t pewn = -1;
		if ( parent ) {
			pswn = parent->m_a;
			pewn = parent->m_b;
		}

		// print it
		pd->sbuf->safePrintf("<tr><td>%" PRId32"</td>\n"
				 "<td>%" PRId32"</td>"
				 "<td>%" PRId32"</td>"
				 "<td>0x%" PRIx32"</td>"
				 "<td>0x%" PRIx32"</td>"
				 "<td>0x%" PRIx32"</td>"
				 "<td>0x%" PRIx32"</td>"
				 "<td>%s</td>"
				 "<td>%" PRId32"</td>"
				 "<td><nobr>%" PRId32" to %" PRId32"</nobr></td>"
				 "<td><nobr>" ,
				 scount++,
				 sn->m_a,
				 sn->m_b,
				 (int32_t)sn->m_baseHash,
				 (int32_t)sn->m_tagHash,
				 (int32_t)sn->m_contentHash64,
				 (int32_t)(sn->m_contentHash64^sn->m_tagHash),
				 xs,
				 sn->m_depth,
				 pswn,
				 pewn);
		// now show the flags
		printFlags ( pd->sbuf , sn );
		// first few words of section
		int32_t a = sn->m_a;
		int32_t b = sn->m_b;
		// -1 means an unclosed tag!! should no longer be the case
		if ( b == -1 ) { g_process.shutdownAbort(true); }//b=m_words->m_numWords;
		pd->sbuf->safePrintf("</nobr></td>");

		pd->sbuf->safePrintf("<td>&nbsp;</td>");

		pd->sbuf->safePrintf("<td><nobr>");
		// 70 chars max
		int32_t   max   = 70; 
		int32_t   count = 0;
		char   truncated = 0;
		// do not print last word/tag in section
		for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) {
			const char *s = (*m_tr)[i].token_start;
			int32_t  slen = (*m_tr)[i].token_len;
			if ( count + slen > max ) {
				truncated = 1; 
				slen = max - count;
			}
			count += slen;
			// boldify front tag
			if ( i == a ) pd->sbuf->safePrintf("<b>");
			pd->sbuf->htmlEncode(s,slen,false);
			// boldify front tag
			if ( i == a ) pd->sbuf->safePrintf("</b>");
		}
		// if we truncated print a ...
		if ( truncated ) pd->sbuf->safePrintf("<b>…</b>");
		// then print ending tag
		if ( b < nw ) {
			int32_t blen = (*m_tr)[b-1].token_len;             //b is from m_b and always>0 so indexing b-1 is safe
			if ( blen>20 ) blen = 20;
			pd->sbuf->safePrintf("<b>");
			pd->sbuf->htmlEncode((*m_tr)[b-1].token_start,blen,false);
			pd->sbuf->safePrintf("</b>");
		}

		pd->sbuf->safePrintf("</nobr></td></tr>\n");
	}
			 
	pd->sbuf->safePrintf("</table>\n<br>\n");


	return true;
}

bool Sections::printSectionDiv(PrintData *pd, const Section *sk) const {
	// enter a new div section now
	pd->sbuf->safePrintf("<br>");
	// only make font color different
	int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff;
	int32_t fcolor = 0x000000;
	int32_t rcolor = 0x000000;
	uint8_t *bp = (uint8_t *)&bcolor;
	bool dark = false;
	if ( bp[0]<128 && bp[1]<128 && bp[2]<128 ) 
		dark = true;
	// or if two are less than 50
	if ( (bp[0]<100 && bp[1]<100) || (bp[1]<100 && bp[2]<100) || (bp[0]<100 && bp[2]<100) ) dark = true;
		
	// if bg color is dark, make font color light
	if ( dark ) {
		fcolor = 0x00ffffff;
		rcolor = 0x00ffffff;
	}
	// start the new div
	pd->sbuf->safePrintf("<div "
			 "style=\""
			 "background-color:#%06" PRIx32";"
			 "margin-left:20px;"
			 "border:#%06" PRIx32" 1px solid;"
			 "color:#%06" PRIx32"\">",
			 //(int32_t)sk,
			 bcolor,
			 rcolor,
			 fcolor);

	bool printWord = true;
	if ( ! sk->m_parent && sk->m_next && sk->m_next->m_a == sk->m_a )
		printWord = false;

	// print word/tag #i
	if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord )
		// only encode if it is a tag
		pd->sbuf->htmlEncode((*m_tr)[sk->m_a].token_start, (*m_tr)[sk->m_a].token_len, false);

	pd->sbuf->safePrintf("<i>");

	// print the flags
	pd->sbuf->safePrintf("A=%" PRId32" ",sk->m_a);

	// print tag hash now
	pd->sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash);

	if ( sk->m_contentHash64 )
		pd->sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64);

	printFlags ( pd->sbuf , sk );
	
	if ( isHardSection(sk) )
		pd->sbuf->safePrintf("hardsec ");
	
	pd->sbuf->safePrintf("</i>\n");

	// now print each word and subsections in this section
	int32_t a = sk->m_a;
	int32_t b = sk->m_b;
	for ( int32_t i = a ; i < b ; i++ ) {
		const auto &token = (*m_tr)[i];
		// . if its a and us, skip
		// . BUT if we are root then really this tag belongs to
		//   our first child, so make an exception for root!
		if ( i == a && token.is_alfanum && (sk->m_parent) ) continue;

		// . get section of this word
		// . TODO: what if this was the tr tag we removed??? i guess
		//   maybe make it NULL now?
		Section *ws = m_sectionPtrs[i];
		// get top most parent that starts at word position #a and
		// is not "sk"
		for ( ; ; ws = ws->m_parent ) {
			if ( ws == sk ) break;
			if ( ! ws->m_parent ) break;
			if ( ws->m_parent->m_a != ws->m_a ) break;
			if ( ws->m_parent == sk ) break;
		}
		// if it belongs to another sections, print that section
		if ( ws != sk ) {
			// print out this subsection
			printSectionDiv(pd,ws);
			// advance to end of that then
			i = ws->m_b - 1;
			// and try next word
			continue;
		}

		// ignore if in style section, etc. just print it out
		if ( sk->m_flags & NOINDEXFLAGS ) {
			pd->sbuf->htmlEncode(token.token_start,token.token_len,false );
			continue;
		}

		// boldify alnum words
		if ( token.is_alfanum ) {
			if ( pd->wposVec[i] == pd->hiPos )
				pd->sbuf->safePrintf("<a name=hipos></a>");
			pd->sbuf->safePrintf("<nobr><b>");
			if ( i <  MAXFRAGWORDS && pd->fragVec[i] == 0 )
				pd->sbuf->safePrintf("<strike>");
		}
		if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
			pd->sbuf->safePrintf("<blink style=\""
					   "background-color:yellow;"
					   "color:black;\">");
		// print that word
		pd->sbuf->htmlEncode(token.token_start, token.token_len, false );
		if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos )
			pd->sbuf->safePrintf("</blink>");
		// boldify alnum words
		if ( token.is_alfanum ) {
			if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
				pd->sbuf->safePrintf("</strike>");
			pd->sbuf->safePrintf("</b>");
		}
		// and print out their pos/div/spam sub
		if ( token.is_alfanum ) {
			pd->sbuf->safePrintf("<sub "
					   "style=\"background-color:white;"
					   "font-size:10px;"
					   "border:black 1px solid;"
					   "color:black;\">");
			pd->sbuf->safePrintf("%" PRId32, pd->wposVec[i]);
			if ( pd->densityVec[i] != MAXDENSITYRANK )
				pd->sbuf->safePrintf("/<font color=purple><b>%" PRId32
						   "</b></font>"
						   ,
						   (int32_t)pd->densityVec[i]);

			if ( pd->wordSpamVec[i] != MAXWORDSPAMRANK )
				pd->sbuf->safePrintf("/<font color=red><b>%" PRId32
						   "</b></font>"
						   ,
						   (int32_t)pd->wordSpamVec[i]);
			pd->sbuf->safePrintf("</sub></nobr>");
		}
	}
	pd->sbuf->safePrintf("</div>\n");

	return true;
}

bool Sections::verifySections ( ) {

	// make sure we map each word to a section that contains it at least
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		Section *si = m_sectionPtrs[i];
		if ( si->m_a >  i ) { g_process.shutdownAbort(true); }
		if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
		// must have checksum
		if ( (*m_tr)[i].is_alfanum && si->m_contentHash64==0) { g_process.shutdownAbort(true); }
		// must have this set if 0
		if ( ! si->m_contentHash64 && !(si->m_flags & SEC_NOTEXT)) {
			g_process.shutdownAbort(true);}
		if (   si->m_contentHash64 &&  (si->m_flags & SEC_NOTEXT)) {
			g_process.shutdownAbort(true);}
	}

	// sanity check
	for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
		// get it
		//Section *sn = &m_sections[i];
		// get parent
		for(const Section *sp = sn->m_parent; sp; sp = sp->m_parent) {
			// make sure parent fully contains
			if ( sp->m_a > sn->m_a ) { g_process.shutdownAbort(true); }
			if ( sp->m_b < sn->m_b ) { g_process.shutdownAbort(true); }
			// and make sure every grandparent fully contains us too!
		}
	}

	// sanity check
	for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
		Section *sn = &m_sections[i];
		if ( sn->m_a >= sn->m_b ) { g_process.shutdownAbort(true); }
	}

	// sanity check, make sure each section is contained by the
	// smallest section containing it
	for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
		for ( Section *sj = m_rootSection ; sj ; sj = sj->m_next ) {
			// skip if us
			if ( sj == si ) continue;
			// skip column sections because they are artificial
			// and only truly contain some of the sections that
			// their [a,b) interval says they contain.
			if ( sj->m_tagId == TAG_TC ) continue;
			// or if an implied section of td tags in a tc
			if ( sj->m_baseHash == BH_IMPLIED &&
			     sj->m_parent &&
			     sj->m_parent->m_tagId == TAG_TC ) 
				continue;

			// skip if sj does not contain first word in si
			if ( sj->m_a >  si->m_a ) continue;
			if ( sj->m_b <= si->m_a ) continue;

			// ok, make sure in our parent path
			Section *ps = si;
			for ( ; ps ; ps = ps->m_parent ) 
				if ( ps == sj ) break;

			// ok if we found it
			if ( ps ) continue;

			// sometimes if sections are equal then the other
			// is the parent
			ps = sj;
			for ( ; ps ; ps = ps->m_parent ) 
				if ( ps == si ) break;

			// must have had us
			if ( ps ) continue;
			g_process.shutdownAbort(true);
		}
	}
	
	// make sure we map each word to a section that contains it at least
	for ( int32_t i = 0 ; i < m_nw ; i++ ) {
		Section *si = m_sectionPtrs[i];
		if ( si->m_a >  i ) { g_process.shutdownAbort(true); }
		if ( si->m_b <= i ) { g_process.shutdownAbort(true); }
	}

	return true;
}
No results found.