// print events should print if nothing else to print // when a div tag's parent truncates its section, it may have been // paired up with a div back tag which then should become free... // that is the problem... because those back tags are unpaired. // so your parent should constrain you as SOON as it is constrained and // close you up at that point. that way you cannot falsely pair-claim // a div back tag. #include "Sections.h" #include "Url.h" #include "tokenizer.h" #include "Conf.h" #include "XmlDoc.h" #include "Bits.h" #include "sort.h" #include "Abbreviations.h" #include "StopWords.h" #include "Process.h" #include "Posdb.h" #include "GbUtil.h" #include "Errno.h" Sections::Sections ( ) { m_sections = NULL; reset(); } void Sections::reset() { m_sectionBuf.purge(); m_sectionPtrBuf.purge(); m_sections = NULL; m_bits = NULL; m_numSections = 0; m_rootSection = NULL; m_lastSection = NULL; m_lastAdded = NULL; m_nw = 0; m_firstSentence = NULL; m_sectionPtrs = NULL; // Coverity m_tr = NULL; m_contentType = 0; m_isRSSExt = false; m_maxNumSections = 0; } Sections::~Sections ( ) { reset(); } #define TXF_MATCHED 1 // an element on the stack is a Tag class Tagx { public: // id of the fron tag we pushed nodeid_t m_tid; // section number we represent int32_t m_secNum; // set to TXF_MATCHED char m_flags; }; // i lowered from 1000 to 300 so that we more sensitive to malformed pages // because typically they seem to take longer to parse. i also added some // new logic for dealing with table tr and td back tags that allow us to // pop off the other contained tags right away rather than delaying it until // we are done because that will often breach this stack. #define MAXTAGSTACK 300 // . returns false if blocked, true otherwise // . returns true and sets g_errno on error // . sets m_sections[] array, 1-1 with words array "w" // . the Weights class can look at these sections and zero out the weights // for words in script, style, select and marquee sections bool Sections::set(const TokenizerResult *tr, Bits *bits, const Url *url, uint8_t contentType ) { reset(); if ( ! tr ) return true; if ( tr->size() > 1000000 ) { log("sections: over 1M words. skipping sections set for " "performance."); return true; } // save it m_tr = tr; m_bits = bits; m_contentType = contentType; // reset this just in case g_errno = 0; if ( tr->empty() ) return true; // shortcuts int32_t nw = tr->size(); m_isRSSExt = false; const char *ext = url->getExtension(); if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true; if ( m_contentType == CT_XML ) m_isRSSExt = true; // . how many sections do we have max? // . init at one to count the root section int32_t max = 1; for ( int32_t i = 0 ; i < nw ; i++ ) { const auto &token = (*tr)[i]; // . count all front tags // count back tags too since some url // http://www.tedxhz.com/tags.asp?id=3919&id2=494 had a bunch // of </p> tags with no front tags and it cored us because // m_numSections > m_maxNumSections! if ( token.nodeid ) { max += 2; // . any punct tag could have a bullet in it... // . or if its a period could make a sentence section } else if ( !token.is_alfanum ) { // only do not count simple spaces if ( token.token_len == 1 && is_wspace_a(token.token_start[0])) continue; // otherwise count it as sentence delimeter max++; } } // . then \0 allows for a sentence too! // . fix doc that was just "localize-sf-prod\n" max++; // and each section may create a sentence section max *= 2; // truncate if excessive. if ( max > 1000000 ) { log("sections: truncating max sections to 1000000"); max = 1000000; } int32_t need = max * sizeof(Section); // set this m_maxNumSections = max; m_sectionPtrBuf.setLabel("psectbuf"); // separate buf now for section ptr for each word if ( ! m_sectionPtrBuf.reserve ( nw *sizeof(Section *)) ) return true; m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart(); // allocate m_sectionBuf m_sections = NULL; m_sectionBuf.setLabel ( "sectbuf" ); if ( ! m_sectionBuf.reserve ( need ) ) return true; // point into it m_sections = (Section *)m_sectionBuf.getBufStart(); // save this too m_nw = nw; // stack of front tags we encounter Tagx stack[MAXTAGSTACK]; Tagx *stackPtr = stack; Section *current = NULL; Section *rootSection = NULL; // assume none m_rootSection = NULL; // only add root section if we got some words if ( nw > 0 ) { // record this i guess rootSection = &m_sections[m_numSections]; // clear memset ( rootSection , 0 , sizeof(Section) ); // . the current section we are in // . let's use a root section current = rootSection; // init that to be the whole page rootSection->m_b = nw; // save it m_rootSection = rootSection; // to fix a core dump rootSection->m_baseHash = 1; // advance m_numSections++; } // Sections are no longer 1-1 with words, just with front tags for ( int32_t i = 0 ; i < nw ; i++ ) { const auto &token = (*tr)[i]; nodeid_t fullTid = token.nodeid; // are we a non-tag? if ( ! fullTid ) { continue; } // make a single section for input tags if ( fullTid == TAG_INPUT || fullTid == TAG_HR || fullTid == TAG_COMMENT ) { // try to realloc i guess. should keep ptrs in tact. if ( m_numSections >= m_maxNumSections) { g_errno = EDOCBADSECTIONS; return true; } // get the section Section *sn = &m_sections[m_numSections]; // clear memset ( sn , 0 , sizeof(Section) ); // inc it m_numSections++; // sanity check - breach check if ( m_numSections > max ) { g_process.shutdownAbort(true); } // set our parent sn->m_parent = current; // need to keep a word range that the section covers sn->m_a = i; // section consists of just this tag sn->m_b = i + 1; // go on to next continue; } // a section of multiple br tags in a sequence if ( fullTid == TAG_BR ) { // try to realloc i guess. should keep ptrs in tact. if ( m_numSections >= m_maxNumSections) { g_errno = EDOCBADSECTIONS; return true; } // get the section Section *sn = &m_sections[m_numSections]; // clear memset ( sn , 0 , sizeof(Section) ); // inc it m_numSections++; // sanity check - breach check if ( m_numSections > max ) { g_process.shutdownAbort(true); } // set our parent sn->m_parent = current; // need to keep a word range that the section covers sn->m_a = i; // count em up int32_t brcnt = 1; // scan for whole sequence int32_t lastBrPos = i; for ( int32_t j = i + 1 ; j < nw ; j++ ) { const auto &token2 = (*tr)[j]; // claim br tags if ( token2.nodeid == TAG_BR ) { lastBrPos = j; brcnt++; continue; } // break on words if ( token2.is_alfanum ) break; // all spaces is ok if ( is_wspace_utf8_string(token2.token_start,token2.token_end()) ) continue; // otherwise, stop on other punct break; } // section consists of just this tag sn->m_b = lastBrPos + 1; // advance i = lastBrPos; // set this for later so that getDelimHash() returns // something different based on the br count for // METHOD_ATTRIBUTE sn->m_baseHash = 19999 + brcnt; // go on to next continue; } // get the tag id without the back bit nodeid_t tid = fullTid & BACKBITCOMP; // . ignore tags with no corresponding back tags // . if they have bad html and have front tags // with no corresponding back tags, that will hurt! // . make exception for <li> tag!!! // . was messing up: // http://events.kqed.org/events/index.php?com=detail& // eID=9812&year=2009&month=11 // for parsing out events // . make excpetion for <p> tag too! most ppl use </p> if ( ( ! hasBackTag ( tid ) || token.token_start[1] =='!' || // <!ENTITY rdfns...> token.token_start[1] =='?' ) && tid != TAG_P && tid != TAG_LI ) continue; // . these imply no back tag // . <description /> // . fixes inconsistency in // www.trumba.com/calendars/KRQE_Calendar.rss if ( token.token_start[token.token_len-2] == '/' && tid == TAG_XMLTAG ) continue; // do not breach the stack if ( stackPtr - stack >= MAXTAGSTACK ) { log( LOG_WARN, "html: stack breach for %s",url->getUrl()); // if we set g_errno and return then the url just // ends up getting retried once the spider lock // in Spider.cpp expires in MAX_LOCK_AGE seconds. // about an hour. but really we should completely // give up on this. whereas we should retry OOM errors // etc. but this just means bad html really. // just reset to 0 sections then reset(); return true; } char gotBackTag ; if ( fullTid != tid ) gotBackTag = 1; else gotBackTag = 0; // "pop tid", tid to pop off stack nodeid_t ptid = tid; nodeid_t fullPopTid = fullTid; // no nested <li> tags allowed if ( fullTid == TAG_LI && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_LI ) gotBackTag = 2; // no nested <b> tags allowed if ( fullTid == TAG_B && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_B ) gotBackTag = 2; // no nested <a> tags allowed if ( fullTid == TAG_A && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_A ) gotBackTag = 2; // no nested <p> tags allowed if ( fullTid == TAG_P && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_P ) gotBackTag = 2; // no <hN> tags inside a <p> tag // fixes http://www.law.berkeley.edu/140.htm if ( fullTid >= TAG_H1 && fullTid <= TAG_H5 && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_P ) { // match this on stack ptid = TAG_P; fullPopTid = TAG_P; gotBackTag = 2; } // no nested <td> tags allowed if ( fullTid == TAG_TD && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_TD ) gotBackTag = 2; // encountering <tr> when in a <td> closes the <td> AND // should also close the <tr>!! if ( fullTid == TAG_TR && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_TD ) gotBackTag = 2; // no nested <tr> tags allowed if ( fullTid == TAG_TR && stackPtr > stack && ((stackPtr-1)->m_tid)==TAG_TR ) gotBackTag = 2; // this is true if we are a BACK TAG if ( gotBackTag ) { // ignore span tags that are non-breaking because they // do not change the grouping/sectioning behavior of // the web page and are often abused. if ( ptid == TAG_SPAN ) continue; // fix for gwair.org if ( ptid == TAG_FONT ) continue; // too many people use these like a <br> tag or // make them open-ended or unbalanced //if ( tid == TAG_P ) continue; if ( ptid == TAG_CENTER ) continue; subloop: // don't blow the stack if ( stackPtr == stack ) continue; // point to it Tagx *spp = (stackPtr - 1); // init it Tagx *p ; // scan through the stack until we find a // front tag that matches this back tag //for(p = spp ; p >= stack && gotBackTag == 1 ; p-- ) { for ( p = spp ; p >= stack ; p-- ) { // no match? if ( p->m_tid != ptid ) { // matched before? we can pop if ( p->m_flags & TXF_MATCHED ) continue; // keep on going continue; } // do not double match if ( p->m_flags & TXF_MATCHED ) continue; // flag it cuz we matched it p->m_flags |= TXF_MATCHED; // set the stack ptr to it spp = p; // and stop break; } // no matching front tag at all? // then just ignore this back tag if ( p < stack ) continue; // get section number of the front tag //int32_t xn = *(secNumPtr-1); int32_t xn = spp->m_secNum; // sanity if ( xn<0 || xn>=m_numSections ) {g_process.shutdownAbort(true);} // get it Section *sn = &m_sections[xn]; // record the word range of the secion we complete sn->m_b = i+1; // do not include the <li> tag as part of it // otherwise we end up with overlapping section since // this tag ALSO starts a section!! if ( gotBackTag == 2 ) sn->m_b = i; // if our parent got closed before "sn" closed because // it hit its back tag before we hit ours, then we // must cut ourselves short and try to match this // back tag to another front tag on the stack Section *ps = sn->m_parent; for ( ; ps != rootSection ; ps = ps->m_parent ) { // skip if parent no longer contains us! if ( ps->m_b <= sn->m_a ) continue; // skip if this parent is still open if ( ps->m_b <= 0 ) continue; // parent must have closed before us if ( ps->m_b > sn->m_b ) {g_process.shutdownAbort(true);} // cut our end shorter sn->m_b = ps->m_b; // our TXF_MATCHED bit should still be set // for spp->m_flags, so try to match ANOTHER // front tag with this back tag now if ( ! ( spp->m_flags & TXF_MATCHED ) ) { g_process.shutdownAbort(true); } // ok, try to match this back tag with another // front tag on the stack, because the front // tag we had selected got cut short because // its parent forced it to cut short. goto subloop; } // sanity check if ( sn->m_b <= sn->m_a ) { g_process.shutdownAbort(true);} // revert it to this guy, may not equal stackPtr-1 !! stackPtr = spp; // get parent section if ( stackPtr > stack ) { // get parent section now xn = (stackPtr-1)->m_secNum; // set current to that current = &m_sections[xn]; } else { // i guess this is bad html! current = rootSection; } // debug log if ( g_conf.m_logDebugSections ) { const char *ms = ""; if ( stackPtr->m_tid != ptid) ms =" UNMATCHED"; const char *back =""; if ( fullPopTid & BACKBIT ) back = "/"; logf(LOG_DEBUG,"section: pop tid=%" PRId32" " "i=%" PRId32" " "level=%" PRId32" " "%s%s " //"h=0x%" PRIx32 "%s",(int32_t)tid, i, (int32_t)(stackPtr - stack), back,g_nodes[tid].m_nodeName, //h, ms); } // . if we were a back tag, we are done... but if we // were a front tag, we must add ourselves below... // . MDW: this seems more logical than the if-statement // below... if ( fullTid != tid ) continue; } if ( tid == TAG_CENTER ) continue; if ( tid == TAG_SPAN ) continue; // gwair.org has font tags the pair up a date "1st Sundays" // with the address above it, and it shouldn't do that! if ( tid == TAG_FONT ) continue; // try to realloc i guess. should keep ptrs in tact. if ( m_numSections >= m_maxNumSections) { g_errno = EDOCBADSECTIONS; return true; } // get the section Section *sn = &m_sections[m_numSections]; // clear memset ( sn , 0 , sizeof(Section) ); // inc it m_numSections++; // sanity check - breach check if ( m_numSections > max ) { g_process.shutdownAbort(true); } // set our parent sn->m_parent = current; // set this current = sn; // need to keep a word range that the section covers sn->m_a = i; // assume no terminating bookend sn->m_b = -1; // push a unique id on the stack so we can pop if we // enter a subsection stackPtr->m_tid = tid; stackPtr->m_secNum = m_numSections - 1; stackPtr->m_flags = 0; stackPtr++; // debug log if ( ! g_conf.m_logDebugSections ) continue; logf(LOG_DEBUG,"section: push tid=%" PRId32" " "i=%" PRId32" " "level=%" PRId32" " "%s " , (int32_t)tid, i, (int32_t)(stackPtr - stack)-1, g_nodes[(int32_t)tid].m_nodeName ); } // if first word in a section false outside of the parent section // then reparent to the grandparent. this can happen when we end // up closing a parent section before ??????? for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *si = &m_sections[i]; // skip if we are still open-ended if ( si->m_b < 0 ) continue; // get parent Section *sp = si->m_parent; // skip if no parent if ( ! sp ) continue; // skip if parent still open ended if ( sp->m_b < 0 ) continue; // subloop it doagain: // skip if no parent if ( ! sp ) continue; // parent must start before us if ( sp->m_a > si->m_a ) { g_process.shutdownAbort(true); } // . does parent contain our first word? // . it need not fully contain our last word!!! if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue; // if parent is open ended, then it is ok for now if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue; // get grandparent sp = sp->m_parent; // set si->m_parent = sp; // try again goto doagain; } bool inGbFrame = false; int32_t gbFrameNum = 0; bool inIFrame = false; // // . set Section::m_xmlNameHash for xml tags here // . set Section::m_frameNum and SEC_IN_GBFRAME bit // for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *sn = &m_sections[i]; // get it int32_t ws = sn->m_a; const auto &token = (*m_tr)[ws]; // shortcut nodeid_t tid = token.nodeid; if (tid == TAG_IFRAME) { //if the section doesn't have the closing iframe tag then set inIFrame bool hasClosingIframeTag = false; for(int j=sn->m_b-1; j>i; j--) { if((*m_tr)[j].nodeid == (TAG_IFRAME|BACKBIT)) { hasClosingIframeTag = true; break; } } if(!hasClosingIframeTag) inIFrame = true; else if(!inGbFrame) sn->m_flags |= SEC_IN_IFRAME; } else if (tid == (TAG_IFRAME | BACKBIT)) { //never happens how sentences are currently split inIFrame = false; } else if ( tid == TAG_GBFRAME ) { // start or end? gbFrameNum++; inGbFrame = true; } else if ( tid == (TAG_GBFRAME | BACKBIT) ) { inGbFrame = false; } if (inIFrame && !inGbFrame) sn->m_flags |= SEC_IN_IFRAME; // mark it if (inGbFrame) sn->m_gbFrameNum = gbFrameNum; // custom xml tag, hash the tag itself if ( tid != TAG_XMLTAG ) continue; // stop at first space to avoid fields!! const char *p = token.token_start + 1; const char *pend = p + token.token_len; // skip back tags if ( *p == '/' ) continue; // reset hash int64_t xh = 0; // and hash char count unsigned char cnt = 0; // hash till space or / or > for ( ; p < pend ; p++ ) { // stop on space or / or > if ( is_wspace_a(*p) ) break; if ( *p == '/' ) break; if ( *p == '>' ) break; // hash it in xh ^= g_hashtab[cnt++][(unsigned char )*p]; } // if it is a string of the same chars it can be 0 if ( ! xh ) xh = 1; // store that sn->m_xmlNameHash = (int32_t)xh; } //TODO: implement section m_flags inheritance correctly. Currently SEC_IN_IFRAME/SEC_HIDDEN/... are not inherited by child sections. // find any open ended tags and constrain them based on their parent for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *si = &m_sections[i]; // get its parent Section *ps = si->m_parent; // if parent is open-ended panic! if ( ps && ps->m_b < 0 ) { g_process.shutdownAbort(true); } // if our parent got constrained from under us, we need // to telescope to a new parent for ( ; ps && ps->m_b >= 0 && ps->m_b <= si->m_a ; ) { ps = ps->m_parent; si->m_parent = ps; } // assume end is end of doc int32_t end = m_tr->size(); // get end of parent if ( ps ) end = ps->m_b; // shrink our section if parent ends before us OR if we // are open ended if ( si->m_b != -1 && si->m_b <= end ) continue; // this might constrain someone's parent such that // that someone no longer can use that parent!! si->m_b = end; // . get our tag type // . use int32_t instead of nodeid_t so we can re-set this // to the xml tag hash if we need to int32_t tid1 = (*m_tr)[si->m_a].nodeid; // use the tag hash if this is an xml tag if ( tid1 == TAG_XMLTAG ) { // we computed this above tid1 = si->m_xmlNameHash; // skip if zero! if ( ! tid1 ) continue; } // must be there to be open ended if ( ! tid1 ) { g_process.shutdownAbort(true); } // NOW, see if within that parent there is actually another // tag after us of our same tag type, then use that to // constrain us instead!! // this hurts <p><table><tr><td><p>.... because it // uses that 2nd <p> tag to constrain si->m_b of the first // <p> tag which is not right! sunsetpromotions.com has that. for ( int32_t j = i + 1 ; j < m_numSections ; j++ ) { // get it Section *sj = &m_sections[j]; // get word start int32_t a = sj->m_a; // skip if ties with us already if ( a == si->m_a ) continue; // stop if out if ( a >= end ) break; // . it must be in the same expanded frame src, if any // . this fixes trulia.com which was ending our html // tag, which was open-ended, with the html tag in // a frame src expansion if ( sj->m_gbFrameNum != si->m_gbFrameNum ) continue; // fix sunsetpromotions.com bug. see above. if ( sj->m_parent != si->m_parent ) continue; // get its tid int32_t tid2 = (*m_tr)[a].nodeid; // use base hash if xml tag if ( tid2 == TAG_XMLTAG ) tid2 = sj->m_xmlNameHash; // must be our tag type! if ( tid2 != tid1 ) continue; // ok end us there instead! si->m_b = a; // stop break; } } // reparent again now that things are closed for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *si = &m_sections[i]; // skip if we are still open-ended if ( si->m_b < 0 ) { g_process.shutdownAbort(true); } // get parent Section *sp = si->m_parent; // skip if null if ( ! sp ) continue; // skip if parent still open ended if ( sp->m_b < 0 ) { g_process.shutdownAbort(true); } // subloop it doagain2: // skip if no parent if ( ! sp ) continue; // . does parent contain our first word? // . it need not fully contain our last word!!! if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue; // if parent is open ended, then it is ok for now if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue; // if parent is open ended, then it is ok for now if ( sp->m_b == -1 ) { g_process.shutdownAbort(true); } // get grandparent sp = sp->m_parent; // set si->m_parent = sp; // try again goto doagain2; } // // // now assign m_sectionPtrs[] which map a word to the first // section that contains it // // Section *dstack[MAXTAGSTACK]; int32_t ns = 0; int32_t j = 0; current = m_rootSection;//&m_sections[0]; Section *next = m_rootSection;//&m_sections[0]; // first print the html lines out for ( int32_t i = 0 ; i < m_nw ; i++ ) { // pop all off the stack that match us for ( ; ns>0 && dstack[ns-1]->m_b == i ; ) { ns--; current = dstack[ns-1]; } // push our current section onto the stack if i equals // its first word # for ( ; next && i == next->m_a ; ) { dstack[ns++] = next; // set our current section to this now current = next; // get next section for setting "next" j++; // if no more left, set "next" to NULL and stop loop if ( j >= m_numSections ) { next=NULL; break; } // grab it next = &m_sections[j]; } // assign m_sectionPtrs[i] = current; } // . addImpliedSections() requires Section::m_baseHash // . set Section::m_baseHash for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // these have to be in order of sn->m_a to work right // because we rely on the parent tag hash, which would not // necessarily be set if we were not sorted, because the // parent section could have SEC_FAKE flag set because it is // a br section added afterwards. Section *sn = &m_sections[i]; // get word start into "ws" int32_t ws = sn->m_a; const auto &token = (*m_tr)[ws]; // shortcut nodeid_t tid = token.nodeid; // sanity check, <a> guys are not sections //if ( tid == TAG_A && // !(sn->m_flags & SEC_SENTENCE) ) { g_process.shutdownAbort(true); } // use a modified tid as the tag hash? int64_t mtid = tid; // custom xml tag, hash the tag itself if ( tid == TAG_XMLTAG ) mtid = hash32 ( token.token_start,token.token_len ); // an unknown tag like <!! ...-> if ( tid == 0 ) mtid = 1; // . if we are a div tag, mod it // . treat the fields in the div tag as // part of the tag hash. // . helps Events.cpp be more precise about // section identification!!!! // . we now do this for TD and TR so Nov 2009 can telescope for // http://10.5.1.203:8000/test/doc.17096238520293298312.html // so the calendar title "Nov 2009" can affect all dates // below the calendar. if ( tid == TAG_DIV || tid == TAG_TD || tid == TAG_TR || tid == TAG_LI || // newmexico.org urls class=xxx tid == TAG_UL || // newmexico.org urls class=xxx tid == TAG_P || // <p class="pstrg"> stjohnscollege.edu tid == TAG_SPAN ) { // get ptr const char *p = token.token_start; // skip < p++; // skip following alnums, that is the tag name for ( ; is_alnum_a(*p) ; p++ ); // scan for "id" or "class" in it // . i had to increase this because we were missing // some stuff causing us to get the wrong implied // sections for // www.guysndollsllc.com/page5/page4/page4.html // causing "The Remains" to be paired up with // "Aug 7, 2010" in an implied section which was // just wrong. it was 20, i made it 100... const char *pend = p + 100; // position ptr unsigned char cnt = 0; // a flag bool skipTillSpace = false; // . just hash every freakin char i guess // . TODO: maybe don't hash "width" for <td><tr> for ( ; *p && *p !='>' && p < pend ; p++ ) { // skip bgcolor= tags because panjea.org // interlaces different colored <tr>s in the // table and i want them to be seen as brother // sections, mostly for the benefit of the // setting of lastBrother1/2 in Events.cpp if ( is_wspace_a(p[0]) && to_lower_a (p[1])=='b' && to_lower_a (p[2])=='g' ) { skipTillSpace = true; continue; } // if not a space continue if ( skipTillSpace ) { if ( ! is_wspace_a(*p) ) continue; skipTillSpace = false; } // do not hash until we get a space if ( skipTillSpace ) continue; // skip if not alnum if ( !is_alnum_a(*p)) continue; // hash it in mtid ^= g_hashtab[cnt++][(unsigned char)*p]; } } // should not have either of these yet! if ( sn->m_flags & SEC_FAKE ) { g_process.shutdownAbort(true); } if ( sn->m_flags & SEC_SENTENCE ) { g_process.shutdownAbort(true); } // sanity check if ( mtid == 0 ) { g_process.shutdownAbort(true); } // . set the base hash, usually just tid // . usually base hash is zero but if it is a br tag // we set it to something special to indicate the number // of br tags in the sequence sn->m_baseHash ^= mtid; // fix this if ( sn == rootSection ) sn->m_baseHash = 1; // fix root section i guess if ( sn->m_baseHash == 0 ) { // fix core on gk21 sn->m_baseHash = 2; } // set this now too WHY? should already be set!!! was // causing the root section to become a title section // because first word was "<title>". then every word in // the doc got SEC_IN_TITLE set and did not get hashed // in XmlDoc::hashBody()... NOR in XmlDoc::hashTitle()!!! if ( sn != rootSection ) sn->m_tagId = tid; } // set up our linked list, the functions below will insert sections // and modify this linked list for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // set it if ( i + 1 < m_numSections ) m_sections[i].m_next = &m_sections[i+1]; if ( i - 1 >= 0 ) m_sections[i].m_prev = &m_sections[i-1]; } // init to -1 to indicate none for ( Section *si = m_rootSection ; si ; si = si->m_next ) { // reset it si->m_firstWordPos = -1; si->m_lastWordPos = -1; si->m_senta = -1; si->m_sentb = -1; } // now set position of first word each section contains for ( int32_t i = 0 ; i < m_nw ; i++ ) { // skip if not alnum word if ( ! (*m_tr)[i].is_alfanum ) continue; // get smallest section containing Section *si = m_sectionPtrs[i]; // do each parent as well for ( ; si ; si = si->m_parent ) { // skip if already had one! if ( si->m_firstWordPos >= 0 ) break; // otherwise, we are it si->m_firstWordPos = i; // . set format hash of it // . do it manually since tagHash not set yet } } // and last word position for ( int32_t i = m_nw - 1 ; i > 0 ; i-- ) { // skip if not alnum word if ( ! (*m_tr)[i].is_alfanum ) continue; // get smallest section containing Section *si = m_sectionPtrs[i]; // do each parent as well for ( ; si ; si = si->m_parent ) { // skip if already had one! if ( si->m_lastWordPos >= 0 ) break; // otherwise, we are it si->m_lastWordPos = i; } } sec_t inFlag = 0; int32_t istack[1000]; sec_t iflags[1000]; int32_t ni = 0; // // now set the inFlags here because the tags might not have all // been closed, making tags like SEC_STYLE overflow from where // they should be... // for ( Section *si = m_rootSection ; si ; si = si->m_next ) { // did we exceed a tag boundary? for ( ; ni>0 && si->m_a >= istack[ni-1] ; ) { // undo flag inFlag &= ~iflags[ni-1]; // pop off ni--; } // get the flag if any into mf sec_t mf = 0; // skip if not special tag id nodeid_t tid = si->m_tagId; if ( tid == TAG_SCRIPT ) mf = SEC_SCRIPT; else if ( tid == TAG_NOSCRIPT) mf = SEC_NOSCRIPT; else if ( tid == TAG_STYLE ) mf = SEC_STYLE; else if ( tid == TAG_SELECT ) mf = SEC_SELECT; else if ( tid == TAG_H1 ) mf = SEC_IN_HEADER; else if ( tid == TAG_H2 ) mf = SEC_IN_HEADER; else if ( tid == TAG_H3 ) mf = SEC_IN_HEADER; else if ( tid == TAG_H4 ) mf = SEC_IN_HEADER; else if ( tid == TAG_TITLE ) mf = SEC_IN_TITLE; else if ( tid == TAG_HEAD ) mf = SEC_IN_HEAD; // accumulate inFlag |= mf; // add in the flags si->m_flags |= inFlag; // skip if nothing special if ( ! mf ) continue; // sanity if ( ni >= 1000 ) { g_process.shutdownAbort(true); } // otherwise, store on stack istack[ni] = si->m_b; iflags[ni] = mf; ni++; } // . now we insert sentence sections // . find the smallest section containing the first and last // word of each sentence and inserts a subsection into that // . we have to be careful to reparent, etc. // . kinda copy splitSections() function // . maybe add an "insertSection()" function??? if ( m_contentType != CT_JS ) { // add sentence sections if ( ! addSentenceSections() ) return true; // this is needed by setSentFlags() setNextSentPtrs(); } // . set m_nextBrother // . we call this now to aid in setHeadingBit() and for adding the // implied sections, but it is ultimately // called a second time once all the new sections are inserted setNextBrotherPtrs ( false ); // . set SEC_HEADING bit // . need this before implied sections setHeadingBit (); setTagHashes(); // // // TODO TODO // // TAKE OUT THESE SANITY CHECKS TO SPEED UP!!!!!! // // // clear this bool isHidden = false; int32_t startHide = 0x7fffffff; int32_t endHide = 0 ; // now that we have closed any open tag, set the SEC_HIDDEN bit // for all sections that are like <div style=display:none> for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) { // set m_lastSection so we can scan backwards m_lastSection = sn; // set this int32_t wn = sn->m_a; // stop hiding it? if ( isHidden ) { // turn it off if not contained if ( wn >= endHide ) isHidden = false; else sn->m_flags |= SEC_HIDDEN; } // get tag id nodeid_t tid = sn->m_tagId; // is div, td or tr tag start? if ( tid!=TAG_DIV && tid!=TAG_TD && tid!=TAG_TR && tid!=TAG_UL && tid!=TAG_SPAN) continue; // . if we are a div tag, mod it // . treat the fields in the div tag as // part of the tag hash. // . helps Events.cpp be more precise about // section identification!!!! // . we now do this for TD and TR so Nov 2009 can telescope for // http://10.5.1.203:8000/test/doc.17096238520293298312.html // so the calendar title "Nov 2009" can affect all dates // below the calendar. // get the style tag in there and check it for "display: none"! int32_t slen = (*m_tr)[wn].token_len; const char *s = (*m_tr)[wn].token_start; const char *send = s + slen; // check out any div tag that has a style const char *style = gb_strncasestr(s,slen,"style=") ; if ( ! style ) continue; // . check for hidden // . if no hidden tag assume it is UNhidden // . TODO: later push & pop on stack const char *ds = gb_strncasestr(style,send-style,"display:"); // if display:none not found turn off SEC_HIDDEN if ( ! ds || ! gb_strncasestr(s,slen,"none") ) { // turn off the hiding isHidden = false; // off in us too sn->m_flags &= ~SEC_HIDDEN; continue; } // mark all sections in this with the tag isHidden = true; // on in us sn->m_flags |= SEC_HIDDEN; // stop it after this word for sure if ( sn->m_b > endHide ) endHide = sn->m_b; if ( sn->m_a < startHide ) startHide = sn->m_a; } // now set the content hash of each section for ( int32_t i = 0 ; i < m_nw ; i++ ) { // must be an alnum word if ( ! (*m_tr)[i].is_alfanum ) continue; // get its section m_sectionPtrs[i]->m_contentHash64 ^= (*m_tr)[i].token_hash; // fix "smooth smooth!" if ( m_sectionPtrs[i]->m_contentHash64 == 0 ) m_sectionPtrs[i]->m_contentHash64 = 123456; } // now set SEC_NOTEXT flag if content hash is zero! for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *sn = &m_sections[i]; // skip if had text if ( sn->m_contentHash64 ) continue; // no text! sn->m_flags |= SEC_NOTEXT; } // // set Section::m_alnumPosA/m_alnumPosB // int32_t alnumCount2 = 0; for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *sn = &m_sections[i]; // skip if had text if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue; // save this sn->m_alnumPosA = alnumCount2; // scan the wids of the whole sentence, which may not // be completely contained in the "sn" section!! int32_t a = sn->m_senta; int32_t b = sn->m_sentb; for ( int32_t j = a ; j < b ; j++ ) { // must be an alnum word if ( ! (*m_tr)[j].is_alfanum ) continue; // alnumcount alnumCount2++; } // so we contain the range [a,b), typical half-open interval sn->m_alnumPosB = alnumCount2; // sanity check if ( sn->m_alnumPosA == sn->m_alnumPosB ){g_process.shutdownAbort(true);} // propagate through parents Section *si = sn->m_parent; // do each parent as well for ( ; si ; si = si->m_parent ) { // skip if already had one! if ( si->m_alnumPosA > 0 ) break; // otherwise, we are it si->m_alnumPosA = sn->m_alnumPosA; } } // propagate up alnumPosB now for ( int32_t i = 0 ; i < m_numSections ; i++ ) { // get it Section *sn = &m_sections[i]; // skip if had text if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue; // propagate through parents Section *si = sn->m_parent; // do each parent as well for ( ; si ; si = si->m_parent ) { // skip if already had one! no, because we need to // get the MAX of all of our kids!! //if ( si->m_alnumPosB > 0 ) break; // otherwise, we are it si->m_alnumPosB = sn->m_alnumPosB; } } /////////////////////////////////////// // // now set Section::m_listContainer // // . a containing section is a section containing // MULTIPLE smaller sections // . so if a section has a containing section set its m_listContainer // to that containing section // . we limit this to sections that directly contain text for now // . Events.cpp::getRegistrationTable() uses m_nextBrother so we // need this now!! // /////////////////////////////////////// setNextBrotherPtrs ( true ); /////////////////////////////////////// // // now set SEC_MENU and SEC_LINK_TEXT flags // /////////////////////////////////////// setMenus(); //verifySections(); return true; } // . PROBLEM: because we ignore non-breaking tags we often get sections // that are really not sentences, but we are forced into them because // we cannot split span or bold tags // i.e. "<div>This is <b>a sentence. And this</b> is a sentence.</div>" // forces us to treat the entire div tag as a sentence section. // . i did add some logic to ignore those (the two for-k loops below) but then // Address.cpp cores because it expects every alnum word to be in a sentence // . now make sure to shrink into our current parent if we would not lose // alnum chars!! fixes sentence flip flopping // . returns false and sets g_errno on error bool Sections::addSentenceSections ( ) { sec_t badFlags = SEC_STYLE | SEC_SCRIPT | SEC_SELECT | SEC_HIDDEN | SEC_NOSCRIPT; // shortcut Section **sp = m_sectionPtrs; static bool s_init = false; static int64_t h_in; static int64_t h_at; static int64_t h_for; static int64_t h_to; static int64_t h_on; static int64_t h_under; static int64_t h_with; static int64_t h_along; static int64_t h_from; static int64_t h_by; static int64_t h_of; static int64_t h_some; static int64_t h_the; static int64_t h_and; static int64_t h_a; static int64_t h_http; static int64_t h_https; static int64_t h_room; static int64_t h_rm; static int64_t h_bldg; static int64_t h_building; static int64_t h_suite; static int64_t h_ste; static int64_t h_tags; if ( ! s_init ) { s_init = true; h_tags = hash64n("tags"); h_in = hash64n("in"); h_the = hash64n("the"); h_and = hash64n("and"); h_a = hash64n("a"); h_at = hash64n("at"); h_for = hash64n("for"); h_to = hash64n("to"); h_on = hash64n("on"); h_under = hash64n("under"); h_with = hash64n("with"); h_along = hash64n("along"); h_from = hash64n("from"); h_by = hash64n("by"); h_of = hash64n("of"); h_some = hash64n("some"); h_http = hash64n("http"); h_https = hash64n("https"); h_room = hash64n("room"); h_rm = hash64n("rm"); h_bldg = hash64n("bldg"); h_building = hash64n("building"); h_suite = hash64n("suite"); h_ste = hash64n("ste"); } // need D_IS_IN_URL bits to be valid m_bits->setInUrlBits ( ); // is the abbr. a noun? like "appt." bool hasWordAfter = false; for ( int32_t i = 0 ; i < m_nw ; i++ ) { // need a wid if ( ! (*m_tr)[i].is_alfanum ) continue; // get section we are currently in Section *cs = m_sectionPtrs[i]; // skip if its bad! i.e. style or script or whatever if ( cs->m_flags & badFlags ) continue; // set that int64_t prevWid = (*m_tr)[i].token_hash; int64_t prevPrevWid = 0LL; // flag int32_t lastWidPos = i;//-1; bool lastWasComma = false; nodeid_t includedTag = -2; int32_t lastbr = -1; bool endOnBr = false; bool endOnBold = false; bool capped = true; int32_t upper = 0; int32_t numAlnums = 0; // scan for sentence end int32_t j; for ( j = i ; j < m_nw ; j++ ) { const auto &token2 = (*m_tr)[j]; // skip words if ( token2.is_alfanum ) { // prev prev prevPrevWid = prevWid; // assume not a word like "vs." hasWordAfter = false; // set prev prevWid = token2.token_hash; lastWidPos = j; lastWasComma = false; endOnBr = false; endOnBold = false; numAlnums++; // skip if stop word and need not be // capitalized if ( m_bits->queryBits(j) & D_IS_STOPWORD ) continue; if ( token2.token_len <= 1 ) continue; if ( is_digit(token2.token_start[0]) ) continue; if ( !is_upper_utf8(token2.token_start)) capped=false; else upper++; continue; } // tag? if ( token2.nodeid ) { // shortcut nodeid_t tid = token2.nodeid & BACKBITCOMP; // treat nobr as breaking to fix ceder.net // which has it after the group title if ( tid == TAG_NOBR ) break; if ( tid == TAG_BR ) endOnBr = true; if ( tid == TAG_B ) endOnBold = true; // a </b><br> is usually like a header if ( capped && upper && endOnBr && endOnBold ) break; // if it is <span style="display:none"> or // div or whatever, that is breaking! // fixes http://chuckprophet.com/gigs/ if ( (tid == TAG_DIV || tid == TAG_SPAN ) && token2.token_len > 14 && strncasestr(token2.token_start,"display:none", token2.token_len) ) break; // ok, treat span as non-breaking for a second if ( tid == TAG_SPAN ) continue; // mark this if ( tid == TAG_BR ) lastbr = j; // // certain tags like span and br sometimes // do and sometimes do not break a sentence. // so by default assume they do, but check // for certain indicators... // if ( tid == TAG_SPAN || tid == TAG_BR || // fixes guysndollsllc.com: // causes core dump: tid == TAG_P || // villr.com // fixes americantowns.com tid == TAG_DIV ) { // if nothing after, moot point if ( j+1 >= m_nw ) break; // if we already included this tag // then keep including it. but some // span tags will break and some won't // even when in or around the same // sentence. see that local.yahoo.com // food delivery services url for // the first street address, // 5013 Miramar if ( includedTag == tid && (token2.nodeid & BACKBIT) ) { // reset it in case next // <span> tag is not connective includedTag = -2; continue; } // if we included this tag type // as a front tag, then include its // back tag in sentence as well. // fixes nonamejustfriends.com // which has a span tag in sentence: // ".. Club holds a <span>FREE</span> // Cruise Night..." and we allow // "<span>" because it follows "a", // but we were breaking on </span>! if ( !(token2.nodeid&BACKBIT)) includedTag = tid; // if prev punct was comma and not // an alnum word if ( lastWasComma ) continue; // get punct words bookcasing this tag if ( ! (*m_tr)[j+1].is_alfanum && ! (*m_tr)[j+1].nodeid && has_char((*m_tr)[j+1].token_start,(*m_tr)[j+1].token_end(),',') ) continue; // if prevwid is like "vs." then // that means keep going even if // we hit one of these tags. fixes // "new york knicks vs.<br>orlando // magic" if ( hasWordAfter ) continue; // if first alnum word after tag // is lower case, that is good too int32_t aw = j + 1; int32_t maxaw = j + 12; if ( maxaw > m_nw ) maxaw = m_nw; for ( ; aw < maxaw ; aw++ ) if ( (*m_tr)[aw].is_alfanum ) break; bool isLower = false; if ( aw < maxaw && is_lower_utf8((*m_tr)[aw].token_start) ) isLower = true; // http or https is not to be // considered as such! fixes // webnetdesign.com from getting // sentences continued by an http:// // url below them. if ( aw < maxaw && ((*m_tr)[aw].token_hash == h_http || (*m_tr)[aw].token_hash == h_https) ) isLower = false; if ( tid == TAG_P && isLower && // Oscar G<p>along with xxxx (*m_tr)[aw].token_hash != h_along && (*m_tr)[aw].token_hash != h_with ) isLower = false; if ( isLower ) continue; // get pre word, preopsitional // phrase starter? if ( prevWid == h_in || prevWid == h_the || prevWid == h_and || // fix for ending on "(Room A)" (prevWid == h_a && prevPrevWid != h_rm && prevPrevWid != h_room && prevPrevWid != h_bldg && prevPrevWid != h_building && prevPrevWid != h_suite && prevPrevWid != h_ste ) || prevWid == h_for || prevWid == h_to || prevWid == h_on || prevWid == h_under || prevWid == h_with || prevWid == h_from || prevWid == h_by || prevWid == h_of || // "some ... Wednesdays" prevWid == h_some || prevWid == h_at ) continue; } // seems like span breaks for meetup.com // et al and not for abqtango.com maybe, we // need to download the css??? or what??? // by default span tags do not seem to break // the line but ppl maybe configure them to if ( tid == TAG_SPAN ) break; // if like <font> ignore it if ( ! isBreakingTagId(token2.nodeid) ) continue; // only break on xml tags if in rss feed to // fix <st1:State w:st="on">Arizona</st1> // for gwair.org if ( tid==TAG_XMLTAG && !m_isRSSExt) continue; // otherwise, stop! break; } // skip simple spaces for speed if ( token2.token_len == 1 && is_wspace_a(token2.token_start[0])) continue; // do not allow punctuation that is in a url // to be split up or used as a splitter. we want // to keep the full url intact. if ( j > i && j+1 < m_nw && (m_bits->queryBits(j-1) & D_IS_IN_URL) && (m_bits->queryBits(j ) & D_IS_IN_URL) && (m_bits->queryBits(j+1) & D_IS_IN_URL) ) continue; // was last punct containing a comma? lastWasComma = false; // scan the punct chars, stop if we hit a sent breaker const char *p = token2.token_start; const char *pend = p + token2.token_len; for ( ; p < pend ; p++ ) { // punct word... if ( *p == '.' ) break; if ( *p == ',' ) lastWasComma =true; // allow this too for now... no... if ( *p == ';' ) break; // now hyphen breaks, mostly for stuff // in title tags like dukecityfix.com if ( sp[j]->m_tagId == TAG_TITLE && *p == '-' && is_wspace_a(p[-1]) && is_wspace_a(p[+1]) && lastWidPos >= 0 && ! m_isRSSExt && j+1<m_nw && (*m_tr)[j+1].is_alfanum && //( ! (bb[lastWidPos] & D_IS_IN_DATE) || // ! (bb[j+1] & D_IS_IN_DATE) ) && // fix for $10 - $12 ( ! is_digit ( (*m_tr)[lastWidPos].token_start[0]) || ! is_digit ( (*m_tr)[j+1].token_start[0]) ) ) break; // . treat colon like comma now // . for unm.edu we have // "Summer Hours: March 15 - Oct15: // 8 am. Mon - Fri, 7:30 am - 10 am Sun., // Winter Hours: Oct. 15 - March 15: // 8 am., seven days a week" // . and we don't want "winter hours" being // toplogically closer to the summer hours // . that is, the colon is a stronger binder // than the comma? // . but for villr.com Hours: May-Aug.. gets // made into two sentences and Hours is // seen as a heading section and causes // addImpliedSections() to be wrong. // . why not the colon? if ( *p == ':' ) { // Tags: music,concert,fun if ( prevWid == h_tags && // just Tags: so far in sentence j == i ) break; // a "::" is used in breadcrumbs, // so break on that. // fixes "Dining :: Visit :: // Cal Performacnes" title if ( p[1] == ':' ) break; // if "with" preceeds, allow if ( prevWid == h_with ) continue; // or prev word was tag! like // "blah</b>:..." bool tagAfter = (j-1>=0 && (*m_tr)[j-1].nodeid); // do not allow if next word is tag bool tagBefore = (j+1<m_nw && (*m_tr)[j+1].nodeid); // do not allow // "<br>...:<br>" or // "<br>...<br>:" or // since such things are usually // somewhat like headers. isolated // lines ending on a colon. // should fix st. martin's center // for unm.edu "Summer Hours: ..." if ( lastbr >= 0 && ( tagBefore || tagAfter ) ) { // end sentence there then j = lastbr; break; } if ( tagBefore ) break; if ( tagAfter ) break; // for now allow it! continue; } // . special hyphen // . breaks up title for peachpundit.com // so we get better event title generation // since peachpundit.com will be a reepat sec // . BUT it did not work! if ( p[0] == (char)-30 && p[1] == (char)-128 && p[2] == (char)-108 ) break; // this for sure // "Home > Albuquerque Events > Love Song ..." if ( *p == '>' ) break; if ( *p == '!' ) break; if ( *p == '?' ) break; if ( *p == '|' ) break; // bullets if ( p[0] == (char)226 && p[1] == (char)128 && p[2] == (char)162 ) break; redo: continue; } // if none, keep going if ( p == pend ) continue; // if an alnum char follows the ., it is ok // probably a hostname or ip or phone # if ( is_alnum_utf8(p+1) && // "venue:ABQ Sq Dance Center..." for // americantowns.com has no space after the colon! *p !=':' ) goto redo; // if abbreviation before we are ok too if ( *p == '.' && isAbbr(prevWid,&hasWordAfter) ) { // but the period may serve a double purpose // to end the abbr and terminate the sentence // if the word that follows is capitalized, // and if the abbr is a lower-case noun. // // if abbr is like "vs" then do not end sentenc if ( hasWordAfter ) goto redo; // set "next" to next alnum word after us int32_t next = j+1; int32_t max = next + 10; if ( max > m_nw ) max = m_nw; for ( ; next < max ; next++ ) { if ( ! (*m_tr)[next].is_alfanum ) continue; break; } // was previous word/abbr capitalized? // if so, assume period does not end sentence. if ( is_capitalized_utf8((*m_tr)[lastWidPos].token_start) ) goto redo; // if next word is NOT capitalized, assume // period does not end sentence... if ( next < max && ! is_capitalized_utf8((*m_tr)[next].token_start) ) goto redo; // otherwise, abbr is NOT capitalized and // next word IS capitalized, so assume the // period does NOT end the sentence } // fix "1. library name" for cabq.gov if ( *p == '.' && lastWidPos == i) { auto const &t = (*m_tr)[lastWidPos]; if(is_ascii_digit_string(t.token_start, t.token_end())) goto redo; } // ok, stop otherwise break; } // do not include tag at end. try to fix sentence flip flop. for ( ; j > i ; j-- ) // stop when we just contain the last word if ( (*m_tr)[j-1].is_alfanum ) break; // make our sentence endpoints now int32_t senta = i; // make the sentence defined by [senta,sentb) where sentb // defines a half-open interval like we do for almost // everything else int32_t sentb = j; // update i for next iteration i = sentb - 1; // crap, but now sentences intersect with our tag-based // sections because they can now split tags because of websites // like aliconference.com and abqtango.com whose sentences // do not align with the tag sections. therefore we introduce // the SEC_TOP_SPLIT and SEC_BOTTOM_SPLIT to indicate // that the section is a top/bottom piece of a split sentence. // if both bits are set we assume SEC_MIDDLE_SPLIT. // then we set the Section::m_senta and m_sentb to // indicate the whole sentence of which it is a split. // but the vast majority of the time m_senta and m_sentb // will equal m_firstWordPos and m_lastWordPos respectively. // then, any routine that // so scan the words in the sentence and as we scan we have // to determine the parent section we inserting the sentence // into as a child section. //Section *parent = NULL; int32_t start = -1; Section *pp; int32_t lastk = 0; Section *splitSection = NULL; Section *lastGuy = NULL; for ( int32_t k = senta ; k <= sentb ; k++ ) { // add final piece if ( k == sentb ) { // stop i no final piece if ( start == -1 ) break; // otherwise, add it goto addit; } // need a real alnum word if ( ! (*m_tr)[k].is_alfanum ) continue; // get his parent pp = m_sectionPtrs[k]; // set parent if need to //if ( ! parent ) parent = pp; // and start sentence if need to if ( start == -1 ) start = k; // if same as exact section as last guy, save some time if ( pp == lastGuy ) pp = NULL; // store it lastGuy = pp; // . i'd say blow up "pp" until its contains "start" // . but if before it contains start it breaches // [senta,sentb) then we have to cut things short for ( ; pp ; pp = pp->m_parent ) { // we now have to split section "pp" // when adding the sentence section. // once we have such a section we // cannot use a different parent... if ( pp->m_firstWordPos < start || pp->m_lastWordPos >= sentb ) { // set it if ( ! splitSection ) splitSection =pp; // WE ARE ONLY ALLOWED TO SPLIT ONE // SECTION ONLY... if ( pp != splitSection) goto addit; break; } // keep telescoping until "parent" contains // [senta,k] , and we already know that it // contains k because that is what we set it to //if ( pp->m_a <= senta ) break; } // mark it if ( (*m_tr)[k].is_alfanum ) lastk = k; // ok, keep chugging continue; // add the final piece if we go to this label addit: // use this flag int32_t bh = BH_SENTENCE; // determine parent section, smallest section // containing [start,lastk] Section *parent = m_sectionPtrs[start]; for ( ; parent ; parent = parent->m_parent ) { // stop if contains lastk if ( parent->m_b > lastk ) break; } // // for "<span>Albuquerque</span>, New Mexico" // "start" points to "Albuquerque" but needs to // point to the "<span>" so its parent is "parent" int32_t adda = start; int32_t addb = lastk; // need to update "start" to so its parent is the new // "parent" now so insertSubSection() does not core for ( ; adda >= 0 ; ) { // stop if we finally got the right parent if ( m_sectionPtrs[adda]==parent ) break; // or if he's a tag and his parent // is "parent" we can stop. // i.e. STOP on a proper subsection of // the section containing the sentence. if ( m_sectionPtrs[adda]->m_parent==parent && m_sectionPtrs[adda]->m_a == adda ) break; // backup adda--; // check if ( adda < 0 ) break; // how can this happen? if ( (*m_tr)[adda].is_alfanum ) { g_process.shutdownAbort(true); } } // sanity if ( adda < 0 ) { g_process.shutdownAbort(true); } // same for right endpoint for ( ; addb < m_nw ; ) { // stop if we finally got the right parent if ( m_sectionPtrs[addb]==parent ) break; // get it Section *sp = m_sectionPtrs[addb]; // come back up here in the case of a section // sharing its Section::m_b with its parent subloop: // or if he's a tag and his parent // is "parent" we can stop if ( sp->m_parent==parent && sp->m_b == addb+1 ) break; // or if we ran into a brother section // that does not contain the sentence... // fix core dump for webnetdesign.com whose // sentence consisted of 3 sections from // A=7079 to B=7198. but now i am getting rid // of allowing a lower case http(s):// on // a separate line to indicate that the // sentence continues... so we will not have // this sentence anymore in case you are // wondering why it is not there any more. if ( sp->m_parent==parent && sp->m_a == addb ) { // do not include that brother's tag addb--; break; } // when we have bad tag formations like for // http://gocitykids.parentsconnect.com/catego // ry/buffalo-ny-usa/places-to-go/tourist-stops // like <a><b>...</div> with no ending </a> or // </b> tags then we have to get the parent // of the parent as long as its m_b is the // same and check that before advancing addb // otherwise we can miss the parent section // that we want! (this is because the kid // sections share the same m_b as their // parent because of they have no ending tag) if ( sp->m_parent && sp->m_parent->m_b == sp->m_b ) { sp = sp->m_parent; goto subloop; } // advance addb++; // stop if addb if ( addb >= m_nw ) break; // how can this happen? if ( (*m_tr)[addb].is_alfanum ) { g_process.shutdownAbort(true); } } // sanity if ( addb >= m_nw ) { g_process.shutdownAbort(true); } // ok, now add the split sentence Section *is =insertSubSection(adda,addb+1,bh); // panic? if ( ! is ) break; // set sentence flag on it is->m_flags |= SEC_SENTENCE; // . set this // . sentence is from [senta,sentb) is->m_senta = senta;//start; is->m_sentb = sentb;//k; // stop if that was it if ( k == sentb ) break; // go on to next fragment then start = -1; parent = NULL; splitSection = NULL; lastGuy = NULL; // redo this same k k--; } } int32_t inSentTil = 0; Section *lastSent = NULL; // get the section of each word. if not a sentence section then // make its m_sentenceSection point to its parent that is a sentence for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) { // need sentence if ( ( sk->m_flags & SEC_SENTENCE ) ) { inSentTil = sk->m_b; lastSent = sk; sk->m_sentenceSection = sk; continue; } // skip if outside of the last sentence we had if ( sk->m_a >= inSentTil ) continue; // we are in that sentence sk->m_sentenceSection = lastSent; } return true; } Section *Sections::insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) { // try to realloc i guess. should keep ptrs in tact. if ( m_numSections >= m_maxNumSections ) { g_errno = EDOCBADSECTIONS; return NULL; } // // make a new section // Section *sk = &m_sections[m_numSections]; // clear memset ( sk , 0 , sizeof(Section) ); // inc it m_numSections++; // now set it sk->m_a = a; sk->m_b = b; // don't mess this up! if ( m_lastSection && a > m_lastSection->m_a ) m_lastSection = sk; // the base hash (delimeter hash) hack sk->m_baseHash = 0;// dh; ???????????????????? // get first section containing word #a Section *si = m_sectionPtrs[a]; for ( ; si ; si = si->m_prev ) { // we become his child if this is true if ( si->m_a < a ) { break; } // if he is bigger (or equal) we become his child // and are after him if ( si->m_a == a && si->m_b >= b ) { break; } } // . try using section before us if it is contained by "si" // . like in the case when word #a belongs to the root section // and there are thousands of child sections of the root before "a" // we really want to get the child section of the root before us // as the prev section, "si", otherwise the 2nd for loop below here // will hafta loop through thousands of sibling sections // . this will fail if word before a is part of our same section // . what if we ignored this for now and set m_sectionPtrs[a] to point // to the newly inserted section, then when done adding sentence // sections we scanned all the words, keeping track of the last // html section we entered and used that to insert the sentence sections if ( m_lastAdded && si && m_lastAdded->m_a > si->m_a && m_lastAdded->m_a < a ) { si = m_lastAdded; } // crap we may have // "<p> <strong>hey there!</strong> this is another sentence.</p>" // then "si" will be pointing at the "<p>" section, and we will // not get the "<strong>" section as the "prev" to sk, which we should! // that is where sk is the "this is another sentence." sentence // section. so to fix that try iterating over si->m_next to get si to // be closer to sk. for ( ; si ; si = si->m_next ) { // stop if no more eavailable if ( ! si->m_next ) break; // stop if would break if ( si->m_next->m_a > a ) break; // if it gets closer to us without exceeding us, use it if ( si->m_next->m_a < a ) continue; // if tied, check b. if it contains us, go to it if ( si->m_next->m_b >= b ) continue; // otherwise, stop break; } // set this m_lastAdded = si; // a br tag can split the very first base html tag like for // mapsandatlases.org we have // "<html>...</html> <br> ...." so the br tag splits the first // section! // SO we need to check for NULL si's! if ( ! si ) { // skip this until we figure it out m_numSections--; g_process.shutdownAbort(true); return NULL; } else { // insert us into the linked list of sections if ( si->m_next ) si->m_next->m_prev = sk; sk->m_next = si->m_next; sk->m_prev = si; si->m_next = sk; } // now set the parent Section *parent = m_sectionPtrs[a]; // expand until it encompasses both a and b for ( ; ; parent = parent->m_parent ) { if ( parent->m_a > a ) continue; if ( parent->m_b < b ) continue; break; } // now we assign the parent to you sk->m_parent = parent; // sometimes an implied section is a subsection of a sentence! // like when there are a lot of brbr (double br) tags in it... sk->m_sentenceSection = parent->m_sentenceSection; // take out certain flags from parent sec_t flags = parent->m_flags; flags &= ~SEC_SENTENCE; // add in fake flags |= SEC_FAKE; // flag it as a fake section sk->m_flags = flags ; // need this sk->m_baseHash = newBaseHash; // reset these sk->m_firstWordPos = -1; sk->m_lastWordPos = -1; sk->m_alnumPosA = -1; sk->m_alnumPosB = -1; sk->m_senta = -1; sk->m_sentb = -1; // set sk->m_firstWordPos for ( int32_t i = a ; i < b ; i++ ) { // and first/last word pos if ( ! (*m_tr)[i].is_alfanum ) continue; // mark this sk->m_firstWordPos = i; break; } // set sk->m_lastWordPos for ( int32_t i = b-1 ; i >= a ; i-- ) { // and first/last word pos if ( ! (*m_tr)[i].is_alfanum ) continue; // mark this sk->m_lastWordPos = i; break; } // // to speed up scan the words in our inserted section, usually // a sentence section i guess, because our parent can have a ton // of children sections!! // for ( int32_t i = a ; i < b ; i++ ) { // get current parent of that word Section *wp = m_sectionPtrs[i]; // if sentence section does NOT contain the word's current // section then the sentence section becomes the new section // for that word. if ( ! sk->strictlyContains ( wp ) ) { // now if "wp" is like a root, then sk becomes the kid m_sectionPtrs[i] = sk; // our parent is wp sk->m_parent = wp; continue; } // we gotta blow up wp until right before it is bigger // than "sk" and use that for ( ; wp->m_parent ; wp = wp->m_parent ) // this could be equal to, not just contains // otherwise we use strictlyContains() if ( wp->m_parent->contains(sk) ) break; // already parented to us? if ( wp->m_parent == sk ) continue; // sentence's parent is now wp's parent sk->m_parent = wp->m_parent; // and we become wp's parent wp->m_parent = sk; // sanity check if ( wp->m_b > sk->m_b ) { g_process.shutdownAbort(true); } if ( wp->m_a < sk->m_a ) { g_process.shutdownAbort(true); } } return sk; } // this is a function because we also call it from addImpliedSections()! void Sections::setNextBrotherPtrs ( bool setContainer ) { // clear out for ( Section *si = m_rootSection ; si ; si = si->m_next ) { si->m_nextBrother = NULL; si->m_prevBrother = NULL; } for ( Section *si = m_rootSection ; si ; si = si->m_next ) { Section *sj = NULL; // get word after us int32_t wn = si->m_b; int32_t nw2 = m_nw; // if we hit a word in our parent.. then increment wn // PROBLEM "<root><t1>hey</t1> blah blah blah x 1 mill</root>" // would exhaust the full word list when si is the "t1" // section. Section *j2 = si->m_next; if ( j2 && j2->m_a >= si->m_b ) { sj = j2; nw2 = 0; } // try one more ahead for things like so we don't end up // setting sj to the "t2" section as in: // "<root><t1><t2>hey</t2></t1> ...." if ( ! sj && j2 ) { // try the next section then j2 = j2->m_next; // set "sj" if its a potential brother section if ( j2 && j2->m_a >= si->m_b ) { sj = j2; nw2 = 0; } } // ok, try the next word algo approach for ( ; wn < nw2 ; wn++ ) { sj = m_sectionPtrs[wn]; if ( sj->m_a >= si->m_b ) break; } // bail if none if ( wn >= m_nw ) continue; // telescope up until brother if possible for ( ; sj ; sj = sj->m_parent ) if ( sj->m_parent == si->m_parent ) break; // give up? if ( ! sj || sj->m_parent != si->m_parent ) continue; // sanity check if ( sj->m_a < si->m_b && sj->m_tagId != TAG_TC && si->m_tagId != TAG_TC ) { g_process.shutdownAbort(true); } // set brother si->m_nextBrother = sj; // set his prev then sj->m_prevBrother = si; // sanity check if ( sj->m_parent != si->m_parent ) { g_process.shutdownAbort(true); } // sanity check if ( sj->m_a < si->m_b && sj->m_tagId != TAG_TC && si->m_tagId != TAG_TC ) { g_process.shutdownAbort(true); } // do more? if ( ! setContainer ) continue; // telescope this Section *te = sj; // telescope up until it contains "si" for ( ; te && te->m_a > si->m_a ; te = te->m_parent ); // only update list container if smaller than previous if ( ! si->m_listContainer ) si->m_listContainer = te; else if ( te && te->m_a > si->m_listContainer->m_a ) si->m_listContainer = te; if ( ! sj->m_listContainer ) sj->m_listContainer = te; else if ( te && te->m_a > sj->m_listContainer->m_a ) sj->m_listContainer = te; // now } } void Sections::setNextSentPtrs ( ) { // kinda like m_rootSection m_firstSentence = NULL; Section *finalSec = NULL; // scan the sentence sections and number them to set m_sentNum for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) { // record final section finalSec = sk; // need sentence if ( ! ( sk->m_flags & SEC_SENTENCE ) ) { continue; } // first one? if ( ! m_firstSentence ) { m_firstSentence = sk; } } Section *lastSent = NULL; // now set "m_nextSentence" of each section for ( Section *sk = finalSec ; sk ; sk = sk->m_prev ) { // set this sk->m_nextSentence = lastSent; // need sentence if ( ! ( sk->m_flags & SEC_SENTENCE ) ) { continue; } // we are the sentence now lastSent = sk; } } #define TABLE_ROWS 25 void Sections::printFlags(SafeBuf *sbuf, const Section *sn) { sec_t f = sn->m_flags; if ( f & SEC_HEADING ) sbuf->safePrintf("heading "); if ( f & SEC_MENU_SENTENCE ) sbuf->safePrintf("menusentence " ); if ( f & SEC_MENU ) sbuf->safePrintf("ismenu " ); if ( f & SEC_MENU_HEADER ) sbuf->safePrintf("menuheader " ); if ( f & SEC_LINK_TEXT ) sbuf->safePrintf("linktext " ); if ( f & SEC_PLAIN_TEXT ) sbuf->safePrintf("plaintext " ); if ( f & SEC_FAKE ) { if ( sn->m_baseHash == BH_BULLET ) sbuf->safePrintf("bulletdelim "); else if ( sn->m_baseHash == BH_SENTENCE ) sbuf->safePrintf("<b>sentence</b> "); else if ( sn->m_baseHash == BH_IMPLIED ) sbuf->safePrintf("<b>impliedsec</b> "); else { g_process.shutdownAbort(true); } } if ( f & SEC_NOTEXT ) sbuf->safePrintf("notext "); if ( f & SEC_SCRIPT ) sbuf->safePrintf("inscript "); if ( f & SEC_NOSCRIPT ) sbuf->safePrintf("innoscript "); if ( f & SEC_STYLE ) sbuf->safePrintf("instyle "); if ( f & SEC_HIDDEN ) sbuf->safePrintf("indivhide "); if ( f & SEC_SELECT ) sbuf->safePrintf("inselect "); if ( f & SEC_IN_HEAD ) sbuf->safePrintf("inhead "); if ( f & SEC_IN_TITLE ) sbuf->safePrintf("intitle "); if ( f & SEC_IN_HEADER ) sbuf->safePrintf("inheader "); if ( f & SEC_IN_IFRAME ) sbuf->safePrintf("iniframe "); } bool Sections::isHardSection(const Section *sn) const { int32_t a = sn->m_a; // . treat this as hard... kinda like a div section... // fixes gwair.org date from stealing address of another date // because the span tags are fucked up... // . crap, no this prevents publicbroadcasting.net and other urls // from telescoping to header dates they need to telescope to. // the header dates are in span tags and if that is seen as a hard // section bad things happen //if ( m_tids[a] == TAG_SPAN ) return true; if ( ! isBreakingTagId((*m_tr)[a].nodeid) ) { // . if first child is hard that works! // . fixes "<blockquote><p>..." for collectorsguide.com if ( sn->m_next && sn->m_next->m_tagId && // fix "blah blah<br>blah blah" for sentence sn->m_next->m_tagId != TAG_BR && sn->m_next->m_a < sn->m_b && isBreakingTagId(sn->m_next->m_tagId) ) return true; // otherwise, forget it! return false; } // trumba.com has sub dates in br-based implied sections that need // to telescope to their parent above if ( (*m_tr)[a].nodeid == TAG_BR ) return false; if ( sn->m_flags & SEC_SENTENCE ) return false; // xml tag exception for gwair.org. treat <st1:Place>... as soft if ( ((*m_tr)[a].nodeid & BACKBITCOMP) == TAG_XMLTAG && ! m_isRSSExt ) return false; return true; } bool Sections::setMenus ( ) { // . this just returns if already set // . sets Bits::m_bits[x].m_flags & D_IN_LINK if its in a link // . this bits array is 1-1 with the words m_bits->setInLinkBits(this); sec_t flag; // set SEC_PLAIN_TEXT and SEC_LINK_TEXT for all sections for ( int32_t i = 0 ; i < m_nw ; i++ ) { // need alnum word if ( ! (*m_tr)[i].is_alfanum ) continue; // get our flag if ( m_bits->queryBits(i) & D_IN_LINK ) flag = SEC_LINK_TEXT; else flag = SEC_PLAIN_TEXT; // get section ptr Section *sk = m_sectionPtrs[i]; // loop for sk for ( ; sk ; sk = sk->m_parent ) { // skip if already set if ( sk->m_flags & flag ) break; // set it sk->m_flags |= flag; } } Section *last = NULL; // . alernatively, scan through all anchor tags // . compare to last anchor tag // . and blow up each to their max non-intersection section and make // sure no PLAIN text in either of those! // . this is all to fix texasdrums.drums.org which has various span // and bold tags throughout its menu at random for ( Section *si = m_rootSection ; si ; si = si->m_next ) { // . if we hit plain text, we kill our last // . this was causing "geeks who drink" for blackbirdbuvette // to get is SEC_MENU set because there was a link after it if ( si->m_flags & SEC_PLAIN_TEXT ) { last = NULL; } // skip if not a href section if ( si->m_baseHash != TAG_A ) { continue; } // . if it is a mailto link forget it // . fixes abtango.com from detecting a bad menu const char *ptr = (*m_tr)[si->m_a].token_start; int32_t plen = (*m_tr)[si->m_a].token_len; const char *mailto = strncasestr(ptr,plen,"mailto:"); if ( mailto ) { last = NULL; } // bail if no last if ( ! last ) { last = si; continue; } // save last Section *prev = last; // set last for next round, used "saved" below last = si; // get first "hard" section encountered while telescoping Section *prevHard = NULL; // blow up last until right before it contains us for ( ; prev ; prev = prev->m_parent ) { // record? if ( ! prevHard && isHardSection(prev) ) prevHard = prev; // if parent contains us, stop if ( prev->m_parent->contains ( si ) ) break; } // if it has plain text, forget it! if ( prev && prev->m_flags & SEC_PLAIN_TEXT ) continue; // use this for us Section *sk = si; // get first "hard" section encountered while telescoping Section *skHard = NULL; // same for us for ( ; sk ; sk = sk->m_parent ) { // record? if ( ! skHard && isHardSection(sk) ) skHard = sk; // if parent contains us, stop if ( prev && sk->m_parent->contains ( prev ) ) break; } // if it has plain text, forget it! if ( sk && sk->m_flags & SEC_PLAIN_TEXT ) continue; // . first hard sections encountered must match! // . otherwise for switchborad.com we lose "A B C ..." as // title candidate because we think it is an SEC_MENU // because the sections before it have links in them, but // they have different hard sections if ( prevHard && ! skHard ) continue; if ( ! prevHard && skHard ) continue; if ( prevHard && prevHard->m_tagId != skHard->m_tagId ) continue; // ok, great that works! if( prev ) { prev->m_flags |= SEC_MENU; } if( sk ) { sk->m_flags |= SEC_MENU; } } int64_t h_copyright = hash64n("copyright"); // copyright check // the copyright symbol in utf8 (see Entities.cpp for the code) static const char copy[] = "�"; // scan all years, lists and ranges of years, and look for // a preceeding copyright sign. mark such years as DF_COPYRIGHT for ( int32_t i = 0 ; i < m_nw ; i++ ) { // skip if tag if ( (*m_tr)[i].nodeid ) continue; // do we have an alnum word before us here? if ( (*m_tr)[i].is_alfanum ) { // if word check for copyright if ( (*m_tr)[i].token_hash != h_copyright ) continue; } // must have copyright sign in it i guess else if ( ! gb_strncasestr((*m_tr)[i].token_start, (*m_tr)[i].token_len, copy)) continue; // mark section as copyright section then Section *sp = m_sectionPtrs[i]; // flag as menu sp->m_flags |= SEC_MENU; } sec_t ff = SEC_MENU; // set SEC_MENU of child sections of SEC_MENU sections for ( Section *si = m_rootSection; si; si = si->m_next ) { // must be a link text only section if ( !( si->m_flags & ff ) ) continue; // ignore if went down this path if ( si->m_used == 82 ) { continue; } // get first potential kid Section *sk = si->m_next; // scan child sections for ( ; sk; sk = sk->m_next ) { // stop if not contained if ( !si->contains( sk ) ) { break; } // mark it sk->m_flags |= ( si->m_flags & ff ); // SEC_MENU; // ignore in big loop sk->m_used = 82; } } // // set SEC_MENU_HEADER // for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) { // skip if not in a menu if ( ! ( sk->m_flags & SEC_MENU ) ) { continue; } // get his list container Section *c = sk->m_listContainer; // skip if none if ( !c ) { continue; } // already flagged? if ( c->m_used == 89 ) { continue; } // do not repeat on any item in this list c->m_used = 89; // flag all its brothers! Section *zz = sk; for ( ; zz; zz = zz->m_nextBrother ) { // bail if not in menu if ( !( zz->m_flags & SEC_MENU ) ) { break; } } // if broked it, stop if ( zz ) { continue; } // // ok, every item in list is a menu item, so try to set header // // get word before first item in list int32_t r = sk->m_a - 1; for ( ; r >= 0 && !(*m_tr)[r].is_alfanum; r-- ) ; // if no header, skip if ( r < 0 ) { continue; } // set SEC_MENU_HEADER setHeader( r, sk, SEC_MENU_HEADER ); } // // set SEC_MENU_SENTENCE flag // for ( Section *si = m_rootSection; si; si = si->m_next ) { // must be a link text only section if ( !( si->m_flags & SEC_MENU ) ) { continue; } // set this bool gotSentence = ( si->m_flags & SEC_SENTENCE ); // set SEC_MENU of the sentence if ( gotSentence ) { continue; } // parent up otherwise for ( Section *sk = si->m_parent; sk; sk = sk->m_parent ) { // stop if sentence finally if ( !( sk->m_flags & SEC_SENTENCE ) ) { continue; } // not a menu sentence if it has plain text in it // though! we have to make this exception to stop // stuff like // "Wedding Ceremonies, No preservatives, more... " // from switchboard.com from being a menu sentence // just because "more" is in a link. if ( sk->m_flags & SEC_PLAIN_TEXT ) { break; } // set it sk->m_flags |= SEC_MENU_SENTENCE; // and stop break; } } static bool s_init = false; static int64_t h_close ; static int64_t h_send ; static int64_t h_map ; static int64_t h_maps ; static int64_t h_directions ; static int64_t h_driving ; static int64_t h_help ; static int64_t h_more ; static int64_t h_log ; static int64_t h_sign ; static int64_t h_change ; static int64_t h_write ; static int64_t h_save ; static int64_t h_share ; static int64_t h_forgot ; static int64_t h_home ; static int64_t h_sitemap ; static int64_t h_advanced ; static int64_t h_go ; static int64_t h_website ; static int64_t h_view; static int64_t h_add; static int64_t h_submit; static int64_t h_get; static int64_t h_about; // new stuff static int64_t h_back; // back to top static int64_t h_next; static int64_t h_buy; // buy tickets static int64_t h_english; // english french german versions static int64_t h_click; if ( ! s_init ) { s_init = true; h_close = hash64n("close"); h_send = hash64n("send"); h_map = hash64n("map"); h_maps = hash64n("maps"); h_directions = hash64n("directions"); h_driving = hash64n("driving"); h_help = hash64n("help"); h_more = hash64n("more"); h_log = hash64n("log"); h_sign = hash64n("sign"); h_change = hash64n("change"); h_write = hash64n("write"); h_save = hash64n("save"); h_share = hash64n("share"); h_forgot = hash64n("forgot"); h_home = hash64n("home"); h_sitemap = hash64n("sitemap"); h_advanced = hash64n("advanced"); h_go = hash64n("go"); h_website = hash64n("website"); h_view = hash64n("view"); h_add = hash64n("add"); h_submit = hash64n("submit"); h_get = hash64n("get"); h_about = hash64n("about"); h_back = hash64n ("back"); h_next = hash64n ("next"); h_buy = hash64n ("buy"); h_english = hash64n ("english"); h_click = hash64n ("click"); } // . when dup/non-dup voting info is not available because we are // more or less an isolated page, guess that these links are // menu links and not to be considered for title or event description // . we completely exclude a word from title/description if its // SEC_MENU is set. // . set SEC_MENU for renegade links that start with an action // verb like "close" or "add" etc. but if their # of non dup votes // is high relative to their # of dup votes, then do not set this // because it might be a name of a band like "More" or something // and be in a link // . scan all href sections // set SEC_LINK_ONLY on sections that just contain a link for ( Section *si = m_rootSection ; si ; si = si->m_next ) { // skip if not a href section if ( si->m_baseHash != TAG_A ) continue; // set points to scan int32_t a = si->m_a; int32_t b = si->m_b; // assume not bad bool bad = false; int32_t i; // scan words if any for ( i = a ; i < b ; i++ ) { const auto &token = (*m_tr)[i]; // skip if not word if ( ! token.is_alfanum ) continue; // assume bad bad = true; // certain words are indicative of menus if ( token.token_hash == h_close ) break; if ( token.token_hash == h_send ) break; if ( token.token_hash == h_map ) break; if ( token.token_hash == h_maps ) break; if ( token.token_hash == h_directions ) break; if ( token.token_hash == h_driving ) break; if ( token.token_hash == h_help ) break; if ( token.token_hash == h_more ) break; if ( token.token_hash == h_log ) break; // log in if ( token.token_hash == h_sign ) break; // sign up/in if ( token.token_hash == h_change ) break; // change my loc. if ( token.token_hash == h_write ) break; // write a review if ( token.token_hash == h_save ) break; if ( token.token_hash == h_share ) break; if ( token.token_hash == h_forgot ) break; // forgot your pwd if ( token.token_hash == h_home ) break; if ( token.token_hash == h_sitemap ) break; if ( token.token_hash == h_advanced ) break; // adv search if ( token.token_hash == h_go ) break; // go to top of page if ( token.token_hash == h_website ) break; if ( token.token_hash == h_view ) break; if ( token.token_hash == h_add ) break; if ( token.token_hash == h_submit ) break; if ( token.token_hash == h_get ) break; if ( token.token_hash == h_about ) break; if ( token.token_hash == h_back ) break; if ( token.token_hash == h_next ) break; if ( token.token_hash == h_buy ) break; if ( token.token_hash == h_english ) break; if ( token.token_hash == h_click ) break; bad = false; break; } // skip if ok if ( ! bad ) continue; // get smallest section Section *sm = m_sectionPtrs[i]; // if bad mark it! sm->m_flags |= SEC_MENU; } return true; } // "first" is first item in the list we are getting header for void Sections::setHeader ( int32_t r , Section *first , sec_t flag ) { // get smallest section containing word #r Section *sr = m_sectionPtrs[r]; // save orig Section *orig = sr; // blow up until just before "first" section for ( ; sr ; sr = sr->m_parent ) { // forget it if in title tag already! if ( sr->m_flags & SEC_IN_TITLE ) return; // stop if no parent if ( ! sr->m_parent ) continue; // parent must not contain first if ( sr->m_parent->contains ( first ) ) break; } // if we failed to contain "first"... what does this mean? i dunno // but its dropping core for // http://tedserbinski.com/jcalendar/jcalendar.js if ( ! sr ) return; // save that Section *biggest = sr; // check out prev brother Section *prev = biggest->m_prevBrother; // if we are in a hard section and capitalized (part of the // SEC_HEADING) requirements, then it should be ok if we have // a prev brother of a different tagid. // this will fix americantowns.com which has a list of header tags // and ul tags intermingled, with menus in the ul tags. // should also fix upcoming.yahoo.com which has alternating // dd and dt tags for its menus. now that we got rid of // addImpliedSections() we have to deal with this here, and it will // be more accurate since addImpliedSections() was often wrong. if ( prev && (orig->m_flags & SEC_HEADING) && prev->m_tagId != biggest->m_tagId ) prev = NULL; // but if prev brother is a blank, we should view that as a delimeter // BUT really we should have added those sections in with the new // delimeter logic! but let's put this in for now anyway... if ( prev && prev->m_firstWordPos < 0 ) prev = NULL; // if the header section has a prev brother, forget it! if ( prev ) return; // . if we gained extra text, that is a no-no then // . these two checks replaced the two commented out ones above // . they allow for empty sections preceeding "sr" at any level as // we telescope it up if ( biggest->m_firstWordPos != orig->m_firstWordPos ) return; if ( biggest->m_lastWordPos != orig->m_lastWordPos ) return; // . now blow up first until just before it hits biggest as well // . this fixes reverbnation on the nextBrother check below for ( ; first ; first = first->m_parent ) { // stop if parent is NULL if ( ! first->m_parent ) break; // stop if parent would contain biggest if ( first->m_parent->contains ( biggest ) ) break; } // if after blowing it up "first" contains more than just menu // sections, then bail. that really was not a menu header! // fixes reverbnation url that thought "That 1 Guy" was a menu header. if ( flag == SEC_MENU_HEADER ) { Section *fx = first; for ( ; fx ; fx = fx->m_next ) { // stop when list is over if ( fx->m_a >= first->m_b ) break; // ignore if no next if ( fx->m_flags & SEC_NOTEXT ) continue; // thats bad if SEC_MENU not set, it should be for all! if ( fx->m_flags & SEC_MENU ) continue; // we got these now if ( fx->m_flags & SEC_MENU_SENTENCE ) continue; // otherwise, bad! return; } } // scan until outside biggest int32_t lastb = biggest->m_b; // . make sure sr does not contain any list in it // . scan all sections between sr and "saved" for ( ; sr ; sr = sr->m_next ) { // stop if over if ( sr->m_a >= lastb ) break; // if we have a brother with same taghash we are // part of a list if ( sr->m_nextBrother && sr->m_nextBrother->m_tagHash == sr->m_tagHash && sr->m_nextBrother != first ) return; if ( sr->m_prevBrother && sr->m_prevBrother->m_tagHash == sr->m_tagHash && // for footers sr->m_prevBrother != first ) return; } // restart loop sr = biggest; // ok, not part of a list, flag it for ( ; sr ; sr = sr->m_next ) { // stop if over if ( sr->m_a >= lastb ) break; // flag each subsection sr->m_flags |= flag; // SEC_MENU_HEADER; } } // . set SEC_HEADING bits in Section::m_flags // . identifies sections that are most likely headings // . the WHOLE idea of this algo is to take a list of sections that are all // the same tagId/baseHash and differentiate them so we can insert implied // sections with headers. bool Sections::setHeadingBit ( ) { int32_t headings = 0; // scan the sections for ( Section *si = m_rootSection ; si ; si = si->m_next ) { int32_t fwp = si->m_firstWordPos; if ( fwp == -1 ) continue; // we must be the smallest container around this text if ( m_sectionPtrs[fwp] != si ) continue; // . make sure we are in our own hard section // . TODO: allow for bold or strong, etc. tags as well bool hasHard = false; int32_t a = si->m_firstWordPos; int32_t b = si->m_lastWordPos; // go to parent Section *pp = si; Section *biggest = NULL; bool inLink = false; // . we need to be isolated in our own hard section container // . TODO: what about "<b>Hi There <i>Bob</i></b>" as a heading // . i guess that will still work! for ( ; pp ; pp = pp->m_parent ) { // stop if breached if ( pp->m_firstWordPos != a ) break; if ( pp->m_lastWordPos != b ) break; // record this if ( pp->m_tagId == TAG_A ) inLink = true; // record the biggest section containing just our text biggest = pp; // is it a hard section? if ( isHardSection(pp) ) hasHard = true; // . allow bold and strong tags // . fixes gwair.org which has the dates of the // month in strong tags. so we need to set // SEC_HEADING for those so getDelimHash() will // recognize such tags as date header tags in the // METHOD_DOM algorithm and we get the proper // implied sections if ( pp->m_tagId == TAG_STRONG ) hasHard = true; if ( pp->m_tagId == TAG_B ) hasHard = true; } // need to be isolated in a hard section if ( ! hasHard ) continue; // now make sure the text is capitalized etc bool hadUpper = false; //bool hadLower = false; int32_t lowerCount = 0; bool hadYear = false; bool hadAlpha = false; int32_t i; // scan the alnum words we contain for ( i = a ; i <= b ; i++ ) { const auto &token = (*m_tr)[i]; // . did we hit a breaking tag? // . "<div> blah <table><tr><td>blah... </div>" if ( token.nodeid && isBreakingTagId(token.nodeid) ) break; // skip if not alnum word if ( ! token.is_alfanum ) continue; // skip digits if(token.token_len == 4 && is_digit(token.token_start[0]) && is_digit(token.token_start[1]) && is_digit(token.token_start[2]) && is_digit(token.token_start[3])) { // . but if we had a year like "2010" that // is allowed to be a header. // . this fixes 770kob.com because the events // under the "2010" header were telescoping // up into events in the "December 2009" // section, when they should have been in // their own section! and now they are in // their own implied section... int32_t num = atol2(token.token_start,token.token_len); if ( num < 1800 ) continue; if ( num > 2100 ) continue; // mark it hadYear = true; continue; } // mark this hadAlpha = true; // is it upper? if ( is_upper_utf8(token.token_start) ) { hadUpper = true; continue; } // skip stop words if(isStopWord(token.token_start, token.token_len, token.token_hash)) continue; // . skip short words // . November 4<sup>th</sup> for facebook.com if ( token.token_len <= 2 ) continue; // is it lower? if ( is_lower_utf8(token.token_start) ) lowerCount++; // stop now if bad //if ( hadUpper ) break; if ( lowerCount >= 2 ) break; } // is it a header? bool isHeader = hadUpper; // a single year by itself is ok though too if ( hadYear && ! hadAlpha ) isHeader = true; // allow for one mistake like we do in Events.cpp for titles if ( lowerCount >= 2 ) isHeader = false; if ( ! isHeader ) continue; // ok, mark this section as a heading section si->m_flags |= SEC_HEADING; // a hack! if ( inLink ) biggest->m_flags |= SEC_LINK_TEXT; // count them headings++; } // bail now if no headings were set if ( ! headings ) return true; return true; } void Sections::setTagHashes ( ) { if ( m_numSections == 0 ) return; // now recompute the tagHashes and depths and content hashes since // we have eliminate open-ended sections in the loop above for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) { // these have to be in order of sn->m_a to work right // because we rely on the parent tag hash, which would not // necessarily be set if we were not sorted, because the // parent section could have SEC_FAKE flag set because it is // a br section added afterwards. // shortcut int64_t bh = (int64_t)sn->m_baseHash; // sanity check if ( bh == 0 ) { g_process.shutdownAbort(true); } // if no parent, use initial values if ( ! sn->m_parent ) { sn->m_depth = 0; sn->m_tagHash = bh; // sanity check if ( bh == 0 ) { g_process.shutdownAbort(true); } continue; } // sanity check if ( sn->m_parent->m_tagHash == 0 ) { g_process.shutdownAbort(true); } // . update the cumulative front tag hash // . do not include hyperlinks as part of the cumulative hash! sn->m_tagHash = hash32h ( bh , sn->m_parent->m_tagHash ); sn->m_colorHash = hash32h ( bh , sn->m_parent->m_colorHash ); // if we are an implied section, just use the tag hash of // our parent. that way since we add different implied // sections for msichicago.com root than we do the kid, // the section voting should still match up if ( bh == BH_IMPLIED ) { sn->m_tagHash = sn->m_parent->m_tagHash; } if ( sn->m_tagHash == 0 ) { sn->m_tagHash = 1234567; } // depth based on parent, too sn->m_depth = sn->m_parent->m_depth + 1; } } // make this replace ::print() when it works bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) const { PrintData pd; pd.sbuf = sbuf; pd.hiPos = hiPos; pd.wposVec = wposVec; pd.densityVec = densityVec; pd.wordSpamVec = wordSpamVec; pd.fragVec = fragVec; return print(&pd); } bool Sections::print(PrintData *pd) const { pd->sbuf->setLabel ("sectprnt"); //verifySections(); int32_t nw = m_tr->size(); // check words for ( int32_t i = 0 ; i < nw ; i++ ) { // get section Section *sn = m_sectionPtrs[i]; if ( sn->m_a > i ) { g_process.shutdownAbort(true); } if ( sn->m_b <= i ) { g_process.shutdownAbort(true); } } // print sections out for ( Section *sk = m_rootSection ; sk ; ) { // print this section printSectionDiv(pd,sk); // advance int32_t b = sk->m_b; // stop if last if ( b >= m_nw ) break; // get section after that sk = m_sectionPtrs[b]; } // print header const char *hdr = "<table border=1>" "<tr>" "<td><b>sec #</b></td>" "<td><b>wordStart</b></td>" "<td><b>wordEnd</b></td>" "<td><b>baseHash</b></td>" "<td><b>cumulTagHash</b></td>" "<td><b>contentHash</b></td>" "<td><b>contentTagHash</b></td>" "<td><b>XOR</b></td>" // only valid for contentHashes "<td><b>depth</b></td>" "<td><b>parent word range</b></td>" "<td><b>flags</b></td>" "<td><b>evIds</b></td>" "<td><b>text snippet</b></td>" "</tr>\n"; pd->sbuf->safePrintf("%s",hdr); int32_t rcount = 0; int32_t scount = 0; // show word # of each section so we can look in PageParser.cpp's // output to see exactly where it starts, since we now label all // the words for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) { // see if one big table causes a browser slowdown if ( (++rcount % TABLE_ROWS ) == 0 ) pd->sbuf->safePrintf("</table>%s\n",hdr); const char *xs = "--"; char ttt[100]; if ( sn->m_contentHash64 ) { int32_t modified = sn->m_tagHash ^ sn->m_contentHash64; sprintf(ttt,"0x%" PRIx32,modified); xs = ttt; } // shortcut Section *parent = sn->m_parent; int32_t pswn = -1; int32_t pewn = -1; if ( parent ) { pswn = parent->m_a; pewn = parent->m_b; } // print it pd->sbuf->safePrintf("<tr><td>%" PRId32"</td>\n" "<td>%" PRId32"</td>" "<td>%" PRId32"</td>" "<td>0x%" PRIx32"</td>" "<td>0x%" PRIx32"</td>" "<td>0x%" PRIx32"</td>" "<td>0x%" PRIx32"</td>" "<td>%s</td>" "<td>%" PRId32"</td>" "<td><nobr>%" PRId32" to %" PRId32"</nobr></td>" "<td><nobr>" , scount++, sn->m_a, sn->m_b, (int32_t)sn->m_baseHash, (int32_t)sn->m_tagHash, (int32_t)sn->m_contentHash64, (int32_t)(sn->m_contentHash64^sn->m_tagHash), xs, sn->m_depth, pswn, pewn); // now show the flags printFlags ( pd->sbuf , sn ); // first few words of section int32_t a = sn->m_a; int32_t b = sn->m_b; // -1 means an unclosed tag!! should no longer be the case if ( b == -1 ) { g_process.shutdownAbort(true); }//b=m_words->m_numWords; pd->sbuf->safePrintf("</nobr></td>"); pd->sbuf->safePrintf("<td> </td>"); pd->sbuf->safePrintf("<td><nobr>"); // 70 chars max int32_t max = 70; int32_t count = 0; char truncated = 0; // do not print last word/tag in section for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) { const char *s = (*m_tr)[i].token_start; int32_t slen = (*m_tr)[i].token_len; if ( count + slen > max ) { truncated = 1; slen = max - count; } count += slen; // boldify front tag if ( i == a ) pd->sbuf->safePrintf("<b>"); pd->sbuf->htmlEncode(s,slen,false); // boldify front tag if ( i == a ) pd->sbuf->safePrintf("</b>"); } // if we truncated print a ... if ( truncated ) pd->sbuf->safePrintf("<b>…</b>"); // then print ending tag if ( b < nw ) { int32_t blen = (*m_tr)[b-1].token_len; //b is from m_b and always>0 so indexing b-1 is safe if ( blen>20 ) blen = 20; pd->sbuf->safePrintf("<b>"); pd->sbuf->htmlEncode((*m_tr)[b-1].token_start,blen,false); pd->sbuf->safePrintf("</b>"); } pd->sbuf->safePrintf("</nobr></td></tr>\n"); } pd->sbuf->safePrintf("</table>\n<br>\n"); return true; } bool Sections::printSectionDiv(PrintData *pd, const Section *sk) const { // enter a new div section now pd->sbuf->safePrintf("<br>"); // only make font color different int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff; int32_t fcolor = 0x000000; int32_t rcolor = 0x000000; uint8_t *bp = (uint8_t *)&bcolor; bool dark = false; if ( bp[0]<128 && bp[1]<128 && bp[2]<128 ) dark = true; // or if two are less than 50 if ( (bp[0]<100 && bp[1]<100) || (bp[1]<100 && bp[2]<100) || (bp[0]<100 && bp[2]<100) ) dark = true; // if bg color is dark, make font color light if ( dark ) { fcolor = 0x00ffffff; rcolor = 0x00ffffff; } // start the new div pd->sbuf->safePrintf("<div " "style=\"" "background-color:#%06" PRIx32";" "margin-left:20px;" "border:#%06" PRIx32" 1px solid;" "color:#%06" PRIx32"\">", //(int32_t)sk, bcolor, rcolor, fcolor); bool printWord = true; if ( ! sk->m_parent && sk->m_next && sk->m_next->m_a == sk->m_a ) printWord = false; // print word/tag #i if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord ) // only encode if it is a tag pd->sbuf->htmlEncode((*m_tr)[sk->m_a].token_start, (*m_tr)[sk->m_a].token_len, false); pd->sbuf->safePrintf("<i>"); // print the flags pd->sbuf->safePrintf("A=%" PRId32" ",sk->m_a); // print tag hash now pd->sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash); if ( sk->m_contentHash64 ) pd->sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64); printFlags ( pd->sbuf , sk ); if ( isHardSection(sk) ) pd->sbuf->safePrintf("hardsec "); pd->sbuf->safePrintf("</i>\n"); // now print each word and subsections in this section int32_t a = sk->m_a; int32_t b = sk->m_b; for ( int32_t i = a ; i < b ; i++ ) { const auto &token = (*m_tr)[i]; // . if its a and us, skip // . BUT if we are root then really this tag belongs to // our first child, so make an exception for root! if ( i == a && token.is_alfanum && (sk->m_parent) ) continue; // . get section of this word // . TODO: what if this was the tr tag we removed??? i guess // maybe make it NULL now? Section *ws = m_sectionPtrs[i]; // get top most parent that starts at word position #a and // is not "sk" for ( ; ; ws = ws->m_parent ) { if ( ws == sk ) break; if ( ! ws->m_parent ) break; if ( ws->m_parent->m_a != ws->m_a ) break; if ( ws->m_parent == sk ) break; } // if it belongs to another sections, print that section if ( ws != sk ) { // print out this subsection printSectionDiv(pd,ws); // advance to end of that then i = ws->m_b - 1; // and try next word continue; } // ignore if in style section, etc. just print it out if ( sk->m_flags & NOINDEXFLAGS ) { pd->sbuf->htmlEncode(token.token_start,token.token_len,false ); continue; } // boldify alnum words if ( token.is_alfanum ) { if ( pd->wposVec[i] == pd->hiPos ) pd->sbuf->safePrintf("<a name=hipos></a>"); pd->sbuf->safePrintf("<nobr><b>"); if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 ) pd->sbuf->safePrintf("<strike>"); } if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos ) pd->sbuf->safePrintf("<blink style=\"" "background-color:yellow;" "color:black;\">"); // print that word pd->sbuf->htmlEncode(token.token_start, token.token_len, false ); if ( token.is_alfanum && pd->wposVec[i] == pd->hiPos ) pd->sbuf->safePrintf("</blink>"); // boldify alnum words if ( token.is_alfanum ) { if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 ) pd->sbuf->safePrintf("</strike>"); pd->sbuf->safePrintf("</b>"); } // and print out their pos/div/spam sub if ( token.is_alfanum ) { pd->sbuf->safePrintf("<sub " "style=\"background-color:white;" "font-size:10px;" "border:black 1px solid;" "color:black;\">"); pd->sbuf->safePrintf("%" PRId32, pd->wposVec[i]); if ( pd->densityVec[i] != MAXDENSITYRANK ) pd->sbuf->safePrintf("/<font color=purple><b>%" PRId32 "</b></font>" , (int32_t)pd->densityVec[i]); if ( pd->wordSpamVec[i] != MAXWORDSPAMRANK ) pd->sbuf->safePrintf("/<font color=red><b>%" PRId32 "</b></font>" , (int32_t)pd->wordSpamVec[i]); pd->sbuf->safePrintf("</sub></nobr>"); } } pd->sbuf->safePrintf("</div>\n"); return true; } bool Sections::verifySections ( ) { // make sure we map each word to a section that contains it at least for ( int32_t i = 0 ; i < m_nw ; i++ ) { Section *si = m_sectionPtrs[i]; if ( si->m_a > i ) { g_process.shutdownAbort(true); } if ( si->m_b <= i ) { g_process.shutdownAbort(true); } // must have checksum if ( (*m_tr)[i].is_alfanum && si->m_contentHash64==0) { g_process.shutdownAbort(true); } // must have this set if 0 if ( ! si->m_contentHash64 && !(si->m_flags & SEC_NOTEXT)) { g_process.shutdownAbort(true);} if ( si->m_contentHash64 && (si->m_flags & SEC_NOTEXT)) { g_process.shutdownAbort(true);} } // sanity check for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) { // get it //Section *sn = &m_sections[i]; // get parent for(const Section *sp = sn->m_parent; sp; sp = sp->m_parent) { // make sure parent fully contains if ( sp->m_a > sn->m_a ) { g_process.shutdownAbort(true); } if ( sp->m_b < sn->m_b ) { g_process.shutdownAbort(true); } // and make sure every grandparent fully contains us too! } } // sanity check for ( int32_t i = 0 ; i < m_numSections ; i++ ) { Section *sn = &m_sections[i]; if ( sn->m_a >= sn->m_b ) { g_process.shutdownAbort(true); } } // sanity check, make sure each section is contained by the // smallest section containing it for ( Section *si = m_rootSection ; si ; si = si->m_next ) { for ( Section *sj = m_rootSection ; sj ; sj = sj->m_next ) { // skip if us if ( sj == si ) continue; // skip column sections because they are artificial // and only truly contain some of the sections that // their [a,b) interval says they contain. if ( sj->m_tagId == TAG_TC ) continue; // or if an implied section of td tags in a tc if ( sj->m_baseHash == BH_IMPLIED && sj->m_parent && sj->m_parent->m_tagId == TAG_TC ) continue; // skip if sj does not contain first word in si if ( sj->m_a > si->m_a ) continue; if ( sj->m_b <= si->m_a ) continue; // ok, make sure in our parent path Section *ps = si; for ( ; ps ; ps = ps->m_parent ) if ( ps == sj ) break; // ok if we found it if ( ps ) continue; // sometimes if sections are equal then the other // is the parent ps = sj; for ( ; ps ; ps = ps->m_parent ) if ( ps == si ) break; // must have had us if ( ps ) continue; g_process.shutdownAbort(true); } } // make sure we map each word to a section that contains it at least for ( int32_t i = 0 ; i < m_nw ; i++ ) { Section *si = m_sectionPtrs[i]; if ( si->m_a > i ) { g_process.shutdownAbort(true); } if ( si->m_b <= i ) { g_process.shutdownAbort(true); } } return true; }