17587 lines
515 KiB
C++
17587 lines
515 KiB
C++
// print events should print if nothing else to print
|
|
|
|
// when a div tag's parent truncates its section, it may have been
|
|
// paired up with a div back tag which then should become free...
|
|
// that is the problem... because those back tags are unpaired.
|
|
// so your parent should constrain you as SOON as it is constrained and
|
|
// close you up at that point. that way you cannot falsely pair-claim
|
|
// a div back tag.
|
|
|
|
|
|
#include "Sections.h"
|
|
#include "Url.h"
|
|
#include "Words.h"
|
|
#include "Msg40.h"
|
|
#include "Conf.h"
|
|
#include "Msg1.h" // getGroupId()
|
|
//#include "HashTableX.h"
|
|
#include "XmlDoc.h"
|
|
#include "Bits.h"
|
|
#include "sort.h"
|
|
#include "Abbreviations.h"
|
|
|
|
#define MIN_VOTERS 10
|
|
//#define MAX_VOTERS 200
|
|
|
|
//static void gotSectiondbListWrapper ( void *state );
|
|
//static int cmp ( const void *s1 , const void *s2 ) ;
|
|
|
|
#define SECTIONDB_MINRECSIZES 5000000
|
|
|
|
Sections::Sections ( ) {
|
|
m_sections = NULL;
|
|
m_buf = NULL;
|
|
m_buf2 = NULL;
|
|
reset();
|
|
}
|
|
|
|
void Sections::reset() {
|
|
//if ( m_sections && m_needsFree )
|
|
// mfree ( m_sections , m_sectionsBufSize , "Sections" );
|
|
m_sectionBuf.purge();
|
|
m_sectionPtrBuf.purge();
|
|
if ( m_buf && m_bufSize )
|
|
mfree ( m_buf , m_bufSize , "sdata" );
|
|
if ( m_buf2 && m_bufSize2 )
|
|
mfree ( m_buf2 , m_bufSize2 , "sdata2" );
|
|
// reset the old and new section voting tables
|
|
//m_osvt.reset();
|
|
//m_nsvt.reset();
|
|
m_sections = NULL;
|
|
m_buf = NULL;
|
|
m_buf2 = NULL;
|
|
m_bits = NULL;
|
|
m_numSections = 0;
|
|
m_numSentenceSections = 0;
|
|
m_badHtml = false;
|
|
m_aa = NULL;
|
|
m_sentFlagsAreSet = false;
|
|
m_addedImpliedSections = false;
|
|
m_setRegBits = false;
|
|
m_rootSection = NULL;
|
|
m_lastSection = NULL;
|
|
m_lastAdded = NULL;
|
|
|
|
m_numLineWaiters = 0;
|
|
m_waitInLine = false;
|
|
m_hadArticle = false;
|
|
m_articleStartWord = -2;
|
|
m_articleEndWord = -2;
|
|
m_recall = 0;
|
|
//m_totalSimilarLayouts = 0;
|
|
m_numVotes = 0;
|
|
m_nw = 0;
|
|
m_firstSent = NULL;
|
|
m_lastSent = NULL;
|
|
m_sectionPtrs = NULL;
|
|
m_firstDateValid = false;
|
|
m_alnumPosValid = false;
|
|
}
|
|
|
|
Sections::~Sections ( ) {
|
|
reset();
|
|
}
|
|
|
|
// for debug watch point
|
|
class Sections *g_sections = NULL;
|
|
class Section *g_sec = NULL;
|
|
|
|
#define TXF_MATCHED 1
|
|
|
|
// an element on the stack is a Tag
|
|
class Tagx {
|
|
public:
|
|
// id of the fron tag we pushed
|
|
nodeid_t m_tid;
|
|
// cumulative hash of all tag ids containing this one, includes us
|
|
//int32_t m_cumHash;
|
|
// section number we represent
|
|
int32_t m_secNum;
|
|
// hash of all the alnum words in this section
|
|
//int32_t m_contentHash;
|
|
// set to TXF_MATCHED
|
|
char m_flags;
|
|
};
|
|
|
|
// i lowered from 1000 to 300 so that we more sensitive to malformed pages
|
|
// because typically they seem to take longer to parse. i also added some
|
|
// new logic for dealing with table tr and td back tags that allow us to
|
|
// pop off the other contained tags right away rather than delaying it until
|
|
// we are done because that will often breach this stack.
|
|
#define MAXTAGSTACK 300
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . sets m_sections[] array, 1-1 with words array "w"
|
|
// . the Weights class can look at these sections and zero out the weights
|
|
// for words in script, style, select and marquee sections
|
|
bool Sections::set ( Words *w ,
|
|
Phrases *phrases ,
|
|
Bits *bits ,
|
|
Url *url ,
|
|
int64_t docId ,
|
|
int64_t siteHash64 ,
|
|
char *coll ,
|
|
int32_t niceness ,
|
|
void *state ,
|
|
void (*callback)(void *state) ,
|
|
uint8_t contentType ,
|
|
Dates *dates ,
|
|
// from XmlDoc::ptr_sectionsData in a title rec
|
|
char *sectionsData ,
|
|
bool sectionsDataValid ,
|
|
char *sectionsVotes ,
|
|
//uint64_t tagPairHash ,
|
|
char *buf ,
|
|
int32_t bufSize ) {
|
|
|
|
reset();
|
|
|
|
if ( ! w ) return true;
|
|
|
|
if ( w->m_numWords > 1000000 ) {
|
|
log("sections: over 1M words. skipping sections set for "
|
|
"performance.");
|
|
return true;
|
|
}
|
|
|
|
// save it
|
|
m_words = w;
|
|
m_bits = bits;
|
|
m_dates = dates;
|
|
m_url = url;
|
|
m_docId = docId;
|
|
m_siteHash64 = siteHash64;
|
|
m_coll = coll;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
m_niceness = niceness;
|
|
//m_tagPairHash = tagPairHash;
|
|
m_contentType = contentType;
|
|
|
|
// reset this just in case
|
|
g_errno = 0;
|
|
|
|
if ( w->getNumWords() <= 0 ) return true;
|
|
|
|
// int16_tcuts
|
|
int64_t *wids = w->getWordIds ();
|
|
nodeid_t *tids = w->getTagIds ();
|
|
int32_t nw = w->getNumWords ();
|
|
char **wptrs = w->getWords ();
|
|
int32_t *wlens = w->getWordLens ();
|
|
|
|
// set these up for isDelimeter() function to use and for
|
|
// isCompatible() as well
|
|
m_wids = wids;
|
|
m_wlens = wlens;
|
|
m_wptrs = wptrs;
|
|
m_tids = tids;
|
|
m_pids = phrases->getPhraseIds2();
|
|
|
|
|
|
m_isRSSExt = false;
|
|
char *ext = m_url->getExtension();
|
|
if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true;
|
|
if ( m_contentType == CT_XML ) m_isRSSExt = true;
|
|
|
|
// are we a trumba.com url? we allow colons in sentences for its
|
|
// event titles so that we get the correct event title. fixes
|
|
// tumba.com "Guided Nature Walk : ..." title
|
|
char *dom = m_url->getDomain();
|
|
int32_t dlen = m_url->getDomainLen();
|
|
m_isTrumba = false;
|
|
m_isFacebook = false;
|
|
m_isEventBrite = false;
|
|
m_isStubHub = false;
|
|
if ( dlen == 10 && strncmp ( dom , "trumba.com" , 10 ) == 0 )
|
|
m_isTrumba = true;
|
|
if ( dlen == 12 && strncmp ( dom , "facebook.com" , 12 ) == 0 )
|
|
m_isFacebook = true;
|
|
if ( dlen == 11 && strncmp ( dom , "stubhub.com" , 11 ) == 0 )
|
|
m_isStubHub = true;
|
|
if ( dlen == 14 && strncmp ( dom , "eventbrite.com" , 14 ) == 0 )
|
|
m_isEventBrite = true;
|
|
|
|
// . how many sections do we have max?
|
|
// . init at one to count the root section
|
|
int32_t max = 1;
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// . count all front tags
|
|
// . count twice since it can terminate a SEC_SENT sentence sec
|
|
//if ( tids[i] && !(tids[i]&BACKBIT) ) max += 2;
|
|
// count back tags too since some url
|
|
// http://www.tedxhz.com/tags.asp?id=3919&id2=494 had a bunch
|
|
// of </p> tags with no front tags and it cored us because
|
|
// m_numSections > m_maxNumSections!
|
|
if ( tids[i] ) max += 2; // && !(tids[i]&BACKBIT) ) max += 2;
|
|
// or any hr tag
|
|
//else if ( tids[i] == (BACKBIT|TAG_HR) ) max += 2;
|
|
//else if ( tids[i] == (BACKBIT|TAG_BR) ) max += 2;
|
|
// . any punct tag could have a bullet in it...
|
|
// . or if its a period could make a sentence section
|
|
//else if ( ! tids[i] && ! wids[i] ) {
|
|
else if ( ! wids[i] ) {
|
|
// only do not count simple spaces
|
|
if ( m_wlens[i] == 1 && is_wspace_a(m_wptrs[i][0]))
|
|
continue;
|
|
// otherwise count it as sentence delimeter
|
|
max++;
|
|
}
|
|
}
|
|
// . then \0 allows for a sentence too!
|
|
// . fix doc that was just "localize-sf-prod\n"
|
|
max++;
|
|
// and each section may create a sentence section
|
|
max *= 2;
|
|
|
|
// truncate if excessive. growSections() will kick in then i guess
|
|
// if we need more sections.
|
|
if ( max > 1000000 ) {
|
|
log("sections: truncating max sections to 1000000");
|
|
max = 1000000;
|
|
}
|
|
|
|
//max += 5000;
|
|
int32_t need = max * sizeof(Section);
|
|
|
|
|
|
// and we need one section ptr for every word!
|
|
//need += nw * 4;
|
|
// and a section ptr for m_sorted[]
|
|
//need += max * sizeof(Section *);
|
|
// set this
|
|
m_maxNumSections = max;
|
|
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
m_sectionPtrBuf.setLabel("psectbuf");
|
|
|
|
// separate buf now for section ptr for each word
|
|
if ( ! m_sectionPtrBuf.reserve ( nw *sizeof(Section *)) ) return true;
|
|
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
|
|
m_sectionPtrsEnd = (Section **)m_sectionPtrBuf.getBufEnd();
|
|
|
|
// allocate m_sectionBuf
|
|
m_sections = NULL;
|
|
|
|
m_sectionBuf.setLabel ( "sectbuf" );
|
|
|
|
if ( ! m_sectionBuf.reserve ( need ) )
|
|
return true;
|
|
|
|
// point into it
|
|
m_sections = (Section *)m_sectionBuf.getBufStart();
|
|
|
|
/*
|
|
// assume no malloc
|
|
m_needsFree = false;
|
|
if ( need < SECTIONS_LOCALBUFSIZE ) {
|
|
m_sections = (Section *)m_localBuf;
|
|
m_sectionsBufSize = SECTIONS_LOCALBUFSIZE;
|
|
//memset ( m_sections , 0 , m_sectionsBufSize );
|
|
}
|
|
else if ( need < bufSize ) {
|
|
m_sections = (Section *)buf;
|
|
m_sectionsBufSize = bufSize;
|
|
//memset ( m_sections , 0 , m_sectionsBufSize );
|
|
}
|
|
else {
|
|
m_sections = (Section *)mmalloc ( need , "Sections" );
|
|
m_sectionsBufSize = need;
|
|
m_needsFree = true;
|
|
}
|
|
*/
|
|
|
|
// clear it nicely
|
|
//memset_nice ( m_sections , 0 , m_sectionsBufSize, m_niceness );
|
|
|
|
// reset
|
|
m_numAlnumWordsInArticle = 0;
|
|
|
|
m_titleStart = -1;
|
|
m_titleEnd = -1;
|
|
|
|
// bail if no luck
|
|
//if ( ! m_sections ) return true;
|
|
|
|
// point to buf
|
|
//char *ppp = (char *)m_sections;
|
|
// skip Sections array
|
|
//ppp += max * sizeof(Section);
|
|
// assign space for m_sorted
|
|
//m_sorted = (Section **)ppp;
|
|
// skip that
|
|
//ppp += max * sizeof(Section *);
|
|
// assign space for our ptrs that are 1-1 with the words array
|
|
//m_sectionPtrs = (Section **)ppp;
|
|
// the end
|
|
//m_sectionPtrsEnd = (Section **)(ppp + nw * 4);
|
|
// save this too
|
|
m_nw = nw;
|
|
|
|
// reset just in case
|
|
//m_numSections = 0;
|
|
|
|
// initialize the ongoing cumulative tag hash
|
|
//int32_t h = 0;
|
|
//int32_t h2 = 0;
|
|
// the content hash ongoing
|
|
//int32_t ch = 0;
|
|
// stack of front tags we encounter
|
|
Tagx stack[MAXTAGSTACK];
|
|
Tagx *stackPtr = stack;
|
|
|
|
/*
|
|
// stack of cumulative tag hashes
|
|
int32_t stack [1000];
|
|
int32_t cumHash [1000];
|
|
int32_t secNums [1000];
|
|
int32_t contentHash[1000];
|
|
int32_t *cumHashPtr = cumHash;
|
|
int32_t *secNumPtr = secNums;
|
|
int32_t *chPtr = contentHash;
|
|
*/
|
|
|
|
// determine what tags we are within
|
|
//sec_t inFlag = 0;
|
|
|
|
Section *current = NULL;
|
|
Section *rootSection = NULL;
|
|
|
|
// assume none
|
|
m_rootSection = NULL;
|
|
|
|
// only add root section if we got some words
|
|
if ( nw > 0 ) {
|
|
// record this i guess
|
|
rootSection = &m_sections[m_numSections];
|
|
// clear
|
|
memset ( rootSection , 0 , sizeof(Section) );
|
|
// . the current section we are in
|
|
// . let's use a root section
|
|
current = rootSection;
|
|
// init that to be the whole page
|
|
rootSection->m_b = nw;
|
|
// save it
|
|
m_rootSection = rootSection;
|
|
// to fix a core dump
|
|
rootSection->m_baseHash = 1;
|
|
// advance
|
|
m_numSections++;
|
|
}
|
|
|
|
// count this
|
|
int32_t alnumCount = 0;
|
|
// for debug
|
|
g_sections = this;
|
|
|
|
// Sections are no longer 1-1 with words, just with front tags
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// we got it
|
|
//m_sectionPtrs[i] = current;
|
|
nodeid_t fullTid = tids[i];
|
|
// are we a non-tag?
|
|
if ( ! fullTid ) {
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// count it raw
|
|
alnumCount++;
|
|
// must be in a section at this point
|
|
if ( ! current ) { char *xx=NULL;*xx=0; }
|
|
// . hash it up for our content hash
|
|
// . this only hashes words DIRECTLY in a
|
|
// section because we can have a large menu
|
|
// section with a little bit of text, and we
|
|
// contain some non-menu sections.
|
|
//ch = hash32h ( (int32_t)wids[i] , ch );
|
|
// if not in an anchor, script, etc. tag
|
|
//if ( ! inFlag ) current->m_plain++;
|
|
// inc count in current section
|
|
current->m_exclusive++;
|
|
continue;
|
|
}
|
|
|
|
// make a single section for input tags
|
|
if ( fullTid == TAG_INPUT ||
|
|
fullTid == TAG_HR ||
|
|
fullTid == TAG_COMMENT ) {
|
|
// try to realloc i guess. should keep ptrs in tact.
|
|
if ( m_numSections >= m_maxNumSections &&
|
|
! growSections() )
|
|
return true;
|
|
// get the section
|
|
Section *sn = &m_sections[m_numSections];
|
|
// clear
|
|
memset ( sn , 0 , sizeof(Section) );
|
|
// inc it
|
|
m_numSections++;
|
|
// sanity check - breach check
|
|
if ( m_numSections > max ) { char *xx=NULL;*xx=0; }
|
|
// set our parent
|
|
sn->m_parent = current;
|
|
// need to keep a word range that the section covers
|
|
sn->m_a = i;
|
|
// init the flags of the section
|
|
//sn->m_flags = inFlag ;
|
|
// section consists of just this tag
|
|
sn->m_b = i + 1;
|
|
// go on to next
|
|
continue;
|
|
}
|
|
|
|
// a section of multiple br tags in a sequence
|
|
if ( fullTid == TAG_BR ) {
|
|
// try to realloc i guess. should keep ptrs in tact.
|
|
if ( m_numSections >= m_maxNumSections &&
|
|
! growSections() )
|
|
return true;
|
|
// get the section
|
|
Section *sn = &m_sections[m_numSections];
|
|
// clear
|
|
memset ( sn , 0 , sizeof(Section) );
|
|
// inc it
|
|
m_numSections++;
|
|
// sanity check - breach check
|
|
if ( m_numSections > max ) { char *xx=NULL;*xx=0; }
|
|
// set our parent
|
|
sn->m_parent = current;
|
|
// need to keep a word range that the section covers
|
|
sn->m_a = i;
|
|
// init the flags of the section
|
|
//sn->m_flags = inFlag ;
|
|
// count em up
|
|
int32_t brcnt = 1;
|
|
// scan for whole sequence
|
|
int32_t lastBrPos = i;
|
|
for ( int32_t j = i + 1 ; j < nw ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// claim br tags
|
|
if ( tids[j] == TAG_BR ) {
|
|
lastBrPos = j;
|
|
brcnt++;
|
|
continue;
|
|
}
|
|
// break on words
|
|
if ( wids[j] ) break;
|
|
// all spaces is ok
|
|
if ( w->isSpaces(j) ) continue;
|
|
// otherwise, stop on other punct
|
|
break;
|
|
}
|
|
// section consists of just this tag
|
|
sn->m_b = lastBrPos + 1;
|
|
// advance
|
|
i = lastBrPos;
|
|
// set this for later so that getDelimHash() returns
|
|
// something different based on the br count for
|
|
// METHOD_ATTRIBUTE
|
|
sn->m_baseHash = 19999 + brcnt;
|
|
// go on to next
|
|
continue;
|
|
}
|
|
|
|
// get the tag id without the back bit
|
|
nodeid_t tid = fullTid & BACKBITCOMP;
|
|
|
|
// . ignore tags with no corresponding back tags
|
|
// . if they have bad html and have front tags
|
|
// with no corresponding back tags, that will hurt!
|
|
// . make exception for <li> tag!!!
|
|
// . was messing up:
|
|
// http://events.kqed.org/events/index.php?com=detail&
|
|
// eID=9812&year=2009&month=11
|
|
// for parsing out events
|
|
// . make exception for <p> tag too! most ppl use </p>
|
|
if ( ( ! hasBackTag ( tid ) ||
|
|
wptrs[i][1] =='!' || // <!ENTITY rdfns...>
|
|
wptrs[i][1] =='?' ) &&
|
|
tid != TAG_P &&
|
|
tid != TAG_LI )
|
|
continue;
|
|
|
|
// . these imply no back tag
|
|
// . <description />
|
|
// . fixes inconsistency in
|
|
// www.trumba.com/calendars/KRQE_Calendar.rss
|
|
if ( wptrs[i][wlens[i]-2] == '/' && tid == TAG_XMLTAG )
|
|
continue;
|
|
|
|
// ignore it cuz we decided it was unbalanced
|
|
//if ( bits->m_bits[i] & D_UNBAL ) continue;
|
|
|
|
// and these often don't have back tags, like <objectid .> WTF!
|
|
// no! we need these for the xml-based rss feeds
|
|
// TODO: we should parse the whole doc up front and determine
|
|
// which tags are missing back tags...
|
|
//if ( tid == TAG_XMLTAG ) continue;
|
|
|
|
// wtf, embed has no back tags. i fixed this in XmlNode.cpp
|
|
//if ( tid == TAG_EMBED ) continue;
|
|
// do not breach the stack
|
|
if ( stackPtr - stack >= MAXTAGSTACK ) {
|
|
log("html: stack breach for %s",url->getUrl());
|
|
// if we set g_errno and return then the url just
|
|
// ends up getting retried once the spider lock
|
|
// in Spider.cpp expires in MAX_LOCK_AGE seconds.
|
|
// about an hour. but really we should completely
|
|
// give up on this. whereas we should retry OOM errors
|
|
// etc. but this just means bad html really.
|
|
//g_errno = ETAGBREACH;
|
|
// just reset to 0 sections then
|
|
reset();
|
|
return true;
|
|
}
|
|
|
|
char gotBackTag ;
|
|
if ( fullTid != tid ) gotBackTag = 1;
|
|
else gotBackTag = 0;
|
|
|
|
// "pop tid", tid to pop off stack
|
|
nodeid_t ptid = tid;
|
|
nodeid_t fullPopTid = fullTid;
|
|
|
|
// no nested <li> tags allowed
|
|
if ( fullTid == TAG_LI &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_LI )
|
|
gotBackTag = 2;
|
|
|
|
// no nested <b> tags allowed
|
|
if ( fullTid == TAG_B &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_B )
|
|
gotBackTag = 2;
|
|
|
|
// no nested <a> tags allowed
|
|
if ( fullTid == TAG_A &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_A )
|
|
gotBackTag = 2;
|
|
|
|
// no nested <p> tags allowed
|
|
if ( fullTid == TAG_P &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_P )
|
|
gotBackTag = 2;
|
|
|
|
// no <hN> tags inside a <p> tag
|
|
// fixes http://www.law.berkeley.edu/140.htm
|
|
if ( fullTid >= TAG_H1 &&
|
|
fullTid <= TAG_H5 &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_P ) {
|
|
// match this on stack
|
|
ptid = TAG_P;
|
|
fullPopTid = TAG_P;
|
|
gotBackTag = 2;
|
|
}
|
|
|
|
// no nested <td> tags allowed
|
|
if ( fullTid == TAG_TD &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_TD )
|
|
gotBackTag = 2;
|
|
|
|
// encountering <tr> when in a <td> closes the <td> AND
|
|
// should also close the <tr>!!
|
|
if ( fullTid == TAG_TR &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_TD )
|
|
gotBackTag = 2;
|
|
|
|
// no nested <tr> tags allowed
|
|
if ( fullTid == TAG_TR &&
|
|
stackPtr > stack &&
|
|
((stackPtr-1)->m_tid)==TAG_TR )
|
|
gotBackTag = 2;
|
|
|
|
// this is true if we are a BACK TAG
|
|
if ( gotBackTag ) {
|
|
/*
|
|
// . ok, at this point, we are a BACK TAG
|
|
// . turn this shit on and off
|
|
if ( tid == TAG_SCRIPT ) inFlag &= ~SEC_SCRIPT;
|
|
if ( tid == TAG_TABLE ) inFlag &= ~SEC_IN_TABLE;
|
|
if ( tid == TAG_NOSCRIPT) inFlag &= ~SEC_NOSCRIPT;
|
|
if ( tid == TAG_STYLE ) inFlag &= ~SEC_STYLE;
|
|
if ( tid == TAG_MARQUEE ) inFlag &= ~SEC_MARQUEE;
|
|
if ( tid == TAG_SELECT ) inFlag &= ~SEC_SELECT;
|
|
if ( tid == TAG_STRIKE ) inFlag &= ~SEC_STRIKE;
|
|
if ( tid == TAG_S ) inFlag &= ~SEC_STRIKE2;
|
|
//if ( tid == TAG_A ) inFlag &= ~SEC_A;
|
|
if ( tid == TAG_TITLE ) {
|
|
inFlag &= ~SEC_IN_TITLE;
|
|
// first time?
|
|
if ( m_titleEnd == -1 ) m_titleEnd = i;
|
|
}
|
|
if ( tid == TAG_H1 ) inFlag &= ~SEC_IN_HEADER;
|
|
if ( tid == TAG_H2 ) inFlag &= ~SEC_IN_HEADER;
|
|
if ( tid == TAG_H3 ) inFlag &= ~SEC_IN_HEADER;
|
|
if ( tid == TAG_H4 ) inFlag &= ~SEC_IN_HEADER;
|
|
*/
|
|
|
|
// . ignore anchor tags as sections
|
|
// . was causing double the number of its parent tag
|
|
// hashes and messing up Events.cpp's counting using
|
|
// "ct" hashtbl
|
|
//if ( tid == TAG_A ) continue;
|
|
|
|
// ignore span tags that are non-breaking because they
|
|
// do not change the grouping/sectioning behavior of
|
|
// the web page and are often abused.
|
|
if ( ptid == TAG_SPAN ) continue;
|
|
|
|
// fix for gwair.org
|
|
if ( ptid == TAG_FONT ) continue;
|
|
|
|
// too many people use these like a <br> tag or
|
|
// make them open-ended or unbalanced
|
|
//if ( tid == TAG_P ) continue;
|
|
if ( ptid == TAG_CENTER ) continue;
|
|
|
|
subloop:
|
|
// don't blow the stack
|
|
if ( stackPtr == stack ) continue;
|
|
|
|
// point to it
|
|
Tagx *spp = (stackPtr - 1);
|
|
|
|
// assume we should pop stack
|
|
//bool popStack = true;
|
|
|
|
// init it
|
|
Tagx *p ;
|
|
// scan through the stack until we find a
|
|
// front tag that matches this back tag
|
|
//for(p = spp ; p >= stack && gotBackTag == 1 ; p-- ) {
|
|
for ( p = spp ; p >= stack ; p-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// no match?
|
|
if ( p->m_tid != ptid ) {
|
|
// matched before? we can pop
|
|
if ( p->m_flags & TXF_MATCHED )
|
|
continue;
|
|
// do not pop it off the stack!
|
|
//popStack = false;
|
|
// keep on going
|
|
continue;
|
|
}
|
|
// do not double match
|
|
if ( p->m_flags & TXF_MATCHED )
|
|
continue;
|
|
// flag it cuz we matched it
|
|
p->m_flags |= TXF_MATCHED;
|
|
// set the stack ptr to it
|
|
spp = p;
|
|
// and stop
|
|
break;
|
|
}
|
|
|
|
// no matching front tag at all?
|
|
// then just ignore this back tag
|
|
if ( p < stack ) continue;
|
|
|
|
// get section number of the front tag
|
|
//int32_t xn = *(secNumPtr-1);
|
|
int32_t xn = spp->m_secNum;
|
|
// sanity
|
|
if ( xn<0 || xn>=m_numSections ) {char*xx=NULL;*xx=0;}
|
|
// get it
|
|
Section *sn = &m_sections[xn];
|
|
|
|
// we are now back in the parent section
|
|
//current = sn;
|
|
// record the word range of the section we complete
|
|
sn->m_b = i+1;
|
|
|
|
// do not include the <li> tag as part of it
|
|
// otherwise we end up with overlapping section since
|
|
// this tag ALSO starts a section!!
|
|
if ( gotBackTag == 2 ) sn->m_b = i;
|
|
|
|
// sanity check
|
|
//if ( sn->m_b <= sn->m_a ) { char *xx=NULL;*xx=0;}
|
|
|
|
/*
|
|
// if our parent got closed before "sn" closed because
|
|
// of an out-of-order back tag issue, then if it
|
|
// no longer contains sn->m_a, then reparent "sn"
|
|
// to its grandparent.
|
|
Section *ps = sn->m_parent;
|
|
for ( ; ps && ps->m_b >= 0 && ps->m_b <= sn->m_a ;
|
|
ps = ps->m_parent );
|
|
// assign it
|
|
sn->m_parent = ps;
|
|
|
|
// . force our parent section to fully contain us
|
|
// . fixes interlaced tag bugs
|
|
if ( sn->m_parent &&
|
|
sn->m_parent->m_b < i &&
|
|
sn->m_parent->m_b >= 0 )
|
|
sn->m_b = sn->m_parent->m_b;
|
|
|
|
// sanity check
|
|
if ( sn->m_b <= sn->m_a ) { char *xx=NULL;*xx=0;}
|
|
*/
|
|
|
|
// if our parent got closed before "sn" closed because
|
|
// it hit its back tag before we hit ours, then we
|
|
// must cut ourselves int16_t and try to match this
|
|
// back tag to another front tag on the stack
|
|
Section *ps = sn->m_parent;
|
|
for ( ; ps != rootSection ; ps = ps->m_parent ) {
|
|
// skip if parent no longer contains us!
|
|
if ( ps->m_b <= sn->m_a ) continue;
|
|
// skip if this parent is still open
|
|
if ( ps->m_b <= 0 ) continue;
|
|
// parent must have closed before us
|
|
if ( ps->m_b > sn->m_b ) {char *xx=NULL;*xx=0;}
|
|
// we had no matching tag, or it was unbalanced
|
|
// but i do not know which...!
|
|
sn->m_flags |= SEC_OPEN_ENDED;
|
|
// cut our end int16_ter
|
|
sn->m_b = ps->m_b;
|
|
// our TXF_MATCHED bit should still be set
|
|
// for spp->m_flags, so try to match ANOTHER
|
|
// front tag with this back tag now
|
|
if ( ! ( spp->m_flags & TXF_MATCHED ) ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// ok, try to match this back tag with another
|
|
// front tag on the stack, because the front
|
|
// tag we had selected got cut int16_t because
|
|
// its parent forced it to cut int16_t.
|
|
goto subloop;
|
|
}
|
|
|
|
// sanity check
|
|
if ( sn->m_b <= sn->m_a ) { char *xx=NULL;*xx=0;}
|
|
|
|
// mark the section as unbalanced
|
|
if ( spp != (stackPtr - 1) )
|
|
sn->m_flags |= SEC_UNBALANCED;
|
|
|
|
// start this
|
|
//sn->m_alnumCount = alnumCount - sn->m_alnumCount ;
|
|
// . store the final content hash for this section
|
|
// . should be 0 if no alnum words encountered
|
|
//sn->m_contentHash = ch;
|
|
|
|
// . when this was uncommented
|
|
// resources.umwhisp.org/census/Spotsylvania/
|
|
// 1860_berkley_sch1.htm
|
|
// which had <tr><td></tr><tr><td></tr>... was
|
|
// keeping a ton of <td>'s on the stack and
|
|
// never popping them off when it hit a </tr>.
|
|
// and the <tr>s remained on the stack too..
|
|
// . this fixes sfmusictech.com from truncating the
|
|
// table that starts at A=535 just because there
|
|
// is an unclosed <td> tag that has to be closed
|
|
// by a </tr> tag. but for some reason we are coring
|
|
// in sunsetpromotions.com
|
|
// . i think the idea was to let the guys remain
|
|
// on the stack, and close the sections below...
|
|
//if ( ! popStack ) goto skipme;
|
|
|
|
// revert it to this guy, may not equal stackPtr-1 !!
|
|
stackPtr = spp;
|
|
|
|
// if perfect decrement the stack ptr
|
|
//stackPtr--;
|
|
//cumHashPtr--;
|
|
//secNumPtr--;
|
|
//chPtr--;
|
|
// now pop it back
|
|
//ch = *chPtr;
|
|
//ch = stackPtr->m_contentHash;
|
|
// get parent section
|
|
if ( stackPtr > stack ) {
|
|
// get parent section now
|
|
//xn = *(secNumPtr-1);
|
|
xn = (stackPtr-1)->m_secNum;
|
|
// set current to that
|
|
current = &m_sections[xn];
|
|
}
|
|
else {
|
|
//current = NULL;
|
|
//char *xx=NULL;*xx=0;
|
|
// i guess this is bad html!
|
|
current = rootSection;
|
|
}
|
|
|
|
// revert the hash
|
|
//if ( stackPtr > stack ) h = *(cumHashPtr-1);
|
|
//if ( stackPtr > stack ) h = (stackPtr-1)->m_cumHash;
|
|
//else h = 0;
|
|
// debug log
|
|
if ( g_conf.m_logDebugSections ) {
|
|
char *ms = "";
|
|
if ( stackPtr->m_tid != ptid) ms =" UNMATCHED";
|
|
char *back ="";
|
|
if ( fullPopTid & BACKBIT ) back = "/";
|
|
logf(LOG_DEBUG,"section: pop tid=%"INT32" "
|
|
"i=%"INT32" "
|
|
"level=%"INT32" "
|
|
"%s%s "
|
|
//"h=0x%"XINT32""
|
|
"%s",(int32_t)tid,
|
|
i,
|
|
(int32_t)(stackPtr - stack),
|
|
back,g_nodes[tid].m_nodeName,
|
|
//h,
|
|
ms);
|
|
}
|
|
|
|
// . hmmm. did we hit a back tag with no front tag?
|
|
// or we could have been missing another back tag!
|
|
// . if nested properly, this should equal here
|
|
// . if it was a match, we are done, so continue
|
|
//if ( *stackPtr == tid ) continue;
|
|
//if ( stackPtr->m_tid == tid ) continue;
|
|
|
|
// for </table> and </tr> keep popping off until we
|
|
// find it!
|
|
//if ( tid == TAG_TABLE || tid == TAG_TR) goto subloop;
|
|
|
|
// set a flag
|
|
//m_badHtml = true;
|
|
|
|
// if we are a <li> or <ul> tag and just closing
|
|
// an <li> tag on the stack, then we still have
|
|
// to add ourselves as a front tag below...
|
|
//if(fullTid != TAG_LI && fullTid != TAG_UL ) continue;
|
|
//skipme:
|
|
// . if we were a back tag, we are done... but if we
|
|
// were a front tag, we must add ourselves below...
|
|
// . MDW: this seems more logical than the if-statement
|
|
// below...
|
|
if ( fullTid != tid ) continue;
|
|
/*
|
|
if ( fullPopTid != TAG_LI &&
|
|
fullPopTid != TAG_B &&
|
|
fullPopTid != TAG_TR &&
|
|
fullPopTid != TAG_TD &&
|
|
fullPopTid != TAG_P ) continue;
|
|
*/
|
|
}
|
|
|
|
|
|
//
|
|
// if front tag, update "h" and throw it on the stack
|
|
//
|
|
|
|
/*
|
|
// turn this shit on and off
|
|
if ( tid == TAG_SCRIPT ) inFlag |= SEC_SCRIPT;
|
|
else if ( tid == TAG_TABLE ) inFlag |= SEC_IN_TABLE;
|
|
else if ( tid == TAG_NOSCRIPT) inFlag |= SEC_NOSCRIPT;
|
|
else if ( tid == TAG_STYLE ) inFlag |= SEC_STYLE;
|
|
else if ( tid == TAG_MARQUEE ) inFlag |= SEC_MARQUEE;
|
|
else if ( tid == TAG_SELECT ) inFlag |= SEC_SELECT;
|
|
else if ( tid == TAG_STRIKE ) inFlag |= SEC_STRIKE;
|
|
else if ( tid == TAG_S ) inFlag |= SEC_STRIKE2;
|
|
//else if ( tid == TAG_A ) inFlag |= SEC_A;
|
|
else if ( tid == TAG_TITLE ) {
|
|
inFlag |= SEC_IN_TITLE;
|
|
// first time?
|
|
if ( m_titleStart == -1 ) {
|
|
m_titleStart = i;
|
|
// Address.cpp uses this
|
|
m_titleStartAlnumPos = alnumCount;
|
|
}
|
|
}
|
|
else if ( tid == TAG_H1 ) inFlag |= SEC_IN_HEADER;
|
|
else if ( tid == TAG_H2 ) inFlag |= SEC_IN_HEADER;
|
|
else if ( tid == TAG_H3 ) inFlag |= SEC_IN_HEADER;
|
|
else if ( tid == TAG_H4 ) inFlag |= SEC_IN_HEADER;
|
|
*/
|
|
|
|
// . ignore anchor tags as sections
|
|
// . was causing double the number of its parent tag
|
|
// hashes and messing up Events.cpp's counting using
|
|
// "ct" hashtbl
|
|
//if ( tid == TAG_A ) continue;
|
|
|
|
// ignore paragraph/center tags, too many people are careless
|
|
// with them... santafeplayhouse.com
|
|
// i can't ignore <p> tags anymore because for
|
|
// http://www.abqfolkfest.org/resources.shtml we are allowing
|
|
// "Halloween" to have "SEP-DEC" as a header even though
|
|
// that header is BELOW "Halloween" just because we THINK
|
|
// they are in the same section BUT in reality they were
|
|
// in different <p> tags. AND now it seems that the
|
|
// improvements we made to Sections.cpp for closing open
|
|
// ended tags are pretty solid that we can unignore <p>
|
|
// tags again, it only helped the qa run...
|
|
//if ( tid == TAG_P ) continue;
|
|
if ( tid == TAG_CENTER ) continue;
|
|
|
|
if ( tid == TAG_SPAN ) continue;
|
|
// gwair.org has font tags the pair up a date "1st Sundays"
|
|
// with the address above it, and it shouldn't do that!
|
|
if ( tid == TAG_FONT ) continue;
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
|
if ( m_numSections >= m_maxNumSections && ! growSections() )
|
|
return true;
|
|
// get the section
|
|
Section *sn = &m_sections[m_numSections];
|
|
// clear
|
|
memset ( sn , 0 , sizeof(Section) );
|
|
// inc it
|
|
m_numSections++;
|
|
// sanity check - breach check
|
|
if ( m_numSections > max ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . check the last tagId we encountered for this
|
|
// current section.
|
|
// . a basic "majority element" algorithm
|
|
// . the final "tid" if any may be the "majority
|
|
// element"
|
|
// . Address.cpp uses this to eliminate place names
|
|
// that are in a totally different section than the
|
|
// street address and should not be paired up
|
|
//if ( tid != TAG_B ) {
|
|
// if ( current->m_lastTid == tid )
|
|
// current->m_numBackToBackSubsections++;
|
|
// else {
|
|
// current->m_lastTid =tid;
|
|
// current->m_numBackToBackSubsections=1;
|
|
// }
|
|
//}
|
|
// set our parent
|
|
sn->m_parent = current;
|
|
// set this
|
|
current = sn;
|
|
// update this
|
|
//m_sectionPtrs[i] = current;
|
|
// clear this
|
|
//sec_t myFlag = 0;
|
|
|
|
// fix it, we don't like hashes of 0 cuz HashTableT
|
|
// uses a key of 0 to signify an empty bucket!
|
|
//if ( h == 0 ) h = 1;
|
|
// store it, post-hash
|
|
//sn->m_tagHash = h;
|
|
|
|
// add the content hash as well!
|
|
/*
|
|
// . get our parents enumerated tagHash
|
|
// . all tag hashes are enumerated except his tagHash
|
|
int32_t ph = sn->m_parent->m_tagHash;
|
|
// put his enumeration into it
|
|
int32_t eh = hash32h ( ph , sn->m_parent->m_occNum );
|
|
// cap it off with our tag hash with no enumeration
|
|
eh = hash32h ( h , eh );
|
|
// store that
|
|
sn->m_tagHash = eh;
|
|
// what kid # are we for this particular parent?
|
|
int32_t occNum = m_ot.getScore ( (int64_t *)&eh );
|
|
// save our kid #
|
|
sn->m_occNum = occNum;
|
|
// inc it for the next guy
|
|
occNum++;
|
|
// store back. return true with g_errno set on error
|
|
if ( ! m_ot.addTerm ( (int64_t *)&eh ) ) return true;
|
|
*/
|
|
|
|
|
|
/*
|
|
// assume this is invalid
|
|
sn->m_hardTagHash = 0;
|
|
// . this hash is just for breaking tags with back tags
|
|
// . this "section id" is used by Events.cpp
|
|
// . we can have multiple sections with the same
|
|
// m_hardTagHash since it is more restrictive
|
|
if ( isBreakingTagId(tid) && hasBackTag(tid) ) {
|
|
// accumulate
|
|
h2 = hash32h ( tid , h2 );
|
|
// fix it
|
|
//if ( h2 == 0 ) h2 = 1;
|
|
// set it
|
|
sn->m_hardTagHash = h2;
|
|
}
|
|
*/
|
|
// need to keep a word range that the section covers
|
|
sn->m_a = i;
|
|
// start this
|
|
//sn->m_alnumCount = alnumCount;
|
|
// init the flags of the section
|
|
//sn->m_flags = inFlag ;//| myFlag ;
|
|
// assume no terminating bookend
|
|
sn->m_b = -1;
|
|
|
|
// if tagged as an unbalanced tag, do not push onto stack
|
|
//if ( bits->m_bits[i] & D_UNBAL ) continue;
|
|
|
|
// push a unique id on the stack so we can pop if we
|
|
// enter a subsection
|
|
stackPtr->m_tid = tid;
|
|
//stackPtr->m_cumHash = h;
|
|
stackPtr->m_secNum = m_numSections - 1;
|
|
//stackPtr->m_contentHash = ch;
|
|
stackPtr->m_flags = 0;
|
|
stackPtr++;
|
|
|
|
//*stackPtr ++ = tid;
|
|
//*cumHashPtr++ = h;
|
|
//*secNumPtr ++ = m_numSections - 1;
|
|
|
|
// push this for when we hit the back tag we pop it
|
|
//*chPtr ++ = ch;
|
|
// reset it since starting a new section
|
|
//ch = 0;
|
|
// depth or level
|
|
//sn->m_depth = stackPtr - stack;
|
|
// debug log
|
|
if ( ! g_conf.m_logDebugSections ) continue;
|
|
//int32_t back = 0;
|
|
//if ( fullTid & BACKBIT ) back = 1;
|
|
logf(LOG_DEBUG,"section: push tid=%"INT32" "
|
|
"i=%"INT32" "
|
|
"level=%"INT32" "
|
|
"%s "
|
|
//"back=%"INT32""
|
|
//" h=0x%"XINT32"",
|
|
,
|
|
(int32_t)tid,
|
|
i,
|
|
(int32_t)(stackPtr - stack)-1,
|
|
g_nodes[(int32_t)tid].m_nodeName
|
|
//,back);//,h);
|
|
);
|
|
}
|
|
|
|
/*
|
|
// <center> tags should end at next <center>... like <li>
|
|
// OR we could end it after our FIRST CHILD... or if text
|
|
// follows, after that text... well the browser seems to not
|
|
// close these center tags... sooo. let's comment this out
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if not open ended
|
|
if ( sn->m_b >= 0 ) continue;
|
|
// skip if not a center or p tag
|
|
if ( sn->m_tagId != TAG_CENTER &&
|
|
sn->m_tagId != TAG_P ) continue;
|
|
// close if open ended on the next <center> or <p> tag
|
|
if ( sn->m_next && sn->m_next->m_a < sn->m_b )
|
|
sn->m_b = sn->m_next->m_a;
|
|
}
|
|
*/
|
|
|
|
// if first word in a section false outside of the parent section
|
|
// then reparent to the grandparent. this can happen when we end
|
|
// up closing a parent section before ???????
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// get it
|
|
Section *si = &m_sections[i];
|
|
// skip if we are still open-ended
|
|
if ( si->m_b < 0 ) continue;
|
|
// get parent
|
|
Section *sp = si->m_parent;
|
|
// skip if no parent
|
|
if ( ! sp ) continue;
|
|
// skip if parent still open ended
|
|
if ( sp->m_b < 0 ) continue;
|
|
// subloop it
|
|
doagain:
|
|
// skip if no parent
|
|
if ( ! sp ) continue;
|
|
// parent must start before us
|
|
if ( sp->m_a > si->m_a ) { char *xx=NULL;*xx=0; }
|
|
// . does parent contain our first word?
|
|
// . it need not fully contain our last word!!!
|
|
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
|
|
// if parent is open ended, then it is ok for now
|
|
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
|
|
// get grandparent
|
|
sp = sp->m_parent;
|
|
// set
|
|
si->m_parent = sp;
|
|
// try again
|
|
goto doagain;
|
|
}
|
|
|
|
bool inFrame = false;
|
|
int32_t gbFrameNum = 0;
|
|
|
|
//
|
|
// . set Section::m_xmlNameHash for xml tags here
|
|
// . set Section::m_frameNum and SEC_IN_GBFRAME bit
|
|
//
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// get it
|
|
int32_t ws = sn->m_a;
|
|
// int16_tcut
|
|
nodeid_t tid = tids[ws];
|
|
|
|
// set SEC_IN_FRAME
|
|
if ( tid == TAG_GBFRAME ) {
|
|
// start or end?
|
|
gbFrameNum++;
|
|
inFrame = true;
|
|
}
|
|
if ( tid == (TAG_GBFRAME | BACKBIT) )
|
|
inFrame = false;
|
|
|
|
// mark it
|
|
if ( inFrame )
|
|
sn->m_gbFrameNum = gbFrameNum;
|
|
|
|
// custom xml tag, hash the tag itself
|
|
if ( tid != TAG_XMLTAG ) continue;
|
|
// stop at first space to avoid fields!!
|
|
char *p = m_wptrs[ws] + 1;
|
|
char *pend = p + m_wlens[ws];
|
|
// skip back tags
|
|
if ( *p == '/' ) continue;
|
|
// reset hash
|
|
int64_t xh = 0;
|
|
// and hash char count
|
|
unsigned char cnt = 0;
|
|
// hash till space or / or >
|
|
for ( ; p < pend ; p++ ) {
|
|
// stop on space or / or >
|
|
if ( is_wspace_a(*p) ) break;
|
|
if ( *p == '/' ) break;
|
|
if ( *p == '>' ) break;
|
|
// hash it in
|
|
xh ^= g_hashtab[cnt++][(unsigned char )*p];
|
|
}
|
|
// sanity check
|
|
//if ( ! xh ) { char *xx=NULL;*xx=0; }
|
|
// if it is a string of the same chars it can be 0
|
|
if ( ! xh ) xh = 1;
|
|
// store that
|
|
sn->m_xmlNameHash = (int32_t)xh;
|
|
}
|
|
|
|
|
|
|
|
// find any open ended tags and constrain them based on their parent
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *si = &m_sections[i];
|
|
// get its parent
|
|
Section *ps = si->m_parent;
|
|
// if parent is open-ended panic!
|
|
if ( ps && ps->m_b < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// if our parent got constrained from under us, we need
|
|
// to telescope to a new parent
|
|
for ( ; ps && ps->m_b >= 0 && ps->m_b <= si->m_a ; ) {
|
|
ps = ps->m_parent;
|
|
si->m_parent = ps;
|
|
}
|
|
|
|
// assume end is end of doc
|
|
int32_t end = m_words->getNumWords();
|
|
// get end of parent
|
|
if ( ps ) end = ps->m_b;
|
|
// flag it
|
|
if ( si->m_b == -1 ) si->m_flags |= SEC_OPEN_ENDED;
|
|
// shrink our section if parent ends before us OR if we
|
|
// are open ended
|
|
if ( si->m_b != -1 && si->m_b <= end ) continue;
|
|
// this might constrain someone's parent such that
|
|
// that someone no longer can use that parent!!
|
|
si->m_b = end;
|
|
// . get our tag type
|
|
// . use int32_t instead of nodeid_t so we can re-set this
|
|
// to the xml tag hash if we need to
|
|
int32_t tid1 = m_tids[si->m_a];
|
|
// use the tag hash if this is an xml tag
|
|
if ( tid1 == TAG_XMLTAG ) {
|
|
// we computed this above
|
|
tid1 = si->m_xmlNameHash;
|
|
// skip if zero!
|
|
if ( ! tid1 ) continue;
|
|
}
|
|
// must be there to be open ended
|
|
if ( ! tid1 ) { char *xx=NULL;*xx=0; }
|
|
// flag it for later
|
|
//si->m_flags |= SEC_CONSTRAINED;
|
|
// NOW, see if within that parent there is actually another
|
|
// tag after us of our same tag type, then use that to
|
|
// constrain us instead!!
|
|
// this hurts <p><table><tr><td><p>.... because it
|
|
// uses that 2nd <p> tag to constrain si->m_b of the first
|
|
// <p> tag which is not right! sunsetpromotions.com has that.
|
|
for ( int32_t j = i + 1 ; j < m_numSections ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Section *sj = &m_sections[j];
|
|
// get word start
|
|
int32_t a = sj->m_a;
|
|
// skip if ties with us already
|
|
if ( a == si->m_a ) continue;
|
|
// stop if out
|
|
if ( a >= end ) break;
|
|
|
|
// . it must be in the same expanded frame src, if any
|
|
// . this fixes trulia.com which was ending our html
|
|
// tag, which was open-ended, with the html tag in
|
|
// a frame src expansion
|
|
if ( sj->m_gbFrameNum != si->m_gbFrameNum ) continue;
|
|
// fix sunsetpromotions.com bug. see above.
|
|
if ( sj->m_parent != si->m_parent ) continue;
|
|
// get its tid
|
|
int32_t tid2 = tids[a];
|
|
// use base hash if xml tag
|
|
if ( tid2 == TAG_XMLTAG )
|
|
tid2 = sj->m_xmlNameHash;
|
|
// must be our tag type!
|
|
if ( tid2 != tid1 ) continue;
|
|
// ok end us there instead!
|
|
si->m_b = a;
|
|
// stop
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
// reparent again now that things are closed
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// get it
|
|
Section *si = &m_sections[i];
|
|
// skip if we are still open-ended
|
|
if ( si->m_b < 0 ) { char *xx=NULL;*xx=0; }
|
|
// get parent
|
|
Section *sp = si->m_parent;
|
|
// skip if null
|
|
if ( ! sp ) continue;
|
|
// skip if parent still open ended
|
|
if ( sp->m_b < 0 ) { char *xx=NULL;*xx=0; }
|
|
// subloop it
|
|
doagain2:
|
|
// skip if no parent
|
|
if ( ! sp ) continue;
|
|
// . does parent contain our first word?
|
|
// . it need not fully contain our last word!!!
|
|
if ( sp->m_a <= si->m_a && sp->m_b > si->m_a ) continue;
|
|
// if parent is open ended, then it is ok for now
|
|
if ( sp->m_a <= si->m_a && sp->m_b == -1 ) continue;
|
|
// if parent is open ended, then it is ok for now
|
|
if ( sp->m_b == -1 ) { char *xx=NULL;*xx=0; }
|
|
// get grandparent
|
|
sp = sp->m_parent;
|
|
// set
|
|
si->m_parent = sp;
|
|
// try again
|
|
goto doagain2;
|
|
}
|
|
|
|
|
|
m_isTestColl = ! strcmp(m_coll,"qatest123") ;
|
|
|
|
//
|
|
//
|
|
// now assign m_sectionPtrs[] which map a word to the first
|
|
// section that contains it
|
|
//
|
|
//
|
|
Section *dstack[MAXTAGSTACK];
|
|
int32_t ns = 0;
|
|
int32_t j = 0;
|
|
current = m_rootSection;//&m_sections[0];
|
|
Section *next = m_rootSection;//&m_sections[0];
|
|
// first print the html lines out
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// pop all off the stack that match us
|
|
for ( ; ns>0 && dstack[ns-1]->m_b == i ; ) {
|
|
ns--;
|
|
current = dstack[ns-1];
|
|
}
|
|
// push our current section onto the stack if i equals
|
|
// its first word #
|
|
for ( ; next && i == next->m_a ; ) {
|
|
dstack[ns++] = next;
|
|
// sanity check
|
|
//if ( next->m_a == next->m_b ) { char *xx=NULL;*xx=0;}
|
|
// set our current section to this now
|
|
current = next;
|
|
// get next section for setting "next"
|
|
j++;
|
|
// if no more left, set "next" to NULL and stop loop
|
|
if ( j >= m_numSections ) { next=NULL; break; }
|
|
// grab it
|
|
next = &m_sections[j];
|
|
}
|
|
// assign
|
|
m_sectionPtrs[i] = current;
|
|
}
|
|
|
|
if ( m_isTestColl ) {
|
|
// map each word to a section that contains it at least
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
Section *si = m_sectionPtrs[i];
|
|
if ( si->m_a > i ) { char *xx=NULL;*xx=0; }
|
|
if ( si->m_b <= i ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
}
|
|
|
|
// . addImpliedSections() requires Section::m_baseHash
|
|
// . set Section::m_baseHash
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// these have to be in order of sn->m_a to work right
|
|
// because we rely on the parent tag hash, which would not
|
|
// necessarily be set if we were not sorted, because the
|
|
// parent section could have SEC_FAKE flag set because it is
|
|
// a br section added afterwards.
|
|
Section *sn = &m_sections[i];
|
|
// get word start into "ws"
|
|
int32_t ws = sn->m_a;
|
|
// int16_tcut
|
|
nodeid_t tid = tids[ws];
|
|
// sanity check, <a> guys are not sections
|
|
//if ( tid == TAG_A &&
|
|
// !(sn->m_flags & SEC_SENTENCE) ) { char *xx=NULL;*xx=0; }
|
|
// use a modified tid as the tag hash?
|
|
int64_t mtid = tid;
|
|
// custom xml tag, hash the tag itself
|
|
if ( tid == TAG_XMLTAG )
|
|
mtid = hash32 ( wptrs[ws], wlens[ws] );
|
|
// an unknown tag like <!! ...->
|
|
if ( tid == 0 )
|
|
mtid = 1;
|
|
// . if we are a div tag, mod it
|
|
// . treat the fields in the div tag as
|
|
// part of the tag hash.
|
|
// . helps Events.cpp be more precise about
|
|
// section identification!!!!
|
|
// . we now do this for TD and TR so Nov 2009 can telescope for
|
|
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
|
|
// so the calendar title "Nov 2009" can affect all dates
|
|
// below the calendar.
|
|
if ( tid == TAG_DIV ||
|
|
tid == TAG_TD ||
|
|
tid == TAG_TR ||
|
|
tid == TAG_LI || // newmexico.org urls class=xxx
|
|
tid == TAG_UL || // newmexico.org urls class=xxx
|
|
tid == TAG_P || // <p class="pstrg"> stjohnscollege.edu
|
|
tid == TAG_SPAN ) {
|
|
// get ptr
|
|
uint8_t *p = (uint8_t *)wptrs[ws];
|
|
// skip <
|
|
p++;
|
|
// skip following alnums, that is the tag name
|
|
for ( ; is_alnum_a(*p) ; p++ );
|
|
//if ( tid == TAG_DIV ) p += 4;
|
|
// skip "<td" or "<tr"
|
|
//else p += 3;
|
|
// scan for "id" or "class" in it
|
|
// . i had to increase this because we were missing
|
|
// some stuff causing us to get the wrong implied
|
|
// sections for
|
|
// www.guysndollsllc.com/page5/page4/page4.html
|
|
// causing "The Remains" to be paired up with
|
|
// "Aug 7, 2010" in an implied section which was
|
|
// just wrong. it was 20, i made it 100...
|
|
uint8_t *pend = p + 100;
|
|
// position ptr
|
|
unsigned char cnt = 0;
|
|
// a flag
|
|
bool skipTillSpace = false;
|
|
// . just hash every freakin char i guess
|
|
// . TODO: maybe don't hash "width" for <td><tr>
|
|
for ( ; *p && *p !='>' && p < pend ; p++ ) {
|
|
// skip bgcolor= tags because panjea.org
|
|
// interlaces different colored <tr>s in the
|
|
// table and i want them to be seen as brother
|
|
// sections, mostly for the benefit of the
|
|
// setting of lastBrother1/2 in Events.cpp
|
|
if ( is_wspace_a(p[0]) &&
|
|
to_lower_a (p[1])=='b' &&
|
|
to_lower_a (p[2])=='g' ) {
|
|
skipTillSpace = true;
|
|
continue;
|
|
}
|
|
// and skip height=* tags so cabq.gov which
|
|
// uses varying <td> height attributes will
|
|
// have its google map links have the same
|
|
// tag hash so TSF_PAGE_REPEAT gets set
|
|
// in Events.cpp and they are not event titles.
|
|
// it has other chaotic nested tag issues
|
|
// so let's take this out.
|
|
/*
|
|
if ( is_wspace_a(p[0]) &&
|
|
to_lower_a (p[1])=='h' &&
|
|
to_lower_a (p[2])=='e' &&
|
|
to_lower_a (p[3])=='i' &&
|
|
to_lower_a (p[4])=='g' &&
|
|
to_lower_a (p[5])=='h' &&
|
|
to_lower_a (p[6])=='t' ) {
|
|
skipTillSpace = true;
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
// if not a space continue
|
|
if ( skipTillSpace ) {
|
|
if ( ! is_wspace_a(*p) ) continue;
|
|
skipTillSpace = false;
|
|
}
|
|
// do not hash until we get a space
|
|
if ( skipTillSpace ) continue;
|
|
// skip if not alnum
|
|
if ( !is_alnum_a(*p)) continue;
|
|
// hash it in
|
|
mtid ^= g_hashtab[cnt++][(unsigned char)*p];
|
|
}
|
|
}
|
|
// should not have either of these yet!
|
|
if ( sn->m_flags & SEC_FAKE ) { char *xx=NULL;*xx=0; }
|
|
if ( sn->m_flags & SEC_SENTENCE ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( mtid == 0 ) { char *xx=NULL;*xx=0; }
|
|
// . set the base hash, usually just tid
|
|
// . usually base hash is zero but if it is a br tag
|
|
// we set it to something special to indicate the number
|
|
// of br tags in the sequence
|
|
sn->m_baseHash ^= mtid;
|
|
// fix this
|
|
if ( sn == rootSection ) sn->m_baseHash = 1;
|
|
// fix root section i guess
|
|
if ( sn->m_baseHash == 0 ) {
|
|
// fix core on gk21
|
|
sn->m_baseHash = 2;
|
|
//char *xx=NULL;*xx=0; }
|
|
}
|
|
// set this now too WHY? should already be set!!! was
|
|
// causing the root section to become a title section
|
|
// because first word was "<title>". then every word in
|
|
// the doc got SEC_IN_TITLE set and did not get hashed
|
|
// in XmlDoc::hashBody()... NOR in XmlDoc::hashTitle()!!!
|
|
if ( sn != rootSection ) // || tid != TAG_TITLE )
|
|
sn->m_tagId = tid;
|
|
|
|
|
|
//
|
|
// set m_turkBaseHash
|
|
//
|
|
// . using just the tid based turkTagHash is not as good
|
|
// as incorporating the class of tags because we can then
|
|
// distinguish between more types of tags in a document and
|
|
// that is kind of what "class" is used for
|
|
//
|
|
// use this = "Class" tid
|
|
int64_t ctid = tid;
|
|
// get ptr
|
|
uint8_t *p = (uint8_t *)wptrs[ws];
|
|
// skip <
|
|
p++;
|
|
// skip following alnums, that is the tag name
|
|
for ( ; is_alnum_a(*p) ; p++ );
|
|
// scan for "class" in it
|
|
uint8_t *pend = p + 100;
|
|
// position ptr
|
|
unsigned char cnt = 0;
|
|
// a flag
|
|
bool inClass = false;
|
|
// . just hash every freakin char i guess
|
|
// . TODO: maybe don't hash "width" for <td><tr>
|
|
for ( ; *p && *p !='>' && p < pend ; p++ ) {
|
|
// hashing it up?
|
|
if ( inClass ) {
|
|
// all done if space
|
|
if ( is_wspace_a(*p) ) break;
|
|
// skip if not alnum
|
|
//if ( !is_alnum_a(*p)) continue;
|
|
// hash it in
|
|
ctid ^= g_hashtab[cnt++][(unsigned char)*p];
|
|
}
|
|
// look for " class="
|
|
if ( p[0] !=' ' ) continue;
|
|
if ( to_lower_a(p[1]) != 'c' ) continue;
|
|
if ( to_lower_a(p[2]) != 'l' ) continue;
|
|
if ( to_lower_a(p[3]) != 'a' ) continue;
|
|
if ( to_lower_a(p[4]) != 's' ) continue;
|
|
if ( to_lower_a(p[5]) != 's' ) continue;
|
|
if ( to_lower_a(p[6]) != '=' ) continue;
|
|
// mark it
|
|
inClass = true;
|
|
// skip over it
|
|
p += 6;
|
|
}
|
|
// if root section has no tag this is zero and will core
|
|
// in Dates.cpp where it checks m_turkTagHash32 to be zero
|
|
if ( ctid == 0 ) ctid = 999999;
|
|
// set it for setting m_turkTagHash32
|
|
sn->m_turkBaseHash = ctid;
|
|
// always make root turkbasehash be 999999.
|
|
// if root section did not start with tag it's turkBaseHash
|
|
// will be 999999. but a root section that did start with
|
|
// a tag will have a different turk base hash.
|
|
// will be the same, even if one leads with some punct.
|
|
// fix fandango.com and its kid.
|
|
if ( sn == m_rootSection )
|
|
sn->m_turkBaseHash = 999999;
|
|
}
|
|
|
|
|
|
// set up our linked list, the functions below will insert sections
|
|
// and modify this linked list
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// set it
|
|
if ( i + 1 < m_numSections )
|
|
m_sections[i].m_next = &m_sections[i+1];
|
|
if ( i - 1 >= 0 )
|
|
m_sections[i].m_prev = &m_sections[i-1];
|
|
}
|
|
|
|
// i would say <hr> is kinda like an <h0>, so do it first
|
|
//splitSections ( "<hr" , (int32_t)BH_HR );
|
|
|
|
// init to -1 to indicate none
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// reset it
|
|
si->m_firstWordPos = -1;
|
|
si->m_lastWordPos = -1;
|
|
si->m_senta = -1;
|
|
si->m_sentb = -1;
|
|
si->m_firstPlaceNum = -1;
|
|
si->m_headRowSection = NULL;
|
|
si->m_headColSection = NULL;
|
|
}
|
|
// now set position of first word each section contains
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get smallest section containing
|
|
Section *si = m_sectionPtrs[i];
|
|
// do each parent as well
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// skip if already had one!
|
|
if ( si->m_firstWordPos >= 0 ) break;
|
|
// otherwise, we are it
|
|
si->m_firstWordPos = i;
|
|
// . set format hash of it
|
|
// . do it manually since tagHash not set yet
|
|
}
|
|
}
|
|
// and last word position
|
|
for ( int32_t i = m_nw - 1 ; i > 0 ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get smallest section containing
|
|
Section *si = m_sectionPtrs[i];
|
|
// do each parent as well
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// skip if already had one!
|
|
if ( si->m_lastWordPos >= 0 ) break;
|
|
// otherwise, we are it
|
|
si->m_lastWordPos = i;
|
|
}
|
|
}
|
|
|
|
sec_t inFlag = 0;
|
|
int32_t istack[1000];
|
|
sec_t iflags[1000];
|
|
int32_t ni = 0;
|
|
//
|
|
// now set the inFlags here because the tags might not have all
|
|
// been closed, making tags like SEC_STYLE overflow from where
|
|
// they should be...
|
|
//
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// did we exceed a tag boundary?
|
|
for ( ; ni>0 && si->m_a >= istack[ni-1] ; ) {
|
|
// undo flag
|
|
inFlag &= ~iflags[ni-1];
|
|
// pop off
|
|
ni--;
|
|
}
|
|
// get the flag if any into mf
|
|
sec_t mf = 0;
|
|
// skip if not special tag id
|
|
nodeid_t tid = si->m_tagId;
|
|
if ( tid == TAG_SCRIPT ) mf = SEC_SCRIPT;
|
|
else if ( tid == TAG_TABLE ) mf = SEC_IN_TABLE;
|
|
else if ( tid == TAG_NOSCRIPT) mf = SEC_NOSCRIPT;
|
|
else if ( tid == TAG_STYLE ) mf = SEC_STYLE;
|
|
else if ( tid == TAG_MARQUEE ) mf = SEC_MARQUEE;
|
|
else if ( tid == TAG_SELECT ) mf = SEC_SELECT;
|
|
else if ( tid == TAG_STRIKE ) mf = SEC_STRIKE;
|
|
else if ( tid == TAG_S ) mf = SEC_STRIKE2;
|
|
else if ( tid == TAG_H1 ) mf = SEC_IN_HEADER;
|
|
else if ( tid == TAG_H2 ) mf = SEC_IN_HEADER;
|
|
else if ( tid == TAG_H3 ) mf = SEC_IN_HEADER;
|
|
else if ( tid == TAG_H4 ) mf = SEC_IN_HEADER;
|
|
else if ( tid == TAG_TITLE ) mf = SEC_IN_TITLE;
|
|
// accumulate
|
|
inFlag |= mf;
|
|
// add in the flags
|
|
si->m_flags |= inFlag;
|
|
// skip if nothing special
|
|
if ( ! mf ) continue;
|
|
// sanity
|
|
if ( ni >= 1000 ) { char *xx=NULL;*xx=0; }
|
|
// otherwise, store on stack
|
|
istack[ni] = si->m_b;
|
|
iflags[ni] = mf;
|
|
ni++;
|
|
|
|
// title is special
|
|
if ( tid == TAG_TITLE && m_titleStart == -1 ) {
|
|
m_titleStart = si->m_a; // i;
|
|
// Address.cpp uses this
|
|
m_titleStartAlnumPos = alnumCount;
|
|
}
|
|
}
|
|
|
|
// . now we insert sentence sections
|
|
// . find the smallest section containing the first and last
|
|
// word of each sentence and inserts a subsection into that
|
|
// . we have to be careful to reparent, etc.
|
|
// . kinda copy splitSections() function
|
|
// . maybe add an "insertSection()" function???
|
|
if ( m_contentType != CT_JS ) {
|
|
// add sentence sections
|
|
if ( ! addSentenceSections() ) return true;
|
|
// this is needed by setSentFlags()
|
|
setNextSentPtrs();
|
|
// returns false and sets g_errno on error
|
|
if ( ! setSentFlagsPart1 ( ) ) return true;
|
|
}
|
|
|
|
|
|
//addSentenceSections();
|
|
|
|
// tuna texa satafexplayhouse.com page uses header tags to divide
|
|
// the page up into sections
|
|
// CRAP! but graffiti.org starts its event sections with h4 tags
|
|
// and those sections contain h3 tags! which results in us capturing
|
|
// the wrong event header in each event section, so i commented this
|
|
// out... and santafeplayhouse.com still seems to work!
|
|
//splitSections ("<h1", (int32_t)BH_H1 );
|
|
//splitSections ("<h2", (int32_t)BH_H2 );
|
|
//splitSections ("<h3", (int32_t)BH_H3 );
|
|
//splitSections ("<h4", (int32_t)BH_H4 );
|
|
|
|
|
|
// . set m_nextBrother
|
|
// . we call this now to aid in setHeadingBit() and for adding the
|
|
// implied sections, but it is ultimately
|
|
// called a second time once all the new sections are inserted
|
|
setNextBrotherPtrs ( false ); // setContainer = false
|
|
|
|
/*
|
|
// setHeadingBit() relies on Section::m_tagHash, so even though we
|
|
// have not added implied sections, do this here, and then do it again
|
|
// after adding implied sections
|
|
setTagHashes();
|
|
*/
|
|
|
|
// . set SEC_HEADING bit
|
|
// . need this before implied sections
|
|
setHeadingBit ();
|
|
|
|
//
|
|
// set SEC_HR_CONTAINER bit for use by addHeaderImpliedSections(true)
|
|
// fix for folkmads.org which has <tr><td><div><hr></div>...
|
|
//
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not hr
|
|
if ( si->m_tagId != TAG_HR ) continue;
|
|
// cycle this
|
|
Section *sp = si;
|
|
// blow up right before it has text
|
|
for ( ; sp ; sp = sp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// set this
|
|
sp->m_flags |= SEC_HR_CONTAINER;
|
|
// stop if parent has text
|
|
if ( sp->m_parent &&
|
|
sp->m_parent->m_firstWordPos >= 0 ) break;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
// HACK: back to back br tags
|
|
//splitSections ( (char *)0x01 , (int32_t)BH_BRBR );
|
|
|
|
// single br tags
|
|
//splitSections ( "<br" , (int32_t)BH_BR );
|
|
|
|
|
|
// just in case we added more sections
|
|
//setNextBrotherPtrs ( false );
|
|
|
|
/*
|
|
//
|
|
// now add implied sections based on dates
|
|
//
|
|
again:
|
|
int32_t added = addDateBasedImpliedSections();
|
|
// g_errno should be set when we return -1 on error
|
|
if ( added < 0 ) return true;
|
|
// if added something, try again
|
|
if ( added > 0 ) goto again;
|
|
|
|
|
|
// •
|
|
char bullet[4];
|
|
bullet[0] = 226;
|
|
bullet[1] = 128;
|
|
bullet[2] = 162;
|
|
bullet[3] = 0;
|
|
splitSections ( bullet , BH_BULLET );
|
|
|
|
// just in case we added more sections
|
|
setNextBrotherPtrs ( false );
|
|
|
|
// now set it again after adding implied sections
|
|
*/
|
|
|
|
setTagHashes();
|
|
|
|
/*
|
|
// now set m_sorted since we added some sections out of order
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ )
|
|
m_sorted[i] = &m_sections[i];
|
|
// sort them
|
|
char tmpBuf[30000];
|
|
char *useBuf = tmpBuf;
|
|
// do not breach buffer
|
|
int32_t need2 = m_numSections * sizeof(Section *) + 32 ;
|
|
if ( need2 > 30000 ) {
|
|
useBuf = (char *)mmalloc ( need2, "gbmrgtmp");
|
|
if ( ! useBuf ) return true;
|
|
}
|
|
gbmergesort ( m_sorted ,
|
|
m_numSections ,
|
|
sizeof(Section *),
|
|
cmp ,
|
|
m_niceness ,
|
|
useBuf ,
|
|
need2 );
|
|
|
|
// free it now
|
|
if ( useBuf != tmpBuf ) mfree ( useBuf , need2 ,"gbmrgtmp");
|
|
|
|
// now set m_sortedIndex of each section so getSubfieldTable() can use
|
|
// that in Address.cpp to just check to see if each subsection has
|
|
// and address/email/fieldname/etc.
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ )
|
|
m_sorted[i]->m_sortedIndex = i;
|
|
*/
|
|
|
|
//
|
|
//
|
|
// TODO TODO
|
|
//
|
|
// TAKE OUT THESE SANITY CHECKS TO SPEED UP!!!!!!
|
|
//
|
|
//
|
|
|
|
// we seem to be pretty good, so remove these now. comment this
|
|
// goto out if you change sections.cpp and want to make sure its good
|
|
//if ( m_isTestColl ) verifySections();
|
|
|
|
// this caused us to not set m_nsvt from the sectionsVotes because
|
|
// the content was blanked out because XmlDoc::m_skipIndexing was
|
|
// set to true! but we still need the section votes to be consistent
|
|
// so we can delete them from Sectiondb if we need to.
|
|
//if ( m_numSections == 0 ) return true;
|
|
|
|
// now we call it twice, so make it a function
|
|
//setTagHashes();
|
|
|
|
// clear this
|
|
bool isHidden = false;
|
|
int32_t startHide = 0x7fffffff;
|
|
int32_t endHide = 0 ;
|
|
//int32_t numTitles = 0;
|
|
//Section *lastTitleParent = NULL;
|
|
Section *firstTitle = NULL;
|
|
// now that we have closed any open tag, set the SEC_HIDDEN bit
|
|
// for all sections that are like <div style=display:none>
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// set m_lastSection so we can scan backwards
|
|
m_lastSection = sn;
|
|
// set SEC_SECOND_TITLE flag
|
|
if ( sn->m_tagId == TAG_TITLE ) {
|
|
// . reset if different parent.
|
|
// . fixes trumba.com rss feed with <title> tags
|
|
if ( firstTitle &&
|
|
firstTitle->m_parent != sn->m_parent )
|
|
firstTitle = NULL;
|
|
// inc the count
|
|
//numTitles++;
|
|
// 2+ is bad. fixes wholefoodsmarket.com 2nd title
|
|
if ( firstTitle ) sn->m_flags |= SEC_SECOND_TITLE;
|
|
// set it if need to
|
|
if ( ! firstTitle ) firstTitle = sn;
|
|
// store our parent
|
|
//lastTitleParent = sn->m_parent;
|
|
}
|
|
// get it
|
|
//Section *sn = m_sorted[i]; // sections[i];
|
|
// set this
|
|
int32_t wn = sn->m_a;
|
|
// stop hiding it?
|
|
if ( isHidden ) {
|
|
// turn it off if not contained
|
|
if ( wn >= endHide ) isHidden = false;
|
|
//else if ( wn <= startHide ) isHidden = false;
|
|
else sn->m_flags |= SEC_HIDDEN;
|
|
}
|
|
// get tag id
|
|
nodeid_t tid = sn->m_tagId;//tids[wn];
|
|
// is div, td or tr tag start?
|
|
if ( tid!=TAG_DIV &&
|
|
tid!=TAG_TD &&
|
|
tid!=TAG_TR &&
|
|
tid!=TAG_UL &&
|
|
tid!=TAG_SPAN) continue;
|
|
|
|
// . if we are a div tag, mod it
|
|
// . treat the fields in the div tag as
|
|
// part of the tag hash.
|
|
// . helps Events.cpp be more precise about
|
|
// section identification!!!!
|
|
// . we now do this for TD and TR so Nov 2009 can telescope for
|
|
// http://10.5.1.203:8000/test/doc.17096238520293298312.html
|
|
// so the calendar title "Nov 2009" can affect all dates
|
|
// below the calendar.
|
|
|
|
// get the style tag in there and check it for "display: none"!
|
|
int32_t slen = wlens[wn];
|
|
char *s = wptrs[wn];
|
|
char *send = s + slen;
|
|
// check out any div tag that has a style
|
|
char *style = gb_strncasestr(s,slen,"style=") ;
|
|
if ( ! style ) continue;
|
|
// . check for hidden
|
|
// . if no hidden tag assume it is UNhidden
|
|
// . TODO: later push & pop on stack
|
|
char *ds = gb_strncasestr(style,send-style,"display:");
|
|
// if display:none not found turn off SEC_HIDDEN
|
|
if ( ! ds || ! gb_strncasestr(s,slen,"none") ) {
|
|
// turn off the hiding
|
|
isHidden = false;
|
|
// off in us too
|
|
sn->m_flags &= ~SEC_HIDDEN;
|
|
continue;
|
|
}
|
|
// mark all sections in this with the tag
|
|
isHidden = true;
|
|
// on in us
|
|
sn->m_flags |= SEC_HIDDEN;
|
|
// stop it after this word for sure
|
|
if ( sn->m_b > endHide ) endHide = sn->m_b;
|
|
if ( sn->m_a < startHide ) startHide = sn->m_a;
|
|
}
|
|
|
|
|
|
// now set the content hash of each section
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// must be an alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get its section
|
|
m_sectionPtrs[i]->m_contentHash64 ^= m_wids[i];
|
|
// fix "smooth smooth!"
|
|
if ( m_sectionPtrs[i]->m_contentHash64 == 0 )
|
|
m_sectionPtrs[i]->m_contentHash64 = 123456;
|
|
|
|
/*
|
|
// . also now set m_voteHash32 which we use for sectiondb now
|
|
// . so if the content is a date or number its hash will
|
|
// not change if the date changes. so if the date basically
|
|
// repeats on each page it won't matter! ticket prices
|
|
// mess it up too now...!!
|
|
if ( isMonth (m_wids [i] ) ) continue;
|
|
if ( is_digit(m_wptrs[i][0]) ) continue;
|
|
// is it 1 st, 2 nd, 3 rd, etc
|
|
if ( m_wlens[i]==2 ) {
|
|
if ( to_lower(m_wptrs[i][0]) =='s' &&
|
|
to_lower(m_wptrs[i][1]) =='t' )
|
|
continue;
|
|
if ( to_lower(m_wptrs[i][0]) =='n' &&
|
|
to_lower(m_wptrs[i][1]) =='d' )
|
|
continue;
|
|
if ( to_lower(m_wptrs[i][0]) =='r' &&
|
|
to_lower(m_wptrs[i][1]) =='d' )
|
|
continue;
|
|
}
|
|
// otherwise put it in
|
|
m_sectionPtrs[i]->m_voteHash32 ^= (uint32_t)m_wids[i];
|
|
// fix "smooth smooth!"
|
|
if ( m_sectionPtrs[i]->m_voteHash32 == 0 )
|
|
m_sectionPtrs[i]->m_voteHash32 = 123456;
|
|
*/
|
|
}
|
|
|
|
// reset
|
|
m_numInvalids = 0;
|
|
|
|
// now set SEC_NOTEXT flag if content hash is zero!
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if had text
|
|
if ( sn->m_contentHash64 ) continue;
|
|
// no text!
|
|
sn->m_flags |= SEC_NOTEXT;
|
|
// count it
|
|
m_numInvalids++;
|
|
}
|
|
|
|
//
|
|
// set m_sentenceContentHash for sentence that need it
|
|
//
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if had text
|
|
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
|
|
// if already set, skip
|
|
//sn->m_sentenceContentHash64 = sn->m_contentHash64;
|
|
// skip?
|
|
//if ( sn->m_contentHash64 ) continue;
|
|
|
|
// no, m_contentHash64 is just words contained
|
|
// directly in the section... so since a sentence can
|
|
// contain like a bold subsection, we gotta scan the
|
|
// whole thing.
|
|
sn->m_sentenceContentHash64 = 0LL;
|
|
|
|
// scan the wids of the whole sentence, which may not
|
|
// be completely contained in the "sn" section!!
|
|
int32_t a = sn->m_senta;
|
|
int32_t b = sn->m_sentb;
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// must be an alnum word
|
|
if ( ! m_wids[j] ) continue;
|
|
// get its section
|
|
sn->m_sentenceContentHash64 ^= m_wids[j];
|
|
// fix "smooth smooth!"
|
|
if ( sn->m_sentenceContentHash64 == 0 )
|
|
sn->m_sentenceContentHash64 = 123456;
|
|
}
|
|
}
|
|
|
|
|
|
////////
|
|
//
|
|
// set Section::m_indirectSentHash64
|
|
//
|
|
////////
|
|
for ( Section *sn = m_firstSent ; sn ; sn = sn->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
int64_t sc64 = sn->m_sentenceContentHash64;
|
|
if ( ! sc64 ) { char *xx=NULL;*xx=0; }
|
|
// propagate it upwards
|
|
Section *p = sn;
|
|
// TODO: because we use XOR for speed we might end up with
|
|
// a 0 if two sentence are repeated, they cancel out..
|
|
for ( ; p ; p = p->m_parent )
|
|
p->m_indirectSentHash64 ^= sc64;
|
|
}
|
|
|
|
/////
|
|
//
|
|
// set SEC_HASHXPATH
|
|
//
|
|
/////
|
|
for ( Section *sn = m_firstSent ; sn ; sn = sn->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
int64_t sc64 = sn->m_sentenceContentHash64;
|
|
if ( ! sc64 ) { char *xx=NULL;*xx=0; }
|
|
// propagate it upwards
|
|
Section *p = sn->m_parent;
|
|
// parent of sentence always gets it i guess
|
|
uint64_t lastVal = 0x7fffffffffffffffLL;
|
|
// TODO: because we use XOR for speed we might end up with
|
|
// a 0 if two sentence are repeated, they cancel out..
|
|
for ( ; p ; p = p->m_parent ) {
|
|
// how can this be a text node?
|
|
if ( p->m_tagId == TAG_TEXTNODE ) continue;
|
|
// if parent's hash is same as its kid then do not
|
|
// hash it separately in order to save index space
|
|
// from adding gbxpathsitehash1234567 terms
|
|
if ( p->m_indirectSentHash64 == lastVal ) continue;
|
|
// update this for deduping
|
|
lastVal = p->m_indirectSentHash64;
|
|
// this parent should be hashed with gbxpathsitehash123
|
|
p->m_flags |= SEC_HASHXPATH;
|
|
}
|
|
}
|
|
|
|
//
|
|
// set Section::m_alnumPosA/m_alnumPosB
|
|
//
|
|
int32_t alnumCount2 = 0;
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if had text
|
|
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
|
|
// save this
|
|
sn->m_alnumPosA = alnumCount2;
|
|
// scan the wids of the whole sentence, which may not
|
|
// be completely contained in the "sn" section!!
|
|
int32_t a = sn->m_senta;
|
|
int32_t b = sn->m_sentb;
|
|
for ( int32_t j = a ; j < b ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// must be an alnum word
|
|
if ( ! m_wids[j] ) continue;
|
|
// alnumcount
|
|
alnumCount2++;
|
|
}
|
|
// so we contain the range [a,b), typical half-open interval
|
|
sn->m_alnumPosB = alnumCount2;
|
|
// sanity check
|
|
if ( sn->m_alnumPosA == sn->m_alnumPosB ){char *xx=NULL;*xx=0;}
|
|
|
|
// propagate through parents
|
|
Section *si = sn->m_parent;
|
|
// do each parent as well
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if already had one!
|
|
if ( si->m_alnumPosA > 0 ) break;
|
|
// otherwise, we are it
|
|
si->m_alnumPosA = sn->m_alnumPosA;
|
|
}
|
|
|
|
}
|
|
// propagate up alnumPosB now
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if had text
|
|
if ( ! ( sn->m_flags & SEC_SENTENCE ) ) continue;
|
|
// propagate through parents
|
|
Section *si = sn->m_parent;
|
|
// do each parent as well
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if already had one! no, because we need to
|
|
// get the MAX of all of our kids!!
|
|
//if ( si->m_alnumPosB > 0 ) break;
|
|
// otherwise, we are it
|
|
si->m_alnumPosB = sn->m_alnumPosB;
|
|
}
|
|
}
|
|
m_alnumPosValid = true;
|
|
|
|
|
|
/////////////////////////////
|
|
//
|
|
// set Section::m_rowNum and m_colNum for sections in table
|
|
//
|
|
// (we use this in Dates.cpp to set Date::m_tableRow/ColHeaderSec
|
|
// for detecting certain col/row headers like "buy tickets" that
|
|
// we use to invalidate dates as event dates)
|
|
//
|
|
/////////////////////////////
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *si = &m_sections[i];
|
|
// need table tag
|
|
if ( si->m_tagId != TAG_TABLE ) continue;
|
|
// set all the headCol/RowSection ptrs rownum, colnum, etc.
|
|
// just for this table.
|
|
setTableRowsAndCols ( si );
|
|
}
|
|
|
|
// now NULLify either all headRow or all headColSections for this
|
|
// table. dmjuice.com actually uses headRows and not columns. so
|
|
// we need to detect if the table header is the first row or the first
|
|
// column.
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *si = &m_sections[i];
|
|
// need table tag
|
|
if ( si->m_tagId != TAG_TABLE ) continue;
|
|
// ok, now set table header bits for that table
|
|
setTableHeaderBits ( si );
|
|
}
|
|
|
|
// propagate m_headCol/RowSection ptr to all kids in the td cell
|
|
|
|
|
|
|
|
// . "ot" = occurrence table
|
|
// . we use this to set Section::m_occNum and m_numOccurences
|
|
if ( ! m_ot.set (4,8,5000,NULL, 0 , false ,m_niceness,"sect-occrnc") )
|
|
return true;
|
|
|
|
// this tells us if the section had text or not
|
|
//if ( ! m_cht2.set(4,0,5000,NULL, 0 , false ,m_niceness,"sect-cht2") )
|
|
// return true;
|
|
|
|
|
|
// set the m_ot hash table for use below
|
|
for ( int32_t i = 1 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// assume no vote
|
|
uint64_t vote = 0;
|
|
// try to get it
|
|
uint64_t *vp = (uint64_t *)m_ot.getValue ( &sn->m_tagHash );
|
|
// assume these are zeroes
|
|
int32_t occNum = 0;
|
|
// if there, set these to something
|
|
if ( vp ) {
|
|
// set it
|
|
vote = *vp;
|
|
// what section # are we for this tag hash?
|
|
occNum = vote & 0xffffffff;
|
|
// save our kid #
|
|
sn->m_occNum = occNum;
|
|
// get ptr to last section to have this tagHash
|
|
//sn->m_prevSibling = (Section *)(vote>>32);
|
|
}
|
|
// mask our prevSibling
|
|
vote &= 0x00000000ffffffff;
|
|
// we are the new prevSiblinkg now
|
|
vote |= ((uint64_t)((uint32_t)i))<<32; // rplcd sn w/ i
|
|
// inc occNum for the next guy
|
|
vote++;
|
|
// store back. return true with g_errno set on error
|
|
if ( ! m_ot.addKey ( &sn->m_tagHash , &vote ) ) return true;
|
|
|
|
// use the secondary content hash which will be non-zero
|
|
// if the section indirectly contains some text, i.e.
|
|
// contains a subsection which directly contains text
|
|
//int32_t ch = sn->m_contentHash;
|
|
// skip if no content
|
|
//if ( ! ch ) continue;
|
|
// use this as the "modified taghash" now
|
|
//int32_t modified = sn->m_tagHash ^ ch;
|
|
// add the content hash to this table as well!
|
|
//if ( ! m_cht2.addKey ( &modified ) ) return true;
|
|
}
|
|
|
|
// . you are invalid in these sections
|
|
// . google seems to index SEC_MARQUEE so i took that out of here
|
|
//sec_t badFlags = SEC_SELECT|SEC_SCRIPT|SEC_STYLE;
|
|
|
|
// . now we define SEC_UNIQUE using the tagHash and "ot"
|
|
// . i think this is better than using m_tagHash
|
|
// . basically you are a unique section if you have no siblings
|
|
// in your container
|
|
|
|
// . set Section::m_numOccurences
|
|
// . the first section is the god section and has a 0 for tagHash
|
|
// so skip that!
|
|
for ( int32_t i = 1 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// get it
|
|
uint64_t vote ;
|
|
// assign it
|
|
uint64_t *slot = (uint64_t *)m_ot.getValue ( &sn->m_tagHash );
|
|
// wtf? i've seen this happen once in a blue moon
|
|
if ( ! slot ) {
|
|
log("build: m_ot slot is NULL! wtf?");
|
|
continue;
|
|
}
|
|
// otherwise, use it
|
|
vote = *slot;
|
|
// get slot for it
|
|
int32_t numKids = vote & 0xffffffff;
|
|
// must be at least 1 i guess
|
|
if ( numKids < 1 && i > 0 ) { char *xx=NULL;*xx=0; }
|
|
// how many siblings do we have total?
|
|
sn->m_numOccurences = numKids;
|
|
// sanity check
|
|
if ( sn->m_a < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( sn->m_b > m_nw ) { char *xx=NULL;*xx=0; }
|
|
//
|
|
// !!!!!!!!!!!!!!!! HEART OF SECTIONS !!!!!!!!!!!!!!!!
|
|
//
|
|
// to be a valid section we must be the sole section
|
|
// containing at least one alnum word AND it must NOT be
|
|
// in a script/style/select/marquee tag
|
|
//if ( sn->m_exclusive>0 && !(sn->m_flags & badFlags) )
|
|
// continue;
|
|
// this means we are useless
|
|
//sn->m_flags |= SEC_NOTEXT;
|
|
// and count them
|
|
//m_numInvalids++;
|
|
}
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// now set Section::m_listContainer
|
|
//
|
|
// . a containing section is a section containing
|
|
// MULTIPLE smaller sections
|
|
// . so if a section has a containing section set its m_listContainer
|
|
// to that containing section
|
|
// . we limit this to sections that directly contain text for now
|
|
// . Events.cpp::getRegistrationTable() uses m_nextBrother so we
|
|
// need this now!!
|
|
//
|
|
///////////////////////////////////////
|
|
setNextBrotherPtrs ( true ); // setContainer = true
|
|
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// now set SEC_MENU and SEC_LINK_TEXT flags
|
|
//
|
|
///////////////////////////////////////
|
|
setMenus();
|
|
|
|
///////////////////////////////////////
|
|
//
|
|
// now set SENT_LIST flags on m_sentFlags
|
|
//
|
|
// try to capture sentences that are not menus but are a list of
|
|
// things. if the sentence itself has a list of int16_t items, or a bunch
|
|
// of commas, then also set the SEC_LIST flag on it. or if sentence
|
|
// is part of a sequence of sentences that are a list of sentences then
|
|
// set it for them as well. typically such sentences will be separated
|
|
// by a vertical space, have no periods, maybe have an <li> tag or only
|
|
// have a few words per sentence. this will help us demote search results
|
|
// that have the query terms in such a list because it is usually not
|
|
// very useful information.
|
|
//
|
|
///////////////////////////////////////
|
|
setListFlags();
|
|
|
|
|
|
//verifySections();
|
|
|
|
// don't use nsvt/osvt now
|
|
return true;
|
|
|
|
/*
|
|
// . init this hashtable here
|
|
// . osvt = old section voting table
|
|
// . this holds all the votes we got exclusively from sectiondb
|
|
// . sectiondb maps a siteHash/tagHash pair to a SectionVote
|
|
// . this table maps a tagHash to a single accumulated SectionVote
|
|
if ( ! m_osvt.set(8,sizeof(SectionVote),1000 ,NULL,0,false,m_niceness,
|
|
"osvt"))
|
|
return true;
|
|
|
|
|
|
// . init this hashtable here
|
|
// . nsvt = new section voting table
|
|
// . this holds all the votes we computed
|
|
if ( ! m_nsvt.set(8,sizeof(SectionVote),4096,NULL,0,false,m_niceness,
|
|
"nsvt"))
|
|
return true;
|
|
*/
|
|
|
|
/*
|
|
- mdw: i got rid of m_plain, so i commented this out
|
|
// put votes on SV_TEXTY section type into m_nsvt
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the ith section
|
|
Section *sn = &m_sections[i];
|
|
// skip if not a section with its own words
|
|
if ( sn->m_flags & SEC_NOTEXT ) continue;
|
|
// . set SEC_TEXTY for "texty" sections
|
|
// . are we an article section?
|
|
float percent = //100.0 *
|
|
(float)(sn->m_plain) /
|
|
(float)(sn->m_exclusive);
|
|
// . update m_nsvt voting table now
|
|
// . this will return false with g_errno set
|
|
if ( ! addVote ( &m_nsvt ,
|
|
sn->m_tagHash ,
|
|
SV_TEXTY ,
|
|
percent ,
|
|
1.0 ) )
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// . add our SV_TAG_PAIR hash to the m_nsvt
|
|
// . every page adds this one
|
|
// . but don't screw up m_nsvt if we are deserializing from it
|
|
if ( ! sectionsVotes &&
|
|
! addVote(&m_nsvt,m_tagPairHash,SV_TAGPAIRHASH,1,1)) return true;
|
|
|
|
// . add our checksum votes as well
|
|
// . we use this for menu identification between web pages
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get the ith section
|
|
Section *sn = &m_sections[i];
|
|
// skip if not a section with its own words
|
|
if ( sn->m_flags & SEC_NOTEXT ) continue;
|
|
// combine the tag hash with the content hash #2
|
|
int32_t modified = sn->m_tagHash ^ sn->m_contentHash;
|
|
// . update m_nsvt voting table now
|
|
// . the tagHash is the content hash for this one
|
|
// . this will return false with g_errno set
|
|
if ( ! addVote ( &m_nsvt ,
|
|
//sn->m_contentHash ,
|
|
modified ,
|
|
SV_TAGCONTENTHASH ,
|
|
1.0 , // score
|
|
1.0 ,
|
|
true ) )
|
|
return true;
|
|
}
|
|
*/
|
|
/*
|
|
// if no url, must be being called from below around compare() on
|
|
// a title rec looked up in titledb that we are comparing to
|
|
if ( ! url ) return true;
|
|
|
|
// . now combine tag pair hash with the site hash
|
|
// . tag pair hash is a hash of the hashes of all the unique adjacent
|
|
// tag pairs. a great way to identify the html format of a doc
|
|
// . see XmlDoc::getTagPairHash() for that code...
|
|
// . tagPairHash is also used by DupDetector.cpp
|
|
//m_termId = hash64 ( tagPairHash , m_siteHash ) & TERMID_MASK;
|
|
|
|
// remove tagPairHash now that each SectionVote has a m_contentHash
|
|
m_termId = m_siteHash64 & TERMID_MASK;
|
|
|
|
|
|
if ( sectionsVotes ) {
|
|
// sanity check, this data is only used for when
|
|
// XmlDoc::m_skipIndexing is true and it zeroes out the content
|
|
// so we have to save our voting info since the content is
|
|
// gone. that way we can delete it from sectiondb later if
|
|
// we need to.
|
|
if ( m_numSections > 0 ) { char *xx=NULL;*xx=0; }
|
|
// . set our m_osvt table from what's left
|
|
// . this returns false and sets g_errno on error
|
|
if ( ! m_nsvt.deserialize(sectionsVotes,-1,m_niceness) )
|
|
return true;
|
|
}
|
|
|
|
// . if we are setting from a TitleRec stored in titledb, then
|
|
// do not consult datedb because it might have changed since we
|
|
// first indexed this document. instead deserialize m_osvt from
|
|
// TitleRec::ptr_sectionsData.
|
|
// . this ensures we parse exactly the same way as we did while we
|
|
// were creating this title rec
|
|
// . this ensures parsing consistency
|
|
// . basically, we m_osvt is a compressed version of the list we
|
|
// read from datedb, combined with any "votes" from the title rec
|
|
// comparisons we might have done in Sections::compare()
|
|
if ( sectionsDataValid ) { // xd && xd->ptr_sectionsData ) {
|
|
// point to it
|
|
char *p = sectionsData;
|
|
// . set our m_osvt table from what's left
|
|
// . this returns false and sets g_errno on error
|
|
if ( p && ! m_osvt.deserialize(p,-1,m_niceness) )
|
|
return true;
|
|
// now we can finalize, m_osvt is already set
|
|
return gotSectiondbList ( NULL );
|
|
}
|
|
|
|
// . for us, dates are really containers of the flags and tag hash
|
|
// . init this up here, it is re-set if we re-call getSectiondbList()
|
|
// because there were too many records in it to handle in one read
|
|
m_startKey = g_datedb.makeStartKey ( m_termId , 0xffffffff );
|
|
|
|
// reset this count
|
|
m_totalSiteVoters = 0;
|
|
|
|
// call it
|
|
return getSectiondbList ( );
|
|
*/
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . XmlDoc.cpp calls this separately from Sections::set because it needs
|
|
// to set Addresses and Dates class partially first so we can use them here.
|
|
bool Sections::addImpliedSections ( Addresses *aa ) {
|
|
|
|
// only call once
|
|
if ( m_addedImpliedSections ) return true;
|
|
m_addedImpliedSections = true;
|
|
|
|
// no point in going any further if we have nothing
|
|
if ( m_numSections == 0 ) return true;
|
|
|
|
// set this
|
|
//m_osvt = osvt;
|
|
|
|
//
|
|
// now do table swoggling
|
|
//
|
|
// turn off table rows and use columns if necessary.
|
|
// some tables are column based not row-based and we have to fix that.
|
|
//
|
|
//if ( ! swoggleTables ( ) ) return false;
|
|
|
|
// as part of a replacement for table swoggling which is confusing
|
|
// and didn't really work right, especially when we had both
|
|
// table header row and column, we set these on each table cell:
|
|
// SEC_HASDATEHEADERROW and SEC_HASDATEHEADERCOL
|
|
if ( ! setTableStuff ( ) ) return false;
|
|
|
|
m_aa = aa;
|
|
|
|
// int16_tcut
|
|
sec_t badFlags =SEC_MARQUEE|SEC_STYLE|SEC_SCRIPT|SEC_SELECT|
|
|
SEC_HIDDEN|SEC_NOSCRIPT;
|
|
|
|
|
|
// now we use m_numTotalPtrs, not m_numDatePtrs:
|
|
// for santafeplayhouse.org one compound date spans
|
|
// multiple sections. the first section is the DOW like
|
|
// "Friday" and the next section is the "day of months list".
|
|
// we want to add implied sections whose header is above
|
|
// the "day of months list" using the _ABOVE_DOM method,
|
|
// but it won't work unless we set these bits here properly.
|
|
// if we can't add that implied section everyone now uses
|
|
// the "Thursdays Pay for Performance" as their title because
|
|
// Thursdays is now fuzzy since we added the isEventEnding()
|
|
// algo to fix the themixtress.com which has Wednesdays in a
|
|
// band name and it should be fuzzy. there are other ways to
|
|
// fix themixtress.com, mainly ignore "Wednesdays" as a
|
|
// recurring date unless following "every", etc., but
|
|
// santafeplayhouse.org might not have had "Thursdays" in
|
|
// that event title in which case we'd have to rely on
|
|
// implied sections... so let's do this right...
|
|
|
|
// scan the dates and set a section's SEC_HAS_DOM/DOW
|
|
// and SEC_HAS_TOD for all the sections containing it either directly
|
|
// or indirectly
|
|
for ( int32_t i = 0 ; i < m_dates->m_numTotalPtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_totalPtrs[i];
|
|
// skip if nuked. part of a compound, or in bad section.
|
|
if ( ! di ) continue;
|
|
// must be in body
|
|
if ( ! ( di->m_flags & DF_FROM_BODY ) ) continue;
|
|
// ignore, otherwise the trumba sections get messed up
|
|
// because we use the official timestamps (with the "T" in
|
|
// them) as a month for adding implied sections!
|
|
if ( di->m_flags5 & DF5_IGNORE )
|
|
continue;
|
|
// skip if compound, list, range, telescope
|
|
if ( di->m_numPtrs != 0 ) continue;
|
|
// sanity check
|
|
if ( di->m_a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// int16_tcut
|
|
datetype_t dt = di->m_hasType;
|
|
// need a DOW, DOM or TOD
|
|
sec_t flags = 0;
|
|
// get smallesst containing section
|
|
Section *sa = m_sectionPtrs [ di->m_a];
|
|
// . ignore <option> tags etc. to fix burtstikilounge.com
|
|
// . was causing "Nov 2009" to be a date brother with
|
|
// its daynums in the calendar select tag dropdown
|
|
if ( sa->m_flags & badFlags ) continue;
|
|
|
|
// are we a header?
|
|
//bool header = false;
|
|
//if ( sa->m_flags & SEC_HEADING ) header = true;
|
|
//if ( sa->m_flags & SEC_HEADING_CONTAINER) header = true;
|
|
|
|
// day of month is good
|
|
//if ( header && (dt & DT_MONTH) && (dt & DT_DAYNUM) )
|
|
// allow non-header guys in so we can pick up setence-y
|
|
// sections. the additional logic in getDelimScore() should
|
|
// cover us...
|
|
// ALLOWING just daynum messes up stubhub.com because we
|
|
// think every little number in every xml tag is a daynum!
|
|
if ( (dt & DT_DAYNUM) ) // && (dt != DT_DAYNUM) ) // && (dt & DT_MONTH) )
|
|
flags |= SEC_HAS_DOM;
|
|
if ( dt & DT_MONTH )
|
|
flags |= SEC_HAS_MONTH;
|
|
// or day of week
|
|
//if ( header && (dt & DT_DOW) )
|
|
// require it be strong so things like "Hillbilly Thursday"
|
|
// in southgatehouse.com which is a band name i think does
|
|
// not cause SEC_DATE_BOUNDARY to be set for that list of
|
|
// brothers, thereby killing its sections ability to telescope
|
|
// to its day of month.
|
|
if ( (di->m_flags & DF_HAS_STRONG_DOW) && // dt & DT_DOW)
|
|
// do not count "before 4th Saturday" for folkmads.org
|
|
// because it is not really a dow in this sense
|
|
!(di->m_flags & DF_ONGOING) )
|
|
flags |= SEC_HAS_DOW;
|
|
// to fix santafeplayhouse.org they have some headers
|
|
// that are like "Thursday Pay What You Wish Performances"
|
|
if ( dt & DT_DOW )
|
|
flags |= SEC_HAS_DOW;
|
|
// tod is good (ignore "before 11pm" for this)
|
|
if ( di->m_flags & (DF_AFTER_TOD|DF_EXACT_TOD) )
|
|
flags |= SEC_HAS_TOD;
|
|
// skip fuzzies - is DF_FUZZY set for this? in parseDates?
|
|
if ( dt == DT_DAYNUM && (di->m_flags & DF_FUZZY) )
|
|
continue;
|
|
// any date?
|
|
//flags |= SEC_HAS_DATE;
|
|
// skip if none
|
|
if ( ! flags ) continue;
|
|
// mark all sections
|
|
for ( ; sa ; sa = sa->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// if already set, stop
|
|
if ( (sa->m_flags & flags) == flags ) break;
|
|
// set it
|
|
sa->m_flags |= flags;
|
|
}
|
|
}
|
|
|
|
// this is needed by setSentFlags()
|
|
//setNextSentPtrs();
|
|
|
|
// returns false and sets g_errno on error
|
|
if ( ! setSentFlagsPart2 ( ) ) return false;
|
|
|
|
|
|
// i would say <hr> is kinda like an <h0>, so do it first
|
|
if ( splitSectionsByTag ( TAG_HR ) > 0 ) setNextBrotherPtrs ( false );
|
|
|
|
// split by double br tags AFTER we've add all the text-based
|
|
// implied sections that we can. 0x01 = <br><br>.
|
|
// really instead of using back to back br tags they could have
|
|
// put everything into <p> tags which would imply these sections
|
|
// anyhow! this will fix sunsetpromotions which has no real
|
|
// sections except these brbr things.
|
|
//splitSections ( (char *)0x01 , (int32_t)BH_BRBR );
|
|
// HACK: back to back br tags
|
|
// . this also splits on any double space really:
|
|
// including text-free tr tags
|
|
if ( splitSectionsByTag ( TAG_BR ) > 0 ) setNextBrotherPtrs ( false );
|
|
|
|
// we now have METHOD_ABOVE_ADDR so need addr xor
|
|
setAddrXors( m_aa );
|
|
|
|
//
|
|
// now add implied sections based on dates
|
|
//
|
|
again:
|
|
int32_t added = addImpliedSections3();
|
|
// g_errno should be set when we return -1 on error
|
|
if ( added < 0 ) return true;
|
|
// if added something, try again
|
|
if ( added > 0 ) goto again;
|
|
// just in case we added more sections
|
|
setNextBrotherPtrs ( false );
|
|
|
|
// . the header tags
|
|
// . no! graffiti.org uses header tags for just making font size
|
|
// large... doing these messes up our implied sections so that
|
|
// the monthday range is excised from the event description
|
|
// . So that means we have to be damn sure to get those implied
|
|
// date based sections in for graffiti.org then! but we do need
|
|
// these header tag splitters for unm.edu which has them above
|
|
// some tables, and for abqfolkfest.org which also does, like
|
|
// for "JAM SESSIONS"
|
|
if (splitSectionsByTag ( TAG_H1 ) > 0 ) setNextBrotherPtrs ( false );
|
|
if (splitSectionsByTag ( TAG_H2 ) > 0 ) setNextBrotherPtrs ( false );
|
|
if (splitSectionsByTag ( TAG_H3 ) > 0 ) setNextBrotherPtrs ( false );
|
|
if (splitSectionsByTag ( TAG_H4 ) > 0 ) setNextBrotherPtrs ( false );
|
|
if (splitSectionsByTag ( TAG_H5 ) > 0 ) setNextBrotherPtrs ( false );
|
|
|
|
// fix pacificmedicalcenters.org
|
|
if ( splitSectionsByTag ( TAG_DT ) > 0 ) setNextBrotherPtrs ( false );
|
|
|
|
// •
|
|
char bullet[4];
|
|
bullet[0] = 226;
|
|
bullet[1] = 128;
|
|
bullet[2] = 162;
|
|
bullet[3] = 0;
|
|
splitSections ( bullet , BH_BULLET );
|
|
|
|
// just in case we added more sections
|
|
setNextBrotherPtrs ( false );
|
|
|
|
// . try adding implied sections again, might getter better
|
|
// similarities now that we have different subsections to divide
|
|
// . should fix santafeplayhouse.org
|
|
again2:
|
|
int32_t added2 = addImpliedSections3();
|
|
// g_errno should be set when we return -1 on error
|
|
if ( added2 < 0 ) return true;
|
|
// if added something, try again
|
|
if ( added2 > 0 ) goto again2;
|
|
// just in case we added more sections
|
|
setNextBrotherPtrs ( false );
|
|
|
|
|
|
|
|
// now set it again after adding implied sections
|
|
setTagHashes();
|
|
|
|
// this is used by Events.cpp Section::m_nextSent
|
|
setNextSentPtrs();
|
|
|
|
//
|
|
// redo the content hash now
|
|
//
|
|
// set checksum of implied section
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// reset
|
|
sk->m_contentHash64 = 0;
|
|
// remove this flag
|
|
sk->m_flags &= ~SEC_NOTEXT;
|
|
}
|
|
// now set the content hash of each section
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// must be an alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get its section
|
|
m_sectionPtrs[i]->m_contentHash64 ^= m_wids[i];
|
|
// fix "smooth smooth!"
|
|
if ( m_sectionPtrs[i]->m_contentHash64 == 0 )
|
|
m_sectionPtrs[i]->m_contentHash64 = 123456;
|
|
}
|
|
// now set SEC_NOTEXT flag if content hash is zero!
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sn = &m_sections[i];
|
|
// skip if had text
|
|
if ( sn->m_contentHash64 ) continue;
|
|
// no text!
|
|
sn->m_flags |= SEC_NOTEXT;
|
|
}
|
|
|
|
///////////////////
|
|
//
|
|
// set form tables
|
|
//
|
|
// set SENT_FORMTABLE_FIELD and SENT_FORMTABLE_VALUE on sentences
|
|
// like "Price:</td><td> $10-$20\n" or "Good For Kids:</td><td>Yes"
|
|
//
|
|
///////////////////
|
|
setFormTableBits();
|
|
|
|
// now set the stuff that depends on the voting table now that
|
|
// all our implied sections are added since XmlDoc::getVotingTable()
|
|
// adds the votes/hashes after all the implied sections are in.
|
|
|
|
/*
|
|
/////////////////////////////
|
|
//
|
|
// set SENT_ONROOTPAGE
|
|
//
|
|
/////////////////////////////
|
|
// . now use svt
|
|
// . do this before telescoping because if one date in a telescope
|
|
// does not have DF_ONROOTPAGE set then we clear that bit for the
|
|
// telescope
|
|
// . we hashed all the sections containing any tod ranges on the root
|
|
// page, assuming it to be store hours.
|
|
// . we hash the tagid with its content hash for each section as we
|
|
// telescoped up hashing in more tagids, up to 5.
|
|
// . so how many layers of the onion can we match?
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if not there
|
|
if ( ! m_osvt ) break;
|
|
// stop if empty
|
|
if ( m_osvt->isTableEmpty() ) break;
|
|
// only works on sentences for now
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// get the date root hash
|
|
uint32_t modified = getDateSectionHash ( si );
|
|
// and see if it exists in the root doc
|
|
if ( ! m_osvt->isInTable ( &modified ) ) continue;
|
|
// it does! so set date flag as being from root
|
|
si->m_sentFlags |= SENT_ONROOTPAGE;
|
|
}
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
// place name indicators
|
|
static HashTableX s_pit;
|
|
static char s_pitbuf[2000];
|
|
static bool s_init9 = false;
|
|
|
|
bool initPlaceIndicatorTable ( ) {
|
|
if ( s_init9 ) return true;
|
|
// init place name indicators
|
|
static char *s_pindicators [] = {
|
|
"airport",
|
|
"airstrip",
|
|
"auditorium",
|
|
//"area",
|
|
"arena",
|
|
"arroyo",
|
|
// wonderland ballroom thewonderlandballroom.com
|
|
"ballroom",
|
|
"bank",
|
|
"banks",
|
|
"bar",
|
|
"base",
|
|
"basin",
|
|
"bay",
|
|
"beach",
|
|
"bluff",
|
|
"bog",
|
|
//"boundary",
|
|
"branch",
|
|
"bridge",
|
|
"brook",
|
|
"building",
|
|
"bunker",
|
|
"burro",
|
|
"butte",
|
|
"bookstore", // st john's college bookstore
|
|
"enterprises", // J & M Enterprises constantcontact.com
|
|
"llc", // why did we remove these before???
|
|
"inc",
|
|
"incorporated",
|
|
"cabin",
|
|
"camp",
|
|
"campground",
|
|
"campgrounds",
|
|
// broke "Tennis on Campus" event
|
|
//"campus",
|
|
"canal",
|
|
"canyon",
|
|
"casa",
|
|
"castle",
|
|
"cathedral",
|
|
"cave",
|
|
"cemetery",
|
|
"cellar", // the wine cellar
|
|
"center",
|
|
"centre",
|
|
// channel 13 news
|
|
//"channel",
|
|
"chapel",
|
|
"church",
|
|
// bible study circle
|
|
//"circle",
|
|
"cliffs",
|
|
"clinic",
|
|
"college",
|
|
"company",
|
|
"complex",
|
|
"corner",
|
|
"cottage",
|
|
"course",
|
|
"courthouse",
|
|
"courtyard",
|
|
"cove",
|
|
"creek",
|
|
"dam",
|
|
"den",
|
|
// subplace
|
|
//"department",
|
|
"depot",
|
|
"dome",
|
|
"downs",
|
|
//"fair", // xxx fair is more of an event name!!
|
|
"fairgrounds",
|
|
"fairground",
|
|
"forum",
|
|
"jcc",
|
|
|
|
"playground",
|
|
"playgrounds",
|
|
"falls",
|
|
"farm",
|
|
"farms",
|
|
"field",
|
|
"fields",
|
|
"flat",
|
|
"flats",
|
|
"forest",
|
|
"fort",
|
|
//"fountain",
|
|
"garden",
|
|
"gardens",
|
|
//"gate",
|
|
"glacier",
|
|
"graveyard",
|
|
"gulch",
|
|
"gully",
|
|
"hacienda",
|
|
"hall",
|
|
//"halls",
|
|
"harbor",
|
|
"harbour",
|
|
"hatchery",
|
|
"headquarters",
|
|
//"heights",
|
|
"heliport",
|
|
// nob hill
|
|
//"hill",
|
|
"hillside",
|
|
"hilton",
|
|
"historical",
|
|
"historic",
|
|
"holy",
|
|
// members home
|
|
//"home",
|
|
"homestead",
|
|
"horn",
|
|
"hospital",
|
|
"hotel",
|
|
"house",
|
|
"howard",
|
|
"inlet",
|
|
"inn",
|
|
"institute",
|
|
"international",
|
|
"isla",
|
|
"island",
|
|
"isle",
|
|
"islet",
|
|
"junction",
|
|
"knoll",
|
|
"lagoon",
|
|
"laguna",
|
|
"lake",
|
|
"landing",
|
|
"ledge",
|
|
"lighthouse",
|
|
"lodge",
|
|
"lookout",
|
|
"mall",
|
|
"manor",
|
|
"marina",
|
|
"meadow",
|
|
"mine",
|
|
"mines",
|
|
"monument",
|
|
"motel",
|
|
"museum",
|
|
// subplace
|
|
//"office",
|
|
"outlet",
|
|
"palace",
|
|
"park",
|
|
"peaks",
|
|
"peninsula",
|
|
"pit",
|
|
"plains",
|
|
"plant",
|
|
"plantation",
|
|
"plateau",
|
|
"playa",
|
|
"plaza",
|
|
"point",
|
|
"pointe",
|
|
"pond",
|
|
"pool", // swimming pool
|
|
"port",
|
|
"railroad",
|
|
"ramada",
|
|
"ranch",
|
|
// rio rancho
|
|
//"rancho",
|
|
// date range
|
|
//"range",
|
|
"reef",
|
|
"refure",
|
|
"preserve", // nature preserve
|
|
"reserve",
|
|
"reservoir",
|
|
"residence",
|
|
"resort",
|
|
//"rio",
|
|
//"river",
|
|
//"riverside",
|
|
//"riverview",
|
|
//"rock",
|
|
"sands",
|
|
"sawmill",
|
|
"school",
|
|
"schoolhouse",
|
|
"shore",
|
|
"spa",
|
|
"speedway",
|
|
"spring",
|
|
"springs",
|
|
"stadium",
|
|
"station",
|
|
"strip",
|
|
"suite",
|
|
"suites",
|
|
"temple",
|
|
"terrace",
|
|
"tower",
|
|
//"trail",
|
|
"travelodge",
|
|
"triangle",
|
|
"tunnel",
|
|
"university",
|
|
//"valley",
|
|
"wall",
|
|
"ward",
|
|
"waterhole",
|
|
"waters",
|
|
"well",
|
|
"wells",
|
|
"wilderness",
|
|
"windmill",
|
|
"woodland",
|
|
"woods",
|
|
"gallery",
|
|
"theater",
|
|
"theatre",
|
|
"cinema",
|
|
"cinemas",
|
|
"playhouse",
|
|
"saloon",
|
|
"nightclub", // guys n' dolls restaurant and night club
|
|
"lounge",
|
|
"ultralounge",
|
|
"brewery",
|
|
"chophouse",
|
|
"tavern",
|
|
"company",
|
|
"rotisserie",
|
|
"bistro",
|
|
"parlor",
|
|
"studio",
|
|
"studios",
|
|
// denver, co
|
|
//"co",
|
|
"bureau",
|
|
//"estates",
|
|
"dockyard",
|
|
"gym",
|
|
"synagogue",
|
|
"shrine",
|
|
"mosque",
|
|
"store",
|
|
"mercantile",
|
|
"mart",
|
|
"amphitheatre",
|
|
"kitchen",
|
|
"casino",
|
|
"diner",
|
|
"eatery",
|
|
"shop",
|
|
//"inc",
|
|
//"incorporated",
|
|
//"corporation",
|
|
//"limited",
|
|
//"llc",
|
|
//"foundation",
|
|
"warehouse",
|
|
"roadhouse",
|
|
"foods",
|
|
"cantina",
|
|
"steakhouse",
|
|
"smokehouse",
|
|
"deli",
|
|
//"enterprises",
|
|
//"repair",
|
|
//"service",
|
|
// group services
|
|
//"services",
|
|
//"systems",
|
|
"salon",
|
|
"boutique",
|
|
"preschool",
|
|
//"galleries",
|
|
"bakery",
|
|
"factory",
|
|
//"llp",
|
|
//"attorney",
|
|
//"association",
|
|
//"solutions",
|
|
"facility",
|
|
"cannery",
|
|
"winery",
|
|
"mill",
|
|
"quarry",
|
|
"monastery",
|
|
"observatory",
|
|
"nursery",
|
|
"pagoda",
|
|
"pier",
|
|
"prison",
|
|
//"post",
|
|
"ruin",
|
|
"ruins",
|
|
"storehouse",
|
|
"square",
|
|
"tomb",
|
|
"wharf",
|
|
"zoo",
|
|
"mesa",
|
|
// five day pass
|
|
//"pass",
|
|
"passage",
|
|
"peak",
|
|
"vineyard",
|
|
"vineyards",
|
|
"grove",
|
|
"space",
|
|
"library",
|
|
"bakery", // a b c bakery
|
|
"school",
|
|
"church",
|
|
"park",
|
|
"house",
|
|
//"market", hurt Los Ranchos Growers' Market
|
|
//"marketplace",
|
|
"university",
|
|
"center",
|
|
"restaurant",
|
|
"bar",
|
|
"grill",
|
|
"grille",
|
|
"cafe",
|
|
"cabana",
|
|
"shack",
|
|
"shoppe",
|
|
"collesium",
|
|
"colliseum",
|
|
"pavilion"
|
|
//"club"
|
|
};
|
|
// store these words into table
|
|
s_init9 = true;
|
|
s_pit.set(8,0,128,s_pitbuf,2000,false,0,"ti1tab");
|
|
int32_t n = (int32_t)sizeof(s_pindicators)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
char *s = s_pindicators[i];
|
|
int64_t h = hash64n(s);
|
|
s_pit.addKey ( &h );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool isPlaceIndicator ( int64_t *widp ) {
|
|
if ( ! s_init9 ) initPlaceIndicatorTable();
|
|
return s_pit.isInTable ( widp );
|
|
}
|
|
|
|
static HashTableX s_igt;
|
|
static char s_igtbuf[10000];
|
|
static bool s_init3 = false;
|
|
|
|
void initGenericTable ( int32_t niceness ) {
|
|
|
|
// . now we create strings of things that are somewhat generic
|
|
// and should be excluded from titles.
|
|
// . we set the GENERIC bit for each word or phrase in the sentence
|
|
// that matches one in this list of generic words/phrases
|
|
// . then we ignore the longest generic phrase that is two words or
|
|
// more, within the sentence, for the purposes of forming titles
|
|
// . so if we had "buy tickets for Spiderman" we would ignore
|
|
// "buy tickets for" and the title would just be Spiderman
|
|
// . we also set the GENERIC bit for dates and places that match
|
|
// that of the event in addition to these phrases/words
|
|
static char *s_ignore[] = {
|
|
"repeats",
|
|
"various", // this event repeats on various days
|
|
"feed",
|
|
"size", // small class size: hadcolon algo fix
|
|
"readers",
|
|
"rating",
|
|
"publish",
|
|
"category",
|
|
"special",
|
|
"guest",
|
|
"guests",
|
|
"sold", // sold out
|
|
"out",
|
|
"tba",
|
|
"promotional", // promotional partners
|
|
"partners",
|
|
"regular", // regular ticket price
|
|
"fee",
|
|
"fees",
|
|
"purchase",
|
|
"how",
|
|
"order", // how to order
|
|
"redeem",
|
|
"promo", // promo code
|
|
"code",
|
|
"genre",
|
|
"type",
|
|
"price",
|
|
"prices", // ticket prices
|
|
"priced",
|
|
"pricing", // pricing information
|
|
"bid", // minimum bid: $50,000
|
|
"bids",
|
|
"attn",
|
|
"pm",
|
|
"am",
|
|
"store", // fix store: open blah blah... for hadcolon
|
|
"default", // fix default: all about... title for had colon
|
|
"id", // event id: 1234
|
|
"buffer", // fix losing Life of Brian to Buffer:33% title
|
|
"coming",
|
|
"soon",
|
|
"deadline", // price deadline
|
|
"place", // place order
|
|
"order",
|
|
"users",
|
|
"claim",
|
|
"it",
|
|
"set", // set: 10:30pm (when they start the set i guess)
|
|
"transportation",
|
|
"organization",
|
|
"company",
|
|
"important",
|
|
"faq",
|
|
"faqs",
|
|
"instructions",
|
|
"instruction",
|
|
"advertise",
|
|
"advertisement",
|
|
"name",
|
|
"upcoming",
|
|
"attraction",
|
|
"attractions",
|
|
"events",
|
|
"news", // news & events
|
|
"posted", // posted by, posted in news
|
|
"is",
|
|
"your",
|
|
"user",
|
|
"reviews",
|
|
"most",
|
|
"recent",
|
|
"latest",
|
|
"comments",
|
|
"bookmark",
|
|
"creator",
|
|
"tags",
|
|
"close",
|
|
"closes",
|
|
"closed",
|
|
"send",
|
|
"map",
|
|
"maps",
|
|
"directions",
|
|
"driving",
|
|
"help",
|
|
"read", // read more
|
|
"more",
|
|
"every", // every other wednesday
|
|
"availability",
|
|
"schedule",
|
|
"scheduling", // scheduling a program
|
|
"program",
|
|
"other",
|
|
"log",
|
|
"sign",
|
|
"up",
|
|
"login",
|
|
"logged", // you're not logged in
|
|
"you're",
|
|
"should", // you should register
|
|
"register",
|
|
"registration", // registration fees
|
|
"back",
|
|
"change",
|
|
"write",
|
|
"save",
|
|
"add",
|
|
"share",
|
|
"forgot",
|
|
"password",
|
|
"home",
|
|
"hourly", // hourly fee
|
|
"hours",
|
|
"operation", // hours of operation
|
|
"public", // hours open to the public.. public tours
|
|
"what's",
|
|
"happening",
|
|
"menu",
|
|
"plan", // plan a meeting
|
|
"sitemap",
|
|
"advanced",
|
|
"beginner", // dance classes
|
|
"intermediate",
|
|
"basic",
|
|
"beginners",
|
|
// beginning hebrew conversation class?
|
|
// 6 Mondays beginning Feb 7th - Mar 14th...
|
|
"beginning",
|
|
|
|
"calendar", // calendar for jan 2012
|
|
"full", // full calendar
|
|
"hello", // <date> hello there guest
|
|
"level", // dance classes
|
|
"open", // open level
|
|
"go",
|
|
"homepage",
|
|
"website",
|
|
"view",
|
|
"submit",
|
|
"get",
|
|
"subscribe",
|
|
"loading",
|
|
"last",
|
|
"modified",
|
|
"updated",
|
|
|
|
"untitled", // untitled document for nonamejustfriends.com
|
|
"document",
|
|
"none", // none specified
|
|
"specified",
|
|
"age", // age suitability
|
|
"suitability",
|
|
"requirement", // age requirement
|
|
"average", // average ratings
|
|
"ratings",
|
|
"business", // business categories
|
|
"categories",
|
|
"narrow", // narrow search
|
|
"related", // related topics/search
|
|
"topics",
|
|
"searches",
|
|
"sort", // sort by
|
|
|
|
"there",
|
|
"seating", // there will be no late seating
|
|
"seats", // seats are limited
|
|
"accomodations", // find accomodations
|
|
"request", // request registration
|
|
"requests",
|
|
|
|
"are",
|
|
"maximum", // maximum capacity
|
|
"max", // maximum capacity
|
|
"capacity", // capcity 150 guests
|
|
"early",
|
|
"late", // Wed - Thurs 1am to Late Morning
|
|
"latest", // latest performance
|
|
"performance",
|
|
"performances", // performances: fridays & saturdays at 8 PM..
|
|
"rsvp", // rsvp for free tickets
|
|
"larger", // view larger map
|
|
|
|
"special",
|
|
"guest",
|
|
"guests",
|
|
"venue",
|
|
"additional",
|
|
"general",
|
|
"admission",
|
|
"information",
|
|
"info",
|
|
"what",
|
|
"who",
|
|
"where",
|
|
"when",
|
|
"tickets",
|
|
"ticket",
|
|
"tix", // buy tix
|
|
"tuition",
|
|
"tuitions",
|
|
"donate",
|
|
"donation", // $4 donation - drivechicago.com
|
|
"donations",
|
|
"booking",
|
|
"now",
|
|
"vip",
|
|
"student",
|
|
"students",
|
|
"senior",
|
|
"seniors",
|
|
"sr",
|
|
"mil",
|
|
"military",
|
|
"adult",
|
|
"adults",
|
|
"teen",
|
|
"teens",
|
|
"tween",
|
|
"tweens",
|
|
"elementary",
|
|
"preschool",
|
|
"preschl",
|
|
"toddler",
|
|
"toddlers",
|
|
"tdlr",
|
|
"family",
|
|
"families",
|
|
"children",
|
|
"youth",
|
|
"youngsters",
|
|
"kids",
|
|
"kid",
|
|
"child",
|
|
|
|
"find", // find us
|
|
"next",
|
|
"prev",
|
|
"previous",
|
|
"series", // next series
|
|
|
|
"round", // open year round
|
|
|
|
// weather reports!
|
|
"currently", // currently: cloudy (hadcolon fix)
|
|
"humidity",
|
|
"light",
|
|
"rain",
|
|
"rainy",
|
|
"snow",
|
|
"cloudy",
|
|
"sunny",
|
|
"fair",
|
|
"windy",
|
|
"mostly", // mostly cloudy
|
|
"partly", // partly cloudy
|
|
"fahrenheit",
|
|
"celsius",
|
|
// "Wind: N at 7 mph"
|
|
"wind",
|
|
"mph",
|
|
"n",
|
|
"s",
|
|
"e",
|
|
"w",
|
|
"ne",
|
|
"nw",
|
|
"se",
|
|
"sw",
|
|
|
|
// lottery title: %16.0 million
|
|
"million",
|
|
|
|
// 6th street
|
|
"1st",
|
|
"2nd",
|
|
"3rd",
|
|
"4th",
|
|
"5th",
|
|
"6th",
|
|
"7th",
|
|
"8th",
|
|
"9th",
|
|
|
|
"thanksgiving",
|
|
"year",
|
|
"yearly",
|
|
"year's",
|
|
"day",
|
|
"night",
|
|
"nightly",
|
|
"daily",
|
|
"christmas",
|
|
|
|
"fall",
|
|
"autumn",
|
|
"winter",
|
|
"spring",
|
|
"summer",
|
|
"season", // 2011 sesason schedule of events
|
|
|
|
"session", // fall session
|
|
"sessions", // fall sessions
|
|
|
|
"jan",
|
|
"feb",
|
|
"mar",
|
|
"apr",
|
|
"may",
|
|
"jun",
|
|
"jul",
|
|
"aug",
|
|
"sep",
|
|
"oct",
|
|
"nov",
|
|
"dec",
|
|
"january",
|
|
"february",
|
|
"march",
|
|
"april",
|
|
"may",
|
|
"june",
|
|
"july",
|
|
"august",
|
|
"september",
|
|
"october",
|
|
"novemeber",
|
|
"december",
|
|
"gmt",
|
|
"utc",
|
|
"cst",
|
|
"cdt",
|
|
"est",
|
|
"edt",
|
|
"pst",
|
|
"pdt",
|
|
"mst",
|
|
"mdt",
|
|
|
|
"morning",
|
|
"evening",
|
|
"afternoon",
|
|
"mornings",
|
|
"evenings",
|
|
"afternoons",
|
|
|
|
"monday",
|
|
"tuesday",
|
|
"wednesday",
|
|
"thursday",
|
|
"friday",
|
|
"saturday",
|
|
"sunday",
|
|
|
|
"mondays",
|
|
"tuesdays",
|
|
"wednesdays",
|
|
"thursdays",
|
|
//"fridays", // hurts "5,000 Fridays" title for when.com
|
|
"saturdays",
|
|
"sundays", // all sundays
|
|
|
|
"mwth",
|
|
"mw",
|
|
"mtw",
|
|
"tuf",
|
|
"thf",
|
|
|
|
"tomorrow",
|
|
"tomorrow's",
|
|
"today",
|
|
"today's",
|
|
|
|
// we should probably treat these times like dusk/dawn
|
|
"noon",
|
|
"midday",
|
|
"midnight",
|
|
"sunset",
|
|
"sundown",
|
|
"dusk",
|
|
"sunrise",
|
|
"dawn",
|
|
"sunup",
|
|
|
|
"m",
|
|
"mo",
|
|
"mon",
|
|
"tu",
|
|
"tue",
|
|
"tues",
|
|
"wed",
|
|
"weds",
|
|
"wednes",
|
|
"th",
|
|
"thu",
|
|
"thur",
|
|
"thr",
|
|
"thurs",
|
|
"f",
|
|
"fr",
|
|
"fri",
|
|
"sa",
|
|
"sat",
|
|
"su",
|
|
"sun",
|
|
|
|
"discount",
|
|
"support",
|
|
"featuring",
|
|
"featured", // featured events
|
|
"features", // more features
|
|
"featuring",
|
|
"presents",
|
|
"presented",
|
|
"presenting",
|
|
"miscellaneous",
|
|
|
|
"usa",
|
|
"relevancy",
|
|
"date",
|
|
"time", // time:
|
|
"showtime",
|
|
"showtimes",
|
|
"distance",
|
|
"pacific", // pacific time
|
|
"eastern",
|
|
"central",
|
|
"mountain",
|
|
"cost", // cost:
|
|
"per", // $50 per ticket
|
|
"description",
|
|
"buy",
|
|
"become", // become a sponsor
|
|
"twitter",
|
|
"hashtag",
|
|
"digg",
|
|
"facebook",
|
|
"like",
|
|
"you",
|
|
"fan", // become a facebook fan
|
|
"media", // media sponsor
|
|
"charity",
|
|
"target", // Target:
|
|
"now", // buy tickets now
|
|
|
|
|
|
"reminder", // event remind
|
|
"sponsors", // event sponsors
|
|
"sponsor",
|
|
"questions", // questions or comments
|
|
"question",
|
|
"comment", // question/comment
|
|
"message",
|
|
"wall", // "comment wall"
|
|
"board", // "comment board" message board
|
|
"other", // other events
|
|
"ongoing", // other ongoing events
|
|
"recurring",
|
|
"repeating", // more repeating events
|
|
"need", // need more information
|
|
"quick", // quick links
|
|
"links", // quick links
|
|
"link",
|
|
"calendar", // calendar of events
|
|
"class", // class calendar
|
|
"classes", // events & classes
|
|
"schedule", // class schedule
|
|
"activity", // activity calendar
|
|
"typically",
|
|
"usually",
|
|
"normally",
|
|
"some", // some saturdays
|
|
"first",
|
|
"second",
|
|
"third",
|
|
"fourth",
|
|
"fifth",
|
|
|
|
"city", // "city of albuquerque title
|
|
|
|
"instructors", // 9/29 instructors:
|
|
"instructor",
|
|
"advisor", // hadcolon algo fix: Parent+Advisor:+Mrs.+Boey
|
|
"advisors",
|
|
"adviser",
|
|
"advisers",
|
|
"caller", // square dance field
|
|
"callers",
|
|
"dj", // might be dj:
|
|
"browse", // browse nearby
|
|
"nearby",
|
|
"restaurants", // nearby restaurants
|
|
"restaurant",
|
|
"bar",
|
|
"bars",
|
|
|
|
// why did i take these out??? leave them in the list, i
|
|
// think these events are too generic. maybe because sometimes
|
|
// that is the correct title and we shouldn't punish the
|
|
// title score, but rather just filter the entire event
|
|
// because of that...?
|
|
//"dinner", // fix "6:30 to 8PM: Dinner" title for la-bike.org
|
|
//"lunch",
|
|
//"breakfast",
|
|
|
|
"served",
|
|
"serving",
|
|
"serves",
|
|
"notes",
|
|
"note",
|
|
"announcement",
|
|
"announcing",
|
|
"first", // first to review
|
|
"things", // things you must see
|
|
"must",
|
|
"see",
|
|
"discover", // zvents: discover things to do
|
|
"do",
|
|
"touring", // touring region
|
|
"region",
|
|
"food",
|
|
"counties",
|
|
"tours",
|
|
"tour",
|
|
"tell", // tell a friend
|
|
"friend", // tell a friend
|
|
"about",
|
|
"this",
|
|
"occurs", // this event occurs weekly
|
|
"weekly",
|
|
|
|
"person",
|
|
"group",
|
|
"groups", // groups (15 or more)
|
|
"our", // our story (about us)
|
|
"story", // our story
|
|
"special", // special offers
|
|
"offers",
|
|
//"bars", // nearby bars
|
|
"people", // people who viewed this also viewed
|
|
"also",
|
|
"viewed",
|
|
"additional", // additional service
|
|
"real", // real reviews
|
|
"united",
|
|
"states",
|
|
"over", // 21 and over only
|
|
"advance", // $12 in adance / $4 at the door
|
|
"list", // event list
|
|
"mi",
|
|
"miles",
|
|
"km",
|
|
"kilometers",
|
|
"yes",
|
|
"no",
|
|
"false", // <visible>false</visible>
|
|
"true",
|
|
|
|
"usd", // currency units (eventbrite)
|
|
"gbp", // currency units (eventbrite)
|
|
|
|
"st", // sunday july 31 st
|
|
"chat", // chat live
|
|
"live",
|
|
"have",
|
|
"feedback",
|
|
"dining", // dining and nightlife
|
|
"nightlife",
|
|
"yet", // no comments yet
|
|
"welcome", // stmargarets.com gets WELCOME as title
|
|
"cancellation",
|
|
"cancellations",
|
|
"review",
|
|
"preview",
|
|
"previews",
|
|
"overview",
|
|
"overviews",
|
|
"gallery", // gallery concerts
|
|
"premium",
|
|
"listing",
|
|
"listings",
|
|
"press",
|
|
"releases", // press releases
|
|
"release", // press release
|
|
"opening",
|
|
"openings",
|
|
"vip",
|
|
"video",
|
|
"audio",
|
|
"radio",
|
|
"yelp",
|
|
"yahoo",
|
|
"google",
|
|
"mapquest",
|
|
"quest", // map quest
|
|
"microsoft",
|
|
"eventbrite",
|
|
"zvents",
|
|
"zipscene",
|
|
"eventful",
|
|
"com", // .com
|
|
"org", // .org
|
|
"areas", // areas covered
|
|
"covered",
|
|
"cover", // cover charge
|
|
"charge",
|
|
"detail",
|
|
"details",
|
|
"phone",
|
|
"tel",
|
|
"telephone",
|
|
"voice",
|
|
"data",
|
|
"ph",
|
|
"tty",
|
|
"tdd",
|
|
"fax",
|
|
"email",
|
|
"e",
|
|
"sale", // on sale now
|
|
"sales", // Sales 7:30 pm Fridays
|
|
"club", // club members
|
|
"join",
|
|
"please", // please join us at our monthly meetings
|
|
"official", // official site link
|
|
"site",
|
|
"blog",
|
|
"blogs", // blogs & sites
|
|
"sites",
|
|
"mail",
|
|
"mailing", // mailing address
|
|
"postal",
|
|
"statewide", // preceeds phone # in unm.edu
|
|
"toll",
|
|
"tollfree",
|
|
"call",
|
|
"number",
|
|
"parking",
|
|
"limited", // limited parking available
|
|
"available", // limited parking available
|
|
"accepts", // accepts credit cards
|
|
"accept",
|
|
"visa", // we accept visa and mastercard
|
|
"mastercard",
|
|
//"jump", // jump to navigation
|
|
"credit",
|
|
"method",
|
|
"methods", // methods of payment
|
|
"payment",
|
|
"payments",
|
|
"cards",
|
|
"zip", // POB 4321 zip 87197
|
|
"admin", // 1201 third nw admin
|
|
"meetings",
|
|
"meeting",
|
|
"meetup",
|
|
"meets",
|
|
"meet",
|
|
"future", // other future dates & times (zvents)
|
|
"dates",
|
|
"times",
|
|
"range", // price range (yelp.com blackbird buvette)
|
|
"write" , // write a review
|
|
"a",
|
|
"performers", // performers at this event
|
|
"band",
|
|
"bands",
|
|
"concert",
|
|
"concerts",
|
|
"hide" , // menu cruft from zvents
|
|
"usa", // address junk
|
|
"musicians", // category
|
|
"musician", // category
|
|
"current", // current weather
|
|
"weather",
|
|
"forecast", // 7-day forecast
|
|
"contact",
|
|
"us",
|
|
"member",
|
|
"members",
|
|
"get", // get involved
|
|
"involved",
|
|
"press", // press room
|
|
"room",
|
|
"back", // back a page
|
|
"page",
|
|
"take", // take me to the top
|
|
"me",
|
|
"top", // top of page
|
|
"print", // print friendly page
|
|
"friendly",
|
|
"description",
|
|
"location",
|
|
"locations",
|
|
"street",
|
|
"address",
|
|
"neighborhood",
|
|
"neighborhoods",
|
|
"guide", // albuquerque guide for bird walk
|
|
"ticketing",
|
|
"software", // ticketing software
|
|
"download",// software download
|
|
"search",
|
|
"results",
|
|
"navigation", // main navigation
|
|
"breadcrumb", // breadcrumb navigation
|
|
"main",
|
|
"skip", // skip to main content
|
|
"content",
|
|
"start",
|
|
"starts", // starts 9/17/2011
|
|
"starting", // starting in 2010
|
|
"ends",
|
|
"end",
|
|
"ending",
|
|
"begin",
|
|
"begins" // concert begins at 8
|
|
"beginning",
|
|
"promptly", // starts promptly
|
|
"will", // will begin
|
|
"visit", // visit our website
|
|
"visitors",
|
|
"visitor",
|
|
"visiting", // visiting hours
|
|
"come", // come visit
|
|
"check", // check us out
|
|
"site",
|
|
"website",
|
|
"select", // select saturdays
|
|
"begin",
|
|
"ends", // ends 9/17/2011
|
|
"multiple", // multiple dates
|
|
"hottest", // hottest upcoming event
|
|
"cancel",
|
|
"displaying",
|
|
"ordering", // ordering info from stjohnscollege
|
|
"edit", // edit business info from yelp
|
|
"of",
|
|
"the",
|
|
"and",
|
|
"at",
|
|
"to",
|
|
"be",
|
|
"or",
|
|
"not",
|
|
"in",
|
|
"on",
|
|
"only", // saturday and sunday only
|
|
"winter", // winter hours: oct 15 - march 15 (unm.edu)
|
|
"summer", // summer hours
|
|
"spring",
|
|
"fall",
|
|
|
|
"by",
|
|
"under",
|
|
"for",
|
|
"from",
|
|
"click",
|
|
"here",
|
|
"new",
|
|
"free",
|
|
"title",
|
|
"event",
|
|
"tbd", // title tbd
|
|
"adv",
|
|
"dos",
|
|
"day",
|
|
"days", // this event repeats on various days
|
|
"week", // day(s) of the week
|
|
"weekend",
|
|
"weekends",
|
|
"two", // two shows
|
|
"runs", // show runs blah to blah (woodencowgallery.com)
|
|
"show",
|
|
"shows",
|
|
"door",
|
|
"doors",
|
|
"gate", // gates open at 8
|
|
"gates",
|
|
"all",
|
|
"ages",
|
|
"admitted", // all ages admitted until 5pm (groundkontrol.com)
|
|
"admittance",
|
|
"until",
|
|
"rights", // all rights reserved
|
|
"reserved", // all rights reserved
|
|
"reservations",
|
|
"reserve",
|
|
"permit", // special event permit application
|
|
"application",
|
|
"shipping", // shipping policy for tickets
|
|
"policy",
|
|
"policies",
|
|
"package", // package includes
|
|
"packages",
|
|
"includes",
|
|
"include",
|
|
"including",
|
|
"tweet",
|
|
"print", // print entire month of events
|
|
"entire",
|
|
"month",
|
|
"monthly",
|
|
"21",
|
|
"21+",
|
|
"both",
|
|
"nights",
|
|
"box",
|
|
"office",
|
|
"this",
|
|
"week",
|
|
"tonight",
|
|
"today",
|
|
"http",
|
|
"https",
|
|
"open",
|
|
"opens"
|
|
};
|
|
|
|
s_init3 = true;
|
|
s_igt.set(8,0,512,s_igtbuf,10000,false,niceness,"igt-tab");
|
|
int32_t n = (int32_t)sizeof(s_ignore)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
char *w = s_ignore[i];
|
|
int32_t wlen = gbstrlen ( w );
|
|
int64_t h = hash64Lower_utf8 ( w , wlen );
|
|
if ( ! s_igt.addKey (&h) ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
}
|
|
|
|
// so Dates.cpp DF_FUZZY algorithm can see if the DT_YEAR date is in
|
|
// a mixed case and period-ending sentence, in which case it will consider
|
|
// it to be fuzzy since it is not header material
|
|
bool Sections::setSentFlagsPart1 ( ) {
|
|
|
|
// int16_tcut
|
|
wbit_t *bits = m_bits->m_bits;
|
|
|
|
static int64_t h_i;
|
|
static int64_t h_com;
|
|
static int64_t h_org;
|
|
static int64_t h_net;
|
|
static int64_t h_pg;
|
|
static int64_t h_pg13;
|
|
static bool s_init38 = false;
|
|
if ( ! s_init38 ) {
|
|
s_init38 = true;
|
|
h_i = hash64n("i");
|
|
h_com = hash64n("com");
|
|
h_org = hash64n("org");
|
|
h_net = hash64n("net");
|
|
h_pg = hash64n("pg");
|
|
h_pg13 = hash64n("pg13");
|
|
}
|
|
|
|
// . score each section that directly contains text.
|
|
// . have a score for title and score for description
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
///////////////////
|
|
//
|
|
// SENT_MIXED_CASE
|
|
//
|
|
///////////////////
|
|
si->m_sentFlags |= getMixedCaseFlags ( m_words ,
|
|
bits ,
|
|
si->m_senta ,
|
|
si->m_sentb ,
|
|
m_niceness );
|
|
|
|
|
|
bool firstWord = true;
|
|
int32_t lowerCount = 0;
|
|
for ( int32_t i = si->m_senta ; i < si->m_sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) continue;
|
|
// are we a stop word?
|
|
bool isStopWord = m_words->isStopWord(i);
|
|
// .com is stop word
|
|
if ( m_wids[i] == h_com ||
|
|
m_wids[i] == h_org ||
|
|
m_wids[i] == h_net ||
|
|
// fixes mr. movie times "PG-13"
|
|
m_wids[i] == h_pg ||
|
|
m_wids[i] == h_pg13 )
|
|
isStopWord = true;
|
|
// are we upper case?
|
|
bool upper = is_upper_utf8(m_wptrs[i]) ;
|
|
// . are we all upper case?
|
|
// . is every single letter upper case?
|
|
// . this is a good tie break sometimes like for
|
|
// the santafeplayhouse.org A TUNA CHRISTMAS
|
|
//if ( megaCaps ) {
|
|
// if ( ! upper ) megaCaps = false;
|
|
// allow if hyphen preceedes like for
|
|
// abqfolkfest.org's "Kay-lee"
|
|
if ( i>0 && m_wptrs[i][-1]=='-' ) upper = true;
|
|
// if we got mixed case, note that!
|
|
if ( m_wids[i] &&
|
|
/*
|
|
( m_wids[i] == h_of ||
|
|
m_wids[i] == h_the ||
|
|
m_wids[i] == h_at ||
|
|
m_wids[i] == h_to ||
|
|
m_wids[i] == h_be ||
|
|
m_wids[i] == h_or ||
|
|
m_wids[i] == h_not ||
|
|
m_wids[i] == h_in ||
|
|
m_wids[i] == h_on ||
|
|
m_wids[i] == h_for ||
|
|
m_wids[i] == h_from ||
|
|
*/
|
|
//! ww->isStopWord(i) &&
|
|
! is_digit(m_wptrs[i][0]) &&
|
|
! upper &&
|
|
(! isStopWord || firstWord ) &&
|
|
// . November 4<sup>th</sup> for facebook.com
|
|
// . added "firstword" for "on AmericanTowns.com"
|
|
// title prevention for americantowns.com
|
|
(m_wlens[i] >= 3 || firstWord) )
|
|
lowerCount++;
|
|
// no longer first word in sentence
|
|
firstWord = false;
|
|
}
|
|
// does it end in period? slight penalty for that since
|
|
// the ideal event title will not.
|
|
// fixes events.kgoradio.com which was selecting the
|
|
// first sentence in the description and not the performers
|
|
// name for "Ragnar Bohlin" and "Malin Christennsson" whose
|
|
// first sentence was for the most part properly capitalized
|
|
// just by sheer luck because it used proper nouns and was
|
|
// int16_t.
|
|
bool endsInPeriod = false;
|
|
char *p = NULL;
|
|
//if ( si->m_b < m_nw ) p = m_wptrs[si->m_b];
|
|
int32_t lastPunct = si->m_sentb;
|
|
// skip over tags to fix nonamejustfriends.com sentence
|
|
for ( ; lastPunct < m_nw && m_tids[lastPunct] ; lastPunct++);
|
|
// now assume, possibly incorrectly, that it is punct
|
|
if ( lastPunct < m_nw ) p = m_wptrs[lastPunct];
|
|
// scan properly to
|
|
char *send = p + m_wlens[lastPunct];
|
|
char *s = p;
|
|
for ( ; s && s < send ; s++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// check. might have ". or ).
|
|
if ( *s == '.' ||
|
|
*s == ';' ||
|
|
*s == '?' ||
|
|
*s == '!' ) {
|
|
// do not count ellipsis for this though
|
|
// to fix eventbrite.com
|
|
// "NYC iPhone Boot Camp: ..."
|
|
if ( s[1] != '.' ) endsInPeriod = true;
|
|
break;
|
|
}
|
|
}
|
|
//if ( p && p[0] == '.' ) endsInPeriod = true;
|
|
//if ( p && p[0] == '?' ) endsInPeriod = true;
|
|
//if ( p && p[0] == '!' ) endsInPeriod = true;
|
|
//if ( p && p[1] == '.' ) endsInPeriod = false; // ellipsis
|
|
if ( isAbbr(m_wids[si->m_b-1]) && m_wlens[si->m_b-1]>1 )
|
|
endsInPeriod = false;
|
|
if ( m_wlens[si->m_b-1] <= 1 &&
|
|
// fix "world war I"
|
|
m_wids[si->m_b-1] != h_i )
|
|
endsInPeriod = false;
|
|
if ( endsInPeriod ) {
|
|
si->m_sentFlags |= SENT_PERIOD_ENDS;
|
|
// double punish if also has a lower case word
|
|
// that should not be lower case in a title
|
|
if ( lowerCount > 0 )
|
|
si->m_sentFlags |= SENT_PERIOD_ENDS_HARD;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Sections::setSentFlagsPart2 ( ) {
|
|
|
|
static bool s_init2 = false;
|
|
static int64_t h_close ;
|
|
static int64_t h_send ;
|
|
static int64_t h_map ;
|
|
static int64_t h_maps ;
|
|
static int64_t h_directions ;
|
|
static int64_t h_driving ;
|
|
static int64_t h_help ;
|
|
static int64_t h_more ;
|
|
static int64_t h_log ;
|
|
static int64_t h_sign ;
|
|
static int64_t h_login ;
|
|
static int64_t h_back ;
|
|
static int64_t h_change ;
|
|
static int64_t h_write ;
|
|
static int64_t h_save ;
|
|
static int64_t h_share ;
|
|
static int64_t h_forgot ;
|
|
static int64_t h_home ;
|
|
static int64_t h_hours;
|
|
static int64_t h_sitemap ;
|
|
static int64_t h_advanced ;
|
|
static int64_t h_go ;
|
|
static int64_t h_website ;
|
|
static int64_t h_view;
|
|
static int64_t h_add;
|
|
static int64_t h_submit;
|
|
static int64_t h_get;
|
|
static int64_t h_subscribe;
|
|
static int64_t h_loading;
|
|
static int64_t h_last;
|
|
static int64_t h_modified;
|
|
static int64_t h_updated;
|
|
static int64_t h_special;
|
|
static int64_t h_guest ;
|
|
static int64_t h_guests ;
|
|
static int64_t h_directed;
|
|
static int64_t h_venue ;
|
|
static int64_t h_instructor ;
|
|
static int64_t h_general;
|
|
static int64_t h_information;
|
|
static int64_t h_info ;
|
|
static int64_t h_i ;
|
|
static int64_t h_what ;
|
|
static int64_t h_who ;
|
|
static int64_t h_tickets;
|
|
static int64_t h_support ;
|
|
static int64_t h_featuring;
|
|
static int64_t h_presents;
|
|
static int64_t h_phone;
|
|
static int64_t h_usa;
|
|
static int64_t h_relevancy; // sort by option
|
|
static int64_t h_buy;
|
|
static int64_t h_where;
|
|
static int64_t h_when;
|
|
static int64_t h_contact;
|
|
static int64_t h_description;
|
|
static int64_t h_location;
|
|
static int64_t h_located;
|
|
static int64_t h_of;
|
|
static int64_t h_the;
|
|
static int64_t h_and;
|
|
static int64_t h_at;
|
|
static int64_t h_to;
|
|
static int64_t h_be;
|
|
static int64_t h_or;
|
|
static int64_t h_not;
|
|
static int64_t h_in;
|
|
static int64_t h_on;
|
|
static int64_t h_for;
|
|
static int64_t h_with;
|
|
static int64_t h_from;
|
|
static int64_t h_click;
|
|
static int64_t h_here;
|
|
static int64_t h_new;
|
|
static int64_t h_free;
|
|
static int64_t h_title;
|
|
static int64_t h_event;
|
|
static int64_t h_adv;
|
|
static int64_t h_dos;
|
|
static int64_t h_advance;
|
|
static int64_t h_day;
|
|
static int64_t h_show;
|
|
static int64_t h_box;
|
|
static int64_t h_office;
|
|
static int64_t h_this;
|
|
static int64_t h_week;
|
|
static int64_t h_tonight;
|
|
static int64_t h_today;
|
|
static int64_t h_http;
|
|
static int64_t h_https;
|
|
static int64_t h_claim;
|
|
static int64_t h_it;
|
|
static int64_t h_upcoming;
|
|
static int64_t h_events;
|
|
static int64_t h_is;
|
|
static int64_t h_your;
|
|
static int64_t h_user;
|
|
static int64_t h_reviews;
|
|
static int64_t h_comments;
|
|
static int64_t h_bookmark;
|
|
static int64_t h_creator;
|
|
static int64_t h_tags;
|
|
static int64_t h_repeats;
|
|
static int64_t h_feed;
|
|
static int64_t h_readers;
|
|
static int64_t h_no;
|
|
static int64_t h_rating;
|
|
static int64_t h_publish;
|
|
static int64_t h_category;
|
|
static int64_t h_genre;
|
|
static int64_t h_type;
|
|
static int64_t h_price;
|
|
static int64_t h_rate;
|
|
static int64_t h_rates;
|
|
static int64_t h_users;
|
|
|
|
static int64_t h_date ;
|
|
static int64_t h_time ;
|
|
static int64_t h_other ;
|
|
static int64_t h_future ;
|
|
static int64_t h_dates ;
|
|
static int64_t h_times ;
|
|
static int64_t h_hide ;
|
|
static int64_t h_print ;
|
|
static int64_t h_powered;
|
|
static int64_t h_provided;
|
|
static int64_t h_admission;
|
|
static int64_t h_by;
|
|
static int64_t h_com;
|
|
static int64_t h_org;
|
|
static int64_t h_net;
|
|
static int64_t h_pg;
|
|
static int64_t h_pg13;
|
|
|
|
static int64_t h_a;
|
|
static int64_t h_use;
|
|
static int64_t h_search;
|
|
static int64_t h_find;
|
|
static int64_t h_school;
|
|
static int64_t h_shop;
|
|
static int64_t h_gift;
|
|
static int64_t h_gallery;
|
|
static int64_t h_library;
|
|
static int64_t h_photo;
|
|
static int64_t h_image;
|
|
static int64_t h_picture;
|
|
static int64_t h_video;
|
|
static int64_t h_media;
|
|
static int64_t h_copyright;
|
|
static int64_t h_review;
|
|
static int64_t h_join;
|
|
static int64_t h_request;
|
|
static int64_t h_promote;
|
|
static int64_t h_open;
|
|
static int64_t h_house;
|
|
static int64_t h_million;
|
|
static int64_t h_billion;
|
|
static int64_t h_thousand;
|
|
|
|
if ( ! s_init2 ) {
|
|
s_init2 = true;
|
|
h_repeats = hash64n("repeats");
|
|
h_feed = hash64n("feed");
|
|
h_readers = hash64n("readers");
|
|
h_no = hash64n("no");
|
|
h_rating = hash64n("rating");
|
|
h_publish = hash64n("publish");
|
|
h_category = hash64n("category");
|
|
h_genre = hash64n("genre");
|
|
h_type = hash64n("type");
|
|
h_price = hash64n("price");
|
|
h_rate = hash64n("rate");
|
|
h_rates = hash64n("rates");
|
|
h_users = hash64n("users");
|
|
h_claim = hash64n("claim");
|
|
h_it = hash64n("it");
|
|
h_upcoming = hash64n("upcoming");
|
|
h_events = hash64n("events");
|
|
h_is = hash64n("is");
|
|
h_your = hash64n("your");
|
|
h_user = hash64n("user");
|
|
h_reviews = hash64n("reviews");
|
|
h_comments = hash64n("comments");
|
|
h_bookmark = hash64n("bookmark");
|
|
h_creator = hash64n("creator");
|
|
h_close = hash64n("close");
|
|
h_tags = hash64n("tags");
|
|
h_send = hash64n("send");
|
|
h_map = hash64n("map");
|
|
h_maps = hash64n("maps");
|
|
h_directions = hash64n("directions");
|
|
h_driving = hash64n("driving");
|
|
h_help = hash64n("help");
|
|
h_more = hash64n("more");
|
|
h_log = hash64n("log");
|
|
h_sign = hash64n("sign");
|
|
h_login = hash64n("login");
|
|
h_back = hash64n("back");
|
|
h_change = hash64n("change");
|
|
h_write = hash64n("write");
|
|
h_save = hash64n("save");
|
|
h_add = hash64n("add");
|
|
h_share = hash64n("share");
|
|
h_forgot = hash64n("forgot");
|
|
h_home = hash64n("home");
|
|
h_hours = hash64n("hours");
|
|
h_sitemap = hash64n("sitemap");
|
|
h_advanced = hash64n("advanced");
|
|
h_go = hash64n("go");
|
|
h_website = hash64n("website");
|
|
h_view = hash64n("view");
|
|
h_submit = hash64n("submit");
|
|
h_get = hash64n("get");
|
|
h_subscribe = hash64n("subscribe");
|
|
h_loading = hash64n("loading");
|
|
h_last = hash64n("last");
|
|
h_modified = hash64n("modified");
|
|
h_updated = hash64n("updated");
|
|
|
|
h_special = hash64n("special");
|
|
h_guest = hash64n("guest");
|
|
h_guests = hash64n("guests");
|
|
h_directed = hash64n("directed");
|
|
h_venue = hash64n("venue");
|
|
h_instructor = hash64n("instructor");
|
|
h_general = hash64n("general");
|
|
h_information = hash64n("information");
|
|
h_info = hash64n("info");
|
|
h_what = hash64n("what");
|
|
h_who = hash64n("who");
|
|
h_tickets = hash64n("tickets");
|
|
h_support = hash64n("support");
|
|
h_featuring = hash64n("featuring");
|
|
h_presents = hash64n("presents");
|
|
|
|
h_phone = hash64n("phone");
|
|
h_usa = hash64n("usa");
|
|
h_relevancy = hash64n("relevancy");
|
|
h_date = hash64n("date");
|
|
h_description = hash64n("description");
|
|
h_buy = hash64n("buy");
|
|
h_where = hash64n("where");
|
|
h_when = hash64n("when");
|
|
h_contact = hash64n("contact");
|
|
h_description = hash64n("description");
|
|
h_location = hash64n("location");
|
|
h_located = hash64n("located");
|
|
h_of = hash64n("of");
|
|
h_the = hash64n("the");
|
|
h_and = hash64n("and");
|
|
h_at = hash64n("at");
|
|
h_to = hash64n("to");
|
|
h_be = hash64n("be");
|
|
h_or = hash64n("or");
|
|
h_not = hash64n("not");
|
|
h_in = hash64n("in");
|
|
h_on = hash64n("on");
|
|
h_for = hash64n("for");
|
|
h_with = hash64n("with");
|
|
h_from = hash64n("from");
|
|
h_click = hash64n("click");
|
|
h_here = hash64n("here");
|
|
h_new = hash64n("new");
|
|
h_free = hash64n("free");
|
|
h_title = hash64n("title");
|
|
h_event = hash64n("event");
|
|
h_adv = hash64n("adv");
|
|
h_dos = hash64n("dos");
|
|
h_day = hash64n("day");
|
|
h_show = hash64n("show");
|
|
h_box = hash64n("box");
|
|
h_i = hash64n("i");
|
|
h_office = hash64n("office");
|
|
h_this = hash64n("this");
|
|
h_week = hash64n("week");
|
|
h_tonight = hash64n("tonight");
|
|
h_today = hash64n("today");
|
|
h_http = hash64n("http");
|
|
h_https = hash64n("https");
|
|
h_date = hash64n("date");
|
|
h_time = hash64n("time");
|
|
h_other = hash64n("other");
|
|
h_future = hash64n("future");
|
|
h_dates = hash64n("dates");
|
|
h_times = hash64n("times");
|
|
h_hide = hash64n("hide");
|
|
h_print = hash64n("print");
|
|
h_powered = hash64n("powered");
|
|
h_provided = hash64n("provided");
|
|
h_admission = hash64n("admission");
|
|
h_by = hash64n("by");
|
|
h_com = hash64n("com");
|
|
h_org = hash64n("org");
|
|
h_net = hash64n("net");
|
|
h_pg = hash64n("pg");
|
|
h_pg13 = hash64n("pg13");
|
|
|
|
h_a = hash64n("a");
|
|
h_use = hash64n("use");
|
|
h_search = hash64n("search");
|
|
h_find = hash64n("find");
|
|
h_school = hash64n("school");
|
|
h_shop = hash64n("shop");
|
|
h_gift = hash64n("gift");
|
|
h_gallery = hash64n("gallery");
|
|
h_library = hash64n("library");
|
|
h_photo = hash64n("photo");
|
|
h_image = hash64n("image");
|
|
h_picture = hash64n("picture");
|
|
h_video = hash64n("video");
|
|
h_media = hash64n("media");
|
|
h_copyright = hash64n("copyright");
|
|
h_review = hash64n("review");
|
|
h_join = hash64n("join");
|
|
h_request = hash64n("request");
|
|
h_promote = hash64n("promote");
|
|
h_open = hash64n("open");
|
|
h_house = hash64n("house");
|
|
h_million = hash64n("million");
|
|
h_billion = hash64n("billion");
|
|
h_thousand = hash64n("thousand");
|
|
}
|
|
|
|
// . title fields!
|
|
// . if entire previous sentence matches this you are[not] a title
|
|
// . uses + to mean is a title, - to mean is NOT a title following
|
|
// . use '$' to indicate, "ends in that" (substring match)
|
|
// . or if previous sentence ends in something like this, it is
|
|
// the same as saying that next sentence begins with this, that
|
|
// should fix "Presented By Colorado Symphony Orchestra" from
|
|
// being a good event title
|
|
static char *s_titleFields [] = {
|
|
|
|
// . these have no '$' so they are exact/full matches
|
|
// . table column headers MUST match full matches and are not
|
|
// allowed to match substring matches
|
|
"+who",
|
|
"+what",
|
|
"+event",
|
|
"+title",
|
|
|
|
// use '$' to endicate sentence ENDS in this
|
|
"-genre",
|
|
"-type",
|
|
"-category",
|
|
"-categories",
|
|
"@where",
|
|
"@location",
|
|
"-contact", // contact: john
|
|
"-neighborhood", // yelp uses this field
|
|
"@venue",
|
|
"-instructor",
|
|
"-instructors",
|
|
"-advisor",
|
|
"-advisors",
|
|
"-leader",
|
|
"-leaders",
|
|
"-chair",
|
|
"-chairman",
|
|
"-chairperson",
|
|
"-designer",
|
|
"-designers",
|
|
"-convenor", // graypanthers uses convenor:
|
|
"-convenors", // graypanthers uses convenor:
|
|
"-caller", // square dancing
|
|
"-callers",
|
|
"-call", // phone number
|
|
"-price",
|
|
"-price range",
|
|
"@event location",
|
|
|
|
// put colon after to indicate we need a colon after!
|
|
// or this can be in a column header.
|
|
// try to fix "business categories: " for switchboard.com
|
|
//"-$categories:"
|
|
|
|
// use '$' to endicate sentence ENDS in this
|
|
"+$presents",
|
|
"+$present",
|
|
"+$featuring",
|
|
|
|
// use '$' to endicate sentence ENDS in this
|
|
"-$presented by",
|
|
"-$brought to you by",
|
|
"-$sponsored by",
|
|
"-$hosted by"
|
|
};
|
|
// store these words into table
|
|
static HashTableX s_ti1;
|
|
static HashTableX s_ti2;
|
|
static char s_tibuf1[2000];
|
|
static char s_tibuf2[2000];
|
|
static bool s_init6 = false;
|
|
if ( ! s_init6 ) {
|
|
s_init6 = true;
|
|
s_ti1.set(8,4,128,s_tibuf1,2000,false,m_niceness,"ti1tab");
|
|
s_ti2.set(8,4,128,s_tibuf2,2000,false,m_niceness,"ti2tab");
|
|
int32_t n = (int32_t)sizeof(s_titleFields)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
char *s = s_titleFields[i];
|
|
Words w; w.set3 ( s );
|
|
int64_t *wi = w.getWordIds();
|
|
int64_t h = 0;
|
|
// scan words
|
|
for ( int32_t j = 0 ; j < w.getNumWords(); j++ )
|
|
if ( wi[j] ) h ^= wi[j];
|
|
// . store hash of all words, value is ptr to it
|
|
// . put all exact matches into ti1 and the substring
|
|
// matches into ti2
|
|
if ( s[1] != '$' )
|
|
s_ti1.addKey ( &h , &s );
|
|
else
|
|
s_ti2.addKey ( &h , &s );
|
|
}
|
|
}
|
|
|
|
// store these words into table
|
|
if ( ! s_init3 ) initGenericTable ( m_niceness );
|
|
|
|
//
|
|
// phrase exceptions to the ignore words
|
|
//
|
|
static char *s_exceptPhrases [] = {
|
|
// this title was used by exploratorium on a zvents page
|
|
"free day",
|
|
"concert band", // the abq concert band
|
|
"band concert",
|
|
"the voice", // "the voice of yes"
|
|
"voice of", // maybe just eliminate voice by itself...?
|
|
"voice for"
|
|
};
|
|
static HashTableX s_ext;
|
|
static char s_ebuf[10000];
|
|
static bool s_init4 = false;
|
|
if ( ! s_init4 ) {
|
|
s_init4 = true;
|
|
s_ext.set(8,0,512,s_ebuf,10000,false,m_niceness,"excp-tab");
|
|
int32_t n = (int32_t)sizeof(s_exceptPhrases)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
Words w; w.set3 ( s_exceptPhrases[i] );
|
|
int64_t *wi = w.getWordIds();
|
|
int64_t h = 0;
|
|
// scan words
|
|
for ( int32_t j = 0 ; j < w.getNumWords(); j++ )
|
|
if ( wi[j] ) h ^= wi[j];
|
|
// store hash of all words
|
|
s_ext.addKey ( &h );
|
|
}
|
|
}
|
|
// int16_tcut
|
|
Sections *ss = this;
|
|
|
|
// init table
|
|
HashTableX cht;
|
|
char chtbuf[10000];
|
|
cht.set(8,4,512,chtbuf,10000,false,m_niceness,"event-chash");
|
|
// hash the content hash of each section and penalize the title
|
|
// score if it is a repeat on this page
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if no text
|
|
//if ( ! si->m_contentHash ) continue;
|
|
// get content hash
|
|
int64_t ch64 = si->m_contentHash64;
|
|
// fix for sentences
|
|
if ( ch64 == 0 ) ch64 = si->m_sentenceContentHash64;
|
|
// if not there in either one, skip it
|
|
if ( ! ch64 ) continue;
|
|
// combine the tag hash with the content hash #2 because
|
|
// a lot of times it is repeated in like a different tag like
|
|
// the title tag
|
|
int64_t modified = si->m_tagHash ^ ch64;
|
|
// store it. return false with g_errno set on error
|
|
if ( ! cht.addTerm ( &modified ) ) return false;
|
|
}
|
|
|
|
// for checking if title contains phone #
|
|
//HashTableX *pt = m_dates->getPhoneTable ();
|
|
m_dates->setPhoneXors();
|
|
// int16_tcut
|
|
wbit_t *bits = m_bits->m_bits;
|
|
|
|
bool afterColon = false;
|
|
Section *lastsi = NULL;
|
|
// . score each section that directly contains text.
|
|
// . have a score for title and score for description
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// if tag breaks turn this off
|
|
if ( afterColon &&
|
|
si->m_tagId &&
|
|
isBreakingTagId ( si->m_tagId ) )
|
|
afterColon = false;
|
|
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
//if ( si->m_minEventId <= 0 ) continue;
|
|
|
|
// if after a colon add that
|
|
if ( afterColon )
|
|
si->m_sentFlags |= SENT_AFTER_COLON;
|
|
|
|
// now with our new logic in Sections::addSentence() [a,b) may
|
|
// actually not be completely contained in "si". this fixes
|
|
// such sentences in aliconference.com and abqtango.com
|
|
int32_t senta = si->m_senta;
|
|
int32_t sentb = si->m_sentb;
|
|
|
|
////////////
|
|
//
|
|
// a byline for a quote?
|
|
//
|
|
// fixes "a great pianist" -New York Times so we do not
|
|
// get "New York Times" as the title for terrence-wilson
|
|
//
|
|
///////////
|
|
char needChar = '-';
|
|
bool over = false;
|
|
// word # senta must be alnum!
|
|
if ( ! m_wids[senta] ) { char *xx=NULL;*xx=0; }
|
|
// start our backwards scan right before the first word
|
|
for ( int32_t i = senta - 1; i >= 0 ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// need punct
|
|
if ( m_wids[i] ) {
|
|
// no words allowed between hyphen and quote
|
|
//if ( needChar == '-' ) continue;
|
|
// otherwise if we had the hyphen and need
|
|
// the quote, we can't have a word pop up here
|
|
over = true;
|
|
break;
|
|
}
|
|
if ( m_tids[i] ) continue;
|
|
// got punct now
|
|
char *pstart = m_wptrs[i];
|
|
char *p = m_wptrs[i] + m_wlens[i] - 1;
|
|
for ( ; p >= pstart ; p-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
if ( is_wspace_a(*p) )
|
|
continue;
|
|
if ( *p != needChar ) {
|
|
over = true;
|
|
break;
|
|
}
|
|
if ( needChar == '-' ) {
|
|
needChar = '\"';
|
|
continue;
|
|
}
|
|
// otherwise, we got it!
|
|
si->m_sentFlags |= SENT_IS_BYLINE;
|
|
over = true;
|
|
break;
|
|
}
|
|
// stop if not byline or we got byline
|
|
if ( over ) break;
|
|
}
|
|
|
|
|
|
// Tags:
|
|
if ( sentb - senta == 1 && m_wids[senta] == h_tags )
|
|
si->m_sentFlags |= SENT_TAG_INDICATOR;
|
|
|
|
bool tagInd = false;
|
|
if ( lastsi && (lastsi->m_sentFlags & SENT_TAG_INDICATOR) )
|
|
tagInd = true;
|
|
// tables: but if he was in different table row, forget it
|
|
if ( tagInd && lastsi->m_rowNum != si->m_rowNum )
|
|
tagInd = false;
|
|
// tables: or same row, but table is not <= 2 columns
|
|
if ( lastsi && (lastsi->m_flags & SEC_TABLE_HEADER) )
|
|
tagInd = false;
|
|
// if in a table, is our header a tags indicator?
|
|
Section *hdr = si->m_headColSection;
|
|
// must have table header set
|
|
if ( ! hdr ) hdr = si->m_headRowSection;
|
|
// check it out
|
|
if ( hdr &&
|
|
hdr->m_nextSent &&
|
|
hdr->m_nextSent->m_a < hdr->m_b &&
|
|
(hdr->m_nextSent->m_sentFlags & SENT_TAG_INDICATOR) )
|
|
tagInd = true;
|
|
// ok, it was a tag indicator before, so we must be tags
|
|
if ( tagInd )
|
|
si->m_sentFlags |= SENT_TAGS;
|
|
|
|
///////////////
|
|
//
|
|
// set D_IS_IN_PARENS, D_CRUFTY
|
|
//
|
|
///////////////
|
|
|
|
// sometimes title starts with a word and the ( or [
|
|
// is before that word! so back up one word to capture it
|
|
int32_t a = senta;
|
|
// if prev word is punct back up
|
|
if ( a-1>=0 && ! m_wids[a-1] && ! m_tids[a-1] ) a--;
|
|
// backup over prev fron tag
|
|
if ( a-1>=0 && m_tids[a-1] && !(m_tids[a-1]&BACKBIT) ) a--;
|
|
// and punct
|
|
if ( a-1>=0 && ! m_wids[a-1] && ! m_tids[a-1] ) a--;
|
|
// init our flags
|
|
//int32_t nonGenerics = 0;
|
|
bool inParens = false;
|
|
bool inSquare = false;
|
|
|
|
for ( int32_t j = a ; j < sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip non alnm wor
|
|
if ( ! m_wids[j] ) {
|
|
// skip tags
|
|
if ( m_tids[j] ) continue;
|
|
// count stuff in ()'s or []'s as generic
|
|
if ( m_words->hasChar(j,')') ) inParens =false;
|
|
if ( m_words->hasChar(j,'(') ) inParens =true;
|
|
if ( m_words->hasChar(j,']') ) inSquare =false;
|
|
if ( m_words->hasChar(j,'[') ) inSquare =true;
|
|
continue;
|
|
}
|
|
// generic if in ()'s or []'s
|
|
if ( inParens || inSquare ) {
|
|
bits[j] |= D_IN_PARENS;//GENERIC_WORD;
|
|
continue;
|
|
}
|
|
// skip if in date
|
|
if ( bits[j] & D_IS_IN_DATE ) {
|
|
// was this right?
|
|
//bits[j] |= D_IN_PARENS;//GENERIC_WORD;
|
|
continue;
|
|
}
|
|
// skip doors or show, etc.
|
|
if ( s_igt.isInTable(&m_wids[j]) ) {
|
|
// see if phrase is an exception
|
|
int64_t h = m_wids[j];
|
|
int32_t wcount = 1;
|
|
bool hadException = false;
|
|
for ( int32_t k = j + 1 ; k < sentb ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not word
|
|
if ( ! m_wids[k] ) continue;
|
|
// hash it in otherwise
|
|
h ^= m_wids[k];
|
|
// check exceptions
|
|
if ( s_ext.isInTable(&h) ) {
|
|
hadException = true;
|
|
break;
|
|
}
|
|
// stop after 3 words
|
|
if ( ++wcount >= 3 ) break;
|
|
}
|
|
if ( ! hadException ) {
|
|
bits[j] |= D_CRUFTY;
|
|
continue;
|
|
}
|
|
}
|
|
// numbers are generic (but not if contains an alpha)
|
|
if ( m_words->isNum(j) ) {
|
|
bits[j] |= D_IS_NUM;
|
|
continue;
|
|
}
|
|
// hex num?
|
|
if ( m_words->isHexNum(j) ) {
|
|
bits[j] |= D_IS_HEX_NUM;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
//int32_t upperCount = 0;
|
|
int32_t alphas = 0;
|
|
bool lastStop = false;
|
|
bool inDate = true;
|
|
bool inAddress = false;
|
|
bool notInAddress = false;
|
|
bool inAddressName = false;
|
|
int32_t stops = 0;
|
|
inParens = false;
|
|
int32_t dollarCount = 0;
|
|
int32_t priceWordCount = 0;
|
|
bool hadAt = false;
|
|
|
|
// punish for huge # of words
|
|
//if ( si->m_alnumPosB - si->m_alnumPosA + 1 >= 15 ) {
|
|
// tflags |= SENT_TOO_MANY_WORDS;
|
|
//}
|
|
|
|
// watchout if in a table. the last table column header
|
|
// should not be applied to the first table cell in the
|
|
// next row! was screwing up
|
|
// www.carnegieconcerts.com/eventperformances.asp?evt=54
|
|
if ( si->m_tableSec &&
|
|
lastsi &&
|
|
lastsi->m_tableSec == si->m_tableSec &&
|
|
lastsi->m_rowNum != si->m_rowNum &&
|
|
lastsi->m_colNum > 1 )
|
|
lastsi = NULL;
|
|
|
|
////////////////////
|
|
//
|
|
// if prev sentence had a title field set, adjust us!
|
|
//
|
|
////////////////////
|
|
if ( lastsi && (lastsi->m_sentFlags & SENT_NON_TITLE_FIELD))
|
|
si->m_sentFlags |= SENT_INNONTITLEFIELD;
|
|
if ( lastsi && (lastsi->m_sentFlags & SENT_TITLE_FIELD) )
|
|
si->m_sentFlags |= SENT_INTITLEFIELD;
|
|
// we are the new lastsi now
|
|
lastsi = si;
|
|
|
|
|
|
|
|
//////////////////
|
|
//
|
|
// set SENT_NON_TITLE_FIELD
|
|
//
|
|
// - any sentence immediately following us will get its
|
|
// title score reduced and SENT_INNONTITLEFIELD flag set
|
|
// if we set this SENT_INNONTITLEFIELD for this sentence
|
|
//
|
|
////////////////
|
|
int64_t h = 0;
|
|
int32_t wcount = 0;
|
|
// scan BACKWARDS
|
|
for ( int32_t i = sentb - 1 ; i >= senta ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) continue;
|
|
// must have word after at least
|
|
if ( i + 2 >= m_nw ) continue;
|
|
// set it first time?
|
|
if ( h == 0 ) h = m_wids[i];
|
|
else h ^= m_wids[i];
|
|
// max word count
|
|
if ( ++wcount >= 5 ) break;
|
|
// get from table
|
|
char **msp1 = (char **)s_ti1.getValue(&h);
|
|
char **msp2 = (char **)s_ti2.getValue(&h);
|
|
// cancel out msp1 (exact match) if we do not have
|
|
// the full sentence hashed yet
|
|
if ( i != senta ) msp1 = NULL;
|
|
// if we are doing a substring match we must be
|
|
// filled with generic words! otherwise we get
|
|
// "...permission from the Yoga Instructor."
|
|
// becoming a non-title field and hurting the next
|
|
// sentence's capaiblity of being a title.
|
|
//if ( !(si->m_sentFlags & SENT_GENERIC_WORDS) )
|
|
//msp2 = NULL;
|
|
// if not in table,s kip
|
|
if ( ! msp1 && ! msp2 ) continue;
|
|
char *s = NULL;
|
|
// use exact match first if we got it
|
|
if ( msp1 ) s = *msp1;
|
|
// otherwise, use the substring match
|
|
else s = *msp2;
|
|
|
|
// Fix: "Sort by: Date | Title | Photo" so Title
|
|
// is not a title field in this case. scan for
|
|
// | after the word.
|
|
int32_t pcount = 0;
|
|
bool hadPipeAfter = false;
|
|
for ( int32_t j = i + 1 ; j < m_nw ; j++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( m_tids[j] ) continue;
|
|
if ( m_wids[j] ) break;
|
|
char *p = m_wptrs[j];
|
|
char *pend = p + m_wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( *p == '|' ) break;
|
|
}
|
|
if ( p < pend ) {
|
|
hadPipeAfter = true;
|
|
break;
|
|
}
|
|
// scan no more than two punct words
|
|
if ( ++pcount >= 2 ) break;
|
|
}
|
|
// if we hit the '|' then forget it!
|
|
if ( hadPipeAfter ) continue;
|
|
// we matched then
|
|
if ( s[0] == '+' )
|
|
si->m_sentFlags |= SENT_TITLE_FIELD;
|
|
else
|
|
si->m_sentFlags |= SENT_NON_TITLE_FIELD;
|
|
// @Location
|
|
//if ( s[0] == '@' )
|
|
// si->m_sentFlags |= SENT_PLACE_FIELD;
|
|
break;
|
|
}
|
|
|
|
//////////////////////////
|
|
//
|
|
// USE TABLE HEADERS AS INDICATORS
|
|
//
|
|
//
|
|
// . repeat that same logic but for the table column header
|
|
// . if we are in a table and table header is "title"
|
|
// . for http://events.mapchannels.com/Index.aspx?venue=628
|
|
// . why isn't this kicking in for psfs.org which has
|
|
// "location" in the table column header
|
|
// . we set "m_headColSection" to NULL if not a header per se
|
|
// . a table can only have a header row or header column
|
|
|
|
hdr = si->m_headColSection;
|
|
// must have table header set
|
|
if ( ! hdr ) hdr = si->m_headRowSection;
|
|
|
|
// ok, set to it
|
|
int32_t csentb = 0;
|
|
int32_t csenta = 0;
|
|
if ( hdr && hdr->m_firstWordPos >= 0 &&
|
|
// do not allow the the header row to get its
|
|
// SENT_PLACE_INDICATED set, etc. it's a field not a value
|
|
! hdr->contains ( si ) ) {
|
|
csenta = hdr->m_firstWordPos;
|
|
csentb = hdr->m_lastWordPos+1;
|
|
}
|
|
h = 0;
|
|
wcount = 0;
|
|
// scan BACKWARDS
|
|
for ( int32_t i = csentb - 1 ; i >= csenta ; i-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) continue;
|
|
// set it first time?
|
|
if ( h == 0 ) h = m_wids[i];
|
|
else h ^= m_wids[i];
|
|
// max word count
|
|
if ( ++wcount >= 5 ) break;
|
|
// stop if not full yet
|
|
if ( i != csenta ) continue;
|
|
// get from table, only exact match since we are
|
|
// checking column headers
|
|
char **msp1 = (char **)s_ti1.getValue(&h);
|
|
//char **msp2 = (char **)s_ti1.getValue(j);
|
|
// the full sentence hashed yet
|
|
if ( ! msp1 ) continue;
|
|
// use exact match first if we got it
|
|
char *s = *msp1;
|
|
// we matched then
|
|
if ( s[0] == '+' ) {
|
|
si->m_sentFlags |= SENT_INTITLEFIELD;
|
|
}
|
|
else {
|
|
si->m_sentFlags |= SENT_INNONTITLEFIELD;
|
|
}
|
|
// @Location, like for psfs.org has Location in tablcol
|
|
if ( s[0] == '@' )
|
|
si->m_sentFlags |= SENT_INPLACEFIELD;
|
|
break;
|
|
}
|
|
|
|
bool hadDollar = false;
|
|
// fix sentences that start with stuff like "$12 ..."
|
|
if ( senta>0 &&
|
|
! m_tids[senta-1] &&
|
|
m_words->hasChar(senta-1,'$') ) {
|
|
dollarCount++;
|
|
hadDollar = true;
|
|
}
|
|
|
|
bool hasSpace = false;
|
|
|
|
//////////////////
|
|
//
|
|
// SENT_TAGS
|
|
//
|
|
//////////////////
|
|
//
|
|
// . just check for <eventTags> for now
|
|
// . get parent tag
|
|
Section *ps = si->m_parent;
|
|
if ( ps &&
|
|
(m_isRSSExt || m_contentType == CT_XML) &&
|
|
m_tids[ps->m_a] == TAG_XMLTAG &&
|
|
strncasecmp(m_wptrs[ps->m_a],"<eventTags>",11)==0 )
|
|
si->m_sentFlags |= SENT_TAGS;
|
|
|
|
/////////////////
|
|
//
|
|
// SENT_INTITLEFIELD
|
|
//
|
|
/////////////////
|
|
if ( ps &&
|
|
(m_isRSSExt || m_contentType == CT_XML) &&
|
|
m_tids[ps->m_a] == TAG_XMLTAG &&
|
|
strncasecmp(m_wptrs[ps->m_a],"<eventTitle>",12)==0 )
|
|
si->m_sentFlags |= SENT_INTITLEFIELD;
|
|
// stubhub.com feed support
|
|
if ( ps &&
|
|
(m_isRSSExt || m_contentType == CT_XML) &&
|
|
m_tids[ps->m_a] == TAG_XMLTAG &&
|
|
strncasecmp(m_wptrs[ps->m_a],
|
|
"<str name=\"act_primary\">",24)==0 )
|
|
si->m_sentFlags |= SENT_INTITLEFIELD;
|
|
|
|
|
|
///////////////////
|
|
//
|
|
// SENT_STRANGE_PUNCT etc.
|
|
//
|
|
///////////////////
|
|
int64_t lastWid = 0LL;
|
|
for ( int32_t i = senta ; i < sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) {
|
|
// skip tags right away
|
|
if ( m_tids[i] ) continue;
|
|
// check for dollar sign and give slight
|
|
// demotion so we can avoid ticket price
|
|
// titles
|
|
//if ( m_words->hasChar(i,'$') )
|
|
// dollarCount++;
|
|
// equal sign is strange. will fix
|
|
// SSID=Truman Library for cabq.gov
|
|
//if ( m_words->hasChar(i,'=' ) )
|
|
// si->m_sentFlags |= SENT_STRANGE_PUNCT;
|
|
// blackbird buvetter has '*''s in menu, so
|
|
// prevent those from being title.
|
|
//if ( m_words->hasChar(i,'*' ) )
|
|
// si->m_sentFlags |= SENT_STRANGE_PUNCT;
|
|
// Unnamed Server [68%] for piratecatradio
|
|
//if ( m_words->hasChar(i,'%' ) )
|
|
// si->m_sentFlags |= SENT_STRANGE_PUNCT;
|
|
// back to back _ might indicate form field
|
|
char *p = m_wptrs[i];
|
|
char *pend = p + m_wlens[i];
|
|
bool strange = false;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( *p == '$' ) {
|
|
hadDollar = true;
|
|
dollarCount++;
|
|
}
|
|
if ( *p == '=' ) strange = true;
|
|
if ( *p == '*' ) strange = true;
|
|
if ( *p == '%' ) strange = true;
|
|
if ( is_wspace_a(*p) ) hasSpace = true;
|
|
// following meats: ____chicken ___pork
|
|
if ( *p == '_' && p[1] == '_' )
|
|
strange = true;
|
|
// need one alnum b4 parens check
|
|
if ( alphas == 0 ) continue;
|
|
if ( *p == '(' ) inParens = true;
|
|
if ( *p == ')' ) inParens = false;
|
|
}
|
|
if ( strange )
|
|
si->m_sentFlags |= SENT_STRANGE_PUNCT;
|
|
// need at least one alnum before parens check
|
|
if ( alphas == 0 ) continue;
|
|
// has colon
|
|
if ( m_words->hasChar(i,':') &&
|
|
i>senta &&
|
|
! is_digit(m_wptrs[i][-1]) )
|
|
si->m_sentFlags |= SENT_HAS_COLON;
|
|
// check for end first in case of ") ("
|
|
//if ( m_words->hasChar(i,')') )
|
|
// inParens = false;
|
|
// check if in parens
|
|
//if ( m_words->hasChar(i,'(') )
|
|
// inParens = true;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// check for a sane PRICE after the dollar sign.
|
|
// we do not want large numbers after it, like for
|
|
// budgets or whatever. those are not tickets!!
|
|
//
|
|
if ( hadDollar ) {
|
|
// require a number after the dollar sign
|
|
if ( ! is_digit(m_wptrs[i][0]) )
|
|
dollarCount = 0;
|
|
// number can't be too big!
|
|
char *p = m_wptrs[i];
|
|
int32_t digits =0;
|
|
char *pmax = p + 30;
|
|
bool hadPeriod = false;
|
|
for ( ; *p && p < pmax ; p++ ) {
|
|
if ( *p == ',' ) continue;
|
|
if ( *p == '.' ) {
|
|
hadPeriod = true;
|
|
continue;
|
|
}
|
|
if ( is_wspace_a(*p) ) continue;
|
|
if ( is_digit(*p) ) {
|
|
if ( ! hadPeriod ) digits++;
|
|
continue;
|
|
}
|
|
// $20M? $20B?
|
|
if ( to_lower_a(*p) == 'm' )
|
|
dollarCount = 0;
|
|
if ( to_lower_a(*p) == 'b' )
|
|
dollarCount = 0;
|
|
break;
|
|
}
|
|
if ( digits >= 4 )
|
|
dollarCount = 0;
|
|
// if word after the digits is million
|
|
// thousand billion, etc. then nuke it
|
|
int32_t n = i + 1;
|
|
int32_t nmax = i + 10;
|
|
if ( nmax > m_nw ) nmax = m_nw;
|
|
for ( ; n < nmax ; n++ ) {
|
|
if ( ! m_wids[n] ) continue;
|
|
if ( m_wids[n] == h_million )
|
|
dollarCount = 0;
|
|
if ( m_wids[n] == h_billion )
|
|
dollarCount = 0;
|
|
if ( m_wids[n] == h_thousand )
|
|
dollarCount = 0;
|
|
break;
|
|
}
|
|
// reset for next one
|
|
hadDollar = false;
|
|
}
|
|
|
|
int64_t savedWid = lastWid;
|
|
lastWid = m_wids[i];
|
|
|
|
// skip if not ours directly
|
|
//if ( sp[i] != si ) continue;
|
|
// in address?
|
|
if ( bits[i] & D_IS_IN_VERIFIED_ADDRESS_NAME )
|
|
inAddressName = true;
|
|
else if ( bits[i] & D_IS_IN_UNVERIFIED_ADDRESS_NAME)
|
|
inAddressName = true;
|
|
else if ( bits[i] & D_IS_IN_ADDRESS )
|
|
inAddress = true;
|
|
else
|
|
notInAddress = true;
|
|
|
|
// . skip if in parens
|
|
// . but allow place name #1's in parens like
|
|
// (Albuquerque Rescue Mission) is for unm.edu
|
|
if ( inParens ) continue;
|
|
|
|
// . in date?
|
|
// . hurts "4-5pm Drumming for Dancers w/ Heidi" so
|
|
// make sure all are in date!
|
|
if ( ! ( bits[i] & D_IS_IN_DATE ) ) inDate = false;
|
|
// "provided/powered by"
|
|
if ( ( m_wids[i] == h_provided ||
|
|
m_wids[i] == h_powered ) &&
|
|
i + 2 < sentb &&
|
|
m_wids[i+2] == h_by )
|
|
si->m_sentFlags |= SENT_POWERED_BY;
|
|
// pricey? "free admission" is a price... be sure
|
|
// to include in the description!
|
|
if ( m_wids[i] == h_admission && savedWid == h_free )
|
|
si->m_sentFlags |= SENT_HAS_PRICE;
|
|
// count alphas
|
|
if ( ! is_digit(m_wptrs[i][0]) ) alphas++;
|
|
// "B-52" as in the band (exclude phone #'s!)
|
|
else if ( i-2>0 &&
|
|
m_wptrs[i][-1] =='-' &&
|
|
!is_digit(m_wptrs[i-2][0] ) )
|
|
alphas++;
|
|
// "B-52s", 52s has a letter in it!
|
|
else if ( ! m_words->isNum(i) ) alphas++;
|
|
// are we a stop word?
|
|
bool isStopWord = m_words->isStopWord(i);
|
|
// .com is stop word
|
|
if ( m_wids[i] == h_com ||
|
|
m_wids[i] == h_org ||
|
|
m_wids[i] == h_net ||
|
|
// fixes mr. movie times "PG-13"
|
|
m_wids[i] == h_pg ||
|
|
m_wids[i] == h_pg13 )
|
|
isStopWord = true;
|
|
// count them
|
|
if ( isStopWord ) stops++;
|
|
// set this
|
|
if ( m_wids[i] == h_at ) hadAt = true;
|
|
// if we end on a stop word that is usually indicative
|
|
// of something like
|
|
// "Search Results for <h1>Doughnuts</h1>" as for
|
|
// switchborad.com url
|
|
if ( m_wids[i] == h_of ||
|
|
m_wids[i] == h_the ||
|
|
m_wids[i] == h_and ||
|
|
m_wids[i] == h_at ||
|
|
m_wids[i] == h_to ||
|
|
m_wids[i] == h_be ||
|
|
m_wids[i] == h_or ||
|
|
m_wids[i] == h_not ||
|
|
m_wids[i] == h_in ||
|
|
m_wids[i] == h_by ||
|
|
m_wids[i] == h_on ||
|
|
m_wids[i] == h_for ||
|
|
m_wids[i] == h_with||
|
|
m_wids[i] == h_from )
|
|
lastStop = true;
|
|
else
|
|
lastStop = false;
|
|
// ticket pricey words
|
|
if ( m_wids[i] == h_adv ||
|
|
m_wids[i] == h_dos ||
|
|
m_wids[i] == h_advance ||
|
|
m_wids[i] == h_day ||
|
|
m_wids[i] == h_of ||
|
|
m_wids[i] == h_show ||
|
|
m_wids[i] == h_box ||
|
|
m_wids[i] == h_office )
|
|
priceWordCount++;
|
|
}
|
|
|
|
// set this
|
|
if ( ! hasSpace )
|
|
si->m_sentFlags |= SENT_HASNOSPACE;
|
|
|
|
/*
|
|
// if all words one in address, penalize
|
|
if ( inAddress && ! notInAddress ) {
|
|
//tscore *= .76;
|
|
//dscore *= .76;
|
|
// since it is a street or city, nuke it harder,
|
|
// harder than a mixed case sentence really... which
|
|
// is .15.
|
|
// now http://www.cabq.gov/library/branches.html is
|
|
// getting the street addresses as titles because
|
|
// the sub header is nuked so hard from MULT_EVENTS
|
|
// so nuke this even harder
|
|
// generic words
|
|
//tscore *= .001;//.12;
|
|
//dscore *= .001;//.12;
|
|
// if generic that means we are not store hours
|
|
// and this is something we do not want in the title
|
|
if ( tflags & SENT_GENERIC_WORDS )
|
|
tscore *= .99;
|
|
// a slight hit now, no hurts, blackbird
|
|
//tscore *= .99;
|
|
tflags |= SENT_IN_ADDRESS;
|
|
}
|
|
|
|
// . if some words in address, penalize less
|
|
// . fixes newmexico.org urls which have
|
|
// "45th Annual Rockhound Roundup - Gem & Mineral Show -
|
|
// Deming, NM - 09:00 AM" where Deming, NM is in address
|
|
// but the rest are not. it caused us to prefer the
|
|
// section "Thursday, 11 March, 2010" as the title and
|
|
// we wound up not getting an outlinked title bit set
|
|
// (EV_OUTLINKED_TITLE)
|
|
if ( inAddress && notInAddress ) {
|
|
// hurt salsapower.com title from
|
|
// "Intermediate Salsa class on Saturdays at 11 a.m.
|
|
// at the Harwood 1114 7th Street NW ts=50.0 ds=50.0
|
|
// inaddress" to "Alb, NM" with weight of .50, so i
|
|
// changed to no penalty.
|
|
// BUT without a penalty we get event titles like
|
|
// "Albuquerque, NM 87104-1133" from trumba that
|
|
// have extended zip codes not recognized as being
|
|
// 100% in an address. and we get
|
|
// "P.O.B. 4321, zip 87196" from unm.edu that are
|
|
// quite address-y but not 100% recognized as all
|
|
// address words by us!
|
|
// . BUT we are missing some nice titles for
|
|
// abqtango.com like "Dance to Live Tango Music,
|
|
// Fridays, 10:00pm-Midnight at the Roasted Bean
|
|
// Cafe", so let's only penalize if there is no
|
|
// "at" in the title
|
|
if ( ! hadAt ) {
|
|
// generic words
|
|
//tscore *= .90;
|
|
//dscore *= .90;
|
|
// slightly - no, hurts blackbird
|
|
//tscore *= .99;
|
|
}
|
|
tflags |= SENT_IN_ADDRESS;
|
|
}
|
|
*/
|
|
|
|
// punish just a tiny amount to fix
|
|
// http://www.when.com/albuquerque-nm/venues which is using
|
|
// the place name as the title and not the real event title
|
|
// which is actually AFTER the event date. BUT the problem
|
|
// is is that good titles are often mislabeled as being
|
|
// in an address name! so we can't really punish this at all
|
|
// because we will lose good titles... unless it is a
|
|
// verified place name i guess... so i changed the algo
|
|
// in Address.cpp to only set D_IS_IN_VERIFIED_ADDRESS_NAME if
|
|
// the place name is verified, not just inlined...
|
|
if ( inAddressName && ! notInAddress )
|
|
si->m_sentFlags |= SENT_PLACE_NAME;
|
|
|
|
// does it contain a place name?
|
|
if ( inAddressName )
|
|
si->m_sentFlags |= SENT_CONTAINS_PLACE_NAME;
|
|
|
|
// try to avoid mutiple-ticket- price titles
|
|
if ( dollarCount >= 2 )
|
|
si->m_sentFlags |= SENT_PRICEY;
|
|
// . if all words in section are describing ticket price...
|
|
// . fix bad titles for southgatehouse.com because right title
|
|
// is in SEC_MENU and perchance is also repeated on the page
|
|
// so its score gets slammed and it doesn't even make it in
|
|
// the event description. instead the title we pick is this
|
|
// ticket pricey title, so let's penalize that here so the
|
|
// right title comes through
|
|
// . pricey title was "$17 ADV / $20 DOS"
|
|
// . title in SEC_MENU is now "Ricky Nye". we should have
|
|
// "Buckwheat zydeco" too, but i guess it didn't make the
|
|
// cut, but since we set EV_OUTLINKED_TITLE anyway, it
|
|
// doesn't matter for now.
|
|
if ( dollarCount >= 1 && alphas == priceWordCount )
|
|
si->m_sentFlags |= SENT_PRICEY;
|
|
// single dollar sign?
|
|
if ( dollarCount >= 1 )
|
|
si->m_sentFlags |= SENT_HAS_PRICE;
|
|
|
|
// . if ALL words in date, penalize
|
|
// . penalize by .50 to fix events.mapchannels.com
|
|
// . nuke even harder because www.zvents.com/albuquerque-nm/events/show/88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer
|
|
// was using the date of the events as the title rather than
|
|
// resorting to "Other Future Dates & Times" header
|
|
// which was penalized down to 4.7 because of MULT_EVENTS
|
|
if ( inDate ) {
|
|
// generic words
|
|
//tscore *= .001;
|
|
//dscore *= .001;
|
|
// punish just a little so that if two titles are
|
|
// basically the same but one has a date, use
|
|
// the one without the date "single's day soiree"
|
|
// vs. "single's day soiree Monday Feb 14th..."
|
|
//tscore *= .99;
|
|
si->m_sentFlags |= SENT_IS_DATE;
|
|
}
|
|
// hurts "Jay-Z" but was helping "Results 1-10 of"
|
|
if ( lastStop )
|
|
si->m_sentFlags |= SENT_LAST_STOP;
|
|
|
|
// check to see if text is a "city, state"
|
|
// or has a city in previous section and state in this
|
|
if ( m_aa->isCityState ( si ) )
|
|
si->m_sentFlags |= SENT_CITY_STATE;
|
|
|
|
// punish if only wnumbers (excluding stop words)
|
|
if ( alphas - stops == 0 )
|
|
si->m_sentFlags |= SENT_NUMBERS_ONLY;
|
|
|
|
// do we contain a phone number?
|
|
//if ( pt->isInTable ( &si ) )
|
|
if ( si->m_phoneXor )
|
|
si->m_sentFlags |= SENT_HAS_PHONE;
|
|
|
|
//
|
|
// end case check
|
|
//
|
|
|
|
char *p = NULL;
|
|
int32_t lastPunct = si->m_sentb;
|
|
// skip over tags to fix nonamejustfriends.com sentence
|
|
for ( ; lastPunct < m_nw && m_tids[lastPunct] ; lastPunct++);
|
|
// now assume, possibly incorrectly, that it is punct
|
|
if ( lastPunct < m_nw ) p = m_wptrs[lastPunct];
|
|
|
|
|
|
if ( p && (p[0]==':' || p[1]==':' ) ) {
|
|
// commas also need to fix lacma.org?
|
|
//!(si->m_sentFlags & SENT_HAS_COMMA) &&
|
|
// . only set this if we are likely a field name
|
|
// . fix "Members of the Capitol Ensemble ...
|
|
// perform Schubert: <i>String Trio in B-flat..."
|
|
// for lacma.org
|
|
//(si->m_alnumPosB - si->m_alnumPosA) <= 5 ) {
|
|
si->m_sentFlags |= SENT_COLON_ENDS;
|
|
afterColon = true;
|
|
}
|
|
// starts with '(' or '[' is strange!
|
|
if ( senta>0 &&
|
|
( m_wptrs[senta][-1] == '(' ||
|
|
m_wptrs[senta][-1] == '[' ) )
|
|
si->m_sentFlags |= SENT_PARENS_START; // STRANGE_PUNCT;
|
|
|
|
// . if one lower case and most are upper, we can ignore
|
|
// . fixes mistakes
|
|
// . fixes "Welcome to the Academy store" for wholefoods.com
|
|
//if ( lowerCount == 1 && upperCount >= 2 ) lowerCount = 0;
|
|
// punish if not title case
|
|
//if ( lowerCount )
|
|
// si->m_sentFlags |= SENT_MIXED_CASE;
|
|
|
|
// if in a tag of its own that's great! like being in a header
|
|
// tag kind of
|
|
Section *sp = si->m_parent;
|
|
// skip paragraph tags
|
|
//if ( sp && sp->m_tagId == TAG_P ) sp = sp->m_parent;
|
|
|
|
if ( sp &&
|
|
//sp->m_tagId != TAG_P &&
|
|
sp->m_firstWordPos == si->m_firstWordPos &&
|
|
sp->m_lastWordPos == si->m_lastWordPos )
|
|
si->m_sentFlags |= SENT_IN_TAG;
|
|
|
|
// this is obvious
|
|
if ( si->m_sentFlags & SENT_IN_HEADER )
|
|
si->m_sentFlags |= SENT_IN_TAG;
|
|
|
|
// if sent to left of us starts the parent tag and a colon
|
|
// separates us, then set both our SENT_IN_TAG bits.
|
|
// fixes newmexico.org where event title is "Looking Ahead:
|
|
// Portraits from the Mott-Warsh Collection, Las Cruces"
|
|
/*
|
|
int32_t cc = si->m_a - 1;
|
|
Section *leftSent = NULL;
|
|
if ( cc - 1 >= 0 ) leftSent = ss->m_sectionPtrs [ cc - 1 ];
|
|
if ( leftSent &&
|
|
(leftSent->m_flags & SEC_SENTENCE) &&
|
|
m_words->hasChar(cc,':') &&
|
|
sp->m_firstWordPos == leftSent->m_firstWordPos &&
|
|
sp->m_lastWordPos == si->m_lastWordPos ) {
|
|
si->m_sentFlags |= SENT_IN_TAG;
|
|
leftSent->m_sentFlags |= SENT_IN_TAG;
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
// are we a trumba.com url? we trust the event titles for those
|
|
//char *dom = m_url->getDomain();
|
|
//int32_t dlen = m_url->getDomainLen();
|
|
//bool isTrumba = false;
|
|
//bool isFacebook = false;
|
|
//if ( dlen == 10 && strncmp ( dom , "trumba.com" , 10 ) == 0 )
|
|
// isTrumba = true;
|
|
//if ( dlen == 12 && strncmp ( dom , "facebook.com" , 12 ) == 0 )
|
|
// isFacebook = true;
|
|
|
|
//bool lastSentenceHadColon = false;
|
|
// . score each section that directly contains text.
|
|
// . have a score for title and score for description
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
//if ( si->m_minEventId <= 0 ) continue;
|
|
|
|
// . if we are after a colon
|
|
// . hurts "Dragonfly Lecture: BIGHORN SHEEP SONGS" for
|
|
// http://www.kumeyaay.com/2009/11/dragonfly-lecture-bighorn
|
|
// -sheep-songs/
|
|
// . hurts "Four Centurues: a History of Albuquerque" for
|
|
// collectorsguide.com
|
|
// . but now i do not penalize if the sentence before the
|
|
// colon had two or more words... see below
|
|
//if ( lastSentenceHadColon )
|
|
// si->m_sentFlags |= SENT_AFTER_COLON;
|
|
|
|
if ( si->m_flags & SEC_MENU )
|
|
si->m_sentFlags |= SENT_IN_MENU;
|
|
|
|
if ( si->m_flags & SEC_MENU_SENTENCE )
|
|
si->m_sentFlags |= SENT_MENU_SENTENCE;
|
|
|
|
// why not check menu header too?
|
|
// would fix 'nearby bars' for terrence-wilson zvents url
|
|
if ( si->m_flags & SEC_MENU_HEADER )
|
|
si->m_sentFlags |= SENT_IN_MENU_HEADER;
|
|
|
|
|
|
// assume we do not end in a colon
|
|
//lastSentenceHadColon = false;
|
|
|
|
int32_t sa = si->m_senta;
|
|
int32_t sb = si->m_sentb;
|
|
|
|
// if breaking tag between "sa" and last word of prev sentence
|
|
// AND now breaking tag between our last word and beginning
|
|
// of next sentence, that's a "word sandwich" and not
|
|
// conducive to titles... treat format change tags as
|
|
// breaking tags for this purpose...
|
|
int32_t na = sb;
|
|
int32_t maxna = na + 40;
|
|
if ( maxna > m_nw ) maxna = m_nw;
|
|
bool hadRightTag = true;
|
|
for ( ; na < maxna ; na++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop on tag
|
|
if ( m_tids[na] ) break;
|
|
// or word
|
|
if ( ! m_wids[na] ) continue;
|
|
// heh...
|
|
hadRightTag = false;
|
|
break;
|
|
}
|
|
bool hadLeftTag = true;
|
|
na = sa - 1;
|
|
int32_t minna = na - 40;
|
|
if ( minna < 0 ) minna = 0;
|
|
for ( ; na >= minna ; na-- ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if had a tag on right
|
|
if ( hadRightTag ) break;
|
|
// stop on tag
|
|
if ( m_tids[na] ) break;
|
|
// or word
|
|
if ( ! m_wids[na] ) continue;
|
|
// heh...
|
|
hadLeftTag = false;
|
|
break;
|
|
}
|
|
if ( ! hadRightTag && ! hadLeftTag )
|
|
si->m_sentFlags |= SENT_WORD_SANDWICH;
|
|
|
|
// <hr> or <p>...</p> before us with no text?
|
|
Section *pj = si->m_prev;
|
|
// keep backing up until does not contain us, if ever
|
|
for ( ; pj && pj->m_b > sa ; pj = pj->m_prev ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
// now check it out
|
|
if ( pj && pj->m_firstWordPos < 0 )
|
|
// must be p or hr tag etc.
|
|
si->m_sentFlags |= SENT_AFTER_SPACER;
|
|
|
|
// TODO: also set if first sentence in CONTAINER...!!
|
|
|
|
|
|
|
|
// likewise, before a spacer tag
|
|
pj = si->m_next;
|
|
// keep backing up until does not contain us, if ever
|
|
for ( ; pj && pj->m_a < sb ; pj = pj->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
}
|
|
// now check it out
|
|
if ( pj && pj->m_firstWordPos < 0 )
|
|
// must be p or hr tag etc.
|
|
si->m_sentFlags |= SENT_BEFORE_SPACER;
|
|
|
|
|
|
// location sandwich? punish
|
|
// "2000 Mountain Rd NW, Old Town, Albuquerque, NM 87104"
|
|
// for collectorsguide.com.
|
|
if ( (bits[sa ] & D_IS_IN_ADDRESS) &&
|
|
(bits[sb-1] & D_IS_IN_ADDRESS) )
|
|
si->m_sentFlags |= SENT_LOCATION_SANDWICH;
|
|
|
|
|
|
|
|
// . has colon at the end of it?
|
|
// . but do allow "hours:" etc to be a title...
|
|
// . that fixes www.thewoodencow.com's store hours "event"
|
|
/*
|
|
if ( m_words->hasChar(sb-1,':') &&
|
|
(sb-2<0 || m_wids[sb-2]!=h_hours) ) {
|
|
// just slight penalty, no hurts us!
|
|
// "*Practica*: 9-10pm"
|
|
si->m_sentFlags |= SENT_HAS_COLON;
|
|
lastSentenceHadColon = true;
|
|
}
|
|
else if ( sb<m_nw && m_words->hasChar(sb,':') &&
|
|
(sb-1<0 || m_wids[sb-1]!=h_hours) ) {
|
|
si->m_sentFlags |= SENT_HAS_COLON;
|
|
lastSentenceHadColon = true;
|
|
}
|
|
|
|
// negate if title: etc.
|
|
if ( m_wids[sb-1] == h_title )
|
|
lastSentenceHadColon = false;
|
|
if ( m_wids[sb-1] == h_event )
|
|
lastSentenceHadColon = false;
|
|
// . or if 2+ words in sentence
|
|
// . but still "Doves: For Divorced, Widowed and Separated"
|
|
// title suffers for http://www.trumba.com/calendars/
|
|
// KRQE_Calendar.rss
|
|
if ( sb - sa >= 2 )
|
|
lastSentenceHadColon = false;
|
|
// negate if "Saturdays: 5-6:30pm All Levels African w/ ..."
|
|
// to fix texasdrums.drums.org url
|
|
if ( bits[sa] & D_IS_IN_DATE )
|
|
lastSentenceHadColon = false;
|
|
*/
|
|
|
|
// . dup slam
|
|
// . make it 2 to 1 to fix trumba.com
|
|
//if ( si->m_votesForDup > 2 * si->m_votesForNotDup )
|
|
// si->m_sentFlags |= SENT_DUP_SECTION;
|
|
|
|
// this crap is set in XmlDoc.cpp
|
|
//if ( si->m_votesForDup > 2 * si->m_votesForNotDup &&
|
|
// si->m_votesForDup >= 1 &&
|
|
// ! (si->m_flags & SEC_HAS_NONFUZZYDATE) )
|
|
// si->m_sentFlags |= SENT_DUP_SECTION;
|
|
|
|
// . second title slam
|
|
// . need to telescope up for this i think
|
|
Section *sit = si;
|
|
for ( ; sit ; sit = sit->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// test
|
|
if ( ! ( sit->m_flags & SEC_SECOND_TITLE ) ) continue;
|
|
si->m_sentFlags |= SENT_SECOND_TITLE;
|
|
break;
|
|
}
|
|
|
|
// are we in a facebook name tag?
|
|
sit = si;
|
|
//if ( ! m_isFacebook ) sit = NULL;
|
|
if ( m_contentType != CT_XML ) sit = NULL;
|
|
for ( ; sit ; sit = sit->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// are we in a facebook <name> tag? that is event title
|
|
if ( m_isFacebook && sit->m_tagId == TAG_FBNAME ) {
|
|
si->m_sentFlags |= SENT_IN_TRUMBA_TITLE;
|
|
break;
|
|
}
|
|
// for trumba and eventbrite... etc.
|
|
if ( sit->m_tagId == TAG_GBXMLTITLE ) {
|
|
si->m_sentFlags |= SENT_IN_TRUMBA_TITLE;
|
|
break;
|
|
}
|
|
// stop if hit an xml tag. we do not support nested
|
|
// tags for this title algo.
|
|
if ( sit->m_tagId ) break;
|
|
}
|
|
|
|
// . in header tag boost
|
|
// . should beat noprev/nonextbrother combo
|
|
// . fixes meetup.com
|
|
if ( si->m_flags & SEC_IN_HEADER )
|
|
si->m_sentFlags |= SENT_IN_HEADER;
|
|
|
|
// lost a title because of SENT_MIXED_TEXT
|
|
// "Tango Club of Albuquerque (Argentine Tango)". should we
|
|
// split up the sentence when it ends in a parenthetical to
|
|
// fix that? let's take this out then see it can remove good
|
|
// titles...
|
|
//if ( (si->m_flags & SEC_LINK_TEXT) &&
|
|
// (si->m_flags & SEC_PLAIN_TEXT) ) {
|
|
// si->m_sentFlags |= SENT_MIXED_TEXT;
|
|
//}
|
|
|
|
// same goes for being in a menu sentence
|
|
//if ( si->m_flags & SEC_MENU_SENTENCE ) {
|
|
// si->m_sentFlags |= SENT_MIXED_TEXT;
|
|
//}
|
|
|
|
// . now fix trumba.com which has <title> tag for each event
|
|
// . if parent section is title tag or has "title" in it
|
|
// somewhere, give us a boost
|
|
Section *ip = si->m_parent;
|
|
// if counted as header do not re-count as title too
|
|
//if ( si->m_sentFlags & SENT_IN_HEADER ) ip = NULL;
|
|
// ignore <title> tags if we are not an rss feed (trumba fix)
|
|
if ( ip && ip->m_tagId == TAG_TITLE && ! m_isRSSExt) ip = NULL;
|
|
// keep telescoping up as int32_t as parent just contains this
|
|
// sentence, si.
|
|
for ( ; ip ; ip = ip->m_parent ) {
|
|
// parent must only contain us
|
|
if ( ip->m_firstWordPos != si->m_firstWordPos ) break;
|
|
if ( ip->m_lastWordPos != si->m_lastWordPos ) break;
|
|
// do not allow urls that could have "title" in them
|
|
if ( ip->m_tagId == TAG_A ) break;
|
|
if ( ip->m_tagId == TAG_IFRAME ) break;
|
|
if ( ip->m_tagId == TAG_FRAME ) break;
|
|
// trumba title tag? need for eventbrite too!
|
|
if ( ip->m_tagId == TAG_GBXMLTITLE ) {//&& isTrumba ) {
|
|
si->m_sentFlags |= SENT_IN_TRUMBA_TITLE;
|
|
break;
|
|
}
|
|
// get tag limits
|
|
char *ta = m_wptrs[ip->m_a];
|
|
char *tb = m_wlens[ip->m_a] + ta;
|
|
// scan for "title"
|
|
char *ss = gb_strncasestr(ta,tb-ta,"title") ;
|
|
// skip if not there
|
|
if ( ! ss ) break;
|
|
// . stop if has equal sign after
|
|
// . exempts "<div title="blah">" to fix
|
|
// reverbnation.com from getting Lyrics as a title
|
|
// for an event
|
|
if ( ss[5] == '=' ) break;
|
|
// reward
|
|
//tflags |= SENT_IN_TITLEY_TAG;
|
|
// if we are trumba, we trust these 100% so
|
|
// make sure it is the title. but if we
|
|
// have multiple candidates we still want to
|
|
// rank them amongst themselves so just give
|
|
// a huge boost. problem was was that some event
|
|
// items repeated the event date in the brother
|
|
// section below the address section, thus causing the
|
|
// true event title to get SENT_MULT_EVENTS set while
|
|
// the address place name would be selected as the
|
|
// title for one, because the address section also
|
|
// contained the event time. and the other "event"
|
|
// would use a title from its section.
|
|
//if ( isTrumba ) {
|
|
// si->m_sentFlags |= SENT_IN_TRUMBA_TITLE;
|
|
// //tscore = (tscore + 100.0) * 2000.0;
|
|
// //dscore = (dscore + 100.0) * 2000.0;
|
|
//}
|
|
//else {
|
|
si->m_sentFlags |= SENT_IN_TITLEY_TAG;
|
|
//}
|
|
// once is good enough
|
|
break;
|
|
}
|
|
|
|
/*
|
|
// get biggest
|
|
Section *biggest = NULL;
|
|
// loop over this
|
|
Section *pp = si;
|
|
int32_t ca = si->m_firstWordPos;
|
|
int32_t cb = si->m_lastWordPos;
|
|
// blow up until contain prev
|
|
for ( ; pp ; pp = pp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if contains prev guy
|
|
if ( pp->m_firstWordPos != ca ) break;
|
|
if ( pp->m_lastWordPos != cb ) break;
|
|
// otherwise, set it
|
|
biggest = pp;
|
|
}
|
|
|
|
|
|
if ( biggest && ! biggest->m_prevBrother &&
|
|
// we already give a bonus for being in header tag above
|
|
! (biggest->m_flags & SEC_IN_HEADER) ) {
|
|
//!(biggest->m_flags & SEC_HEADING_CONTAINER) &&
|
|
//!(biggest->m_flags & SEC_NIXED_HEADING_CONTAINER) ) {
|
|
tscore *= 1.2;
|
|
tflags |= SENT_NO_PREV_BROTHER;
|
|
// check this if we have no prev brother only
|
|
//if ( ! biggest->m_nextBrother ||
|
|
// // or we can differ too!
|
|
// (biggest->m_nextBrother->m_tagHash !=
|
|
// biggest->m_tagHash ) ) {
|
|
// tscore *= 1.1;
|
|
// tflags |= SENT_NO_NEXT_BROTHER;
|
|
//}
|
|
}
|
|
*/
|
|
|
|
/*
|
|
Section *bro = NULL;
|
|
if ( biggest ) bro = biggest->m_nextBrother;
|
|
// discard brother if not same tag hash
|
|
if ( bro && bro->m_tagHash != biggest->m_tagHash ) bro = NULL;
|
|
// get smallest section containing first word of bro
|
|
Section *smbro = NULL;
|
|
int32_t fwp = -1;
|
|
if ( bro ) fwp = bro->m_firstWordPos;
|
|
if ( fwp >= 0) smbro = sp[fwp];
|
|
// discard brother if its in lower case issues and we are not
|
|
if ( smbro &&
|
|
!(biggest->m_sentFlags & SENT_MIXED_CASE) &&
|
|
(smbro->m_sentFlags & SENT_MIXED_CASE)) bro=NULL;
|
|
// if no next brother, and no prev brother, reward
|
|
if ( ! bro && (tflags & SENT_NO_PREV_BROTHER) ) {
|
|
tscore *= 1.1;
|
|
tflags |= SENT_NO_NEXT_BROTHER;
|
|
}
|
|
*/
|
|
|
|
int64_t ch64 = si->m_contentHash64;
|
|
// fix for sentences
|
|
if ( ch64 == 0 ) ch64 = si->m_sentenceContentHash64;
|
|
// must be there
|
|
if ( ! ch64 ) { char *xx=NULL;*xx=0; }
|
|
// combine the tag hash with the content hash #2 because
|
|
// a lot of times it is repeated in like a different tag like
|
|
// the title tag
|
|
int64_t modified = si->m_tagHash ^ ch64;
|
|
// repeat on page?
|
|
// hurts "5:30-7pm Beginning African w/ Romy" which is
|
|
// legit and repeated for different days of the week for
|
|
// texasdrums.drums.org, so ease off a bit
|
|
int32_t chtscore = cht.getScore ( &modified ) ;
|
|
if ( chtscore > 1 )
|
|
si->m_sentFlags |= SENT_PAGE_REPEAT;
|
|
/*
|
|
for ( int32_t cc = chtscore ; cc > 1 ; cc-- ) {
|
|
//if ( chtscore == cc ) tscore *= .80;//.40;
|
|
// punish a little more for every repeat so that
|
|
// "HEADLINER" will lose to "That 1 Guy" for
|
|
// reverbnation.com
|
|
//else tscore *= .99;
|
|
si->m_sentFlags |= SENT_PAGE_REPEAT;
|
|
}
|
|
*/
|
|
// advance to first word
|
|
//int32_t f = a; for ( ; ! m_wids[f] && f < b ; f++ );
|
|
int32_t f = si->m_senta;//si->m_firstWordPos;
|
|
int32_t L = si->m_sentb;//si->m_lastWordPos;
|
|
if ( f < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( L < 0 ) { char *xx=NULL;*xx=0; }
|
|
// single word?
|
|
bool single = (f == (L-1));
|
|
/*
|
|
// skip f forward until it is text we contain directly!
|
|
for ( ; f < m_nw ; f++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if we contain directly
|
|
if ( sp[f] == si ) break;
|
|
}
|
|
*/
|
|
|
|
// slight penalty if first word is action word or another
|
|
// word not very indicative of a title, and more indicative
|
|
// of a menu element
|
|
bool badFirst = false;
|
|
bool skip = false;
|
|
if ( m_wids[f] == h_send ) badFirst = true;
|
|
if ( m_wids[f] == h_save ) badFirst = true;
|
|
if ( m_wids[f] == h_add ) badFirst = true;
|
|
if ( m_wids[f] == h_share ) badFirst = true;
|
|
if ( m_wids[f] == h_join ) badFirst = true;
|
|
// request a visit from jim hammond: booktour.com
|
|
if ( m_wids[f] == h_request ) badFirst = true;
|
|
if ( m_wids[f] == h_contact ) badFirst = true;
|
|
// "promote your event" : booktour.com
|
|
if ( m_wids[f] == h_promote ) badFirst = true;
|
|
if ( m_wids[f] == h_subscribe ) badFirst = true;
|
|
if ( m_wids[f] == h_loading ) badFirst = true;
|
|
// "last modified|updated"
|
|
if ( m_wids[f] == h_last && f+2 < m_nw &&
|
|
( m_wids[f+2] == h_modified || m_wids[f+2]==h_updated))
|
|
badFirst = true;
|
|
// special guest
|
|
if ( m_wids[f] == h_special && f+2 < m_nw &&
|
|
( m_wids[f+2] == h_guest || m_wids[f+2]==h_guests))
|
|
badFirst = true;
|
|
// directed by ... darren dunbar (santafeplayhouse.org)
|
|
if ( m_wids[f] == h_directed && f+2 < m_nw &&
|
|
m_wids[f+2] == h_by )
|
|
badFirst = true;
|
|
// map of
|
|
if ( m_wids[f] == h_map && f+2 < m_nw &&
|
|
m_wids[f+2] == h_of )
|
|
badFirst = true;
|
|
// "more|other|venue information|info"
|
|
if ( ( m_wids[f] == h_more ||
|
|
m_wids[f] == h_venue ||
|
|
m_wids[f] == h_general ||
|
|
m_wids[f] == h_event ||
|
|
m_wids[f] == h_other ) &&
|
|
f+2 < m_nw &&
|
|
( m_wids[f+2] == h_information || m_wids[f+2]==h_info))
|
|
badFirst = true;
|
|
// phone number field
|
|
if ( m_wids[f] == h_phone ) badFirst = true;
|
|
// part of address, but we do not pick it up
|
|
if ( m_wids[f] == h_usa ) badFirst = true;
|
|
if ( m_wids[f] == h_date ) badFirst = true;
|
|
if ( m_wids[f] == h_description ) badFirst = true;
|
|
// "buy tickets from $54" for events.mapchannels.com!
|
|
if ( m_wids[f] == h_buy ) badFirst = true;
|
|
if ( m_wids[f] == h_where ) badFirst = true;
|
|
if ( m_wids[f] == h_location ) badFirst = true;
|
|
if ( m_wids[f] == h_located ) badFirst = true;
|
|
if ( m_wids[f] == h_click ) badFirst = true;
|
|
if ( m_wids[f] == h_here ) badFirst = true;
|
|
// "back to band profile" myspace.com
|
|
if ( m_wids[f] == h_back &&
|
|
f+2 < m_nw && m_wids[f+2] == h_to )
|
|
badFirst = true;
|
|
// southgatehouse.com "this week"
|
|
if ( m_wids[f] == h_this &&
|
|
f+2 < m_nw && m_wids[f+2] == h_week )
|
|
skip = true;
|
|
// "this event repeats 48 times" for
|
|
// http://www.zipscene.com/events/view/2848438-2-75-u-call-
|
|
// its-with-dj-johnny-b-cincinnati was getting hammered by
|
|
// this algo
|
|
if ( m_wids[f] == h_this &&
|
|
f+2 < m_nw && m_wids[f+2] == h_event &&
|
|
f+4 < m_nw && m_wids[f+4] == h_repeats )
|
|
skip = true;
|
|
// "claim it"
|
|
if ( m_wids[f] == h_claim &&
|
|
f+2 < m_nw && m_wids[f+2] == h_it )
|
|
skip = true;
|
|
// "claim this event"
|
|
if ( m_wids[f] == h_claim &&
|
|
f+2 < m_nw && m_wids[f+2] == h_this &&
|
|
f+4 < m_nw && m_wids[f+4] == h_event )
|
|
skip = true;
|
|
// "upcoming events"
|
|
if ( m_wids[f] == h_upcoming &&
|
|
f+2 < m_nw && m_wids[f+2] == h_events )
|
|
skip = true;
|
|
// "other upcoming events..."
|
|
if ( m_wids[f] == h_other &&
|
|
f+2 < m_nw && m_wids[f+2] == h_upcoming &&
|
|
f+4 < m_nw && m_wids[f+4] == h_events )
|
|
skip = true;
|
|
// "is this your [event|venue]?"
|
|
if ( m_wids[f] == h_is &&
|
|
f+2 < m_nw && m_wids[f+2] == h_this &&
|
|
f+4 < m_nw && m_wids[f+4] == h_your )
|
|
skip = true;
|
|
// "feed readers..."
|
|
if ( m_wids[f] == h_feed &&
|
|
f+2 < m_nw && m_wids[f+2] == h_readers )
|
|
skip = true;
|
|
// "no rating..."
|
|
if ( m_wids[f] == h_no &&
|
|
f+2 < m_nw && m_wids[f+2] == h_rating )
|
|
skip = true;
|
|
// user reviews
|
|
if ( m_wids[f] == h_user &&
|
|
f+2 < m_nw && m_wids[f+2] == h_reviews )
|
|
skip = true;
|
|
// reviews & comments
|
|
if ( m_wids[f] == h_reviews &&
|
|
f+2 < m_nw && m_wids[f+2] == h_comments )
|
|
skip = true;
|
|
// skip urls
|
|
if ( (m_wids[f] == h_http || m_wids[f] == h_https) &&
|
|
f+6 < m_nw &&
|
|
m_wptrs[f+1][0]==':' &&
|
|
m_wptrs[f+1][1]=='/' &&
|
|
m_wptrs[f+1][2]=='/' ) {
|
|
skip = true;
|
|
}
|
|
|
|
// single word baddies
|
|
if ( single ) {
|
|
if ( m_wids[f] == h_new ) {
|
|
badFirst = true;
|
|
// fix abqtango.com "New" as an event title
|
|
//tscore *= .50;
|
|
}
|
|
// "featuring"
|
|
if ( m_wids[f] == h_featuring ) badFirst = true;
|
|
// "what:"
|
|
if ( m_wids[f] == h_what ) badFirst = true;
|
|
if ( m_wids[f] == h_who ) badFirst = true;
|
|
if ( m_wids[f] == h_tickets) badFirst = true;
|
|
// a price point!
|
|
if ( m_wids[f] == h_free ) badFirst = true;
|
|
// navigation
|
|
if ( m_wids[f] == h_login ) badFirst = true;
|
|
if ( m_wids[f] == h_back ) badFirst = true;
|
|
if ( m_wids[f] == h_when ) badFirst = true;
|
|
if ( m_wids[f] == h_contact ) badFirst = true;
|
|
if ( m_wids[f] == h_phone ) badFirst = true;
|
|
// stop hours from being title
|
|
if ( m_wids[f] == h_hours ) badFirst = true;
|
|
// selection menu. sort by "relevancy"
|
|
if ( m_wids[f] == h_relevancy ) badFirst = true;
|
|
// from southgatehouse.com
|
|
if ( m_wids[f] == h_tonight ) skip = true;
|
|
if ( m_wids[f] == h_today ) skip = true;
|
|
if ( m_wids[f] == h_share ) skip = true;
|
|
if ( m_wids[f] == h_join ) skip = true;
|
|
if ( m_wids[f] == h_loading ) skip = true;
|
|
if ( m_wids[f] == h_bookmark ) skip = true;
|
|
if ( m_wids[f] == h_publish ) skip = true;
|
|
if ( m_wids[f] == h_subscribe ) skip = true;
|
|
if ( m_wids[f] == h_save ) skip = true;
|
|
if ( m_wids[f] == h_creator ) skip = true;
|
|
if ( m_wids[f] == h_tags ) skip = true;
|
|
if ( m_wids[f] == h_category ) skip = true;
|
|
if ( m_wids[f] == h_price ) skip = true;
|
|
if ( m_wids[f] == h_rate ) skip = true;
|
|
if ( m_wids[f] == h_rates ) skip = true;
|
|
if ( m_wids[f] == h_users ) skip = true;
|
|
if ( m_wids[f] == h_support ) skip = true;
|
|
}
|
|
|
|
|
|
if ( badFirst )
|
|
si->m_sentFlags |= SENT_BAD_FIRST_WORD;
|
|
|
|
if ( skip )
|
|
si->m_sentFlags |= SENT_NUKE_FIRST_WORD;
|
|
|
|
}
|
|
|
|
|
|
Section *prevSec = NULL;
|
|
//////////////
|
|
//
|
|
// now if SENT_HAS_COLON was set and the next sentence is
|
|
// a phone number, then we can assume we are the name of a person
|
|
// or some field for which that phone # applies
|
|
//
|
|
//////////////
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
//if ( si->m_minEventId <= 0 ) continue;
|
|
// save it
|
|
Section *lastSec = prevSec;
|
|
// update it
|
|
prevSec = si;
|
|
// skip if no last yet
|
|
if ( ! lastSec ) continue;
|
|
// ok, last guy must have had a colon
|
|
if ( ! (lastSec->m_sentFlags & SENT_COLON_ENDS) ) continue;
|
|
// we must be generic, end in a period or be mixed case
|
|
// no, this hurts "Bill Holman Big Band: Composer, arranger..."
|
|
//if ( ! (si->m_sentFlags & SENT_GENERIC_WORDS) &&
|
|
// ! (si->m_sentFlags & SENT_PERIOD_ENDS) &&
|
|
// ! (si->m_sentFlags & SENT_MIXED_CASE) )
|
|
// continue;
|
|
// this is good for cabq.gov libraries page "children's room:"
|
|
// which has a phone number after it
|
|
if ( ! ( si->m_sentFlags & SENT_HAS_PHONE ) )
|
|
// are we like "location:"
|
|
//! ( si->m_sentFlags & SENT_IN_ADDRESS ) )
|
|
continue;
|
|
// ok, punish last guy in that case for having a colon
|
|
// and preceeding a generic sentence
|
|
//lastSec->m_titleScore *= .02;
|
|
lastSec->m_sentFlags |= SENT_FIELD_NAME;
|
|
}
|
|
|
|
|
|
if ( ! s_init9 ) initPlaceIndicatorTable();
|
|
|
|
|
|
if ( ! m_alnumPosValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
/////////////////////////////
|
|
//
|
|
// set SENT_OBVIOUS_PLACE
|
|
//
|
|
// - indicate sentences that are place names
|
|
// - end in "theater", "hall", "bar", etc.
|
|
//
|
|
////////////////////////////
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// only works on sentences for now
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// sentence must not be mixed case
|
|
if ( si->m_sentFlags & SENT_MIXED_CASE ) continue;
|
|
// ignore if in menu ("the lower school" - maret.org)
|
|
if ( si->m_sentFlags & SENT_IN_MENU ) continue;
|
|
if ( si->m_sentFlags & SENT_IN_MENU_HEADER ) continue;
|
|
// how many alnum words we got?
|
|
int32_t na = si->m_alnumPosB - si->m_alnumPosA ;
|
|
// skip if only one word
|
|
if ( na <= 1 ) continue;
|
|
// skip if more than 7
|
|
if ( na > 7 ) continue;
|
|
// last word in sentence is "last"
|
|
int32_t last = si->m_sentb - 1;
|
|
// back up if "last" is numeric like "Studio 3" or "Room 127"
|
|
for ( ; last > si->m_senta ; last-- ) {
|
|
// skip until alnumword
|
|
if ( ! m_wids[last] ) continue;
|
|
// stop if last is NOT starting with a number
|
|
if ( ! is_digit(m_wptrs[last][0])) break;
|
|
}
|
|
// make a phrase wid for "night club"
|
|
int64_t pid = 0LL;
|
|
for ( int32_t k = last - 1 ; k >= si->m_senta ; k-- ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! m_wids[k] ) continue;
|
|
pid = m_pids[k];
|
|
break;
|
|
}
|
|
// last word must be indicator: "theater" "center" "school"...
|
|
bool inTable = false;
|
|
if ( s_pit.isInTable(&m_wids[last]) ) inTable = true;
|
|
// check phrase id for "nightclub" for guysndollsllc.com
|
|
if ( pid && s_pit.isInTable ( &pid ) )
|
|
inTable = true;
|
|
if ( ! inTable ) continue;
|
|
// the word "the" or "a" cannot preceed indicator
|
|
if ( m_wids[last-2] == h_the ) continue;
|
|
// "find a store"
|
|
if ( m_wids[last-2] == h_a ) continue;
|
|
// "use of museums"
|
|
if ( m_wids[last-2] == h_of ) continue;
|
|
// first word
|
|
int32_t i = si->m_senta;
|
|
// "add/contact/use/search store"
|
|
if ( m_wids[i] == h_add ) continue;
|
|
if ( m_wids[i] == h_contact ) continue;
|
|
if ( m_wids[i] == h_use ) continue;
|
|
if ( m_wids[i] == h_search ) continue;
|
|
if ( m_wids[i] == h_find ) continue;
|
|
// school needs 3 words at least to fix
|
|
// "drama/music/cooking/german school"
|
|
if ( m_wids[last] == h_school && na <= 2 ) continue;
|
|
// "gift shop" is a subplace name (collectorsguide.com)
|
|
if ( m_wids[last] == h_shop && m_wids[last-2]==h_gift)continue;
|
|
// "open house" is subplace name (maret.org)
|
|
if ( m_wids[last]==h_house&& m_wids[last-2]==h_open)continue;
|
|
// photo/image/video/media library/gallery
|
|
if ((m_wids[last] == h_gallery||m_wids[last]==h_library) &&
|
|
(m_wids[last-2] == h_photo ||
|
|
m_wids[last-2] == h_image ||
|
|
m_wids[last-2] == h_picture ||
|
|
m_wids[last-2] == h_video ||
|
|
m_wids[last-2] == h_media ) )
|
|
continue;
|
|
// if contains "at" or "by" or blah blah, stop it
|
|
int32_t j; for ( j = si->m_senta ; j < si->m_sentb ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip tags
|
|
if ( m_tids[j] ) continue;
|
|
// strange punct?
|
|
if ( ! m_wids[j] ) {
|
|
char *p = m_wptrs[j];
|
|
char *pend = p + m_wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( is_wspace_a(*p) ) continue;
|
|
if ( *p == '\'' ) continue;
|
|
// St. John's College Bookstore
|
|
if ( *p == '.' ) continue;
|
|
break;
|
|
}
|
|
// bad punct? do not set the flag then
|
|
if ( p < pend ) break;
|
|
// otherwise, we are good to go
|
|
continue;
|
|
}
|
|
// stop on these
|
|
if ( m_wids[j] == h_at ) break;
|
|
// presented by ....
|
|
if ( m_wids[j] == h_by ) break;
|
|
// copyright is bad
|
|
if ( m_wids[j] == h_copyright ) break;
|
|
// "proceed to west hartford center" (directs)
|
|
if ( m_wids[j] == h_to ) break;
|
|
// "... presents Swan Lake"
|
|
if ( m_wids[j] == h_presents ) break;
|
|
// "...review of ..."
|
|
if ( m_wids[j] == h_review ) break;
|
|
// folks from erda gardens
|
|
if ( m_wids[j] == h_from ) break;
|
|
// join us in crested butte
|
|
if ( m_wids[j] == h_join ) break;
|
|
}
|
|
// if hit a bad word, do not set flag then
|
|
if ( j < si->m_sentb ) continue;
|
|
// set it
|
|
si->m_sentFlags |= SENT_OBVIOUS_PLACE;
|
|
}
|
|
|
|
if ( ! m_alnumPosValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
//
|
|
// set SENT_EVENT_ENDING
|
|
//
|
|
// . if sentence ends in "festival" etc. set this bit
|
|
// . or stuff like "workshop a" is ok since 'a' is a stop word
|
|
// . or stuff like 'sunday services from 9am to 10am'
|
|
//
|
|
// scan the sentences
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// only works on sentences for now
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// compute this
|
|
int32_t alnumCount = si->m_alnumPosB - si->m_alnumPosA ;
|
|
// set event ending/beginning
|
|
int32_t val = hasTitleWords(si->m_sentFlags,si->m_a,si->m_b,
|
|
alnumCount,m_bits,m_words,true,
|
|
m_niceness);
|
|
if ( val == 1 )
|
|
si->m_sentFlags |= SENT_HASTITLEWORDS;
|
|
if ( val == -1 )
|
|
si->m_sentFlags |= SENT_BADEVENTSTART;
|
|
}
|
|
|
|
/*
|
|
supplanted by hasTitleWords()
|
|
|
|
//////////////////////
|
|
//
|
|
// set SENT_BADEVENTSTART
|
|
//
|
|
//////////////////////
|
|
//
|
|
// . '-': if a title sentences begins with one of these words then
|
|
// set SENT_BADEVENTSTART
|
|
// . do not give bonus for SENT_EVENT_ENDING if SENT_BADEVENTSTART
|
|
// is set
|
|
// . fixes "Presented by Colorado Symphony Orchestra"
|
|
static char *s_starters [] = {
|
|
"-presented", // presented by fred, bad
|
|
//"+festival", // festival of lights, good
|
|
//"+class", // class w/ caroline & constantine
|
|
//"+lecture", // lecture on physics
|
|
//"+beginning", // beginning painting constantcontact.com
|
|
//"+intermeditate",
|
|
//"+advanced"
|
|
};
|
|
// store these words into table
|
|
static HashTableX s_sw;
|
|
static char s_swbuf[2000];
|
|
static bool s_init8 = false;
|
|
if ( ! s_init8 ) {
|
|
s_init8 = true;
|
|
s_sw.set(8,4,128,s_swbuf,2000,false,m_niceness,"swtab");
|
|
int32_t n = (int32_t)sizeof(s_starters)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
char *s = s_starters[i];
|
|
Words w; w.set3 ( s );
|
|
int64_t *wi = w.getWordIds();
|
|
int64_t h = 0;
|
|
// scan words
|
|
for ( int32_t j = 0 ; j < w.getNumWords(); j++ )
|
|
if ( wi[j] ) h ^= wi[j];
|
|
// . store hash of all words, value is ptr to it
|
|
// . put all exact matches into sw1 and the substring
|
|
// matches into sw2
|
|
s_sw.addKey ( &h , &s );
|
|
}
|
|
}
|
|
// . use the same event ending table to see if the title sentence
|
|
// begins with one of these "endings"
|
|
// . should fix "Presented By Colorado Symphony Orchestra" from
|
|
// being a good event title
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// only works on sentences for now
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// how many alnum words in it?
|
|
int32_t na = si->m_alnumPosB - si->m_alnumPosA ;
|
|
// need at least two words
|
|
if ( na <= 1 ) continue;
|
|
// skip if too long and not capitalized
|
|
if ( na > 7 && (si->m_sentFlags & SENT_MIXED_CASE ) ) continue;
|
|
// get FIRST word in potential event title
|
|
int32_t i = si->m_senta;
|
|
int32_t a = si->m_senta;
|
|
int32_t b = si->m_sentb;
|
|
// skip cruft though
|
|
for ( ; i < a ; i++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! m_wids[i] ) continue;
|
|
if ( bits[i] & D_IS_IN_DATE ) continue;
|
|
if ( bits[i] & D_IS_STOPWORD ) continue;
|
|
if ( m_wlens[i] == 1 ) continue;
|
|
break;
|
|
}
|
|
// if all cruft, ignore
|
|
if ( i == b ) continue;
|
|
// go to next section if word not in our list
|
|
if ( ! s_sw.isInTable ( &m_wids[i] ) ) continue;
|
|
// must have at least one word after that
|
|
int32_t next = i + 2;
|
|
// skip tags
|
|
for ( ; next < b && ! m_wids[next] ; next++ );
|
|
// if no more words, forget it, go to next section
|
|
if ( next >= b ) continue;
|
|
// get the string
|
|
char *str = *(char **)s_sw.getValue ( &m_wids[i] );
|
|
// check sign
|
|
//if ( str[0] == '+' )
|
|
// si->m_sentFlags |= SENT_GOODEVENTSTART;
|
|
if ( str[0] == '-' )
|
|
si->m_sentFlags |= SENT_BADEVENTSTART;
|
|
// must have a sign
|
|
else { char *xx=NULL;*xx=0; }
|
|
}
|
|
*/
|
|
|
|
|
|
///////////////////
|
|
//
|
|
// set SENT_PRETTY
|
|
//
|
|
///////////////////
|
|
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// only works on sentences for now
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
setSentPrettyFlag ( si );
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// returns +1 if has positive title words/phrases
|
|
// returns -1 if has negative title words/phrases
|
|
// return 0 otherwise
|
|
int32_t hasTitleWords ( sentflags_t sflags ,
|
|
int32_t a ,
|
|
int32_t b ,
|
|
int32_t alnumCount ,
|
|
Bits *bitsClass ,
|
|
Words *words ,
|
|
bool useAsterisk ,
|
|
int32_t niceness ) {
|
|
|
|
// need at least two alnum words
|
|
if ( alnumCount <= 0 ) return 0;
|
|
// skip if too long and not capitalized
|
|
if ( alnumCount > 7 && (sflags & SENT_MIXED_CASE ) ) return 0;
|
|
|
|
// sanity. we need s_pit to be initialized
|
|
if ( ! s_init9 ) initPlaceIndicatorTable();
|
|
|
|
|
|
int64_t *wids = words->getWordIds();
|
|
nodeid_t *tids = words->getTagIds();
|
|
char **wptrs = words->getWords();
|
|
int32_t *wlens = words->getWordLens();
|
|
int32_t nw = words->getNumWords();
|
|
|
|
// . int16_tcut
|
|
// . we are also called from dates.cpp and m_bits is NULL!
|
|
wbit_t *bits = NULL;
|
|
if ( bitsClass ) bits = bitsClass->m_bits;
|
|
|
|
static bool s_flag = false;
|
|
static int64_t h_annual;
|
|
static int64_t h_anniversary;
|
|
static int64_t h_next;
|
|
static int64_t h_past;
|
|
static int64_t h_future;
|
|
static int64_t h_upcoming;
|
|
static int64_t h_other;
|
|
static int64_t h_more;
|
|
static int64_t h_weekly;
|
|
static int64_t h_daily;
|
|
static int64_t h_permanent; // fix permanent exhibit for collectorsg
|
|
static int64_t h_beginning ;
|
|
static int64_t h_every ;
|
|
static int64_t h_featuring ;
|
|
static int64_t h_for ;
|
|
static int64_t h_at ;
|
|
static int64_t h_by ;
|
|
static int64_t h_on ;
|
|
static int64_t h_no ;
|
|
static int64_t h_name ;
|
|
static int64_t h_in ;
|
|
static int64_t h_sponsored;
|
|
static int64_t h_sponsered;
|
|
static int64_t h_presented;
|
|
static int64_t h_i;
|
|
static int64_t h_id;
|
|
static int64_t h_begins ;
|
|
static int64_t h_meets ;
|
|
static int64_t h_benefitting ;
|
|
static int64_t h_benefiting ;
|
|
static int64_t h_with ;
|
|
static int64_t h_starring ;
|
|
static int64_t h_experience;
|
|
static int64_t h_w ;
|
|
static int64_t h_event;
|
|
static int64_t h_band;
|
|
static int64_t h_tickets;
|
|
static int64_t h_events;
|
|
static int64_t h_jobs;
|
|
static int64_t h_this;
|
|
static int64_t h_series;
|
|
static int64_t h_total;
|
|
static int64_t h_times;
|
|
static int64_t h_purchase;
|
|
static int64_t h_look;
|
|
static int64_t h_new;
|
|
static int64_t h_us;
|
|
static int64_t h_its;
|
|
|
|
if ( ! s_flag ) {
|
|
s_flag = true;
|
|
h_annual = hash64n("annual");
|
|
h_anniversary = hash64n("anniversary");
|
|
h_next = hash64n("next");
|
|
h_past = hash64n("past");
|
|
h_future = hash64n("future");
|
|
h_upcoming = hash64n("upcoming");
|
|
h_other = hash64n("other");
|
|
h_more = hash64n("more");
|
|
h_weekly = hash64n("weekly");
|
|
h_daily = hash64n("daily");
|
|
h_permanent = hash64n("permanent");
|
|
h_beginning = hash64n("beginning");
|
|
h_every = hash64n("every");
|
|
h_featuring = hash64n("featuring");
|
|
h_for = hash64n("for");
|
|
h_at = hash64n("at");
|
|
h_by = hash64n("by");
|
|
h_on = hash64n("on");
|
|
h_no = hash64n("no");
|
|
h_name = hash64n("name");
|
|
h_in = hash64n("in");
|
|
h_sponsored = hash64n("sponsored");
|
|
h_sponsered = hash64n("sponsered");
|
|
h_presented = hash64n("presented");
|
|
h_i = hash64n("i");
|
|
h_id = hash64n("id");
|
|
h_begins = hash64n("begins");
|
|
h_meets = hash64n("meets");
|
|
h_benefitting = hash64n("benefitting");
|
|
h_benefiting = hash64n("benefiting");
|
|
h_with = hash64n("with");
|
|
h_starring = hash64n("starring");
|
|
h_experience = hash64n("experience");
|
|
h_w = hash64n("w");
|
|
h_event = hash64n("event");
|
|
h_band = hash64n("band");
|
|
h_tickets = hash64n("tickets");
|
|
h_events = hash64n("events");
|
|
h_jobs = hash64n("jobs");
|
|
h_this = hash64n("this");
|
|
h_series = hash64n("series");
|
|
h_total = hash64n("total");
|
|
h_times = hash64n("times");
|
|
h_purchase = hash64n("purchase");
|
|
h_look = hash64n("look");
|
|
h_new = hash64n("new");
|
|
h_us = hash64n("us");
|
|
h_its = hash64n("its");
|
|
}
|
|
|
|
// . if it just consists of one non-stop word, forget it!
|
|
// . fixes "This series" for denver.org
|
|
// . fixes "Practica" for abqtango.org
|
|
// . we have to have another beefy word besides just the event ending
|
|
int32_t goodCount = 0;
|
|
for ( int32_t k = a ; k < b ; k++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( ! wids[k] ) continue;
|
|
if ( bits && (bits[k] & D_IS_IN_DATE) ) continue;
|
|
if ( bits && (bits[k] & D_IS_STOPWORD) ) continue;
|
|
if ( wlens[k] == 1 ) continue;
|
|
// treat "next" as stop word "next performance" etc.
|
|
// to fix "next auction"?
|
|
if ( wids[k] == h_next ) continue;
|
|
if ( wids[k] == h_past ) continue;
|
|
if ( wids[k] == h_future ) continue;
|
|
if ( wids[k] == h_upcoming ) continue;
|
|
if ( wids[k] == h_other ) continue;
|
|
if ( wids[k] == h_more ) continue;
|
|
if ( wids[k] == h_beginning ) continue; // beginning on ...
|
|
if ( wids[k] == h_weekly ) continue; // weekly shows
|
|
if ( wids[k] == h_daily ) continue;
|
|
if ( wids[k] == h_permanent ) continue;
|
|
if ( wids[k] == h_every ) continue; // every saturday
|
|
goodCount++;
|
|
if ( goodCount >= 2 ) break;
|
|
}
|
|
// . need at least 2 non-stopwords/non-dates
|
|
// . crap, what about "Bingo" - let reduce to 1
|
|
//if ( goodCount <= 1 ) return 0;
|
|
if ( goodCount <= 0 ) return 0;
|
|
|
|
// "9th annual...."
|
|
// "THE 9th annual..."
|
|
// 2012 annual
|
|
for ( int32_t k = a ; k < b ; k++ ) {
|
|
if ( !is_digit(wptrs[k][0])) continue;
|
|
if ( k+2>=nw ) continue;
|
|
if ( wids[k+2] == h_annual ) return true;
|
|
if ( wids[k+2] == h_anniversary ) return true;
|
|
}
|
|
|
|
//
|
|
// TODO: * with <person name>
|
|
//
|
|
|
|
// . a host list of title-y words and phrases
|
|
// . event title indicators
|
|
// . ^ means must be in beginning
|
|
// . $ means must match the end
|
|
// . * means its generic and must have 2+ words in title
|
|
static char *s_twords[] = {
|
|
"$jazzfest",
|
|
"$photofest",
|
|
"$fest", // Fan fest
|
|
"$fanfest",
|
|
"$brewfest",
|
|
"$brewfestivus",
|
|
"$musicfest",
|
|
"$slamfest",
|
|
"$songfest",
|
|
"$ozfest",
|
|
"$winefest",
|
|
"$beerfest",
|
|
"$winterfest",
|
|
"$summerfest",
|
|
"$springfest",
|
|
"|culturefest",
|
|
"$fest", // reggae fest
|
|
"$fallfest",
|
|
"$o-rama", // string-o-rama
|
|
"$jazzshow",
|
|
"$musicshow",
|
|
"$songshow",
|
|
"$wineshow",
|
|
"$beershow",
|
|
"$wintershow",
|
|
"$summershow",
|
|
"$springshow",
|
|
"$fallshow",
|
|
"$wintershow",
|
|
"$winter fantasy",
|
|
"$recital",
|
|
"$auditions",
|
|
"$audition",
|
|
"$festival",
|
|
"$the festival", // the festival of trees is held at...
|
|
"$festivals",
|
|
"$jubilee",
|
|
"$concert",
|
|
"$concerts",
|
|
"$concerto",
|
|
"$concertos",
|
|
"$bout", // fight
|
|
"*$series", // world series, concert series
|
|
"-this series", // fix denver.org
|
|
"-television series",
|
|
"-tv series",
|
|
"$hoedown",
|
|
"$launch", // album launch
|
|
"*$3d", // puss in boots 3d
|
|
|
|
"*$beginning", // beginning painting constantcontact.com
|
|
"*$intermediate",
|
|
"*$advanced",
|
|
"*^beginning", // beginning painting constantcontact.com
|
|
"-^beginning in", // beginning in january
|
|
"-^beginning on",
|
|
"-^beginning at",
|
|
"*|intermediate", // drawing, intermediate
|
|
"*^int",
|
|
"*^advanced",
|
|
"*^adv",
|
|
"*^beginner", // beginner tango
|
|
"*$beginners", // for beginners
|
|
"*^beg", // beg. II salsa
|
|
"*^adult", // Adult III
|
|
|
|
"*$con",
|
|
"$convention",
|
|
"$comiccon",
|
|
"$jobfair",
|
|
"$peepshow",
|
|
"$ballyhoo",
|
|
|
|
"$open", // french open
|
|
"-$half open", // fix
|
|
"-$we're open", // fix
|
|
"-$is open", // fix
|
|
"-$be open", // fix
|
|
"-$percent open",
|
|
"-$now open", // fix
|
|
"-$shop open", // fix
|
|
"-$office open",
|
|
"-$re open",
|
|
"-$building open", // TODO: use all place indicators!
|
|
"-$desk open", // help desk open
|
|
"open mic", // anywhere in the title
|
|
|
|
// . should fix "Presented By Colorado Symphony Orchestra" from
|
|
// being a good event title
|
|
"-^presented",
|
|
|
|
"*$opening", // grand opening, art opening
|
|
"$spectacular", // liberty belle spectacular
|
|
"$classic", // 2nd annual cigar club classic
|
|
"$carnival",
|
|
"$circus",
|
|
"$rodeo",
|
|
"$ride", // bike ride
|
|
"$rides", // char lift rides
|
|
"train ride",
|
|
"train rides",
|
|
|
|
"$summit", // tech summit
|
|
"$lecture",
|
|
"*$overview", // BeachBound+Hounds+2012+Overview
|
|
"$talk", // Curator's+Gallery+Talk
|
|
"$discussion", // panel discussion
|
|
"^panel discussion",
|
|
"|public hearing",
|
|
"$webinar",
|
|
"$teleseminar",
|
|
"$seminar",
|
|
"$seminars", // S.+Burlington+Saturday+Seminars
|
|
"$soiree", // single's day soiree
|
|
"$extravaganza",
|
|
"|reception",
|
|
"-|reception desk",
|
|
"$tribute",
|
|
"$parade",
|
|
"^2011 parade", // 2011 parade of homes
|
|
"^2012 parade",
|
|
"^2013 parade",
|
|
"^2014 parade",
|
|
"^2015 parade",
|
|
"$fireworks",
|
|
//"$car club", // social car club. "club" itself is too generic
|
|
//"$book club",
|
|
//"$yacht club",
|
|
//"$glee club",
|
|
//"$knitting club",
|
|
//"mountaineering club",
|
|
//"debate club",
|
|
//"chess club",
|
|
"*$club",
|
|
"club of", // rotary club of kirkland
|
|
"$forum", // The+bisexual+forum+meets+on+the+second+Tuesday
|
|
"$meet", // swap meet
|
|
"$swap", // solstice seed swap
|
|
"$board", // board meeting
|
|
"^board of", // board of selectmen
|
|
//"-board meeting", // i'm tired of these!!
|
|
//"-council meeting",
|
|
"*$meeting",
|
|
"*$mtg",
|
|
"*$mtng",
|
|
"*$meetings", // schedule of meetings
|
|
"*$meet", // stonegate speech meet/ track meet
|
|
"-no meeting",
|
|
"-no meetings",
|
|
"-no game",
|
|
"-no games",
|
|
"*$mtg", // State-Wide+Educational+Mtg
|
|
"$meetup",
|
|
"$meetups",
|
|
"$meet ups",
|
|
"$meet up",
|
|
"*meets on" , // grade 8 meets on thursday
|
|
"^meet the", // meet the doula
|
|
"$committee",
|
|
"$council", // parish council
|
|
"$hearing", // public hearing
|
|
"$band", // blues band
|
|
"$quartet",
|
|
"$trio",
|
|
"$networking event",
|
|
"$social", // polar express social
|
|
"tour of", // tour of soulard / The+Chocolate+Tour+of+New+York
|
|
"|tour",// tour boulevard brewery
|
|
"|tours", // tours leave the main bldg...
|
|
"$cruise",
|
|
"$cruises",
|
|
"cruise aboard", // dog paddle motor cruise abord the tupelo
|
|
"motor cruise",
|
|
"$safari", // fishing safari
|
|
"*$trip", // photography trip
|
|
"*$trips", // photography trip
|
|
"$slam", // poetry slam
|
|
"$readings", // poetry readings
|
|
"$expedition",
|
|
"$expeditions",
|
|
"orchestra", // middle school orchestra in the chapel
|
|
"$ensemble",
|
|
"$ensembles",
|
|
"$philharmonic",
|
|
"$chorale",
|
|
"$choir",
|
|
"$chorus",
|
|
"$prelude", // music prelude
|
|
|
|
"-$website", // welcome to the blah website
|
|
"-$blog",
|
|
"-$homepage",
|
|
|
|
"*$group", // pet loss group
|
|
"*$groups", // 12 marin art groups
|
|
"$sig", // special interest group
|
|
"-$your group", // promote your group
|
|
"-$your groups", // promote your group
|
|
"-sub group", // IPv6+Addressing+Sub-group
|
|
"-$a group", // not proper noun really
|
|
"-$the group",
|
|
"-$the groups",
|
|
"-$by group",
|
|
"-$by groups",
|
|
"-$or group",
|
|
"-$or groups",
|
|
"-$for group",
|
|
"-$for groups",
|
|
"-$sub groups",
|
|
"-$participating group",
|
|
"-$participating groups",
|
|
"-$large group",
|
|
"-$large groups",
|
|
"-$small group",
|
|
"-$small groups",
|
|
"-$age group",
|
|
"-$age groups",
|
|
"-$profit group",
|
|
"-$profit groups",
|
|
"-$dental group",
|
|
"-$dental groups",
|
|
"-$eyecare group",
|
|
"-$eyecare groups",
|
|
"-$medical group",
|
|
"-$medical groups",
|
|
"-$private group",
|
|
"-$private groups",
|
|
"-$media group", // locally owned by xxx media group
|
|
"conversation group",
|
|
"book group",
|
|
"reading group",
|
|
"support group",
|
|
"support groups",
|
|
"discussion group",
|
|
"discussion groups",
|
|
"$playgroup",
|
|
"$workgroup",
|
|
"$intergruopgroup",
|
|
|
|
"|orientation", // Orientation+to+Floortime
|
|
// no! "$all day" "$minimum day" from is too generic for
|
|
// mvusd.us
|
|
"$day", // heritage day. kite day
|
|
"$day out", // girl's day out
|
|
"$play day",
|
|
"*day of", // day of action
|
|
"*hour of", // hour of power
|
|
"$day 2012", // local history day 2012
|
|
"*$all day",
|
|
"*$every day", // happy hour every day vs. "every day"
|
|
"*$this day",
|
|
"-$per day",
|
|
"$caucus",
|
|
"$caucuses",
|
|
|
|
"*$days", // liberty lake days
|
|
"$expo",
|
|
"$exposition",
|
|
"*$session",// sautrday evening jazz session
|
|
"*$sessions",
|
|
//"-$current exhibition", // not a good title?
|
|
"-$current session", // fixes
|
|
"-$current sessions",
|
|
"-$event session",
|
|
"-$event sessions",
|
|
//"-$schedule",
|
|
"-$calendar",
|
|
"-$calendars",
|
|
"$revue",
|
|
"*$lesson",
|
|
"*$lessons",
|
|
"rehersal",
|
|
"rehearsal", // misspelling
|
|
"$audition",
|
|
"$auditions",
|
|
"*$practice",
|
|
"*$practices", // Countryside+Christian+Basketball+Practices
|
|
"*$practica", // common for dance
|
|
"^guided", // guided astral travel
|
|
"*|training", // training for leaders
|
|
"*$exercise", // Sit+&+Fit+Chair+Exercise+for+Seniors
|
|
"$performance",
|
|
"$performances",
|
|
"*$dinner",
|
|
"*$lunch",
|
|
"*$luncheon",
|
|
"*$brunch",
|
|
"$bbq",
|
|
"$barbeque",
|
|
"|auction", // auction begins at
|
|
"|auctions",
|
|
"$run", // fun run
|
|
"$trek", // turkey trek
|
|
"$trot", // turk trot
|
|
"$walk", // bird walk, tunnel walk
|
|
"$walks", // ghost walks
|
|
"$ramble", // plymouth rock ramble
|
|
"$crawl", // spring crawl, pub crawl
|
|
"$ramble", // turkey ramble
|
|
"$ceremony", // tree lighting ceremony
|
|
"$ceremoney", // misspelling
|
|
// "ceremonies" itself is too generic. gets
|
|
// "ballroom ceremonies" as part of a sales rental putch
|
|
"$opening ceremonies", // opening ceremonies
|
|
"art opening", // Art+Opening+-+Jeff+Hagen+Watercolorist
|
|
"-certificate", // Coaching+Certificate
|
|
"-$supplies", // yoga supplies
|
|
"$awards", // 2011 Vision Awards
|
|
"$banquet",
|
|
"$banquets",
|
|
"*$ball",
|
|
"-$county", // always bad to have a county name in the title
|
|
"-$counties",
|
|
"scavenger hunt",
|
|
"$celebration", // a christmas celebration
|
|
"celebration of", // celebration of giving
|
|
"celebrates", // +Band+Celebrates+a+Season+of+Red,...
|
|
"celebrate", // Celebrate Recovery
|
|
"$showdown", // sunday showdown
|
|
"|yoga", // astha yoga
|
|
"|meditation", // simply sitting meditation, meditation 4a calm
|
|
"^family", // family climb, family drum time
|
|
"$taiko", // japanese drumming
|
|
"|karaoke", // karaoke with jimmy z
|
|
"$party", // best of the city party
|
|
"-$to party", // 18+ to party
|
|
"|symposium", // event where multiple speaches made
|
|
"|symposiums", // friday night symposiums
|
|
"|composium",
|
|
"|composiums",
|
|
"|colloquium",
|
|
"|colloquiums",
|
|
"afterparty",
|
|
"|blowout",
|
|
"$potluck",
|
|
"$pot luck",
|
|
"$bonanza",
|
|
"$night out", // girls' night out
|
|
"$showcase", // bridal showcase
|
|
"$show case", // bridal showcase
|
|
"$sideshow", // circus sideshow
|
|
"$hockey",
|
|
"|bad minton",
|
|
"|badminton",
|
|
"ping pong",
|
|
"pingpong",
|
|
"$pickleball",
|
|
"$swim", // open swim, lap swim, women only swim
|
|
"|swimming",
|
|
"|skiing",
|
|
"|carving", // wood carving
|
|
"|tai chi",
|
|
"|balance chi",
|
|
"|taichi",
|
|
"|karate",
|
|
"|judo", // kids judo
|
|
"|wrestling",
|
|
"|jujitsu",
|
|
"|ju jitsu",
|
|
"|walking",
|
|
"|pilates",
|
|
"|aerobics",
|
|
"|jazzercise",
|
|
"|birding",
|
|
"|kickboxing",
|
|
"|kick boxing",
|
|
"|billiards",
|
|
"$table tennis",
|
|
"$basketball",
|
|
"$fishing",
|
|
"^fishing in", // fishing in the midwest
|
|
"^cardiohoop",
|
|
"$crossfit",
|
|
"$zumba",
|
|
"$scouts", // cub/boy/girl scouts
|
|
"$webelos",
|
|
"$baseball",
|
|
"$softball",
|
|
"$football",
|
|
"$foosball",
|
|
"$soccer",
|
|
"$bb",
|
|
"$volleyball",
|
|
"$vb",
|
|
"$painting", // pastel painting
|
|
"$sculpting", // body sculpting
|
|
"^body sculpting",
|
|
"^chess", // chess for kids
|
|
"$campus visit",
|
|
"$tennis",
|
|
"*$sale", // jewelry sale: 10% off
|
|
"book sale",
|
|
"book clearance",
|
|
"$bash",
|
|
"$pow wow",
|
|
"$powwow",
|
|
"$camp", // iphone boot camp / space camp
|
|
"*bootcamp", // for one word occurrences
|
|
"*$tournament",
|
|
"*$tournaments", // daily poker tournaments
|
|
"$tourney",
|
|
"$tourneys",
|
|
"$competition",
|
|
"$contest",
|
|
"$cook off",
|
|
"$bake off",
|
|
"$kick off",
|
|
"$fair",
|
|
"$jam", // monday night jam
|
|
"$jamboree",
|
|
"$exhibition",
|
|
"$exhibit",
|
|
"^exhibition of",
|
|
"^exhibit of",
|
|
"^evening of", // evening of yoga & wine
|
|
"group exhibition",
|
|
"group exhibit",
|
|
"$therapy",
|
|
"$support", // Coshocton+County+Peer+Support
|
|
//"^exhibition", // exhibition: .... no. "exhibit hall"
|
|
// Graffiti Group Exhibition @ Chabot College Gallery
|
|
"exhibition",
|
|
"exhibitions",
|
|
"exhibit",
|
|
"exhibits",
|
|
"$retrospective",
|
|
"food drive",
|
|
"coat drive",
|
|
"toy drive",
|
|
"blood drive",
|
|
"recruitment drive",
|
|
"waste drive",
|
|
"donor drive",
|
|
"$christmas",
|
|
"$winterland",
|
|
"$wonderland",
|
|
"$christmasland",
|
|
"$carol",
|
|
"$carolling",
|
|
"$caroling",
|
|
"$caroles", // 3T Potluck & Caroles
|
|
"$carols",
|
|
"*$demo", // Spinning demo by margarete... villr.com
|
|
"*$demonstration",
|
|
"$debate",
|
|
"$racing",
|
|
"$race",
|
|
"|5k", // 5k run/walk ... otter run 5k
|
|
"|8k", // 5k run/walk
|
|
"|10k", // 5k run/walk
|
|
"$triathalon",
|
|
"$triathlon",
|
|
"$biathalon",
|
|
"$biathlon",
|
|
"$duathalon",
|
|
"$duathlon",
|
|
"$marathon",
|
|
"$thon", // hack-a-thon
|
|
"$athon",
|
|
"$runwalk", // run/walk
|
|
"*$relay", // Women's+Only+1/2+marathon+and+2+person+relay
|
|
"*$hunt", // egg hunt, scavenger hunt
|
|
|
|
// no, gets "write a review"
|
|
//"$review", // Bone+Densitometry+Comprehensive+Exam+Review
|
|
|
|
"|bingo",
|
|
"|poker",
|
|
"|billiards",
|
|
"|bunco",
|
|
"|crochet",
|
|
"|pinochle",
|
|
"|dominoes",
|
|
"|dominos",
|
|
"*|domino",
|
|
"*$game", // basketball game
|
|
"$game day", // senior game day
|
|
"^adoption day", // Adoption+Day,+Utopia+for+Pets
|
|
"*$gaming", // Teen+Free+Play+Gaming+at+the+Library
|
|
"*$program", // kids program, reading program
|
|
"school play",
|
|
"$experience", // the pink floyd experience
|
|
|
|
"*$programs", // children's programs
|
|
"-^register", // register for spring programs
|
|
"$101", // barnyard animals 101
|
|
"*$techniques", // Core+Training+++Stretch+Techniques
|
|
"*$technique", // course 503 raindrop technique
|
|
"*$basics", // wilton decorating basics
|
|
"^basics of",
|
|
"^the basics",
|
|
"*^basic" , // basic first aid (gets basic wire bracelet too!)
|
|
"$first aid",
|
|
"^fundamentals of",
|
|
"$fundamentals", // carving fundamentals
|
|
"^principles of",
|
|
"$principles",
|
|
"^intersections of",
|
|
"$gala", // the 17th annual nyc gala
|
|
"*$anonymous", // narcotics anonymous
|
|
"$substance abuse", // women's substance abuse
|
|
"$weight watchers",
|
|
"$mass", // vietnamese mass (watch out for massachussettes)
|
|
"midnight mass",
|
|
"$masses",
|
|
"|communion",
|
|
"|keynote",
|
|
"^opening keynote",
|
|
"spelling bee",
|
|
"on ice", // disney on ice
|
|
"for charity", // shopping for charity
|
|
"a charity", // shopping for a charity
|
|
"storytime", // children's storytime
|
|
"storytimes",
|
|
"$story", // the hershey story
|
|
"commencement", // sping commencement: undergraduate
|
|
"$walk to", // walk to end alzheimer's
|
|
"$walk 2", // walk to ...
|
|
"$walk for", // walk for ...
|
|
"$walk 4", // walk for ...
|
|
"$walk of", // walk of hope
|
|
"$encounters", // sea lion encounters
|
|
"$encounter", // sea lion encounter
|
|
"-visitor information", // Visitor+Information+@+Fort+Worth+Zoo
|
|
"*$breakfast", // 14th+Annual+Passaic+Breakfast
|
|
"presentation", // Annual+Banquet+&+Awards+Presentation
|
|
"presentations",
|
|
"-available soon",//Presentation+details+will+be+available+soon
|
|
"bike classic",
|
|
"$havdalah", // Children's+Donut+Making+Class+and+Havdalah
|
|
"|shabbat",
|
|
"|minyan", // what is this?
|
|
"|minyans", // what is this?
|
|
"fellowship",
|
|
"$benefit", // The+Play+Group+Theatre+6th+Annual+Benefit
|
|
"children's church",
|
|
"sunday school",
|
|
"*$event", // dog adoption event, networking event
|
|
"-$events", // Ongoing Tango Club of Albuquerque * events at...
|
|
"-private event",
|
|
"-view event",
|
|
"-view events",
|
|
"gathering",
|
|
"gatherings", // all-church gatherings
|
|
"$mixer", // dallas networking mixer
|
|
|
|
// put this into the isStoreHours() function in Dates.cpp
|
|
//"-is open", // Our+Website+is+Open+for+Shopping+24/7
|
|
//"-are open", // Our+Website+is+Open+for+Shopping+24/7
|
|
//"-store hours", // Winter+Store+Hours+@+13+Creek+St.
|
|
//"-shopping hours", // extended shopping hours
|
|
//"-shop hours",
|
|
//"-deadline",
|
|
//"-deadlines",
|
|
|
|
"-news", // urban planning news
|
|
"-posted", // posted in marketing at 8pm
|
|
"-driving directions",
|
|
|
|
// popular titles
|
|
"side story", // west-side story
|
|
"westside story", // west-side story
|
|
"doctor dolittle",
|
|
"nutcracker",
|
|
"mary poppins",
|
|
"harlem globetrotters",
|
|
"no chaser", // straight, no chaser
|
|
"snow white",
|
|
"charlie brown",
|
|
"pumpkin patch",
|
|
"marie osmond",
|
|
"hairspray",
|
|
"defending the", // defending the caveman
|
|
"lion king",
|
|
"ugly duckling",
|
|
"santa claus", // santa claus is coming to town
|
|
"stomp",
|
|
"chorus line",
|
|
"^cirque", // cirque do soleil
|
|
"red hot", // red hot chili peppers
|
|
"street live", // sesame street live
|
|
"the beast", // beauty and the beast
|
|
"lady gaga",
|
|
"led zeppelin",
|
|
"tom petty",
|
|
"adam ant",
|
|
"kid rock",
|
|
"|annie", // little orphan annie play
|
|
"swan lake",
|
|
|
|
// popular event names?
|
|
"crafty kids",
|
|
"sit knit", // sit & knit
|
|
|
|
// popular headliners
|
|
//"larry king",
|
|
|
|
// TODO: support "*ing club"(mountaineering club)(nursing club)
|
|
// TODO: blah blah 2012 is good!!
|
|
|
|
// gerunds (| means either $ or ^)
|
|
"*^learning",
|
|
"|bowling",
|
|
"*$bowl", // orange bowl, super bowl
|
|
"|singing", // Children's+Choirs+Singing
|
|
"|sing along", // Messiah+Sing-Along
|
|
"|singalong",
|
|
"^sing", // community sing
|
|
"$singers", // Lakeside+Singers+at+NCC
|
|
"|soapmaking", // Girls+Spa+Day:+Lip+balm,Perfume&Soapmaking:
|
|
"|scrapbooking", // What's+new+in+Scrapbooking
|
|
"|exhibiting", // Exhibiting+at+The+Center+for+Photography
|
|
"|healing", // Service+of+Healing
|
|
"^service of", // Service+of+Remembrance,+Healing+and+Hope
|
|
"^a healing", // a healing guide to renewing...
|
|
"^the healing",
|
|
"star gazing",
|
|
"stargazing",
|
|
"|meditating", // Meditating+With+The+Body
|
|
"*$showing",
|
|
"*$shooting", // Trap+shooting
|
|
"*$skills", // Resume+and+Interviewing+Skills
|
|
|
|
"|networking",
|
|
"|making", // making money
|
|
// no, was getting "serving wayne county since 1904"
|
|
// and a bunch of others like that
|
|
//"*^serving", // serving the children of the world
|
|
"-serving dinner",
|
|
"-serving breakfast",
|
|
"-serving lunch",
|
|
"-serving brunch",
|
|
"|diving",
|
|
"^hiking",
|
|
"$hike",
|
|
"*^varsity", // Varsity+Swim+&+Dive+-+ISL+Diving+@+Stone+Ridge
|
|
"*^junior varsity",
|
|
"*^jv",
|
|
"|judging", // plant judging
|
|
"rock climbing",
|
|
"|kayaking",
|
|
"|bellydancing",
|
|
"|bellydance",
|
|
"|belly dancing",
|
|
"|belly dance",
|
|
"|square dancing",
|
|
"|square dance",
|
|
"|swing dancing",
|
|
"|swing dance",
|
|
"swing night",
|
|
"$speaking", // Pastor+Mike+speaking+-+December+18,+2011
|
|
"|canoeing",
|
|
"|wrestling",
|
|
"|knitting",
|
|
"|needlework",
|
|
"|crocheting",
|
|
"$voting", // early voting
|
|
"|printmaking",
|
|
"|making", // paper bead making
|
|
"|writing", // ENG+2201:+Intermediate+College+Writing
|
|
"|sharing", // Wisdom+of+the+Sangha:+Sharing,+Reflection...
|
|
"|decorating", // wilton decorating basics
|
|
"|reading", // Tess+Gallagher+reading+at+Village+Books
|
|
"|readings", // Tess+Gallagher+reading+at+Village+Books
|
|
"-currently reading", // Currently Reading:
|
|
"|poetry",
|
|
"$and friends",
|
|
"*^celebrating", // Celebrating+Sankrant+2012
|
|
"*^interpreting", // intepreting for the deaf
|
|
"*^researching", // "Researching+and+Reflecting:+Sounds,+Sights
|
|
"*^reflections", // Stories:+Reflections+of+a+Civil+War+Historia
|
|
"*^reflecting",
|
|
"*^enhancing",
|
|
"*^mechanisms", // Mechanisms+Involved+in+Generating...
|
|
"*^finding", // finding+time+to...
|
|
"*^transforming", // Transforming+Transportation+2012
|
|
"*^reinventing",
|
|
"*^making", // Making+Good+on+the+Promise:+Effective+Board+...
|
|
"*^creating", // Creating+Cofident+Caregivers
|
|
"*^giving", // Giving+in+ALLAH's+Name
|
|
"*^getting", // Getting+Out+of+Debt, Getting+to+Know...
|
|
"-getting here", // directions! bad!
|
|
"-getting there",// directions! bad!
|
|
"*^turning", // Turning+Your+Strategic+Plan+into+an+Action+Plan
|
|
"*^engaging", // Engaging+Volunteers+with+Disabilties
|
|
"*^governing",
|
|
"*^managing", // Managing+Your+Citations+with+RefWorks
|
|
"*^entering", // entering the gates of judaism
|
|
"*^stregthening", // ...+Baptist+Church:+Strengthening+the+Core
|
|
"treeplanting",
|
|
"-managing director",
|
|
"-managing partner",
|
|
"^issues in",
|
|
"^the issues",
|
|
"*^countdown", // countdown to new year's eve sweepstakes
|
|
"*^navigating", // Navigating+the+Current+Volatility+
|
|
// defensive driving
|
|
"*|driving", // Driving+Innovation+&+Entrepreneurship
|
|
"*^using", // Using+OverDrive
|
|
"*^letting", // Letting+Poetry+Express+Your+Grief
|
|
"*^feeding", // Feeding+Children+Everywhere
|
|
"*^feeling", // Feeling+Out+of+Balance?
|
|
"*^educating", // "Educating+for+Eternity"
|
|
"*^demystifying", // like understanding (demystifying weightloss
|
|
"*^discovering",
|
|
"*^equipping", // Equipping+Ministry+-+Gospel+and+Culture
|
|
//"-^no", // no mass, no ... HURTS "no name club" too bad!!
|
|
"-$break", // fall break, winter break...
|
|
"-not be", // there will not be a dance in december
|
|
"-pollution advisory",
|
|
"-|closed", // holiday hours - museum closed, closed for xmas
|
|
"-is closing",
|
|
"-is closed",
|
|
"-be closing",
|
|
"-be closed",
|
|
"-are closing",
|
|
"-are closed",
|
|
"-$vacation", // vacation
|
|
"-cancelled",
|
|
"-canceled",
|
|
"-^calendar", // calendar of events
|
|
"-^view", // view upcoming events inlyons
|
|
"-^suggest", // suggest a new event
|
|
"-^find us", // find us on facebook...
|
|
"-^hosted by", // Hosted by Cross Country Education (CCE)
|
|
"*^comment", // Comment+in+George+Test+Group+group
|
|
"-^purchase", // Purchase Products from this Seminar
|
|
"-in recess", // class in recess
|
|
"*^playing", // Playing+For+Change
|
|
"*$drawing", // portrait drawing
|
|
"*^integrating",
|
|
"*^greening",
|
|
"*^dispelling",
|
|
"*^growing", // Growing+Up+Adopted
|
|
"*^looking",
|
|
"*^communicating",
|
|
"*^leasing",
|
|
"*^assessing",
|
|
"^quit smoking",
|
|
"*^exploring",
|
|
"history of", // "Headgear:+The+Natural+History+of+Horns+and+
|
|
"*^are your", // Are+Your+Hormones+Making+You+Sick?
|
|
"*^are you",
|
|
"*^when will", // When+Will+the+Outdoor+Warning+Sirens+Sound?
|
|
"*^concerned about", // Concerned+About+Outliving+Your+Income?
|
|
"*discover your", // discover your inner feelings
|
|
"*creating your", // creating your personal brand
|
|
"walking tour",
|
|
"|strategies", // speech topics
|
|
"*|coaching", // strategies for coaching
|
|
"*|watching", // watching paint dry
|
|
"ice skating",
|
|
"free screening",
|
|
"film screening",
|
|
"^screening of",
|
|
"^the screening",
|
|
"movie screening",
|
|
"$trilogy", // back to the future trilogy
|
|
"dj spinning",
|
|
"$screening", // Free+Memory+Screening
|
|
"$screenings", // Salt+Ghost+DVD+screenings+at+9+p.m
|
|
"$tastings", // Chocolate+Tastings+at+Oliver+Kita+Chocolates
|
|
"wine bar",
|
|
"open bar",
|
|
"open gym",
|
|
"open swim",
|
|
"*byob", // always a good indicator
|
|
"tea tastings",
|
|
"*coming to", // santa claus is coming to town, blah blah...
|
|
|
|
"^improv", // improv and standup
|
|
"^standup",
|
|
|
|
"*$circle", // past life healing circle
|
|
"^circle of", // circle of knitting (not "circle 8's" sq dnc)
|
|
"$invitational",
|
|
"$invitationals",
|
|
"-$the market", // new on the market (house sale)
|
|
"*$market", // villr.com Los Ranchos Growers' Market
|
|
"*$markets",// villr.com fix for "Markets ((subsent))"
|
|
"$bazaar", // holiday bazaar
|
|
"$bazzaar", // holiday bazaar
|
|
"$sailing", // boat sailing
|
|
"*$sail", // free public sail
|
|
"candle lighting",
|
|
"menorah lighting",
|
|
"*$lights", // river of lights, holiday lights
|
|
"*$lighting",
|
|
"tree lighting",
|
|
"tree trimming",
|
|
"book signing",
|
|
"booksigning",
|
|
"bookfair",
|
|
"ribbon cutting",
|
|
"*special guest", // +&+Hits+with+special+guest+Billy+Dean
|
|
// got Myspace.com:Music instead of the better title
|
|
// for music.myspace.com
|
|
//"$music", // live music
|
|
"^music at", // music at the mission
|
|
"$of music",
|
|
//"$of jazz", // evening of jazz, jazz music
|
|
"|jazz", // children's jazz
|
|
"$feast", // a bountiful burlesque feast
|
|
//"$live", // junkyard jane live - but gets a place name "Kobo live".. so bad
|
|
"*$spree", // let it snow holiday shopping spree
|
|
|
|
"|public skating",
|
|
"^public skate",
|
|
|
|
// no caused 'state emission inspection' to come up
|
|
//"$inspection", // building life safety inspections
|
|
//"$inspections",
|
|
|
|
"*^occupy", // occupy portland
|
|
"$fundraiser",
|
|
"^fundraising", // fundraising performance for ...
|
|
"$raffle",
|
|
"$giveaway", // anniversary quilt giveaway
|
|
"$townhall",
|
|
"town hall",
|
|
"open house",
|
|
"open houses", // then a list of times...
|
|
"pumpkin patch",
|
|
"happy hour",
|
|
"cook off",
|
|
"bake off",
|
|
"story time",
|
|
"story telling",
|
|
"storytelling",
|
|
"story hour",
|
|
"speed dating",
|
|
|
|
// this can be a place name too easily: the empty bottle
|
|
// The Florida Rowing Center
|
|
//"^the", // IS THIS GOOD???? not for "
|
|
|
|
"$worship",
|
|
"$rosary",
|
|
"bible study",
|
|
"bible studies",
|
|
"torah study",
|
|
"torah studies",
|
|
"$prayer", // bible study and prayer
|
|
"^pray", // pray in the new year
|
|
"$eve service", // christmas/thanksgiving eve service
|
|
"$penance service",
|
|
"$penance services",
|
|
"$candlelight service",
|
|
"$candlelight", // carols & candlelight
|
|
"eve worship",
|
|
"eucharist",
|
|
"worship service",
|
|
"worship services",
|
|
"morning service",
|
|
"morning services",
|
|
"night service",
|
|
"night services",
|
|
"evening service",
|
|
"evening services",
|
|
"sunday services", // church stuff
|
|
"monday services", // church stuff
|
|
"tuesday services", // church stuff
|
|
"wednesday services", // church stuff
|
|
"thursday services", // church stuff
|
|
"friday services", // church stuff
|
|
"saturday services", // church stuff
|
|
"worship services", // church stuff
|
|
"church services", // church stuff
|
|
"sunday service", // church stuff
|
|
"monday service", // church stuff
|
|
"tuesday service", // church stuff
|
|
"wednesday service", // church stuff
|
|
"thursday service", // church stuff
|
|
"friday service", // church stuff
|
|
"saturday service", // church stuff
|
|
"day service", // memorial day service, etc.
|
|
"candleight service",
|
|
"prayer service",
|
|
"traditional service",
|
|
"traditions service",
|
|
"blended service",
|
|
"shabbat service",
|
|
"shabbat services",
|
|
"contemporary service",
|
|
"lenten service",
|
|
"celebration service",
|
|
"worship service",
|
|
"eucharist service",
|
|
"service times",
|
|
"service time",
|
|
"sunday mass",
|
|
"monday mass",
|
|
"tuesday mass",
|
|
"wednesday mass",
|
|
"thursday mass",
|
|
"friday mass",
|
|
"saturday mass",
|
|
|
|
"$taco tuesdays",
|
|
|
|
"$tasting", // wine tasting
|
|
"$tastings", // wine tastings
|
|
"*$conference", // Parent/Teacher+Conference+
|
|
"*$conferences",
|
|
"*$retreat",
|
|
"$adventure", // big bird's adventure
|
|
"^the adventures", // the adventures of...
|
|
"$workshop", // "$workshop a" for aliconferences.com
|
|
"$workshops", // budget workshops
|
|
"$worksession", // city council worksession
|
|
"^rally", // rall to support the EPA's mercury health standards
|
|
"$rally", // pep rally
|
|
"$fair",
|
|
"$fairs",
|
|
"*$night", //biker night for guysndollsllc.com,family fun night
|
|
// dance club nights - hurts ritmo caliente because we lose
|
|
// that title and get "$main room, dance nights, 21+"
|
|
//"$nights",
|
|
"*^dj", // dj frosty
|
|
"$music nights",
|
|
"$championships",
|
|
"$championship",
|
|
"$challenge",
|
|
"$picnic",
|
|
"$dance",
|
|
"$dancing", // irish traditional dancing
|
|
"$dancers", // irish ceili dancers
|
|
"$freestyle", // type of dance
|
|
"^freestyle", // type of dance
|
|
"^cardio",
|
|
"*$fitness", // pre-natal fitness
|
|
"*$workout", // football workout
|
|
"-school of", // nix school of ballet
|
|
"$tango", // tango for beginners
|
|
"$ballet",
|
|
"$preballet",
|
|
"|linedancing",
|
|
"$modern",
|
|
"$waltz",
|
|
"$polka",
|
|
"$musical", // menopause the musical
|
|
"$swing",
|
|
"$milonga", // type of dance
|
|
"$bachata",
|
|
"|salsa", // Salsa in New Mexico
|
|
"$tap",
|
|
"$pagent",
|
|
"$pageant", // christmas pageant
|
|
"*$tutoring",
|
|
"*$tutorial",
|
|
"*$tutorials",
|
|
"*$instruction",
|
|
"*$education", // childbirth education
|
|
"-no class", // holday break - no classes
|
|
"-no classes",// holday break - no classes
|
|
"-unavailable",
|
|
"-last class",
|
|
"-no school",
|
|
"*$class",
|
|
"*$classes",
|
|
"*$teleclass",
|
|
"*$certification",
|
|
"*$class week", // massage class week 1
|
|
"*$class level", // massage class level 1
|
|
"*$mixed level", // Hatha+Basics+Mixed+Level
|
|
"*$mixed levels",
|
|
"*$part 1", // Recent+Acquisitions,+Part+I:+Contemporary+Photos
|
|
"*$part 2",
|
|
"*$part 3",
|
|
"*$part 4",
|
|
"*$part i",
|
|
"*$part ii",
|
|
"*$part iii",
|
|
"*$part iv",
|
|
"*$class session", // massage class session 1
|
|
"*$course",
|
|
"*$courses",
|
|
"-$golf course",
|
|
"-$golf courses",
|
|
"*$lessons", // Free Latin Dancing Lessons
|
|
"*$lesson",
|
|
"-$no shows", // we bill for no shows
|
|
"*$show", // steve chuke show
|
|
"*$shows",
|
|
"-no show",
|
|
"-no shows",
|
|
"-past shows",
|
|
"-future shows",
|
|
|
|
// stuff at start
|
|
"*^annual",
|
|
"|anniversary", // anniversary dinner, 2 year anniversary
|
|
"festival of", // Masonicare+Festival+of+Trees
|
|
"^learn to",
|
|
"*^understanding", // Understanding Heart Valves (lecture)
|
|
"*introduction to", // lecture
|
|
"*^introductory", // introductory potter's wheel
|
|
"^introduction", // ... : introduction climb
|
|
"*how to", // cheap how to succeed in business...
|
|
"-reach us", // how to reach us
|
|
"-contact us", // how to contact us
|
|
"*intro to", // lecture
|
|
"*^the story", // the story of carl kiger
|
|
"*^all about", // Class - All About Sleep Apnea
|
|
"*$an introduction", // lecture
|
|
"*$an intro", // lecture
|
|
"*^the wonder", // the wonder of kites
|
|
//"^dance", // 'dance location map' ceder.net!
|
|
"^graduation",
|
|
"*$special", // valentine's special
|
|
"*world premier",
|
|
|
|
// stuff in the middle
|
|
"*vs",
|
|
"*versus",
|
|
"*class with",
|
|
"*class w", // class w/
|
|
"*class at",
|
|
"*class on",
|
|
"*class in", // class in downtown palo alto
|
|
"*classes with",
|
|
"*classes w", // class w/
|
|
"*classes at",
|
|
"*classes on",
|
|
"-^half day", // half day of classes
|
|
"*$on display", // trains on display
|
|
"*show with",
|
|
"*dance with",
|
|
"*dancing with",
|
|
"^free dance", // free dance fusion class at ...
|
|
"*dance at",
|
|
"dancing lessons",
|
|
"*art of", // the art of bill smith
|
|
"*^art at", // art at the airport
|
|
"*meeting at",
|
|
"*meetings are", // meetings are from 7pm to 9pm
|
|
"*$and greet", // meet and greet
|
|
"$meet greet", // meet & greet
|
|
"*$and mingle", // mix and mingle
|
|
"*free lessons",
|
|
"*lessons at", // free dancing lessons at
|
|
"*dance for" // dance for the stars
|
|
};
|
|
|
|
// store these words into table
|
|
static HashTableX s_twt;
|
|
static char s_twtbuf[4000];
|
|
static bool s_init10 = false;
|
|
if ( ! s_init10 ) {
|
|
s_init10 = true;
|
|
s_twt.set(8,4,128,s_twtbuf,4000,false,0,"ti1tab");
|
|
int32_t n = (int32_t)sizeof(s_twords)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
char *s = s_twords[i];
|
|
// set words
|
|
Words w; w.set3 ( s );
|
|
int64_t *wi = w.getWordIds();
|
|
int64_t h = 0;
|
|
// scan words
|
|
for ( int32_t j = 0 ; j < w.getNumWords(); j++ ) {
|
|
// fix "art of" = "of art"
|
|
if (! wi[j] ) continue;
|
|
h <<= 1LL;
|
|
h ^= wi[j];
|
|
}
|
|
// wtf?
|
|
if ( h == 0LL ) { char *xx=NULL;*xx=0; }
|
|
// . store hash of all words, value is ptr to it
|
|
// . put all exact matches into sw1 and the substring
|
|
// matches into sw2
|
|
if ( ! s_twt.addKey ( &h , &s ) ) return false;
|
|
}
|
|
}
|
|
|
|
// store these words into table
|
|
if ( ! s_init3 ) initGenericTable ( niceness );
|
|
|
|
// scan words in [a,b) for a match.
|
|
// skip if in date or stop word
|
|
|
|
// ok, now scan forward. the word can also be next to
|
|
// strange punct like a colon like
|
|
// "Hebrew Conversation Class: Beginning" for dailylobo.com
|
|
int32_t i = a;
|
|
int64_t firstWid = 0LL;
|
|
int64_t lastWid = 0LL;
|
|
bool hadAnnual = false;
|
|
bool hadFeaturing = false;
|
|
bool lastWordWasDate = false;
|
|
bool negMatch = false;
|
|
bool posMatch = false;
|
|
bool oneWordMatch = false;
|
|
bool hadAthon = false;
|
|
for ( ; i < b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if not alnum word
|
|
if ( ! wids[i] ) continue;
|
|
// first word?
|
|
if ( ! firstWid ) firstWid = wids[i];
|
|
// does it match a word in the table?
|
|
char **sp = (char **)s_twt.getValue ( &wids[i] );
|
|
// a match?
|
|
if ( sp ) oneWordMatch = true;
|
|
// or a two word phrase? even if we matched a one word
|
|
// phrase, try for the two because it might be a negative!
|
|
// i.e. "half open"
|
|
if ( lastWid ) {
|
|
int64_t combo = wids[i] ^ (lastWid<<1LL);
|
|
char **sp2 = (char **)s_twt.getValue ( &combo );
|
|
// if there use that! otherwise, leave sp alone
|
|
if ( sp2 ) sp = sp2;
|
|
if ( sp2 ) oneWordMatch = false;
|
|
}
|
|
// get next wid after us
|
|
int64_t nextWid = 0LL;
|
|
for ( int32_t k = i + 1 ; k < b ; k++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( ! wids[k] ) continue;
|
|
nextWid = wids[k];
|
|
break;
|
|
}
|
|
// "-getting there" to prevent "getting" from winning
|
|
if ( nextWid ) {
|
|
int64_t combo = (wids[i]<<1LL) ^ nextWid;
|
|
char **sp2 = (char **)s_twt.getValue ( &combo );
|
|
// if there use that! otherwise, leave sp alone
|
|
if ( sp2 ) sp = sp2;
|
|
}
|
|
|
|
if ( wids[i] == h_annual ) hadAnnual = true;
|
|
// must not be the first word or last word
|
|
if ( wids[i] == h_featuring && i > a && i < b-1 )
|
|
hadFeaturing = true;
|
|
|
|
// any kind of athon, hackathon, etc. sitathon
|
|
if ( wlens[i]>=8 &&
|
|
to_lower_a(wptrs[i][wlens[i]-5]) == 'a' &&
|
|
to_lower_a(wptrs[i][wlens[i]-4]) == 't' &&
|
|
to_lower_a(wptrs[i][wlens[i]-3]) == 'h' &&
|
|
to_lower_a(wptrs[i][wlens[i]-2]) == 'o' &&
|
|
to_lower_a(wptrs[i][wlens[i]-1]) == 'n' )
|
|
hadAthon = true;
|
|
|
|
// any *fest too!! assfest
|
|
if ( wlens[i]>=7 &&
|
|
to_lower_a(wptrs[i][wlens[i]-4]) == 'f' &&
|
|
to_lower_a(wptrs[i][wlens[i]-3]) == 'e' &&
|
|
to_lower_a(wptrs[i][wlens[i]-2]) == 's' &&
|
|
to_lower_a(wptrs[i][wlens[i]-1]) == 't' )
|
|
hadAthon = true;
|
|
|
|
//if ( wids[i] == h_this &&
|
|
// i+2<nw && wids[i+2] == h_series )
|
|
// log("hey");
|
|
|
|
// save it
|
|
int64_t savedWid = lastWid;
|
|
// assign
|
|
lastWid = wids[i];
|
|
// save this
|
|
bool savedLastWordWasDate = lastWordWasDate;
|
|
// and update
|
|
lastWordWasDate = (bool)(bits && (bits[i] & D_IS_IN_DATE));
|
|
// match?
|
|
if ( ! sp ) continue;
|
|
// get the char ptr
|
|
char *pp = *sp;
|
|
// or total like "total courses: xx. total sections: yy"
|
|
if ( savedWid == h_total ) return -1;
|
|
// . if prev word was "no" then return -1
|
|
// . fix "no mass" "no class" etc.
|
|
// . oneWordMatch fixes "Straight No Chaser" play title
|
|
// which has "no chaser" in the list above
|
|
if ( savedWid == h_no && oneWordMatch ) return -1;
|
|
// fix "past conferences"
|
|
if ( savedWid == h_past && oneWordMatch ) return -1;
|
|
// aynthing starting with no is generally bad...
|
|
// like "no class" "no service", etc.
|
|
if ( savedWid == h_no && firstWid == h_no &&
|
|
// UNLESS it's "no name" - like "the no name club"
|
|
lastWid != h_name )
|
|
return -1;
|
|
// return value is true by default
|
|
bool *matchp = &posMatch;
|
|
// . is it generic? "sesssions", etc. that means we need
|
|
// 2+ alnum words, we can't have a title that is just
|
|
// "sessions"
|
|
// . if it is generic and we only have one word, forget it!
|
|
if ( *pp == '*' && alnumCount == 1 && useAsterisk )
|
|
continue;
|
|
// skip asterisk
|
|
if ( *pp == '*' )
|
|
pp++;
|
|
// is it negative. complement return value then
|
|
if ( *pp == '-' ) { matchp = &negMatch; pp++; }
|
|
// anywhere?
|
|
if ( *pp != '$' && *pp != '^' && *pp != '|' ) {
|
|
*matchp = 1;
|
|
continue;
|
|
}
|
|
// the gerunds mismatch easily and therefore we require for
|
|
// the match to be complete that we not be mixed case. fixes
|
|
// "Looking forward to seeing you again"
|
|
int32_t pplen = gbstrlen(pp);
|
|
if ( pp[pplen-1]=='g' &&
|
|
pp[pplen-2]=='n' &&
|
|
pp[pplen-3]=='i' &&
|
|
( sflags & SENT_MIXED_CASE ) )
|
|
continue;
|
|
// yes! must match first part of sentence
|
|
if ( (*pp == '^' || *pp == '|' ) &&
|
|
( wids[i] == firstWid ||
|
|
lastWid == firstWid ) )
|
|
*matchp = 1;
|
|
// . or if a colon was right before us...
|
|
// . Hacking+the+Planet:+Demystifying+the+Hacker+Space
|
|
if ( (*pp == '^' || *pp == '|' ) &&
|
|
i>a &&
|
|
words->hasChar(i-1,':') )
|
|
*matchp = 1;
|
|
// or date right before us
|
|
if ( (*pp == '^' || *pp == '|' ) && savedLastWordWasDate )
|
|
*matchp = 1;
|
|
// . this is always good (HACK)
|
|
// . annual party / annual spring concert/annual nye afterparty
|
|
if ( hadAnnual && *pp != '-' ) *matchp = 1;
|
|
// keep chugging if must match first word in sentence
|
|
if ( *pp == '^' ) continue;
|
|
// stop if end of the line, that counts as well
|
|
if ( i + 2 >= b ) { *matchp = 1; continue; }
|
|
// tags are good
|
|
if ( tids[i+1] ) *matchp = 1;
|
|
// fix "...dance class,</strong>"
|
|
if ( tids[i+2] ) *matchp = 1;
|
|
// fix "workshop a" for aliconferences.com
|
|
if ( i == b - 3 && wlens[b-1] == 1 )
|
|
*matchp = 1;
|
|
// Dance III..., Adult II...
|
|
if ( i+2 < b && wptrs[i+2][0]=='I'&&wptrs[i+2][1] == 'I')
|
|
*matchp = 1;
|
|
// Dance I ...
|
|
if ( i+2 < b && wlens[i+2]==1 && wptrs[i+2][0] == 'I' )
|
|
*matchp = 1;
|
|
// Ballet V ...
|
|
if ( i+2 < b && wlens[i+2]==1 && wptrs[i+2][0] == 'V' )
|
|
*matchp = 1;
|
|
// Ballet VI ...
|
|
if ( i+2 < b && wptrs[i+2][0] == 'V'&&wptrs[i+2][1]=='I')
|
|
*matchp = 1;
|
|
// hitting a date is ok... (TODO: test "... every saturday"
|
|
if ( bits && (bits[i+2] & D_IS_IN_DATE) ) *matchp = 1;
|
|
// a lot of times it does not treat the year as a date!
|
|
// "Radio City Christmas Spectacular 2011"
|
|
if ( i+2 < b &&
|
|
is_digit(wptrs[i+2][0]) &&
|
|
wlens[i+2]==4 &&
|
|
words->getAsLong(i+2) >= 2005 &&
|
|
words->getAsLong(i+2) <= 2040 )
|
|
*matchp = 1;
|
|
// the word for is ok? "training for grades ..."
|
|
if ( wids[i+2] == h_for ) *matchp = 1;
|
|
// how about "of"... symposium of science. celebration of ...
|
|
//if ( wids[i+2] == h_of ) *matchp = 1;
|
|
// at following is ok. Tess+Gallagher+reading+at+Village+Book
|
|
if ( wids[i+2] == h_at ) *matchp = 1;
|
|
// music by the bay
|
|
if ( wids[i+2] == h_by ) *matchp = 1;
|
|
|
|
// "in" can be a delimeter if a place name follows it like
|
|
// "the conference room" or "the Martin Building" ...
|
|
// or "Zipperdome, 123 main street"
|
|
// fix subsent:
|
|
// ... Pre-Hearing Discussion in the Conference Room...
|
|
// for legals.abqjournal.com/legals/show/273616
|
|
// TODO: a city/state!!!
|
|
if ( wids[i+2] == h_in ) {
|
|
*matchp = 1;
|
|
/*
|
|
// assume none
|
|
bool gotPlaceInd = false;
|
|
// scan for place indicator or in address bit
|
|
for ( int32_t j = i + 4 ; j < b ; j++ ) {
|
|
if ( ! isPlaceIndicator(&wids[j]) ) continue;
|
|
gotPlaceInd = true;
|
|
break;
|
|
}
|
|
if ( gotPlaceInd ) *matchp = 1;
|
|
*/
|
|
}
|
|
|
|
// put in subsent code
|
|
//if ( i+4<b && wids[i+2] == h_directed && wids[i+4] == h_by )
|
|
// *matchp = 1;
|
|
// Mass on January 1
|
|
if ( i+4<b && wids[i+2]==h_on && bits &&
|
|
(bits[i+4]&D_IS_IN_DATE) )
|
|
*matchp = 1;
|
|
// blah blah in the chapel / in the Park / symposium in Boise..
|
|
if ( i+6<b && wids[i+2] == h_in )
|
|
*matchp = 1;
|
|
// beginning at
|
|
if ( i+4<b && wids[i+2]==h_beginning && wids[i+4]==h_at )
|
|
*matchp = 1;
|
|
// begins at
|
|
if ( i+4<b && wids[i+2]==h_begins && wids[i+4]==h_at )
|
|
*matchp = 1;
|
|
// club meets at
|
|
if ( i+4<b && wids[i+2]==h_meets && wids[i+4]==h_at )
|
|
*matchp = 1;
|
|
// rehersal times, concert times, blah blah times
|
|
if ( i+3 == b && wids[i+2]==h_times )
|
|
*matchp = 1;
|
|
// blah blah benefitting blah blah
|
|
if ( wids[i+2] == h_benefitting ) *matchp = 1;
|
|
if ( wids[i+2] == h_benefiting ) *matchp = 1;
|
|
// blah blah party sponsored by ...
|
|
if ( i+4<b && wids[i+2] == h_sponsored ) *matchp = 1;
|
|
if ( i+4<b && wids[i+2] == h_sponsered ) *matchp = 1;
|
|
// blah blah party presented by ...
|
|
if ( i+4<b && wids[i+2] == h_presented ) *matchp = 1;
|
|
// a colon is good "Soapmaking: how to"
|
|
if ( words->hasChar (i+1,':' ) &&
|
|
// watch out for field names
|
|
wids[i] != h_event &&
|
|
wids[i] != h_band )
|
|
*matchp = 1;
|
|
// likewise a hyphen "Class - All About Sleep Apnea"
|
|
if ( words->hasChar (i+1,'-' ) &&
|
|
// watch out for field names
|
|
wids[i] != h_event &&
|
|
wids[i] != h_band )
|
|
*matchp = 1;
|
|
// or parens: Million+Dollar+Quartet+(Touring)
|
|
if ( words->hasChar (i+1,'(' ) ) *matchp = 1;
|
|
|
|
/*
|
|
// strange punct follows?
|
|
char *p = wptrs[i+1];
|
|
char *pend = p + wlens[i+1];
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( *p != ':' ) continue;
|
|
// . phone number not allowed after it!
|
|
// . fix "Adult Services: 881-001" for unm.edu
|
|
int32_t next = i + 1;
|
|
for ( ; next < b && ! wids[next]; next++ );
|
|
// no phone number, so allow this ending!
|
|
if ( next >= b ) break;
|
|
// phone number? if not, we're good!
|
|
if ( ! isdigit(wptrs[next][0]) ) break;
|
|
// crap, forget it... do not set ENDING bit
|
|
p = pend-1;
|
|
}
|
|
// there is no strange punct after it, so this word is
|
|
// not really the "last" word in this sentence
|
|
if ( p >= pend ) continue;
|
|
// we got some strange punct
|
|
return retVal;
|
|
*/
|
|
}
|
|
|
|
// return it if we got something
|
|
if ( negMatch ) return -1;
|
|
if ( posMatch ) return 1;
|
|
if ( hadAthon ) return 1;
|
|
|
|
// blah blah featuring blah blah
|
|
if ( hadFeaturing ) return 1;// true;
|
|
|
|
// . if it has quotes, with, at, @, *ing, "w/" set it
|
|
// . if has mixed case do not do this loop
|
|
if ( sflags & SENT_MIXED_CASE ) b = 0;
|
|
int32_t hadNonDateWord = 0;
|
|
int64_t lastWordId = 0LL;
|
|
bool lastWordPastTense = false;
|
|
// loop over all words in the title
|
|
for ( i = a ; i < b ; i++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( tids[i] ) continue;
|
|
// check for puncutation based title indicators
|
|
if ( ! wids[i] ) {
|
|
// MCL+Meet+@+Trego
|
|
if ( words->hasChar(i,'@' ) &&
|
|
// fix "Sunday Oct 4 2pm at ..." for zvents.com
|
|
hadNonDateWord &&
|
|
// last word is not "tickets"!! fix zvents.com
|
|
lastWordId != h_tickets &&
|
|
// only for place names not tods like "@ 2pm"
|
|
i+1<b && ! is_digit(words->m_words[i+1][0]) )
|
|
break;
|
|
// "Chicago"
|
|
if ( i>0 &&
|
|
! tids[i-1] &&
|
|
! wids[i-1] &&
|
|
words->hasChar(i-1,'\"') )
|
|
break;
|
|
// Event:+'Sign+Language+Interpreted+Mass'
|
|
if ( i>0 &&
|
|
! tids[i-1] &&
|
|
! wids[i-1] &&
|
|
words->hasChar(i,'\'') )
|
|
break;
|
|
continue;
|
|
}
|
|
|
|
// blah blah with Tom Smith
|
|
if ( i > a &&
|
|
i+2 < b &&
|
|
(wids[i] == h_with || wids[i] == h_starring) &&
|
|
// with id doesn't count
|
|
wids[i+2] != h_i &&
|
|
wids[i+2] != h_id &&
|
|
// experience with quickbooks
|
|
(i-2<a || wids[i-2] != h_experience) &&
|
|
// with purchase
|
|
wids[i+2] != h_purchase &&
|
|
// with $75 purchase
|
|
(i+4>=b || wids[i+4] != h_purchase) )
|
|
break;
|
|
// blah blah w/ Tom Smith
|
|
if ( i > a &&
|
|
i+2 < b &&
|
|
wids[i] == h_w &&
|
|
i+1<b &&
|
|
words->hasChar(i+1,'/') &&
|
|
// with id doesn't count
|
|
wids[i+2] != h_i &&
|
|
wids[i+2] != h_id )
|
|
break;
|
|
// "Lotsa Fun at McCarthy's"
|
|
if ( wids[i] == h_at &&
|
|
// fix "Sunday Oct 4 2pm at ..." for zvents.com
|
|
hadNonDateWord &&
|
|
// last word is not "tickets"!! fix zvents.com
|
|
lastWordId != h_tickets &&
|
|
// not look at.. take a look at the menu
|
|
lastWordId != h_look &&
|
|
// what's new at the farm
|
|
lastWordId != h_new &&
|
|
// fix "Events at Stone Brewing Co"
|
|
lastWordId != h_events &&
|
|
// search jobs at aria
|
|
lastWordId != h_jobs &&
|
|
// write us at ...
|
|
lastWordId != h_us &&
|
|
// at its best
|
|
(i+2>=b || wids[i+2] != h_its) &&
|
|
// she studied at the rcm
|
|
! lastWordPastTense &&
|
|
// . lastword can't be a place indicator
|
|
// . Aquarium at Albuquerque ...
|
|
// . Museums at Harvard
|
|
! s_pit.isInTable(&lastWordId) &&
|
|
// only for place names not tods like "at 2pm"
|
|
i+2<b && ! is_digit(words->m_words[i+2][0]) )
|
|
break;
|
|
|
|
bool isDateWord = (bool)(bits && (bits[i] & D_IS_IN_DATE));
|
|
// mark this
|
|
if ( ! isDateWord ) hadNonDateWord++;
|
|
// save this
|
|
lastWordId = wids[i];
|
|
|
|
// assume not past tense
|
|
lastWordPastTense = false;
|
|
// is it past tense like "studied"?
|
|
int32_t wlen = wlens[i];
|
|
if ( to_lower_a(wptrs[i][wlen-1]) != 'd' ) continue;
|
|
if ( to_lower_a(wptrs[i][wlen-2]) != 'e' ) continue;
|
|
// exceptions: feed need
|
|
if ( to_lower_a(wptrs[i][wlen-3]) == 'e' ) continue;
|
|
// exceptions: ned zed bed
|
|
if ( wlen == 3 ) continue;
|
|
// probably a ton more exceptions must let's see how it does
|
|
lastWordPastTense = true;
|
|
}
|
|
// i guess we had something nice...
|
|
if ( i < b ) return 1; // true;
|
|
|
|
// no match
|
|
return 0;
|
|
}
|
|
/*
|
|
bool Sections::isPlaceOrBusinessWord ( int32_t i ) {
|
|
|
|
isPlaceIndicator ( &wids[i] ) return true;
|
|
if ( wids[i] == h_news ) return true;
|
|
if ( wids[i] == h_network ) return true;
|
|
|
|
// generally, allow all gerunds, anywhere in the title
|
|
// but if a noun like association or center follows them
|
|
// they do not count, they are describing a bldg or
|
|
// organization... or "news" or "club" "room" (dining room)
|
|
// "co" "company" "llc" ...
|
|
}
|
|
*/
|
|
|
|
void Sections::setSentPrettyFlag ( Section *si ) {
|
|
// int16_tcut
|
|
sentflags_t sflags = si->m_sentFlags;
|
|
|
|
// int16_tcut
|
|
wbit_t *bits = m_bits->m_bits;
|
|
|
|
static int64_t h_click;
|
|
static int64_t h_here;
|
|
static int64_t h_link;
|
|
static int64_t h_the;
|
|
static bool s_init = false;
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
h_click = hash64n("click");
|
|
h_here = hash64n("here");
|
|
h_the = hash64n("the");
|
|
h_link = hash64n("link");
|
|
}
|
|
|
|
if ( ! m_alnumPosValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume not pretty
|
|
bool pretty = false;
|
|
// . we want to record votes on pretty sentences regardless
|
|
// . the idea is is that if a turk vote changes the event
|
|
// title then more sentences may be included as part of the
|
|
// event description without every having been voted on!
|
|
bool periodEnds = (sflags & SENT_PERIOD_ENDS);
|
|
// if enough mixed case words, allow it through anyway
|
|
int32_t numAlnums = si->m_alnumPosB - si->m_alnumPosA + 1;
|
|
if ( (sflags & SENT_MIXED_CASE) && numAlnums >= 6 &&
|
|
// watch out for urls like get?q=gbeventhash668419539....
|
|
!(sflags & SENT_HASNOSPACE) )
|
|
periodEnds = true;
|
|
// or if it has commas
|
|
bool hasComma = false;
|
|
for ( int32_t i = si->m_senta ; i < si->m_sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not word
|
|
if ( ! m_tids[i] ) continue;
|
|
if ( m_wids[i] ) continue;
|
|
if ( ! m_words->hasChar(i,',') ) continue;
|
|
hasComma = true;
|
|
break;
|
|
}
|
|
if ( hasComma && numAlnums >= 3 ) periodEnds = true;
|
|
// if no period ends, nuke
|
|
if ( periodEnds ) pretty = true;
|
|
// BUT if its all generic words it is not pretty!
|
|
int32_t lastWidPos = -1;
|
|
// assume all generic words
|
|
bool allGeneric = true;
|
|
// is all generic words?
|
|
for ( int32_t i = si->m_senta ; i < si->m_sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not word
|
|
if ( ! m_wids[i] ) continue;
|
|
// ignore if it has "click here"
|
|
if ( m_wids[i] == h_here &&
|
|
lastWidPos >= 0 &&
|
|
m_wids[lastWidPos] == h_click )
|
|
break;
|
|
// ignore if has "[click|follow] the link"
|
|
if ( m_wids[i] == h_link &&
|
|
lastWidPos >= 0 &&
|
|
m_wids[lastWidPos] == h_the )
|
|
break;
|
|
// save it
|
|
lastWidPos = i;
|
|
// skip if generic like
|
|
if ( bits[i] & (D_CRUFTY|
|
|
D_IS_NUM|
|
|
D_IS_HEX_NUM|
|
|
D_IN_PARENS|
|
|
D_IS_IN_DATE) )
|
|
continue;
|
|
// otherwise, not generic
|
|
allGeneric = false;
|
|
break;
|
|
}
|
|
// if all generic, not pretty
|
|
if ( allGeneric ) pretty = false;
|
|
// powered by is not pretty
|
|
if ( sflags & SENT_POWERED_BY ) pretty = false;
|
|
// all in link is not pretty
|
|
if ( bits[si->m_senta] & D_IN_LINK ) pretty = false;
|
|
// starts with "click" is bad "Click the map to drag around..."
|
|
// or "Click the link to see more..."
|
|
if ( m_wids[si->m_a] == h_click ) pretty = false;
|
|
// set it
|
|
if ( ! pretty ) return;
|
|
// we are pretty
|
|
si->m_sentFlags |= SENT_PRETTY;
|
|
}
|
|
|
|
/*
|
|
////////////////////////////////
|
|
//
|
|
// punish sections that have a repeated tag hash
|
|
//
|
|
//
|
|
// - the idea being title sections are somewhat unique
|
|
// - limit repeat table recordings for each individual event
|
|
//
|
|
////////////////////////////////
|
|
char rttbuf[4000];
|
|
HashTableX rtt;
|
|
rtt.set(4,4,256,rttbuf,4000,false,m_niceness,"rttt");
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
if ( si->m_minEventId <= 0 ) continue;
|
|
// insert into table, score one
|
|
if ( ! rtt.addTerm32((int32_t *)&si->m_tagHash) ) return false;
|
|
}
|
|
// now punish the repeaters
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
if ( si->m_minEventId <= 0 ) continue;
|
|
// insert into table, score one
|
|
int32_t score = rtt.getScore32((int32_t *)&si->m_tagHash) ;
|
|
// punish?
|
|
if ( score > 1 ) continue;
|
|
// yes
|
|
si->m_sentFlags |= SENT_UNIQUE_TAG_HASH;
|
|
si->m_titleScore *= 1.02;
|
|
}
|
|
|
|
|
|
/////////////
|
|
//
|
|
// title score boost for being part of title tag
|
|
//
|
|
// many event titles are repeated in the title tag
|
|
// that may be true, but also the title is generic and repeated
|
|
// again in the document body, and that hurts us!!
|
|
//
|
|
// TODO: consider doing this if only one event?
|
|
//
|
|
///////////
|
|
// get the title tag section
|
|
Section *titleSec = NULL;
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// now we require the sentence
|
|
if ( si->m_tagId != TAG_TITLE ) continue;
|
|
titleSec = si;
|
|
break;
|
|
}
|
|
// . score each section that directly contains text.
|
|
// . have a score for title and score for description
|
|
for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// forget if no title
|
|
if ( ! titleSec ) break;
|
|
// now we require the sentence
|
|
if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// skip if not in description right now, no need to score it!
|
|
if ( si->m_minEventId <= 0 ) continue;
|
|
// get sentence
|
|
int32_t senta = si->m_senta;
|
|
int32_t sentb = si->m_sentb;
|
|
// do not match title tag with itself!
|
|
if ( senta >= titleSec->m_a && senta < titleSec->m_b )
|
|
continue;
|
|
// title sentence
|
|
int32_t ta = titleSec->m_a;
|
|
int32_t tb = titleSec->m_b;
|
|
// record max words matched
|
|
int32_t max = 0;
|
|
bool allMatched = false;
|
|
int32_t matchedWords = 0;
|
|
// compare to sentence in title
|
|
for ( int32_t i = ta ; i < tb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not alnum
|
|
if ( ! m_wids[i] ) continue;
|
|
// check with title
|
|
if ( m_wids[i] != m_wids[senta] ) {
|
|
// reset matched word count
|
|
matchedWords = 0;
|
|
// reset this ptr too
|
|
senta = si->m_senta;
|
|
continue;
|
|
}
|
|
// ok, words match, see how many
|
|
matchedWords++;
|
|
// record max
|
|
if ( matchedWords > max ) max = matchedWords;
|
|
// advance over word we match
|
|
senta++;
|
|
// advance to next alnumword
|
|
for ( ; senta < sentb && !m_wids[senta] ; senta++);
|
|
// all done?
|
|
if ( senta < sentb ) continue;
|
|
// all matched!
|
|
allMatched = true;
|
|
break;
|
|
}
|
|
if ( ! allMatched ) continue;
|
|
// reward
|
|
si->m_sentFlags |= SENT_MATCHED_TITLE_TAG;
|
|
// give a little more the more words matched
|
|
// no, because the title often has the containing place name,
|
|
// like "Museum of Science and Industry" even when the page
|
|
// is talking about the "Bronzeville Blue Club" which will
|
|
// get less points because its only match 3 words...
|
|
//float minb = 2.0;
|
|
//float maxb = 3.0;
|
|
//if ( matchedWords > 10 ) matchedWords = 10;
|
|
//float bonus = minb+(maxb-minb) * ((float)matchedWords/10.0);
|
|
si->m_titleScore *= 3.0;//bonus;
|
|
si->m_descScore *= 3.0;//bonus;
|
|
}
|
|
}
|
|
*/
|
|
|
|
#define METHOD_MONTH_PURE 0 // like "<h3>July</h3>"
|
|
#define METHOD_TAGID 1
|
|
#define METHOD_DOM 2
|
|
#define METHOD_ABOVE_DOM 3
|
|
#define METHOD_DOW 4
|
|
#define METHOD_DOM_PURE 5
|
|
#define METHOD_DOW_PURE 6
|
|
#define METHOD_ABOVE_DOW 7
|
|
#define METHOD_INNER_TAGID 8
|
|
#define METHOD_ABOVE_ADDR 9
|
|
#define METHOD_MAX 10
|
|
//#define METHOD_ABOVE_TOD 8 // dailylobo.com
|
|
//#define METHOD_ABOVE_DOW 6
|
|
//#define METHOD_ATTRIBUTE 2
|
|
//#define METHOD_BR_DOM 10 // like "<br>Starting July 31..."
|
|
//#define METHOD_EMPTY_TAG 8 // like "<p></p>", now also <br><br>
|
|
|
|
#define MAXCELLS (1024*5)
|
|
|
|
class Partition {
|
|
public:
|
|
int32_t m_np;
|
|
int32_t m_a[MAXCELLS];
|
|
int32_t m_b[MAXCELLS];
|
|
class Section *m_firstBro[MAXCELLS];
|
|
};
|
|
|
|
// . employ the existing methods (add METHOD_MONTH_PURE,EMPTY_TAG, etc.)
|
|
// . but in scoring the partition use the SENT_* bits
|
|
// . import SEC_HAS_DOM/TOD/etc. in the SENT_* bits for simplicity
|
|
// . hash every combination of a sentence's SENT_* bits with his neighbor
|
|
// for making the scoring vector. or just hash the whole thing???
|
|
// like just all the bits of him with all the sent flags with next sentence?
|
|
// . i kinda want to manually control the methods rather than resorting to
|
|
// all combinations of sentence bits as possible delimeter hashes in order
|
|
// to keep performance fast.
|
|
|
|
// . scan each list of brothers
|
|
// . identify if it contains dom/dow dates and tod dates in brothers
|
|
// . the scan each brother in the list
|
|
// . take all combinations of up to 3 sentence flag bits from a brother section
|
|
// and use that as a delimeter
|
|
// . then evaluate the score of the resulting partition based on using that
|
|
// delimeter.
|
|
// . example: the first brother has SENT_IN_HEADER and SENT_HAS_PRICE set
|
|
// amongst other bits, so we try to use all brother sections in the list
|
|
// that have both SENT_IN_HEADER and SENT_HAS_PRICE as delimiters. and we
|
|
// score the partition based on that.
|
|
// . we score partitions by comparing each partition cell to the others.
|
|
// i.e. how similar are they in terms of the SENT_* bits they have set?
|
|
// . the idea is too create a partition that balances the SENT_* bits
|
|
// so that each partition has more or less the same SENT_* bits from all
|
|
// the sections that it contains.
|
|
// . example: like each partitioned section has SENT_PRICE and SENT_TOD, ...
|
|
// then that's pretty good.
|
|
// . we also hash all possible pairs of SENT_* flags between adjacent sentences
|
|
// in the same paritioned section. these gives us ordering information.
|
|
// . like an SENT_MIXED_CASE sentence follows an SENT_IN_HEADER sentence would
|
|
// result in one vector component of the vector for the partitioned cell.
|
|
// and we compare vectors from each cell in the partition with all the other
|
|
// cells in the partition to get the similarity score.
|
|
// . returns false with g_errno set on error
|
|
int32_t Sections::addImpliedSections3 ( ) {
|
|
|
|
// . try skipping for xml
|
|
// . eventbrite.com has a bunch of dates per event item and
|
|
// we end up using METHOD_DOM on those!
|
|
// . i think the implied section algo is meant for html really
|
|
// or plain text
|
|
if ( m_contentType == CT_XML &&
|
|
( m_isEventBrite ||
|
|
m_isStubHub ||
|
|
m_isFacebook ) )
|
|
return 0;
|
|
|
|
|
|
sec_t badFlags =SEC_MARQUEE|SEC_STYLE|SEC_SCRIPT|SEC_SELECT|
|
|
SEC_HIDDEN|SEC_NOSCRIPT;
|
|
|
|
// for debugging -- sleep forever
|
|
//if ( g_hostdb.m_hostId == 44 ) {
|
|
// for ( ; ; ) {
|
|
// QUICKPOLL(m_niceness);
|
|
// }
|
|
//}
|
|
|
|
// ignore brothers that are one of these tagids
|
|
//sec_t badFlags = SEC_SELECT|SEC_SCRIPT|SEC_STYLE|SEC_HIDDEN;
|
|
|
|
bool addedPartition = false;
|
|
|
|
// scan the sections
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must be the first brother in a list of brothers
|
|
if ( sk->m_prevBrother ) continue;
|
|
// must have a brother after him
|
|
if ( ! sk->m_nextBrother ) continue;
|
|
// all least 3 brothers to make implied sections useful
|
|
if ( ! sk->m_nextBrother->m_nextBrother ) continue;
|
|
|
|
// use this to pass less args to functions then
|
|
//m_totalHdrCount = 0;
|
|
|
|
// tally some stats for this list of brothers
|
|
int32_t count1 = 0;
|
|
int32_t count2 = 0;
|
|
//int32_t bothCount = 0;
|
|
// scan the brothers now
|
|
Section *bro = sk;
|
|
// until the end of the list
|
|
for ( ; bro ; bro = bro->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// now at least two brothers must have the
|
|
// SEC_HAS_DOM/DOW set and at least two other brothers
|
|
// must have SEC_HAS_TOD set
|
|
sec_t flags = bro->m_flags;
|
|
// skip bad ones
|
|
if ( flags & badFlags ) continue;
|
|
// might also have tod set like folkmads.org does for
|
|
// one event title
|
|
if ( flags & (SEC_HAS_DOM|SEC_HAS_DOW) )
|
|
count1++;
|
|
// but for tod, must only just be tod i guess???
|
|
if ( flags & SEC_HAS_TOD )
|
|
count2++;
|
|
// how many headers with dom/dow dates do we have?
|
|
// like "Jan 23, 2010" or "Wednesdays"
|
|
//if ( ((flags & SEC_HEADING_CONTAINER) ||
|
|
// (flags & SEC_HEADING ) ) &&
|
|
// ( flags & (SEC_HAS_DOM|SEC_HAS_DOW) ) )
|
|
// m_totalHdrCount++;
|
|
//if ( (flags & (SEC_HAS_DOM|SEC_HAS_DOW) ) &&
|
|
// ( flags & SEC_HAS_TOD ) )
|
|
// bothCount++;
|
|
}
|
|
// . skip this list of brothers if we did not make the cut
|
|
// . i lowered to one to re-fix gwair.org, cabq.gov, etc.
|
|
// because i fixed the SEC_HAS_DOM/DOW/TOD for implied
|
|
// sections so it is actually accurate and not inheriting
|
|
// these flags from its parent. but to keep things fast
|
|
// we still have this constraint.
|
|
if ( count1 < 1 || count2 < 1 ) continue;
|
|
|
|
// can't all be both. the whole point in the partition is
|
|
// to group tods with doms/dows.
|
|
// this was hurting blackbirdbuvette.com which needed the
|
|
// hr partition to split up some sections so it got
|
|
// "Geeks Who Drink" as the title because it would no longer
|
|
// have the multevent penalty!
|
|
//if ( count1 + count2 == 2 * bothCount ) continue;
|
|
|
|
// hmmm... two brothers need to have dates. see what this hurts
|
|
if ( count1 < 2 && count2 < 2 ) continue;
|
|
|
|
// reset the cache
|
|
char cbuf[2048];
|
|
m_ct.set (4,0,32,cbuf , 2048 , false,m_niceness,"sect-cache");
|
|
|
|
// set this partition info for each method:
|
|
Partition parts[METHOD_MAX];
|
|
|
|
// reset these
|
|
int32_t bestMethodScore[METHOD_MAX];
|
|
memset ( bestMethodScore , 0 , METHOD_MAX * 4 );
|
|
|
|
//
|
|
// ok, now we made the cut, so scan list of brothers again
|
|
// for potential partition tags. then get score of each
|
|
// partition and prefer the highest scoring one.
|
|
//
|
|
// reset scan
|
|
//bro = sk;
|
|
// assume no winner
|
|
int32_t bestScore = 0;
|
|
Section *bestBro = NULL;
|
|
char bestMethod = -1;
|
|
Partition *bestPart = NULL;
|
|
// loop over all enumerated methods
|
|
for ( int32_t m = 0 ; m < METHOD_MAX ; m++ ) {
|
|
// try skipping these
|
|
if ( m == METHOD_TAGID ) continue;
|
|
if ( m == METHOD_INNER_TAGID ) continue;
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// reset for each method
|
|
bro = sk;
|
|
// until the end of the list
|
|
for ( ; bro ; bro = bro->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip bad ones
|
|
if ( bro->m_flags & badFlags ) continue;
|
|
// grab next partition
|
|
Partition tmpp;
|
|
// . try each tagid once i guess
|
|
// . returns -1 if we already did it!
|
|
int32_t score = getDelimScore(sk,m,bro,&tmpp);
|
|
// error? g_errno should be set
|
|
if ( score == -3 ) return -1;
|
|
// skip if not legit
|
|
if ( score == -2 ) {
|
|
// this happens we we have a TON
|
|
// of brothers!! so give up on them!
|
|
// not it happens when bro tag type
|
|
// is not compatible with method.
|
|
// like for santafeplayhouse.org!
|
|
// for sections around the performance
|
|
// date lines.
|
|
continue;
|
|
//break;
|
|
}
|
|
// strange?
|
|
if ( score == -1 ) continue;
|
|
// score now based on where you start
|
|
score = 1000000 - bro->m_a;
|
|
// is it best score for this method?
|
|
if ( score <= bestMethodScore[m] ) continue;
|
|
// store it
|
|
bestMethodScore[m] = score;
|
|
// if so, store it for this method
|
|
parts[m].m_np = tmpp.m_np;
|
|
for ( int32_t i = 0 ; i < tmpp.m_np ; i++ ) {
|
|
// rbeathe
|
|
QUICKPOLL(m_niceness);
|
|
parts[m].m_a[i] = tmpp.m_a[i];
|
|
parts[m].m_b[i] = tmpp.m_b[i];
|
|
parts[m].m_firstBro[i] =
|
|
tmpp.m_firstBro[i];
|
|
}
|
|
// skip if not best overall
|
|
if ( score <= bestScore ) continue;
|
|
// is best of all methods so far?
|
|
bestScore = score;
|
|
bestBro = bro;
|
|
bestMethod = m;
|
|
bestPart = &parts[m];
|
|
}
|
|
}
|
|
|
|
// if this list had no viable partitions, skip it
|
|
if ( bestMethod == -1 ) continue;
|
|
|
|
/*
|
|
Partition *bestSuper = NULL;
|
|
int32_t bestSuperMethod;
|
|
// . before inserting the winning partition see if another
|
|
// partition is a "super partition" of that and insert that
|
|
// first if it is.
|
|
// . a lot of times the super partition as a smaller score
|
|
// because one section in the partition is much larger
|
|
// than another, possibly empty, section...
|
|
// . prefer the super partition with the lest # of cells
|
|
for ( int32_t m = 0 ; m < METHOD_MAX ; m++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if winner
|
|
if ( m == bestMethod ) continue;
|
|
// if no partition here, skip it
|
|
if ( bestMethodScore[m] <= 0 ) continue;
|
|
// shorcut
|
|
Partition *super = &parts[m];
|
|
Partition *sub = bestPart;
|
|
// need at least one partition
|
|
if ( super->m_np <= 1 ) continue;
|
|
// skip if same number of partitions or more
|
|
if ( super->m_np >= sub->m_np ) continue;
|
|
// must start ABOVE first partition
|
|
if ( super->m_a[0] >= sub->m_a[0] ) continue;
|
|
// assume it is a super partition
|
|
bool isSuper = true;
|
|
// w is cursor into the subpartitions cells/intervals
|
|
int32_t w = 0;
|
|
// now every startpoint in the super partition must
|
|
// be right before a point in the subpartition
|
|
int32_t k; for ( k = 0 ; k < super->m_np ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get last partition if we have a sequence
|
|
// of empties... like if the super partition
|
|
// was <h1> tags and the sub partition was
|
|
// <h2> if we had:
|
|
// <h1>..</h1>
|
|
// <h1>..</h1>
|
|
// <h2>..</h2>
|
|
// <h1>..</h1>
|
|
// <h2>..</h2>
|
|
// then that first two adjacent <h1> tags
|
|
// should not stop the <h1> tags from
|
|
// constituting a super partition over the
|
|
// subpartition of <h2> tags
|
|
if ( k+1 < super->m_np &&
|
|
super->m_firstBro[k ]->m_alnumPosB ==
|
|
super->m_firstBro[k+1]->m_alnumPosA )
|
|
continue;
|
|
// advance "w" is need to catch up
|
|
for ( ; w < sub->m_np ; w++ )
|
|
if ( sub->m_a[w] >= super->m_a[k] )
|
|
break;
|
|
// if w is exhausted we are done
|
|
if ( w >= sub->m_np ) break;
|
|
// . now compare to next guy in subpartition
|
|
// . the first bros must be adjacent
|
|
// . i.e. <h1> tag must be adjacent to <h2> tag
|
|
if ( super->m_firstBro[k]->m_alnumPosB ==
|
|
sub ->m_firstBro[w]->m_alnumPosA )
|
|
continue;
|
|
// crap, not a match, not a super partition
|
|
isSuper = false;
|
|
break;
|
|
}
|
|
// skip if not a super partition
|
|
if ( ! isSuper ) continue;
|
|
// the best super will have the fewest partitions
|
|
if ( bestSuper && super->m_np >= bestSuper->m_np )
|
|
continue;
|
|
// we are now the best super partition
|
|
bestSuper = super;
|
|
bestSuperMethod = m;
|
|
}
|
|
// turn off for now
|
|
//bestSuper = NULL;
|
|
*/
|
|
|
|
// select the winning partition
|
|
int32_t winnerMethod = bestMethod;
|
|
Partition *winnerPart = bestPart;
|
|
// if no super partition...
|
|
//if ( bestSuper ) {
|
|
// winnerMethod = bestSuperMethod;
|
|
// winnerPart = bestSuper;
|
|
//}
|
|
|
|
|
|
// log it
|
|
char *ms = "";
|
|
if ( winnerMethod == METHOD_TAGID ) ms = "tagid";
|
|
if ( winnerMethod == METHOD_DOM ) ms = "dom";
|
|
if ( winnerMethod == METHOD_DOM_PURE ) ms = "dompure";
|
|
if ( winnerMethod == METHOD_ABOVE_DOM ) ms = "abovedom";
|
|
if ( winnerMethod == METHOD_DOW ) ms = "dow";
|
|
if ( winnerMethod == METHOD_DOW_PURE ) ms = "dowpure";
|
|
if ( winnerMethod == METHOD_MONTH_PURE ) ms = "monthpure";
|
|
if ( winnerMethod == METHOD_ABOVE_DOW ) ms = "abovedow";
|
|
if ( winnerMethod == METHOD_INNER_TAGID ) ms = "innertagid";
|
|
if ( winnerMethod == METHOD_ABOVE_ADDR ) ms = "aboveaddr";
|
|
//if ( winnerMethod == METHOD_ABOVE_TOD ) ms = "abovetod";
|
|
//if ( winnerMethod == METHOD_EMPTY_TAG ) ms = "emptytag";
|
|
//if ( winnerMethod == METHOD_ATTRIBUTE ) ms = "attribute";
|
|
log("sections: inserting winner method=%s",ms);
|
|
|
|
// loop over his paritions and insert each one
|
|
for ( int32_t i = 0 ; i < winnerPart->m_np ; i++ ) {
|
|
// get partition
|
|
int32_t a = winnerPart->m_a[i];
|
|
int32_t b = winnerPart->m_b[i];
|
|
// if it is a container around another container
|
|
// then it is pointeless
|
|
Section *cc = m_sectionPtrs[a];
|
|
if ( cc && cc->m_a == a && cc->m_b == b ) continue;
|
|
// this returns false and sets g_errno on error
|
|
if ( ! insertSubSection( sk->m_parent ,
|
|
winnerPart->m_a[i],
|
|
winnerPart->m_b[i],
|
|
BH_IMPLIED ) )
|
|
return -1;
|
|
}
|
|
|
|
// ok, flag it
|
|
addedPartition = true;
|
|
}
|
|
|
|
// return now if added no partitions
|
|
if ( ! addedPartition ) return 0;
|
|
|
|
// we gotta fix m_nextBrother,m_prevBrother after doing this
|
|
setNextBrotherPtrs ( false );
|
|
|
|
return 1;
|
|
}
|
|
|
|
// . vec? now has dups!
|
|
float computeSimilarity2 ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t *s0 , // corresponding scores vector
|
|
int32_t *s1 , // corresponding scores vector
|
|
int32_t niceness ,
|
|
SafeBuf *pbuf ,
|
|
HashTableX *labelTable ,
|
|
int32_t nv0 ,
|
|
int32_t nv1 ) {
|
|
// if both empty, assume not similar at all
|
|
if ( *vec0 == 0 && *vec1 == 0 ) return 0;
|
|
// if either is empty, return 0 to be on the safe side
|
|
if ( *vec0 == 0 ) return 0;
|
|
if ( *vec1 == 0 ) return 0;
|
|
|
|
HashTableX ht;
|
|
char hbuf[200000];
|
|
int32_t phsize = 200000;
|
|
char *phbuf = hbuf;
|
|
// how many slots to allocate initially?
|
|
int32_t need = 1024;
|
|
if ( nv0 > 0 ) need = nv0 * 4;
|
|
// do not use the buf on stack if need more...
|
|
if ( need > 16384 ) { phbuf = NULL; phsize = 0; }
|
|
// allow dups!
|
|
if ( ! ht.set ( 4,4,need,phbuf,phsize,true,niceness,"xmlqvtbl2"))
|
|
return -1;
|
|
|
|
// for deduping labels
|
|
HashTableX dedupLabels;
|
|
if ( labelTable )
|
|
dedupLabels.set(4,4,need,NULL,0,false,niceness,"xmldelab");
|
|
|
|
bool useScores = (bool)s0;
|
|
|
|
int32_t matches = 0;
|
|
int32_t total = 0;
|
|
|
|
int32_t matchScore = 0;
|
|
int32_t totalScore = 0;
|
|
|
|
// hash first vector. accumulating score total and total count
|
|
for ( int32_t *p = vec0; *p ; p++ , s0++ ) {
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s0;
|
|
// total it up
|
|
totalScore += score;
|
|
// accumulate all the scores into this one bucket
|
|
// in the case of p being a dup
|
|
if ( ! ht.addTerm32 ( p , score ) ) return -1;
|
|
}
|
|
|
|
//int32_t zero = 0;
|
|
|
|
// see what components of this vector match
|
|
for ( int32_t *p = vec1; *p ; p++ , s1++ ) {
|
|
// count it
|
|
total++;
|
|
// get it
|
|
int32_t score = 1;
|
|
// get the score if valid
|
|
if ( useScores ) score = *s1;
|
|
// and total scores
|
|
totalScore += score;
|
|
// is it in there?
|
|
int32_t slot = ht.getSlot ( p );
|
|
// skip if unmatched
|
|
if ( slot < 0 ) {
|
|
// skip if not debugging
|
|
if ( ! pbuf ) continue;
|
|
// get the label ptr
|
|
char *pptr = (char *)labelTable->getValue((int32_t *)p);
|
|
// record label in safeBuf if provided
|
|
dedupLabels.addTerm32 ( (int32_t *)&pptr , score );
|
|
//dedupLabels.addTerm32 ( (int32_t *)p );
|
|
// and then skip
|
|
continue;
|
|
}
|
|
|
|
// otherwise, it is a match!
|
|
matches++;
|
|
|
|
// and score of what we matched
|
|
int32_t *val = (int32_t *)ht.getValueFromSlot ( slot );
|
|
// sanity check. does "vec1" have dups in it? shouldn't...
|
|
if ( *val == 0 ) { char *xx=NULL;*xx=0; }
|
|
// we get the min
|
|
int32_t minScore ;
|
|
if ( *val < score ) minScore = *val;
|
|
else minScore = score;
|
|
|
|
// only matched "min" of them times 2!
|
|
matchScore += 2 * minScore;
|
|
// he is hit too
|
|
//matchScore += *val;
|
|
// how many were unmatched?
|
|
int32_t unmatched = *val + score - (2*minScore);
|
|
|
|
// remove it as we match it to deal with dups
|
|
// once we match it once, do not match again, score was
|
|
// already accumulated
|
|
// otherwise, remove this dup and try to match any
|
|
// remaining dups in the table
|
|
ht.setValue ( slot , &unmatched ); // &zero
|
|
//ht.removeSlot ( slot );
|
|
}
|
|
|
|
// for debug add all remaining into dedup table
|
|
for ( int32_t i = 0 ; labelTable && i < ht.getNumSlots(); i++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( ! ht.m_flags[i] ) continue;
|
|
uint32_t *unmatched = (uint32_t *)ht.getValueFromSlot (i);
|
|
if ( *unmatched == 0 ) continue;
|
|
// use the key to get the label ptr
|
|
int32_t key = *(int32_t *)ht.getKeyFromSlot(i);
|
|
char *pptr = (char *)labelTable->getValue(&key);
|
|
dedupLabels.addTerm32 ( (int32_t *)&pptr , *unmatched );
|
|
}
|
|
|
|
|
|
// if after subtracting query terms we got no hits, return 0.framesets?
|
|
if ( useScores && totalScore == 0 ) return 0;
|
|
if ( total == 0 ) return 0;
|
|
// . what is the max possible score we coulda had?
|
|
// . subtract the vector components that matched a query term
|
|
float percent = 100 * (float)matchScore / (float)totalScore;
|
|
// sanity
|
|
if ( percent > 100 ) { char *xx=NULL;*xx=0; }
|
|
|
|
if ( ! labelTable ) return percent;
|
|
|
|
// scan label table for labels
|
|
for ( int32_t i = 0 ; i < dedupLabels.getNumSlots(); i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip empties
|
|
if ( ! dedupLabels.m_flags[i] ) continue;
|
|
// get hash
|
|
char **pptr = (char **)dedupLabels.getKeyFromSlot(i);
|
|
// and count
|
|
int32_t count = dedupLabels.getScoreFromSlot(i);
|
|
// get label and count
|
|
char *str = *pptr;//(char *)labelTable->getValue(&h);
|
|
if ( count != 1 ) pbuf->safePrintf ( "%s(%"INT32") ",str,count);
|
|
else pbuf->safePrintf ( "%s ",str);
|
|
}
|
|
|
|
return percent;
|
|
}
|
|
|
|
// . returns -1 if already got score for the suggested delimeter
|
|
// . returns -2 if suggested section cannot be a delimeter for the
|
|
// suggested method
|
|
// . returns -3 with g_errno set on error
|
|
// . returns 0 if no viable partition exists which has a domdow/tod pair
|
|
// in one interval of the partition and a domdow/tod pair in another
|
|
// interval of the partition
|
|
// . otherwise returns a positive score of the strength of the partition
|
|
// . assumes all sections with the same "getDelimHash() as "delim" are the
|
|
// first section in a particular partition cell
|
|
int32_t Sections::getDelimScore ( Section *bro ,
|
|
char method ,
|
|
Section *delim ,
|
|
Partition *part ) {
|
|
|
|
// save it
|
|
Section *start = bro;
|
|
|
|
int32_t dh = getDelimHash ( method , delim , start );
|
|
|
|
// bro must be certain type for some methods
|
|
if ( dh == -1 ) return -2;
|
|
|
|
// did we already do this dh?
|
|
if ( m_ct.isInTable ( &dh ) ) return -1;
|
|
|
|
// ignore brothers that are one of these tagids
|
|
sec_t badFlags = SEC_SELECT|SEC_SCRIPT|SEC_STYLE|SEC_HIDDEN;
|
|
|
|
// get the containing section
|
|
Section *container = bro->m_parent;
|
|
// sanity check... should all be brothers (same parent)
|
|
if ( delim->m_parent != container ) { char *xx=NULL;*xx=0; }
|
|
|
|
// the head section of a particular partition's section
|
|
Section *currDelim = bro;
|
|
// scores
|
|
int32_t brosWithWords = 0;
|
|
int32_t maxBrosWithWords = 0;
|
|
int32_t bonus1 = 0;
|
|
int32_t bonus2 = 0;
|
|
int32_t bonus3 = 0;
|
|
#define MAX_COMPONENTS 15000
|
|
int32_t pva[MAX_COMPONENTS];
|
|
int32_t pvb[MAX_COMPONENTS];
|
|
int32_t sva[MAX_COMPONENTS];
|
|
int32_t svb[MAX_COMPONENTS];
|
|
int32_t nva = 0;
|
|
int32_t nvb = 0;
|
|
pva[0] = 0;
|
|
pvb[0] = 0;
|
|
int32_t *pvec = NULL;
|
|
int32_t *pnum = NULL;
|
|
int32_t *pscore = NULL;
|
|
bool firstDelim = true;
|
|
float simTotal = 0;
|
|
float minSim = 101.0;
|
|
int32_t mina1 = -2;
|
|
int32_t mina2 = -2;
|
|
int32_t minTotalComponents=0;
|
|
Section *prevPrevStart = NULL;
|
|
Section *prevStart = NULL;
|
|
int32_t simCount = 0;
|
|
int32_t inserts = 0;
|
|
int32_t skips = 0;
|
|
|
|
// no longer allow dups, keep a count of each hash now
|
|
char vhtbuf[92000];
|
|
HashTableX vht;
|
|
vht.set ( 4, 4 ,256,vhtbuf,92000,false,m_niceness,"vhttab");
|
|
int32_t cellCount = 0;
|
|
SafeBuf minBuf;
|
|
|
|
HashTableX labels;
|
|
labels.set ( 4,65,20000,NULL,0,false,m_niceness,"lbldbug");
|
|
HashTableX *dbt = NULL;
|
|
SafeBuf sb;
|
|
SafeBuf *pbuf = NULL;
|
|
|
|
//
|
|
// assign for debug DEBUG DEBUG implied sections
|
|
//
|
|
//dbt = &labels;
|
|
//pbuf = &sb;
|
|
|
|
int32_t np = 0;
|
|
int32_t nonDelims = 0;
|
|
|
|
bool ignoreAbove = true;
|
|
// if delimeter is like an hr tag without text in it, then do include
|
|
// the stuff above its first occurrence as part of the partition,
|
|
// otherwise, assume the stuff above the first occurrence of "delim"
|
|
// is just header junk and should not be included. fixes
|
|
// dailylobo.com which has an "<h2>" header to the brother list
|
|
// which should not be part of the partition we use.
|
|
//if ( delim->m_firstWordPos < 0 ) ignoreAbove = false;
|
|
|
|
// reset prev sentence
|
|
Section *prevSent = NULL;
|
|
Section *lastBro = NULL;
|
|
|
|
// scan the brothers
|
|
for ( ; ; bro = bro->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if bad
|
|
if ( bro && (bro->m_flags & badFlags) ) continue;
|
|
|
|
// if any brother is an implied section, stop!
|
|
if ( bro && (bro->m_baseHash == BH_IMPLIED ) ) return -2;
|
|
|
|
// get its hash
|
|
int32_t h = 0LL ;
|
|
if ( bro ) h = getDelimHash ( method , bro , start );
|
|
|
|
// . check this out
|
|
// . don't return 0 because we make a vector of these hashes
|
|
// and computeSimilarity() assumes vectors are NULL term'd
|
|
if ( bro && h == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// once we hit the delimeter we stop ignoring
|
|
if ( h == dh ) ignoreAbove = false;
|
|
|
|
// if first time, ignore crap above the first delimeter occurnc
|
|
if ( ignoreAbove ) continue;
|
|
|
|
// count the section delimeter itself in this if not firstTime,
|
|
// and then we need two sections after that...?
|
|
//int32_t need = 3;
|
|
// i guess the first time is reduced since it started
|
|
// the secCount at 0, and normally it starts at 1 (see below)
|
|
// but if we have like an hr tag delimeter then we only
|
|
// need two sections above it. this fixed cabq.gov so it
|
|
// got the first implied section and no longer missed it.
|
|
//if ( firstTime ) need = 2;
|
|
|
|
// update this for insertSubSection()
|
|
lastBro = bro;
|
|
|
|
if ( h == dh )
|
|
currDelim = bro;
|
|
|
|
// count non delimeter sections. at least one section
|
|
// must have text and not be a delimeter section
|
|
if ( h != dh && bro && bro->m_firstWordPos >= 0 )
|
|
nonDelims++;
|
|
|
|
// start a new partition?
|
|
if ( h == dh ) {
|
|
// start new one
|
|
part->m_a [np] = bro->m_a;
|
|
part->m_b [np] = bro->m_b;
|
|
part->m_firstBro [np] = bro;
|
|
np++;
|
|
part->m_np = np;
|
|
// if out of buffer space, note it and just do not
|
|
// do this partition
|
|
if ( np >= MAXCELLS ) {
|
|
log("sec: partition too big!!!");
|
|
return -2;
|
|
}
|
|
}
|
|
// always extend current partition
|
|
else if ( np > 0 && bro ) {
|
|
part->m_b[np-1] = bro->m_b;
|
|
}
|
|
|
|
|
|
// did we finalize a cell in the partition?
|
|
bool getSimilarity = false;
|
|
// if we hit a delimiting brothers, calculate the similarity
|
|
// of the previous brothers
|
|
if ( h == dh ) getSimilarity = true;
|
|
// if we end the list of brothers...
|
|
if ( ! bro ) getSimilarity = true;
|
|
// empty partition?
|
|
if ( vht.isTableEmpty() ) getSimilarity = false;
|
|
// turn off for now
|
|
//mdwmdwgetSimilarity = false;
|
|
|
|
// only need this for tag based method now
|
|
//if ( method != METHOD_TAGID ) getSimilarity = false;
|
|
|
|
// convert our hashtable into a vector and compare to
|
|
// vector of previous partition cell if we hit a delimeter
|
|
// section or have overrun the list (bro == NULL)
|
|
if ( getSimilarity ) {
|
|
|
|
// if we have been hashing sentences in the previous
|
|
// brothers, then hash the last sentence as a previous
|
|
// sentence to a NULL sentence after it.
|
|
// as a kind of boundary thing. i.e. "*last* sentence
|
|
// is in header tag, etc." just like how we hash the
|
|
// first sentence with "NULL" as the previous sentence.
|
|
if (!hashSentBits(NULL,&vht,container,0,dbt,NULL))
|
|
return -3;
|
|
|
|
if (!hashSentPairs(prevSent,NULL,&vht,container,dbt))
|
|
return -3;
|
|
|
|
// reset this since it is talking about sentences
|
|
// just in the partition cell
|
|
prevSent = NULL;
|
|
// inc this for flip flopping which vector we use
|
|
cellCount++;
|
|
// what vector was used last?
|
|
if ( (cellCount & 0x01) == 0x00 ) {
|
|
pvec = pvb;
|
|
pnum = &nvb;
|
|
pscore = svb;
|
|
}
|
|
else {
|
|
pvec = pva;
|
|
pnum = &nva;
|
|
pscore = sva;
|
|
}
|
|
// reset vector size
|
|
*pnum = 0;
|
|
// convert vht to vector
|
|
for ( int32_t i = 0 ; i < vht.m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! vht.m_flags[i] ) continue;
|
|
// add it otherwise
|
|
pvec[*pnum] = *(int32_t *)(&vht.m_keys[i*4]);
|
|
// add score
|
|
pscore[*pnum] = *(int32_t *)(&vht.m_vals[i*4]);
|
|
// sanity
|
|
if ( pscore[*pnum] <= 0){char *xx=NULL;*xx=0;}
|
|
// inc component count
|
|
*pnum = *pnum + 1;
|
|
// how is this?
|
|
if(*pnum>=MAX_COMPONENTS){char *xx=NULL;*xx=0;}
|
|
}
|
|
// null temrinate
|
|
pvec[*pnum] = 0;
|
|
// . this func is defined in XmlDoc.cpp
|
|
// . vec0 is last partitions vector of delim hashes
|
|
// . this allows us to see if each section in our
|
|
// partition consists of the same sequence of
|
|
// section "types"
|
|
// . TODO: just compare pva to pvb and vice versa and
|
|
// then take the best one of those two compares!
|
|
// that way one can be a SUBSET of the other and
|
|
// we can get a 100% "sim"
|
|
float sim = computeSimilarity2( pva ,
|
|
pvb ,
|
|
sva , // scores
|
|
svb , // scores
|
|
m_niceness ,
|
|
pbuf ,
|
|
dbt ,
|
|
nva ,
|
|
nvb );
|
|
// add up all sims
|
|
if ( cellCount >= 2 ) { // ! firstTime ) {
|
|
simTotal += sim;
|
|
simCount++;
|
|
}
|
|
if ( cellCount >= 2 && sim < minSim ) {
|
|
minSim = sim;
|
|
if ( prevPrevStart ) mina1=prevPrevStart->m_a;
|
|
else mina1 = -1;
|
|
if ( prevStart ) mina2 = prevStart->m_a;
|
|
else mina2 = -1;
|
|
minTotalComponents = nva + nvb;
|
|
// copy to our buf then
|
|
if ( pbuf )
|
|
minBuf.safeMemcpy ( pbuf );
|
|
}
|
|
// a new head
|
|
//currDelim = bro;
|
|
// reset vht for next partition cell to call
|
|
// hashSentenceBits() into
|
|
vht.clear();
|
|
}
|
|
|
|
if ( h == dh && brosWithWords >= 1 && ! firstDelim )
|
|
inserts++;
|
|
else if ( h == dh && ! firstDelim )
|
|
skips++;
|
|
else if ( ! bro )
|
|
inserts++;
|
|
|
|
// sometimes we have a couple of back to back lines
|
|
// that are like "M-F 8-5\n" and "Saturdays 8-6" and we do not
|
|
// want them to make implied sections because it would
|
|
// split them up weird like for unm.edu.
|
|
// unm.edu had 3 sentences:
|
|
// "9 am. - 6 pm. Mon. - Sat.\n"
|
|
// "Thur. 9 am. - 7 pm. Sun. 10 am - 4 pm.\n"
|
|
// "Books, Furniture, Toys, TV's, Jewelry, Household Items\n"
|
|
// and we were making an implied section around the last
|
|
// two sentences, which messed everything up.
|
|
// so let's add this code here to fix that.
|
|
if ( h == dh &&
|
|
// this means basically back-to-back delimiters
|
|
brosWithWords <= 1 &&
|
|
// if we got a timeofday that is indicative of a schedule
|
|
(bro->m_flags & SEC_HAS_TOD) &&
|
|
! firstDelim )
|
|
return -2;
|
|
|
|
// reset some stuff
|
|
if ( h == dh ) {
|
|
firstDelim = false;
|
|
brosWithWords = 0;
|
|
prevPrevStart = prevStart;
|
|
prevStart = bro;
|
|
}
|
|
|
|
// . count sections.
|
|
// . now we only count if they have text to avoid pointless
|
|
// implied sections for folkmads.org
|
|
// . do not count delimeter sections towards this since
|
|
// delimeter sections START a partition cell
|
|
if ( bro && bro->m_firstWordPos >= 0 )
|
|
brosWithWords++;
|
|
|
|
// keep a max on # of brothers with words in a given
|
|
// partition cell. if all have just one such section
|
|
// then no need to partition at all!
|
|
if ( brosWithWords > maxBrosWithWords )
|
|
maxBrosWithWords = brosWithWords;
|
|
|
|
//if ( h == dh && delim->m_a==525 && bro->m_a>=521 &&
|
|
// container->m_a == 521 &&
|
|
// method == METHOD_ABOVE_DOM ) //ATTRIBUTE )
|
|
// log("hey");
|
|
//if ( h == dh &&
|
|
// delim->m_a==657 &&
|
|
// //bro->m_a>=521 &&
|
|
// container->m_a == 655 &&
|
|
// method == METHOD_INNER_TAGID )
|
|
// log("hey");
|
|
|
|
// scan all sentences in this section and use them to
|
|
// make a vector which we store in the hashtable "vht"
|
|
for ( Section *sx=bro; sx &&sx->m_a<bro->m_b;sx = sx->m_next) {
|
|
// mdwmdw break;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not sentence
|
|
if ( ! (sx->m_flags & SEC_SENTENCE) ) continue;
|
|
// . store its vector components into hashtable
|
|
// . store tagids preceeding sentence
|
|
// . store tagid pairs preceeding sentence
|
|
// . store SENT_* flags
|
|
// . store SENT_* flags with prev sent SENT_* flags
|
|
// . store hash of first word? last word?
|
|
// . store SEC_HAS_MONTH/TOD/DOM/...
|
|
// . should particular singles or pairs be weighted
|
|
// more than normal when doing the final compare
|
|
// between two adjacent implied sections in
|
|
// the partition?
|
|
if (!hashSentBits(sx ,&vht,container,0,dbt,NULL))
|
|
return -3;
|
|
if (!hashSentPairs(prevSent,sx,&vht,container,dbt))
|
|
return -3;
|
|
// set this
|
|
prevSent = sx;
|
|
}
|
|
// stop when brother list exhausted
|
|
if ( ! bro ) break;
|
|
}
|
|
|
|
// cache it
|
|
if ( ! m_ct.addKey ( &dh ) ) return -3;
|
|
|
|
//if ( last != currDelim && brosWithWords >= 2 )
|
|
// inserts++;
|
|
//else if ( last != currDelim )
|
|
// skips++;
|
|
|
|
// if no winners mdwmdw
|
|
if ( minSim > 100.0 ) return -2;
|
|
|
|
if ( maxBrosWithWords <= 1 ) return -2;
|
|
|
|
if ( inserts <= 1 ) return -2;
|
|
|
|
// at least one brother must NOT be a delimeter section and have text
|
|
if ( nonDelims <= 0 ) return -2;
|
|
|
|
// . empty tags always win
|
|
// . this should not hurt anything...
|
|
// . do this AFTER we've inserted all the big sections because it
|
|
// hurts large partitions like for "AUGUST" for cabq.gov/museums...
|
|
// because that needs to be in one big section
|
|
// if ( method == METHOD_EMPTY_TAG ) minSim = 200.0;
|
|
|
|
//if ( minSim < 40.0 ) return -2;
|
|
|
|
//if ( minSim < 60.0 && inserts == 2 ) return -2;
|
|
|
|
// . METHOD_TAGID has to be really strong to work, since tags are
|
|
// so flaky in general
|
|
// . dailylobo.com depends on this but shold probably use a
|
|
// METHOD_ABOVE_TOD_PURE or something or do hr splits before
|
|
// adding implied sections?
|
|
// . this should also fix santafeplayhouse.org from adding tagid
|
|
// based implied sections that are screwing it up by grouping its
|
|
// tods up with the address below rather than the address above,
|
|
// since the address below is already claimed by the 6pm parking time
|
|
if ( minSim < 80.00 && method == METHOD_TAGID ) return -2;
|
|
|
|
if ( minSim < 80.00 && method == METHOD_INNER_TAGID ) return -2;
|
|
|
|
// special case. require at least two DOM sections for this method.
|
|
// this fixes
|
|
// www.trumba.com/calendars/albuquerque-area-events-calendar.rss
|
|
// where we were splitting its <description> tag for the win your
|
|
// own home raffle so that the date of the event got into the same
|
|
// section as a location of where the home you could win was, instead
|
|
// of being in the same implied section as the event address. now
|
|
// that i added this constraint we once again partition by the double
|
|
// br tags so it works right now. we needed the METHOD_BR_DOM to fix
|
|
// www.guysndollsllc.com/page5/page4/page4.html which was not
|
|
// splitting well with the METHOD_DOM i guess because "skips" was
|
|
// too high? because some sentences contained just the dom and the
|
|
// event description and perhaps were seen as contentless partitions??
|
|
// perhaps we could also fix METHOD_DOM to deal with that rather than
|
|
// add this new method, METHOD_BR_DOM
|
|
// MDW: see if trumba still works without this!
|
|
//if ( method == METHOD_BR_DOM && inserts <= 2 )
|
|
// return -2;
|
|
|
|
// add the last part of the list
|
|
//if ( partition && last != currDelim && brosWithWords >= 2 )
|
|
// // if inserting implied sections, add the previous one
|
|
// if ( ! insertSubSection ( currDelim->m_parent ,
|
|
// currDelim->m_a ,
|
|
// last->m_b , // end
|
|
// BH_IMPLIED ))
|
|
// // return -3 with g_errno set on error
|
|
// return -3;
|
|
|
|
// return -2 if no applicable partition of two or more intervals
|
|
//if ( inserts <= 1 ) return -2;
|
|
|
|
// super bonus if all 100% (exactly the same)
|
|
float avgSim = simTotal / simCount;
|
|
|
|
// use this now
|
|
// . very interesting: it only slightly insignificantly changes
|
|
// one url, graypanthers, if we use the avgSim vs. the minSim.
|
|
// . i prefer minSim because it does not give an advantage to
|
|
// many smaller sections vs. fewer larger sections in partition.
|
|
// . later we should probably consider doing a larger partition first
|
|
// then partitioning those larger sections further. like looking
|
|
// ahead a move in a chess game. should better partition
|
|
// santafeplayhouse.org methinks this way
|
|
//bonus1 = (int32_t)(avgSim * 100);
|
|
bonus1 = (int32_t)(minSim * 1000);
|
|
|
|
// i added the "* 2" to fix unm.edu because it was starting each
|
|
// section the the <table> tag section but had a high badCount,
|
|
// so this is kind of a hack...
|
|
//int32_t total = goodCount * 10000 - badCount * 2 * 9000;
|
|
int32_t total = 0;
|
|
total += bonus1;
|
|
|
|
// debug output
|
|
char *ms = "";
|
|
if ( method == METHOD_TAGID ) ms = "tagid";
|
|
if ( method == METHOD_DOM ) ms = "dom";
|
|
if ( method == METHOD_DOM_PURE ) ms = "dompure";
|
|
if ( method == METHOD_ABOVE_DOM ) ms = "abovedom";
|
|
if ( method == METHOD_DOW ) ms = "dow";
|
|
if ( method == METHOD_DOW_PURE ) ms = "dowpure";
|
|
if ( method == METHOD_MONTH_PURE ) ms = "monthpure";
|
|
if ( method == METHOD_ABOVE_DOW ) ms = "abovedow";
|
|
if ( method == METHOD_INNER_TAGID ) ms = "innertagid";
|
|
if ( method == METHOD_ABOVE_ADDR ) ms = "aboveaddr";
|
|
//if ( method == METHOD_ABOVE_TOD ) ms = "abovetod";
|
|
//if ( method == METHOD_EMPTY_TAG ) ms = "emptytag";
|
|
//if ( method == METHOD_ATTRIBUTE ) ms = "attribute";
|
|
|
|
// skip this for now
|
|
//return total;
|
|
|
|
logf(LOG_DEBUG,"sec: 1stbro=%"UINT32" "
|
|
"nondelims=%"INT32" "
|
|
"total=%"INT32" "
|
|
"bonus1=%"INT32" "
|
|
"bonus2=%"INT32" "
|
|
"bonus3=%"INT32" "
|
|
"avgSim=%.02f "
|
|
"minSim=%.02f "
|
|
"totalcomps=%"INT32" "
|
|
"inserts=%"INT32" "
|
|
"skips=%"INT32" "
|
|
"containera=%"INT32" "
|
|
//"goodcount=%"INT32" badcount=%"INT32" "
|
|
"dhA=%"INT32" method=%s",
|
|
start->m_a,
|
|
nonDelims,
|
|
total,
|
|
bonus1,
|
|
bonus2,
|
|
bonus3,
|
|
avgSim,
|
|
minSim,
|
|
minTotalComponents,
|
|
inserts,
|
|
skips,
|
|
(int32_t)container->m_a,
|
|
//goodCount,badCount,
|
|
delim->m_a,ms);
|
|
|
|
// show the difference in the two adjacent brother sections that
|
|
// resulted in the min similarity
|
|
// NOTE: using the log() it was truncating us!! so use stderr
|
|
fprintf(stderr,"sec: mina1=%"INT32" mina2=%"INT32" "
|
|
"missingbits=%s\n", mina1,mina2, minBuf.getBufStart());
|
|
|
|
// return score
|
|
return total;
|
|
}
|
|
|
|
|
|
char *getSentBitLabel ( sentflags_t sf ) {
|
|
if ( sf == SENT_HAS_COLON ) return "hascolon";
|
|
if ( sf == SENT_AFTER_COLON ) return "aftercolon";
|
|
//if ( sf == SENT_DUP_SECTION ) return "dupsection";
|
|
if ( sf == SENT_BAD_FIRST_WORD ) return "badfirstword";
|
|
if ( sf == SENT_MIXED_CASE ) return "mixedcase";
|
|
if ( sf == SENT_MIXED_CASE_STRICT ) return "mixedcasestrict";
|
|
if ( sf == SENT_POWERED_BY ) return "poweredby";
|
|
if ( sf == SENT_MULT_EVENTS ) return "multevents";
|
|
if ( sf == SENT_PAGE_REPEAT ) return "pagerepeat";
|
|
if ( sf == SENT_NUMBERS_ONLY ) return "numbersonly";
|
|
if ( sf == SENT_IN_ADDRESS ) return "inaddr";
|
|
if ( sf == SENT_SECOND_TITLE ) return "secondtitle";
|
|
if ( sf == SENT_IS_DATE ) return "allwordsindate";
|
|
if ( sf == SENT_LAST_STOP ) return "laststop";
|
|
if ( sf == SENT_NUMBER_START ) return "numberstarts";
|
|
//if ( sf == SENT_HASEVENTADDRESS ) return "haseventaddress";
|
|
//if ( sf == SENT_PRETTY ) return "pretty";
|
|
//if ( sf == SENT_NO_PREV_BROTHER ) return "noprevbro";
|
|
//if ( sf == SENT_NO_NEXT_BROTHER ) return "nonextbro";
|
|
if ( sf == SENT_IN_HEADER ) return "inheader";
|
|
//if ( sf == SENT_DONOTPRINT ) return "donotprint";
|
|
if ( sf == SENT_IN_LIST ) return "inlist";
|
|
if ( sf == SENT_IN_BIG_LIST ) return "inbiglist";
|
|
if ( sf == SENT_COLON_ENDS ) return "colonends";
|
|
if ( sf == SENT_IN_ADDRESS_NAME ) return "inaddressname";
|
|
if ( sf == SENT_IN_TITLEY_TAG ) return "intitleytag";
|
|
if ( sf == SENT_CITY_STATE ) return "citystate";
|
|
if ( sf == SENT_PRICEY ) return "pricey";
|
|
if ( sf == (sentflags_t)SENT_HAS_PRICE ) return "hasprice";
|
|
if ( sf == SENT_PERIOD_ENDS ) return "periodends";
|
|
if ( sf == SENT_HAS_PHONE ) return "hasphone";
|
|
if ( sf == SENT_IN_MENU ) return "inmenu";
|
|
if ( sf == SENT_MIXED_TEXT ) return "mixedtext";
|
|
if ( sf == SENT_TAGS ) return "senttags";
|
|
if ( sf == SENT_INTITLEFIELD ) return "intitlefield";
|
|
if ( sf == SENT_INPLACEFIELD ) return "inplacefield";
|
|
if ( sf == SENT_STRANGE_PUNCT ) return "strangepunct";
|
|
if ( sf == SENT_TAG_INDICATOR ) return "tagindicator";
|
|
//if ( sf == SENT_GENERIC_WORDS ) return "genericwords";
|
|
if ( sf == SENT_INNONTITLEFIELD ) return "innontitlefield";
|
|
//if ( sf == SENT_TOO_MANY_WORDS ) return "toomanywords";
|
|
if ( sf == SENT_HASNOSPACE ) return "hasnospace";
|
|
if ( sf == SENT_IS_BYLINE ) return "isbyline";
|
|
if ( sf == SENT_NON_TITLE_FIELD ) return "nontitlefield";
|
|
if ( sf == SENT_TITLE_FIELD ) return "titlefield";
|
|
if ( sf == SENT_UNIQUE_TAG_HASH ) return "uniquetaghash";
|
|
if ( sf == SENT_AFTER_SENTENCE ) return "aftersentence";
|
|
if ( sf == SENT_WORD_SANDWICH ) return "wordsandwich";
|
|
if ( sf == SENT_AFTER_SPACER ) return "afterspacer";
|
|
if ( sf == SENT_BEFORE_SPACER ) return "beforespacer";
|
|
if ( sf == SENT_LOCATION_SANDWICH ) return "locationsandwich";
|
|
if ( sf == SENT_NUKE_FIRST_WORD ) return "nukefirstword";
|
|
if ( sf == SENT_FIELD_NAME ) return "fieldname";
|
|
if ( sf == SENT_PERIOD_ENDS_HARD ) return "periodends2";
|
|
if ( sf == SENT_PARENS_START ) return "parensstart";
|
|
if ( sf == SENT_IN_MENU_HEADER ) return "inmenuheader";
|
|
if ( sf == SENT_IN_TRUMBA_TITLE ) return "intrumbatitle";
|
|
if ( sf == SENT_PLACE_NAME ) return "placename";
|
|
if ( sf == SENT_CONTAINS_PLACE_NAME ) return "containsplacename";
|
|
if ( sf == SENT_FORMTABLE_FIELD ) return "formtablefield";
|
|
if ( sf == SENT_FORMTABLE_VALUE ) return "formtablevalue";
|
|
if ( sf == SENT_IN_TAG ) return "intag";
|
|
if ( sf == SENT_OBVIOUS_PLACE ) return "obviousplace";
|
|
//if ( sf == SENT_ONROOTPAGE ) return "onrootpage";
|
|
if ( sf == SENT_HASSOMEEVENTSDATE ) return "hassomeeventsdate";
|
|
//if ( sf == SENT_EVENT_ENDING ) return "eventending";
|
|
if ( sf == SENT_HASTITLEWORDS ) return "hastitlewords";
|
|
//if ( sf == SENT_CLOSETODATE ) return "closetodate";
|
|
//if ( sf == SENT_GOODEVENTSTART ) return "goodeventstart";
|
|
if ( sf == SENT_BADEVENTSTART ) return "badeventstart";
|
|
if ( sf == SENT_MENU_SENTENCE ) return "menusentence";
|
|
if ( sf == SENT_PRETTY ) return "pretty";
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
|
|
static sentflags_t s_sentFlags[] = {
|
|
SENT_AFTER_SPACER,
|
|
SENT_BEFORE_SPACER,
|
|
//SENT_PERIOD_ENDS, already have periodend2
|
|
SENT_IN_TAG,
|
|
SENT_IN_HEADER,
|
|
SENT_NUMBERS_ONLY,
|
|
SENT_IN_ADDRESS,
|
|
SENT_IS_DATE,
|
|
SENT_NUMBER_START,
|
|
SENT_IN_TITLEY_TAG,
|
|
SENT_PRICEY,
|
|
SENT_HAS_PHONE,
|
|
SENT_IN_MENU,
|
|
SENT_INTITLEFIELD,
|
|
SENT_STRANGE_PUNCT,
|
|
SENT_INNONTITLEFIELD,
|
|
//SENT_IS_BYLINE,
|
|
SENT_NON_TITLE_FIELD,
|
|
SENT_TITLE_FIELD,
|
|
//SENT_UNIQUE_TAG_HASH,
|
|
SENT_FIELD_NAME,
|
|
SENT_PERIOD_ENDS_HARD,
|
|
//SENT_IN_MENU_HEADER,
|
|
SENT_PLACE_NAME,
|
|
//SENT_FORMAT_DUP,
|
|
SENT_MIXED_CASE };
|
|
|
|
char *getSecBitLabel ( sec_t sf ) {
|
|
if ( sf == SEC_HEADING_CONTAINER ) return "headingcontainer";
|
|
if ( sf == SEC_PLAIN_TEXT ) return "plaintext";
|
|
if ( sf == SEC_HAS_TOD ) return "hastod";
|
|
if ( sf == SEC_HAS_DOW ) return "hasdow";
|
|
if ( sf == SEC_HAS_MONTH ) return "hasmonth";
|
|
if ( sf == SEC_HAS_DOM ) return "hasdom";
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
}
|
|
|
|
static sec_t s_secFlags[] = {
|
|
SEC_HEADING_CONTAINER,
|
|
SEC_PLAIN_TEXT,
|
|
SEC_HAS_TOD,
|
|
SEC_HAS_DOW,
|
|
SEC_HAS_MONTH,
|
|
SEC_HAS_DOM };
|
|
|
|
#define MAXLABELSIZE 64
|
|
|
|
bool addLabel ( HashTableX *labelTable ,
|
|
int32_t key ,
|
|
char *label ) {
|
|
|
|
if ( key == 1021574190 )
|
|
log("got it");
|
|
// if in there, make sure agrees
|
|
char *ptr = (char *)labelTable->getValue (&key);
|
|
if ( ptr ) {
|
|
// compare sanity check
|
|
if ( strcmp(ptr,label) ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// see if label already exists under different key
|
|
for ( int32_t i = 0 ; i < labelTable->m_numSlots ; i++ ) {
|
|
if ( ! labelTable->m_flags[i] ) continue;
|
|
char *v = (char *)labelTable->getValueFromSlot(i);
|
|
if ( strcmp(v,label) ) continue;
|
|
int32_t k1 = *(int32_t *)labelTable->getKeyFromSlot(i);
|
|
log("sec: key=%"INT32" oldk=%"INT32"",key,k1);
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// add it otherwise
|
|
return labelTable->addKey ( &key, label );
|
|
}
|
|
|
|
// . if sx is NULL then prevSent will be the last sentence in partition cell
|
|
// . use the hashtable
|
|
bool Sections::hashSentBits (Section *sx ,
|
|
HashTableX *vht ,
|
|
Section *container ,
|
|
uint32_t mod ,
|
|
HashTableX *labelTable ,
|
|
char *modLabel ) {
|
|
|
|
int32_t n;
|
|
int32_t count = 0;
|
|
char sbuf [ MAXLABELSIZE ];
|
|
|
|
if ( ! sx ) {
|
|
// if no mod, we do not hash sent bits for NULL sentences
|
|
if ( ! mod ) return true;
|
|
// mix up
|
|
uint32_t key = (mod << 8) ^ 72263;
|
|
// use that
|
|
if ( ! vht->addTerm32 ( &key) ) return false;
|
|
// if no debug, done
|
|
if ( ! labelTable ) return true;
|
|
// for debug
|
|
sprintf(sbuf,"%sXXXX",modLabel);
|
|
addLabel(labelTable,key,sbuf);
|
|
return true;
|
|
}
|
|
|
|
n = sizeof(s_sentFlags)/sizeof(sentflags_t);
|
|
// handle SENT_* flags
|
|
for ( int32_t i = 0 ; i < n ; i++ , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if bit is off
|
|
if ( ! (sx->m_sentFlags & s_sentFlags[i] ) ) continue;
|
|
// set key
|
|
uint32_t key = g_hashtab[count/256][count%256];
|
|
// mod it?
|
|
key ^= mod;
|
|
// now put that into our vector
|
|
if ( ! vht->addTerm32 ( &key ) ) return false;
|
|
// add to our label table too
|
|
if ( ! labelTable ) continue;
|
|
// convert sentence bit to text description
|
|
char *str = getSentBitLabel ( s_sentFlags[i] );
|
|
// store in buffer
|
|
if ( modLabel )
|
|
sprintf ( sbuf,"%s%s", modLabel,str );
|
|
else
|
|
sprintf ( sbuf,"%s", str );
|
|
// make sure X chars or less
|
|
if ( strlen(sbuf)>= MAXLABELSIZE-1) { char *xx=NULL;*xx=0; }
|
|
// store
|
|
if ( ! addLabel(labelTable,key,sbuf ) ) return false;
|
|
}
|
|
|
|
n = sizeof(s_secFlags)/sizeof(sec_t);
|
|
// and for SEC_* flags
|
|
for ( int32_t i = 0 ; i < n ; i++ , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// mod it with the value for sentence "sx"
|
|
if ( sx && ! (sx->m_flags & s_secFlags[i]) ) continue;
|
|
// set key
|
|
uint32_t key = g_hashtab[count/256][count%256];
|
|
// mod it?
|
|
key ^= mod;
|
|
// now put that into our vector
|
|
if ( ! vht->addTerm32 ( &key ) ) return false;
|
|
// add to our label table too
|
|
if ( ! labelTable ) continue;
|
|
// convert sentence bit to text description
|
|
char *str = getSecBitLabel ( s_secFlags[i] );
|
|
// store in buffer
|
|
if ( modLabel )
|
|
sprintf ( sbuf,"%s%s", modLabel,str );
|
|
else
|
|
sprintf ( sbuf,"%s", str );
|
|
// make sure X chars or less
|
|
if ( strlen(sbuf)>= MAXLABELSIZE-1) { char *xx=NULL;*xx=0; }
|
|
// store
|
|
if ( ! addLabel(labelTable,key,sbuf ) ) return false;
|
|
}
|
|
|
|
// and tag sections we are in
|
|
for ( Section *sp = sx->m_parent ; sp ; sp = sp->m_parent ) {
|
|
// stop when not in container anymore
|
|
if ( container && ! container->contains ( sp ) ) break;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if no tag id
|
|
if ( ! sp->m_tagId ) continue;
|
|
// otherwise, use it
|
|
uint32_t key = sp->m_tagId;
|
|
// mod it?
|
|
key ^= mod;
|
|
// now put that into our vector
|
|
if ( ! vht->addTerm32 ( &key ) ) return false;
|
|
// add to our label table too
|
|
if ( ! labelTable ) continue;
|
|
// convert sentence bit to text description
|
|
char *str = getTagName(sp->m_tagId);
|
|
if ( modLabel ) sprintf ( sbuf,"%s%s", modLabel,str);
|
|
else sprintf ( sbuf,"%s", str );
|
|
// make sure X chars or less
|
|
if ( strlen(sbuf)>= MAXLABELSIZE-1) { char *xx=NULL;*xx=0; }
|
|
// store
|
|
if ( ! addLabel(labelTable,key,sbuf ) ) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// . if sx is NULL then prevSent will be the last sentence in partition cell
|
|
// . use the hashtable
|
|
bool Sections::hashSentPairs (Section *sx ,
|
|
Section *sb ,
|
|
HashTableX *vht ,
|
|
Section *container ,
|
|
HashTableX *labelTable ) {
|
|
|
|
// only one can be NULL
|
|
if ( ! sx && ! sb ) return true;
|
|
|
|
int32_t n;
|
|
int32_t count = 0;
|
|
char *str = NULL;
|
|
char sbuf [ MAXLABELSIZE ];
|
|
|
|
if ( ! sx ) {
|
|
// mix up
|
|
uint32_t mod = 9944812;
|
|
// for debug
|
|
sprintf(sbuf,"XXXX*");
|
|
// try to pair up with that
|
|
return hashSentBits ( sb, vht,container,mod,labelTable,sbuf);
|
|
}
|
|
|
|
|
|
n = sizeof(s_sentFlags)/sizeof(sentflags_t);
|
|
// handle SENT_* flags
|
|
for ( int32_t i = 0 ; i < n ; i++ , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// mod it with the value for sentence "sx"
|
|
if ( sx && ! ( sx->m_sentFlags & s_sentFlags[i]) ) continue;
|
|
// set key
|
|
uint32_t key = g_hashtab[count/256][count%256];
|
|
// mod that
|
|
uint32_t mod = key << 8;
|
|
// add to our label table too
|
|
if ( labelTable ) {
|
|
// convert sentence bit to text description
|
|
char *str = getSentBitLabel ( s_sentFlags[i] );
|
|
// store in buffer
|
|
sprintf ( sbuf , "%s*", str );
|
|
// make sure X chars or less
|
|
if(strlen(sbuf)>=MAXLABELSIZE-1){char *xx=NULL;*xx=0;}
|
|
}
|
|
// hash sentenceb with that mod
|
|
if ( ! hashSentBits ( sb, vht,container,mod,labelTable,sbuf))
|
|
return false;
|
|
}
|
|
|
|
n = sizeof(s_secFlags)/sizeof(sec_t);
|
|
// and for SEC_* flags
|
|
for ( int32_t i = 0 ; i < n ; i++ , count++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// mod it with the value for sentence "sx"
|
|
if ( sx && !(sx->m_flags & s_secFlags[i] ) ) continue;
|
|
// set key
|
|
uint32_t key = g_hashtab[count/256][count%256];
|
|
// mod that
|
|
uint32_t mod = key << 8;
|
|
// add to our label table too
|
|
if ( labelTable ) {
|
|
// convert sentence bit to text description
|
|
char *str = getSecBitLabel ( s_secFlags[i] );
|
|
// store in buffer
|
|
sprintf ( sbuf , "%s*", str );
|
|
// make sure X chars or less
|
|
if(strlen(sbuf)>=MAXLABELSIZE-1){char *xx=NULL;*xx=0;}
|
|
}
|
|
// hash sentenceb with that mod
|
|
if ( ! hashSentBits ( sb, vht, container,mod,labelTable,sbuf))
|
|
return false;
|
|
}
|
|
|
|
// and tag sections we are in
|
|
for ( Section *sp = sx->m_parent ; sp ; sp = sp->m_parent ) {
|
|
// stop when not in container anymore
|
|
if ( container && ! container->contains ( sp ) ) break;
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if no tag id
|
|
if ( ! sp->m_tagId ) continue;
|
|
// otherwise, use it
|
|
uint32_t key = sp->m_tagId;
|
|
// mod that
|
|
uint32_t mod = (key<<8) ^ 45644;
|
|
// fake it
|
|
if ( labelTable ) {
|
|
// convert sentence bit to text description
|
|
sprintf(sbuf,"%s*", getTagName(sp->m_tagId) );
|
|
str = sbuf;
|
|
}
|
|
// if no mod, then do the pairs now
|
|
if ( ! hashSentBits ( sb,vht,container,mod,labelTable,str))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . don't return 0 because we make a vector of these hashes
|
|
// and computeSimilarity() assumes vectors are NULL term'd. return -1 instead
|
|
int32_t Sections::getDelimHash ( char method , Section *bro , Section *head ) {
|
|
|
|
// now all must have text!
|
|
//if ( bro->m_firstWordPos < 0 ) return -1;
|
|
|
|
// if has no text give it a slightly different hash because these
|
|
// sections are often seen as delimiters
|
|
int32_t mod = 0;
|
|
if ( bro->m_firstWordPos < 0 ) mod = 3405873;
|
|
|
|
// this is really a fix for the screwed up sections in
|
|
// www.guysndollsllc.com/page5/page4/page4.html which have
|
|
// a dom and then the tod. sometimes the dom is in the same sentence
|
|
// section as the tod and other times it isn't. so we have a list of
|
|
// dom/tod/domtod sections that are brothers. but they are partitioned
|
|
// by the br tag, but the dom following the br tag sometimes has
|
|
// extra text after it like "To be announced", so we just required
|
|
// that the first word be a date.
|
|
/*
|
|
if ( method == METHOD_BR_DOM ) {
|
|
if ( bro->m_tagId != TAG_BR ) return -1;
|
|
// get next
|
|
Section *next = bro->m_nextBrother;
|
|
// must have section after
|
|
if ( ! next ) return -1;
|
|
// must be a sort of heading like "Jul 24"
|
|
//if ( !(next->m_flags & SEC_HEADING_CONTAINER) &&
|
|
// !(next->m_flags & SEC_HEADING ) )
|
|
// return -1;
|
|
// section after must have dom
|
|
if ( ! (next->m_flags & SEC_HAS_DOM) ) return -1;
|
|
// now it must be all date words
|
|
int32_t a = next->m_firstWordPos;
|
|
int32_t b = next->m_lastWordPos;
|
|
// sanity check
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// scan
|
|
for ( int32_t i = a ; i <= b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not wid
|
|
if ( ! m_wids[i] ) continue;
|
|
// first word must be in date
|
|
if ( ! ( m_bits->m_bits[i] & D_IS_IN_DATE ) )
|
|
return -1;
|
|
// ok, that's good enough!
|
|
break;
|
|
}
|
|
// we're good!
|
|
return 8888;
|
|
}
|
|
*/
|
|
|
|
// . single br tags not allowed to be implied section delimiters any
|
|
// more for no specific reason but seems to be the right way to go
|
|
// . this hurts the guysndollsllc.com url, which has single br lines
|
|
// each with its own DOM, so let's allow br tags in that case
|
|
if ( m_tids[bro->m_a] == TAG_BR && bro->m_a + 1 == bro->m_b )
|
|
return -1;
|
|
|
|
/*
|
|
// <P></P>
|
|
if ( method == METHOD_EMPTY_TAG ) {
|
|
// . and cabq.gov/museums/events.html also uses <br><br> which
|
|
// looks the same as <p></p> when rendered
|
|
// . and this is not a single br tag since that is eliminated
|
|
// by the return statement right above us
|
|
// . damn, they had the brbr in a <p> tag with other text so
|
|
// this was not fixing that ... so i took it out to be safe
|
|
// . now i need this for sunsetpromotions.com which use
|
|
// double brs otherwise i get multiple locations error
|
|
if ( bro->m_tagId == TAG_BR )
|
|
return 777777;
|
|
// must be a p tag for now
|
|
if ( bro->m_tagId != TAG_P ) return -1;
|
|
// and empty. if it has alnum words in it, return -1
|
|
if ( bro->m_firstWordPos >= 0 ) return -1;
|
|
// images count as words i guess to fix
|
|
// cabq.gov/museum/events.html
|
|
if ( containsTagId ( bro , TAG_IMG ) ) return -1;
|
|
// use a special hash
|
|
return 777777;
|
|
}
|
|
*/
|
|
|
|
if ( method == METHOD_MONTH_PURE ) {
|
|
// must be a sort of heading like "Jul 24"
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
if ( ! (bro->m_flags & SEC_HAS_MONTH) )
|
|
return -1;
|
|
// skip if also have daynum, we just want the month and
|
|
// maybe the year...
|
|
if ( (bro->m_flags & SEC_HAS_DOM) )
|
|
return -1;
|
|
// now it must be all date words
|
|
int32_t a = bro->m_firstWordPos;
|
|
int32_t b = bro->m_lastWordPos;
|
|
// sanity check
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// scan
|
|
for ( int32_t i = a ; i <= b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not wid
|
|
if ( ! m_wids[i] ) continue;
|
|
// must be in date
|
|
if ( ! ( m_bits->m_bits[i] & D_IS_IN_DATE ) )
|
|
return -1;
|
|
}
|
|
// do not collide with tagids
|
|
return 999999;
|
|
}
|
|
// are we partitioning by tagid?
|
|
if ( method == METHOD_TAGID ) {
|
|
int32_t tid = bro->m_tagId;
|
|
// . map a strong tag to h4 since they look the same!
|
|
// . should fix metropolis url
|
|
if ( tid == TAG_STRONG ) tid = TAG_H4;
|
|
if ( tid == TAG_B ) tid = TAG_H4;
|
|
if ( tid ) return tid;
|
|
// . use -1 to indicate can't be a delimeter
|
|
// . this should fix switchboard.com which lost
|
|
// "Wedding cakes..." as part of description because it
|
|
// called it an SEC_MENU_HEADER because an implied section
|
|
// was inserted and placed it at the top so it had nothing
|
|
// above it so-to-speak and was able to be a menu header.
|
|
if ( bro->m_baseHash == BH_SENTENCE ) return -1;
|
|
// if 0 use base hash
|
|
return bro->m_baseHash ^ mod;
|
|
}
|
|
/*
|
|
// stole this function logic from getBaseHash2()
|
|
if ( method == METHOD_ATTRIBUTE ) {
|
|
// assume this
|
|
int32_t bh = bro->m_tagHash; // Id;
|
|
// for sentence sections, etc...
|
|
if ( bh == 0 ) bh = bro->m_baseHash;
|
|
// do not allow sentences (see comment above)
|
|
if ( bro->m_baseHash == BH_SENTENCE ) return -1;
|
|
// . make heading sections different
|
|
// . this was wrong for christchurchcincinnati.org/worship
|
|
// so i commented it out to get the proper h3/h2 headers
|
|
//if ( bro->m_flags & SEC_HEADING_CONTAINER)
|
|
// bh ^= 0x789123;
|
|
return bh ^ mod;
|
|
}
|
|
*/
|
|
if ( method == METHOD_INNER_TAGID ) {
|
|
Section *last = bro;
|
|
// scan to right to find first kid
|
|
for ( Section *nn = bro->m_next ; nn ; nn = nn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if not contained
|
|
if ( ! last->contains ( nn ) ) break;
|
|
// ignore sentences because they alternate with
|
|
// the strong tags used in abqfolkfest.org and
|
|
// mess things up. some are <x><strong><sent> and
|
|
// some are <x><sent>blah blah<strong>.
|
|
// THIS IS NOT THE BEST WAY to fix this issue. i think
|
|
// we need to put the sentence sections outside the
|
|
// strong sections when possible? not sure.
|
|
if ( nn->m_baseHash == BH_SENTENCE ) continue;
|
|
// stop if it directly contains text DIRECTLY
|
|
// to fix <p> blah <a href> blah</a> </p> for
|
|
// abqtango.com, so we stop at the <p> tag and do
|
|
// not use the <a> tag as the inner tag id.
|
|
// CRAP, we do not have this set yet...
|
|
//if ( ! ( nn->m_flags & SEC_NOTEXT ) ) break;
|
|
// just do it this way then
|
|
if ( nn->m_a +1 < m_nw &&
|
|
m_wids[nn->m_a + 1] )
|
|
break;
|
|
if ( nn->m_a +2 < m_nw &&
|
|
! m_tids[nn->m_a+1] &&
|
|
m_wids[nn->m_a + 2] )
|
|
break;
|
|
// update
|
|
last = nn;
|
|
}
|
|
// do not allow sentences (see comment above)
|
|
if ( last->m_baseHash == BH_SENTENCE ) return -1;
|
|
// make it a little different from regular tagid
|
|
return last->m_tagId ^ 34908573 ^ mod;
|
|
}
|
|
if ( method == METHOD_DOM ) {
|
|
// need a dom of course (day of month)
|
|
if ( ! (bro->m_flags & SEC_HAS_DOM) )
|
|
return -1;
|
|
// we require it also have another type because DOM is
|
|
// quite fuzzy and matches all these little numbers in
|
|
// xml docs like stubhub.com's xml feed
|
|
if ( ! (bro->m_flags &
|
|
(SEC_HAS_MONTH|SEC_HAS_DOW|SEC_HAS_TOD)) )
|
|
return -1;
|
|
// if you aren't pure you should at least be in a tag
|
|
// or something. don't allow regular sentences for now...
|
|
// otherwise we get a few sections that aren't good for
|
|
// sunsetpromotions.com because it has a few sentences
|
|
// mentioning the day of month (april 16 2011). but those
|
|
// sections should be split by the double br tags.
|
|
if ( bro->m_sentFlags & SENT_MIXED_CASE )
|
|
return -1;
|
|
// . must be a sort of heading like "Jul 24"
|
|
// . without this we were getting bad implied sections
|
|
// for tennisoncampus because the section had a sentence
|
|
// with a bunch of sentences... BUT what does this hurt??
|
|
// . it hurts anja when we require this stuff here on
|
|
// texasdrums.drums.org/new_orleansdrums.htm because it is
|
|
// unclear that her event should be boxed in...
|
|
if ( !(bro->m_flags & SEC_SENTENCE) &&
|
|
!(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 11111;
|
|
}
|
|
if ( method == METHOD_DOM_PURE ) {
|
|
// must be a sort of heading like "Jul 24"
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
if ( ! (bro->m_flags & SEC_HAS_DOM) )
|
|
return -1;
|
|
// now it must be all date words
|
|
int32_t a = bro->m_firstWordPos;
|
|
int32_t b = bro->m_lastWordPos;
|
|
// sanity check
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// scan
|
|
for ( int32_t i = a ; i <= b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not wid
|
|
if ( ! m_wids[i] ) continue;
|
|
// must be in date
|
|
if ( ! ( m_bits->m_bits[i] & D_IS_IN_DATE ) )
|
|
return -1;
|
|
}
|
|
// do not collide with tagids
|
|
return 55555;
|
|
}
|
|
if ( method == METHOD_ABOVE_DOM ) {
|
|
// we cannot have a dom ourselves to reduce the problem
|
|
// with cabq.gov/museums/events.html of repeating the dom
|
|
// in the body and getting a false header.
|
|
if ( bro->m_flags & SEC_HAS_DOM )
|
|
return -1;
|
|
Section *nb = bro->m_nextBrother;
|
|
if ( ! nb )
|
|
return -1;
|
|
// skip empties like image tags
|
|
if ( nb->m_firstWordPos < 0 )
|
|
nb = nb->m_nextBrother;
|
|
if ( ! nb )
|
|
return -1;
|
|
// must be a sort of heading like "Jul 24"
|
|
//if ( !(nb->m_flags & SEC_HEADING_CONTAINER) &&
|
|
// !(nb->m_flags & SEC_HEADING ) )
|
|
// return -1;
|
|
if ( ! ( nb->m_flags & SEC_HAS_DOM ) )
|
|
return -1;
|
|
// require we be in a tag
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 22222;
|
|
}
|
|
if ( method == METHOD_DOW ) {
|
|
// must be a sort of heading like "Jul 24"
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
if ( ! (bro->m_flags & SEC_HAS_DOW) )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 33333;
|
|
}
|
|
if ( method == METHOD_DOW_PURE ) {
|
|
// santafeplayhouse.org had
|
|
// "Thursdays Pay What You Wish Performances" and so it
|
|
// was not getting an implied section set, so let's do away
|
|
// with the pure dow algo and see what happens.
|
|
return -1;
|
|
// must be a sort of heading like "Jul 24"
|
|
//if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
// !(bro->m_flags & SEC_HEADING ) )
|
|
// return -1;
|
|
if ( ! (bro->m_flags & SEC_HAS_DOW) )
|
|
return -1;
|
|
// this is causing core
|
|
if ( bro->m_tagId == TAG_TC ) return -1;
|
|
// now it must be all date words
|
|
int32_t a = bro->m_firstWordPos;
|
|
int32_t b = bro->m_lastWordPos;
|
|
// sanity check
|
|
if ( a < 0 ) { char *xx=NULL;*xx=0; }
|
|
// scan
|
|
for ( int32_t i = a ; i <= b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not wid
|
|
if ( ! m_wids[i] ) continue;
|
|
// must be in date
|
|
if ( ! ( m_bits->m_bits[i] & D_IS_IN_DATE ) )
|
|
return -1;
|
|
}
|
|
// do not collide with tagids
|
|
return 66666;
|
|
}
|
|
if ( method == METHOD_ABOVE_DOW ) {
|
|
// must be a sort of heading like "Jul 24"
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
Section *nb = bro->m_nextBrother;
|
|
if ( ! nb )
|
|
return -1;
|
|
if ( ! ( nb->m_flags & SEC_HAS_DOW ) )
|
|
return -1;
|
|
// next sentence not set yet, so figure it out
|
|
Section *sent = nb;
|
|
// scan for it
|
|
for ( ; sent ; sent = sent->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop we got a sentence section now
|
|
if ( sent->m_flags & SEC_SENTENCE ) break;
|
|
}
|
|
// might have been last sentence already
|
|
if ( ! sent )
|
|
return -1;
|
|
// . next SENTENCE must have the dow
|
|
// . should fix santafeplayhouse from making crazy
|
|
// implied sections
|
|
if ( ! (sent->m_flags & SEC_HAS_DOW) )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 44444;
|
|
}
|
|
if ( method == METHOD_ABOVE_ADDR ) {
|
|
// must be a sort of heading like "11. San Pedro Library"
|
|
// for cabq.gov
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
Section *nb = bro->m_nextBrother;
|
|
if ( ! nb )
|
|
return -1;
|
|
if ( ! nb->m_addrXor )
|
|
return -1;
|
|
// next sentence not set yet, so figure it out
|
|
Section *sent = nb;
|
|
// scan for it
|
|
for ( ; sent ; sent = sent->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop we got a sentence section now
|
|
if ( sent->m_flags & SEC_SENTENCE ) break;
|
|
}
|
|
// might have been last sentence already
|
|
if ( ! sent )
|
|
return -1;
|
|
// . next SENTENCE must have the addr
|
|
// . should fix santafeplayhouse from making crazy
|
|
// implied sections
|
|
if ( ! sent->m_addrXor )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 77777;
|
|
}
|
|
/*
|
|
if ( method == METHOD_ABOVE_TOD ) {
|
|
// must be a sort of heading like "Jul 24"
|
|
if ( !(bro->m_flags & SEC_HEADING_CONTAINER) &&
|
|
!(bro->m_flags & SEC_HEADING ) )
|
|
return -1;
|
|
Section *nb = bro->m_nextBrother;
|
|
if ( ! nb )
|
|
return -1;
|
|
if ( ! ( nb->m_flags & SEC_HAS_TOD ) )
|
|
return -1;
|
|
// next sentence not set yet, so figure it out
|
|
Section *sent = nb;
|
|
// scan for it
|
|
for ( ; sent ; sent = sent->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop we got a sentence section now
|
|
if ( sent->m_flags & SEC_SENTENCE ) break;
|
|
}
|
|
// next SENTENCE must have the tod
|
|
if ( ! (sent->m_flags & SEC_HAS_TOD) )
|
|
return -1;
|
|
// do not collide with tagids
|
|
return 444333;
|
|
}
|
|
*/
|
|
char *xx=NULL;*xx=0;
|
|
return 0;
|
|
}
|
|
|
|
// . PROBLEM: because we ignore non-breaking tags we often get sections
|
|
// that are really not sentences, but we are forced into them because
|
|
// we cannot split span or bold tags
|
|
// i.e. "<div>This is <b>a sentence. And this</b> is a sentence.</div>"
|
|
// forces us to treat the entire div tag as a sentence section.
|
|
// . i did add some logic to ignore those (the two for-k loops below) but then
|
|
// Address.cpp cores because it expects every alnum word to be in a sentence
|
|
// . now make sure to shrink into our current parent if we would not lose
|
|
// alnum chars!! fixes sentence flip flopping
|
|
// . returns false and sets g_errno on error
|
|
bool Sections::addSentenceSections ( ) {
|
|
|
|
m_numSentenceSections = 0;
|
|
|
|
sec_t badFlags =
|
|
//SEC_MARQUEE|
|
|
SEC_STYLE|
|
|
SEC_SCRIPT|
|
|
SEC_SELECT|
|
|
SEC_HIDDEN|
|
|
SEC_NOSCRIPT;
|
|
|
|
// int16_tcut
|
|
Section **sp = m_sectionPtrs;
|
|
|
|
static bool s_init = false;
|
|
static int64_t h_in;
|
|
static int64_t h_at;
|
|
static int64_t h_for;
|
|
static int64_t h_to;
|
|
static int64_t h_on;
|
|
static int64_t h_under;
|
|
static int64_t h_with;
|
|
static int64_t h_along;
|
|
static int64_t h_from;
|
|
static int64_t h_by;
|
|
static int64_t h_of;
|
|
static int64_t h_some;
|
|
static int64_t h_the;
|
|
static int64_t h_and;
|
|
static int64_t h_a;
|
|
static int64_t h_p;
|
|
static int64_t h_m;
|
|
static int64_t h_am;
|
|
static int64_t h_pm;
|
|
static int64_t h_http;
|
|
static int64_t h_https;
|
|
static int64_t h_room;
|
|
static int64_t h_rm;
|
|
static int64_t h_bldg;
|
|
static int64_t h_building;
|
|
static int64_t h_suite;
|
|
static int64_t h_ste;
|
|
static int64_t h_tags;
|
|
//static int64_t h_noon;
|
|
//static int64_t h_midnight;
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
h_tags = hash64n("tags");
|
|
h_in = hash64n("in");
|
|
h_the = hash64n("the");
|
|
h_and = hash64n("and");
|
|
h_a = hash64n("a");
|
|
h_p = hash64n("p");
|
|
h_m = hash64n("m");
|
|
h_am = hash64n("am");
|
|
h_pm = hash64n("pm");
|
|
h_a = hash64n("a");
|
|
h_at = hash64n("at");
|
|
h_for = hash64n("for");
|
|
h_to = hash64n("to");
|
|
h_on = hash64n("on");
|
|
h_under = hash64n("under");
|
|
h_with = hash64n("with");
|
|
h_along = hash64n("along");
|
|
h_from = hash64n("from");
|
|
h_by = hash64n("by");
|
|
h_of = hash64n("of");
|
|
h_some = hash64n("some");
|
|
h_http = hash64n("http");
|
|
h_https = hash64n("https");
|
|
h_room = hash64n("room");
|
|
h_rm = hash64n("rm");
|
|
h_bldg = hash64n("bldg");
|
|
h_building = hash64n("building");
|
|
h_suite = hash64n("suite");
|
|
h_ste = hash64n("ste");
|
|
//h_noon = hash64n("noon");
|
|
//h_midnight = hash64n("midnight");
|
|
}
|
|
|
|
// need D_IS_IN_URL bits to be valid
|
|
m_bits->setInUrlBits ( m_niceness );
|
|
// int16_tcut
|
|
wbit_t *bb = m_bits->m_bits;
|
|
/*
|
|
// int16_tcut
|
|
Date **datePtrs = m_dates->m_datePtrs;
|
|
// int16_tcut. this should not include telescoped dates!!!
|
|
int32_t maxnd = m_dates->m_numDatePtrs;
|
|
// set up nd to reference first date in the document
|
|
int32_t nd = 0;
|
|
// what date we are currently in
|
|
Date *inDate = NULL;
|
|
// get first date
|
|
for ( ; nd < maxnd ; nd++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if empty
|
|
if ( ! datePtrs[nd] ) continue;
|
|
// skip if not in doc
|
|
if ( datePtrs[nd]->m_a < 0 )
|
|
continue;
|
|
// stop when we got one
|
|
break;
|
|
}
|
|
*/
|
|
// is the abbr. a noun? like "appt."
|
|
bool hasWordAfter = false;
|
|
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// need a wid
|
|
if ( ! m_wids[i] ) continue;
|
|
// get section we are currently in
|
|
Section *cs = m_sectionPtrs[i];
|
|
// skip if its bad! i.e. style or script or whatever
|
|
if ( cs->m_flags & badFlags ) continue;
|
|
// set that
|
|
int64_t prevWid = m_wids[i];
|
|
int64_t prevPrevWid = 0LL;
|
|
// flag
|
|
int32_t lastWidPos = i;//-1;
|
|
bool lastWasComma = false;
|
|
nodeid_t includedTag = -2;
|
|
int32_t lastbr = -1;
|
|
bool hasColon = false;
|
|
bool endOnBr = false;
|
|
bool endOnBold = false;
|
|
bool capped = true;
|
|
int32_t upper = 0;
|
|
int32_t numAlnums = 0;
|
|
int32_t verbCount = 0;
|
|
// scan for sentence end
|
|
int32_t j; for ( j = i ; j < m_nw ; j++ ) {
|
|
// skip words
|
|
if ( m_wids[j] ) {
|
|
// prev prev
|
|
prevPrevWid = prevWid;
|
|
// assume not a word like "vs."
|
|
hasWordAfter = false;
|
|
// set prev
|
|
prevWid = m_wids[j];
|
|
lastWidPos = j;
|
|
lastWasComma = false;
|
|
endOnBr = false;
|
|
endOnBold = false;
|
|
numAlnums++;
|
|
// skip if stop word and need not be
|
|
// capitalized
|
|
if ( bb[j] & D_IS_STOPWORD ) continue;
|
|
if ( m_wlens[j] <= 1 ) continue;
|
|
if ( is_digit(m_wptrs[j][0]) ) continue;
|
|
if ( !is_upper_utf8(m_wptrs[j])) capped=false;
|
|
else upper++;
|
|
// is it a verb?
|
|
if ( isVerb( &m_wids[j] ) )
|
|
verbCount++;
|
|
//if ( bb[i] & D_IS_IN_DATE_2 ) inDate = true;
|
|
// it is in the sentence
|
|
continue;
|
|
}
|
|
// tag?
|
|
if ( m_tids[j] ) {
|
|
// int16_tcut
|
|
nodeid_t tid = m_tids[j] & BACKBITCOMP;
|
|
|
|
// "shelter: 123-4567"
|
|
// do not allow "<br>...:...<br>" to be
|
|
// the 2nd line of a sentence.
|
|
// they should be their own setence.
|
|
// should fix st. martin's center
|
|
// for unm.edu which has that shelter: line.
|
|
// BUT it hurts "TUESDAYS: 7-8 song...<br>
|
|
// off-site." for texasdrums.
|
|
//if ( isBreakingTagId(tid) &&
|
|
// lastbr >= 0 &&
|
|
// hasColon ) {
|
|
// // end sentence at the prev br tag
|
|
// j = lastbr;
|
|
// break;
|
|
//}
|
|
|
|
// treat nobr as breaking to fix ceder.net
|
|
// which has it after the group title
|
|
if ( tid == TAG_NOBR ) break;
|
|
|
|
if ( tid == TAG_BR ) endOnBr = true;
|
|
if ( tid == TAG_B ) endOnBold = true;
|
|
|
|
// a </b><br> is usually like a header
|
|
if ( capped && upper && endOnBr && endOnBold )
|
|
break;
|
|
// if it is <span style="display:none"> or
|
|
// div or whatever, that is breaking!
|
|
// fixes http://chuckprophet.com/gigs/
|
|
if ( (tid == TAG_DIV ||
|
|
tid == TAG_SPAN ) &&
|
|
m_wlens[j] > 14 &&
|
|
strncasestr(m_wptrs[j],"display:none",
|
|
m_wlens[j]) )
|
|
break;
|
|
// ok, treat span as non-breaking for a second
|
|
if ( tid == TAG_SPAN ) continue;
|
|
// mark this
|
|
if ( tid == TAG_BR ) lastbr = j;
|
|
//
|
|
// certain tags like span and br sometimes
|
|
// do and sometimes do not break a sentence.
|
|
// so by default assume they do, but check
|
|
// for certain indicators...
|
|
//
|
|
if ( tid == TAG_SPAN ||
|
|
tid == TAG_BR ||
|
|
// fixes guysndollsllc.com:
|
|
// causes core dump:
|
|
tid == TAG_P || // villr.com
|
|
// fixes americantowns.com
|
|
tid == TAG_DIV ) {
|
|
// if nothing after, moot point
|
|
if ( j+1 >= m_nw ) break;
|
|
// if we already included this tag
|
|
// then keep including it. but some
|
|
// span tags will break and some won't
|
|
// even when in or around the same
|
|
// sentence. see that local.yahoo.com
|
|
// food delivery services url for
|
|
// the first street address,
|
|
// 5013 Miramar
|
|
if ( includedTag == tid &&
|
|
(m_tids[j] & BACKBIT) ) {
|
|
// reset it in case next
|
|
// <span> tag is not connective
|
|
includedTag = -2;
|
|
continue;
|
|
}
|
|
// if we included this tag type
|
|
// as a front tag, then include its
|
|
// back tag in sentence as well.
|
|
// fixes nonamejustfriends.com
|
|
// which has a span tag in sentence:
|
|
// ".. Club holds a <span>FREE</span>
|
|
// Cruise Night..." and we allow
|
|
// "<span>" because it follows "a",
|
|
// but we were breaking on </span>!
|
|
if ( !(m_tids[j]&BACKBIT))
|
|
includedTag = tid;
|
|
// if prev punct was comma and not
|
|
// an alnum word
|
|
if ( lastWasComma ) continue;
|
|
// get punct words bookcasing this tag
|
|
if ( ! m_wids[j+1] &&
|
|
! m_tids[j+1] &&
|
|
m_words->hasChar(j+1,',') )
|
|
continue;
|
|
// if prevwid is like "vs." then
|
|
// that means keep going even if
|
|
// we hit one of these tags. fixes
|
|
// "new york knicks vs.<br>orlando
|
|
// magic"
|
|
if ( hasWordAfter )
|
|
continue;
|
|
// if first alnum word after tag
|
|
// is lower case, that is good too
|
|
int32_t aw = j + 1;
|
|
int32_t maxaw = j + 12;
|
|
if ( maxaw > m_nw ) maxaw = m_nw;
|
|
for ( ; aw < maxaw ; aw++ )
|
|
if ( m_wids[aw] ) break;
|
|
bool isLower = false;
|
|
if ( aw < maxaw &&
|
|
is_lower_utf8(m_wptrs[aw]) )
|
|
isLower = true;
|
|
|
|
// http or https is not to be
|
|
// considered as such! fixes
|
|
// webnetdesign.com from getting
|
|
// sentences continued by an http://
|
|
// url below them.
|
|
if ( aw < maxaw &&
|
|
(m_wids[aw] == h_http ||
|
|
m_wids[aw] == h_https) )
|
|
isLower = false;
|
|
|
|
// this almost always breaks a sentence
|
|
// and adding this line here fixes
|
|
// "Sunday<p>noon" the thewoodencow.com
|
|
// and let's villr.com stay the same
|
|
// since its first part ends in "and"
|
|
//if ( m_wids[aw] == h_noon ||
|
|
// m_wids[aw] == h_midnight )
|
|
if ( tid == TAG_P &&
|
|
isLower &&
|
|
// Oscar G<p>along with xxxx
|
|
m_wids[aw] != h_along &&
|
|
m_wids[aw] != h_with )
|
|
isLower = false;
|
|
|
|
if ( isLower ) continue;
|
|
// get pre word, preopsitional
|
|
// phrase starter?
|
|
if ( prevWid == h_in ||
|
|
prevWid == h_the ||
|
|
prevWid == h_and ||
|
|
// fix for ending on "(Room A)"
|
|
(prevWid == h_a &&
|
|
prevPrevWid != h_rm &&
|
|
prevPrevWid != h_room &&
|
|
prevPrevWid != h_bldg &&
|
|
prevPrevWid != h_building &&
|
|
prevPrevWid != h_suite &&
|
|
prevPrevWid != h_ste ) ||
|
|
prevWid == h_for ||
|
|
prevWid == h_to ||
|
|
prevWid == h_on ||
|
|
prevWid == h_under ||
|
|
prevWid == h_with ||
|
|
prevWid == h_from ||
|
|
prevWid == h_by ||
|
|
prevWid == h_of ||
|
|
// "some ... Wednesdays"
|
|
prevWid == h_some ||
|
|
prevWid == h_at )
|
|
continue;
|
|
}
|
|
|
|
|
|
// seems like span breaks for meetup.com
|
|
// et al and not for abqtango.com maybe, we
|
|
// need to download the css??? or what???
|
|
// by default span tags do not seem to break
|
|
// the line but ppl maybe configure them to
|
|
if ( tid == TAG_SPAN ) break;
|
|
// if like <font> ignore it
|
|
if ( ! isBreakingTagId(m_tids[j]) ) continue;
|
|
// only break on xml tags if in rss feed to
|
|
// fix <st1:State w:st="on">Arizona</st1>
|
|
// for gwair.org
|
|
if ( tid==TAG_XMLTAG && !m_isRSSExt) continue;
|
|
// otherwise, stop!
|
|
break;
|
|
}
|
|
// skip simple spaces for speed
|
|
if ( m_wlens[j] == 1 && is_wspace_a(m_wptrs[j][0]))
|
|
continue;
|
|
|
|
// do not allow punctuation that is in a url
|
|
// to be split up or used as a splitter. we want
|
|
// to keep the full url intact.
|
|
if ( j > i && j+1 < m_nw &&
|
|
(bb[j-1] & D_IS_IN_URL) &&
|
|
(bb[j ] & D_IS_IN_URL) &&
|
|
(bb[j+1] & D_IS_IN_URL) )
|
|
continue;
|
|
|
|
// was last punct containing a comma?
|
|
lastWasComma = false;
|
|
// scan the punct chars, stop if we hit a sent breaker
|
|
char *p = m_wptrs[j];
|
|
char *pend = p + m_wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
// punct word...
|
|
/*
|
|
if ( is_wspace_a(*p) ) continue;
|
|
if ( *p==',' ) continue;
|
|
if ( *p=='&' ) continue;
|
|
if ( *p=='_' ) continue;
|
|
if ( *p=='$' ) continue;
|
|
if ( *p=='#' ) continue;
|
|
if ( *p=='\"' ) continue;
|
|
if ( *p==';' ) continue;
|
|
if ( *p==':' ) continue; // 10:30
|
|
if ( *p=='@' ) continue;
|
|
if ( *p=='%' ) continue;
|
|
if ( *p=='%' ) continue;
|
|
if ( *p=='+' ) continue;
|
|
if ( *p=='-' ) continue;
|
|
if ( *p=='=' ) continue;
|
|
if ( *p=='/' ) continue;
|
|
if ( *p=='*' ) continue;
|
|
if ( *p=='\'') continue; // dish 'n' spoon
|
|
if ( is_wspace_utf8(p) ) continue;
|
|
break;
|
|
*/
|
|
if ( *p == '.' ) break;
|
|
if ( *p == ',' ) lastWasComma =true;
|
|
// allow this too for now... no...
|
|
if ( *p == ';' ) break;
|
|
// now hyphen breaks, mostly for stuff
|
|
// in title tags like dukecityfix.com
|
|
if ( sp[j]->m_tagId == TAG_TITLE &&
|
|
*p == '-' &&
|
|
is_wspace_a(p[-1]) &&
|
|
is_wspace_a(p[+1]) &&
|
|
lastWidPos >= 0 &&
|
|
! m_isRSSExt &&
|
|
j+1<m_nw &&
|
|
m_wids[j+1] &&
|
|
// can't be in a date range
|
|
// date bits are not valid here because
|
|
// we are now called from ::set() without
|
|
// dates because dates need sentence
|
|
// sections now to set DF_FUZZY for years
|
|
( !isDateType(&prevWid)||
|
|
!isDateType(&m_wids[j+1])) &&
|
|
//( ! (bb[lastWidPos] & D_IS_IN_DATE) ||
|
|
// ! (bb[j+1] & D_IS_IN_DATE) ) &&
|
|
// fix for $10 - $12
|
|
( ! is_digit ( m_wptrs[lastWidPos][0]) ||
|
|
! is_digit ( m_wptrs[j+1][0]) ) )
|
|
break;
|
|
// . treat colon like comma now
|
|
// . for unm.edu we have
|
|
// "Summer Hours: March 15 - Oct15:
|
|
// 8 am. Mon - Fri, 7:30 am - 10 am Sun.,
|
|
// Winter Hours: Oct. 15 - March 15:
|
|
// 8 am., seven days a week"
|
|
// . and we don't want "winter hours" being
|
|
// toplogically closer to the summer hours
|
|
// . that is, the colon is a stronger binder
|
|
// than the comma?
|
|
// . but for villr.com Hours: May-Aug.. gets
|
|
// made into two sentences and Hours is
|
|
// seen as a heading section and causes
|
|
// addImpliedSections() to be wrong.
|
|
//if ( *p == ':' ) lastWasComma =true;
|
|
// . why not the colon?
|
|
if ( *p == ':' ) {
|
|
|
|
// Tags: music,concert,fun
|
|
if ( prevWid == h_tags &&
|
|
// just Tags: so far in sentence
|
|
j == i )
|
|
break;
|
|
|
|
// flag it, but only if not in
|
|
// a time format like "8:30"
|
|
if ( j>0 && !is_digit(m_wptrs[j][-1]))
|
|
hasColon = true;
|
|
|
|
// a "::" is used in breadcrumbs,
|
|
// so break on that.
|
|
// fixes "Dining :: Visit ::
|
|
// Cal Performacnes" title
|
|
if ( p[1] == ':' )
|
|
break;
|
|
|
|
// if "with" preceeds, allow
|
|
if ( prevWid == h_with ) continue;
|
|
|
|
// or prev word was tag! like
|
|
// "blah</b>:..."
|
|
bool tagAfter=(j-1>=0&&m_tids[j-1]);
|
|
|
|
// do not allow if next word is tag
|
|
bool tagBefore=(j+1<m_nw&&m_tids[j+1]);
|
|
|
|
// do not allow
|
|
// "<br>...:<br>" or
|
|
// "<br>...<br>:" or
|
|
// since such things are usually
|
|
// somewhat like headers. isolated
|
|
// lines ending on a colon.
|
|
// should fix st. martin's center
|
|
// for unm.edu "Summer Hours: ..."
|
|
if ( lastbr >= 0 &&
|
|
( tagBefore || tagAfter ) ) {
|
|
// end sentence there then
|
|
j = lastbr;
|
|
break;
|
|
}
|
|
|
|
if ( tagBefore ) break;
|
|
if ( tagAfter ) break;
|
|
// for now allow it!
|
|
continue;
|
|
// do not break http://... though
|
|
if ( p[1] == '/' ) continue;
|
|
// or 10:30 etc.
|
|
if ( is_digit(p[1]) ) continue;
|
|
if ( j>0 && is_digit(p[-1]) ) continue;
|
|
// allow trumba titles to have colons
|
|
// so they can get the TSF_TITLEY
|
|
// event title boost in Events.cpp
|
|
if ( m_isTrumba &&
|
|
sp[j]->m_tagId == TAG_TITLE )
|
|
continue;
|
|
// fix guysndollsllc.com which has
|
|
// "Battle of the Bands with: The Cincy
|
|
// Rockers, Second Wind, ..."
|
|
// if last word was a lowercase
|
|
// and one of these, let it in the
|
|
// sentence
|
|
//if ( lastWidPos < 0 )
|
|
// break;
|
|
// must have been lowercase
|
|
if(!is_lower_utf8(m_wptrs[lastWidPos]))
|
|
break;
|
|
// and must be one of these words:
|
|
if ( prevWid == h_with ||
|
|
// "Send info to: Booking"
|
|
// from guysndollsllc.com/page4.ht
|
|
prevWid == h_to ||
|
|
prevWid == h_and )
|
|
continue;
|
|
// otherwise, break it
|
|
break;
|
|
}
|
|
// . special hyphen
|
|
// . breaks up title for peachpundit.com
|
|
// so we get better event title generation
|
|
// since peachpundit.com will be a reepat sec
|
|
// . BUT it did not work!
|
|
if ( p[0] == (char)-30 &&
|
|
p[1] == (char)-128 &&
|
|
p[2] == (char)-108 )
|
|
break;
|
|
// this for sure
|
|
// "Home > Albuquerque Events > Love Song ..."
|
|
if ( *p == '>' ) break;
|
|
if ( *p == '!' ) break;
|
|
if ( *p == '?' ) break;
|
|
if ( *p == '|' )
|
|
break;
|
|
// bullets
|
|
if ( p[0] == (char)226 &&
|
|
p[1] == (char)128 &&
|
|
p[2] == (char)162 )
|
|
break;
|
|
redo:
|
|
continue;
|
|
}
|
|
// if none, keep going
|
|
if ( p == pend ) continue;
|
|
// if an alnum char follows the ., it is ok
|
|
// probably a hostname or ip or phone #
|
|
if ( is_alnum_utf8(p+1) &&
|
|
// "venue:ABQ Sq Dance Center..." for
|
|
// americantowns.com has no space after the colon!
|
|
*p !=':' )
|
|
goto redo;
|
|
// if abbreviation before we are ok too
|
|
if ( *p == '.' && isAbbr(prevWid,&hasWordAfter) ) {
|
|
// but the period may serve a double purpose
|
|
// to end the abbr and terminate the sentence
|
|
// if the word that follows is capitalized,
|
|
// and if the abbr is a lower-case noun.
|
|
//
|
|
// if abbr is like "vs" then do not end sentenc
|
|
if ( hasWordAfter )
|
|
goto redo;
|
|
|
|
// set "next" to next alnum word after us
|
|
int32_t next = j+1;
|
|
int64_t nwid = 0LL;
|
|
int32_t max = next + 10;
|
|
if ( max > m_nw ) max = m_nw;
|
|
for ( ; next < max ; next++ ) {
|
|
if ( ! m_wids[next] ) continue;
|
|
nwid = m_wids[next];
|
|
break;
|
|
}
|
|
// am. pm.
|
|
// if prev word was like 'm' as in am or pm
|
|
// then assume a cap word following ends sent.
|
|
// although if we got
|
|
// "At 1 p.m. Bob Jones plays"
|
|
// then we'd be wrong.
|
|
bool isAmPm = false;
|
|
if ( prevWid == h_m &&
|
|
(prevPrevWid == h_a ||
|
|
prevPrevWid == h_p ) )
|
|
isAmPm = true;
|
|
if ( (prevWid == h_am ||
|
|
prevWid == h_pm ) )
|
|
isAmPm = true;
|
|
if ( isAmPm &&
|
|
verbCount >= 1 &&
|
|
next < max &&
|
|
m_words->isCapitalized(next) &&
|
|
// exclude "5 p.m. Friday" (santafe.org)
|
|
! isDateType(&nwid) )
|
|
// end the sentence
|
|
break;
|
|
// do not end sentence if like
|
|
// "8 am. Fridays", otherwise it would end
|
|
// without this statement since am is lower
|
|
// case and Fridays is capitalized
|
|
if ( isAmPm && isDateType(&nwid) )
|
|
goto redo;
|
|
|
|
// was previous word/abbr capitalized?
|
|
// if so, assume period does not end sentence.
|
|
if ( m_words->isCapitalized(lastWidPos) )
|
|
goto redo;
|
|
// if next word is NOT capitalized, assume
|
|
// period does not end sentence...
|
|
if ( next < max &&
|
|
! m_words->isCapitalized ( next ) )
|
|
goto redo;
|
|
// otherwise, abbr is NOT capitalized and
|
|
// next word IS capitalized, so assume the
|
|
// period does NOT end the sentence
|
|
}
|
|
// fix "1. library name" for cabq.gov
|
|
if ( *p == '.' &&
|
|
lastWidPos == i &&
|
|
m_words->isNum(lastWidPos) )
|
|
goto redo;
|
|
// fix for www.<b>test</b>.com in google serps
|
|
//if ( j+3<m_nw &&
|
|
// *p == '.' &&
|
|
// ( (m_tids[j+1]&BACKBITCOMP) == TAG_B ||
|
|
// (m_tids[j+1]&BACKBITCOMP) == TAG_STRONG ||
|
|
// (m_tids[j+1]&BACKBITCOMP) == TAG_I ) &&
|
|
// m_wids[j+2] )
|
|
// goto redo;
|
|
// ok, stop otherwise
|
|
break;
|
|
}
|
|
// set j to this to exclude ending tags
|
|
//if ( lastWidPos >= 0 ) j = lastWidPos + 1;
|
|
|
|
// do not include tag at end. try to fix sentence flip flop.
|
|
for ( ; j > i ; j-- )
|
|
// stop when we just contain the last word
|
|
if ( m_wids[j-1] ) break;
|
|
|
|
// make our sentence endpoints now
|
|
int32_t senta = i;
|
|
// make the sentence defined by [senta,sentb) where sentb
|
|
// defines a half-open interval like we do for almost
|
|
// everything else
|
|
int32_t sentb = j;
|
|
|
|
// update i for next iteration
|
|
i = sentb - 1;
|
|
|
|
// if we ended on a punct word, include that in the sentence
|
|
//if ( j < m_nw && ! m_tids[j] ) sentb++;
|
|
|
|
|
|
// crap, but now sentences intersect with our tag-based
|
|
// sections because they can now split tags because of websites
|
|
// like aliconference.com and abqtango.com whose sentences
|
|
// do not align with the tag sections. therefore we introduce
|
|
// the SEC_TOP_SPLIT and SEC_BOTTOM_SPLIT to indicate
|
|
// that the section is a top/bottom piece of a split sentence.
|
|
// if both bits are set we assume SEC_MIDDLE_SPLIT.
|
|
// then we set the Section::m_senta and m_sentb to
|
|
// indicate the whole sentence of which it is a split.
|
|
// but the vast majority of the time m_senta and m_sentb
|
|
// will equal m_firstWordPos and m_lastWordPos respectively.
|
|
// then, any routine that
|
|
|
|
|
|
|
|
// so scan the words in the sentence and as we scan we have
|
|
// to determine the parent section we inserting the sentence
|
|
// into as a child section.
|
|
//Section *parent = NULL;
|
|
int32_t start = -1;
|
|
Section *pp;
|
|
//Section *np;
|
|
int32_t lastk = 0;
|
|
Section *splitSection = NULL;
|
|
Section *lastGuy = NULL;
|
|
|
|
for ( int32_t k = senta ; k <= sentb ; k++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// add final piece
|
|
if ( k == sentb ) {
|
|
// stop i no final piece
|
|
if ( start == -1 ) break;
|
|
// otherwise, add it
|
|
goto addit;
|
|
}
|
|
// need a real alnum word
|
|
if ( ! m_wids[k] ) continue;
|
|
// get his parent
|
|
pp = m_sectionPtrs[k];
|
|
// set parent if need to
|
|
//if ( ! parent ) parent = pp;
|
|
// and start sentence if need to
|
|
if ( start == -1 ) start = k;
|
|
// if same as exact section as last guy, save some time
|
|
if ( pp == lastGuy ) pp = NULL;
|
|
// store it
|
|
lastGuy = pp;
|
|
// . i'd say blow up "pp" until its contains "start"
|
|
// . but if before it contains start it breaches
|
|
// [senta,sentb) then we have to cut things int16_t
|
|
for ( ; pp ; pp = pp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// we now have to split section "pp"
|
|
// when adding the sentence section.
|
|
// once we have such a section we
|
|
// cannot use a different parent...
|
|
if ( pp->m_firstWordPos < start ||
|
|
pp->m_lastWordPos >= sentb ) {
|
|
// set it
|
|
if ( ! splitSection ) splitSection =pp;
|
|
// WE ARE ONLY ALLOWED TO SPLIT ONE
|
|
// SECTION ONLY...
|
|
if ( pp != splitSection)
|
|
goto addit;
|
|
break;
|
|
}
|
|
// keep telescoping until "parent" contains
|
|
// [senta,k] , and we already know that it
|
|
// contains k because that is what we set it to
|
|
//if ( pp->m_a <= senta ) break;
|
|
}
|
|
// mark it
|
|
if ( m_wids[k] ) lastk = k;
|
|
// ok, keep chugging
|
|
continue;
|
|
|
|
// add the final piece if we go to this label
|
|
addit:
|
|
// use this flag
|
|
int32_t bh = BH_SENTENCE;
|
|
// determine parent section, smallest section
|
|
// containing [start,lastk]
|
|
Section *parent = m_sectionPtrs[start];
|
|
for ( ; parent ; parent = parent->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if contains lastk
|
|
if ( parent->m_b > lastk ) break;
|
|
}
|
|
//
|
|
// for "<span>Albuquerque</span>, New Mexico"
|
|
// "start" points to "Albuquerque" but needs to
|
|
// point to the "<span>" so its parent is "parent"
|
|
int32_t adda = start;
|
|
int32_t addb = lastk;
|
|
// need to update "start" to so its parent is the new
|
|
// "parent" now so insertSubSection() does not core
|
|
for ( ; adda >= 0 ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if we finally got the right parent
|
|
if ( m_sectionPtrs[adda]==parent ) break;
|
|
// or if he's a tag and his parent
|
|
// is "parent" we can stop.
|
|
// i.e. STOP on a proper subsection of
|
|
// the section containing the sentence.
|
|
if ( m_sectionPtrs[adda]->m_parent==parent &&
|
|
m_sectionPtrs[adda]->m_a == adda )
|
|
break;
|
|
// backup
|
|
adda--;
|
|
// check
|
|
if ( adda < 0 ) break;
|
|
// how can this happen?
|
|
if ( m_wids[adda] ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity
|
|
if ( adda < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// backup addb over any punct we don't need that
|
|
//if ( addb > 0 && addb < m_nw &&
|
|
// ! m_wids[addb] && ! m_tids[addb] ) addb--;
|
|
|
|
// same for right endpoint
|
|
for ( ; addb < m_nw ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if we finally got the right parent
|
|
if ( m_sectionPtrs[addb]==parent ) break;
|
|
// get it
|
|
Section *sp = m_sectionPtrs[addb];
|
|
// come back up here in the case of a section
|
|
// sharing its Section::m_b with its parent
|
|
subloop:
|
|
// or if he's a tag and his parent
|
|
// is "parent" we can stop
|
|
if ( sp->m_parent==parent &&
|
|
sp->m_b == addb+1 )
|
|
break;
|
|
// or if we ran into a brother section
|
|
// that does not contain the sentence...
|
|
// fix core dump for webnetdesign.com whose
|
|
// sentence consisted of 3 sections from
|
|
// A=7079 to B=7198. but now i am getting rid
|
|
// of allowing a lower case http(s):// on
|
|
// a separate line to indicate that the
|
|
// sentence continues... so we will not have
|
|
// this sentence anymore in case you are
|
|
// wondering why it is not there any more.
|
|
if ( sp->m_parent==parent &&
|
|
sp->m_a == addb ) {
|
|
// do not include that brother's tag
|
|
addb--;
|
|
break;
|
|
}
|
|
|
|
// when we have bad tag formations like for
|
|
// http://gocitykids.parentsconnect.com/catego
|
|
// ry/buffalo-ny-usa/places-to-go/tourist-stops
|
|
// like <a><b>...</div> with no ending </a> or
|
|
// </b> tags then we have to get the parent
|
|
// of the parent as int32_t as its m_b is the
|
|
// same and check that before advancing addb
|
|
// otherwise we can miss the parent section
|
|
// that we want! (this is because the kid
|
|
// sections share the same m_b as their
|
|
// parent because of they have no ending tag)
|
|
if ( sp->m_parent &&
|
|
sp->m_parent->m_b == sp->m_b ) {
|
|
sp = sp->m_parent;
|
|
goto subloop;
|
|
}
|
|
|
|
// advance
|
|
addb++;
|
|
// stop if addb
|
|
if ( addb >= m_nw ) break;
|
|
// how can this happen?
|
|
if ( m_wids[addb] ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// sanity
|
|
if ( addb >= m_nw ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ok, now add the split sentence
|
|
Section *is =insertSubSection(parent,adda,addb+1,bh);
|
|
// panic?
|
|
if ( ! is ) return false;
|
|
// set sentence flag on it
|
|
is->m_flags |= SEC_SENTENCE;
|
|
// count it
|
|
m_numSentenceSections++;
|
|
// print it out
|
|
/*
|
|
SafeBuf tt;
|
|
tt.safeMemcpy(m_wptrs[adda],
|
|
m_wptrs[addb]+m_wlens[addb]-
|
|
m_wptrs[adda]);
|
|
tt.safeReplace2 ( "\n",1,"*",1,m_niceness);
|
|
tt.safeReplace2 ( "\r",1,"*",1,m_niceness);
|
|
if ( is->m_flags & SEC_SPLIT_SENT )
|
|
tt.safePrintf(" [split]");
|
|
tt.pushChar(0);
|
|
fprintf(stderr,"a=%"INT32" %s\n",start,tt.m_buf);
|
|
*/
|
|
// . set this
|
|
// . sentence is from [senta,sentb)
|
|
is->m_senta = senta;//start;
|
|
is->m_sentb = sentb;//k;
|
|
// use this too if its a split of a sentence
|
|
if ( is->m_senta < is->m_a )
|
|
is->m_flags |= SEC_SPLIT_SENT;
|
|
if ( is->m_sentb > is->m_b )
|
|
is->m_flags |= SEC_SPLIT_SENT;
|
|
// stop if that was it
|
|
if ( k == sentb ) break;
|
|
// go on to next fragment then
|
|
start = -1;
|
|
parent = NULL;
|
|
splitSection = NULL;
|
|
lastGuy = NULL;
|
|
// redo this same k
|
|
k--;
|
|
}
|
|
}
|
|
|
|
int32_t inSentTil = 0;
|
|
Section *lastSent = NULL;
|
|
// get the section of each word. if not a sentence section then
|
|
// make its m_sentenceSection point to its parent that is a sentence
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// need sentence
|
|
if ( ( sk->m_flags & SEC_SENTENCE ) ) {
|
|
inSentTil = sk->m_b;
|
|
lastSent = sk;
|
|
sk->m_sentenceSection = sk;
|
|
continue;
|
|
}
|
|
// skip if outside of the last sentence we had
|
|
if ( sk->m_a >= inSentTil ) continue;
|
|
// we are in that sentence
|
|
sk->m_sentenceSection = lastSent;
|
|
}
|
|
|
|
return true;
|
|
|
|
/*
|
|
// set this
|
|
Section *firstSection = m_sectionPtrs[i];
|
|
Section *lastSection = m_sectionPtrs[j-1];
|
|
|
|
// blow up before containing each other
|
|
for ( ; firstSection ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if too much
|
|
Section *pp = firstSection->m_parent;
|
|
// stop if too much
|
|
if ( ! pp ) break;
|
|
// stop if too much
|
|
if ( pp->contains ( lastSection ) ) break;
|
|
// advance
|
|
firstSection = pp;
|
|
}
|
|
for ( ; lastSection ; ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if too much
|
|
Section *pp = lastSection->m_parent;
|
|
// stop if too much
|
|
if ( ! pp ) break;
|
|
// stop if too much
|
|
if ( pp->contains ( firstSection ) ) break;
|
|
// advance
|
|
lastSection = pp;
|
|
}
|
|
|
|
// if last section overflowed, cut if off
|
|
if ( lastSection != firstSection &&
|
|
// make sure it doesn't just contain the front part
|
|
// of the sentence as well as firstSection!
|
|
lastSection->m_firstWordPos > i &&
|
|
lastSection->m_lastWordPos > j ) {
|
|
sentb = lastSection->m_a;
|
|
// set a flag
|
|
// no need to do a loop
|
|
|
|
// likewise, we can't split first section either, but in
|
|
// that case, drop everything not in the first section
|
|
if ( lastSection != firstSection &&
|
|
firstSection->m_firstWordPos < i &&
|
|
firstSection->m_lastWordPos < sentb )
|
|
sentb = firstSection->m_b;
|
|
|
|
// for reverbnation.com we have "Tools for <a>artists</a> |
|
|
// <a>labels</a> | ..." and the first sentence contains
|
|
// the anchor tag, but the 2nd sentence is a subsection of
|
|
// the anchor tag. thus giving inconsistent tag hashes
|
|
// and messing up our menu detection, so let's include
|
|
// anchor tags in the sentence to fix that!
|
|
// CRAP, this was hurting blackbirdbuvette because it was
|
|
// including the bullet delimeter and the "big" section
|
|
// computed below ended up spanning two bullet delimited
|
|
// sections to make the sentence!! that gave us a bad event
|
|
// title that consisted of that sentence..
|
|
//if ( j < m_nw && ! m_tids[j] ) sentb++;
|
|
|
|
// sent section needs to include the initial <a>, somehow
|
|
// that is flip flopping
|
|
//Section *sa = m_sectionPtrs[senta];
|
|
//if ( m_tids[senta] && sa->contains ( senta, sentb ) )
|
|
// senta++;
|
|
|
|
// A = m_sectionPtrs[senta] and
|
|
// B = m_sectionPtrs[sentb-1]
|
|
// find the smallest section that contains both A and B.
|
|
// if A and B are the same, we are done, we are just
|
|
// a child if that section.
|
|
// . telescope both A and B up until they contain
|
|
// each other like our lasta and lastb algo !!
|
|
|
|
Section *big = m_sectionPtrs[senta];
|
|
for ( ; big ; big = big->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
// stop when we contain the whole sentence
|
|
if ( big->m_a <= senta &&
|
|
big->m_b >= sentb ) break;
|
|
}
|
|
|
|
Section *sa = m_sectionPtrs[senta];
|
|
Section *sb = m_sectionPtrs[sentb-1];
|
|
|
|
// get the two sections that are children of "big" and cover
|
|
// the first and last words of the sentence.
|
|
Section *lasta = NULL;
|
|
Section *lastb = NULL;
|
|
// blow up sb until it contains senta, but its section right
|
|
// before it equals "big"
|
|
for ( ; sb ; sb = sb->m_parent ) {
|
|
if ( sb == big ) break;
|
|
lastb = sb;
|
|
}
|
|
for ( ; sa ; sa = sa->m_parent ) {
|
|
if ( sa == big ) break;
|
|
lasta = sa;
|
|
}
|
|
|
|
// set some simple constraints on [senta,sentb) so that we
|
|
// do no split the sections lasta and lastb. we need our new
|
|
// inserted section to contain "lasta" and "lastb" but be
|
|
// a child of "big"
|
|
int32_t maxa ;
|
|
int32_t minb ;
|
|
// but if these are the same as big
|
|
if ( ! lasta ) maxa = senta;
|
|
else maxa = lasta->m_a;
|
|
if ( ! lastb ) minb = sentb;
|
|
else minb = lastb->m_b;
|
|
|
|
// save for debug
|
|
//int32_t saveda = senta;
|
|
//int32_t savedb = sentb;
|
|
|
|
// apply the constraints
|
|
if ( senta > maxa ) senta = maxa;
|
|
if ( sentb < minb ) sentb = minb;
|
|
|
|
// ok, add the sentence sections as a subsection of "sa"
|
|
Section *is = insertSubSection ( big , senta , sentb,
|
|
BH_SENTENCE );
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// return false with g_errno set on error
|
|
if ( ! is ) return false;
|
|
// set sentence flag on it
|
|
is->m_flags |= SEC_SENTENCE;
|
|
// make it fake to now
|
|
//is->m_flags |= SEC_FAKE;
|
|
// set its base hash to something special
|
|
//is->m_baseHash = BH_SENTENCE;
|
|
// skip over
|
|
//i = j - 1;
|
|
i = sentb - 1;
|
|
}
|
|
return true;
|
|
*/
|
|
}
|
|
|
|
Section *Sections::insertSubSection ( Section *parentArg , int32_t a , int32_t b ,
|
|
int32_t newBaseHash ) {
|
|
// debug
|
|
//log("sect: inserting subsection [%"INT32",%"INT32")",a,b);
|
|
|
|
// try to realloc i guess. should keep ptrs in tact.
|
|
if ( m_numSections >= m_maxNumSections )
|
|
// try to realloc i guess
|
|
if ( ! growSections() ) return NULL;
|
|
//char *xx=NULL;*xx=0;}
|
|
|
|
//
|
|
// make a new section
|
|
//
|
|
Section *sk = &m_sections[m_numSections];
|
|
// clear
|
|
memset ( sk , 0 , sizeof(Section) );
|
|
// inc it
|
|
m_numSections++;
|
|
// now set it
|
|
sk->m_a = a;
|
|
sk->m_b = b;
|
|
|
|
// this makes it really fast!
|
|
//return sk;
|
|
|
|
// don't mess this up!
|
|
if ( m_lastSection && a > m_lastSection->m_a )
|
|
m_lastSection = sk;
|
|
|
|
// the base hash (delimeter hash) hack
|
|
sk->m_baseHash = 0;// dh; ????????????????????
|
|
|
|
// do not resplit this split section with same delimeter!!
|
|
sk->m_processedHash = 0; // ?????? dh;
|
|
|
|
// get first section containing word #a
|
|
Section *si = m_sectionPtrs[a];
|
|
|
|
for ( ; si ; si = si->m_prev ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// we become his child if this is true
|
|
if ( si->m_a < a ) break;
|
|
// if he is bigger (or equal) we become his child
|
|
// and are after him
|
|
if ( si->m_a == a && si->m_b >= b ) break;
|
|
}
|
|
|
|
// . try using section before us if it is contained by "si"
|
|
// . like in the case when word #a belongs to the root section
|
|
// and there are thousands of child sections of the root before "a"
|
|
// we really want to get the child section of the root before us
|
|
// as the prev section, "si", otherwise the 2nd for loop below here
|
|
// will hafta loop through thousands of sibling sections
|
|
// . this will fail if word before a is part of our same section
|
|
// . what if we ignored this for now and set m_sectionPtrs[a] to point
|
|
// to the newly inserted section, then when done adding sentence
|
|
// sections we scanned all the words, keeping track of the last
|
|
// html section we entered and used that to insert the sentence sections
|
|
if ( m_lastAdded && m_lastAdded->m_a > si->m_a && m_lastAdded->m_a < a )
|
|
si = m_lastAdded;
|
|
|
|
|
|
// crap we may have
|
|
// "<p> <strong>hey there!</strong> this is another sentence.</p>"
|
|
// then "si" will be pointing at the "<p>" section, and we will
|
|
// not get the "<strong>" section as the "prev" to sk, which we should!
|
|
// that is where sk is the "this is another sentence." sentence
|
|
// section. so to fix that try iterating over si->m_next to get si to
|
|
// be closer to sk.
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if no more eavailable
|
|
if ( ! si->m_next ) break;
|
|
// stop if would break
|
|
if ( si->m_next->m_a > a ) break;
|
|
// if it gets closer to us without exceeding us, use it
|
|
if ( si->m_next->m_a < a ) continue;
|
|
// if tied, check b. if it contains us, go to it
|
|
if ( si->m_next->m_b >= b ) continue;
|
|
// otherwise, stop
|
|
break;
|
|
}
|
|
|
|
// set this
|
|
m_lastAdded = si;
|
|
|
|
// a br tag can split the very first base html tag like for
|
|
// mapsandatlases.org we have
|
|
// "<html>...</html> <br> ...." so the br tag splits the first
|
|
// section!
|
|
// SO we need to check for NULL si's!
|
|
if ( ! si ) {
|
|
// skip this until we figure it out
|
|
m_numSections--;
|
|
char *xx=NULL;*xx=0;
|
|
return NULL;
|
|
sk->m_next = m_rootSection;//m_rootSection;
|
|
sk->m_prev = NULL;
|
|
//m_sections[0].m_prev = sk;
|
|
m_rootSection->m_prev = sk;
|
|
m_rootSection = sk;
|
|
}
|
|
else {
|
|
// insert us into the linked list of sections
|
|
if ( si->m_next ) si->m_next->m_prev = sk;
|
|
sk->m_next = si->m_next;
|
|
sk->m_prev = si;
|
|
si->m_next = sk;
|
|
}
|
|
|
|
// now set the parent
|
|
Section *parent = m_sectionPtrs[a];
|
|
// expand until it encompasses both a and b
|
|
for ( ; ; parent = parent->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
if ( parent->m_a > a ) continue;
|
|
if ( parent->m_b < b ) continue;
|
|
break;
|
|
}
|
|
// now we assign the parent to you
|
|
sk->m_parent = parent;
|
|
sk->m_exclusive = parent->m_exclusive;
|
|
// sometimes an implied section is a subsection of a sentence!
|
|
// like when there are a lot of brbr (double br) tags in it...
|
|
sk->m_sentenceSection = parent->m_sentenceSection;
|
|
// take out certain flags from parent
|
|
sec_t flags = parent->m_flags;
|
|
// like this
|
|
flags &= ~SEC_SENTENCE;
|
|
flags &= ~SEC_SPLIT_SENT;
|
|
// but take out unbalanced!
|
|
flags &= ~SEC_UNBALANCED;
|
|
|
|
// . remove SEC_HAS_DOM/DOW/TOD
|
|
// . we scan our new kids for reparenting to us below, so OR these
|
|
// flags back in down there if we should
|
|
flags &= ~SEC_HAS_DOM;
|
|
flags &= ~SEC_HAS_DOW;
|
|
flags &= ~SEC_HAS_TOD;
|
|
//flags &= ~SEC_HAS_DATE;
|
|
|
|
// add in fake
|
|
flags |= SEC_FAKE;
|
|
// flag it as a fake section
|
|
sk->m_flags = flags ;
|
|
// need this
|
|
sk->m_baseHash = newBaseHash;
|
|
|
|
// reset these
|
|
sk->m_firstWordPos = -1;
|
|
sk->m_lastWordPos = -1;
|
|
sk->m_alnumPosA = -1;
|
|
sk->m_alnumPosB = -1;
|
|
sk->m_senta = -1;
|
|
sk->m_sentb = -1;
|
|
sk->m_firstPlaceNum= -1;
|
|
sk->m_headColSection = NULL;
|
|
sk->m_headRowSection = NULL;
|
|
sk->m_tableSec = NULL;
|
|
sk->m_rowNum = 0;
|
|
sk->m_colNum = 0;
|
|
|
|
|
|
#ifdef _DEBUG_SECTIONS_
|
|
// interlaced section detection
|
|
if ( m_isTestColl ) {
|
|
// scan from words and telescope up
|
|
Section *s1 = m_sectionPtrs[a];
|
|
Section *s2 = m_sectionPtrs[b-1];
|
|
// check for interlace
|
|
for ( ; s1 ; s1 = s1->m_parent )
|
|
if ( s1->m_a < a &&
|
|
s1->m_b > a &&
|
|
s1->m_b < b ) {char *xx=NULL;*xx=0;}
|
|
// check for interlace
|
|
for ( ; s2 ; s2 = s2->m_parent )
|
|
if ( s2->m_a < b &&
|
|
s2->m_b > b &&
|
|
s2->m_a > a ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
#endif
|
|
|
|
// try to keep a sorted linked list
|
|
//Section *current = m_sectionPtrs[a];
|
|
|
|
//return sk;
|
|
|
|
// for inheriting flags from our kids
|
|
sec_t mask = (SEC_HAS_DOM|SEC_HAS_DOW|SEC_HAS_TOD);//|SEC_HAS_DATE);
|
|
|
|
//
|
|
// !!!!!!!!!! SPEED THIS UP !!!!!!!!!!
|
|
//
|
|
|
|
// . find any child section of "parent" and make us their parent
|
|
// . TODO: can later speed up with ptr to ptr logic
|
|
// . at this point sections are not sorted so we can't
|
|
// really iterate linearly through them ... !!!
|
|
// . TODO: speed this up!!!
|
|
//
|
|
// . TODO: use hashtable?
|
|
// . TODO: aren't these sections in order by m_a??? could just use that
|
|
//
|
|
//for ( int32_t xx = 0 ; xx < m_numSections ; xx++ ) {
|
|
|
|
// set sk->m_firstWordPos
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
// and first/last word pos
|
|
if ( ! m_wids[i] ) continue;
|
|
// mark this
|
|
sk->m_firstWordPos = i;
|
|
break;
|
|
}
|
|
|
|
// set sk->m_lastWordPos
|
|
for ( int32_t i = b-1 ; i >= a ; i-- ) {
|
|
// and first/last word pos
|
|
if ( ! m_wids[i] ) continue;
|
|
// mark this
|
|
sk->m_lastWordPos = i;
|
|
break;
|
|
}
|
|
|
|
|
|
//
|
|
// to speed up scan the words in our inserted section, usually
|
|
// a sentence section i guess, because our parent can have a ton
|
|
// of children sections!!
|
|
//
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get current parent of that word
|
|
Section *wp = m_sectionPtrs[i];
|
|
// if sentence section does NOT contain the word's current
|
|
// section then the sentence section becomes the new section
|
|
// for that word.
|
|
if ( ! sk->strictlyContains ( wp ) ) {
|
|
// now if "wp" is like a root, then sk becomes the kid
|
|
m_sectionPtrs[i] = sk;
|
|
// our parent is wp
|
|
sk->m_parent = wp;
|
|
continue;
|
|
}
|
|
// we gotta blow up wp until right before it is bigger
|
|
// than "sk" and use that
|
|
for ( ; wp->m_parent ; wp = wp->m_parent )
|
|
// this could be equal to, not just contains
|
|
// otherwise we use strictlyContains()
|
|
if ( wp->m_parent->contains(sk) ) break;
|
|
// already parented to us?
|
|
if ( wp->m_parent == sk ) continue;
|
|
// sentence's parent is now wp's parent
|
|
sk->m_parent = wp->m_parent;
|
|
// and we become wp's parent
|
|
wp->m_parent = sk;
|
|
// and or his flags into us. SEC_HAS_DOM, etc.
|
|
sk->m_flags |= wp->m_flags & mask;
|
|
// sanity check
|
|
if ( wp->m_b > sk->m_b ) { char *xy=NULL;*xy=0; }
|
|
if ( wp->m_a < sk->m_a ) { char *xy=NULL;*xy=0; }
|
|
}
|
|
|
|
return sk;
|
|
|
|
// start scanning here
|
|
Section *start = parent->m_next;
|
|
|
|
int32_t lastb = -1;
|
|
// try just scanning sections in parent
|
|
for ( Section *sx = start ; sx ; sx = sx->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
//Section *sx = &m_sections[xx];
|
|
// skip if section ends before our sentence begins
|
|
if ( sx->m_b <= a ) continue;
|
|
// stop if beyond sk
|
|
if ( sx->m_a >= b ) break;
|
|
// skip if sn not parent
|
|
if ( sx->m_parent != parent ) continue;
|
|
// when splitting a section do not reparent if
|
|
// not in our split...
|
|
//if ( sx->m_a >= b ) continue;
|
|
// do not reparent if it contains us
|
|
if ( sx->m_a <= a && sx->m_b >= b ) continue;
|
|
// reset his parent to the newly added section
|
|
sx->m_parent = sk;
|
|
// and or his flags into us. SEC_HAS_DOM, etc.
|
|
sk->m_flags |= sx->m_flags & mask;
|
|
// sanity check
|
|
if ( sx->m_b > sk->m_b ) { char *xy=NULL;*xy=0; }
|
|
if ( sx->m_a < sk->m_a ) { char *xy=NULL;*xy=0; }
|
|
// skip if already got the xor for this section
|
|
if ( sx->m_a < lastb ) continue;
|
|
// set this
|
|
lastb = sx->m_b;
|
|
// add all the entries from this child section from the
|
|
// phone/email/etc. tables
|
|
sk->m_phoneXor ^= sx->m_phoneXor;
|
|
sk->m_emailXor ^= sx->m_emailXor;
|
|
sk->m_priceXor ^= sx->m_priceXor;
|
|
sk->m_todXor ^= sx->m_todXor;
|
|
sk->m_dayXor ^= sx->m_dayXor;
|
|
sk->m_addrXor ^= sx->m_addrXor;
|
|
// make sure did not make it zero
|
|
if ( sx->m_phoneXor && sk->m_phoneXor == 0 )
|
|
sk->m_phoneXor = sx->m_phoneXor;
|
|
if ( sx->m_emailXor && sk->m_emailXor == 0 )
|
|
sk->m_emailXor = sx->m_emailXor;
|
|
if ( sx->m_priceXor && sk->m_priceXor == 0 )
|
|
sk->m_priceXor = sx->m_priceXor;
|
|
if ( sx->m_todXor && sk->m_todXor == 0 )
|
|
sk->m_todXor = sx->m_todXor;
|
|
if ( sx->m_dayXor && sk->m_dayXor == 0 )
|
|
sk->m_dayXor = sx->m_dayXor;
|
|
if ( sx->m_addrXor && sk->m_addrXor == 0 )
|
|
sk->m_addrXor = sx->m_addrXor;
|
|
// set this perhaps
|
|
if ( sk->m_firstPlaceNum < 0 )
|
|
sk->m_firstPlaceNum = sx->m_firstPlaceNum;
|
|
// update this?
|
|
if ( sx->m_alnumPosA < 0 ) continue;
|
|
// take the first one we get
|
|
if ( sk->m_alnumPosA == -1 )
|
|
sk->m_alnumPosA = sx->m_alnumPosA;
|
|
// update to the last one always
|
|
sk->m_alnumPosB = sx->m_alnumPosB;
|
|
}
|
|
|
|
// a flag
|
|
bool needsFirst = true;
|
|
// . set the words ptrs to it
|
|
// . TODO: can later speed up with ptr to ptr logic
|
|
for ( int32_t yy = a ; yy < b ; yy++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// and first/last word pos
|
|
if ( m_wids[yy] ) {
|
|
// mark this
|
|
if ( needsFirst ) {
|
|
sk->m_firstWordPos = yy;
|
|
needsFirst = false;
|
|
}
|
|
// remember last
|
|
sk->m_lastWordPos = yy;
|
|
}
|
|
// must have had sn as parent
|
|
if ( m_sectionPtrs[yy] != parent ) continue;
|
|
// "sk" becomes the new parent
|
|
m_sectionPtrs[yy] = sk;
|
|
}
|
|
|
|
return sk;
|
|
}
|
|
|
|
// for brbr and hr splitting delimiters
|
|
int32_t Sections::splitSectionsByTag ( nodeid_t tagid ) {
|
|
|
|
// . try skipping for xml
|
|
// . eventbrite.com has a bunch of dates per event item and
|
|
// we end up using METHOD_DOM on those!
|
|
// . i think the implied section algo is meant for html really
|
|
// or plain text
|
|
if ( m_contentType == CT_XML &&
|
|
( m_isEventBrite ||
|
|
m_isStubHub ||
|
|
m_isFacebook ) )
|
|
return 0;
|
|
|
|
int32_t numAdded = 0;
|
|
// . now, split sections up if they contain one or more <hr> tags
|
|
// . just append some "hr" sections under that parent to m_sections[]
|
|
// . need to update m_sectionPtrs[] after this of course!!!!!
|
|
// . now we also support various other delimiters, like bullets
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must be brbr or hr
|
|
if ( ! isTagDelimeter ( si , tagid ) ) continue;
|
|
// must have a next brother to be useful
|
|
if ( ! si->m_nextBrother ) continue;
|
|
// skip if already did this section
|
|
if ( si->m_processedHash ) continue;
|
|
// set first brother
|
|
Section *first = si;
|
|
for ( ; first->m_prevBrother ; first = first->m_prevBrother )
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// save parent
|
|
Section *parent = first->m_parent;
|
|
|
|
subloop:
|
|
// mark it
|
|
first->m_processedHash = 1;
|
|
|
|
// start of insertion section is right after tag
|
|
int32_t a = first->m_b;
|
|
|
|
// but if first is not a tag delimeter than use m_a
|
|
if ( ! isTagDelimeter ( first , tagid ) ) a = first->m_a;
|
|
|
|
// or if first section has text, then include that, like
|
|
// in the case of h1 tags for example
|
|
if ( first->m_firstWordPos >= 0 ) a = first->m_a;
|
|
|
|
// end of inserted section is "b"
|
|
int32_t b = -1;
|
|
|
|
int32_t numTextSections = 0;
|
|
// count this
|
|
if ( first->m_firstWordPos >= 0 ) numTextSections++;
|
|
// start scanning right after "first"
|
|
Section *last = first->m_nextBrother;
|
|
// set last brother and "b"
|
|
for ( ; last ; last = last->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop on tag delimiters
|
|
if ( isTagDelimeter ( last , tagid ) ) {
|
|
// set endpoint of new subsection
|
|
b = last->m_a;
|
|
// and stop
|
|
break;
|
|
}
|
|
// assume we are the end of the line
|
|
b = last->m_b;
|
|
// count this
|
|
if ( last->m_firstWordPos >= 0 ) numTextSections++;
|
|
// must be brbr or hr
|
|
//if ( ! isTagDelimeter ( last , tagid ) ) continue;
|
|
// got it, use the start of us being a splitter tag
|
|
//b = last->m_a;
|
|
// stop
|
|
//break;
|
|
}
|
|
|
|
// . insert [first->m_b,b]
|
|
// . make sure it covers at least one "word" which means
|
|
// that a != b-1
|
|
if ( a < b - 1 &&
|
|
// and must group together something meaningful
|
|
numTextSections >= 2 ) {
|
|
// do the insertion
|
|
Section *sk = insertSubSection (parent,a,b,BH_IMPLIED);
|
|
// error?
|
|
if ( ! sk ) return -1;
|
|
// fix it
|
|
sk->m_processedHash = 1;
|
|
// count it
|
|
numAdded++;
|
|
}
|
|
|
|
// first is now last
|
|
first = last;
|
|
// loop up if there are more brothers
|
|
if ( first ) goto subloop;
|
|
}
|
|
return numAdded;
|
|
}
|
|
|
|
bool Sections::splitSections ( char *delimeter , int32_t dh ) {
|
|
|
|
// . try skipping for xml
|
|
// . eventbrite.com has a bunch of dates per event item and
|
|
// we end up using METHOD_DOM on those!
|
|
// . i think the implied section algo is meant for html really
|
|
// or plain text
|
|
if ( m_contentType == CT_XML &&
|
|
( m_isEventBrite ||
|
|
m_isStubHub ||
|
|
m_isFacebook ) )
|
|
return 0;
|
|
|
|
// use this for ultimately setting Section::m_tagHash in loop above
|
|
//int32_t dh ;
|
|
//if ( delimeter == (char *)0x01 ) dh = 3947503;
|
|
//else dh = hash32n ( delimeter );
|
|
|
|
//int32_t th = hash32n("<br");
|
|
|
|
// . is the delimeter a section starting tag itself?
|
|
// . right now, just <hN> tags
|
|
//bool delimIsSection = false;
|
|
//if ( delimeter != (char *)0x01 && is_digit(delimeter[2] ) )
|
|
// delimIsSection = true;
|
|
|
|
//int32_t ns = m_numSections;
|
|
|
|
|
|
int32_t saved = -1;
|
|
int32_t delimEnd = -1000;
|
|
|
|
// . now, split sections up if they contain one or more <hr> tags
|
|
// . just append some "hr" sections under that parent to m_sections[]
|
|
// . need to update m_sectionPtrs[] after this of course!!!!!
|
|
// . now we also support various other delimiters, like bullets
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
if ( i < saved ) { char *xx=NULL;*xx=0; }
|
|
// a quicky
|
|
if ( ! isDelimeter ( i , delimeter , &delimEnd ) ) continue;
|
|
// get section it is in
|
|
Section *sn = m_sectionPtrs[i];
|
|
// save it
|
|
//Section *origsn = sn;
|
|
// for <h1> it must split its parent section!
|
|
//if ( sn->m_a == i && delimIsSection )
|
|
// sn = sn->m_parent;
|
|
// skip if already did this section
|
|
if ( sn->m_processedHash == dh ) continue;
|
|
/*
|
|
// or if we are <br> and the double br got it
|
|
if ( sn->m_processedHash == BH_BRBR && dh == BH_BR &&
|
|
// . might have "<br><br> stuff1 <br> stuff2 <br> ..."
|
|
// . so need to be at the start of it to ignore it then
|
|
i >= sn->m_a && i <= sn->m_a+2 ) continue;
|
|
*/
|
|
// . or if we equal start of section
|
|
// . it was created, not processed
|
|
//if ( sn->m_a == i ) continue;
|
|
// int16_tcut
|
|
//Section *sk ;
|
|
// what section # is section "sn"?
|
|
int32_t offset = sn - m_sections;
|
|
// sanity check
|
|
if ( &m_sections[offset] != sn ) { char *xx=NULL;*xx=0; }
|
|
// point to the section after "sn"
|
|
//int32_t xx = offset + 1;
|
|
// point to words in the new section
|
|
//int32_t yy = sn->m_a ;
|
|
// init this
|
|
int32_t start = sn->m_a;
|
|
// CAUTION: sn->m_a can equal "i" for something like:
|
|
// "<div><h2>blah</h2> <hr> </div>"
|
|
// where when splitting h2 sections we are at the start
|
|
// of an hr section. i think its best to just skip it!
|
|
// then if we find another <h2> within that same <hr> section
|
|
// it can split it into non-empty sections
|
|
if ( start == i ) continue;
|
|
// save it so we can rescan from delimeter right after this one
|
|
// because there might be more delimiters in DIFFERENT
|
|
// subsections
|
|
saved = i;
|
|
// sanity check
|
|
//if ( start == i ) { char *xx=NULL;*xx=0; }
|
|
|
|
subloop:
|
|
// sanity check
|
|
if ( m_numSections >= m_maxNumSections) {char *xx=NULL;*xx=0;}
|
|
//
|
|
// try this now
|
|
//
|
|
Section *sk = insertSubSection ( sn , start , i , dh );
|
|
/*
|
|
//
|
|
// make a new section
|
|
//
|
|
sk = &m_sections[m_numSections];
|
|
// inc it
|
|
m_numSections++;
|
|
// now set it
|
|
sk->m_a = start;
|
|
sk->m_b = i;
|
|
sk->m_parent = sn;
|
|
sk->m_exclusive = sn->m_exclusive;
|
|
// the base hash (delimeter hash) hack
|
|
sk->m_baseHash = dh;
|
|
// sanity check
|
|
if ( start == i ) { char *xx=NULL;*xx=0; }
|
|
// flag it as an <hr> section
|
|
sk->m_flags = sn->m_flags | SEC_FAKE;
|
|
// but take out unbalanced!
|
|
sk->m_flags &= ~SEC_UNBALANCED;
|
|
*/
|
|
|
|
// and take out sentence as well FROM THE PARENT!
|
|
//sn->m_flags &= ~SEC_SENTENCE;
|
|
|
|
//sk->m_baseHash = dh;
|
|
|
|
// do not resplit this split section with same delimeter!!
|
|
if ( sk ) sk->m_processedHash = dh;
|
|
|
|
// take out the processed flag
|
|
//sk->m_flags &= ~SEC_PROCESSED;
|
|
|
|
// quick hack sanity check
|
|
//for ( int32_t w = 0 ; w < m_numSections ; w++ ){
|
|
//for ( int32_t u = w+1 ; u < m_numSections ; u++ ) {
|
|
// if ( m_sections[w].m_a == m_sections[u].m_a &&
|
|
// m_sections[w].m_b == m_sections[u].m_b ) {
|
|
// char *xx=NULL;*xx=0; }
|
|
//}}
|
|
|
|
/*
|
|
// take out the SEC_NOTEXT flag if we should
|
|
//if ( ! notext ) sk->m_flags &= ~SEC_NOTEXT;
|
|
// . find any child section of "sn" and make us their parent
|
|
// . TODO: can later speed up with ptr to ptr logic
|
|
// . at this point sections are not sorted so we can't
|
|
// really iterate linearly through them ... !!!
|
|
// . TODO: speed this up!!!
|
|
for ( xx = 0 ; xx < ns ; xx++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Section *sx = &m_sections[xx];
|
|
// skip if sn not parent
|
|
if ( sx->m_parent != sn ) continue;
|
|
// when splitting a section do not reparent if
|
|
// not in our split...
|
|
if ( sx->m_a >= i ) continue;
|
|
if ( sx->m_b <= start ) continue;
|
|
// reset his parent to the hr faux section
|
|
sx->m_parent = sk;
|
|
}
|
|
// . set the words ptrs to it
|
|
// . TODO: can later speed up with ptr to ptr logic
|
|
for ( ; yy < i ; yy++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must have had sn as parent
|
|
if ( m_sectionPtrs[yy] != sn ) continue;
|
|
// "sk" becomes the new parent
|
|
m_sectionPtrs[yy] = sk;
|
|
}
|
|
*/
|
|
// if we were it, no more sublooping!
|
|
if ( i >= sn->m_b ) {
|
|
// sn loses some stuff
|
|
sn->m_exclusive = 0;
|
|
// resume where we left off in case next delim is
|
|
// in a section different than "sn"
|
|
i = saved;
|
|
// do not process any more delimiters in this section
|
|
sn->m_processedHash = dh;
|
|
//i = sn->m_b - 1;
|
|
continue;
|
|
}
|
|
|
|
// update values in case we call subloop
|
|
start = i;
|
|
|
|
// skip over that delimeter at word #i
|
|
i++;
|
|
|
|
// if we had back-to-back br tags make i point to word
|
|
// after the last br tag
|
|
if ( delimeter == (char *)0x01 ) i = delimEnd;
|
|
|
|
// find the next <hr> tag, if any, stop at end of "sn"
|
|
for ( ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop at end of section "sn"
|
|
if ( i >= sn->m_b ) break;
|
|
// get his section
|
|
Section *si = m_sectionPtrs[i];
|
|
// delimiters that start their own sections must
|
|
// grow out to their parent
|
|
//if ( delimIsSection )
|
|
// si = si->m_parent;
|
|
// ignore if not the right parent
|
|
if ( si != sn ) continue;
|
|
// a quicky
|
|
if ( isDelimeter ( i , delimeter , &delimEnd ) ) break;
|
|
}
|
|
|
|
// if we were it, no more sublooping!
|
|
//if ( i >= sn->m_b ) { i = saved; continue; }
|
|
|
|
// now add the <hr> section above word #i
|
|
goto subloop;
|
|
}
|
|
return true;
|
|
}
|
|
/*
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Sections::...( ) {
|
|
|
|
key128_t end = g_datedb.makeEndKey ( m_termId , 0 );
|
|
|
|
// before looking up TitleRecs using Msg20, let's first consult
|
|
// datedb to see if we got adequate data as to what sections
|
|
// are the article sections
|
|
|
|
int32_t minRecSizes = SECTIONDB_MINRECSIZES;
|
|
// get the group this list is in
|
|
uint32_t gid ;
|
|
// split = false
|
|
gid = getGroupId ( RDB_SECTIONDB , (char *)&m_startKey, false );
|
|
// we need a group # from the groupId
|
|
int32_t split = g_hostdb.getGroupNum ( gid );
|
|
// int16_tcut
|
|
Msg0 *m = &m_msg0;
|
|
|
|
loop:
|
|
// get the list
|
|
if ( ! m->getList ( -1 , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // addToCache
|
|
RDB_SECTIONDB , // was RDB_DATEDB
|
|
m_coll ,
|
|
&m_list ,
|
|
(char *)&m_startKey ,
|
|
(char *)&end ,
|
|
minRecSizes ,
|
|
this ,
|
|
gotSectiondbListWrapper ,
|
|
m_niceness , // MAX_NICENESS
|
|
// default parms follow
|
|
true , // doErrorCorrection?
|
|
true , // includeTree?
|
|
true , // doMerge?
|
|
-1 , // firstHostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
30 , // timeout
|
|
-1 , // syncPoint
|
|
-1 , // preferLocalReads
|
|
NULL , // msg5
|
|
NULL , // msg5b
|
|
false , // isrealmerge?
|
|
true , // allowpagecache?
|
|
false , // forceLocalIndexdb?
|
|
false , // doIndexdbSplit?
|
|
split ))
|
|
return false;
|
|
|
|
// record recall
|
|
bool needsRecall = false;
|
|
// return false if this blocked
|
|
if ( ! gotSectiondbList ( &needsRecall ) ) return false;
|
|
// if it needs a recall then loop back up!
|
|
if ( needsRecall ) goto loop;
|
|
// we are done
|
|
return true;
|
|
}
|
|
|
|
void gotSectiondbListWrapper ( void *state ) {
|
|
SectionVotingTable *THIS = (Sections *)state;
|
|
// record recall
|
|
bool needsRecall = false;
|
|
// return if this blocked
|
|
if ( ! THIS->gotSectiondbList( &needsRecall ) ) return;
|
|
// needs a recall?
|
|
if ( needsRecall && ! THIS->getSectiondbList() ) return;
|
|
// nothing blocked and no recall needed, so we are done
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
*/
|
|
|
|
// returns false and sets g_errno on error
|
|
bool Sections::addVotes ( SectionVotingTable *nsvt , uint32_t tagPairHash ) {
|
|
|
|
// . add our SV_TAG_PAIR hash to the m_nsvt
|
|
// . every page adds this one
|
|
// . but don't screw up m_nsvt if we are deserializing from it
|
|
if ( ! nsvt->addVote2(tagPairHash,SV_TAGPAIRHASH,1))
|
|
return false;
|
|
|
|
// . add our checksum votes as well
|
|
// . we use this for menu identification between web pages
|
|
for ( Section *sn = m_firstSent ; sn ; sn = sn->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// sanity check
|
|
if ( ! sn->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
|
|
// add the tag hash too!
|
|
if ( ! nsvt->addVote3 ( sn->m_turkTagHash32 ,
|
|
SV_TURKTAGHASH ,
|
|
1.0 , // score
|
|
1.0 ,
|
|
true ) )
|
|
return false;
|
|
// skip if not a section with its own words
|
|
//if ( sn->m_flags & SEC_NOTEXT ) continue;
|
|
// . combine the tag hash with the content hash #2
|
|
// . for some reason m_contentHash is 0 for like menu-y sectns
|
|
int32_t modified = sn->m_turkTagHash32;
|
|
modified ^= sn->m_sentenceContentHash64;
|
|
// now we use votehash32 which ignores dates and numbers
|
|
//int32_t modified = sn->m_turkTagHash32 ^ sn->m_voteHash32;
|
|
// . update m_nsvt voting table now
|
|
// . the tagHash is the content hash for this one
|
|
// . this will return false with g_errno set
|
|
if ( ! nsvt->addVote3 ( modified ,
|
|
SV_TAGCONTENTHASH ,
|
|
1.0 , // score
|
|
1.0 ,
|
|
true ) )
|
|
return false;
|
|
}
|
|
|
|
// success!
|
|
return true;
|
|
}
|
|
|
|
SectionVotingTable::SectionVotingTable ( ) {
|
|
m_totalSiteVoters = 0;
|
|
//m_totalSimilarLayouts = 0;
|
|
}
|
|
|
|
// . returns false if blocked, returns true and sets g_errno on error
|
|
// . compile the datedb list into a hash table
|
|
// . we serialize this hash table into TitleRec::ptr_sectionsData during
|
|
// the reindexing process to ensure consistency
|
|
//bool Sections::gotSectiondbList ( bool *needsRecall ) {
|
|
bool SectionVotingTable::addListOfVotes ( RdbList *list,
|
|
key128_t **lastKey ,
|
|
uint32_t tagPairHash ,
|
|
int64_t myDocId ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t lastDocId = 0LL;
|
|
int64_t lastsh48 = 0LL;
|
|
|
|
// . tally the votes
|
|
// . the "flags" are more significant than the tag hash because
|
|
// we don't want a single tag hash dominating our list, even though
|
|
// that can still happen!
|
|
// . TODO: limit indexing of taghashes that are well represented!
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// get rec
|
|
char *rec = list->getCurrentRec();
|
|
// get the key ptr, key128_t
|
|
*lastKey = (key128_t *)rec;
|
|
// for ehre
|
|
key128_t *key = (key128_t *)rec;
|
|
// the score is the bit which is was set in Section::m_flags
|
|
// for that docid
|
|
int32_t secType = g_indexdb.getScore ( (char *)key );
|
|
// treat key like a datedb key and get the taghash
|
|
uint32_t turkTagHash32 = g_datedb.getDate ( key );
|
|
// get this
|
|
int64_t d = g_datedb.getDocId(key);
|
|
// skip this vote if from an old titlerec of ourselves!
|
|
if ( d == myDocId ) continue;
|
|
// sanity
|
|
int64_t sh48 = g_datedb.getTermId ( key );
|
|
if ( sh48 != lastsh48 && lastsh48 ) { char *xx=NULL;*xx=0;}
|
|
lastsh48 = sh48;
|
|
|
|
// now for every docid we index one sectiondb key with a
|
|
// tagHash of zero so we can keep tabs on how many section
|
|
// voters we have. we need to limit the # of voters per site
|
|
// otherwise we have a huge sectiondb explosion.
|
|
if ( turkTagHash32 == 0 ) {
|
|
// now that we do compression we strip these votes
|
|
// in Msg0.cppp
|
|
//char *xx=NULL;*xx=0;
|
|
m_totalSiteVoters++;
|
|
//log("sect: got site voter");
|
|
// sanity check - make sure only one per docid
|
|
if ( d == lastDocId ) { char *xx=NULL;*xx=0; }
|
|
lastDocId = d;
|
|
continue;
|
|
}
|
|
// . the enum tag hash is the tag pair hash here as well
|
|
// . let's us know who is waiting for a quick respider once
|
|
// enough docs from this site and with this particular
|
|
// tag pair hash (similar page layout) have been indexed
|
|
// and had their votes added to sectiondb
|
|
// . then we can make better decisions as to what sections
|
|
// are repeated or menu or comment sections by comparing
|
|
// to these other documents
|
|
//if ( secType == SV_WAITINLINE ) {
|
|
// // only count him if he is our same tag layout
|
|
// if ( tagHash == m_tagPairHash ) m_numLineWaiters++;
|
|
// // do not add this vote to m_osvt, it is special
|
|
// continue;
|
|
//}
|
|
// if we do not have this in our list of tagHashes then
|
|
// skip it. we do not want votes for what we do not have.
|
|
// this would just bloat m_osvt which we serialize into
|
|
// TitleRec::ptr_sectionsData
|
|
//if ( ! m_ot.isInTable ( &tagHash ) ) continue;
|
|
// get data/vote from the current record in the sectiondb list
|
|
SectionVote *sv = (SectionVote *)list->getCurrentData ( );
|
|
// get the avg score for this docid
|
|
float avg = 0;
|
|
if ( sv->m_numSampled > 0 )
|
|
avg = sv->m_score / sv->m_numSampled;
|
|
//
|
|
// PERFORMANCE HACK
|
|
//
|
|
// now we compile large sectiondb termlists in msg0.cpp
|
|
// which means we fold keys of the same tag type and
|
|
// hash together and increment their
|
|
// "SectionVote::m_numSampled" so that for the case of
|
|
// tags of type contenthash and taghash the m_numSampled
|
|
// value is the NUMBER OF DOCUMENTS that have that
|
|
// contenthash/taghash... but for the other tag types it is
|
|
// just the number of tags in that one document that have
|
|
// that contenthash/taghash.... otherwise, before this hack
|
|
// we were getting HUGE termlists for a handful of sites
|
|
// totally slowing us down when rebuilding the index using
|
|
// the "repair" tab.
|
|
//
|
|
float numSampled = sv->m_numSampled;
|
|
|
|
// incorporate this vote into m_osvt. the "old" voting table.
|
|
if ( ! addVote3 ( turkTagHash32 ,
|
|
secType ,
|
|
avg , // score
|
|
numSampled )) // numSampled
|
|
return true;
|
|
}
|
|
|
|
// we are done
|
|
return true;
|
|
}
|
|
|
|
// this is a function because we also call it from addImpliedSections()!
|
|
void Sections::setNextBrotherPtrs ( bool setContainer ) {
|
|
|
|
// clear out
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// no! when adding implied sections we need to recompute
|
|
// brothers for these tags to avoid an infinite loop in
|
|
// adding implied sections forever
|
|
// allow columns to keep theirs! since we set in
|
|
// swoggleTable() below
|
|
//if ( si->m_tagId == TAG_TC ) continue;
|
|
// or if a swoggled table cell
|
|
//if ( si->m_tagId == TAG_TH ) continue;
|
|
//if ( si->m_tagId == TAG_TD ) continue;
|
|
si->m_nextBrother = NULL;
|
|
si->m_prevBrother = NULL;
|
|
}
|
|
|
|
|
|
//for ( int32_t i = 0 ; i + 1 < m_numSections ; i++ ) {
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip column sections. their m_a and m_b overlap with
|
|
// one another which triggers a seg fault below... and
|
|
// we already set their m_next/prevBrother members in the
|
|
// swoggleTable() function
|
|
//if ( si->m_tagId == TAG_TC ) continue;
|
|
// int16_tcut
|
|
//Section *si = &m_sections[i];
|
|
// assume none
|
|
//si->m_nextBrother = NULL;
|
|
// crap this is overwriting our junk
|
|
//si->m_prevBrother = NULL;
|
|
// . skip if not a section with its own words
|
|
// . no, i need this for Events.cpp::getRegistrationTable()
|
|
//if ( si->m_flags & SEC_NOTEXT ) continue;
|
|
// must be a menu
|
|
//if ( ! ( si->m_flags & SEC_DUP ) ) continue;
|
|
// must NOT have a unique tag hash
|
|
//if ( si->m_numOccurences <= 1 ) continue;
|
|
//Section *sj = m_sectionPtrs[wn];
|
|
Section *sj = NULL;
|
|
|
|
// get word after us
|
|
int32_t wn = si->m_b;
|
|
int32_t nw2 = m_nw;
|
|
|
|
// if we hit a word in our parent.. then increment wn
|
|
// PROBLEM "<root><t1>hey</t1> blah blah blah x 1 mill</root>"
|
|
// would exhaust the full word list when si is the "t1"
|
|
// section.
|
|
Section *j2 = si->m_next;
|
|
if ( j2 && j2->m_a >= si->m_b ) {
|
|
sj = j2;
|
|
nw2 = 0;
|
|
}
|
|
|
|
// try one more ahead for things like so we don't end up
|
|
// setting sj to the "t2" section as in:
|
|
// "<root><t1><t2>hey</t2></t1> ...."
|
|
if ( ! sj && j2 ) {
|
|
// try the next section then
|
|
j2 = j2->m_next;
|
|
// set "sj" if its a potential brother section
|
|
if ( j2 && j2->m_a >= si->m_b ) {
|
|
sj = j2;
|
|
nw2 = 0;
|
|
}
|
|
}
|
|
|
|
// ok, try the next word algo approach
|
|
for ( ; wn < nw2 ; wn++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
sj = m_sectionPtrs[wn];
|
|
if ( sj->m_a >= si->m_b ) break;
|
|
}
|
|
// bail if none
|
|
if ( wn >= m_nw ) continue;
|
|
|
|
//
|
|
// NO! this is too slow!!!!! use the above loop!!!!
|
|
// yeah, in certain cases it is horrible... like if si
|
|
// is the root! then we gotta loop through all sections here
|
|
// until sj is NULL!
|
|
//
|
|
// try using sections, might be faster and it is better
|
|
// for our table column sections which have their td
|
|
// kids in an interlaced ordering
|
|
//for ( sj = si->m_next; sj ; sj = sj->m_next )
|
|
// if ( sj->m_a >= si->m_b ) break;
|
|
|
|
// telescope up until brother if possible
|
|
for ( ; sj ; sj = sj->m_parent )
|
|
if ( sj->m_parent == si->m_parent ) break;
|
|
// give up?
|
|
if ( ! sj || sj->m_parent != si->m_parent ) continue;
|
|
// sanity check
|
|
if ( sj->m_a < si->m_b &&
|
|
sj->m_tagId != TAG_TC &&
|
|
si->m_tagId != TAG_TC ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// set brother
|
|
si->m_nextBrother = sj;
|
|
// set his prev then
|
|
sj->m_prevBrother = si;
|
|
// sanity check
|
|
if ( sj->m_parent != si->m_parent ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( sj->m_a < si->m_b &&
|
|
sj->m_tagId != TAG_TC &&
|
|
si->m_tagId != TAG_TC ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// do more?
|
|
if ( ! setContainer ) continue;
|
|
// telescope this
|
|
Section *te = sj;
|
|
// telescope up until it contains "si"
|
|
for ( ; te && te->m_a > si->m_a ; te = te->m_parent );
|
|
// only update list container if smaller than previous
|
|
if ( ! si->m_listContainer )
|
|
si->m_listContainer = te;
|
|
else if ( te->m_a > si->m_listContainer->m_a )
|
|
si->m_listContainer = te;
|
|
if ( ! sj->m_listContainer )
|
|
sj->m_listContainer = te;
|
|
else if ( te->m_a > sj->m_listContainer->m_a )
|
|
sj->m_listContainer = te;
|
|
|
|
// now
|
|
}
|
|
}
|
|
|
|
|
|
void Sections::setNextSentPtrs ( ) {
|
|
|
|
// kinda like m_rootSection
|
|
m_firstSent = NULL;
|
|
m_lastSent = NULL;
|
|
|
|
Section *finalSec = NULL;
|
|
Section *lastSent = NULL;
|
|
// scan the sentence sections and number them to set m_sentNum
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// record final section
|
|
finalSec = sk;
|
|
// set this
|
|
sk->m_prevSent = lastSent;
|
|
// need sentence
|
|
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) continue;
|
|
// first one?
|
|
if ( ! m_firstSent ) m_firstSent = sk;
|
|
// we are the sentence now
|
|
lastSent = sk;
|
|
}
|
|
// update
|
|
m_lastSent = lastSent;
|
|
// reset this
|
|
lastSent = NULL;
|
|
// now set "m_nextSent" of each section
|
|
for ( Section *sk = finalSec ; sk ; sk = sk->m_prev ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// set this
|
|
sk->m_nextSent = lastSent;
|
|
// need sentence
|
|
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) continue;
|
|
// we are the sentence now
|
|
lastSent = sk;
|
|
}
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool SectionVotingTable::addVote3 ( int32_t turkTagHash ,
|
|
int32_t sectionType ,
|
|
float score ,
|
|
float numSampled ,
|
|
bool hackFix ) {
|
|
|
|
HashTableX *ttt = &m_svt;
|
|
// or in bitnum
|
|
//int64_t vk = turkTagHash;
|
|
uint64_t vk = sectionType;
|
|
// make room for bitnum
|
|
vk <<= 32;
|
|
// combine with turkTagHash
|
|
vk |= (uint32_t)turkTagHash;
|
|
//vk |= sectionType;
|
|
|
|
// sanity
|
|
if ( sectionType < 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// print out for debug
|
|
//log("section: adding vote #%"INT32") th=%"UINT32"-%"INT32"",
|
|
// m_numVotes++,tagHash,sectionType);
|
|
|
|
// get existing vote statistics for this vk from hash table
|
|
int32_t slot = ttt->getSlot ( &vk );
|
|
// return true with g_errno set on error
|
|
if ( slot < 0 ) {
|
|
SectionVote nv;
|
|
nv.m_score = score;
|
|
nv.m_numSampled = numSampled;
|
|
return ttt->addKey ( &vk , &nv );
|
|
}
|
|
// get the current voting statistics in table
|
|
SectionVote *sv = (SectionVote *)ttt->getValueFromSlot ( slot );
|
|
|
|
// incorporate this voter's voting info
|
|
sv->m_score += score;
|
|
sv->m_numSampled += numSampled;
|
|
|
|
// keep this to one
|
|
if ( hackFix )
|
|
sv->m_numSampled = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// . no longer use single bit flags, sec_t
|
|
// . just use enumerated section types now
|
|
// . each section type has a score and number sampled to get that score
|
|
// . returns -1 if no data
|
|
// . otherwise returns score from 0.0 to 1.0 which is probability that
|
|
// sections with the given tagHash are of type "sectionType" where
|
|
// "sectionType" is like SEC_TEXTY, SEC_DUP, SEC_CLOCK, etc.
|
|
float SectionVotingTable::getScore ( int32_t turkTagHash , int32_t sectionType ) {
|
|
// make the vote key
|
|
int64_t vk = ((uint64_t)sectionType) << 32 | (uint32_t)turkTagHash;
|
|
//int64_t vk = ((uint64_t)tagHash) << 32 | (uint32_t)sectionType;
|
|
// get these
|
|
SectionVote *sv = (SectionVote *)m_svt.getValue ( &vk );
|
|
//SectionVote *osv = (SectionVote *)m_osvt.getValue ( &vk );
|
|
//SectionVote *nsv = (SectionVote *)m_nsvt.getValue ( &vk );
|
|
// return -1.0 if no voting data for this guy
|
|
//if ( ! osv && ! nsv ) return -1.0;
|
|
if ( ! sv ) return -1.0;
|
|
// combine
|
|
float score = 0.0;
|
|
float numSampled = 0.0;
|
|
if ( sv ) score += sv->m_score;
|
|
if ( sv ) numSampled += sv->m_numSampled;
|
|
// . only count ourselves as one sample
|
|
// . in the "old" voting table a single sample is a doc, as opposed
|
|
// to in the new voting table where we have one sample count every
|
|
// time the turkTagHash or contentHash occurs on the page.
|
|
//if ( nsv ) numSampled++;
|
|
// and use an average for the score
|
|
//if ( nsv ) score += nsv->m_score / nsv->m_numSampled;
|
|
// sanity check
|
|
if ( numSampled <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
// normalize
|
|
return score / numSampled;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
float SectionVotingTable::getOldScore ( Section *sn , int32_t sectionType ) {
|
|
// make the vote key
|
|
int64_t vk = ((uint64_t)sectionType) << 32|(uint32_t)sn->m_tagHash;
|
|
//int64_t vk=((uint64_t)sn->m_tagHash) << 32 | (uint32_t)sectionType;
|
|
// get these
|
|
SectionVote *osv = (SectionVote *)m_osvt.getValue ( &vk );
|
|
// return -1.0 if no voting data for this guy
|
|
if ( ! osv ) return -1.0;
|
|
// combine
|
|
float score = 0.0;
|
|
float numSampled = 0.0;
|
|
if ( osv ) score += osv->m_score;
|
|
if ( osv ) numSampled += osv->m_numSampled;
|
|
// sanity check
|
|
if ( numSampled <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
// normalize
|
|
return score / numSampled;
|
|
}
|
|
|
|
float SectionVotingTable::getNewScore ( Section *sn , int32_t sectionType ) {
|
|
// make the vote key
|
|
int64_t vk = ((uint64_t)sectionType) << 32|(uint32_t)sn->m_tagHash;
|
|
//int64_t vk=((uint64_t)sn->m_tagHash) << 32 | (uint32_t)sectionType;
|
|
// get these
|
|
SectionVote *nsv = (SectionVote *)m_nsvt.getValue ( &vk );
|
|
// return -1.0 if no voting data for this guy
|
|
if ( ! nsv ) return -1.0;
|
|
// combine
|
|
float score = 0.0;
|
|
float numSampled = 0.0;
|
|
if ( nsv ) score += nsv->m_score;
|
|
if ( nsv ) numSampled += nsv->m_numSampled;
|
|
// sanity check
|
|
if ( numSampled <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
// normalize
|
|
return score / numSampled;
|
|
}
|
|
*/
|
|
|
|
// just like getScore() above basically
|
|
float SectionVotingTable::getNumSampled ( int32_t turkTagHash, int32_t sectionType) {
|
|
// make the vote key
|
|
int64_t vk = ((uint64_t)sectionType) << 32 | (uint32_t)turkTagHash;
|
|
//int64_t vk = ((uint64_t)tagHash) << 32 | (uint32_t)sectionType;
|
|
// get these
|
|
SectionVote *sv = (SectionVote *)m_svt.getValue ( &vk );
|
|
//SectionVote *osv = (SectionVote *)m_osvt.getValue ( &vk );
|
|
//SectionVote *nsv = (SectionVote *)m_nsvt.getValue ( &vk );
|
|
// return 0.0 if no voting data for this guy
|
|
//if ( ! osv && ! nsv ) return 0.0;
|
|
if ( ! sv ) return 0.0;
|
|
// combine
|
|
float numSampled = 0.0;
|
|
if ( sv ) numSampled += sv->m_numSampled;
|
|
// . only count ourselves as one sample
|
|
// . in the "old" voting table a single sample is a doc, as opposed
|
|
// to in the new voting table where we have one sample count every
|
|
// time the tagHash or contentHash occurs on the page.
|
|
//if ( nsv ) numSampled++;
|
|
// sanity check
|
|
if ( numSampled <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
// normalize
|
|
return numSampled;
|
|
}
|
|
|
|
/*
|
|
// just like getScore() above basically
|
|
float SectionVotingTable::getOldNumSampled ( Section *sn , int32_t sectionType ) {
|
|
// make the vote key
|
|
int64_t vk=((uint64_t)sectionType) << 32 | (uint32_t)sn->m_tagHash;
|
|
//int64_t vk=((uint64_t)sn->m_tagHash) << 32 | (uint32_t)sectionType;
|
|
// get these
|
|
SectionVote *osv = (SectionVote *)m_osvt.getValue ( &vk );
|
|
// return 0.0 if no voting data for this guy
|
|
if ( ! osv ) return 0.0;
|
|
// combine
|
|
float numSampled = 0.0;
|
|
if ( osv ) numSampled += osv->m_numSampled;
|
|
// sanity check
|
|
if ( numSampled <= 0.0 ) { char *xx=NULL;*xx=0; }
|
|
// normalize
|
|
return numSampled;
|
|
}
|
|
*/
|
|
/*
|
|
// . returns false if no article
|
|
// . otherwise sets a and b to the range in word #'s and returns true
|
|
void Sections::getArticleRange ( int32_t *start , int32_t *end ) {
|
|
// assume no section
|
|
*start = -1;
|
|
*end = -1;
|
|
// if no article, skip all!
|
|
if ( ! m_hadArticle ) return ;
|
|
// return if we got it
|
|
if ( m_articleEndWord != -2 ) {
|
|
*start = m_articleStartWord;
|
|
*end = m_articleEndWord;
|
|
return;
|
|
}
|
|
int32_t a = -1;
|
|
int32_t b = -1;
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// int16_tcut
|
|
Section *sn = &m_sections[i];
|
|
// skip if not article section
|
|
if ( ! ( sn->m_flags & SEC_ARTICLE ) ) continue;
|
|
// keep a range going
|
|
if ( sn->m_a < a || a == -1 ) a = sn->m_a;
|
|
// and the end of the range
|
|
if ( sn->m_b > b || b == -1 ) b = sn->m_b;
|
|
}
|
|
// set them
|
|
m_articleStartWord = a;
|
|
m_articleEndWord = b;
|
|
// sanity check. if m_hadArticle was set, we should have something
|
|
if ( (a == -1 || b == -1) && m_hadArticle ) { char *xx=NULL;*xx=0; }
|
|
// set the range to return it
|
|
*start = m_articleStartWord;
|
|
*end = m_articleEndWord;
|
|
}
|
|
*/
|
|
|
|
// . store our "votes" into datedb
|
|
// . each vote key is:
|
|
// siteAndTagPairHash(termId)|tagHash(date)|secTypeEnum(score)|docId
|
|
// . to index a tag hash the section's flags must NOT have SEC_NOTEXT set.
|
|
// . TODO: do not index a key if the secTypeEnum (section type) is already well
|
|
// represented in list we read from datedb
|
|
// . returns false and sets g_errno on error
|
|
// . returns true on success
|
|
bool SectionVotingTable::hash ( int64_t docId ,
|
|
HashTableX *dt ,
|
|
uint64_t siteHash64 ,
|
|
int32_t niceness ) {
|
|
|
|
// let's try skipping this always for now and going without using
|
|
// sectiondb. we do not use it for the test parser anyway and we
|
|
// seem to do ok... but verify that!
|
|
//return true;
|
|
|
|
// . do not index more recs to sectiondb if we have enough!
|
|
// . this is now in XmlDoc.cpp
|
|
//if ( m_totalSiteVoters >= MAX_SITE_VOTERS ) {
|
|
// //log("sect: got %"INT32" site voters. skipping.",m_totalSiteVoters);
|
|
// return true;
|
|
//}
|
|
|
|
// sanity check
|
|
if ( dt->m_ks != sizeof(key128_t) ) { char *xx=NULL;*xx=0; }
|
|
if ( dt->m_ds != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
if ( getKeySizeFromRdbId ( RDB_SECTIONDB ) != 16 ){char*xx=NULL;*xx=0;}
|
|
// if we did not have enough voters from our site with our same
|
|
// page layout, then wait in line. when we get enough such voters
|
|
// we will be respidered quickly theoretically.
|
|
// MDW: disable this for now since it is causing a parsing
|
|
// inconsistency since when we re-read the list from sectiondb
|
|
// for the parsing inconsistency check in XmlDoc.cpp, it sometimes
|
|
// sets m_waitInLine to true... which is strange... wtf?
|
|
// guilty url: www.lis.illinois.edu/newsroom/events?dt=2011-01-15
|
|
uint64_t termId = siteHash64 & TERMID_MASK;
|
|
|
|
// . now for tallying the # of voters per site we use tagHash of 0
|
|
// . we use this to check for a MAX_SITE_VOTERS breach in XmlDoc.cpp
|
|
key128_t k = g_datedb.makeKey ( termId ,
|
|
// 0 is the tagHash
|
|
0 ,
|
|
// score/secType
|
|
SV_SITE_VOTER ,
|
|
docId ,
|
|
false );
|
|
// make a fake section vote
|
|
SectionVote sv;
|
|
sv.m_score = 0;
|
|
sv.m_numSampled = 0;
|
|
// . hash it
|
|
// . returns false and sets g_errno on error
|
|
if ( ! dt->addKey ( &k , &sv ) ) return false;
|
|
|
|
|
|
// . add all votes in m_nsvt into Sectiondb
|
|
// . we don't add m_osvt because we got those from Sectiondb!
|
|
// . however, we do serialize both tables in the TitleRec and we
|
|
// only serialize m_nsvt right now because of SEC_DUP having
|
|
// been computed by volatile Msg20s.. to ensure parsing consistency
|
|
for ( int32_t i = 0 ; i < m_svt.m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( niceness );
|
|
// skip if empty
|
|
if ( m_svt.m_flags[i] == 0 ) continue;
|
|
// get key
|
|
uint64_t vk = *(uint64_t *)m_svt.getKey(i);
|
|
// get section type from key
|
|
int32_t sectionType = vk >> 32;
|
|
//int32_t sectionType = vk & 0xffffffff;
|
|
// sanity check
|
|
if ( sectionType > 255 ) { char *xx=NULL;*xx=0; }
|
|
if ( sectionType < 0 ) { char *xx=NULL;*xx=0; }
|
|
// and tagHash from key
|
|
uint32_t secHash32 = vk & 0xffffffff;
|
|
// check for "what"
|
|
//if ( tagHash == (uint32_t)-1024871986 )
|
|
// log("hey");
|
|
//int32_t tagHash = vk >> 32;
|
|
// make the record key first
|
|
key128_t k;
|
|
k = g_datedb.makeKey ( termId , // termId
|
|
secHash32 , // date field
|
|
sectionType , // score
|
|
docId ,
|
|
false ); // delete key?
|
|
// the data of the record is just the SectionVote
|
|
SectionVote *sv = (SectionVote *)m_svt.getValueFromSlot(i);
|
|
// sanity check. this is reserved for line waiters i guess
|
|
if ( sv->m_numSampled == 0 ) { char *xx=NULL;*xx=0; }
|
|
// sanity check
|
|
uint64_t sh = g_datedb.getTermId(&k);
|
|
if ( sh != termId ) { char *xx=NULL;*xx=0; }
|
|
int64_t d = g_datedb.getDocId(&k);
|
|
if ( d != docId ) { char *xx=NULL;*xx=0; }
|
|
// this returns false and sets g_errno on error
|
|
if ( ! dt->addKey ( &k , sv ) ) return false;
|
|
// log this for now! last hash is the date format hash!
|
|
//logf(LOG_DEBUG,"section: added tagHash=0x%"XINT32" "
|
|
// "sectionType=%"INT32" score=%.02f numSampled=%.02f",
|
|
// tagHash,sectionType,sv->m_score,sv->m_numSampled);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
// add docid-based forced spider recs into the metalist
|
|
char *Sections::respiderLineWaiters ( char *metaList ,
|
|
char *metaListEnd ) {
|
|
// these are from the parent
|
|
//Url *url ,
|
|
//int32_t ip ,
|
|
//int32_t priority ) {
|
|
|
|
// int16_tcut
|
|
char *p = metaList;
|
|
char *pend = metaListEnd;
|
|
|
|
// not if we ourselves had to wait in line! that means there were not
|
|
// enough voters!
|
|
if ( m_waitInLine ) return p;
|
|
|
|
// MDW: disable this for now since it is causing a parsing
|
|
// inconsistency since when we re-read the list from sectiondb
|
|
// for the parsing inconsistency check in XmlDoc.cpp, it sometimes
|
|
// sets m_waitInLine to true... which is strange... wtf?
|
|
// guilty url: www.lis.illinois.edu/newsroom/events?dt=2011-01-15
|
|
return p;
|
|
|
|
//int32_t now = getTimeSynced();
|
|
|
|
// host hash
|
|
//int32_t h = hash32 ( url->getHost() , url->getHostLen() );
|
|
// make sure not 0
|
|
//if ( h == 0 ) h = 1;
|
|
|
|
// now add the line waiters into spiderdb as forced spider recs
|
|
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// does it match us?
|
|
int32_t tagPairHash = m_list.getCurrentDate();
|
|
// skip if not line waiter
|
|
if ( tagPairHash != m_tagPairHash ) continue;
|
|
// key128_t
|
|
key128_t *key = (key128_t *)m_list.getCurrentRec();
|
|
// get section type of this sectiondb vote/key
|
|
int32_t secType = g_indexdb.getScore ( (char *)key );
|
|
// must be this
|
|
if ( secType != SV_WAITINLINE ) continue;
|
|
// get docid
|
|
int64_t docId = m_list.getCurrentDocId();
|
|
// store the rdbId
|
|
*p++ = RDB_SPIDERDB;
|
|
// . store in the meta list
|
|
// . similar to how we do it in Links::setMetaList()
|
|
SpiderRequest *sreq = (SpiderRequest *)p;
|
|
// reset it
|
|
sreq->reset();
|
|
// now set these
|
|
sreq->m_fromSections = 1;
|
|
sreq->m_urlIsDocId = 1;
|
|
sreq->m_fakeFirstIp = 1;
|
|
// copy url
|
|
sprintf(sreq->m_url,"%"UINT64"",docId);
|
|
// fake
|
|
int32_t firstIp = hash32n(sreq->m_url);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq->m_firstIp = firstIp;
|
|
// set the key!
|
|
sreq->setKey( firstIp, m_docId , false );
|
|
// advance p
|
|
p += sreq->getRecSize();
|
|
// sanity check
|
|
if ( p > pend ) { char *xx=NULL;*xx=0; }
|
|
// debug
|
|
logf(LOG_DEBUG,"section: respider line waiter d=%"INT64"",docId);
|
|
}
|
|
return p;
|
|
}
|
|
*/
|
|
#define TABLE_ROWS 25
|
|
|
|
// . print it out
|
|
// . we can't just loop over the sections because some sections have
|
|
// words between the subsections they contain,
|
|
// . so we have to loop over the individual "words"
|
|
/*
|
|
bool Sections::print ( SafeBuf *sbuf ,
|
|
HashTableX *pt ,
|
|
HashTableX *et ,
|
|
HashTableX *st2 ,
|
|
HashTableX *at ,
|
|
HashTableX *tt ,
|
|
//HashTableX *rt ,
|
|
HashTableX *priceTable ) {
|
|
|
|
sbuf->safePrintf("<b>Sections in Document</b>\n");
|
|
|
|
// section sanity checks
|
|
for ( int32_t i =0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get section
|
|
Section *sn = &m_sections[i]; // Ptrs[i];
|
|
// get parent
|
|
Section *sp = sn->m_parent;
|
|
if ( ! sp ) continue;
|
|
// check if contained
|
|
if ( sn->m_a < sp->m_a ) { char *xx=NULL;*xx=0; }
|
|
if ( sn->m_b > sp->m_b ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
char **wptrs = m_words->getWords ();
|
|
int32_t *wlens = m_words->getWordLens ();
|
|
nodeid_t *tids = m_words->getTagIds();
|
|
int32_t nw = m_words->getNumWords ();
|
|
|
|
// check words
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get section
|
|
Section *sn = m_sectionPtrs[i];
|
|
if ( sn->m_a > i ) { char *xx=NULL;*xx=0; }
|
|
if ( sn->m_b <= i ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
bool hadWords = false;
|
|
Section *lastSection = NULL; // TagHash = -1;
|
|
Section *dstack[MAXTAGSTACK];
|
|
int32_t ns = 0;
|
|
int32_t printedi = -1;
|
|
//sbuf->safePrintf("<pre>\n");
|
|
// first print the html lines out
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get section
|
|
Section *sn = m_sectionPtrs[i];
|
|
// print if ending
|
|
if ( ns>0 && dstack[ns-1]->m_b == i && ! hadWords )
|
|
sbuf->safePrintf(" ");
|
|
// if this word is punct print that out
|
|
if ( ! m_wids[i] && ! m_tids[i] ) {
|
|
sbuf->safeMemcpy(m_wptrs[i],m_wlens[i]);
|
|
// do not reprint
|
|
printedi = i;
|
|
}
|
|
// punch out some </divs>
|
|
for ( ; ns>0 && dstack[ns-1]->m_b == i ; ns-- ) {
|
|
//sbuf->safePrintf("</div id=0x%"XINT32">\n",
|
|
//(int32_t)dstack[ns-1]);
|
|
sbuf->safePrintf("</div>\n");
|
|
}
|
|
// . put in a div if its a new section changing front tag
|
|
// . section might not start with a tag, like the bullet
|
|
if ( //tids[i] &&
|
|
//! ( tids[i] & BACKBIT ) &&
|
|
sn->m_a == i &&
|
|
lastSection != sn ) {
|
|
// punch out some </divs>
|
|
//for ( ; ns>0 && dstack[ns-1] == i ; ns-- )
|
|
// sbuf->safePrintf("</div>\n");
|
|
sbuf->safePrintf("<br>");
|
|
// store the sections in this array
|
|
Section *spbuf[20];
|
|
// one div per parent!
|
|
Section *sp = sn;
|
|
int32_t qq = 0;
|
|
for ( ; sp && sp->m_a == sn->m_a ; sp = sp->m_parent )
|
|
spbuf[qq++] = sp;
|
|
// sanity check
|
|
if ( qq > 20 ) { char *xx=NULL;*xx=0; }
|
|
// flag for printing
|
|
//bool printed = false;
|
|
// for sanity check
|
|
Section *lastk = NULL;
|
|
bool printFirstWord = false;
|
|
// now reverse loop for div sections
|
|
for ( int32_t k = qq-1 ; k >= 0 ; k-- ) {
|
|
// get it
|
|
Section *sk = spbuf[k];
|
|
// must be good
|
|
if ( lastk &&
|
|
( sk->m_a > lastk->m_a ||
|
|
sk->m_b > lastk->m_b )) {
|
|
char *xx=NULL;*xx=0; }
|
|
// save it
|
|
lastk = sk;
|
|
// flag
|
|
if ( sk->m_flags & SEC_SENTENCE )
|
|
printFirstWord = true;
|
|
// only make font color different
|
|
int32_t bcolor = (int32_t)sk->m_tagHash& 0x00ffffff;
|
|
int32_t fcolor = 0x000000;
|
|
int32_t rcolor = 0x000000;
|
|
uint8_t *bp = (uint8_t *)&bcolor;
|
|
bool dark = false;
|
|
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
|
|
dark = true;
|
|
// or if two are less than 50
|
|
if ( bp[0]<100 && bp[1]<100 ) dark = true;
|
|
if ( bp[1]<100 && bp[2]<100 ) dark = true;
|
|
if ( bp[0]<100 && bp[2]<100 ) dark = true;
|
|
// if bg color is dark, make font color light
|
|
if ( dark ) {
|
|
fcolor = 0x00ffffff;
|
|
rcolor = 0x00ffffff;
|
|
}
|
|
// start the new div
|
|
//sbuf->safePrintf("<div id=0x%"XINT32" "//e%"INT32" "
|
|
sbuf->safePrintf("<div "
|
|
"style=\""
|
|
"background-color:#%06"XINT32";"
|
|
"margin-left:20px;"
|
|
"border:#%06"XINT32" 1px solid;"
|
|
"color:#%06"XINT32"\">",
|
|
//(int32_t)sk,
|
|
bcolor,
|
|
rcolor,
|
|
fcolor);
|
|
// print event id range
|
|
if ( sk->m_minEventId >= 1 )
|
|
sbuf->safePrintf("%"INT32"-%"INT32" ",
|
|
sk->m_minEventId,
|
|
sk->m_maxEventId);
|
|
// push that
|
|
//if ( (sk->m_flags & SEC_FAKE) &&
|
|
// sk->m_b > sk->m_a + 1 )
|
|
// dstack[ns++] = sk->m_b - 1;
|
|
//else
|
|
dstack[ns++] = sk;//->m_b;
|
|
// print word/tag #i
|
|
if ( printedi<i && !(sk->m_flags&SEC_FAKE)) {
|
|
// only print the contents once
|
|
printedi = i; // = true;
|
|
// "<br>" might be a sequnce of brs
|
|
int32_t last = i;
|
|
if ( tids[i] == TAG_BR )
|
|
last = sk->m_b - 1;
|
|
char *end = m_wptrs[last] +
|
|
m_wlens[last];
|
|
int32_t tlen = end - wptrs[i];
|
|
// only encode if it is a tag
|
|
if ( tids[i] )
|
|
sbuf->htmlEncode(wptrs[i],
|
|
tlen,
|
|
false );
|
|
//else
|
|
//sbuf->safePrintf(wptrs[i],
|
|
//wlens[i],
|
|
//false );
|
|
}
|
|
// print the flags
|
|
sbuf->safePrintf("<i>");
|
|
|
|
if ( sk )
|
|
sbuf->safePrintf("A=%"INT32" ",sk->m_a);
|
|
|
|
//sbuf->
|
|
//safePrintf("fwp=%"INT32" ",sk->m_firstWordPos);
|
|
|
|
// print tag hash now
|
|
if ( sk )
|
|
sbuf->safePrintf("hash=0x%"XINT32" ",
|
|
(int32_t)sk->m_tagHash);
|
|
|
|
if ( sk->m_contentHash)
|
|
sbuf->safePrintf("ch=0x%"XINT32" ",
|
|
(int32_t)sk->m_contentHash);
|
|
//else if ( sk->m_contentHash2 )
|
|
// sbuf->safePrintf("ch2=0x%"XINT32" ",
|
|
// (int32_t)sk->m_contentHash2);
|
|
else if ( sk->m_sentenceContentHash )
|
|
sbuf->safePrintf("sch=0x%"XINT32" ",
|
|
(int32_t)sk->m_sentenceContentHash);
|
|
|
|
|
|
// show dup votes if any
|
|
if ( sk->m_votesForDup )
|
|
sbuf->safePrintf("dupvotes=%"INT32" ",
|
|
sk->m_votesForDup);
|
|
if ( sk->m_votesForNotDup )
|
|
sbuf->safePrintf("notdupvotes=%"INT32" ",
|
|
sk->m_votesForNotDup);
|
|
|
|
printFlags ( sbuf , sk , false );
|
|
|
|
// print the item tables out
|
|
int64_t *ph = (int64_t *)pt->getValue(&sk);
|
|
int64_t *eh = (int64_t *)et->getValue(&sk);
|
|
//bool isInRt = false;
|
|
//if ( rt ) isInRt = rt->isInTable(&sk);
|
|
bool inPriceTable = false;
|
|
if ( priceTable )
|
|
inPriceTable =
|
|
priceTable->isInTable(&sk);
|
|
// get addr index ptr if any (could be mult)
|
|
int32_t acount = 0;
|
|
int64_t sh = 0LL;
|
|
if ( at ) {
|
|
int32_t slot = at->getSlot(&sk);
|
|
for(;slot>=0;
|
|
slot=at->getNextSlot(slot,&sk)) {
|
|
// get min
|
|
Place **pp;
|
|
pp = (Place **)at->
|
|
getValueFromSlot(slot);
|
|
// get hash
|
|
int64_t tt=(*pp)->m_hash;
|
|
// get max
|
|
if ( ! sh || tt > sh ) sh = tt;
|
|
// count them
|
|
acount++;
|
|
}
|
|
}
|
|
// date #
|
|
//int32_t *ti = (int32_t *)tt->getValue(&sk);
|
|
// print those out
|
|
if ( ph )
|
|
sbuf->safePrintf("hasphone ");
|
|
if ( eh )
|
|
sbuf->safePrintf("hasemail ");
|
|
//if ( isInRt )
|
|
// sbuf->safePrintf("hasregistration ");
|
|
|
|
if ( inPriceTable )
|
|
sbuf->safePrintf("hasprice ");
|
|
|
|
//if ( sk )
|
|
//sbuf->safePrintf("dh=0x%"XINT32" ",
|
|
//getDelimHash(METHOD_INNER_TAGID,sk));
|
|
|
|
|
|
if ( sh )
|
|
sbuf->safePrintf("placehash=0x%"XINT64"",
|
|
sh);
|
|
if ( sh && acount >= 2 )
|
|
sbuf->safePrintf(" (%"INT32" total)",
|
|
acount);
|
|
|
|
if ( isHardSection(sk) )
|
|
sbuf->safePrintf("hardsec ");
|
|
|
|
//if ( ph )
|
|
// sbuf->safePrintf("phonehash=%"XINT64" ",*ph);
|
|
//if ( eh )
|
|
// sbuf->safePrintf("emailhash=%"XINT64" ",*eh);
|
|
// consider actually print the address out
|
|
// if you want to, but don't show the pointer
|
|
// because it causes problems in the diff
|
|
// that the qa loop does.
|
|
// addr #
|
|
//if ( ai )
|
|
//sbuf->safePrintf("addrindex=%"INT32" ",*ai);
|
|
//if ( ai )
|
|
//sbuf->safePrintf("hasaddress ");
|
|
// tod #
|
|
//if ( ti )
|
|
// sbuf->safePrintf("hastod ");//,*ti);
|
|
//if ( sk->m_numAddresses > 0 )
|
|
// sbuf->safePrintf("na=%i ",
|
|
// sk->m_numAddresses);
|
|
//if ( sk->m_numPlaces > 0 )
|
|
// sbuf->safePrintf("np=%i ",
|
|
// sk->m_numPlaces);
|
|
sbuf->safePrintf("</i> ");
|
|
}
|
|
// assume had no words
|
|
hadWords = false;
|
|
|
|
if (printFirstWord && !tids[i] )
|
|
sbuf->htmlEncode(wptrs[i],wlens[i],false );
|
|
|
|
// update it
|
|
lastSection = sn;
|
|
continue;
|
|
}
|
|
// based on depth
|
|
//for ( int32_t k = 0 ;
|
|
// lastSection != sn && k < sn->m_depth + 1 ;
|
|
// k++ )
|
|
// sbuf->safePrintf("-");
|
|
// print word/tag #i
|
|
if ( tids[i] != TAG_BR && i>printedi )
|
|
sbuf->htmlEncode(wptrs[i],wlens[i],false );
|
|
// update it
|
|
lastSection = sn;
|
|
// assume had words
|
|
if ( m_wids[i] || tids[i] ) hadWords = true;
|
|
}
|
|
bool unbal = (ns > 0 );
|
|
//sbuf->safePrintf("</pre>\n");
|
|
for ( int32_t i = 0 ; i < ns ; i++ )
|
|
sbuf->safePrintf("</div>\n");
|
|
// sanity check
|
|
if ( unbal )
|
|
sbuf->safePrintf("<br><b>%"INT32" UNBALANCED SECTIONS</b><br><br>",
|
|
ns);
|
|
|
|
|
|
// print header
|
|
char *hdr =
|
|
"<table border=1>"
|
|
"<tr>"
|
|
"<td><b>sec #</b></td>"
|
|
"<td><b>baseHash</b></td>"
|
|
"<td><b>cumulTagHash</b></td>"
|
|
"<td><b>wordStart</b></td>"
|
|
"<td><b>wordEnd</b></td>"
|
|
"<td><b>contentHash</b></td>"
|
|
"<td><b>XOR</b></td>" // only valid for contentHashes
|
|
"<td><b>alnum words</b></td>" // contained in section
|
|
"<td><b>depth</b></td>"
|
|
"<td><b>parent word range</b></td>"
|
|
"<td><b># siblings</b></td>"
|
|
"<td><b>flags</b></td>"
|
|
"<td><b>evIds</b></td>"
|
|
"<td><b>text snippet</b></td>"
|
|
//"<td>votes for static</td>"
|
|
//"<td>votes for dynamic</td>"
|
|
//"<td>votes for texty</td>"
|
|
//"<td>votes for unique</td>"
|
|
"</tr>\n";
|
|
sbuf->safePrintf("%s",hdr);
|
|
|
|
int32_t rcount = 0;
|
|
int32_t scount = 0;
|
|
// show word # of each section so we can look in PageParser.cpp's
|
|
// output to see exactly where it starts, since we now label all
|
|
// the words
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
|
sbuf->safePrintf("<!--ignore--></table>%s\n",hdr);
|
|
// get it
|
|
//Section *sn = &m_sections[i];
|
|
//Section *sn = m_sorted[i];
|
|
// skip if not a section with its own words
|
|
//if ( sn->m_flags & SEC_NOTEXT ) continue;
|
|
char *xs = "--";
|
|
char ttt[100];
|
|
if ( sn->m_contentHash ) {
|
|
int32_t modified = sn->m_tagHash ^ sn->m_contentHash;
|
|
sprintf(ttt,"0x%"XINT32"",modified);
|
|
xs = ttt;
|
|
}
|
|
// int16_tcut
|
|
Section *parent = sn->m_parent;
|
|
int32_t pswn = -1;
|
|
int32_t pewn = -1;
|
|
if ( parent ) pswn = parent->m_a;
|
|
if ( parent ) pewn = parent->m_b;
|
|
// print it
|
|
sbuf->safePrintf("<!--ignore--><tr><td>%"INT32"</td>\n"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%s</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td><nobr>%"INT32" to %"INT32"</nobr></td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td><nobr>" ,
|
|
scount++,//i,
|
|
(int32_t)sn->m_baseHash,
|
|
(int32_t)sn->m_tagHash,
|
|
sn->m_a,
|
|
sn->m_b,
|
|
(int32_t)sn->m_contentHash,
|
|
xs,
|
|
sn->m_exclusive,
|
|
sn->m_depth,
|
|
pswn,
|
|
pewn,
|
|
sn->m_numOccurences);//totalOccurences );
|
|
// now show the flags
|
|
printFlags ( sbuf , sn , false );
|
|
// first few words of section
|
|
int32_t a = sn->m_a;
|
|
int32_t b = sn->m_b;
|
|
// -1 means an unclosed tag!! should no longer be the case
|
|
if ( b == -1 ) { char *xx=NULL;*xx=0; }//b=m_words->m_numWords;
|
|
sbuf->safePrintf("</nobr></td>");
|
|
|
|
if ( sn->m_minEventId >= 1 )
|
|
sbuf->safePrintf("<td><nobr>%"INT32"-%"INT32"</nobr></td>",
|
|
sn->m_minEventId,sn->m_maxEventId);
|
|
else
|
|
sbuf->safePrintf("<td> </td>");
|
|
|
|
sbuf->safePrintf("<td><nobr>");
|
|
// 70 chars max
|
|
int32_t max = 70;
|
|
int32_t count = 0;
|
|
char truncated = 0;
|
|
// do not print last word/tag in section
|
|
for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) {
|
|
char *s = wptrs[i];
|
|
int32_t slen = wlens[i];
|
|
if ( count + slen > max ) {
|
|
truncated = 1;
|
|
slen = max - count;
|
|
}
|
|
count += slen;
|
|
// boldify front tag
|
|
if ( i == a ) sbuf->safePrintf("<b>");
|
|
sbuf->htmlEncode(s,slen,false);
|
|
// boldify front tag
|
|
if ( i == a ) sbuf->safePrintf("</b>");
|
|
}
|
|
// if we truncated print a ...
|
|
if ( truncated ) sbuf->safePrintf("<b>...</b>");
|
|
// then print ending tag
|
|
if ( b < nw ) {
|
|
int32_t blen = wlens[b-1];
|
|
if ( blen>20 ) blen = 20;
|
|
sbuf->safePrintf("<b>");
|
|
sbuf->htmlEncode(wptrs[b-1],blen,false);
|
|
sbuf->safePrintf("</b>");
|
|
}
|
|
|
|
sbuf->safePrintf("</nobr></td></tr>\n");
|
|
}
|
|
|
|
sbuf->safePrintf("</table>\n<br>\n");
|
|
|
|
|
|
|
|
// now print the NEW voting table
|
|
sbuf->safePrintf("<b>NEW Section Voting Table (nsvt)</b>\n");
|
|
sbuf->safePrintf("<br>");
|
|
sbuf->safePrintf("<i>tagHash is combined with sectionType to make the "
|
|
"key in sectiondb. The data is everything else. "
|
|
"<br>"
|
|
"*The tagHash is XOR'ed with the contentHash for the "
|
|
"contentHash "
|
|
"section type, and is the tagPairHash for the "
|
|
"tagPairHash section type.</i>");
|
|
// print table header
|
|
char *hdr2 =
|
|
"<table border=1>"
|
|
"<tr>"
|
|
"<td><b>siteHash</b></td>"
|
|
"<td><b>tagHash*</b></td>"
|
|
"<td><b>sectionType</b></td>"
|
|
"<td><b>scoreTotal</b></td>"
|
|
"<td><b>numSampled</b></td>"
|
|
"<td><b>avgScore</b></td>"
|
|
"</tr>\n";
|
|
sbuf->safePrintf("%s",hdr2);
|
|
HashTableX *st = &m_nsvt;
|
|
rcount = 0;
|
|
for ( int32_t i = 0 ; i < st->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! st->m_flags[i] ) continue;
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
|
sbuf->safePrintf("<!--ignore--></table>%s\n",hdr2);
|
|
// get the key
|
|
uint64_t k = *(uint64_t *)st->getKey ( i );
|
|
// get the data
|
|
SectionVote *sv = (SectionVote *)st->getValueFromSlot ( i );
|
|
// parse key
|
|
int32_t tagHash = (int32_t)(k & 0xffffffff);
|
|
int32_t sectionType = (int32_t)(k >> 32);
|
|
//int32_t tagHash = (int32_t)(k >> 32);
|
|
//int32_t sectionType = (int32_t)(k & 0xffffffff);
|
|
// convert to string
|
|
char *st = getSectionTypeAsStr ( sectionType );
|
|
float avg = 0.0;
|
|
if ( sv->m_numSampled > 0 ) avg = sv->m_score/sv->m_numSampled;
|
|
sbuf->safePrintf("<tr>"
|
|
"<td>--</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%s</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"</tr>\n",
|
|
tagHash,
|
|
st,
|
|
sv->m_score,
|
|
sv->m_numSampled ,
|
|
avg );
|
|
}
|
|
sbuf->safePrintf("</table>\n<br><br>\n");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// now print the OLD voting table
|
|
sbuf->safePrintf("<b>OLD Section Voting Table (osvt)</b>\n");
|
|
sbuf->safePrintf("<br>\n");
|
|
sbuf->safePrintf("<i>numSampled is # of docs that had tagHash or "
|
|
"contentHash.\n");
|
|
// print table header
|
|
sbuf->safePrintf("%s",hdr2);
|
|
st = &m_osvt;
|
|
rcount = 0;
|
|
for ( int32_t i = 0 ; i < st->m_numSlots ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! st->m_flags[i] ) continue;
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
|
sbuf->safePrintf("<!--ignore--></table>%s\n",hdr2);
|
|
// get the key
|
|
uint64_t k = *(uint64_t *)st->getKey ( i );
|
|
// get the data
|
|
SectionVote *sv = (SectionVote *)st->getValueFromSlot ( i );
|
|
// parse key
|
|
int32_t tagHash = (int32_t)(k & 0xffffffff);
|
|
int32_t sectionType = (int32_t)(k >> 32);
|
|
//int32_t tagHash = (int32_t)(k >> 32);
|
|
//int32_t sectionType = (int32_t)(k & 0xffffffff);
|
|
// convert to string
|
|
char *st = getSectionTypeAsStr ( sectionType );
|
|
float avg = 0.0;
|
|
if ( sv->m_numSampled > 0 ) avg = sv->m_score/sv->m_numSampled;
|
|
sbuf->safePrintf("<tr>"
|
|
"<td>--</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%s</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"</tr>\n",
|
|
tagHash,
|
|
st,
|
|
sv->m_score,
|
|
sv->m_numSampled ,
|
|
avg );
|
|
}
|
|
sbuf->safePrintf("</table>\n<br><br>\n");
|
|
|
|
sbuf->safePrintf("<table border=1 cellpadding=3>\n");
|
|
sbuf->safePrintf(
|
|
"<tr><td><nobr><b>Section Type</b></nobr></td>"
|
|
"<td><b>Description</b></td></tr>\n"
|
|
|
|
"<tr><td>texty</td><td>An average "
|
|
"score of 0.0 means the "
|
|
"section does not have any words that are not "
|
|
"in anchor test. An average "
|
|
"score of 1.0 means the section "
|
|
"has words, none of which are in anchor text."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>clock</td><td>An average "
|
|
"score of 1.0 means "
|
|
"section identified as "
|
|
"containing a clock. An average "
|
|
"score of 0.0 mean it contains "
|
|
"a date that is definitely not a clock."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>eurdatefmt</td><td>An average "
|
|
"score of 1.0 means "
|
|
"the section contains a date in european format, "
|
|
"which means day first, not month. An average "
|
|
"score of 0.0 "
|
|
"means the section contains a date that is NOT in "
|
|
"european format.</td></tr>\n"
|
|
|
|
//"<tr><td>event</td><td>"
|
|
//"</td></tr>\n"
|
|
|
|
//"<tr><td>address</td><td></td></tr>\n"
|
|
|
|
"<tr><td>tagpairhash</td><td>"
|
|
"All unique adjacent tag ids are hashed to form "
|
|
"this number which represents the overal structure "
|
|
"of the document. The tagHash is the tagPairHash "
|
|
"in this case."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>contenthash</td><td>"
|
|
"The tagHash in this case is really "
|
|
"the content hash of the words in the section XORed "
|
|
"with the original tagHash."
|
|
"</td></tr>\n"
|
|
|
|
//"<tr><td>textmaxsmpl</td><td></td></tr>\n"
|
|
//"<tr><td>waitinline</td><td></td></tr>\n"
|
|
//"<tr><td></td><td></td></tr>\n"
|
|
);
|
|
sbuf->safePrintf("</table>\n<br><br>\n");
|
|
// must be \0 terminated
|
|
if ( sbuf->m_buf[sbuf->m_length] ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
void Sections::printFlags ( SafeBuf *sbuf , Section *sn , bool justEvents ) {
|
|
sec_t f = sn->m_flags;
|
|
|
|
|
|
//if ( f & SEC_ADDRESS_CONTAINER )
|
|
// sbuf->safePrintf("addresscontainer ");
|
|
//if ( f & SEC_EVENT )
|
|
// sbuf->safePrintf("hasevent ");
|
|
|
|
//if ( f & SEC_MULT_EVENTS_1 )
|
|
// sbuf->safePrintf("multevents1 ");
|
|
//if ( f & SEC_MULT_EVENTS_2 )
|
|
// sbuf->safePrintf("multevents2 ");
|
|
//if ( f & SEC_MULT_DATES )
|
|
// sbuf->safePrintf("multdates ");
|
|
//if ( f & SEC_SUBSECTION )
|
|
// sbuf->safePrintf("subsection ");
|
|
//if ( f & SEC_VENUE_PAGE )
|
|
// sbuf->safePrintf("venuepage ");
|
|
//if ( f & SEC_MULT_ADDRESSES )
|
|
// sbuf->safePrintf("multaddresses ");
|
|
//if ( f & SEC_MULT_PLACES )
|
|
// sbuf->safePrintf("multplaces ");
|
|
|
|
if ( justEvents ) return;
|
|
|
|
if ( f & SEC_HASHXPATH )
|
|
sbuf->safePrintf("hashxpath ");
|
|
|
|
sbuf->safePrintf("indsenthash64=%"UINT64" ",sn->m_indirectSentHash64);
|
|
|
|
|
|
if ( f & SEC_TOD_EVENT )
|
|
sbuf->safePrintf("todevent ");
|
|
|
|
if ( f & SEC_HR_CONTAINER )
|
|
sbuf->safePrintf("hrcontainer ");
|
|
|
|
//if ( f & SEC_TOD_EVENT_2 )
|
|
// sbuf->safePrintf("containsmulttodevents ");
|
|
//if ( f & SEC_TOD_EVENT_3 )
|
|
// sbuf->safePrintf("containsmulttodevents ");
|
|
//if ( sn->m_numTods >= 1 )
|
|
// sbuf->safePrintf("numtods=%"INT32" ",sn->m_numTods );
|
|
|
|
|
|
if ( f & SEC_HAS_REGISTRATION )
|
|
sbuf->safePrintf("hasregistration ");
|
|
if ( f & SEC_HAS_PARKING )
|
|
sbuf->safePrintf("hasparking ");
|
|
|
|
if ( f & SEC_NIXED_HEADING_CONTAINER )
|
|
sbuf->safePrintf("nixedheadingcontainer ");
|
|
if ( f & SEC_HEADING_CONTAINER )
|
|
sbuf->safePrintf("headingcontainer ");
|
|
if ( f & SEC_HEADING )
|
|
sbuf->safePrintf("heading ");
|
|
|
|
if ( f & SEC_HAS_NONFUZZYDATE )
|
|
sbuf->safePrintf("hasnonfuzzydate ");
|
|
|
|
if ( f & SEC_TAIL_CRAP )
|
|
sbuf->safePrintf("tailcrap ");
|
|
|
|
if ( f & SEC_STRIKE )
|
|
sbuf->safePrintf("strike ");
|
|
if ( f & SEC_STRIKE2 )
|
|
sbuf->safePrintf("strike2 ");
|
|
|
|
if ( f & SEC_SECOND_TITLE )
|
|
sbuf->safePrintf("secondtitle ");
|
|
|
|
//if ( f & SEC_HAS_EVENT_DATE )
|
|
// sbuf->safePrintf("haseventdate " );
|
|
//if ( f & SEC_HAS_NON_EVENT_DATE )
|
|
// sbuf->safePrintf("hasnoneventdate " );
|
|
if ( f & SEC_EVENT_BROTHER )
|
|
sbuf->safePrintf("eventbrother " );
|
|
if ( f & SEC_IGNOREEVENTBROTHER )
|
|
sbuf->safePrintf("ignoreeventbrother " );
|
|
if ( f & SEC_STOREHOURSCONTAINER )
|
|
sbuf->safePrintf("storehourscontainer ");
|
|
if ( f & SEC_DATE_LIST_CONTAINER )
|
|
sbuf->safePrintf("datelistcontainer " );
|
|
if ( f & SEC_PUBDATECONTAINER )
|
|
sbuf->safePrintf("pubdatecontainer ");
|
|
if ( f & SEC_TABLE_HEADER )
|
|
sbuf->safePrintf("tableheader ");
|
|
//if ( f & SEC_HAS_DATE )
|
|
// sbuf->safePrintf("hasdate " );
|
|
if ( f & SEC_HAS_DOM )
|
|
sbuf->safePrintf("hasdom " );
|
|
if ( f & SEC_HAS_MONTH )
|
|
sbuf->safePrintf("hasmonth " );
|
|
if ( f & SEC_HAS_DOW )
|
|
sbuf->safePrintf("hasdow " );
|
|
if ( f & SEC_HAS_TOD )
|
|
sbuf->safePrintf("hastod " );
|
|
if ( f & SEC_HASEVENTDOMDOW )
|
|
sbuf->safePrintf("haseventdomdow " );
|
|
//if ( f & SEC_HAS_TODINMENU )
|
|
// sbuf->safePrintf("hastodinmenu " );
|
|
if ( f & SEC_MENU_SENTENCE )
|
|
sbuf->safePrintf("menusentence " );
|
|
if ( f & SEC_MENU )
|
|
sbuf->safePrintf("ismenu " );
|
|
if ( f & SEC_MENU_HEADER )
|
|
sbuf->safePrintf("menuheader " );
|
|
//if ( f & SEC_LIST_HEADER )
|
|
// sbuf->safePrintf("listheader " );
|
|
if ( f & SEC_CONTAINER )
|
|
sbuf->safePrintf("listcontainer " );
|
|
if ( f & SEC_INPUT_HEADER )
|
|
sbuf->safePrintf("inputheader " );
|
|
if ( f & SEC_INPUT_FOOTER )
|
|
sbuf->safePrintf("inputfooter " );
|
|
if ( f & SEC_LINK_TEXT )
|
|
sbuf->safePrintf("linktext " );
|
|
if ( f & SEC_PLAIN_TEXT )
|
|
sbuf->safePrintf("plaintext " );
|
|
//if ( f & SEC_NOT_MENU )
|
|
// sbuf->safePrintf("notmenu " );
|
|
//if ( f & SEC_MAYBE_MENU )
|
|
// sbuf->safePrintf("maybemenu " );
|
|
//if ( f & SEC_DUP )
|
|
// sbuf->safePrintf("dupsection " );
|
|
//if ( f & SEC_IS_MENUITEM )
|
|
// sbuf->safePrintf("menuitem " );
|
|
|
|
|
|
//if ( f & SEC_SENTENCE )
|
|
// sbuf->safePrintf("sentence ");
|
|
|
|
if ( f & SEC_FAKE ) {
|
|
//sbuf->safePrintf("hrtag ");
|
|
// extra it
|
|
//if ( sn->m_baseHash == BH_HR )
|
|
// sbuf->safePrintf("hrtagdelim ");
|
|
//if ( sn->m_baseHash == BH_BR )
|
|
// sbuf->safePrintf("brtagdelim ");
|
|
//else if ( sn->m_baseHash == BH_H1 )
|
|
// sbuf->safePrintf("h1tagdelim ");
|
|
//else if ( sn->m_baseHash == BH_H2 )
|
|
// sbuf->safePrintf("h2tagdelim ");
|
|
//else if ( sn->m_baseHash == BH_H3 )
|
|
// sbuf->safePrintf("h3tagdelim ");
|
|
//else if ( sn->m_baseHash == BH_H4 )
|
|
// sbuf->safePrintf("h4tagdelim ");
|
|
//else if ( sn->m_baseHash == BH_BRBR )
|
|
// sbuf->safePrintf("brbrdelim ");
|
|
if ( sn->m_baseHash == BH_BULLET )
|
|
sbuf->safePrintf("bulletdelim ");
|
|
else if ( sn->m_baseHash == BH_SENTENCE )
|
|
sbuf->safePrintf("<b>sentence</b> ");
|
|
else if ( sn->m_baseHash == BH_IMPLIED )
|
|
sbuf->safePrintf("<b>impliedsec</b> ");
|
|
//else if ( sn->m_baseHash == BH_IMPLIED_LIST )
|
|
// sbuf->safePrintf("<b>impliedLIST</b> ");
|
|
else { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
if ( f & SEC_SPLIT_SENT )
|
|
sbuf->safePrintf("<b>splitsent</b> ");
|
|
|
|
if ( f & SEC_NOTEXT )
|
|
sbuf->safePrintf("notext ");
|
|
|
|
if ( f & SEC_MULTIDIMS )
|
|
sbuf->safePrintf("multidims ");
|
|
|
|
if ( f & SEC_HASDATEHEADERROW )
|
|
sbuf->safePrintf("hasdateheaderrow ");
|
|
if ( f & SEC_HASDATEHEADERCOL )
|
|
sbuf->safePrintf("hasdateheadercol ");
|
|
|
|
|
|
|
|
if ( sn->m_colNum )
|
|
sbuf->safePrintf("colnum=%"INT32" ",sn->m_colNum );
|
|
if ( sn->m_rowNum )
|
|
sbuf->safePrintf("rownum=%"INT32" ",sn->m_rowNum );
|
|
if ( sn->m_headColSection )
|
|
sbuf->safePrintf("headcola=%"INT32" ",sn->m_headColSection->m_a);
|
|
if ( sn->m_headRowSection )
|
|
sbuf->safePrintf("headrowa=%"INT32" ",sn->m_headRowSection->m_a);
|
|
|
|
if ( f & SEC_IN_TABLE )
|
|
sbuf->safePrintf("intable ");
|
|
//if ( f & SEC_ARTICLE )
|
|
// sbuf->safePrintf("inarticle ");
|
|
if ( f & SEC_SCRIPT )
|
|
sbuf->safePrintf("inscript ");
|
|
if ( f & SEC_NOSCRIPT )
|
|
sbuf->safePrintf("innoscript ");
|
|
if ( f & SEC_STYLE )
|
|
sbuf->safePrintf("instyle ");
|
|
//if ( f & SEC_FIRST_HIDDEN )
|
|
// sbuf->safePrintf("infirstdivhide ");
|
|
if ( f & SEC_HIDDEN )
|
|
sbuf->safePrintf("indivhide ");
|
|
if ( f & SEC_SELECT )
|
|
sbuf->safePrintf("inselect ");
|
|
if ( f & SEC_MARQUEE )
|
|
sbuf->safePrintf("inmarquee ");
|
|
//if ( f & SEC_A )
|
|
// sbuf->safePrintf("inhref ");
|
|
if ( f & SEC_IN_TITLE )
|
|
sbuf->safePrintf("intitle ");
|
|
if ( f & SEC_IN_HEADER )
|
|
sbuf->safePrintf("inheader ");
|
|
|
|
if ( f & SEC_UNBALANCED )
|
|
sbuf->safePrintf("unbalanced " );
|
|
if ( f & SEC_OPEN_ENDED )
|
|
sbuf->safePrintf("openended " );
|
|
|
|
//for ( int32_t i = 0 ; i < (int32_t)sizeof(turkbits_t)*8 ; i++ ) {
|
|
// uint64_t mask = ((turkbits_t)1) << (turkbits_t)i;
|
|
// if ( ! ((sn->m_turkBits) & mask ) ) continue;
|
|
// sbuf->safePrintf("%s ",getTurkBitLabel(mask));
|
|
//}
|
|
|
|
// sentence flags
|
|
sentflags_t sf = sn->m_sentFlags;
|
|
for ( int32_t i = 0 ; i < 64 ; i++ ) {
|
|
// get mask
|
|
uint64_t mask = ((uint64_t)1) << (uint64_t)i;
|
|
if ( sf & mask )
|
|
sbuf->safePrintf("%s ",getSentBitLabel(mask));
|
|
}
|
|
|
|
//if ( f & SEC_HAS_DATE )
|
|
// sbuf->safePrintf("hasdate ");
|
|
//if ( ! f ) sbuf->safePrintf(" ");
|
|
}
|
|
|
|
char *getSectionTypeAsStr ( int32_t sectionType ) {
|
|
//if ( sectionType == SV_TEXTY ) return "texty";
|
|
if ( sectionType == SV_CLOCK ) return "clock";
|
|
if ( sectionType == SV_EURDATEFMT ) return "eurdatefmt";
|
|
if ( sectionType == SV_EVENT ) return "event";
|
|
if ( sectionType == SV_ADDRESS ) return "address";
|
|
if ( sectionType == SV_TAGPAIRHASH ) return "tagpairhash";
|
|
if ( sectionType == SV_TAGCONTENTHASH ) return "tagcontenthash";
|
|
if ( sectionType == SV_TURKTAGHASH ) return "turktaghash";
|
|
//if ( sectionType == SV_DUP ) return "dup";
|
|
//if ( sectionType == SV_NOT_DUP ) return "notdup";
|
|
//if ( sectionType == SV_TEXTY_MAX_SAMPLED ) return "textymaxsmpl";
|
|
//if ( sectionType == SV_WAITINLINE ) return "waitinline";
|
|
if ( sectionType == SV_FUTURE_DATE ) return "futuredate";
|
|
if ( sectionType == SV_CURRENT_DATE ) return "currentdate";
|
|
if ( sectionType == SV_PAST_DATE ) return "pastdate";
|
|
if ( sectionType == SV_SITE_VOTER ) return "sitevoter";
|
|
// sanity check
|
|
char *xx=NULL;*xx=0;
|
|
return "unknown";
|
|
}
|
|
/*
|
|
// . returns (char *)-1 with g_errno set on error
|
|
// . XmlDoc.cpp calls this to fill in TitleRec::ptr_sectionsData for storage
|
|
// . because sectiondb is volatile we need to store m_osvt, which is a tally
|
|
// of all the relevant votes for our sections that we extracted from
|
|
// sectiondb
|
|
char *Sections::getSectionsReply ( int32_t *size ) {
|
|
// assume none
|
|
*size = 0;
|
|
// how much buf do we need?
|
|
m_bufSize = m_osvt.getStoredSize();
|
|
// returning NULL is valid
|
|
if ( m_bufSize == 0 ) return NULL;
|
|
// make buf
|
|
m_buf = (char *)mmalloc ( m_bufSize , "sdata" );
|
|
if ( ! m_buf ) return (char *)-1;
|
|
// store it
|
|
int32_t bytes = m_osvt.serialize ( m_buf , m_bufSize );
|
|
// sanity check
|
|
if ( bytes != m_bufSize ) { char *xx=NULL;*xx=0; }
|
|
// save this
|
|
*size = bytes;
|
|
// good
|
|
return m_buf;
|
|
}
|
|
|
|
// . returns (char *)-1 with g_errno set on error
|
|
// . XmlDoc.cpp calls this to fill in TitleRec::ptr_sectionsData for storage
|
|
// . because sectiondb is volatile we need to store m_osvt, which is a tally
|
|
// of all the relevant votes for our sections that we extracted from
|
|
// sectiondb
|
|
char *Sections::getSectionsVotes ( int32_t *size ) {
|
|
// assume none
|
|
*size = 0;
|
|
// how much buf do we need?
|
|
m_bufSize2 = m_nsvt.getStoredSize();
|
|
// returning NULL is valid
|
|
if ( m_bufSize2 == 0 ) return NULL;
|
|
// make buf
|
|
m_buf2 = (char *)mmalloc ( m_bufSize2 , "sdata" );
|
|
if ( ! m_buf2 ) return (char *)-1;
|
|
// store it
|
|
int32_t bytes = m_nsvt.serialize ( m_buf2 , m_bufSize2 );
|
|
// sanity check
|
|
if ( bytes != m_bufSize2 ) { char *xx=NULL;*xx=0; }
|
|
// save this
|
|
*size = bytes;
|
|
// good
|
|
return m_buf2;
|
|
}
|
|
*/
|
|
bool Sections::isHardSection ( Section *sn ) {
|
|
int32_t a = sn->m_a;
|
|
// . treat this as hard... kinda like a div section...
|
|
// fixes gwair.org date from stealing address of another date
|
|
// because the span tags are fucked up...
|
|
// . crap, no this prevents publicbroadcasting.net and other urls
|
|
// from telescoping to header dates they need to telescope to.
|
|
// the header dates are in span tags and if that is seen as a hard
|
|
// section bad things happen
|
|
//if ( m_tids[a] == TAG_SPAN ) return true;
|
|
if ( ! isBreakingTagId(m_tids[a]) ) {
|
|
// . if first child is hard that works!
|
|
// . fixes "<blockquote><p>..." for collectorsguide.com
|
|
if ( sn->m_next &&
|
|
sn->m_next->m_tagId &&
|
|
// fix "blah blah<br>blah blah" for sentence
|
|
sn->m_next->m_tagId != TAG_BR &&
|
|
sn->m_next->m_a < sn->m_b &&
|
|
isBreakingTagId(sn->m_next->m_tagId) )
|
|
return true;
|
|
// otherwise, forget it!
|
|
return false;
|
|
}
|
|
// trumba.com has sub dates in br-based implied sections that need
|
|
// to telescope to their parent above
|
|
if ( m_tids[a] == TAG_BR ) return false;
|
|
//if ( sn->m_baseHash == BH_BRBR ) return false;
|
|
//if ( sn->m_baseHash == BH_BR ) return false;
|
|
if ( sn->m_flags & SEC_SENTENCE ) return false;
|
|
// xml tag exception for gwair.org. treat <st1:Place>... as soft
|
|
if ( (m_tids[a] & BACKBITCOMP) == TAG_XMLTAG && ! m_isRSSExt )
|
|
return false;
|
|
//if ( sn->m_baseHash == BH_IMPLIED ) return false;
|
|
// try to fix abqcsl.org for allowing a tod range to extend to
|
|
// the DOW in the paragraph before it
|
|
//if ( (m_tids[a]&BACKBITCOMP) == TAG_P ) return false;
|
|
//if (m_baseHash == BH_H1 ) return true;
|
|
//if (m_baseHash == BH_H2 ) return true;
|
|
//if (m_baseHash == BH_H3 ) return true;
|
|
//if (m_baseHash == BH_H4 ) return true;
|
|
//if (m_baseHash == BH_BULLET ) return true;
|
|
return true;
|
|
}
|
|
|
|
|
|
bool Sections::setMenus ( ) {
|
|
|
|
/*
|
|
// set SEC_LINK_ONLY on sections that just contain a link
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not a href section
|
|
if ( si->m_baseHash != TAG_A ) continue;
|
|
// try that out
|
|
Section *sk = si;
|
|
// loop back up here
|
|
subloop:
|
|
// mark that
|
|
sk->m_flags |= SEC_LINK_TEXT;
|
|
// set boundaries
|
|
int32_t a = sk->m_a;
|
|
int32_t b = sk->m_b;
|
|
// mark parents as int32_t as they have no more text than this!
|
|
sk = sk->m_parent;
|
|
// stop if no more
|
|
if ( ! sk ) continue;
|
|
// check
|
|
int32_t i;
|
|
for ( i = sk->m_a ; i < a ; i++ )
|
|
if ( m_wids[i] ) break;
|
|
// skip this section if has text outside
|
|
if ( i < a ) continue;
|
|
// try the right side
|
|
for ( i = b ; i < sk->m_b ; i++ )
|
|
if ( m_wids[i] ) break;
|
|
// skip this section if has text outside
|
|
if ( i < sk->m_b ) continue;
|
|
// try the next parent
|
|
goto subloop;
|
|
}
|
|
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// we are the first item
|
|
Section *first = sk;
|
|
// must be a link text only section
|
|
if ( ! ( sk->m_flags & SEC_LINK_TEXT ) ) continue;
|
|
// skip if used already
|
|
if ( sk->m_used == 78 ) continue;
|
|
// reset
|
|
//bool added = false;
|
|
// use si now
|
|
Section *si = sk;
|
|
// loop back up here
|
|
subloop2:
|
|
// mark it
|
|
si->m_used = 78;
|
|
// save it
|
|
Section *last = si;
|
|
// scan brothers
|
|
si = si->m_nextBrother;
|
|
if ( ! si ) continue;
|
|
// . hard tag ids must match
|
|
// . texasdrums.drums.org likes to put span, bold, etc.
|
|
// tags throughout their menus, so this fixes that
|
|
//if ( si->m_hardSection->m_tagId !=
|
|
// last->m_hardSection->m_tagId ) continue;
|
|
// . must have same tag id, baseHash
|
|
// . no wholefoods has a different id for each, but we
|
|
// should match the tag id!
|
|
//if ( si->m_baseHash != last->m_baseHash ) continue;
|
|
if ( si->m_tagId != last->m_tagId ) continue;
|
|
// stop if not link text
|
|
if ( ! ( si->m_flags & SEC_LINK_TEXT ) ) continue;
|
|
// it is link text
|
|
si->m_flags |= SEC_MENU;
|
|
// and the first one too
|
|
first->m_flags |= SEC_MENU;
|
|
// repeat
|
|
goto subloop2;
|
|
}
|
|
*/
|
|
|
|
|
|
// . this just returns if already set
|
|
// . sets Bits::m_bits[x].m_flags & D_IN_LINK if its in a link
|
|
// . this bits array is 1-1 with the words
|
|
m_bits->setInLinkBits(this);
|
|
|
|
// int16_tcut
|
|
wbit_t *bb = m_bits->m_bits;
|
|
|
|
sec_t flag;
|
|
// set SEC_PLAIN_TEXT and SEC_LINK_TEXT for all sections
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// need alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get our flag
|
|
if ( bb[i] & D_IN_LINK ) flag = SEC_LINK_TEXT;
|
|
else flag = SEC_PLAIN_TEXT;
|
|
// get section ptr
|
|
Section *sk = m_sectionPtrs[i];
|
|
// loop for sk
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
// skip if already set
|
|
if ( sk->m_flags & flag ) break;
|
|
// set it
|
|
sk->m_flags |= flag;
|
|
}
|
|
}
|
|
|
|
Section *last = NULL;
|
|
// . alernatively, scan through all anchor tags
|
|
// . compare to last anchor tag
|
|
// . and blow up each to their max non-intersection section and make
|
|
// sure no PLAIN text in either of those!
|
|
// . this is all to fix texasdrums.drums.org which has various span
|
|
// and bold tags throughout its menu at random
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// . if we hit plain text, we kill our last
|
|
// . this was causing "geeks who drink" for blackbirdbuvette
|
|
// to get is SEC_MENU set because there was a link after it
|
|
if ( si->m_flags & SEC_PLAIN_TEXT ) last = NULL;
|
|
// skip if not a href section
|
|
if ( si->m_baseHash != TAG_A ) continue;
|
|
// . if it is a mailto link forget it
|
|
// . fixes abtango.com from detecting a bad menu
|
|
char *ptr = m_wptrs[si->m_a];
|
|
int32_t plen = m_wlens[si->m_a];
|
|
char *mailto = strncasestr(ptr,plen,"mailto:");
|
|
if ( mailto ) last = NULL;
|
|
// bail if no last
|
|
if ( ! last ) { last = si; continue; }
|
|
// save last
|
|
Section *prev = last;
|
|
// set last for next round, used "saved" below
|
|
last = si;
|
|
// get first "hard" section encountered while telescoping
|
|
Section *prevHard = NULL;
|
|
// blow up last until right before it contains us
|
|
for ( ; prev ; prev = prev->m_parent ) {
|
|
// record?
|
|
if ( ! prevHard && isHardSection(prev) )
|
|
prevHard = prev;
|
|
// if parent contains us, stop
|
|
if ( prev->m_parent->contains ( si ) ) break;
|
|
}
|
|
// if it has plain text, forget it!
|
|
if ( prev->m_flags & SEC_PLAIN_TEXT ) continue;
|
|
// use this for us
|
|
Section *sk = si;
|
|
// get first "hard" section encountered while telescoping
|
|
Section *skHard = NULL;
|
|
// same for us
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
// record?
|
|
if ( ! skHard && isHardSection(sk) ) skHard = sk;
|
|
// if parent contains us, stop
|
|
if ( sk->m_parent->contains ( prev ) ) break;
|
|
}
|
|
// if it has plain text, forget it!
|
|
if ( sk->m_flags & SEC_PLAIN_TEXT ) continue;
|
|
|
|
// . first hard sections encountered must match!
|
|
// . otherwise for switchborad.com we lose "A B C ..." as
|
|
// title candidate because we think it is an SEC_MENU
|
|
// because the sections before it have links in them, but
|
|
// they have different hard sections
|
|
if ( prevHard && ! skHard ) continue;
|
|
if ( ! prevHard && skHard ) continue;
|
|
if ( prevHard && prevHard->m_tagId!=skHard->m_tagId ) continue;
|
|
|
|
/*
|
|
// if this href section splits a sentence, then do not
|
|
// let it be a menu. fixes southgatehouse.com which has
|
|
// "<a>Songwriter Showcase and Open Mic</a>
|
|
// <a>with Mike Kuntz</a>"
|
|
// getting detected as a two-link menu system
|
|
// . NO! hurts some other menus that have just spaces
|
|
// separating the terms!
|
|
Section *se;
|
|
se = prev;
|
|
for ( ; se ; se = se->m_parent )
|
|
if ( se->m_flags & SEC_SENTENCE ) break;
|
|
if ( se &&
|
|
( se->m_firstWordPos != prev->m_firstWordPos ||
|
|
se->m_lastWordPos != prev->m_lastWordPos ) )
|
|
continue;
|
|
se = si;
|
|
for ( ; se ; se = se->m_parent )
|
|
if ( se->m_flags & SEC_SENTENCE ) break;
|
|
if ( se &&
|
|
( se->m_firstWordPos != sk->m_firstWordPos ||
|
|
se->m_lastWordPos != sk->m_lastWordPos ) )
|
|
continue;
|
|
*/
|
|
|
|
// ok, great that works!
|
|
prev->m_flags |= SEC_MENU;
|
|
sk ->m_flags |= SEC_MENU;
|
|
}
|
|
|
|
/*
|
|
// now remove menus that are just two links
|
|
// IF they are not exactly the same tag hash?????
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// need a menu section
|
|
if ( ! ( si->m_flags & SEC_MENU ) ) continue;
|
|
// skip if processed
|
|
if ( si->m_used == 101 ) continue;
|
|
// must be a list of menu sections
|
|
Section *next = si->m_nextBrother;
|
|
if ( ! next ) continue;
|
|
if ( ! ( next->m_flags & SEC_MENU ) ) continue;
|
|
// is 3rd set?
|
|
Section *third = next->m_nextBrother;
|
|
// unset?
|
|
bool unset = 0;
|
|
if ( ! third ) unset = true;
|
|
else if ( ! ( third->m_flags & SEC_MENU ) ) unset = true;
|
|
// init it
|
|
Section *sj = si;
|
|
// do a full loop
|
|
for ( ; sj ; sj = sj->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// mark it
|
|
sj->m_used = 101;
|
|
// stop if we hit third and unset is true
|
|
if ( sj == third && unset ) break;
|
|
// unset?
|
|
if ( unset ) sj->m_flags &= ~SEC_MENU;
|
|
}
|
|
}
|
|
*/
|
|
|
|
// . set text around input radio checkboxes text boxes and text areas
|
|
// . we need input tags to be their own sections though!
|
|
//for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// need a tag
|
|
if ( ! m_tids[i] ) continue;
|
|
// must be an input tag
|
|
if ( m_tids[i] != TAG_INPUT &&
|
|
m_tids[i] != TAG_TEXTAREA &&
|
|
m_tids[i] != TAG_SELECT )
|
|
continue;
|
|
// get tag as a word
|
|
char *tag = m_wptrs[i];
|
|
int32_t tagLen = m_wlens[i];
|
|
// what type of input tag is this? hidden? radio? checkbox?...
|
|
int32_t itlen;
|
|
char *it = getFieldValue ( tag , tagLen , "type" , &itlen );
|
|
// skip if hidden
|
|
if ( itlen==6 && !strncasecmp ( it,"hidden",6) ) continue;
|
|
// get word before first item in list
|
|
int32_t r = i - 1;
|
|
for ( ; r >= 0 ; r-- ) {
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not wordid
|
|
if ( ! m_wids[r] ) continue;
|
|
// get its section
|
|
Section *sr = m_sectionPtrs[r];
|
|
// . skip if in div hidden section
|
|
// . fixes www.switchboard.com/albuquerque-nm/doughnuts
|
|
if ( sr->m_flags & SEC_HIDDEN ) continue;
|
|
// ok, stop
|
|
break;
|
|
}
|
|
// if no header, skip
|
|
if ( r < 0 ) continue;
|
|
// we are the first item
|
|
//Section *first = sk;
|
|
//int32_t firsta = i;
|
|
Section *first = m_sectionPtrs[i];
|
|
// set SEC_INPUT_HEADER
|
|
setHeader ( r , first , SEC_INPUT_HEADER );
|
|
|
|
// and the footer, if any
|
|
r = i + 1;
|
|
for ( ; r < m_nw && ! m_wids[r] ; r++ ) QUICKPOLL(m_niceness);
|
|
// if no header, skip
|
|
if ( r >= m_nw ) continue;
|
|
// we are the first item
|
|
//Section *first = sk;
|
|
//int32_t firsta = i;
|
|
//Section *first = m_sectionPtrs[i];
|
|
// set SEC_INPUT_FOOTER
|
|
setHeader ( r , first , SEC_INPUT_FOOTER );
|
|
|
|
}
|
|
|
|
|
|
int64_t h_copyright = hash64n("copyright");
|
|
// copyright check
|
|
// the copyright symbol in utf8 (see Entities.cpp for the code)
|
|
char copy[3];
|
|
copy[0] = 0xc2;
|
|
copy[1] = 0xa9;
|
|
copy[2] = 0x00;
|
|
// scan all years, lists and ranges of years, and look for
|
|
// a preceeding copyright sign. mark such years as DF_COPYRIGHT
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
// skip if tag
|
|
if ( m_tids[i] ) continue;
|
|
// do we have an alnum word before us here?
|
|
if ( m_wids[i] ) {
|
|
// if word check for copyright
|
|
if ( m_wids[i] != h_copyright ) continue;
|
|
}
|
|
// must have copyright sign in it i guess
|
|
else if ( ! gb_strncasestr(m_wptrs[i],m_wlens[i],copy))
|
|
continue;
|
|
// mark section as copyright section then
|
|
Section *sp = m_sectionPtrs[i];
|
|
// flag as menu
|
|
sp->m_flags |= SEC_MENU;
|
|
}
|
|
|
|
|
|
sec_t ff = SEC_MENU |
|
|
SEC_INPUT_HEADER |
|
|
SEC_INPUT_FOOTER;
|
|
|
|
// set SEC_MENU of child sections of SEC_MENU sections
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must be a link text only section
|
|
if ( ! ( si->m_flags & ff ) ) continue;
|
|
// ignore if went down this path
|
|
if ( si->m_used == 82 ) continue;
|
|
// save it
|
|
//Section *parent = si;
|
|
// get first potential kid
|
|
Section *sk = si->m_next;
|
|
// scan child sections
|
|
for ( ; sk ; sk = sk->m_next ) {
|
|
// stop if not contained
|
|
if ( ! si->contains ( sk ) ) break;
|
|
// mark it
|
|
sk->m_flags |= (si->m_flags & ff); // SEC_MENU;
|
|
// ignore in big loop
|
|
sk->m_used = 82;
|
|
}
|
|
}
|
|
|
|
//
|
|
// set SEC_MENU_HEADER
|
|
//
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not in a menu
|
|
if ( ! ( sk->m_flags & SEC_MENU ) ) continue;
|
|
// get his list container
|
|
Section *c = sk->m_listContainer;
|
|
// skip if none
|
|
if ( ! c ) continue;
|
|
// already flagged?
|
|
if ( c->m_used == 89 ) continue;
|
|
// do not repeat on any item in this list
|
|
c->m_used = 89;
|
|
// flag all its brothers!
|
|
Section *zz = sk;
|
|
for ( ; zz ; zz = zz->m_nextBrother )
|
|
// bail if not in menu
|
|
if ( ! ( zz->m_flags & SEC_MENU ) ) break;
|
|
// if broked it, stop
|
|
if ( zz ) continue;
|
|
//
|
|
// ok, every item in list is a menu item, so try to set header
|
|
//
|
|
// get word before first item in list
|
|
int32_t r = sk->m_a - 1;
|
|
for ( ; r >= 0 && ! m_wids[r] ; r-- )
|
|
QUICKPOLL(m_niceness);
|
|
// if no header, skip
|
|
if ( r < 0 ) continue;
|
|
// set SEC_MENU_HEADER
|
|
setHeader ( r , sk , SEC_MENU_HEADER );
|
|
}
|
|
|
|
//
|
|
// unset SEC_MENU if a lot of votes for not dup
|
|
//
|
|
/*
|
|
for ( Section *sk = m_rootSection ; sk ; sk = sk->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not in a menu
|
|
if ( ! ( sk->m_flags & SEC_MENU ) ) continue;
|
|
// . check him out
|
|
// . this is only valid for sections that have SEC_NO_TEXT clr
|
|
if ( sk->m_votesForNotDup <= sk->m_votesForDup ) continue;
|
|
// ok, more likely not a dup i guess
|
|
sk->m_flags &= ~SEC_MENU;
|
|
// and every parent
|
|
for ( Section *sp = sk ; sp ; sp = sp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if clear
|
|
if ( ! ( sp->m_flags & SEC_MENU ) ) break;
|
|
// unset it
|
|
sp->m_flags &= ~SEC_MENU;
|
|
}
|
|
}
|
|
*/
|
|
|
|
//
|
|
// set SEC_MENU_SENTENCE flag
|
|
//
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must be a link text only section
|
|
if ( ! ( si->m_flags & SEC_MENU ) ) continue;
|
|
// set this
|
|
bool gotSentence = ( si->m_flags & SEC_SENTENCE );
|
|
// set SEC_MENU of the sentence
|
|
if ( gotSentence ) continue;
|
|
// parent up otherwise
|
|
for ( Section *sk = si->m_parent ; sk ; sk = sk->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop if sentence finally
|
|
if ( ! ( sk->m_flags & SEC_SENTENCE ) ) continue;
|
|
// not a menu sentence if it has plain text in it
|
|
// though! we have to make this exception to stop
|
|
// stuff like
|
|
// "Wedding Ceremonies, No preservatives, more... "
|
|
// from switchboard.com from being a menu sentence
|
|
// just because "more" is in a link.
|
|
if ( sk->m_flags & SEC_PLAIN_TEXT ) break;
|
|
// set it
|
|
sk->m_flags |= SEC_MENU_SENTENCE;
|
|
// and stop
|
|
break;
|
|
}
|
|
// sanity check - must have sentence
|
|
//if ( ! sk ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
|
|
|
|
// just this here
|
|
//ff = SEC_MENU_HEADER;
|
|
|
|
// . now set generic list headers
|
|
// . list headers can only contain one hard section with text
|
|
// . list headers cannot have a previous brother section
|
|
for ( int32_t i = 0 ; i + 1 < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// int16_tcut
|
|
Section *si = &m_sections[i];
|
|
// get container
|
|
Section *c = si->m_listContainer;
|
|
// skip if no container
|
|
if ( ! c ) continue;
|
|
// skip if already did this container
|
|
if ( c->m_used == 55 ) continue;
|
|
// mark it
|
|
c->m_used = 55;
|
|
// flag it
|
|
c->m_flags |= SEC_CONTAINER;
|
|
// skip the rest for now
|
|
continue;
|
|
/*
|
|
// get first word before the list container
|
|
int32_t r = c->m_a - 1;
|
|
for ( ; r >= 0 && ! m_wids[r] ; r-- ) QUICKPOLL(m_niceness);
|
|
// if no header, skip
|
|
if ( r < 0 ) continue;
|
|
// now see if the containing list section has an outside header
|
|
//setHeader ( r , c , SEC_LIST_HEADER );
|
|
//Section *maxhdr = setHeader ( r , c , SEC_LIST_HEADER );
|
|
// skip if none
|
|
//if ( ! maxhdr ) continue;
|
|
// if it was a header, mark the list it is a header of
|
|
//maxhdr->m_headerOfList = c;
|
|
*/
|
|
}
|
|
|
|
static bool s_init = false;
|
|
static int64_t h_close ;
|
|
static int64_t h_send ;
|
|
static int64_t h_map ;
|
|
static int64_t h_maps ;
|
|
static int64_t h_directions ;
|
|
static int64_t h_driving ;
|
|
static int64_t h_help ;
|
|
static int64_t h_more ;
|
|
static int64_t h_log ;
|
|
static int64_t h_sign ;
|
|
static int64_t h_change ;
|
|
static int64_t h_write ;
|
|
static int64_t h_save ;
|
|
static int64_t h_share ;
|
|
static int64_t h_forgot ;
|
|
static int64_t h_home ;
|
|
static int64_t h_sitemap ;
|
|
static int64_t h_advanced ;
|
|
static int64_t h_go ;
|
|
static int64_t h_website ;
|
|
static int64_t h_view;
|
|
static int64_t h_add;
|
|
static int64_t h_submit;
|
|
static int64_t h_get;
|
|
static int64_t h_about;
|
|
// new stuff
|
|
static int64_t h_back; // back to top
|
|
static int64_t h_next;
|
|
static int64_t h_buy; // buy tickets
|
|
static int64_t h_english; // english french german versions
|
|
static int64_t h_click;
|
|
|
|
if ( ! s_init ) {
|
|
s_init = true;
|
|
h_close = hash64n("close");
|
|
h_send = hash64n("send");
|
|
h_map = hash64n("map");
|
|
h_maps = hash64n("maps");
|
|
h_directions = hash64n("directions");
|
|
h_driving = hash64n("driving");
|
|
h_help = hash64n("help");
|
|
h_more = hash64n("more");
|
|
h_log = hash64n("log");
|
|
h_sign = hash64n("sign");
|
|
h_change = hash64n("change");
|
|
h_write = hash64n("write");
|
|
h_save = hash64n("save");
|
|
h_share = hash64n("share");
|
|
h_forgot = hash64n("forgot");
|
|
h_home = hash64n("home");
|
|
h_sitemap = hash64n("sitemap");
|
|
h_advanced = hash64n("advanced");
|
|
h_go = hash64n("go");
|
|
h_website = hash64n("website");
|
|
h_view = hash64n("view");
|
|
h_add = hash64n("add");
|
|
h_submit = hash64n("submit");
|
|
h_get = hash64n("get");
|
|
h_about = hash64n("about");
|
|
h_back = hash64n ("back");
|
|
h_next = hash64n ("next");
|
|
h_buy = hash64n ("buy");
|
|
h_english = hash64n ("english");
|
|
h_click = hash64n ("click");
|
|
}
|
|
|
|
// . when dup/non-dup voting info is not available because we are
|
|
// more or less an isolated page, guess that these links are
|
|
// menu links and not to be considered for title or event description
|
|
// . we completely exclude a word from title/description if its
|
|
// SEC_MENU is set.
|
|
// . set SEC_MENU for renegade links that start with an action
|
|
// verb like "close" or "add" etc. but if their # of non dup votes
|
|
// is high relative to their # of dup votes, then do not set this
|
|
// because it might be a name of a band like "More" or something
|
|
// and be in a link
|
|
// . scan all href sections
|
|
// set SEC_LINK_ONLY on sections that just contain a link
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not a href section
|
|
if ( si->m_baseHash != TAG_A ) continue;
|
|
// set points to scan
|
|
int32_t a = si->m_a;
|
|
int32_t b = si->m_b;
|
|
// assume not bad
|
|
bool bad = false;
|
|
int32_t i;
|
|
// scan words if any
|
|
for ( i = a ; i < b ; i++ ) {
|
|
// skip if not word
|
|
if ( ! m_wids[i] ) continue;
|
|
// assume bad
|
|
bad = true;
|
|
// certain words are indicative of menus
|
|
if ( m_wids[i] == h_close ) break;
|
|
if ( m_wids[i] == h_send ) break;
|
|
if ( m_wids[i] == h_map ) break;
|
|
if ( m_wids[i] == h_maps ) break;
|
|
if ( m_wids[i] == h_directions ) break;
|
|
if ( m_wids[i] == h_driving ) break;
|
|
if ( m_wids[i] == h_help ) break;
|
|
if ( m_wids[i] == h_more ) break;
|
|
if ( m_wids[i] == h_log ) break; // log in
|
|
if ( m_wids[i] == h_sign ) break; // sign up/in
|
|
if ( m_wids[i] == h_change ) break; // change my loc.
|
|
if ( m_wids[i] == h_write ) break; // write a review
|
|
if ( m_wids[i] == h_save ) break;
|
|
if ( m_wids[i] == h_share ) break;
|
|
if ( m_wids[i] == h_forgot ) break; // forgot your pwd
|
|
if ( m_wids[i] == h_home ) break;
|
|
if ( m_wids[i] == h_sitemap ) break;
|
|
if ( m_wids[i] == h_advanced ) break; // adv search
|
|
if ( m_wids[i] == h_go ) break; // go to top of page
|
|
if ( m_wids[i] == h_website ) break;
|
|
if ( m_wids[i] == h_view ) break;
|
|
if ( m_wids[i] == h_add ) break;
|
|
if ( m_wids[i] == h_submit ) break;
|
|
if ( m_wids[i] == h_get ) break;
|
|
if ( m_wids[i] == h_about ) break;
|
|
if ( m_wids[i] == h_back ) break;
|
|
if ( m_wids[i] == h_next ) break;
|
|
if ( m_wids[i] == h_buy ) break;
|
|
if ( m_wids[i] == h_english ) break;
|
|
if ( m_wids[i] == h_click ) break;
|
|
bad = false;
|
|
break;
|
|
}
|
|
// skip if ok
|
|
if ( ! bad ) continue;
|
|
// get smallest section
|
|
Section *sm = m_sectionPtrs[i];
|
|
// . exempt if lots of non-dup votes relatively
|
|
// . could be a band name of "More ..."
|
|
//if ( sm->m_votesForNotDup > sm->m_votesForDup ) continue;
|
|
// if bad mark it!
|
|
sm->m_flags |= SEC_MENU;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// "first" is first item in the list we are getting header for
|
|
void Sections::setHeader ( int32_t r , Section *first , sec_t flag ) {
|
|
// get smallest section containing word #r
|
|
Section *sr = m_sectionPtrs[r];
|
|
// save orig
|
|
Section *orig = sr;
|
|
|
|
// save that
|
|
//Section *firstOrig = first;
|
|
// blow up until just before "first" section
|
|
for ( ; sr ; sr = sr->m_parent ) {
|
|
// forget it if in title tag already!
|
|
if ( sr->m_flags & SEC_IN_TITLE ) return;
|
|
// any brother means to stop! this basically means
|
|
// we are looking for a single line of text as the menu
|
|
// header
|
|
//if ( sr->m_prevBrother ) return;
|
|
// stop if no parent
|
|
if ( ! sr->m_parent ) continue;
|
|
// parent must not contain first
|
|
if ( sr->m_parent->contains ( first ) ) break;
|
|
}
|
|
// if we failed to contain "first"... what does this mean? i dunno
|
|
// but its dropping core for
|
|
// http://tedserbinski.com/jcalendar/jcalendar.js
|
|
if ( ! sr ) return;
|
|
|
|
// save that
|
|
Section *biggest = sr;
|
|
|
|
// check out prev brother
|
|
Section *prev = biggest->m_prevBrother;
|
|
|
|
// if we are in a hard section and capitalized (part of the
|
|
// SEC_HEADING) requirements, then it should be ok if we have
|
|
// a prev brother of a different tagid.
|
|
// this will fix americantowns.com which has a list of header tags
|
|
// and ul tags intermingled, with menus in the ul tags.
|
|
// should also fix upcoming.yahoo.com which has alternating
|
|
// dd and dt tags for its menus. now that we got rid of
|
|
// addImpliedSections() we have to deal with this here, and it will
|
|
// be more accurate since addImpliedSections() was often wrong.
|
|
if ( prev &&
|
|
//(orig->m_flags & SEC_CAPITALIZED) &&
|
|
//!(orig->m_flags & SEC_ENDS_IN_PUNCT) &&
|
|
(orig->m_flags & SEC_HEADING) &&
|
|
prev->m_tagId != biggest->m_tagId )
|
|
prev = NULL;
|
|
|
|
// but if prev brother is a blank, we should view that as a delimeter
|
|
// BUT really we should have added those sections in with the new
|
|
// delimeter logic! but let's put this in for now anyway...
|
|
if ( prev && prev->m_firstWordPos < 0 )
|
|
prev = NULL;
|
|
|
|
// if the header section has a prev brother, forget it!
|
|
if ( prev ) return;
|
|
|
|
// . if we gained extra text, that is a no-no then
|
|
// . these two checks replaced the two commented out ones above
|
|
// . they allow for empty sections preceeding "sr" at any level as
|
|
// we telescope it up
|
|
if ( biggest->m_firstWordPos != orig->m_firstWordPos ) return;
|
|
if ( biggest->m_lastWordPos != orig->m_lastWordPos ) return;
|
|
|
|
/*
|
|
// . scan all subsections of the header section
|
|
// . only allowed to have one hard section containing text
|
|
Section *hdr = biggest;
|
|
for ( int32_t i = biggest->m_a ; i < biggest->m_b ; i++ ) {
|
|
// need word
|
|
if ( ! m_wids[i] ) continue;
|
|
// get smallest *hard* section containing word
|
|
Section *sp = m_sectionPtrs[i];
|
|
// get first hard section
|
|
for ( ; ! sp->isHardSection() ; sp = sp->m_parent ) ;
|
|
// if we had a different one already, that is bad! we can't
|
|
// be a header section then!
|
|
if ( firstHard && firstHard != sp ) return;
|
|
// flag it
|
|
firstHard = sp;
|
|
|
|
sp->m_used = 33;
|
|
*/
|
|
|
|
|
|
// . now blow up first until just before it hits biggest as well
|
|
// . this fixes reverbnation on the nextBrother check below
|
|
for ( ; first ; first = first->m_parent ) {
|
|
// stop if parent is NULL
|
|
if ( ! first->m_parent ) break;
|
|
// stop if parent would contain biggest
|
|
if ( first->m_parent->contains ( biggest ) ) break;
|
|
}
|
|
// if after blowing it up "first" contains more than just menu
|
|
// sections, then bail. that really was not a menu header!
|
|
// fixes reverbnation url that thought "That 1 Guy" was a menu header.
|
|
if ( flag == SEC_MENU_HEADER ) {
|
|
Section *fx = first;
|
|
for ( ; fx ; fx = fx->m_next ) {
|
|
// stop when list is over
|
|
if ( fx->m_a >= first->m_b ) break;
|
|
// ignore if no next
|
|
if ( fx->m_flags & SEC_NOTEXT ) continue;
|
|
// that's bad if SEC_MENU not set, it should be for all!
|
|
if ( fx->m_flags & SEC_MENU ) continue;
|
|
// we got these now
|
|
if ( fx->m_flags & SEC_MENU_SENTENCE ) continue;
|
|
// otherwise, bad!
|
|
return;
|
|
//if ( ! ( fx->m_flags & SEC_MENU ) ) return;
|
|
}
|
|
}
|
|
|
|
// strange?
|
|
if ( ! sr ) { char *xx=NULL;*xx=0; }
|
|
// scan until outside biggest
|
|
int32_t lastb = biggest->m_b;
|
|
// . make sure sr does not contain any list in it
|
|
// . scan all sections between sr and "saved"
|
|
for ( ; sr ; sr = sr->m_next ) {
|
|
// stop if over
|
|
if ( sr->m_a >= lastb ) break;
|
|
// if we have a brother with same taghash we are
|
|
// part of a list
|
|
if ( sr->m_nextBrother &&
|
|
sr->m_nextBrother->m_tagHash == sr->m_tagHash &&
|
|
sr->m_nextBrother != first )
|
|
return;
|
|
if ( sr->m_prevBrother &&
|
|
sr->m_prevBrother->m_tagHash == sr->m_tagHash &&
|
|
// for footers
|
|
sr->m_prevBrother != first )
|
|
return;
|
|
}
|
|
|
|
// restart loop
|
|
sr = biggest;
|
|
// ok, not part of a list, flag it
|
|
for ( ; sr ; sr = sr->m_next ) {
|
|
// stop if over
|
|
if ( sr->m_a >= lastb ) break;
|
|
// flag each subsection
|
|
sr->m_flags |= flag; // SEC_MENU_HEADER;
|
|
// mark it
|
|
//if (flag == SEC_LIST_HEADER ) sr->m_headerOfList = firstOrig;
|
|
}
|
|
}
|
|
|
|
|
|
// . set SEC_HEADING and SEC_HEADING_CONTAINER bits in Section::m_flags
|
|
// . identifies sections that are most likely headings
|
|
// . the WHOLE idea of this algo is to take a list of sections that are all
|
|
// the same tagId/baseHash and differentiate them so we can insert implied
|
|
// sections with headers.
|
|
// . then addImpliedSections() uses the SEC_HEADING_CONTAINER bit to
|
|
// modify the base hash to distinguish sections that would otherwise all
|
|
// be the same!
|
|
// . should fix salsapower.com and abqtango.com to have proper implied sections
|
|
// . it is better to not add any implied sections than to add the wrong ones
|
|
// so be extra strict in our rules here.
|
|
bool Sections::setHeadingBit ( ) {
|
|
|
|
int32_t headings = 0;
|
|
// scan the sections
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if directly contains no text
|
|
//if ( si->m_flags & SEC_NOTEXT ) continue;
|
|
// SEC_NOTEXT is not set at this point
|
|
int32_t fwp = si->m_firstWordPos;
|
|
if ( fwp == -1 ) continue;
|
|
// we must be the smallest container around this text
|
|
if ( m_sectionPtrs[fwp] != si ) continue;
|
|
|
|
// . make sure we are in our own hard section
|
|
// . TODO: allow for bold or strong, etc. tags as well
|
|
bool hasHard = false;
|
|
int32_t a = si->m_firstWordPos;
|
|
int32_t b = si->m_lastWordPos;
|
|
// go to parent
|
|
Section *pp = si;
|
|
Section *biggest = NULL;
|
|
bool inLink = false;
|
|
// . we need to be isolated in our own hard section container
|
|
// . TODO: what about "<b>Hi There <i>Bob</i></b>" as a heading
|
|
// . i guess that will still work!
|
|
for ( ; pp ; pp = pp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if breached
|
|
if ( pp->m_firstWordPos != a ) break;
|
|
if ( pp->m_lastWordPos != b ) break;
|
|
// record this
|
|
if ( pp->m_tagId == TAG_A ) inLink = true;
|
|
// record the biggest section containing just our text
|
|
biggest = pp;
|
|
// is it a hard section?
|
|
if ( isHardSection(pp) ) hasHard = true;
|
|
// . allow bold and strong tags
|
|
// . fixes gwair.org which has the dates of the
|
|
// month in strong tags. so we need to set
|
|
// SEC_HEADING for those so getDelimHash() will
|
|
// recognize such tags as date header tags in the
|
|
// METHOD_DOM algorithm and we get the proper
|
|
// implied sections
|
|
if ( pp->m_tagId == TAG_STRONG ) hasHard = true;
|
|
if ( pp->m_tagId == TAG_B ) hasHard = true;
|
|
}
|
|
// need to be isolated in a hard section
|
|
if ( ! hasHard ) continue;
|
|
|
|
// . must be a <tr> or <p> tag... for now keep it constrained
|
|
// . fixes events.mapchannels.com where we were allowing
|
|
// <td> cells to be headings in a row... kinda strange
|
|
// . allow SEC_HEADING and SEC_HEADING_CONTAINER to be set
|
|
// here at least and nixed in the 2nd loop. that way we
|
|
// can use SEC_NIXED_HEADING_CONTAINER in the algo
|
|
// in Dates.cpp that sets SEC_TOD_EVENT
|
|
// . WELL, i turned this off 11/8/11 and didn't seem to
|
|
// be needed! this should be tagid independent anyhow.
|
|
/*
|
|
if ( biggest->m_tagId != TAG_TR &&
|
|
// if we don't allow <td> cells we miss out on
|
|
// "PAYNE NURSERIES INC" for
|
|
// www.magicyellow.com/category/Nurseries_Plants_Trees/
|
|
// Santa_Fe_NM.html
|
|
biggest->m_tagId != TAG_TD &&
|
|
// http://www.fabnyc.org/calendar.php?type=class
|
|
biggest->m_tagId != TAG_TABLE &&
|
|
// fix pacificmedicalcenters.org
|
|
biggest->m_tagId != TAG_DT &&
|
|
biggest->m_tagId != TAG_P &&
|
|
biggest->m_tagId != TAG_P2 &&
|
|
biggest->m_tagId != TAG_H1 &&
|
|
biggest->m_tagId != TAG_H2 &&
|
|
biggest->m_tagId != TAG_H3 &&
|
|
biggest->m_tagId != TAG_H4 &&
|
|
// fix gwair.org to allow dates in strong tags to be
|
|
// headings so our getDelimHash() works. see comment
|
|
// above regarding gwair.org
|
|
biggest->m_tagId != TAG_STRONG &&
|
|
biggest->m_tagId != TAG_B )
|
|
continue;
|
|
*/
|
|
|
|
// now make sure the text is capitalized etc
|
|
bool hadUpper = false;
|
|
//bool hadLower = false;
|
|
int32_t lowerCount = 0;
|
|
bool hadYear = false;
|
|
bool hadAlpha = false;
|
|
int32_t i;
|
|
// scan the alnum words we contain
|
|
for ( i = a ; i <= b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// . did we hit a breaking tag?
|
|
// . "<div> blah <table><tr><td>blah... </div>"
|
|
if ( m_tids[i] && isBreakingTagId(m_tids[i]) ) break;
|
|
// skip if not alnum word
|
|
if ( ! m_wids[i] ) continue;
|
|
// skip digits
|
|
if ( is_digit(m_wptrs[i][0]) ) {
|
|
// skip if not 4-digit year
|
|
if ( m_wlens[i] != 4 ) continue;
|
|
// . but if we had a year like "2010" that
|
|
// is allowed to be a header.
|
|
// . this fixes 770kob.com because the events
|
|
// under the "2010" header were telescoping
|
|
// up into events in the "December 2009"
|
|
// section, when they should have been in
|
|
// their own section! and now they are in
|
|
// their own implied section...
|
|
int32_t num = m_words->getAsLong(i);
|
|
if ( num < 1800 ) continue;
|
|
if ( num > 2100 ) continue;
|
|
// mark it
|
|
hadYear = true;
|
|
continue;
|
|
}
|
|
// mark this
|
|
hadAlpha = true;
|
|
// is it upper?
|
|
if ( is_upper_utf8(m_wptrs[i]) ) {
|
|
hadUpper = true;
|
|
continue;
|
|
}
|
|
// skip stop words
|
|
if ( m_words->isStopWord(i) ) continue;
|
|
// . skip int16_t words
|
|
// . November 4<sup>th</sup> for facebook.com
|
|
if ( m_wlens[i] <= 2 ) continue;
|
|
// is it lower?
|
|
if ( is_lower_utf8(m_wptrs[i]) ) lowerCount++;
|
|
// stop now if bad
|
|
//if ( hadUpper ) break;
|
|
if ( lowerCount >= 2 ) break;
|
|
}
|
|
// is it a header?
|
|
bool isHeader = hadUpper;
|
|
// a single year by itself is ok though too
|
|
if ( hadYear && ! hadAlpha ) isHeader = true;
|
|
// not if we had a lower case guy
|
|
//if ( hadLower ) isHeader = false;
|
|
// allow for one mistake like we do in Events.cpp for titles
|
|
if ( lowerCount >= 2 ) isHeader = false;
|
|
// . mark as bad? we need at least one upper case word!
|
|
// . seeing too many all lower case non-headers!
|
|
//if ( ! hadUpper ) continue;
|
|
// if we broke out early, give up
|
|
//if ( i < b ) continue;
|
|
if ( ! isHeader ) continue;
|
|
|
|
// ok, mark this section as a heading section
|
|
si->m_flags |= SEC_HEADING;
|
|
|
|
// mark all parents to up to biggest
|
|
biggest->m_flags |= SEC_HEADING_CONTAINER;
|
|
|
|
// mark all up to biggest now too! that way the td section
|
|
// gets marked and if the tr section gets replaced with a
|
|
// fake tc section then we are ok for METHOD_ABOVE_DOW!
|
|
for ( Section *pp = si; // ->m_parent ; (bug!)
|
|
pp && pp != biggest ;
|
|
pp = pp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
pp->m_flags |= SEC_HEADING_CONTAINER;
|
|
}
|
|
|
|
// a hack!
|
|
if ( inLink ) biggest->m_flags |= SEC_LINK_TEXT;
|
|
|
|
// count them
|
|
headings++;
|
|
}
|
|
|
|
// bail now if no headings were set
|
|
if ( ! headings ) return true;
|
|
|
|
// . now scan sections again and scan each list of brothers
|
|
// whose m_used is not set to 33
|
|
// . if it is set to 33 that means we already scanned it
|
|
// . and unset SEC_HEADING if brothers are not all the same tr or p tag
|
|
/*
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// skip if not a heading container
|
|
if ( ! ( si->m_flags & SEC_HEADING_CONTAINER) ) continue;
|
|
// skip if already did
|
|
if ( si->m_used == 33 ) continue;
|
|
// we got one, get the brother list
|
|
Section *bro = si;
|
|
// back up to first brother
|
|
for ( ; bro->m_prevBrother ; bro = bro->m_prevBrother )
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip NIXED for now
|
|
//continue;
|
|
// save it
|
|
Section *first = bro;
|
|
// are we a bad list?
|
|
char bad = 0;
|
|
// now do the full scan
|
|
for ( ; bro ; bro = bro->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . must all be the same baseHash (kinda like tagId)
|
|
// . the WHOLE idea of this algo is to take a list
|
|
// of sections that are all the same tagId/baseHash
|
|
// and differentiate them so we can insert
|
|
// implied sections with headers
|
|
//if ( bro->m_baseHash != si->m_baseHash ) bad = 1;
|
|
// abqtango.org has all <p> tags but with different
|
|
// attributes... so fix that...
|
|
if ( bro->m_tagId != si->m_tagId ) bad = 1;
|
|
// if has link text and is heading, that is bad
|
|
if ( ( bro->m_flags & SEC_HEADING_CONTAINER ) &&
|
|
( bro->m_flags & SEC_LINK_TEXT ) )
|
|
bad = 2;
|
|
// mark it
|
|
bro->m_used = 33;
|
|
}
|
|
// if not bad, keep it
|
|
if ( ! bad ) continue;
|
|
// clear the whole list
|
|
bro = first;
|
|
// scan again
|
|
for ( ; bro ; bro = bro->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not set
|
|
if ( ! ( bro->m_flags & SEC_HEADING_CONTAINER ) )
|
|
continue;
|
|
// unset
|
|
bro->m_flags &= ~SEC_HEADING_CONTAINER;
|
|
// note it for Dates.cpp to use
|
|
bro->m_flags |= SEC_NIXED_HEADING_CONTAINER;
|
|
}
|
|
}
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
void Sections::setTagHashes ( ) {
|
|
|
|
if ( m_numSections == 0 ) return;
|
|
|
|
// now recompute the tagHashes and depths and content hashes since
|
|
// we have eliminate open-ended sections in the loop above
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// these have to be in order of sn->m_a to work right
|
|
// because we rely on the parent tag hash, which would not
|
|
// necessarily be set if we were not sorted, because the
|
|
// parent section could have SEC_FAKE flag set because it is
|
|
// a br section added afterwards.
|
|
//Section *sn = m_sorted[i]; // sections[i];
|
|
// int16_tcut
|
|
int64_t bh = (int64_t)sn->m_baseHash;
|
|
//int64_t fh = sn->m_tagId;
|
|
// sanity check
|
|
if ( bh == 0 ) { char *xx=NULL;*xx=0; }
|
|
// if no parent, use initial values
|
|
if ( ! sn->m_parent ) {
|
|
sn->m_depth = 0;
|
|
sn->m_tagHash = bh;
|
|
sn->m_turkTagHash32 = sn->m_turkBaseHash;//m_tagId;
|
|
//sn->m_turkTagHash32 = bh;
|
|
//sn->m_formatHash = fh;
|
|
// sanity check
|
|
if ( bh == 0 ) { char *xx=NULL;*xx=0; }
|
|
continue;
|
|
}
|
|
// sanity check
|
|
if ( sn->m_parent->m_tagHash == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . update the cumulative front tag hash
|
|
// . do not include hyperlinks as part of the cumulative hash!
|
|
sn->m_tagHash = hash32h ( bh , sn->m_parent->m_tagHash );
|
|
|
|
// now use this for setting Date::m_dateTagHash instead
|
|
// of using Section::m_tagHash since often the dates like
|
|
// for zvents.org are in a <tr id=xxxx> where xxxx changes
|
|
sn->m_turkTagHash32 =
|
|
//hash32h ( sn->m_tagId, sn->m_parent->m_turkTagHash );
|
|
hash32h ( sn->m_turkBaseHash,
|
|
sn->m_parent->m_turkTagHash32 );
|
|
|
|
sn->m_colorHash = hash32h ( bh , sn->m_parent->m_colorHash );
|
|
|
|
//if(fh)sn->m_formatHash=hash32h(fh,sn->m_parent->m_formatHash)
|
|
//else sn->m_formatHash=sn->m_parent->m_formatHash;
|
|
|
|
// if we are an implied section, just use the tag hash of
|
|
// our parent. that way since we add different implied
|
|
// sections for msichicago.com root than we do the kid,
|
|
// the section voting should still match up
|
|
if ( bh == BH_IMPLIED ) {
|
|
sn->m_tagHash = sn->m_parent->m_tagHash;
|
|
sn->m_turkTagHash32 = sn->m_parent->m_turkTagHash32;
|
|
}
|
|
|
|
// sanity check
|
|
// i've seen this happen once for
|
|
// sn->m_parent->m_tagHash = 791013962
|
|
// bh = 20020
|
|
if ( sn->m_tagHash == 0 ) sn->m_tagHash = 1234567;
|
|
// depth based on parent, too
|
|
//if ( tid != TAG_A ) sn->m_depth = sn->m_parent->m_depth + 1;
|
|
//else sn->m_depth = sn->m_parent->m_depth ;
|
|
sn->m_depth = sn->m_parent->m_depth + 1;
|
|
}
|
|
}
|
|
|
|
bool Sections::containsTagId ( Section *si, nodeid_t tagId ) {
|
|
// scan sections to right
|
|
int32_t a = si->m_a + 1;
|
|
// scan as int32_t as contained by us
|
|
for ( ; a < m_nw && a < si->m_b ; a++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
if ( m_tids[a] == tagId ) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Sections::setTableStuff ( ) {
|
|
char tbuf[1024];
|
|
HashTableX tdups;
|
|
tdups.set(4,0,64,tbuf,1024,false,m_niceness,"tdupt");
|
|
for ( int32_t i = 0 ; i < m_dates->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_datePtrs[i];
|
|
// skip if none
|
|
if ( ! di ) continue;
|
|
// get section
|
|
Section *si = di->m_section;
|
|
// skip if not in body
|
|
if ( ! si ) continue;
|
|
// need to be in a table
|
|
Section *ts = si->m_tableSec;
|
|
// skip if none
|
|
if ( ! ts ) continue;
|
|
// if already did, forget it!
|
|
if ( tdups.isInTable ( &ts ) ) continue;
|
|
// add it
|
|
if ( ! tdups.addKey ( &ts ) ) return false;
|
|
// do it
|
|
if ( ! setTableDateHeaders ( ts ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . does table have a date header row or column?
|
|
// . we need this for our weekly schedule detection algorithm
|
|
// . many sites have a row header this is the days of the week
|
|
// . sometimes they have tods in the first column, and sometimes they
|
|
// just put the tods and tod ranges in the table cells directly.
|
|
// . sets Section::m_flags SEC_HASDATEHEADERCOL/ROW for JUST the table
|
|
// section if it indeed has such date headers
|
|
// . Dates::isCompatible() looks at that table flag to see if it should
|
|
// apply special processing when deciding if two dates should be paired
|
|
// . then we set DF_TABLEDATEHEADERROW/COL for the dates in those
|
|
// header rows/cols so that we can set SF_RECURRING_DOW if the dow date
|
|
// was in the header row/col
|
|
bool Sections::setTableDateHeaders ( Section *ts ) {
|
|
|
|
// skip if only one row and column
|
|
if ( ! ( ts->m_flags & SEC_MULTIDIMS ) ) return true;
|
|
|
|
char adtbuf[1000];
|
|
HashTableX adt;
|
|
adt.set ( 4 , 0 , 64 , adtbuf, 1000, false,m_niceness,"adttab");
|
|
|
|
// sanity check
|
|
//if ( ! m_firstDateValid ) { char *xx=NULL;*xx=0; }
|
|
// return right away if table contains no dates
|
|
//int32_t dn = ts->m_firstDate - 1;
|
|
//if ( dn < 0 ) return true;
|
|
int32_t dn = 0;
|
|
|
|
Section *headerCol = NULL;
|
|
Section *headerRow = NULL;
|
|
|
|
for ( int32_t i = dn ; i < m_dates->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_datePtrs[i];
|
|
// skip if nuked. part of a compound, or in bad section.
|
|
if ( ! di ) continue;
|
|
// must be in table
|
|
if ( di->m_a < ts->m_a ) continue;
|
|
// stop if out of table
|
|
if ( di->m_a >= ts->m_b ) break;
|
|
// sanity check
|
|
if ( di->m_a < 0 ) continue;
|
|
// if it is a single tod then make sure not like "8.32"
|
|
if ( di->m_flags & DF_FUZZY ) continue;
|
|
// get date's section
|
|
Section *si = di->m_section;
|
|
// must be in that table.. in subtable?? then skip...
|
|
if ( si->m_tableSec != ts ) continue;
|
|
|
|
// get the table cell section
|
|
Section *cell = si;
|
|
for ( ; cell ; cell = cell->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( cell->m_tagId == TAG_TD ) break;
|
|
if ( cell->m_tagId == TAG_TH ) break;
|
|
}
|
|
// how does this happen?
|
|
if ( ! cell ) continue;
|
|
// . do not add our hash if not just a date
|
|
// . we are not header material if more than just a date
|
|
wbit_t *bb = NULL;
|
|
if ( m_bits ) bb = m_bits->m_bits;
|
|
bool hasWord = false;
|
|
for ( int32_t j = cell->m_a ; j < cell->m_b ; j++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! bb ) break;
|
|
if ( ! m_wids[j] ) continue;
|
|
if ( bb[j] & D_IS_IN_DATE ) continue;
|
|
hasWord = true;
|
|
break;
|
|
}
|
|
// we are not date header material if we got more than
|
|
// just a date in our td/th cell
|
|
if ( hasWord ) continue;
|
|
|
|
// get date types
|
|
datetype_t dt = di->m_hasType;
|
|
// see if same date type in prev row or col
|
|
int32_t row = si->m_rowNum;
|
|
int32_t col = si->m_colNum;
|
|
int32_t prevRow = row - 1;
|
|
int32_t prevCol = col - 1;
|
|
int32_t h;
|
|
// zero is invalid row
|
|
if ( prevRow >= 1 && ! headerCol ) {
|
|
h = hash32h ( prevRow , col );
|
|
h = hash32h ( (int32_t)(PTRTYPE)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( adt.isInTable ( &h ) ) {
|
|
headerCol = cell;
|
|
//break;
|
|
}
|
|
}
|
|
// zero is invalid col
|
|
if ( prevCol >= 1 && ! headerRow ) {
|
|
h = hash32h ( row , prevCol );
|
|
h = hash32h ( (int32_t)(PTRTYPE)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( adt.isInTable ( &h ) ) {
|
|
headerRow = cell;
|
|
//break;
|
|
}
|
|
}
|
|
// add our hash
|
|
h = hash32h ( row , col );
|
|
h = hash32h ( (int32_t)(PTRTYPE)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( ! adt.addKey ( &h ) ) return false;
|
|
}
|
|
// set flags in all cells of table
|
|
sec_t sef = 0;
|
|
if ( headerCol ) sef |= SEC_HASDATEHEADERCOL;
|
|
if ( headerRow ) sef |= SEC_HASDATEHEADERROW;
|
|
// bail if none
|
|
if ( ! sef ) return true;
|
|
// just set on table now to avoid confusion
|
|
ts->m_flags |= sef;
|
|
|
|
// . set DF_ISTABLEHEADER on dates in the headrow and headcol
|
|
// . then we can set SF_RECURRING if its a dow because its a
|
|
// weekly schedule then
|
|
// . we are setting implied sections so we do not have telescoped
|
|
// dates at this point.
|
|
for ( int32_t i = dn ; i < m_dates->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_datePtrs[i];
|
|
// skip if nuked. part of a compound, or in bad section.
|
|
if ( ! di ) continue;
|
|
// in row?
|
|
Section *ds = di->m_section;
|
|
// must be in body
|
|
if ( ! ds ) continue;
|
|
// stop if beyond table
|
|
if ( ds->m_a >= ts->m_b ) break;
|
|
// must be in table
|
|
if ( ds->m_tableSec != ts ) continue;
|
|
// is date in a header column?
|
|
if ( headerCol && headerCol->m_colNum == ds->m_colNum )
|
|
di->m_flags |= DF_TABLEDATEHEADERCOL;
|
|
// is date in a header row?
|
|
if ( headerRow && headerRow->m_rowNum == ds->m_rowNum )
|
|
di->m_flags |= DF_TABLEDATEHEADERROW;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool Sections::swoggleTables ( ) {
|
|
|
|
// . turn this off for now because it is too confusing
|
|
// . really just need to rewrite the entire table and re-set
|
|
// the sections class!!
|
|
return true;
|
|
|
|
Section *lastTable = NULL;
|
|
// scan dates until we find one in a table
|
|
for ( int32_t i = 0 ; i < m_dates->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_datePtrs[i];
|
|
// skip if nuked. part of a compound, or in bad section.
|
|
if ( ! di ) continue;
|
|
// sanity check
|
|
if ( di->m_a < 0 ) continue;
|
|
// get section
|
|
Section *si = di->m_section;
|
|
// must be in a table
|
|
if ( ! ( si->m_flags & SEC_IN_TABLE ) ) continue;
|
|
// get our table we are swoggling
|
|
Section *ts = si->m_tableSec;
|
|
// seems like this happens for bad sections like
|
|
// display:none hidden sections etc.
|
|
if ( ! ts ) continue;
|
|
// bail if already did it
|
|
if ( ts == lastTable ) continue;
|
|
// set it
|
|
lastTable = ts;
|
|
// swoggle that table
|
|
if ( ! swoggleTable ( i , ts ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . swoggle the table at section "ts"
|
|
// . start at date #dn
|
|
bool Sections::swoggleTable ( int32_t dn , Section *ts ) {
|
|
|
|
char adtbuf[1000];
|
|
HashTableX adt;
|
|
adt.set ( 4 , 0 , 64 , adtbuf, 1000, false,m_niceness,"adttab");
|
|
|
|
bool adjacentColumns = false;
|
|
bool adjacentRows = false;
|
|
for ( int32_t i = dn ; i < m_dates->m_numDatePtrs ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
Date *di = m_dates->m_datePtrs[i];
|
|
// skip if nuked. part of a compound, or in bad section.
|
|
if ( ! di ) continue;
|
|
// sanity check
|
|
if ( di->m_a < 0 ) continue;
|
|
// if it is a single tod then make sure not like "8.32"
|
|
if ( di->m_flags & DF_FUZZY ) continue;
|
|
// get date's section
|
|
Section *si = di->m_section;
|
|
// must be in that table
|
|
if ( si->m_tableSec != ts ) break;
|
|
// get date types
|
|
datetype_t dt = di->m_hasType;
|
|
// see if same date type in prev row or col
|
|
int32_t row = si->m_rowNum;
|
|
int32_t col = si->m_colNum;
|
|
int32_t prevRow = row - 1;
|
|
int32_t prevCol = col - 1;
|
|
int32_t h;
|
|
if ( prevRow >= 0 ) {
|
|
h = hash32h ( prevRow , col );
|
|
h = hash32h ( (int32_t)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( adt.isInTable ( &h ) ) {
|
|
adjacentColumns = true;
|
|
//break;
|
|
}
|
|
}
|
|
if ( prevCol >= 0 ) {
|
|
h = hash32h ( row , prevCol );
|
|
h = hash32h ( (int32_t)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( adt.isInTable ( &h ) ) {
|
|
adjacentRows = true;
|
|
//break;
|
|
}
|
|
}
|
|
// add our hash
|
|
h = hash32h ( row , col );
|
|
h = hash32h ( (int32_t)ts , h );
|
|
h = hash32h ( dt , h ); // datetype
|
|
if ( ! adt.addKey ( &h ) ) return false;
|
|
}
|
|
// bail if nobody adjacent or both!
|
|
if ( ! adjacentColumns && ! adjacentRows ) return true;
|
|
if ( adjacentColumns && adjacentRows ) return true;
|
|
if ( adjacentColumns ) return true;
|
|
|
|
Section *next = NULL;
|
|
Section *tailSec = NULL;
|
|
// . remove <tr> sections from the doubly linked list of sections
|
|
// . up <tr> children's m_parent in the loop below though
|
|
for ( Section *si = ts ; si ; si = next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get next in case excised
|
|
next = si->m_next;
|
|
// stop if out of table
|
|
if ( si->m_a >= ts->m_b ) break;
|
|
// remember next section after last row
|
|
tailSec = si->m_next;
|
|
// skip if not td section or th
|
|
if ( si->m_tagId != TAG_TR ) continue;
|
|
// excise from linked list
|
|
si->m_prev->m_next = si->m_next;
|
|
si->m_next->m_prev = si->m_prev;
|
|
// its a doubly linked list
|
|
si->m_next->m_prev = si->m_prev;
|
|
si->m_prev->m_next = si->m_next;
|
|
// sanity
|
|
si->m_next = NULL;
|
|
si->m_prev = NULL;
|
|
}
|
|
|
|
#define MAXCOLS 50
|
|
Section *colSections[MAXCOLS];
|
|
Section *lastColCell[MAXCOLS];
|
|
Section *lastColKid [MAXCOLS];
|
|
for ( int32_t i = 0 ; i < MAXCOLS ; i++ ) {
|
|
colSections[i] = NULL;
|
|
lastColCell[i] = NULL;
|
|
lastColKid [i] = NULL;
|
|
}
|
|
int32_t maxColnum = 0;
|
|
|
|
// scan the <td> sections in the table
|
|
for ( Section *si = ts ; si ; si = next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get next in case excised
|
|
next = si->m_next;
|
|
// stop if out of table
|
|
if ( si->m_a >= ts->m_b ) break;
|
|
// skip if not td section or th
|
|
if ( si->m_tagId != TAG_TD && si->m_tagId != TAG_TH ) continue;
|
|
// parent must be table now since we excised row
|
|
if ( ! si->m_parent ) continue;
|
|
// must be parent of excised row of THIS table, not a td
|
|
// in another table inside this one
|
|
if ( si->m_parent->m_parent != ts ) continue;
|
|
// excise ourselves so we can re-add in a better place
|
|
//si->m_prev->m_next = NULL; // si->m_next;
|
|
//si->m_next->m_prev = si->m_prev;
|
|
// its a doubly linked list
|
|
//si->m_next->m_prev = si->m_prev;
|
|
//si->m_prev->m_next = si->m_next;
|
|
// int16_tcut
|
|
int32_t colnum = si->m_colNum;
|
|
// cram everything into last column in case of too many cols
|
|
if ( colnum >= MAXCOLS ) colnum = MAXCOLS-1;
|
|
// sanity
|
|
if ( colnum < 0 ) { char *xx=NULL;*xx=0; }
|
|
// get col section. starts at colnum=1, not 0!
|
|
Section *cs = colSections[colnum];
|
|
// make a new col section if haven't already
|
|
if ( ! cs ) {
|
|
// overflow check
|
|
if ( m_numSections >= m_maxNumSections) {
|
|
char *xx=NULL;*xx=0;}
|
|
// insert it
|
|
cs = &m_sections[m_numSections++];
|
|
// count them as well
|
|
if ( colnum > maxColnum ) maxColnum = colnum;
|
|
// get the previous colnum from us that is non-NULL,
|
|
// but use the table itself if we are the first.
|
|
Section *prevCol = NULL;
|
|
for ( int32_t pc = colnum - 1; pc >= 1 ; pc-- ) {
|
|
if ( ! colSections[pc] ) continue;
|
|
prevCol = colSections[pc];
|
|
break;
|
|
}
|
|
// the column sections are in a brother list
|
|
cs->m_prevBrother = prevCol;
|
|
if ( prevCol ) prevCol->m_nextBrother = cs;
|
|
// make it an implied section
|
|
cs->m_baseHash = BH_IMPLIED;
|
|
// reset these
|
|
cs->m_headColSection = NULL;
|
|
cs->m_headRowSection = NULL;
|
|
cs->m_firstWordPos = -1;
|
|
cs->m_lastWordPos = -1;
|
|
cs->m_firstPlaceNum= -1;
|
|
cs->m_senta = -1;
|
|
cs->m_sentb = -1;
|
|
cs->m_a = si->m_a;
|
|
cs->m_b = si->m_b;
|
|
cs->m_parent = ts;
|
|
cs->m_tagId = TAG_TC;
|
|
cs->m_flags = 0;
|
|
cs->m_flags |= SEC_NOTEXT;
|
|
// store in temp array
|
|
colSections[colnum] = cs;
|
|
}
|
|
// get it
|
|
if ( si->m_a == 1156 && si->m_b == 1171 )
|
|
log("you");
|
|
// col sec is our new parent, not the <tr> section we erased
|
|
si->m_parent = cs;
|
|
// update col section's m_b
|
|
if ( si->m_b > cs->m_b ) cs->m_b = si->m_b;
|
|
// assume not in brother list
|
|
si->m_prevBrother = NULL;
|
|
si->m_nextBrother = NULL;
|
|
// and in the doubly linked list we now follow the
|
|
// last one in the colsection
|
|
Section *lastCell = lastColCell[colnum];
|
|
// insert into brother list if we are not the first
|
|
if ( lastCell ) {
|
|
lastCell->m_nextBrother = si;
|
|
si->m_prevBrother = lastCell;
|
|
}
|
|
// get last section in this th/td cell
|
|
Section *lastKid = si;
|
|
// endpoint of that th/td cell
|
|
int32_t b = lastKid->m_b;
|
|
// and find the true last kid in this cell
|
|
for ( ; lastKid ; lastKid = lastKid->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if we are the last section period
|
|
if ( ! lastKid->m_next ) break;
|
|
// keep looping as int32_t as the "lastKid"
|
|
// section is contained in this td/th cell
|
|
if ( lastKid->m_next->m_a >= b ) break;
|
|
}
|
|
|
|
// stitch the th/td cells in the same column together
|
|
Section *prevLastKid = lastColKid[colnum];
|
|
// link this cell up with the previous section it should be w/
|
|
if ( prevLastKid ) {
|
|
prevLastKid->m_next = si;
|
|
si->m_prev = prevLastKid;
|
|
}
|
|
else {
|
|
si->m_prev = cs;
|
|
cs->m_next = si;
|
|
}
|
|
|
|
// this td/th section is not connected to anything else yet.
|
|
// if we do not connect it right above we will connect it up
|
|
// in the loop below.
|
|
// no, we can't do this because it messes up our scan for
|
|
// this for loop!
|
|
//lastKid->m_next = NULL;
|
|
// we are the new last kid for this col #
|
|
lastColCell[colnum] = si;
|
|
lastColKid [colnum] = lastKid;
|
|
}
|
|
|
|
//
|
|
// now set the m_prev/m_next members of each column section
|
|
//
|
|
Section *prevLastKid = NULL;
|
|
for ( int32_t i = 1 ; i <= maxColnum ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get it
|
|
Section *cs = colSections[i];
|
|
// skip if empty
|
|
if ( ! cs ) continue;
|
|
// get last section in this column
|
|
Section *lastKid = lastColKid[i];
|
|
// insert column into linked list with the lastKid
|
|
// of the previous column
|
|
if ( prevLastKid ) {
|
|
cs->m_prev = prevLastKid;
|
|
prevLastKid->m_next = cs;
|
|
}
|
|
else {
|
|
cs->m_prev = ts;
|
|
ts->m_next = cs;
|
|
}
|
|
// assume we are the final lastKid and will not be connected
|
|
// with the "prevLastKid" logic above
|
|
lastKid->m_next = tailSec;
|
|
tailSec->m_prev = lastKid;
|
|
// set this now
|
|
prevLastKid = lastKid;
|
|
}
|
|
|
|
//verifySections();
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
|
|
// . just the voting info for passing into diffbot in json
|
|
// . along w/ the title/summary/etc. we can return this json blob for each search result
|
|
bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
|
|
|
|
// save ptrs
|
|
m_sbuf = sb;
|
|
m_sbuf->setLabel ("sectprnt2");
|
|
|
|
// print sections out
|
|
for ( Section *sk = m_rootSection ; sk ; ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// print this section
|
|
printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
|
|
// advance
|
|
int32_t b = sk->m_b;
|
|
// stop if last
|
|
if ( b >= m_nw ) break;
|
|
// get section after that
|
|
sk = m_sectionPtrs[b];
|
|
}
|
|
|
|
// ensure ends in \0
|
|
if ( ! sb->nullTerm() ) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// make this replace ::print() when it works
|
|
bool Sections::print2 ( SafeBuf *sbuf ,
|
|
int32_t hiPos,
|
|
int32_t *wposVec,
|
|
char *densityVec,
|
|
char *diversityVec,
|
|
char *wordSpamVec,
|
|
char *fragVec,
|
|
HashTableX *st2 ,
|
|
HashTableX *tt ,
|
|
Addresses *aa ,
|
|
char format ) { // bool forProCog ){
|
|
//FORMAT_PROCOG FORMAT_JSON HTML
|
|
|
|
//sbuf->safePrintf("<b>Sections in Document</b>\n");
|
|
|
|
// save ptrs
|
|
m_sbuf = sbuf;
|
|
|
|
m_sbuf->setLabel ("sectprnt");
|
|
|
|
//m_pt = pt;
|
|
//m_et = et;
|
|
//m_at = at;
|
|
//m_priceTable = priceTable;
|
|
m_aa = aa;
|
|
m_hiPos = hiPos;
|
|
|
|
m_wposVec = wposVec;
|
|
m_densityVec = densityVec;
|
|
m_diversityVec = diversityVec;
|
|
m_wordSpamVec = wordSpamVec;
|
|
m_fragVec = fragVec;
|
|
|
|
//verifySections();
|
|
|
|
char **wptrs = m_words->getWords ();
|
|
int32_t *wlens = m_words->getWordLens ();
|
|
//nodeid_t *tids = m_words->getTagIds();
|
|
int32_t nw = m_words->getNumWords ();
|
|
|
|
// check words
|
|
for ( int32_t i = 0 ; i < nw ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get section
|
|
Section *sn = m_sectionPtrs[i];
|
|
if ( sn->m_a > i ) { char *xx=NULL;*xx=0; }
|
|
if ( sn->m_b <= i ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
|
|
// print sections out
|
|
for ( Section *sk = m_rootSection ; sk ; ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// print this section
|
|
printSectionDiv ( sk , format );//forProCog );
|
|
// advance
|
|
int32_t b = sk->m_b;
|
|
// stop if last
|
|
if ( b >= m_nw ) break;
|
|
// get section after that
|
|
sk = m_sectionPtrs[b];
|
|
}
|
|
|
|
if ( format != FORMAT_HTML ) return true; // forProCog
|
|
|
|
// print header
|
|
char *hdr =
|
|
"<table border=1>"
|
|
"<tr>"
|
|
"<td><b>sec #</b></td>"
|
|
"<td><b>wordStart</b></td>"
|
|
"<td><b>wordEnd</b></td>"
|
|
"<td><b>baseHash</b></td>"
|
|
"<td><b>cumulTagHash</b></td>"
|
|
"<td><b>contentHash</b></td>"
|
|
"<td><b>contentTagHash</b></td>"
|
|
"<td><b>XOR</b></td>" // only valid for contentHashes
|
|
"<td><b>alnum words</b></td>" // contained in section
|
|
"<td><b>depth</b></td>"
|
|
"<td><b>parent word range</b></td>"
|
|
"<td><b># siblings</b></td>"
|
|
"<td><b>flags</b></td>"
|
|
"<td><b>evIds</b></td>"
|
|
"<td><b>text snippet</b></td>"
|
|
//"<td>votes for static</td>"
|
|
//"<td>votes for dynamic</td>"
|
|
//"<td>votes for texty</td>"
|
|
//"<td>votes for unique</td>"
|
|
"</tr>\n";
|
|
sbuf->safePrintf("%s",hdr);
|
|
|
|
int32_t rcount = 0;
|
|
int32_t scount = 0;
|
|
// show word # of each section so we can look in PageParser.cpp's
|
|
// output to see exactly where it starts, since we now label all
|
|
// the words
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
|
sbuf->safePrintf("<!--ignore--></table>%s\n",hdr);
|
|
// get it
|
|
//Section *sn = &m_sections[i];
|
|
//Section *sn = m_sorted[i];
|
|
// skip if not a section with its own words
|
|
//if ( sn->m_flags & SEC_NOTEXT ) continue;
|
|
char *xs = "--";
|
|
char ttt[100];
|
|
if ( sn->m_contentHash64 ) {
|
|
int32_t modified = sn->m_tagHash ^ sn->m_contentHash64;
|
|
sprintf(ttt,"0x%"XINT32"",modified);
|
|
xs = ttt;
|
|
}
|
|
// int16_tcut
|
|
Section *parent = sn->m_parent;
|
|
int32_t pswn = -1;
|
|
int32_t pewn = -1;
|
|
if ( parent ) pswn = parent->m_a;
|
|
if ( parent ) pewn = parent->m_b;
|
|
// print it
|
|
sbuf->safePrintf("<!--ignore--><tr><td>%"INT32"</td>\n"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%s</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td><nobr>%"INT32" to %"INT32"</nobr></td>"
|
|
"<td>%"INT32"</td>"
|
|
"<td><nobr>" ,
|
|
scount++,//i,
|
|
sn->m_a,
|
|
sn->m_b,
|
|
(int32_t)sn->m_baseHash,
|
|
(int32_t)sn->m_tagHash,
|
|
(int32_t)sn->m_contentHash64,
|
|
(int32_t)(sn->m_contentHash64^sn->m_tagHash),
|
|
xs,
|
|
sn->m_exclusive,
|
|
sn->m_depth,
|
|
pswn,
|
|
pewn,
|
|
sn->m_numOccurences);//totalOccurences );
|
|
// now show the flags
|
|
printFlags ( sbuf , sn , false );
|
|
// first few words of section
|
|
int32_t a = sn->m_a;
|
|
int32_t b = sn->m_b;
|
|
// -1 means an unclosed tag!! should no longer be the case
|
|
if ( b == -1 ) { char *xx=NULL;*xx=0; }//b=m_words->m_numWords;
|
|
sbuf->safePrintf("</nobr></td>");
|
|
|
|
sbuf->safePrintf("<td> </td>");
|
|
|
|
sbuf->safePrintf("<td><nobr>");
|
|
// 70 chars max
|
|
int32_t max = 70;
|
|
int32_t count = 0;
|
|
char truncated = 0;
|
|
// do not print last word/tag in section
|
|
for ( int32_t i = a ; i < b - 1 && count < max ; i++ ) {
|
|
char *s = wptrs[i];
|
|
int32_t slen = wlens[i];
|
|
if ( count + slen > max ) {
|
|
truncated = 1;
|
|
slen = max - count;
|
|
}
|
|
count += slen;
|
|
// boldify front tag
|
|
if ( i == a ) sbuf->safePrintf("<b>");
|
|
sbuf->htmlEncode(s,slen,false);
|
|
// boldify front tag
|
|
if ( i == a ) sbuf->safePrintf("</b>");
|
|
}
|
|
// if we truncated print a ...
|
|
if ( truncated ) sbuf->safePrintf("<b>...</b>");
|
|
// then print ending tag
|
|
if ( b < nw ) {
|
|
int32_t blen = wlens[b-1];
|
|
if ( blen>20 ) blen = 20;
|
|
sbuf->safePrintf("<b>");
|
|
sbuf->htmlEncode(wptrs[b-1],blen,false);
|
|
sbuf->safePrintf("</b>");
|
|
}
|
|
|
|
sbuf->safePrintf("</nobr></td></tr>\n");
|
|
}
|
|
|
|
sbuf->safePrintf("</table>\n<br>\n");
|
|
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
bool SectionVotingTable::print ( SafeBuf *sbuf , char *title ) {
|
|
|
|
|
|
// title = "new/old section voting table"
|
|
sbuf->safePrintf("<b>%s</b>\n",title );
|
|
sbuf->safePrintf("<br>");
|
|
sbuf->safePrintf("<i>turkTagHash is combined with sectionType to "
|
|
"make the "
|
|
"key in sectiondb. The data is everything else. "
|
|
"<br>"
|
|
"*The turkTagHash is XOR'ed with the contentHash "
|
|
"for the contentHash "
|
|
"section type, and is the tagPairHash for the "
|
|
"tagPairHash section type.</i>");
|
|
// print table header
|
|
char *hdr2 =
|
|
"<table border=1>"
|
|
"<tr>"
|
|
"<td><b>siteHash</b></td>"
|
|
"<td><b>turkTagHash*</b></td>"
|
|
"<td><b>sectionType</b></td>"
|
|
"<td><b>scoreTotal</b></td>"
|
|
"<td><b>numSampled</b></td>"
|
|
"<td><b>avgScore</b></td>"
|
|
"</tr>\n";
|
|
sbuf->safePrintf("%s",hdr2);
|
|
HashTableX *st = &m_svt;
|
|
int32_t rcount = 0;
|
|
for ( int32_t i = 0 ; i < st->m_numSlots ; i++ ) {
|
|
// breathe
|
|
//QUICKPOLL(m_niceness);
|
|
// skip if empty
|
|
if ( ! st->m_flags[i] ) continue;
|
|
// see if one big table causes a browser slowdown
|
|
if ( (++rcount % TABLE_ROWS ) == 0 )
|
|
sbuf->safePrintf("<!--ignore--></table>%s\n",hdr2);
|
|
// get the key
|
|
uint64_t k = *(uint64_t *)st->getKey ( i );
|
|
// get the data
|
|
SectionVote *sv = (SectionVote *)st->getValueFromSlot ( i );
|
|
// parse key
|
|
int32_t turkTagHash = (int32_t)(k & 0xffffffff);
|
|
int32_t sectionType = (int32_t)(k >> 32);
|
|
//int32_t tagHash = (int32_t)(k >> 32);
|
|
//int32_t sectionType = (int32_t)(k & 0xffffffff);
|
|
// convert to string
|
|
char *st = getSectionTypeAsStr ( sectionType );
|
|
float avg = 0.0;
|
|
if ( sv->m_numSampled > 0 ) avg = sv->m_score/sv->m_numSampled;
|
|
sbuf->safePrintf("<tr>"
|
|
"<td>--</td>"
|
|
"<td>0x%"XINT32"</td>"
|
|
"<td>%s</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"<td>%.02f</td>"
|
|
"</tr>\n",
|
|
turkTagHash,
|
|
st,
|
|
sv->m_score,
|
|
sv->m_numSampled ,
|
|
avg );
|
|
}
|
|
sbuf->safePrintf("</table>\n<br><br>\n");
|
|
|
|
|
|
|
|
sbuf->safePrintf("<table border=1 cellpadding=3>\n");
|
|
sbuf->safePrintf(
|
|
"<tr><td><nobr><b>Section Type</b></nobr></td>"
|
|
"<td><b>Description</b></td></tr>\n"
|
|
|
|
"<tr><td>texty</td><td>An average "
|
|
"score of 0.0 means the "
|
|
"section does not have any words that are not "
|
|
"in anchor test. An average "
|
|
"score of 1.0 means the section "
|
|
"has words, none of which are in anchor text."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>clock</td><td>An average "
|
|
"score of 1.0 means "
|
|
"section identified as "
|
|
"containing a clock. An average "
|
|
"score of 0.0 mean it contains "
|
|
"a date that is definitely not a clock."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>eurdatefmt</td><td>An average "
|
|
"score of 1.0 means "
|
|
"the section contains a date in european format, "
|
|
"which means day first, not month. An average "
|
|
"score of 0.0 "
|
|
"means the section contains a date that is NOT in "
|
|
"european format.</td></tr>\n"
|
|
|
|
//"<tr><td>event</td><td>"
|
|
//"</td></tr>\n"
|
|
|
|
//"<tr><td>address</td><td></td></tr>\n"
|
|
|
|
"<tr><td>tagpairhash</td><td>"
|
|
"All unique adjacent tag ids are hashed to form "
|
|
"this number which represents the overal structure "
|
|
"of the document. The turkTagHash is the tagPairHash "
|
|
"in this case."
|
|
"</td></tr>\n"
|
|
|
|
"<tr><td>contenthash</td><td>"
|
|
"The turkTagHash in this case is really "
|
|
"the content hash of the words in the section XORed "
|
|
"with the original turkTagHash."
|
|
"</td></tr>\n"
|
|
|
|
//"<tr><td>textmaxsmpl</td><td></td></tr>\n"
|
|
//"<tr><td>waitinline</td><td></td></tr>\n"
|
|
//"<tr><td></td><td></td></tr>\n"
|
|
);
|
|
sbuf->safePrintf("</table>\n<br><br>\n");
|
|
// must be \0 terminated
|
|
if ( sbuf->m_buf[sbuf->m_length] ) { char *xx=NULL;*xx=0; }
|
|
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog ) {
|
|
//log("sk=%"INT32"",sk->m_a);
|
|
// enter a new div section now
|
|
m_sbuf->safePrintf("<br>");
|
|
// only make font color different
|
|
int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff;
|
|
int32_t fcolor = 0x000000;
|
|
int32_t rcolor = 0x000000;
|
|
uint8_t *bp = (uint8_t *)&bcolor;
|
|
bool dark = false;
|
|
if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
|
|
dark = true;
|
|
// or if two are less than 50
|
|
if ( bp[0]<100 && bp[1]<100 ) dark = true;
|
|
if ( bp[1]<100 && bp[2]<100 ) dark = true;
|
|
if ( bp[0]<100 && bp[2]<100 ) dark = true;
|
|
// if bg color is dark, make font color light
|
|
if ( dark ) {
|
|
fcolor = 0x00ffffff;
|
|
rcolor = 0x00ffffff;
|
|
}
|
|
// start the new div
|
|
m_sbuf->safePrintf("<div "
|
|
"style=\""
|
|
"background-color:#%06"XINT32";"
|
|
"margin-left:20px;"
|
|
"border:#%06"XINT32" 1px solid;"
|
|
"color:#%06"XINT32"\">",
|
|
//(int32_t)sk,
|
|
bcolor,
|
|
rcolor,
|
|
fcolor);
|
|
|
|
bool printWord = true;
|
|
if ( ! sk->m_parent && sk->m_next && sk->m_next->m_a == sk->m_a )
|
|
printWord = false;
|
|
|
|
// print word/tag #i
|
|
if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord )
|
|
// only encode if it is a tag
|
|
m_sbuf->htmlEncode(m_wptrs[sk->m_a],m_wlens[sk->m_a],false );
|
|
|
|
//if ( forProCog )
|
|
// m_sbuf->safePrintf("A=%"INT32" ",sk->m_a);
|
|
|
|
|
|
/*
|
|
take out for now since we changed the stats class around
|
|
if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
|
|
// do not count our own site!
|
|
m_sbuf->safePrintf("<i>"
|
|
"<font size=-1>"
|
|
"<a href=\"/search?"
|
|
// turn off summary deduping
|
|
"dr=0&"
|
|
// turn ON site clustering
|
|
"sc=1&"
|
|
"q=gbsectionhash:%"UINT64"\">"
|
|
"sitedups=%"INT32""
|
|
"</a>"
|
|
"</font>"
|
|
"</i> "
|
|
, sk->m_sentenceContentHash64
|
|
,(int32_t)sk->m_stats.m_numUniqueSites-1);
|
|
}
|
|
*/
|
|
|
|
m_sbuf->safePrintf("<i>");
|
|
|
|
if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
|
|
sec_t f = sk->m_flags;
|
|
//if ( f & SEC_SENTENCE )
|
|
// m_sbuf->safePrintf("sentence " );
|
|
if ( f & SEC_MENU_SENTENCE )
|
|
m_sbuf->safePrintf("menusentence " );
|
|
if ( f & SEC_MENU )
|
|
m_sbuf->safePrintf("ismenu " );
|
|
if ( f & SEC_MENU_HEADER )
|
|
m_sbuf->safePrintf("menuheader " );
|
|
}
|
|
|
|
|
|
//if ( sk->m_sentenceContentHash64 &&
|
|
// sk->m_sentenceContentHash64 ) // != sk->m_contentHash64 )
|
|
// m_sbuf->safePrintf("sch=%"UINT64" ",
|
|
// sk->m_sentenceContentHash64);
|
|
|
|
// show dup votes if any
|
|
//if ( sk->m_votesForDup )
|
|
// m_sbuf->safePrintf("dupvotes=%"INT32" ",sk->m_votesForDup);
|
|
//if ( sk->m_votesForNotDup )
|
|
// m_sbuf->safePrintf("notdupvotes=%"INT32" ",
|
|
// sk->m_votesForNotDup);
|
|
|
|
if ( format != FORMAT_PROCOG ) {
|
|
// print the flags
|
|
m_sbuf->safePrintf("A=%"INT32" ",sk->m_a);
|
|
|
|
// print tag hash now
|
|
m_sbuf->safePrintf("taghash=%"UINT32" ",(int32_t)sk->m_tagHash);
|
|
|
|
m_sbuf->safePrintf("turktaghash=%"UINT32" ",
|
|
(int32_t)sk->m_turkTagHash32);
|
|
|
|
if ( sk->m_contentHash64 )
|
|
m_sbuf->safePrintf("ch64=%"UINT64" ",sk->m_contentHash64);
|
|
if ( sk->m_sentenceContentHash64 &&
|
|
sk->m_sentenceContentHash64 != sk->m_contentHash64 )
|
|
m_sbuf->safePrintf("sch=%"UINT64" ",
|
|
sk->m_sentenceContentHash64);
|
|
|
|
|
|
// show this stuff for tags that contain sentences indirectly,
|
|
// that is what we hash in XmlDoc::hashSections()
|
|
//if(sk->m_indirectSentHash64 && sk->m_tagId != TAG_TEXTNODE) {
|
|
uint64_t mod = 0;
|
|
if ( sk->m_flags & SEC_HASHXPATH ) {
|
|
// show for all tags now because diffbot wants to see
|
|
// markup on all tags
|
|
//if ( sk->m_indirectSentHash64 && sk->m_tagId !=TAG_TEXTNODE){
|
|
//if ( sk->m_stats.m_totalDocIds ) {
|
|
mod = (uint32_t)sk->m_turkTagHash32;
|
|
mod ^= (uint32_t)(uint64_t)m_siteHash64;
|
|
m_sbuf->safePrintf("<a style=decoration:none; "
|
|
"href=/search?c=%s&"
|
|
"q=gbfacetstr%%3A"
|
|
"gbxpathsitehash%"UINT64">"
|
|
//"<u>"
|
|
"xpathsitehash=%"UINT64""
|
|
//"</u>"
|
|
"</a> "
|
|
//"</font> "
|
|
,m_coll
|
|
,mod
|
|
,mod);
|
|
}
|
|
|
|
SectionStats *ss = &sk->m_stats;
|
|
|
|
// also the value of the inner html hashed
|
|
if ( sk->m_flags & SEC_HASHXPATH ) {//ss->m_totalMatches > 0) {
|
|
uint32_t val ;
|
|
val = (uint32_t) sk->m_indirectSentHash64 ;
|
|
m_sbuf->safePrintf("xpathsitehashval=%"UINT32" ", val );
|
|
}
|
|
|
|
// some voting stats
|
|
if ( sk->m_flags & SEC_HASHXPATH ) {//ss->m_totalMatches > 0) {
|
|
m_sbuf->safePrintf("_s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32" "
|
|
,(int32_t)ss->m_totalMatches
|
|
,(int32_t)ss->m_totalDocIds
|
|
,(int32_t)ss->m_totalEntries
|
|
,(int32_t)ss->m_numUniqueVals
|
|
,(uint32_t)mod
|
|
);
|
|
}
|
|
|
|
// take this out for now... MDW 7/7/2014
|
|
|
|
// // for the gbsectionhash:xxxxx terms we index
|
|
// if ( sk->m_sentenceContentHash64 ) {
|
|
// uint32_t mod = (uint32_t)sk->m_turkTagHash32;
|
|
// mod ^= (uint32_t)m_siteHash64;
|
|
// m_sbuf->safePrintf(//"<font color=red>"
|
|
// "gbsectionhash32=%"UINT32" "
|
|
// //"</font> "
|
|
// ,mod);
|
|
// }
|
|
// if ( sk->m_contentHash64 )
|
|
// m_sbuf->safePrintf(//"<font color=red>"
|
|
// "ch32=%"UINT32""
|
|
// //"</font> "
|
|
// ,
|
|
// (uint32_t)sk->m_contentHash64);
|
|
|
|
|
|
if ( sk->m_lastLinkContentHash32 )
|
|
m_sbuf->safePrintf("llch=%"UINT32" ",
|
|
(int32_t)sk->m_lastLinkContentHash32);
|
|
|
|
if ( sk->m_leftCell )
|
|
m_sbuf->safePrintf("leftcellA=%"INT32" ",
|
|
(int32_t)sk->m_leftCell->m_a);
|
|
if ( sk->m_aboveCell )
|
|
m_sbuf->safePrintf("abovecellA=%"INT32" ",
|
|
(int32_t)sk->m_aboveCell->m_a);
|
|
|
|
printFlags ( m_sbuf , sk , false );
|
|
|
|
if ( isHardSection(sk) )
|
|
m_sbuf->safePrintf("hardsec ");
|
|
|
|
// get addr index ptr if any (could be mult)
|
|
int32_t acount = 0;
|
|
//int64_t sh = 0LL;
|
|
int32_t pi = sk->m_firstPlaceNum;
|
|
int32_t np = m_aa->m_numSorted;
|
|
for ( ; pi >= 0 && pi < np ; pi++ ) {
|
|
Place *p = m_aa->m_sorted[pi];
|
|
// stop if not in section any more
|
|
if ( p->m_a >= sk->m_b ) break;
|
|
// get hash
|
|
//int64_t tt = p->m_hash;
|
|
// get max
|
|
//if ( ! sh || tt > sh ) sh = tt;
|
|
// count them
|
|
acount++;
|
|
}
|
|
// print those out
|
|
if ( sk->m_dateBits )
|
|
m_sbuf->safePrintf("datebits=0x%"XINT32" ",
|
|
(int32_t)sk->m_dateBits);
|
|
if ( sk->m_phoneXor )
|
|
m_sbuf->safePrintf("phonexor=0x%"XINT32" ",sk->m_phoneXor);
|
|
if ( sk->m_emailXor )
|
|
m_sbuf->safePrintf("emailxor=0x%"XINT32" ",sk->m_emailXor);
|
|
if ( sk->m_priceXor )
|
|
m_sbuf->safePrintf("pricexor=0x%"XINT32" ",sk->m_priceXor);
|
|
if ( sk->m_todXor )
|
|
m_sbuf->safePrintf("todxor=0x%"XINT32" ",sk->m_todXor);
|
|
if ( sk->m_dayXor )
|
|
m_sbuf->safePrintf("dayxor=0x%"XINT32" ",sk->m_dayXor);
|
|
if ( sk->m_addrXor )
|
|
m_sbuf->safePrintf("addrxor=0x%"XINT32" ",sk->m_addrXor);
|
|
if ( acount >= 2 )
|
|
m_sbuf->safePrintf(" (%"INT32" places)",acount);
|
|
}
|
|
|
|
m_sbuf->safePrintf("</i>\n");
|
|
|
|
// now print each word and subsections in this section
|
|
int32_t a = sk->m_a;
|
|
int32_t b = sk->m_b;
|
|
for ( int32_t i = a ; i < b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// . if its a and us, skip
|
|
// . BUT if we are root then really this tag belongs to
|
|
// our first child, so make an exception for root!
|
|
if ( i == a && m_tids[i] && (sk->m_parent) ) continue;
|
|
|
|
// . get section of this word
|
|
// . TODO: what if this was the tr tag we removed??? i guess
|
|
// maybe make it NULL now?
|
|
Section *ws = m_sectionPtrs[i];
|
|
// get top most parent that starts at word position #a and
|
|
// is not "sk"
|
|
for ( ; ; ws = ws->m_parent ) {
|
|
if ( ws == sk ) break;
|
|
if ( ! ws->m_parent ) break;
|
|
if ( ws->m_parent->m_a != ws->m_a ) break;
|
|
if ( ws->m_parent == sk ) break;
|
|
}
|
|
// if it belongs to another sections, print that section
|
|
if ( ws != sk ) {
|
|
// print out this subsection
|
|
printSectionDiv ( ws , format ); // forProCog );
|
|
// advance to end of that then
|
|
i = ws->m_b - 1;
|
|
// and try next word
|
|
continue;
|
|
}
|
|
|
|
// ignore if in style section, etc. just print it out
|
|
if ( sk->m_flags & NOINDEXFLAGS ) {
|
|
m_sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
|
continue;
|
|
}
|
|
|
|
// boldify alnum words
|
|
if ( m_wids[i] ) {
|
|
if ( m_wposVec[i] == m_hiPos )
|
|
m_sbuf->safePrintf("<a name=hipos></a>");
|
|
m_sbuf->safePrintf("<nobr><b>");
|
|
if ( i < MAXFRAGWORDS && m_fragVec[i] == 0 )
|
|
m_sbuf->safePrintf("<strike>");
|
|
}
|
|
if ( m_wids[i] && m_wposVec[i] == m_hiPos )
|
|
m_sbuf->safePrintf("<blink style=\""
|
|
"background-color:yellow;"
|
|
"color:black;\">");
|
|
// print that word
|
|
m_sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
|
if ( m_wids[i] && m_wposVec[i] == m_hiPos )
|
|
m_sbuf->safePrintf("</blink>");
|
|
// boldify alnum words
|
|
if ( m_wids[i] ) {
|
|
if ( i < MAXFRAGWORDS && m_fragVec[i] == 0 )
|
|
m_sbuf->safePrintf("</strike>");
|
|
m_sbuf->safePrintf("</b>");
|
|
}
|
|
// and print out their pos/div/spam sub
|
|
if ( m_wids[i] ) {
|
|
m_sbuf->safePrintf("<sub "
|
|
"style=\"background-color:white;"
|
|
"font-size:10px;"
|
|
"border:black 1px solid;"
|
|
"color:black;\">");
|
|
m_sbuf->safePrintf("%"INT32"",m_wposVec[i]);
|
|
if ( m_densityVec[i] != MAXDENSITYRANK )
|
|
m_sbuf->safePrintf("/<font color=purple><b>%"INT32""
|
|
"</b></font>"
|
|
,
|
|
(int32_t)m_densityVec[i]);
|
|
/*
|
|
if ( m_diversityVec[i] != MAXDIVERSITYRANK )
|
|
m_sbuf->safePrintf("/<font color=green><b>%"INT32""
|
|
"</b></font>"
|
|
,
|
|
(int32_t)m_diversityVec[i]);
|
|
*/
|
|
if ( m_wordSpamVec[i] != MAXWORDSPAMRANK )
|
|
m_sbuf->safePrintf("/<font color=red><b>%"INT32""
|
|
"</b></font>"
|
|
,
|
|
(int32_t)m_wordSpamVec[i]);
|
|
m_sbuf->safePrintf("</sub></nobr>");
|
|
}
|
|
}
|
|
m_sbuf->safePrintf("</div>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Sections::verifySections ( ) {
|
|
|
|
// make sure we map each word to a section that contains it at least
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
Section *si = m_sectionPtrs[i];
|
|
if ( si->m_a > i ) { char *xx=NULL;*xx=0; }
|
|
if ( si->m_b <= i ) { char *xx=NULL;*xx=0; }
|
|
// must have checksum
|
|
if ( m_wids[i] && si->m_contentHash64==0){char *xx=NULL;*xx=0;}
|
|
// must have this set if 0
|
|
if ( ! si->m_contentHash64 && !(si->m_flags & SEC_NOTEXT)) {
|
|
char *xx=NULL;*xx=0;}
|
|
if ( si->m_contentHash64 && (si->m_flags & SEC_NOTEXT)) {
|
|
char *xx=NULL;*xx=0;}
|
|
}
|
|
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next )
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
|
|
// sanity check
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// get it
|
|
//Section *sn = &m_sections[i];
|
|
// get parent
|
|
Section *sp = sn->m_parent;
|
|
subloop3:
|
|
// skip if no parent
|
|
if ( ! sp ) continue;
|
|
// make sure parent fully contains
|
|
if ( sp->m_a > sn->m_a ) { char *xx=NULL;*xx=0; }
|
|
if ( sp->m_b < sn->m_b ) { char *xx=NULL;*xx=0; }
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// and make sure every grandparent fully contains us too!
|
|
sp = sp->m_parent;
|
|
goto subloop3;
|
|
}
|
|
|
|
// sanity check
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
Section *sn = &m_sections[i];
|
|
if ( sn->m_a >= sn->m_b ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
// sanity check, make sure each section is contained by the
|
|
// smallest section containing it
|
|
//for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
for ( Section *si = m_rootSection ; si ; si = si->m_next ) {
|
|
//Section *si = &m_sections[i];
|
|
//if ( ! si ) continue;
|
|
//for ( int32_t j = 0 ; j < m_numSections ; j++ ) {
|
|
for ( Section *sj = m_rootSection ; sj ; sj = sj->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if us
|
|
if ( sj == si ) continue;
|
|
// skip column sections because they are artificial
|
|
// and only truly contain some of the sections that
|
|
// their [a,b) interval says they contain.
|
|
if ( sj->m_tagId == TAG_TC ) continue;
|
|
// or if an implied section of td tags in a tc
|
|
if ( sj->m_baseHash == BH_IMPLIED &&
|
|
sj->m_parent &&
|
|
sj->m_parent->m_tagId == TAG_TC )
|
|
continue;
|
|
// get him
|
|
//Section *sj = &m_sections[j];
|
|
// skip if sj does not contain first word in si
|
|
if ( sj->m_a > si->m_a ) continue;
|
|
if ( sj->m_b <= si->m_a ) continue;
|
|
// ok, make sure in our parent path
|
|
Section *ps = si;
|
|
for ( ; ps ; ps = ps->m_parent )
|
|
if ( ps == sj ) break;
|
|
// ok if we found it
|
|
if ( ps ) continue;
|
|
// sometimes if sections are equal then the other
|
|
// is the parent
|
|
ps = sj;
|
|
for ( ; ps ; ps = ps->m_parent )
|
|
if ( ps == si ) break;
|
|
// must have had us
|
|
if ( ps ) continue;
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
}
|
|
|
|
// make sure we map each word to a section that contains it at least
|
|
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
|
|
Section *si = m_sectionPtrs[i];
|
|
if ( si->m_a > i ) { char *xx=NULL;*xx=0; }
|
|
if ( si->m_b <= i ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Sections::isTagDelimeter ( class Section *si , nodeid_t tagId ) {
|
|
// store
|
|
Section *saved = si;
|
|
// . people embed tags in other tags, so scan
|
|
// . fix "<strong><em><br></em><strong><span><br></span> for
|
|
// guysndollsllc.com homepage
|
|
for ( ; si ; si = si->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if si not contained in saved
|
|
if ( ! saved->contains ( si ) ) return false;
|
|
// stop if got it
|
|
if ( tagId != TAG_BR && si->m_tagId == tagId ) break;
|
|
// tag br can be a pr or p tag (any vert whitespace really)
|
|
if ( tagId == TAG_BR ) {
|
|
if ( si->m_tagId == TAG_BR ) break;
|
|
if ( si->m_tagId == TAG_P ) break;
|
|
// treat <hN> and </hN> as single breaking tags too
|
|
nodeid_t ft = si->m_tagId & BACKBITCOMP;
|
|
if ( ft >= TAG_H1 && ft <= TAG_H5 ) break;
|
|
// fix tennisoncampus.com which has <p>...<table> as
|
|
// a double space
|
|
if ( si->m_tagId == TAG_TABLE ) break;
|
|
}
|
|
// . skip if has no text of its own
|
|
// . like if looking for header tags and we got:
|
|
// <tr><td></td><td><h1>stuff here</h1></td></tr>
|
|
// we do not want the <td></td> to stop us
|
|
// bluefin-cambridge.com
|
|
if ( si->m_firstWordPos < 0 ) continue;
|
|
// stop if hit alnum before tagid
|
|
//if ( si->m_alnumPosA != saved->m_alnumPosA )
|
|
// return false;
|
|
// stop if hit text before hitting tagid
|
|
if ( si->m_lastWordPos != saved->m_lastWordPos )
|
|
return false;
|
|
}
|
|
// done?
|
|
if ( ! si ) return false;
|
|
//if ( si->m_tagId != tagId ) return false;
|
|
if ( tagId != TAG_BR ) return true;
|
|
// need double brs (or p tags)
|
|
int32_t a = si->m_a + 1;
|
|
//if ( a + 2 >= m_nw ) return false;
|
|
//if ( m_tids[a+1] == TAG_BR ) return true;
|
|
//if ( m_wids[a+1] ) return false;
|
|
//if ( m_tids[a+2] == TAG_BR ) return true;
|
|
// guynsdollsllc.com homepage has some crazy br tags
|
|
// in their own em strong tags, etc.
|
|
int32_t kmax = a+10;
|
|
if ( kmax > m_nw ) kmax = m_nw;
|
|
for ( int32_t k = a ; k < kmax ; k++ ) {
|
|
if ( m_wids[k] ) return false;
|
|
if ( m_tids[k] == TAG_BR ) return true;
|
|
if ( m_tids[k] == TAG_P ) return true;
|
|
if ( m_tids[k] == TAG_TABLE ) return true;
|
|
// treat <hN> and </hN> as single breaking tags too
|
|
nodeid_t ft = m_tids[k] & BACKBITCOMP;
|
|
if ( ft >= TAG_H1 && ft <= TAG_H5 ) return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
// . this maps a section ptr to if it contains registration info or not
|
|
// . we use this so we can ignore addresses in "registration" sections
|
|
// . now we also include old years to ignore addresses in sentences like
|
|
// "since its first opening in 2003 at gerschwin theater" talking about
|
|
// past events
|
|
// . returns false with g_errno set on error
|
|
bool Sections::setRegistrationBits ( ) {
|
|
|
|
//if ( m_regTableValid ) return &m_regTable;
|
|
if ( m_setRegBits ) return true;
|
|
|
|
//if ( ! m_regTable.set ( 4 , 0 , 128,NULL,0,false,m_niceness ) )
|
|
// return NULL;
|
|
|
|
// make a keyword table
|
|
static HashTableX s_kt;
|
|
static bool s_ktinit = false;
|
|
static char s_ktbuf[1200];
|
|
static int64_t h_registration;
|
|
static int64_t h_registrar;
|
|
static int64_t h_registrations;
|
|
static int64_t h_reservation;
|
|
static int64_t h_reservations;
|
|
static int64_t h_register;
|
|
static int64_t h_sign;
|
|
static int64_t h_up;
|
|
static int64_t h_signup;
|
|
static int64_t h_tickets;
|
|
static int64_t h_purchase;
|
|
static int64_t h_request;
|
|
static int64_t h_requesting;
|
|
static int64_t h_get;
|
|
static int64_t h_enroll;
|
|
static int64_t h_buy;
|
|
static int64_t h_presale ;
|
|
static int64_t h_pre ;
|
|
static int64_t h_sale ;
|
|
static int64_t h_on ;
|
|
static int64_t h_to ;
|
|
static int64_t h_sales ;
|
|
static int64_t h_deliver;
|
|
static int64_t h_picks;
|
|
static int64_t h_box; // box office for newmexicojazzfestival.org
|
|
static int64_t h_office;
|
|
static int64_t h_ticket;//ticket window for newmexicojazzfestival.org
|
|
static int64_t h_online;
|
|
static int64_t h_window;
|
|
static int64_t h_patron;
|
|
static int64_t h_service;
|
|
static int64_t h_services;
|
|
static int64_t h_phone;
|
|
static int64_t h_hours;
|
|
static int64_t h_end ;
|
|
static int64_t h_begin ;
|
|
static int64_t h_start ;
|
|
static int64_t h_stop ;
|
|
static int64_t h_parking;
|
|
static int64_t h_performance;
|
|
static int64_t h_dates;
|
|
static int64_t h_take;
|
|
static int64_t h_takes;
|
|
static int64_t h_place;
|
|
static int64_t h_doors;
|
|
static int64_t h_open;
|
|
static int64_t h_event;
|
|
static int64_t h_details;
|
|
if ( ! s_ktinit ) {
|
|
s_ktinit = true;
|
|
s_kt.set(8,0,128,s_ktbuf,1200,false,m_niceness,"evkeywrds");
|
|
// assign these
|
|
h_registrar = hash64n("registrar");
|
|
h_registration = hash64n("registration");
|
|
h_registrations= hash64n("registrations");
|
|
h_reservation = hash64n("reservation");
|
|
h_reservations = hash64n("reservations");
|
|
h_register = hash64n("register");
|
|
h_sign = hash64n("sign");
|
|
h_up = hash64n("up");
|
|
h_signup = hash64n("signup");
|
|
h_tickets = hash64n("tickets");
|
|
h_purchase = hash64n("purchase");
|
|
h_requesting = hash64n("requesting");
|
|
h_request = hash64n("request");
|
|
h_get = hash64n("get");
|
|
h_enroll = hash64n("enroll");
|
|
h_buy = hash64n("buy");
|
|
h_presale = hash64n("presale");
|
|
h_pre = hash64n("pre");
|
|
h_sale = hash64n("sale");
|
|
h_on = hash64n("on");
|
|
h_to = hash64n("to");
|
|
h_sales = hash64n("sales");
|
|
h_deliver = hash64n("deliver");
|
|
h_end = hash64n("end");
|
|
h_begin = hash64n("begin");
|
|
h_start = hash64n("start");
|
|
h_stop = hash64n("stop");
|
|
h_parking = hash64n("parking");
|
|
h_performance = hash64n("performance");
|
|
h_dates = hash64n("dates");
|
|
h_take = hash64n("take");
|
|
h_takes = hash64n("takes");
|
|
h_place = hash64n("place");
|
|
h_doors = hash64n("doors");
|
|
h_open = hash64n("open");
|
|
h_event = hash64n("event");
|
|
h_details = hash64n("details");
|
|
h_box = hash64n("box");
|
|
h_office = hash64n("office");
|
|
h_ticket = hash64n("ticket");
|
|
h_online = hash64n("online");
|
|
h_window = hash64n("window");
|
|
h_patron = hash64n("patron");
|
|
h_service = hash64n("service");
|
|
h_services = hash64n("services");
|
|
h_phone = hash64n("phone");
|
|
h_hours = hash64n("hours");
|
|
h_picks = hash64n("picks");
|
|
// populate it
|
|
s_kt.addKey(&h_registration);
|
|
s_kt.addKey(&h_registrar);
|
|
s_kt.addKey(&h_registrations);
|
|
s_kt.addKey(&h_reservation);
|
|
s_kt.addKey(&h_reservations);
|
|
s_kt.addKey(&h_register);
|
|
s_kt.addKey(&h_sign);
|
|
s_kt.addKey(&h_signup);
|
|
s_kt.addKey(&h_buy);
|
|
s_kt.addKey(&h_purchase);
|
|
s_kt.addKey(&h_requesting);
|
|
s_kt.addKey(&h_request);
|
|
s_kt.addKey(&h_get);
|
|
s_kt.addKey(&h_tickets);
|
|
s_kt.addKey(&h_presale);
|
|
s_kt.addKey(&h_on);
|
|
s_kt.addKey(&h_pre);
|
|
s_kt.addKey(&h_sales);
|
|
s_kt.addKey(&h_deliver);
|
|
s_kt.addKey(&h_picks);
|
|
s_kt.addKey(&h_box);
|
|
s_kt.addKey(&h_ticket);
|
|
s_kt.addKey(&h_patron);
|
|
s_kt.addKey(&h_phone);
|
|
// support parking stuff now
|
|
s_kt.addKey(&h_parking);
|
|
}
|
|
|
|
// this maps a section ptr to the places it contains
|
|
//HashTableX *at = m_addresses->getPlaceTable ();
|
|
// this maps a section ptr to the dates it contains
|
|
//HashTableX *tt = m_dates->getTODTable ();
|
|
|
|
// int16_tcut
|
|
//int64_t *wids = m_wids;
|
|
|
|
m_bits->setInLinkBits ( this ); // m_sections );
|
|
|
|
// scan all words
|
|
for ( int32_t j = 0 ; j < m_nw ; j++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip punct words
|
|
if ( ! m_wids[j] ) continue;
|
|
// if it is an old date with a year fixes "Since its openining
|
|
// at Gerschwin THeater in Oct 30, 2003 Wicked has sold out"
|
|
// for events.kgoradio.com/san-francisco-ca/events/show/
|
|
// 113475645-wicked
|
|
// Optionally we could check for "Since * opening at *"
|
|
// This seems to hurt to many other events, so let's
|
|
// try the option.
|
|
/*
|
|
if ( (m_bits[j] & D_IS_IN_DATE) ) {
|
|
// must be a year
|
|
int32_t num = m_words->getAsLong(j);
|
|
if ( num < 1700 ) continue;
|
|
if ( num >= 2009 ) continue;
|
|
// get section
|
|
Section *sk = m_sections->m_sectionPtrs[j];
|
|
// assume a year
|
|
if ( ! m_regTable.addKey ( &sk ) ) return false;
|
|
// skip it
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
// int16_tcut
|
|
int64_t wid = m_wids[j];
|
|
|
|
// "ictickets office" for http://www.ictickets.com/Event/
|
|
// Default.aspx?id=1036
|
|
// CAUTION: this maps tickets,ticketing -> ticket
|
|
if ( m_wlens[j] >= 7 ) {
|
|
char *p = m_wptrs[j];
|
|
char *pend = p + m_wlens[j];
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( to_lower_a(p[0]) == 't' &&
|
|
to_lower_a(p[1]) == 'i' &&
|
|
to_lower_a(p[2]) == 'c' &&
|
|
to_lower_a(p[3]) == 'k' &&
|
|
to_lower_a(p[4]) == 'e' &&
|
|
to_lower_a(p[5]) == 't' )
|
|
wid = h_ticket;
|
|
}
|
|
}
|
|
|
|
// skip if not in table
|
|
if( ! s_kt.isInTable(&wid)) continue;
|
|
|
|
// get next word
|
|
int32_t max = j + 10; if ( max > m_nw ) max = m_nw;
|
|
// assume right after us
|
|
int32_t next = j + 2;
|
|
// set nextWid to its id
|
|
int64_t nextWid = 0LL;
|
|
for ( ; next < max ; next++ ) {
|
|
if ( ! m_wids[next] ) continue;
|
|
// grab it
|
|
nextWid = m_wids[next];
|
|
// for loop below
|
|
next++;
|
|
// stop
|
|
break;
|
|
}
|
|
// and the next after that
|
|
int64_t nextWid2 = 0LL;
|
|
for ( ; next < max ; next++ ) {
|
|
if ( ! m_wids[next] ) continue;
|
|
// grab it
|
|
nextWid2 = m_wids[next];
|
|
// stop
|
|
break;
|
|
}
|
|
|
|
// . ignore if in a link
|
|
// . fixes southgatehouse.com "buy tickets"
|
|
// . i think that tingley colesium url has that too!
|
|
if ( (m_bits->m_bits[j]) & D_IN_LINK ) continue;
|
|
|
|
// synonyms
|
|
if ( nextWid == h_tickets ) nextWid = h_ticket;
|
|
if ( nextWid2 == h_tickets ) nextWid2 = h_ticket;
|
|
|
|
// reset flag
|
|
char gotIt = 0;
|
|
// is it register?
|
|
if ( wid == h_register ) gotIt = 1;
|
|
if ( wid == h_registrar ) gotIt = 1;
|
|
if ( wid == h_registration ) gotIt = 1;
|
|
if ( wid == h_registrations ) gotIt = 1;
|
|
if ( wid == h_reservation ) gotIt = 1;
|
|
if ( wid == h_reservations ) gotIt = 1;
|
|
if ( wid == h_phone && nextWid == h_hours ) gotIt = 1;
|
|
if ( wid == h_sign && nextWid == h_up ) gotIt = 1;
|
|
if ( wid == h_signup ) gotIt = 1;
|
|
if ( wid == h_buy && nextWid == h_ticket ) gotIt = 1;
|
|
//if ( wid == h_buy && nextWid == h_online ) gotIt = 1;
|
|
if ( wid == h_purchase&&nextWid==h_ticket ) gotIt = 1;
|
|
if ( wid == h_get && nextWid==h_ticket ) gotIt = 1;
|
|
// for that jimmy kimmel live url "requesting tickets online"
|
|
if ( wid == h_request && nextWid==h_ticket) gotIt = 1;
|
|
if ( wid == h_requesting && nextWid==h_ticket) gotIt = 1;
|
|
// "after requesting your tickets"
|
|
if ( wid == h_requesting && nextWid2==h_ticket) gotIt = 1;
|
|
if ( wid == h_ticket && nextWid==h_request) gotIt = 1;
|
|
// "purchase single tickets" or "purchase * tickets"
|
|
// to fix www.santarosasymphony.com/tickets/tickets_home.asp
|
|
if ( wid == h_purchase && nextWid2==h_ticket ) gotIt = 1;
|
|
if ( wid == h_patron && nextWid ==h_services) gotIt = 1;
|
|
if ( wid == h_patron && nextWid ==h_service ) gotIt = 1;
|
|
if ( wid == h_phone && nextWid ==h_hours ) gotIt = 1;
|
|
// "give them tickets to" for santafe playhouse url
|
|
// to cancel out "Max's or Dish n' Spoon" as a place
|
|
// (for Address.cpp's call to isTicketDate())
|
|
if ( wid == h_ticket && nextWid == h_to ) gotIt = 1;
|
|
if ( wid == h_presale ) gotIt = 1;
|
|
if ( wid == h_on && nextWid == h_sale ) gotIt = 1;
|
|
if ( wid == h_pre && nextWid == h_sale ) gotIt = 1;
|
|
if ( wid == h_sales && nextWid == h_end ) gotIt = 1;
|
|
if ( wid == h_sales && nextWid == h_stop ) gotIt = 1;
|
|
if ( wid == h_sales && nextWid == h_begin ) gotIt = 1;
|
|
if ( wid == h_sales && nextWid == h_start ) gotIt = 1;
|
|
// deliver saturdays and sundays - frenchtulips.com
|
|
if ( wid == h_deliver ) gotIt = 1;
|
|
// The Taos Ski Shuttle picks up guests daily at 9:15am
|
|
if ( wid == h_picks && nextWid == h_up ) gotIt = 1;
|
|
if ( wid == h_box && nextWid == h_office ) gotIt = 1;
|
|
if ( wid == h_ticket&& nextWid == h_window ) gotIt = 1;
|
|
if ( wid == h_ticket&& nextWid == h_office ) gotIt = 1;
|
|
// try to fix villr.com whose event is at a parking lot!
|
|
if ( wid == h_parking ) gotIt = 2;
|
|
// ticket: or tickets: fix for mesaartscenter.com
|
|
//if (wid==h_ticket && m_wptrs[j][m_wlens[j]]==':') gotIt = 1;
|
|
if ( ! gotIt ) continue;
|
|
// skip if in title
|
|
Section *sk = m_sectionPtrs[j];
|
|
|
|
// telescope up to sentence
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if sentence
|
|
if ( sk->m_flags & SEC_SENTENCE ) break;
|
|
}
|
|
|
|
// sk is NULL if in "bad" section like an <option> tag
|
|
// in which case we skip it i guess
|
|
if ( ! sk ) continue;
|
|
|
|
// if in a menu like "register" for mrmovietimes.com continue
|
|
if ( sk->m_flags & SEC_MENU ) continue;
|
|
|
|
// save it
|
|
//Section *orig = sk;
|
|
|
|
int32_t sa = sk->m_firstWordPos;
|
|
//int32_t sb = sk->m_lastWordPos;
|
|
//bool expanded = false;
|
|
Section *lastsk = NULL;
|
|
|
|
// are we in a header section?
|
|
//bool heading = ( sk->m_flags & SEC_HEADING );
|
|
|
|
// ok, now telescope up this section
|
|
// get first word
|
|
//int32_t fw = sk->m_a - 1;
|
|
// find alnum right before that
|
|
//for ( ; fw >= 0 ; fw-- )
|
|
// if ( wids[fw] ) break;
|
|
// get first word below us
|
|
//int32_t fw2 = sk->m_b;
|
|
// scan for it
|
|
//for ( ; fw2 < m_nw ; fw2++ )
|
|
// if ( wids[fw2] ) break;
|
|
// telescope until we contain a good place
|
|
for ( ; sk ; sk = sk->m_parent ) {
|
|
// stop if includes words above our sentence
|
|
//if ( sk->m_a <= fw ) break;
|
|
if ( sk->m_firstWordPos < sa ) break;
|
|
|
|
// save it
|
|
lastsk = sk;
|
|
|
|
// did we expand?
|
|
//if ( sk->m_lastWordPos != sb ) expanded = true;
|
|
|
|
// stop if we hit a title tag
|
|
//if ( sk->m_tagId == TAG_TITLE ) break;
|
|
|
|
// . stop if this section is in a list of other
|
|
// . we should hash each sections tag hash along with
|
|
// their parent section ptr for this
|
|
//if ( sk->m_container ) break;
|
|
|
|
// add it
|
|
//if ( ! m_regTable.addKey ( &sk ) ) return false;
|
|
// flag it as well
|
|
if ( gotIt == 1 ) sk->m_flags |= SEC_HAS_REGISTRATION;
|
|
else sk->m_flags |= SEC_HAS_PARKING;
|
|
|
|
// stop if includes new words below
|
|
//if ( sk->m_b > fw2 ) break;
|
|
//if ( sk->m_lastWordPos > sb ) break;
|
|
|
|
// or td hit. fixes events.mapchannels.com
|
|
if ( sk->m_tagId == TAG_TD ) break;
|
|
if ( sk->m_tagId == TAG_TR ) break;
|
|
if ( sk->m_tagId == TAG_TABLE ) break;
|
|
if ( sk->m_tagId == TAG_FORM ) break;
|
|
|
|
// . we can extend maxsb if we are a heading only
|
|
// . this fixes signmeup.com which has "registration"
|
|
// in the first non-header sentence of a section and
|
|
// then in the second sentence has the critical date
|
|
// "Thanksgiving" which we need to telescope to to
|
|
// get our events.
|
|
// . now that we only telescope "sk" if the key word
|
|
// was in a "heading" we fix that.
|
|
//if ( sk->m_b > sb && ! heading ) break;
|
|
|
|
/*
|
|
Section *nb = sk->m_nextBrother;
|
|
|
|
// this did fix mapchannels.com, but now we only
|
|
// allow SEC_HEADING_CONAINER to be set on a list
|
|
// of brothers that have <tr> or <p> tags, and not
|
|
// the <td> tags as used in mapchannels.com.
|
|
// get next brother
|
|
|
|
// . skip implied sections as those are often wrong
|
|
// . this fixed events.mapchannels.com from allowing
|
|
// the "Buy Tickets..." heading to telescope up
|
|
// to the whole row in the registration table,
|
|
// because its brother got encapsulated in an
|
|
// implied section.
|
|
//for ( ; nb ; nb = nb->m_next )
|
|
// // stop if not implied
|
|
// if ( nb->m_baseHash != BH_IMPLIED ) break;
|
|
|
|
// do not telescope more if we have a next brother
|
|
if ( nb &&
|
|
// can't use tagHash because implied sections
|
|
// changed it!
|
|
//nb->m_tagHash == sk->m_tagHash &&
|
|
nb->m_tagId == sk->m_tagId &&
|
|
// . ignore br tag sections
|
|
// . often the header is "registration" then there
|
|
// is a br tag and the registration info like in
|
|
// signmeup.com
|
|
( sk->m_baseHash != BH_BR &&
|
|
sk->m_baseHash != BH_BRBR ) )
|
|
break;
|
|
*/
|
|
|
|
// stop if title tag hit
|
|
int32_t a = sk->m_a;
|
|
if ( m_tids[a] == TAG_TITLE ) break;
|
|
// . stop if we hit a place
|
|
// . this was causing signmeup.com which had the
|
|
// address then the times of the registration to
|
|
// mess up
|
|
//if ( at->isInTable(&sk) ) break;
|
|
// or a time!
|
|
// might have registration at a particular time
|
|
// on the same day as other events all at the same
|
|
// place like for
|
|
// http://www.breadnm.org/custom.html
|
|
// crap this breaks denver.org which just happens
|
|
// to have like everything in a table...
|
|
//if ( tt->isInTable(&sk) ) break;
|
|
|
|
// the whole thing might have been registration!
|
|
// so don't let sk become NULL. this went NULL for
|
|
// the txt doc:
|
|
// http://bywaysresourcecenter.org/events/uploads/AB
|
|
// RC_09ConfReg_bro_FNL.txt which basically had no
|
|
// sections
|
|
if ( ! sk->m_parent ) break;
|
|
}
|
|
|
|
// if the biggest section with the registration sentence
|
|
// at the top of it contains multiple sentences, then
|
|
// that should be good enough...
|
|
sk = lastsk;
|
|
// sanity check
|
|
if ( ! sk ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
// int16_tcut
|
|
Section **sp = m_sectionPtrs;
|
|
|
|
//
|
|
// now unset SEC_HAS_REGISTRATION for sections that seem
|
|
// to contain sentences that are specifically about the
|
|
// event or actual performance
|
|
//
|
|
|
|
// now completely ignore "sb" and starting at word #i
|
|
// scan until we hit a breaking tag (excluding br) or
|
|
// we hit a phrase that identifies the end of the reservation
|
|
// sections. like "Performance Dates" for santafeplayhouse.org.
|
|
// and for adobetheater.org we should stop at the </tr> tag
|
|
// so we can pick up that "Friday from 9 AM to 4 PM" that
|
|
// we were missing because "sb" stopped int16_t.
|
|
|
|
for ( int32_t i = j ; i < sk->m_b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if certain tag id
|
|
if ( m_tids[i] ) {
|
|
/*
|
|
// get tag id
|
|
nodeid_t tid = m_tids[i] & BACKBITCOMP;
|
|
// mapchannels has "buy tickets" in a td cell
|
|
if ( tid == TAG_TD ) clearIt = true;
|
|
if ( tid == TAG_TR ) clearIt = true;
|
|
if ( tid == TAG_TABLE ) clearIt = true;
|
|
if ( tid == TAG_FORM ) clearIt = true;
|
|
*/
|
|
continue;
|
|
}
|
|
// skip if punct
|
|
if ( ! m_wids[i] ) continue;
|
|
// get section
|
|
Section *si = sp[i];
|
|
// certain words will stop us
|
|
if ( i+2<m_nw ) {
|
|
// fix for santafeplayhouse.org
|
|
if ( m_wids[i ] == h_performance &&
|
|
m_wids[i+2] == h_dates )
|
|
goto clear;
|
|
// fix signmeup.com
|
|
if ( m_wids[i ] == h_take &&
|
|
m_wids[i+2] == h_place )
|
|
goto clear;
|
|
if ( m_wids[i ] == h_takes &&
|
|
m_wids[i+2] == h_place )
|
|
goto clear;
|
|
// fix reverbnation.com
|
|
if ( m_wids[i ] == h_doors &&
|
|
m_wids[i+2] == h_open )
|
|
goto clear;
|
|
// fix socialmediabeach.com
|
|
if ( m_wids[i ] == h_event &&
|
|
m_wids[i+2] == h_details )
|
|
goto clear;
|
|
}
|
|
// mark it
|
|
if ( gotIt == 1 ) si->m_flags |= SEC_HAS_REGISTRATION;
|
|
else si->m_flags |= SEC_HAS_PARKING;
|
|
continue;
|
|
clear:
|
|
for ( ; si ; si = si->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// clear flag
|
|
si->m_flags &= ~SEC_HAS_REGISTRATION;
|
|
si->m_flags &= ~SEC_HAS_PARKING;
|
|
}
|
|
break;
|
|
}
|
|
|
|
/*
|
|
|
|
// this messes up too many other pages. the better approach
|
|
// to fix mesaartscenter.com is to consider those events
|
|
// outlinked titles because of the "DETAILS" link. then
|
|
// we can go to the correct page and get the correct tod
|
|
// ranges.
|
|
|
|
// get largest section containing us. just our sentence.
|
|
// then get next brother of that. that whole brother
|
|
// should be under our influence. assuming we do not have
|
|
// any dates in our section...
|
|
Section *gs = orig;
|
|
// require it to be a heading i guess
|
|
if ( ! ( gs->m_flags & SEC_HEADING ) ) gs = NULL;
|
|
// blow it up into the biggest container of just "orig"
|
|
for ( ; gs ; gs = gs->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ! gs->m_parent ) break;
|
|
if ( gs->m_parent->m_alnumPosA != gs->m_alnumPosA )
|
|
break;
|
|
if ( gs->m_parent->m_alnumPosB != gs->m_alnumPosB )
|
|
break;
|
|
}
|
|
// this is non-NULL if registration indicator was in a heading
|
|
if ( gs ) {
|
|
// get the brother section after that heading
|
|
Section *bro = gs->m_nextBrother;
|
|
// mark all sections in "bro" as registration
|
|
for ( Section *ss = bro ; ss ; ss = ss->m_next ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ss->m_a >= bro->m_b ) break;
|
|
ss->m_flags |= SEC_HAS_REGISTRATION;
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
|
|
// . try to fix adobetheater.org which has one sentence which
|
|
// we see as two:
|
|
// "Advance reservations are accepted on Monday - Thursday from "
|
|
// 9 AM to 5 PM" and "Friday from 9 AM to 4 PM."
|
|
// . assume hours are registration dates is better to be wrong
|
|
// and call them registration dates than the other way around because
|
|
// we do not want to give wrong dates
|
|
|
|
// . now scan the sections
|
|
// . look at the lists of brothers
|
|
// . if a brother is in the table , then assume all brothers after
|
|
// him are in the table as well
|
|
/*
|
|
for ( Section *si = m_sections->m_rootSection ; si ; si = si->m_next ){
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if not in table
|
|
//if ( ! m_regTable.isInTable ( &si ) ) continue;
|
|
if ( ! (si->m_flags & SEC_HAS_REGISTRATION) ) continue;
|
|
// skip if already scanned
|
|
if ( si->m_used == 88 ) continue;
|
|
// scan brothers
|
|
for ( Section *nb = si ; nb ; nb = nb->m_nextBrother ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// mark all his brothers as reg too!
|
|
nb->m_flags |= SEC_HAS_REGISTRATION;
|
|
// mark it
|
|
nb->m_used = 88;
|
|
}
|
|
}
|
|
*/
|
|
|
|
m_setRegBits = true;
|
|
return true;
|
|
//m_regTableValid = true;
|
|
//return &m_regTable;
|
|
}
|
|
|
|
|
|
sentflags_t getMixedCaseFlags ( Words *words ,
|
|
wbit_t *bits ,
|
|
int32_t senta ,
|
|
int32_t sentb ,
|
|
int32_t niceness ) {
|
|
|
|
int64_t *wids = words->getWordIds();
|
|
int32_t *wlens = words->getWordLens();
|
|
char **wptrs = words->getWordPtrs();
|
|
int32_t lowerCount = 0;
|
|
int32_t upperCount = 0;
|
|
bool firstWord = true;
|
|
bool inParens = false;
|
|
for ( int32_t i = senta ; i < sentb ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// skip if not alnum
|
|
if ( ! wids[i] ) {
|
|
// skip tags right away
|
|
if ( wptrs[i][0]=='<' ) continue;
|
|
// check for end first in case of ") ("
|
|
if ( words->hasChar(i,')') ) inParens = false;
|
|
// check if in parens
|
|
if ( words->hasChar(i,'(') ) inParens = true;
|
|
continue;
|
|
}
|
|
// skip if in parens
|
|
if ( inParens ) continue;
|
|
// are we upper case?
|
|
bool upper = is_upper_utf8(wptrs[i]) ;
|
|
// are we a stop word?
|
|
bool isStopWord = words->isStopWord(i);
|
|
// . if first word is stop word and lower case, forget it
|
|
// . fix "by Ron Hutchinson" title for adobetheater.org
|
|
if ( isStopWord && firstWord && ! upper )
|
|
// make sure both flags are returned i guess
|
|
return (SENT_MIXED_CASE | SENT_MIXED_CASE_STRICT);
|
|
|
|
// allow if hyphen preceedes like for
|
|
// abqfolkfest.org's "Kay-lee"
|
|
if ( i>0 && wptrs[i][-1]=='-' ) upper = true;
|
|
// if we got mixed case, note that!
|
|
if ( wids[i] &&
|
|
! is_digit(wptrs[i][0]) &&
|
|
! upper &&
|
|
(! isStopWord || firstWord ) &&
|
|
// . November 4<sup>th</sup> for facebook.com
|
|
// . added "firstword" for "on AmericanTowns.com"
|
|
// title prevention for americantowns.com
|
|
(wlens[i] >= 3 || firstWord) )
|
|
lowerCount++;
|
|
|
|
// turn off
|
|
firstWord = false;
|
|
// . don't count words like "Sunday" that are dates!
|
|
// . fixes "6:30 am. Sat. and Sun.only" for unm.edu
|
|
// and "3:30 pm. - 4 pm. Sat. and Sun., sandwiches"
|
|
// . fixes events.kgoradio.com's
|
|
// "San Francisco Symphony Chorus sings Bach's
|
|
// Christmas Oratorio"
|
|
// . fixes "7:00-8:00pm, Tango Fundamentals lesson" for
|
|
// abqtango.com
|
|
// . fixes "Song w/ Joanne DelCarpine (located" for
|
|
// texasdrums.drums.org
|
|
// . "Loren Kahn Puppet and Object Theater presents
|
|
// Billy Goat Ball" for trumba.com
|
|
if ( bits[i] & D_IS_IN_DATE ) upper = false;
|
|
// . was it upper case?
|
|
if ( upper ) upperCount++;
|
|
}
|
|
|
|
sentflags_t sflags = 0;
|
|
|
|
if ( lowerCount > 0 ) sflags |= SENT_MIXED_CASE_STRICT;
|
|
|
|
if ( lowerCount == 1 && upperCount >= 2 ) lowerCount = 0;
|
|
|
|
|
|
// . fix "7-10:30pm Contra dance"
|
|
// . treat a numeric date like an upper case word
|
|
if ( (bits[senta] & D_IS_IN_DATE) &&
|
|
// treat a single lower case word as error
|
|
lowerCount == 1 &&
|
|
// prevent "7:30-8:30 dance" for ceder.net i guess
|
|
upperCount >= 1)
|
|
lowerCount = 0;
|
|
|
|
if ( lowerCount > 0 ) sflags |= SENT_MIXED_CASE;
|
|
|
|
return sflags;
|
|
}
|
|
|
|
/*
|
|
uint32_t getSectionContentTagHash3 ( Section *sn ) {
|
|
// sanity check
|
|
if ( sn->m_firstWordPos < 0 ) { char *xx=NULL;*xx=0; }
|
|
// hash the tagid of this section together with its last
|
|
// 3 parents for a total of 4
|
|
uint32_t th = sn->m_tagId;
|
|
// telescope up through the next 3 parents
|
|
int32_t pcount = 0;
|
|
for ( Section *ps = sn->m_parent; ps ; ps = ps->m_parent ) {
|
|
// limit to 3
|
|
if ( ++pcount >= 4 ) break;
|
|
// incorporate it
|
|
th = hash32h ( (uint32_t)ps->m_tagId , th );
|
|
}
|
|
// accumulate the tag hash
|
|
//th = hash32 ( th , sn->m_tagHash );
|
|
// . hash that like in XmlDoc::getSectionVotingTable()
|
|
// . combine the tag hash with the content hash #2
|
|
return th ^ sn->m_contentHash;
|
|
}
|
|
*/
|
|
|
|
// identify crap like "Good For Kids: Yes" for yelp.com etc. so we don't
|
|
// use that crap as titles.
|
|
bool Sections::setFormTableBits ( ) {
|
|
|
|
if ( ! m_alnumPosValid ) { char *xx=NULL;*xx=0; }
|
|
|
|
sec_t sdf = SEC_HAS_DOM|SEC_HAS_MONTH|SEC_HAS_DOW|SEC_HAS_TOD;
|
|
// scan all sentences
|
|
for ( Section *si = m_firstSent ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// must be sentence
|
|
//if ( ! ( si->m_flags & SEC_SENTENCE ) ) continue;
|
|
// int16_tcut
|
|
sentflags_t sf = si->m_sentFlags;
|
|
// . fortable field must not end in period
|
|
// . i.e. "Good For Kids:"
|
|
if ( sf & SENT_PERIOD_ENDS ) continue;
|
|
// get next sent
|
|
Section *next = si->m_nextSent;
|
|
// this stops things..
|
|
if ( ! next ) continue;
|
|
// must not end in period either
|
|
if ( next->m_sentFlags & SENT_PERIOD_ENDS ) continue;
|
|
Section *ps;
|
|
// must be the only sentences in a section
|
|
ps = si->m_parent;
|
|
for ( ; ps ; ps = ps->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( ps->contains ( next ) ) break;
|
|
}
|
|
// how does this happen?
|
|
if ( ! ps ) continue;
|
|
// must not contain any other sentences, just these two
|
|
if ( ps->m_alnumPosA != si ->m_alnumPosA )
|
|
continue;
|
|
if ( ps->m_alnumPosB != next->m_alnumPosB )
|
|
continue;
|
|
// get first solid parent tag for "si"
|
|
Section *bs = si->m_parent;
|
|
for ( ; bs ; bs = bs->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( bs->m_tagId == TAG_B ) continue;
|
|
if ( bs->m_tagId == TAG_STRONG ) continue;
|
|
if ( bs->m_tagId == TAG_FONT ) continue;
|
|
if ( bs->m_tagId == TAG_I ) continue;
|
|
break;
|
|
}
|
|
// if none, bail
|
|
if ( ! bs ) continue;
|
|
// get the containing tag then
|
|
nodeid_t tid = bs->m_tagId;
|
|
// must be some kind of field/value indicator
|
|
bool separated = false;
|
|
if ( tid == TAG_DT ) separated = true;
|
|
if ( tid == TAG_TD ) separated = true;
|
|
// if tr or dt tag or whatever contains "next" that is not
|
|
// good, we need next and si to be in their own dt or td
|
|
// section.
|
|
if ( bs->contains ( next ) ) separated = false;
|
|
// fix "Venue Type: Auditorium" for zvents.com kimo theater
|
|
if ( sf & SENT_COLON_ENDS ) separated = true;
|
|
// must be separated by a tag or colon to be field/value pair
|
|
if ( ! separated ) continue;
|
|
// if either one has dates, let is slide.
|
|
// fix "zumba</td><td>10/26/2011" for www.ci.tualatin.or.us
|
|
if ( si ->m_flags & sdf ) continue;
|
|
if ( next->m_flags & sdf ) continue;
|
|
// label them
|
|
si ->m_sentFlags |= SENT_FORMTABLE_FIELD;
|
|
next->m_sentFlags |= SENT_FORMTABLE_VALUE;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// just loop over Addresses::m_sortedPlaces
|
|
void Sections::setAddrXors ( Addresses *aa ) {
|
|
// sanity check
|
|
if ( ! aa->m_sortedValid ) { char *xx=NULL;*xx=0; }
|
|
// loop over the places, sorted by Place::m_a
|
|
int32_t np = aa->m_numSorted;
|
|
for ( int32_t i = 0 ; i < np ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get place #i
|
|
Place *p = aa->m_sorted[i];
|
|
// get the address?
|
|
Address *ad = p->m_address;
|
|
// assume none
|
|
int32_t h = 0;
|
|
// or alias
|
|
if ( ! ad ) ad = p->m_alias;
|
|
// or just use place hash i guess!
|
|
if ( ! ad )
|
|
h = (int32_t)(PTRTYPE)p;
|
|
else if ( ad->m_flags3 & AF2_LATLON )
|
|
h = (int32_t)(PTRTYPE)p;
|
|
// otherwise hash up address street etc.
|
|
else {
|
|
h =(int32_t)ad->m_street->m_hash;
|
|
h ^=(int32_t)ad->m_street->m_streetNumHash;
|
|
//h ^= ad->m_adm1->m_cid; // country id
|
|
//h ^= (int32_t)ad->m_adm1Bits;
|
|
//h ^= (int32_t)ad->m_cityHash;
|
|
h ^= (int32_t)ad->m_cityId32;
|
|
// sanity check
|
|
//if ( ! ad->m_adm1Bits ||
|
|
// ! ad->m_cityHash ) {
|
|
if ( ! ad->m_cityId32 ) {
|
|
//! ad->m_adm1->m_cid ) {
|
|
char *xx=NULL;*xx=0; }
|
|
}
|
|
// get first section containing place #i
|
|
Section *sp = m_sectionPtrs[p->m_a];
|
|
// telescope all the way up
|
|
for ( ; sp ; sp = sp->m_parent ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// propagate
|
|
sp->m_addrXor ^= h;
|
|
if ( ! sp->m_addrXor ) sp->m_addrXor = h;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool Sections::setTableRowsAndCols ( Section *tableSec ) {
|
|
|
|
//int32_t rowCount = -1;
|
|
int32_t colCount = -1;
|
|
int32_t maxColCount = -1;
|
|
int32_t maxRowCount = -1;
|
|
Section *headCol[100];
|
|
Section *headRow[300];
|
|
int32_t rowspanAcc[300];
|
|
|
|
int32_t maxCol = -1;
|
|
int32_t minCol = 99999;
|
|
int32_t maxRow = -1;
|
|
int32_t minRow = 99999;
|
|
|
|
int32_t rowCount = 0;
|
|
// init rowspan info
|
|
for ( int32_t k = 0 ; k < 300 ; k++ ) rowspanAcc[k] = 1;
|
|
|
|
Section *prev = NULL;
|
|
Section *above[100];
|
|
memset ( above,0,sizeof(Section *)*100);
|
|
|
|
// scan sections in the table
|
|
for ( Section *ss = tableSec->m_next ; ss ; ss = ss->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// stop if went outside of table
|
|
if ( ss->m_a >= tableSec->m_b ) break;
|
|
// table in a table?
|
|
Section *p = ss->m_parent;
|
|
for ( ; p ; p = p->m_parent ) {
|
|
QUICKPOLL(m_niceness);
|
|
if ( p->m_tagId == TAG_TABLE ) break;
|
|
}
|
|
// must not be within another table in our table
|
|
if ( p != tableSec ) continue;
|
|
// int16_tcut
|
|
nodeid_t tid = ss->m_tagId;
|
|
// . ok, we are a section the the table "tableSec" now
|
|
// . row?
|
|
if ( tid == TAG_TR ) {
|
|
rowCount++;
|
|
colCount = 0;
|
|
continue;
|
|
}
|
|
// . column?
|
|
// . fix eviesays.com which has "what" in a <th> tag
|
|
if ( tid != TAG_TD && tid != TAG_TH ) continue;
|
|
// . must have had a row
|
|
if ( rowCount <= 0 ) continue;
|
|
// . did previous td tag have multiple rowspan?
|
|
// . this <td> tag is referring to the first column
|
|
// that has not exhausted its rowspan
|
|
for ( ; colCount<300 ; colCount++ )
|
|
if ( --rowspanAcc[colCount] <= 0 ) break;
|
|
|
|
// is it wacko? we should check this in Dates.cpp
|
|
// and ignore all dates in such tables or at least
|
|
// not allow them to pair up with each other
|
|
int32_t rslen;
|
|
char *rs = getFieldValue ( m_wptrs[ss->m_a] , // tag
|
|
m_wlens[ss->m_a] , // tagLen
|
|
"rowspan" ,
|
|
&rslen );
|
|
int32_t rowspan = 1;
|
|
if ( rs ) rowspan = atol2(rs,rslen);
|
|
//if ( rowspan != 1 )
|
|
// tableSec->m_flags |= SEC_WACKO_TABLE;
|
|
if ( colCount < 300 )
|
|
rowspanAcc[colCount] = rowspan;
|
|
|
|
//Section *cs = m_sectionPtrs[i];
|
|
// update headCol[] array to refer to us
|
|
if ( rowCount == 1 && colCount < 100 ) {
|
|
headCol[colCount] = ss;
|
|
// record a max since some tables have
|
|
// fewer columns in first row! bad tables!
|
|
maxColCount = colCount;
|
|
}
|
|
// update headRow[] array to refer to us
|
|
if ( colCount == 0 && rowCount < 300 ) {
|
|
headRow[rowCount] = ss;
|
|
// same for this
|
|
maxRowCount = rowCount;
|
|
}
|
|
// set our junk
|
|
if ( colCount < 100 && colCount <= maxColCount )
|
|
ss->m_headColSection = headCol[colCount];
|
|
if ( rowCount < 300 && rowCount <= maxRowCount )
|
|
ss->m_headRowSection = headRow[rowCount];
|
|
colCount++;
|
|
// start counting at "1" so that way we know that a
|
|
// Sections::m_col/rowCount of 0 is invalid
|
|
ss->m_colNum = colCount;
|
|
ss->m_rowNum = rowCount;
|
|
ss->m_tableSec = tableSec;
|
|
|
|
|
|
// . sets Section::m_cellAbove and Section::m_cellLeft to
|
|
// point to the td or th cells above us or to the left of us
|
|
// respectively.
|
|
// . use this to scan for dates when telescoping
|
|
// . if date is in a table and date you are telescoping to is
|
|
// in the same table it must be to your left or above you in
|
|
// the same row/col if SEC_HASDATEHEADERROW or
|
|
// SEC_HASDATEHEADERCOL is set for the table.
|
|
// . so Dates::isCompatible() needs this function to set
|
|
// those ptrs
|
|
|
|
// who was on our left?
|
|
if ( prev && prev->m_rowNum == rowCount )
|
|
ss->m_leftCell = prev;
|
|
// who is above us?
|
|
if ( colCount<100 && rowCount>=2 && above[colCount] )
|
|
ss->m_aboveCell = above[colCount];
|
|
|
|
// update for row
|
|
prev = ss;
|
|
// update for col
|
|
if ( colCount<100) above[colCount] = ss;
|
|
|
|
// first word position in section. -1 if no words contained.
|
|
if ( ss->m_firstWordPos >= 0 ) {
|
|
if ( colCount > maxCol ) maxCol = colCount;
|
|
if ( colCount < minCol ) minCol = colCount;
|
|
if ( rowCount > maxRow ) maxRow = rowCount;
|
|
if ( rowCount < minRow ) minRow = rowCount;
|
|
}
|
|
|
|
//
|
|
// propagate to all child sections in our section
|
|
//
|
|
int32_t maxb = ss->m_b;
|
|
Section *kid = ss->m_next;
|
|
for ( ; kid && kid->m_b <= maxb ; kid = kid->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if belongs to another table already
|
|
if ( kid->m_tableSec &&
|
|
tableSec->contains ( kid->m_tableSec) )
|
|
continue;
|
|
// set his junk
|
|
kid->m_colNum = colCount;
|
|
kid->m_rowNum = rowCount;
|
|
kid->m_headColSection = ss->m_headColSection;
|
|
kid->m_headRowSection = ss->m_headRowSection;
|
|
kid->m_tableSec = ss->m_tableSec;
|
|
}
|
|
}
|
|
|
|
// . require at least a 2x2!!!
|
|
// . AND require there have been text in the other dimensions
|
|
// in order to fix the pool hours for www.the-w.org/poolsched.html
|
|
// TODO!!!
|
|
// . TODO: allow addlist to combine dates across <td> or <th> tags
|
|
// provided the table is NOT SEC_MULTIDIMS...
|
|
if ( maxRow != minRow && maxCol != minCol )
|
|
tableSec->m_flags |= SEC_MULTIDIMS;
|
|
|
|
return true;
|
|
}
|
|
|
|
// . "<table>" section tag is "sn"
|
|
// . set SEC_TABLE_HEADER on contained <td> or <th> cells that represent
|
|
// a header column or row
|
|
bool Sections::setTableHeaderBits ( Section *ts ) {
|
|
|
|
static char *s_tableFields [] = {
|
|
// . these have no '$' so they are exact/full matches
|
|
// . table column headers MUST match full matches and are not
|
|
// allowed to match substring matches
|
|
"+who",
|
|
"+what",
|
|
"+event",
|
|
"+title",
|
|
|
|
// use '$' to endicate sentence ENDS in this
|
|
"-genre",
|
|
"-type",
|
|
"-category",
|
|
"-categories",
|
|
"@where",
|
|
"@location",
|
|
"-contact", // contact: john
|
|
"-neighborhood", // yelp uses this field
|
|
"@venue",
|
|
"-instructor",
|
|
"-instructors",
|
|
"-convenor", // graypanthers uses convenor:
|
|
"-convenors", // graypanthers uses convenor:
|
|
"-caller", // square dancing
|
|
"-callers",
|
|
"-call", // phone number
|
|
"-price",
|
|
"-price range",
|
|
"@event location",
|
|
};
|
|
// store these words into field table "ft"
|
|
static HashTableX s_ft;
|
|
static char s_ftbuf[2000];
|
|
static bool s_init0 = false;
|
|
if ( ! s_init0 ) {
|
|
s_init0 = true;
|
|
s_ft.set(8,4,128,s_ftbuf,2000,false,m_niceness,"ftbuf");
|
|
int32_t n = (int32_t)sizeof(s_tableFields)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
// set words
|
|
char *s = s_tableFields[i];
|
|
Words w; w.set3 ( s );
|
|
int64_t *wi = w.getWordIds();
|
|
int64_t h = 0;
|
|
// scan words
|
|
for ( int32_t j = 0 ; j < w.getNumWords(); j++ )
|
|
if ( wi[j] ) h ^= wi[j];
|
|
// . store hash of all words, value is ptr to it
|
|
// . put all exact matches into ti1 and the substring
|
|
// matches into ti2
|
|
if ( ! s_ft.addKey ( &h , &s ) ) {char *xx=NULL;*xx=0;}
|
|
}
|
|
}
|
|
|
|
|
|
int32_t colVotes = 0;
|
|
int32_t rowVotes = 0;
|
|
int32_t maxCol = 0;
|
|
int32_t maxRow = 0;
|
|
Section *sn = ts;
|
|
// scan the sections in the table
|
|
for ( ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop if leaves the table
|
|
if ( sn->m_a >= ts->m_b ) break;
|
|
// skip if in another table in the "ts" table
|
|
if ( sn->m_tableSec != ts ) continue;
|
|
// must be a TD section or TH section
|
|
if ( sn->m_tagId != TAG_TD && sn->m_tagId != TAG_TH ) continue;
|
|
// get max
|
|
if ( sn->m_rowNum > maxRow ) maxRow = sn->m_rowNum;
|
|
if ( sn->m_colNum > maxCol ) maxCol = sn->m_colNum;
|
|
// must be row or col 1
|
|
if ( sn->m_colNum != 1 && sn->m_rowNum != 1 ) continue;
|
|
// header format?
|
|
bool hdrFormatting = (sn->m_flags & SEC_HEADING_CONTAINER);
|
|
// is it a single format? i.e. no word<tag>word in the cell?
|
|
if ( sn->m_colNum == 1 && hdrFormatting ) colVotes++;
|
|
if ( sn->m_rowNum == 1 && hdrFormatting ) rowVotes++;
|
|
// skip if not heading container
|
|
if ( ! hdrFormatting ) continue;
|
|
// look for certain words like "location:" or "venue", those
|
|
// are very strong indicators of a header row or header col
|
|
for ( int32_t i = sn->m_a ; i < sn->m_b ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
if ( ! s_ft.isInTable ( &m_wids[i] ) ) continue;
|
|
if ( sn->m_colNum == 1 ) colVotes += 10000;
|
|
if ( sn->m_rowNum == 1 ) rowVotes += 10000;
|
|
}
|
|
}
|
|
|
|
bool colWins = false;
|
|
bool rowWins = false;
|
|
// colWins means col #1 is the table header
|
|
if ( colVotes > rowVotes ) colWins = true;
|
|
// rowWins means row #1 is the table header
|
|
if ( colVotes < rowVotes ) rowWins = true;
|
|
|
|
// do another scan of table
|
|
sn = ts;
|
|
// skip loop if no need
|
|
if ( ! rowWins && ! colWins ) sn = NULL;
|
|
// if table only has one row or col
|
|
if ( maxRow <= 1 && maxCol <= 1 ) sn = NULL;
|
|
|
|
// scan the sections in the table
|
|
for ( ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop if leaves the table
|
|
if ( sn->m_a >= ts->m_b ) break;
|
|
// skip if in another table in the "ts" table
|
|
if ( sn->m_tableSec != ts ) continue;
|
|
// must be a TD section or TH section
|
|
if ( sn->m_tagId != TAG_TD && sn->m_tagId != TAG_TH ) continue;
|
|
// it must be in the winning row or column
|
|
if ( rowWins && sn->m_rowNum != 1 ) continue;
|
|
if ( colWins && sn->m_colNum != 1 ) continue;
|
|
// flag it as a table header
|
|
sn->m_flags |= SEC_TABLE_HEADER;
|
|
// propagate to all kids as well so the sentence itself
|
|
// will have SEC_TABLE_HEADER set so we can detect that
|
|
// in setSentFlags(), because we use it for setting
|
|
// SENT_TAGS
|
|
Section *kid = sn->m_next;
|
|
for ( ; kid && kid->m_b <= sn->m_b ; kid = kid->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if does not belong to our table
|
|
if ( kid->m_tableSec &&
|
|
kid->m_tableSec != sn->m_tableSec ) continue;
|
|
// set it
|
|
kid->m_flags |= SEC_TABLE_HEADER;
|
|
}
|
|
}
|
|
|
|
// scan the cells in the table, NULL out the
|
|
// m_headColSection or m_headRowSection depending
|
|
sn = ts;
|
|
// scan the sections in the table
|
|
for ( ; sn ; sn = sn->m_next ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// stop if leaves the table
|
|
if ( sn->m_a >= ts->m_b ) break;
|
|
// skip if in another table in the "ts" table
|
|
if ( sn->m_tableSec != ts ) continue;
|
|
// must be a TD section or TH section
|
|
if ( sn->m_tagId != TAG_TD && sn->m_tagId != TAG_TH ) continue;
|
|
// if its a header section itself...
|
|
if ( sn->m_flags & SEC_TABLE_HEADER ) {
|
|
sn->m_headColSection = NULL;
|
|
sn->m_headRowSection = NULL;
|
|
// keep going so we can propagate the NULLs to our kids
|
|
}
|
|
// get its hdr
|
|
Section *hdr = sn->m_headColSection;
|
|
// only if we are > row 1
|
|
//if ( sn->m_rowNum >= 2 ) hdr = sn->m_headColSection;
|
|
// must have table header set
|
|
if ( hdr && ! ( hdr->m_flags & SEC_TABLE_HEADER ) )
|
|
// but if we are not in the first col, we can use it
|
|
//sn->m_colNum == 1 )
|
|
sn->m_headColSection = NULL;
|
|
// same for row
|
|
hdr = sn->m_headRowSection;
|
|
// only if we are col > 1
|
|
//if ( ! hdr && sn->m_colNum >= 2 ) hdr = sn->m_headRowSection;
|
|
// must have table header set
|
|
if ( hdr && ! ( hdr->m_flags & SEC_TABLE_HEADER ) )
|
|
// . but if we are not in the first row, we can use it
|
|
// . m_rowNum starts at 1, m_colNum starts at 1
|
|
//sn->m_rowNum == 1 )
|
|
sn->m_headRowSection = NULL;
|
|
|
|
//
|
|
// propagate to all child sections in our section
|
|
//
|
|
Section *kid = sn->m_next;
|
|
for ( ; kid && kid->m_b <= sn->m_b ; kid = kid->m_next ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// skip if does not belong to our table
|
|
if ( kid->m_tableSec != ts ) continue;
|
|
// set his junk
|
|
//kid->m_colNum = sn->m_colNum;
|
|
//kid->m_rowNum = sn->m_rowNum;
|
|
kid->m_headColSection = sn->m_headColSection;
|
|
kid->m_headRowSection = sn->m_headRowSection;
|
|
//kid->m_tableSec = sn->m_tableSec;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
#include "gb-include.h"
|
|
#include "Threads.h"
|
|
|
|
Sectiondb g_sectiondb;
|
|
Sectiondb g_sectiondb2;
|
|
|
|
// reset rdb
|
|
void Sectiondb::reset() { m_rdb.reset(); }
|
|
|
|
// init our rdb
|
|
bool Sectiondb::init ( ) {
|
|
// . what's max # of tree nodes?
|
|
// . key+4+left+right+parents+dataPtr = sizeof(key128_t)+4 +4+4+4+4
|
|
// . 32 bytes per record when in the tree
|
|
int32_t node = 16+4+4+4+4+4 + sizeof(SectionVote);
|
|
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
|
|
// . NOTE: overhead is about 32 bytes per node
|
|
//int32_t maxTreeNodes = g_conf.m_sectiondbMaxTreeMem / node;
|
|
int32_t maxTreeMem = 200000000;
|
|
int32_t maxTreeNodes = maxTreeMem / node;
|
|
// . we now use a disk page cache for sectiondb as opposed to the
|
|
// old rec cache. i am trying to do away with the Rdb::m_cache rec
|
|
// cache in favor of cleverly used disk page caches, because
|
|
// the rec caches are not real-time and get stale.
|
|
// . just hard-code 5MB for now
|
|
//int32_t pcmem = 5000000; // = g_conf.m_sectiondbMaxDiskPageCacheMem;
|
|
|
|
// do not use for now i think we use posdb and store the 32bit
|
|
// val in the key for facet type stuff
|
|
//pcmem = 0;
|
|
maxTreeMem = 100000;
|
|
maxTreeNodes = 1000;
|
|
|
|
/*
|
|
key128_t k;
|
|
uint32_t sech32 = (uint32_t)rand();
|
|
uint64_t sh64 = (uint64_t)rand() << 32LL | rand();
|
|
uint8_t secType = 23;
|
|
int64_t docId = ((uint64_t)rand() << 32 | rand()) & DOCID_MASK;
|
|
k = makeKey2 ( sech32,
|
|
sh64,
|
|
secType,
|
|
docId,
|
|
false ); // del key?
|
|
if ( g_sectiondb.getSectionHash(&k) != sech32 ){char*xx=NULL;*xx=0;}
|
|
if ( g_sectiondb.getSiteHash(&k) != sh64 ){char*xx=NULL;*xx=0;}
|
|
if ( g_sectiondb.getSectionType(&k) !=secType){char*xx=NULL;*xx=0;}
|
|
if ( g_sectiondb.getDocId(&k) != docId ){char*xx=NULL;*xx=0;}
|
|
*/
|
|
|
|
|
|
// do not use any page cache if doing tmp cluster in order to
|
|
// prevent swapping
|
|
// if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
|
|
// int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
|
|
// // init the page cache
|
|
// if ( ! m_pc.init ( "sectiondb",
|
|
// RDB_SECTIONDB,
|
|
// pcmem ,
|
|
// pageSize ) )
|
|
// return log("db: Sectiondb init failed.");
|
|
|
|
// initialize our own internal rdb
|
|
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
|
"sectiondb" ,
|
|
true , // dedup same keys?
|
|
sizeof(SectionVote) , // fixed record size
|
|
// avoid excessive merging!
|
|
-1 , // min files 2 merge
|
|
maxTreeMem ,
|
|
maxTreeNodes ,
|
|
true , // balance tree?
|
|
// turn off cache for now because the page cache
|
|
// is just as fast and does not get out of date
|
|
// so bad??
|
|
0 ,
|
|
0 , // maxCacheNodes
|
|
false , // half keys?
|
|
false , // saveCache?
|
|
NULL,//&m_pc , // page cache ptr
|
|
false , // is titledb?
|
|
false , // preloadcache?
|
|
16 ))// keySize
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
// init the rebuild/secondary rdb, used by PageRepair.cpp
|
|
bool Sectiondb::init2 ( int32_t treeMem ) {
|
|
// . what's max # of tree nodes?
|
|
// . key+4+left+right+parents+dataPtr = sizeof(key128_t)+4 +4+4+4+4
|
|
// . 32 bytes per record when in the tree
|
|
int32_t node = 16+4+4+4+4+4 + sizeof(SectionVote);
|
|
// . NOTE: overhead is about 32 bytes per node
|
|
int32_t maxTreeNodes = treeMem / node;
|
|
// initialize our own internal rdb
|
|
if ( ! m_rdb.init ( g_hostdb.m_dir ,
|
|
"sectiondbRebuild" ,
|
|
true , // dedup same keys?
|
|
sizeof(SectionVote) , // fixed record size
|
|
50 , // MinFilesToMerge
|
|
treeMem ,
|
|
maxTreeNodes ,
|
|
true , // balance tree?
|
|
0 , // MaxCacheMem ,
|
|
0 , // maxCacheNodes
|
|
false , // half keys?
|
|
false , // sectiondbSaveCache
|
|
NULL , // page cache ptr
|
|
false , // is titledb?
|
|
false , // prelaod cache?
|
|
16 ))// keySize
|
|
return false;
|
|
return true;
|
|
}
|
|
/*
|
|
bool Sectiondb::addColl ( char *coll, bool doVerify ) {
|
|
if ( ! m_rdb.addColl ( coll ) ) return false;
|
|
if ( ! doVerify ) return true;
|
|
// verify
|
|
if ( verify(coll) ) return true;
|
|
// if not allowing scale, return false
|
|
if ( ! g_conf.m_allowScale ) return false;
|
|
// otherwise let it go
|
|
log ( "db: sectiondb verify failed, but scaling is allowed, passing.");
|
|
return true;
|
|
}
|
|
*/
|
|
bool Sectiondb::verify ( char *coll ) {
|
|
log ( LOG_INFO, "db: Verifying Sectiondb for coll %s...", coll );
|
|
g_threads.disableThreads();
|
|
|
|
Msg5 msg5;
|
|
Msg5 msg5b;
|
|
RdbList list;
|
|
key128_t startKey;
|
|
key128_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
CollectionRec *cr = g_collectiondb.getRec(coll);
|
|
|
|
if ( ! msg5.getList ( RDB_SECTIONDB ,
|
|
cr->m_collnum ,
|
|
&list ,
|
|
&startKey ,
|
|
&endKey ,
|
|
1024*1024 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL , // cache key ptr
|
|
0 , // retry num
|
|
-1 , // maxRetries
|
|
true , // compensate for merge
|
|
-1LL , // sync point
|
|
&msg5b ,
|
|
false )) {
|
|
g_threads.enableThreads();
|
|
return log("db: HEY! it did not block");
|
|
}
|
|
|
|
int32_t count = 0;
|
|
int32_t got = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
key128_t k;
|
|
list.getCurrentKey(&k);
|
|
count++;
|
|
uint32_t shardNum = getShardNum ( RDB_SECTIONDB , &k );
|
|
if ( shardNum == getMyShardNum() ) got++;
|
|
}
|
|
if ( got != count ) {
|
|
log ("db: Out of first %"INT32" records in sectiondb, "
|
|
"only %"INT32" belong to our group.",count,got);
|
|
// exit if NONE, we probably got the wrong data
|
|
if ( count > 10 && got == 0 )
|
|
log("db: Are you sure you have the right "
|
|
"data in the right directory? "
|
|
"Exiting.");
|
|
log ( "db: Exiting due to Sectiondb inconsistency." );
|
|
g_threads.enableThreads();
|
|
return g_conf.m_bypassValidation;
|
|
}
|
|
|
|
log ( LOG_INFO, "db: Sectiondb passed verification successfully for "
|
|
"%"INT32" recs.", count );
|
|
// DONE
|
|
g_threads.enableThreads();
|
|
return true;
|
|
}
|
|
|
|
|
|
bool Sections::setListFlags ( ) {
|
|
|
|
return true;
|
|
/*
|
|
sec_t sdf = SEC_HAS_DOM|SEC_HAS_MONTH|SEC_HAS_DOW|SEC_HAS_TOD;
|
|
// scan all sentences
|
|
for ( Section *si = m_firstSent ; si ; si = si->m_nextSent ) {
|
|
// breathe
|
|
QUICKPOLL ( m_niceness );
|
|
// int16_tcut
|
|
sentflags_t sf = si->m_sentFlags;
|
|
// . fortable field must not end in period
|
|
// . i.e. "Good For Kids:"
|
|
if ( sf & SENT_PERIOD_ENDS ) continue;
|
|
// get next sent
|
|
Section *next = si->m_nextSent;
|
|
// this stops things..
|
|
if ( ! next ) continue;
|
|
// must not end in period either
|
|
if ( next->m_sentFlags & SENT_PERIOD_ENDS ) continue;
|
|
Section *ps;
|
|
*/
|
|
}
|
|
|
|
bool Sections::growSections ( ) {
|
|
// make a log note b/c this should not happen a lot because it's slow
|
|
log("build: growing sections!");
|
|
g_errno = EDOCBADSECTIONS;
|
|
return true;
|
|
// record old buf start
|
|
char *oldBuf = m_sectionBuf.getBufStart();
|
|
// grow by 20MB at a time
|
|
if ( ! m_sectionBuf.reserve ( 20000000 ) ) return false;
|
|
// for fixing ptrs:
|
|
char *newBuf = m_sectionBuf.getBufStart();
|
|
// set the new max
|
|
m_maxNumSections = m_sectionBuf.getCapacity() / sizeof(Section);
|
|
// update ptrs in the old sections
|
|
for ( int32_t i = 0 ; i < m_numSections ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
Section *si = &m_sections[i];
|
|
if ( si->m_parent ) {
|
|
char *np = (char *)si->m_parent;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_parent = (Section *)np;
|
|
}
|
|
if ( si->m_next ) {
|
|
char *np = (char *)si->m_next;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_next = (Section *)np;
|
|
}
|
|
if ( si->m_prev ) {
|
|
char *np = (char *)si->m_prev;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_prev = (Section *)np;
|
|
}
|
|
if ( si->m_listContainer ) {
|
|
char *np = (char *)si->m_listContainer;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_listContainer = (Section *)np;
|
|
}
|
|
if ( si->m_prevBrother ) {
|
|
char *np = (char *)si->m_prevBrother;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_prevBrother = (Section *)np;
|
|
}
|
|
if ( si->m_nextBrother ) {
|
|
char *np = (char *)si->m_nextBrother;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_nextBrother = (Section *)np;
|
|
}
|
|
if ( si->m_sentenceSection ) {
|
|
char *np = (char *)si->m_sentenceSection;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_sentenceSection = (Section *)np;
|
|
}
|
|
if ( si->m_prevSent ) {
|
|
char *np = (char *)si->m_prevSent;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_prevSent = (Section *)np;
|
|
}
|
|
if ( si->m_nextSent ) {
|
|
char *np = (char *)si->m_nextSent;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_nextSent = (Section *)np;
|
|
}
|
|
if ( si->m_tableSec ) {
|
|
char *np = (char *)si->m_tableSec;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_tableSec = (Section *)np;
|
|
}
|
|
if ( si->m_headColSection ) {
|
|
char *np = (char *)si->m_headColSection;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_headColSection = (Section *)np;
|
|
}
|
|
if ( si->m_headRowSection ) {
|
|
char *np = (char *)si->m_headRowSection;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_headRowSection = (Section *)np;
|
|
}
|
|
if ( si->m_leftCell ) {
|
|
char *np = (char *)si->m_leftCell;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_leftCell = (Section *)np;
|
|
}
|
|
if ( si->m_aboveCell ) {
|
|
char *np = (char *)si->m_aboveCell;
|
|
np = np - oldBuf + newBuf;
|
|
si->m_aboveCell = (Section *)np;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|