262 lines
7.5 KiB
C++
262 lines
7.5 KiB
C++
#ifndef GB_SECTIONS_H
|
|
#define GB_SECTIONS_H
|
|
|
|
#include "SafeBuf.h"
|
|
#include "nodeid_t.h"
|
|
|
|
class TokenizerResult;
|
|
class Bits;
|
|
class Url;
|
|
|
|
|
|
// h: hash value. typically the lower 32 bits of the
|
|
// Section::m_contentHash64 vars. we
|
|
// do not need the full 64 bits because we have the 48 bit site hash included
|
|
// to reduce collisions substantially.
|
|
|
|
//
|
|
// BEGIN SECTION BIT FLAGS (sec_t)
|
|
// values for Section::m_flags, of type sec_t
|
|
//
|
|
|
|
// . these are descriptive flags, they are computed when Sections is set
|
|
// . SEC_NOTEXT sections do not vote, i.e. they are not stored in Sectiondb
|
|
#define SEC_NOTEXT 0x0001 // implies section has no alnum words
|
|
//#define SEC_UNUSED 0x0002
|
|
//#define SEC_UNUSED 0x0004
|
|
#define SEC_SCRIPT 0x0008
|
|
#define SEC_STYLE 0x0010
|
|
#define SEC_SELECT 0x0020
|
|
//#define SEC_UNUSED 0x0040
|
|
#define SEC_IN_HEAD 0x0080 // in <head>
|
|
#define SEC_IN_TITLE 0x0100 // in title
|
|
#define SEC_IN_HEADER 0x0200 // in <hN> tags
|
|
#define SEC_IN_IFRAME 0x0400
|
|
#define SEC_HIDDEN 0x0800 // <div style="display: none">
|
|
//#define SEC_UNUSED 0x1000
|
|
#define SEC_FAKE 0x2000 // <hr>/<br>/sentence based faux section
|
|
#define SEC_NOSCRIPT 0x4000
|
|
//#define SEC_UNUSED 0x8000
|
|
|
|
#define SEC_MENU 0x00010000
|
|
#define SEC_LINK_TEXT 0x00020000
|
|
#define SEC_MENU_HEADER 0x00040000
|
|
//#define SEC_UNUSED 0x00080000
|
|
//#define SEC_UNUSED 0x00100000
|
|
#define SEC_HEADING 0x00200000
|
|
//#define SEC_UNUSED 0x00400000
|
|
//#define SEC_UNUSED 0x00800000
|
|
#define SEC_SENTENCE 0x01000000 // made by a sentence?
|
|
#define SEC_PLAIN_TEXT 0x02000000
|
|
//#define SEC_UNUSED 0x04000000
|
|
//#define SEC_UNUSED 0x00008000000LL
|
|
//#define SEC_UNUSED 0x00010000000LL
|
|
//#define SEC_UNUSED 0x00020000000LL
|
|
//#define SEC_UNUSED 0x00040000000LL
|
|
//#define SEC_UNUSED 0x00080000000LL
|
|
|
|
//#define SEC_UNUSED 0x00100000000LL
|
|
#define SEC_MENU_SENTENCE 0x00200000000LL
|
|
//#define SEC_UNUSED 0x00400000000LL
|
|
//#define SEC_UNUSED 0x00800000000LL
|
|
|
|
// . some random-y numbers for Section::m_baseHash
|
|
// . used by splitSection() function
|
|
#define BH_BULLET 7845934
|
|
#define BH_SENTENCE 4590649
|
|
#define BH_IMPLIED 95468323
|
|
|
|
#define NOINDEXFLAGS (SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_IFRAME)
|
|
|
|
typedef int64_t sec_t;
|
|
|
|
class Section {
|
|
public:
|
|
|
|
// . the section immediately containing us
|
|
// . used by Events.cpp to count # of timeofdays in section
|
|
Section *m_parent;
|
|
|
|
// . we are in a linked list of sections
|
|
// . this is how we keep order
|
|
Section *m_next;
|
|
Section *m_prev;
|
|
|
|
// . if we are an element in a list, what is the list container section
|
|
// . a containing section is a section containing MULTIPLE
|
|
// smaller sections
|
|
// . right now we limit such contained elements to text sections only
|
|
// . used to set SEC_HAS_MENUBROTHER flag
|
|
Section *m_listContainer;
|
|
|
|
// the sibling section before/after us. can be NULL.
|
|
Section *m_prevBrother;
|
|
Section *m_nextBrother;
|
|
|
|
// if we are in a bold section in a sentence section then this
|
|
// will point to the sentence section that contains us. if we already
|
|
// are a sentence section then this points to itself.
|
|
Section *m_sentenceSection;
|
|
|
|
// position of the first and last alnum word contained directly OR
|
|
// indirectly in this section. use -1 if no text contained...
|
|
int32_t m_firstWordPos;
|
|
int32_t m_lastWordPos;
|
|
|
|
// alnum positions for words contained directly OR indirectly as well
|
|
int32_t m_alnumPosA;
|
|
int32_t m_alnumPosB;
|
|
|
|
// . for sentences that span multiple sections UNEVENLY
|
|
// . see aliconference.com and abqtango.com for this crazy things
|
|
// . for like 99% of all sections these guys equal m_firstWordPos and
|
|
// m_lastWordPos respectively
|
|
int32_t m_senta;
|
|
int32_t m_sentb;
|
|
|
|
Section *m_nextSentence;
|
|
|
|
// hash of this tag's baseHash and all its parents baseHashes combined
|
|
uint32_t m_tagHash;
|
|
|
|
// for debug output display of color coded nested sections
|
|
uint32_t m_colorHash;
|
|
|
|
// tagid of this section, 0 means none (like sentence section, etc.)
|
|
nodeid_t m_tagId;
|
|
|
|
// usually just the m_tagId, but hashes in the class attributes of
|
|
// div and span tags, etc. to make them unique
|
|
uint32_t m_baseHash;
|
|
|
|
// kinda like m_baseHash but for xml tags and only hashes the
|
|
// tag name and none of the fields
|
|
uint32_t m_xmlNameHash;
|
|
|
|
// hash of all the alnum words DIRECTLY in this section
|
|
uint64_t m_contentHash64;
|
|
|
|
// . range of words in Words class we encompass
|
|
// . m_wordStart and m_wordEnd are the tag word #'s
|
|
// . ACTUALLY it is a half-closed interval [a,b) like all else
|
|
// so m_b-1 is the word # of the ending tag, BUT split sections
|
|
// do not include ending tags!!! (i.e. <hr>, <br>, &bull, etc.)
|
|
// that were made with a call to splitSection()
|
|
int32_t m_a;//wordStart;
|
|
int32_t m_b;//wordEnd;
|
|
|
|
// our depth. # of tags in the hash
|
|
int32_t m_depth;
|
|
|
|
// container for the #define'd SEC_* values above
|
|
sec_t m_flags;
|
|
|
|
char m_used;
|
|
|
|
int32_t m_gbFrameNum;
|
|
|
|
// do we contain section "arg"?
|
|
bool contains(const Section *arg) const {
|
|
return ( m_a <= arg->m_a && m_b >= arg->m_b );
|
|
}
|
|
|
|
// do we contain section "arg"?
|
|
bool strictlyContains(const Section *arg) const {
|
|
if ( m_a < arg->m_a && m_b >= arg->m_b ) return true;
|
|
if ( m_a <= arg->m_a && m_b > arg->m_b ) return true;
|
|
return false;
|
|
}
|
|
};
|
|
|
|
|
|
class Sections {
|
|
public:
|
|
Sections();
|
|
~Sections();
|
|
|
|
void reset();
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
// . sets m_sections[] array, 1-1 with words array "w"
|
|
bool set(const TokenizerResult *tr, Bits *bits, const Url *url, uint8_t contentType);
|
|
|
|
private:
|
|
bool verifySections ( ) ;
|
|
|
|
void setNextBrotherPtrs ( bool setContainer ) ;
|
|
|
|
void setNextSentPtrs();
|
|
|
|
static void printFlags(SafeBuf *sbuf , const Section *sn );
|
|
|
|
public:
|
|
bool print(SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec) const;
|
|
|
|
private:
|
|
struct PrintData {
|
|
SafeBuf *sbuf;
|
|
int32_t hiPos;
|
|
const int32_t *wposVec;
|
|
const char *densityVec;
|
|
const char *wordSpamVec;
|
|
const char *fragVec;
|
|
};
|
|
bool print(PrintData *pd) const;
|
|
bool printSectionDiv(PrintData *pd, const Section *) const;
|
|
|
|
bool isHardSection(const Section *sn) const;
|
|
|
|
bool setMenus ( );
|
|
|
|
void setHeader(int32_t r, Section *first, sec_t flag);
|
|
|
|
bool setHeadingBit ( ) ;
|
|
|
|
void setTagHashes ( ) ;
|
|
|
|
// save it
|
|
const TokenizerResult *m_tr;
|
|
int32_t m_nw; //from m_word->getNumWords()
|
|
Bits *m_bits;
|
|
uint8_t m_contentType;
|
|
|
|
// url ends in .rss or .xml ?
|
|
bool m_isRSSExt;
|
|
|
|
public:
|
|
// these are 1-1 with the TokenizerResult::tokens[] array
|
|
Section **m_sectionPtrs;
|
|
|
|
// allocate m_sections[] buffer
|
|
Section *m_sections;
|
|
int32_t m_numSections;
|
|
private:
|
|
int32_t m_maxNumSections;
|
|
|
|
// this holds the Sections instances in a growable array
|
|
SafeBuf m_sectionBuf;
|
|
|
|
// this holds ptrs to sections 1-1 with words array, so we can
|
|
// see what section a word is in.
|
|
SafeBuf m_sectionPtrBuf;
|
|
|
|
bool addSentenceSections ( ) ;
|
|
|
|
Section *insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) ;
|
|
|
|
public:
|
|
Section *m_rootSection; // the first section, aka m_firstSection
|
|
private:
|
|
Section *m_lastSection;
|
|
|
|
Section *m_lastAdded;
|
|
|
|
public:
|
|
// kinda like m_rootSection, the first sentence section that occurs
|
|
// in the document, is NULL iff no sentences in document
|
|
Section *m_firstSentence;
|
|
};
|
|
|
|
#endif // GB_SECTIONS_H
|