mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
More cleanup in Sections.*
This commit is contained in:
151
Sections.cpp
151
Sections.cpp
@ -39,23 +39,14 @@ void Sections::reset() {
|
||||
m_sectionPtrs = NULL;
|
||||
|
||||
// Coverity
|
||||
m_sbuf = NULL;
|
||||
m_words = NULL;
|
||||
m_url = NULL;
|
||||
m_coll = NULL;
|
||||
m_contentType = 0;
|
||||
m_wposVec = NULL;
|
||||
m_densityVec = NULL;
|
||||
m_wordSpamVec = NULL;
|
||||
m_fragVec = NULL;
|
||||
m_isRSSExt = false;
|
||||
m_titleStart = 0;
|
||||
m_maxNumSections = 0;
|
||||
m_wids = NULL;
|
||||
m_wlens = NULL;
|
||||
m_wptrs = NULL;
|
||||
m_tids = NULL;
|
||||
m_hiPos = 0;
|
||||
}
|
||||
|
||||
Sections::~Sections ( ) {
|
||||
@ -87,7 +78,7 @@ public:
|
||||
// . sets m_sections[] array, 1-1 with words array "w"
|
||||
// . the Weights class can look at these sections and zero out the weights
|
||||
// for words in script, style, select and marquee sections
|
||||
bool Sections::set(const Words *w, Bits *bits, const Url *url, const char *coll, uint8_t contentType ) {
|
||||
bool Sections::set(const Words *w, Bits *bits, const Url *url, uint8_t contentType ) {
|
||||
reset();
|
||||
|
||||
if ( ! w ) return true;
|
||||
@ -101,8 +92,6 @@ bool Sections::set(const Words *w, Bits *bits, const Url *url, const char *coll,
|
||||
// save it
|
||||
m_words = w;
|
||||
m_bits = bits;
|
||||
m_url = url;
|
||||
m_coll = coll;
|
||||
m_contentType = contentType;
|
||||
|
||||
// reset this just in case
|
||||
@ -124,7 +113,7 @@ bool Sections::set(const Words *w, Bits *bits, const Url *url, const char *coll,
|
||||
m_tids = tids;
|
||||
|
||||
m_isRSSExt = false;
|
||||
const char *ext = m_url->getExtension();
|
||||
const char *ext = url->getExtension();
|
||||
if ( ext && strcasecmp(ext,"rss") == 0 ) m_isRSSExt = true;
|
||||
if ( m_contentType == CT_XML ) m_isRSSExt = true;
|
||||
|
||||
@ -186,8 +175,6 @@ bool Sections::set(const Words *w, Bits *bits, const Url *url, const char *coll,
|
||||
// point into it
|
||||
m_sections = (Section *)m_sectionBuf.getBufStart();
|
||||
|
||||
m_titleStart = -1;
|
||||
|
||||
// save this too
|
||||
m_nw = nw;
|
||||
|
||||
@ -1062,11 +1049,6 @@ bool Sections::set(const Words *w, Bits *bits, const Url *url, const char *coll,
|
||||
istack[ni] = si->m_b;
|
||||
iflags[ni] = mf;
|
||||
ni++;
|
||||
|
||||
// title is special
|
||||
if ( tid == TAG_TITLE && m_titleStart == -1 ) {
|
||||
m_titleStart = si->m_a; // i;
|
||||
}
|
||||
}
|
||||
|
||||
// . now we insert sentence sections
|
||||
@ -3075,18 +3057,19 @@ void Sections::setTagHashes ( ) {
|
||||
}
|
||||
|
||||
// make this replace ::print() when it works
|
||||
bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) {
|
||||
// save ptrs
|
||||
m_sbuf = sbuf;
|
||||
bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec ) const {
|
||||
PrintData pd;
|
||||
pd.sbuf = sbuf;
|
||||
pd.hiPos = hiPos;
|
||||
pd.wposVec = wposVec;
|
||||
pd.densityVec = densityVec;
|
||||
pd.wordSpamVec = wordSpamVec;
|
||||
pd.fragVec = fragVec;
|
||||
return print(&pd);
|
||||
}
|
||||
|
||||
m_sbuf->setLabel ("sectprnt");
|
||||
|
||||
m_hiPos = hiPos;
|
||||
|
||||
m_wposVec = wposVec;
|
||||
m_densityVec = densityVec;
|
||||
m_wordSpamVec = wordSpamVec;
|
||||
m_fragVec = fragVec;
|
||||
bool Sections::print(PrintData *pd) const {
|
||||
pd->sbuf->setLabel ("sectprnt");
|
||||
|
||||
//verifySections();
|
||||
|
||||
@ -3106,7 +3089,7 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
// print sections out
|
||||
for ( Section *sk = m_rootSection ; sk ; ) {
|
||||
// print this section
|
||||
printSectionDiv(sk);
|
||||
printSectionDiv(pd,sk);
|
||||
// advance
|
||||
int32_t b = sk->m_b;
|
||||
// stop if last
|
||||
@ -3133,7 +3116,7 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
"<td><b>evIds</b></td>"
|
||||
"<td><b>text snippet</b></td>"
|
||||
"</tr>\n";
|
||||
sbuf->safePrintf("%s",hdr);
|
||||
pd->sbuf->safePrintf("%s",hdr);
|
||||
|
||||
int32_t rcount = 0;
|
||||
int32_t scount = 0;
|
||||
@ -3143,7 +3126,7 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
for ( Section *sn = m_rootSection ; sn ; sn = sn->m_next ) {
|
||||
// see if one big table causes a browser slowdown
|
||||
if ( (++rcount % TABLE_ROWS ) == 0 )
|
||||
sbuf->safePrintf("</table>%s\n",hdr);
|
||||
pd->sbuf->safePrintf("</table>%s\n",hdr);
|
||||
const char *xs = "--";
|
||||
char ttt[100];
|
||||
if ( sn->m_contentHash64 ) {
|
||||
@ -3161,7 +3144,7 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
}
|
||||
|
||||
// print it
|
||||
sbuf->safePrintf("<tr><td>%" PRId32"</td>\n"
|
||||
pd->sbuf->safePrintf("<tr><td>%" PRId32"</td>\n"
|
||||
"<td>%" PRId32"</td>"
|
||||
"<td>%" PRId32"</td>"
|
||||
"<td>0x%" PRIx32"</td>"
|
||||
@ -3184,17 +3167,17 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
pswn,
|
||||
pewn);
|
||||
// now show the flags
|
||||
printFlags ( sbuf , sn );
|
||||
printFlags ( pd->sbuf , sn );
|
||||
// first few words of section
|
||||
int32_t a = sn->m_a;
|
||||
int32_t b = sn->m_b;
|
||||
// -1 means an unclosed tag!! should no longer be the case
|
||||
if ( b == -1 ) { g_process.shutdownAbort(true); }//b=m_words->m_numWords;
|
||||
sbuf->safePrintf("</nobr></td>");
|
||||
pd->sbuf->safePrintf("</nobr></td>");
|
||||
|
||||
sbuf->safePrintf("<td> </td>");
|
||||
pd->sbuf->safePrintf("<td> </td>");
|
||||
|
||||
sbuf->safePrintf("<td><nobr>");
|
||||
pd->sbuf->safePrintf("<td><nobr>");
|
||||
// 70 chars max
|
||||
int32_t max = 70;
|
||||
int32_t count = 0;
|
||||
@ -3209,34 +3192,34 @@ bool Sections::print( SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, cons
|
||||
}
|
||||
count += slen;
|
||||
// boldify front tag
|
||||
if ( i == a ) sbuf->safePrintf("<b>");
|
||||
sbuf->htmlEncode(s,slen,false);
|
||||
if ( i == a ) pd->sbuf->safePrintf("<b>");
|
||||
pd->sbuf->htmlEncode(s,slen,false);
|
||||
// boldify front tag
|
||||
if ( i == a ) sbuf->safePrintf("</b>");
|
||||
if ( i == a ) pd->sbuf->safePrintf("</b>");
|
||||
}
|
||||
// if we truncated print a ...
|
||||
if ( truncated ) sbuf->safePrintf("<b>...</b>");
|
||||
if ( truncated ) pd->sbuf->safePrintf("<b>...</b>");
|
||||
// then print ending tag
|
||||
if ( b < nw ) {
|
||||
int32_t blen = wlens[b-1];
|
||||
if ( blen>20 ) blen = 20;
|
||||
sbuf->safePrintf("<b>");
|
||||
sbuf->htmlEncode(wptrs[b-1],blen,false);
|
||||
sbuf->safePrintf("</b>");
|
||||
pd->sbuf->safePrintf("<b>");
|
||||
pd->sbuf->htmlEncode(wptrs[b-1],blen,false);
|
||||
pd->sbuf->safePrintf("</b>");
|
||||
}
|
||||
|
||||
sbuf->safePrintf("</nobr></td></tr>\n");
|
||||
pd->sbuf->safePrintf("</nobr></td></tr>\n");
|
||||
}
|
||||
|
||||
sbuf->safePrintf("</table>\n<br>\n");
|
||||
pd->sbuf->safePrintf("</table>\n<br>\n");
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Sections::printSectionDiv(const Section *sk) {
|
||||
bool Sections::printSectionDiv(PrintData *pd, const Section *sk) const {
|
||||
// enter a new div section now
|
||||
m_sbuf->safePrintf("<br>");
|
||||
pd->sbuf->safePrintf("<br>");
|
||||
// only make font color different
|
||||
int32_t bcolor = (int32_t)sk->m_colorHash& 0x00ffffff;
|
||||
int32_t fcolor = 0x000000;
|
||||
@ -3254,7 +3237,7 @@ bool Sections::printSectionDiv(const Section *sk) {
|
||||
rcolor = 0x00ffffff;
|
||||
}
|
||||
// start the new div
|
||||
m_sbuf->safePrintf("<div "
|
||||
pd->sbuf->safePrintf("<div "
|
||||
"style=\""
|
||||
"background-color:#%06" PRIx32";"
|
||||
"margin-left:20px;"
|
||||
@ -3272,25 +3255,25 @@ bool Sections::printSectionDiv(const Section *sk) {
|
||||
// print word/tag #i
|
||||
if ( !(sk->m_flags&SEC_FAKE) && sk->m_tagId && printWord )
|
||||
// only encode if it is a tag
|
||||
m_sbuf->htmlEncode(m_wptrs[sk->m_a],m_wlens[sk->m_a],false );
|
||||
pd->sbuf->htmlEncode(m_wptrs[sk->m_a],m_wlens[sk->m_a],false );
|
||||
|
||||
m_sbuf->safePrintf("<i>");
|
||||
pd->sbuf->safePrintf("<i>");
|
||||
|
||||
// print the flags
|
||||
m_sbuf->safePrintf("A=%" PRId32" ",sk->m_a);
|
||||
pd->sbuf->safePrintf("A=%" PRId32" ",sk->m_a);
|
||||
|
||||
// print tag hash now
|
||||
m_sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash);
|
||||
pd->sbuf->safePrintf("taghash=%" PRIu32" ",(int32_t)sk->m_tagHash);
|
||||
|
||||
if ( sk->m_contentHash64 )
|
||||
m_sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64);
|
||||
pd->sbuf->safePrintf("ch64=%" PRIu64" ",sk->m_contentHash64);
|
||||
|
||||
printFlags ( m_sbuf , sk );
|
||||
printFlags ( pd->sbuf , sk );
|
||||
|
||||
if ( isHardSection(sk) )
|
||||
m_sbuf->safePrintf("hardsec ");
|
||||
pd->sbuf->safePrintf("hardsec ");
|
||||
|
||||
m_sbuf->safePrintf("</i>\n");
|
||||
pd->sbuf->safePrintf("</i>\n");
|
||||
|
||||
// now print each word and subsections in this section
|
||||
int32_t a = sk->m_a;
|
||||
@ -3316,7 +3299,7 @@ bool Sections::printSectionDiv(const Section *sk) {
|
||||
// if it belongs to another sections, print that section
|
||||
if ( ws != sk ) {
|
||||
// print out this subsection
|
||||
printSectionDiv(ws);
|
||||
printSectionDiv(pd,ws);
|
||||
// advance to end of that then
|
||||
i = ws->m_b - 1;
|
||||
// and try next word
|
||||
@ -3325,55 +3308,55 @@ bool Sections::printSectionDiv(const Section *sk) {
|
||||
|
||||
// ignore if in style section, etc. just print it out
|
||||
if ( sk->m_flags & NOINDEXFLAGS ) {
|
||||
m_sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
||||
pd->sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
||||
continue;
|
||||
}
|
||||
|
||||
// boldify alnum words
|
||||
if ( m_wids[i] ) {
|
||||
if ( m_wposVec[i] == m_hiPos )
|
||||
m_sbuf->safePrintf("<a name=hipos></a>");
|
||||
m_sbuf->safePrintf("<nobr><b>");
|
||||
if ( i < MAXFRAGWORDS && m_fragVec[i] == 0 )
|
||||
m_sbuf->safePrintf("<strike>");
|
||||
if ( pd->wposVec[i] == pd->hiPos )
|
||||
pd->sbuf->safePrintf("<a name=hipos></a>");
|
||||
pd->sbuf->safePrintf("<nobr><b>");
|
||||
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
|
||||
pd->sbuf->safePrintf("<strike>");
|
||||
}
|
||||
if ( m_wids[i] && m_wposVec[i] == m_hiPos )
|
||||
m_sbuf->safePrintf("<blink style=\""
|
||||
if ( m_wids[i] && pd->wposVec[i] == pd->hiPos )
|
||||
pd->sbuf->safePrintf("<blink style=\""
|
||||
"background-color:yellow;"
|
||||
"color:black;\">");
|
||||
// print that word
|
||||
m_sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
||||
if ( m_wids[i] && m_wposVec[i] == m_hiPos )
|
||||
m_sbuf->safePrintf("</blink>");
|
||||
pd->sbuf->htmlEncode(m_wptrs[i],m_wlens[i],false );
|
||||
if ( m_wids[i] && pd->wposVec[i] == pd->hiPos )
|
||||
pd->sbuf->safePrintf("</blink>");
|
||||
// boldify alnum words
|
||||
if ( m_wids[i] ) {
|
||||
if ( i < MAXFRAGWORDS && m_fragVec[i] == 0 )
|
||||
m_sbuf->safePrintf("</strike>");
|
||||
m_sbuf->safePrintf("</b>");
|
||||
if ( i < MAXFRAGWORDS && pd->fragVec[i] == 0 )
|
||||
pd->sbuf->safePrintf("</strike>");
|
||||
pd->sbuf->safePrintf("</b>");
|
||||
}
|
||||
// and print out their pos/div/spam sub
|
||||
if ( m_wids[i] ) {
|
||||
m_sbuf->safePrintf("<sub "
|
||||
pd->sbuf->safePrintf("<sub "
|
||||
"style=\"background-color:white;"
|
||||
"font-size:10px;"
|
||||
"border:black 1px solid;"
|
||||
"color:black;\">");
|
||||
m_sbuf->safePrintf("%" PRId32,m_wposVec[i]);
|
||||
if ( m_densityVec[i] != MAXDENSITYRANK )
|
||||
m_sbuf->safePrintf("/<font color=purple><b>%" PRId32
|
||||
pd->sbuf->safePrintf("%" PRId32, pd->wposVec[i]);
|
||||
if ( pd->densityVec[i] != MAXDENSITYRANK )
|
||||
pd->sbuf->safePrintf("/<font color=purple><b>%" PRId32
|
||||
"</b></font>"
|
||||
,
|
||||
(int32_t)m_densityVec[i]);
|
||||
(int32_t)pd->densityVec[i]);
|
||||
|
||||
if ( m_wordSpamVec[i] != MAXWORDSPAMRANK )
|
||||
m_sbuf->safePrintf("/<font color=red><b>%" PRId32
|
||||
if ( pd->wordSpamVec[i] != MAXWORDSPAMRANK )
|
||||
pd->sbuf->safePrintf("/<font color=red><b>%" PRId32
|
||||
"</b></font>"
|
||||
,
|
||||
(int32_t)m_wordSpamVec[i]);
|
||||
m_sbuf->safePrintf("</sub></nobr>");
|
||||
(int32_t)pd->wordSpamVec[i]);
|
||||
pd->sbuf->safePrintf("</sub></nobr>");
|
||||
}
|
||||
}
|
||||
m_sbuf->safePrintf("</div>\n");
|
||||
pd->sbuf->safePrintf("</div>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
37
Sections.h
37
Sections.h
@ -68,7 +68,6 @@ class Url;
|
||||
|
||||
#define NOINDEXFLAGS (SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_IFRAME)
|
||||
|
||||
// the section type (bit flag vector for SEC_*) is currently 32 bits
|
||||
typedef int64_t sec_t;
|
||||
|
||||
class Section {
|
||||
@ -180,7 +179,7 @@ public:
|
||||
// . returns false if blocked, true otherwise
|
||||
// . returns true and sets g_errno on error
|
||||
// . sets m_sections[] array, 1-1 with words array "w"
|
||||
bool set(const Words *w, Bits *bits, const Url *url, const char *coll, uint8_t contentType );
|
||||
bool set(const Words *w, Bits *bits, const Url *url, uint8_t contentType);
|
||||
|
||||
private:
|
||||
bool verifySections ( ) ;
|
||||
@ -193,17 +192,25 @@ private:
|
||||
static void printFlags(SafeBuf *sbuf , const Section *sn );
|
||||
|
||||
public:
|
||||
bool print(SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec);
|
||||
bool print(SafeBuf *sbuf, int32_t hiPos, const int32_t *wposVec, const char *densityVec, const char *wordSpamVec, const char *fragVec) const;
|
||||
|
||||
private:
|
||||
bool printSectionDiv(const Section *);
|
||||
SafeBuf *m_sbuf;
|
||||
struct PrintData {
|
||||
SafeBuf *sbuf;
|
||||
int32_t hiPos;
|
||||
const int32_t *wposVec;
|
||||
const char *densityVec;
|
||||
const char *wordSpamVec;
|
||||
const char *fragVec;
|
||||
};
|
||||
bool print(PrintData *pd) const;
|
||||
bool printSectionDiv(PrintData *pd, const Section *) const;
|
||||
|
||||
bool isHardSection(const Section *sn) const;
|
||||
|
||||
bool setMenus ( );
|
||||
|
||||
void setHeader ( int32_t r , class Section *first , sec_t flag ) ;
|
||||
void setHeader(int32_t r, Section *first, sec_t flag);
|
||||
|
||||
bool setHeadingBit ( ) ;
|
||||
|
||||
@ -211,31 +218,17 @@ private:
|
||||
|
||||
// save it
|
||||
const Words *m_words;
|
||||
int32_t m_nw; //from m_word->getNumWords()
|
||||
Bits *m_bits;
|
||||
const Url *m_url;
|
||||
const char *m_coll;
|
||||
uint8_t m_contentType;
|
||||
|
||||
const int32_t *m_wposVec;
|
||||
const char *m_densityVec;
|
||||
const char *m_wordSpamVec;
|
||||
const char *m_fragVec;
|
||||
|
||||
// url ends in .rss or .xml ?
|
||||
bool m_isRSSExt;
|
||||
|
||||
// word #'s (-1 means invalid)
|
||||
int32_t m_titleStart;
|
||||
|
||||
public:
|
||||
// these are 1-1 with the Words::m_words[] array
|
||||
Section **m_sectionPtrs;
|
||||
|
||||
private:
|
||||
// save this too
|
||||
int32_t m_nw ;
|
||||
|
||||
public:
|
||||
// allocate m_sections[] buffer
|
||||
Section *m_sections;
|
||||
int32_t m_numSections;
|
||||
@ -254,8 +247,6 @@ private:
|
||||
const char * const *m_wptrs;
|
||||
const nodeid_t *m_tids;
|
||||
|
||||
int32_t m_hiPos;
|
||||
|
||||
bool addSentenceSections ( ) ;
|
||||
|
||||
Section *insertSubSection ( int32_t a, int32_t b, int32_t newBaseHash ) ;
|
||||
|
@ -4027,7 +4027,7 @@ Sections *XmlDoc::getSections ( ) {
|
||||
// this uses the sectionsReply to see which sections are "text", etc.
|
||||
// rather than compute it expensively
|
||||
if ( !m_calledSections &&
|
||||
!m_sections.set( &m_words, bits, getFirstUrl(), cr->m_coll, *ct ) ) {
|
||||
!m_sections.set( &m_words, bits, getFirstUrl(), *ct ) ) {
|
||||
m_calledSections = true;
|
||||
// it blocked, return -1
|
||||
return (Sections *) -1;
|
||||
|
2
main.cpp
2
main.cpp
@ -4121,7 +4121,7 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
|
||||
t = gettimeofdayInMilliseconds();
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ )
|
||||
// do not supply xd so it will be set from scratch
|
||||
if ( !sections.set( &words, &bits, NULL, NULL, 0 ) ) {
|
||||
if ( !sections.set( &words, &bits, NULL, 0 ) ) {
|
||||
log(LOG_WARN, "build: speedtestxml: sections set: %s", mstrerror(g_errno));
|
||||
return false;
|
||||
}
|
||||
|
Reference in New Issue
Block a user