mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-17 02:56:07 -04:00
Title.cpp: use enum instead of #defines
This commit is contained in:
135
Title.cpp
135
Title.cpp
@ -122,22 +122,23 @@ bool Title::setTitleFromTags( Xml *xml, int32_t maxTitleLen, uint8_t contentType
|
||||
}
|
||||
|
||||
// types of titles. indicates where they came from.
|
||||
#define TT_LINKTEXTLOCAL 1
|
||||
#define TT_LINKTEXTREMOTE 2
|
||||
#define TT_RSSITEMLOCAL 3
|
||||
#define TT_RSSITEMREMOTE 4
|
||||
#define TT_BOLDTAG 5
|
||||
#define TT_HTAG 6
|
||||
#define TT_TITLETAG 7
|
||||
#define TT_FIRSTLINE 9
|
||||
#define TT_DIVTAG 10
|
||||
#define TT_FONTTAG 11
|
||||
#define TT_ATAG 12
|
||||
#define TT_TDTAG 13
|
||||
#define TT_PTAG 14
|
||||
#define TT_URLPATH 15
|
||||
#define TT_TITLEATT 16
|
||||
|
||||
enum class title_source_t {
|
||||
TT_LINKTEXTLOCAL = 1,
|
||||
TT_LINKTEXTREMOTE,
|
||||
TT_RSSITEMLOCAL,
|
||||
TT_RSSITEMREMOTE,
|
||||
TT_BOLDTAG,
|
||||
TT_HTAG,
|
||||
TT_TITLETAG,
|
||||
TT_FIRSTLINE,
|
||||
TT_DIVTAG,
|
||||
TT_FONTTAG,
|
||||
TT_ATAG,
|
||||
TT_TDTAG,
|
||||
TT_PTAG,
|
||||
TT_URLPATH,
|
||||
TT_TITLEATT,
|
||||
};
|
||||
#define MAX_TIT_CANDIDATES 100
|
||||
|
||||
// does word qualify as a subtitle delimeter?
|
||||
@ -204,7 +205,7 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
int32_t bs[MAX_TIT_CANDIDATES];
|
||||
float scores[MAX_TIT_CANDIDATES];
|
||||
Words *cptrs[MAX_TIT_CANDIDATES];
|
||||
int32_t types[MAX_TIT_CANDIDATES];
|
||||
title_source_t types[MAX_TIT_CANDIDATES];
|
||||
int32_t parent[MAX_TIT_CANDIDATES];
|
||||
|
||||
// record the scoring algos effects
|
||||
@ -280,8 +281,8 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// do not count so high if remote!
|
||||
else scores[n] = 0.80;
|
||||
// set the type
|
||||
if ( sh ) types [n] = TT_LINKTEXTLOCAL;
|
||||
else types [n] = TT_LINKTEXTREMOTE;
|
||||
if ( sh ) types [n] = title_source_t::TT_LINKTEXTLOCAL;
|
||||
else types [n] = title_source_t::TT_LINKTEXTREMOTE;
|
||||
// another candidate
|
||||
n++;
|
||||
// use xml and words
|
||||
@ -317,8 +318,8 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// if not same host, treat like link text
|
||||
else scores[n] = 2.0;
|
||||
// set the type
|
||||
if ( sh ) types [n] = TT_RSSITEMLOCAL;
|
||||
else types [n] = TT_RSSITEMREMOTE;
|
||||
if ( sh ) types [n] = title_source_t::TT_RSSITEMLOCAL;
|
||||
else types [n] = title_source_t::TT_RSSITEMREMOTE;
|
||||
// advance
|
||||
n++;
|
||||
// break out if too many already. save some for below.
|
||||
@ -450,7 +451,7 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
as [n] = 0;
|
||||
bs [n] = tw[ti].getNumWords();
|
||||
scores [n] = 3.0; // not ALWAYS solid gold!
|
||||
types [n] = TT_TITLEATT;
|
||||
types [n] = title_source_t::TT_TITLEATT;
|
||||
|
||||
// we are using the words class
|
||||
ti++;
|
||||
@ -636,34 +637,34 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
as[n] = start;
|
||||
bs[n] = i;
|
||||
if ( tid == TAG_B ) {
|
||||
types[n] = TT_BOLDTAG;
|
||||
types[n] = title_source_t::TT_BOLDTAG;
|
||||
scores[n] = 1.0;
|
||||
} else if ( tid == TAG_H1 ) {
|
||||
types[n] = TT_HTAG;
|
||||
types[n] = title_source_t::TT_HTAG;
|
||||
scores[n] = 1.8;
|
||||
} else if ( tid == TAG_H2 ) {
|
||||
types[n] = TT_HTAG;
|
||||
types[n] = title_source_t::TT_HTAG;
|
||||
scores[n] = 1.7;
|
||||
} else if ( tid == TAG_H3 ) {
|
||||
types[n] = TT_HTAG;
|
||||
types[n] = title_source_t::TT_HTAG;
|
||||
scores[n] = 1.6;
|
||||
} else if ( tid == TAG_TITLE ) {
|
||||
types[n] = TT_TITLETAG;
|
||||
types[n] = title_source_t::TT_TITLETAG;
|
||||
scores[n] = 3.0;
|
||||
} else if ( tid == TAG_DIV ) {
|
||||
types[n] = TT_DIVTAG;
|
||||
types[n] = title_source_t::TT_DIVTAG;
|
||||
scores[n] = 1.0;
|
||||
} else if ( tid == TAG_TD ) {
|
||||
types[n] = TT_TDTAG;
|
||||
types[n] = title_source_t::TT_TDTAG;
|
||||
scores[n] = 1.0;
|
||||
} else if ( tid == TAG_P ) {
|
||||
types[n] = TT_PTAG;
|
||||
types[n] = title_source_t::TT_PTAG;
|
||||
scores[n] = 1.0;
|
||||
} else if ( tid == TAG_FONT ) {
|
||||
types[n] = TT_FONTTAG;
|
||||
types[n] = title_source_t::TT_FONTTAG;
|
||||
scores[n] = 1.0;
|
||||
} else if ( tid == TAG_A ) {
|
||||
types[n] = TT_ATAG;
|
||||
types[n] = title_source_t::TT_ATAG;
|
||||
// . self link is very powerful BUT
|
||||
// http://www.npr.org/templates/story/story.php?storyId=5417137
|
||||
// doesn't use it right! so use
|
||||
@ -731,7 +732,7 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
|
||||
// this is the last resort i guess...
|
||||
scores [n] = 0.5;
|
||||
types [n] = TT_FIRSTLINE;
|
||||
types [n] = title_source_t::TT_FIRSTLINE;
|
||||
as [n] = t0;
|
||||
bs [n] = t1;
|
||||
|
||||
@ -794,7 +795,7 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
as [n] = 0;
|
||||
bs [n] = tw[ti].getNumWords();
|
||||
scores [n] = 1.0;
|
||||
types [n] = TT_URLPATH;
|
||||
types [n] = title_source_t::TT_URLPATH;
|
||||
|
||||
// increment since we are using it
|
||||
ti++;
|
||||
@ -1130,8 +1131,8 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// and its kids, subtitles...
|
||||
//
|
||||
// do not compare type X to type Y
|
||||
if ( types[i] == TT_TITLETAG ) {
|
||||
if ( types[j] == TT_TITLETAG ) {
|
||||
if ( types[i] == title_source_t::TT_TITLETAG ) {
|
||||
if ( types[j] == title_source_t::TT_TITLETAG ) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -1141,11 +1142,11 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// likewise, a TD to another TD
|
||||
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html
|
||||
// ... etc.
|
||||
if ( types[i] == TT_BOLDTAG ||
|
||||
types[i] == TT_HTAG ||
|
||||
types[i] == TT_DIVTAG ||
|
||||
types[i] == TT_TDTAG ||
|
||||
types[i] == TT_FONTTAG ) {
|
||||
if ( types[i] == title_source_t::TT_BOLDTAG ||
|
||||
types[i] == title_source_t::TT_HTAG ||
|
||||
types[i] == title_source_t::TT_DIVTAG ||
|
||||
types[i] == title_source_t::TT_TDTAG ||
|
||||
types[i] == title_source_t::TT_FONTTAG ) {
|
||||
if ( types[j] == types[i] ) continue;
|
||||
}
|
||||
// . do not compare one kid to another kid
|
||||
@ -1161,31 +1162,31 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
|
||||
// i put bold tags back
|
||||
|
||||
if ( types[i] == TT_LINKTEXTLOCAL ) {
|
||||
if ( types[j] == TT_LINKTEXTLOCAL ) continue;
|
||||
if ( types[i] == title_source_t::TT_LINKTEXTLOCAL ) {
|
||||
if ( types[j] == title_source_t::TT_LINKTEXTLOCAL ) continue;
|
||||
}
|
||||
if ( types[i] == TT_RSSITEMLOCAL ) {
|
||||
if ( types[j] == TT_RSSITEMLOCAL ) continue;
|
||||
if ( types[i] == title_source_t::TT_RSSITEMLOCAL ) {
|
||||
if ( types[j] == title_source_t::TT_RSSITEMLOCAL ) continue;
|
||||
}
|
||||
|
||||
// only compare to one local link text for each i
|
||||
if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 ) {
|
||||
if ( types[j] == title_source_t::TT_LINKTEXTLOCAL && localFlag1 ) {
|
||||
continue;
|
||||
}
|
||||
if ( types[j] == TT_RSSITEMLOCAL && localFlag2 ) {
|
||||
if ( types[j] == title_source_t::TT_RSSITEMLOCAL && localFlag2 ) {
|
||||
continue;
|
||||
}
|
||||
if ( types[j] == TT_LINKTEXTLOCAL ) {
|
||||
if ( types[j] == title_source_t::TT_LINKTEXTLOCAL ) {
|
||||
localFlag1 = 1;
|
||||
}
|
||||
if ( types[j] == TT_RSSITEMLOCAL ) {
|
||||
if ( types[j] == title_source_t::TT_RSSITEMLOCAL ) {
|
||||
localFlag2 = 1;
|
||||
}
|
||||
|
||||
// not link title attr to link title attr either
|
||||
// fixes http://www.spiritualwoman.net/?cat=191
|
||||
if ( types[i] == TT_TITLEATT &&
|
||||
types[j] == TT_TITLEATT )
|
||||
if ( types[i] == title_source_t::TT_TITLEATT &&
|
||||
types[j] == title_source_t::TT_TITLEATT )
|
||||
continue;
|
||||
|
||||
// get our words
|
||||
@ -1255,7 +1256,7 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
}
|
||||
|
||||
// url path's cannot be titles in and of themselves
|
||||
if ( types[i] == TT_URLPATH ) {
|
||||
if ( types[i] == title_source_t::TT_URLPATH ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1345,21 +1346,21 @@ bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, const Query
|
||||
// print out all candidates
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
char *ts = "unknown";
|
||||
if ( types[i] == TT_LINKTEXTLOCAL ) ts = "local inlink text";
|
||||
if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text";
|
||||
if ( types[i] == TT_RSSITEMLOCAL ) ts = "local rss title";
|
||||
if ( types[i] == TT_RSSITEMREMOTE ) ts = "remote rss title";
|
||||
if ( types[i] == TT_BOLDTAG ) ts = "bold tag";
|
||||
if ( types[i] == TT_HTAG ) ts = "header tag";
|
||||
if ( types[i] == TT_TITLETAG ) ts = "title tag";
|
||||
if ( types[i] == TT_FIRSTLINE ) ts = "first line in text";
|
||||
if ( types[i] == TT_FONTTAG ) ts = "font tag";
|
||||
if ( types[i] == TT_ATAG ) ts = "anchor tag";
|
||||
if ( types[i] == TT_DIVTAG ) ts = "div tag";
|
||||
if ( types[i] == TT_TDTAG ) ts = "td tag";
|
||||
if ( types[i] == TT_PTAG ) ts = "p tag";
|
||||
if ( types[i] == TT_URLPATH ) ts = "url path";
|
||||
if ( types[i] == TT_TITLEATT ) ts = "title attribute";
|
||||
if ( types[i] == title_source_t::TT_LINKTEXTLOCAL ) ts = "local inlink text";
|
||||
if ( types[i] == title_source_t::TT_LINKTEXTREMOTE ) ts = "remote inlink text";
|
||||
if ( types[i] == title_source_t::TT_RSSITEMLOCAL ) ts = "local rss title";
|
||||
if ( types[i] == title_source_t::TT_RSSITEMREMOTE ) ts = "remote rss title";
|
||||
if ( types[i] == title_source_t::TT_BOLDTAG ) ts = "bold tag";
|
||||
if ( types[i] == title_source_t::TT_HTAG ) ts = "header tag";
|
||||
if ( types[i] == title_source_t::TT_TITLETAG ) ts = "title tag";
|
||||
if ( types[i] == title_source_t::TT_FIRSTLINE ) ts = "first line in text";
|
||||
if ( types[i] == title_source_t::TT_FONTTAG ) ts = "font tag";
|
||||
if ( types[i] == title_source_t::TT_ATAG ) ts = "anchor tag";
|
||||
if ( types[i] == title_source_t::TT_DIVTAG ) ts = "div tag";
|
||||
if ( types[i] == title_source_t::TT_TDTAG ) ts = "td tag";
|
||||
if ( types[i] == title_source_t::TT_PTAG ) ts = "p tag";
|
||||
if ( types[i] == title_source_t::TT_URLPATH ) ts = "url path";
|
||||
if ( types[i] == title_source_t::TT_TITLEATT ) ts = "title attribute";
|
||||
// get the title
|
||||
pbuf->safePrintf(
|
||||
"<tr>"
|
||||
|
Reference in New Issue
Block a user