mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Add allowindexpage & allowrootpages criteria to urlblocklist for domain
This commit is contained in:
committed by
Brian Rasmusson
parent
60b5ea11db
commit
3a890f2895
18
UrlBlock.cpp
18
UrlBlock.cpp
@ -11,9 +11,10 @@ urlblocktld_t::urlblocktld_t(const std::string &tlds)
|
||||
, m_tlds(split(tlds, ',')) {
|
||||
}
|
||||
|
||||
urlblockdomain_t::urlblockdomain_t(const std::string &domain, const std::string &allow)
|
||||
urlblockdomain_t::urlblockdomain_t(const std::string &domain, const std::string &allow, pathcriteria_t pathcriteria)
|
||||
: m_domain(domain)
|
||||
, m_allow(split(allow, ',')) {
|
||||
, m_allow(split(allow, ','))
|
||||
, m_pathcriteria(pathcriteria) {
|
||||
}
|
||||
|
||||
urlblockhost_t::urlblockhost_t(const std::string &host, const std::string &path)
|
||||
@ -66,7 +67,18 @@ bool UrlBlock::match(const Url &url) const {
|
||||
if (!m_domain->m_allow.empty()) {
|
||||
auto subDomainLen = (url.getDomain() == url.getHost()) ? 0 : url.getDomain() - url.getHost() - 1;
|
||||
std::string subDomain(url.getHost(), subDomainLen);
|
||||
return (std::find(m_domain->m_allow.cbegin(), m_domain->m_allow.cend(), subDomain) == m_domain->m_allow.cend());
|
||||
bool match = (std::find(m_domain->m_allow.cbegin(), m_domain->m_allow.cend(), subDomain) == m_domain->m_allow.cend());
|
||||
if (!match) {
|
||||
// check for pathcriteria
|
||||
switch (m_domain->m_pathcriteria) {
|
||||
case urlblockdomain_t::pathcriteria_allow_all:
|
||||
return false;
|
||||
case urlblockdomain_t::pathcriteria_allow_index_only:
|
||||
return (url.getPathLen() > 1);
|
||||
case urlblockdomain_t::pathcriteria_allow_rootpages_only:
|
||||
return (url.getPathDepth(false) > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -8,10 +8,17 @@
|
||||
#include "GbRegex.h"
|
||||
|
||||
struct urlblockdomain_t {
|
||||
urlblockdomain_t(const std::string &domain, const std::string &allow);
|
||||
enum pathcriteria_t {
|
||||
pathcriteria_allow_all,
|
||||
pathcriteria_allow_index_only,
|
||||
pathcriteria_allow_rootpages_only
|
||||
};
|
||||
|
||||
urlblockdomain_t(const std::string &domain, const std::string &allow, pathcriteria_t pathcriteria);
|
||||
|
||||
std::string m_domain;
|
||||
std::vector<std::string> m_allow;
|
||||
pathcriteria_t m_pathcriteria;
|
||||
};
|
||||
|
||||
struct urlblockhost_t {
|
||||
|
@ -34,6 +34,26 @@ void UrlBlockList::reload(int /*fd*/, void *state) {
|
||||
urlBlockList->load();
|
||||
}
|
||||
|
||||
static void parseDomain(urlblocklist_ptr_t *urlBlockList, const std::string &col2, const std::string &col3, const std::string &col4) {
|
||||
std::string allowStr;
|
||||
if (!col3.empty()) {
|
||||
if (starts_with(col3.c_str(), "allow=")) {
|
||||
allowStr.append(col3, 6, std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
urlblockdomain_t::pathcriteria_t pathcriteria = urlblockdomain_t::pathcriteria_allow_all;
|
||||
if (!col4.empty()) {
|
||||
if (col4.compare("allowindexpage") == 0) {
|
||||
pathcriteria = urlblockdomain_t::pathcriteria_allow_index_only;
|
||||
} else if (col4.compare("allowrootpages") == 0) {
|
||||
pathcriteria = urlblockdomain_t::pathcriteria_allow_rootpages_only;
|
||||
}
|
||||
}
|
||||
|
||||
(*urlBlockList)->emplace_back(std::shared_ptr<urlblockdomain_t>(new urlblockdomain_t(col2, allowStr, pathcriteria)));
|
||||
}
|
||||
|
||||
bool UrlBlockList::load() {
|
||||
logTrace(g_conf.m_logTraceUrlBlockList, "Loading %s", m_filename);
|
||||
|
||||
@ -70,12 +90,22 @@ bool UrlBlockList::load() {
|
||||
}
|
||||
|
||||
size_t secondColEnd = line.find_first_of(" \t", secondCol);
|
||||
|
||||
size_t thirdCol = line.find_first_not_of(" \t", secondColEnd);
|
||||
size_t thirdColEnd = line.find_first_of(" \t", thirdCol);
|
||||
|
||||
size_t fourthCol = line.find_first_not_of(" \t", thirdColEnd);
|
||||
size_t fourthColEnd = line.find_first_of(" \t", fourthCol);
|
||||
|
||||
std::string col2(line, secondCol, secondColEnd - secondCol);
|
||||
std::string col3;
|
||||
if (thirdCol != std::string::npos) {
|
||||
col3 = std::string(line, thirdCol);
|
||||
col3 = std::string(line, thirdCol, thirdColEnd - thirdCol);
|
||||
}
|
||||
|
||||
std::string col4;
|
||||
if (fourthCol != std::string::npos) {
|
||||
col4 = std::string(line, fourthCol, fourthColEnd - fourthCol);
|
||||
}
|
||||
|
||||
switch (line[0]) {
|
||||
@ -86,12 +116,7 @@ bool UrlBlockList::load() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (starts_with(col3.c_str(), "allow=")) {
|
||||
col3.erase(0, 6);
|
||||
} else {
|
||||
col3.clear();
|
||||
}
|
||||
tmpUrlBlockList->emplace_back(std::shared_ptr<urlblockdomain_t>(new urlblockdomain_t(col2, col3)));
|
||||
parseDomain(&tmpUrlBlockList, col2, col3, col4);
|
||||
break;
|
||||
case 'h':
|
||||
// host
|
||||
|
@ -22,21 +22,25 @@ TEST(UrlBlockListTest, Domain) {
|
||||
TestUrlBlockList urlBlockList("blocklist/domain.txt");
|
||||
urlBlockList.load();
|
||||
|
||||
//regex badsite.com https?://www\.badsite\.com/
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.badsite.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.badsite.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("httpp://www.badsite.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.badsite.com/page.html"));
|
||||
|
||||
//regex httponly.com http://www\.httponly\.com/
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.httponly.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.httponly.com/page.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://www.httponly.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://subdomain.httponly.com/"));
|
||||
|
||||
//regex httpsonly.com https://www\.httpsonly\.com/
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.httpsonly.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.httpsonly.com/page.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.httpsonly.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://subdomain.httpsonly.com/"));
|
||||
|
||||
//domain allsubdomain.com
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allsubdomain.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.allsubdomain.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub2.allsubdomain.com/"));
|
||||
@ -46,34 +50,88 @@ TEST(UrlBlockListTest, Domain) {
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://something.com/www.allsubdomain.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://sub1.diffdomain.com/"));
|
||||
|
||||
//regex onlyroot.com http://www\.onlyroot\.com/$
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.onlyroot.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.onlyroot.com/page.html"));
|
||||
|
||||
//domain example.com allow=,www
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.sub2.example.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.example.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.sub1.example.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.example.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://example.com/"));
|
||||
|
||||
//host specific.host.com
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://specific.host.com/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://specific.host.com/"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://www.host.com/"));
|
||||
|
||||
//tld my,dk
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://specific.host.dk/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.host.my/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.host.com.my/"));
|
||||
|
||||
//host www.somesite.com /badpath/
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.somesite.com/badpath/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.somesite.com/badpath/me.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.somesite.com/path/me.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://sub.somesite.com/badpath/"));
|
||||
|
||||
//regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.itsybitsy.com/spider/"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://itsybitsy.com/spider/waterspout.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.itsybitsy.com/spider/waterspout.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://nursery.itsybitsy.com/spider/waterspout.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://rhyme.itsybitsy.com/spider/waterspout.html"));
|
||||
|
||||
//domain allowrootdomainrootpages.com allow=, allowrootpages
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/abc.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/def.html?param1=value1"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/d1/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/d1/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/d1/d2/xyz.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/d1/d2/jkl.html"));
|
||||
|
||||
//domain allowdomainrootpages.com allow=,www allowrootpages
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowdomainrootpages.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/abc.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/def.html"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/abc.html?param1=value1"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/def.html?param1=value1"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/d1/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/d1/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/d1/d2/xyz.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/d1/d2/jkl.html"));
|
||||
|
||||
//domain allowrootdomainindexpage.com allow=, allowindexpage
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowrootdomainindexpage.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/?param=value"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/d1/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/d1/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/d1/d2/xyz.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/d1/d2/jkl.html"));
|
||||
|
||||
//domain allowdomainindexpage.com allow=,www allowindexpage
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowdomainindexpage.com"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/?param=value"));
|
||||
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/?param=value"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/d1/abc.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/d1/def.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/d1/d2/xyz.html"));
|
||||
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/d1/d2/jkl.html"));
|
||||
}
|
||||
|
||||
TEST(UrlBlockListTest, Path) {
|
||||
|
@ -8,4 +8,8 @@ domain example.com allow=,www
|
||||
host specific.host.com
|
||||
tld my,dk
|
||||
host www.somesite.com /badpath/
|
||||
regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
|
||||
regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
|
||||
domain allowrootdomainrootpages.com allow=, allowrootpages
|
||||
domain allowdomainrootpages.com allow=,www allowrootpages
|
||||
domain allowrootdomainindexpage.com allow=, allowindexpage
|
||||
domain allowdomainindexpage.com allow=,www allowindexpage
|
Reference in New Issue
Block a user