Add allowindexpage & allowrootpages criteria to urlblocklist for domain

This commit is contained in:
Ai Lin Chia
2017-07-06 13:17:20 +02:00
committed by Brian Rasmusson
parent 60b5ea11db
commit 3a890f2895
5 changed files with 118 additions and 12 deletions

@ -11,9 +11,10 @@ urlblocktld_t::urlblocktld_t(const std::string &tlds)
, m_tlds(split(tlds, ',')) {
}
urlblockdomain_t::urlblockdomain_t(const std::string &domain, const std::string &allow)
urlblockdomain_t::urlblockdomain_t(const std::string &domain, const std::string &allow, pathcriteria_t pathcriteria)
: m_domain(domain)
, m_allow(split(allow, ',')) {
, m_allow(split(allow, ','))
, m_pathcriteria(pathcriteria) {
}
urlblockhost_t::urlblockhost_t(const std::string &host, const std::string &path)
@ -66,7 +67,18 @@ bool UrlBlock::match(const Url &url) const {
if (!m_domain->m_allow.empty()) {
auto subDomainLen = (url.getDomain() == url.getHost()) ? 0 : url.getDomain() - url.getHost() - 1;
std::string subDomain(url.getHost(), subDomainLen);
return (std::find(m_domain->m_allow.cbegin(), m_domain->m_allow.cend(), subDomain) == m_domain->m_allow.cend());
bool match = (std::find(m_domain->m_allow.cbegin(), m_domain->m_allow.cend(), subDomain) == m_domain->m_allow.cend());
if (!match) {
// check for pathcriteria
switch (m_domain->m_pathcriteria) {
case urlblockdomain_t::pathcriteria_allow_all:
return false;
case urlblockdomain_t::pathcriteria_allow_index_only:
return (url.getPathLen() > 1);
case urlblockdomain_t::pathcriteria_allow_rootpages_only:
return (url.getPathDepth(false) > 0);
}
}
}
return true;

@ -8,10 +8,17 @@
#include "GbRegex.h"
struct urlblockdomain_t {
urlblockdomain_t(const std::string &domain, const std::string &allow);
enum pathcriteria_t {
pathcriteria_allow_all,
pathcriteria_allow_index_only,
pathcriteria_allow_rootpages_only
};
urlblockdomain_t(const std::string &domain, const std::string &allow, pathcriteria_t pathcriteria);
std::string m_domain;
std::vector<std::string> m_allow;
pathcriteria_t m_pathcriteria;
};
struct urlblockhost_t {

@ -34,6 +34,26 @@ void UrlBlockList::reload(int /*fd*/, void *state) {
urlBlockList->load();
}
static void parseDomain(urlblocklist_ptr_t *urlBlockList, const std::string &col2, const std::string &col3, const std::string &col4) {
std::string allowStr;
if (!col3.empty()) {
if (starts_with(col3.c_str(), "allow=")) {
allowStr.append(col3, 6, std::string::npos);
}
}
urlblockdomain_t::pathcriteria_t pathcriteria = urlblockdomain_t::pathcriteria_allow_all;
if (!col4.empty()) {
if (col4.compare("allowindexpage") == 0) {
pathcriteria = urlblockdomain_t::pathcriteria_allow_index_only;
} else if (col4.compare("allowrootpages") == 0) {
pathcriteria = urlblockdomain_t::pathcriteria_allow_rootpages_only;
}
}
(*urlBlockList)->emplace_back(std::shared_ptr<urlblockdomain_t>(new urlblockdomain_t(col2, allowStr, pathcriteria)));
}
bool UrlBlockList::load() {
logTrace(g_conf.m_logTraceUrlBlockList, "Loading %s", m_filename);
@ -70,12 +90,22 @@ bool UrlBlockList::load() {
}
size_t secondColEnd = line.find_first_of(" \t", secondCol);
size_t thirdCol = line.find_first_not_of(" \t", secondColEnd);
size_t thirdColEnd = line.find_first_of(" \t", thirdCol);
size_t fourthCol = line.find_first_not_of(" \t", thirdColEnd);
size_t fourthColEnd = line.find_first_of(" \t", fourthCol);
std::string col2(line, secondCol, secondColEnd - secondCol);
std::string col3;
if (thirdCol != std::string::npos) {
col3 = std::string(line, thirdCol);
col3 = std::string(line, thirdCol, thirdColEnd - thirdCol);
}
std::string col4;
if (fourthCol != std::string::npos) {
col4 = std::string(line, fourthCol, fourthColEnd - fourthCol);
}
switch (line[0]) {
@ -86,12 +116,7 @@ bool UrlBlockList::load() {
continue;
}
if (starts_with(col3.c_str(), "allow=")) {
col3.erase(0, 6);
} else {
col3.clear();
}
tmpUrlBlockList->emplace_back(std::shared_ptr<urlblockdomain_t>(new urlblockdomain_t(col2, col3)));
parseDomain(&tmpUrlBlockList, col2, col3, col4);
break;
case 'h':
// host

@ -22,21 +22,25 @@ TEST(UrlBlockListTest, Domain) {
TestUrlBlockList urlBlockList("blocklist/domain.txt");
urlBlockList.load();
//regex badsite.com https?://www\.badsite\.com/
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.badsite.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.badsite.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("httpp://www.badsite.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.badsite.com/page.html"));
//regex httponly.com http://www\.httponly\.com/
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.httponly.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.httponly.com/page.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://www.httponly.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://subdomain.httponly.com/"));
//regex httpsonly.com https://www\.httpsonly\.com/
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.httpsonly.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.httpsonly.com/page.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.httpsonly.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://subdomain.httpsonly.com/"));
//domain allsubdomain.com
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allsubdomain.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.allsubdomain.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub2.allsubdomain.com/"));
@ -46,34 +50,88 @@ TEST(UrlBlockListTest, Domain) {
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://something.com/www.allsubdomain.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://sub1.diffdomain.com/"));
//regex onlyroot.com http://www\.onlyroot\.com/$
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.onlyroot.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.onlyroot.com/page.html"));
//domain example.com allow=,www
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.sub2.example.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub1.example.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.sub1.example.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.example.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://example.com/"));
//host specific.host.com
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://specific.host.com/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://specific.host.com/"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("https://www.host.com/"));
//tld my,dk
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://specific.host.dk/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.host.my/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("https://www.host.com.my/"));
//host www.somesite.com /badpath/
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.somesite.com/badpath/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.somesite.com/badpath/me.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.somesite.com/path/me.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://sub.somesite.com/badpath/"));
//regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.itsybitsy.com/spider/"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://itsybitsy.com/spider/waterspout.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.itsybitsy.com/spider/waterspout.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://nursery.itsybitsy.com/spider/waterspout.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://rhyme.itsybitsy.com/spider/waterspout.html"));
//domain allowrootdomainrootpages.com allow=, allowrootpages
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/abc.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/def.html?param1=value1"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/d1/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/d1/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainrootpages.com/d1/d2/xyz.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainrootpages.com/d1/d2/jkl.html"));
//domain allowdomainrootpages.com allow=,www allowrootpages
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowdomainrootpages.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/abc.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/def.html"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/abc.html?param1=value1"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/def.html?param1=value1"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/d1/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/d1/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainrootpages.com/d1/d2/xyz.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainrootpages.com/d1/d2/jkl.html"));
//domain allowrootdomainindexpage.com allow=, allowindexpage
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowrootdomainindexpage.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/?param=value"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/d1/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/d1/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowrootdomainindexpage.com/d1/d2/xyz.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowrootdomainindexpage.com/d1/d2/jkl.html"));
//domain allowdomainindexpage.com allow=,www allowindexpage
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://sub.allowdomainindexpage.com"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/?param=value"));
EXPECT_FALSE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/?param=value"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/d1/abc.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/d1/def.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://allowdomainindexpage.com/d1/d2/xyz.html"));
EXPECT_TRUE(urlBlockList.isUrlBlocked("http://www.allowdomainindexpage.com/d1/d2/jkl.html"));
}
TEST(UrlBlockListTest, Path) {

@ -8,4 +8,8 @@ domain example.com allow=,www
host specific.host.com
tld my,dk
host www.somesite.com /badpath/
regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
regex itsybitsy.com ^https?://(www\.|nursery\.|)itsybitsy\.com/spider/.+
domain allowrootdomainrootpages.com allow=, allowrootpages
domain allowdomainrootpages.com allow=,www allowrootpages
domain allowrootdomainindexpage.com allow=, allowindexpage
domain allowdomainindexpage.com allow=,www allowindexpage