Add logic for matchpartial, matchsuffix, matchprefix to host, domain, path

This commit is contained in:
Ai Lin Chia 2018-07-12 13:40:43 +02:00
parent 2fb8ca2337
commit 9bdafc231e
7 changed files with 139 additions and 99 deletions

@ -7,9 +7,23 @@
#include "UrlParser.h"
#include <algorithm>
urlmatchstr_t::urlmatchstr_t(urlmatchtype_t type, const std::string &str)
urlmatchstr_t::urlmatchstr_t(urlmatchtype_t type, const std::string &str, const std::string &match_criteria)
: m_type(type)
, m_str(str) {
, m_str(str)
, m_matchcriteria(matchcriteria_exact) {
if (match_criteria.compare("matchprefix") == 0) {
m_matchcriteria = matchcriteria_prefix;
} else if (match_criteria.compare("matchsuffix") == 0) {
m_matchcriteria = matchcriteria_suffix;
} else if (match_criteria.compare("matchpartial") == 0) {
m_matchcriteria = matchcriteria_partial;
}
}
urlmatchstr_t::urlmatchstr_t(urlmatchtype_t type, const std::string &str, matchcriteria_t matchcriteria)
: m_type(type)
, m_str(str)
, m_matchcriteria(matchcriteria) {
}
urlmatchset_t::urlmatchset_t(urlmatchtype_t type, const std::string &str)
@ -30,9 +44,9 @@ urlmatchpathcriteria_t::urlmatchpathcriteria_t(const std::string &str)
: m_str(str)
, m_pathcriteria(pathcriteria_all) {
if (str.compare("indexpage") == 0) {
m_pathcriteria = urlmatchpathcriteria_t::pathcriteria_index_only;
m_pathcriteria = pathcriteria_index_only;
} else if (str.compare("rootpages") == 0) {
m_pathcriteria = urlmatchpathcriteria_t::pathcriteria_rootpages_only;
m_pathcriteria = pathcriteria_rootpages_only;
}
}
@ -76,21 +90,25 @@ urlmatchtype_t UrlMatch::getType() const {
}
std::string UrlMatch::getDomain() const {
switch (m_type) {
case url_match_domain:
return m_str->m_str;
case url_match_host:
case url_match_hostsuffix: {
Url url;
url.set(m_str->m_str.c_str());
return std::string(url.getDomain(), url.getDomainLen());
if (m_str->m_matchcriteria == urlmatchstr_t::matchcriteria_exact || m_str->m_matchcriteria == urlmatchstr_t::matchcriteria_suffix) {
switch (m_type) {
case url_match_domain:
return m_str->m_str;
case url_match_host: {
Url url;
url.set(m_str->m_str.c_str());
return std::string(url.getDomain(), url.getDomainLen());
}
default:
// do nothing
break;
}
default:
return "";
}
return "";
}
static bool matchString(const std::string &needle, const char *haystack, int32_t haystackLen) {
static bool matchStringExact(const std::string &needle, const char *haystack, int32_t haystackLen) {
return ((needle.length() == static_cast<size_t>(haystackLen)) && memcmp(needle.c_str(), haystack, needle.length()) == 0);
}
@ -98,8 +116,42 @@ static bool matchStringPrefix(const std::string &needle, const char *haystack, i
return ((needle.length() <= static_cast<size_t>(haystackLen)) && memcmp(needle.c_str(), haystack, needle.length()) == 0);
}
static bool matchStringSuffix(const std::string &needle, const char *haystack, int32_t haystackLen) {
return ((needle.length() <= static_cast<size_t>(haystackLen)) && memcmp(needle.c_str(), haystack + haystackLen - needle.length(), needle.length()) == 0);
static bool matchStringSuffix(const std::string &needle, const char *haystack, int32_t haystackLen, char segmentSeparator) {
if ((needle.length() <= static_cast<size_t>(haystackLen)) && memcmp(needle.c_str(), haystack + haystackLen - needle.length(), needle.length()) == 0) {
if (segmentSeparator == '\0') {
return true;
} else {
// we need full segment match
if ((needle.length() == static_cast<size_t>(haystackLen)) ||
// haystack starts segment separator
(needle[0] == segmentSeparator) ||
// haystack doesn't start with a dot, but we always want a full segment match
(haystack[haystackLen - needle.length() - 1] == segmentSeparator)) {
return true;
}
}
}
return false;
}
static bool matchStringPartial(const std::string &needle, const char *haystack, int32_t haystackLen) {
return (strncasestr(haystack, needle.c_str(), haystackLen) != nullptr);
}
static bool matchString(std::shared_ptr<urlmatchstr_t> matchstr, const char *haystack, int32_t haystackLen, char segmentSeparator = '\0') {
switch (matchstr->m_matchcriteria) {
case urlmatchstr_t::matchcriteria_exact:
return matchStringExact(matchstr->m_str, haystack, haystackLen);
case urlmatchstr_t::matchcriteria_prefix:
return matchStringPrefix(matchstr->m_str, haystack, haystackLen);
case urlmatchstr_t::matchcriteria_suffix:
return matchStringSuffix(matchstr->m_str, haystack, haystackLen, segmentSeparator);
case urlmatchstr_t::matchcriteria_partial:
return matchStringPartial(matchstr->m_str, haystack, haystackLen);
}
return false;
}
bool UrlMatch::match(const Url &url, const UrlParser &urlParser) const {
@ -107,27 +159,15 @@ bool UrlMatch::match(const Url &url, const UrlParser &urlParser) const {
switch (m_type) {
case url_match_domain:
return m_invert ^ matchString(m_str->m_str, url.getDomain(), url.getDomainLen());
return m_invert ^ matchString(m_str, url.getDomain(), url.getDomainLen(), '.');
case url_match_extension:
return m_invert ^ matchString(m_str->m_str, url.getExtension(), url.getExtensionLen());
return m_invert ^ matchString(m_str, url.getExtension(), url.getExtensionLen());
case url_match_file:
return m_invert ^ matchString(m_str->m_str, url.getFilename(), url.getFilenameLen());
return m_invert ^ matchString(m_str, url.getFilename(), url.getFilenameLen());
case url_match_host:
return m_invert ^ matchString(m_str->m_str, url.getHost(), url.getHostLen());
case url_match_hostsuffix:
if (matchStringSuffix(m_str->m_str, url.getHost(), url.getHostLen())) {
// full match
if ((m_str->m_str.length() == static_cast<size_t>(url.getHostLen())) ||
// hostsuffix starts with a dot
(m_str->m_str[0] == '.') ||
// hostsuffix doesn't start with a dot, but we always want a full segment match
(url.getHost()[url.getHostLen() - m_str->m_str.length() - 1] == '.')) {
return m_invert ^ true;
}
}
return false;
return m_invert ^ matchString(m_str, url.getHost(), url.getHostLen(), '.');
case url_match_middomain:
return m_invert ^ matchString(m_str->m_str, url.getMidDomain(), url.getMidDomainLen());
return m_invert ^ matchString(m_str, url.getMidDomain(), url.getMidDomainLen(), '.');
case url_match_queryparam:
if (strncasestr(url.getQuery(), m_param->m_name.c_str(), url.getQueryLen()) != nullptr) {
// not the most efficient, but there is already parsing logic for query parameter in UrlParser
@ -137,14 +177,14 @@ bool UrlMatch::match(const Url &url, const UrlParser &urlParser) const {
}
for (auto &queryMatch : queryMatches) {
if (matchString(m_param->m_value, queryMatch->getValue(), queryMatch->getValueLen())) {
if (matchStringExact(m_param->m_value, queryMatch->getValue(), queryMatch->getValueLen())) {
return m_invert ^ true;
}
}
}
break;
case url_match_path:
return m_invert ^ matchStringPrefix(m_str->m_str, url.getPath(), url.getPathLenWithCgi());
return m_invert ^ matchString(m_str, url.getPath(), url.getPathLenWithCgi(), '/');
case url_match_pathcriteria:
// check for pathcriteria
switch (m_pathcriteria->m_pathcriteria) {
@ -165,22 +205,22 @@ bool UrlMatch::match(const Url &url, const UrlParser &urlParser) const {
}
for (auto &pathParamMatch : pathParamMatches) {
if (matchString(m_param->m_value, pathParamMatch->getValue(), pathParamMatch->getValueLen())) {
if (matchStringExact(m_param->m_value, pathParamMatch->getValue(), pathParamMatch->getValueLen())) {
return m_invert ^ true;
}
}
}
break;
case url_match_pathpartial:
return m_invert ^ (strncasestr(url.getPath(), m_str->m_str.c_str(), url.getPathLen()) != nullptr);
return m_invert ^ matchStringPartial(m_str->m_str, url.getPath(), url.getPathLen());
case url_match_port: {
std::string port = std::to_string(url.getPort());
return m_invert ^ matchString(m_str->m_str, port.c_str(), port.length());
return m_invert ^ matchStringExact(m_str->m_str, port.c_str(), port.length());
}
case url_match_regex:
return m_invert ^ m_regex->m_regex.match(url.getUrl());
case url_match_scheme:
return m_invert ^ matchString(m_str->m_str, url.getScheme(), url.getSchemeLen());
return m_invert ^ matchStringExact(m_str->m_str, url.getScheme(), url.getSchemeLen());
case url_match_subdomain: {
auto subDomainLen = (url.getDomain() == url.getHost()) ? 0 : url.getDomain() - url.getHost() - 1;
std::string subDomain(url.getHost(), subDomainLen);
@ -217,10 +257,6 @@ void UrlMatch::log(const Url &url, const char **type, const char **value) const
*type = "host";
*value = m_str->m_str.c_str();
break;
case url_match_hostsuffix:
*type = "hostsuffix";
*value = m_str->m_str.c_str();
break;
case url_match_middomain:
*type = "middomain";
*value = m_str->m_str.c_str();

@ -13,7 +13,6 @@ enum urlmatchtype_t {
url_match_domain,
url_match_file,
url_match_host,
url_match_hostsuffix,
url_match_middomain,
url_match_path,
url_match_pathcriteria,
@ -28,10 +27,19 @@ enum urlmatchtype_t {
};
struct urlmatchstr_t {
urlmatchstr_t(urlmatchtype_t type, const std::string &str);
enum matchcriteria_t {
matchcriteria_exact,
matchcriteria_prefix,
matchcriteria_suffix,
matchcriteria_partial
};
urlmatchstr_t(urlmatchtype_t type, const std::string &str, const std::string &match_criteria);
urlmatchstr_t(urlmatchtype_t type, const std::string &str, matchcriteria_t matchcriteria);
urlmatchtype_t m_type;
std::string m_str;
matchcriteria_t m_matchcriteria;
};
struct urlmatchset_t {

@ -115,6 +115,25 @@ static bool parseMatchSet(urlmatches_t *urlMatches, urlmatchtype_t type, const s
return true;
}
static bool parseMatchStrWithCriteria(urlmatches_t *urlMatches, urlmatchtype_t type, const std::vector<std::string> &tokens,
bool invert, urlmatchstr_t::matchcriteria_t default_matchcriteria = urlmatchstr_t::matchcriteria_exact) {
// validate
if (tokens.size() < 2 || tokens.size() > 3) {
return false;
}
const std::string &str = tokens[1];
if (tokens.size() == 2) {
urlMatches->emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(type, str, default_matchcriteria)), invert);
} else {
const std::string &match_criteria = tokens[2];
urlMatches->emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(type, str, match_criteria)), invert);
}
return true;
}
static bool parseMatchStr(urlmatches_t *urlMatches, urlmatchtype_t type, const std::vector<std::string> &tokens, bool invert) {
// validate
if (tokens.size() != 2) {
@ -122,21 +141,8 @@ static bool parseMatchStr(urlmatches_t *urlMatches, urlmatchtype_t type, const s
}
const std::string &str = tokens[1];
urlMatches->emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(type, str)), invert);
return true;
}
static bool parseHostSuffix(urlmatches_t *urlMatches, const std::vector<std::string> &tokens, bool invert) {
// validate
if (tokens.size() != 2) {
return false;
}
const std::string &host = tokens[1];
auto matcher = std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(url_match_hostsuffix, host));
urlMatches->emplace_back(matcher, invert);
urlMatches->emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(type, str, "")), invert);
return true;
}
@ -253,7 +259,7 @@ bool UrlMatchList::load() {
case 'd':
// domain
if (type.compare("domain") == 0) {
if (!parseMatchStr(&urlMatches, url_match_domain, tokens, invert)) {
if (!parseMatchStrWithCriteria(&urlMatches, url_match_domain, tokens, invert)) {
logError("Invalid '%s' line found. Ignoring line='%s'", type.c_str(), line.c_str());
foundInvalid = true;
continue;
@ -295,13 +301,7 @@ bool UrlMatchList::load() {
case 'h':
// host
if (type.compare("host") == 0) {
if (!parseMatchStr(&urlMatches, url_match_host, tokens, invert)) {
logError("Invalid '%s' line found. Ignoring line='%s'", type.c_str(), line.c_str());
foundInvalid = true;
continue;
}
} else if (type.compare("hostsuffix") == 0) {
if (!parseHostSuffix(&urlMatches, tokens, invert)) {
if (!parseMatchStrWithCriteria(&urlMatches, url_match_host, tokens, invert)) {
logError("Invalid '%s' line found. Ignoring line='%s'", type.c_str(), line.c_str());
foundInvalid = true;
continue;
@ -336,7 +336,7 @@ bool UrlMatchList::load() {
}
} else if (type.compare("path") == 0) {
// path
if (!parseMatchStr(&urlMatches, url_match_path, tokens, invert)) {
if (!parseMatchStrWithCriteria(&urlMatches, url_match_path, tokens, invert, urlmatchstr_t::matchcriteria_prefix)) {
logError("Invalid '%s' line found. Ignoring line='%s'", type.c_str(), line.c_str());
foundInvalid = true;
continue;
@ -354,13 +354,6 @@ bool UrlMatchList::load() {
foundInvalid = true;
continue;
}
} else if (type.compare("pathpartial") == 0) {
// pathpartial
if (!parseMatchStr(&urlMatches, url_match_pathpartial, tokens, invert)) {
logError("Invalid '%s' line found. Ignoring line='%s'", type.c_str(), line.c_str());
foundInvalid = true;
continue;
}
} else if (type.compare("port") == 0) {
// port
if (!parseMatchStr(&urlMatches, url_match_port, tokens, invert)) {
@ -453,15 +446,17 @@ bool UrlMatchList::load() {
auto func = [](const UrlMatch &match) {
return match.getType() == url_match_domain ||
match.getType() == url_match_host ||
match.getType() == url_match_hostsuffix;
match.getType() == url_match_host;
};
auto it = std::find_if(urlMatches.begin(), urlMatches.end(), func);
if (it != urlMatches.end()) {
auto &list = tmpUrlMatchList->m_domainUrlMatchesList[it->getDomain()];
list.emplace_back(urlMatches);
continue;
std::string domain = it->getDomain();
if (!domain.empty()) {
auto &list = tmpUrlMatchList->m_domainUrlMatchesList[domain];
list.emplace_back(urlMatches);
continue;
}
}
tmpUrlMatchList->m_urlMatchesList.emplace_back(urlMatches);

@ -232,46 +232,46 @@ TEST(UrlMatchListTest, DomainTld) {
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.host.com.my/"));
}
TEST(UrlMatchListTest, DomainHostSuffix) {
TestUrlMatchList urlMatchList("blocklist/domain.txt");
TEST(UrlMatchListTest, HostHostSuffix) {
TestUrlMatchList urlMatchList("blocklist/host.txt");
urlMatchList.load();
//hostsuffix hostsuffix01.com
//host hostsuffix01.com matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix01.com"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix01.com"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://hostsuffix01.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://bhostsuffix01.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://jostsuffix01.com"));
//hostsuffix .hostsuffix02.com
//host .hostsuffix02.com matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix02.com"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix02.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://hostsuffix02.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://bhostsuffix02.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://jostsuffix02.com"));
//hostsuffix hostsuffix03.co.uk
//host hostsuffix03.co.uk matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix03.co.uk"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix03.co.uk"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://hostsuffix03.co.uk"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://bhostsuffix03.co.uk"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://jostsuffix03.co.uk"));
//hostsuffix .hostsuffix04.co.uk
//host .hostsuffix04.co.uk matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix04.co.uk"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix04.co.uk"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://hostsuffix04.co.uk"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://bhostsuffix04.co.uk"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://jostsuffix04.co.uk"));
//hostsuffix hostsuffix05.a.se
//host hostsuffix05.a.se matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix05.a.se"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix05.a.se"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://hostsuffix05.a.se"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://bhostsuffix05.a.se"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://jostsuffix05.a.se"));
//hostsuffix .hostsuffix06.a.se
//host .hostsuffix06.a.se matchsuffix
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub.hostsuffix06.a.se"));
EXPECT_TRUE(urlMatchList.isUrlMatched("http://sub1.sub.hostsuffix06.a.se"));
EXPECT_FALSE(urlMatchList.isUrlMatched("http://hostsuffix06.a.se"));
@ -329,11 +329,11 @@ TEST(UrlMatchListTest, PathFile) {
EXPECT_TRUE(urlMatchList.isUrlMatched("http://www.example.com/blog/wp-login.php?param=value&param2=value2"));
}
TEST(UrlMatchListTest, PathQueryParam) {
TEST(UrlMatchListTest, PathParam) {
TestUrlMatchList urlMatchList("blocklist/path.txt");
urlMatchList.load();
//queryparam url
//param url
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/bogus.html?URL=abc"));
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/bogus.html?url=abcde"));
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/bogus.html?uRl=abcde"));
@ -341,7 +341,7 @@ TEST(UrlMatchListTest, PathQueryParam) {
EXPECT_FALSE(urlMatchList.isUrlMatched("https://www.example.com/bogus.html?urlz=http://www.example.com"));
EXPECT_FALSE(urlMatchList.isUrlMatched("https://www.example.com/bogus.html?zurl=http://www.example.com"));
//queryparam action buy_now
//param action buy_now
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/cart.html?action=buy_now&product_id=123"));
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/cart.html?ACTION=buy_now&product_id=123"));
EXPECT_TRUE(urlMatchList.isUrlMatched("https://www.example.com/cart.html?product_id=123&action=buy_now"));

@ -21,9 +21,3 @@ domain allowrootdomainindexpage.com AND NOT pathcriteria indexpage
domain allowdomainindexpage.com AND NOT subdomain ,www
domain allowdomainindexpage.com AND NOT pathcriteria indexpage
hostsuffix hostsuffix01.com
hostsuffix .hostsuffix02.com
hostsuffix hostsuffix03.co.uk
hostsuffix .hostsuffix04.co.uk
hostsuffix hostsuffix05.a.se
hostsuffix .hostsuffix06.a.se

@ -4,3 +4,10 @@ host port.host.com AND port 3001
host ssl.host.com AND port 443
host www.somesite.com AND path /badpath/
host hostsuffix01.com matchsuffix
host .hostsuffix02.com matchsuffix
host hostsuffix03.co.uk matchsuffix
host .hostsuffix04.co.uk matchsuffix
host hostsuffix05.a.se matchsuffix
host .hostsuffix06.a.se matchsuffix

@ -1,5 +1,5 @@
path /wp-admin/
pathpartial /wishlist/index/add/
path /wishlist/index/add/ matchpartial
file wp-login.php
param url
param action buy_now