forked from Mirrors/privacore-open-source-search-engine
142 lines
4.0 KiB
C++
142 lines
4.0 KiB
C++
#include "UrlBlock.h"
|
|
#include "Url.h"
|
|
#include "hash.h"
|
|
#include "GbUtil.h"
|
|
#include "Log.h"
|
|
#include "Conf.h"
|
|
#include <algorithm>
|
|
|
|
urlblocktld_t::urlblocktld_t(const std::string &tlds)
|
|
: m_tldsStr(tlds)
|
|
, m_tlds(split(tlds, ',')) {
|
|
}
|
|
|
|
urlblockdomain_t::urlblockdomain_t(const std::string &domain, const std::string &allow)
|
|
: m_domain(domain)
|
|
, m_allow(split(allow, ',')) {
|
|
}
|
|
|
|
urlblockhost_t::urlblockhost_t(const std::string &host, const std::string &path)
|
|
: m_host(host)
|
|
, m_path(path) {
|
|
}
|
|
|
|
urlblockpath_t::urlblockpath_t(const std::string &path)
|
|
: m_path(path) {
|
|
}
|
|
|
|
urlblockregex_t::urlblockregex_t(const std::string ®exStr, const GbRegex ®ex, const std::string &domain)
|
|
: m_regex(regex)
|
|
, m_regexStr(regexStr)
|
|
, m_domain(domain) {
|
|
}
|
|
|
|
UrlBlock::UrlBlock(const std::shared_ptr<urlblocktld_t> &urlblocktld)
|
|
: m_type(url_block_tld)
|
|
, m_tld(urlblocktld) {
|
|
}
|
|
|
|
UrlBlock::UrlBlock(const std::shared_ptr<urlblockdomain_t> &urlblockdomain)
|
|
: m_type(url_block_domain)
|
|
, m_domain(urlblockdomain) {
|
|
}
|
|
|
|
UrlBlock::UrlBlock(const std::shared_ptr<urlblockhost_t> &urlblockhost)
|
|
: m_type(url_block_host)
|
|
, m_host(urlblockhost) {
|
|
}
|
|
|
|
UrlBlock::UrlBlock(const std::shared_ptr<urlblockpath_t> &urlblockpath)
|
|
: m_type(url_block_path)
|
|
, m_path(urlblockpath) {
|
|
|
|
}
|
|
|
|
UrlBlock::UrlBlock(const std::shared_ptr<urlblockregex_t> &urlblockregex)
|
|
: m_type(url_block_regex)
|
|
, m_regex(urlblockregex) {
|
|
}
|
|
|
|
bool UrlBlock::match(const Url &url) const {
|
|
switch (m_type) {
|
|
case url_block_domain:
|
|
if (m_domain->m_domain.length() == static_cast<size_t>(url.getDomainLen()) &&
|
|
memcmp(m_domain->m_domain.c_str(), url.getDomain(), url.getDomainLen()) == 0) {
|
|
// check subdomain
|
|
if (!m_domain->m_allow.empty()) {
|
|
auto subDomainLen = (url.getDomain() == url.getHost()) ? 0 : url.getDomain() - url.getHost() - 1;
|
|
std::string subDomain(url.getHost(), subDomainLen);
|
|
return (std::find(m_domain->m_allow.cbegin(), m_domain->m_allow.cend(), subDomain) == m_domain->m_allow.cend());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
break;
|
|
case url_block_host:
|
|
if (m_host->m_host.length() == static_cast<size_t>(url.getHostLen()) &&
|
|
memcmp(m_host->m_host.c_str(), url.getHost(), url.getHostLen()) == 0) {
|
|
if (m_host->m_path.empty()) {
|
|
return true;
|
|
}
|
|
|
|
return (m_host->m_path.length() <= static_cast<size_t>(url.getPathLenWithCgi()) &&
|
|
memcmp(m_host->m_path.c_str(), url.getPath(), m_host->m_path.length()) == 0);
|
|
}
|
|
break;
|
|
case url_block_path:
|
|
return (m_path->m_path.length() <= static_cast<size_t>(url.getPathLenWithCgi()) &&
|
|
memcmp(m_path->m_path.c_str(), url.getPath(), m_path->m_path.length()) == 0);
|
|
case url_block_regex:
|
|
if (m_regex->m_domain.empty() || (!m_regex->m_domain.empty() &&
|
|
m_regex->m_domain.length() == static_cast<size_t>(url.getDomainLen()) &&
|
|
memcmp(m_regex->m_domain.c_str(), url.getDomain(), url.getDomainLen()) == 0)) {
|
|
return m_regex->m_regex.match(url.getUrl());
|
|
}
|
|
break;
|
|
case url_block_tld:
|
|
if (!m_tld->m_tlds.empty()) {
|
|
const char *tld = url.getTLD();
|
|
size_t tldLen = static_cast<size_t>(url.getTLDLen());
|
|
const char *dotPos = static_cast<const char *>(memchr(tld, '.', tldLen));
|
|
if (dotPos) {
|
|
tldLen -= (dotPos - tld + 1);
|
|
tld = dotPos + 1;
|
|
}
|
|
|
|
return (std::find(m_tld->m_tlds.cbegin(), m_tld->m_tlds.cend(), std::string(tld, tldLen)) != m_tld->m_tlds.cend());
|
|
}
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void UrlBlock::logMatch(const Url &url) const {
|
|
const char *type = NULL;
|
|
const char *value = NULL;
|
|
|
|
switch (m_type) {
|
|
case url_block_domain:
|
|
type = "domain";
|
|
value = m_domain->m_domain.c_str();
|
|
break;
|
|
case url_block_host:
|
|
type = "host";
|
|
value = m_host->m_host.c_str();
|
|
break;
|
|
case url_block_path:
|
|
type = "path";
|
|
value = m_path->m_path.c_str();
|
|
break;
|
|
case url_block_regex:
|
|
type = "regex";
|
|
value = m_regex->m_regexStr.c_str();
|
|
break;
|
|
case url_block_tld:
|
|
type = "tld";
|
|
value = m_tld->m_tldsStr.c_str();
|
|
}
|
|
|
|
logTrace(g_conf.m_logTraceUrlBlockList, "Url block criteria %s='%s' matched url '%s'", type, value, url.getUrl());
|
|
}
|