Files
privacore-open-source-searc…/UrlMatchList.cpp

380 lines
11 KiB
C++

#include "UrlMatchList.h"
#include "Log.h"
#include "Conf.h"
#include "Loop.h"
#include "Url.h"
#include "GbUtil.h"
#include "Dir.h"
#include "Hostdb.h"
#include "third-party/sparsepp/sparsepp/spp.h"
#include "JobScheduler.h"
#include <fstream>
#include <sys/stat.h>
#include <atomic>
UrlMatchList g_urlBlackList("urlblacklist*.txt");
UrlMatchList g_urlWhiteList("urlwhitelist.txt");
typedef std::vector<UrlMatch> urlmatchlist_t;
typedef spp::sparse_hash_map<std::string, urlmatchlist_t> urlmatchlist_map_t;
struct UrlMatchListItem {
spp::sparse_hash_set<std::string> m_domainMatches;
urlmatchlist_map_t m_listMatches;
urlmatchlist_t m_urlMatches;
};
UrlMatchList::UrlMatchList(const char *filename)
: m_filename(filename)
, m_dirname()
, m_loading(false)
, m_urlMatchList(new UrlMatchListItem)
, m_lastModifiedTimes() {
size_t pos = m_filename.find_last_of('/');
if (pos != std::string::npos) {
m_dirname = m_filename.substr(0, pos);
m_filename.erase(0, pos + 1);
}
}
bool UrlMatchList::init() {
log(LOG_INFO, "Initializing UrlMatchList with %s", m_filename.c_str());
if (!g_loop.registerSleepCallback(60000, this, &reload, "UrlMatchList::reload", 0)) {
log(LOG_WARN, "UrlMatchList: Failed to register callback.");
return false;
}
// we do a load here instead of using sleep callback with immediate set to true so
// we don't rely on g_loop being up and running to use urlmatchlist
load();
return true;
}
void UrlMatchList::reload(int /*fd*/, void *state) {
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
return;
}
// unable to submit job (load on main thread)
reload(state);
}
void UrlMatchList::reload(void *state) {
UrlMatchList *urlMatchList = static_cast<UrlMatchList*>(state);
// don't load multiple times at the same time
if (urlMatchList->m_loading.exchange(true)) {
return;
}
urlMatchList->load();
urlMatchList->m_loading = false;
}
static bool parseDomain(urlmatchlistitem_ptr_t *urlMatchList, const std::string &col2, const std::string &col3, const std::string &col4) {
// verify that col2 is actually a domain
Url url;
url.set(col2.c_str());
if (static_cast<size_t>(url.getDomainLen()) != col2.length()) {
return false;
}
if (col3.empty() && col4.empty()) {
(*urlMatchList)->m_domainMatches.insert(col2);
return true;
}
std::string allowStr;
if (!col3.empty()) {
if (starts_with(col3.c_str(), "allow=")) {
allowStr.append(col3, 6, std::string::npos);
}
}
urlmatchdomain_t::pathcriteria_t pathcriteria = urlmatchdomain_t::pathcriteria_allow_all;
if (!col4.empty()) {
if (col4.compare("allowindexpage") == 0) {
pathcriteria = urlmatchdomain_t::pathcriteria_allow_index_only;
} else if (col4.compare("allowrootpages") == 0) {
pathcriteria = urlmatchdomain_t::pathcriteria_allow_rootpages_only;
}
}
auto matcher = std::shared_ptr<urlmatchdomain_t>(new urlmatchdomain_t(col2, allowStr, pathcriteria));
auto &list = (*urlMatchList)->m_listMatches[matcher->m_domain];
list.emplace_back(matcher);
return true;
}
static void parseHost(urlmatchlistitem_ptr_t *urlMatchList, const std::string &col2, const std::string &col3) {
auto matcher = std::shared_ptr<urlmatchhost_t>(new urlmatchhost_t(col2, col3));
Url url;
url.set(matcher->m_host.c_str());
auto &list = (*urlMatchList)->m_listMatches[std::string(url.getDomain(), url.getDomainLen())];
list.emplace_back(matcher);
}
static void parseRegex(urlmatchlistitem_ptr_t *urlMatchList, const std::string &col2, const std::string &col3) {
// check for wildcard domain
std::string domain(col2);
if (domain.length() == 1 && domain[0] == '*') {
domain.clear();
}
auto matcher = std::shared_ptr<urlmatchregex_t>(new urlmatchregex_t(col3, GbRegex(col3.c_str(), PCRE_NO_AUTO_CAPTURE, PCRE_STUDY_JIT_COMPILE), domain));
if (domain.empty()) {
(*urlMatchList)->m_urlMatches.emplace_back(matcher);
} else {
auto &list = (*urlMatchList)->m_listMatches[matcher->m_domain];
list.emplace_back(matcher);
}
}
static void parseHostSuffix(urlmatchlistitem_ptr_t *urlMatchList, const std::string &col2) {
auto matcher = std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(url_match_hostsuffix, col2));
Url url;
url.set(matcher->m_str.c_str());
auto &list = (*urlMatchList)->m_listMatches[std::string(url.getDomain(), url.getDomainLen())];
list.emplace_back(matcher);
}
bool UrlMatchList::load() {
std::string dirname(m_dirname);
if (dirname.empty()) {
dirname = g_hostdb.m_dir;
}
Dir dir;
if (!dir.set(dirname.c_str()) || !dir.open()) {
logError("Had error opening directory %s", g_hostdb.m_dir);
return false;
}
urlmatchlistitem_ptr_t tmpUrlMatchList(new UrlMatchListItem);
std::vector<std::string> filePaths;
bool anyFileModified = false;
while (const char *filename = dir.getNextFilename(m_filename.c_str())) {
std::string filePath(filename);
if (!m_dirname.empty()) {
filePath.insert(0, "/");
filePath.insert(0, m_dirname);
}
logTrace(g_conf.m_logTraceUrlMatchList, "Loading %s", filePath.c_str());
struct stat st;
if (stat(filePath.c_str(), &st) != 0) {
// probably not found
log(LOG_INFO, "UrlMatchList::load: Unable to stat %s", filePath.c_str());
continue;
}
filePaths.push_back(filePath);
time_t lastModifiedTime = m_lastModifiedTimes[filePath];
if (lastModifiedTime != 0 && lastModifiedTime == st.st_mtime) {
// not modified. assume successful
logTrace(g_conf.m_logTraceUrlMatchList, "Not modified");
continue;
}
anyFileModified = true;
m_lastModifiedTimes[filePath] = st.st_mtime;
}
if (!anyFileModified) {
return false;
}
int totalCount = 0;
bool loadedFile = false;
for (const auto &filePath : filePaths) {
log(LOG_INFO, "Loading '%s' for UrlMatchList", filePath.c_str());
int count = 0;
std::ifstream file(filePath.c_str());
std::string line;
while (std::getline(file, line)) {
// ignore comments & empty lines
if (line.length() == 0 || line[0] == '#') {
continue;
}
// look for first space or tab
auto firstColEnd = line.find_first_of(" \t");
size_t secondCol = line.find_first_not_of(" \t", firstColEnd);
if (firstColEnd == std::string::npos || secondCol == std::string::npos) {
// invalid format
continue;
}
size_t secondColEnd = line.find_first_of(" \t", secondCol);
size_t thirdCol = line.find_first_not_of(" \t", secondColEnd);
size_t thirdColEnd = line.find_first_of(" \t", thirdCol);
size_t fourthCol = line.find_first_not_of(" \t", thirdColEnd);
size_t fourthColEnd = line.find_first_of(" \t", fourthCol);
std::string col2(line, secondCol, secondColEnd - secondCol);
std::string col3;
if (thirdCol != std::string::npos) {
col3 = std::string(line, thirdCol, thirdColEnd - thirdCol);
}
std::string col4;
if (fourthCol != std::string::npos) {
col4 = std::string(line, fourthCol, fourthColEnd - fourthCol);
}
switch (line[0]) {
case 'd':
// domain
if (firstColEnd == 6 && memcmp(line.data(), "domain", 6) == 0) {
if (!parseDomain(&tmpUrlMatchList, col2, col3, col4)) {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
case 'f':
// file
if (firstColEnd == 4 && memcmp(line.data(), "file", 4) == 0) {
tmpUrlMatchList->m_urlMatches.emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(url_match_file, col2)));
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
case 'h':
// host
if (firstColEnd == 4 && memcmp(line.data(), "host", 4) == 0) {
parseHost(&tmpUrlMatchList, col2, col3);
} else if (firstColEnd == 10 && memcmp(line.data(), "hostsuffix", 10) == 0) {
parseHostSuffix(&tmpUrlMatchList, col2);
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
case 'p':
if (firstColEnd == 5 && memcmp(line.data(), "param", 5) == 0) {
// param
tmpUrlMatchList->m_urlMatches.emplace_back(std::shared_ptr<urlmatchparam_t>(new urlmatchparam_t(col2, col3)));
} else if (firstColEnd == 4 && memcmp(line.data(), "path", 4) == 0) {
// path
tmpUrlMatchList->m_urlMatches.emplace_back(std::shared_ptr<urlmatchstr_t>(new urlmatchstr_t(url_match_path, col2)));
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
case 'r':
// regex
if (firstColEnd == 5 && memcmp(line.data(), "regex", 5) == 0 && !col3.empty()) {
parseRegex(&tmpUrlMatchList, col2, col3);
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
case 't':
// tld
if (firstColEnd == 3 && memcmp(line.data(), "tld", 3) == 0) {
tmpUrlMatchList->m_urlMatches.emplace_back(std::shared_ptr<urlmatchtld_t>(new urlmatchtld_t(col2)));
} else {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
break;
default:
logError("Invalid line found. Ignoring line='%s'", line.c_str());
continue;
}
logTrace(g_conf.m_logTraceUrlMatchList, "Adding criteria '%s' to list", line.c_str());
++count;
}
loadedFile = true;
log(LOG_INFO, "Loaded '%s' with %d entries for UrlMatchList", filePath.c_str(), count);
totalCount += count;
}
if (loadedFile) {
logTrace(g_conf.m_logTraceUrlMatchList, "Number of url-match entries in %s: %d", m_filename.c_str(), totalCount);
swapUrlMatchList(tmpUrlMatchList);
}
return loadedFile;
}
static bool matchList(const urlmatchlist_map_t &matcher, const std::string &key, const Url &url, const UrlParser &urlParser) {
auto it = matcher.find(key);
if (it != matcher.end()) {
for (auto urlMatch : it->second) {
if (urlMatch.match(url, urlParser)) {
if (g_conf.m_logTraceUrlMatchList) {
urlMatch.logMatch(url);
}
return true;
}
}
}
return false;
}
bool UrlMatchList::isUrlMatched(const Url &url) {
UrlParser urlParser(url.getUrl(), url.getUrlLen(), TITLEREC_CURRENT_VERSION);
auto urlMatchList = getUrlMatchList();
std::string domain(url.getDomain(), url.getDomainLen());
// simple domain match
if (urlMatchList->m_domainMatches.count(domain) > 0) {
return true;
}
// check urlmatches using domain as key
if (matchList(urlMatchList->m_listMatches, domain, url, urlParser)) {
return true;
}
for (auto const &urlMatch : urlMatchList->m_urlMatches) {
if (urlMatch.match(url, urlParser)) {
if (g_conf.m_logTraceUrlMatchList) {
urlMatch.logMatch(url);
}
return true;
}
}
return false;
}
urlmatchlistitemconst_ptr_t UrlMatchList::getUrlMatchList() {
return m_urlMatchList;
}
void UrlMatchList::swapUrlMatchList(urlmatchlistitemconst_ptr_t urlMatchList) {
std::atomic_store(&m_urlMatchList, urlMatchList);
}