Add g_contentRetryProxyList & rename BlockList to MatchList

This commit is contained in:
Ai Lin Chia 2018-05-31 12:44:20 +02:00
parent a990000fe4
commit dcb7aa46ee
16 changed files with 146 additions and 69 deletions

@ -233,7 +233,7 @@ Conf::Conf ( ) {
m_logDebugUrlAttempts = false;
m_logDebugVagus = false;
m_logTraceBigFile = false;
m_logTraceBlockList = false;
m_logTraceMatchList = false;
m_logTraceContentTypeBlockList = false;
m_logTraceDocProcess = false;
m_logTraceDns = false;

2
Conf.h

@ -381,7 +381,7 @@ class Conf {
bool m_logDebugVagus;
bool m_logTraceBigFile;
bool m_logTraceBlockList;
bool m_logTraceMatchList;
bool m_logTraceContentTypeBlockList;
bool m_logTraceDocProcess;
bool m_logTraceDns;

41
ContentMatchList.cpp Normal file

@ -0,0 +1,41 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "ContentMatchList.h"
#include "Log.h"
#include "Conf.h"
ContentMatchList g_contentRetryProxyList;
static const char s_filename[] = "contentretryproxylist.txt";
ContentMatchList::ContentMatchList()
: MatchList(s_filename) {
}
bool ContentMatchList::isContentMatched(const char *content, size_t contentLen) {
auto contentMatchList = getMatchList();
for (auto const &contentMatch : *contentMatchList) {
if (strncasestr(content, contentLen, contentMatch.c_str())) {
return true;
}
}
return false;
}

32
ContentMatchList.h Normal file

@ -0,0 +1,32 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_CONTENTMATCHLIST_H
#define FX_CONTENTMATCHLIST_H
#include "MatchList.h"
class ContentMatchList : public MatchList<std::string> {
public:
ContentMatchList();
bool isContentMatched(const char *content, size_t contentLen);
};
extern ContentMatchList g_contentRetryProxyList;
#endif // FX_CONTENTMATCHLIST_H

@ -29,7 +29,7 @@ static const char s_contenttype_filename[] = "contenttypeblocklist.txt";
static const char s_contenttype_allowed_filename[] = "contenttypeallowed.txt";
ContentTypeBlockList::ContentTypeBlockList()
: BlockList(s_contenttype_filename)
: MatchList(s_contenttype_filename)
, m_contenttype_allowed()
, m_contenttype_allowed_mtx(PTHREAD_MUTEX_INITIALIZER) {
}
@ -43,7 +43,7 @@ bool ContentTypeBlockList::init() {
m_contenttype_allowed.push_back(line);
}
return BlockList::init();
return MatchList::init();
}
void ContentTypeBlockList::addContentTypeAllowed(const char *contentType, size_t contentTypeLen) {
@ -65,7 +65,7 @@ bool ContentTypeBlockList::isContentTypeBlocked(const char *contentType, size_t
return false;
}
auto contentTypeBlockList = getBlockList();
auto contentTypeBlockList = getMatchList();
for (auto const &contentTypeBlock : *contentTypeBlockList) {
if (contentTypeBlock.back() == '*') {

@ -20,11 +20,11 @@
#define FX_CONTENTTYPEBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
#include <pthread.h>
#include <vector>
class ContentTypeBlockList : public BlockList<std::string> {
class ContentTypeBlockList : public MatchList<std::string> {
public:
ContentTypeBlockList();

@ -25,11 +25,11 @@ DnsBlockList g_dnsBlockList;
static const char s_dns_filename[] = "dnsblocklist.txt";
DnsBlockList::DnsBlockList()
: BlockList(s_dns_filename) {
: MatchList(s_dns_filename) {
}
bool DnsBlockList::isDnsBlocked(const char *dns) {
auto dnsBlockList = getBlockList();
auto dnsBlockList = getMatchList();
for (auto const &dnsBlock : *dnsBlockList) {
if (dnsBlock.front() == '*') {

@ -19,9 +19,9 @@
#ifndef FX_DNSBLOCKLIST_H
#define FX_DNSBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
class DnsBlockList : public BlockList<std::string> {
class DnsBlockList : public MatchList<std::string> {
public:
DnsBlockList();
bool isDnsBlocked(const char *dns);

@ -26,11 +26,11 @@ IpBlockList g_ipBlockList;
static const char s_ip_filename[] = "ipblocklist.txt";
IpBlockList::IpBlockList()
: BlockList(s_ip_filename) {
: MatchList(s_ip_filename) {
}
bool IpBlockList::isIpBlocked(uint32_t ip) {
auto ipBlockList = getBlockList();
auto ipBlockList = getMatchList();
for (auto const &ipBlock : *ipBlockList) {
if (ipBlock == ip) {
@ -42,7 +42,7 @@ bool IpBlockList::isIpBlocked(uint32_t ip) {
return false;
}
void IpBlockList::addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line) {
void IpBlockList::addToBlockList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line) {
in_addr addr;
if (inet_pton(AF_INET, line.c_str(), &addr) != 1) {

@ -19,15 +19,15 @@
#ifndef FX_IPBLOCKLIST_H
#define FX_IPBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
class IpBlockList : public BlockList<uint32_t> {
class IpBlockList : public MatchList<uint32_t> {
public:
IpBlockList();
bool isIpBlocked(uint32_t ip);
protected:
void addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line);
void addToBlockList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line);
};

@ -59,8 +59,8 @@ OBJS_O2 = \
OBJS_O3 = \
BlockList.o \
ContentTypeBlockList.o \
MatchList.o \
ContentMatchList.o ContentTypeBlockList.o \
DocDelete.o DocProcess.o DocRebuild.o DocReindex.o DnsBlockList.o \
IPAddressChecks.o IpBlockList.o \
LanguageResultOverride.o Linkdb.o \

@ -16,7 +16,7 @@
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "BlockList.h"
#include "MatchList.h"
#include "Log.h"
#include "Conf.h"
#include "Loop.h"
@ -26,31 +26,31 @@
#include <atomic>
template <class T>
BlockList<T>::BlockList(const char *filename)
MatchList<T>::MatchList(const char *filename)
: m_filename(filename)
, m_loading(false)
, m_blockList(new blocklist_t<T>)
, m_matchList(new matchlist_t<T>)
, m_lastModifiedTime(0) {
}
template <class T>
bool BlockList<T>::init() {
log(LOG_INFO, "Initializing BlockList with %s", m_filename);
bool MatchList<T>::init() {
log(LOG_INFO, "Initializing MatchList with %s", m_filename);
if (!g_loop.registerSleepCallback(60000, this, &reload, "BlockList<T>::reload", 0)) {
log(LOG_WARN, "BlockList<T>:: Failed to register callback.");
if (!g_loop.registerSleepCallback(60000, this, &reload, "MatchList<T>::reload", 0)) {
log(LOG_WARN, "MatchList<T>:: Failed to register callback.");
return false;
}
// we do a load here instead of using sleep callback with immediate set to true so
// we don't rely on g_loop being up and running to use blocklist
// we don't rely on g_loop being up and running to use matchlist
load();
return true;
}
template <class T>
void BlockList<T>::reload(int /*fd*/, void *state) {
void MatchList<T>::reload(int /*fd*/, void *state) {
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
return;
}
@ -60,36 +60,36 @@ void BlockList<T>::reload(int /*fd*/, void *state) {
}
template <class T>
void BlockList<T>::reload(void *state) {
BlockList *blockList = static_cast<BlockList*>(state);
void MatchList<T>::reload(void *state) {
MatchList *matchList = static_cast<MatchList*>(state);
// don't load multiple times at the same time
if (blockList->m_loading.exchange(true)) {
if (matchList->m_loading.exchange(true)) {
return;
}
blockList->load();
blockList->m_loading = false;
matchList->load();
matchList->m_loading = false;
}
template <class T>
bool BlockList<T>::load() {
logTrace(g_conf.m_logTraceBlockList, "Loading %s", m_filename);
bool MatchList<T>::load() {
logTrace(g_conf.m_logTraceMatchList, "Loading %s", m_filename);
struct stat st;
if (stat(m_filename, &st) != 0) {
// probably not found
log(LOG_INFO, "BlockList<T>::load: Unable to stat %s", m_filename);
log(LOG_INFO, "MatchList<T>::load: Unable to stat %s", m_filename);
return false;
}
if (m_lastModifiedTime != 0 && m_lastModifiedTime == st.st_mtime) {
// not modified. assume successful
logTrace(g_conf.m_logTraceBlockList, "%s not modified", m_filename);
logTrace(g_conf.m_logTraceMatchList, "%s not modified", m_filename);
return true;
}
blocklist_ptr_t<T> tmpBlockList(new blocklist_t<T>);
matchlist_ptr_t<T> tmpMatchList(new matchlist_t<T>);
std::ifstream file(m_filename);
std::string line;
@ -99,37 +99,37 @@ bool BlockList<T>::load() {
continue;
}
addToBlockList(tmpBlockList, line);
logTrace(g_conf.m_logTraceBlockList, "Adding criteria '%s' to list", line.c_str());
addToMatchList(tmpMatchList, line);
logTrace(g_conf.m_logTraceMatchList, "Adding criteria '%s' to list", line.c_str());
}
swapBlockList(tmpBlockList);
swapMatchList(tmpMatchList);
m_lastModifiedTime = st.st_mtime;
logTrace(g_conf.m_logTraceBlockList, "Loaded %s", m_filename);
logTrace(g_conf.m_logTraceMatchList, "Loaded %s", m_filename);
return true;
}
template <class T>
void BlockList<T>::addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line) {
void MatchList<T>::addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line) {
gbshutdownLogicError();
}
template <>
void BlockList<std::string>::addToBlockList(blocklist_ptr_t<std::string> &blockList, const std::string &line) {
blockList->emplace_back(line);
void MatchList<std::string>::addToMatchList(matchlist_ptr_t<std::string> &matchList, const std::string &line) {
matchList->emplace_back(line);
}
template <class T>
blocklistconst_ptr_t<T> BlockList<T>::getBlockList() {
return m_blockList;
matchlistconst_ptr_t<T> MatchList<T>::getMatchList() {
return m_matchList;
}
template <class T>
void BlockList<T>::swapBlockList(blocklistconst_ptr_t<T> blockList) {
std::atomic_store(&m_blockList, blockList);
void MatchList<T>::swapMatchList(matchlistconst_ptr_t<T> matchList) {
std::atomic_store(&m_matchList, matchList);
}
// explicit instantiations
template class BlockList<std::string>;
template class BlockList<uint32_t>;
template class MatchList<std::string>;
template class MatchList<uint32_t>;

@ -16,8 +16,8 @@
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_BLOCKLIST_H
#define FX_BLOCKLIST_H
#ifndef FX_MATCHLIST_H
#define FX_MATCHLIST_H
#include <memory>
@ -25,14 +25,14 @@
#include <string>
#include <atomic>
template <typename T> using blocklist_t = std::vector<T>;
template <typename T> using blocklist_ptr_t = std::shared_ptr<std::vector<T>>;
template <typename T> using blocklistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
template <typename T> using matchlist_t = std::vector<T>;
template <typename T> using matchlist_ptr_t = std::shared_ptr<std::vector<T>>;
template <typename T> using matchlistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
template<class T> class BlockList {
template<class T> class MatchList {
public:
explicit BlockList(const char *filename);
virtual ~BlockList() = default;
explicit MatchList(const char *filename);
virtual ~MatchList() = default;
virtual bool init();
@ -42,18 +42,18 @@ public:
protected:
bool load();
virtual void addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line);
blocklistconst_ptr_t<T> getBlockList();
virtual void addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line);
matchlistconst_ptr_t<T> getMatchList();
const char *m_filename;
private:
void swapBlockList(blocklistconst_ptr_t<T> blockList);
void swapMatchList(matchlistconst_ptr_t<T> matchList);
std::atomic_bool m_loading;
blocklistconst_ptr_t<T> m_blockList;
matchlistconst_ptr_t<T> m_matchList;
time_t m_lastModifiedTime;
};
#endif //FX_BLOCKLIST_H
#endif //FX_MATCHLIST_H

@ -19,6 +19,7 @@
#include "Statistics.h"
#include "Sanity.h"
#include "UrlMatchList.h"
#include "ContentMatchList.h"
#include <string.h>
@ -1066,9 +1067,10 @@ static bool retryProxy(TcpSocket *ts, const char **msg, Msg13Request *r) {
return false;
}
// @todo ALC check content
return false;
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
size_t haystack_size = ts->m_readOffset - pre_size;
const char *haystack = ts->m_readBuf + pre_size;
return g_contentRetryProxyList.isContentMatched(haystack, haystack_size);
}
static void appendCrawlBan(const char *group, const char *url, int urlLen) {

@ -9049,9 +9049,9 @@ void Parms::init ( ) {
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for BlockList";
m->m_title = "log trace info for MatchList";
m->m_cgi = "ltrc_bl";
simple_m_set(Conf,m_logTraceBlockList);
simple_m_set(Conf,m_logTraceMatchList);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;

@ -105,6 +105,7 @@
#include "IpBlockList.h"
#include "SpiderdbSqlite.h"
#include "QueryLanguage.h"
#include "ContentMatchList.h"
#include <sys/stat.h> //umask()
@ -1304,6 +1305,7 @@ int main2 ( int argc , char *argv[] ) {
g_dnsBlockList.init();
g_contentTypeBlockList.init();
g_ipBlockList.init();
g_contentRetryProxyList.init();
g_urlBlackList.init();
g_urlWhiteList.init();