Add ContentTypeBlockList to block by http content-type

This commit is contained in:
Ai Lin Chia 2017-11-15 12:36:00 +01:00
parent 09bf57e147
commit 9a3f87b56e
17 changed files with 404 additions and 125 deletions

115
BlockList.cpp Normal file
View File

@ -0,0 +1,115 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "BlockList.h"
#include "Log.h"
#include "Conf.h"
#include "Loop.h"
#include "JobScheduler.h"
#include <fstream>
#include <sys/stat.h>
#include <atomic>
BlockList::BlockList(const char *filename)
: m_filename(filename)
, m_loading(false)
, m_blockList(new blocklist_t)
, m_lastModifiedTime(0) {
}
bool BlockList::init() {
log(LOG_INFO, "Initializing BlockList with %s", m_filename);
if (!g_loop.registerSleepCallback(60000, this, &reload, "BlockList::reload", 0)) {
log(LOG_WARN, "BlockList:: Failed to register callback.");
return false;
}
// we do a load here instead of using sleep callback with immediate set to true so
// we don't rely on g_loop being up and running to use blocklist
load();
return true;
}
void BlockList::reload(int /*fd*/, void *state) {
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
return;
}
// unable to submit job (load on main thread)
reload(state);
}
void BlockList::reload(void *state) {
BlockList *blockList = static_cast<BlockList*>(state);
// don't load multiple times at the same time
if (blockList->m_loading.exchange(true)) {
return;
}
blockList->load();
blockList->m_loading = false;
}
bool BlockList::load() {
logTrace(g_conf.m_logTraceBlockList, "Loading %s", m_filename);
struct stat st;
if (stat(m_filename, &st) != 0) {
// probably not found
log(LOG_INFO, "BlockList::load: Unable to stat %s", m_filename);
return false;
}
if (m_lastModifiedTime != 0 && m_lastModifiedTime == st.st_mtime) {
// not modified. assume successful
logTrace(g_conf.m_logTraceBlockList, "%s not modified", m_filename);
return true;
}
blocklist_ptr_t tmpBlockList(new blocklist_t);
std::ifstream file(m_filename);
std::string line;
while (std::getline(file, line)) {
// ignore comments & empty lines
if (line.length() == 0 || line[0] == '#') {
continue;
}
tmpBlockList->emplace_back(line);
logTrace(g_conf.m_logTraceBlockList, "Adding criteria '%s' to list", line.c_str());
}
swapBlockList(tmpBlockList);
m_lastModifiedTime = st.st_mtime;
logTrace(g_conf.m_logTraceBlockList, "Loaded %s", m_filename);
return true;
}
blocklistconst_ptr_t BlockList::getBlockList() {
return m_blockList;
}
void BlockList::swapBlockList(blocklistconst_ptr_t blockList) {
std::atomic_store(&m_blockList, blockList);
}

58
BlockList.h Normal file
View File

@ -0,0 +1,58 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_BLOCKLIST_H
#define FX_BLOCKLIST_H
#include <memory>
#include <vector>
#include <string>
#include <atomic>
typedef std::vector<std::string> blocklist_t;
typedef std::shared_ptr<blocklist_t> blocklist_ptr_t;
typedef std::shared_ptr<const blocklist_t> blocklistconst_ptr_t;
class BlockList {
public:
BlockList(const char *filename);
bool init();
static void reload(int /*fd*/, void *state);
static void reload(void *state);
protected:
bool load();
const char *m_filename;
blocklistconst_ptr_t getBlockList();
private:
void swapBlockList(blocklistconst_ptr_t blockList);
std::atomic_bool m_loading;
blocklistconst_ptr_t m_blockList;
time_t m_lastModifiedTime;
};
#endif //FX_BLOCKLIST_H

View File

@ -233,6 +233,8 @@ Conf::Conf ( ) {
m_logDebugUrlAttempts = false;
m_logDebugVagus = false;
m_logTraceBigFile = false;
m_logTraceBlockList = false;
m_logTraceContentTypeBlockList = false;
m_logTraceDocDelete = false;
m_logTraceDns = false;
m_logTraceDnsBlockList = false;

2
Conf.h
View File

@ -383,6 +383,8 @@ class Conf {
bool m_logDebugVagus;
bool m_logTraceBigFile;
bool m_logTraceBlockList;
bool m_logTraceContentTypeBlockList;
bool m_logTraceDocDelete;
bool m_logTraceDns;
bool m_logTraceDnsBlockList;

79
ContentTypeBlockList.cpp Normal file
View File

@ -0,0 +1,79 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "ContentTypeBlockList.h"
#include "ScopedLock.h"
#include "Log.h"
#include "Conf.h"
#include <algorithm>
#include <fstream>
ContentTypeBlockList g_contentTypeBlockList;
static const char s_contenttype_filename[] = "contenttypeblocklist.txt";
static const char s_contenttype_allowed_filename[] = "contenttypeallowed.txt";
ContentTypeBlockList::ContentTypeBlockList()
: BlockList(s_contenttype_filename)
, m_contenttype_allowed()
, m_contenttype_allowed_mtx(PTHREAD_MUTEX_INITIALIZER) {
std::ifstream file(s_contenttype_allowed_filename);
std::string line;
ScopedLock sl(m_contenttype_allowed_mtx);
while (std::getline(file, line)) {
m_contenttype_allowed.push_back(line);
}
}
void ContentTypeBlockList::addContentTypeAllowed(const std::string &contentType) {
ScopedLock sl(m_contenttype_allowed_mtx);
if (std::find(m_contenttype_allowed.begin(), m_contenttype_allowed.end(), contentType) != m_contenttype_allowed.end()) {
return;
}
m_contenttype_allowed.push_back(contentType);
std::ofstream file(s_contenttype_allowed_filename, (std::ios::out | std::ios::app));
file << contentType << std::endl;
}
bool ContentTypeBlockList::isContentTypeBlocked(const char *contentType, size_t contentTypeLen) {
if (contentTypeLen == 0) {
return false;
}
auto contentTypeBlockList = getBlockList();
for (auto const &contentTypeBlock : *contentTypeBlockList) {
if (contentTypeBlock.back() == '*') {
// prefix
if (contentTypeLen >= contentTypeBlock.size() - 1 && strncasecmp(contentTypeBlock.c_str(), contentType, contentTypeBlock.size() - 1) == 0) {
logTrace(g_conf.m_logTraceContentTypeBlockList, "Content type block criteria %s matched contenttype '%.*s'", contentTypeBlock.c_str(), static_cast<int>(contentTypeLen), contentType);
return true;
}
} else {
if (contentTypeLen == contentTypeBlock.size() && strncasecmp(contentTypeBlock.c_str(), contentType, contentTypeLen) == 0) {
logTrace(g_conf.m_logTraceContentTypeBlockList, "Content type block criteria %s matched contenttype '%.*s'", contentTypeBlock.c_str(), static_cast<int>(contentTypeLen), contentType);
return true;
}
}
}
addContentTypeAllowed(std::string(contentType, contentTypeLen));
return false;
}

42
ContentTypeBlockList.h Normal file
View File

@ -0,0 +1,42 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_CONTENTTYPEBLOCKLIST_H
#define FX_CONTENTTYPEBLOCKLIST_H
#include "BlockList.h"
#include <pthread.h>
#include <vector>
class ContentTypeBlockList : public BlockList {
public:
ContentTypeBlockList();
bool isContentTypeBlocked(const char *contentType, size_t contentTypeLen);
void addContentTypeAllowed(const std::string &contentType);
private:
std::vector<std::string> m_contenttype_allowed;
mutable pthread_mutex_t m_contenttype_allowed_mtx;
};
extern ContentTypeBlockList g_contentTypeBlockList;
#endif //FX_CONTENTTYPEBLOCKLIST_H

View File

@ -1,98 +1,35 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "DnsBlockList.h"
#include "Log.h"
#include "Conf.h"
#include "Loop.h"
#include "JobScheduler.h"
#include <fstream>
#include <sys/stat.h>
#include <atomic>
DnsBlockList g_dnsBlockList;
static const char s_dns_filename[] = "dnsblocklist.txt";
DnsBlockList::DnsBlockList()
: m_filename(s_dns_filename)
, m_loading(false)
, m_dnsBlockList(new dnsblocklist_t)
, m_lastModifiedTime(0) {
}
bool DnsBlockList::init() {
log(LOG_INFO, "Initializing DnsBlockList with %s", m_filename);
if (!g_loop.registerSleepCallback(60000, this, &reload, "DnsBlockList::reload", 0)) {
log(LOG_WARN, "DnsBlockList:: Failed to register callback.");
return false;
}
// we do a load here instead of using sleep callback with immediate set to true so
// we don't rely on g_loop being up and running to use dnsblocklist
load();
return true;
}
void DnsBlockList::reload(int /*fd*/, void *state) {
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
return;
}
// unable to submit job (load on main thread)
reload(state);
}
void DnsBlockList::reload(void *state) {
DnsBlockList *dnsBlockList = static_cast<DnsBlockList*>(state);
// don't load multiple times at the same time
if (dnsBlockList->m_loading.exchange(true)) {
return;
}
dnsBlockList->load();
dnsBlockList->m_loading = false;
}
bool DnsBlockList::load() {
logTrace(g_conf.m_logTraceDnsBlockList, "Loading %s", m_filename);
struct stat st;
if (stat(m_filename, &st) != 0) {
// probably not found
log(LOG_INFO, "DnsBlockList::load: Unable to stat %s", m_filename);
return false;
}
if (m_lastModifiedTime != 0 && m_lastModifiedTime == st.st_mtime) {
// not modified. assume successful
logTrace(g_conf.m_logTraceDnsBlockList, "Not modified");
return true;
}
dnsblocklist_ptr_t tmpDnsBlockList(new dnsblocklist_t);
std::ifstream file(m_filename);
std::string line;
while (std::getline(file, line)) {
// ignore comments & empty lines
if (line.length() == 0 || line[0] == '#') {
continue;
}
tmpDnsBlockList->emplace_back(line);
logTrace(g_conf.m_logTraceDnsBlockList, "Adding criteria '%s' to list", line.c_str());
}
swapDnsBlockList(tmpDnsBlockList);
m_lastModifiedTime = st.st_mtime;
logTrace(g_conf.m_logTraceDnsBlockList, "Loaded %s", m_filename);
return true;
: BlockList(s_dns_filename) {
}
bool DnsBlockList::isDnsBlocked(const char *dns) {
auto dnsBlockList = getDnsBlockList();
auto dnsBlockList = getBlockList();
for (auto const &dnsBlock : *dnsBlockList) {
if (dnsBlock.front() == '*') {
@ -112,11 +49,3 @@ bool DnsBlockList::isDnsBlocked(const char *dns) {
return false;
}
dnsblocklistconst_ptr_t DnsBlockList::getDnsBlockList() {
return m_dnsBlockList;
}
void DnsBlockList::swapDnsBlockList(dnsblocklistconst_ptr_t dnsBlockList) {
std::atomic_store(&m_dnsBlockList, dnsBlockList);
}

View File

@ -1,41 +1,32 @@
#ifndef GB_DNSBLOCKLIST_H
#define GB_DNSBLOCKLIST_H
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_DNSBLOCKLIST_H
#define FX_DNSBLOCKLIST_H
#include <memory>
#include <vector>
#include <string>
#include <atomic>
#include "BlockList.h"
typedef std::vector<std::string> dnsblocklist_t;
typedef std::shared_ptr<dnsblocklist_t> dnsblocklist_ptr_t;
typedef std::shared_ptr<const dnsblocklist_t> dnsblocklistconst_ptr_t;
class DnsBlockList {
class DnsBlockList : public BlockList {
public:
DnsBlockList();
bool init();
bool isDnsBlocked(const char *dns);
static void reload(int /*fd*/, void *state);
static void reload(void *state);
protected:
bool load();
const char *m_filename;
private:
dnsblocklistconst_ptr_t getDnsBlockList();
void swapDnsBlockList(dnsblocklistconst_ptr_t dnsBlockList);
std::atomic_bool m_loading;
dnsblocklistconst_ptr_t m_dnsBlockList;
time_t m_lastModifiedTime;
};
extern DnsBlockList g_dnsBlockList;
#endif //GB_DNSBLOCKLIST_H
#endif //FX_DNSBLOCKLIST_H

View File

@ -74,6 +74,7 @@ void HttpMime::reset ( ) {
m_contentEncodingPos = NULL;
m_contentLengthPos = NULL;
m_contentTypePos = NULL;
m_contentTypeLen = 0;
m_cookies.clear();
}
@ -625,6 +626,7 @@ bool HttpMime::parseContentType(const char *field, size_t fieldLen) {
if (getValue(&value, &valueLen)) {
m_contentTypePos = value;
m_contentTypeLen = valueLen;
m_contentType = getContentTypePrivate(value, valueLen);
}

View File

@ -129,6 +129,7 @@ public:
const char *getContentEncodingPos() { return m_contentEncodingPos; }
const char *getContentLengthPos() { return m_contentLengthPos; }
const char *getContentTypePos() { return m_contentTypePos; }
int32_t getContentTypeLen() const { return m_contentTypeLen; }
// convert a file extension like "gif" to "images/gif"
static const char *getContentTypeFromExtension ( const char *ext ) ;
@ -237,7 +238,9 @@ private:
int32_t m_contentEncoding;
const char *m_contentEncodingPos;
const char *m_contentLengthPos;
const char *m_contentTypePos;
size_t m_contentTypeLen;
// Content-Type: text/html;charset=euc-jp // japanese (euc-jp)
// Content-Type: text/html;charset=gb2312 // chinese (gb2312)

View File

@ -58,6 +58,8 @@ OBJS_O2 = \
OBJS_O3 = \
BlockList.o \
ContentTypeBlockList.o \
DocDelete.o DnsBlockList.o \
IPAddressChecks.o \
LanguageResultOverride.o Linkdb.o \

View File

@ -8649,6 +8649,20 @@ void Parms::init ( ) {
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for BlockList";
m->m_cgi = "ltrc_bl";
simple_m_set(Conf,m_logTraceBlockList);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for ContentTypeBlockList";
m->m_cgi = "ltrc_ctbl";
simple_m_set(Conf,m_logTraceContentTypeBlockList);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for DocDelete";
m->m_cgi = "ltrc_docdel";
simple_m_set(Conf,m_logTraceDocDelete);

View File

@ -50,6 +50,7 @@
#include "GbDns.h"
#include "RobotsCheckList.h"
#include "UrlResultOverride.h"
#include "ContentTypeBlockList.h"
#include <iostream>
#include <fstream>
@ -2280,6 +2281,13 @@ int32_t *XmlDoc::getIndexCode ( ) {
return (int32_t *)mime;
}
if (g_contentTypeBlockList.isContentTypeBlocked(mime->getContentTypePos(), mime->getContentTypeLen())) {
m_indexCode = EDOCBADCONTENTTYPE;
m_indexCodeValid = true;
logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCBADCONTENTTYPE");
return &m_indexCode;
}
// check redir url
Url **redirp = getRedirUrl();
if ( ! redirp || redirp == (void *)-1 ) {
@ -9240,8 +9248,6 @@ char **XmlDoc::getFilteredContent ( ) {
if ( *ct == CT_TEXT ) return &m_filteredContent;
if ( *ct == CT_XML ) return &m_filteredContent;
// javascript - sometimes has address information in it, so keep it!
if ( *ct == CT_JS ) return &m_filteredContent;
if ( m_contentLen == 0 ) return &m_filteredContent;
// we now support JSON for diffbot

View File

@ -81,6 +81,7 @@
#include "Dir.h"
#include "File.h"
#include "DnsBlockList.h"
#include "ContentTypeBlockList.h"
#include "UrlMatchList.h"
#include "UrlBlockCheck.h"
#include "DocDelete.h"
@ -1666,6 +1667,7 @@ int main2 ( int argc , char *argv[] ) {
// load block lists
g_dnsBlockList.init();
g_contentTypeBlockList.init();
g_urlBlackList.init();
g_urlWhiteList.init();

View File

@ -0,0 +1,29 @@
#include <gtest/gtest.h>
#include "ContentTypeBlockList.h"
class TestContentTypeBlockList : public ContentTypeBlockList {
public:
TestContentTypeBlockList(const char *filename)
: ContentTypeBlockList() {
m_filename = filename;
}
using ContentTypeBlockList::load;
bool isContentTypeBlocked(const char *str) {
return ContentTypeBlockList::isContentTypeBlocked(str, strlen(str));
}
};
TEST(ContentTypeBlockListTest, BlockList) {
TestContentTypeBlockList contentTypeBlockList("blocklist/contenttype.txt");
contentTypeBlockList.load();
// full match
EXPECT_TRUE(contentTypeBlockList.isContentTypeBlocked("application/font-woff"));
EXPECT_FALSE(contentTypeBlockList.isContentTypeBlocked("application/font-woff-2"));
EXPECT_FALSE(contentTypeBlockList.isContentTypeBlocked("naudio/"));
EXPECT_TRUE(contentTypeBlockList.isContentTypeBlocked("audio/"));
EXPECT_TRUE(contentTypeBlockList.isContentTypeBlocked("audio/CN"));
EXPECT_TRUE(contentTypeBlockList.isContentTypeBlocked("audio/DAT12"));
}

View File

@ -6,6 +6,7 @@ BASE_DIR ?= ../..
TARGET = GigablastTest
OBJECTS = GigablastTest.o GigablastTestUtils.o \
BitOperationsTest.o BigFileTest.o \
ContentTypeBlockListTest.o \
DirTest.o DnsBlockListTest.o \
FctypesTest.o \
GbCacheTest.o \

View File

@ -0,0 +1,2 @@
application/font-woff
audio/*