wantedcheck shlib: check single content, example with cellery

This commit is contained in:
Ivan Skytte Jørgensen 2017-09-12 16:24:40 +02:00
parent d4313ac193
commit 7b6ba45c27
8 changed files with 104 additions and 3 deletions

@ -283,6 +283,8 @@ const char *mstrerror ( int errnum ) {
return "Doc blocked by shlib (url)";
case EBANNEDCRAWL:
return "Crawl banned by server";
case EDOCBLOCKEDSHLICONTENT:
return "Doc blocked by shlib (content)";
}
}
@ -468,6 +470,7 @@ static const char* s_errname[] {
STRINGIFY( EDOCBLOCKEDSHLIBDOMAIN ),
STRINGIFY( EDOCBLOCKEDSHLIBURL ),
STRINGIFY( EBANNEDCRAWL ),
STRINGIFY( EDOCBLOCKEDSHLICONTENT ),
};
#undef STRINGIFY

@ -196,6 +196,7 @@ enum {
EDOCBLOCKEDSHLIBDOMAIN,
EDOCBLOCKEDSHLIBURL,
EBANNEDCRAWL, // we are apparently banned/blacklisted by the Webserver/IDS/
EDOCBLOCKEDSHLICONTENT,
};
#endif // GB_ERRNO_H

@ -9,6 +9,7 @@
#include "SpiderProxy.h" // OP_GETPROXY OP_RETPROXY
#include "RdbCache.h"
#include "Collectiondb.h"
#include "WantedChecker.h"
#include "ip.h"
#include "GbUtil.h"
#include "zlib.h"
@ -1118,6 +1119,43 @@ static bool crawlWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
}
static bool contentIsUnwanted(TcpSocket *ts, const char **msg, Msg13Request *r) {
log(LOG_INFO,"contentIsUnwanted: url=%s", r->ptr_url);
//logTrace ..."contentIsUnwanted, %.*s", r->size_url, r->ptr_url);
// no socket -> must be a bulk import job so obviously wanted
if(!ts)
return false;
//we only do ban checks if there weren't any other error
if(g_errno!=0)
return false;
//todo: if the server returned an empty response then we might be banned. But let's assume not for now.
// check the http mime for 403 Forbidden
HttpMime mime;
mime.set ( ts->m_readBuf , ts->m_readOffset , NULL );
int32_t httpStatus = mime.getHttpStatus();
if(httpStatus == 200) { //ok
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
size_t haystack_size = ts->m_readOffset - pre_size;
const void *haystack = ts->m_readBuf + pre_size;
if(!WantedChecker::check_single_content(r->ptr_url,haystack,haystack_size).wanted) {
log(LOG_INFO,"Url %s is unwanted by shlib", r->ptr_url);
*msg = "shlib-unwanted";
return true;
}
}
//logTrace ..."Url crawl seems to not be banned");
// otherwise assume not.
*msg = NULL;
return false;
}
// come here after telling host #0 we are done using this proxy.
// host #0 will update the loadbucket for it, using m_lbId.
void gotHttpReply9 ( void *state , TcpSocket *ts ) {
@ -1280,6 +1318,14 @@ void gotHttpReply2 ( void *state ,
savedErr = g_errno = EBANNEDCRAWL;
}
if(contentIsUnwanted(ts,&banMsg,r)) {
log("msg13: url %.*s is unwanted (%s)"
, (int)r->size_url, r->ptr_url
, banMsg
);
savedErr = g_errno = EDOCBLOCKEDSHLICONTENT;
}
// . add to the table if not in there yet
// . store in our table of ips we should use proxies for
// . also start off with a crawldelay of like 1 sec for this
@ -1724,7 +1770,8 @@ void gotHttpReply2 ( void *state ,
err != EPIPE &&
// connection reset by peer
err != ECONNRESET &&
err != EBANNEDCRAWL)
err != EBANNEDCRAWL &&
err != EDOCBLOCKEDSHLICONTENT)
{
log("http: bad error from httpserver get doc: %s",
mstrerror(err));

@ -1,4 +1,5 @@
#include "WantedCheckerApi.h"
#include <string.h>
//Example library for checking if a domain/url/document is wanted or not.
@ -36,6 +37,17 @@ static WantedCheckApi::UrlCheckResult example_check_url(const std::string &url)
}
static WantedCheckApi::SingleContentCheckResult noop_check_single_content(const std::string &url, const void *content, size_t content_len) {
WantedCheckApi::SingleContentCheckResult result;
result.wanted = true;
//if the content contains the word "cellery" and it isn't a good site then reject it
if(memmem(content,content_len,"cellery",7)!=0 &&
url.find("destroy-all-cellery")==std::string::npos)
result.wanted = false;
return result;
}
//No example for content filtering
// static WantedCheckApi::ContentMultiCheckResult example_check_multi_content(const std::vector<WantedCheckApi::Content> &/*content*/) {
@ -49,5 +61,6 @@ static WantedCheckApi::UrlCheckResult example_check_url(const std::string &url)
WantedCheckApi::APIDescriptorBlock wanted_check_api_descriptor_block = {
example_check_domain,
example_check_url,
noop_check_single_content,
NULL //example_check_multi_content
};

@ -25,6 +25,12 @@ static WantedCheckApi::UrlCheckResult noop_check_url(const std::string &/*url*/)
return result;
}
static WantedCheckApi::SingleContentCheckResult noop_check_single_content(const std::string &/*url*/, const void */*content*/, size_t /*content_len*/) {
WantedCheckApi::SingleContentCheckResult result;
result.wanted = true;
return result;
}
static WantedCheckApi::MultiContentCheckResult noop_check_multi_content(const std::vector<WantedCheckApi::MultiContent> &/*content*/) {
WantedCheckApi::MultiContentCheckResult result;
result.result = result.wanted;
@ -41,7 +47,8 @@ static void *p_shlib = 0;
static WantedCheckApi::APIDescriptorBlock effective_descriptor_block = {
noop_check_domain,
noop_check_url,
noop_check_multi_content
noop_check_single_content,
noop_check_multi_content,
};
@ -69,6 +76,8 @@ bool WantedChecker::initialize() {
effective_descriptor_block.check_domain_pfn = desc->check_domain_pfn;
if(desc->check_url_pfn)
effective_descriptor_block.check_url_pfn = desc->check_url_pfn;
if(desc->check_single_content_pfn)
effective_descriptor_block.check_single_content_pfn = desc->check_single_content_pfn;
if(desc->check_multi_content_pfn)
effective_descriptor_block.check_multi_content_pfn = desc->check_multi_content_pfn;
@ -82,6 +91,7 @@ void WantedChecker::finalize() {
effective_descriptor_block.check_domain_pfn = noop_check_domain;
effective_descriptor_block.check_url_pfn = noop_check_url;
effective_descriptor_block.check_single_content_pfn = noop_check_single_content;
effective_descriptor_block.check_multi_content_pfn = noop_check_multi_content;
if(p_shlib) {
@ -101,3 +111,7 @@ WantedChecker::DomainCheckResult WantedChecker::check_domain(const std::string &
WantedChecker::UrlCheckResult WantedChecker::check_url(const std::string &url) {
return effective_descriptor_block.check_url_pfn(url);
}
WantedChecker::SingleContentCheckResult WantedChecker::check_single_content(const std::string &url, const void *content, size_t content_len) {
return effective_descriptor_block.check_single_content_pfn(url,content,content_len);
}

@ -14,6 +14,10 @@ DomainCheckResult check_domain(const std::string &domain);
typedef WantedCheckApi::UrlCheckResult UrlCheckResult;
UrlCheckResult check_url(const std::string &url);
typedef WantedCheckApi::SingleContentCheckResult SingleContentCheckResult;
SingleContentCheckResult check_single_content(const std::string &url, const void *content, size_t content_len);
} //namespace
#endif

@ -38,7 +38,20 @@ struct UrlCheckResult {
typedef UrlCheckResult (*check_url_t)(const std::string &url);
// check_content
// check_single_content
//Called after content has been successfully fetched and transcoded into UTF-8
//Possible outcomes:
// wanted
// unwanted
struct SingleContentCheckResult {
bool wanted;
};
typedef SingleContentCheckResult (*check_single_content_t)(const std::string &url, const void *content, size_t content_len);
// check_multi_content
//
//Called after content has been fetched and transcoded into UTF-8
//Possible outcomes:
@ -71,6 +84,7 @@ typedef MultiContentCheckResult (*check_multi_content_t)(const std::vector<Multi
struct APIDescriptorBlock {
check_domain_t check_domain_pfn;
check_url_t check_url_pfn;
check_single_content_t check_single_content_pfn;
check_multi_content_t check_multi_content_pfn;
};

@ -1742,6 +1742,11 @@ bool XmlDoc::indexDoc ( ) {
m_indexCodeValid = true;
}
if ( g_errno == EDOCBLOCKEDSHLICONTENT ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
// default to internal error which will be retried forever otherwise
if ( ! m_indexCodeValid ) {
m_indexCode = EINTERNALERROR;//g_errno;