mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-06-05 21:19:33 -04:00
wantedcheck shlib: check single content, example with cellery
This commit is contained in:
parent
d4313ac193
commit
7b6ba45c27
@ -283,6 +283,8 @@ const char *mstrerror ( int errnum ) {
|
||||
return "Doc blocked by shlib (url)";
|
||||
case EBANNEDCRAWL:
|
||||
return "Crawl banned by server";
|
||||
case EDOCBLOCKEDSHLICONTENT:
|
||||
return "Doc blocked by shlib (content)";
|
||||
}
|
||||
}
|
||||
|
||||
@ -468,6 +470,7 @@ static const char* s_errname[] {
|
||||
STRINGIFY( EDOCBLOCKEDSHLIBDOMAIN ),
|
||||
STRINGIFY( EDOCBLOCKEDSHLIBURL ),
|
||||
STRINGIFY( EBANNEDCRAWL ),
|
||||
STRINGIFY( EDOCBLOCKEDSHLICONTENT ),
|
||||
};
|
||||
|
||||
#undef STRINGIFY
|
||||
|
1
Errno.h
1
Errno.h
@ -196,6 +196,7 @@ enum {
|
||||
EDOCBLOCKEDSHLIBDOMAIN,
|
||||
EDOCBLOCKEDSHLIBURL,
|
||||
EBANNEDCRAWL, // we are apparently banned/blacklisted by the Webserver/IDS/
|
||||
EDOCBLOCKEDSHLICONTENT,
|
||||
};
|
||||
|
||||
#endif // GB_ERRNO_H
|
||||
|
49
Msg13.cpp
49
Msg13.cpp
@ -9,6 +9,7 @@
|
||||
#include "SpiderProxy.h" // OP_GETPROXY OP_RETPROXY
|
||||
#include "RdbCache.h"
|
||||
#include "Collectiondb.h"
|
||||
#include "WantedChecker.h"
|
||||
#include "ip.h"
|
||||
#include "GbUtil.h"
|
||||
#include "zlib.h"
|
||||
@ -1118,6 +1119,43 @@ static bool crawlWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
||||
}
|
||||
|
||||
|
||||
static bool contentIsUnwanted(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
||||
log(LOG_INFO,"contentIsUnwanted: url=%s", r->ptr_url);
|
||||
//logTrace ..."contentIsUnwanted, %.*s", r->size_url, r->ptr_url);
|
||||
// no socket -> must be a bulk import job so obviously wanted
|
||||
if(!ts)
|
||||
return false;
|
||||
|
||||
//we only do ban checks if there weren't any other error
|
||||
if(g_errno!=0)
|
||||
return false;
|
||||
|
||||
//todo: if the server returned an empty response then we might be banned. But let's assume not for now.
|
||||
|
||||
// check the http mime for 403 Forbidden
|
||||
HttpMime mime;
|
||||
mime.set ( ts->m_readBuf , ts->m_readOffset , NULL );
|
||||
|
||||
int32_t httpStatus = mime.getHttpStatus();
|
||||
if(httpStatus == 200) { //ok
|
||||
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
||||
size_t haystack_size = ts->m_readOffset - pre_size;
|
||||
const void *haystack = ts->m_readBuf + pre_size;
|
||||
if(!WantedChecker::check_single_content(r->ptr_url,haystack,haystack_size).wanted) {
|
||||
log(LOG_INFO,"Url %s is unwanted by shlib", r->ptr_url);
|
||||
*msg = "shlib-unwanted";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
//logTrace ..."Url crawl seems to not be banned");
|
||||
// otherwise assume not.
|
||||
*msg = NULL;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// come here after telling host #0 we are done using this proxy.
|
||||
// host #0 will update the loadbucket for it, using m_lbId.
|
||||
void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
@ -1280,6 +1318,14 @@ void gotHttpReply2 ( void *state ,
|
||||
savedErr = g_errno = EBANNEDCRAWL;
|
||||
}
|
||||
|
||||
if(contentIsUnwanted(ts,&banMsg,r)) {
|
||||
log("msg13: url %.*s is unwanted (%s)"
|
||||
, (int)r->size_url, r->ptr_url
|
||||
, banMsg
|
||||
);
|
||||
savedErr = g_errno = EDOCBLOCKEDSHLICONTENT;
|
||||
}
|
||||
|
||||
// . add to the table if not in there yet
|
||||
// . store in our table of ips we should use proxies for
|
||||
// . also start off with a crawldelay of like 1 sec for this
|
||||
@ -1724,7 +1770,8 @@ void gotHttpReply2 ( void *state ,
|
||||
err != EPIPE &&
|
||||
// connection reset by peer
|
||||
err != ECONNRESET &&
|
||||
err != EBANNEDCRAWL)
|
||||
err != EBANNEDCRAWL &&
|
||||
err != EDOCBLOCKEDSHLICONTENT)
|
||||
{
|
||||
log("http: bad error from httpserver get doc: %s",
|
||||
mstrerror(err));
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "WantedCheckerApi.h"
|
||||
#include <string.h>
|
||||
|
||||
//Example library for checking if a domain/url/document is wanted or not.
|
||||
|
||||
@ -36,6 +37,17 @@ static WantedCheckApi::UrlCheckResult example_check_url(const std::string &url)
|
||||
}
|
||||
|
||||
|
||||
static WantedCheckApi::SingleContentCheckResult noop_check_single_content(const std::string &url, const void *content, size_t content_len) {
|
||||
WantedCheckApi::SingleContentCheckResult result;
|
||||
result.wanted = true;
|
||||
//if the content contains the word "cellery" and it isn't a good site then reject it
|
||||
if(memmem(content,content_len,"cellery",7)!=0 &&
|
||||
url.find("destroy-all-cellery")==std::string::npos)
|
||||
result.wanted = false;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//No example for content filtering
|
||||
|
||||
// static WantedCheckApi::ContentMultiCheckResult example_check_multi_content(const std::vector<WantedCheckApi::Content> &/*content*/) {
|
||||
@ -49,5 +61,6 @@ static WantedCheckApi::UrlCheckResult example_check_url(const std::string &url)
|
||||
WantedCheckApi::APIDescriptorBlock wanted_check_api_descriptor_block = {
|
||||
example_check_domain,
|
||||
example_check_url,
|
||||
noop_check_single_content,
|
||||
NULL //example_check_multi_content
|
||||
};
|
||||
|
@ -25,6 +25,12 @@ static WantedCheckApi::UrlCheckResult noop_check_url(const std::string &/*url*/)
|
||||
return result;
|
||||
}
|
||||
|
||||
static WantedCheckApi::SingleContentCheckResult noop_check_single_content(const std::string &/*url*/, const void */*content*/, size_t /*content_len*/) {
|
||||
WantedCheckApi::SingleContentCheckResult result;
|
||||
result.wanted = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
static WantedCheckApi::MultiContentCheckResult noop_check_multi_content(const std::vector<WantedCheckApi::MultiContent> &/*content*/) {
|
||||
WantedCheckApi::MultiContentCheckResult result;
|
||||
result.result = result.wanted;
|
||||
@ -41,7 +47,8 @@ static void *p_shlib = 0;
|
||||
static WantedCheckApi::APIDescriptorBlock effective_descriptor_block = {
|
||||
noop_check_domain,
|
||||
noop_check_url,
|
||||
noop_check_multi_content
|
||||
noop_check_single_content,
|
||||
noop_check_multi_content,
|
||||
};
|
||||
|
||||
|
||||
@ -69,6 +76,8 @@ bool WantedChecker::initialize() {
|
||||
effective_descriptor_block.check_domain_pfn = desc->check_domain_pfn;
|
||||
if(desc->check_url_pfn)
|
||||
effective_descriptor_block.check_url_pfn = desc->check_url_pfn;
|
||||
if(desc->check_single_content_pfn)
|
||||
effective_descriptor_block.check_single_content_pfn = desc->check_single_content_pfn;
|
||||
if(desc->check_multi_content_pfn)
|
||||
effective_descriptor_block.check_multi_content_pfn = desc->check_multi_content_pfn;
|
||||
|
||||
@ -82,6 +91,7 @@ void WantedChecker::finalize() {
|
||||
|
||||
effective_descriptor_block.check_domain_pfn = noop_check_domain;
|
||||
effective_descriptor_block.check_url_pfn = noop_check_url;
|
||||
effective_descriptor_block.check_single_content_pfn = noop_check_single_content;
|
||||
effective_descriptor_block.check_multi_content_pfn = noop_check_multi_content;
|
||||
|
||||
if(p_shlib) {
|
||||
@ -101,3 +111,7 @@ WantedChecker::DomainCheckResult WantedChecker::check_domain(const std::string &
|
||||
WantedChecker::UrlCheckResult WantedChecker::check_url(const std::string &url) {
|
||||
return effective_descriptor_block.check_url_pfn(url);
|
||||
}
|
||||
|
||||
WantedChecker::SingleContentCheckResult WantedChecker::check_single_content(const std::string &url, const void *content, size_t content_len) {
|
||||
return effective_descriptor_block.check_single_content_pfn(url,content,content_len);
|
||||
}
|
||||
|
@ -14,6 +14,10 @@ DomainCheckResult check_domain(const std::string &domain);
|
||||
typedef WantedCheckApi::UrlCheckResult UrlCheckResult;
|
||||
UrlCheckResult check_url(const std::string &url);
|
||||
|
||||
typedef WantedCheckApi::SingleContentCheckResult SingleContentCheckResult;
|
||||
SingleContentCheckResult check_single_content(const std::string &url, const void *content, size_t content_len);
|
||||
|
||||
|
||||
} //namespace
|
||||
|
||||
#endif
|
||||
|
@ -38,7 +38,20 @@ struct UrlCheckResult {
|
||||
typedef UrlCheckResult (*check_url_t)(const std::string &url);
|
||||
|
||||
|
||||
// check_content
|
||||
// check_single_content
|
||||
//Called after content has been successfully fetched and transcoded into UTF-8
|
||||
//Possible outcomes:
|
||||
// wanted
|
||||
// unwanted
|
||||
|
||||
struct SingleContentCheckResult {
|
||||
bool wanted;
|
||||
};
|
||||
|
||||
typedef SingleContentCheckResult (*check_single_content_t)(const std::string &url, const void *content, size_t content_len);
|
||||
|
||||
|
||||
// check_multi_content
|
||||
//
|
||||
//Called after content has been fetched and transcoded into UTF-8
|
||||
//Possible outcomes:
|
||||
@ -71,6 +84,7 @@ typedef MultiContentCheckResult (*check_multi_content_t)(const std::vector<Multi
|
||||
struct APIDescriptorBlock {
|
||||
check_domain_t check_domain_pfn;
|
||||
check_url_t check_url_pfn;
|
||||
check_single_content_t check_single_content_pfn;
|
||||
check_multi_content_t check_multi_content_pfn;
|
||||
|
||||
};
|
||||
|
@ -1742,6 +1742,11 @@ bool XmlDoc::indexDoc ( ) {
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
if ( g_errno == EDOCBLOCKEDSHLICONTENT ) {
|
||||
m_indexCode = g_errno;
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
// default to internal error which will be retried forever otherwise
|
||||
if ( ! m_indexCodeValid ) {
|
||||
m_indexCode = EINTERNALERROR;//g_errno;
|
||||
|
Loading…
x
Reference in New Issue
Block a user