mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-02-02 03:38:43 -05:00
67 lines
2.1 KiB
C++
67 lines
2.1 KiB
C++
#include "WantedCheckerApi.h"
|
|
#include <string.h>
|
|
|
|
//Example library for checking if a domain/url/document is wanted or not.
|
|
|
|
|
|
static WantedCheckApi::DomainCheckResult example_check_domain(const std::string &domain) {
|
|
WantedCheckApi::DomainCheckResult result;
|
|
result.wanted = true;
|
|
|
|
//Filter out blatant spam
|
|
if(domain=="spam.example.com")
|
|
result.wanted = false;
|
|
if(domain=="phishing.example.com")
|
|
result.wanted = false;
|
|
if(domain=="totally-not-a-scam-trust-me.example.com")
|
|
result.wanted = false;
|
|
|
|
//Filter out "statistics" sites or similar that embed common domains as sub-domains.
|
|
//This is quite tricky but as an example we filter out www.blablabla.com.something (.com.br being the exception)
|
|
if(domain.find(".com.")!=std::string::npos &&
|
|
domain.find(".com.br")==std::string::npos)
|
|
result.wanted = false;
|
|
return result;
|
|
}
|
|
|
|
|
|
static WantedCheckApi::UrlCheckResult example_check_url(const std::string &url) {
|
|
WantedCheckApi::UrlCheckResult result;
|
|
result.wanted = true;
|
|
//filter out the fictitious scheme "spam://"
|
|
if(url.substr(0,7)=="spam://")
|
|
result.wanted = false;
|
|
if(url.find("evil-penguin-on-hoverboard")!=std::string::npos)
|
|
result.wanted = false;
|
|
return result;
|
|
}
|
|
|
|
|
|
static WantedCheckApi::SingleContentCheckResult noop_check_single_content(const std::string &url, const void *content, size_t content_len) {
|
|
WantedCheckApi::SingleContentCheckResult result;
|
|
result.wanted = true;
|
|
//if the content contains the word "cellery" and it isn't a good site then reject it
|
|
if(memmem(content,content_len,"cellery",7)!=0 &&
|
|
url.find("destroy-all-cellery")==std::string::npos)
|
|
result.wanted = false;
|
|
return result;
|
|
}
|
|
|
|
|
|
//No example for content filtering
|
|
|
|
// static WantedCheckApi::ContentMultiCheckResult example_check_multi_content(const std::vector<WantedCheckApi::Content> &/*content*/) {
|
|
// WantedCheckApi::ContentMultiCheckResult result;
|
|
// result.result = result.wanted;
|
|
// return result;
|
|
// }
|
|
|
|
|
|
|
|
WantedCheckApi::APIDescriptorBlock wanted_check_api_descriptor_block = {
|
|
example_check_domain,
|
|
example_check_url,
|
|
noop_check_single_content,
|
|
NULL //example_check_multi_content
|
|
};
|