privacore-open-source-searc.../WantedCheckerApi.h
2017-09-12 16:24:40 +02:00

100 lines
2.3 KiB
C++

#ifndef WANTEDCHECKERAPI_H
#define WANTEDCHECKERAPI_H
#include <string>
#include <vector>
#include <stddef.h>
namespace WantedCheckApi {
// check_domain()
//
//Checks if the domain is wanted. Eg "www.example.com"
//The callout may employ any logic to determine it, including statistical
//analysis of the domain name. Eg "fjsiurtiu.sjlqvnmsdf.ibuycarz.com" looks
//spammy.
//This callout is used before inserting into spider queue or doing DNS lookups.
struct DomainCheckResult {
bool wanted;
};
typedef DomainCheckResult (*check_domain_t)(const std::string &domain);
// check_url()
//
//The callout is meant for checking the path component that cannot easily be
//done with regex. Eg.
// proxy.example.com/www.spammy.spam.com
// stat.example.com/www.legit_site.com
// statistik.example.dk/statistik/www.legit_site.com?timespan=1y
//This callout is used before inserting into spider queue or doing DNS lookups.
struct UrlCheckResult {
bool wanted;
};
typedef UrlCheckResult (*check_url_t)(const std::string &url);
// check_single_content
//Called after content has been successfully fetched and transcoded into UTF-8
//Possible outcomes:
// wanted
// unwanted
struct SingleContentCheckResult {
bool wanted;
};
typedef SingleContentCheckResult (*check_single_content_t)(const std::string &url, const void *content, size_t content_len);
// check_multi_content
//
//Called after content has been fetched and transcoded into UTF-8
//Possible outcomes:
// wanted
// unwanted
// dont_know_but_please_fetch_me_some_other_doc(casino.js)
// The first item in the 'content' array is the page we are asking about. The
// other items are documents the callout asked for.
struct MultiContentCheckResult {
enum {
wanted,
unwanted,
fetch_other_page
} result;
std::string other_url_to_fetch;
};
struct MultiContent {
const void *ptr;
size_t size;
std::string content_type;
int http_result; //-1 = could not fetch
};
typedef MultiContentCheckResult (*check_multi_content_t)(const std::vector<MultiContent> &content);
struct APIDescriptorBlock {
check_domain_t check_domain_pfn;
check_url_t check_url_pfn;
check_single_content_t check_single_content_pfn;
check_multi_content_t check_multi_content_pfn;
};
} //namespace
//this is the symbol we will locate in the shlib
extern WantedCheckApi::APIDescriptorBlock wanted_check_api_descriptor_block;
#endif