Merge branch 'master' into lemma

This commit is contained in:
Ivan Skytte Jørgensen
2018-06-22 13:05:59 +02:00
52 changed files with 1680 additions and 1463 deletions

@ -65,6 +65,16 @@ Conf::Conf ( ) {
m_maxOutstandingQueryLanguage = 0;
m_queryLanguageTimeout = 0;
m_siteMedianPageTemperatureServerName[0] = '\0';
m_siteMedianPageTemperatureServerPort = 0;
m_maxOutstandingSiteMedianPageTemperature = 0;
m_siteMedianPageTemperatureTimeout = 0;
m_siteNumInlinksServerName[0] = '\0';
m_siteNumInlinksServerPort = 0;
m_maxOutstandingSiteNumInlinks = 0;
m_siteNumInlinksTimeout = 0;
m_urlClassificationServerName[0] = '\0';
m_urlClassificationServerPort = 0;
m_maxOutstandingUrlClassifications = 0;
@ -233,8 +243,9 @@ Conf::Conf ( ) {
m_logDebugUrlAttempts = false;
m_logDebugVagus = false;
m_logTraceBigFile = false;
m_logTraceBlockList = false;
m_logTraceMatchList = false;
m_logTraceContentTypeBlockList = false;
m_logTraceDocid2FlagsAndSiteMap = false;
m_logTraceDocProcess = false;
m_logTraceDns = false;
m_logTraceDnsBlockList = false;
@ -270,6 +281,8 @@ Conf::Conf ( ) {
m_logTraceSpider = false;
m_logTraceSpiderUrlCache = false;
m_logTraceReindex = false;
m_logTraceSiteMedianPageTemperature = false;
m_logTraceSiteNumInlinks = false;
m_logTraceSpiderdbRdbSqliteBridge = false;
m_logTraceSummary = false;
m_logTraceTitledb = false;

15
Conf.h

@ -105,6 +105,16 @@ class Conf {
unsigned m_maxOutstandingQueryLanguage;
unsigned m_queryLanguageTimeout;
char m_siteMedianPageTemperatureServerName[64];
int32_t m_siteMedianPageTemperatureServerPort;
unsigned m_maxOutstandingSiteMedianPageTemperature;
unsigned m_siteMedianPageTemperatureTimeout;
char m_siteNumInlinksServerName[64];
int32_t m_siteNumInlinksServerPort;
unsigned m_maxOutstandingSiteNumInlinks;
unsigned m_siteNumInlinksTimeout;
char m_urlClassificationServerName[64];
int32_t m_urlClassificationServerPort;
unsigned m_maxOutstandingUrlClassifications;
@ -381,8 +391,9 @@ class Conf {
bool m_logDebugVagus;
bool m_logTraceBigFile;
bool m_logTraceBlockList;
bool m_logTraceMatchList;
bool m_logTraceContentTypeBlockList;
bool m_logTraceDocid2FlagsAndSiteMap;
bool m_logTraceDocProcess;
bool m_logTraceDns;
bool m_logTraceDnsBlockList;
@ -416,6 +427,8 @@ class Conf {
bool m_logTraceRepairs;
bool m_logTraceRobots;
bool m_logTraceRobotsCheckList;
bool m_logTraceSiteMedianPageTemperature;
bool m_logTraceSiteNumInlinks;
bool m_logTraceSpider;
bool m_logTraceSpiderUrlCache;
bool m_logTraceReindex;

41
ContentMatchList.cpp Normal file

@ -0,0 +1,41 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "ContentMatchList.h"
#include "Log.h"
#include "Conf.h"
ContentMatchList g_contentRetryProxyList;
static const char s_filename[] = "contentretryproxylist.txt";
ContentMatchList::ContentMatchList()
: MatchList(s_filename) {
}
bool ContentMatchList::isContentMatched(const char *content, size_t contentLen) {
auto contentMatchList = getMatchList();
for (auto const &contentMatch : *contentMatchList) {
if (strncasestr(content, contentLen, contentMatch.c_str())) {
return true;
}
}
return false;
}

32
ContentMatchList.h Normal file

@ -0,0 +1,32 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_CONTENTMATCHLIST_H
#define FX_CONTENTMATCHLIST_H
#include "MatchList.h"
class ContentMatchList : public MatchList<std::string> {
public:
ContentMatchList();
bool isContentMatched(const char *content, size_t contentLen);
};
extern ContentMatchList g_contentRetryProxyList;
#endif // FX_CONTENTMATCHLIST_H

@ -29,7 +29,7 @@ static const char s_contenttype_filename[] = "contenttypeblocklist.txt";
static const char s_contenttype_allowed_filename[] = "contenttypeallowed.txt";
ContentTypeBlockList::ContentTypeBlockList()
: BlockList(s_contenttype_filename)
: MatchList(s_contenttype_filename)
, m_contenttype_allowed()
, m_contenttype_allowed_mtx(PTHREAD_MUTEX_INITIALIZER) {
}
@ -43,7 +43,7 @@ bool ContentTypeBlockList::init() {
m_contenttype_allowed.push_back(line);
}
return BlockList::init();
return MatchList::init();
}
void ContentTypeBlockList::addContentTypeAllowed(const char *contentType, size_t contentTypeLen) {
@ -65,7 +65,7 @@ bool ContentTypeBlockList::isContentTypeBlocked(const char *contentType, size_t
return false;
}
auto contentTypeBlockList = getBlockList();
auto contentTypeBlockList = getMatchList();
for (auto const &contentTypeBlock : *contentTypeBlockList) {
if (contentTypeBlock.back() == '*') {

@ -20,11 +20,11 @@
#define FX_CONTENTTYPEBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
#include <pthread.h>
#include <vector>
class ContentTypeBlockList : public BlockList<std::string> {
class ContentTypeBlockList : public MatchList<std::string> {
public:
ContentTypeBlockList();

@ -25,11 +25,11 @@ DnsBlockList g_dnsBlockList;
static const char s_dns_filename[] = "dnsblocklist.txt";
DnsBlockList::DnsBlockList()
: BlockList(s_dns_filename) {
: MatchList(s_dns_filename) {
}
bool DnsBlockList::isDnsBlocked(const char *dns) {
auto dnsBlockList = getBlockList();
auto dnsBlockList = getMatchList();
for (auto const &dnsBlock : *dnsBlockList) {
if (dnsBlock.front() == '*') {

@ -19,9 +19,9 @@
#ifndef FX_DNSBLOCKLIST_H
#define FX_DNSBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
class DnsBlockList : public BlockList<std::string> {
class DnsBlockList : public MatchList<std::string> {
public:
DnsBlockList();
bool isDnsBlocked(const char *dns);

@ -95,7 +95,7 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
return;
}
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", MAX_NICENESS)) {
xmlDoc->m_indexCode = ECORRUPTDATA;
xmlDoc->m_indexCodeValid = true;

@ -1,5 +1,6 @@
#include "Docid2Siteflags.h"
#include "Log.h"
#include "Conf.h"
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
@ -112,11 +113,13 @@ bool Docid2FlagsAndSiteMap::lookupSiteHash(uint64_t docid, uint32_t *sitehash32)
if(pos!=e.end()) {
if(pos->docid == docid) {
*sitehash32 = pos->sitehash32;
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record sitehash32=%u for docid=%lu", *sitehash32, docid);
return true;
} else
return false;
} else
return false;
}
}
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
return false;
}
@ -129,9 +132,11 @@ bool Docid2FlagsAndSiteMap::lookupFlags(uint64_t docid, unsigned *flags) {
if(pos!=e.end()) {
if(pos->docid == docid) {
*flags = pos->flags;
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record flags=%u for docid=%lu", *flags, docid);
return true;
} else
return false;
} else
return false;
}
}
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
return false;
}

@ -1921,6 +1921,11 @@ static bool isTLDForUrl(const char *tld, int32_t tldLen) {
// otherwise, if one period, check table to see if qualified
if( ! s_table.getNumSlots() ) {
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
gbshutdownLogicError();
}
int64_t h = hash64Lower_a ( tld , tldLen ); // strlen(tld));
//return s_table.isInTable ( &h );//getScoreFromTermId ( h );
bool b = s_table.isInTable ( &h );//getScoreFromTermId ( h );
@ -1929,6 +1934,10 @@ static bool isTLDForUrl(const char *tld, int32_t tldLen) {
bool isTLD(const char *tld, int32_t tldLen) {
if( ! s_table.getNumSlots() ) {
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
gbshutdownLogicError();
}
int64_t h = hash64Lower_a(tld, tldLen);
return s_table.isInTable(&h);
}

@ -424,7 +424,7 @@ void FxClient::reinitializeSettings(const char *hostname, int port, unsigned max
}
bool FxClient::sendRequest(fxclient_request_ptr_t request) {
if (m_outstanding_request_count >= m_max_outstanding) {
if (m_max_outstanding > 0 && m_outstanding_request_count >= m_max_outstanding) {
return false;
}
@ -433,7 +433,7 @@ bool FxClient::sendRequest(fxclient_request_ptr_t request) {
return false;
}
if (m_outstanding_request_count + m_queued_requests.size() >= m_max_outstanding) {
if (m_max_outstanding > 0 && m_outstanding_request_count + m_queued_requests.size() >= m_max_outstanding) {
return false;
}

@ -36,7 +36,6 @@
// a global class extern'd in .h file
HttpServer g_httpServer;
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
static bool sendPagePretty(TcpSocket *s, HttpRequest *r, const char *filename, const char *tabName);
// we get like 100k submissions a day!!!
@ -824,9 +823,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( ! strncmp ( path ,"/api", pathLen ) )
return sendPageAPI ( s , r );
if ( ! strncmp ( path ,"/print", pathLen ) )
return sendPageAnalyze ( s , r );
// proxy should handle all regular file requests itself! that is
// generally faster i think, and, besides, sending pieces of a big
// file one at a time using our encapsulation method won't work! so

@ -26,11 +26,11 @@ IpBlockList g_ipBlockList;
static const char s_ip_filename[] = "ipblocklist.txt";
IpBlockList::IpBlockList()
: BlockList(s_ip_filename) {
: MatchList(s_ip_filename) {
}
bool IpBlockList::isIpBlocked(uint32_t ip) {
auto ipBlockList = getBlockList();
auto ipBlockList = getMatchList();
for (auto const &ipBlock : *ipBlockList) {
if (ipBlock == ip) {
@ -42,7 +42,7 @@ bool IpBlockList::isIpBlocked(uint32_t ip) {
return false;
}
void IpBlockList::addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line) {
void IpBlockList::addToMatchList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line) {
in_addr addr;
if (inet_pton(AF_INET, line.c_str(), &addr) != 1) {

@ -19,15 +19,15 @@
#ifndef FX_IPBLOCKLIST_H
#define FX_IPBLOCKLIST_H
#include "BlockList.h"
#include "MatchList.h"
class IpBlockList : public BlockList<uint32_t> {
class IpBlockList : public MatchList<uint32_t> {
public:
IpBlockList();
bool isIpBlocked(uint32_t ip);
protected:
void addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line);
void addToMatchList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line);
};

@ -60,7 +60,6 @@ bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
int32_t niceness ,
bool doLinkSpamCheck ,
bool oneVotePerIpDom ,
bool canBeCancelled ,
int32_t lastUpdateTime ,
bool onlyNeedGoodInlinks ,
// if an inlinking document has an outlink
@ -262,10 +261,7 @@ class Xml;
class Inlink;
class LinkInfo {
public:
int32_t getStoredSize ( ) const { return m_lisize; }
public:
int32_t getSize ( ) const { return m_lisize; }
time_t getLastUpdated ( ) const { return (time_t)m_lastUpdated; }
@ -277,8 +273,6 @@ class LinkInfo {
return const_cast<LinkInfo*>(this)->getNextInlink(const_cast<Inlink*>(k));
}
bool getItemXml ( Xml *xml ) ;
bool hasLinkText() const;
// for PageTitledb

@ -19,7 +19,7 @@ OBJS_O0 = \
File.o \
FxTermCheckList.o FxCheckAdult.o FxCheckSpam.o \
GbMutex.o \
HashTable.o HighFrequencyTermShortcuts.o PageTemperatureRegistry.o SiteMedianPageTemperatureRegistry.o SiteDefaultPageTemperatureRemoteRegistry.o Docid2Siteflags.o HttpMime.o HttpRequest.o HttpServer.o Hostdb.o \
HashTable.o HighFrequencyTermShortcuts.o PageTemperatureRegistry.o SiteMedianPageTemperatureRegistry.o Docid2Siteflags.o HttpMime.o HttpRequest.o HttpServer.o Hostdb.o \
iana_charset.o Images.o ip.o \
JobScheduler.o Json.o \
Lang.o Log.o \
@ -59,8 +59,8 @@ OBJS_O2 = \
OBJS_O3 = \
BlockList.o \
ContentTypeBlockList.o \
MatchList.o \
ContentMatchList.o ContentTypeBlockList.o \
DocDelete.o DocProcess.o DocRebuild.o DocReindex.o DnsBlockList.o \
IPAddressChecks.o IpBlockList.o \
LanguageResultOverride.o Linkdb.o \
@ -94,8 +94,11 @@ OBJS_O3 = \
EGStack.o \
QueryLanguage.o \
FxClient.o \
SiteNumInlinks.o \
SiteMedianPageTemperature.o \
Lemma.o \
OBJS = $(OBJS_O0) $(OBJS_O1) $(OBJS_O2) $(OBJS_O3)

@ -16,7 +16,7 @@
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "BlockList.h"
#include "MatchList.h"
#include "Log.h"
#include "Conf.h"
#include "Loop.h"
@ -26,31 +26,31 @@
#include <atomic>
template <class T>
BlockList<T>::BlockList(const char *filename)
MatchList<T>::MatchList(const char *filename)
: m_filename(filename)
, m_loading(false)
, m_blockList(new blocklist_t<T>)
, m_matchList(new matchlist_t<T>)
, m_lastModifiedTime(0) {
}
template <class T>
bool BlockList<T>::init() {
log(LOG_INFO, "Initializing BlockList with %s", m_filename);
bool MatchList<T>::init() {
log(LOG_INFO, "Initializing MatchList with %s", m_filename);
if (!g_loop.registerSleepCallback(60000, this, &reload, "BlockList<T>::reload", 0)) {
log(LOG_WARN, "BlockList<T>:: Failed to register callback.");
if (!g_loop.registerSleepCallback(60000, this, &reload, "MatchList<T>::reload", 0)) {
log(LOG_WARN, "MatchList<T>:: Failed to register callback.");
return false;
}
// we do a load here instead of using sleep callback with immediate set to true so
// we don't rely on g_loop being up and running to use blocklist
// we don't rely on g_loop being up and running to use matchlist
load();
return true;
}
template <class T>
void BlockList<T>::reload(int /*fd*/, void *state) {
void MatchList<T>::reload(int /*fd*/, void *state) {
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
return;
}
@ -60,36 +60,36 @@ void BlockList<T>::reload(int /*fd*/, void *state) {
}
template <class T>
void BlockList<T>::reload(void *state) {
BlockList *blockList = static_cast<BlockList*>(state);
void MatchList<T>::reload(void *state) {
MatchList *matchList = static_cast<MatchList*>(state);
// don't load multiple times at the same time
if (blockList->m_loading.exchange(true)) {
if (matchList->m_loading.exchange(true)) {
return;
}
blockList->load();
blockList->m_loading = false;
matchList->load();
matchList->m_loading = false;
}
template <class T>
bool BlockList<T>::load() {
logTrace(g_conf.m_logTraceBlockList, "Loading %s", m_filename);
bool MatchList<T>::load() {
logTrace(g_conf.m_logTraceMatchList, "Loading %s", m_filename);
struct stat st;
if (stat(m_filename, &st) != 0) {
// probably not found
log(LOG_INFO, "BlockList<T>::load: Unable to stat %s", m_filename);
log(LOG_INFO, "MatchList<T>::load: Unable to stat %s", m_filename);
return false;
}
if (m_lastModifiedTime != 0 && m_lastModifiedTime == st.st_mtime) {
// not modified. assume successful
logTrace(g_conf.m_logTraceBlockList, "%s not modified", m_filename);
logTrace(g_conf.m_logTraceMatchList, "%s not modified", m_filename);
return true;
}
blocklist_ptr_t<T> tmpBlockList(new blocklist_t<T>);
matchlist_ptr_t<T> tmpMatchList(new matchlist_t<T>);
std::ifstream file(m_filename);
std::string line;
@ -99,37 +99,37 @@ bool BlockList<T>::load() {
continue;
}
addToBlockList(tmpBlockList, line);
logTrace(g_conf.m_logTraceBlockList, "Adding criteria '%s' to list", line.c_str());
addToMatchList(tmpMatchList, line);
logTrace(g_conf.m_logTraceMatchList, "Adding criteria '%s' to list", line.c_str());
}
swapBlockList(tmpBlockList);
swapMatchList(tmpMatchList);
m_lastModifiedTime = st.st_mtime;
logTrace(g_conf.m_logTraceBlockList, "Loaded %s", m_filename);
logTrace(g_conf.m_logTraceMatchList, "Loaded %s", m_filename);
return true;
}
template <class T>
void BlockList<T>::addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line) {
void MatchList<T>::addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line) {
gbshutdownLogicError();
}
template <>
void BlockList<std::string>::addToBlockList(blocklist_ptr_t<std::string> &blockList, const std::string &line) {
blockList->emplace_back(line);
void MatchList<std::string>::addToMatchList(matchlist_ptr_t<std::string> &matchList, const std::string &line) {
matchList->emplace_back(line);
}
template <class T>
blocklistconst_ptr_t<T> BlockList<T>::getBlockList() {
return m_blockList;
matchlistconst_ptr_t<T> MatchList<T>::getMatchList() {
return m_matchList;
}
template <class T>
void BlockList<T>::swapBlockList(blocklistconst_ptr_t<T> blockList) {
std::atomic_store(&m_blockList, blockList);
void MatchList<T>::swapMatchList(matchlistconst_ptr_t<T> matchList) {
std::atomic_store(&m_matchList, matchList);
}
// explicit instantiations
template class BlockList<std::string>;
template class BlockList<uint32_t>;
template class MatchList<std::string>;
template class MatchList<uint32_t>;

@ -16,8 +16,8 @@
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_BLOCKLIST_H
#define FX_BLOCKLIST_H
#ifndef FX_MATCHLIST_H
#define FX_MATCHLIST_H
#include <memory>
@ -25,14 +25,14 @@
#include <string>
#include <atomic>
template <typename T> using blocklist_t = std::vector<T>;
template <typename T> using blocklist_ptr_t = std::shared_ptr<std::vector<T>>;
template <typename T> using blocklistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
template <typename T> using matchlist_t = std::vector<T>;
template <typename T> using matchlist_ptr_t = std::shared_ptr<std::vector<T>>;
template <typename T> using matchlistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
template<class T> class BlockList {
template<class T> class MatchList {
public:
explicit BlockList(const char *filename);
virtual ~BlockList() = default;
explicit MatchList(const char *filename);
virtual ~MatchList() = default;
virtual bool init();
@ -42,18 +42,18 @@ public:
protected:
bool load();
virtual void addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line);
blocklistconst_ptr_t<T> getBlockList();
virtual void addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line);
matchlistconst_ptr_t<T> getMatchList();
const char *m_filename;
private:
void swapBlockList(blocklistconst_ptr_t<T> blockList);
void swapMatchList(matchlistconst_ptr_t<T> matchList);
std::atomic_bool m_loading;
blocklistconst_ptr_t<T> m_blockList;
matchlistconst_ptr_t<T> m_matchList;
time_t m_lastModifiedTime;
};
#endif //FX_BLOCKLIST_H
#endif //FX_MATCHLIST_H

121
Msg13.cpp

@ -18,6 +18,8 @@
#include "Pages.h"
#include "Statistics.h"
#include "Sanity.h"
#include "UrlMatchList.h"
#include "ContentMatchList.h"
#include <string.h>
@ -644,23 +646,28 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
bool useProxies = false;
// for diffbot turn ON if use robots is off
if ( r->m_forceUseFloaters ) useProxies = true;
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
// if you turned on automatically use proxies in spider controls...
if ( ! useProxies &&
cr &&
r->m_urlIp != 0 &&
r->m_urlIp != -1 &&
cr->m_automaticallyUseProxies &&
isIpInTwitchyTable( cr, r->m_urlIp ) )
useProxies = true;
// we gotta have some proxy ips that we can use
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
if (g_conf.m_proxyIps.hasDigits()) {
// for diffbot turn ON if use robots is off
if (r->m_forceUseFloaters) {
useProxies = true;
}
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
// if you turned on automatically use proxies in spider controls...
if (!useProxies &&
cr && cr->m_automaticallyUseProxies &&
r->m_urlIp != 0 && r->m_urlIp != -1 && isIpInTwitchyTable(cr, r->m_urlIp)) {
useProxies = true;
}
Url url;
url.set(r->ptr_url, r->size_url);
if (g_urlProxyList.isUrlMatched(url)) {
useProxies = true;
}
}
// we did not need a spider proxy ip so send this reuest to a host
// to download the url
@ -1036,6 +1043,67 @@ static bool ipWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
return false;
}
static void appendRetryProxy(const char *url, int urlLen, const char *location = nullptr, int locationLen = 0) {
char filename[1024];
sprintf(filename,"%s/retryproxy.txt", g_hostdb.m_myHost->m_dir);
FILE *fp = fopen(filename,"a");
if (fp) {
fprintf(fp, "%.*s|%.*s\n", urlLen, url, locationLen, location);
fclose(fp);
}
}
static bool retryProxy(TcpSocket *ts, const char **msg, Msg13Request *r) {
if (!ts) {
return false;
}
//we only do proxy checks if there weren't any other error
if (g_errno != 0) {
return false;
}
// don't check for retries if it's already done
if (r->m_proxyTries > 0) {
return false;
}
Url url;
url.set(r->ptr_url, r->size_url);
HttpMime mime;
mime.set(ts->m_readBuf, ts->m_readOffset, &url);
int32_t httpStatus = mime.getHttpStatus();
if (httpStatus == 301 || httpStatus == 302 || httpStatus == 307 || httpStatus == 308) {
// we only retry when list matches redirected url & does not match original url
if (g_urlRetryProxyList.isUrlMatched(url)) {
return false;
}
const Url *location = mime.getLocationUrl();
if (g_urlRetryProxyList.isUrlMatched(*location)) {
*msg = "redir url proxy match list";
appendRetryProxy(url.getUrl(), url.getUrlLen(), location->getUrl(), location->getUrlLen());
return true;
}
return false;
}
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
size_t haystack_size = ts->m_readOffset - pre_size;
const char *haystack = ts->m_readBuf + pre_size;
if (g_contentRetryProxyList.isContentMatched(haystack, haystack_size)) {
*msg = "content proxy match list";
appendRetryProxy(url.getUrl(), url.getUrlLen());
return true;
}
return false;
}
static void appendCrawlBan(const char *group, const char *url, int urlLen) {
char filename[1024];
@ -1332,6 +1400,13 @@ void gotHttpReply2 ( void *state ,
);
}
bool retry_proxy = false;
if (retryProxy(ts, &banMsg, r)) {
retry_proxy = true;
char ipbuf[16];
log("msg13: retry using proxy for url %s due to %s, for ip %s", r->ptr_url, banMsg, iptoa(r->m_urlIp, ipbuf));
}
if(crawlWasBanned(ts,&banMsg,r)) {
char ipbuf[16];
log("msg13: url %.*s detected as banned2 (%s), for ip %s"
@ -1369,8 +1444,7 @@ void gotHttpReply2 ( void *state ,
if ( banned &&
// retry iff we haven't already, but if we did stop the inf loop
! r->m_wasInTableBeforeStarting &&
cr &&
( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
cr && ( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
// but this is not for proxies... only native crawlbot backoff
! r->m_proxyIp ) {
// note this as well
@ -1388,6 +1462,19 @@ void gotHttpReply2 ( void *state ,
return;
}
if (retry_proxy) {
// note this as well
log("msg13: retrying spidered page with proxy for %s", r->ptr_url);
// reset error
g_errno = 0;
r->m_forceUseFloaters = 1;
downloadTheDocForReals2(r);
return;
}
// do not print this if we are already using proxies, it is for
// the auto crawldelay backoff logic only
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )

779
Msg25.cpp

File diff suppressed because it is too large Load Diff

31
Msg25.h

@ -42,7 +42,6 @@ public:
int32_t m_niceness;
bool m_doLinkSpamCheck;
bool m_oneVotePerIpDom;
bool m_canBeCancelled;
int32_t m_lastUpdateTime;
bool m_onlyNeedGoodInlinks;
int32_t m_ourHostHash32;
@ -140,7 +139,6 @@ class Msg25 {
int32_t niceness,
bool doLinkSpamCheck,
bool oneVotePerIpDom,
bool canBeCancelled,
int32_t lastUpdateTime,
bool onlyNeedGoodInlinks,
// if an inlinking document has an outlink
@ -175,18 +173,9 @@ class Msg25 {
// private:
// these need to be public for wrappers to call:
bool gotTermFreq(bool msg42Called);
bool getRootTitleRec();
bool gotRootTitleRec();
bool gotDocId();
bool gotRootLinkText();
bool gotRootLinkText2();
bool getLinkingDocIds();
bool gotList();
bool gotClusterRecs();
bool sendRequests();
bool gotLinkText(class Msg20Request *req);
bool gotMsg25Reply();
bool doReadLoop();
// input vars
@ -200,8 +189,6 @@ class Msg25 {
uint64_t m_linkHash64;
key224_t m_nextKey;
bool m_retried;
bool m_prependWWW;
bool m_onlyNeedGoodInlinks;
int64_t m_docId;
collnum_t m_collnum;
@ -231,9 +218,6 @@ class Msg25 {
Inlink *m_k;
// for getting the root title rec so we can share its pwids
Msg22 m_msg22;
int32_t m_maxNumLinkers;
// should we free the m_replyPtrs on destruction? default=true
@ -258,10 +242,6 @@ class Msg25 {
int32_t m_minRecSizes;
// Msg20 is for getting the LinkInfo class from this same url's
// titleRec from another (usually much larger) gigablast cluster/netwrk
Msg20 m_msg20;
// how many msg20s have we sent/recvd?
int32_t m_numRequests;
int32_t m_numReplies;
@ -278,20 +258,16 @@ class Msg25 {
bool m_oneVotePerIpDom;
bool m_doLinkSpamCheck;
bool m_isInjecting;
char m_canBeCancelled;
int32_t m_lastUpdateTime;
Multicast m_mcast;
int32_t m_good;
int32_t m_errors;
int32_t m_noText;
int32_t m_reciprocal;
bool m_spideringEnabled;
int32_t m_dupCount;
int32_t m_vectorDups;
int32_t m_spamLinks;
int32_t m_niceness;
int32_t m_numFromSameIp;
@ -299,12 +275,8 @@ class Msg25 {
// stats for allow some link spam inlinks to vote
int32_t m_spamCount;
int32_t m_spamWeight;
int32_t m_maxSpam;
char m_siteQuality;
int32_t m_siteNumFreshInlinks;
// this is used for the linkdb list
HashTableX m_ipTable;
HashTableX m_fullIpTable;
@ -321,9 +293,6 @@ class Msg25 {
int32_t m_linkSpamLinkdb;
int32_t m_ipDups;
uint32_t m_groupId;
int64_t m_probDocId;
LinkInfo *m_oldLinkInfo;
char m_buf[MAX_NOTE_BUF_LEN];

@ -29,17 +29,10 @@ public:
int32_t m_niceness;
XmlDoc m_xd;
lang_t m_langId;
//Msg8a m_msg8a;
//SiteRec m_sr;
//TagRec m_tagRec;
TcpSocket *m_socket;
HttpRequest m_r;
char m_coll[MAX_COLL_LEN+2];
//CollectionRec *m_cr;
bool m_isMasterAdmin;
//bool m_seq;
bool m_rtq;
//char m_q[MAX_QUERY_LEN+1];
SafeBuf m_qsb;
char m_qtmpBuf[128];
int32_t m_qlen;
@ -51,7 +44,6 @@ public:
int32_t m_strip;
bool m_cnsPage; // Are we in the click 'n' scroll page?
bool m_printDisclaimer;
bool m_netTestResults;
bool m_isBanned;
bool m_noArchive;
SafeBuf m_sb;
@ -65,7 +57,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
int32_t collLen = 0;
const char *coll = r->getString("c",&collLen);
if ( ! coll || ! coll[0] ) {
//coll = g_conf.m_defaultColl;
coll = g_conf.getDefaultColl( );
collLen = strlen(coll);
}
@ -103,13 +94,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
// . should we do a sequential lookup?
// . we need to match summary here so we need to know this
//bool seq = r->getLong ( "seq" , false );
// restrict to root file?
bool rtq = r->getLong ( "rtq" , 0) ? true : false;
// . get the titleRec
// . TODO: redirect client to a better http server to save bandwidth
State2 *st ;
@ -139,7 +123,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
}
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
// store query for query highlighting
st->m_netTestResults = r->getLong ("rnettest", 0) ? true : false;
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
st->m_qsb.setLabel ( "qsbpg" );
@ -150,7 +133,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
st->m_qsb.safeStrcpy ( "" );
st->m_qlen = qlen;
st->m_rtq = rtq;
st->m_isBanned = false;
st->m_noArchive = false;
st->m_socket = s;
@ -166,14 +148,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
if ( st->m_strip ) {
st->m_printDisclaimer = false;
}
// should we cache it?
char useCache = r->getLong ( "usecache" , 1 );
char rcache = r->getLong ( "rcache" , 1 );
char wcache = r->getLong ( "wcache" , 1 );
int32_t cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
if ( rcache == 0 ) cacheAge = 0;
// . fetch the TitleRec
// . a max cache age of 0 means not to read from the cache
XmlDoc *xd = &st->m_xd;
@ -413,12 +388,11 @@ bool processLoop ( void *state ) {
// Moved over from PageResults.cpp
sb->safePrintf( "</span> - <a href=\""
"/get?"
"q=%s&amp;c=%s&amp;rtq=%" PRId32"&amp;"
"q=%s&amp;c=%s&amp;"
"d=%" PRId64"&amp;strip=1\""
" style=\"%s\">"
"[stripped]</a>",
q , st->m_coll ,
(int32_t)st->m_rtq,
q , st->m_coll ,
st->m_docId, styleLink );
// a link to alexa

@ -10,122 +10,49 @@
class State8 {
public:
//Msg16 m_msg16;
//Msg14 m_msg14;
//Msg15 m_msg15;
SafeBuf m_dbuf;
//XmlDoc m_doc;
//Url m_url;
//Url m_rootUrl;
const char *m_u;
int32_t m_ulen;
char m_rootQuality;
char m_coll[MAX_COLL_LEN];
int32_t m_collLen;
//int32_t m_sfn;
//int32_t m_urlLen;
TcpSocket *m_s;
char m_pwd[32];
HttpRequest m_r;
int32_t m_old;
// recyle the link info from the title rec?
int32_t m_recycle;
// recycle the link info that was imported from another coll?
int32_t m_recycle2;
bool m_render;
bool m_recompute;
int32_t m_oips;
char m_linkInfoColl[11];
// char m_buf[16384 * 1024];
//int32_t m_page;
// m_pbuf now points to m_sbuf if we are showing the parsing junk
SafeBuf m_xbuf;
SafeBuf m_wbuf;
bool m_donePrinting;
//SafeBuf m_sbuf;
// this is a buffer which cats m_sbuf into it
//SafeBuf m_sbuf2;
// new state vars for Msg3b.cpp
int64_t m_docId;
void *m_state ;
void (* m_callback) (void *state);
Query *m_q;
int64_t *m_termFreqs;
float *m_termFreqWeights;
float *m_affWeights;
//score_t m_total;
bool m_freeIt;
bool m_blocked;
// these are from rearranging the code
int32_t m_indexCode;
//uint64_t m_chksum1;
bool m_didRootDom;
bool m_didRootWWW;
bool m_wasRootDom;
// call Msg16 with a versino of title rec to do
int32_t m_titleRecVersion;
//TitleRec m_tr;
//XmlDoc m_oldDoc;
XmlDoc m_xd;
};
// TODO: meta redirect tag to host if hostId not ours
static bool processLoop ( void *state ) ;
static bool gotXmlDoc ( void *state ) ;
static bool sendErrorReply ( void *state , int32_t err ) ;
static bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
class State8 *st ,
int64_t docId ,
Query *q ,
int64_t *termFreqs ,
float *termFreqWeights ,
float *affWeights ,
void *state ,
void (* callback)(void *state) ) ;
static bool processLoop(void *state);
static bool sendErrorReply(void *state, int32_t err);
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the config of this host
// . call g_httpServer.sendDynamicPage() to send it
// . TODO: don't close this socket until httpserver returns!!
bool sendPageParser ( TcpSocket *s , HttpRequest *r ) {
return sendPageParser2 ( s , r , NULL , -1LL , NULL , NULL,
NULL , NULL, NULL , NULL );
}
// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
static bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
State8 *st ,
int64_t docId ,
Query *q ,
// in query term space, not imap space
int64_t *termFreqs ,
// in imap space
float *termFreqWeights ,
// in imap space
float *affWeights ,
void *state ,
void (* callback)(void *state) ) {
//log("parser: read sock=%" PRId32,s->m_sd);
// . TODO: don't close this socket until httpserver returns!!
bool sendPageParser(TcpSocket *s, HttpRequest *r) {
// might a simple request to addsomething to validated.*.txt file
// from XmlDoc::print() or XmlDoc::validateOutput()
//int64_t uh64 = r->getLongLong("uh64",0LL);
const char *uh64str = r->getString("uh64",NULL);
//char *divTag = r->getString("div",NULL);
if ( uh64str ) {
// make basic reply
const char *reply = "HTTP/1.0 200 OK\r\n"
@ -144,34 +71,18 @@ static bool sendPageParser2 ( TcpSocket *s ,
}
// make a state
if ( st ) st->m_freeIt = false;
if ( ! st ) {
try { st = new (State8); }
catch(std::bad_alloc&) {
g_errno = ENOMEM;
log("PageParser: new(%i): %s",
(int)sizeof(State8),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));}
mnew ( st , sizeof(State8) , "PageParser" );
st->m_freeIt = true;
}
// msg3b uses this to get a score from the query
st->m_state = state;
st->m_callback = callback;
st->m_q = q;
st->m_termFreqs = termFreqs;
st->m_termFreqWeights = termFreqWeights;
st->m_affWeights = affWeights;
//st->m_total = (score_t)-1;
State8 *st;
try { st = new (State8); }
catch(std::bad_alloc&) {
g_errno = ENOMEM;
log("PageParser: new(%i): %s",
(int)sizeof(State8),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));}
mnew ( st , sizeof(State8) , "PageParser" );
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;
st->m_didRootWWW = false;
st->m_wasRootDom = false;
st->m_u = NULL;
st->m_recompute = false;
//st->m_url.reset();
// password, too
int32_t pwdLen = 0;
@ -196,28 +107,14 @@ static bool sendPageParser2 ( TcpSocket *s ,
if ( st->m_titleRecVersion == -1 )
st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
//int32_t ulen = 0;
//char *u = r->getString ( "u" , &ulen , NULL /*default*/);
int32_t old = r->getLong ( "old", 0 );
// url will override docid if given
if ( ! st->m_u || ! st->m_u[0] )
st->m_docId = r->getLongLong ("docid",-1);
else
st->m_docId = -1;
// set url in state class (may have length 0)
//if ( u ) st->m_url.set ( u , ulen );
//st->m_urlLen = ulen;
st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
// should we recycle link info?
st->m_recycle = r->getLong("recycle",0);
st->m_recycle2 = r->getLong("recycleimp",0);
st->m_render = r->getLong("render" ,0) ? true : false;
// for quality computation... takes way longer cuz we have to
// lookup the IP address of every outlink, so we can get its root
// quality using Msg25 which needs to filter out voters from that IP
// range.
st->m_oips = r->getLong("oips" ,0);
int32_t linkInfoLen = 0;
// default is NULL
@ -227,15 +124,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
// should we use the old title rec?
st->m_old = old;
//no more setting the default root quality to 30, instead if we do not
// know it setting it to -1
st->m_rootQuality=-1;
// header
SafeBuf *xbuf = &st->m_xbuf;
@ -299,21 +187,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"</tr>"
/*
"<tr class=poo>"
"<td>"
"Parser version to use: "
"</td>"
"<td>"
"<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
"</td>"
"<td>"
"(-1 means to use latest title rec version)<br>"
"</td>"
"</tr>"
*/
"<tr class=poo>"
"<td>"
"<b>use cached</b>"
@ -328,20 +201,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"</tr>"
/*
"<tr class=poo>"
"<td>"
"Reparse root:"
"</td>"
"<td>"
"<input type=checkbox name=artr value=1%s> "
"</td>"
"<td>"
"Apply selected ruleset to root to update quality"
"</td>"
"</tr>"
*/
"<tr class=poo>"
"<td>"
"<b>recycle link info</b>"
@ -357,20 +216,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"</tr>"
/*
"<tr class=poo>"
"<td>"
"Recycle Link Info Imported:"
"</td>"
"<td>"
"<input type=checkbox name=recycleimp value=1%s> "
"</td>"
"<td>"
"Recycle the link info imported from other coll"
"</td>"
"</tr>"
*/
"<tr class=poo>"
"<td>"
"<b>render html</b>"
@ -385,33 +230,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"</tr>"
/*
"<tr class=poo>"
"<td>"
"Lookup outlinks' ruleset, ips, quality:"
"</td>"
"<td>"
"<input type=checkbox name=oips value=1%s> "
"</td>"
"<td>"
"To compute quality lookup IP addresses of roots "
"of outlinks."
"</td>"
"</tr>"
"<tr class=poo>"
"<td>"
"LinkInfo Coll:"
"</td>"
"<td>"
"<input type=text name=\"oli\" size=\"10\" value=\"\"> "
"</td>"
"<td>"
"Leave empty usually. Uses this coll to lookup link info."
"</td>"
"</tr>"
*/
"<tr class=poo>"
"<td>"
"<b>optional query</b>"
@ -443,7 +261,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</td>"
"<td>"
//"<input type=checkbox name=xml value=1> "
"<select name=ctype>\n"
"<option value=%" PRId32" selected>HTML</option>\n"
"<option value=%" PRId32">XML</option>\n"
@ -477,12 +294,8 @@ static bool sendPageParser2 ( TcpSocket *s ,
"</center>"
"</form>"
"<br>",
//oips ,
contentParm );
xbuf->safePrintf(
"<center>"
"<input type=submit value=Submit>"
@ -526,18 +339,8 @@ static bool sendPageParser2 ( TcpSocket *s ,
uint8_t contentType = CT_HTML;
if ( r->getBool("xml",0) ) contentType = CT_XML;
contentType = r->getLong("ctype",contentType);//CT_HTML);
// if facebook, load xml content from title rec...
bool isFacebook = strstr(st->m_u,"http://www.facebook.com/") ? true : false;
if ( isFacebook && ! content ) {
int64_t docId = Titledb::getProbableDocId((char*)st->m_u);
sprintf(sreq.m_url ,"%" PRIu64 "", (uint64_t) docId);
sreq.m_isPageReindex = true;
}
// hack
if ( content ) {
st->m_dbuf.purge();
@ -588,17 +391,6 @@ bool processLoop ( void *state ) {
// print it out
xd->printDoc( &st->m_xbuf );
}
// print reason we can't analyze it (or index it)
//if ( st->m_indexCode != 0 ) {
// st->m_xbuf.safePrintf ("<br><br><b>indexCode: %s</b>\n<br>",
// mstrerror(st->m_indexCode));
//}
// print the final tail
//p += g_httpServer.printTail ( p , pend - p );
//log("parser: send sock=%" PRId32,st->m_s->m_sd);
// now encapsulate it in html head/tail and send it off
bool status = g_httpServer.sendDynamicPage( st->m_s ,
@ -611,10 +403,9 @@ bool processLoop ( void *state ) {
NULL,//cookie
"utf-8");
// delete the state now
if ( st->m_freeIt ) {
mdelete ( st , sizeof(State8) , "PageParser" );
delete (st);
}
mdelete ( st , sizeof(State8) , "PageParser" );
delete (st);
// return the status
return status;
}
@ -643,250 +434,3 @@ bool sendErrorReply ( void *state , int32_t err ) {
//return g_httpServer.sendDynamicPage ( s , tmp , strlen(tmp) );
return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
}
// for procog
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
// make a state
State8 *st;
try { st = new (State8); }
catch(std::bad_alloc&) {
g_errno = ENOMEM;
log("PageParser: new(%i): %s",
(int)sizeof(State8),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));}
mnew ( st , sizeof(State8) , "PageParser" );
st->m_freeIt = true;
st->m_state = NULL;
//st->m_callback = callback;
//st->m_q = q;
//st->m_termFreqs = termFreqs;
//st->m_termFreqWeights = termFreqWeights;
//st->m_affWeights = affWeights;
//st->m_total = (score_t)-1;
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;
st->m_didRootWWW = false;
st->m_wasRootDom = false;
st->m_u = NULL;
// password, too
int32_t pwdLen = 0;
const char *pwd = r->getString ( "pwd" , &pwdLen );
if ( pwdLen > 31 ) pwdLen = 31;
if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
st->m_pwd[pwdLen]='\0';
// save socket ptr
st->m_s = s;
st->m_r.copy ( r );
// get the collection
const char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
if ( ! coll ) coll = g_conf.m_defaultColl;
int32_t collLen = strlen(coll);
if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS );
strcpy ( st->m_coll , coll );
// version to use, if -1 use latest
st->m_titleRecVersion = r->getLong("version",-1);
if ( st->m_titleRecVersion == -1 )
st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
int32_t old = r->getLong ( "old", 0 );
// url will override docid if given
st->m_docId = r->getLongLong ("d",-1);
st->m_docId = r->getLongLong ("docid",st->m_docId);
int32_t ulen;
const char *u = st->m_r.getString("u",&ulen,NULL);
if ( ! u ) u = st->m_r.getString("url",&ulen,NULL);
if ( ! u && st->m_docId == -1LL )
return sendErrorReply ( st , EBADREQUEST );
// set url in state class (may have length 0)
//if ( u ) st->m_url.set ( u , ulen );
//st->m_urlLen = ulen;
st->m_u = u;
st->m_ulen = 0;
if ( u ) st->m_ulen = strlen(u);
// should we recycle link info?
st->m_recycle = r->getLong("recycle",1);
st->m_recycle2 = r->getLong("recycleimp",0);
st->m_render = r->getLong("render" ,0) ? true : false;
st->m_recompute = r->getLong("recompute" ,0) ? true : false;
// for quality computation... takes way longer cuz we have to
// lookup the IP address of every outlink, so we can get its root
// quality using Msg25 which needs to filter out voters from that IP
// range.
st->m_oips = r->getLong("oips" ,0);
//st->m_page = r->getLong("page",1);
int32_t linkInfoLen = 0;
// default is NULL
const char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
else st->m_linkInfoColl[0] = '\0';
// should we use the old title rec?
st->m_old = old;
//no more setting the default root quality to 30, instead if we do not
// know it setting it to -1
st->m_rootQuality=-1;
// header
//st->m_xbuf.safePrintf("<meta http-equiv=\"Content-Type\" "
// "content=\"text/html; charset=utf-8\">\n");
XmlDoc *xd = &st->m_xd;
int32_t isXml = r->getLong("xml",0);
// if got docid, use that
if ( st->m_docId != -1 ) {
if ( ! xd->set3 ( st->m_docId,
st->m_coll,
0 ) ) // niceness
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
xd->setCallback ( st , gotXmlDoc );
xd->m_pbuf = &st->m_wbuf;
// reset this flag
st->m_donePrinting = false;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( st->m_recycle ) xd->m_recycleContent = true;
xd->m_recycleContent = true;
// force this on
//xd->m_useSiteLinkBuf = true;
//xd->m_usePageLinkBuf = true;
if ( isXml ) xd->m_printInXml = true;
// now tell it to fetch the old title rec
if ( ! xd->loadFromOldTitleRec () )
// return false if this blocks
return false;
return gotXmlDoc ( st );
}
// set this up
SpiderRequest sreq;
if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
int32_t firstIp = hash32n(st->m_u);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// parentdocid of 0
sreq.setKey( firstIp, 0LL, false );
sreq.m_isPageParser = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
Url nu;
nu.set(sreq.m_url);
sreq.m_domHash32 = nu.getDomainHash32();
sreq.m_siteHash32 = nu.getHostHash32();
// . get provided content if any
// . will be NULL if none provided
// . "content" may contain a MIME
int32_t contentLen = 0;
const char *content = r->getString ( "content" , &contentLen , NULL );
if ( ! content ) {
content = r->getUnencodedContent ();
contentLen = r->getUnencodedContentLen ();
}
// ensure null
if ( contentLen == 0 ) content = NULL;
int32_t ctype = r->getLong("ctype",CT_HTML);
// . use the enormous power of our new XmlDoc class
// . this returns false if blocked
if ( ! xd->set4 ( &sreq ,
NULL ,
(char*)st->m_coll ,
// we need this so the term table is set!
&st->m_wbuf , // XmlDoc::m_pbuf
0, // niceness
(char*)content ,
false, // deletefromindex
0, // forced ip
ctype ))
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
xd->setCallback ( st , gotXmlDoc );
// reset this flag
st->m_donePrinting = false;
// prevent a core here in the event we download the page content
xd->m_crawlDelayValid = true;
xd->m_crawlDelay = 0;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( st->m_recycle ) xd->m_recycleContent = true;
// only recycle if docid is given!!
if ( st->m_recycle ) xd->m_recycleContent = true;
// force this on
//xd->m_useSiteLinkBuf = true;
//xd->m_usePageLinkBuf = true;
if ( isXml ) xd->m_printInXml = true;
return gotXmlDoc ( st );
}
bool gotXmlDoc ( void *state ) {
// cast it
State8 *st = (State8 *)state;
// get the xmldoc
XmlDoc *xd = &st->m_xd;
// if we loaded from old title rec, it should be there!
// error?
if ( g_errno ) return sendErrorReply ( st , g_errno );
bool printIt = false;
if ( st->m_u && st->m_u[0] ) printIt = true;
if ( st->m_docId != -1LL ) printIt = true;
if ( st->m_donePrinting ) printIt = false;
// do not re-call this if printDocForProCog blocked... (check length())
if ( printIt ) {
// mark as done
st->m_donePrinting = true;
// always re-compute the page inlinks dynamically, do not
// use the ptr_linkInfo1 stored in titlerec!!
// NO! not if set from titlerec/docid
if ( st->m_recompute )
xd->m_linkInfo1Valid = false;
// . print it out
// . returns false if blocks, true otherwise
// . sets g_errno on error
if ( ! xd->printDocForProCog ( &st->m_xbuf, &st->m_r ) )
return false;
// error?
if ( g_errno ) return sendErrorReply ( st , g_errno );
}
int32_t isXml = st->m_r.getLong("xml",0);
char ctype2 = CT_HTML;
if ( isXml ) ctype2 = CT_XML;
// now encapsulate it in html head/tail and send it off
bool status = g_httpServer.sendDynamicPage( st->m_s ,
st->m_xbuf.getBufStart(),
st->m_xbuf.length() ,
-1, //cachtime
false ,//postreply?
&ctype2,
-1 , //httpstatus
NULL,//cookie
"utf-8");
// delete the state now
if ( st->m_freeIt ) {
mdelete ( st , sizeof(State8) , "PageParser" );
delete (st);
}
// return the status
return status;
}

@ -4,6 +4,4 @@
class TcpSocket;
class HttpRequest;
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
#endif // GB_PAGEPARSER_H

175
Parms.cpp

@ -38,6 +38,8 @@
#include "GbDns.h"
#include "SiteMedianPageTemperatureRegistry.h"
#include "QueryLanguage.h"
#include "SiteNumInlinks.h"
#include "SiteMedianPageTemperature.h"
#include <set>
#include <fstream>
@ -1550,11 +1552,11 @@ bool Parms::printParm( SafeBuf* sb,
// . make at least as big as a int64_t
if ( j >= jend ) s = "\0\0\0\0\0\0\0\0";
// delimit each cgi var if we need to
if ( m->m_cgi && strlen(m->m_cgi) > 45 ) {
char cgi[128];
if ( m->m_cgi && strlen(m->m_cgi)+10 >= sizeof(cgi) ) { //10 digits
log(LOG_LOGIC,"admin: Cgi variable is TOO big.");
g_process.shutdownAbort(true);
}
char cgi[64];
if ( m->m_cgi ) {
if ( j > 0 ) sprintf ( cgi , "%s%" PRId32 , m->m_cgi , j );
else sprintf ( cgi , "%s" , m->m_cgi );
@ -3679,6 +3681,15 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m++;
m->m_title = "adjective neuter<->common variants";
m->m_desc = "Extend to both grammatical genders";
simple_m_set(SearchInput,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
m->m_defOff= offsetof(CollectionRec,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
m->m_cgi = "lwv_adjective_grammatical_gender_simplification";
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m++;
// limit to this # of the top term pairs from inlink text whose
// score is accumulated
@ -5496,7 +5507,7 @@ void Parms::init ( ) {
m->m_off = offsetof(Conf,m_queryLanguageServerName);
m->m_type = TYPE_STRING;
m->m_def = "localhost";
m->m_size = sizeof(Conf::m_urlClassificationServerName);
m->m_size = sizeof(Conf::m_queryLanguageServerName);
m->m_obj = OBJ_CONF;
m->m_group = true;
m->m_page = PAGE_MASTER;
@ -5541,6 +5552,108 @@ void Parms::init ( ) {
m->m_flags = PF_REBUILDQUERYLANGSETTINGS;
m++;
m->m_title = "Site median page temperature server name";
m->m_desc = "";
m->m_cgi = "smpt_server_name";
m->m_off = offsetof(Conf,m_siteMedianPageTemperatureServerName);
m->m_type = TYPE_STRING;
m->m_def = "localhost";
m->m_size = sizeof(Conf::m_siteNumInlinksServerName);
m->m_obj = OBJ_CONF;
m->m_group = true;
m->m_page = PAGE_MASTER;
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
m++;
m->m_title = "Site median page temperature server port";
m->m_desc = "(0=disable; 8076=default server port)";
m->m_cgi = "smpt_server_port";
simple_m_set(Conf,m_siteMedianPageTemperatureServerPort);
m->m_def = "0";
m->m_smin = 0;
m->m_smax = 65535;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
m++;
m->m_title = "Site median page temperature max outstanding requests";
m->m_desc = "(0=disable)";
m->m_cgi = "smpt_max_oustanding_requests";
simple_m_set(Conf,m_maxOutstandingSiteMedianPageTemperature);
m->m_def = "1000";
m->m_smin = 0;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
m++;
m->m_title = "Site median page temperature timeout";
m->m_desc = "Per-request timeout.";
m->m_cgi = "smpt_timeout";
simple_m_set(Conf,m_siteMedianPageTemperatureTimeout);
m->m_def = "500";
m->m_units = "milliseconds";
m->m_smin = 0;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
m++;
m->m_title = "Site num inlinks server name";
m->m_desc = "";
m->m_cgi = "sni_server_name";
m->m_off = offsetof(Conf,m_siteNumInlinksServerName);
m->m_type = TYPE_STRING;
m->m_def = "localhost";
m->m_size = sizeof(Conf::m_siteNumInlinksServerName);
m->m_obj = OBJ_CONF;
m->m_group = true;
m->m_page = PAGE_MASTER;
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
m++;
m->m_title = "Site num inlinks server port";
m->m_desc = "(0=disable; 8077=default server port)";
m->m_cgi = "sni_server_port";
simple_m_set(Conf,m_siteNumInlinksServerPort);
m->m_def = "0";
m->m_smin = 0;
m->m_smax = 65535;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
m++;
m->m_title = "Site num inlinks max outstanding requests";
m->m_desc = "(0=disable)";
m->m_cgi = "sni_max_oustanding_requests";
simple_m_set(Conf,m_maxOutstandingSiteNumInlinks);
m->m_def = "1000";
m->m_smin = 0;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
m++;
m->m_title = "Site num inlinks timeout";
m->m_desc = "Per-request timeout.";
m->m_cgi = "sni_timeout";
simple_m_set(Conf,m_siteNumInlinksTimeout);
m->m_def = "500";
m->m_units = "milliseconds";
m->m_smin = 0;
m->m_group = false;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
m++;
m->m_title = "URL realtime classification server name";
m->m_desc = "";
@ -7464,6 +7577,15 @@ void Parms::init ( ) {
m->m_page = PAGE_WORD_VARIATIONS;
m++;
m->m_title = "adjective neuter<->common variants";
m->m_desc = "Extend to both grammatical genders";
m->m_def = "0.95";
simple_m_set(CollectionRec,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
m->m_cgi = "lwv_adjective_grammatical_gender_simplification";
m->m_flags = PF_API;
m->m_page = PAGE_WORD_VARIATIONS;
m++;
///////////////////////////////////////////
@ -9086,9 +9208,9 @@ void Parms::init ( ) {
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for BlockList";
m->m_title = "log trace info for MatchList";
m->m_cgi = "ltrc_bl";
simple_m_set(Conf,m_logTraceBlockList);
simple_m_set(Conf,m_logTraceMatchList);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
@ -9100,6 +9222,13 @@ void Parms::init ( ) {
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for Docid2FlagsAndSiteMap";
m->m_cgi = "ltrc_dtofsm";
simple_m_set(Conf,m_logTraceDocid2FlagsAndSiteMap);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for DocProcess";
m->m_cgi = "ltrc_docpro";
simple_m_set(Conf,m_logTraceDocProcess);
@ -9325,6 +9454,20 @@ void Parms::init ( ) {
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for SiteMedianPageTemperature";
m->m_cgi = "ltrc_smpt";
simple_m_set(Conf,m_logTraceSiteMedianPageTemperature);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for SiteNumInlinks";
m->m_cgi = "ltrc_sni";
simple_m_set(Conf,m_logTraceSiteNumInlinks);
m->m_def = "0";
m->m_page = PAGE_LOG;
m++;
m->m_title = "log trace info for Spider";
m->m_cgi = "ltrc_sp";
simple_m_set(Conf,m_logTraceSpider);
@ -10783,6 +10926,8 @@ void Parms::handleRequest3fLoop(void *weArg) {
bool rebuildDnsSettings = false;
bool rebuildSpiderSettings = false;
bool rebuildQueryLanguageSettings = false;
bool rebuildSiteNumInlinksSettings = false;
bool rebuildSiteMedianPageTemperatureSettings = false;
// process them
const char *p = we->m_parmPtr;
@ -10883,6 +11028,14 @@ void Parms::handleRequest3fLoop(void *weArg) {
if (parm->m_flags & PF_REBUILDQUERYLANGSETTINGS) {
rebuildQueryLanguageSettings = true;
}
if (parm->m_flags & PF_REBUILDSITENUMINLINKSSETTINGS) {
rebuildSiteNumInlinksSettings = true;
}
if (parm->m_flags & PF_REBUILDSITEMEDIANPAGETEMPSETTINGS) {
rebuildSiteMedianPageTemperatureSettings = true;
}
}
// do the next parm
@ -10946,10 +11099,20 @@ void Parms::handleRequest3fLoop(void *weArg) {
}
if (rebuildQueryLanguageSettings) {
log("parms: rebuild fxclient settings");
log("parms: rebuild querylanguage settings");
g_queryLanguage.reinitializeSettings();
}
if (rebuildSiteNumInlinksSettings) {
log("parms: rebuild sitenuminlinks settings");
g_siteNumInlinks.reinitializeSettings();
}
if (rebuildSiteMedianPageTemperatureSettings) {
log("parms: rebuild sitemedianpagetemperature settings");
g_siteMedianPageTemperature.reinitializeSettings();
}
// note it
if ( ! we->m_sentReply )
log("parms: sending parm update reply");

@ -54,7 +54,7 @@ enum parameter_type_t {
// bit flags for Parm::m_flags
#define PF_COOKIE 0x00000001 // store in cookie?
#define PF_REBUILDQUERYLANGSETTINGS 0x00000002
//#define PF_UNUSED 0x00000004
#define PF_REBUILDSITENUMINLINKSSETTINGS 0x00000004
#define PF_REBUILDSPIDERSETTINGS 0x00000008
#define PF_API 0x00000010
#define PF_REBUILDURLFILTERS 0x00000020
@ -78,7 +78,7 @@ enum parameter_type_t {
#define PF_REBUILDRANKINGSETTINGS 0x00200000 // ranking setting. Reinitialize any derived values
#define PF_TABLESPLIT 0x00400000 // split into separate table
#define PF_REBUILDSITEMEDIANPAGETEMPSETTINGS 0x00800000
class Parm {
public:

@ -3930,6 +3930,7 @@ void PosdbTable::intersectLists_real() {
if(g_pageTemperatureRegistry.query_page_temperature(m_docId, range_min, range_max, &page_temperature)) {
//excellent, we know the page's temperature
} else if(g_d2fasm.lookupSiteHash(m_docId,&sitehash32) && g_smptr.lookup(sitehash32,&raw_default_site_page_temperature)) {
// we'll only use site median page temperature when we have updated docid2siteflags file
//hmm, use the site-default page temperature
page_temperature = g_pageTemperatureRegistry.scale_temperature(range_min, range_max, raw_default_site_page_temperature);
} else {

@ -50,6 +50,8 @@
#include "DocRebuild.h"
#include "DocReindex.h"
#include "QueryLanguage.h"
#include "SiteNumInlinks.h"
#include "SiteMedianPageTemperature.h"
#include <sys/statvfs.h>
#include <pthread.h>
#include <fcntl.h>
@ -623,6 +625,8 @@ bool Process::shutdown2() {
g_urlRealtimeClassification.finalize();
g_queryLanguage.finalize();
g_siteNumInlinks.finalize();
g_siteMedianPageTemperature.finalize();
WantedChecker::finalize();

15
Rdb.cpp

@ -1830,14 +1830,19 @@ char getKeySizeFromRdbId(rdbid_t rdbId) {
case RDB_LINKDB:
case RDB2_LINKDB2:
return sizeof(key224_t); // 28
case RDB_NONE:
case RDB_END:
log(LOG_ERROR, "rdb: bad lookup rdbid of %i", (int)rdbId);
g_process.shutdownAbort(true);
case RDB_TITLEDB:
case RDB2_TITLEDB2:
case RDB_CLUSTERDB:
case RDB2_CLUSTERDB2:
case RDB_DOLEDB:
return sizeof(key96_t); // 12
case RDB_SITEDEFAULTPAGETEMPERATURE:
return 8; //fake
case RDB_NONE:
case RDB_END:
default:
return sizeof(key96_t); // 12
log(LOG_ERROR, "rdb: bad lookup rdbid of %i", (int)rdbId);
g_process.shutdownAbort(true);
}
}

@ -1178,9 +1178,9 @@ bool Repair::injectTitleRec ( ) {
m_stage = STAGE_TITLEDB_0; // 0
return true;
}
mnew ( xd , sizeof(XmlDoc),"xmldocpr");
mnew ( xd , sizeof(XmlDoc),"xmldocpr");
if ( ! xd->set2 ( titleRec,-1,m_cr->m_coll , NULL , MAX_NICENESS ) ) {
if (!xd->set2(titleRec, -1, m_cr->m_coll, MAX_NICENESS)) {
m_recsetErrors++;
m_stage = STAGE_TITLEDB_0; // 0
logTrace(g_conf.m_logTraceRepairs,"END, return true. XmlDoc->set2 failed");
@ -1290,6 +1290,9 @@ bool Repair::injectTitleRec ( ) {
xd->m_blockedDocValid = true;
xd->m_blockedDoc = false;
// don't check site median page temperature
xd->m_calledServiceSiteMedianPageTemperature = true;
// . get the meta list to add
// . sets m_usePosdb, m_useTitledb, etc.
logTrace(g_conf.m_logTraceRepairs,"Calling indexDoc");
@ -1323,7 +1326,7 @@ bool Repair::injectTitleRecSmall(char *titleRec, int32_t titleRecSize) {
//decompress+decode xmldoc
XmlDoc xd;
if(!xd.set2(titleRec,titleRecSize, m_cr->m_coll, NULL, MAX_NICENESS)) {
if (!xd.set2(titleRec, titleRecSize, m_cr->m_coll, MAX_NICENESS)) {
m_recsetErrors++;
m_stage = STAGE_TITLEDB_0;
logTrace(g_conf.m_logTraceRepairs,"END, return true. XmlDoc->set2 failed");

@ -1,15 +0,0 @@
#include "SiteDefaultPageTemperatureRemoteRegistry.h"
bool SiteDefaultPageTemperatureRemoteRegistry::initialize() {
return true;
}
void SiteDefaultPageTemperatureRemoteRegistry::finalize() {
}
bool SiteDefaultPageTemperatureRemoteRegistry::lookup(int32_t /*sitehash32*/, int64_t /*docId*/, void * /*ctx*/, callback_t /*callback*/) {
return false;
}

@ -1,26 +0,0 @@
#ifndef SITEDEFAULTPAGETEMPERATUREREMOTEREGISTRY_H_
#define SITEDEFAULTPAGETEMPERATUREREMOTEREGISTRY_H_
#include <inttypes.h>
namespace SiteDefaultPageTemperatureRemoteRegistry {
bool initialize();
void finalize();
//Look up the site-default page temperature.
enum class lookup_result_t {
error, //something went wrong, look for g_errno for details
page_temperature_known, //page-specific temperature is known, use that
site_temperature_known, //site-default temperature known, good
site_unknown //site unknown, use global default temperature
};
typedef void (*callback_t)(void *ctx, unsigned siteDefaultPageTemperature, lookup_result_t result);
bool lookup(int32_t sitehash32, int64_t docId, void *ctx, callback_t callback);
} //namespace
#endif

@ -0,0 +1,88 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "SiteMedianPageTemperature.h"
#include "Conf.h"
#include "GbUtil.h"
// The protocol is very simple.
// The server Receives queries in the form
// <query-id>:v1|sitehash<NL>
//
// The server responses:
// <query-id>:site_median_page_temperature<NL>
SiteMedianPageTemperature g_siteMedianPageTemperature;
struct SiteMedianPageTemperatureRequest : public FxClientRequest {
SiteMedianPageTemperatureRequest(void *context, int timeout_ms, site_median_page_temperature_callback_t callback, unsigned sitehash)
: FxClientRequest(context, timeout_ms)
, m_callback(callback)
, m_sitehash(sitehash) {
}
site_median_page_temperature_callback_t m_callback;
unsigned m_sitehash;
};
// v1|sitehash
void SiteMedianPageTemperature::convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) {
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
out_buffer->reserve_extra(8 + 1 + 3 + 8 + 1);
sprintf(out_buffer->end(), "%08x", seq);
out_buffer->push_back(8);
out_buffer->end()[0] = ':';
out_buffer->push_back(1);
memcpy(out_buffer->end(), "v1|", 3);
out_buffer->push_back(3);
sprintf(out_buffer->end(), "%08x", request->m_sitehash);
out_buffer->push_back(8);
out_buffer->end()[0] = '\n';
out_buffer->push_back(1);
}
void SiteMedianPageTemperature::processResponse(fxclient_request_ptr_t base_request, char *response) {
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
logTrace(g_conf.m_logTraceSiteMedianPageTemperature, "Got result='%s' for sitehash=%d", response, request->m_sitehash);
unsigned long site_num_inlinks = strtoul(response, nullptr, 10);
(request->m_callback)(request->m_context, site_num_inlinks);
}
void SiteMedianPageTemperature::errorCallback(fxclient_request_ptr_t base_request) {
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
request->m_callback(request->m_context, {});
}
bool SiteMedianPageTemperature::initialize() {
return FxClient::initialize("site temperature", "sitetemp", g_conf.m_siteMedianPageTemperatureServerName, g_conf.m_siteMedianPageTemperatureServerPort,
g_conf.m_maxOutstandingSiteMedianPageTemperature, g_conf.m_logTraceSiteMedianPageTemperature);
}
void SiteMedianPageTemperature::reinitializeSettings() {
FxClient::reinitializeSettings(g_conf.m_siteMedianPageTemperatureServerName, g_conf.m_siteMedianPageTemperatureServerPort,
g_conf.m_maxOutstandingSiteMedianPageTemperature, g_conf.m_logTraceSiteMedianPageTemperature);
}
bool SiteMedianPageTemperature::getSiteMedianPageTemperature(void *context, site_median_page_temperature_callback_t callback, unsigned sitehash) {
return sendRequest(std::static_pointer_cast<FxClientRequest>(std::make_shared<SiteMedianPageTemperatureRequest>(context, g_conf.m_siteMedianPageTemperatureTimeout, callback, sitehash)));
}

@ -0,0 +1,43 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_SITEMEDIANPAGETEMPERATURE_H
#define FX_SITEMEDIANPAGETEMPERATURE_H
#include "FxClient.h"
typedef void (*site_median_page_temperature_callback_t)(void *context, long count);
class SiteMedianPageTemperature : public FxClient {
public:
bool initialize();
void reinitializeSettings();
using FxClient::finalize;
void convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) override;
void processResponse(fxclient_request_ptr_t base_request, char *response) override;
void errorCallback(fxclient_request_ptr_t base_request) override;
bool getSiteMedianPageTemperature(void *context, site_median_page_temperature_callback_t callback, unsigned sitehash);
};
extern SiteMedianPageTemperature g_siteMedianPageTemperature;
#endif //FX_SITEMEDIANPAGETEMPERATURE_H

88
SiteNumInlinks.cpp Normal file

@ -0,0 +1,88 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "SiteNumInlinks.h"
#include "Conf.h"
#include "GbUtil.h"
// The protocol is very simple.
// The server Receives queries in the form
// <query-id>:v1|sitehash<NL>
//
// The server responses:
// <query-id>:site_inlink_count<NL>
SiteNumInlinks g_siteNumInlinks;
struct SiteNumInlinksRequest : public FxClientRequest {
SiteNumInlinksRequest(void *context, int timeout_ms, site_inlinks_count_callback_t callback, unsigned sitehash)
: FxClientRequest(context, timeout_ms)
, m_callback(callback)
, m_sitehash(sitehash) {
}
site_inlinks_count_callback_t m_callback;
unsigned m_sitehash;
};
// v1|sitehash
void SiteNumInlinks::convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) {
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
out_buffer->reserve_extra(8 + 1 + 3 + 8 + 1);
sprintf(out_buffer->end(), "%08x", seq);
out_buffer->push_back(8);
out_buffer->end()[0] = ':';
out_buffer->push_back(1);
memcpy(out_buffer->end(), "v1|", 3);
out_buffer->push_back(3);
sprintf(out_buffer->end(), "%08x", request->m_sitehash);
out_buffer->push_back(8);
out_buffer->end()[0] = '\n';
out_buffer->push_back(1);
}
void SiteNumInlinks::processResponse(fxclient_request_ptr_t base_request, char *response) {
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
logTrace(g_conf.m_logTraceSiteNumInlinks, "Got result='%s' for sitehash=%d", response, request->m_sitehash);
unsigned long site_num_inlinks = strtoul(response, nullptr, 10);
(request->m_callback)(request->m_context, site_num_inlinks);
}
void SiteNumInlinks::errorCallback(fxclient_request_ptr_t base_request) {
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
request->m_callback(request->m_context, {});
}
bool SiteNumInlinks::initialize() {
return FxClient::initialize("site num inlinks", "sitenum", g_conf.m_siteNumInlinksServerName, g_conf.m_siteNumInlinksServerPort,
g_conf.m_maxOutstandingSiteNumInlinks, g_conf.m_logTraceSiteNumInlinks);
}
void SiteNumInlinks::reinitializeSettings() {
FxClient::reinitializeSettings(g_conf.m_siteNumInlinksServerName, g_conf.m_siteNumInlinksServerPort,
g_conf.m_maxOutstandingSiteNumInlinks, g_conf.m_logTraceSiteNumInlinks);
}
bool SiteNumInlinks::getSiteNumInlinks(void *context, site_inlinks_count_callback_t callback, unsigned sitehash) {
return sendRequest(std::static_pointer_cast<FxClientRequest>(std::make_shared<SiteNumInlinksRequest>(context, g_conf.m_siteNumInlinksTimeout, callback, sitehash)));
}

43
SiteNumInlinks.h Normal file

@ -0,0 +1,43 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#ifndef FX_SITENUMINLINKS_H
#define FX_SITENUMINLINKS_H
#include "FxClient.h"
typedef void (*site_inlinks_count_callback_t)(void *context, long count);
class SiteNumInlinks : public FxClient {
public:
bool initialize();
void reinitializeSettings();
using FxClient::finalize;
void convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) override;
void processResponse(fxclient_request_ptr_t base_request, char *response) override;
void errorCallback(fxclient_request_ptr_t base_request) override;
bool getSiteNumInlinks(void *context, site_inlinks_count_callback_t callback, unsigned sitehash);
};
extern SiteNumInlinks g_siteNumInlinks;
#endif //FX_SITENUMINLINKS_H

@ -258,7 +258,7 @@ void filterTitledbList(RdbList *list) {
if (!KEYNEG(rec)) {
XmlDoc xd;
if (xd.set2(rec, recSize, "main", NULL, 0)) {
if (xd.set2(rec, recSize, "main", 0)) {
if (isUrlBlocked(*(xd.getFirstUrl()))) {
++filteredCount;
continue;

@ -15,6 +15,8 @@
UrlMatchList g_urlBlackList("urlblacklist*.txt");
UrlMatchList g_urlWhiteList("urlwhitelist.txt");
UrlMatchList g_urlProxyList("urlproxylist.txt");
UrlMatchList g_urlRetryProxyList("urlretryproxylist.txt");
typedef std::vector<UrlMatch> urlmatchlist_t;
typedef spp::sparse_hash_map<std::string, urlmatchlist_t> urlmatchlist_map_t;
@ -245,6 +247,8 @@ bool UrlMatchList::load() {
if (firstColEnd == 6 && memcmp(line.data(), "domain", 6) == 0) {
if (!parseDomain(&tmpUrlMatchList, col2, col3, col4)) {
logError("Invalid line found. Ignoring line='%s'", line.c_str());
// catch domain parsing errors here
gbshutdownLogicError();
continue;
}
} else {

@ -44,4 +44,7 @@ private:
extern UrlMatchList g_urlBlackList;
extern UrlMatchList g_urlWhiteList;
extern UrlMatchList g_urlProxyList;
extern UrlMatchList g_urlRetryProxyList;
#endif //GB_URLMATCHLIST_H_

@ -55,7 +55,8 @@
#include "IpBlockList.h"
#include "PageTemperatureRegistry.h"
#include "SiteMedianPageTemperatureRegistry.h"
#include "SiteDefaultPageTemperatureRemoteRegistry.h"
#include "SiteNumInlinks.h"
#include "SiteMedianPageTemperature.h"
#include <iostream>
#include <fstream>
#include <sysexits.h>
@ -181,7 +182,7 @@ void XmlDoc::reset ( ) {
m_checkedIpBlockList = false;
m_defaultSitePageTemperature = 0;
m_defaultSitePageTemperatureValid = false;
m_defaultSitePageTemperatureIsUnset = false;
m_calledServiceSiteMedianPageTemperature = false;
m_parsedRobotsMetaTag = false;
m_robotsNoIndex = false;
m_robotsNoFollow = false;
@ -336,8 +337,6 @@ void XmlDoc::reset ( ) {
m_wtsTable.reset();
m_wbuf.reset();
m_pageLinkBuf.reset();
m_siteLinkBuf.reset();
m_esbuf.reset();
m_tagRecBuf.reset();
@ -420,6 +419,8 @@ void XmlDoc::reset ( ) {
// do not cache the http reply in msg13 etc.
m_maxCacheAge = 0;
m_calledServiceSiteNumInlinks = false;
// reset these ptrs too!
void *px = &ptr_firstUrl;
void *pxend = &m_dummyEnd;
@ -555,7 +556,7 @@ bool XmlDoc::loadFromOldTitleRec() {
// use that. decompress it! this will also set
// m_setFromTitleRec to true
if (!set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, nullptr, m_niceness)) {
if (!set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, m_niceness)) {
// we are now loaded, do not re-call
m_loaded = true;
@ -705,9 +706,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
m_wasContentInjected = true;
m_contentType = contentType;
m_contentTypeValid = true;
// use this ip as well for now to avoid ip lookup
//m_ip = atoip("127.0.0.1");
//m_ipValid = true;
// do not need robots.txt then
m_isAllowed = true;
m_isAllowedValid = true;
@ -840,7 +839,6 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
bool XmlDoc::set2 ( char *titleRec ,
int32_t maxSize ,
const char *coll ,
SafeBuf *pbuf ,
int32_t niceness ,
SpiderRequest *sreq ) {
@ -849,37 +847,12 @@ bool XmlDoc::set2 ( char *titleRec ,
setStatus ( "setting xml doc from title rec");
// . it resets us, so save this
// . we only save these for set2() not the other sets()!
//void (*cb1)(void *state) = m_callback1;
//bool (*cb2)(void *state) = m_callback2;
//void *state = m_state;
// . clear it all out
// . no! this is clearing our msg20/msg22 reply...
// . ok, but repair.cpp needs it so do it there then
//reset();
// restore callbacks
//m_callback1 = cb1;
//m_callback2 = cb2;
//m_state = state;
// sanity check - since we do not reset
if ( m_contentValid ) { g_process.shutdownAbort(true); }
// this is true
m_setFromTitleRec = true;
// this is valid i guess. includes key, etc.
//m_titleRec = titleRec;
//m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key96_t) + 4;
//m_titleRecValid = true;
// . should we free m_cbuf on our reset/destruction?
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
// that should not be freed, besides the alloc size is not known!
//m_freeTitleRec = false;
// it must be there!
if ( !titleRec ) { g_errno=ENOTFOUND; return false; }
@ -900,8 +873,6 @@ bool XmlDoc::set2 ( char *titleRec ,
}
m_titleRecBufValid = true;
//m_coll = coll;
m_pbuf = pbuf;
m_niceness = niceness;
// set our collection number
@ -1071,14 +1042,6 @@ bool XmlDoc::set2 ( char *titleRec ,
// set our easy stuff
gbmemcpy ( (void *)this , m_ubuf , headerSize );
// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
// like in Msg.cpp and Msg20Reply.cpp
if ( m_pbuf ) {
int32_t crc = hash32(m_ubuf,headerSize);
m_pbuf->safePrintf("crchdr=0x%" PRIx32" sizehdr=%" PRId32", ",
crc,headerSize);
}
// point to the string data
char *up = m_ubuf + headerSize;
@ -1128,12 +1091,6 @@ bool XmlDoc::set2 ( char *titleRec ,
// point to the data. could be 64-bit ptr.
*pd = up;//(int32_t)up;
// debug
if ( m_pbuf ) {
int32_t crc = hash32(up,*ps);
m_pbuf->safePrintf("crc%" PRId32"=0x%" PRIx32" size%" PRId32"=%" PRId32", ",
i,crc,i,*ps);
}
// skip over data
up += *ps;
@ -1489,7 +1446,7 @@ bool XmlDoc::injectDoc(const char *url,
m_indexCodeValid = true;
}
if (httpStatus != 200) {
if (httpStatus != 0 && httpStatus != 200) {
m_httpStatus = httpStatus;
m_httpStatusValid = true;
}
@ -2029,58 +1986,47 @@ bool* XmlDoc::checkBlockList() {
return &m_blockedDoc;
}
static void gotDefaultSitePageTemperature(void *context, long count) {
XmlDoc *xmlDoc = reinterpret_cast<XmlDoc*>(context);
if (count != -1) {
xmlDoc->m_defaultSitePageTemperature = count;
xmlDoc->m_defaultSitePageTemperatureValid = true;
}
xmlDoc->m_masterLoop(xmlDoc->m_masterState);
}
unsigned *XmlDoc::getDefaultSitePageTemperature() {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
if(m_defaultSitePageTemperatureIsUnset) {
//already tried to look up. Don't try it again
logTrace(g_conf.m_logTraceXmlDoc, "END, already tried, (unset)");
return NULL;
}
if(m_defaultSitePageTemperatureValid) {
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid. m_defaultSitePageTemperature=%u" , m_defaultSitePageTemperature);
if (m_defaultSitePageTemperatureValid) {
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid. m_defaultSitePageTemperature=%u", m_defaultSitePageTemperature);
return &m_defaultSitePageTemperature;
}
int64_t *docId = getDocId();
if(!docId || docId==(int64_t*)-1) {
if (!docId || docId == (int64_t *)-1) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getDocId() failed or blocked");
return (unsigned*)docId;
return (unsigned *)docId;
}
int32_t *sitehash32 = getSiteHash32();
if(sitehash32==NULL || sitehash32==(int32_t*)-1) {
if (sitehash32 == NULL || sitehash32 == (int32_t *)-1) {
logTrace(g_conf.m_logTraceXmlDoc, "END, getSiteHash32 failed/blocked");
return (unsigned*)sitehash32;
return (unsigned *)sitehash32;
}
if(g_smptr.lookup(*sitehash32, &m_defaultSitePageTemperature)) {
m_defaultSitePageTemperatureValid = true;
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperatureRegistry hit");
return &m_defaultSitePageTemperature;
}
m_defaultSitePageTemperatureIsUnset = true; //make sure we try this only once
if(!SiteDefaultPageTemperatureRemoteRegistry::lookup(*sitehash32, m_docId, this, &XmlDoc::gotDefaultSitePageTemperature)) {
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteDefaultPageTemperatureRemoteRegistry is disabled");
return NULL;
}
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteDefaultPageTemperatureRemoteRegistry::lookup() blocked");
return (unsigned*)-1;
}
void XmlDoc::gotDefaultSitePageTemperature(void *ctx, unsigned siteDefaultPageTemperature, SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t result) {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN, siteDefaultPageTemperature=%u, result=%d", siteDefaultPageTemperature,(int)result);
XmlDoc *that = reinterpret_cast<XmlDoc*>(ctx);
if(result==SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t::site_temperature_known) {
that->m_defaultSitePageTemperature = siteDefaultPageTemperature;
that->m_defaultSitePageTemperatureValid = true;
} else
that->m_defaultSitePageTemperatureIsUnset = true;
indexDocWrapper(that);
}
if (!m_calledServiceSiteMedianPageTemperature &&
g_siteMedianPageTemperature.getSiteMedianPageTemperature(this, gotDefaultSitePageTemperature, *sitehash32)) {
m_calledServiceSiteMedianPageTemperature = true;
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperature::getSiteMedianPageTemperature is blocked");
return (unsigned *)-1;
}
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperature is disabled");
return nullptr;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
@ -4363,8 +4309,7 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
// . apply link spam settings
// . set the "spam bits" in the Links class
setLinkSpam ( *ip ,
u , // linker url
setLinkSpam (u , // linker url
*sni ,
xml ,
&m_links ,
@ -6006,11 +5951,7 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
// ,m_firstUrl.getUrl());
// if title rec is corrupted data uncompress will fail and this
// will return false!
if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
m_oldTitleRecSize , // maxSize
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
if (!m_oldDoc->set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, m_niceness)) {
log("build: failed to set old doc for %s",m_firstUrl.getUrl());
if ( ! g_errno ) { g_process.shutdownAbort(true); }
//int32_t saved = g_errno;
@ -6286,11 +6227,7 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
// if we had the title rec, set from that
if ( *rtr ) {
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
m_rootTitleRecSize , // maxSize ,
cr->m_coll ,
NULL , // pbuf
m_niceness ) ) {
if (!m_rootDoc->set2(m_rootTitleRec, m_rootTitleRecSize, cr->m_coll, m_niceness)) {
// it was corrupted... delete this
// possibly printed
// " uncompress uncompressed size=..." bad uncompress
@ -6799,6 +6736,16 @@ int32_t *XmlDoc::getFirstIp ( ) {
return &m_firstIp;
}
static void gotSiteNumInlinksWrapper(void *context, long count) {
XmlDoc *xmlDoc = reinterpret_cast<XmlDoc*>(context);
if (count != -1) {
xmlDoc->m_siteNumInlinks = count;
xmlDoc->m_siteNumInlinksValid = true;
}
xmlDoc->m_masterLoop(xmlDoc->m_masterState);
}
// this is the # of GOOD INLINKS to the site. so it is no more than
// 1 per c block, and it has to pass link spam detection. this is the
// highest-level count of inlinks to the site. use it a lot.
@ -6836,6 +6783,17 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
return &m_siteNumInlinks;
}
int32_t *sh32 = getSiteHash32();
if (!sh32 || sh32 == (void *)-1) {
return (int32_t *)sh32;
}
// make sure we only call site num inlink server once
if (!m_calledServiceSiteNumInlinks && g_siteNumInlinks.getSiteNumInlinks(this, gotSiteNumInlinksWrapper, *sh32)) {
m_calledServiceSiteNumInlinks = true;
return (int32_t*)-1;
}
setStatus ( "getting site num inlinks");
// get it from the tag rec if we can
@ -7043,12 +7001,6 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
return NULL;
}
// can we be cancelled?
bool canBeCancelled = true;
// not if pageparser though
if ( m_pbuf ) canBeCancelled = false;
// not if injecting
if ( ! m_sreqValid ) canBeCancelled = false;
// assume valid when it returns
m_siteLinkInfoValid = true;
@ -7080,7 +7032,6 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
m_niceness ,
cr->m_doLinkSpamCheck ,
cr->m_oneVotePerIpDom ,
canBeCancelled ,
lastUpdateTime ,
onlyNeedGoodInlinks ,
0,
@ -7863,14 +7814,6 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
// do not redo it
m_calledMsg25 = true;
// shortcut
//Msg25 *m = &m_msg25;
// can we be cancelled?
bool canBeCancelled = true;
// not if pageparser though
if ( m_pbuf ) canBeCancelled = false;
// not if injecting
if ( ! m_sreqValid ) canBeCancelled = false;
// we do not want to waste time computing the page title
// of bad inlinks if we only want the good inlinks, because
@ -7914,7 +7857,6 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
m_niceness ,
doLinkSpamCheck ,
oneVotePerIpDom ,
canBeCancelled ,
lastUpdateTime ,
onlyNeedGoodInlinks ,
0, // ourhosthash32 (special)
@ -13547,7 +13489,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
g_process.shutdownAbort(true);
}
if(m_defaultSitePageTemperatureValid) {
if(!forDelete && m_defaultSitePageTemperatureValid) {
*m_p++ = RDB_SITEDEFAULTPAGETEMPERATURE;
uint64_t k = (m_docId<<1) | 0x01; //magic bit shuffling so msg4 can treat it as a normal rdb key with negative-bit etc.
memcpy(m_p, &k, 8);
@ -14038,9 +13980,11 @@ skipNewAdd2:
// store data
if (ds) {
// store data size
*(int32_t *)nptr = ds;
nptr += 4;
// only store data size if it's not fixed sized
if (getDataSizeFromRdbId(rdbId) == -1) {
*(int32_t *) nptr = ds;
nptr += 4;
}
gbmemcpy (nptr, data, ds);
nptr += ds;
@ -15437,7 +15381,7 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
if ( ! m_setTr ) {
// . this completely resets us
// . this returns false with g_errno set on error
bool status = set2( *otr, 0, cr->m_coll, NULL, m_niceness);
bool status = set2( *otr, 0, cr->m_coll, m_niceness);
// sanity check
if ( ! status && ! g_errno ) {
@ -15560,18 +15504,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
bool getThatTitle = true;
if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
if ( m_reply.ptr_tbuf ) getThatTitle = false;
// if steve's requesting the inlink summary we will want to get
// the title of each linker even if they are spammy!
// only get title here if NOT getting link text otherwise
// we only get it down below if not a spammy voter, because
// this sets the damn slow sections class
if ( m_req->m_getLinkText &&
! m_useSiteLinkBuf &&
! m_usePageLinkBuf &&
// m_pbuf is used by pageparser.cpp now, not the other two things
// above this.
! m_pbuf )
getThatTitle = false;
// if steve is getting the inlinks, bad and good, for displaying
// then get the title here now... otherwise, if we are just spidering
@ -15904,9 +15836,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
m_reply.size_rssItem = rssItemLen + 1;
}
if ( ! m_req->m_doLinkSpamCheck )
m_reply.m_isLinkSpam = 0;
if ( m_req->m_doLinkSpamCheck ) {
// reset to NULL to avoid strlen segfault
const char *note = NULL;
@ -15918,7 +15847,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
// get it. does not block.
m_reply.m_isLinkSpam = ::isLinkSpam ( linker ,
m_ip ,
m_siteNumInlinks,
&m_xml,
links,
@ -15938,12 +15866,15 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
m_reply.size_note = strlen(note)+1;
}
// log the reason why it is a log page
if ( m_reply.m_isLinkSpam )
log(LOG_DEBUG,"build: linker %s: %s.",
linker->getUrl(),note);
// sanity
if ( m_reply.m_isLinkSpam && ! note )
log("linkspam: missing note for d=%" PRId64"!",m_docId);
if (m_reply.m_isLinkSpam) {
log(LOG_DEBUG, "build: linker %s: %s.", linker->getUrl(), note);
if (!note) {
log(LOG_WARN, "linkspam: missing note for d=%" PRId64"!", m_docId);
}
}
} else {
m_reply.m_isLinkSpam = 0;
}
// sanity check
@ -16721,7 +16652,6 @@ char *XmlDoc::getIsLinkSpam ( ) {
// . doc length over 100,000 bytes consider it link spam
m_isLinkSpamValid = true;
m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
*ip ,
*sni ,
xml,
links,
@ -17441,23 +17371,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
printRainbowSections ( sb , NULL );
//
// PRINT LINKINFO
//
char *p = m_pageLinkBuf.getBufStart();
int32_t plen = m_pageLinkBuf.length();
sb->safeMemcpy ( p , plen );
//
// PRINT SITE LINKINFO
//
p = m_siteLinkBuf.getBufStart();
plen = m_siteLinkBuf.length();
sb->safeMemcpy ( p , plen );
// note this
sb->safePrintf("<h2>NEW Meta List</h2>");
@ -17659,8 +17572,7 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
if ( page == 2 )
return printPageInlinks(sb,hr);
if ( page == 3 )
return printSiteInlinks(sb,hr);
// 3 used to be print site inlinks (nothing is printed)
if ( page == 4 )
return printRainbowSections(sb,hr);
@ -17668,8 +17580,7 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
if ( page == 5 )
return printTermList(sb,hr);
if ( page == 6 )
return printSpiderStats(sb,hr);
// 6 used to be print spider stats (coming soon page)
if ( page == 7 )
return printCachedPage(sb,hr);
@ -18113,44 +18024,6 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
return true;
}
bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {
// use msg25 to hit linkdb and give us a link info class i guess
// but we need paging functionality so we can page through like
// 100 links at a time. clustered by c-class ip.
// do we need to mention how many from each ip c-class then? because
// then we'd have to read the whole termlist, might be several
// separate disk reads.
// we need to re-get both if either is NULL
LinkInfo *sinfo = getSiteLinkInfo();
// block or error?
if ( ! sinfo ) return true;
if ( sinfo == (LinkInfo *)-1) return false;
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
if ( isXml )
sb->safePrintf ("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
);
sb->safeMemcpy ( &m_siteLinkBuf );
if ( isXml )
sb->safePrintf ("</response>\n" );
// just print that
//sinfo->print ( sb , cr->m_coll );
return true;
}
bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
// we need to re-get both if either is NULL
@ -18177,8 +18050,6 @@ bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
// i guess we need this
if ( ! recompute ) // m_setFromTitleRec )
info1->print ( sb , cr->m_coll );
else
sb->safeMemcpy ( &m_pageLinkBuf );
if ( isXml )
sb->safePrintf ("</response>\n" );
@ -18675,17 +18546,6 @@ bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {
return true;
}
bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
sb->safePrintf("<b>Coming Soon</b>");
return true;
}
bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {
char **c = getUtf8Content();

@ -51,7 +51,6 @@
#include "HttpMime.h" // ET_DEFLAT
#include "Json.h"
#include "Posdb.h"
#include "SiteDefaultPageTemperatureRemoteRegistry.h" //SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t
// forward declaration
@ -282,7 +281,6 @@ public:
bool set2 ( char *titleRec,
int32_t maxSize,
const char *coll,
SafeBuf *p,
int32_t niceness ,
class SpiderRequest *sreq = NULL );
@ -338,7 +336,6 @@ public:
bool *checkBlockList();
unsigned *getDefaultSitePageTemperature();
static void gotDefaultSitePageTemperature(void *ctx, unsigned siteDefaultPageTemperature, SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t result);
bool *parseRobotsMetaTag();
void parseRobotsMetaTagContent(const char *content, int32_t contentLen);
@ -582,14 +579,10 @@ public:
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
void printTermList() const;
char *getTitleBuf ( );
char *getRootTitleBuf ( );
char *getFilteredRootTitleBuf ( );
@ -1098,10 +1091,6 @@ public:
HashTableX m_wtsTable;
SafeBuf m_wbuf;
// Msg25.cpp stores its pageparser.cpp output into this one
SafeBuf m_pageLinkBuf;
SafeBuf m_siteLinkBuf;
// which set() function was called above to set us?
bool m_setFromTitleRec;
bool m_setFromSpiderRec;
@ -1139,7 +1128,8 @@ public:
bool m_checkedIpBlockList;
unsigned m_defaultSitePageTemperature;
bool m_defaultSitePageTemperatureIsUnset;
bool m_calledServiceSiteMedianPageTemperature;
bool m_parsedRobotsMetaTag;
bool m_robotsNoIndex;
bool m_robotsNoFollow;
@ -1188,6 +1178,8 @@ public:
void logQueryTimingEnd(const char* function, int64_t startTime);
void callCallback();
bool m_calledServiceSiteNumInlinks;
};
// . PageParser.cpp uses this class for printing hashed terms out by calling

@ -366,8 +366,7 @@ static bool isWebstatisticsPage(const Xml *xml) {
// . otherwise, each outlink in "links" is assigned a "note" to indicate if
// the outlink is a spam link or not
// . returns true on success, false on error
bool setLinkSpam ( int32_t ip ,
const Url *linker ,
bool setLinkSpam (const Url *linker ,
int32_t siteNumInlinks ,
Xml *xml ,
Links *links ,
@ -614,9 +613,7 @@ bool setLinkSpam ( int32_t ip ,
bool isLinkSpam ( const Url *linker,
int32_t ip ,
int32_t siteNumInlinks ,
//TitleRec *tr,
Xml *xml,
Links *links ,
int32_t maxDocLen ,
@ -631,11 +628,10 @@ bool isLinkSpam ( const Url *linker,
int32_t h1len = linkee->getHostLen();
const char *h2 = linker->getHost();
int32_t h2len = linker->getHostLen();
//if ( tr ) h2 = tr->getUrl()->getHost();
//if ( tr ) h2len = tr->getUrl()->getHostLen();
if ( h1len == h2len && strncmp ( h1 , h2 , h1len ) == 0 )
return false;
}
// do not allow .info or .biz to vote ever for now
const char *tld = linker->getTLD();
int32_t tldLen = linker->getTLDLen();
@ -671,9 +667,6 @@ bool isLinkSpam ( const Url *linker,
// do not allow any cgi url to vote
if ( linker->isCgi() ) { *note = "path is cgi"; return true; }
// if the page has just one rel=nofollow tag then we know they
// are not a guestbook
//if ( links->hasRelNoFollow() ) plen = 0;
if(isLinkfulPath(linker->getPath(),linker->getPathLen(),note))
return true;

@ -8,15 +8,13 @@
class Url;
bool setLinkSpam ( int32_t ip ,
const Url *linker ,
bool setLinkSpam (const Url *linker ,
int32_t siteNumInlinks ,
class Xml *xml ,
class Links *links ,
bool isContentTruncated );
bool isLinkSpam ( const Url *linker ,
int32_t ip ,
int32_t siteNumInlinks ,
class Xml *xml ,
class Links *links ,

@ -105,6 +105,9 @@
#include "IpBlockList.h"
#include "SpiderdbSqlite.h"
#include "QueryLanguage.h"
#include "SiteNumInlinks.h"
#include "ContentMatchList.h"
#include "SiteMedianPageTemperature.h"
#include "Lemma.h"
@ -438,6 +441,12 @@ int main2 ( int argc , char *argv[] ) {
//initialize IP address checks
initialize_ip_address_checks();
// Make sure TLD table is initializing before calling any URL handling function
if(!initializeDomains(g_hostdb.m_dir)) {
log( LOG_ERROR, "Domains initialization failed!" );
return 1;
}
// load up hosts.conf
// . it will determine our hostid based on the directory path of this
@ -1239,11 +1248,6 @@ int main2 ( int argc , char *argv[] ) {
log( LOG_ERROR, "Wiki initialization failed!" );
return 1;
}
if(!initializeDomains(g_hostdb.m_dir)) {
log( LOG_ERROR, "Domains initialization failed!" );
return 1;
}
// shout out if we're in read only mode
if ( g_conf.m_readOnlyMode )
@ -1310,9 +1314,12 @@ int main2 ( int argc , char *argv[] ) {
g_dnsBlockList.init();
g_contentTypeBlockList.init();
g_ipBlockList.init();
g_contentRetryProxyList.init();
g_urlBlackList.init();
g_urlWhiteList.init();
g_urlProxyList.init();
g_urlRetryProxyList.init();
g_robotsCheckList.init();
@ -1461,6 +1468,8 @@ int main2 ( int argc , char *argv[] ) {
// initialize clients
g_urlRealtimeClassification.initialize();
g_queryLanguage.initialize();
g_siteNumInlinks.initialize();
g_siteMedianPageTemperature.initialize();
if(!WantedChecker::initialize())
return 0;
@ -2448,7 +2457,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
xd->reset();
// uncompress the title rec
//TitleRec tr;
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
@ -3373,7 +3382,7 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
xd->reset();
// uncompress the title rec
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
@ -3547,7 +3556,7 @@ static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_
xd->reset();
// uncompress the title rec
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
@ -3688,7 +3697,7 @@ static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t
xd->reset();
// uncompress the title rec
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
@ -3868,7 +3877,7 @@ static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t
xd->reset();
// uncompress the title rec
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
if (!xd->set2(rec, recSize, coll, 0)) {
//set2() may have logged something but not the docid
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
continue;
@ -3974,7 +3983,7 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
char *rec = tlist.getCurrentRec();
int32_t listSize = tlist.getListSize ();
XmlDoc xd;
if ( ! xd.set2 ( rec , listSize , coll , NULL , 0 ) ) {
if (!xd.set2(rec, listSize, coll, 0)) {
log(LOG_WARN, "build: speedtestxml: Error setting xml doc.");
return false;
}
@ -3999,7 +4008,7 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
// speed test
int64_t t = gettimeofdayInMilliseconds();
for ( int32_t k = 0 ; k < 100 ; k++ )
xd.set2 (rec, listSize, coll , NULL , 0 );
xd.set2(rec, listSize, coll, 0);
int64_t e = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
(float)(e-t)/100.0);
@ -4226,7 +4235,7 @@ static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t
// loop parse
for ( int32_t i = 0 ; i < 100 ; i++ ) {
XmlDoc xd;
if( !xd.set2 (rec, listSize, coll,NULL,0) ) {
if (!xd.set2(rec, listSize, coll, 0)) {
log(LOG_ERROR,"%s:%s: XmlDoc.set2 failed", __FILE__, __func__);
return false;
}
@ -4915,7 +4924,7 @@ static void countdomains(const char* coll, int32_t numRecs, int32_t output) {
}
XmlDoc xd;
if ( ! xd.set2 (rec, recSize, coll,NULL,0) )
if (!xd.set2(rec, recSize, coll, 0))
continue;
struct ip_info *sipi ;

@ -14,9 +14,14 @@ fi
echo "===Making signature"
$bd/sto_convert.py signature --output_file="$2" || exit
for input_file in $1/STO_LMF_morphology_{adj,noun,pronoun,rest,verb}*.xml; do
echo "===Processing $input_file"
$bd/sto_convert.py convert --input_file=$input_file --output_file=$2 || exit
done
echo "===Done"
#is it the original STO files, or have they been split into lexical entries?
if [ -d $1/noun -a -d $1/verb ]; then
$bd/sto_convert.py convert --input_tree=$1 --output_file=$2 || exit
else
for input_file in $1/STO_LMF_morphology_{adj,noun,pronoun,rest,verb}*.xml; do
echo "===Processing $input_file"
$bd/sto_convert.py convert --input_file=$input_file --output_file=$2 || exit
done
echo "===Done"
fi
exit 0

@ -1,9 +1,15 @@
#!/usr/bin/python3
#!/usr/bin/python2
from __future__ import print_function
import xml.etree.ElementTree
import struct
import argparse
import sys
import os
#hack to make utf-8 values work
import sys
reload(sys)
sys.setdefaultencoding("utf_8")
part_of_speech_map={
"adjective":1,
@ -81,104 +87,144 @@ word_form_attribute_map={
}
def do_convert(input_file_name, output_file):
total_entry_count = None
total_wordform_count = None
warnings = {}
skips = {}
def emit_warning(id,what):
global warnings
warnings[id] = what
def emit_skip(id,why):
global skips
skips[id] = why
def process_lexcial_entry(lexicalentry,output_file):
global total_entry_count, total_wordform_count
part_of_speech=None
id=None
morphological_unit_id=None
for feat in lexicalentry.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
if att=="partOfSpeech":
if val in part_of_speech_map:
part_of_speech = part_of_speech_map[val]
else:
print("Unknown part_of_speech: ",val, file=sys.stderr)
sys.exit(2)
elif att=="id":
id=val
elif att=="morphologicalUnitId":
morphological_unit_id=val
#todo:decomposition
if part_of_speech==None:
emit_skip(id,"No partOfSpeech")
return
if morphological_unit_id==None:
emit_skip(id,"No morphologicalUnitId")
return
raw_wordforms = b""
wordform_count = 0
for wordform in lexicalentry.findall("WordForm"):
attributes=[]
for feat in wordform.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("wordform.feat: att=%s val=%s"%(att,val))
s=att+"_"+val
if s in word_form_attribute_map:
attributes.append(word_form_attribute_map[s])
else:
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
sys.exit(2)
if len(attributes)==0:
emit_warning(id,"No <feat> attributes")
#happens for a few entries such as "Chippendale". We convert it anyway because at least we know the part-of-speech
if len(attributes)>6:
emit_skip(id,"Too many <feat>")
return
while len(attributes)<6:
attributes.append(0)
for formrepresentation in wordform.findall("FormRepresentation"):
writtenform=None
for feat in formrepresentation.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
if att=="writtenForm":
writtenform=val
raw_writtenform = writtenform.encode()
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
+ struct.pack(">B",len(raw_writtenform)) \
+ raw_writtenform
wordform_count += 1
raw_wordforms += raw_wordform
raw_morphological_unit_id = morphological_unit_id.encode()
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
output_file.write(raw_entry)
total_entry_count += 1
total_wordform_count += wordform_count
def do_convert_lexicon_file(input_file_name, output_file):
print("Opening and parsing %s"%(input_file_name))
tree = xml.etree.ElementTree.parse(input_file_name)
root = tree.getroot()
lexicon=root.find("Lexicon")
global total_entry_count, total_wordform_count
total_entry_count=0
total_wordform_count=0
for lexicalentry in lexicon.findall("LexicalEntry"):
part_of_speech=None
id=None
morphological_unit_id=None
for feat in lexicalentry.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
if att=="partOfSpeech":
if val in part_of_speech_map:
part_of_speech = part_of_speech_map[val]
else:
print("Unknown part_of_speech: ",val, file=sys.stderr)
sys.exit(2)
elif att=="id":
id=val
elif att=="morphologicalUnitId":
morphological_unit_id=val
#todo:decomposition
if part_of_speech==None:
print("Entry %s doesn't have partOfSpeech"%id, file=sys.stderr)
if morphological_unit_id==None:
print("Entry %s doesn't have morphologicalUnitId"%id, file=sys.stderr)
sys.exit(2)
raw_wordforms = b""
wordform_count = 0
for wordform in lexicalentry.findall("WordForm"):
attributes=[]
for feat in wordform.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
#print("wordform.feat: att=%s val=%s"%(att,val))
s=att+"_"+val
if s in word_form_attribute_map:
attributes.append(word_form_attribute_map[s])
else:
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
sys.exit(2)
if len(attributes)==0:
print("Entry %s: No feat?"%(id),file=sys.stderr)
#happens for a few entries such as "Chippendale". We convert it anyway beucase at least we know the part-of-speech
#sys.exit(2)
if len(attributes)>6:
print("Entry %s: Too many feat (%d)"%(id,len(attributes)),file=sys.stderr)
sys.exit(2)
while len(attributes)<6:
attributes.append(0)
for formrepresentation in wordform.findall("FormRepresentation"):
writtenform=None
for feat in formrepresentation.findall("feat"):
att=feat.attrib["att"]
val=feat.attrib["val"]
if att=="writtenForm":
writtenform=val
raw_writtenform = writtenform.encode()
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
+ struct.pack(">B",len(raw_writtenform)) \
+ raw_writtenform
wordform_count += 1
raw_wordforms += raw_wordform
raw_morphological_unit_id = morphological_unit_id.encode()
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
output_file.write(raw_entry)
total_entry_count += 1
total_wordform_count += wordform_count
process_lexcial_entry(lexicalentry,output_file)
print("Done")
print("\tlexical entries: %d"%total_entry_count)
print("\twordforms: %d"%total_wordform_count)
def do_convert_lexcialentry_file(input_file_name,output_file):
print("%s:"%input_file_name);
tree = xml.etree.ElementTree.parse(input_file_name)
root = tree.getroot()
process_lexcial_entry(root,output_file)
def do_convert_tree(input_tree_name, output_file):
global total_entry_count, total_wordform_count
total_entry_count=0
total_wordform_count=0
for (dirpath,dirnames,filenames) in os.walk(input_tree_name):
for filename in filenames:
if filename[-4:]==".xml":
full_file_name = dirpath+"/"+filename
do_convert_lexcialentry_file(full_file_name,output_file)
print("Done")
print("\tlexical entries: %d"%total_entry_count)
print("\twordforms: %d"%total_wordform_count)
parser = argparse.ArgumentParser(description="STO converter")
parser.add_argument("-i","--input_file",type=str)
parser.add_argument("-i","--input_file",type=str,default=None)
parser.add_argument("-I","--input_tree",type=str,default=None)
parser.add_argument("-o","--output_file",type=str,required=True)
parser.add_argument("command",type=str,default="convert",nargs='?',choices=["convert","signature"])
args=parser.parse_args()
if args.command=="signature" and args.input_file:
print("input_file cannot be specified when generating signature", file=sys.stderr)
if args.command=="signature" and (args.input_file!=None or args.input_tree!=None):
print("input_file/input_tree cannot be specified when generating signature", file=sys.stderr)
sys.exit(1)
if args.command=="convert" and (not args.input_file):
print("input_file and output_file must be specified when generating converting", file=sys.stderr)
if args.command=="convert" and args.input_file==None and args.input_tree==None:
print("input_file/input_tree and output_file must be specified when generating converting", file=sys.stderr)
sys.exit(1)
@ -188,10 +234,23 @@ if args.command=="signature":
version_1_signature = ("parsed-sto-v2\n"+'\0'*80)[0:80]
output_file.write(version_1_signature.encode())
elif args.command=="convert":
do_convert(args.input_file,output_file)
if args.input_file:
do_convert_lexicon_file(args.input_file,output_file)
else:
do_convert_tree(args.input_tree,output_file)
else:
print("argh...", file=sys.stderr)
sys.exit(99)
output_file.close()
if len(warnings)>0:
print("===Warnings:", file=sys.stderr)
for (k,v) in warnings.iteritems():
print("%s: %s"%(k,v), file=sys.stderr)
if len(skips)>0:
print("===Skips:", file=sys.stderr)
for (k,v) in skips.iteritems():
print("%s: %s"%(k,v), file=sys.stderr)
sys.exit(0)

@ -230,6 +230,7 @@ static void remove_combining_marks_norwegian(TokenizerResult *tr);
static void remove_combining_marks_swedish(TokenizerResult *tr);
static void remove_combining_marks_german(TokenizerResult *tr);
static void remove_combining_marks_swiss_german(TokenizerResult *tr);
static void remove_combining_marks_italian(TokenizerResult *tr);
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count);
@ -250,6 +251,9 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang, const char
else
remove_combining_marks_swiss_german(tr);
return;
case langItalian:
remove_combining_marks_italian(tr);
break;
default:
break;
}
@ -333,6 +337,37 @@ static void remove_combining_marks_swiss_german(TokenizerResult *tr) {
}
//Combining marks in Italian:
// - grave àèìòù Mandatory for lowercase. Dedicated keys on keyboard
// - acute é Mandatory for lowercase. Dedicated keys on keyboard
// - cedilla ç Non-native. Dedicated key on keyboard - lowercase only
//Swiss-Italian keyboard has access to umlaut.
//Major problem is that none the the three Italian keyboard layouts have easy access to uppercase accented letters, so the accents are frequently
//omitted or typed as apostrophe. More discussion here: https://italian.stackexchange.com/questions/3878/how-do-italians-customarily-insert-uppercase-italian-vowels-with-diacritics-with
//So one way to deal with this is to just remove all diacritics in both diocument and query, but that would lose precision. But given that most documents has been run through word
//processing software the documents are mostly written correctly, and that when users type queries they rarely use uppercase so the accents are probably also typed correctly there.
//So we keep the native and easily accessible marks. Then on a later date we should detect the incorrect forms and fix them (requires a dictionary though).
static void remove_combining_marks_italian(TokenizerResult *tr) {
static const UChar32 native_marked_letters[] = {
0x00C0, //À
0x00C8, //È
0x00CC, //Ì
0x00D2, //Ò
0x00D9, //Ù
0x00E0, //à
0x00E8, //è
0x00EC, //ì
0x00F2, //ò
0x00F9, //ù
0x00C9, //É
0x00E9, //é
0x00C7, //Ç
0x00E7, //ç
};
remove_some_combining_marks(tr, native_marked_letters, sizeof(native_marked_letters)/sizeof(native_marked_letters[0]));
}
//Remove combining marks form the codepoints except for the native marked letters
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count) {
const size_t org_token_count = tr->size();

@ -609,6 +609,49 @@ int main(void) {
assert(t.str(6)=="Noel");
}
//italian diacritics
printf("Test line %d\n",__LINE__);
{
T2 t("aaa bbb",langItalian);
assert(t.token_count()==3);
}
printf("Test line %d\n",__LINE__);
{
T2 t("Ragù",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="Ragù");
}
printf("Test line %d\n",__LINE__);
{
T2 t("àèìòùéç",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="àèìòùéç");
}
printf("Test line %d\n",__LINE__);
{
T2 t("ÀÈÌÒÙÉÇ",langItalian);
assert(t.token_count()==1);
assert(t.str(0)=="ÀÈÌÒÙÉÇ");
}
printf("Test line %d\n",__LINE__);
{
T2 t("monaco münchen",langItalian);
assert(t.token_count()==4);
assert(t.str(3)=="munchen");
}
printf("Test line %d\n",__LINE__);
{
T2 t("Eskişehir",langItalian);
assert(t.token_count()==2);
assert(t.str(1)=="Eskisehir");
}
//diacritics hands-off
printf("Test line %d\n",__LINE__);
{

@ -98,6 +98,12 @@ param bad xyz
# allows: www.example.com/en/wp-admin
path /wp-admin
# block partial path
# blocks: www.example.com/badpath
# blocks: www.example.com/en/badpath
# blocks: www.example.com/badpath/subpath
pathpartial /badpath
# regex example
# =============
# blocks url by regex

@ -16,6 +16,7 @@ struct WordVariationWeights {
float verb_spelling_variants;
float verb_past_past_variants;
float simple_spelling_variants; //simple variants, eg "cyklen" vs. "cykelen"
float adjective_grammatical_gender_simplification;
//todo: more configurable weights in WordVariationWeights
WordVariationWeights()
: noun_indefinite_definite(1.0),
@ -25,7 +26,8 @@ struct WordVariationWeights {
proper_noun_spelling_variants(1.0),
verb_spelling_variants(1.0),
verb_past_past_variants(1.0),
simple_spelling_variants(1.0)
simple_spelling_variants(1.0),
adjective_grammatical_gender_simplification(1.0)
{}
};

@ -6,6 +6,9 @@
namespace {
enum noun_or_verb_t { noun, verb, whatever };
class WordVariationGenerator_danish : public STOWordVariationGenerator {
public:
WordVariationGenerator_danish()
@ -41,10 +44,40 @@ public:
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight);
void handle_adjective_grammatical_gender_simplification(std::vector<WordVariationGenerator::Variation> &variations,
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight);
};
static WordVariationGenerator_danish s_WordVariationGenerator_danish;
//class for handling unknown compound words
class LogicalMatches {
std::vector<const sto::LexicalEntry *> actual_matches;
std::string prefix; //prefix if a compound word
std::string suffix; //suffix of compound word, or whole word if non-compound
public:
LogicalMatches(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb);
bool empty() const { return actual_matches.empty(); }
size_t size() const { return actual_matches.size(); }
const sto::LexicalEntry * operator[](size_t i) const { return actual_matches[i]; }
std::vector<const sto::LexicalEntry *>::const_iterator begin() const { return actual_matches.begin(); }
std::vector<const sto::LexicalEntry *>::const_iterator end() const { return actual_matches.end(); }
const std::string query_matched_word() const { return suffix; }
std::string query_logical_written_form(const sto::WordForm *wf) const {
return prefix + std::string(wf->written_form,wf->written_form_length);
}
private:
std::string find_compound_word_longest_known_suffix(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb);
};
} //anonymous namespace
@ -55,6 +88,64 @@ bool initializeWordVariationGenerator_Danish() {
LogicalMatches::LogicalMatches(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb)
: actual_matches(lexicon.query_matches(source_word)),
prefix(""),
suffix(source_word)
{
if(actual_matches.empty()) {
//unknown word
std::string tmp = find_compound_word_longest_known_suffix(lexicon,source_word,noun_or_verb);
if(tmp.length()>0) {
//found a suffix match
prefix = source_word.substr(0,source_word.length()-tmp.length());
suffix = tmp;
actual_matches = lexicon.query_matches(suffix);
}
}
}
//Find the longest suffix that has a match in STO.
//This is useful for identifying the (linguistic-)head in unknown compound words which in Danish usually is the last stem in the word
//Eg sneglemassakre, hvidvaske, købepizza
std::string LogicalMatches::find_compound_word_longest_known_suffix(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb) {
//we insist on at least 4 letters, although that would make us not find "udu"
if(source_word.length()<4)
return "";
//we only do it on on normal words without punctuation, special characters etc.
//properly checking codepoint-is-alphabetic would require linking in libunicode etc. adding to the linking complexicity so just hack it here.
for(size_t i=0; i<source_word.length(); i++) {
char c = source_word[i];
if(c<(char)128) {
if((c>='A' && c<='Z') || (c>='a' && c<='z'))
;
else
return "";
}
}
//(todo) extend minimum suffix length if the word ends with a common suffix that isn't an indepedent word, eg -skab, -inde, -isme. Complication is that the suffixes are also inflicted/declined, eg. -skabernes
size_t source_length = source_word.length();
for(size_t suffix_length = source_length-1; suffix_length>=2; suffix_length--) {
std::string candidate_suffix(source_word, source_length-suffix_length);
auto matches(lexicon.query_matches(candidate_suffix));
if(!matches.empty()) {
for(auto match : matches) {
if(noun_or_verb==whatever)
return candidate_suffix;
if(noun_or_verb==noun && match->part_of_speech==sto::part_of_speech_t::commonNoun)
return candidate_suffix;
if(noun_or_verb==verb && (match->part_of_speech==sto::part_of_speech_t::deponentVerb || match->part_of_speech==sto::part_of_speech_t::mainVerb))
return candidate_suffix;
}
return "";
}
}
return "";
}
std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::query_variations(const std::vector<std::string> &source_words, const WordVariationWeights& weights, float threshold) {
std::vector<std::string> lower_source_words(lower_words(source_words));
std::vector<WordVariationGenerator::Variation> variations;
@ -91,6 +182,10 @@ std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::qu
find_simple_attribute_match_wordforms(variations,lower_source_words,weights.simple_spelling_variants);
}
if(weights.adjective_grammatical_gender_simplification >= threshold) {
handle_adjective_grammatical_gender_simplification(variations,source_words,lower_source_words, weights.simple_spelling_variants);
}
//currently inactive because Query.cpp/PosdbTable.cpp cannot handle wordvariations spanning more than one word
//make_proper_noun_part_genetive(variations,source_words,lower_source_words,1.2);
@ -121,7 +216,7 @@ static uint64_t wordformattrs2bitmask(const sto::WordForm &wf) {
static bool same_wordform_as_source(const sto::WordForm &wf, const std::string source_word) {
return wf.written_form_length==source_word.length() &&
memcmp(wf.written_form,source_word.data(),source_word.length())==0;
memcmp(wf.written_form,source_word.data(),source_word.length())==0;
}
@ -132,11 +227,11 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
{
for(unsigned i=0; i<source_words.size(); i++) {
auto source_word(source_words[i]);
auto matches(lexicon.query_matches(source_word));
LogicalMatches matches(lexicon,source_word,noun);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word) &&
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(from_attr))
{
uint64_t source_word_bitmask = wordformattrs2bitmask(*wordform);
@ -148,7 +243,7 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
//found the other form of the word.
//this may match multiple alternative spellings of the wordform, but the STO database cannot distinguish
Variation v;
v.word.assign(definite_wordform->written_form,definite_wordform->written_form_length);
v.word = matches.query_logical_written_form(definite_wordform);
v.weight = weight;
v.source_word_start = i;
v.source_word_end = i+1;
@ -174,11 +269,11 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
{
for(unsigned i=0; i<source_words.size(); i++) {
auto source_word(source_words[i]);
auto matches(lexicon.query_matches(source_word));
LogicalMatches matches(lexicon,source_word,whatever);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word)) {
if(same_wordform_as_source(*wordform,matches.query_matched_word())) {
//found the word form match. Now look for other wordforms with exactly the same attributes. Those are alternate spellings.
//so first find all lexical entries with the same morphological unit id, and check all wordforms of those, looking for an attribute match
auto same_morph_entries = lexicon.query_lexical_entries_with_same_morphological_unit_id(match);
@ -188,7 +283,7 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
if(wordform2!=wordform && has_same_attributes(wordform,wordform2)) {
//found an alternative spelling of the word
Variation v;
v.word.assign(wordform2->written_form,wordform2->written_form_length);
v.word = matches.query_logical_written_form(wordform2);
v.weight = weight;
v.source_word_start = i;
v.source_word_end = i+1;
@ -332,11 +427,11 @@ void WordVariationGenerator_danish::transliterate_verb_acute_accent(std::vector<
if(source_word.length()>4 && source_word.substr(source_word.length()-2)=="er") {
//possibly a verb in imperative
bool is_imperative = false;
auto matches(lexicon.query_matches(source_word));
LogicalMatches matches(lexicon,source_word,verb);
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word) &&
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_imperative))
{
is_imperative = true;
@ -379,7 +474,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
auto source_word(lower_source_words[i]);
if(source_word==" ")
continue;
auto matches(lexicon.query_matches(source_word));
LogicalMatches matches(lexicon,source_word,verb);
if(prev_was_er || prev_was_var || prev_was_har || prev_was_havde) {
//check if this word is the past participle
const sto::WordForm *wordform_past_participle = NULL;
@ -387,7 +482,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word) &&
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(sto::word_form_attribute_t::tense_past) &&
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_participle))
{
@ -405,7 +500,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
//generate preterite
if(wordform_preterite) {
WordVariationGenerator::Variation v0;
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
v0.word = matches.query_logical_written_form(wordform_preterite);
v0.weight = weight;
v0.source_word_start = prev_word_idx;
v0.source_word_end = i+1;
@ -423,7 +518,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
//generate preterite
if(wordform_preterite) {
WordVariationGenerator::Variation v0;
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
v0.word = matches.query_logical_written_form(wordform_preterite);
v0.weight = weight;
v0.source_word_start = prev_word_idx;
v0.source_word_end = i+1;
@ -441,7 +536,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
//generate preterite
if(wordform_preterite) {
WordVariationGenerator::Variation v0;
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
v0.word = matches.query_logical_written_form(wordform_preterite);
v0.weight = weight;
v0.source_word_start = prev_word_idx;
v0.source_word_end = i+1;
@ -459,7 +554,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
//generate preterite
if(wordform_preterite) {
WordVariationGenerator::Variation v0;
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
v0.word = matches.query_logical_written_form(wordform_preterite);
v0.weight = weight;
v0.source_word_start = prev_word_idx;
v0.source_word_end = i+1;
@ -481,7 +576,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
for(auto match : matches) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word) &&
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(sto::word_form_attribute_t::tense_past) &&
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_indicative) &&
wordform->has_attribute(sto::word_form_attribute_t::voice_activeVoice)) //we'll ignore this complication for now
@ -504,26 +599,26 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
//generate perfect
if(source_word!="var") {
WordVariationGenerator::Variation v0_0;
v0_0.word = "har "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v0_0.word = "har "+matches.query_logical_written_form(wordform_past_participle);
v0_0.weight = weight;
v0_0.source_word_start = i;
v0_0.source_word_end = i+1;
variations.push_back(v0_0);
WordVariationGenerator::Variation v0_1;
v0_1.word = "er "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v0_1.word = "er "+matches.query_logical_written_form(wordform_past_participle);
v0_1.weight = weight;
v0_1.source_word_start = i;
v0_1.source_word_end = i+1;
variations.push_back(v0_1);
//generate pluperfect
WordVariationGenerator::Variation v1_0;
v1_0.word = "havde "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v1_0.word = "havde "+matches.query_logical_written_form(wordform_past_participle);
v1_0.weight = weight;
v1_0.source_word_start = i;
v1_0.source_word_end = i+1;
variations.push_back(v1_0);
WordVariationGenerator::Variation v1_1;
v1_1.word = "var "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v1_1.word = "var "+matches.query_logical_written_form(wordform_past_participle);
v1_1.weight = weight;
v1_1.source_word_start = i;
v1_1.source_word_end = i+1;
@ -531,13 +626,13 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
} else {
//"at være" takes the auxilliary verb "have"
WordVariationGenerator::Variation v0_0;
v0_0.word = "har "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v0_0.word = "har "+matches.query_logical_written_form(wordform_past_participle);
v0_0.weight = weight;
v0_0.source_word_start = i;
v0_0.source_word_end = i+1;
variations.push_back(v0_0);
WordVariationGenerator::Variation v1_0;
v1_0.word = "havde "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
v1_0.word = "havde "+matches.query_logical_written_form(wordform_past_participle);
v1_0.weight = weight;
v1_0.source_word_start = i;
v1_0.source_word_end = i+1;
@ -577,13 +672,13 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
continue;
//find noun
auto matches(lexicon.query_matches(source_word0));
LogicalMatches matches(lexicon,source_word0,noun);
const sto::WordForm *wordform_noun = NULL;
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(same_wordform_as_source(*wordform,source_word0) &&
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
{
wordform_noun = wordform;
@ -614,10 +709,10 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
auto source_word4_capitalized(capitalize_word(source_word4));
//find proper-noun
matches = lexicon.query_matches(source_word4_capitalized);
auto matches2 = lexicon.query_matches(source_word4_capitalized);
const sto::WordForm *wordform_proper_noun = NULL;
const sto::WordForm *wordform_proper_noun_genitive = NULL;
for(auto match : matches) {
for(auto match : matches2) {
if(match->part_of_speech==sto::part_of_speech_t::properNoun) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
@ -640,10 +735,63 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
//transform that into propernoun-genetive noun
WordVariationGenerator::Variation v0_0;
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + std::string(wordform_noun->written_form,wordform_noun->written_form_length);
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + matches.query_logical_written_form(wordform_noun);
v0_0.weight = weight;
v0_0.source_word_start = i;
v0_0.source_word_end = i+5;
variations.push_back(v0_0);
}
}
void WordVariationGenerator_danish::handle_adjective_grammatical_gender_simplification(std::vector<WordVariationGenerator::Variation> &variations,
const std::vector<std::string> &source_words,
const std::vector<std::string> &lower_source_words,
float weight)
{
//In Danish there are officially two grammatical genders: common and neuter. Adjectives have to agree when in singular indefinite.
//However, Western Jutland generally doesn't distinguish. And for objects of abstract nature or non-obvious grammatical gender people don't always follow the rule.
//So a document may have "Et internationalt marked" but the user searches for "international marked".
//The opposite can also happen but it is less common.
//So locate adjectives with gender=common number=singular definitenes=indefinite, find the corresponding wordform for gender=neuter and generate that
for(unsigned i=0; i<lower_source_words.size(); i++) {
auto source_word0(lower_source_words[i]);
if(source_word0==" ")
continue;
//find adjective
bool is_common_singular_indefinite = false;
const sto::WordForm *wordform_neuter_singular_indefinite = NULL;
LogicalMatches matches(lexicon,source_word0,whatever);
for(auto match : matches) {
if(match->part_of_speech==sto::part_of_speech_t::adjective) {
auto wordforms(match->query_all_explicit_word_forms());
for(auto wordform : wordforms) {
if(wordform->has_attribute(sto::word_form_attribute_t::grammaticalGender_neuter) &&
wordform->has_attribute(sto::word_form_attribute_t::grammaticalNumber_singular) &&
wordform->has_attribute(sto::word_form_attribute_t::definiteness_indefinite))
{
wordform_neuter_singular_indefinite = wordform;
}
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
wordform->has_attribute(sto::word_form_attribute_t::grammaticalGender_commonGender) &&
wordform->has_attribute(sto::word_form_attribute_t::grammaticalNumber_singular) &&
wordform->has_attribute(sto::word_form_attribute_t::definiteness_indefinite))
{
is_common_singular_indefinite = wordform;
}
}
}
}
if(!is_common_singular_indefinite || !wordform_neuter_singular_indefinite)
continue;
WordVariationGenerator::Variation v0_0;
v0_0.word = matches.query_logical_written_form(wordform_neuter_singular_indefinite);
v0_0.weight = weight;
v0_0.source_word_start = i;
v0_0.source_word_end = i+1;
variations.push_back(v0_0);
}
}