mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-12 02:26:07 -04:00
Merge branch 'master' into lemma
This commit is contained in:
Conf.cppConf.hContentMatchList.cppContentMatchList.hContentTypeBlockList.cppContentTypeBlockList.hDnsBlockList.cppDnsBlockList.hDocRebuild.cppDocid2Siteflags.cppDomains.cppFxClient.cppHttpServer.cppIpBlockList.cppIpBlockList.hLinkdb.hMakefileMatchList.cppMatchList.hMsg13.cppMsg25.cppMsg25.hPageGet.cppPageParser.cppPageParser.hParms.cppParms.hPosdbTable.cppProcess.cppRdb.cppRepair.cppSiteDefaultPageTemperatureRemoteRegistry.cppSiteDefaultPageTemperatureRemoteRegistry.hSiteMedianPageTemperature.cppSiteMedianPageTemperature.hSiteNumInlinks.cppSiteNumInlinks.hTitledb.cppUrlMatchList.cppUrlMatchList.hXmlDoc.cppXmlDoc.hlinkspam.cpplinkspam.hmain.cpp
sto
tokenizer
urlmatchlist.txt.exampleword_variations
15
Conf.cpp
15
Conf.cpp
@ -65,6 +65,16 @@ Conf::Conf ( ) {
|
||||
m_maxOutstandingQueryLanguage = 0;
|
||||
m_queryLanguageTimeout = 0;
|
||||
|
||||
m_siteMedianPageTemperatureServerName[0] = '\0';
|
||||
m_siteMedianPageTemperatureServerPort = 0;
|
||||
m_maxOutstandingSiteMedianPageTemperature = 0;
|
||||
m_siteMedianPageTemperatureTimeout = 0;
|
||||
|
||||
m_siteNumInlinksServerName[0] = '\0';
|
||||
m_siteNumInlinksServerPort = 0;
|
||||
m_maxOutstandingSiteNumInlinks = 0;
|
||||
m_siteNumInlinksTimeout = 0;
|
||||
|
||||
m_urlClassificationServerName[0] = '\0';
|
||||
m_urlClassificationServerPort = 0;
|
||||
m_maxOutstandingUrlClassifications = 0;
|
||||
@ -233,8 +243,9 @@ Conf::Conf ( ) {
|
||||
m_logDebugUrlAttempts = false;
|
||||
m_logDebugVagus = false;
|
||||
m_logTraceBigFile = false;
|
||||
m_logTraceBlockList = false;
|
||||
m_logTraceMatchList = false;
|
||||
m_logTraceContentTypeBlockList = false;
|
||||
m_logTraceDocid2FlagsAndSiteMap = false;
|
||||
m_logTraceDocProcess = false;
|
||||
m_logTraceDns = false;
|
||||
m_logTraceDnsBlockList = false;
|
||||
@ -270,6 +281,8 @@ Conf::Conf ( ) {
|
||||
m_logTraceSpider = false;
|
||||
m_logTraceSpiderUrlCache = false;
|
||||
m_logTraceReindex = false;
|
||||
m_logTraceSiteMedianPageTemperature = false;
|
||||
m_logTraceSiteNumInlinks = false;
|
||||
m_logTraceSpiderdbRdbSqliteBridge = false;
|
||||
m_logTraceSummary = false;
|
||||
m_logTraceTitledb = false;
|
||||
|
15
Conf.h
15
Conf.h
@ -105,6 +105,16 @@ class Conf {
|
||||
unsigned m_maxOutstandingQueryLanguage;
|
||||
unsigned m_queryLanguageTimeout;
|
||||
|
||||
char m_siteMedianPageTemperatureServerName[64];
|
||||
int32_t m_siteMedianPageTemperatureServerPort;
|
||||
unsigned m_maxOutstandingSiteMedianPageTemperature;
|
||||
unsigned m_siteMedianPageTemperatureTimeout;
|
||||
|
||||
char m_siteNumInlinksServerName[64];
|
||||
int32_t m_siteNumInlinksServerPort;
|
||||
unsigned m_maxOutstandingSiteNumInlinks;
|
||||
unsigned m_siteNumInlinksTimeout;
|
||||
|
||||
char m_urlClassificationServerName[64];
|
||||
int32_t m_urlClassificationServerPort;
|
||||
unsigned m_maxOutstandingUrlClassifications;
|
||||
@ -381,8 +391,9 @@ class Conf {
|
||||
bool m_logDebugVagus;
|
||||
|
||||
bool m_logTraceBigFile;
|
||||
bool m_logTraceBlockList;
|
||||
bool m_logTraceMatchList;
|
||||
bool m_logTraceContentTypeBlockList;
|
||||
bool m_logTraceDocid2FlagsAndSiteMap;
|
||||
bool m_logTraceDocProcess;
|
||||
bool m_logTraceDns;
|
||||
bool m_logTraceDnsBlockList;
|
||||
@ -416,6 +427,8 @@ class Conf {
|
||||
bool m_logTraceRepairs;
|
||||
bool m_logTraceRobots;
|
||||
bool m_logTraceRobotsCheckList;
|
||||
bool m_logTraceSiteMedianPageTemperature;
|
||||
bool m_logTraceSiteNumInlinks;
|
||||
bool m_logTraceSpider;
|
||||
bool m_logTraceSpiderUrlCache;
|
||||
bool m_logTraceReindex;
|
||||
|
41
ContentMatchList.cpp
Normal file
41
ContentMatchList.cpp
Normal file
@ -0,0 +1,41 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#include "ContentMatchList.h"
|
||||
#include "Log.h"
|
||||
#include "Conf.h"
|
||||
|
||||
ContentMatchList g_contentRetryProxyList;
|
||||
|
||||
static const char s_filename[] = "contentretryproxylist.txt";
|
||||
|
||||
ContentMatchList::ContentMatchList()
|
||||
: MatchList(s_filename) {
|
||||
}
|
||||
|
||||
bool ContentMatchList::isContentMatched(const char *content, size_t contentLen) {
|
||||
auto contentMatchList = getMatchList();
|
||||
|
||||
for (auto const &contentMatch : *contentMatchList) {
|
||||
if (strncasestr(content, contentLen, contentMatch.c_str())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
32
ContentMatchList.h
Normal file
32
ContentMatchList.h
Normal file
@ -0,0 +1,32 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#ifndef FX_CONTENTMATCHLIST_H
|
||||
#define FX_CONTENTMATCHLIST_H
|
||||
|
||||
#include "MatchList.h"
|
||||
|
||||
class ContentMatchList : public MatchList<std::string> {
|
||||
public:
|
||||
ContentMatchList();
|
||||
bool isContentMatched(const char *content, size_t contentLen);
|
||||
};
|
||||
|
||||
extern ContentMatchList g_contentRetryProxyList;
|
||||
|
||||
#endif // FX_CONTENTMATCHLIST_H
|
@ -29,7 +29,7 @@ static const char s_contenttype_filename[] = "contenttypeblocklist.txt";
|
||||
static const char s_contenttype_allowed_filename[] = "contenttypeallowed.txt";
|
||||
|
||||
ContentTypeBlockList::ContentTypeBlockList()
|
||||
: BlockList(s_contenttype_filename)
|
||||
: MatchList(s_contenttype_filename)
|
||||
, m_contenttype_allowed()
|
||||
, m_contenttype_allowed_mtx(PTHREAD_MUTEX_INITIALIZER) {
|
||||
}
|
||||
@ -43,7 +43,7 @@ bool ContentTypeBlockList::init() {
|
||||
m_contenttype_allowed.push_back(line);
|
||||
}
|
||||
|
||||
return BlockList::init();
|
||||
return MatchList::init();
|
||||
}
|
||||
|
||||
void ContentTypeBlockList::addContentTypeAllowed(const char *contentType, size_t contentTypeLen) {
|
||||
@ -65,7 +65,7 @@ bool ContentTypeBlockList::isContentTypeBlocked(const char *contentType, size_t
|
||||
return false;
|
||||
}
|
||||
|
||||
auto contentTypeBlockList = getBlockList();
|
||||
auto contentTypeBlockList = getMatchList();
|
||||
|
||||
for (auto const &contentTypeBlock : *contentTypeBlockList) {
|
||||
if (contentTypeBlock.back() == '*') {
|
||||
|
@ -20,11 +20,11 @@
|
||||
#define FX_CONTENTTYPEBLOCKLIST_H
|
||||
|
||||
|
||||
#include "BlockList.h"
|
||||
#include "MatchList.h"
|
||||
#include <pthread.h>
|
||||
#include <vector>
|
||||
|
||||
class ContentTypeBlockList : public BlockList<std::string> {
|
||||
class ContentTypeBlockList : public MatchList<std::string> {
|
||||
public:
|
||||
ContentTypeBlockList();
|
||||
|
||||
|
@ -25,11 +25,11 @@ DnsBlockList g_dnsBlockList;
|
||||
static const char s_dns_filename[] = "dnsblocklist.txt";
|
||||
|
||||
DnsBlockList::DnsBlockList()
|
||||
: BlockList(s_dns_filename) {
|
||||
: MatchList(s_dns_filename) {
|
||||
}
|
||||
|
||||
bool DnsBlockList::isDnsBlocked(const char *dns) {
|
||||
auto dnsBlockList = getBlockList();
|
||||
auto dnsBlockList = getMatchList();
|
||||
|
||||
for (auto const &dnsBlock : *dnsBlockList) {
|
||||
if (dnsBlock.front() == '*') {
|
||||
|
@ -19,9 +19,9 @@
|
||||
#ifndef FX_DNSBLOCKLIST_H
|
||||
#define FX_DNSBLOCKLIST_H
|
||||
|
||||
#include "BlockList.h"
|
||||
#include "MatchList.h"
|
||||
|
||||
class DnsBlockList : public BlockList<std::string> {
|
||||
class DnsBlockList : public MatchList<std::string> {
|
||||
public:
|
||||
DnsBlockList();
|
||||
bool isDnsBlocked(const char *dns);
|
||||
|
@ -95,7 +95,7 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
|
||||
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", MAX_NICENESS)) {
|
||||
xmlDoc->m_indexCode = ECORRUPTDATA;
|
||||
xmlDoc->m_indexCodeValid = true;
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "Docid2Siteflags.h"
|
||||
#include "Log.h"
|
||||
#include "Conf.h"
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
@ -112,11 +113,13 @@ bool Docid2FlagsAndSiteMap::lookupSiteHash(uint64_t docid, uint32_t *sitehash32)
|
||||
if(pos!=e.end()) {
|
||||
if(pos->docid == docid) {
|
||||
*sitehash32 = pos->sitehash32;
|
||||
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record sitehash32=%u for docid=%lu", *sitehash32, docid);
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -129,9 +132,11 @@ bool Docid2FlagsAndSiteMap::lookupFlags(uint64_t docid, unsigned *flags) {
|
||||
if(pos!=e.end()) {
|
||||
if(pos->docid == docid) {
|
||||
*flags = pos->flags;
|
||||
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record flags=%u for docid=%lu", *flags, docid);
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
|
||||
return false;
|
||||
}
|
||||
|
@ -1921,6 +1921,11 @@ static bool isTLDForUrl(const char *tld, int32_t tldLen) {
|
||||
|
||||
// otherwise, if one period, check table to see if qualified
|
||||
|
||||
if( ! s_table.getNumSlots() ) {
|
||||
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
int64_t h = hash64Lower_a ( tld , tldLen ); // strlen(tld));
|
||||
//return s_table.isInTable ( &h );//getScoreFromTermId ( h );
|
||||
bool b = s_table.isInTable ( &h );//getScoreFromTermId ( h );
|
||||
@ -1929,6 +1934,10 @@ static bool isTLDForUrl(const char *tld, int32_t tldLen) {
|
||||
|
||||
|
||||
bool isTLD(const char *tld, int32_t tldLen) {
|
||||
if( ! s_table.getNumSlots() ) {
|
||||
log(LOG_ERROR,"%s:%d: Attempted to use uninitialized TLD table", __func__, __LINE__);
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
int64_t h = hash64Lower_a(tld, tldLen);
|
||||
return s_table.isInTable(&h);
|
||||
}
|
||||
|
@ -424,7 +424,7 @@ void FxClient::reinitializeSettings(const char *hostname, int port, unsigned max
|
||||
}
|
||||
|
||||
bool FxClient::sendRequest(fxclient_request_ptr_t request) {
|
||||
if (m_outstanding_request_count >= m_max_outstanding) {
|
||||
if (m_max_outstanding > 0 && m_outstanding_request_count >= m_max_outstanding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -433,7 +433,7 @@ bool FxClient::sendRequest(fxclient_request_ptr_t request) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_outstanding_request_count + m_queued_requests.size() >= m_max_outstanding) {
|
||||
if (m_max_outstanding > 0 && m_outstanding_request_count + m_queued_requests.size() >= m_max_outstanding) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,6 @@
|
||||
// a global class extern'd in .h file
|
||||
HttpServer g_httpServer;
|
||||
|
||||
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
|
||||
static bool sendPagePretty(TcpSocket *s, HttpRequest *r, const char *filename, const char *tabName);
|
||||
|
||||
// we get like 100k submissions a day!!!
|
||||
@ -824,9 +823,6 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( ! strncmp ( path ,"/api", pathLen ) )
|
||||
return sendPageAPI ( s , r );
|
||||
|
||||
if ( ! strncmp ( path ,"/print", pathLen ) )
|
||||
return sendPageAnalyze ( s , r );
|
||||
|
||||
// proxy should handle all regular file requests itself! that is
|
||||
// generally faster i think, and, besides, sending pieces of a big
|
||||
// file one at a time using our encapsulation method won't work! so
|
||||
|
@ -26,11 +26,11 @@ IpBlockList g_ipBlockList;
|
||||
static const char s_ip_filename[] = "ipblocklist.txt";
|
||||
|
||||
IpBlockList::IpBlockList()
|
||||
: BlockList(s_ip_filename) {
|
||||
: MatchList(s_ip_filename) {
|
||||
}
|
||||
|
||||
bool IpBlockList::isIpBlocked(uint32_t ip) {
|
||||
auto ipBlockList = getBlockList();
|
||||
auto ipBlockList = getMatchList();
|
||||
|
||||
for (auto const &ipBlock : *ipBlockList) {
|
||||
if (ipBlock == ip) {
|
||||
@ -42,7 +42,7 @@ bool IpBlockList::isIpBlocked(uint32_t ip) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void IpBlockList::addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line) {
|
||||
void IpBlockList::addToMatchList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line) {
|
||||
in_addr addr;
|
||||
|
||||
if (inet_pton(AF_INET, line.c_str(), &addr) != 1) {
|
||||
|
@ -19,15 +19,15 @@
|
||||
#ifndef FX_IPBLOCKLIST_H
|
||||
#define FX_IPBLOCKLIST_H
|
||||
|
||||
#include "BlockList.h"
|
||||
#include "MatchList.h"
|
||||
|
||||
class IpBlockList : public BlockList<uint32_t> {
|
||||
class IpBlockList : public MatchList<uint32_t> {
|
||||
public:
|
||||
IpBlockList();
|
||||
bool isIpBlocked(uint32_t ip);
|
||||
|
||||
protected:
|
||||
void addToBlockList(blocklist_ptr_t<uint32_t> &blockList, const std::string &line);
|
||||
void addToMatchList(matchlist_ptr_t<uint32_t> &blockList, const std::string &line);
|
||||
|
||||
};
|
||||
|
||||
|
8
Linkdb.h
8
Linkdb.h
@ -60,7 +60,6 @@ bool getLinkInfo ( SafeBuf *reqBuf , // store msg25 request in here
|
||||
int32_t niceness ,
|
||||
bool doLinkSpamCheck ,
|
||||
bool oneVotePerIpDom ,
|
||||
bool canBeCancelled ,
|
||||
int32_t lastUpdateTime ,
|
||||
bool onlyNeedGoodInlinks ,
|
||||
// if an inlinking document has an outlink
|
||||
@ -262,10 +261,7 @@ class Xml;
|
||||
class Inlink;
|
||||
|
||||
class LinkInfo {
|
||||
|
||||
public:
|
||||
|
||||
int32_t getStoredSize ( ) const { return m_lisize; }
|
||||
public:
|
||||
int32_t getSize ( ) const { return m_lisize; }
|
||||
time_t getLastUpdated ( ) const { return (time_t)m_lastUpdated; }
|
||||
|
||||
@ -277,8 +273,6 @@ class LinkInfo {
|
||||
return const_cast<LinkInfo*>(this)->getNextInlink(const_cast<Inlink*>(k));
|
||||
}
|
||||
|
||||
bool getItemXml ( Xml *xml ) ;
|
||||
|
||||
bool hasLinkText() const;
|
||||
|
||||
// for PageTitledb
|
||||
|
9
Makefile
9
Makefile
@ -19,7 +19,7 @@ OBJS_O0 = \
|
||||
File.o \
|
||||
FxTermCheckList.o FxCheckAdult.o FxCheckSpam.o \
|
||||
GbMutex.o \
|
||||
HashTable.o HighFrequencyTermShortcuts.o PageTemperatureRegistry.o SiteMedianPageTemperatureRegistry.o SiteDefaultPageTemperatureRemoteRegistry.o Docid2Siteflags.o HttpMime.o HttpRequest.o HttpServer.o Hostdb.o \
|
||||
HashTable.o HighFrequencyTermShortcuts.o PageTemperatureRegistry.o SiteMedianPageTemperatureRegistry.o Docid2Siteflags.o HttpMime.o HttpRequest.o HttpServer.o Hostdb.o \
|
||||
iana_charset.o Images.o ip.o \
|
||||
JobScheduler.o Json.o \
|
||||
Lang.o Log.o \
|
||||
@ -59,8 +59,8 @@ OBJS_O2 = \
|
||||
|
||||
|
||||
OBJS_O3 = \
|
||||
BlockList.o \
|
||||
ContentTypeBlockList.o \
|
||||
MatchList.o \
|
||||
ContentMatchList.o ContentTypeBlockList.o \
|
||||
DocDelete.o DocProcess.o DocRebuild.o DocReindex.o DnsBlockList.o \
|
||||
IPAddressChecks.o IpBlockList.o \
|
||||
LanguageResultOverride.o Linkdb.o \
|
||||
@ -94,8 +94,11 @@ OBJS_O3 = \
|
||||
EGStack.o \
|
||||
QueryLanguage.o \
|
||||
FxClient.o \
|
||||
SiteNumInlinks.o \
|
||||
SiteMedianPageTemperature.o \
|
||||
Lemma.o \
|
||||
|
||||
|
||||
OBJS = $(OBJS_O0) $(OBJS_O1) $(OBJS_O2) $(OBJS_O3)
|
||||
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#include "BlockList.h"
|
||||
#include "MatchList.h"
|
||||
#include "Log.h"
|
||||
#include "Conf.h"
|
||||
#include "Loop.h"
|
||||
@ -26,31 +26,31 @@
|
||||
#include <atomic>
|
||||
|
||||
template <class T>
|
||||
BlockList<T>::BlockList(const char *filename)
|
||||
MatchList<T>::MatchList(const char *filename)
|
||||
: m_filename(filename)
|
||||
, m_loading(false)
|
||||
, m_blockList(new blocklist_t<T>)
|
||||
, m_matchList(new matchlist_t<T>)
|
||||
, m_lastModifiedTime(0) {
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool BlockList<T>::init() {
|
||||
log(LOG_INFO, "Initializing BlockList with %s", m_filename);
|
||||
bool MatchList<T>::init() {
|
||||
log(LOG_INFO, "Initializing MatchList with %s", m_filename);
|
||||
|
||||
if (!g_loop.registerSleepCallback(60000, this, &reload, "BlockList<T>::reload", 0)) {
|
||||
log(LOG_WARN, "BlockList<T>:: Failed to register callback.");
|
||||
if (!g_loop.registerSleepCallback(60000, this, &reload, "MatchList<T>::reload", 0)) {
|
||||
log(LOG_WARN, "MatchList<T>:: Failed to register callback.");
|
||||
return false;
|
||||
}
|
||||
|
||||
// we do a load here instead of using sleep callback with immediate set to true so
|
||||
// we don't rely on g_loop being up and running to use blocklist
|
||||
// we don't rely on g_loop being up and running to use matchlist
|
||||
load();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void BlockList<T>::reload(int /*fd*/, void *state) {
|
||||
void MatchList<T>::reload(int /*fd*/, void *state) {
|
||||
if (g_jobScheduler.submit(reload, nullptr, state, thread_type_config_load, 0)) {
|
||||
return;
|
||||
}
|
||||
@ -60,36 +60,36 @@ void BlockList<T>::reload(int /*fd*/, void *state) {
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void BlockList<T>::reload(void *state) {
|
||||
BlockList *blockList = static_cast<BlockList*>(state);
|
||||
void MatchList<T>::reload(void *state) {
|
||||
MatchList *matchList = static_cast<MatchList*>(state);
|
||||
|
||||
// don't load multiple times at the same time
|
||||
if (blockList->m_loading.exchange(true)) {
|
||||
if (matchList->m_loading.exchange(true)) {
|
||||
return;
|
||||
}
|
||||
|
||||
blockList->load();
|
||||
blockList->m_loading = false;
|
||||
matchList->load();
|
||||
matchList->m_loading = false;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool BlockList<T>::load() {
|
||||
logTrace(g_conf.m_logTraceBlockList, "Loading %s", m_filename);
|
||||
bool MatchList<T>::load() {
|
||||
logTrace(g_conf.m_logTraceMatchList, "Loading %s", m_filename);
|
||||
|
||||
struct stat st;
|
||||
if (stat(m_filename, &st) != 0) {
|
||||
// probably not found
|
||||
log(LOG_INFO, "BlockList<T>::load: Unable to stat %s", m_filename);
|
||||
log(LOG_INFO, "MatchList<T>::load: Unable to stat %s", m_filename);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_lastModifiedTime != 0 && m_lastModifiedTime == st.st_mtime) {
|
||||
// not modified. assume successful
|
||||
logTrace(g_conf.m_logTraceBlockList, "%s not modified", m_filename);
|
||||
logTrace(g_conf.m_logTraceMatchList, "%s not modified", m_filename);
|
||||
return true;
|
||||
}
|
||||
|
||||
blocklist_ptr_t<T> tmpBlockList(new blocklist_t<T>);
|
||||
matchlist_ptr_t<T> tmpMatchList(new matchlist_t<T>);
|
||||
|
||||
std::ifstream file(m_filename);
|
||||
std::string line;
|
||||
@ -99,37 +99,37 @@ bool BlockList<T>::load() {
|
||||
continue;
|
||||
}
|
||||
|
||||
addToBlockList(tmpBlockList, line);
|
||||
logTrace(g_conf.m_logTraceBlockList, "Adding criteria '%s' to list", line.c_str());
|
||||
addToMatchList(tmpMatchList, line);
|
||||
logTrace(g_conf.m_logTraceMatchList, "Adding criteria '%s' to list", line.c_str());
|
||||
}
|
||||
|
||||
swapBlockList(tmpBlockList);
|
||||
swapMatchList(tmpMatchList);
|
||||
m_lastModifiedTime = st.st_mtime;
|
||||
|
||||
logTrace(g_conf.m_logTraceBlockList, "Loaded %s", m_filename);
|
||||
logTrace(g_conf.m_logTraceMatchList, "Loaded %s", m_filename);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void BlockList<T>::addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line) {
|
||||
void MatchList<T>::addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line) {
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
template <>
|
||||
void BlockList<std::string>::addToBlockList(blocklist_ptr_t<std::string> &blockList, const std::string &line) {
|
||||
blockList->emplace_back(line);
|
||||
void MatchList<std::string>::addToMatchList(matchlist_ptr_t<std::string> &matchList, const std::string &line) {
|
||||
matchList->emplace_back(line);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
blocklistconst_ptr_t<T> BlockList<T>::getBlockList() {
|
||||
return m_blockList;
|
||||
matchlistconst_ptr_t<T> MatchList<T>::getMatchList() {
|
||||
return m_matchList;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void BlockList<T>::swapBlockList(blocklistconst_ptr_t<T> blockList) {
|
||||
std::atomic_store(&m_blockList, blockList);
|
||||
void MatchList<T>::swapMatchList(matchlistconst_ptr_t<T> matchList) {
|
||||
std::atomic_store(&m_matchList, matchList);
|
||||
}
|
||||
|
||||
// explicit instantiations
|
||||
template class BlockList<std::string>;
|
||||
template class BlockList<uint32_t>;
|
||||
template class MatchList<std::string>;
|
||||
template class MatchList<uint32_t>;
|
@ -16,8 +16,8 @@
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#ifndef FX_BLOCKLIST_H
|
||||
#define FX_BLOCKLIST_H
|
||||
#ifndef FX_MATCHLIST_H
|
||||
#define FX_MATCHLIST_H
|
||||
|
||||
|
||||
#include <memory>
|
||||
@ -25,14 +25,14 @@
|
||||
#include <string>
|
||||
#include <atomic>
|
||||
|
||||
template <typename T> using blocklist_t = std::vector<T>;
|
||||
template <typename T> using blocklist_ptr_t = std::shared_ptr<std::vector<T>>;
|
||||
template <typename T> using blocklistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
|
||||
template <typename T> using matchlist_t = std::vector<T>;
|
||||
template <typename T> using matchlist_ptr_t = std::shared_ptr<std::vector<T>>;
|
||||
template <typename T> using matchlistconst_ptr_t = std::shared_ptr<const std::vector<T>>;
|
||||
|
||||
template<class T> class BlockList {
|
||||
template<class T> class MatchList {
|
||||
public:
|
||||
explicit BlockList(const char *filename);
|
||||
virtual ~BlockList() = default;
|
||||
explicit MatchList(const char *filename);
|
||||
virtual ~MatchList() = default;
|
||||
|
||||
virtual bool init();
|
||||
|
||||
@ -42,18 +42,18 @@ public:
|
||||
protected:
|
||||
bool load();
|
||||
|
||||
virtual void addToBlockList(blocklist_ptr_t<T> &blockList, const std::string &line);
|
||||
blocklistconst_ptr_t<T> getBlockList();
|
||||
virtual void addToMatchList(matchlist_ptr_t<T> &matchList, const std::string &line);
|
||||
matchlistconst_ptr_t<T> getMatchList();
|
||||
|
||||
const char *m_filename;
|
||||
|
||||
private:
|
||||
void swapBlockList(blocklistconst_ptr_t<T> blockList);
|
||||
void swapMatchList(matchlistconst_ptr_t<T> matchList);
|
||||
|
||||
std::atomic_bool m_loading;
|
||||
blocklistconst_ptr_t<T> m_blockList;
|
||||
matchlistconst_ptr_t<T> m_matchList;
|
||||
|
||||
time_t m_lastModifiedTime;
|
||||
};
|
||||
|
||||
#endif //FX_BLOCKLIST_H
|
||||
#endif //FX_MATCHLIST_H
|
121
Msg13.cpp
121
Msg13.cpp
@ -18,6 +18,8 @@
|
||||
#include "Pages.h"
|
||||
#include "Statistics.h"
|
||||
#include "Sanity.h"
|
||||
#include "UrlMatchList.h"
|
||||
#include "ContentMatchList.h"
|
||||
#include <string.h>
|
||||
|
||||
|
||||
@ -644,23 +646,28 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
|
||||
bool useProxies = false;
|
||||
|
||||
// for diffbot turn ON if use robots is off
|
||||
if ( r->m_forceUseFloaters ) useProxies = true;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if you turned on automatically use proxies in spider controls...
|
||||
if ( ! useProxies &&
|
||||
cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
cr->m_automaticallyUseProxies &&
|
||||
isIpInTwitchyTable( cr, r->m_urlIp ) )
|
||||
useProxies = true;
|
||||
|
||||
// we gotta have some proxy ips that we can use
|
||||
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
||||
if (g_conf.m_proxyIps.hasDigits()) {
|
||||
// for diffbot turn ON if use robots is off
|
||||
if (r->m_forceUseFloaters) {
|
||||
useProxies = true;
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(r->m_collnum);
|
||||
|
||||
// if you turned on automatically use proxies in spider controls...
|
||||
if (!useProxies &&
|
||||
cr && cr->m_automaticallyUseProxies &&
|
||||
r->m_urlIp != 0 && r->m_urlIp != -1 && isIpInTwitchyTable(cr, r->m_urlIp)) {
|
||||
useProxies = true;
|
||||
}
|
||||
|
||||
Url url;
|
||||
url.set(r->ptr_url, r->size_url);
|
||||
if (g_urlProxyList.isUrlMatched(url)) {
|
||||
useProxies = true;
|
||||
}
|
||||
}
|
||||
|
||||
// we did not need a spider proxy ip so send this reuest to a host
|
||||
// to download the url
|
||||
@ -1036,6 +1043,67 @@ static bool ipWasBanned(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void appendRetryProxy(const char *url, int urlLen, const char *location = nullptr, int locationLen = 0) {
|
||||
char filename[1024];
|
||||
sprintf(filename,"%s/retryproxy.txt", g_hostdb.m_myHost->m_dir);
|
||||
FILE *fp = fopen(filename,"a");
|
||||
if (fp) {
|
||||
fprintf(fp, "%.*s|%.*s\n", urlLen, url, locationLen, location);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
static bool retryProxy(TcpSocket *ts, const char **msg, Msg13Request *r) {
|
||||
if (!ts) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//we only do proxy checks if there weren't any other error
|
||||
if (g_errno != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// don't check for retries if it's already done
|
||||
if (r->m_proxyTries > 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Url url;
|
||||
url.set(r->ptr_url, r->size_url);
|
||||
|
||||
HttpMime mime;
|
||||
mime.set(ts->m_readBuf, ts->m_readOffset, &url);
|
||||
|
||||
int32_t httpStatus = mime.getHttpStatus();
|
||||
if (httpStatus == 301 || httpStatus == 302 || httpStatus == 307 || httpStatus == 308) {
|
||||
// we only retry when list matches redirected url & does not match original url
|
||||
if (g_urlRetryProxyList.isUrlMatched(url)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const Url *location = mime.getLocationUrl();
|
||||
|
||||
if (g_urlRetryProxyList.isUrlMatched(*location)) {
|
||||
*msg = "redir url proxy match list";
|
||||
appendRetryProxy(url.getUrl(), url.getUrlLen(), location->getUrl(), location->getUrlLen());
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t pre_size = mime.getMimeLen(); //size of http response line, mime headers and empty line separator
|
||||
size_t haystack_size = ts->m_readOffset - pre_size;
|
||||
const char *haystack = ts->m_readBuf + pre_size;
|
||||
|
||||
if (g_contentRetryProxyList.isContentMatched(haystack, haystack_size)) {
|
||||
*msg = "content proxy match list";
|
||||
appendRetryProxy(url.getUrl(), url.getUrlLen());
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void appendCrawlBan(const char *group, const char *url, int urlLen) {
|
||||
char filename[1024];
|
||||
@ -1332,6 +1400,13 @@ void gotHttpReply2 ( void *state ,
|
||||
);
|
||||
}
|
||||
|
||||
bool retry_proxy = false;
|
||||
if (retryProxy(ts, &banMsg, r)) {
|
||||
retry_proxy = true;
|
||||
char ipbuf[16];
|
||||
log("msg13: retry using proxy for url %s due to %s, for ip %s", r->ptr_url, banMsg, iptoa(r->m_urlIp, ipbuf));
|
||||
}
|
||||
|
||||
if(crawlWasBanned(ts,&banMsg,r)) {
|
||||
char ipbuf[16];
|
||||
log("msg13: url %.*s detected as banned2 (%s), for ip %s"
|
||||
@ -1369,8 +1444,7 @@ void gotHttpReply2 ( void *state ,
|
||||
if ( banned &&
|
||||
// retry iff we haven't already, but if we did stop the inf loop
|
||||
! r->m_wasInTableBeforeStarting &&
|
||||
cr &&
|
||||
( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
|
||||
cr && ( cr->m_automaticallyBackOff || cr->m_automaticallyUseProxies ) &&
|
||||
// but this is not for proxies... only native crawlbot backoff
|
||||
! r->m_proxyIp ) {
|
||||
// note this as well
|
||||
@ -1388,6 +1462,19 @@ void gotHttpReply2 ( void *state ,
|
||||
return;
|
||||
}
|
||||
|
||||
if (retry_proxy) {
|
||||
// note this as well
|
||||
log("msg13: retrying spidered page with proxy for %s", r->ptr_url);
|
||||
|
||||
// reset error
|
||||
g_errno = 0;
|
||||
|
||||
r->m_forceUseFloaters = 1;
|
||||
|
||||
downloadTheDocForReals2(r);
|
||||
return;
|
||||
}
|
||||
|
||||
// do not print this if we are already using proxies, it is for
|
||||
// the auto crawldelay backoff logic only
|
||||
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )
|
||||
|
31
Msg25.h
31
Msg25.h
@ -42,7 +42,6 @@ public:
|
||||
int32_t m_niceness;
|
||||
bool m_doLinkSpamCheck;
|
||||
bool m_oneVotePerIpDom;
|
||||
bool m_canBeCancelled;
|
||||
int32_t m_lastUpdateTime;
|
||||
bool m_onlyNeedGoodInlinks;
|
||||
int32_t m_ourHostHash32;
|
||||
@ -140,7 +139,6 @@ class Msg25 {
|
||||
int32_t niceness,
|
||||
bool doLinkSpamCheck,
|
||||
bool oneVotePerIpDom,
|
||||
bool canBeCancelled,
|
||||
int32_t lastUpdateTime,
|
||||
bool onlyNeedGoodInlinks,
|
||||
// if an inlinking document has an outlink
|
||||
@ -175,18 +173,9 @@ class Msg25 {
|
||||
|
||||
// private:
|
||||
// these need to be public for wrappers to call:
|
||||
bool gotTermFreq(bool msg42Called);
|
||||
bool getRootTitleRec();
|
||||
bool gotRootTitleRec();
|
||||
bool gotDocId();
|
||||
bool gotRootLinkText();
|
||||
bool gotRootLinkText2();
|
||||
bool getLinkingDocIds();
|
||||
bool gotList();
|
||||
bool gotClusterRecs();
|
||||
bool sendRequests();
|
||||
bool gotLinkText(class Msg20Request *req);
|
||||
bool gotMsg25Reply();
|
||||
bool doReadLoop();
|
||||
|
||||
// input vars
|
||||
@ -200,8 +189,6 @@ class Msg25 {
|
||||
uint64_t m_linkHash64;
|
||||
key224_t m_nextKey;
|
||||
|
||||
bool m_retried;
|
||||
bool m_prependWWW;
|
||||
bool m_onlyNeedGoodInlinks;
|
||||
int64_t m_docId;
|
||||
collnum_t m_collnum;
|
||||
@ -231,9 +218,6 @@ class Msg25 {
|
||||
|
||||
Inlink *m_k;
|
||||
|
||||
// for getting the root title rec so we can share its pwids
|
||||
Msg22 m_msg22;
|
||||
|
||||
int32_t m_maxNumLinkers;
|
||||
|
||||
// should we free the m_replyPtrs on destruction? default=true
|
||||
@ -258,10 +242,6 @@ class Msg25 {
|
||||
|
||||
int32_t m_minRecSizes;
|
||||
|
||||
// Msg20 is for getting the LinkInfo class from this same url's
|
||||
// titleRec from another (usually much larger) gigablast cluster/netwrk
|
||||
Msg20 m_msg20;
|
||||
|
||||
// how many msg20s have we sent/recvd?
|
||||
int32_t m_numRequests;
|
||||
int32_t m_numReplies;
|
||||
@ -278,20 +258,16 @@ class Msg25 {
|
||||
bool m_oneVotePerIpDom;
|
||||
bool m_doLinkSpamCheck;
|
||||
bool m_isInjecting;
|
||||
char m_canBeCancelled;
|
||||
int32_t m_lastUpdateTime;
|
||||
|
||||
Multicast m_mcast;
|
||||
|
||||
int32_t m_good;
|
||||
int32_t m_errors;
|
||||
int32_t m_noText;
|
||||
int32_t m_reciprocal;
|
||||
|
||||
bool m_spideringEnabled;
|
||||
|
||||
int32_t m_dupCount;
|
||||
int32_t m_vectorDups;
|
||||
int32_t m_spamLinks;
|
||||
int32_t m_niceness;
|
||||
int32_t m_numFromSameIp;
|
||||
@ -299,12 +275,8 @@ class Msg25 {
|
||||
|
||||
// stats for allow some link spam inlinks to vote
|
||||
int32_t m_spamCount;
|
||||
int32_t m_spamWeight;
|
||||
int32_t m_maxSpam;
|
||||
|
||||
char m_siteQuality;
|
||||
int32_t m_siteNumFreshInlinks;
|
||||
|
||||
// this is used for the linkdb list
|
||||
HashTableX m_ipTable;
|
||||
HashTableX m_fullIpTable;
|
||||
@ -321,9 +293,6 @@ class Msg25 {
|
||||
int32_t m_linkSpamLinkdb;
|
||||
int32_t m_ipDups;
|
||||
|
||||
uint32_t m_groupId;
|
||||
int64_t m_probDocId;
|
||||
|
||||
LinkInfo *m_oldLinkInfo;
|
||||
|
||||
char m_buf[MAX_NOTE_BUF_LEN];
|
||||
|
32
PageGet.cpp
32
PageGet.cpp
@ -29,17 +29,10 @@ public:
|
||||
int32_t m_niceness;
|
||||
XmlDoc m_xd;
|
||||
lang_t m_langId;
|
||||
//Msg8a m_msg8a;
|
||||
//SiteRec m_sr;
|
||||
//TagRec m_tagRec;
|
||||
TcpSocket *m_socket;
|
||||
HttpRequest m_r;
|
||||
char m_coll[MAX_COLL_LEN+2];
|
||||
//CollectionRec *m_cr;
|
||||
bool m_isMasterAdmin;
|
||||
//bool m_seq;
|
||||
bool m_rtq;
|
||||
//char m_q[MAX_QUERY_LEN+1];
|
||||
SafeBuf m_qsb;
|
||||
char m_qtmpBuf[128];
|
||||
int32_t m_qlen;
|
||||
@ -51,7 +44,6 @@ public:
|
||||
int32_t m_strip;
|
||||
bool m_cnsPage; // Are we in the click 'n' scroll page?
|
||||
bool m_printDisclaimer;
|
||||
bool m_netTestResults;
|
||||
bool m_isBanned;
|
||||
bool m_noArchive;
|
||||
SafeBuf m_sb;
|
||||
@ -65,7 +57,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
int32_t collLen = 0;
|
||||
const char *coll = r->getString("c",&collLen);
|
||||
if ( ! coll || ! coll[0] ) {
|
||||
//coll = g_conf.m_defaultColl;
|
||||
coll = g_conf.getDefaultColl( );
|
||||
collLen = strlen(coll);
|
||||
}
|
||||
@ -103,13 +94,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
// . should we do a sequential lookup?
|
||||
// . we need to match summary here so we need to know this
|
||||
//bool seq = r->getLong ( "seq" , false );
|
||||
// restrict to root file?
|
||||
bool rtq = r->getLong ( "rtq" , 0) ? true : false;
|
||||
|
||||
// . get the titleRec
|
||||
// . TODO: redirect client to a better http server to save bandwidth
|
||||
State2 *st ;
|
||||
@ -139,7 +123,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
|
||||
// store query for query highlighting
|
||||
st->m_netTestResults = r->getLong ("rnettest", 0) ? true : false;
|
||||
st->m_qsb.setBuf ( st->m_qtmpBuf,128,0,false );
|
||||
st->m_qsb.setLabel ( "qsbpg" );
|
||||
|
||||
@ -150,7 +133,6 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
st->m_qsb.safeStrcpy ( "" );
|
||||
|
||||
st->m_qlen = qlen;
|
||||
st->m_rtq = rtq;
|
||||
st->m_isBanned = false;
|
||||
st->m_noArchive = false;
|
||||
st->m_socket = s;
|
||||
@ -166,14 +148,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( st->m_strip ) {
|
||||
st->m_printDisclaimer = false;
|
||||
}
|
||||
|
||||
// should we cache it?
|
||||
char useCache = r->getLong ( "usecache" , 1 );
|
||||
char rcache = r->getLong ( "rcache" , 1 );
|
||||
char wcache = r->getLong ( "wcache" , 1 );
|
||||
int32_t cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
|
||||
if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
|
||||
if ( rcache == 0 ) cacheAge = 0;
|
||||
|
||||
// . fetch the TitleRec
|
||||
// . a max cache age of 0 means not to read from the cache
|
||||
XmlDoc *xd = &st->m_xd;
|
||||
@ -413,12 +388,11 @@ bool processLoop ( void *state ) {
|
||||
// Moved over from PageResults.cpp
|
||||
sb->safePrintf( "</span> - <a href=\""
|
||||
"/get?"
|
||||
"q=%s&c=%s&rtq=%" PRId32"&"
|
||||
"q=%s&c=%s&"
|
||||
"d=%" PRId64"&strip=1\""
|
||||
" style=\"%s\">"
|
||||
"[stripped]</a>",
|
||||
q , st->m_coll ,
|
||||
(int32_t)st->m_rtq,
|
||||
q , st->m_coll ,
|
||||
st->m_docId, styleLink );
|
||||
|
||||
// a link to alexa
|
||||
|
494
PageParser.cpp
494
PageParser.cpp
@ -10,122 +10,49 @@
|
||||
|
||||
class State8 {
|
||||
public:
|
||||
//Msg16 m_msg16;
|
||||
//Msg14 m_msg14;
|
||||
//Msg15 m_msg15;
|
||||
SafeBuf m_dbuf;
|
||||
//XmlDoc m_doc;
|
||||
//Url m_url;
|
||||
//Url m_rootUrl;
|
||||
const char *m_u;
|
||||
int32_t m_ulen;
|
||||
char m_rootQuality;
|
||||
char m_coll[MAX_COLL_LEN];
|
||||
int32_t m_collLen;
|
||||
//int32_t m_sfn;
|
||||
//int32_t m_urlLen;
|
||||
|
||||
TcpSocket *m_s;
|
||||
char m_pwd[32];
|
||||
HttpRequest m_r;
|
||||
int32_t m_old;
|
||||
// recyle the link info from the title rec?
|
||||
int32_t m_recycle;
|
||||
// recycle the link info that was imported from another coll?
|
||||
int32_t m_recycle2;
|
||||
bool m_render;
|
||||
bool m_recompute;
|
||||
int32_t m_oips;
|
||||
char m_linkInfoColl[11];
|
||||
// char m_buf[16384 * 1024];
|
||||
|
||||
//int32_t m_page;
|
||||
// m_pbuf now points to m_sbuf if we are showing the parsing junk
|
||||
SafeBuf m_xbuf;
|
||||
SafeBuf m_wbuf;
|
||||
bool m_donePrinting;
|
||||
//SafeBuf m_sbuf;
|
||||
// this is a buffer which cats m_sbuf into it
|
||||
//SafeBuf m_sbuf2;
|
||||
|
||||
// new state vars for Msg3b.cpp
|
||||
int64_t m_docId;
|
||||
void *m_state ;
|
||||
void (* m_callback) (void *state);
|
||||
Query *m_q;
|
||||
int64_t *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
// these are from rearranging the code
|
||||
int32_t m_indexCode;
|
||||
//uint64_t m_chksum1;
|
||||
|
||||
bool m_didRootDom;
|
||||
bool m_didRootWWW;
|
||||
bool m_wasRootDom;
|
||||
|
||||
// call Msg16 with a versino of title rec to do
|
||||
int32_t m_titleRecVersion;
|
||||
|
||||
//TitleRec m_tr;
|
||||
|
||||
//XmlDoc m_oldDoc;
|
||||
XmlDoc m_xd;
|
||||
};
|
||||
|
||||
// TODO: meta redirect tag to host if hostId not ours
|
||||
static bool processLoop ( void *state ) ;
|
||||
static bool gotXmlDoc ( void *state ) ;
|
||||
static bool sendErrorReply ( void *state , int32_t err ) ;
|
||||
static bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
class State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
int64_t *termFreqs ,
|
||||
float *termFreqWeights ,
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) ;
|
||||
static bool processLoop(void *state);
|
||||
static bool sendErrorReply(void *state, int32_t err);
|
||||
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
// . make a web page displaying the config of this host
|
||||
// . call g_httpServer.sendDynamicPage() to send it
|
||||
// . TODO: don't close this socket until httpserver returns!!
|
||||
bool sendPageParser ( TcpSocket *s , HttpRequest *r ) {
|
||||
return sendPageParser2 ( s , r , NULL , -1LL , NULL , NULL,
|
||||
NULL , NULL, NULL , NULL );
|
||||
}
|
||||
|
||||
// . a new interface so Msg3b can call this with "s" set to NULL
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
static bool sendPageParser2 ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
State8 *st ,
|
||||
int64_t docId ,
|
||||
Query *q ,
|
||||
// in query term space, not imap space
|
||||
int64_t *termFreqs ,
|
||||
// in imap space
|
||||
float *termFreqWeights ,
|
||||
// in imap space
|
||||
float *affWeights ,
|
||||
void *state ,
|
||||
void (* callback)(void *state) ) {
|
||||
|
||||
//log("parser: read sock=%" PRId32,s->m_sd);
|
||||
|
||||
// . TODO: don't close this socket until httpserver returns!!
|
||||
bool sendPageParser(TcpSocket *s, HttpRequest *r) {
|
||||
// might a simple request to addsomething to validated.*.txt file
|
||||
// from XmlDoc::print() or XmlDoc::validateOutput()
|
||||
//int64_t uh64 = r->getLongLong("uh64",0LL);
|
||||
const char *uh64str = r->getString("uh64",NULL);
|
||||
//char *divTag = r->getString("div",NULL);
|
||||
if ( uh64str ) {
|
||||
// make basic reply
|
||||
const char *reply = "HTTP/1.0 200 OK\r\n"
|
||||
@ -144,34 +71,18 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
}
|
||||
|
||||
// make a state
|
||||
if ( st ) st->m_freeIt = false;
|
||||
if ( ! st ) {
|
||||
try { st = new (State8); }
|
||||
catch(std::bad_alloc&) {
|
||||
g_errno = ENOMEM;
|
||||
log("PageParser: new(%i): %s",
|
||||
(int)sizeof(State8),mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,
|
||||
mstrerror(g_errno));}
|
||||
mnew ( st , sizeof(State8) , "PageParser" );
|
||||
st->m_freeIt = true;
|
||||
}
|
||||
// msg3b uses this to get a score from the query
|
||||
st->m_state = state;
|
||||
st->m_callback = callback;
|
||||
st->m_q = q;
|
||||
st->m_termFreqs = termFreqs;
|
||||
st->m_termFreqWeights = termFreqWeights;
|
||||
st->m_affWeights = affWeights;
|
||||
//st->m_total = (score_t)-1;
|
||||
State8 *st;
|
||||
try { st = new (State8); }
|
||||
catch(std::bad_alloc&) {
|
||||
g_errno = ENOMEM;
|
||||
log("PageParser: new(%i): %s",
|
||||
(int)sizeof(State8),mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,
|
||||
mstrerror(g_errno));}
|
||||
mnew ( st , sizeof(State8) , "PageParser" );
|
||||
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
st->m_didRootWWW = false;
|
||||
st->m_wasRootDom = false;
|
||||
st->m_u = NULL;
|
||||
st->m_recompute = false;
|
||||
//st->m_url.reset();
|
||||
|
||||
// password, too
|
||||
int32_t pwdLen = 0;
|
||||
@ -196,28 +107,14 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
if ( st->m_titleRecVersion == -1 )
|
||||
st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
|
||||
|
||||
//int32_t ulen = 0;
|
||||
//char *u = r->getString ( "u" , &ulen , NULL /*default*/);
|
||||
int32_t old = r->getLong ( "old", 0 );
|
||||
|
||||
// url will override docid if given
|
||||
if ( ! st->m_u || ! st->m_u[0] )
|
||||
st->m_docId = r->getLongLong ("docid",-1);
|
||||
else
|
||||
st->m_docId = -1;
|
||||
// set url in state class (may have length 0)
|
||||
//if ( u ) st->m_url.set ( u , ulen );
|
||||
//st->m_urlLen = ulen;
|
||||
st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
|
||||
// should we recycle link info?
|
||||
st->m_recycle = r->getLong("recycle",0);
|
||||
st->m_recycle2 = r->getLong("recycleimp",0);
|
||||
|
||||
st->m_render = r->getLong("render" ,0) ? true : false;
|
||||
// for quality computation... takes way longer cuz we have to
|
||||
// lookup the IP address of every outlink, so we can get its root
|
||||
// quality using Msg25 which needs to filter out voters from that IP
|
||||
// range.
|
||||
st->m_oips = r->getLong("oips" ,0);
|
||||
|
||||
int32_t linkInfoLen = 0;
|
||||
// default is NULL
|
||||
@ -227,15 +124,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
|
||||
// should we use the old title rec?
|
||||
st->m_old = old;
|
||||
//no more setting the default root quality to 30, instead if we do not
|
||||
// know it setting it to -1
|
||||
st->m_rootQuality=-1;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// header
|
||||
SafeBuf *xbuf = &st->m_xbuf;
|
||||
@ -299,21 +187,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Parser version to use: "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"(-1 means to use latest title rec version)<br>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>use cached</b>"
|
||||
@ -328,20 +201,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Reparse root:"
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=artr value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Apply selected ruleset to root to update quality"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>recycle link info</b>"
|
||||
@ -357,20 +216,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Recycle Link Info Imported:"
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=recycleimp value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Recycle the link info imported from other coll"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>render html</b>"
|
||||
@ -385,33 +230,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"Lookup outlinks' ruleset, ips, quality:"
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=checkbox name=oips value=1%s> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"To compute quality lookup IP addresses of roots "
|
||||
"of outlinks."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"LinkInfo Coll:"
|
||||
"</td>"
|
||||
"<td>"
|
||||
"<input type=text name=\"oli\" size=\"10\" value=\"\"> "
|
||||
"</td>"
|
||||
"<td>"
|
||||
"Leave empty usually. Uses this coll to lookup link info."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>"
|
||||
"<b>optional query</b>"
|
||||
@ -443,7 +261,6 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</td>"
|
||||
|
||||
"<td>"
|
||||
//"<input type=checkbox name=xml value=1> "
|
||||
"<select name=ctype>\n"
|
||||
"<option value=%" PRId32" selected>HTML</option>\n"
|
||||
"<option value=%" PRId32">XML</option>\n"
|
||||
@ -477,12 +294,8 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
"</center>"
|
||||
"</form>"
|
||||
"<br>",
|
||||
|
||||
//oips ,
|
||||
contentParm );
|
||||
|
||||
|
||||
|
||||
xbuf->safePrintf(
|
||||
"<center>"
|
||||
"<input type=submit value=Submit>"
|
||||
@ -526,18 +339,8 @@ static bool sendPageParser2 ( TcpSocket *s ,
|
||||
|
||||
uint8_t contentType = CT_HTML;
|
||||
if ( r->getBool("xml",0) ) contentType = CT_XML;
|
||||
|
||||
contentType = r->getLong("ctype",contentType);//CT_HTML);
|
||||
|
||||
|
||||
// if facebook, load xml content from title rec...
|
||||
bool isFacebook = strstr(st->m_u,"http://www.facebook.com/") ? true : false;
|
||||
if ( isFacebook && ! content ) {
|
||||
int64_t docId = Titledb::getProbableDocId((char*)st->m_u);
|
||||
sprintf(sreq.m_url ,"%" PRIu64 "", (uint64_t) docId);
|
||||
sreq.m_isPageReindex = true;
|
||||
}
|
||||
|
||||
// hack
|
||||
if ( content ) {
|
||||
st->m_dbuf.purge();
|
||||
@ -588,17 +391,6 @@ bool processLoop ( void *state ) {
|
||||
// print it out
|
||||
xd->printDoc( &st->m_xbuf );
|
||||
}
|
||||
|
||||
// print reason we can't analyze it (or index it)
|
||||
//if ( st->m_indexCode != 0 ) {
|
||||
// st->m_xbuf.safePrintf ("<br><br><b>indexCode: %s</b>\n<br>",
|
||||
// mstrerror(st->m_indexCode));
|
||||
//}
|
||||
|
||||
// print the final tail
|
||||
//p += g_httpServer.printTail ( p , pend - p );
|
||||
|
||||
//log("parser: send sock=%" PRId32,st->m_s->m_sd);
|
||||
|
||||
// now encapsulate it in html head/tail and send it off
|
||||
bool status = g_httpServer.sendDynamicPage( st->m_s ,
|
||||
@ -611,10 +403,9 @@ bool processLoop ( void *state ) {
|
||||
NULL,//cookie
|
||||
"utf-8");
|
||||
// delete the state now
|
||||
if ( st->m_freeIt ) {
|
||||
mdelete ( st , sizeof(State8) , "PageParser" );
|
||||
delete (st);
|
||||
}
|
||||
mdelete ( st , sizeof(State8) , "PageParser" );
|
||||
delete (st);
|
||||
|
||||
// return the status
|
||||
return status;
|
||||
}
|
||||
@ -643,250 +434,3 @@ bool sendErrorReply ( void *state , int32_t err ) {
|
||||
//return g_httpServer.sendDynamicPage ( s , tmp , strlen(tmp) );
|
||||
return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
|
||||
}
|
||||
|
||||
// for procog
|
||||
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
|
||||
|
||||
// make a state
|
||||
State8 *st;
|
||||
try { st = new (State8); }
|
||||
catch(std::bad_alloc&) {
|
||||
g_errno = ENOMEM;
|
||||
log("PageParser: new(%i): %s",
|
||||
(int)sizeof(State8),mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,
|
||||
mstrerror(g_errno));}
|
||||
mnew ( st , sizeof(State8) , "PageParser" );
|
||||
st->m_freeIt = true;
|
||||
st->m_state = NULL;
|
||||
//st->m_callback = callback;
|
||||
//st->m_q = q;
|
||||
//st->m_termFreqs = termFreqs;
|
||||
//st->m_termFreqWeights = termFreqWeights;
|
||||
//st->m_affWeights = affWeights;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
st->m_didRootWWW = false;
|
||||
st->m_wasRootDom = false;
|
||||
st->m_u = NULL;
|
||||
|
||||
// password, too
|
||||
int32_t pwdLen = 0;
|
||||
const char *pwd = r->getString ( "pwd" , &pwdLen );
|
||||
if ( pwdLen > 31 ) pwdLen = 31;
|
||||
if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
|
||||
st->m_pwd[pwdLen]='\0';
|
||||
|
||||
// save socket ptr
|
||||
st->m_s = s;
|
||||
st->m_r.copy ( r );
|
||||
|
||||
// get the collection
|
||||
const char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
|
||||
if ( ! coll ) coll = g_conf.m_defaultColl;
|
||||
int32_t collLen = strlen(coll);
|
||||
if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS );
|
||||
strcpy ( st->m_coll , coll );
|
||||
|
||||
// version to use, if -1 use latest
|
||||
st->m_titleRecVersion = r->getLong("version",-1);
|
||||
if ( st->m_titleRecVersion == -1 )
|
||||
st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
|
||||
|
||||
int32_t old = r->getLong ( "old", 0 );
|
||||
|
||||
// url will override docid if given
|
||||
st->m_docId = r->getLongLong ("d",-1);
|
||||
st->m_docId = r->getLongLong ("docid",st->m_docId);
|
||||
|
||||
int32_t ulen;
|
||||
const char *u = st->m_r.getString("u",&ulen,NULL);
|
||||
if ( ! u ) u = st->m_r.getString("url",&ulen,NULL);
|
||||
if ( ! u && st->m_docId == -1LL )
|
||||
return sendErrorReply ( st , EBADREQUEST );
|
||||
|
||||
// set url in state class (may have length 0)
|
||||
//if ( u ) st->m_url.set ( u , ulen );
|
||||
//st->m_urlLen = ulen;
|
||||
st->m_u = u;
|
||||
st->m_ulen = 0;
|
||||
if ( u ) st->m_ulen = strlen(u);
|
||||
// should we recycle link info?
|
||||
st->m_recycle = r->getLong("recycle",1);
|
||||
st->m_recycle2 = r->getLong("recycleimp",0);
|
||||
st->m_render = r->getLong("render" ,0) ? true : false;
|
||||
st->m_recompute = r->getLong("recompute" ,0) ? true : false;
|
||||
// for quality computation... takes way longer cuz we have to
|
||||
// lookup the IP address of every outlink, so we can get its root
|
||||
// quality using Msg25 which needs to filter out voters from that IP
|
||||
// range.
|
||||
st->m_oips = r->getLong("oips" ,0);
|
||||
//st->m_page = r->getLong("page",1);
|
||||
|
||||
int32_t linkInfoLen = 0;
|
||||
// default is NULL
|
||||
const char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
|
||||
if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
|
||||
else st->m_linkInfoColl[0] = '\0';
|
||||
|
||||
// should we use the old title rec?
|
||||
st->m_old = old;
|
||||
//no more setting the default root quality to 30, instead if we do not
|
||||
// know it setting it to -1
|
||||
st->m_rootQuality=-1;
|
||||
|
||||
// header
|
||||
//st->m_xbuf.safePrintf("<meta http-equiv=\"Content-Type\" "
|
||||
// "content=\"text/html; charset=utf-8\">\n");
|
||||
|
||||
XmlDoc *xd = &st->m_xd;
|
||||
|
||||
int32_t isXml = r->getLong("xml",0);
|
||||
|
||||
// if got docid, use that
|
||||
if ( st->m_docId != -1 ) {
|
||||
if ( ! xd->set3 ( st->m_docId,
|
||||
st->m_coll,
|
||||
0 ) ) // niceness
|
||||
// return error reply if g_errno is set
|
||||
return sendErrorReply ( st , g_errno );
|
||||
// make this our callback in case something blocks
|
||||
xd->setCallback ( st , gotXmlDoc );
|
||||
xd->m_pbuf = &st->m_wbuf;
|
||||
// reset this flag
|
||||
st->m_donePrinting = false;
|
||||
// . set xd from the old title rec if recycle is true
|
||||
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
||||
//if ( st->m_recycle ) xd->m_recycleContent = true;
|
||||
xd->m_recycleContent = true;
|
||||
// force this on
|
||||
//xd->m_useSiteLinkBuf = true;
|
||||
//xd->m_usePageLinkBuf = true;
|
||||
if ( isXml ) xd->m_printInXml = true;
|
||||
// now tell it to fetch the old title rec
|
||||
if ( ! xd->loadFromOldTitleRec () )
|
||||
// return false if this blocks
|
||||
return false;
|
||||
return gotXmlDoc ( st );
|
||||
}
|
||||
|
||||
// set this up
|
||||
SpiderRequest sreq;
|
||||
if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
|
||||
int32_t firstIp = hash32n(st->m_u);
|
||||
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
||||
// parentdocid of 0
|
||||
sreq.setKey( firstIp, 0LL, false );
|
||||
sreq.m_isPageParser = 1;
|
||||
sreq.m_fakeFirstIp = 1;
|
||||
sreq.m_firstIp = firstIp;
|
||||
Url nu;
|
||||
nu.set(sreq.m_url);
|
||||
sreq.m_domHash32 = nu.getDomainHash32();
|
||||
sreq.m_siteHash32 = nu.getHostHash32();
|
||||
|
||||
// . get provided content if any
|
||||
// . will be NULL if none provided
|
||||
// . "content" may contain a MIME
|
||||
int32_t contentLen = 0;
|
||||
const char *content = r->getString ( "content" , &contentLen , NULL );
|
||||
if ( ! content ) {
|
||||
content = r->getUnencodedContent ();
|
||||
contentLen = r->getUnencodedContentLen ();
|
||||
}
|
||||
// ensure null
|
||||
if ( contentLen == 0 ) content = NULL;
|
||||
|
||||
int32_t ctype = r->getLong("ctype",CT_HTML);
|
||||
|
||||
// . use the enormous power of our new XmlDoc class
|
||||
// . this returns false if blocked
|
||||
if ( ! xd->set4 ( &sreq ,
|
||||
NULL ,
|
||||
(char*)st->m_coll ,
|
||||
// we need this so the term table is set!
|
||||
&st->m_wbuf , // XmlDoc::m_pbuf
|
||||
0, // niceness
|
||||
(char*)content ,
|
||||
false, // deletefromindex
|
||||
0, // forced ip
|
||||
ctype ))
|
||||
// return error reply if g_errno is set
|
||||
return sendErrorReply ( st , g_errno );
|
||||
// make this our callback in case something blocks
|
||||
xd->setCallback ( st , gotXmlDoc );
|
||||
// reset this flag
|
||||
st->m_donePrinting = false;
|
||||
// prevent a core here in the event we download the page content
|
||||
xd->m_crawlDelayValid = true;
|
||||
xd->m_crawlDelay = 0;
|
||||
// . set xd from the old title rec if recycle is true
|
||||
// . can also use XmlDoc::m_loadFromOldTitleRec flag
|
||||
//if ( st->m_recycle ) xd->m_recycleContent = true;
|
||||
// only recycle if docid is given!!
|
||||
if ( st->m_recycle ) xd->m_recycleContent = true;
|
||||
// force this on
|
||||
//xd->m_useSiteLinkBuf = true;
|
||||
//xd->m_usePageLinkBuf = true;
|
||||
if ( isXml ) xd->m_printInXml = true;
|
||||
|
||||
return gotXmlDoc ( st );
|
||||
}
|
||||
|
||||
bool gotXmlDoc ( void *state ) {
|
||||
// cast it
|
||||
State8 *st = (State8 *)state;
|
||||
// get the xmldoc
|
||||
XmlDoc *xd = &st->m_xd;
|
||||
|
||||
// if we loaded from old title rec, it should be there!
|
||||
|
||||
// error?
|
||||
if ( g_errno ) return sendErrorReply ( st , g_errno );
|
||||
|
||||
bool printIt = false;
|
||||
if ( st->m_u && st->m_u[0] ) printIt = true;
|
||||
if ( st->m_docId != -1LL ) printIt = true;
|
||||
if ( st->m_donePrinting ) printIt = false;
|
||||
|
||||
// do not re-call this if printDocForProCog blocked... (check length())
|
||||
if ( printIt ) {
|
||||
// mark as done
|
||||
st->m_donePrinting = true;
|
||||
// always re-compute the page inlinks dynamically, do not
|
||||
// use the ptr_linkInfo1 stored in titlerec!!
|
||||
// NO! not if set from titlerec/docid
|
||||
if ( st->m_recompute )
|
||||
xd->m_linkInfo1Valid = false;
|
||||
// . print it out
|
||||
// . returns false if blocks, true otherwise
|
||||
// . sets g_errno on error
|
||||
if ( ! xd->printDocForProCog ( &st->m_xbuf, &st->m_r ) )
|
||||
return false;
|
||||
// error?
|
||||
if ( g_errno ) return sendErrorReply ( st , g_errno );
|
||||
}
|
||||
|
||||
int32_t isXml = st->m_r.getLong("xml",0);
|
||||
char ctype2 = CT_HTML;
|
||||
if ( isXml ) ctype2 = CT_XML;
|
||||
// now encapsulate it in html head/tail and send it off
|
||||
bool status = g_httpServer.sendDynamicPage( st->m_s ,
|
||||
st->m_xbuf.getBufStart(),
|
||||
st->m_xbuf.length() ,
|
||||
-1, //cachtime
|
||||
false ,//postreply?
|
||||
&ctype2,
|
||||
-1 , //httpstatus
|
||||
NULL,//cookie
|
||||
"utf-8");
|
||||
// delete the state now
|
||||
if ( st->m_freeIt ) {
|
||||
mdelete ( st , sizeof(State8) , "PageParser" );
|
||||
delete (st);
|
||||
}
|
||||
// return the status
|
||||
return status;
|
||||
}
|
||||
|
@ -4,6 +4,4 @@
|
||||
class TcpSocket;
|
||||
class HttpRequest;
|
||||
|
||||
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
|
||||
|
||||
#endif // GB_PAGEPARSER_H
|
||||
|
175
Parms.cpp
175
Parms.cpp
@ -38,6 +38,8 @@
|
||||
#include "GbDns.h"
|
||||
#include "SiteMedianPageTemperatureRegistry.h"
|
||||
#include "QueryLanguage.h"
|
||||
#include "SiteNumInlinks.h"
|
||||
#include "SiteMedianPageTemperature.h"
|
||||
#include <set>
|
||||
#include <fstream>
|
||||
|
||||
@ -1550,11 +1552,11 @@ bool Parms::printParm( SafeBuf* sb,
|
||||
// . make at least as big as a int64_t
|
||||
if ( j >= jend ) s = "\0\0\0\0\0\0\0\0";
|
||||
// delimit each cgi var if we need to
|
||||
if ( m->m_cgi && strlen(m->m_cgi) > 45 ) {
|
||||
char cgi[128];
|
||||
if ( m->m_cgi && strlen(m->m_cgi)+10 >= sizeof(cgi) ) { //10 digits
|
||||
log(LOG_LOGIC,"admin: Cgi variable is TOO big.");
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
char cgi[64];
|
||||
if ( m->m_cgi ) {
|
||||
if ( j > 0 ) sprintf ( cgi , "%s%" PRId32 , m->m_cgi , j );
|
||||
else sprintf ( cgi , "%s" , m->m_cgi );
|
||||
@ -3679,6 +3681,15 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
m->m_title = "adjective neuter<->common variants";
|
||||
m->m_desc = "Extend to both grammatical genders";
|
||||
simple_m_set(SearchInput,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
|
||||
m->m_defOff= offsetof(CollectionRec,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
|
||||
m->m_cgi = "lwv_adjective_grammatical_gender_simplification";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_RESULTS;
|
||||
m++;
|
||||
|
||||
|
||||
// limit to this # of the top term pairs from inlink text whose
|
||||
// score is accumulated
|
||||
@ -5496,7 +5507,7 @@ void Parms::init ( ) {
|
||||
m->m_off = offsetof(Conf,m_queryLanguageServerName);
|
||||
m->m_type = TYPE_STRING;
|
||||
m->m_def = "localhost";
|
||||
m->m_size = sizeof(Conf::m_urlClassificationServerName);
|
||||
m->m_size = sizeof(Conf::m_queryLanguageServerName);
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = true;
|
||||
m->m_page = PAGE_MASTER;
|
||||
@ -5541,6 +5552,108 @@ void Parms::init ( ) {
|
||||
m->m_flags = PF_REBUILDQUERYLANGSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site median page temperature server name";
|
||||
m->m_desc = "";
|
||||
m->m_cgi = "smpt_server_name";
|
||||
m->m_off = offsetof(Conf,m_siteMedianPageTemperatureServerName);
|
||||
m->m_type = TYPE_STRING;
|
||||
m->m_def = "localhost";
|
||||
m->m_size = sizeof(Conf::m_siteNumInlinksServerName);
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = true;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site median page temperature server port";
|
||||
m->m_desc = "(0=disable; 8076=default server port)";
|
||||
m->m_cgi = "smpt_server_port";
|
||||
simple_m_set(Conf,m_siteMedianPageTemperatureServerPort);
|
||||
m->m_def = "0";
|
||||
m->m_smin = 0;
|
||||
m->m_smax = 65535;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site median page temperature max outstanding requests";
|
||||
m->m_desc = "(0=disable)";
|
||||
m->m_cgi = "smpt_max_oustanding_requests";
|
||||
simple_m_set(Conf,m_maxOutstandingSiteMedianPageTemperature);
|
||||
m->m_def = "1000";
|
||||
m->m_smin = 0;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site median page temperature timeout";
|
||||
m->m_desc = "Per-request timeout.";
|
||||
m->m_cgi = "smpt_timeout";
|
||||
simple_m_set(Conf,m_siteMedianPageTemperatureTimeout);
|
||||
m->m_def = "500";
|
||||
m->m_units = "milliseconds";
|
||||
m->m_smin = 0;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITEMEDIANPAGETEMPSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site num inlinks server name";
|
||||
m->m_desc = "";
|
||||
m->m_cgi = "sni_server_name";
|
||||
m->m_off = offsetof(Conf,m_siteNumInlinksServerName);
|
||||
m->m_type = TYPE_STRING;
|
||||
m->m_def = "localhost";
|
||||
m->m_size = sizeof(Conf::m_siteNumInlinksServerName);
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = true;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site num inlinks server port";
|
||||
m->m_desc = "(0=disable; 8077=default server port)";
|
||||
m->m_cgi = "sni_server_port";
|
||||
simple_m_set(Conf,m_siteNumInlinksServerPort);
|
||||
m->m_def = "0";
|
||||
m->m_smin = 0;
|
||||
m->m_smax = 65535;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site num inlinks max outstanding requests";
|
||||
m->m_desc = "(0=disable)";
|
||||
m->m_cgi = "sni_max_oustanding_requests";
|
||||
simple_m_set(Conf,m_maxOutstandingSiteNumInlinks);
|
||||
m->m_def = "1000";
|
||||
m->m_smin = 0;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
|
||||
m++;
|
||||
|
||||
m->m_title = "Site num inlinks timeout";
|
||||
m->m_desc = "Per-request timeout.";
|
||||
m->m_cgi = "sni_timeout";
|
||||
simple_m_set(Conf,m_siteNumInlinksTimeout);
|
||||
m->m_def = "500";
|
||||
m->m_units = "milliseconds";
|
||||
m->m_smin = 0;
|
||||
m->m_group = false;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_flags = PF_REBUILDSITENUMINLINKSSETTINGS;
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "URL realtime classification server name";
|
||||
m->m_desc = "";
|
||||
@ -7464,6 +7577,15 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_WORD_VARIATIONS;
|
||||
m++;
|
||||
|
||||
m->m_title = "adjective neuter<->common variants";
|
||||
m->m_desc = "Extend to both grammatical genders";
|
||||
m->m_def = "0.95";
|
||||
simple_m_set(CollectionRec,m_word_variations_config.m_word_variations_weights.adjective_grammatical_gender_simplification);
|
||||
m->m_cgi = "lwv_adjective_grammatical_gender_simplification";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_WORD_VARIATIONS;
|
||||
m++;
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////
|
||||
@ -9086,9 +9208,9 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for BlockList";
|
||||
m->m_title = "log trace info for MatchList";
|
||||
m->m_cgi = "ltrc_bl";
|
||||
simple_m_set(Conf,m_logTraceBlockList);
|
||||
simple_m_set(Conf,m_logTraceMatchList);
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
@ -9100,6 +9222,13 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for Docid2FlagsAndSiteMap";
|
||||
m->m_cgi = "ltrc_dtofsm";
|
||||
simple_m_set(Conf,m_logTraceDocid2FlagsAndSiteMap);
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for DocProcess";
|
||||
m->m_cgi = "ltrc_docpro";
|
||||
simple_m_set(Conf,m_logTraceDocProcess);
|
||||
@ -9325,6 +9454,20 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for SiteMedianPageTemperature";
|
||||
m->m_cgi = "ltrc_smpt";
|
||||
simple_m_set(Conf,m_logTraceSiteMedianPageTemperature);
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for SiteNumInlinks";
|
||||
m->m_cgi = "ltrc_sni";
|
||||
simple_m_set(Conf,m_logTraceSiteNumInlinks);
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_LOG;
|
||||
m++;
|
||||
|
||||
m->m_title = "log trace info for Spider";
|
||||
m->m_cgi = "ltrc_sp";
|
||||
simple_m_set(Conf,m_logTraceSpider);
|
||||
@ -10783,6 +10926,8 @@ void Parms::handleRequest3fLoop(void *weArg) {
|
||||
bool rebuildDnsSettings = false;
|
||||
bool rebuildSpiderSettings = false;
|
||||
bool rebuildQueryLanguageSettings = false;
|
||||
bool rebuildSiteNumInlinksSettings = false;
|
||||
bool rebuildSiteMedianPageTemperatureSettings = false;
|
||||
|
||||
// process them
|
||||
const char *p = we->m_parmPtr;
|
||||
@ -10883,6 +11028,14 @@ void Parms::handleRequest3fLoop(void *weArg) {
|
||||
if (parm->m_flags & PF_REBUILDQUERYLANGSETTINGS) {
|
||||
rebuildQueryLanguageSettings = true;
|
||||
}
|
||||
|
||||
if (parm->m_flags & PF_REBUILDSITENUMINLINKSSETTINGS) {
|
||||
rebuildSiteNumInlinksSettings = true;
|
||||
}
|
||||
|
||||
if (parm->m_flags & PF_REBUILDSITEMEDIANPAGETEMPSETTINGS) {
|
||||
rebuildSiteMedianPageTemperatureSettings = true;
|
||||
}
|
||||
}
|
||||
|
||||
// do the next parm
|
||||
@ -10946,10 +11099,20 @@ void Parms::handleRequest3fLoop(void *weArg) {
|
||||
}
|
||||
|
||||
if (rebuildQueryLanguageSettings) {
|
||||
log("parms: rebuild fxclient settings");
|
||||
log("parms: rebuild querylanguage settings");
|
||||
g_queryLanguage.reinitializeSettings();
|
||||
}
|
||||
|
||||
if (rebuildSiteNumInlinksSettings) {
|
||||
log("parms: rebuild sitenuminlinks settings");
|
||||
g_siteNumInlinks.reinitializeSettings();
|
||||
}
|
||||
|
||||
if (rebuildSiteMedianPageTemperatureSettings) {
|
||||
log("parms: rebuild sitemedianpagetemperature settings");
|
||||
g_siteMedianPageTemperature.reinitializeSettings();
|
||||
}
|
||||
|
||||
// note it
|
||||
if ( ! we->m_sentReply )
|
||||
log("parms: sending parm update reply");
|
||||
|
4
Parms.h
4
Parms.h
@ -54,7 +54,7 @@ enum parameter_type_t {
|
||||
// bit flags for Parm::m_flags
|
||||
#define PF_COOKIE 0x00000001 // store in cookie?
|
||||
#define PF_REBUILDQUERYLANGSETTINGS 0x00000002
|
||||
//#define PF_UNUSED 0x00000004
|
||||
#define PF_REBUILDSITENUMINLINKSSETTINGS 0x00000004
|
||||
#define PF_REBUILDSPIDERSETTINGS 0x00000008
|
||||
#define PF_API 0x00000010
|
||||
#define PF_REBUILDURLFILTERS 0x00000020
|
||||
@ -78,7 +78,7 @@ enum parameter_type_t {
|
||||
|
||||
#define PF_REBUILDRANKINGSETTINGS 0x00200000 // ranking setting. Reinitialize any derived values
|
||||
#define PF_TABLESPLIT 0x00400000 // split into separate table
|
||||
|
||||
#define PF_REBUILDSITEMEDIANPAGETEMPSETTINGS 0x00800000
|
||||
|
||||
class Parm {
|
||||
public:
|
||||
|
@ -3930,6 +3930,7 @@ void PosdbTable::intersectLists_real() {
|
||||
if(g_pageTemperatureRegistry.query_page_temperature(m_docId, range_min, range_max, &page_temperature)) {
|
||||
//excellent, we know the page's temperature
|
||||
} else if(g_d2fasm.lookupSiteHash(m_docId,&sitehash32) && g_smptr.lookup(sitehash32,&raw_default_site_page_temperature)) {
|
||||
// we'll only use site median page temperature when we have updated docid2siteflags file
|
||||
//hmm, use the site-default page temperature
|
||||
page_temperature = g_pageTemperatureRegistry.scale_temperature(range_min, range_max, raw_default_site_page_temperature);
|
||||
} else {
|
||||
|
@ -50,6 +50,8 @@
|
||||
#include "DocRebuild.h"
|
||||
#include "DocReindex.h"
|
||||
#include "QueryLanguage.h"
|
||||
#include "SiteNumInlinks.h"
|
||||
#include "SiteMedianPageTemperature.h"
|
||||
#include <sys/statvfs.h>
|
||||
#include <pthread.h>
|
||||
#include <fcntl.h>
|
||||
@ -623,6 +625,8 @@ bool Process::shutdown2() {
|
||||
|
||||
g_urlRealtimeClassification.finalize();
|
||||
g_queryLanguage.finalize();
|
||||
g_siteNumInlinks.finalize();
|
||||
g_siteMedianPageTemperature.finalize();
|
||||
|
||||
WantedChecker::finalize();
|
||||
|
||||
|
15
Rdb.cpp
15
Rdb.cpp
@ -1830,14 +1830,19 @@ char getKeySizeFromRdbId(rdbid_t rdbId) {
|
||||
case RDB_LINKDB:
|
||||
case RDB2_LINKDB2:
|
||||
return sizeof(key224_t); // 28
|
||||
case RDB_NONE:
|
||||
case RDB_END:
|
||||
log(LOG_ERROR, "rdb: bad lookup rdbid of %i", (int)rdbId);
|
||||
g_process.shutdownAbort(true);
|
||||
case RDB_TITLEDB:
|
||||
case RDB2_TITLEDB2:
|
||||
case RDB_CLUSTERDB:
|
||||
case RDB2_CLUSTERDB2:
|
||||
case RDB_DOLEDB:
|
||||
return sizeof(key96_t); // 12
|
||||
case RDB_SITEDEFAULTPAGETEMPERATURE:
|
||||
return 8; //fake
|
||||
case RDB_NONE:
|
||||
case RDB_END:
|
||||
default:
|
||||
return sizeof(key96_t); // 12
|
||||
log(LOG_ERROR, "rdb: bad lookup rdbid of %i", (int)rdbId);
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1178,9 +1178,9 @@ bool Repair::injectTitleRec ( ) {
|
||||
m_stage = STAGE_TITLEDB_0; // 0
|
||||
return true;
|
||||
}
|
||||
mnew ( xd , sizeof(XmlDoc),"xmldocpr");
|
||||
mnew ( xd , sizeof(XmlDoc),"xmldocpr");
|
||||
|
||||
if ( ! xd->set2 ( titleRec,-1,m_cr->m_coll , NULL , MAX_NICENESS ) ) {
|
||||
if (!xd->set2(titleRec, -1, m_cr->m_coll, MAX_NICENESS)) {
|
||||
m_recsetErrors++;
|
||||
m_stage = STAGE_TITLEDB_0; // 0
|
||||
logTrace(g_conf.m_logTraceRepairs,"END, return true. XmlDoc->set2 failed");
|
||||
@ -1290,6 +1290,9 @@ bool Repair::injectTitleRec ( ) {
|
||||
xd->m_blockedDocValid = true;
|
||||
xd->m_blockedDoc = false;
|
||||
|
||||
// don't check site median page temperature
|
||||
xd->m_calledServiceSiteMedianPageTemperature = true;
|
||||
|
||||
// . get the meta list to add
|
||||
// . sets m_usePosdb, m_useTitledb, etc.
|
||||
logTrace(g_conf.m_logTraceRepairs,"Calling indexDoc");
|
||||
@ -1323,7 +1326,7 @@ bool Repair::injectTitleRecSmall(char *titleRec, int32_t titleRecSize) {
|
||||
|
||||
//decompress+decode xmldoc
|
||||
XmlDoc xd;
|
||||
if(!xd.set2(titleRec,titleRecSize, m_cr->m_coll, NULL, MAX_NICENESS)) {
|
||||
if (!xd.set2(titleRec, titleRecSize, m_cr->m_coll, MAX_NICENESS)) {
|
||||
m_recsetErrors++;
|
||||
m_stage = STAGE_TITLEDB_0;
|
||||
logTrace(g_conf.m_logTraceRepairs,"END, return true. XmlDoc->set2 failed");
|
||||
|
@ -1,15 +0,0 @@
|
||||
#include "SiteDefaultPageTemperatureRemoteRegistry.h"
|
||||
|
||||
|
||||
bool SiteDefaultPageTemperatureRemoteRegistry::initialize() {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void SiteDefaultPageTemperatureRemoteRegistry::finalize() {
|
||||
}
|
||||
|
||||
|
||||
bool SiteDefaultPageTemperatureRemoteRegistry::lookup(int32_t /*sitehash32*/, int64_t /*docId*/, void * /*ctx*/, callback_t /*callback*/) {
|
||||
return false;
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
#ifndef SITEDEFAULTPAGETEMPERATUREREMOTEREGISTRY_H_
|
||||
#define SITEDEFAULTPAGETEMPERATUREREMOTEREGISTRY_H_
|
||||
#include <inttypes.h>
|
||||
|
||||
|
||||
namespace SiteDefaultPageTemperatureRemoteRegistry {
|
||||
|
||||
bool initialize();
|
||||
void finalize();
|
||||
|
||||
|
||||
//Look up the site-default page temperature.
|
||||
enum class lookup_result_t {
|
||||
error, //something went wrong, look for g_errno for details
|
||||
page_temperature_known, //page-specific temperature is known, use that
|
||||
site_temperature_known, //site-default temperature known, good
|
||||
site_unknown //site unknown, use global default temperature
|
||||
};
|
||||
typedef void (*callback_t)(void *ctx, unsigned siteDefaultPageTemperature, lookup_result_t result);
|
||||
bool lookup(int32_t sitehash32, int64_t docId, void *ctx, callback_t callback);
|
||||
|
||||
|
||||
} //namespace
|
||||
|
||||
|
||||
#endif
|
88
SiteMedianPageTemperature.cpp
Normal file
88
SiteMedianPageTemperature.cpp
Normal file
@ -0,0 +1,88 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#include "SiteMedianPageTemperature.h"
|
||||
#include "Conf.h"
|
||||
#include "GbUtil.h"
|
||||
|
||||
// The protocol is very simple.
|
||||
// The server Receives queries in the form
|
||||
// <query-id>:v1|sitehash<NL>
|
||||
//
|
||||
// The server responses:
|
||||
// <query-id>:site_median_page_temperature<NL>
|
||||
|
||||
SiteMedianPageTemperature g_siteMedianPageTemperature;
|
||||
|
||||
struct SiteMedianPageTemperatureRequest : public FxClientRequest {
|
||||
SiteMedianPageTemperatureRequest(void *context, int timeout_ms, site_median_page_temperature_callback_t callback, unsigned sitehash)
|
||||
: FxClientRequest(context, timeout_ms)
|
||||
, m_callback(callback)
|
||||
, m_sitehash(sitehash) {
|
||||
}
|
||||
|
||||
site_median_page_temperature_callback_t m_callback;
|
||||
unsigned m_sitehash;
|
||||
};
|
||||
|
||||
// v1|sitehash
|
||||
void SiteMedianPageTemperature::convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) {
|
||||
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
|
||||
|
||||
out_buffer->reserve_extra(8 + 1 + 3 + 8 + 1);
|
||||
|
||||
sprintf(out_buffer->end(), "%08x", seq);
|
||||
out_buffer->push_back(8);
|
||||
out_buffer->end()[0] = ':';
|
||||
out_buffer->push_back(1);
|
||||
|
||||
memcpy(out_buffer->end(), "v1|", 3);
|
||||
out_buffer->push_back(3);
|
||||
|
||||
sprintf(out_buffer->end(), "%08x", request->m_sitehash);
|
||||
out_buffer->push_back(8);
|
||||
out_buffer->end()[0] = '\n';
|
||||
out_buffer->push_back(1);
|
||||
}
|
||||
|
||||
void SiteMedianPageTemperature::processResponse(fxclient_request_ptr_t base_request, char *response) {
|
||||
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
|
||||
logTrace(g_conf.m_logTraceSiteMedianPageTemperature, "Got result='%s' for sitehash=%d", response, request->m_sitehash);
|
||||
|
||||
unsigned long site_num_inlinks = strtoul(response, nullptr, 10);
|
||||
(request->m_callback)(request->m_context, site_num_inlinks);
|
||||
}
|
||||
|
||||
void SiteMedianPageTemperature::errorCallback(fxclient_request_ptr_t base_request) {
|
||||
std::shared_ptr<SiteMedianPageTemperatureRequest> request = std::dynamic_pointer_cast<SiteMedianPageTemperatureRequest>(base_request);
|
||||
request->m_callback(request->m_context, {});
|
||||
}
|
||||
|
||||
bool SiteMedianPageTemperature::initialize() {
|
||||
return FxClient::initialize("site temperature", "sitetemp", g_conf.m_siteMedianPageTemperatureServerName, g_conf.m_siteMedianPageTemperatureServerPort,
|
||||
g_conf.m_maxOutstandingSiteMedianPageTemperature, g_conf.m_logTraceSiteMedianPageTemperature);
|
||||
}
|
||||
|
||||
void SiteMedianPageTemperature::reinitializeSettings() {
|
||||
FxClient::reinitializeSettings(g_conf.m_siteMedianPageTemperatureServerName, g_conf.m_siteMedianPageTemperatureServerPort,
|
||||
g_conf.m_maxOutstandingSiteMedianPageTemperature, g_conf.m_logTraceSiteMedianPageTemperature);
|
||||
}
|
||||
|
||||
bool SiteMedianPageTemperature::getSiteMedianPageTemperature(void *context, site_median_page_temperature_callback_t callback, unsigned sitehash) {
|
||||
return sendRequest(std::static_pointer_cast<FxClientRequest>(std::make_shared<SiteMedianPageTemperatureRequest>(context, g_conf.m_siteMedianPageTemperatureTimeout, callback, sitehash)));
|
||||
}
|
43
SiteMedianPageTemperature.h
Normal file
43
SiteMedianPageTemperature.h
Normal file
@ -0,0 +1,43 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#ifndef FX_SITEMEDIANPAGETEMPERATURE_H
|
||||
#define FX_SITEMEDIANPAGETEMPERATURE_H
|
||||
|
||||
#include "FxClient.h"
|
||||
|
||||
typedef void (*site_median_page_temperature_callback_t)(void *context, long count);
|
||||
|
||||
class SiteMedianPageTemperature : public FxClient {
|
||||
public:
|
||||
bool initialize();
|
||||
void reinitializeSettings();
|
||||
|
||||
using FxClient::finalize;
|
||||
|
||||
void convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) override;
|
||||
void processResponse(fxclient_request_ptr_t base_request, char *response) override;
|
||||
void errorCallback(fxclient_request_ptr_t base_request) override;
|
||||
|
||||
bool getSiteMedianPageTemperature(void *context, site_median_page_temperature_callback_t callback, unsigned sitehash);
|
||||
};
|
||||
|
||||
extern SiteMedianPageTemperature g_siteMedianPageTemperature;
|
||||
|
||||
|
||||
#endif //FX_SITEMEDIANPAGETEMPERATURE_H
|
88
SiteNumInlinks.cpp
Normal file
88
SiteNumInlinks.cpp
Normal file
@ -0,0 +1,88 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#include "SiteNumInlinks.h"
|
||||
#include "Conf.h"
|
||||
#include "GbUtil.h"
|
||||
|
||||
// The protocol is very simple.
|
||||
// The server Receives queries in the form
|
||||
// <query-id>:v1|sitehash<NL>
|
||||
//
|
||||
// The server responses:
|
||||
// <query-id>:site_inlink_count<NL>
|
||||
|
||||
SiteNumInlinks g_siteNumInlinks;
|
||||
|
||||
struct SiteNumInlinksRequest : public FxClientRequest {
|
||||
SiteNumInlinksRequest(void *context, int timeout_ms, site_inlinks_count_callback_t callback, unsigned sitehash)
|
||||
: FxClientRequest(context, timeout_ms)
|
||||
, m_callback(callback)
|
||||
, m_sitehash(sitehash) {
|
||||
}
|
||||
|
||||
site_inlinks_count_callback_t m_callback;
|
||||
unsigned m_sitehash;
|
||||
};
|
||||
|
||||
// v1|sitehash
|
||||
void SiteNumInlinks::convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) {
|
||||
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
|
||||
|
||||
out_buffer->reserve_extra(8 + 1 + 3 + 8 + 1);
|
||||
|
||||
sprintf(out_buffer->end(), "%08x", seq);
|
||||
out_buffer->push_back(8);
|
||||
out_buffer->end()[0] = ':';
|
||||
out_buffer->push_back(1);
|
||||
|
||||
memcpy(out_buffer->end(), "v1|", 3);
|
||||
out_buffer->push_back(3);
|
||||
|
||||
sprintf(out_buffer->end(), "%08x", request->m_sitehash);
|
||||
out_buffer->push_back(8);
|
||||
out_buffer->end()[0] = '\n';
|
||||
out_buffer->push_back(1);
|
||||
}
|
||||
|
||||
void SiteNumInlinks::processResponse(fxclient_request_ptr_t base_request, char *response) {
|
||||
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
|
||||
logTrace(g_conf.m_logTraceSiteNumInlinks, "Got result='%s' for sitehash=%d", response, request->m_sitehash);
|
||||
|
||||
unsigned long site_num_inlinks = strtoul(response, nullptr, 10);
|
||||
(request->m_callback)(request->m_context, site_num_inlinks);
|
||||
}
|
||||
|
||||
void SiteNumInlinks::errorCallback(fxclient_request_ptr_t base_request) {
|
||||
std::shared_ptr<SiteNumInlinksRequest> request = std::dynamic_pointer_cast<SiteNumInlinksRequest>(base_request);
|
||||
request->m_callback(request->m_context, {});
|
||||
}
|
||||
|
||||
bool SiteNumInlinks::initialize() {
|
||||
return FxClient::initialize("site num inlinks", "sitenum", g_conf.m_siteNumInlinksServerName, g_conf.m_siteNumInlinksServerPort,
|
||||
g_conf.m_maxOutstandingSiteNumInlinks, g_conf.m_logTraceSiteNumInlinks);
|
||||
}
|
||||
|
||||
void SiteNumInlinks::reinitializeSettings() {
|
||||
FxClient::reinitializeSettings(g_conf.m_siteNumInlinksServerName, g_conf.m_siteNumInlinksServerPort,
|
||||
g_conf.m_maxOutstandingSiteNumInlinks, g_conf.m_logTraceSiteNumInlinks);
|
||||
}
|
||||
|
||||
bool SiteNumInlinks::getSiteNumInlinks(void *context, site_inlinks_count_callback_t callback, unsigned sitehash) {
|
||||
return sendRequest(std::static_pointer_cast<FxClientRequest>(std::make_shared<SiteNumInlinksRequest>(context, g_conf.m_siteNumInlinksTimeout, callback, sitehash)));
|
||||
}
|
43
SiteNumInlinks.h
Normal file
43
SiteNumInlinks.h
Normal file
@ -0,0 +1,43 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
#ifndef FX_SITENUMINLINKS_H
|
||||
#define FX_SITENUMINLINKS_H
|
||||
|
||||
#include "FxClient.h"
|
||||
|
||||
typedef void (*site_inlinks_count_callback_t)(void *context, long count);
|
||||
|
||||
class SiteNumInlinks : public FxClient {
|
||||
public:
|
||||
bool initialize();
|
||||
void reinitializeSettings();
|
||||
|
||||
using FxClient::finalize;
|
||||
|
||||
void convertRequestToWireFormat(IOBuffer *out_buffer, uint32_t seq, fxclient_request_ptr_t base_request) override;
|
||||
void processResponse(fxclient_request_ptr_t base_request, char *response) override;
|
||||
void errorCallback(fxclient_request_ptr_t base_request) override;
|
||||
|
||||
bool getSiteNumInlinks(void *context, site_inlinks_count_callback_t callback, unsigned sitehash);
|
||||
};
|
||||
|
||||
extern SiteNumInlinks g_siteNumInlinks;
|
||||
|
||||
|
||||
#endif //FX_SITENUMINLINKS_H
|
@ -258,7 +258,7 @@ void filterTitledbList(RdbList *list) {
|
||||
|
||||
if (!KEYNEG(rec)) {
|
||||
XmlDoc xd;
|
||||
if (xd.set2(rec, recSize, "main", NULL, 0)) {
|
||||
if (xd.set2(rec, recSize, "main", 0)) {
|
||||
if (isUrlBlocked(*(xd.getFirstUrl()))) {
|
||||
++filteredCount;
|
||||
continue;
|
||||
|
@ -15,6 +15,8 @@
|
||||
|
||||
UrlMatchList g_urlBlackList("urlblacklist*.txt");
|
||||
UrlMatchList g_urlWhiteList("urlwhitelist.txt");
|
||||
UrlMatchList g_urlProxyList("urlproxylist.txt");
|
||||
UrlMatchList g_urlRetryProxyList("urlretryproxylist.txt");
|
||||
|
||||
typedef std::vector<UrlMatch> urlmatchlist_t;
|
||||
typedef spp::sparse_hash_map<std::string, urlmatchlist_t> urlmatchlist_map_t;
|
||||
@ -245,6 +247,8 @@ bool UrlMatchList::load() {
|
||||
if (firstColEnd == 6 && memcmp(line.data(), "domain", 6) == 0) {
|
||||
if (!parseDomain(&tmpUrlMatchList, col2, col3, col4)) {
|
||||
logError("Invalid line found. Ignoring line='%s'", line.c_str());
|
||||
// catch domain parsing errors here
|
||||
gbshutdownLogicError();
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
|
@ -44,4 +44,7 @@ private:
|
||||
extern UrlMatchList g_urlBlackList;
|
||||
extern UrlMatchList g_urlWhiteList;
|
||||
|
||||
extern UrlMatchList g_urlProxyList;
|
||||
extern UrlMatchList g_urlRetryProxyList;
|
||||
|
||||
#endif //GB_URLMATCHLIST_H_
|
||||
|
296
XmlDoc.cpp
296
XmlDoc.cpp
@ -55,7 +55,8 @@
|
||||
#include "IpBlockList.h"
|
||||
#include "PageTemperatureRegistry.h"
|
||||
#include "SiteMedianPageTemperatureRegistry.h"
|
||||
#include "SiteDefaultPageTemperatureRemoteRegistry.h"
|
||||
#include "SiteNumInlinks.h"
|
||||
#include "SiteMedianPageTemperature.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sysexits.h>
|
||||
@ -181,7 +182,7 @@ void XmlDoc::reset ( ) {
|
||||
m_checkedIpBlockList = false;
|
||||
m_defaultSitePageTemperature = 0;
|
||||
m_defaultSitePageTemperatureValid = false;
|
||||
m_defaultSitePageTemperatureIsUnset = false;
|
||||
m_calledServiceSiteMedianPageTemperature = false;
|
||||
m_parsedRobotsMetaTag = false;
|
||||
m_robotsNoIndex = false;
|
||||
m_robotsNoFollow = false;
|
||||
@ -336,8 +337,6 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_wtsTable.reset();
|
||||
m_wbuf.reset();
|
||||
m_pageLinkBuf.reset();
|
||||
m_siteLinkBuf.reset();
|
||||
m_esbuf.reset();
|
||||
m_tagRecBuf.reset();
|
||||
|
||||
@ -420,6 +419,8 @@ void XmlDoc::reset ( ) {
|
||||
// do not cache the http reply in msg13 etc.
|
||||
m_maxCacheAge = 0;
|
||||
|
||||
m_calledServiceSiteNumInlinks = false;
|
||||
|
||||
// reset these ptrs too!
|
||||
void *px = &ptr_firstUrl;
|
||||
void *pxend = &m_dummyEnd;
|
||||
@ -555,7 +556,7 @@ bool XmlDoc::loadFromOldTitleRec() {
|
||||
|
||||
// use that. decompress it! this will also set
|
||||
// m_setFromTitleRec to true
|
||||
if (!set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, nullptr, m_niceness)) {
|
||||
if (!set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, m_niceness)) {
|
||||
// we are now loaded, do not re-call
|
||||
m_loaded = true;
|
||||
|
||||
@ -705,9 +706,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
m_wasContentInjected = true;
|
||||
m_contentType = contentType;
|
||||
m_contentTypeValid = true;
|
||||
// use this ip as well for now to avoid ip lookup
|
||||
//m_ip = atoip("127.0.0.1");
|
||||
//m_ipValid = true;
|
||||
|
||||
// do not need robots.txt then
|
||||
m_isAllowed = true;
|
||||
m_isAllowedValid = true;
|
||||
@ -840,7 +839,6 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
bool XmlDoc::set2 ( char *titleRec ,
|
||||
int32_t maxSize ,
|
||||
const char *coll ,
|
||||
SafeBuf *pbuf ,
|
||||
int32_t niceness ,
|
||||
SpiderRequest *sreq ) {
|
||||
|
||||
@ -849,37 +847,12 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
|
||||
setStatus ( "setting xml doc from title rec");
|
||||
|
||||
// . it resets us, so save this
|
||||
// . we only save these for set2() not the other sets()!
|
||||
//void (*cb1)(void *state) = m_callback1;
|
||||
//bool (*cb2)(void *state) = m_callback2;
|
||||
//void *state = m_state;
|
||||
|
||||
// . clear it all out
|
||||
// . no! this is clearing our msg20/msg22 reply...
|
||||
// . ok, but repair.cpp needs it so do it there then
|
||||
//reset();
|
||||
|
||||
// restore callbacks
|
||||
//m_callback1 = cb1;
|
||||
//m_callback2 = cb2;
|
||||
//m_state = state;
|
||||
|
||||
// sanity check - since we do not reset
|
||||
if ( m_contentValid ) { g_process.shutdownAbort(true); }
|
||||
|
||||
// this is true
|
||||
m_setFromTitleRec = true;
|
||||
|
||||
// this is valid i guess. includes key, etc.
|
||||
//m_titleRec = titleRec;
|
||||
//m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key96_t) + 4;
|
||||
//m_titleRecValid = true;
|
||||
// . should we free m_cbuf on our reset/destruction?
|
||||
// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
|
||||
// that should not be freed, besides the alloc size is not known!
|
||||
//m_freeTitleRec = false;
|
||||
|
||||
// it must be there!
|
||||
if ( !titleRec ) { g_errno=ENOTFOUND; return false; }
|
||||
|
||||
@ -900,8 +873,6 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
}
|
||||
m_titleRecBufValid = true;
|
||||
|
||||
//m_coll = coll;
|
||||
m_pbuf = pbuf;
|
||||
m_niceness = niceness;
|
||||
|
||||
// set our collection number
|
||||
@ -1071,14 +1042,6 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
// set our easy stuff
|
||||
gbmemcpy ( (void *)this , m_ubuf , headerSize );
|
||||
|
||||
// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
|
||||
// like in Msg.cpp and Msg20Reply.cpp
|
||||
if ( m_pbuf ) {
|
||||
int32_t crc = hash32(m_ubuf,headerSize);
|
||||
m_pbuf->safePrintf("crchdr=0x%" PRIx32" sizehdr=%" PRId32", ",
|
||||
crc,headerSize);
|
||||
}
|
||||
|
||||
|
||||
// point to the string data
|
||||
char *up = m_ubuf + headerSize;
|
||||
@ -1128,12 +1091,6 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
// point to the data. could be 64-bit ptr.
|
||||
*pd = up;//(int32_t)up;
|
||||
|
||||
// debug
|
||||
if ( m_pbuf ) {
|
||||
int32_t crc = hash32(up,*ps);
|
||||
m_pbuf->safePrintf("crc%" PRId32"=0x%" PRIx32" size%" PRId32"=%" PRId32", ",
|
||||
i,crc,i,*ps);
|
||||
}
|
||||
// skip over data
|
||||
up += *ps;
|
||||
|
||||
@ -1489,7 +1446,7 @@ bool XmlDoc::injectDoc(const char *url,
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
if (httpStatus != 200) {
|
||||
if (httpStatus != 0 && httpStatus != 200) {
|
||||
m_httpStatus = httpStatus;
|
||||
m_httpStatusValid = true;
|
||||
}
|
||||
@ -2029,58 +1986,47 @@ bool* XmlDoc::checkBlockList() {
|
||||
return &m_blockedDoc;
|
||||
}
|
||||
|
||||
static void gotDefaultSitePageTemperature(void *context, long count) {
|
||||
XmlDoc *xmlDoc = reinterpret_cast<XmlDoc*>(context);
|
||||
if (count != -1) {
|
||||
xmlDoc->m_defaultSitePageTemperature = count;
|
||||
xmlDoc->m_defaultSitePageTemperatureValid = true;
|
||||
}
|
||||
|
||||
xmlDoc->m_masterLoop(xmlDoc->m_masterState);
|
||||
}
|
||||
|
||||
unsigned *XmlDoc::getDefaultSitePageTemperature() {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
|
||||
if(m_defaultSitePageTemperatureIsUnset) {
|
||||
//already tried to look up. Don't try it again
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, already tried, (unset)");
|
||||
return NULL;
|
||||
}
|
||||
if(m_defaultSitePageTemperatureValid) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid. m_defaultSitePageTemperature=%u" , m_defaultSitePageTemperature);
|
||||
|
||||
if (m_defaultSitePageTemperatureValid) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid. m_defaultSitePageTemperature=%u", m_defaultSitePageTemperature);
|
||||
return &m_defaultSitePageTemperature;
|
||||
}
|
||||
|
||||
|
||||
int64_t *docId = getDocId();
|
||||
if(!docId || docId==(int64_t*)-1) {
|
||||
if (!docId || docId == (int64_t *)-1) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getDocId() failed or blocked");
|
||||
return (unsigned*)docId;
|
||||
return (unsigned *)docId;
|
||||
}
|
||||
|
||||
|
||||
int32_t *sitehash32 = getSiteHash32();
|
||||
if(sitehash32==NULL || sitehash32==(int32_t*)-1) {
|
||||
if (sitehash32 == NULL || sitehash32 == (int32_t *)-1) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, getSiteHash32 failed/blocked");
|
||||
return (unsigned*)sitehash32;
|
||||
return (unsigned *)sitehash32;
|
||||
}
|
||||
|
||||
if(g_smptr.lookup(*sitehash32, &m_defaultSitePageTemperature)) {
|
||||
m_defaultSitePageTemperatureValid = true;
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperatureRegistry hit");
|
||||
return &m_defaultSitePageTemperature;
|
||||
}
|
||||
|
||||
m_defaultSitePageTemperatureIsUnset = true; //make sure we try this only once
|
||||
if(!SiteDefaultPageTemperatureRemoteRegistry::lookup(*sitehash32, m_docId, this, &XmlDoc::gotDefaultSitePageTemperature)) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteDefaultPageTemperatureRemoteRegistry is disabled");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteDefaultPageTemperatureRemoteRegistry::lookup() blocked");
|
||||
return (unsigned*)-1;
|
||||
}
|
||||
|
||||
void XmlDoc::gotDefaultSitePageTemperature(void *ctx, unsigned siteDefaultPageTemperature, SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t result) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN, siteDefaultPageTemperature=%u, result=%d", siteDefaultPageTemperature,(int)result);
|
||||
XmlDoc *that = reinterpret_cast<XmlDoc*>(ctx);
|
||||
if(result==SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t::site_temperature_known) {
|
||||
that->m_defaultSitePageTemperature = siteDefaultPageTemperature;
|
||||
that->m_defaultSitePageTemperatureValid = true;
|
||||
} else
|
||||
that->m_defaultSitePageTemperatureIsUnset = true;
|
||||
indexDocWrapper(that);
|
||||
}
|
||||
|
||||
if (!m_calledServiceSiteMedianPageTemperature &&
|
||||
g_siteMedianPageTemperature.getSiteMedianPageTemperature(this, gotDefaultSitePageTemperature, *sitehash32)) {
|
||||
m_calledServiceSiteMedianPageTemperature = true;
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperature::getSiteMedianPageTemperature is blocked");
|
||||
return (unsigned *)-1;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, SiteMedianPageTemperature is disabled");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error and returns true
|
||||
@ -4363,8 +4309,7 @@ Links *XmlDoc::getLinks ( bool doQuickSet ) {
|
||||
|
||||
// . apply link spam settings
|
||||
// . set the "spam bits" in the Links class
|
||||
setLinkSpam ( *ip ,
|
||||
u , // linker url
|
||||
setLinkSpam (u , // linker url
|
||||
*sni ,
|
||||
xml ,
|
||||
&m_links ,
|
||||
@ -6006,11 +5951,7 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
|
||||
// ,m_firstUrl.getUrl());
|
||||
// if title rec is corrupted data uncompress will fail and this
|
||||
// will return false!
|
||||
if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
|
||||
m_oldTitleRecSize , // maxSize
|
||||
cr->m_coll ,
|
||||
NULL , // pbuf
|
||||
m_niceness ) ) {
|
||||
if (!m_oldDoc->set2(m_oldTitleRec, m_oldTitleRecSize, cr->m_coll, m_niceness)) {
|
||||
log("build: failed to set old doc for %s",m_firstUrl.getUrl());
|
||||
if ( ! g_errno ) { g_process.shutdownAbort(true); }
|
||||
//int32_t saved = g_errno;
|
||||
@ -6286,11 +6227,7 @@ XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
|
||||
mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
|
||||
// if we had the title rec, set from that
|
||||
if ( *rtr ) {
|
||||
if ( ! m_rootDoc->set2 ( m_rootTitleRec ,
|
||||
m_rootTitleRecSize , // maxSize ,
|
||||
cr->m_coll ,
|
||||
NULL , // pbuf
|
||||
m_niceness ) ) {
|
||||
if (!m_rootDoc->set2(m_rootTitleRec, m_rootTitleRecSize, cr->m_coll, m_niceness)) {
|
||||
// it was corrupted... delete this
|
||||
// possibly printed
|
||||
// " uncompress uncompressed size=..." bad uncompress
|
||||
@ -6799,6 +6736,16 @@ int32_t *XmlDoc::getFirstIp ( ) {
|
||||
return &m_firstIp;
|
||||
}
|
||||
|
||||
static void gotSiteNumInlinksWrapper(void *context, long count) {
|
||||
XmlDoc *xmlDoc = reinterpret_cast<XmlDoc*>(context);
|
||||
if (count != -1) {
|
||||
xmlDoc->m_siteNumInlinks = count;
|
||||
xmlDoc->m_siteNumInlinksValid = true;
|
||||
}
|
||||
|
||||
xmlDoc->m_masterLoop(xmlDoc->m_masterState);
|
||||
}
|
||||
|
||||
// this is the # of GOOD INLINKS to the site. so it is no more than
|
||||
// 1 per c block, and it has to pass link spam detection. this is the
|
||||
// highest-level count of inlinks to the site. use it a lot.
|
||||
@ -6836,6 +6783,17 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
return &m_siteNumInlinks;
|
||||
}
|
||||
|
||||
int32_t *sh32 = getSiteHash32();
|
||||
if (!sh32 || sh32 == (void *)-1) {
|
||||
return (int32_t *)sh32;
|
||||
}
|
||||
|
||||
// make sure we only call site num inlink server once
|
||||
if (!m_calledServiceSiteNumInlinks && g_siteNumInlinks.getSiteNumInlinks(this, gotSiteNumInlinksWrapper, *sh32)) {
|
||||
m_calledServiceSiteNumInlinks = true;
|
||||
return (int32_t*)-1;
|
||||
}
|
||||
|
||||
setStatus ( "getting site num inlinks");
|
||||
|
||||
// get it from the tag rec if we can
|
||||
@ -7043,12 +7001,6 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// can we be cancelled?
|
||||
bool canBeCancelled = true;
|
||||
// not if pageparser though
|
||||
if ( m_pbuf ) canBeCancelled = false;
|
||||
// not if injecting
|
||||
if ( ! m_sreqValid ) canBeCancelled = false;
|
||||
// assume valid when it returns
|
||||
m_siteLinkInfoValid = true;
|
||||
|
||||
@ -7080,7 +7032,6 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
|
||||
m_niceness ,
|
||||
cr->m_doLinkSpamCheck ,
|
||||
cr->m_oneVotePerIpDom ,
|
||||
canBeCancelled ,
|
||||
lastUpdateTime ,
|
||||
onlyNeedGoodInlinks ,
|
||||
0,
|
||||
@ -7863,14 +7814,6 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
|
||||
// do not redo it
|
||||
m_calledMsg25 = true;
|
||||
// shortcut
|
||||
//Msg25 *m = &m_msg25;
|
||||
// can we be cancelled?
|
||||
bool canBeCancelled = true;
|
||||
// not if pageparser though
|
||||
if ( m_pbuf ) canBeCancelled = false;
|
||||
// not if injecting
|
||||
if ( ! m_sreqValid ) canBeCancelled = false;
|
||||
|
||||
// we do not want to waste time computing the page title
|
||||
// of bad inlinks if we only want the good inlinks, because
|
||||
@ -7914,7 +7857,6 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
m_niceness ,
|
||||
doLinkSpamCheck ,
|
||||
oneVotePerIpDom ,
|
||||
canBeCancelled ,
|
||||
lastUpdateTime ,
|
||||
onlyNeedGoodInlinks ,
|
||||
0, // ourhosthash32 (special)
|
||||
@ -13547,7 +13489,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
if(m_defaultSitePageTemperatureValid) {
|
||||
if(!forDelete && m_defaultSitePageTemperatureValid) {
|
||||
*m_p++ = RDB_SITEDEFAULTPAGETEMPERATURE;
|
||||
uint64_t k = (m_docId<<1) | 0x01; //magic bit shuffling so msg4 can treat it as a normal rdb key with negative-bit etc.
|
||||
memcpy(m_p, &k, 8);
|
||||
@ -14038,9 +13980,11 @@ skipNewAdd2:
|
||||
|
||||
// store data
|
||||
if (ds) {
|
||||
// store data size
|
||||
*(int32_t *)nptr = ds;
|
||||
nptr += 4;
|
||||
// only store data size if it's not fixed sized
|
||||
if (getDataSizeFromRdbId(rdbId) == -1) {
|
||||
*(int32_t *) nptr = ds;
|
||||
nptr += 4;
|
||||
}
|
||||
|
||||
gbmemcpy (nptr, data, ds);
|
||||
nptr += ds;
|
||||
@ -15437,7 +15381,7 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
|
||||
if ( ! m_setTr ) {
|
||||
// . this completely resets us
|
||||
// . this returns false with g_errno set on error
|
||||
bool status = set2( *otr, 0, cr->m_coll, NULL, m_niceness);
|
||||
bool status = set2( *otr, 0, cr->m_coll, m_niceness);
|
||||
|
||||
// sanity check
|
||||
if ( ! status && ! g_errno ) {
|
||||
@ -15560,18 +15504,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
|
||||
bool getThatTitle = true;
|
||||
if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
|
||||
if ( m_reply.ptr_tbuf ) getThatTitle = false;
|
||||
// if steve's requesting the inlink summary we will want to get
|
||||
// the title of each linker even if they are spammy!
|
||||
// only get title here if NOT getting link text otherwise
|
||||
// we only get it down below if not a spammy voter, because
|
||||
// this sets the damn slow sections class
|
||||
if ( m_req->m_getLinkText &&
|
||||
! m_useSiteLinkBuf &&
|
||||
! m_usePageLinkBuf &&
|
||||
// m_pbuf is used by pageparser.cpp now, not the other two things
|
||||
// above this.
|
||||
! m_pbuf )
|
||||
getThatTitle = false;
|
||||
|
||||
// if steve is getting the inlinks, bad and good, for displaying
|
||||
// then get the title here now... otherwise, if we are just spidering
|
||||
@ -15904,9 +15836,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
|
||||
m_reply.size_rssItem = rssItemLen + 1;
|
||||
}
|
||||
|
||||
if ( ! m_req->m_doLinkSpamCheck )
|
||||
m_reply.m_isLinkSpam = 0;
|
||||
|
||||
if ( m_req->m_doLinkSpamCheck ) {
|
||||
// reset to NULL to avoid strlen segfault
|
||||
const char *note = NULL;
|
||||
@ -15918,7 +15847,6 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
|
||||
|
||||
// get it. does not block.
|
||||
m_reply.m_isLinkSpam = ::isLinkSpam ( linker ,
|
||||
m_ip ,
|
||||
m_siteNumInlinks,
|
||||
&m_xml,
|
||||
links,
|
||||
@ -15938,12 +15866,15 @@ Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
|
||||
m_reply.size_note = strlen(note)+1;
|
||||
}
|
||||
// log the reason why it is a log page
|
||||
if ( m_reply.m_isLinkSpam )
|
||||
log(LOG_DEBUG,"build: linker %s: %s.",
|
||||
linker->getUrl(),note);
|
||||
// sanity
|
||||
if ( m_reply.m_isLinkSpam && ! note )
|
||||
log("linkspam: missing note for d=%" PRId64"!",m_docId);
|
||||
if (m_reply.m_isLinkSpam) {
|
||||
log(LOG_DEBUG, "build: linker %s: %s.", linker->getUrl(), note);
|
||||
|
||||
if (!note) {
|
||||
log(LOG_WARN, "linkspam: missing note for d=%" PRId64"!", m_docId);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
m_reply.m_isLinkSpam = 0;
|
||||
}
|
||||
|
||||
// sanity check
|
||||
@ -16721,7 +16652,6 @@ char *XmlDoc::getIsLinkSpam ( ) {
|
||||
// . doc length over 100,000 bytes consider it link spam
|
||||
m_isLinkSpamValid = true;
|
||||
m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
|
||||
*ip ,
|
||||
*sni ,
|
||||
xml,
|
||||
links,
|
||||
@ -17441,23 +17371,6 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
|
||||
printRainbowSections ( sb , NULL );
|
||||
|
||||
//
|
||||
// PRINT LINKINFO
|
||||
//
|
||||
|
||||
char *p = m_pageLinkBuf.getBufStart();
|
||||
int32_t plen = m_pageLinkBuf.length();
|
||||
sb->safeMemcpy ( p , plen );
|
||||
|
||||
|
||||
//
|
||||
// PRINT SITE LINKINFO
|
||||
//
|
||||
p = m_siteLinkBuf.getBufStart();
|
||||
plen = m_siteLinkBuf.length();
|
||||
sb->safeMemcpy ( p , plen );
|
||||
|
||||
|
||||
// note this
|
||||
sb->safePrintf("<h2>NEW Meta List</h2>");
|
||||
|
||||
@ -17659,8 +17572,7 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
if ( page == 2 )
|
||||
return printPageInlinks(sb,hr);
|
||||
|
||||
if ( page == 3 )
|
||||
return printSiteInlinks(sb,hr);
|
||||
// 3 used to be print site inlinks (nothing is printed)
|
||||
|
||||
if ( page == 4 )
|
||||
return printRainbowSections(sb,hr);
|
||||
@ -17668,8 +17580,7 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
if ( page == 5 )
|
||||
return printTermList(sb,hr);
|
||||
|
||||
if ( page == 6 )
|
||||
return printSpiderStats(sb,hr);
|
||||
// 6 used to be print spider stats (coming soon page)
|
||||
|
||||
if ( page == 7 )
|
||||
return printCachedPage(sb,hr);
|
||||
@ -18113,44 +18024,6 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
// use msg25 to hit linkdb and give us a link info class i guess
|
||||
// but we need paging functionality so we can page through like
|
||||
// 100 links at a time. clustered by c-class ip.
|
||||
|
||||
// do we need to mention how many from each ip c-class then? because
|
||||
// then we'd have to read the whole termlist, might be several
|
||||
// separate disk reads.
|
||||
|
||||
// we need to re-get both if either is NULL
|
||||
LinkInfo *sinfo = getSiteLinkInfo();
|
||||
// block or error?
|
||||
if ( ! sinfo ) return true;
|
||||
if ( sinfo == (LinkInfo *)-1) return false;
|
||||
|
||||
int32_t isXml = hr->getLong("xml",0);
|
||||
|
||||
if ( ! isXml ) printMenu ( sb );
|
||||
|
||||
if ( isXml )
|
||||
sb->safePrintf ("<?xml version=\"1.0\" "
|
||||
"encoding=\"UTF-8\" ?>\n"
|
||||
"<response>\n"
|
||||
);
|
||||
|
||||
|
||||
sb->safeMemcpy ( &m_siteLinkBuf );
|
||||
|
||||
if ( isXml )
|
||||
sb->safePrintf ("</response>\n" );
|
||||
|
||||
// just print that
|
||||
//sinfo->print ( sb , cr->m_coll );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
// we need to re-get both if either is NULL
|
||||
@ -18177,8 +18050,6 @@ bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
// i guess we need this
|
||||
if ( ! recompute ) // m_setFromTitleRec )
|
||||
info1->print ( sb , cr->m_coll );
|
||||
else
|
||||
sb->safeMemcpy ( &m_pageLinkBuf );
|
||||
|
||||
if ( isXml )
|
||||
sb->safePrintf ("</response>\n" );
|
||||
@ -18675,17 +18546,6 @@ bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
int32_t isXml = hr->getLong("xml",0);
|
||||
|
||||
if ( ! isXml ) printMenu ( sb );
|
||||
|
||||
sb->safePrintf("<b>Coming Soon</b>");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
char **c = getUtf8Content();
|
||||
|
16
XmlDoc.h
16
XmlDoc.h
@ -51,7 +51,6 @@
|
||||
#include "HttpMime.h" // ET_DEFLAT
|
||||
#include "Json.h"
|
||||
#include "Posdb.h"
|
||||
#include "SiteDefaultPageTemperatureRemoteRegistry.h" //SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t
|
||||
|
||||
|
||||
// forward declaration
|
||||
@ -282,7 +281,6 @@ public:
|
||||
bool set2 ( char *titleRec,
|
||||
int32_t maxSize,
|
||||
const char *coll,
|
||||
SafeBuf *p,
|
||||
int32_t niceness ,
|
||||
class SpiderRequest *sreq = NULL );
|
||||
|
||||
@ -338,7 +336,6 @@ public:
|
||||
|
||||
bool *checkBlockList();
|
||||
unsigned *getDefaultSitePageTemperature();
|
||||
static void gotDefaultSitePageTemperature(void *ctx, unsigned siteDefaultPageTemperature, SiteDefaultPageTemperatureRemoteRegistry::lookup_result_t result);
|
||||
|
||||
bool *parseRobotsMetaTag();
|
||||
void parseRobotsMetaTagContent(const char *content, int32_t contentLen);
|
||||
@ -582,14 +579,10 @@ public:
|
||||
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
|
||||
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
|
||||
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
|
||||
bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
|
||||
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
|
||||
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
|
||||
bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
|
||||
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
|
||||
|
||||
void printTermList() const;
|
||||
|
||||
char *getTitleBuf ( );
|
||||
char *getRootTitleBuf ( );
|
||||
char *getFilteredRootTitleBuf ( );
|
||||
@ -1098,10 +1091,6 @@ public:
|
||||
HashTableX m_wtsTable;
|
||||
SafeBuf m_wbuf;
|
||||
|
||||
// Msg25.cpp stores its pageparser.cpp output into this one
|
||||
SafeBuf m_pageLinkBuf;
|
||||
SafeBuf m_siteLinkBuf;
|
||||
|
||||
// which set() function was called above to set us?
|
||||
bool m_setFromTitleRec;
|
||||
bool m_setFromSpiderRec;
|
||||
@ -1139,7 +1128,8 @@ public:
|
||||
bool m_checkedIpBlockList;
|
||||
|
||||
unsigned m_defaultSitePageTemperature;
|
||||
bool m_defaultSitePageTemperatureIsUnset;
|
||||
bool m_calledServiceSiteMedianPageTemperature;
|
||||
|
||||
bool m_parsedRobotsMetaTag;
|
||||
bool m_robotsNoIndex;
|
||||
bool m_robotsNoFollow;
|
||||
@ -1188,6 +1178,8 @@ public:
|
||||
void logQueryTimingEnd(const char* function, int64_t startTime);
|
||||
|
||||
void callCallback();
|
||||
|
||||
bool m_calledServiceSiteNumInlinks;
|
||||
};
|
||||
|
||||
// . PageParser.cpp uses this class for printing hashed terms out by calling
|
||||
|
11
linkspam.cpp
11
linkspam.cpp
@ -366,8 +366,7 @@ static bool isWebstatisticsPage(const Xml *xml) {
|
||||
// . otherwise, each outlink in "links" is assigned a "note" to indicate if
|
||||
// the outlink is a spam link or not
|
||||
// . returns true on success, false on error
|
||||
bool setLinkSpam ( int32_t ip ,
|
||||
const Url *linker ,
|
||||
bool setLinkSpam (const Url *linker ,
|
||||
int32_t siteNumInlinks ,
|
||||
Xml *xml ,
|
||||
Links *links ,
|
||||
@ -614,9 +613,7 @@ bool setLinkSpam ( int32_t ip ,
|
||||
|
||||
|
||||
bool isLinkSpam ( const Url *linker,
|
||||
int32_t ip ,
|
||||
int32_t siteNumInlinks ,
|
||||
//TitleRec *tr,
|
||||
Xml *xml,
|
||||
Links *links ,
|
||||
int32_t maxDocLen ,
|
||||
@ -631,11 +628,10 @@ bool isLinkSpam ( const Url *linker,
|
||||
int32_t h1len = linkee->getHostLen();
|
||||
const char *h2 = linker->getHost();
|
||||
int32_t h2len = linker->getHostLen();
|
||||
//if ( tr ) h2 = tr->getUrl()->getHost();
|
||||
//if ( tr ) h2len = tr->getUrl()->getHostLen();
|
||||
if ( h1len == h2len && strncmp ( h1 , h2 , h1len ) == 0 )
|
||||
return false;
|
||||
}
|
||||
|
||||
// do not allow .info or .biz to vote ever for now
|
||||
const char *tld = linker->getTLD();
|
||||
int32_t tldLen = linker->getTLDLen();
|
||||
@ -671,9 +667,6 @@ bool isLinkSpam ( const Url *linker,
|
||||
// do not allow any cgi url to vote
|
||||
if ( linker->isCgi() ) { *note = "path is cgi"; return true; }
|
||||
|
||||
// if the page has just one rel=nofollow tag then we know they
|
||||
// are not a guestbook
|
||||
//if ( links->hasRelNoFollow() ) plen = 0;
|
||||
if(isLinkfulPath(linker->getPath(),linker->getPathLen(),note))
|
||||
return true;
|
||||
|
||||
|
@ -8,15 +8,13 @@
|
||||
|
||||
class Url;
|
||||
|
||||
bool setLinkSpam ( int32_t ip ,
|
||||
const Url *linker ,
|
||||
bool setLinkSpam (const Url *linker ,
|
||||
int32_t siteNumInlinks ,
|
||||
class Xml *xml ,
|
||||
class Links *links ,
|
||||
bool isContentTruncated );
|
||||
|
||||
bool isLinkSpam ( const Url *linker ,
|
||||
int32_t ip ,
|
||||
int32_t siteNumInlinks ,
|
||||
class Xml *xml ,
|
||||
class Links *links ,
|
||||
|
37
main.cpp
37
main.cpp
@ -105,6 +105,9 @@
|
||||
#include "IpBlockList.h"
|
||||
#include "SpiderdbSqlite.h"
|
||||
#include "QueryLanguage.h"
|
||||
#include "SiteNumInlinks.h"
|
||||
#include "ContentMatchList.h"
|
||||
#include "SiteMedianPageTemperature.h"
|
||||
#include "Lemma.h"
|
||||
|
||||
|
||||
@ -438,6 +441,12 @@ int main2 ( int argc , char *argv[] ) {
|
||||
|
||||
//initialize IP address checks
|
||||
initialize_ip_address_checks();
|
||||
|
||||
// Make sure TLD table is initializing before calling any URL handling function
|
||||
if(!initializeDomains(g_hostdb.m_dir)) {
|
||||
log( LOG_ERROR, "Domains initialization failed!" );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load up hosts.conf
|
||||
// . it will determine our hostid based on the directory path of this
|
||||
@ -1239,11 +1248,6 @@ int main2 ( int argc , char *argv[] ) {
|
||||
log( LOG_ERROR, "Wiki initialization failed!" );
|
||||
return 1;
|
||||
}
|
||||
|
||||
if(!initializeDomains(g_hostdb.m_dir)) {
|
||||
log( LOG_ERROR, "Domains initialization failed!" );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// shout out if we're in read only mode
|
||||
if ( g_conf.m_readOnlyMode )
|
||||
@ -1310,9 +1314,12 @@ int main2 ( int argc , char *argv[] ) {
|
||||
g_dnsBlockList.init();
|
||||
g_contentTypeBlockList.init();
|
||||
g_ipBlockList.init();
|
||||
g_contentRetryProxyList.init();
|
||||
|
||||
g_urlBlackList.init();
|
||||
g_urlWhiteList.init();
|
||||
g_urlProxyList.init();
|
||||
g_urlRetryProxyList.init();
|
||||
|
||||
g_robotsCheckList.init();
|
||||
|
||||
@ -1461,6 +1468,8 @@ int main2 ( int argc , char *argv[] ) {
|
||||
// initialize clients
|
||||
g_urlRealtimeClassification.initialize();
|
||||
g_queryLanguage.initialize();
|
||||
g_siteNumInlinks.initialize();
|
||||
g_siteMedianPageTemperature.initialize();
|
||||
|
||||
if(!WantedChecker::initialize())
|
||||
return 0;
|
||||
@ -2448,7 +2457,7 @@ void dumpTitledb (const char *coll, int32_t startFileNum, int32_t numFiles, bool
|
||||
xd->reset();
|
||||
// uncompress the title rec
|
||||
//TitleRec tr;
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
if (!xd->set2(rec, recSize, coll, 0)) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
@ -3373,7 +3382,7 @@ static void dumpUnwantedTitledbRecs(const char *coll, int32_t startFileNum, int3
|
||||
xd->reset();
|
||||
|
||||
// uncompress the title rec
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
if (!xd->set2(rec, recSize, coll, 0)) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
@ -3547,7 +3556,7 @@ static void dumpWantedTitledbRecs(const char *coll, int32_t startFileNum, int32_
|
||||
xd->reset();
|
||||
|
||||
// uncompress the title rec
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
if (!xd->set2(rec, recSize, coll, 0)) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
@ -3688,7 +3697,7 @@ static void dumpAdultTitledbRecs(const char *coll, int32_t startFileNum, int32_t
|
||||
xd->reset();
|
||||
|
||||
// uncompress the title rec
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
if (!xd->set2(rec, recSize, coll, 0)) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
@ -3868,7 +3877,7 @@ static void dumpSpamTitledbRecs(const char *coll, int32_t startFileNum, int32_t
|
||||
xd->reset();
|
||||
|
||||
// uncompress the title rec
|
||||
if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) {
|
||||
if (!xd->set2(rec, recSize, coll, 0)) {
|
||||
//set2() may have logged something but not the docid
|
||||
log(LOG_WARN, "dbdump: XmlDoc::set2() failed for docid %" PRId64, docId);
|
||||
continue;
|
||||
@ -3974,7 +3983,7 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
|
||||
char *rec = tlist.getCurrentRec();
|
||||
int32_t listSize = tlist.getListSize ();
|
||||
XmlDoc xd;
|
||||
if ( ! xd.set2 ( rec , listSize , coll , NULL , 0 ) ) {
|
||||
if (!xd.set2(rec, listSize, coll, 0)) {
|
||||
log(LOG_WARN, "build: speedtestxml: Error setting xml doc.");
|
||||
return false;
|
||||
}
|
||||
@ -3999,7 +4008,7 @@ static bool parseTest(const char *coll, int64_t docId, const char *query) {
|
||||
// speed test
|
||||
int64_t t = gettimeofdayInMilliseconds();
|
||||
for ( int32_t k = 0 ; k < 100 ; k++ )
|
||||
xd.set2 (rec, listSize, coll , NULL , 0 );
|
||||
xd.set2(rec, listSize, coll, 0);
|
||||
int64_t e = gettimeofdayInMilliseconds();
|
||||
logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
|
||||
(float)(e-t)/100.0);
|
||||
@ -4226,7 +4235,7 @@ static bool summaryTest1(char *rec, int32_t listSize, const char *coll, int64_t
|
||||
// loop parse
|
||||
for ( int32_t i = 0 ; i < 100 ; i++ ) {
|
||||
XmlDoc xd;
|
||||
if( !xd.set2 (rec, listSize, coll,NULL,0) ) {
|
||||
if (!xd.set2(rec, listSize, coll, 0)) {
|
||||
log(LOG_ERROR,"%s:%s: XmlDoc.set2 failed", __FILE__, __func__);
|
||||
return false;
|
||||
}
|
||||
@ -4915,7 +4924,7 @@ static void countdomains(const char* coll, int32_t numRecs, int32_t output) {
|
||||
}
|
||||
|
||||
XmlDoc xd;
|
||||
if ( ! xd.set2 (rec, recSize, coll,NULL,0) )
|
||||
if (!xd.set2(rec, recSize, coll, 0))
|
||||
continue;
|
||||
|
||||
struct ip_info *sipi ;
|
||||
|
@ -14,9 +14,14 @@ fi
|
||||
|
||||
echo "===Making signature"
|
||||
$bd/sto_convert.py signature --output_file="$2" || exit
|
||||
for input_file in $1/STO_LMF_morphology_{adj,noun,pronoun,rest,verb}*.xml; do
|
||||
echo "===Processing $input_file"
|
||||
$bd/sto_convert.py convert --input_file=$input_file --output_file=$2 || exit
|
||||
done
|
||||
echo "===Done"
|
||||
#is it the original STO files, or have they been split into lexical entries?
|
||||
if [ -d $1/noun -a -d $1/verb ]; then
|
||||
$bd/sto_convert.py convert --input_tree=$1 --output_file=$2 || exit
|
||||
else
|
||||
for input_file in $1/STO_LMF_morphology_{adj,noun,pronoun,rest,verb}*.xml; do
|
||||
echo "===Processing $input_file"
|
||||
$bd/sto_convert.py convert --input_file=$input_file --output_file=$2 || exit
|
||||
done
|
||||
echo "===Done"
|
||||
fi
|
||||
exit 0
|
||||
|
@ -1,9 +1,15 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/python2
|
||||
from __future__ import print_function
|
||||
import xml.etree.ElementTree
|
||||
import struct
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
|
||||
#hack to make utf-8 values work
|
||||
import sys
|
||||
reload(sys)
|
||||
sys.setdefaultencoding("utf_8")
|
||||
|
||||
part_of_speech_map={
|
||||
"adjective":1,
|
||||
@ -81,104 +87,144 @@ word_form_attribute_map={
|
||||
}
|
||||
|
||||
|
||||
def do_convert(input_file_name, output_file):
|
||||
|
||||
total_entry_count = None
|
||||
total_wordform_count = None
|
||||
|
||||
warnings = {}
|
||||
skips = {}
|
||||
|
||||
def emit_warning(id,what):
|
||||
global warnings
|
||||
warnings[id] = what
|
||||
def emit_skip(id,why):
|
||||
global skips
|
||||
skips[id] = why
|
||||
|
||||
|
||||
def process_lexcial_entry(lexicalentry,output_file):
|
||||
global total_entry_count, total_wordform_count
|
||||
|
||||
part_of_speech=None
|
||||
id=None
|
||||
morphological_unit_id=None
|
||||
for feat in lexicalentry.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
|
||||
if att=="partOfSpeech":
|
||||
if val in part_of_speech_map:
|
||||
part_of_speech = part_of_speech_map[val]
|
||||
else:
|
||||
print("Unknown part_of_speech: ",val, file=sys.stderr)
|
||||
sys.exit(2)
|
||||
elif att=="id":
|
||||
id=val
|
||||
elif att=="morphologicalUnitId":
|
||||
morphological_unit_id=val
|
||||
#todo:decomposition
|
||||
if part_of_speech==None:
|
||||
emit_skip(id,"No partOfSpeech")
|
||||
return
|
||||
if morphological_unit_id==None:
|
||||
emit_skip(id,"No morphologicalUnitId")
|
||||
return
|
||||
|
||||
raw_wordforms = b""
|
||||
wordform_count = 0
|
||||
|
||||
for wordform in lexicalentry.findall("WordForm"):
|
||||
attributes=[]
|
||||
for feat in wordform.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
#print("wordform.feat: att=%s val=%s"%(att,val))
|
||||
s=att+"_"+val
|
||||
if s in word_form_attribute_map:
|
||||
attributes.append(word_form_attribute_map[s])
|
||||
else:
|
||||
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
|
||||
sys.exit(2)
|
||||
if len(attributes)==0:
|
||||
emit_warning(id,"No <feat> attributes")
|
||||
#happens for a few entries such as "Chippendale". We convert it anyway because at least we know the part-of-speech
|
||||
if len(attributes)>6:
|
||||
emit_skip(id,"Too many <feat>")
|
||||
return
|
||||
while len(attributes)<6:
|
||||
attributes.append(0)
|
||||
for formrepresentation in wordform.findall("FormRepresentation"):
|
||||
writtenform=None
|
||||
for feat in formrepresentation.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
if att=="writtenForm":
|
||||
writtenform=val
|
||||
|
||||
raw_writtenform = writtenform.encode()
|
||||
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
|
||||
+ struct.pack(">B",len(raw_writtenform)) \
|
||||
+ raw_writtenform
|
||||
wordform_count += 1
|
||||
raw_wordforms += raw_wordform
|
||||
|
||||
raw_morphological_unit_id = morphological_unit_id.encode()
|
||||
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
|
||||
output_file.write(raw_entry)
|
||||
|
||||
total_entry_count += 1
|
||||
total_wordform_count += wordform_count
|
||||
|
||||
|
||||
def do_convert_lexicon_file(input_file_name, output_file):
|
||||
print("Opening and parsing %s"%(input_file_name))
|
||||
tree = xml.etree.ElementTree.parse(input_file_name)
|
||||
root = tree.getroot()
|
||||
lexicon=root.find("Lexicon")
|
||||
global total_entry_count, total_wordform_count
|
||||
total_entry_count=0
|
||||
total_wordform_count=0
|
||||
for lexicalentry in lexicon.findall("LexicalEntry"):
|
||||
part_of_speech=None
|
||||
id=None
|
||||
morphological_unit_id=None
|
||||
for feat in lexicalentry.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
#print("lexicalentry.feat: att=%s val=%s"%(att,val))
|
||||
if att=="partOfSpeech":
|
||||
if val in part_of_speech_map:
|
||||
part_of_speech = part_of_speech_map[val]
|
||||
else:
|
||||
print("Unknown part_of_speech: ",val, file=sys.stderr)
|
||||
sys.exit(2)
|
||||
elif att=="id":
|
||||
id=val
|
||||
elif att=="morphologicalUnitId":
|
||||
morphological_unit_id=val
|
||||
#todo:decomposition
|
||||
if part_of_speech==None:
|
||||
print("Entry %s doesn't have partOfSpeech"%id, file=sys.stderr)
|
||||
if morphological_unit_id==None:
|
||||
print("Entry %s doesn't have morphologicalUnitId"%id, file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
raw_wordforms = b""
|
||||
wordform_count = 0
|
||||
|
||||
for wordform in lexicalentry.findall("WordForm"):
|
||||
attributes=[]
|
||||
for feat in wordform.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
#print("wordform.feat: att=%s val=%s"%(att,val))
|
||||
s=att+"_"+val
|
||||
if s in word_form_attribute_map:
|
||||
attributes.append(word_form_attribute_map[s])
|
||||
else:
|
||||
print("Entry %s: Unknown wordform feat: %s"%(id,s),file=sys.stderr)
|
||||
sys.exit(2)
|
||||
if len(attributes)==0:
|
||||
print("Entry %s: No feat?"%(id),file=sys.stderr)
|
||||
#happens for a few entries such as "Chippendale". We convert it anyway beucase at least we know the part-of-speech
|
||||
#sys.exit(2)
|
||||
if len(attributes)>6:
|
||||
print("Entry %s: Too many feat (%d)"%(id,len(attributes)),file=sys.stderr)
|
||||
sys.exit(2)
|
||||
while len(attributes)<6:
|
||||
attributes.append(0)
|
||||
for formrepresentation in wordform.findall("FormRepresentation"):
|
||||
writtenform=None
|
||||
for feat in formrepresentation.findall("feat"):
|
||||
att=feat.attrib["att"]
|
||||
val=feat.attrib["val"]
|
||||
if att=="writtenForm":
|
||||
writtenform=val
|
||||
|
||||
raw_writtenform = writtenform.encode()
|
||||
raw_wordform = struct.pack(">BBBBBB",attributes[0],attributes[1],attributes[2],attributes[3],attributes[4],attributes[5]) \
|
||||
+ struct.pack(">B",len(raw_writtenform)) \
|
||||
+ raw_writtenform
|
||||
wordform_count += 1
|
||||
raw_wordforms += raw_wordform
|
||||
|
||||
raw_morphological_unit_id = morphological_unit_id.encode()
|
||||
raw_entry = struct.pack(">BBBB",part_of_speech,1,len(raw_morphological_unit_id),wordform_count) + raw_morphological_unit_id + raw_wordforms
|
||||
output_file.write(raw_entry)
|
||||
|
||||
total_entry_count += 1
|
||||
total_wordform_count += wordform_count
|
||||
process_lexcial_entry(lexicalentry,output_file)
|
||||
|
||||
print("Done")
|
||||
print("\tlexical entries: %d"%total_entry_count)
|
||||
print("\twordforms: %d"%total_wordform_count)
|
||||
|
||||
|
||||
|
||||
def do_convert_lexcialentry_file(input_file_name,output_file):
|
||||
print("%s:"%input_file_name);
|
||||
tree = xml.etree.ElementTree.parse(input_file_name)
|
||||
root = tree.getroot()
|
||||
process_lexcial_entry(root,output_file)
|
||||
|
||||
def do_convert_tree(input_tree_name, output_file):
|
||||
global total_entry_count, total_wordform_count
|
||||
total_entry_count=0
|
||||
total_wordform_count=0
|
||||
for (dirpath,dirnames,filenames) in os.walk(input_tree_name):
|
||||
for filename in filenames:
|
||||
if filename[-4:]==".xml":
|
||||
full_file_name = dirpath+"/"+filename
|
||||
do_convert_lexcialentry_file(full_file_name,output_file)
|
||||
print("Done")
|
||||
print("\tlexical entries: %d"%total_entry_count)
|
||||
print("\twordforms: %d"%total_wordform_count)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="STO converter")
|
||||
parser.add_argument("-i","--input_file",type=str)
|
||||
parser.add_argument("-i","--input_file",type=str,default=None)
|
||||
parser.add_argument("-I","--input_tree",type=str,default=None)
|
||||
parser.add_argument("-o","--output_file",type=str,required=True)
|
||||
parser.add_argument("command",type=str,default="convert",nargs='?',choices=["convert","signature"])
|
||||
|
||||
args=parser.parse_args()
|
||||
|
||||
if args.command=="signature" and args.input_file:
|
||||
print("input_file cannot be specified when generating signature", file=sys.stderr)
|
||||
if args.command=="signature" and (args.input_file!=None or args.input_tree!=None):
|
||||
print("input_file/input_tree cannot be specified when generating signature", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if args.command=="convert" and (not args.input_file):
|
||||
print("input_file and output_file must be specified when generating converting", file=sys.stderr)
|
||||
if args.command=="convert" and args.input_file==None and args.input_tree==None:
|
||||
print("input_file/input_tree and output_file must be specified when generating converting", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@ -188,10 +234,23 @@ if args.command=="signature":
|
||||
version_1_signature = ("parsed-sto-v2\n"+'\0'*80)[0:80]
|
||||
output_file.write(version_1_signature.encode())
|
||||
elif args.command=="convert":
|
||||
do_convert(args.input_file,output_file)
|
||||
if args.input_file:
|
||||
do_convert_lexicon_file(args.input_file,output_file)
|
||||
else:
|
||||
do_convert_tree(args.input_tree,output_file)
|
||||
else:
|
||||
print("argh...", file=sys.stderr)
|
||||
sys.exit(99)
|
||||
|
||||
output_file.close()
|
||||
|
||||
if len(warnings)>0:
|
||||
print("===Warnings:", file=sys.stderr)
|
||||
for (k,v) in warnings.iteritems():
|
||||
print("%s: %s"%(k,v), file=sys.stderr)
|
||||
if len(skips)>0:
|
||||
print("===Skips:", file=sys.stderr)
|
||||
for (k,v) in skips.iteritems():
|
||||
print("%s: %s"%(k,v), file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
|
@ -230,6 +230,7 @@ static void remove_combining_marks_norwegian(TokenizerResult *tr);
|
||||
static void remove_combining_marks_swedish(TokenizerResult *tr);
|
||||
static void remove_combining_marks_german(TokenizerResult *tr);
|
||||
static void remove_combining_marks_swiss_german(TokenizerResult *tr);
|
||||
static void remove_combining_marks_italian(TokenizerResult *tr);
|
||||
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count);
|
||||
|
||||
|
||||
@ -250,6 +251,9 @@ static void remove_combining_marks(TokenizerResult *tr, lang_t lang, const char
|
||||
else
|
||||
remove_combining_marks_swiss_german(tr);
|
||||
return;
|
||||
case langItalian:
|
||||
remove_combining_marks_italian(tr);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -333,6 +337,37 @@ static void remove_combining_marks_swiss_german(TokenizerResult *tr) {
|
||||
}
|
||||
|
||||
|
||||
//Combining marks in Italian:
|
||||
// - grave àèìòù Mandatory for lowercase. Dedicated keys on keyboard
|
||||
// - acute é Mandatory for lowercase. Dedicated keys on keyboard
|
||||
// - cedilla ç Non-native. Dedicated key on keyboard - lowercase only
|
||||
//Swiss-Italian keyboard has access to umlaut.
|
||||
//Major problem is that none the the three Italian keyboard layouts have easy access to uppercase accented letters, so the accents are frequently
|
||||
//omitted or typed as apostrophe. More discussion here: https://italian.stackexchange.com/questions/3878/how-do-italians-customarily-insert-uppercase-italian-vowels-with-diacritics-with
|
||||
//So one way to deal with this is to just remove all diacritics in both diocument and query, but that would lose precision. But given that most documents has been run through word
|
||||
//processing software the documents are mostly written correctly, and that when users type queries they rarely use uppercase so the accents are probably also typed correctly there.
|
||||
//So we keep the native and easily accessible marks. Then on a later date we should detect the incorrect forms and fix them (requires a dictionary though).
|
||||
static void remove_combining_marks_italian(TokenizerResult *tr) {
|
||||
static const UChar32 native_marked_letters[] = {
|
||||
0x00C0, //À
|
||||
0x00C8, //È
|
||||
0x00CC, //Ì
|
||||
0x00D2, //Ò
|
||||
0x00D9, //Ù
|
||||
0x00E0, //à
|
||||
0x00E8, //è
|
||||
0x00EC, //ì
|
||||
0x00F2, //ò
|
||||
0x00F9, //ù
|
||||
0x00C9, //É
|
||||
0x00E9, //é
|
||||
0x00C7, //Ç
|
||||
0x00E7, //ç
|
||||
};
|
||||
remove_some_combining_marks(tr, native_marked_letters, sizeof(native_marked_letters)/sizeof(native_marked_letters[0]));
|
||||
}
|
||||
|
||||
|
||||
//Remove combining marks form the codepoints except for the native marked letters
|
||||
static void remove_some_combining_marks(TokenizerResult *tr, const UChar32 native_marked_letters[], size_t native_marked_letters_count) {
|
||||
const size_t org_token_count = tr->size();
|
||||
|
@ -609,6 +609,49 @@ int main(void) {
|
||||
assert(t.str(6)=="Noel");
|
||||
}
|
||||
|
||||
//italian diacritics
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("aaa bbb",langItalian);
|
||||
assert(t.token_count()==3);
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("Ragù",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="Ragù");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("àèìòùéç",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="àèìòùéç");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("ÀÈÌÒÙÉÇ",langItalian);
|
||||
assert(t.token_count()==1);
|
||||
assert(t.str(0)=="ÀÈÌÒÙÉÇ");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("monaco münchen",langItalian);
|
||||
assert(t.token_count()==4);
|
||||
assert(t.str(3)=="munchen");
|
||||
}
|
||||
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
T2 t("Eskişehir",langItalian);
|
||||
assert(t.token_count()==2);
|
||||
assert(t.str(1)=="Eskisehir");
|
||||
}
|
||||
|
||||
|
||||
//diacritics hands-off
|
||||
printf("Test line %d\n",__LINE__);
|
||||
{
|
||||
|
@ -98,6 +98,12 @@ param bad xyz
|
||||
# allows: www.example.com/en/wp-admin
|
||||
path /wp-admin
|
||||
|
||||
# block partial path
|
||||
# blocks: www.example.com/badpath
|
||||
# blocks: www.example.com/en/badpath
|
||||
# blocks: www.example.com/badpath/subpath
|
||||
pathpartial /badpath
|
||||
|
||||
# regex example
|
||||
# =============
|
||||
# blocks url by regex
|
@ -16,6 +16,7 @@ struct WordVariationWeights {
|
||||
float verb_spelling_variants;
|
||||
float verb_past_past_variants;
|
||||
float simple_spelling_variants; //simple variants, eg "cyklen" vs. "cykelen"
|
||||
float adjective_grammatical_gender_simplification;
|
||||
//todo: more configurable weights in WordVariationWeights
|
||||
WordVariationWeights()
|
||||
: noun_indefinite_definite(1.0),
|
||||
@ -25,7 +26,8 @@ struct WordVariationWeights {
|
||||
proper_noun_spelling_variants(1.0),
|
||||
verb_spelling_variants(1.0),
|
||||
verb_past_past_variants(1.0),
|
||||
simple_spelling_variants(1.0)
|
||||
simple_spelling_variants(1.0),
|
||||
adjective_grammatical_gender_simplification(1.0)
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -6,6 +6,9 @@
|
||||
|
||||
namespace {
|
||||
|
||||
enum noun_or_verb_t { noun, verb, whatever };
|
||||
|
||||
|
||||
class WordVariationGenerator_danish : public STOWordVariationGenerator {
|
||||
public:
|
||||
WordVariationGenerator_danish()
|
||||
@ -41,10 +44,40 @@ public:
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight);
|
||||
void handle_adjective_grammatical_gender_simplification(std::vector<WordVariationGenerator::Variation> &variations,
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight);
|
||||
};
|
||||
|
||||
static WordVariationGenerator_danish s_WordVariationGenerator_danish;
|
||||
|
||||
|
||||
//class for handling unknown compound words
|
||||
class LogicalMatches {
|
||||
std::vector<const sto::LexicalEntry *> actual_matches;
|
||||
std::string prefix; //prefix if a compound word
|
||||
std::string suffix; //suffix of compound word, or whole word if non-compound
|
||||
|
||||
public:
|
||||
LogicalMatches(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb);
|
||||
|
||||
bool empty() const { return actual_matches.empty(); }
|
||||
size_t size() const { return actual_matches.size(); }
|
||||
const sto::LexicalEntry * operator[](size_t i) const { return actual_matches[i]; }
|
||||
std::vector<const sto::LexicalEntry *>::const_iterator begin() const { return actual_matches.begin(); }
|
||||
std::vector<const sto::LexicalEntry *>::const_iterator end() const { return actual_matches.end(); }
|
||||
|
||||
const std::string query_matched_word() const { return suffix; }
|
||||
std::string query_logical_written_form(const sto::WordForm *wf) const {
|
||||
return prefix + std::string(wf->written_form,wf->written_form_length);
|
||||
}
|
||||
|
||||
private:
|
||||
std::string find_compound_word_longest_known_suffix(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb);
|
||||
};
|
||||
|
||||
|
||||
} //anonymous namespace
|
||||
|
||||
|
||||
@ -55,6 +88,64 @@ bool initializeWordVariationGenerator_Danish() {
|
||||
|
||||
|
||||
|
||||
LogicalMatches::LogicalMatches(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb)
|
||||
: actual_matches(lexicon.query_matches(source_word)),
|
||||
prefix(""),
|
||||
suffix(source_word)
|
||||
{
|
||||
if(actual_matches.empty()) {
|
||||
//unknown word
|
||||
std::string tmp = find_compound_word_longest_known_suffix(lexicon,source_word,noun_or_verb);
|
||||
if(tmp.length()>0) {
|
||||
//found a suffix match
|
||||
prefix = source_word.substr(0,source_word.length()-tmp.length());
|
||||
suffix = tmp;
|
||||
actual_matches = lexicon.query_matches(suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Find the longest suffix that has a match in STO.
|
||||
//This is useful for identifying the (linguistic-)head in unknown compound words which in Danish usually is the last stem in the word
|
||||
//Eg sneglemassakre, hvidvaske, købepizza
|
||||
std::string LogicalMatches::find_compound_word_longest_known_suffix(const sto::Lexicon &lexicon, const std::string &source_word, noun_or_verb_t noun_or_verb) {
|
||||
//we insist on at least 4 letters, although that would make us not find "udu"
|
||||
if(source_word.length()<4)
|
||||
return "";
|
||||
//we only do it on on normal words without punctuation, special characters etc.
|
||||
//properly checking codepoint-is-alphabetic would require linking in libunicode etc. adding to the linking complexicity so just hack it here.
|
||||
for(size_t i=0; i<source_word.length(); i++) {
|
||||
char c = source_word[i];
|
||||
if(c<(char)128) {
|
||||
if((c>='A' && c<='Z') || (c>='a' && c<='z'))
|
||||
;
|
||||
else
|
||||
return "";
|
||||
}
|
||||
}
|
||||
//(todo) extend minimum suffix length if the word ends with a common suffix that isn't an indepedent word, eg -skab, -inde, -isme. Complication is that the suffixes are also inflicted/declined, eg. -skabernes
|
||||
size_t source_length = source_word.length();
|
||||
for(size_t suffix_length = source_length-1; suffix_length>=2; suffix_length--) {
|
||||
std::string candidate_suffix(source_word, source_length-suffix_length);
|
||||
auto matches(lexicon.query_matches(candidate_suffix));
|
||||
if(!matches.empty()) {
|
||||
for(auto match : matches) {
|
||||
if(noun_or_verb==whatever)
|
||||
return candidate_suffix;
|
||||
if(noun_or_verb==noun && match->part_of_speech==sto::part_of_speech_t::commonNoun)
|
||||
return candidate_suffix;
|
||||
if(noun_or_verb==verb && (match->part_of_speech==sto::part_of_speech_t::deponentVerb || match->part_of_speech==sto::part_of_speech_t::mainVerb))
|
||||
return candidate_suffix;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::query_variations(const std::vector<std::string> &source_words, const WordVariationWeights& weights, float threshold) {
|
||||
std::vector<std::string> lower_source_words(lower_words(source_words));
|
||||
std::vector<WordVariationGenerator::Variation> variations;
|
||||
@ -91,6 +182,10 @@ std::vector<WordVariationGenerator::Variation> WordVariationGenerator_danish::qu
|
||||
find_simple_attribute_match_wordforms(variations,lower_source_words,weights.simple_spelling_variants);
|
||||
}
|
||||
|
||||
if(weights.adjective_grammatical_gender_simplification >= threshold) {
|
||||
handle_adjective_grammatical_gender_simplification(variations,source_words,lower_source_words, weights.simple_spelling_variants);
|
||||
}
|
||||
|
||||
//currently inactive because Query.cpp/PosdbTable.cpp cannot handle wordvariations spanning more than one word
|
||||
//make_proper_noun_part_genetive(variations,source_words,lower_source_words,1.2);
|
||||
|
||||
@ -121,7 +216,7 @@ static uint64_t wordformattrs2bitmask(const sto::WordForm &wf) {
|
||||
|
||||
static bool same_wordform_as_source(const sto::WordForm &wf, const std::string source_word) {
|
||||
return wf.written_form_length==source_word.length() &&
|
||||
memcmp(wf.written_form,source_word.data(),source_word.length())==0;
|
||||
memcmp(wf.written_form,source_word.data(),source_word.length())==0;
|
||||
}
|
||||
|
||||
|
||||
@ -132,11 +227,11 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
|
||||
{
|
||||
for(unsigned i=0; i<source_words.size(); i++) {
|
||||
auto source_word(source_words[i]);
|
||||
auto matches(lexicon.query_matches(source_word));
|
||||
LogicalMatches matches(lexicon,source_word,noun);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word) &&
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(from_attr))
|
||||
{
|
||||
uint64_t source_word_bitmask = wordformattrs2bitmask(*wordform);
|
||||
@ -148,7 +243,7 @@ void WordVariationGenerator_danish::find_simple_attribute_difference_wordforms(s
|
||||
//found the other form of the word.
|
||||
//this may match multiple alternative spellings of the wordform, but the STO database cannot distinguish
|
||||
Variation v;
|
||||
v.word.assign(definite_wordform->written_form,definite_wordform->written_form_length);
|
||||
v.word = matches.query_logical_written_form(definite_wordform);
|
||||
v.weight = weight;
|
||||
v.source_word_start = i;
|
||||
v.source_word_end = i+1;
|
||||
@ -174,11 +269,11 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
|
||||
{
|
||||
for(unsigned i=0; i<source_words.size(); i++) {
|
||||
auto source_word(source_words[i]);
|
||||
auto matches(lexicon.query_matches(source_word));
|
||||
LogicalMatches matches(lexicon,source_word,whatever);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word)) {
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word())) {
|
||||
//found the word form match. Now look for other wordforms with exactly the same attributes. Those are alternate spellings.
|
||||
//so first find all lexical entries with the same morphological unit id, and check all wordforms of those, looking for an attribute match
|
||||
auto same_morph_entries = lexicon.query_lexical_entries_with_same_morphological_unit_id(match);
|
||||
@ -188,7 +283,7 @@ void WordVariationGenerator_danish::find_simple_attribute_match_wordforms(std::v
|
||||
if(wordform2!=wordform && has_same_attributes(wordform,wordform2)) {
|
||||
//found an alternative spelling of the word
|
||||
Variation v;
|
||||
v.word.assign(wordform2->written_form,wordform2->written_form_length);
|
||||
v.word = matches.query_logical_written_form(wordform2);
|
||||
v.weight = weight;
|
||||
v.source_word_start = i;
|
||||
v.source_word_end = i+1;
|
||||
@ -332,11 +427,11 @@ void WordVariationGenerator_danish::transliterate_verb_acute_accent(std::vector<
|
||||
if(source_word.length()>4 && source_word.substr(source_word.length()-2)=="er") {
|
||||
//possibly a verb in imperative
|
||||
bool is_imperative = false;
|
||||
auto matches(lexicon.query_matches(source_word));
|
||||
LogicalMatches matches(lexicon,source_word,verb);
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word) &&
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_imperative))
|
||||
{
|
||||
is_imperative = true;
|
||||
@ -379,7 +474,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
auto source_word(lower_source_words[i]);
|
||||
if(source_word==" ")
|
||||
continue;
|
||||
auto matches(lexicon.query_matches(source_word));
|
||||
LogicalMatches matches(lexicon,source_word,verb);
|
||||
if(prev_was_er || prev_was_var || prev_was_har || prev_was_havde) {
|
||||
//check if this word is the past participle
|
||||
const sto::WordForm *wordform_past_participle = NULL;
|
||||
@ -387,7 +482,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word) &&
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::tense_past) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_participle))
|
||||
{
|
||||
@ -405,7 +500,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
//generate preterite
|
||||
if(wordform_preterite) {
|
||||
WordVariationGenerator::Variation v0;
|
||||
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
|
||||
v0.word = matches.query_logical_written_form(wordform_preterite);
|
||||
v0.weight = weight;
|
||||
v0.source_word_start = prev_word_idx;
|
||||
v0.source_word_end = i+1;
|
||||
@ -423,7 +518,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
//generate preterite
|
||||
if(wordform_preterite) {
|
||||
WordVariationGenerator::Variation v0;
|
||||
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
|
||||
v0.word = matches.query_logical_written_form(wordform_preterite);
|
||||
v0.weight = weight;
|
||||
v0.source_word_start = prev_word_idx;
|
||||
v0.source_word_end = i+1;
|
||||
@ -441,7 +536,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
//generate preterite
|
||||
if(wordform_preterite) {
|
||||
WordVariationGenerator::Variation v0;
|
||||
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
|
||||
v0.word = matches.query_logical_written_form(wordform_preterite);
|
||||
v0.weight = weight;
|
||||
v0.source_word_start = prev_word_idx;
|
||||
v0.source_word_end = i+1;
|
||||
@ -459,7 +554,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
//generate preterite
|
||||
if(wordform_preterite) {
|
||||
WordVariationGenerator::Variation v0;
|
||||
v0.word.assign(wordform_preterite->written_form,wordform_preterite->written_form_length);
|
||||
v0.word = matches.query_logical_written_form(wordform_preterite);
|
||||
v0.weight = weight;
|
||||
v0.source_word_start = prev_word_idx;
|
||||
v0.source_word_end = i+1;
|
||||
@ -481,7 +576,7 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
for(auto match : matches) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word) &&
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::tense_past) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::verbFormMood_indicative) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::voice_activeVoice)) //we'll ignore this complication for now
|
||||
@ -504,26 +599,26 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
//generate perfect
|
||||
if(source_word!="var") {
|
||||
WordVariationGenerator::Variation v0_0;
|
||||
v0_0.word = "har "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v0_0.word = "har "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v0_0.weight = weight;
|
||||
v0_0.source_word_start = i;
|
||||
v0_0.source_word_end = i+1;
|
||||
variations.push_back(v0_0);
|
||||
WordVariationGenerator::Variation v0_1;
|
||||
v0_1.word = "er "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v0_1.word = "er "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v0_1.weight = weight;
|
||||
v0_1.source_word_start = i;
|
||||
v0_1.source_word_end = i+1;
|
||||
variations.push_back(v0_1);
|
||||
//generate pluperfect
|
||||
WordVariationGenerator::Variation v1_0;
|
||||
v1_0.word = "havde "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v1_0.word = "havde "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v1_0.weight = weight;
|
||||
v1_0.source_word_start = i;
|
||||
v1_0.source_word_end = i+1;
|
||||
variations.push_back(v1_0);
|
||||
WordVariationGenerator::Variation v1_1;
|
||||
v1_1.word = "var "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v1_1.word = "var "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v1_1.weight = weight;
|
||||
v1_1.source_word_start = i;
|
||||
v1_1.source_word_end = i+1;
|
||||
@ -531,13 +626,13 @@ void WordVariationGenerator_danish::make_verb_past_past_variants(std::vector<Wor
|
||||
} else {
|
||||
//"at være" takes the auxilliary verb "have"
|
||||
WordVariationGenerator::Variation v0_0;
|
||||
v0_0.word = "har "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v0_0.word = "har "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v0_0.weight = weight;
|
||||
v0_0.source_word_start = i;
|
||||
v0_0.source_word_end = i+1;
|
||||
variations.push_back(v0_0);
|
||||
WordVariationGenerator::Variation v1_0;
|
||||
v1_0.word = "havde "+std::string(wordform_past_participle->written_form,wordform_past_participle->written_form_length);
|
||||
v1_0.word = "havde "+matches.query_logical_written_form(wordform_past_participle);
|
||||
v1_0.weight = weight;
|
||||
v1_0.source_word_start = i;
|
||||
v1_0.source_word_end = i+1;
|
||||
@ -577,13 +672,13 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
|
||||
continue;
|
||||
|
||||
//find noun
|
||||
auto matches(lexicon.query_matches(source_word0));
|
||||
LogicalMatches matches(lexicon,source_word0,noun);
|
||||
const sto::WordForm *wordform_noun = NULL;
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::commonNoun) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(same_wordform_as_source(*wordform,source_word0) &&
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::case_unspecified))
|
||||
{
|
||||
wordform_noun = wordform;
|
||||
@ -614,10 +709,10 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
|
||||
auto source_word4_capitalized(capitalize_word(source_word4));
|
||||
|
||||
//find proper-noun
|
||||
matches = lexicon.query_matches(source_word4_capitalized);
|
||||
auto matches2 = lexicon.query_matches(source_word4_capitalized);
|
||||
const sto::WordForm *wordform_proper_noun = NULL;
|
||||
const sto::WordForm *wordform_proper_noun_genitive = NULL;
|
||||
for(auto match : matches) {
|
||||
for(auto match : matches2) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::properNoun) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
@ -640,10 +735,63 @@ void WordVariationGenerator_danish::make_proper_noun_part_genetive(std::vector<W
|
||||
//transform that into propernoun-genetive noun
|
||||
|
||||
WordVariationGenerator::Variation v0_0;
|
||||
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + std::string(wordform_noun->written_form,wordform_noun->written_form_length);
|
||||
v0_0.word = std::string(wordform_proper_noun_genitive->written_form,wordform_proper_noun_genitive->written_form_length) + " " + matches.query_logical_written_form(wordform_noun);
|
||||
v0_0.weight = weight;
|
||||
v0_0.source_word_start = i;
|
||||
v0_0.source_word_end = i+5;
|
||||
variations.push_back(v0_0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void WordVariationGenerator_danish::handle_adjective_grammatical_gender_simplification(std::vector<WordVariationGenerator::Variation> &variations,
|
||||
const std::vector<std::string> &source_words,
|
||||
const std::vector<std::string> &lower_source_words,
|
||||
float weight)
|
||||
{
|
||||
//In Danish there are officially two grammatical genders: common and neuter. Adjectives have to agree when in singular indefinite.
|
||||
//However, Western Jutland generally doesn't distinguish. And for objects of abstract nature or non-obvious grammatical gender people don't always follow the rule.
|
||||
//So a document may have "Et internationalt marked" but the user searches for "international marked".
|
||||
//The opposite can also happen but it is less common.
|
||||
|
||||
//So locate adjectives with gender=common number=singular definitenes=indefinite, find the corresponding wordform for gender=neuter and generate that
|
||||
for(unsigned i=0; i<lower_source_words.size(); i++) {
|
||||
auto source_word0(lower_source_words[i]);
|
||||
if(source_word0==" ")
|
||||
continue;
|
||||
|
||||
//find adjective
|
||||
bool is_common_singular_indefinite = false;
|
||||
const sto::WordForm *wordform_neuter_singular_indefinite = NULL;
|
||||
LogicalMatches matches(lexicon,source_word0,whatever);
|
||||
for(auto match : matches) {
|
||||
if(match->part_of_speech==sto::part_of_speech_t::adjective) {
|
||||
auto wordforms(match->query_all_explicit_word_forms());
|
||||
for(auto wordform : wordforms) {
|
||||
if(wordform->has_attribute(sto::word_form_attribute_t::grammaticalGender_neuter) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::grammaticalNumber_singular) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::definiteness_indefinite))
|
||||
{
|
||||
wordform_neuter_singular_indefinite = wordform;
|
||||
}
|
||||
if(same_wordform_as_source(*wordform,matches.query_matched_word()) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::grammaticalGender_commonGender) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::grammaticalNumber_singular) &&
|
||||
wordform->has_attribute(sto::word_form_attribute_t::definiteness_indefinite))
|
||||
{
|
||||
is_common_singular_indefinite = wordform;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(!is_common_singular_indefinite || !wordform_neuter_singular_indefinite)
|
||||
continue;
|
||||
|
||||
WordVariationGenerator::Variation v0_0;
|
||||
v0_0.word = matches.query_logical_written_form(wordform_neuter_singular_indefinite);
|
||||
v0_0.weight = weight;
|
||||
v0_0.source_word_start = i;
|
||||
v0_0.source_word_end = i+1;
|
||||
variations.push_back(v0_0);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user