Merge branch 'master' into sto

This commit is contained in:
Ivan Skytte Jørgensen
2017-12-07 13:23:46 +01:00
30 changed files with 866 additions and 342 deletions

@ -172,6 +172,7 @@ Conf::Conf ( ) {
m_useShotgun = false;
m_testMem = false;
m_doConsistencyTesting = false;
m_titleRecVersion = TITLEREC_CURRENT_VERSION;
memset(m_spiderUserAgent, 0, sizeof(m_spiderUserAgent));
memset(m_spiderBotName, 0, sizeof(m_spiderBotName));
m_autoSaveFrequency = 0;

2
Conf.h

@ -299,6 +299,8 @@ class Conf {
bool m_testMem;
bool m_doConsistencyTesting;
int32_t m_titleRecVersion;
// defaults to "Gigabot/1.0"
char m_spiderUserAgent[USERAGENTMAXSIZE];

@ -205,7 +205,7 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
gbshutdownLogicError();
}
if (it == m_pendingDocItems.begin()) {
if (docItem->m_lastPos >= 0 && it == m_pendingDocItems.begin()) {
std::ofstream lastPosFile(docItem->m_docProcess->m_lastPosFilename, std::ofstream::out|std::ofstream::trunc);
lastPosFile << docItem->m_lastPos << "|" << docItem->m_key << std::endl;
}
@ -214,6 +214,38 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
pthread_cond_signal(&m_pendingDocItemsCond);
}
bool DocProcess::addKey(const std::string &key, int64_t currentFilePos) {
logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
DocProcessDocItem *docItem = createDocItem(this, key, currentFilePos);
if (m_isUrl) {
SpiderRequest sreq;
sreq.setFromAddUrl(key.c_str());
sreq.m_isAddUrl = 0;
logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
} else {
int64_t docId = strtoll(key.c_str(), nullptr, 10);
if (docId == 0) {
// ignore invalid docId
return false;
}
logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
docItem->m_xmlDoc->set3(docId, "main", 0);
}
updateXmldoc(docItem->m_xmlDoc);
docItem->m_xmlDoc->setCallback(docItem, processedDoc);
addPendingDoc(docItem);
s_docProcessDocThreadQueue.addItem(docItem);
return true;
}
void DocProcess::processFile(void *item) {
DocProcessFileItem *fileItem = static_cast<DocProcessFileItem*>(item);
@ -253,35 +285,9 @@ void DocProcess::processFile(void *item) {
std::string key = fileItem->m_docProcess->m_isUrl ? line : line.substr(0, line.find('|'));
if (foundLastPos) {
logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
DocProcessDocItem *docItem = fileItem->m_docProcess->createDocItem(fileItem->m_docProcess, key, currentFilePos);
if (fileItem->m_docProcess->m_isUrl) {
SpiderRequest sreq;
sreq.setFromAddUrl(key.c_str());
sreq.m_isAddUrl = 0;
logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
} else {
int64_t docId = strtoll(line.c_str(), nullptr, 10);
if (docId == 0) {
// ignore invalid docId
continue;
}
logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
docItem->m_xmlDoc->set3(docId, "main", 0);
if (fileItem->m_docProcess->addKey(key, currentFilePos)) {
fileItem->m_docProcess->waitPendingDocCount(10);
}
docItem->m_docProcess->updateXmldoc(docItem->m_xmlDoc);
docItem->m_xmlDoc->setCallback(docItem, processedDoc);
fileItem->m_docProcess->addPendingDoc(docItem);
s_docProcessDocThreadQueue.addItem(docItem);
fileItem->m_docProcess->waitPendingDocCount(10);
} else if (lastPosKey.compare(key) == 0) {
foundLastPos = true;
}

@ -49,18 +49,22 @@ public:
virtual void updateXmldoc(XmlDoc *xmlDoc) = 0;
virtual void processDocItem(DocProcessDocItem *docItem) = 0;
bool addKey(const std::string &key, int64_t currentFilePos = -1);
static void reload(int /*fd*/, void */*state*/);
static void processFile(void *item);
static void processDoc(void *item);
static void processedDoc(void *state);
void waitPendingDocCount(unsigned maxCount);
protected:
void removePendingDoc(DocProcessDocItem *docItem);
bool m_isUrl;
private:
void waitPendingDocCount(unsigned maxCount);
void addPendingDoc(DocProcessDocItem *docItem);
const char *m_filename;

@ -20,6 +20,7 @@
#include "XmlDoc.h"
#include "Msg0.h"
#include "RdbList.h"
#include "Conf.h"
DocRebuild g_docRebuild("docrebuild.txt", false);
DocRebuild g_docRebuildUrl("docrebuildurl.txt", true);
@ -49,15 +50,22 @@ DocProcessDocItem* DocRebuild::createDocItem(DocProcess *docProcess, const std::
void DocRebuild::updateXmldoc(XmlDoc *xmlDoc) {
xmlDoc->m_recycleContent = true;
xmlDoc->m_docRebuild = true;
}
void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
DocRebuildDocItem *rebuildDocItem = dynamic_cast<DocRebuildDocItem*>(docItem);
if (rebuildDocItem == nullptr) {
gbshutdownLogicError();
}
XmlDoc *xmlDoc = rebuildDocItem->m_xmlDoc;
// set callback
xmlDoc->m_masterLoop = processedDoc;
xmlDoc->m_masterState = rebuildDocItem;
if (xmlDoc->m_masterLoop == nullptr) {
xmlDoc->m_masterLoop = processedDoc;
xmlDoc->m_masterState = rebuildDocItem;
}
// prepare
char **oldTitleRec = xmlDoc->getOldTitleRec();
@ -80,11 +88,12 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
return;
}
// reset callback
xmlDoc->m_masterLoop = nullptr;
xmlDoc->m_masterState = nullptr;
XmlDoc **oldXmlDoc = xmlDoc->getOldXmlDoc();
if (!oldXmlDoc || oldXmlDoc == (XmlDoc**)-1) {
return;
}
if (!xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
xmlDoc->m_indexCode = ECORRUPTDATA;
xmlDoc->m_indexCodeValid = true;
@ -100,8 +109,8 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
int32_t *firstIp = xmlDoc->getFirstIp();
if (!firstIp || firstIp == (int32_t*)-1) {
// we must not be blocked/invalid at this point
gbshutdownLogicError();
// blocked
return;
}
int32_t *siteNumInLinks = xmlDoc->getSiteNumInlinks();
@ -114,6 +123,47 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
if (xmlDoc->m_masterLoop == processedDoc) {
xmlDoc->m_masterLoop = nullptr;
xmlDoc->m_masterState = nullptr;
// logic copied from Repair.cpp
// rebuild the title rec! otherwise we re-add the old one
xmlDoc->m_titleRecBufValid = false;
xmlDoc->m_titleRecBuf.purge();
// recompute site, no more domain sites allowed
xmlDoc->m_siteValid = false;
xmlDoc->ptr_site = nullptr;
xmlDoc->size_site = 0;
// recalculate the sitenuminlinks
xmlDoc->m_siteNumInlinksValid = false;
// recalculate the langid
xmlDoc->m_langIdValid = false;
// recalcualte and store the link info
xmlDoc->m_linkInfo1Valid = false;
xmlDoc->ptr_linkInfo1 = nullptr;
xmlDoc->size_linkInfo1 = 0;
// re-get the tag rec from tagdb
xmlDoc->m_tagRecValid = false;
xmlDoc->m_tagRecDataValid = false;
xmlDoc->m_priority = -1;
xmlDoc->m_priorityValid = true;
xmlDoc->m_contentValid = true;
xmlDoc->m_content = xmlDoc->ptr_utf8Content;
xmlDoc->m_contentLen = xmlDoc->size_utf8Content - 1;
// update to latest version
#ifndef PRIVACORE_SAFE_VERSION
xmlDoc->m_version = g_conf.m_titleRecVersion;
#else
xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
#endif
xmlDoc->m_versionValid = true;
}
// set spider request

@ -20,6 +20,8 @@
#include "XmlDoc.h"
#include "Msg0.h"
#include "RdbList.h"
#include "Conf.h"
#include "TitleRecVersion.h"
DocReindex g_docReindex("docreindex.txt", false);
DocReindex g_docReindexUrl("docreindexurl.txt", true);
@ -49,10 +51,22 @@ DocProcessDocItem* DocReindex::createDocItem(DocProcess *docProcess, const std::
void DocReindex::updateXmldoc(XmlDoc *xmlDoc) {
xmlDoc->m_indexCodeValid = false;
#ifndef PRIVACORE_SAFE_VERSION
xmlDoc->m_version = g_conf.m_titleRecVersion;
#else
xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
#endif
xmlDoc->m_versionValid = true;
}
void DocReindex::processDocItem(DocProcessDocItem *docItem) {
DocReindexDocItem *reindexDocItem = dynamic_cast<DocReindexDocItem*>(docItem);
if (reindexDocItem == nullptr) {
gbshutdownLogicError();
}
XmlDoc *xmlDoc = reindexDocItem->m_xmlDoc;
// set callback

@ -426,6 +426,7 @@ bool JobScheduler_impl::submit(thread_type_t thread_type, JobEntry &e)
case thread_type_unspecified_io: job_queue = &cpu_job_queue; break;
case thread_type_generate_thumbnail: job_queue = &external_job_queue; break;
case thread_type_config_load: job_queue = &cpu_job_queue; break;
case thread_type_page_process: job_queue = &cpu_job_queue; break;
default:
assert(false);

@ -46,6 +46,7 @@ enum thread_type_t {
thread_type_unspecified_io, //until we can be more specific
thread_type_generate_thumbnail,
thread_type_config_load,
thread_type_page_process,
};

@ -25,7 +25,7 @@ OBJS_O0 = \
Lang.o Log.o \
Mem.o Msg0.o Msg4In.o Msg4Out.o MsgC.o Msg13.o Msg20.o Msg22.o Msg39.o Msg3a.o Msg51.o Msge0.o Msge1.o Multicast.o \
Parms.o Pages.o PageAddColl.o PageAddUrl.o PageBasic.o PageCrawlBot.o PageGet.o PageHealthCheck.o PageHosts.o PageInject.o \
PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o \
PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o PageDocProcess.o \
Phrases.o HostFlags.o Process.o Proxy.o Punycode.o \
InstanceInfoExchange.o \
Query.o \

@ -2853,7 +2853,7 @@ static LinkInfo *makeLinkInfo(int32_t ip,
// get approx # of words in link text
int32_t nw = 0;
if ( txtLen > 0 )
nw = getNumWords(txt,txtLen,TITLEREC_CURRENT_VERSION);
nw = getNumWords(txt,txtLen);
// store it
r->m_linkTextNumWords = nw;

117
PageDocProcess.cpp Normal file

@ -0,0 +1,117 @@
//
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// License TL;DR: If you change this file, you must publish your changes.
//
#include "TcpSocket.h"
#include "HttpRequest.h"
#include "HttpServer.h"
#include "Pages.h"
#include "GbUtil.h"
#include "DocDelete.h"
#include "DocRebuild.h"
#include "DocReindex.h"
#include "JobScheduler.h"
struct PageDocProcessState {
PageDocProcessState(TcpSocket *s, HttpRequest *r, DocProcess *docProcess)
: m_s(s)
, m_r()
, m_docProcess(docProcess) {
m_r.copy(r);
}
TcpSocket *m_s;
HttpRequest m_r;
DocProcess *m_docProcess;
};
void waitPendingDocCountWrapper(void *state) {
PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
pageDocProcessState->m_docProcess->waitPendingDocCount(0);
}
void doneWaitPendingDocCountWrapper(void *state, job_exit_t exit_type) {
PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
if (exit_type != job_exit_normal) {
g_httpServer.sendErrorReply(pageDocProcessState->m_s, ECANCELED, "job canceled");
return;
}
g_httpServer.sendSuccessReply(pageDocProcessState->m_s, pageDocProcessState->m_r.getReplyFormat());
}
bool sendPageDocProcess(TcpSocket *s, HttpRequest *r) {
int32_t keyLen = 0;
const char *key = r->getString("key", &keyLen);
std::string keyStr(key, keyLen);
int32_t typeLen = 0;
const char *type = r->getString("type", &typeLen);
if (typeLen == 0) {
return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "missing parameter type");
}
DocProcess *docProcess = nullptr;
switch (typeLen) {
case 9:
if (strncasecmp(type, "docdelete", 9) == 0) {
// docdelete
if (starts_with(keyStr.c_str(), "http")) {
docProcess = &g_docDeleteUrl;
} else {
docProcess = &g_docDelete;
}
}
break;
case 10:
if (strncasecmp(type, "docrebuild", 10) == 0) {
// docrebuild
if (starts_with(keyStr.c_str(), "http")) {
docProcess = &g_docRebuildUrl;
} else {
docProcess = &g_docRebuild;
}
} else if (strncasecmp(type, "docreindex", 10) == 0) {
// docreindex
if (starts_with(keyStr.c_str(), "http")) {
docProcess = &g_docReindexUrl;
} else {
docProcess = &g_docReindex;
}
}
default:
break;
}
if (docProcess) {
docProcess->addKey(keyStr);
PageDocProcessState *state = new PageDocProcessState(s, r, docProcess);
if (!g_jobScheduler.submit(waitPendingDocCountWrapper, doneWaitPendingDocCountWrapper, state, thread_type_page_process, 0)) {
// unable to submit page
return g_httpServer.sendErrorReply(s, EBADENGINEER, "unable to submit job");
}
return false;
}
return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "invalid parameter type (docdelete, docrebuild, docreindex)");
}

@ -66,18 +66,25 @@ static void generatePageJSON(std::vector<uint32_t> &doleips, const char *coll, S
}
static bool respondWithError(TcpSocket *s, HttpRequest *r, const char *msg) {
static bool respondWithError(TcpSocket *s, HttpRequest *r, int32_t error, const char *errmsg) {
SafeBuf sb;
const char *contentType = NULL;
switch(r->getReplyFormat()) {
case FORMAT_HTML:
g_pages.printAdminTop(&sb, s, r, NULL);
sb.safePrintf("<p>%s</p>", msg);
sb.safePrintf("<p>%s</p>", errmsg);
g_pages.printAdminBottom2(&sb);
contentType = "text/html";
break;
case FORMAT_JSON:
sb.safePrintf("{error_message:\"%s\"}",msg); //todo: safe encode
sb.safePrintf("{\"response\":{\n"
"\t\"statusCode\":%" PRId32",\n"
"\t\"statusMsg\":\"", error);
sb.jsonEncode(errmsg);
sb.safePrintf("\"\n"
"}\n"
"}\n");
contentType = "application/json";
contentType = "application/json";
break;
default:
@ -94,12 +101,12 @@ bool sendPageDoledbIPTable(TcpSocket *s, HttpRequest *r) {
const char *coll = r->getString("c", NULL, NULL);
CollectionRec *cr = g_collectiondb.getRec(coll);
if(!cr) {
return respondWithError(s, r, "No collection specified");
return respondWithError(s, r, ENOCOLLREC, "No collection specified");
}
SpiderColl *spiderColl = cr->m_spiderColl;
if(!spiderColl) {
return respondWithError(s, r, "No spider-collection (?)");
return respondWithError(s, r, EBADENGINEER, "No spider-collection (?)");
}
std::vector<uint32_t> doleips = spiderColl->getDoledbIpTable();

@ -214,16 +214,10 @@ bool sendErrorReply ( void *state , int32_t err ) {
// get the tcp socket from the state
TcpSocket *s = st->m_socket;
char tmp [ 1024*32 ] ;
sprintf ( tmp , "%s",
mstrerror(g_errno));
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
// erase g_errno for sending
//g_errno = 0;
// . now encapsulate it in html head/tail and send it off
//return g_httpServer.sendDynamicPage ( s , tmp , strlen(tmp) );
return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
}

@ -193,7 +193,7 @@ static bool gotSpiderRecs2(State *st) {
}
static bool respondWithError(State *st, const char *msg) {
static bool respondWithError(State *st, int32_t error, const char *errmsg) {
// get the socket
TcpSocket *s = st->m_socket;
@ -202,12 +202,18 @@ static bool respondWithError(State *st, const char *msg) {
switch(st->m_r.getReplyFormat()) {
case FORMAT_HTML:
g_pages.printAdminTop(&sb, s, &st->m_r, NULL);
sb.safePrintf("<p>%s</p>", msg);
sb.safePrintf("<p>%s</p>", errmsg);
g_pages.printAdminBottom2(&sb);
contentType = "text/html";
break;
case FORMAT_JSON:
sb.safePrintf("{error_message:\"%s\"}", msg); //todo: safe encode
sb.safePrintf("{\"response\":{\n"
"\t\"statusCode\":%" PRId32",\n"
"\t\"statusMsg\":\"", error);
sb.jsonEncode(errmsg);
sb.safePrintf("\"\n"
"}\n"
"}\n");
contentType = "application/json";
break;
default:
@ -425,7 +431,7 @@ static bool sendResult(State *st) {
sb.reserve2x ( 32768 );
if(g_errno) {
return respondWithError(st, mstrerror(g_errno));
return respondWithError(st, g_errno, mstrerror(g_errno));
}
int32_t shardNum = -1;

@ -33,6 +33,7 @@ static const char *thread_type_name(thread_type_t tt) {
case thread_type_unspecified_io: return "unspecified IO";
case thread_type_generate_thumbnail: return "generate-thumbnail";
case thread_type_config_load: return "config-load";
case thread_type_page_process: return "page-process";
default: return "?";
}
}

@ -233,6 +233,11 @@ static WebPage s_pages[] = {
sendPageParser,
PG_NOAPI|PG_COLLADMIN|PG_ACTIVE},
{ PAGE_DOCPROCESS, "admin/docprocess", 0, "DocProcess", 0, page_method_t::page_method_get,
"Various doc process methods",
sendPageDocProcess,
PG_NOAPI|PG_MASTERADMIN|PG_ACTIVE},
{ PAGE_SITEDB , "admin/tagdb" , 0 , "Tagdb" , 0, page_method_t::page_method_post_url,
"add/remove/get tags for sites/urls",
sendPageTagdb,
@ -862,55 +867,6 @@ bool printGigabotAdvice(SafeBuf *sb,
return true;
}
void Pages::printFormTop( SafeBuf *sb, HttpRequest *r ) {
int32_t page = getDynamicPageNumber ( r );
if( page < 0 ) {
logError("getDynamicPageNumber returned negative index!");
return;
}
// . the form
// . we cannot use the GET method if there is more than a few k of
// parameters, like in the case of the Search Controls page. The
// browser simply will not send the request if it is that big.
switch(s_pages[page].m_page_method) {
case page_method_t::page_method_post_form:
sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
// we need this for <input type=file> tags
"ENCTYPE=\"multipart/form-data\" "
"action=\"/%s\">\n",
s_pages[page].m_filename);
case page_method_t::page_method_post_url:
sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
"action=\"/%s\">\n",
s_pages[page].m_filename);
case page_method_t::page_method_get:
sb->safePrintf ("<form name=\"SubmitInput\" method=\"get\" "
"action=\"/%s\">\n",
s_pages[page].m_filename);
}
}
void Pages::printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r ) {
int32_t page = getDynamicPageNumber ( r );
const char *coll = r->getString ( "c" );
if ( ! coll ) coll = "";
sb->safePrintf ( "<input type=\"hidden\" name=\"c\" "
"value=\"%s\" />\n", coll);
// should any changes be broadcasted to all hosts?
sb->safePrintf ("<input type=\"hidden\" name=\"cast\" value=\"%" PRId32"\" "
"/>\n",
page >= 0 ? (int32_t)s_pages[page].m_cast : 0);
}
bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
return printAdminBottom ( sb );
}
bool Pages::printSubmit ( SafeBuf *sb ) {
// update button
return sb->safePrintf (
@ -1124,6 +1080,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
if ( i == PAGE_SEARCHBOX ) continue;
if ( i == PAGE_TITLEDB ) continue;
if ( i == PAGE_HEALTHCHECK ) continue;
if ( i == PAGE_DOCPROCESS ) continue;

@ -69,10 +69,9 @@ bool sendPageProfiler ( TcpSocket *s , HttpRequest *r );
bool sendPageThreads ( TcpSocket *s , HttpRequest *r );
bool sendPageAPI ( TcpSocket *s , HttpRequest *r );
bool sendPageHelp ( TcpSocket *s , HttpRequest *r );
bool sendPageGraph ( TcpSocket *s , HttpRequest *r );
bool sendPageHealthCheck ( TcpSocket *sock , HttpRequest *hr ) ;
bool sendPageDefaultCss(TcpSocket *s, HttpRequest *r);
bool sendPageDocProcess(TcpSocket *s, HttpRequest *r);
enum class page_method_t {
page_method_get = 1, //plain http get
@ -137,10 +136,6 @@ class Pages {
const char *qs = NULL,
const char* bodyJavascript = "" );
void printFormTop( SafeBuf *sb, HttpRequest *r );
void printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r );
bool printAdminBottom ( SafeBuf *sb, HttpRequest *r );
bool printAdminBottom ( SafeBuf *sb);
bool printAdminBottom2 ( SafeBuf *sb);
bool printTail ( SafeBuf* sb, bool isLocal );
@ -222,6 +217,7 @@ enum {
PAGE_DOLEIPTABLE ,
PAGE_SEARCHBOX ,
PAGE_PARSER ,
PAGE_DOCPROCESS ,
PAGE_SITEDB ,
PAGE_HEALTHCHECK ,
PAGE_NONE };

@ -5390,6 +5390,18 @@ void Parms::init ( ) {
m->m_page = PAGE_MASTER;
m++;
#ifndef PRIVACORE_SAFE_VERSION
m->m_title = "TitleRec version number";
m->m_desc = "Override TitleRec version number (for testing only!)";
m->m_cgi = "trvn";
simple_m_set(Conf,m_titleRecVersion);
m->m_def = TITLEREC_CURRENT_VERSION_STR;
m->m_group = false;
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m++;
#endif
m->m_title = "use shotgun";
m->m_desc = "If enabled, all servers must have two gigabit "
"ethernet ports hooked up and Gigablast will round robin "

@ -2659,6 +2659,14 @@ const struct QueryField g_fields[] = {
NULL,
QTF_DUP },
{"sitenoindex",
FIELD_SITE,
true,
"sitenoindex:example.com",
"Matches all documents on the example.com domain that in not indexed.",
NULL,
0 },
{"ip",
FIELD_IP,
true,

@ -346,6 +346,37 @@ bool Tag::printToBufAsXml(SafeBuf *sb) const {
return true;
}
bool Tag::printToBufAsJson(SafeBuf *sb) const {
sb->safePrintf("\t{\n");
// print the tagname
sb->safePrintf("\t\t\"name\": \"");
sb->jsonEncode(getTagStrFromType(m_type));
sb->safePrintf("\",\n");
sb->safePrintf("\t\t\"user\": \"");
sb->jsonEncode(getUser());
sb->safePrintf("\",\n");
// print the date when this tag was added
sb->safePrintf("\t\t\"timestamp\": %" PRId32",\n", m_timestamp);
// print the ip added from
char ipbuf[16];
sb->safePrintf("\t\t\"ip\": \"");
sb->jsonEncode(iptoa(m_ip,ipbuf));
sb->safePrintf("\",\n");
sb->safePrintf("\t\t\"value\": \"");
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf("\"\n");
sb->safePrintf("\t},\n");
return true;
}
bool Tag::printToBufAsHtml(SafeBuf *sb, const char *prefix) const {
// print the tagname
const char *str = getTagStrFromType ( m_type );
@ -847,6 +878,20 @@ bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
return true;
}
bool TagRec::printToBufAsJson ( SafeBuf *sb ) {
sb->safePrintf("\t\"tag\": [\n");
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsJson ( sb );
sb->removeLastChar('\n');
sb->removeLastChar(',');
sb->safePrintf("\t]\n");
return true;
}
bool TagRec::printToBufAsHtml ( SafeBuf *sb , const char *prefix ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )

@ -34,6 +34,7 @@ class Tag {
bool printToBuf (SafeBuf *sb) const;
bool printToBufAsAddRequest(SafeBuf *sb) const;
bool printToBufAsXml (SafeBuf *sb) const;
bool printToBufAsJson (SafeBuf *sb) const;
bool printToBufAsHtml (SafeBuf *sb, const char *prefix) const;
bool printToBufAsTagVector (SafeBuf *sb) const;
// just print the m_data...
@ -116,6 +117,7 @@ public:
bool printToBuf ( SafeBuf *sb );
bool printToBufAsAddRequest ( SafeBuf *sb );
bool printToBufAsXml ( SafeBuf *sb );
bool printToBufAsJson ( SafeBuf *sb );
bool printToBufAsHtml ( SafeBuf *sb , const char *prefix );
bool printToBufAsTagVector ( SafeBuf *sb );

@ -1,6 +1,11 @@
#ifndef GB_TITLERECVERSION_H
#define GB_TITLERECVERSION_H
#ifndef STRINGIFY
#define STRINGIFY(x) #x
#define TO_STRING(x) STRINGIFY(x)
#endif
// Starting version when Gigablast was open-sourced
//#define TITLEREC_CURRENT_VERSION 120
@ -22,6 +27,11 @@
//#define TITLEREC_CURRENT_VERSION 125
// new adult detection
#define TITLEREC_CURRENT_VERSION 126
//#define TITLEREC_CURRENT_VERSION 126
// handle robots meta with noindex, follow
#define TITLEREC_CURRENT_VERSION 127
#define TITLEREC_CURRENT_VERSION_STR TO_STRING(TITLEREC_CURRENT_VERSION)
#endif // GB_TITLERECVERSION_H

@ -6,8 +6,10 @@
#include "Process.h"
#include <string.h>
#ifndef STRINGIFY
#define STRINGIFY(x) #x
#define TO_STRING(x) STRINGIFY(x)
#endif
#ifndef GIT_COMMIT_ID
#define GIT_COMMIT_ID unknown
@ -65,6 +67,3 @@ void printVersion() {
fprintf(stdout,"Gigablast Git branch : %s\n", getBranch());
fprintf(stdout,"Gigablast Git commit : %s\n", getCommitId());
}
#undef STRINGIFY
#undef TO_STRING

@ -385,6 +385,7 @@ void XmlDoc::reset ( ) {
m_setTr = false;
m_recycleContent = false;
m_docRebuild = false;
m_callback1 = NULL;
m_callback2 = NULL;
m_state = NULL;
@ -748,7 +749,13 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
//m_coll = coll;
m_pbuf = pbuf;
m_niceness = niceness;
m_version = TITLEREC_CURRENT_VERSION;
#ifndef PRIVACORE_SAFE_VERSION
m_version = g_conf.m_titleRecVersion;
#else
m_version = TITLEREC_CURRENT_VERSION;
#endif
m_versionValid = true;
// this is used to removing the rec from doledb after we spider it
@ -2332,7 +2339,15 @@ int32_t *XmlDoc::getIndexCode ( ) {
return (int32_t *) ini;
}
if (*ini) {
// check meta nofollow
bool *inf = getIsNoFollow();
if (!inf || inf == (bool*) -1) {
logTrace(g_conf.m_logTraceXmlDoc, "END, could not getIsNoFollow");
return (int32_t *) inf;
}
// meta noindex & nofollow
if (*ini && *inf) {
if (m_firstUrl.isRoot()) {
m_indexCode = EDOCDISALLOWEDROOT;
} else {
@ -2562,7 +2577,6 @@ int32_t *XmlDoc::getIndexCode ( ) {
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCNONCANONICAL");
return &m_indexCode;
@ -2921,8 +2935,12 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
// assume could not make one because we were banned or something
tbuf->purge(); // m_titleRec = NULL;
#ifndef PRIVACORE_SAFE_VERSION
m_version = g_conf.m_titleRecVersion;
#else
// start seting members in THIS's header before compression
m_version = TITLEREC_CURRENT_VERSION;
m_version = TITLEREC_CURRENT_VERSION;
#endif
// set this
m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
@ -3125,7 +3143,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
} else {
m_titleRecBufValid = true;
return &m_titleRecBuf;
@ -5723,7 +5740,6 @@ Url **XmlDoc::getRedirUrl() {
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
// mdw: let this path through so contactXmlDoc gets a proper
// redirect that we can follow. for the base xml doc at
@ -6230,8 +6246,11 @@ SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
// from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getOldTitleRec() {
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
// if valid return that
if ( m_oldTitleRecValid ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid");
return &m_oldTitleRec;
}
@ -6241,6 +6260,7 @@ char **XmlDoc::getOldTitleRec() {
if ( m_setFromTitleRec ) {
m_oldTitleRecValid = true;
m_oldTitleRec = NULL;//m_titleRec;
logTrace(g_conf.m_logTraceXmlDoc, "END, setFromTitleRec");
return &m_oldTitleRec;
}
// sanity check
@ -6259,6 +6279,7 @@ char **XmlDoc::getOldTitleRec() {
if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
m_oldTitleRec = NULL;
m_oldTitleRecValid = true;
logTrace(g_conf.m_logTraceXmlDoc, "END, not indexed");
return &m_oldTitleRec;
}
// sanity check. if we have no url or docid ...
@ -6288,6 +6309,7 @@ char **XmlDoc::getOldTitleRec() {
CollectionRec *cr = getCollRec();
if ( ! cr ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, no collection");
return NULL;
}
@ -6316,6 +6338,7 @@ char **XmlDoc::getOldTitleRec() {
m_niceness , // niceness
999999 )) {// timeout seconds
// return -1 if we blocked
logTrace(g_conf.m_logTraceXmlDoc, "END, blocked");
return (char **)-1;
}
@ -6326,9 +6349,12 @@ char **XmlDoc::getOldTitleRec() {
// error?
if ( g_errno ) {
logTrace(g_conf.m_logTraceXmlDoc, "END, error=%s", mstrerror(g_errno));
return NULL;
}
logTrace(g_conf.m_logTraceXmlDoc, "END");
// got it
return &m_oldTitleRec;
}
@ -6619,7 +6645,9 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
// sanity check
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {g_process.shutdownAbort(true);}
if (m_setFromTitleRec && !m_useSecondaryRdbs && !m_docRebuild) {
g_process.shutdownAbort(true);
}
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
@ -12929,7 +12957,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
// . so at least now set all the data members we will need to
// seriazlize into the title rec because we can't be blocking further
// down below after we set all the hashtables and XmlDoc::ptr_ stuff
if (!m_setFromTitleRec || m_useSecondaryRdbs) {
if (!m_setFromTitleRec || m_useSecondaryRdbs || m_docRebuild) {
// all member vars should already be valid if set from titlerec
char *ptg = prepareToMakeTitleRec();
@ -14445,7 +14473,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// . only if had old one
// . we use this in url filters to set the respider wait time usually
if ( od ) {
if ( od && !m_recycleContent) {
int32_t spideredTime = getSpideredTime();
int32_t oldSpideredTime = od->getSpideredTime();
float numDays = spideredTime - oldSpideredTime;
@ -17634,9 +17662,10 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
// for some reason sections page blocks forever in browser
if ( page != 7 && ! m_printedMenu ) {
printFrontPageShell ( sb , "search" , cr , false );
if (hr->getReplyFormat() == FORMAT_HTML) {
printFrontPageShell(sb, "search", cr, false);
}
m_printedMenu = true;
//printMenu ( sb );
}
@ -17741,9 +17770,8 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
const char *es = mstrerror(m_indexCode);
if ( ! m_indexCode ) es = mstrerror(g_errno);
int32_t isXml = hr->getLong("xml",0);
if ( ! isXml ) printMenu ( sb );
char format = hr->getReplyFormat();
if ( format == FORMAT_HTML ) printMenu ( sb );
int32_t shardNum = getShardNumFromDocId ( m_docId );
Host *hosts = g_hostdb.getShard ( shardNum );
@ -17757,7 +17785,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum, false);
}
if ( ! isXml )
if ( format == FORMAT_HTML )
sb->safePrintf (
"<table cellpadding=3 border=0>\n"
@ -17776,12 +17804,16 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
"<td>%" PRId32"</td>"
"</tr>\n"
"<tr>"
"<td width=\"25%%\">title rec version</td>"
"<td>%" PRIu16"</td>"
"</tr>\n"
"<tr>"
"<td>index error code</td>"
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>robots.txt allows</td>"
"<td>%s</td>"
@ -17800,6 +17832,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
h->m_hostId,
spiderHostId,
m_version,
es,
allowed,
@ -17807,13 +17840,14 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
fu
);
else
else if (format == FORMAT_XML)
sb->safePrintf (
"<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
"<response>\n"
"\t<coll><![CDATA[%s]]></coll>\n"
"\t<docId>%" PRId64"</docId>\n"
"\t<titleRecVersion>%" PRIu16"</titleRecVersion>\n"
"\t<indexError><![CDATA[%s]]></indexError>\n"
"\t<robotsTxtAllows>%" PRId32
"</robotsTxtAllows>\n"
@ -17821,30 +17855,75 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
,
cr->m_coll,
m_docId ,
m_version,
es,
allowedInt,//(int32_t)m_isAllowed,
fu
);
else if (format == FORMAT_JSON) {
sb->safePrintf("{\"response\":{\n");
sb->safePrintf("\t\"coll\": \"");
sb->jsonEncode(cr->m_coll);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"docId\": %" PRIu64",\n", m_docId);
sb->safePrintf("\t\"titleRecVersion\": %" PRIu16",\n", m_version);
sb->safePrintf("\t\"indexError\": \"");
sb->jsonEncode(es);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"robotsTxtAllows\": %" PRId32",\n", allowedInt);
sb->safePrintf("\t\"url\": \"");
sb->jsonEncode(fu);
sb->safePrintf("\",\n");
}
char *redir = ptr_redirUrl;
if ( redir && ! isXml ) {
sb->safePrintf(
"<tr>"
"<td>redir url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,redir
,redir );
if (redir) {
switch (format) {
case FORMAT_HTML:
sb->safePrintf(
"<tr>"
"<td>redir url</td>"
"<td><a href=\"%s\">%s</a></td>"
"</tr>\n"
,redir
,redir );
break;
case FORMAT_XML:
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]></redirectUrl>\n" ,redir );
break;
case FORMAT_JSON:
sb->safePrintf("\t\"redirectUrl\": \"");
sb->jsonEncode(redir);
sb->safePrintf("\",\n");
break;
default:
break;
}
}
else if ( redir ) {
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
"</redirectUrl>\n" ,redir );
}
if ( m_indexCode || g_errno ) {
if ( ! isXml ) sb->safePrintf("</table><br>\n");
else sb->safePrintf("</response>\n");
switch (format) {
case FORMAT_HTML:
sb->safePrintf("</table><br>\n");
break;
case FORMAT_XML:
sb->safePrintf("</response>\n");
break;
case FORMAT_JSON:
sb->removeLastChar('\n');
sb->removeLastChar(',');
sb->safePrintf("}\n");
sb->safePrintf("}\n");
break;
default:
break;
}
return true;
}
@ -17852,157 +17931,120 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
// must always start with http
if ( strncmp ( fu , "http" , 4 ) != 0 ) { g_process.shutdownAbort(true); }
struct tm tm_buf;
char buf[64];
time_t ts = (time_t)m_firstIndexedDate;
if ( ! isXml )
sb->safePrintf("<tr><td>first indexed date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
else
sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32
"</firstIndexedDateUTC>\n",
(uint32_t)m_firstIndexedDate );
ts = m_spideredTime;
if ( ! isXml )
sb->safePrintf("<tr><td>last indexed date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
else
sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32
"</lastIndexedDateUTC>\n",
(uint32_t)m_spideredTime );
ts = m_outlinksAddedDate;
if ( ! isXml )
sb->safePrintf("<tr><td>outlinks last added date</td>"
"<td>%s UTC</td></tr>\n" ,
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
else
sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32
"</outlinksLastAddedUTC>\n",
(uint32_t)m_outlinksAddedDate );
// hop count
if ( ! isXml )
sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td>"
"</tr>\n",
(int32_t)m_hopCount);
else
sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n",
(int32_t)m_hopCount);
char strLanguage[128];
languageToString(m_langId, strLanguage);
// print tags
//SafeBuf tb;
int32_t sni = m_siteNumInlinks;
char ipString[16];
iptoa(m_ip,ipString);
//int32_t sni = info1->getNumGoodInlinks();
switch (format) {
case FORMAT_HTML: {
struct tm tm_buf;
char buf[64];
time_t tlu = info1->getLastUpdated();
struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);//info1->m_lastUpdated );
char tmp3[64];
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
time_t ts = (time_t)m_firstIndexedDate;
sb->safePrintf("<tr><td>first indexed date</td><td>%s UTC</td></tr>\n",
asctime_r(gmtime_r(&ts, &tm_buf), buf));
ts = m_spideredTime;
sb->safePrintf("<tr><td>last indexed date</td><td>%s UTC</td></tr>\n",
asctime_r(gmtime_r(&ts, &tm_buf), buf));
if ( ! isXml )
sb->safePrintf (
"<tr><td>original charset</td><td>%s</td></tr>\n"
"<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n"
//"<tr><td>is link spam?</td><td>%" PRId32" <b>%s</b></td></tr>\n"
"<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n"
"<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n"
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
"%s</td></tr>\n"
"<tr><td>http status</td><td>%d</td></tr>"
"<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n"
"<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n"
"<tr><td>content type</td><td>%s</td></tr>\n"
"<tr><td>language</td><td>%s</td></tr>\n"
"<tr><td>country</td><td>%s</td></tr>\n"
ts = m_outlinksAddedDate;
sb->safePrintf("<tr><td>outlinks last added date</td><td>%s UTC</td></tr>\n",
asctime_r(gmtime_r(&ts, &tm_buf), buf));
"<tr><td><b>good inlinks to site</b>"
"</td><td>%" PRId32"</td></tr>\n"
sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td></tr>\n", (int32_t)m_hopCount);
"<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n"
sb->safePrintf("<tr><td>original charset</td><td>%s</td></tr>\n", get_charset_str(m_charset));
sb->safePrintf("<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isAdult);
sb->safePrintf("<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isPermalink);
sb->safePrintf("<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isRSS);
sb->safePrintf("<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">%s</td></tr>\n", ipString, cr->m_coll, ipString);
sb->safePrintf("<tr><td>http status</td><td>%d</td></tr>", m_httpStatus);
sb->safePrintf("<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n", size_utf8Content - 1);
sb->safePrintf("<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isContentTruncated);
sb->safePrintf("<tr><td>content type</td><td>%s</td></tr>\n", g_contentTypeStrings[(int)m_contentType]);
sb->safePrintf("<tr><td>language</td><td>%s</td></tr>\n", strLanguage);
sb->safePrintf("<tr><td>country</td><td>%s</td></tr>\n", g_countryCode.getName(m_countryId));
sb->safePrintf("<tr><td><b>good inlinks to site</b></td><td>%" PRId32"</td></tr>\n", m_siteNumInlinks);
sb->safePrintf("<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n", ::getSiteRank(m_siteNumInlinks));
sb->safePrintf("<tr><td>good inlinks to page</td><td>%" PRId32"</td></tr>\n", info1->getNumGoodInlinks());
"<tr><td>good inlinks to page"
"</td><td>%" PRId32"</td></tr>\n"
time_t tlu = info1->getLastUpdated();
struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);
char tmp3[64];
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
sb->safePrintf("<tr><td><nobr>page inlinks last computed</nobr></td><td>%s</td></tr>\n", tmp3);
"<tr><td><nobr>page inlinks last computed</nobr></td>"
"<td>%s</td></tr>\n"
"</td></tr>\n",
get_charset_str(m_charset),
(int32_t)m_isAdult,
(int32_t)m_isPermalink,
(int32_t)m_isRSS,
ipString,
cr->m_coll,
ipString,
m_httpStatus,
size_utf8Content - 1,
(int32_t)m_isContentTruncated,
g_contentTypeStrings[(int)m_contentType] ,
strLanguage,
g_countryCode.getName(m_countryId) ,
sni,
::getSiteRank(sni),
info1->getNumGoodInlinks(),
sb->safePrintf("</td></tr>\n");
} break;
case FORMAT_XML:
sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32"</firstIndexedDateUTC>\n", (uint32_t)m_firstIndexedDate);
sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32"</lastIndexedDateUTC>\n", (uint32_t)m_spideredTime);
sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32"</outlinksLastAddedUTC>\n", (uint32_t)m_outlinksAddedDate);
tmp3
);
else {
sb->safePrintf (
"\t<charset><![CDATA[%s]]></charset>\n"
"\t<isAdult>%" PRId32"</isAdult>\n"
"\t<isLinkSpam>%" PRId32"</isLinkSpam>\n"
"\t<siteRank>%" PRId32"</siteRank>\n"
sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n", (int32_t)m_hopCount);
"\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n"
sb->safePrintf("\t<charset><![CDATA[%s]]></charset>\n", get_charset_str(m_charset));
sb->safePrintf("\t<isAdult>%" PRId32"</isAdult>\n", (int32_t)m_isAdult);
sb->safePrintf("\t<isLinkSpam>%" PRId32"</isLinkSpam>\n", (int32_t)m_isLinkSpam);
sb->safePrintf("\t<siteRank>%" PRId32"</siteRank>\n", ::getSiteRank(m_siteNumInlinks));
sb->safePrintf("\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n", m_siteNumInlinks);
sb->safePrintf("\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n", info1->getNumGoodInlinks());
sb->safePrintf("\t<pageInlinksLastComputed>%" PRId32"</pageInlinksLastComputed>\n", (int32_t)info1->m_lastUpdated);
sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n", (int32_t)m_isPermalink);
sb->safePrintf("\t<isRSSFeed>%" PRId32"</isRSSFeed>\n", (int32_t)m_isRSS);
sb->safePrintf("\t<ipAddress><![CDATA[%s]]></ipAddress>\n", ipString);
sb->safePrintf("\t<httpStatus>%d</httpStatus>", m_httpStatus);
sb->safePrintf("\t<contentLenInBytes>%" PRId32"</contentLenInBytes>\n", size_utf8Content - 1);
sb->safePrintf("\t<isContentTruncated>%" PRId32"</isContentTruncated>\n", (int32_t)m_isContentTruncated);
sb->safePrintf("\t<contentType><![CDATA[%s]]></contentType>\n", g_contentTypeStrings[(int)m_contentType]);
sb->safePrintf("\t<language><![CDATA[%s]]></language>\n", strLanguage);
sb->safePrintf("\t<country><![CDATA[%s]]></country>\n", g_countryCode.getName(m_countryId));
break;
case FORMAT_JSON:
sb->safePrintf("\t\"firstIndexedDateUTC\": %" PRIu32",\n", m_firstIndexedDate);
sb->safePrintf("\t\"lastIndexedDateUTC\": %" PRIu32",\n", m_spideredTime);
sb->safePrintf("\t\"outlinksLastAddedUTC\": %" PRIu32",\n", m_outlinksAddedDate);
"\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n"
"\t<pageInlinksLastComputed>%" PRId32
"</pageInlinksLastComputed>\n"
sb->safePrintf("\t\"hopCount\": %" PRId8",\n", m_hopCount);
,get_charset_str(m_charset)
,(int32_t)m_isAdult
,(int32_t)m_isLinkSpam
,::getSiteRank(sni)
,sni
sb->safePrintf("\t\"charset\": \"");
sb->jsonEncode(get_charset_str(m_charset));
sb->safePrintf("\",\n");
,info1->getNumGoodInlinks()
,(int32_t)info1->m_lastUpdated
);
sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n"
"\t<isRSSFeed>%" PRId32"</isRSSFeed>\n"
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
"\t<httpStatus>%d</httpStatus>"
"\t<contentLenInBytes>%" PRId32
"</contentLenInBytes>\n"
"\t<isContentTruncated>%" PRId32
"</isContentTruncated>\n"
"\t<contentType><![CDATA[%s]]></contentType>\n"
"\t<language><![CDATA[%s]]></language>\n"
"\t<country><![CDATA[%s]]></country>\n",
(int32_t)m_isPermalink,
(int32_t)m_isRSS,
ipString,
m_httpStatus,
size_utf8Content - 1,
(int32_t)m_isContentTruncated,
g_contentTypeStrings[(int)m_contentType] ,
strLanguage,
g_countryCode.getName(m_countryId) );
sb->safePrintf("\t\"isAdult\": %s,\n", m_isAdult ? "true" : "false");
sb->safePrintf("\t\"isLinkSpam\": %s,\n", m_isLinkSpam ? "true" : "false");
sb->safePrintf("\t\"siteRank\": %" PRId32",\n", ::getSiteRank(m_siteNumInlinks));
sb->safePrintf("\t\"numGoodSiteInlinks\": %" PRId32",\n", m_siteNumInlinks);
sb->safePrintf("\t\"numGoodPageInlinks\": %" PRId32",\n", info1->getNumGoodInlinks());
sb->safePrintf("\t\"pageInlinksLastComputed\": %" PRId32",\n", info1->m_lastUpdated);
sb->safePrintf("\t\"isPermalink\": %s,\n", m_isPermalink ? "true" : "false");
sb->safePrintf("\t\"isRSSFeed\": %s,\n", m_isRSS ? "true" : "false");
sb->safePrintf("\t\"ipAddress\": \"");
sb->jsonEncode(ipString);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"httpStatus\": %" PRId16",\n", m_httpStatus);
sb->safePrintf("\t\"contentLenInBytes\": %" PRId32",\n", size_utf8Content - 1);
sb->safePrintf("\t\"isContentTruncated\": %s,\n", m_isContentTruncated ? "true" : "false");
sb->safePrintf("\t\"contentType\": \"");
sb->jsonEncode(g_contentTypeStrings[(int)m_contentType]);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"language\": \"");
sb->jsonEncode(strLanguage);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"country\": \"");
sb->jsonEncode(g_countryCode.getName(m_countryId));
sb->safePrintf("\",\n");
break;
default:
break;
}
TagRec *ogr = NULL;
@ -18011,18 +18053,45 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
// sanity. should be set from titlerec, so no blocking!
if ( ! ogr || ogr == (void *)-1 ) { g_process.shutdownAbort(true); }
}
if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
else if ( ogr ) ogr->printToBufAsXml ( sb );
if (ogr) {
switch (format) {
case FORMAT_HTML:
ogr->printToBufAsHtml(sb, "tag");
break;
case FORMAT_XML:
ogr->printToBufAsXml(sb);
break;
case FORMAT_JSON:
ogr->printToBufAsJson(sb);
break;
default:
break;
}
}
// show the good inlinks we used when indexing this
if ( ! isXml )
info1->print(sb,cr->m_coll);
if (format == FORMAT_HTML) {
info1->print(sb, cr->m_coll);
}
// close the table
if ( ! isXml )
sb->safePrintf ( "</table></center><br>\n" );
else
sb->safePrintf("</response>\n");
switch (format) {
case FORMAT_HTML:
sb->safePrintf("</table><br>\n");
break;
case FORMAT_XML:
sb->safePrintf("</response>\n");
break;
case FORMAT_JSON:
sb->removeLastChar('\n');
sb->removeLastChar(',');
sb->safePrintf("}\n");
sb->safePrintf("}\n");
break;
default:
break;
}
return true;
}

@ -1104,6 +1104,7 @@ public:
bool m_contentInjected;
bool m_recycleContent;
bool m_docRebuild;
char *m_rawUtf8Content;
int32_t m_rawUtf8ContentSize;

@ -372,6 +372,17 @@ char *XmlDoc::hashAll(HashTableX *table) {
return (char *)1;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
if (*ini && m_version > 126) {
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
return (char *)1;
}
if ((size_utf8Content - 1) <= 0) {
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
return (char *)1;
@ -916,50 +927,57 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
return false;
if( urlOnly )
{
if (urlOnly) {
return true;
}
bool *ini = getIsNoIndex();
if (ini == nullptr || ini == (bool*)-1) {
// must not be blocked
gbshutdownLogicError();
}
if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
if ( getUseTimeAxis() ) {
hi.m_prefix = "gbtimeurl";
SafeBuf *tau = getTimeAxisUrl();
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
}
setStatus ( "hashing inurl colon" );
char *s = fu->getUrl();
int32_t slen = fu->getUrlLen();
//
// HASH inurl: terms
//
char *s = fu->getUrl ();
int32_t slen = fu->getUrlLen();
hi.m_prefix = "inurl";
if (!*ini || m_version <= 126) {
setStatus("hashing inurl colon");
//
// HASH inurl: terms
//
hi.m_prefix = "inurl";
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
hi.m_hashNumbers = false;
hi.m_filterUrlIndexableWords = true;
if ( ! hashString ( s,slen, &hi ) ) return false;
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
hi.m_hashNumbers = false;
hi.m_filterUrlIndexableWords = true;
if (!hashString(s, slen, &hi)) return false;
}
{
setStatus("hashing ip colon");
hi.m_hashNumbers = true;
hi.m_filterUrlIndexableWords = false;
setStatus ( "hashing ip colon" );
hi.m_hashNumbers = true;
hi.m_filterUrlIndexableWords = false;
//
// HASH ip:a.b.c.d
//
if (!m_ipValid) { g_process.shutdownAbort(true); }
// copy it to save it
char ipbuf[64];
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
hi.m_prefix = "ip";
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
//
// HASH ip:a.b.c.d
//
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
// copy it to save it
char ipbuf[64];
int32_t iplen = strlen(iptoa(m_ip,ipbuf));
hi.m_prefix = "ip";
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
// . sanity check
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
// . sanity check
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
}
//
@ -1033,9 +1051,12 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
*p = '\0';
// update hash parms
hi.m_prefix = "site";
// no longer, we just index json now
//if ( isStatusDoc ) hi.m_prefix = "site2";
if (m_version <= 126) {
hi.m_prefix = "site";
} else {
hi.m_prefix = *ini ? "sitenoindex" : "site";
}
hi.m_hashGroup = HASHGROUP_INURL;
@ -1105,24 +1126,26 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
}
}
const char *ext = fu->getExtension();
int32_t elen = fu->getExtensionLen();
if (!*ini || m_version <= 126) {
//
// HASH ext: term
//
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
setStatus("hashing ext colon");
// update hash parms
hi.m_prefix = "ext";
if (!hashSingleTerm(ext, elen, &hi)) return false;
}
//
// HASH ext: term
//
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
setStatus ( "hashing ext colon");
const char *ext = fu->getExtension();
int32_t elen = fu->getExtensionLen();
// update hash parms
hi.m_prefix = "ext";
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
setStatus ( "hashing gbdocid" );
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2,"%" PRIu64, (uint64_t)m_docId );
if ( ! hashSingleTerm(buf2,strlen(buf2),&hi) ) return false;
{
setStatus("hashing gbdocid");
hi.m_prefix = "gbdocid";
char buf2[32];
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
}
setStatus ( "hashing SiteGetter terms");
@ -1180,6 +1203,11 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
hi.m_prefix = "urlhash";
if ( ! hashString(buf,blen,&hi) ) return false;
// don't index mid domain or url path for noindex document
if (*ini && m_version > 126) {
return true;
}
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
setStatus("hashing url mid domain");

@ -306,7 +306,7 @@ const char *strnstrn(const char *haystack, int32_t haystackLen, const char *need
}
// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {
int32_t getNumWords ( char *s , int32_t len ) {
int32_t wordCount = 0;
bool inWord = false;

@ -66,7 +66,7 @@ int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src ) ;
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src, const char *srcEnd) ;
// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) ;
int32_t getNumWords ( char *s , int32_t len ) ;
int32_t atol2 ( const char *s, int32_t len ) ;
int64_t atoll1 ( const char *s ) ;
int64_t atoll2 ( const char *s, int32_t len ) ;

@ -0,0 +1,195 @@
#include "XmlDoc.h"
#include "Collectiondb.h"
#include "SpiderCache.h"
#include "Titledb.h"
#include "Doledb.h"
#include "CountryCode.h"
#include "Log.h"
#include "Conf.h"
#include "Mem.h"
#include "UrlBlockCheck.h"
#include "UrlMatchList.h"
#include "WantedChecker.h"
#include <libgen.h>
#include <algorithm>
static void print_usage(const char *argv0) {
fprintf(stdout, "Usage: %s [-h] PATH\n", argv0);
fprintf(stdout, "Dump unwanted titlerec\n");
fprintf(stdout, "\n");
fprintf(stdout, " -h, --help display this help and exit\n");
}
static void cleanup() {
g_log.m_disabled = true;
g_linkdb.reset();
g_clusterdb.reset();
g_spiderCache.reset();
g_doledb.reset();
g_spiderdb.reset();
g_tagdb.reset();
g_titledb.reset();
g_posdb.reset();
g_collectiondb.reset();
g_loop.reset();
WantedChecker::finalize();
}
int main(int argc, char **argv) {
if (argc < 2) {
print_usage(argv[0]);
return 1;
}
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ) {
print_usage(argv[0]);
return 1;
}
g_log.m_disabled = true;
// initialize library
g_mem.init();
hashinit();
// current dir
char path[PATH_MAX];
realpath(argv[1], path);
size_t pathLen = strlen(path);
if (path[pathLen] != '/') {
strcat(path, "/");
}
g_hostdb.init(-1, false, false, true, path);
g_conf.init(path);
ucInit();
// initialize rdbs
g_loop.init();
g_collectiondb.loadAllCollRecs();
g_posdb.init();
g_titledb.init();
g_tagdb.init();
g_spiderdb.init();
g_doledb.init();
g_spiderCache.init();
g_clusterdb.init();
g_linkdb.init();
g_collectiondb.addRdbBaseToAllRdbsForEachCollRec();
g_log.m_disabled = false;
g_log.m_logPrefix = false;
CollectionRec *cr = g_collectiondb.getRec("main");
if (!cr) {
logf(LOG_TRACE, "No main collection found");
return 1;
}
// initialize shlib & blacklist
if (!WantedChecker::initialize()) {
fprintf(stderr, "Unable to initialize WantedChecker");
return 1;
}
g_urlBlackList.init();
g_urlWhiteList.init();
Msg5 msg5;
RdbList list;
key96_t startKey;
startKey.setMin();
key96_t endKey;
endKey.setMax();
while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {
if (list.isEmpty()) {
break;
}
for (list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
key96_t key = list.getCurrentKey();
int64_t docId = Titledb::getDocIdFromKey(&key);
XmlDoc xmlDoc;
if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) {
logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId);
continue;
}
// extract the url
Url *url = xmlDoc.getFirstUrl();
const char *reason = NULL;
if (isUrlUnwanted(*url, &reason)) {
continue;
}
Url **redirUrlPtr = xmlDoc.getRedirUrl();
if (redirUrlPtr && *redirUrlPtr) {
Url *redirUrl = *redirUrlPtr;
if (isUrlUnwanted(*redirUrl, &reason)) {
continue;
}
}
uint8_t *contentType = xmlDoc.getContentType();
switch (*contentType) {
case CT_GIF:
case CT_JPG:
case CT_PNG:
case CT_TIFF:
case CT_BMP:
case CT_JS:
case CT_CSS:
case CT_JSON:
case CT_IMAGE:
case CT_GZ:
case CT_ARC:
case CT_WARC:
continue;
default:
break;
}
// check content
int32_t contentLen = xmlDoc.size_utf8Content > 0 ? (xmlDoc.size_utf8Content - 1) : 0;
if (contentLen > 0) {
if (!WantedChecker::check_single_content(url->getUrl(), xmlDoc.ptr_utf8Content, contentLen).wanted) {
continue;
}
}
bool *ini = xmlDoc.getIsNoIndex();
if (*ini) {
bool *inf = xmlDoc.getIsNoFollow();
if (!*inf) {
fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
}
continue;
}
}
startKey = *(key96_t *)list.getLastKey();
startKey++;
// watch out for wrap around
if ( startKey < *(key96_t *)list.getLastKey() ) {
break;
}
}
cleanup();
return 0;
}

@ -178,10 +178,8 @@ int main(int argc, char **argv) {
bool *inf = xmlDoc.getIsNoFollow();
if (*inf) {
fprintf(stdout, "%" PRId64"|meta noindex nofollow|%s\n", docId, url->getUrl());
} else {
fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
continue;
}
continue;
}
}