mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-06-27 00:16:07 -04:00
Merge branch 'master' into sto
This commit is contained in:
Conf.cppConf.hDocProcess.cppDocProcess.hDocRebuild.cppDocReindex.cppJobScheduler.cppJobScheduler.hMakefileMsg25.cppPageDocProcess.cppPageDoledbIPTable.cppPageGet.cppPageSpiderdbLookup.cppPageThreads.cppPages.cppPages.hParms.cppQuery.cppTagdb.cppTagdb.hTitleRecVersion.hVersion.cppXmlDoc.cppXmlDoc.hXmlDoc_Indexing.cppfctypes.cppfctypes.h
tools
1
Conf.cpp
1
Conf.cpp
@ -172,6 +172,7 @@ Conf::Conf ( ) {
|
||||
m_useShotgun = false;
|
||||
m_testMem = false;
|
||||
m_doConsistencyTesting = false;
|
||||
m_titleRecVersion = TITLEREC_CURRENT_VERSION;
|
||||
memset(m_spiderUserAgent, 0, sizeof(m_spiderUserAgent));
|
||||
memset(m_spiderBotName, 0, sizeof(m_spiderBotName));
|
||||
m_autoSaveFrequency = 0;
|
||||
|
2
Conf.h
2
Conf.h
@ -299,6 +299,8 @@ class Conf {
|
||||
bool m_testMem;
|
||||
bool m_doConsistencyTesting;
|
||||
|
||||
int32_t m_titleRecVersion;
|
||||
|
||||
// defaults to "Gigabot/1.0"
|
||||
char m_spiderUserAgent[USERAGENTMAXSIZE];
|
||||
|
||||
|
@ -205,7 +205,7 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
if (it == m_pendingDocItems.begin()) {
|
||||
if (docItem->m_lastPos >= 0 && it == m_pendingDocItems.begin()) {
|
||||
std::ofstream lastPosFile(docItem->m_docProcess->m_lastPosFilename, std::ofstream::out|std::ofstream::trunc);
|
||||
lastPosFile << docItem->m_lastPos << "|" << docItem->m_key << std::endl;
|
||||
}
|
||||
@ -214,6 +214,38 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
|
||||
pthread_cond_signal(&m_pendingDocItemsCond);
|
||||
}
|
||||
|
||||
bool DocProcess::addKey(const std::string &key, int64_t currentFilePos) {
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
|
||||
DocProcessDocItem *docItem = createDocItem(this, key, currentFilePos);
|
||||
|
||||
if (m_isUrl) {
|
||||
SpiderRequest sreq;
|
||||
sreq.setFromAddUrl(key.c_str());
|
||||
sreq.m_isAddUrl = 0;
|
||||
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
|
||||
docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
|
||||
} else {
|
||||
int64_t docId = strtoll(key.c_str(), nullptr, 10);
|
||||
|
||||
if (docId == 0) {
|
||||
// ignore invalid docId
|
||||
return false;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
|
||||
docItem->m_xmlDoc->set3(docId, "main", 0);
|
||||
}
|
||||
|
||||
updateXmldoc(docItem->m_xmlDoc);
|
||||
docItem->m_xmlDoc->setCallback(docItem, processedDoc);
|
||||
|
||||
addPendingDoc(docItem);
|
||||
s_docProcessDocThreadQueue.addItem(docItem);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void DocProcess::processFile(void *item) {
|
||||
DocProcessFileItem *fileItem = static_cast<DocProcessFileItem*>(item);
|
||||
|
||||
@ -253,35 +285,9 @@ void DocProcess::processFile(void *item) {
|
||||
std::string key = fileItem->m_docProcess->m_isUrl ? line : line.substr(0, line.find('|'));
|
||||
|
||||
if (foundLastPos) {
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
|
||||
DocProcessDocItem *docItem = fileItem->m_docProcess->createDocItem(fileItem->m_docProcess, key, currentFilePos);
|
||||
|
||||
if (fileItem->m_docProcess->m_isUrl) {
|
||||
SpiderRequest sreq;
|
||||
sreq.setFromAddUrl(key.c_str());
|
||||
sreq.m_isAddUrl = 0;
|
||||
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
|
||||
docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
|
||||
} else {
|
||||
int64_t docId = strtoll(line.c_str(), nullptr, 10);
|
||||
|
||||
if (docId == 0) {
|
||||
// ignore invalid docId
|
||||
continue;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
|
||||
docItem->m_xmlDoc->set3(docId, "main", 0);
|
||||
if (fileItem->m_docProcess->addKey(key, currentFilePos)) {
|
||||
fileItem->m_docProcess->waitPendingDocCount(10);
|
||||
}
|
||||
|
||||
docItem->m_docProcess->updateXmldoc(docItem->m_xmlDoc);
|
||||
docItem->m_xmlDoc->setCallback(docItem, processedDoc);
|
||||
|
||||
fileItem->m_docProcess->addPendingDoc(docItem);
|
||||
s_docProcessDocThreadQueue.addItem(docItem);
|
||||
|
||||
fileItem->m_docProcess->waitPendingDocCount(10);
|
||||
} else if (lastPosKey.compare(key) == 0) {
|
||||
foundLastPos = true;
|
||||
}
|
||||
|
@ -49,18 +49,22 @@ public:
|
||||
virtual void updateXmldoc(XmlDoc *xmlDoc) = 0;
|
||||
virtual void processDocItem(DocProcessDocItem *docItem) = 0;
|
||||
|
||||
bool addKey(const std::string &key, int64_t currentFilePos = -1);
|
||||
|
||||
static void reload(int /*fd*/, void */*state*/);
|
||||
|
||||
static void processFile(void *item);
|
||||
static void processDoc(void *item);
|
||||
static void processedDoc(void *state);
|
||||
|
||||
void waitPendingDocCount(unsigned maxCount);
|
||||
|
||||
protected:
|
||||
void removePendingDoc(DocProcessDocItem *docItem);
|
||||
|
||||
bool m_isUrl;
|
||||
|
||||
private:
|
||||
void waitPendingDocCount(unsigned maxCount);
|
||||
void addPendingDoc(DocProcessDocItem *docItem);
|
||||
|
||||
const char *m_filename;
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "XmlDoc.h"
|
||||
#include "Msg0.h"
|
||||
#include "RdbList.h"
|
||||
#include "Conf.h"
|
||||
|
||||
DocRebuild g_docRebuild("docrebuild.txt", false);
|
||||
DocRebuild g_docRebuildUrl("docrebuildurl.txt", true);
|
||||
@ -49,15 +50,22 @@ DocProcessDocItem* DocRebuild::createDocItem(DocProcess *docProcess, const std::
|
||||
|
||||
void DocRebuild::updateXmldoc(XmlDoc *xmlDoc) {
|
||||
xmlDoc->m_recycleContent = true;
|
||||
xmlDoc->m_docRebuild = true;
|
||||
}
|
||||
|
||||
void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
||||
DocRebuildDocItem *rebuildDocItem = dynamic_cast<DocRebuildDocItem*>(docItem);
|
||||
if (rebuildDocItem == nullptr) {
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
XmlDoc *xmlDoc = rebuildDocItem->m_xmlDoc;
|
||||
|
||||
// set callback
|
||||
xmlDoc->m_masterLoop = processedDoc;
|
||||
xmlDoc->m_masterState = rebuildDocItem;
|
||||
if (xmlDoc->m_masterLoop == nullptr) {
|
||||
xmlDoc->m_masterLoop = processedDoc;
|
||||
xmlDoc->m_masterState = rebuildDocItem;
|
||||
}
|
||||
|
||||
// prepare
|
||||
char **oldTitleRec = xmlDoc->getOldTitleRec();
|
||||
@ -80,11 +88,12 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
||||
return;
|
||||
}
|
||||
|
||||
// reset callback
|
||||
xmlDoc->m_masterLoop = nullptr;
|
||||
xmlDoc->m_masterState = nullptr;
|
||||
XmlDoc **oldXmlDoc = xmlDoc->getOldXmlDoc();
|
||||
if (!oldXmlDoc || oldXmlDoc == (XmlDoc**)-1) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
|
||||
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
|
||||
xmlDoc->m_indexCode = ECORRUPTDATA;
|
||||
xmlDoc->m_indexCodeValid = true;
|
||||
|
||||
@ -100,8 +109,8 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
||||
|
||||
int32_t *firstIp = xmlDoc->getFirstIp();
|
||||
if (!firstIp || firstIp == (int32_t*)-1) {
|
||||
// we must not be blocked/invalid at this point
|
||||
gbshutdownLogicError();
|
||||
// blocked
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t *siteNumInLinks = xmlDoc->getSiteNumInlinks();
|
||||
@ -114,6 +123,47 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
||||
if (xmlDoc->m_masterLoop == processedDoc) {
|
||||
xmlDoc->m_masterLoop = nullptr;
|
||||
xmlDoc->m_masterState = nullptr;
|
||||
|
||||
// logic copied from Repair.cpp
|
||||
|
||||
// rebuild the title rec! otherwise we re-add the old one
|
||||
xmlDoc->m_titleRecBufValid = false;
|
||||
xmlDoc->m_titleRecBuf.purge();
|
||||
|
||||
// recompute site, no more domain sites allowed
|
||||
xmlDoc->m_siteValid = false;
|
||||
xmlDoc->ptr_site = nullptr;
|
||||
xmlDoc->size_site = 0;
|
||||
|
||||
// recalculate the sitenuminlinks
|
||||
xmlDoc->m_siteNumInlinksValid = false;
|
||||
|
||||
// recalculate the langid
|
||||
xmlDoc->m_langIdValid = false;
|
||||
|
||||
// recalcualte and store the link info
|
||||
xmlDoc->m_linkInfo1Valid = false;
|
||||
xmlDoc->ptr_linkInfo1 = nullptr;
|
||||
xmlDoc->size_linkInfo1 = 0;
|
||||
|
||||
// re-get the tag rec from tagdb
|
||||
xmlDoc->m_tagRecValid = false;
|
||||
xmlDoc->m_tagRecDataValid = false;
|
||||
|
||||
xmlDoc->m_priority = -1;
|
||||
xmlDoc->m_priorityValid = true;
|
||||
|
||||
xmlDoc->m_contentValid = true;
|
||||
xmlDoc->m_content = xmlDoc->ptr_utf8Content;
|
||||
xmlDoc->m_contentLen = xmlDoc->size_utf8Content - 1;
|
||||
|
||||
// update to latest version
|
||||
#ifndef PRIVACORE_SAFE_VERSION
|
||||
xmlDoc->m_version = g_conf.m_titleRecVersion;
|
||||
#else
|
||||
xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
|
||||
#endif
|
||||
xmlDoc->m_versionValid = true;
|
||||
}
|
||||
|
||||
// set spider request
|
||||
|
@ -20,6 +20,8 @@
|
||||
#include "XmlDoc.h"
|
||||
#include "Msg0.h"
|
||||
#include "RdbList.h"
|
||||
#include "Conf.h"
|
||||
#include "TitleRecVersion.h"
|
||||
|
||||
DocReindex g_docReindex("docreindex.txt", false);
|
||||
DocReindex g_docReindexUrl("docreindexurl.txt", true);
|
||||
@ -49,10 +51,22 @@ DocProcessDocItem* DocReindex::createDocItem(DocProcess *docProcess, const std::
|
||||
|
||||
void DocReindex::updateXmldoc(XmlDoc *xmlDoc) {
|
||||
xmlDoc->m_indexCodeValid = false;
|
||||
|
||||
#ifndef PRIVACORE_SAFE_VERSION
|
||||
xmlDoc->m_version = g_conf.m_titleRecVersion;
|
||||
#else
|
||||
xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
|
||||
#endif
|
||||
|
||||
xmlDoc->m_versionValid = true;
|
||||
}
|
||||
|
||||
void DocReindex::processDocItem(DocProcessDocItem *docItem) {
|
||||
DocReindexDocItem *reindexDocItem = dynamic_cast<DocReindexDocItem*>(docItem);
|
||||
if (reindexDocItem == nullptr) {
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
XmlDoc *xmlDoc = reindexDocItem->m_xmlDoc;
|
||||
|
||||
// set callback
|
||||
|
@ -426,6 +426,7 @@ bool JobScheduler_impl::submit(thread_type_t thread_type, JobEntry &e)
|
||||
case thread_type_unspecified_io: job_queue = &cpu_job_queue; break;
|
||||
case thread_type_generate_thumbnail: job_queue = &external_job_queue; break;
|
||||
case thread_type_config_load: job_queue = &cpu_job_queue; break;
|
||||
case thread_type_page_process: job_queue = &cpu_job_queue; break;
|
||||
default:
|
||||
assert(false);
|
||||
|
||||
|
@ -46,6 +46,7 @@ enum thread_type_t {
|
||||
thread_type_unspecified_io, //until we can be more specific
|
||||
thread_type_generate_thumbnail,
|
||||
thread_type_config_load,
|
||||
thread_type_page_process,
|
||||
};
|
||||
|
||||
|
||||
|
2
Makefile
2
Makefile
@ -25,7 +25,7 @@ OBJS_O0 = \
|
||||
Lang.o Log.o \
|
||||
Mem.o Msg0.o Msg4In.o Msg4Out.o MsgC.o Msg13.o Msg20.o Msg22.o Msg39.o Msg3a.o Msg51.o Msge0.o Msge1.o Multicast.o \
|
||||
Parms.o Pages.o PageAddColl.o PageAddUrl.o PageBasic.o PageCrawlBot.o PageGet.o PageHealthCheck.o PageHosts.o PageInject.o \
|
||||
PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o \
|
||||
PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o PageDocProcess.o \
|
||||
Phrases.o HostFlags.o Process.o Proxy.o Punycode.o \
|
||||
InstanceInfoExchange.o \
|
||||
Query.o \
|
||||
|
@ -2853,7 +2853,7 @@ static LinkInfo *makeLinkInfo(int32_t ip,
|
||||
// get approx # of words in link text
|
||||
int32_t nw = 0;
|
||||
if ( txtLen > 0 )
|
||||
nw = getNumWords(txt,txtLen,TITLEREC_CURRENT_VERSION);
|
||||
nw = getNumWords(txt,txtLen);
|
||||
// store it
|
||||
r->m_linkTextNumWords = nw;
|
||||
|
||||
|
117
PageDocProcess.cpp
Normal file
117
PageDocProcess.cpp
Normal file
@ -0,0 +1,117 @@
|
||||
//
|
||||
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
//
|
||||
// License TL;DR: If you change this file, you must publish your changes.
|
||||
//
|
||||
|
||||
#include "TcpSocket.h"
|
||||
#include "HttpRequest.h"
|
||||
#include "HttpServer.h"
|
||||
#include "Pages.h"
|
||||
#include "GbUtil.h"
|
||||
#include "DocDelete.h"
|
||||
#include "DocRebuild.h"
|
||||
#include "DocReindex.h"
|
||||
#include "JobScheduler.h"
|
||||
|
||||
struct PageDocProcessState {
|
||||
PageDocProcessState(TcpSocket *s, HttpRequest *r, DocProcess *docProcess)
|
||||
: m_s(s)
|
||||
, m_r()
|
||||
, m_docProcess(docProcess) {
|
||||
m_r.copy(r);
|
||||
}
|
||||
|
||||
TcpSocket *m_s;
|
||||
HttpRequest m_r;
|
||||
DocProcess *m_docProcess;
|
||||
};
|
||||
|
||||
void waitPendingDocCountWrapper(void *state) {
|
||||
PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
|
||||
pageDocProcessState->m_docProcess->waitPendingDocCount(0);
|
||||
}
|
||||
|
||||
void doneWaitPendingDocCountWrapper(void *state, job_exit_t exit_type) {
|
||||
PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
|
||||
|
||||
if (exit_type != job_exit_normal) {
|
||||
g_httpServer.sendErrorReply(pageDocProcessState->m_s, ECANCELED, "job canceled");
|
||||
return;
|
||||
}
|
||||
|
||||
g_httpServer.sendSuccessReply(pageDocProcessState->m_s, pageDocProcessState->m_r.getReplyFormat());
|
||||
}
|
||||
|
||||
bool sendPageDocProcess(TcpSocket *s, HttpRequest *r) {
|
||||
int32_t keyLen = 0;
|
||||
const char *key = r->getString("key", &keyLen);
|
||||
std::string keyStr(key, keyLen);
|
||||
|
||||
int32_t typeLen = 0;
|
||||
const char *type = r->getString("type", &typeLen);
|
||||
|
||||
if (typeLen == 0) {
|
||||
return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "missing parameter type");
|
||||
}
|
||||
|
||||
DocProcess *docProcess = nullptr;
|
||||
|
||||
switch (typeLen) {
|
||||
case 9:
|
||||
if (strncasecmp(type, "docdelete", 9) == 0) {
|
||||
// docdelete
|
||||
if (starts_with(keyStr.c_str(), "http")) {
|
||||
docProcess = &g_docDeleteUrl;
|
||||
} else {
|
||||
docProcess = &g_docDelete;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 10:
|
||||
if (strncasecmp(type, "docrebuild", 10) == 0) {
|
||||
// docrebuild
|
||||
if (starts_with(keyStr.c_str(), "http")) {
|
||||
docProcess = &g_docRebuildUrl;
|
||||
} else {
|
||||
docProcess = &g_docRebuild;
|
||||
}
|
||||
} else if (strncasecmp(type, "docreindex", 10) == 0) {
|
||||
// docreindex
|
||||
if (starts_with(keyStr.c_str(), "http")) {
|
||||
docProcess = &g_docReindexUrl;
|
||||
} else {
|
||||
docProcess = &g_docReindex;
|
||||
}
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (docProcess) {
|
||||
docProcess->addKey(keyStr);
|
||||
|
||||
PageDocProcessState *state = new PageDocProcessState(s, r, docProcess);
|
||||
if (!g_jobScheduler.submit(waitPendingDocCountWrapper, doneWaitPendingDocCountWrapper, state, thread_type_page_process, 0)) {
|
||||
// unable to submit page
|
||||
return g_httpServer.sendErrorReply(s, EBADENGINEER, "unable to submit job");
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "invalid parameter type (docdelete, docrebuild, docreindex)");
|
||||
}
|
@ -66,18 +66,25 @@ static void generatePageJSON(std::vector<uint32_t> &doleips, const char *coll, S
|
||||
}
|
||||
|
||||
|
||||
static bool respondWithError(TcpSocket *s, HttpRequest *r, const char *msg) {
|
||||
static bool respondWithError(TcpSocket *s, HttpRequest *r, int32_t error, const char *errmsg) {
|
||||
SafeBuf sb;
|
||||
const char *contentType = NULL;
|
||||
switch(r->getReplyFormat()) {
|
||||
case FORMAT_HTML:
|
||||
g_pages.printAdminTop(&sb, s, r, NULL);
|
||||
sb.safePrintf("<p>%s</p>", msg);
|
||||
sb.safePrintf("<p>%s</p>", errmsg);
|
||||
g_pages.printAdminBottom2(&sb);
|
||||
contentType = "text/html";
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb.safePrintf("{error_message:\"%s\"}",msg); //todo: safe encode
|
||||
sb.safePrintf("{\"response\":{\n"
|
||||
"\t\"statusCode\":%" PRId32",\n"
|
||||
"\t\"statusMsg\":\"", error);
|
||||
sb.jsonEncode(errmsg);
|
||||
sb.safePrintf("\"\n"
|
||||
"}\n"
|
||||
"}\n");
|
||||
contentType = "application/json";
|
||||
contentType = "application/json";
|
||||
break;
|
||||
default:
|
||||
@ -94,12 +101,12 @@ bool sendPageDoledbIPTable(TcpSocket *s, HttpRequest *r) {
|
||||
const char *coll = r->getString("c", NULL, NULL);
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
if(!cr) {
|
||||
return respondWithError(s, r, "No collection specified");
|
||||
return respondWithError(s, r, ENOCOLLREC, "No collection specified");
|
||||
}
|
||||
|
||||
SpiderColl *spiderColl = cr->m_spiderColl;
|
||||
if(!spiderColl) {
|
||||
return respondWithError(s, r, "No spider-collection (?)");
|
||||
return respondWithError(s, r, EBADENGINEER, "No spider-collection (?)");
|
||||
}
|
||||
|
||||
std::vector<uint32_t> doleips = spiderColl->getDoledbIpTable();
|
||||
|
@ -214,16 +214,10 @@ bool sendErrorReply ( void *state , int32_t err ) {
|
||||
// get the tcp socket from the state
|
||||
TcpSocket *s = st->m_socket;
|
||||
|
||||
char tmp [ 1024*32 ] ;
|
||||
sprintf ( tmp , "%s",
|
||||
mstrerror(g_errno));
|
||||
// nuke state2
|
||||
mdelete ( st , sizeof(State2) , "PageGet1" );
|
||||
delete (st);
|
||||
// erase g_errno for sending
|
||||
//g_errno = 0;
|
||||
// . now encapsulate it in html head/tail and send it off
|
||||
//return g_httpServer.sendDynamicPage ( s , tmp , strlen(tmp) );
|
||||
|
||||
return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
|
||||
}
|
||||
|
||||
|
@ -193,7 +193,7 @@ static bool gotSpiderRecs2(State *st) {
|
||||
}
|
||||
|
||||
|
||||
static bool respondWithError(State *st, const char *msg) {
|
||||
static bool respondWithError(State *st, int32_t error, const char *errmsg) {
|
||||
// get the socket
|
||||
TcpSocket *s = st->m_socket;
|
||||
|
||||
@ -202,12 +202,18 @@ static bool respondWithError(State *st, const char *msg) {
|
||||
switch(st->m_r.getReplyFormat()) {
|
||||
case FORMAT_HTML:
|
||||
g_pages.printAdminTop(&sb, s, &st->m_r, NULL);
|
||||
sb.safePrintf("<p>%s</p>", msg);
|
||||
sb.safePrintf("<p>%s</p>", errmsg);
|
||||
g_pages.printAdminBottom2(&sb);
|
||||
contentType = "text/html";
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb.safePrintf("{error_message:\"%s\"}", msg); //todo: safe encode
|
||||
sb.safePrintf("{\"response\":{\n"
|
||||
"\t\"statusCode\":%" PRId32",\n"
|
||||
"\t\"statusMsg\":\"", error);
|
||||
sb.jsonEncode(errmsg);
|
||||
sb.safePrintf("\"\n"
|
||||
"}\n"
|
||||
"}\n");
|
||||
contentType = "application/json";
|
||||
break;
|
||||
default:
|
||||
@ -425,7 +431,7 @@ static bool sendResult(State *st) {
|
||||
sb.reserve2x ( 32768 );
|
||||
|
||||
if(g_errno) {
|
||||
return respondWithError(st, mstrerror(g_errno));
|
||||
return respondWithError(st, g_errno, mstrerror(g_errno));
|
||||
}
|
||||
|
||||
int32_t shardNum = -1;
|
||||
|
@ -33,6 +33,7 @@ static const char *thread_type_name(thread_type_t tt) {
|
||||
case thread_type_unspecified_io: return "unspecified IO";
|
||||
case thread_type_generate_thumbnail: return "generate-thumbnail";
|
||||
case thread_type_config_load: return "config-load";
|
||||
case thread_type_page_process: return "page-process";
|
||||
default: return "?";
|
||||
}
|
||||
}
|
||||
|
55
Pages.cpp
55
Pages.cpp
@ -233,6 +233,11 @@ static WebPage s_pages[] = {
|
||||
sendPageParser,
|
||||
PG_NOAPI|PG_COLLADMIN|PG_ACTIVE},
|
||||
|
||||
{ PAGE_DOCPROCESS, "admin/docprocess", 0, "DocProcess", 0, page_method_t::page_method_get,
|
||||
"Various doc process methods",
|
||||
sendPageDocProcess,
|
||||
PG_NOAPI|PG_MASTERADMIN|PG_ACTIVE},
|
||||
|
||||
{ PAGE_SITEDB , "admin/tagdb" , 0 , "Tagdb" , 0, page_method_t::page_method_post_url,
|
||||
"add/remove/get tags for sites/urls",
|
||||
sendPageTagdb,
|
||||
@ -862,55 +867,6 @@ bool printGigabotAdvice(SafeBuf *sb,
|
||||
return true;
|
||||
}
|
||||
|
||||
void Pages::printFormTop( SafeBuf *sb, HttpRequest *r ) {
|
||||
int32_t page = getDynamicPageNumber ( r );
|
||||
|
||||
if( page < 0 ) {
|
||||
logError("getDynamicPageNumber returned negative index!");
|
||||
return;
|
||||
}
|
||||
|
||||
// . the form
|
||||
// . we cannot use the GET method if there is more than a few k of
|
||||
// parameters, like in the case of the Search Controls page. The
|
||||
// browser simply will not send the request if it is that big.
|
||||
switch(s_pages[page].m_page_method) {
|
||||
case page_method_t::page_method_post_form:
|
||||
sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
|
||||
// we need this for <input type=file> tags
|
||||
"ENCTYPE=\"multipart/form-data\" "
|
||||
"action=\"/%s\">\n",
|
||||
s_pages[page].m_filename);
|
||||
case page_method_t::page_method_post_url:
|
||||
sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
|
||||
"action=\"/%s\">\n",
|
||||
s_pages[page].m_filename);
|
||||
case page_method_t::page_method_get:
|
||||
sb->safePrintf ("<form name=\"SubmitInput\" method=\"get\" "
|
||||
"action=\"/%s\">\n",
|
||||
s_pages[page].m_filename);
|
||||
}
|
||||
}
|
||||
|
||||
void Pages::printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r ) {
|
||||
|
||||
int32_t page = getDynamicPageNumber ( r );
|
||||
const char *coll = r->getString ( "c" );
|
||||
if ( ! coll ) coll = "";
|
||||
sb->safePrintf ( "<input type=\"hidden\" name=\"c\" "
|
||||
"value=\"%s\" />\n", coll);
|
||||
|
||||
// should any changes be broadcasted to all hosts?
|
||||
sb->safePrintf ("<input type=\"hidden\" name=\"cast\" value=\"%" PRId32"\" "
|
||||
"/>\n",
|
||||
page >= 0 ? (int32_t)s_pages[page].m_cast : 0);
|
||||
|
||||
}
|
||||
|
||||
bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
|
||||
return printAdminBottom ( sb );
|
||||
}
|
||||
|
||||
bool Pages::printSubmit ( SafeBuf *sb ) {
|
||||
// update button
|
||||
return sb->safePrintf (
|
||||
@ -1124,6 +1080,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
if ( i == PAGE_SEARCHBOX ) continue;
|
||||
if ( i == PAGE_TITLEDB ) continue;
|
||||
if ( i == PAGE_HEALTHCHECK ) continue;
|
||||
if ( i == PAGE_DOCPROCESS ) continue;
|
||||
|
||||
|
||||
|
||||
|
8
Pages.h
8
Pages.h
@ -69,10 +69,9 @@ bool sendPageProfiler ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageThreads ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageAPI ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageHelp ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageGraph ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageHealthCheck ( TcpSocket *sock , HttpRequest *hr ) ;
|
||||
bool sendPageDefaultCss(TcpSocket *s, HttpRequest *r);
|
||||
|
||||
bool sendPageDocProcess(TcpSocket *s, HttpRequest *r);
|
||||
|
||||
enum class page_method_t {
|
||||
page_method_get = 1, //plain http get
|
||||
@ -137,10 +136,6 @@ class Pages {
|
||||
const char *qs = NULL,
|
||||
const char* bodyJavascript = "" );
|
||||
|
||||
void printFormTop( SafeBuf *sb, HttpRequest *r );
|
||||
void printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r );
|
||||
|
||||
bool printAdminBottom ( SafeBuf *sb, HttpRequest *r );
|
||||
bool printAdminBottom ( SafeBuf *sb);
|
||||
bool printAdminBottom2 ( SafeBuf *sb);
|
||||
bool printTail ( SafeBuf* sb, bool isLocal );
|
||||
@ -222,6 +217,7 @@ enum {
|
||||
PAGE_DOLEIPTABLE ,
|
||||
PAGE_SEARCHBOX ,
|
||||
PAGE_PARSER ,
|
||||
PAGE_DOCPROCESS ,
|
||||
PAGE_SITEDB ,
|
||||
PAGE_HEALTHCHECK ,
|
||||
PAGE_NONE };
|
||||
|
12
Parms.cpp
12
Parms.cpp
@ -5390,6 +5390,18 @@ void Parms::init ( ) {
|
||||
m->m_page = PAGE_MASTER;
|
||||
m++;
|
||||
|
||||
#ifndef PRIVACORE_SAFE_VERSION
|
||||
m->m_title = "TitleRec version number";
|
||||
m->m_desc = "Override TitleRec version number (for testing only!)";
|
||||
m->m_cgi = "trvn";
|
||||
simple_m_set(Conf,m_titleRecVersion);
|
||||
m->m_def = TITLEREC_CURRENT_VERSION_STR;
|
||||
m->m_group = false;
|
||||
m->m_flags = PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m++;
|
||||
#endif
|
||||
|
||||
m->m_title = "use shotgun";
|
||||
m->m_desc = "If enabled, all servers must have two gigabit "
|
||||
"ethernet ports hooked up and Gigablast will round robin "
|
||||
|
@ -2659,6 +2659,14 @@ const struct QueryField g_fields[] = {
|
||||
NULL,
|
||||
QTF_DUP },
|
||||
|
||||
{"sitenoindex",
|
||||
FIELD_SITE,
|
||||
true,
|
||||
"sitenoindex:example.com",
|
||||
"Matches all documents on the example.com domain that in not indexed.",
|
||||
NULL,
|
||||
0 },
|
||||
|
||||
{"ip",
|
||||
FIELD_IP,
|
||||
true,
|
||||
|
45
Tagdb.cpp
45
Tagdb.cpp
@ -346,6 +346,37 @@ bool Tag::printToBufAsXml(SafeBuf *sb) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Tag::printToBufAsJson(SafeBuf *sb) const {
|
||||
sb->safePrintf("\t{\n");
|
||||
// print the tagname
|
||||
sb->safePrintf("\t\t\"name\": \"");
|
||||
sb->jsonEncode(getTagStrFromType(m_type));
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\t\"user\": \"");
|
||||
sb->jsonEncode(getUser());
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
// print the date when this tag was added
|
||||
sb->safePrintf("\t\t\"timestamp\": %" PRId32",\n", m_timestamp);
|
||||
|
||||
// print the ip added from
|
||||
char ipbuf[16];
|
||||
sb->safePrintf("\t\t\"ip\": \"");
|
||||
sb->jsonEncode(iptoa(m_ip,ipbuf));
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
|
||||
sb->safePrintf("\t\t\"value\": \"");
|
||||
|
||||
// print the m_data
|
||||
if ( ! printDataToBuf ( sb ) ) return false;
|
||||
|
||||
sb->safePrintf("\"\n");
|
||||
sb->safePrintf("\t},\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Tag::printToBufAsHtml(SafeBuf *sb, const char *prefix) const {
|
||||
// print the tagname
|
||||
const char *str = getTagStrFromType ( m_type );
|
||||
@ -847,6 +878,20 @@ bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TagRec::printToBufAsJson ( SafeBuf *sb ) {
|
||||
sb->safePrintf("\t\"tag\": [\n");
|
||||
|
||||
Tag *tag = getFirstTag();
|
||||
for ( ; tag ; tag = getNextTag ( tag ) )
|
||||
if ( tag->m_type != TT_DUP ) tag->printToBufAsJson ( sb );
|
||||
|
||||
sb->removeLastChar('\n');
|
||||
sb->removeLastChar(',');
|
||||
|
||||
sb->safePrintf("\t]\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TagRec::printToBufAsHtml ( SafeBuf *sb , const char *prefix ) {
|
||||
Tag *tag = getFirstTag();
|
||||
for ( ; tag ; tag = getNextTag ( tag ) )
|
||||
|
2
Tagdb.h
2
Tagdb.h
@ -34,6 +34,7 @@ class Tag {
|
||||
bool printToBuf (SafeBuf *sb) const;
|
||||
bool printToBufAsAddRequest(SafeBuf *sb) const;
|
||||
bool printToBufAsXml (SafeBuf *sb) const;
|
||||
bool printToBufAsJson (SafeBuf *sb) const;
|
||||
bool printToBufAsHtml (SafeBuf *sb, const char *prefix) const;
|
||||
bool printToBufAsTagVector (SafeBuf *sb) const;
|
||||
// just print the m_data...
|
||||
@ -116,6 +117,7 @@ public:
|
||||
bool printToBuf ( SafeBuf *sb );
|
||||
bool printToBufAsAddRequest ( SafeBuf *sb );
|
||||
bool printToBufAsXml ( SafeBuf *sb );
|
||||
bool printToBufAsJson ( SafeBuf *sb );
|
||||
bool printToBufAsHtml ( SafeBuf *sb , const char *prefix );
|
||||
bool printToBufAsTagVector ( SafeBuf *sb );
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
#ifndef GB_TITLERECVERSION_H
|
||||
#define GB_TITLERECVERSION_H
|
||||
|
||||
#ifndef STRINGIFY
|
||||
#define STRINGIFY(x) #x
|
||||
#define TO_STRING(x) STRINGIFY(x)
|
||||
#endif
|
||||
|
||||
// Starting version when Gigablast was open-sourced
|
||||
//#define TITLEREC_CURRENT_VERSION 120
|
||||
|
||||
@ -22,6 +27,11 @@
|
||||
//#define TITLEREC_CURRENT_VERSION 125
|
||||
|
||||
// new adult detection
|
||||
#define TITLEREC_CURRENT_VERSION 126
|
||||
//#define TITLEREC_CURRENT_VERSION 126
|
||||
|
||||
// handle robots meta with noindex, follow
|
||||
#define TITLEREC_CURRENT_VERSION 127
|
||||
|
||||
#define TITLEREC_CURRENT_VERSION_STR TO_STRING(TITLEREC_CURRENT_VERSION)
|
||||
|
||||
#endif // GB_TITLERECVERSION_H
|
||||
|
@ -6,8 +6,10 @@
|
||||
#include "Process.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef STRINGIFY
|
||||
#define STRINGIFY(x) #x
|
||||
#define TO_STRING(x) STRINGIFY(x)
|
||||
#endif
|
||||
|
||||
#ifndef GIT_COMMIT_ID
|
||||
#define GIT_COMMIT_ID unknown
|
||||
@ -65,6 +67,3 @@ void printVersion() {
|
||||
fprintf(stdout,"Gigablast Git branch : %s\n", getBranch());
|
||||
fprintf(stdout,"Gigablast Git commit : %s\n", getCommitId());
|
||||
}
|
||||
|
||||
#undef STRINGIFY
|
||||
#undef TO_STRING
|
||||
|
415
XmlDoc.cpp
415
XmlDoc.cpp
@ -385,6 +385,7 @@ void XmlDoc::reset ( ) {
|
||||
m_setTr = false;
|
||||
|
||||
m_recycleContent = false;
|
||||
m_docRebuild = false;
|
||||
m_callback1 = NULL;
|
||||
m_callback2 = NULL;
|
||||
m_state = NULL;
|
||||
@ -748,7 +749,13 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
//m_coll = coll;
|
||||
m_pbuf = pbuf;
|
||||
m_niceness = niceness;
|
||||
m_version = TITLEREC_CURRENT_VERSION;
|
||||
|
||||
#ifndef PRIVACORE_SAFE_VERSION
|
||||
m_version = g_conf.m_titleRecVersion;
|
||||
#else
|
||||
m_version = TITLEREC_CURRENT_VERSION;
|
||||
#endif
|
||||
|
||||
m_versionValid = true;
|
||||
|
||||
// this is used to removing the rec from doledb after we spider it
|
||||
@ -2332,7 +2339,15 @@ int32_t *XmlDoc::getIndexCode ( ) {
|
||||
return (int32_t *) ini;
|
||||
}
|
||||
|
||||
if (*ini) {
|
||||
// check meta nofollow
|
||||
bool *inf = getIsNoFollow();
|
||||
if (!inf || inf == (bool*) -1) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, could not getIsNoFollow");
|
||||
return (int32_t *) inf;
|
||||
}
|
||||
|
||||
// meta noindex & nofollow
|
||||
if (*ini && *inf) {
|
||||
if (m_firstUrl.isRoot()) {
|
||||
m_indexCode = EDOCDISALLOWEDROOT;
|
||||
} else {
|
||||
@ -2562,7 +2577,6 @@ int32_t *XmlDoc::getIndexCode ( ) {
|
||||
|
||||
ptr_utf8Content = NULL;
|
||||
size_utf8Content = 0;
|
||||
m_utf8ContentValid = true;
|
||||
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCNONCANONICAL");
|
||||
return &m_indexCode;
|
||||
@ -2921,8 +2935,12 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
|
||||
// assume could not make one because we were banned or something
|
||||
tbuf->purge(); // m_titleRec = NULL;
|
||||
|
||||
#ifndef PRIVACORE_SAFE_VERSION
|
||||
m_version = g_conf.m_titleRecVersion;
|
||||
#else
|
||||
// start seting members in THIS's header before compression
|
||||
m_version = TITLEREC_CURRENT_VERSION;
|
||||
m_version = TITLEREC_CURRENT_VERSION;
|
||||
#endif
|
||||
|
||||
// set this
|
||||
m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
|
||||
@ -3125,7 +3143,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {
|
||||
|
||||
ptr_utf8Content = NULL;
|
||||
size_utf8Content = 0;
|
||||
m_utf8ContentValid = true;
|
||||
} else {
|
||||
m_titleRecBufValid = true;
|
||||
return &m_titleRecBuf;
|
||||
@ -5723,7 +5740,6 @@ Url **XmlDoc::getRedirUrl() {
|
||||
|
||||
ptr_utf8Content = NULL;
|
||||
size_utf8Content = 0;
|
||||
m_utf8ContentValid = true;
|
||||
|
||||
// mdw: let this path through so contactXmlDoc gets a proper
|
||||
// redirect that we can follow. for the base xml doc at
|
||||
@ -6230,8 +6246,11 @@ SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
|
||||
// from scratch. this loads it from titledb.
|
||||
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
|
||||
char **XmlDoc::getOldTitleRec() {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
|
||||
|
||||
// if valid return that
|
||||
if ( m_oldTitleRecValid ) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, already valid");
|
||||
return &m_oldTitleRec;
|
||||
}
|
||||
|
||||
@ -6241,6 +6260,7 @@ char **XmlDoc::getOldTitleRec() {
|
||||
if ( m_setFromTitleRec ) {
|
||||
m_oldTitleRecValid = true;
|
||||
m_oldTitleRec = NULL;//m_titleRec;
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, setFromTitleRec");
|
||||
return &m_oldTitleRec;
|
||||
}
|
||||
// sanity check
|
||||
@ -6259,6 +6279,7 @@ char **XmlDoc::getOldTitleRec() {
|
||||
if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
|
||||
m_oldTitleRec = NULL;
|
||||
m_oldTitleRecValid = true;
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, not indexed");
|
||||
return &m_oldTitleRec;
|
||||
}
|
||||
// sanity check. if we have no url or docid ...
|
||||
@ -6288,6 +6309,7 @@ char **XmlDoc::getOldTitleRec() {
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, no collection");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -6316,6 +6338,7 @@ char **XmlDoc::getOldTitleRec() {
|
||||
m_niceness , // niceness
|
||||
999999 )) {// timeout seconds
|
||||
// return -1 if we blocked
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, blocked");
|
||||
return (char **)-1;
|
||||
}
|
||||
|
||||
@ -6326,9 +6349,12 @@ char **XmlDoc::getOldTitleRec() {
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, error=%s", mstrerror(g_errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END");
|
||||
|
||||
// got it
|
||||
return &m_oldTitleRec;
|
||||
}
|
||||
@ -6619,7 +6645,9 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
|
||||
if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;
|
||||
|
||||
// sanity check
|
||||
if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {g_process.shutdownAbort(true);}
|
||||
if (m_setFromTitleRec && !m_useSecondaryRdbs && !m_docRebuild) {
|
||||
g_process.shutdownAbort(true);
|
||||
}
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
@ -12929,7 +12957,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
|
||||
// . so at least now set all the data members we will need to
|
||||
// seriazlize into the title rec because we can't be blocking further
|
||||
// down below after we set all the hashtables and XmlDoc::ptr_ stuff
|
||||
if (!m_setFromTitleRec || m_useSecondaryRdbs) {
|
||||
if (!m_setFromTitleRec || m_useSecondaryRdbs || m_docRebuild) {
|
||||
// all member vars should already be valid if set from titlerec
|
||||
char *ptg = prepareToMakeTitleRec();
|
||||
|
||||
@ -14445,7 +14473,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
|
||||
// . only if had old one
|
||||
// . we use this in url filters to set the respider wait time usually
|
||||
if ( od ) {
|
||||
if ( od && !m_recycleContent) {
|
||||
int32_t spideredTime = getSpideredTime();
|
||||
int32_t oldSpideredTime = od->getSpideredTime();
|
||||
float numDays = spideredTime - oldSpideredTime;
|
||||
@ -17634,9 +17662,10 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
// for some reason sections page blocks forever in browser
|
||||
if ( page != 7 && ! m_printedMenu ) {
|
||||
printFrontPageShell ( sb , "search" , cr , false );
|
||||
if (hr->getReplyFormat() == FORMAT_HTML) {
|
||||
printFrontPageShell(sb, "search", cr, false);
|
||||
}
|
||||
m_printedMenu = true;
|
||||
//printMenu ( sb );
|
||||
}
|
||||
|
||||
|
||||
@ -17741,9 +17770,8 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
const char *es = mstrerror(m_indexCode);
|
||||
if ( ! m_indexCode ) es = mstrerror(g_errno);
|
||||
|
||||
int32_t isXml = hr->getLong("xml",0);
|
||||
|
||||
if ( ! isXml ) printMenu ( sb );
|
||||
char format = hr->getReplyFormat();
|
||||
if ( format == FORMAT_HTML ) printMenu ( sb );
|
||||
|
||||
int32_t shardNum = getShardNumFromDocId ( m_docId );
|
||||
Host *hosts = g_hostdb.getShard ( shardNum );
|
||||
@ -17757,7 +17785,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum, false);
|
||||
}
|
||||
|
||||
if ( ! isXml )
|
||||
if ( format == FORMAT_HTML )
|
||||
sb->safePrintf (
|
||||
"<table cellpadding=3 border=0>\n"
|
||||
|
||||
@ -17776,12 +17804,16 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"<td>%" PRId32"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<td width=\"25%%\">title rec version</td>"
|
||||
"<td>%" PRIu16"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<td>index error code</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>\n"
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<td>robots.txt allows</td>"
|
||||
"<td>%s</td>"
|
||||
@ -17800,6 +17832,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
h->m_hostId,
|
||||
spiderHostId,
|
||||
m_version,
|
||||
es,
|
||||
allowed,
|
||||
|
||||
@ -17807,13 +17840,14 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
fu
|
||||
|
||||
);
|
||||
else
|
||||
else if (format == FORMAT_XML)
|
||||
sb->safePrintf (
|
||||
"<?xml version=\"1.0\" "
|
||||
"encoding=\"UTF-8\" ?>\n"
|
||||
"<response>\n"
|
||||
"\t<coll><![CDATA[%s]]></coll>\n"
|
||||
"\t<docId>%" PRId64"</docId>\n"
|
||||
"\t<titleRecVersion>%" PRIu16"</titleRecVersion>\n"
|
||||
"\t<indexError><![CDATA[%s]]></indexError>\n"
|
||||
"\t<robotsTxtAllows>%" PRId32
|
||||
"</robotsTxtAllows>\n"
|
||||
@ -17821,30 +17855,75 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
,
|
||||
cr->m_coll,
|
||||
m_docId ,
|
||||
m_version,
|
||||
es,
|
||||
allowedInt,//(int32_t)m_isAllowed,
|
||||
fu
|
||||
);
|
||||
else if (format == FORMAT_JSON) {
|
||||
sb->safePrintf("{\"response\":{\n");
|
||||
|
||||
sb->safePrintf("\t\"coll\": \"");
|
||||
sb->jsonEncode(cr->m_coll);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\"docId\": %" PRIu64",\n", m_docId);
|
||||
sb->safePrintf("\t\"titleRecVersion\": %" PRIu16",\n", m_version);
|
||||
|
||||
sb->safePrintf("\t\"indexError\": \"");
|
||||
sb->jsonEncode(es);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\"robotsTxtAllows\": %" PRId32",\n", allowedInt);
|
||||
|
||||
sb->safePrintf("\t\"url\": \"");
|
||||
sb->jsonEncode(fu);
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
char *redir = ptr_redirUrl;
|
||||
if ( redir && ! isXml ) {
|
||||
sb->safePrintf(
|
||||
"<tr>"
|
||||
"<td>redir url</td>"
|
||||
"<td><a href=\"%s\">%s</a></td>"
|
||||
"</tr>\n"
|
||||
,redir
|
||||
,redir );
|
||||
if (redir) {
|
||||
switch (format) {
|
||||
case FORMAT_HTML:
|
||||
sb->safePrintf(
|
||||
"<tr>"
|
||||
"<td>redir url</td>"
|
||||
"<td><a href=\"%s\">%s</a></td>"
|
||||
"</tr>\n"
|
||||
,redir
|
||||
,redir );
|
||||
break;
|
||||
case FORMAT_XML:
|
||||
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]></redirectUrl>\n" ,redir );
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb->safePrintf("\t\"redirectUrl\": \"");
|
||||
sb->jsonEncode(redir);
|
||||
sb->safePrintf("\",\n");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if ( redir ) {
|
||||
sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
|
||||
"</redirectUrl>\n" ,redir );
|
||||
}
|
||||
|
||||
|
||||
if ( m_indexCode || g_errno ) {
|
||||
if ( ! isXml ) sb->safePrintf("</table><br>\n");
|
||||
else sb->safePrintf("</response>\n");
|
||||
switch (format) {
|
||||
case FORMAT_HTML:
|
||||
sb->safePrintf("</table><br>\n");
|
||||
break;
|
||||
case FORMAT_XML:
|
||||
sb->safePrintf("</response>\n");
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb->removeLastChar('\n');
|
||||
sb->removeLastChar(',');
|
||||
sb->safePrintf("}\n");
|
||||
sb->safePrintf("}\n");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -17852,157 +17931,120 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
// must always start with http
|
||||
if ( strncmp ( fu , "http" , 4 ) != 0 ) { g_process.shutdownAbort(true); }
|
||||
|
||||
struct tm tm_buf;
|
||||
char buf[64];
|
||||
time_t ts = (time_t)m_firstIndexedDate;
|
||||
|
||||
if ( ! isXml )
|
||||
sb->safePrintf("<tr><td>first indexed date</td>"
|
||||
"<td>%s UTC</td></tr>\n" ,
|
||||
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
|
||||
else
|
||||
sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32
|
||||
"</firstIndexedDateUTC>\n",
|
||||
(uint32_t)m_firstIndexedDate );
|
||||
|
||||
ts = m_spideredTime;
|
||||
|
||||
if ( ! isXml )
|
||||
sb->safePrintf("<tr><td>last indexed date</td>"
|
||||
"<td>%s UTC</td></tr>\n" ,
|
||||
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
|
||||
else
|
||||
sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32
|
||||
"</lastIndexedDateUTC>\n",
|
||||
(uint32_t)m_spideredTime );
|
||||
|
||||
ts = m_outlinksAddedDate;
|
||||
|
||||
if ( ! isXml )
|
||||
sb->safePrintf("<tr><td>outlinks last added date</td>"
|
||||
"<td>%s UTC</td></tr>\n" ,
|
||||
asctime_r(gmtime_r(&ts,&tm_buf),buf) );
|
||||
else
|
||||
sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32
|
||||
"</outlinksLastAddedUTC>\n",
|
||||
(uint32_t)m_outlinksAddedDate );
|
||||
|
||||
// hop count
|
||||
if ( ! isXml )
|
||||
sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td>"
|
||||
"</tr>\n",
|
||||
(int32_t)m_hopCount);
|
||||
else
|
||||
sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n",
|
||||
(int32_t)m_hopCount);
|
||||
|
||||
|
||||
char strLanguage[128];
|
||||
languageToString(m_langId, strLanguage);
|
||||
|
||||
// print tags
|
||||
//SafeBuf tb;
|
||||
int32_t sni = m_siteNumInlinks;
|
||||
|
||||
char ipString[16];
|
||||
iptoa(m_ip,ipString);
|
||||
|
||||
//int32_t sni = info1->getNumGoodInlinks();
|
||||
switch (format) {
|
||||
case FORMAT_HTML: {
|
||||
struct tm tm_buf;
|
||||
char buf[64];
|
||||
|
||||
time_t tlu = info1->getLastUpdated();
|
||||
struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);//info1->m_lastUpdated );
|
||||
char tmp3[64];
|
||||
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
|
||||
time_t ts = (time_t)m_firstIndexedDate;
|
||||
sb->safePrintf("<tr><td>first indexed date</td><td>%s UTC</td></tr>\n",
|
||||
asctime_r(gmtime_r(&ts, &tm_buf), buf));
|
||||
|
||||
ts = m_spideredTime;
|
||||
sb->safePrintf("<tr><td>last indexed date</td><td>%s UTC</td></tr>\n",
|
||||
asctime_r(gmtime_r(&ts, &tm_buf), buf));
|
||||
|
||||
if ( ! isXml )
|
||||
sb->safePrintf (
|
||||
"<tr><td>original charset</td><td>%s</td></tr>\n"
|
||||
"<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n"
|
||||
//"<tr><td>is link spam?</td><td>%" PRId32" <b>%s</b></td></tr>\n"
|
||||
"<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n"
|
||||
"<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n"
|
||||
"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
|
||||
"%s</td></tr>\n"
|
||||
"<tr><td>http status</td><td>%d</td></tr>"
|
||||
"<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n"
|
||||
"<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n"
|
||||
"<tr><td>content type</td><td>%s</td></tr>\n"
|
||||
"<tr><td>language</td><td>%s</td></tr>\n"
|
||||
"<tr><td>country</td><td>%s</td></tr>\n"
|
||||
ts = m_outlinksAddedDate;
|
||||
sb->safePrintf("<tr><td>outlinks last added date</td><td>%s UTC</td></tr>\n",
|
||||
asctime_r(gmtime_r(&ts, &tm_buf), buf));
|
||||
|
||||
"<tr><td><b>good inlinks to site</b>"
|
||||
"</td><td>%" PRId32"</td></tr>\n"
|
||||
sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td></tr>\n", (int32_t)m_hopCount);
|
||||
|
||||
"<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n"
|
||||
sb->safePrintf("<tr><td>original charset</td><td>%s</td></tr>\n", get_charset_str(m_charset));
|
||||
sb->safePrintf("<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isAdult);
|
||||
sb->safePrintf("<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isPermalink);
|
||||
sb->safePrintf("<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isRSS);
|
||||
sb->safePrintf("<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">%s</td></tr>\n", ipString, cr->m_coll, ipString);
|
||||
sb->safePrintf("<tr><td>http status</td><td>%d</td></tr>", m_httpStatus);
|
||||
sb->safePrintf("<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n", size_utf8Content - 1);
|
||||
sb->safePrintf("<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isContentTruncated);
|
||||
sb->safePrintf("<tr><td>content type</td><td>%s</td></tr>\n", g_contentTypeStrings[(int)m_contentType]);
|
||||
sb->safePrintf("<tr><td>language</td><td>%s</td></tr>\n", strLanguage);
|
||||
sb->safePrintf("<tr><td>country</td><td>%s</td></tr>\n", g_countryCode.getName(m_countryId));
|
||||
sb->safePrintf("<tr><td><b>good inlinks to site</b></td><td>%" PRId32"</td></tr>\n", m_siteNumInlinks);
|
||||
sb->safePrintf("<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n", ::getSiteRank(m_siteNumInlinks));
|
||||
sb->safePrintf("<tr><td>good inlinks to page</td><td>%" PRId32"</td></tr>\n", info1->getNumGoodInlinks());
|
||||
|
||||
"<tr><td>good inlinks to page"
|
||||
"</td><td>%" PRId32"</td></tr>\n"
|
||||
time_t tlu = info1->getLastUpdated();
|
||||
struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);
|
||||
char tmp3[64];
|
||||
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
|
||||
sb->safePrintf("<tr><td><nobr>page inlinks last computed</nobr></td><td>%s</td></tr>\n", tmp3);
|
||||
|
||||
"<tr><td><nobr>page inlinks last computed</nobr></td>"
|
||||
"<td>%s</td></tr>\n"
|
||||
"</td></tr>\n",
|
||||
get_charset_str(m_charset),
|
||||
(int32_t)m_isAdult,
|
||||
(int32_t)m_isPermalink,
|
||||
(int32_t)m_isRSS,
|
||||
ipString,
|
||||
cr->m_coll,
|
||||
ipString,
|
||||
m_httpStatus,
|
||||
size_utf8Content - 1,
|
||||
(int32_t)m_isContentTruncated,
|
||||
g_contentTypeStrings[(int)m_contentType] ,
|
||||
strLanguage,
|
||||
g_countryCode.getName(m_countryId) ,
|
||||
sni,
|
||||
::getSiteRank(sni),
|
||||
info1->getNumGoodInlinks(),
|
||||
sb->safePrintf("</td></tr>\n");
|
||||
} break;
|
||||
case FORMAT_XML:
|
||||
sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32"</firstIndexedDateUTC>\n", (uint32_t)m_firstIndexedDate);
|
||||
sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32"</lastIndexedDateUTC>\n", (uint32_t)m_spideredTime);
|
||||
sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32"</outlinksLastAddedUTC>\n", (uint32_t)m_outlinksAddedDate);
|
||||
|
||||
tmp3
|
||||
);
|
||||
else {
|
||||
sb->safePrintf (
|
||||
"\t<charset><![CDATA[%s]]></charset>\n"
|
||||
"\t<isAdult>%" PRId32"</isAdult>\n"
|
||||
"\t<isLinkSpam>%" PRId32"</isLinkSpam>\n"
|
||||
"\t<siteRank>%" PRId32"</siteRank>\n"
|
||||
sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n", (int32_t)m_hopCount);
|
||||
|
||||
"\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n"
|
||||
sb->safePrintf("\t<charset><![CDATA[%s]]></charset>\n", get_charset_str(m_charset));
|
||||
sb->safePrintf("\t<isAdult>%" PRId32"</isAdult>\n", (int32_t)m_isAdult);
|
||||
sb->safePrintf("\t<isLinkSpam>%" PRId32"</isLinkSpam>\n", (int32_t)m_isLinkSpam);
|
||||
sb->safePrintf("\t<siteRank>%" PRId32"</siteRank>\n", ::getSiteRank(m_siteNumInlinks));
|
||||
sb->safePrintf("\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n", m_siteNumInlinks);
|
||||
sb->safePrintf("\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n", info1->getNumGoodInlinks());
|
||||
sb->safePrintf("\t<pageInlinksLastComputed>%" PRId32"</pageInlinksLastComputed>\n", (int32_t)info1->m_lastUpdated);
|
||||
sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n", (int32_t)m_isPermalink);
|
||||
sb->safePrintf("\t<isRSSFeed>%" PRId32"</isRSSFeed>\n", (int32_t)m_isRSS);
|
||||
sb->safePrintf("\t<ipAddress><![CDATA[%s]]></ipAddress>\n", ipString);
|
||||
sb->safePrintf("\t<httpStatus>%d</httpStatus>", m_httpStatus);
|
||||
sb->safePrintf("\t<contentLenInBytes>%" PRId32"</contentLenInBytes>\n", size_utf8Content - 1);
|
||||
sb->safePrintf("\t<isContentTruncated>%" PRId32"</isContentTruncated>\n", (int32_t)m_isContentTruncated);
|
||||
sb->safePrintf("\t<contentType><![CDATA[%s]]></contentType>\n", g_contentTypeStrings[(int)m_contentType]);
|
||||
sb->safePrintf("\t<language><![CDATA[%s]]></language>\n", strLanguage);
|
||||
sb->safePrintf("\t<country><![CDATA[%s]]></country>\n", g_countryCode.getName(m_countryId));
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb->safePrintf("\t\"firstIndexedDateUTC\": %" PRIu32",\n", m_firstIndexedDate);
|
||||
sb->safePrintf("\t\"lastIndexedDateUTC\": %" PRIu32",\n", m_spideredTime);
|
||||
sb->safePrintf("\t\"outlinksLastAddedUTC\": %" PRIu32",\n", m_outlinksAddedDate);
|
||||
|
||||
"\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n"
|
||||
"\t<pageInlinksLastComputed>%" PRId32
|
||||
"</pageInlinksLastComputed>\n"
|
||||
sb->safePrintf("\t\"hopCount\": %" PRId8",\n", m_hopCount);
|
||||
|
||||
,get_charset_str(m_charset)
|
||||
,(int32_t)m_isAdult
|
||||
,(int32_t)m_isLinkSpam
|
||||
,::getSiteRank(sni)
|
||||
,sni
|
||||
sb->safePrintf("\t\"charset\": \"");
|
||||
sb->jsonEncode(get_charset_str(m_charset));
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
,info1->getNumGoodInlinks()
|
||||
,(int32_t)info1->m_lastUpdated
|
||||
);
|
||||
sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n"
|
||||
"\t<isRSSFeed>%" PRId32"</isRSSFeed>\n"
|
||||
"\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
|
||||
"\t<httpStatus>%d</httpStatus>"
|
||||
"\t<contentLenInBytes>%" PRId32
|
||||
"</contentLenInBytes>\n"
|
||||
"\t<isContentTruncated>%" PRId32
|
||||
"</isContentTruncated>\n"
|
||||
"\t<contentType><![CDATA[%s]]></contentType>\n"
|
||||
"\t<language><![CDATA[%s]]></language>\n"
|
||||
"\t<country><![CDATA[%s]]></country>\n",
|
||||
(int32_t)m_isPermalink,
|
||||
(int32_t)m_isRSS,
|
||||
ipString,
|
||||
m_httpStatus,
|
||||
size_utf8Content - 1,
|
||||
(int32_t)m_isContentTruncated,
|
||||
g_contentTypeStrings[(int)m_contentType] ,
|
||||
strLanguage,
|
||||
g_countryCode.getName(m_countryId) );
|
||||
sb->safePrintf("\t\"isAdult\": %s,\n", m_isAdult ? "true" : "false");
|
||||
sb->safePrintf("\t\"isLinkSpam\": %s,\n", m_isLinkSpam ? "true" : "false");
|
||||
sb->safePrintf("\t\"siteRank\": %" PRId32",\n", ::getSiteRank(m_siteNumInlinks));
|
||||
sb->safePrintf("\t\"numGoodSiteInlinks\": %" PRId32",\n", m_siteNumInlinks);
|
||||
sb->safePrintf("\t\"numGoodPageInlinks\": %" PRId32",\n", info1->getNumGoodInlinks());
|
||||
sb->safePrintf("\t\"pageInlinksLastComputed\": %" PRId32",\n", info1->m_lastUpdated);
|
||||
sb->safePrintf("\t\"isPermalink\": %s,\n", m_isPermalink ? "true" : "false");
|
||||
sb->safePrintf("\t\"isRSSFeed\": %s,\n", m_isRSS ? "true" : "false");
|
||||
|
||||
sb->safePrintf("\t\"ipAddress\": \"");
|
||||
sb->jsonEncode(ipString);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\"httpStatus\": %" PRId16",\n", m_httpStatus);
|
||||
sb->safePrintf("\t\"contentLenInBytes\": %" PRId32",\n", size_utf8Content - 1);
|
||||
sb->safePrintf("\t\"isContentTruncated\": %s,\n", m_isContentTruncated ? "true" : "false");
|
||||
|
||||
sb->safePrintf("\t\"contentType\": \"");
|
||||
sb->jsonEncode(g_contentTypeStrings[(int)m_contentType]);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\"language\": \"");
|
||||
sb->jsonEncode(strLanguage);
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
sb->safePrintf("\t\"country\": \"");
|
||||
sb->jsonEncode(g_countryCode.getName(m_countryId));
|
||||
sb->safePrintf("\",\n");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
TagRec *ogr = NULL;
|
||||
@ -18011,18 +18053,45 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
// sanity. should be set from titlerec, so no blocking!
|
||||
if ( ! ogr || ogr == (void *)-1 ) { g_process.shutdownAbort(true); }
|
||||
}
|
||||
if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
|
||||
else if ( ogr ) ogr->printToBufAsXml ( sb );
|
||||
|
||||
if (ogr) {
|
||||
switch (format) {
|
||||
case FORMAT_HTML:
|
||||
ogr->printToBufAsHtml(sb, "tag");
|
||||
break;
|
||||
case FORMAT_XML:
|
||||
ogr->printToBufAsXml(sb);
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
ogr->printToBufAsJson(sb);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// show the good inlinks we used when indexing this
|
||||
if ( ! isXml )
|
||||
info1->print(sb,cr->m_coll);
|
||||
if (format == FORMAT_HTML) {
|
||||
info1->print(sb, cr->m_coll);
|
||||
}
|
||||
|
||||
// close the table
|
||||
if ( ! isXml )
|
||||
sb->safePrintf ( "</table></center><br>\n" );
|
||||
else
|
||||
sb->safePrintf("</response>\n");
|
||||
switch (format) {
|
||||
case FORMAT_HTML:
|
||||
sb->safePrintf("</table><br>\n");
|
||||
break;
|
||||
case FORMAT_XML:
|
||||
sb->safePrintf("</response>\n");
|
||||
break;
|
||||
case FORMAT_JSON:
|
||||
sb->removeLastChar('\n');
|
||||
sb->removeLastChar(',');
|
||||
sb->safePrintf("}\n");
|
||||
sb->safePrintf("}\n");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
1
XmlDoc.h
1
XmlDoc.h
@ -1104,6 +1104,7 @@ public:
|
||||
bool m_contentInjected;
|
||||
|
||||
bool m_recycleContent;
|
||||
bool m_docRebuild;
|
||||
|
||||
char *m_rawUtf8Content;
|
||||
int32_t m_rawUtf8ContentSize;
|
||||
|
@ -372,6 +372,17 @@ char *XmlDoc::hashAll(HashTableX *table) {
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
bool *ini = getIsNoIndex();
|
||||
if (ini == nullptr || ini == (bool*)-1) {
|
||||
// must not be blocked
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
if (*ini && m_version > 126) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
|
||||
return (char *)1;
|
||||
}
|
||||
|
||||
if ((size_utf8Content - 1) <= 0) {
|
||||
logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
|
||||
return (char *)1;
|
||||
@ -916,50 +927,57 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
|
||||
return false;
|
||||
|
||||
if( urlOnly )
|
||||
{
|
||||
if (urlOnly) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool *ini = getIsNoIndex();
|
||||
if (ini == nullptr || ini == (bool*)-1) {
|
||||
// must not be blocked
|
||||
gbshutdownLogicError();
|
||||
}
|
||||
|
||||
if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
|
||||
if ( getUseTimeAxis() ) {
|
||||
hi.m_prefix = "gbtimeurl";
|
||||
SafeBuf *tau = getTimeAxisUrl();
|
||||
hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
|
||||
}
|
||||
|
||||
setStatus ( "hashing inurl colon" );
|
||||
char *s = fu->getUrl();
|
||||
int32_t slen = fu->getUrlLen();
|
||||
|
||||
//
|
||||
// HASH inurl: terms
|
||||
//
|
||||
char *s = fu->getUrl ();
|
||||
int32_t slen = fu->getUrlLen();
|
||||
hi.m_prefix = "inurl";
|
||||
if (!*ini || m_version <= 126) {
|
||||
setStatus("hashing inurl colon");
|
||||
|
||||
//
|
||||
// HASH inurl: terms
|
||||
//
|
||||
hi.m_prefix = "inurl";
|
||||
|
||||
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
|
||||
hi.m_hashNumbers = false;
|
||||
hi.m_filterUrlIndexableWords = true;
|
||||
if ( ! hashString ( s,slen, &hi ) ) return false;
|
||||
// BR 20160114: Skip numbers in urls when doing "inurl:" queries
|
||||
hi.m_hashNumbers = false;
|
||||
hi.m_filterUrlIndexableWords = true;
|
||||
if (!hashString(s, slen, &hi)) return false;
|
||||
}
|
||||
|
||||
{
|
||||
setStatus("hashing ip colon");
|
||||
hi.m_hashNumbers = true;
|
||||
hi.m_filterUrlIndexableWords = false;
|
||||
|
||||
setStatus ( "hashing ip colon" );
|
||||
hi.m_hashNumbers = true;
|
||||
hi.m_filterUrlIndexableWords = false;
|
||||
//
|
||||
// HASH ip:a.b.c.d
|
||||
//
|
||||
if (!m_ipValid) { g_process.shutdownAbort(true); }
|
||||
// copy it to save it
|
||||
char ipbuf[64];
|
||||
int32_t iplen = strlen(iptoa(m_ip, ipbuf));
|
||||
hi.m_prefix = "ip";
|
||||
if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;
|
||||
|
||||
//
|
||||
// HASH ip:a.b.c.d
|
||||
//
|
||||
if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
|
||||
// copy it to save it
|
||||
char ipbuf[64];
|
||||
int32_t iplen = strlen(iptoa(m_ip,ipbuf));
|
||||
hi.m_prefix = "ip";
|
||||
if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
|
||||
|
||||
// . sanity check
|
||||
if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
|
||||
// . sanity check
|
||||
if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
@ -1033,9 +1051,12 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
*p = '\0';
|
||||
|
||||
// update hash parms
|
||||
hi.m_prefix = "site";
|
||||
// no longer, we just index json now
|
||||
//if ( isStatusDoc ) hi.m_prefix = "site2";
|
||||
if (m_version <= 126) {
|
||||
hi.m_prefix = "site";
|
||||
} else {
|
||||
hi.m_prefix = *ini ? "sitenoindex" : "site";
|
||||
}
|
||||
|
||||
hi.m_hashGroup = HASHGROUP_INURL;
|
||||
|
||||
|
||||
@ -1105,24 +1126,26 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
}
|
||||
}
|
||||
|
||||
const char *ext = fu->getExtension();
|
||||
int32_t elen = fu->getExtensionLen();
|
||||
if (!*ini || m_version <= 126) {
|
||||
//
|
||||
// HASH ext: term
|
||||
//
|
||||
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
||||
setStatus("hashing ext colon");
|
||||
// update hash parms
|
||||
hi.m_prefix = "ext";
|
||||
if (!hashSingleTerm(ext, elen, &hi)) return false;
|
||||
}
|
||||
|
||||
//
|
||||
// HASH ext: term
|
||||
//
|
||||
// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
|
||||
setStatus ( "hashing ext colon");
|
||||
const char *ext = fu->getExtension();
|
||||
int32_t elen = fu->getExtensionLen();
|
||||
// update hash parms
|
||||
hi.m_prefix = "ext";
|
||||
if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
|
||||
|
||||
|
||||
setStatus ( "hashing gbdocid" );
|
||||
hi.m_prefix = "gbdocid";
|
||||
char buf2[32];
|
||||
sprintf(buf2,"%" PRIu64, (uint64_t)m_docId );
|
||||
if ( ! hashSingleTerm(buf2,strlen(buf2),&hi) ) return false;
|
||||
{
|
||||
setStatus("hashing gbdocid");
|
||||
hi.m_prefix = "gbdocid";
|
||||
char buf2[32];
|
||||
sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
|
||||
if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
|
||||
}
|
||||
|
||||
setStatus ( "hashing SiteGetter terms");
|
||||
|
||||
@ -1180,6 +1203,11 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
|
||||
hi.m_prefix = "urlhash";
|
||||
if ( ! hashString(buf,blen,&hi) ) return false;
|
||||
|
||||
// don't index mid domain or url path for noindex document
|
||||
if (*ini && m_version > 126) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
|
||||
setStatus("hashing url mid domain");
|
||||
|
||||
|
@ -306,7 +306,7 @@ const char *strnstrn(const char *haystack, int32_t haystackLen, const char *need
|
||||
}
|
||||
|
||||
// . get the # of words in this string
|
||||
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {
|
||||
int32_t getNumWords ( char *s , int32_t len ) {
|
||||
|
||||
int32_t wordCount = 0;
|
||||
bool inWord = false;
|
||||
|
@ -66,7 +66,7 @@ int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src ) ;
|
||||
int32_t to_lower_utf8 (char *dst , char *dstEnd, const char *src, const char *srcEnd) ;
|
||||
|
||||
// . get the # of words in this string
|
||||
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) ;
|
||||
int32_t getNumWords ( char *s , int32_t len ) ;
|
||||
int32_t atol2 ( const char *s, int32_t len ) ;
|
||||
int64_t atoll1 ( const char *s ) ;
|
||||
int64_t atoll2 ( const char *s, int32_t len ) ;
|
||||
|
195
tools/dump_rebuild_noindex.cpp
Normal file
195
tools/dump_rebuild_noindex.cpp
Normal file
@ -0,0 +1,195 @@
|
||||
#include "XmlDoc.h"
|
||||
#include "Collectiondb.h"
|
||||
#include "SpiderCache.h"
|
||||
#include "Titledb.h"
|
||||
#include "Doledb.h"
|
||||
#include "CountryCode.h"
|
||||
#include "Log.h"
|
||||
#include "Conf.h"
|
||||
#include "Mem.h"
|
||||
#include "UrlBlockCheck.h"
|
||||
#include "UrlMatchList.h"
|
||||
#include "WantedChecker.h"
|
||||
#include <libgen.h>
|
||||
#include <algorithm>
|
||||
|
||||
static void print_usage(const char *argv0) {
|
||||
fprintf(stdout, "Usage: %s [-h] PATH\n", argv0);
|
||||
fprintf(stdout, "Dump unwanted titlerec\n");
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, " -h, --help display this help and exit\n");
|
||||
}
|
||||
|
||||
static void cleanup() {
|
||||
g_log.m_disabled = true;
|
||||
|
||||
g_linkdb.reset();
|
||||
g_clusterdb.reset();
|
||||
g_spiderCache.reset();
|
||||
g_doledb.reset();
|
||||
g_spiderdb.reset();
|
||||
g_tagdb.reset();
|
||||
g_titledb.reset();
|
||||
g_posdb.reset();
|
||||
|
||||
g_collectiondb.reset();
|
||||
|
||||
g_loop.reset();
|
||||
|
||||
WantedChecker::finalize();
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
print_usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ) {
|
||||
print_usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_log.m_disabled = true;
|
||||
|
||||
// initialize library
|
||||
g_mem.init();
|
||||
hashinit();
|
||||
|
||||
// current dir
|
||||
char path[PATH_MAX];
|
||||
realpath(argv[1], path);
|
||||
size_t pathLen = strlen(path);
|
||||
if (path[pathLen] != '/') {
|
||||
strcat(path, "/");
|
||||
}
|
||||
|
||||
g_hostdb.init(-1, false, false, true, path);
|
||||
g_conf.init(path);
|
||||
|
||||
ucInit();
|
||||
|
||||
// initialize rdbs
|
||||
g_loop.init();
|
||||
|
||||
g_collectiondb.loadAllCollRecs();
|
||||
|
||||
g_posdb.init();
|
||||
g_titledb.init();
|
||||
g_tagdb.init();
|
||||
g_spiderdb.init();
|
||||
g_doledb.init();
|
||||
g_spiderCache.init();
|
||||
g_clusterdb.init();
|
||||
g_linkdb.init();
|
||||
|
||||
g_collectiondb.addRdbBaseToAllRdbsForEachCollRec();
|
||||
|
||||
g_log.m_disabled = false;
|
||||
g_log.m_logPrefix = false;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec("main");
|
||||
if (!cr) {
|
||||
logf(LOG_TRACE, "No main collection found");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize shlib & blacklist
|
||||
if (!WantedChecker::initialize()) {
|
||||
fprintf(stderr, "Unable to initialize WantedChecker");
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_urlBlackList.init();
|
||||
g_urlWhiteList.init();
|
||||
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
|
||||
key96_t startKey;
|
||||
startKey.setMin();
|
||||
|
||||
key96_t endKey;
|
||||
endKey.setMax();
|
||||
|
||||
while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {
|
||||
if (list.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
|
||||
key96_t key = list.getCurrentKey();
|
||||
int64_t docId = Titledb::getDocIdFromKey(&key);
|
||||
|
||||
XmlDoc xmlDoc;
|
||||
if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) {
|
||||
logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId);
|
||||
continue;
|
||||
}
|
||||
|
||||
// extract the url
|
||||
Url *url = xmlDoc.getFirstUrl();
|
||||
const char *reason = NULL;
|
||||
|
||||
if (isUrlUnwanted(*url, &reason)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Url **redirUrlPtr = xmlDoc.getRedirUrl();
|
||||
if (redirUrlPtr && *redirUrlPtr) {
|
||||
Url *redirUrl = *redirUrlPtr;
|
||||
if (isUrlUnwanted(*redirUrl, &reason)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *contentType = xmlDoc.getContentType();
|
||||
switch (*contentType) {
|
||||
case CT_GIF:
|
||||
case CT_JPG:
|
||||
case CT_PNG:
|
||||
case CT_TIFF:
|
||||
case CT_BMP:
|
||||
case CT_JS:
|
||||
case CT_CSS:
|
||||
case CT_JSON:
|
||||
case CT_IMAGE:
|
||||
case CT_GZ:
|
||||
case CT_ARC:
|
||||
case CT_WARC:
|
||||
continue;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// check content
|
||||
int32_t contentLen = xmlDoc.size_utf8Content > 0 ? (xmlDoc.size_utf8Content - 1) : 0;
|
||||
if (contentLen > 0) {
|
||||
if (!WantedChecker::check_single_content(url->getUrl(), xmlDoc.ptr_utf8Content, contentLen).wanted) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool *ini = xmlDoc.getIsNoIndex();
|
||||
if (*ini) {
|
||||
bool *inf = xmlDoc.getIsNoFollow();
|
||||
if (!*inf) {
|
||||
fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
startKey = *(key96_t *)list.getLastKey();
|
||||
startKey++;
|
||||
|
||||
// watch out for wrap around
|
||||
if ( startKey < *(key96_t *)list.getLastKey() ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup();
|
||||
|
||||
return 0;
|
||||
}
|
@ -178,10 +178,8 @@ int main(int argc, char **argv) {
|
||||
bool *inf = xmlDoc.getIsNoFollow();
|
||||
if (*inf) {
|
||||
fprintf(stdout, "%" PRId64"|meta noindex nofollow|%s\n", docId, url->getUrl());
|
||||
} else {
|
||||
fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user