Merge branch 'master' into sto

2025-06-27 00:16:07 -04:00 · 2017-12-07 13:23:46 +01:00
parent 331581ff0b 54111ae16c
commit d6e6740143
30 changed files with 866 additions and 342 deletions
--- a/Conf.cpp
+++ b/Conf.cpp
@ -172,6 +172,7 @@ Conf::Conf ( ) {
 	m_useShotgun = false;
 	m_testMem = false;
 	m_doConsistencyTesting = false;
+	m_titleRecVersion = TITLEREC_CURRENT_VERSION;
 	memset(m_spiderUserAgent, 0, sizeof(m_spiderUserAgent));
 	memset(m_spiderBotName, 0, sizeof(m_spiderBotName));
 	m_autoSaveFrequency = 0;
--- a/Conf.h
+++ b/Conf.h
@ -299,6 +299,8 @@ class Conf {
 	bool   m_testMem;
 	bool   m_doConsistencyTesting;

+	int32_t m_titleRecVersion;
+
 	// defaults to "Gigabot/1.0"
 	char m_spiderUserAgent[USERAGENTMAXSIZE];

--- a/DocProcess.cpp
+++ b/DocProcess.cpp
@ -205,7 +205,7 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
 		gbshutdownLogicError();
 	}

-	if (it == m_pendingDocItems.begin()) {
+	if (docItem->m_lastPos >= 0 && it == m_pendingDocItems.begin()) {
 		std::ofstream lastPosFile(docItem->m_docProcess->m_lastPosFilename, std::ofstream::out|std::ofstream::trunc);
 		lastPosFile << docItem->m_lastPos << "|" << docItem->m_key << std::endl;
 	}
@ -214,6 +214,38 @@ void DocProcess::removePendingDoc(DocProcessDocItem *docItem) {
 	pthread_cond_signal(&m_pendingDocItemsCond);
 }

+bool DocProcess::addKey(const std::string &key, int64_t currentFilePos) {
+	logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
+	DocProcessDocItem *docItem = createDocItem(this, key, currentFilePos);
+
+	if (m_isUrl) {
+		SpiderRequest sreq;
+		sreq.setFromAddUrl(key.c_str());
+		sreq.m_isAddUrl = 0;
+
+		logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
+		docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
+	} else {
+		int64_t docId = strtoll(key.c_str(), nullptr, 10);
+
+		if (docId == 0) {
+			// ignore invalid docId
+			return false;
+		}
+
+		logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
+		docItem->m_xmlDoc->set3(docId, "main", 0);
+	}
+
+	updateXmldoc(docItem->m_xmlDoc);
+	docItem->m_xmlDoc->setCallback(docItem, processedDoc);
+
+	addPendingDoc(docItem);
+	s_docProcessDocThreadQueue.addItem(docItem);
+
+	return true;
+}
+
 void DocProcess::processFile(void *item) {
 	DocProcessFileItem *fileItem = static_cast<DocProcessFileItem*>(item);

@ -253,35 +285,9 @@ void DocProcess::processFile(void *item) {
 		std::string key = fileItem->m_docProcess->m_isUrl ? line : line.substr(0, line.find('|'));

 		if (foundLastPos) {
-			logTrace(g_conf.m_logTraceDocProcess, "Processing key='%s'", key.c_str());
-			DocProcessDocItem *docItem = fileItem->m_docProcess->createDocItem(fileItem->m_docProcess, key, currentFilePos);
-
-			if (fileItem->m_docProcess->m_isUrl) {
-				SpiderRequest sreq;
-				sreq.setFromAddUrl(key.c_str());
-				sreq.m_isAddUrl = 0;
-
-				logTrace(g_conf.m_logTraceDocProcess, "Adding url=%s", key.c_str());
-				docItem->m_xmlDoc->set4(&sreq, nullptr, "main", nullptr, 0);
-			} else {
-				int64_t docId = strtoll(line.c_str(), nullptr, 10);
-
-				if (docId == 0) {
-					// ignore invalid docId
-					continue;
-				}
-
-				logTrace(g_conf.m_logTraceDocProcess, "Adding docid=%" PRId64, docId);
-				docItem->m_xmlDoc->set3(docId, "main", 0);
+			if (fileItem->m_docProcess->addKey(key, currentFilePos)) {
+				fileItem->m_docProcess->waitPendingDocCount(10);
 			}
-
-			docItem->m_docProcess->updateXmldoc(docItem->m_xmlDoc);
-			docItem->m_xmlDoc->setCallback(docItem, processedDoc);
-
-			fileItem->m_docProcess->addPendingDoc(docItem);
-			s_docProcessDocThreadQueue.addItem(docItem);
-
-			fileItem->m_docProcess->waitPendingDocCount(10);
 		} else if (lastPosKey.compare(key) == 0) {
 			foundLastPos = true;
 		}
--- a/DocProcess.h
+++ b/DocProcess.h
@ -49,18 +49,22 @@ public:
 	virtual void updateXmldoc(XmlDoc *xmlDoc) = 0;
 	virtual void processDocItem(DocProcessDocItem *docItem) = 0;

+	bool addKey(const std::string &key, int64_t currentFilePos = -1);
+
 	static void reload(int /*fd*/, void */*state*/);
+
 	static void processFile(void *item);
 	static void processDoc(void *item);
 	static void processedDoc(void *state);

+	void waitPendingDocCount(unsigned maxCount);
+
 protected:
 	void removePendingDoc(DocProcessDocItem *docItem);

 	bool m_isUrl;

 private:
-	void waitPendingDocCount(unsigned maxCount);
 	void addPendingDoc(DocProcessDocItem *docItem);

 	const char *m_filename;
--- a/DocRebuild.cpp
+++ b/DocRebuild.cpp
@ -20,6 +20,7 @@
 #include "XmlDoc.h"
 #include "Msg0.h"
 #include "RdbList.h"
+#include "Conf.h"

 DocRebuild g_docRebuild("docrebuild.txt", false);
 DocRebuild g_docRebuildUrl("docrebuildurl.txt", true);
@ -49,15 +50,22 @@ DocProcessDocItem* DocRebuild::createDocItem(DocProcess *docProcess, const std::

 void DocRebuild::updateXmldoc(XmlDoc *xmlDoc) {
 	xmlDoc->m_recycleContent = true;
+	xmlDoc->m_docRebuild = true;
 }

 void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
 	DocRebuildDocItem *rebuildDocItem = dynamic_cast<DocRebuildDocItem*>(docItem);
+	if (rebuildDocItem == nullptr) {
+		gbshutdownLogicError();
+	}
+
 	XmlDoc *xmlDoc = rebuildDocItem->m_xmlDoc;

 	// set callback
-	xmlDoc->m_masterLoop = processedDoc;
-	xmlDoc->m_masterState = rebuildDocItem;
+	if (xmlDoc->m_masterLoop == nullptr) {
+		xmlDoc->m_masterLoop = processedDoc;
+		xmlDoc->m_masterState = rebuildDocItem;
+	}

 	// prepare
 	char **oldTitleRec = xmlDoc->getOldTitleRec();
@ -80,11 +88,12 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
 		return;
 	}

-	// reset callback
-	xmlDoc->m_masterLoop = nullptr;
-	xmlDoc->m_masterState = nullptr;
+	XmlDoc **oldXmlDoc = xmlDoc->getOldXmlDoc();
+	if (!oldXmlDoc || oldXmlDoc == (XmlDoc**)-1) {
+		return;
+	}

-	if (!xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
+	if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", nullptr, MAX_NICENESS)) {
 		xmlDoc->m_indexCode = ECORRUPTDATA;
 		xmlDoc->m_indexCodeValid = true;

@ -100,8 +109,8 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {

 	int32_t *firstIp = xmlDoc->getFirstIp();
 	if (!firstIp || firstIp == (int32_t*)-1) {
-		// we must not be blocked/invalid at this point
-		gbshutdownLogicError();
+		// blocked
+		return;
 	}

 	int32_t *siteNumInLinks = xmlDoc->getSiteNumInlinks();
@ -114,6 +123,47 @@ void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
 	if (xmlDoc->m_masterLoop == processedDoc) {
 		xmlDoc->m_masterLoop = nullptr;
 		xmlDoc->m_masterState = nullptr;
+
+		// logic copied from Repair.cpp
+
+		// rebuild the title rec! otherwise we re-add the old one
+		xmlDoc->m_titleRecBufValid = false;
+		xmlDoc->m_titleRecBuf.purge();
+
+		// recompute site, no more domain sites allowed
+		xmlDoc->m_siteValid = false;
+		xmlDoc->ptr_site = nullptr;
+		xmlDoc->size_site = 0;
+
+		// recalculate the sitenuminlinks
+		xmlDoc->m_siteNumInlinksValid = false;
+
+		// recalculate the langid
+		xmlDoc->m_langIdValid = false;
+
+		// recalcualte and store the link info
+		xmlDoc->m_linkInfo1Valid = false;
+		xmlDoc->ptr_linkInfo1 = nullptr;
+		xmlDoc->size_linkInfo1 = 0;
+
+		// re-get the tag rec from tagdb
+		xmlDoc->m_tagRecValid = false;
+		xmlDoc->m_tagRecDataValid = false;
+
+		xmlDoc->m_priority = -1;
+		xmlDoc->m_priorityValid = true;
+
+		xmlDoc->m_contentValid = true;
+		xmlDoc->m_content = xmlDoc->ptr_utf8Content;
+		xmlDoc->m_contentLen = xmlDoc->size_utf8Content - 1;
+
+		// update to latest version
+#ifndef PRIVACORE_SAFE_VERSION
+		xmlDoc->m_version = g_conf.m_titleRecVersion;
+#else
+		xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
+#endif
+		xmlDoc->m_versionValid = true;
 	}

 	// set spider request
--- a/DocReindex.cpp
+++ b/DocReindex.cpp
@ -20,6 +20,8 @@
 #include "XmlDoc.h"
 #include "Msg0.h"
 #include "RdbList.h"
+#include "Conf.h"
+#include "TitleRecVersion.h"

 DocReindex g_docReindex("docreindex.txt", false);
 DocReindex g_docReindexUrl("docreindexurl.txt", true);
@ -49,10 +51,22 @@ DocProcessDocItem* DocReindex::createDocItem(DocProcess *docProcess, const std::

 void DocReindex::updateXmldoc(XmlDoc *xmlDoc) {
 	xmlDoc->m_indexCodeValid = false;
+
+#ifndef PRIVACORE_SAFE_VERSION
+	xmlDoc->m_version = g_conf.m_titleRecVersion;
+#else
+	xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
+#endif
+
+	xmlDoc->m_versionValid = true;
 }

 void DocReindex::processDocItem(DocProcessDocItem *docItem) {
 	DocReindexDocItem *reindexDocItem = dynamic_cast<DocReindexDocItem*>(docItem);
+	if (reindexDocItem == nullptr) {
+		gbshutdownLogicError();
+	}
+
 	XmlDoc *xmlDoc = reindexDocItem->m_xmlDoc;

 	// set callback
--- a/JobScheduler.cpp
+++ b/JobScheduler.cpp
@ -426,6 +426,7 @@ bool JobScheduler_impl::submit(thread_type_t thread_type, JobEntry &e)
 			case thread_type_unspecified_io:     job_queue = &cpu_job_queue;      break;
 			case thread_type_generate_thumbnail: job_queue = &external_job_queue; break;
 			case thread_type_config_load:        job_queue = &cpu_job_queue;      break;
+			case thread_type_page_process:       job_queue = &cpu_job_queue;      break;
 			default:
 				assert(false);

--- a/JobScheduler.h
+++ b/JobScheduler.h
@ -46,6 +46,7 @@ enum thread_type_t {
 	thread_type_unspecified_io,     //until we can be more specific
 	thread_type_generate_thumbnail,
 	thread_type_config_load,
+	thread_type_page_process,
 };


--- a/2
+++ b/2
@ -25,7 +25,7 @@ OBJS_O0 =  \
 	Lang.o Log.o \
 	Mem.o Msg0.o Msg4In.o Msg4Out.o MsgC.o Msg13.o Msg20.o Msg22.o Msg39.o Msg3a.o Msg51.o Msge0.o Msge1.o Multicast.o \
 	Parms.o Pages.o PageAddColl.o PageAddUrl.o PageBasic.o PageCrawlBot.o PageGet.o PageHealthCheck.o PageHosts.o PageInject.o \
-	PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o \
+	PageParser.o PagePerf.o PageReindex.o PageResults.o PageRoot.o PageSockets.o PageStats.o PageThreads.o PageTitledb.o PageSpiderdbLookup.o PageSpider.o PageDoledbIPTable.o PageDocProcess.o \
 	Phrases.o HostFlags.o Process.o Proxy.o Punycode.o \
 	InstanceInfoExchange.o \
 	Query.o \
--- a/Msg25.cpp
+++ b/Msg25.cpp
@ -2853,7 +2853,7 @@ static LinkInfo *makeLinkInfo(int32_t         ip,
 		// get approx # of words in link text
 		int32_t nw = 0;
 		if ( txtLen > 0 )
-			nw = getNumWords(txt,txtLen,TITLEREC_CURRENT_VERSION);
+			nw = getNumWords(txt,txtLen);
 		// store it
 		r->m_linkTextNumWords = nw;
 		
--- a/PageDocProcess.cpp
+++ b/PageDocProcess.cpp
@ -0,0 +1,117 @@
+//
+// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+// License TL;DR: If you change this file, you must publish your changes.
+//
+
+#include "TcpSocket.h"
+#include "HttpRequest.h"
+#include "HttpServer.h"
+#include "Pages.h"
+#include "GbUtil.h"
+#include "DocDelete.h"
+#include "DocRebuild.h"
+#include "DocReindex.h"
+#include "JobScheduler.h"
+
+struct PageDocProcessState {
+	PageDocProcessState(TcpSocket *s, HttpRequest *r, DocProcess *docProcess)
+		: m_s(s)
+		, m_r()
+		, m_docProcess(docProcess) {
+		m_r.copy(r);
+	}
+
+	TcpSocket *m_s;
+	HttpRequest m_r;
+	DocProcess *m_docProcess;
+};
+
+void waitPendingDocCountWrapper(void *state) {
+	PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
+	pageDocProcessState->m_docProcess->waitPendingDocCount(0);
+}
+
+void doneWaitPendingDocCountWrapper(void *state, job_exit_t exit_type) {
+	PageDocProcessState *pageDocProcessState = static_cast<PageDocProcessState*>(state);
+
+	if (exit_type != job_exit_normal) {
+		g_httpServer.sendErrorReply(pageDocProcessState->m_s, ECANCELED, "job canceled");
+		return;
+	}
+
+	g_httpServer.sendSuccessReply(pageDocProcessState->m_s, pageDocProcessState->m_r.getReplyFormat());
+}
+
+bool sendPageDocProcess(TcpSocket *s, HttpRequest *r) {
+	int32_t keyLen = 0;
+	const char *key = r->getString("key", &keyLen);
+	std::string keyStr(key, keyLen);
+
+	int32_t typeLen = 0;
+	const char *type = r->getString("type", &typeLen);
+
+	if (typeLen == 0) {
+		return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "missing parameter type");
+	}
+
+	DocProcess *docProcess = nullptr;
+
+	switch (typeLen) {
+		case 9:
+			if (strncasecmp(type, "docdelete", 9) == 0) {
+				// docdelete
+				if (starts_with(keyStr.c_str(), "http")) {
+					docProcess = &g_docDeleteUrl;
+				} else {
+					docProcess = &g_docDelete;
+				}
+			}
+			break;
+		case 10:
+			if (strncasecmp(type, "docrebuild", 10) == 0) {
+				// docrebuild
+				if (starts_with(keyStr.c_str(), "http")) {
+					docProcess = &g_docRebuildUrl;
+				} else {
+					docProcess = &g_docRebuild;
+				}
+			} else if (strncasecmp(type, "docreindex", 10) == 0) {
+				// docreindex
+				if (starts_with(keyStr.c_str(), "http")) {
+					docProcess = &g_docReindexUrl;
+				} else {
+					docProcess = &g_docReindex;
+				}
+			}
+		default:
+			break;
+	}
+
+	if (docProcess) {
+		docProcess->addKey(keyStr);
+
+		PageDocProcessState *state = new PageDocProcessState(s, r, docProcess);
+		if (!g_jobScheduler.submit(waitPendingDocCountWrapper, doneWaitPendingDocCountWrapper, state, thread_type_page_process, 0)) {
+			// unable to submit page
+			return g_httpServer.sendErrorReply(s, EBADENGINEER, "unable to submit job");
+		}
+
+		return false;
+	}
+
+	return g_httpServer.sendErrorReply(s, EMISSINGINPUT, "invalid parameter type (docdelete, docrebuild, docreindex)");
+}
--- a/PageDoledbIPTable.cpp
+++ b/PageDoledbIPTable.cpp
@ -66,18 +66,25 @@ static void generatePageJSON(std::vector<uint32_t> &doleips, const char *coll, S
 }


-static bool respondWithError(TcpSocket *s, HttpRequest *r, const char *msg) {
+static bool respondWithError(TcpSocket *s, HttpRequest *r, int32_t error, const char *errmsg) {
 	SafeBuf sb;
 	const char *contentType = NULL;
 	switch(r->getReplyFormat()) {
 		case FORMAT_HTML:
 			g_pages.printAdminTop(&sb, s, r, NULL);
-			sb.safePrintf("<p>%s</p>", msg);
+			sb.safePrintf("<p>%s</p>", errmsg);
 			g_pages.printAdminBottom2(&sb);
 			contentType = "text/html";
 			break;
 		case FORMAT_JSON:
-			sb.safePrintf("{error_message:\"%s\"}",msg); //todo: safe encode
+			sb.safePrintf("{\"response\":{\n"
+				              "\t\"statusCode\":%" PRId32",\n"
+				              "\t\"statusMsg\":\"", error);
+			sb.jsonEncode(errmsg);
+			sb.safePrintf("\"\n"
+				              "}\n"
+				              "}\n");
+			contentType = "application/json";
 			contentType = "application/json";
 			break;
 		default:
@ -94,12 +101,12 @@ bool sendPageDoledbIPTable(TcpSocket *s, HttpRequest *r) {
 	const char *coll = r->getString("c", NULL, NULL);
 	CollectionRec *cr = g_collectiondb.getRec(coll);
 	if(!cr) {
-		return respondWithError(s, r, "No collection specified");
+		return respondWithError(s, r, ENOCOLLREC, "No collection specified");
 	}
 	
 	SpiderColl *spiderColl = cr->m_spiderColl;
 	if(!spiderColl) {
-		return respondWithError(s, r, "No spider-collection (?)");
+		return respondWithError(s, r, EBADENGINEER, "No spider-collection (?)");
 	}
 	
 	std::vector<uint32_t> doleips = spiderColl->getDoledbIpTable();
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -214,16 +214,10 @@ bool sendErrorReply ( void *state , int32_t err ) {
 	// get the tcp socket from the state
 	TcpSocket *s = st->m_socket;

-	char tmp [ 1024*32 ] ;
-	sprintf ( tmp , "%s",
-		  mstrerror(g_errno));
 	// nuke state2
 	mdelete ( st , sizeof(State2) , "PageGet1" );
 	delete (st);
-	// erase g_errno for sending
-	//g_errno = 0;
-	// . now encapsulate it in html head/tail and send it off
-	//return g_httpServer.sendDynamicPage ( s , tmp , strlen(tmp) );
+
 	return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
 }

--- a/PageSpiderdbLookup.cpp
+++ b/PageSpiderdbLookup.cpp
@ -193,7 +193,7 @@ static bool gotSpiderRecs2(State *st) {
 }


-static bool respondWithError(State *st, const char *msg) {
+static bool respondWithError(State *st, int32_t error, const char *errmsg) {
 	// get the socket
 	TcpSocket *s = st->m_socket;

@ -202,12 +202,18 @@ static bool respondWithError(State *st, const char *msg) {
 	switch(st->m_r.getReplyFormat()) {
 		case FORMAT_HTML:
 			g_pages.printAdminTop(&sb, s, &st->m_r, NULL);
-			sb.safePrintf("<p>%s</p>", msg);
+			sb.safePrintf("<p>%s</p>", errmsg);
 			g_pages.printAdminBottom2(&sb);
 			contentType = "text/html";
 			break;
 		case FORMAT_JSON:
-			sb.safePrintf("{error_message:\"%s\"}", msg); //todo: safe encode
+			sb.safePrintf("{\"response\":{\n"
+			              "\t\"statusCode\":%" PRId32",\n"
+			              "\t\"statusMsg\":\"", error);
+			sb.jsonEncode(errmsg);
+			sb.safePrintf("\"\n"
+			              "}\n"
+			              "}\n");
 			contentType = "application/json";
 			break;
 		default:
@ -425,7 +431,7 @@ static bool sendResult(State *st) {
 	sb.reserve2x ( 32768 );

 	if(g_errno) {
-		return respondWithError(st, mstrerror(g_errno));
+		return respondWithError(st, g_errno, mstrerror(g_errno));
 	}

 	int32_t shardNum = -1;
--- a/PageThreads.cpp
+++ b/PageThreads.cpp
@ -33,6 +33,7 @@ static const char *thread_type_name(thread_type_t tt) {
 		case thread_type_unspecified_io:     return "unspecified IO";
 		case thread_type_generate_thumbnail: return "generate-thumbnail";
 		case thread_type_config_load:        return "config-load";
+		case thread_type_page_process:       return "page-process";
 		default: return "?";
 	}
 }
--- a/Pages.cpp
+++ b/Pages.cpp
@ -233,6 +233,11 @@ static WebPage s_pages[] = {
 	  sendPageParser,
 	  PG_NOAPI|PG_COLLADMIN|PG_ACTIVE},

+	{ PAGE_DOCPROCESS, "admin/docprocess", 0, "DocProcess", 0, page_method_t::page_method_get,
+	  "Various doc process methods",
+	  sendPageDocProcess,
+	  PG_NOAPI|PG_MASTERADMIN|PG_ACTIVE},
+
 	{ PAGE_SITEDB    , "admin/tagdb"  , 0 , "Tagdb"  ,  0, page_method_t::page_method_post_url,
 	  "add/remove/get tags for sites/urls",
 	  sendPageTagdb,
@ -862,55 +867,6 @@ bool printGigabotAdvice(SafeBuf *sb,
 	return true;
 }

-void Pages::printFormTop( SafeBuf *sb, HttpRequest *r ) {
-	int32_t  page   = getDynamicPageNumber ( r );
-
-	if( page < 0 ) {
-		logError("getDynamicPageNumber returned negative index!");
-		return;
-	}
-
-	// . the form
-	// . we cannot use the GET method if there is more than a few k of
-	//   parameters, like in the case of the Search Controls page. The
-	//   browser simply will not send the request if it is that big.
-	switch(s_pages[page].m_page_method) {
-		case page_method_t::page_method_post_form:
-			sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
-					// we need this for <input type=file> tags
-					"ENCTYPE=\"multipart/form-data\" "
-					"action=\"/%s\">\n",
-					s_pages[page].m_filename);
-		case page_method_t::page_method_post_url:
-			sb->safePrintf ("<form name=\"SubmitInput\" method=\"post\" "
-					"action=\"/%s\">\n",
-					s_pages[page].m_filename);
-		case page_method_t::page_method_get:
-			sb->safePrintf ("<form name=\"SubmitInput\" method=\"get\" "
-					"action=\"/%s\">\n",
-					s_pages[page].m_filename);
-	}
-}
-
-void Pages::printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r ) {
-
-	int32_t  page   = getDynamicPageNumber ( r );
-	const char *coll   = r->getString ( "c"   );
-	if ( ! coll ) coll = "";
-	sb->safePrintf ( "<input type=\"hidden\" name=\"c\" "
-			 "value=\"%s\" />\n", coll);
-
-	// should any changes be broadcasted to all hosts?
-	sb->safePrintf ("<input type=\"hidden\" name=\"cast\" value=\"%" PRId32"\" "
-			"/>\n",
-			page >= 0 ? (int32_t)s_pages[page].m_cast : 0);
-
-}
-
-bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
-	return printAdminBottom ( sb );
-}
-
 bool Pages::printSubmit ( SafeBuf *sb ) {
 	// update button
 	return sb->safePrintf ( 
@ -1124,6 +1080,7 @@ bool  Pages::printAdminLinks ( SafeBuf *sb,
 		if ( i == PAGE_SEARCHBOX ) continue;
 		if ( i == PAGE_TITLEDB ) continue;
 		if ( i == PAGE_HEALTHCHECK ) continue;
+		if ( i == PAGE_DOCPROCESS ) continue;
 		


--- a/Pages.h
+++ b/Pages.h
@ -69,10 +69,9 @@ bool sendPageProfiler   ( TcpSocket *s , HttpRequest *r );
 bool sendPageThreads    ( TcpSocket *s , HttpRequest *r );
 bool sendPageAPI        ( TcpSocket *s , HttpRequest *r );
 bool sendPageHelp       ( TcpSocket *s , HttpRequest *r );
-bool sendPageGraph      ( TcpSocket *s , HttpRequest *r );
 bool sendPageHealthCheck ( TcpSocket *sock , HttpRequest *hr ) ;
 bool sendPageDefaultCss(TcpSocket *s, HttpRequest *r);
-
+bool sendPageDocProcess(TcpSocket *s, HttpRequest *r);

 enum class page_method_t {
 	page_method_get = 1,		//plain http get
@ -137,10 +136,6 @@ class Pages {
 					 const char  *qs = NULL,
 					 const char* bodyJavascript = "" );

-	void printFormTop(  SafeBuf *sb, HttpRequest *r );
-	void printFormData( SafeBuf *sb, TcpSocket *s, HttpRequest *r );
-
-	bool  printAdminBottom         ( SafeBuf *sb, HttpRequest *r );
 	bool  printAdminBottom         ( SafeBuf *sb);
 	bool  printAdminBottom2        ( SafeBuf *sb);
 	bool  printTail                ( SafeBuf* sb, bool isLocal );
@ -222,6 +217,7 @@ enum {
 	PAGE_DOLEIPTABLE ,
 	PAGE_SEARCHBOX   ,
 	PAGE_PARSER      ,
+	PAGE_DOCPROCESS  ,
 	PAGE_SITEDB      ,
 	PAGE_HEALTHCHECK ,
 	PAGE_NONE     	};
--- a/Parms.cpp
+++ b/Parms.cpp
@ -5390,6 +5390,18 @@ void Parms::init ( ) {
 	m->m_page  = PAGE_MASTER;
 	m++;

+#ifndef PRIVACORE_SAFE_VERSION
+	m->m_title = "TitleRec version number";
+	m->m_desc  = "Override TitleRec version number (for testing only!)";
+	m->m_cgi   = "trvn";
+	simple_m_set(Conf,m_titleRecVersion);
+	m->m_def   = TITLEREC_CURRENT_VERSION_STR;
+	m->m_group = false;
+	m->m_flags = PF_HIDDEN | PF_NOSAVE;
+	m->m_page  = PAGE_MASTER;
+	m++;
+#endif
+
 	m->m_title = "use shotgun";
 	m->m_desc  = "If enabled, all servers must have two gigabit "
 		"ethernet ports hooked up and Gigablast will round robin "
--- a/Query.cpp
+++ b/Query.cpp
@ -2659,6 +2659,14 @@ const struct QueryField g_fields[] = {
 	 NULL,
 	 QTF_DUP },

+	{"sitenoindex",
+	 FIELD_SITE,
+	 true,
+	 "sitenoindex:example.com",
+	 "Matches all documents on the example.com domain that in not indexed.",
+	 NULL,
+	 0 },
+
 	{"ip", 
 	 FIELD_IP, 
 	 true,
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -346,6 +346,37 @@ bool Tag::printToBufAsXml(SafeBuf *sb) const {
 	return true;
 }

+bool Tag::printToBufAsJson(SafeBuf *sb) const {
+	sb->safePrintf("\t{\n");
+	// print the tagname
+	sb->safePrintf("\t\t\"name\": \"");
+	sb->jsonEncode(getTagStrFromType(m_type));
+	sb->safePrintf("\",\n");
+
+	sb->safePrintf("\t\t\"user\": \"");
+	sb->jsonEncode(getUser());
+	sb->safePrintf("\",\n");
+
+	// print the date when this tag was added
+	sb->safePrintf("\t\t\"timestamp\": %" PRId32",\n", m_timestamp);
+
+	// print the ip added from
+	char ipbuf[16];
+	sb->safePrintf("\t\t\"ip\": \"");
+	sb->jsonEncode(iptoa(m_ip,ipbuf));
+	sb->safePrintf("\",\n");
+
+
+	sb->safePrintf("\t\t\"value\": \"");
+
+	// print the m_data
+	if ( ! printDataToBuf ( sb ) ) return false;
+
+	sb->safePrintf("\"\n");
+	sb->safePrintf("\t},\n");
+	return true;
+}
+
 bool Tag::printToBufAsHtml(SafeBuf *sb, const char *prefix) const {
 	// print the tagname
 	const char *str = getTagStrFromType ( m_type );
@ -847,6 +878,20 @@ bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
 	return true;
 }

+bool TagRec::printToBufAsJson ( SafeBuf *sb ) {
+	sb->safePrintf("\t\"tag\": [\n");
+
+	Tag *tag = getFirstTag();
+	for ( ; tag ; tag = getNextTag ( tag ) )
+		if ( tag->m_type != TT_DUP ) tag->printToBufAsJson ( sb );
+
+	sb->removeLastChar('\n');
+	sb->removeLastChar(',');
+
+	sb->safePrintf("\t]\n");
+	return true;
+}
+
 bool TagRec::printToBufAsHtml ( SafeBuf *sb , const char *prefix ) {
 	Tag *tag = getFirstTag();
 	for ( ; tag ; tag = getNextTag ( tag ) ) 
--- a/Tagdb.h
+++ b/Tagdb.h
@ -34,6 +34,7 @@ class Tag {
 	bool printToBuf            (SafeBuf *sb) const;
 	bool printToBufAsAddRequest(SafeBuf *sb) const;
 	bool printToBufAsXml       (SafeBuf *sb) const;
+	bool printToBufAsJson      (SafeBuf *sb) const;
 	bool printToBufAsHtml      (SafeBuf *sb, const char *prefix) const;
 	bool printToBufAsTagVector (SafeBuf *sb) const;
 	// just print the m_data...
@ -116,6 +117,7 @@ public:
 	bool printToBuf             ( SafeBuf *sb );
 	bool printToBufAsAddRequest ( SafeBuf *sb );
 	bool printToBufAsXml        ( SafeBuf *sb );
+	bool printToBufAsJson       ( SafeBuf *sb );
 	bool printToBufAsHtml       ( SafeBuf *sb , const char *prefix );
 	bool printToBufAsTagVector  ( SafeBuf *sb );

--- a/TitleRecVersion.h
+++ b/TitleRecVersion.h
@ -1,6 +1,11 @@
 #ifndef GB_TITLERECVERSION_H
 #define GB_TITLERECVERSION_H

+#ifndef STRINGIFY
+#define STRINGIFY(x) #x
+#define TO_STRING(x) STRINGIFY(x)
+#endif
+
 // Starting version when Gigablast was open-sourced
 //#define TITLEREC_CURRENT_VERSION 120

@ -22,6 +27,11 @@
 //#define TITLEREC_CURRENT_VERSION    125

 // new adult detection
-#define TITLEREC_CURRENT_VERSION	126
+//#define TITLEREC_CURRENT_VERSION    126
+
+// handle robots meta with noindex, follow
+#define TITLEREC_CURRENT_VERSION    127
+
+#define TITLEREC_CURRENT_VERSION_STR    TO_STRING(TITLEREC_CURRENT_VERSION)

 #endif // GB_TITLERECVERSION_H
--- a/Version.cpp
+++ b/Version.cpp
@ -6,8 +6,10 @@
 #include "Process.h"
 #include <string.h>

+#ifndef STRINGIFY
 #define STRINGIFY(x) #x
 #define TO_STRING(x) STRINGIFY(x)
+#endif

 #ifndef GIT_COMMIT_ID
 #define GIT_COMMIT_ID unknown
@ -65,6 +67,3 @@ void printVersion() {
 	fprintf(stdout,"Gigablast Git branch   : %s\n", getBranch());
 	fprintf(stdout,"Gigablast Git commit   : %s\n", getCommitId());
 }
-
-#undef STRINGIFY
-#undef TO_STRING
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -385,6 +385,7 @@ void XmlDoc::reset ( ) {
 	m_setTr                    = false;

 	m_recycleContent           = false;
+	m_docRebuild               = false;
 	m_callback1                = NULL;
 	m_callback2                = NULL;
 	m_state                    = NULL;
@ -748,7 +749,13 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	//m_coll      = coll;
 	m_pbuf      = pbuf;
 	m_niceness  = niceness;
-	m_version   = TITLEREC_CURRENT_VERSION;
+
+#ifndef PRIVACORE_SAFE_VERSION
+	m_version = g_conf.m_titleRecVersion;
+#else
+	m_version = TITLEREC_CURRENT_VERSION;
+#endif
+
 	m_versionValid = true;

 	// this is used to removing the rec from doledb after we spider it
@ -2332,7 +2339,15 @@ int32_t *XmlDoc::getIndexCode ( ) {
 		return (int32_t *) ini;
 	}

-	if (*ini) {
+	// check meta nofollow
+	bool *inf = getIsNoFollow();
+	if (!inf || inf == (bool*) -1) {
+		logTrace(g_conf.m_logTraceXmlDoc, "END, could not getIsNoFollow");
+		return (int32_t *) inf;
+	}
+
+	// meta noindex & nofollow
+	if (*ini && *inf) {
 		if (m_firstUrl.isRoot()) {
 			m_indexCode = EDOCDISALLOWEDROOT;
 		} else {
@ -2562,7 +2577,6 @@ int32_t *XmlDoc::getIndexCode ( ) {

 			ptr_utf8Content    = NULL;
 			size_utf8Content   = 0;
-			m_utf8ContentValid = true;

 			logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCNONCANONICAL");
 			return &m_indexCode;
@ -2921,8 +2935,12 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){
 	// assume could not make one because we were banned or something
 	tbuf->purge(); // m_titleRec = NULL;

+#ifndef PRIVACORE_SAFE_VERSION
+	m_version = g_conf.m_titleRecVersion;
+#else
 	// start seting members in THIS's header before compression
-	m_version           = TITLEREC_CURRENT_VERSION;
+	m_version = TITLEREC_CURRENT_VERSION;
+#endif

 	// set this
 	m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;
@ -3125,7 +3143,6 @@ SafeBuf *XmlDoc::getTitleRecBuf ( ) {

 			ptr_utf8Content    = NULL;
 			size_utf8Content   = 0;
-			m_utf8ContentValid = true;
 		} else {
 			m_titleRecBufValid = true;
 			return &m_titleRecBuf;
@ -5723,7 +5740,6 @@ Url **XmlDoc::getRedirUrl() {

 		ptr_utf8Content    = NULL;
 		size_utf8Content   = 0;
-		m_utf8ContentValid = true;

 		// mdw: let this path through so contactXmlDoc gets a proper
 		// redirect that we can follow. for the base xml doc at
@ -6230,8 +6246,11 @@ SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
 //   from scratch. this loads it from titledb.
 // . NULL is a valid value (EDOCNOTFOUND) so return a char **
 char **XmlDoc::getOldTitleRec() {
+	logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");
+
 	// if valid return that
 	if ( m_oldTitleRecValid ) {
+		logTrace(g_conf.m_logTraceXmlDoc, "END, already valid");
 		return &m_oldTitleRec;
 	}

@ -6241,6 +6260,7 @@ char **XmlDoc::getOldTitleRec() {
 	if ( m_setFromTitleRec ) {
 		m_oldTitleRecValid = true;
 		m_oldTitleRec      = NULL;//m_titleRec;
+		logTrace(g_conf.m_logTraceXmlDoc, "END, setFromTitleRec");
 		return &m_oldTitleRec;
 	}
 	// sanity check
@ -6259,6 +6279,7 @@ char **XmlDoc::getOldTitleRec() {
 	if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
 		m_oldTitleRec      = NULL;
 		m_oldTitleRecValid = true;
+		logTrace(g_conf.m_logTraceXmlDoc, "END, not indexed");
 		return &m_oldTitleRec;
 	}
 	// sanity check. if we have no url or docid ...
@ -6288,6 +6309,7 @@ char **XmlDoc::getOldTitleRec() {

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) {
+		logTrace(g_conf.m_logTraceXmlDoc, "END, no collection");
 		return NULL;
 	}

@ -6316,6 +6338,7 @@ char **XmlDoc::getOldTitleRec() {
 				      m_niceness           , // niceness
 				      999999               )) {// timeout seconds
 		// return -1 if we blocked
+		logTrace(g_conf.m_logTraceXmlDoc, "END, blocked");
 		return (char **)-1;
 	}

@ -6326,9 +6349,12 @@ char **XmlDoc::getOldTitleRec() {

 	// error?
 	if ( g_errno ) {
+		logTrace(g_conf.m_logTraceXmlDoc, "END, error=%s", mstrerror(g_errno));
 		return NULL;
 	}

+	logTrace(g_conf.m_logTraceXmlDoc, "END");
+
 	// got it
 	return &m_oldTitleRec;
 }
@ -6619,7 +6645,9 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;

 	// sanity check
-	if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {g_process.shutdownAbort(true);}
+	if (m_setFromTitleRec && !m_useSecondaryRdbs && !m_docRebuild) {
+		g_process.shutdownAbort(true);
+	}

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
@ -12929,7 +12957,7 @@ char *XmlDoc::getMetaList(bool forDelete) {
 	// . so at least now set all the data members we will need to
 	//   seriazlize into the title rec because we can't be blocking further
 	//   down below after we set all the hashtables and XmlDoc::ptr_ stuff
-	if (!m_setFromTitleRec || m_useSecondaryRdbs) {
+	if (!m_setFromTitleRec || m_useSecondaryRdbs || m_docRebuild) {
 		// all member vars should already be valid if set from titlerec
 		char *ptg = prepareToMakeTitleRec();

@ -14445,7 +14473,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {

 	// . only if had old one
 	// . we use this in url filters to set the respider wait time usually
-	if ( od ) {
+	if ( od && !m_recycleContent) {
 		int32_t spideredTime = getSpideredTime();
 		int32_t oldSpideredTime = od->getSpideredTime();
 		float numDays = spideredTime - oldSpideredTime;
@ -17634,9 +17662,10 @@ bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {

 	// for some reason sections page blocks forever in browser
 	if ( page != 7 && ! m_printedMenu ) {
-		printFrontPageShell ( sb , "search" , cr , false );
+		if (hr->getReplyFormat() == FORMAT_HTML) {
+			printFrontPageShell(sb, "search", cr, false);
+		}
 		m_printedMenu = true;
-		//printMenu ( sb );
 	}


@ -17741,9 +17770,8 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 	const char *es = mstrerror(m_indexCode);
 	if ( ! m_indexCode ) es = mstrerror(g_errno);

-	int32_t isXml = hr->getLong("xml",0);
-
-	if ( ! isXml ) printMenu ( sb );
+	char format = hr->getReplyFormat();
+	if ( format == FORMAT_HTML ) printMenu ( sb );

 	int32_t shardNum = getShardNumFromDocId ( m_docId );
 	Host *hosts = g_hostdb.getShard ( shardNum );
@ -17757,7 +17785,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 		spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum, false);
 	}

-	if ( ! isXml )
+	if ( format == FORMAT_HTML )
 		sb->safePrintf (
 				"<table cellpadding=3 border=0>\n"

@ -17776,12 +17804,16 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 				"<td>%" PRId32"</td>"
 				"</tr>\n"

+				"<tr>"
+				"<td width=\"25%%\">title rec version</td>"
+				"<td>%" PRIu16"</td>"
+				"</tr>\n"
+
 				"<tr>"
 				"<td>index error code</td>"
 				"<td>%s</td>"
 				"</tr>\n"

-
 				"<tr>"
 				"<td>robots.txt allows</td>"
 				"<td>%s</td>"
@ -17800,6 +17832,7 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {

 				h->m_hostId,
 				spiderHostId,
+				m_version,
 				es,
 				allowed,

@ -17807,13 +17840,14 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 				fu

 				);
-	else
+	else if (format == FORMAT_XML)
 		sb->safePrintf (
 				"<?xml version=\"1.0\" "
 				"encoding=\"UTF-8\" ?>\n"
 				"<response>\n"
 				"\t<coll><![CDATA[%s]]></coll>\n"
 				"\t<docId>%" PRId64"</docId>\n"
+				"\t<titleRecVersion>%" PRIu16"</titleRecVersion>\n"
 				"\t<indexError><![CDATA[%s]]></indexError>\n"
 				"\t<robotsTxtAllows>%" PRId32
 				"</robotsTxtAllows>\n"
@ -17821,30 +17855,75 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 				,
 				cr->m_coll,
 				m_docId ,
+				m_version,
 				es,
 				allowedInt,//(int32_t)m_isAllowed,
 				fu
 				);
+	else if (format == FORMAT_JSON) {
+		sb->safePrintf("{\"response\":{\n");
+
+		sb->safePrintf("\t\"coll\": \"");
+		sb->jsonEncode(cr->m_coll);
+		sb->safePrintf("\",\n");
+
+		sb->safePrintf("\t\"docId\": %" PRIu64",\n", m_docId);
+		sb->safePrintf("\t\"titleRecVersion\": %" PRIu16",\n", m_version);
+
+		sb->safePrintf("\t\"indexError\": \"");
+		sb->jsonEncode(es);
+		sb->safePrintf("\",\n");
+
+		sb->safePrintf("\t\"robotsTxtAllows\": %" PRId32",\n", allowedInt);
+
+		sb->safePrintf("\t\"url\": \"");
+		sb->jsonEncode(fu);
+		sb->safePrintf("\",\n");
+	}

 	char *redir = ptr_redirUrl;
-	if ( redir && ! isXml ) {
-		sb->safePrintf(
-			       "<tr>"
-			       "<td>redir url</td>"
-			       "<td><a href=\"%s\">%s</a></td>"
-			       "</tr>\n"
-			       ,redir
-			       ,redir );
+	if (redir) {
+		switch (format) {
+			case FORMAT_HTML:
+				sb->safePrintf(
+					"<tr>"
+						"<td>redir url</td>"
+						"<td><a href=\"%s\">%s</a></td>"
+						"</tr>\n"
+					,redir
+					,redir );
+				break;
+			case FORMAT_XML:
+				sb->safePrintf("\t<redirectUrl><![CDATA[%s]]></redirectUrl>\n" ,redir );
+				break;
+			case FORMAT_JSON:
+				sb->safePrintf("\t\"redirectUrl\": \"");
+				sb->jsonEncode(redir);
+				sb->safePrintf("\",\n");
+				break;
+			default:
+				break;
+		}
 	}
-	else if ( redir ) {
-		sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
-			       "</redirectUrl>\n" ,redir );
-	}
-

 	if ( m_indexCode || g_errno ) {
-		if ( ! isXml ) sb->safePrintf("</table><br>\n");
-		else           sb->safePrintf("</response>\n");
+		switch (format) {
+			case FORMAT_HTML:
+				sb->safePrintf("</table><br>\n");
+				break;
+			case FORMAT_XML:
+				sb->safePrintf("</response>\n");
+				break;
+			case FORMAT_JSON:
+				sb->removeLastChar('\n');
+				sb->removeLastChar(',');
+				sb->safePrintf("}\n");
+				sb->safePrintf("}\n");
+				break;
+			default:
+				break;
+		}
+
 		return true;
 	}

@ -17852,157 +17931,120 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 	// must always start with http
 	if ( strncmp ( fu , "http" , 4 ) != 0 ) { g_process.shutdownAbort(true); }

-	struct tm tm_buf;
-	char buf[64];
-	time_t ts = (time_t)m_firstIndexedDate;
-
-	if ( ! isXml )
-		sb->safePrintf("<tr><td>first indexed date</td>"
-			       "<td>%s UTC</td></tr>\n" ,
-			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
-	else
-		sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32
-			       "</firstIndexedDateUTC>\n",
-			       (uint32_t)m_firstIndexedDate );
-
-	ts = m_spideredTime;
-
-	if ( ! isXml )
-		sb->safePrintf("<tr><td>last indexed date</td>"
-			       "<td>%s UTC</td></tr>\n" ,
-			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
-	else
-		sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32
-			       "</lastIndexedDateUTC>\n",
-			       (uint32_t)m_spideredTime );
-
-	ts = m_outlinksAddedDate;
-
-	if ( ! isXml )
-		sb->safePrintf("<tr><td>outlinks last added date</td>"
-			       "<td>%s UTC</td></tr>\n" ,
-			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
-	else
-		sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32
-			       "</outlinksLastAddedUTC>\n",
-			       (uint32_t)m_outlinksAddedDate );
-
-	// hop count
-	if ( ! isXml )
-		sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td>"
-			       "</tr>\n",
-			       (int32_t)m_hopCount);
-	else
-		sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n",
-			       (int32_t)m_hopCount);
-
-
 	char strLanguage[128];
 	languageToString(m_langId, strLanguage);

-	// print tags
-	//SafeBuf tb;
-	int32_t sni  = m_siteNumInlinks;
-
 	char ipString[16];
 	iptoa(m_ip,ipString);

-	//int32_t sni = info1->getNumGoodInlinks();
+	switch (format) {
+		case FORMAT_HTML: {
+			struct tm tm_buf;
+			char buf[64];

-	time_t tlu = info1->getLastUpdated();
-	struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);//info1->m_lastUpdated );
-	char tmp3[64];
-	strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
+			time_t ts = (time_t)m_firstIndexedDate;
+			sb->safePrintf("<tr><td>first indexed date</td><td>%s UTC</td></tr>\n",
+			               asctime_r(gmtime_r(&ts, &tm_buf), buf));

+			ts = m_spideredTime;
+			sb->safePrintf("<tr><td>last indexed date</td><td>%s UTC</td></tr>\n",
+			               asctime_r(gmtime_r(&ts, &tm_buf), buf));

-	if ( ! isXml )
-		sb->safePrintf (
-			"<tr><td>original charset</td><td>%s</td></tr>\n"
-			"<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n"
-			//"<tr><td>is link spam?</td><td>%" PRId32" <b>%s</b></td></tr>\n"
-			"<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n"
-			"<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n"
-			"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
-			"%s</td></tr>\n"
-			"<tr><td>http status</td><td>%d</td></tr>"
-			"<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n"
-			"<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n"
-			"<tr><td>content type</td><td>%s</td></tr>\n"
-			"<tr><td>language</td><td>%s</td></tr>\n"
-			"<tr><td>country</td><td>%s</td></tr>\n"
+			ts = m_outlinksAddedDate;
+			sb->safePrintf("<tr><td>outlinks last added date</td><td>%s UTC</td></tr>\n",
+			               asctime_r(gmtime_r(&ts, &tm_buf), buf));

-			"<tr><td><b>good inlinks to site</b>"
-			"</td><td>%" PRId32"</td></tr>\n"
+			sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td></tr>\n", (int32_t)m_hopCount);

-			"<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n"
+			sb->safePrintf("<tr><td>original charset</td><td>%s</td></tr>\n", get_charset_str(m_charset));
+			sb->safePrintf("<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isAdult);
+			sb->safePrintf("<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isPermalink);
+			sb->safePrintf("<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isRSS);
+			sb->safePrintf("<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">%s</td></tr>\n", ipString, cr->m_coll, ipString);
+			sb->safePrintf("<tr><td>http status</td><td>%d</td></tr>", m_httpStatus);
+			sb->safePrintf("<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n", size_utf8Content - 1);
+			sb->safePrintf("<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n", (int32_t)m_isContentTruncated);
+			sb->safePrintf("<tr><td>content type</td><td>%s</td></tr>\n", g_contentTypeStrings[(int)m_contentType]);
+			sb->safePrintf("<tr><td>language</td><td>%s</td></tr>\n", strLanguage);
+			sb->safePrintf("<tr><td>country</td><td>%s</td></tr>\n", g_countryCode.getName(m_countryId));
+			sb->safePrintf("<tr><td><b>good inlinks to site</b></td><td>%" PRId32"</td></tr>\n", m_siteNumInlinks);
+			sb->safePrintf("<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n", ::getSiteRank(m_siteNumInlinks));
+			sb->safePrintf("<tr><td>good inlinks to page</td><td>%" PRId32"</td></tr>\n", info1->getNumGoodInlinks());

-			"<tr><td>good inlinks to page"
-			"</td><td>%" PRId32"</td></tr>\n"
+			time_t tlu = info1->getLastUpdated();
+			struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);
+			char tmp3[64];
+			strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
+			sb->safePrintf("<tr><td><nobr>page inlinks last computed</nobr></td><td>%s</td></tr>\n", tmp3);

-			"<tr><td><nobr>page inlinks last computed</nobr></td>"
-			"<td>%s</td></tr>\n"
-			"</td></tr>\n",
-			get_charset_str(m_charset),
-			(int32_t)m_isAdult,
-			(int32_t)m_isPermalink,
-			(int32_t)m_isRSS,
-			ipString,
-			cr->m_coll,
-			ipString,
-			m_httpStatus,
-			size_utf8Content - 1,
-			(int32_t)m_isContentTruncated,
-			g_contentTypeStrings[(int)m_contentType] ,
-			strLanguage,
-			g_countryCode.getName(m_countryId) ,
-			sni,
-			::getSiteRank(sni),
-			info1->getNumGoodInlinks(),
+			sb->safePrintf("</td></tr>\n");
+		} break;
+		case FORMAT_XML:
+			sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32"</firstIndexedDateUTC>\n", (uint32_t)m_firstIndexedDate);
+			sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32"</lastIndexedDateUTC>\n", (uint32_t)m_spideredTime);
+			sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32"</outlinksLastAddedUTC>\n", (uint32_t)m_outlinksAddedDate);

-			tmp3
-			);
-	else {
-		sb->safePrintf (
-			"\t<charset><![CDATA[%s]]></charset>\n"
-			"\t<isAdult>%" PRId32"</isAdult>\n"
-			"\t<isLinkSpam>%" PRId32"</isLinkSpam>\n"
-			"\t<siteRank>%" PRId32"</siteRank>\n"
+			sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n", (int32_t)m_hopCount);

-			"\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n"
+			sb->safePrintf("\t<charset><![CDATA[%s]]></charset>\n", get_charset_str(m_charset));
+			sb->safePrintf("\t<isAdult>%" PRId32"</isAdult>\n", (int32_t)m_isAdult);
+			sb->safePrintf("\t<isLinkSpam>%" PRId32"</isLinkSpam>\n", (int32_t)m_isLinkSpam);
+			sb->safePrintf("\t<siteRank>%" PRId32"</siteRank>\n", ::getSiteRank(m_siteNumInlinks));
+			sb->safePrintf("\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n", m_siteNumInlinks);
+			sb->safePrintf("\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n", info1->getNumGoodInlinks());
+			sb->safePrintf("\t<pageInlinksLastComputed>%" PRId32"</pageInlinksLastComputed>\n", (int32_t)info1->m_lastUpdated);
+			sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n", (int32_t)m_isPermalink);
+			sb->safePrintf("\t<isRSSFeed>%" PRId32"</isRSSFeed>\n", (int32_t)m_isRSS);
+			sb->safePrintf("\t<ipAddress><![CDATA[%s]]></ipAddress>\n", ipString);
+			sb->safePrintf("\t<httpStatus>%d</httpStatus>", m_httpStatus);
+			sb->safePrintf("\t<contentLenInBytes>%" PRId32"</contentLenInBytes>\n", size_utf8Content - 1);
+			sb->safePrintf("\t<isContentTruncated>%" PRId32"</isContentTruncated>\n", (int32_t)m_isContentTruncated);
+			sb->safePrintf("\t<contentType><![CDATA[%s]]></contentType>\n", g_contentTypeStrings[(int)m_contentType]);
+			sb->safePrintf("\t<language><![CDATA[%s]]></language>\n", strLanguage);
+			sb->safePrintf("\t<country><![CDATA[%s]]></country>\n", g_countryCode.getName(m_countryId));
+			break;
+		case FORMAT_JSON:
+			sb->safePrintf("\t\"firstIndexedDateUTC\": %" PRIu32",\n", m_firstIndexedDate);
+			sb->safePrintf("\t\"lastIndexedDateUTC\": %" PRIu32",\n", m_spideredTime);
+			sb->safePrintf("\t\"outlinksLastAddedUTC\": %" PRIu32",\n", m_outlinksAddedDate);

-			"\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n"
-			"\t<pageInlinksLastComputed>%" PRId32
-			"</pageInlinksLastComputed>\n"
+			sb->safePrintf("\t\"hopCount\": %" PRId8",\n", m_hopCount);

-			,get_charset_str(m_charset)
-			,(int32_t)m_isAdult
-			,(int32_t)m_isLinkSpam
-			,::getSiteRank(sni)
-			,sni
+			sb->safePrintf("\t\"charset\": \"");
+			sb->jsonEncode(get_charset_str(m_charset));
+			sb->safePrintf("\",\n");

-			,info1->getNumGoodInlinks()
-			,(int32_t)info1->m_lastUpdated
-			);
-		sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n"
-			       "\t<isRSSFeed>%" PRId32"</isRSSFeed>\n"
-			       "\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
-			       "\t<httpStatus>%d</httpStatus>"
-			       "\t<contentLenInBytes>%" PRId32
-			       "</contentLenInBytes>\n"
-			       "\t<isContentTruncated>%" PRId32
-			       "</isContentTruncated>\n"
-			       "\t<contentType><![CDATA[%s]]></contentType>\n"
-			       "\t<language><![CDATA[%s]]></language>\n"
-			       "\t<country><![CDATA[%s]]></country>\n",
-			       (int32_t)m_isPermalink,
-			       (int32_t)m_isRSS,
-			       ipString,
-			       m_httpStatus,
-			       size_utf8Content - 1,
-			       (int32_t)m_isContentTruncated,
-			       g_contentTypeStrings[(int)m_contentType] ,
-			       strLanguage,
-			       g_countryCode.getName(m_countryId) );
+			sb->safePrintf("\t\"isAdult\": %s,\n", m_isAdult ? "true" : "false");
+			sb->safePrintf("\t\"isLinkSpam\": %s,\n", m_isLinkSpam ? "true" : "false");
+			sb->safePrintf("\t\"siteRank\": %" PRId32",\n", ::getSiteRank(m_siteNumInlinks));
+			sb->safePrintf("\t\"numGoodSiteInlinks\": %" PRId32",\n", m_siteNumInlinks);
+			sb->safePrintf("\t\"numGoodPageInlinks\": %" PRId32",\n", info1->getNumGoodInlinks());
+			sb->safePrintf("\t\"pageInlinksLastComputed\": %" PRId32",\n", info1->m_lastUpdated);
+			sb->safePrintf("\t\"isPermalink\": %s,\n", m_isPermalink ? "true" : "false");
+			sb->safePrintf("\t\"isRSSFeed\": %s,\n", m_isRSS ? "true" : "false");
+
+			sb->safePrintf("\t\"ipAddress\": \"");
+			sb->jsonEncode(ipString);
+			sb->safePrintf("\",\n");
+
+			sb->safePrintf("\t\"httpStatus\": %" PRId16",\n", m_httpStatus);
+			sb->safePrintf("\t\"contentLenInBytes\": %" PRId32",\n", size_utf8Content - 1);
+			sb->safePrintf("\t\"isContentTruncated\": %s,\n", m_isContentTruncated ? "true" : "false");
+
+			sb->safePrintf("\t\"contentType\": \"");
+			sb->jsonEncode(g_contentTypeStrings[(int)m_contentType]);
+			sb->safePrintf("\",\n");
+
+			sb->safePrintf("\t\"language\": \"");
+			sb->jsonEncode(strLanguage);
+			sb->safePrintf("\",\n");
+
+			sb->safePrintf("\t\"country\": \"");
+			sb->jsonEncode(g_countryCode.getName(m_countryId));
+			sb->safePrintf("\",\n");
+			break;
+		default:
+			break;
 	}

 	TagRec *ogr = NULL;
@ -18011,18 +18053,45 @@ bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {
 		// sanity. should be set from titlerec, so no blocking!
 		if ( ! ogr || ogr == (void *)-1 ) { g_process.shutdownAbort(true); }
 	}
-	if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
-	else if ( ogr )       ogr->printToBufAsXml  ( sb  );
+
+	if (ogr) {
+		switch (format) {
+			case FORMAT_HTML:
+				ogr->printToBufAsHtml(sb, "tag");
+				break;
+			case FORMAT_XML:
+				ogr->printToBufAsXml(sb);
+				break;
+			case FORMAT_JSON:
+				ogr->printToBufAsJson(sb);
+				break;
+			default:
+				break;
+		}
+	}

 	// show the good inlinks we used when indexing this
-	if ( ! isXml )
-		info1->print(sb,cr->m_coll);
+	if (format == FORMAT_HTML) {
+		info1->print(sb, cr->m_coll);
+	}

 	// close the table
-	if ( ! isXml )
-		sb->safePrintf ( "</table></center><br>\n" );
-	else
-		sb->safePrintf("</response>\n");
+	switch (format) {
+		case FORMAT_HTML:
+			sb->safePrintf("</table><br>\n");
+			break;
+		case FORMAT_XML:
+			sb->safePrintf("</response>\n");
+			break;
+		case FORMAT_JSON:
+			sb->removeLastChar('\n');
+			sb->removeLastChar(',');
+			sb->safePrintf("}\n");
+			sb->safePrintf("}\n");
+			break;
+		default:
+			break;
+	}

 	return true;
 }
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -1104,6 +1104,7 @@ public:
 	bool          m_contentInjected;

 	bool          m_recycleContent;
+	bool          m_docRebuild;

 	char *m_rawUtf8Content;
 	int32_t  m_rawUtf8ContentSize;
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -372,6 +372,17 @@ char *XmlDoc::hashAll(HashTableX *table) {
 		return (char *)1;
 	}

+	bool *ini = getIsNoIndex();
+	if (ini == nullptr || ini == (bool*)-1) {
+		// must not be blocked
+		gbshutdownLogicError();
+	}
+
+	if (*ini && m_version > 126) {
+		logTrace(g_conf.m_logTraceXmlDoc, "END, noindex");
+		return (char *)1;
+	}
+
 	if ((size_utf8Content - 1) <= 0) {
 		logTrace(g_conf.m_logTraceXmlDoc, "END, contentLen == 0");
 		return (char *)1;
@ -916,50 +927,57 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 	if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
 		return false;

-	if( urlOnly )
-	{
+	if (urlOnly) {
 		return true;
 	}

+	bool *ini = getIsNoIndex();
+	if (ini == nullptr || ini == (bool*)-1) {
+		// must not be blocked
+		gbshutdownLogicError();
+	}

-	if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
+	if ( getUseTimeAxis() ) {
 		hi.m_prefix = "gbtimeurl";
 		SafeBuf *tau = getTimeAxisUrl();
 		hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
 	}

-	setStatus ( "hashing inurl colon" );
+	char *s = fu->getUrl();
+	int32_t slen = fu->getUrlLen();

-	//
-	// HASH inurl: terms
-	//
-	char *s    = fu->getUrl   ();
-	int32_t  slen = fu->getUrlLen();
-	hi.m_prefix = "inurl";
+	if (!*ini || m_version <= 126) {
+		setStatus("hashing inurl colon");

+		//
+		// HASH inurl: terms
+		//
+		hi.m_prefix = "inurl";

-	// BR 20160114: Skip numbers in urls when doing "inurl:" queries
-	hi.m_hashNumbers = false;
-	hi.m_filterUrlIndexableWords = true;
-	if ( ! hashString ( s,slen, &hi ) ) return false;
+		// BR 20160114: Skip numbers in urls when doing "inurl:" queries
+		hi.m_hashNumbers = false;
+		hi.m_filterUrlIndexableWords = true;
+		if (!hashString(s, slen, &hi)) return false;
+	}

+	{
+		setStatus("hashing ip colon");
+		hi.m_hashNumbers = true;
+		hi.m_filterUrlIndexableWords = false;

-	setStatus ( "hashing ip colon" );
-	hi.m_hashNumbers = true;
-	hi.m_filterUrlIndexableWords = false;
+		//
+		// HASH ip:a.b.c.d
+		//
+		if (!m_ipValid) { g_process.shutdownAbort(true); }
+		// copy it to save it
+		char ipbuf[64];
+		int32_t iplen = strlen(iptoa(m_ip, ipbuf));
+		hi.m_prefix = "ip";
+		if (!hashSingleTerm(ipbuf, iplen, &hi)) return false;

-	//
-	// HASH ip:a.b.c.d
-	//
-	if ( ! m_ipValid ) { g_process.shutdownAbort(true); }
-	// copy it to save it
-	char ipbuf[64];
-	int32_t iplen = strlen(iptoa(m_ip,ipbuf));
-	hi.m_prefix = "ip";
-	if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;
-
-	// . sanity check
-	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
+		// . sanity check
+		if (!m_siteNumInlinksValid) { g_process.shutdownAbort(true); }
+	}


 	//
@ -1033,9 +1051,12 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 		*p = '\0';

 		// update hash parms
-		hi.m_prefix    = "site";
-		// no longer, we just index json now
-		//if ( isStatusDoc ) hi.m_prefix = "site2";
+		if (m_version <= 126) {
+			hi.m_prefix = "site";
+		} else {
+			hi.m_prefix = *ini ? "sitenoindex" : "site";
+		}
+
 		hi.m_hashGroup = HASHGROUP_INURL;
 		
 		
@ -1105,24 +1126,26 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 		}
 	}

+	const char *ext = fu->getExtension();
+	int32_t elen = fu->getExtensionLen();
+	if (!*ini || m_version <= 126) {
+		//
+		// HASH ext: term
+		//
+		// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
+		setStatus("hashing ext colon");
+		// update hash parms
+		hi.m_prefix = "ext";
+		if (!hashSingleTerm(ext, elen, &hi)) return false;
+	}

-	//
-	// HASH ext: term
-	//
-	// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
-	setStatus ( "hashing ext colon");
-	const char *ext  = fu->getExtension();
-	int32_t  elen = fu->getExtensionLen();
-	// update hash parms
-	hi.m_prefix    = "ext";
-	if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;
-
-
-	setStatus ( "hashing gbdocid" );
-	hi.m_prefix = "gbdocid";
-	char buf2[32];
-	sprintf(buf2,"%" PRIu64, (uint64_t)m_docId );
-	if ( ! hashSingleTerm(buf2,strlen(buf2),&hi) ) return false;
+	{
+		setStatus("hashing gbdocid");
+		hi.m_prefix = "gbdocid";
+		char buf2[32];
+		sprintf(buf2, "%" PRIu64, (uint64_t)m_docId);
+		if (!hashSingleTerm(buf2, strlen(buf2), &hi)) return false;
+	}

 	setStatus ( "hashing SiteGetter terms");

@ -1180,6 +1203,11 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 	hi.m_prefix    = "urlhash";
 	if ( ! hashString(buf,blen,&hi) ) return false;

+	// don't index mid domain or url path for noindex document
+	if (*ini && m_version > 126) {
+		return true;
+	}
+
 	if (size_utf8Content - 1 > 0 || m_indexCode == EDOCDISALLOWEDROOT) {
 		setStatus("hashing url mid domain");

--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -306,7 +306,7 @@ const char *strnstrn(const char *haystack, int32_t haystackLen, const char *need
 }

 // . get the # of words in this string
-int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {
+int32_t getNumWords ( char *s , int32_t len ) {

 	int32_t wordCount = 0;
 	bool inWord   = false;
--- a/fctypes.h
+++ b/fctypes.h
@ -66,7 +66,7 @@ int32_t to_lower_utf8        (char *dst , char *dstEnd, const char *src ) ;
 int32_t to_lower_utf8        (char *dst , char *dstEnd, const char *src, const char *srcEnd) ;

 // . get the # of words in this string
-int32_t      getNumWords ( char *s , int32_t len, int32_t titleVersion ) ;
+int32_t      getNumWords ( char *s , int32_t len ) ;
 int32_t      atol2       ( const char *s, int32_t len ) ;
 int64_t atoll1      ( const char *s ) ;
 int64_t atoll2      ( const char *s, int32_t len ) ;
--- a/tools/dump_rebuild_noindex.cpp
+++ b/tools/dump_rebuild_noindex.cpp
@ -0,0 +1,195 @@
+#include "XmlDoc.h"
+#include "Collectiondb.h"
+#include "SpiderCache.h"
+#include "Titledb.h"
+#include "Doledb.h"
+#include "CountryCode.h"
+#include "Log.h"
+#include "Conf.h"
+#include "Mem.h"
+#include "UrlBlockCheck.h"
+#include "UrlMatchList.h"
+#include "WantedChecker.h"
+#include <libgen.h>
+#include <algorithm>
+
+static void print_usage(const char *argv0) {
+	fprintf(stdout, "Usage: %s [-h] PATH\n", argv0);
+	fprintf(stdout, "Dump unwanted titlerec\n");
+	fprintf(stdout, "\n");
+	fprintf(stdout, "  -h, --help     display this help and exit\n");
+}
+
+static void cleanup() {
+	g_log.m_disabled = true;
+
+	g_linkdb.reset();
+	g_clusterdb.reset();
+	g_spiderCache.reset();
+	g_doledb.reset();
+	g_spiderdb.reset();
+	g_tagdb.reset();
+	g_titledb.reset();
+	g_posdb.reset();
+
+	g_collectiondb.reset();
+
+	g_loop.reset();
+
+	WantedChecker::finalize();
+}
+
+int main(int argc, char **argv) {
+	if (argc < 2) {
+		print_usage(argv[0]);
+		return 1;
+	}
+
+	if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ) {
+		print_usage(argv[0]);
+		return 1;
+	}
+
+	g_log.m_disabled = true;
+
+	// initialize library
+	g_mem.init();
+	hashinit();
+
+	// current dir
+	char path[PATH_MAX];
+	realpath(argv[1], path);
+	size_t pathLen = strlen(path);
+	if (path[pathLen] != '/') {
+		strcat(path, "/");
+	}
+
+	g_hostdb.init(-1, false, false, true, path);
+	g_conf.init(path);
+
+	ucInit();
+
+	// initialize rdbs
+	g_loop.init();
+
+	g_collectiondb.loadAllCollRecs();
+
+	g_posdb.init();
+	g_titledb.init();
+	g_tagdb.init();
+	g_spiderdb.init();
+	g_doledb.init();
+	g_spiderCache.init();
+	g_clusterdb.init();
+	g_linkdb.init();
+
+	g_collectiondb.addRdbBaseToAllRdbsForEachCollRec();
+
+	g_log.m_disabled = false;
+	g_log.m_logPrefix = false;
+
+	CollectionRec *cr = g_collectiondb.getRec("main");
+	if (!cr) {
+		logf(LOG_TRACE, "No main collection found");
+		return 1;
+	}
+
+	// initialize shlib & blacklist
+	if (!WantedChecker::initialize()) {
+		fprintf(stderr, "Unable to initialize WantedChecker");
+		return 1;
+	}
+
+	g_urlBlackList.init();
+	g_urlWhiteList.init();
+
+	Msg5 msg5;
+	RdbList list;
+
+	key96_t startKey;
+	startKey.setMin();
+
+	key96_t endKey;
+	endKey.setMax();
+
+	while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {
+		if (list.isEmpty()) {
+			break;
+		}
+
+		for (list.resetListPtr(); !list.isExhausted(); list.skipCurrentRecord()) {
+			key96_t key = list.getCurrentKey();
+			int64_t docId = Titledb::getDocIdFromKey(&key);
+
+			XmlDoc xmlDoc;
+			if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) {
+				logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId);
+				continue;
+			}
+
+			// extract the url
+			Url *url = xmlDoc.getFirstUrl();
+			const char *reason = NULL;
+
+			if (isUrlUnwanted(*url, &reason)) {
+				continue;
+			}
+
+			Url **redirUrlPtr = xmlDoc.getRedirUrl();
+			if (redirUrlPtr && *redirUrlPtr) {
+				Url *redirUrl = *redirUrlPtr;
+				if (isUrlUnwanted(*redirUrl, &reason)) {
+					continue;
+				}
+			}
+
+			uint8_t *contentType = xmlDoc.getContentType();
+			switch (*contentType) {
+				case CT_GIF:
+				case CT_JPG:
+				case CT_PNG:
+				case CT_TIFF:
+				case CT_BMP:
+				case CT_JS:
+				case CT_CSS:
+				case CT_JSON:
+				case CT_IMAGE:
+				case CT_GZ:
+				case CT_ARC:
+				case CT_WARC:
+					continue;
+				default:
+					break;
+			}
+
+			// check content
+			int32_t contentLen = xmlDoc.size_utf8Content > 0 ? (xmlDoc.size_utf8Content - 1) : 0;
+			if (contentLen > 0) {
+				if (!WantedChecker::check_single_content(url->getUrl(), xmlDoc.ptr_utf8Content, contentLen).wanted) {
+					continue;
+				}
+			}
+
+			bool *ini = xmlDoc.getIsNoIndex();
+			if (*ini) {
+				bool *inf = xmlDoc.getIsNoFollow();
+				if (!*inf) {
+					fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
+				}
+				continue;
+			}
+		}
+
+		startKey = *(key96_t *)list.getLastKey();
+		startKey++;
+
+		// watch out for wrap around
+		if ( startKey < *(key96_t *)list.getLastKey() ) {
+			break;
+		}
+	}
+
+	cleanup();
+
+	return 0;
+}
--- a/tools/dump_unwanted.cpp
+++ b/tools/dump_unwanted.cpp
@ -178,10 +178,8 @@ int main(int argc, char **argv) {
 				bool *inf = xmlDoc.getIsNoFollow();
 				if (*inf) {
 					fprintf(stdout, "%" PRId64"|meta noindex nofollow|%s\n", docId, url->getUrl());
-				} else {
-					fprintf(stdout, "%" PRId64"|meta noindex follow|%s\n", docId, url->getUrl());
+					continue;
 				}
-				continue;
 			}
 		}