215 lines
5.9 KiB
C++
215 lines
5.9 KiB
C++
//
|
|
// Copyright (C) 2017 Privacore ApS - https://www.privacore.com
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as
|
|
// published by the Free Software Foundation, either version 3 of the
|
|
// License, or (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
//
|
|
// License TL;DR: If you change this file, you must publish your changes.
|
|
//
|
|
#include "DocRebuild.h"
|
|
#include "XmlDoc.h"
|
|
#include "Msg0.h"
|
|
#include "RdbList.h"
|
|
#include "Conf.h"
|
|
#include "Errno.h"
|
|
|
|
DocRebuild g_docRebuild("docrebuild.txt", false);
|
|
DocRebuild g_docRebuildUrl("docrebuildurl.txt", true);
|
|
|
|
struct DocRebuildDocItem : public DocProcessDocItem {
|
|
DocRebuildDocItem(DocProcess *docProcess, const std::string &key, uint32_t firstIp, int64_t lastPos)
|
|
: DocProcessDocItem(docProcess, key, firstIp, lastPos)
|
|
, m_msg0()
|
|
, m_spiderdbList()
|
|
, m_spiderdbListRequested(false)
|
|
, m_spiderdbListProcessed(false)
|
|
, m_clearedXmlDoc(false) {
|
|
}
|
|
|
|
Msg0 m_msg0;
|
|
RdbList m_spiderdbList;
|
|
bool m_spiderdbListRequested;
|
|
bool m_spiderdbListProcessed;
|
|
bool m_clearedXmlDoc;
|
|
};
|
|
|
|
DocRebuild::DocRebuild(const char *filename, bool isUrl)
|
|
: DocProcess(filename, isUrl, false) {
|
|
}
|
|
|
|
DocProcessDocItem* DocRebuild::createDocItem(DocProcess *docProcess, const std::string &key, uint32_t firstIp, int64_t lastPos) {
|
|
return new DocRebuildDocItem(docProcess, key, firstIp, lastPos);
|
|
}
|
|
|
|
void DocRebuild::updateXmldoc(XmlDoc *xmlDoc) {
|
|
xmlDoc->m_recycleContent = true;
|
|
xmlDoc->m_docRebuild = true;
|
|
}
|
|
|
|
void DocRebuild::processDocItem(DocProcessDocItem *docItem) {
|
|
DocRebuildDocItem *rebuildDocItem = dynamic_cast<DocRebuildDocItem*>(docItem);
|
|
if (rebuildDocItem == nullptr) {
|
|
gbshutdownLogicError();
|
|
}
|
|
|
|
XmlDoc *xmlDoc = rebuildDocItem->m_xmlDoc;
|
|
|
|
// set callback
|
|
if (xmlDoc->m_masterLoop == nullptr) {
|
|
xmlDoc->m_masterLoop = processedDoc;
|
|
xmlDoc->m_masterState = rebuildDocItem;
|
|
}
|
|
|
|
// prepare
|
|
char **oldTitleRec = xmlDoc->getOldTitleRec();
|
|
if (!oldTitleRec || oldTitleRec == (char**)-1) {
|
|
return;
|
|
}
|
|
|
|
// oldTitleRec is mandatory for docrebuild
|
|
if (*oldTitleRec == nullptr) {
|
|
xmlDoc->m_indexCode = ENOTFOUND;
|
|
xmlDoc->m_indexCodeValid = true;
|
|
|
|
xmlDoc->logIt();
|
|
|
|
removePendingDoc(rebuildDocItem);
|
|
|
|
delete xmlDoc;
|
|
delete rebuildDocItem;
|
|
|
|
return;
|
|
}
|
|
|
|
XmlDoc **oldXmlDoc = xmlDoc->getOldXmlDoc();
|
|
if (!oldXmlDoc || oldXmlDoc == (XmlDoc**)-1) {
|
|
return;
|
|
}
|
|
|
|
if (!xmlDoc->m_contentValid && !xmlDoc->set2(*oldTitleRec, -1, "main", MAX_NICENESS)) {
|
|
xmlDoc->m_indexCode = ECORRUPTDATA;
|
|
xmlDoc->m_indexCodeValid = true;
|
|
|
|
xmlDoc->logIt();
|
|
|
|
removePendingDoc(rebuildDocItem);
|
|
|
|
delete xmlDoc;
|
|
delete rebuildDocItem;
|
|
|
|
return;
|
|
}
|
|
|
|
int32_t *firstIp = xmlDoc->getFirstIp();
|
|
if (!firstIp || firstIp == (int32_t*)-1) {
|
|
// blocked
|
|
return;
|
|
}
|
|
|
|
if (!rebuildDocItem->m_clearedXmlDoc) {
|
|
// logic copied from Repair.cpp
|
|
|
|
// rebuild the title rec! otherwise we re-add the old one
|
|
xmlDoc->m_titleRecBufValid = false;
|
|
xmlDoc->m_titleRecBuf.purge();
|
|
|
|
// recompute site, no more domain sites allowed
|
|
xmlDoc->m_siteValid = false;
|
|
xmlDoc->ptr_site = nullptr;
|
|
xmlDoc->size_site = 0;
|
|
|
|
// recalculate the sitenuminlinks
|
|
xmlDoc->m_siteNumInlinksValid = false;
|
|
|
|
// recalculate the langid
|
|
xmlDoc->m_langIdValid = false;
|
|
|
|
// recalcualte and store the link info
|
|
xmlDoc->m_linkInfo1Valid = false;
|
|
xmlDoc->ptr_linkInfo1 = nullptr;
|
|
xmlDoc->size_linkInfo1 = 0;
|
|
|
|
// re-get the tag rec from tagdb
|
|
xmlDoc->m_tagRecValid = false;
|
|
xmlDoc->m_tagRecDataValid = false;
|
|
|
|
xmlDoc->m_priority = -1;
|
|
xmlDoc->m_priorityValid = true;
|
|
|
|
xmlDoc->m_contentValid = true;
|
|
xmlDoc->m_content = xmlDoc->ptr_utf8Content;
|
|
xmlDoc->m_contentLen = xmlDoc->size_utf8Content - 1;
|
|
|
|
// update to latest version
|
|
#ifndef PRIVACORE_SAFE_VERSION
|
|
xmlDoc->m_version = g_conf.m_titleRecVersion;
|
|
#else
|
|
xmlDoc->m_version = TITLEREC_CURRENT_VERSION;
|
|
#endif
|
|
xmlDoc->m_versionValid = true;
|
|
|
|
rebuildDocItem->m_clearedXmlDoc = true;
|
|
}
|
|
|
|
// reset callback
|
|
if (xmlDoc->m_masterLoop == processedDoc) {
|
|
xmlDoc->m_masterLoop = nullptr;
|
|
xmlDoc->m_masterState = nullptr;
|
|
}
|
|
|
|
// set spider request
|
|
if (!rebuildDocItem->m_spiderdbListRequested) {
|
|
int64_t urlHash48 = xmlDoc->getFirstUrlHash48();
|
|
key128_t startKey = Spiderdb::makeKey(*firstIp, urlHash48, true, 0, false);
|
|
key128_t endKey = Spiderdb::makeKey(*firstIp, urlHash48, true, MAX_DOCID, false);
|
|
|
|
rebuildDocItem->m_spiderdbListRequested = true;
|
|
|
|
if (!rebuildDocItem->m_msg0.getList(-1, RDB_SPIDERDB_DEPRECATED, xmlDoc->m_collnum, &rebuildDocItem->m_spiderdbList, (const char *)&startKey,
|
|
(const char *)&endKey,
|
|
1000000, rebuildDocItem, processedDoc, 0, true, true, -1, 0, -1, 10000, false, false, -1)) {
|
|
// blocked
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!rebuildDocItem->m_spiderdbListProcessed) {
|
|
if (rebuildDocItem->m_spiderdbList.isEmpty()) {
|
|
xmlDoc->getRebuiltSpiderRequest(&xmlDoc->m_sreq);
|
|
xmlDoc->m_addSpiderRequest = true;
|
|
} else {
|
|
SpiderRequest *sreq = reinterpret_cast<SpiderRequest *>(rebuildDocItem->m_spiderdbList.getCurrentRec());
|
|
memcpy(&xmlDoc->m_sreq, sreq, sreq->m_dataSize + sizeof(key128_t) + 4);
|
|
}
|
|
|
|
xmlDoc->m_sreqValid = true;
|
|
rebuildDocItem->m_spiderdbListProcessed = true;
|
|
}
|
|
|
|
// done
|
|
if (xmlDoc->m_indexedDoc || xmlDoc->indexDoc()) {
|
|
removePendingDoc(rebuildDocItem);
|
|
|
|
delete xmlDoc;
|
|
delete rebuildDocItem;
|
|
}
|
|
}
|
|
|
|
int64_t DocRebuild::getMaxPending() const {
|
|
return g_conf.m_docRebuildMaxPending;
|
|
}
|
|
|
|
int64_t DocRebuild::getDelayMs() const {
|
|
return g_conf.m_docRebuildDelayMs;
|
|
}
|