Merge branch 'master' into sqlite

This commit is contained in:
Ivan Skytte Jørgensen 2017-10-10 15:18:26 +02:00
commit 50b363b8fe
9 changed files with 115 additions and 67 deletions

@ -106,7 +106,7 @@ void DocDelete::finalize() {
s_docDeleteDocThreadQueue.finalize();
}
void reloadDocDelete(bool isDocDeleteUrl) {
static void reloadDocDelete(bool isDocDeleteUrl) {
if (!s_docDeleteFileThreadQueue.isEmpty()) {
// we're currently processing tmp file
return;

@ -1825,41 +1825,41 @@ bool SpiderColl::evalIpLoop ( ) {
}
}
loop:
for(;;) {
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this, "5" ) ) {
// pretend to block since we got deleted!!!
logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
return false;
}
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this, "5" ) ) {
// pretend to block since we got deleted!!!
logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
return false;
}
// . did reading the list from spiderdb have an error?
// . i guess we don't add to doledb then
if ( g_errno ) {
log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));
// . did reading the list from spiderdb have an error?
// . i guess we don't add to doledb then
if ( g_errno ) {
log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));
// save mem
m_list.freeList();
// save mem
m_list.freeList();
logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
return true;
}
logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
return true;
}
// if we started reading, then assume we got a fresh list here
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
m_list.getListSize(), (int32_t)m_collnum );
// if we started reading, then assume we got a fresh list here
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
m_list.getListSize(), (int32_t)m_collnum );
// . set the winning request for all lists we read so far
// . if m_countingPagesIndexed is true this will just fill in
// quota info into m_localTable...
scanListForWinners();
// . set the winning request for all lists we read so far
// . if m_countingPagesIndexed is true this will just fill in
// quota info into m_localTable...
scanListForWinners();
// if list not empty, keep reading!
if ( ! m_list.isEmpty() ) {
// if list not empty, keep reading!
if(m_list.isEmpty())
break;
// update m_nextKey for successive reads of spiderdb by
// calling readListFromSpiderdb()
key128_t lastKey = *(key128_t *)m_list.getLastKey();
@ -1884,13 +1884,14 @@ bool SpiderColl::evalIpLoop ( ) {
// . normally i would go by this to indicate that we are
// done reading, but there's some bugs... so we go
// by whether our list is empty or not for now
if ( m_nextKey < lastKey ) m_nextKey = lastKey;
if(m_nextKey < lastKey)
m_nextKey = lastKey;
// reset list to save mem
m_list.reset();
// read more! return if it blocked
if ( ! readListFromSpiderdb() ) return false;
if(!readListFromSpiderdb())
return false;
// we got a list without blocking
goto loop;
}

@ -10,10 +10,15 @@
#include <sys/stat.h>
#include <ctime>
static const char *s_filename = "spiderdbhostdelete.txt";
static const char *s_tmp_filename = "spiderdbhostdelete.txt.processing";
static const char *s_spiderdbhost_filename = "spiderdbhostdelete.txt";
static const char *s_spiderdbhost_tmp_filename = "spiderdbhostdelete.txt.processing";
static time_t s_lastModifiedTime = 0;
static time_t s_spiderdbhost_lastModifiedTime = 0;
static const char *s_spiderdburl_filename = "spiderdburldelete.txt";
static const char *s_spiderdburl_tmp_filename = "spiderdburldelete.txt.processing";
static time_t s_spiderdburl_lastModifiedTime = 0;
static GbMutex s_sleepMtx;
static pthread_cond_t s_sleepCond = PTHREAD_COND_INITIALIZER;
@ -65,67 +70,92 @@ void SpiderdbHostDelete::finalize() {
}
struct FileItem {
FileItem(bool resume)
: m_resume(resume) {
FileItem(const char *tmpFilename, bool matchHost, bool resume)
: m_tmpFilename(tmpFilename)
, m_matchHost(matchHost)
, m_resume(resume) {
}
const char *m_tmpFilename;
bool m_matchHost;
bool m_resume;
};
void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
static void reloadSpiderdbHostDelete(bool matchHost) {
if (!s_fileThreadQueue.isEmpty()) {
// we're currently processing tmp file
return;
}
const char *filename = nullptr;
const char *tmpFilename = nullptr;
time_t *lastModifiedTime = nullptr;
if (matchHost) {
filename = s_spiderdbhost_filename;
tmpFilename = s_spiderdbhost_tmp_filename;
lastModifiedTime = &s_spiderdbhost_lastModifiedTime;
} else {
filename = s_spiderdburl_filename;
tmpFilename = s_spiderdburl_tmp_filename;
lastModifiedTime = &s_spiderdburl_lastModifiedTime;
}
bool resume = false;
struct stat st;
if (stat(s_tmp_filename, &st) == 0) {
if (stat(tmpFilename, &st) == 0) {
if (spiderdbHostDeleteDisabled()) {
log(LOG_INFO, "Processing of %s is disabled", s_tmp_filename);
log(LOG_INFO, "Processing of %s is disabled", tmpFilename);
return;
}
resume = true;
} else {
if (stat(s_filename, &st) != 0) {
if (stat(filename, &st) != 0) {
// probably not found
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", s_filename);
s_lastModifiedTime = 0;
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", filename);
*lastModifiedTime = 0;
return;
}
// we only process the file if we have 2 consecutive loads with the same m_time
if (s_lastModifiedTime == 0 || s_lastModifiedTime != st.st_mtime) {
s_lastModifiedTime = st.st_mtime;
if (*lastModifiedTime == 0 || *lastModifiedTime != st.st_mtime) {
*lastModifiedTime = st.st_mtime;
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Modified time changed between load");
return;
}
// only start processing if spidering is disabled
if (spiderdbHostDeleteDisabled()) {
log(LOG_INFO, "Processing of %s is disabled", s_filename);
log(LOG_INFO, "Processing of %s is disabled", filename);
return;
}
// make sure file is not changed while we're processing it
int rc = rename(s_filename, s_tmp_filename);
int rc = rename(filename, tmpFilename);
if (rc == -1) {
log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", s_filename, s_tmp_filename, mstrerror(errno));
log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", filename, tmpFilename, mstrerror(errno));
return;
}
}
s_fileThreadQueue.addItem(new FileItem(resume));
s_fileThreadQueue.addItem(new FileItem(tmpFilename, matchHost, resume));
}
void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
// spiderdburldelete.txt
reloadSpiderdbHostDelete(false);
// spiderdbhostdelete.txt
reloadSpiderdbHostDelete(true);
}
void SpiderdbHostDelete::processFile(void *item) {
FileItem *fileItem = static_cast<FileItem*>(item);
bool resume = fileItem->m_resume;
delete fileItem;
log(LOG_INFO, "Processing %s", s_tmp_filename);
log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
g_urlHostBlackList.load(s_tmp_filename);
g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
CollectionRec *collRec = g_collectiondb.getRec("main");
if (!collRec) {
@ -134,7 +164,7 @@ void SpiderdbHostDelete::processFile(void *item) {
RdbBase *base = collRec->getBase(RDB_SPIDERDB);
Rdb *rdb = g_spiderdb.getRdb();
if (!resume) {
if (!fileItem->m_resume) {
// dump tree
rdb->submitRdbDumpJob(true);
@ -149,15 +179,17 @@ void SpiderdbHostDelete::processFile(void *item) {
}
if (s_stop) {
delete fileItem;
return;
}
}
}
// tight merge (only force merge all when not resuming)
if (!base->attemptMerge(0, !resume)) {
if (!base->attemptMerge(0, !fileItem->m_resume)) {
// unable to start merge
g_urlHostBlackList.unload();
delete fileItem;
return;
}
@ -172,14 +204,17 @@ void SpiderdbHostDelete::processFile(void *item) {
}
if (s_stop) {
delete fileItem;
return;
}
}
log(LOG_INFO, "Processed %s", s_tmp_filename);
log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
g_urlHostBlackList.unload();
// delete files
unlink(s_tmp_filename);
unlink(fileItem->m_tmpFilename);
delete fileItem;
}

@ -18,6 +18,7 @@
//#define TITLEREC_CURRENT_VERSION 124
// strip ascii tab & newline from url
// store m_indexCode in TitleRec
#define TITLEREC_CURRENT_VERSION 125
#endif // GB_TITLERECVERSION_H

@ -10,11 +10,13 @@ UrlMatchHostList g_urlHostBlackList;
UrlMatchHostList::UrlMatchHostList()
: m_filename()
, m_matchHost(false)
, m_urlmatchhostlist(new urlmatchhostlist_t) {
}
bool UrlMatchHostList::load(const char *filename) {
bool UrlMatchHostList::load(const char *filename, bool matchHost) {
m_filename = filename;
m_matchHost = matchHost;
log(LOG_INFO, "Loading %s", m_filename);
@ -56,8 +58,8 @@ void UrlMatchHostList::unload() {
bool UrlMatchHostList::isUrlMatched(const Url &url) {
auto urlmatchhostlist = getUrlMatchHostList();
std::string host(url.getHost(), url.getHostLen());
return (urlmatchhostlist->count(host) > 0);
std::string key = m_matchHost ? std::string(url.getHost(), url.getHostLen()) : std::string(url.getUrl(), url.getUrlLen());
return (urlmatchhostlist->count(key) > 0);
}
urlmatchhostlistconst_ptr_t UrlMatchHostList::getUrlMatchHostList() {

@ -14,7 +14,7 @@ class UrlMatchHostList {
public:
UrlMatchHostList();
bool load(const char *filename);
bool load(const char *filename, bool matchHost);
void unload();
bool isUrlMatched(const Url &url);
@ -24,6 +24,7 @@ private:
void swapUrlMatchHostList(urlmatchhostlistconst_ptr_t urlMatchHostList);
const char *m_filename;
bool m_matchHost;
urlmatchhostlistconst_ptr_t m_urlmatchhostlist;
};

@ -1149,7 +1149,11 @@ bool XmlDoc::set2 ( char *titleRec ,
m_isSiteRootValid = true;
// there was no issue indexing it...
m_indexCode = 0;
if (m_version < 125) {
// we only start storing indexCode in version 125
m_indexCode = 0;
}
m_indexCodeValid = true;
m_redirError = 0;
m_redirErrorValid = true;

@ -149,7 +149,9 @@ public:
uint32_t m_tagPairHash32;
int32_t m_siteNumInlinks;
int32_t m_reserved1;
// this is non-zero if we decided not to index the doc
int32_t m_indexCode;
int32_t m_reserved2;
uint32_t m_spideredTime; // time_t
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
@ -1041,9 +1043,6 @@ public:
bool (* m_callback2) ( void *state );
void *m_state;
// this is non-zero if we decided not to index the doc
int32_t m_indexCode;
// the spider priority
int32_t m_priority;

@ -7,6 +7,8 @@
#include "Log.h"
#include "Conf.h"
#include "Mem.h"
#include "UrlBlockCheck.h"
#include "UrlMatchList.h"
#include <libgen.h>
#include <algorithm>
@ -97,6 +99,9 @@ int main(int argc, char **argv) {
key96_t endKey;
endKey.setMax();
g_urlBlackList.init();
g_urlWhiteList.init();
while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {
if (list.isEmpty()) {
@ -145,7 +150,7 @@ int main(int argc, char **argv) {
Url url;
url.set(link.c_str());
if (isUrlUnwanted(url)) {
if (isUrlUnwanted(url) || (url.isRoot() && url.isValid())) {
continue;
}