mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-06-05 21:19:33 -04:00
Merge branch 'master' into sqlite
This commit is contained in:
commit
50b363b8fe
@ -106,7 +106,7 @@ void DocDelete::finalize() {
|
||||
s_docDeleteDocThreadQueue.finalize();
|
||||
}
|
||||
|
||||
void reloadDocDelete(bool isDocDeleteUrl) {
|
||||
static void reloadDocDelete(bool isDocDeleteUrl) {
|
||||
if (!s_docDeleteFileThreadQueue.isEmpty()) {
|
||||
// we're currently processing tmp file
|
||||
return;
|
||||
|
@ -1825,41 +1825,41 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
loop:
|
||||
for(;;) {
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( this, "5" ) ) {
|
||||
// pretend to block since we got deleted!!!
|
||||
logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
|
||||
return false;
|
||||
}
|
||||
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( this, "5" ) ) {
|
||||
// pretend to block since we got deleted!!!
|
||||
logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
|
||||
return false;
|
||||
}
|
||||
// . did reading the list from spiderdb have an error?
|
||||
// . i guess we don't add to doledb then
|
||||
if ( g_errno ) {
|
||||
log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));
|
||||
|
||||
// . did reading the list from spiderdb have an error?
|
||||
// . i guess we don't add to doledb then
|
||||
if ( g_errno ) {
|
||||
log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));
|
||||
// save mem
|
||||
m_list.freeList();
|
||||
|
||||
// save mem
|
||||
m_list.freeList();
|
||||
|
||||
logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
|
||||
return true;
|
||||
}
|
||||
logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// if we started reading, then assume we got a fresh list here
|
||||
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
|
||||
m_list.getListSize(), (int32_t)m_collnum );
|
||||
// if we started reading, then assume we got a fresh list here
|
||||
logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
|
||||
m_list.getListSize(), (int32_t)m_collnum );
|
||||
|
||||
// . set the winning request for all lists we read so far
|
||||
// . if m_countingPagesIndexed is true this will just fill in
|
||||
// quota info into m_localTable...
|
||||
scanListForWinners();
|
||||
// . set the winning request for all lists we read so far
|
||||
// . if m_countingPagesIndexed is true this will just fill in
|
||||
// quota info into m_localTable...
|
||||
scanListForWinners();
|
||||
|
||||
// if list not empty, keep reading!
|
||||
if ( ! m_list.isEmpty() ) {
|
||||
// if list not empty, keep reading!
|
||||
if(m_list.isEmpty())
|
||||
break;
|
||||
// update m_nextKey for successive reads of spiderdb by
|
||||
// calling readListFromSpiderdb()
|
||||
key128_t lastKey = *(key128_t *)m_list.getLastKey();
|
||||
@ -1884,13 +1884,14 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
// . normally i would go by this to indicate that we are
|
||||
// done reading, but there's some bugs... so we go
|
||||
// by whether our list is empty or not for now
|
||||
if ( m_nextKey < lastKey ) m_nextKey = lastKey;
|
||||
if(m_nextKey < lastKey)
|
||||
m_nextKey = lastKey;
|
||||
// reset list to save mem
|
||||
m_list.reset();
|
||||
// read more! return if it blocked
|
||||
if ( ! readListFromSpiderdb() ) return false;
|
||||
if(!readListFromSpiderdb())
|
||||
return false;
|
||||
// we got a list without blocking
|
||||
goto loop;
|
||||
}
|
||||
|
||||
|
||||
|
@ -10,10 +10,15 @@
|
||||
#include <sys/stat.h>
|
||||
#include <ctime>
|
||||
|
||||
static const char *s_filename = "spiderdbhostdelete.txt";
|
||||
static const char *s_tmp_filename = "spiderdbhostdelete.txt.processing";
|
||||
static const char *s_spiderdbhost_filename = "spiderdbhostdelete.txt";
|
||||
static const char *s_spiderdbhost_tmp_filename = "spiderdbhostdelete.txt.processing";
|
||||
|
||||
static time_t s_lastModifiedTime = 0;
|
||||
static time_t s_spiderdbhost_lastModifiedTime = 0;
|
||||
|
||||
static const char *s_spiderdburl_filename = "spiderdburldelete.txt";
|
||||
static const char *s_spiderdburl_tmp_filename = "spiderdburldelete.txt.processing";
|
||||
|
||||
static time_t s_spiderdburl_lastModifiedTime = 0;
|
||||
|
||||
static GbMutex s_sleepMtx;
|
||||
static pthread_cond_t s_sleepCond = PTHREAD_COND_INITIALIZER;
|
||||
@ -65,67 +70,92 @@ void SpiderdbHostDelete::finalize() {
|
||||
}
|
||||
|
||||
struct FileItem {
|
||||
FileItem(bool resume)
|
||||
: m_resume(resume) {
|
||||
FileItem(const char *tmpFilename, bool matchHost, bool resume)
|
||||
: m_tmpFilename(tmpFilename)
|
||||
, m_matchHost(matchHost)
|
||||
, m_resume(resume) {
|
||||
}
|
||||
|
||||
const char *m_tmpFilename;
|
||||
bool m_matchHost;
|
||||
bool m_resume;
|
||||
};
|
||||
void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
|
||||
|
||||
static void reloadSpiderdbHostDelete(bool matchHost) {
|
||||
if (!s_fileThreadQueue.isEmpty()) {
|
||||
// we're currently processing tmp file
|
||||
return;
|
||||
}
|
||||
|
||||
const char *filename = nullptr;
|
||||
const char *tmpFilename = nullptr;
|
||||
time_t *lastModifiedTime = nullptr;
|
||||
|
||||
if (matchHost) {
|
||||
filename = s_spiderdbhost_filename;
|
||||
tmpFilename = s_spiderdbhost_tmp_filename;
|
||||
lastModifiedTime = &s_spiderdbhost_lastModifiedTime;
|
||||
} else {
|
||||
filename = s_spiderdburl_filename;
|
||||
tmpFilename = s_spiderdburl_tmp_filename;
|
||||
lastModifiedTime = &s_spiderdburl_lastModifiedTime;
|
||||
}
|
||||
|
||||
bool resume = false;
|
||||
struct stat st;
|
||||
if (stat(s_tmp_filename, &st) == 0) {
|
||||
if (stat(tmpFilename, &st) == 0) {
|
||||
if (spiderdbHostDeleteDisabled()) {
|
||||
log(LOG_INFO, "Processing of %s is disabled", s_tmp_filename);
|
||||
log(LOG_INFO, "Processing of %s is disabled", tmpFilename);
|
||||
return;
|
||||
}
|
||||
|
||||
resume = true;
|
||||
} else {
|
||||
if (stat(s_filename, &st) != 0) {
|
||||
if (stat(filename, &st) != 0) {
|
||||
// probably not found
|
||||
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", s_filename);
|
||||
s_lastModifiedTime = 0;
|
||||
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", filename);
|
||||
*lastModifiedTime = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// we only process the file if we have 2 consecutive loads with the same m_time
|
||||
if (s_lastModifiedTime == 0 || s_lastModifiedTime != st.st_mtime) {
|
||||
s_lastModifiedTime = st.st_mtime;
|
||||
if (*lastModifiedTime == 0 || *lastModifiedTime != st.st_mtime) {
|
||||
*lastModifiedTime = st.st_mtime;
|
||||
logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Modified time changed between load");
|
||||
return;
|
||||
}
|
||||
|
||||
// only start processing if spidering is disabled
|
||||
if (spiderdbHostDeleteDisabled()) {
|
||||
log(LOG_INFO, "Processing of %s is disabled", s_filename);
|
||||
log(LOG_INFO, "Processing of %s is disabled", filename);
|
||||
return;
|
||||
}
|
||||
|
||||
// make sure file is not changed while we're processing it
|
||||
int rc = rename(s_filename, s_tmp_filename);
|
||||
int rc = rename(filename, tmpFilename);
|
||||
if (rc == -1) {
|
||||
log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", s_filename, s_tmp_filename, mstrerror(errno));
|
||||
log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", filename, tmpFilename, mstrerror(errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
s_fileThreadQueue.addItem(new FileItem(resume));
|
||||
s_fileThreadQueue.addItem(new FileItem(tmpFilename, matchHost, resume));
|
||||
}
|
||||
|
||||
void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
|
||||
// spiderdburldelete.txt
|
||||
reloadSpiderdbHostDelete(false);
|
||||
|
||||
// spiderdbhostdelete.txt
|
||||
reloadSpiderdbHostDelete(true);
|
||||
}
|
||||
|
||||
void SpiderdbHostDelete::processFile(void *item) {
|
||||
FileItem *fileItem = static_cast<FileItem*>(item);
|
||||
bool resume = fileItem->m_resume;
|
||||
delete fileItem;
|
||||
|
||||
log(LOG_INFO, "Processing %s", s_tmp_filename);
|
||||
log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);
|
||||
|
||||
g_urlHostBlackList.load(s_tmp_filename);
|
||||
g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);
|
||||
|
||||
CollectionRec *collRec = g_collectiondb.getRec("main");
|
||||
if (!collRec) {
|
||||
@ -134,7 +164,7 @@ void SpiderdbHostDelete::processFile(void *item) {
|
||||
RdbBase *base = collRec->getBase(RDB_SPIDERDB);
|
||||
Rdb *rdb = g_spiderdb.getRdb();
|
||||
|
||||
if (!resume) {
|
||||
if (!fileItem->m_resume) {
|
||||
// dump tree
|
||||
rdb->submitRdbDumpJob(true);
|
||||
|
||||
@ -149,15 +179,17 @@ void SpiderdbHostDelete::processFile(void *item) {
|
||||
}
|
||||
|
||||
if (s_stop) {
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tight merge (only force merge all when not resuming)
|
||||
if (!base->attemptMerge(0, !resume)) {
|
||||
if (!base->attemptMerge(0, !fileItem->m_resume)) {
|
||||
// unable to start merge
|
||||
g_urlHostBlackList.unload();
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -172,14 +204,17 @@ void SpiderdbHostDelete::processFile(void *item) {
|
||||
}
|
||||
|
||||
if (s_stop) {
|
||||
delete fileItem;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log(LOG_INFO, "Processed %s", s_tmp_filename);
|
||||
log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);
|
||||
|
||||
g_urlHostBlackList.unload();
|
||||
|
||||
// delete files
|
||||
unlink(s_tmp_filename);
|
||||
unlink(fileItem->m_tmpFilename);
|
||||
|
||||
delete fileItem;
|
||||
}
|
@ -18,6 +18,7 @@
|
||||
//#define TITLEREC_CURRENT_VERSION 124
|
||||
|
||||
// strip ascii tab & newline from url
|
||||
// store m_indexCode in TitleRec
|
||||
#define TITLEREC_CURRENT_VERSION 125
|
||||
|
||||
#endif // GB_TITLERECVERSION_H
|
||||
|
@ -10,11 +10,13 @@ UrlMatchHostList g_urlHostBlackList;
|
||||
|
||||
UrlMatchHostList::UrlMatchHostList()
|
||||
: m_filename()
|
||||
, m_matchHost(false)
|
||||
, m_urlmatchhostlist(new urlmatchhostlist_t) {
|
||||
}
|
||||
|
||||
bool UrlMatchHostList::load(const char *filename) {
|
||||
bool UrlMatchHostList::load(const char *filename, bool matchHost) {
|
||||
m_filename = filename;
|
||||
m_matchHost = matchHost;
|
||||
|
||||
log(LOG_INFO, "Loading %s", m_filename);
|
||||
|
||||
@ -56,8 +58,8 @@ void UrlMatchHostList::unload() {
|
||||
bool UrlMatchHostList::isUrlMatched(const Url &url) {
|
||||
auto urlmatchhostlist = getUrlMatchHostList();
|
||||
|
||||
std::string host(url.getHost(), url.getHostLen());
|
||||
return (urlmatchhostlist->count(host) > 0);
|
||||
std::string key = m_matchHost ? std::string(url.getHost(), url.getHostLen()) : std::string(url.getUrl(), url.getUrlLen());
|
||||
return (urlmatchhostlist->count(key) > 0);
|
||||
}
|
||||
|
||||
urlmatchhostlistconst_ptr_t UrlMatchHostList::getUrlMatchHostList() {
|
||||
|
@ -14,7 +14,7 @@ class UrlMatchHostList {
|
||||
public:
|
||||
UrlMatchHostList();
|
||||
|
||||
bool load(const char *filename);
|
||||
bool load(const char *filename, bool matchHost);
|
||||
void unload();
|
||||
|
||||
bool isUrlMatched(const Url &url);
|
||||
@ -24,6 +24,7 @@ private:
|
||||
void swapUrlMatchHostList(urlmatchhostlistconst_ptr_t urlMatchHostList);
|
||||
|
||||
const char *m_filename;
|
||||
bool m_matchHost;
|
||||
|
||||
urlmatchhostlistconst_ptr_t m_urlmatchhostlist;
|
||||
};
|
||||
|
@ -1149,7 +1149,11 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
m_isSiteRootValid = true;
|
||||
|
||||
// there was no issue indexing it...
|
||||
m_indexCode = 0;
|
||||
if (m_version < 125) {
|
||||
// we only start storing indexCode in version 125
|
||||
m_indexCode = 0;
|
||||
}
|
||||
|
||||
m_indexCodeValid = true;
|
||||
m_redirError = 0;
|
||||
m_redirErrorValid = true;
|
||||
|
7
XmlDoc.h
7
XmlDoc.h
@ -149,7 +149,9 @@ public:
|
||||
uint32_t m_tagPairHash32;
|
||||
int32_t m_siteNumInlinks;
|
||||
|
||||
int32_t m_reserved1;
|
||||
// this is non-zero if we decided not to index the doc
|
||||
int32_t m_indexCode;
|
||||
|
||||
int32_t m_reserved2;
|
||||
uint32_t m_spideredTime; // time_t
|
||||
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
|
||||
@ -1041,9 +1043,6 @@ public:
|
||||
bool (* m_callback2) ( void *state );
|
||||
void *m_state;
|
||||
|
||||
// this is non-zero if we decided not to index the doc
|
||||
int32_t m_indexCode;
|
||||
|
||||
// the spider priority
|
||||
int32_t m_priority;
|
||||
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include "Log.h"
|
||||
#include "Conf.h"
|
||||
#include "Mem.h"
|
||||
#include "UrlBlockCheck.h"
|
||||
#include "UrlMatchList.h"
|
||||
#include <libgen.h>
|
||||
#include <algorithm>
|
||||
|
||||
@ -97,6 +99,9 @@ int main(int argc, char **argv) {
|
||||
key96_t endKey;
|
||||
endKey.setMax();
|
||||
|
||||
g_urlBlackList.init();
|
||||
g_urlWhiteList.init();
|
||||
|
||||
while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {
|
||||
|
||||
if (list.isEmpty()) {
|
||||
@ -145,7 +150,7 @@ int main(int argc, char **argv) {
|
||||
Url url;
|
||||
url.set(link.c_str());
|
||||
|
||||
if (isUrlUnwanted(url)) {
|
||||
if (isUrlUnwanted(url) || (url.isRoot() && url.isValid())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user