Merge branch 'master' into sqlite

2025-06-05 21:19:33 -04:00 · 2017-10-10 15:18:26 +02:00 · 2017-10-10 15:18:26 +02:00 · 50b363b8fe
commit 50b363b8fe
parent 830d4fbc96 31dd1f6b3f
9 changed files with 115 additions and 67 deletions
--- a/DocDelete.cpp
+++ b/DocDelete.cpp
@ -106,7 +106,7 @@ void DocDelete::finalize() {
 	s_docDeleteDocThreadQueue.finalize();
 }

-void reloadDocDelete(bool isDocDeleteUrl) {
+static void reloadDocDelete(bool isDocDeleteUrl) {
 	if (!s_docDeleteFileThreadQueue.isEmpty()) {
 		// we're currently processing tmp file
 		return;
--- a/SpiderColl.cpp
+++ b/SpiderColl.cpp
@ -1825,41 +1825,41 @@ bool SpiderColl::evalIpLoop ( ) {
 		}
 	}

- loop:
+	for(;;) {
+		// did our collection rec get deleted? since we were doing a read
+		// the SpiderColl will have been preserved in that case but its
+		// m_deleteMyself flag will have been set.
+		if ( tryToDeleteSpiderColl ( this, "5" ) ) {
+			// pretend to block since we got deleted!!!
+			logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
+			return false;
+		}

-	// did our collection rec get deleted? since we were doing a read
-	// the SpiderColl will have been preserved in that case but its
-	// m_deleteMyself flag will have been set.
-	if ( tryToDeleteSpiderColl ( this, "5" ) ) {
-		// pretend to block since we got deleted!!!
-		logTrace( g_conf.m_logTraceSpider, "END, after tryToDeleteSpiderColl (5)" );
-		return false;
-	}
+		// . did reading the list from spiderdb have an error?
+		// . i guess we don't add to doledb then
+		if ( g_errno ) {
+			log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));

-	// . did reading the list from spiderdb have an error?
-	// . i guess we don't add to doledb then
-	if ( g_errno ) {
-		log("spider: Had error getting list of urls from spiderdb: %s.",mstrerror(g_errno));
+			// save mem
+			m_list.freeList();

-		// save mem
-		m_list.freeList();
-
-		logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
-		return true;
-	}
+			logTrace( g_conf.m_logTraceSpider, "END, g_errno %" PRId32, g_errno );
+			return true;
+		}


-	// if we started reading, then assume we got a fresh list here
-	logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
-	          m_list.getListSize(), (int32_t)m_collnum );
+		// if we started reading, then assume we got a fresh list here
+		logDebug( g_conf.m_logDebugSpider, "spider: back from msg5 spiderdb read2 of %" PRId32" bytes (cn=%" PRId32")",
+		         m_list.getListSize(), (int32_t)m_collnum );

-	// . set the winning request for all lists we read so far
-	// . if m_countingPagesIndexed is true this will just fill in
-	//   quota info into m_localTable...
-	scanListForWinners();
+		// . set the winning request for all lists we read so far
+		// . if m_countingPagesIndexed is true this will just fill in
+		//   quota info into m_localTable...
+		scanListForWinners();

-	// if list not empty, keep reading!
-	if ( ! m_list.isEmpty() ) {
+		// if list not empty, keep reading!
+		if(m_list.isEmpty())
+			break;
 		// update m_nextKey for successive reads of spiderdb by
 		// calling readListFromSpiderdb()
 		key128_t lastKey  = *(key128_t *)m_list.getLastKey();
@ -1884,13 +1884,14 @@ bool SpiderColl::evalIpLoop ( ) {
 		// . normally i would go by this to indicate that we are
 		//   done reading, but there's some bugs... so we go
 		//   by whether our list is empty or not for now
-		if ( m_nextKey < lastKey ) m_nextKey = lastKey;
+		if(m_nextKey < lastKey)
+			m_nextKey = lastKey;
 		// reset list to save mem
 		m_list.reset();
 		// read more! return if it blocked
-		if ( ! readListFromSpiderdb() ) return false;
+		if(!readListFromSpiderdb())
+			return false;
 		// we got a list without blocking
-		goto loop;
 	}


--- a/SpiderdbHostDelete.cpp
+++ b/SpiderdbHostDelete.cpp
@ -10,10 +10,15 @@
 #include <sys/stat.h>
 #include <ctime>

-static const char *s_filename = "spiderdbhostdelete.txt";
-static const char *s_tmp_filename = "spiderdbhostdelete.txt.processing";
+static const char *s_spiderdbhost_filename = "spiderdbhostdelete.txt";
+static const char *s_spiderdbhost_tmp_filename = "spiderdbhostdelete.txt.processing";

-static time_t s_lastModifiedTime = 0;
+static time_t s_spiderdbhost_lastModifiedTime = 0;
+
+static const char *s_spiderdburl_filename = "spiderdburldelete.txt";
+static const char *s_spiderdburl_tmp_filename = "spiderdburldelete.txt.processing";
+
+static time_t s_spiderdburl_lastModifiedTime = 0;

 static GbMutex s_sleepMtx;
 static pthread_cond_t s_sleepCond = PTHREAD_COND_INITIALIZER;
@ -65,67 +70,92 @@ void SpiderdbHostDelete::finalize() {
 }

 struct FileItem {
-	FileItem(bool resume)
-		: m_resume(resume) {
+	FileItem(const char *tmpFilename, bool matchHost, bool resume)
+		: m_tmpFilename(tmpFilename)
+		, m_matchHost(matchHost)
+		, m_resume(resume) {
 	}

+	const char *m_tmpFilename;
+	bool m_matchHost;
 	bool m_resume;
 };
-void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
+
+static void reloadSpiderdbHostDelete(bool matchHost) {
 	if (!s_fileThreadQueue.isEmpty()) {
 		// we're currently processing tmp file
 		return;
 	}

+	const char *filename = nullptr;
+	const char *tmpFilename = nullptr;
+	time_t *lastModifiedTime = nullptr;
+
+	if (matchHost) {
+		filename = s_spiderdbhost_filename;
+		tmpFilename = s_spiderdbhost_tmp_filename;
+		lastModifiedTime = &s_spiderdbhost_lastModifiedTime;
+	} else {
+		filename = s_spiderdburl_filename;
+		tmpFilename = s_spiderdburl_tmp_filename;
+		lastModifiedTime = &s_spiderdburl_lastModifiedTime;
+	}
+
 	bool resume = false;
 	struct stat st;
-	if (stat(s_tmp_filename, &st) == 0) {
+	if (stat(tmpFilename, &st) == 0) {
 		if (spiderdbHostDeleteDisabled()) {
-			log(LOG_INFO, "Processing of %s is disabled", s_tmp_filename);
+			log(LOG_INFO, "Processing of %s is disabled", tmpFilename);
 			return;
 		}

 		resume = true;
 	} else {
-		if (stat(s_filename, &st) != 0) {
+		if (stat(filename, &st) != 0) {
 			// probably not found
-			logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", s_filename);
-			s_lastModifiedTime = 0;
+			logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Unable to stat %s", filename);
+			*lastModifiedTime = 0;
 			return;
 		}

 		// we only process the file if we have 2 consecutive loads with the same m_time
-		if (s_lastModifiedTime == 0 || s_lastModifiedTime != st.st_mtime) {
-			s_lastModifiedTime = st.st_mtime;
+		if (*lastModifiedTime == 0 || *lastModifiedTime != st.st_mtime) {
+			*lastModifiedTime = st.st_mtime;
 			logTrace(g_conf.m_logTraceSpiderdbHostDelete, "SpiderdbHostDelete::load: Modified time changed between load");
 			return;
 		}

 		// only start processing if spidering is disabled
 		if (spiderdbHostDeleteDisabled()) {
-			log(LOG_INFO, "Processing of %s is disabled", s_filename);
+			log(LOG_INFO, "Processing of %s is disabled", filename);
 			return;
 		}

 		// make sure file is not changed while we're processing it
-		int rc = rename(s_filename, s_tmp_filename);
+		int rc = rename(filename, tmpFilename);
 		if (rc == -1) {
-			log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", s_filename, s_tmp_filename, mstrerror(errno));
+			log(LOG_WARN, "Unable to rename '%s' to '%s' due to '%s'", filename, tmpFilename, mstrerror(errno));
 			return;
 		}
 	}

-	s_fileThreadQueue.addItem(new FileItem(resume));
+	s_fileThreadQueue.addItem(new FileItem(tmpFilename, matchHost, resume));
+}
+
+void SpiderdbHostDelete::reload(int /*fd*/, void */*state*/) {
+	// spiderdburldelete.txt
+	reloadSpiderdbHostDelete(false);
+
+	// spiderdbhostdelete.txt
+	reloadSpiderdbHostDelete(true);
 }

 void SpiderdbHostDelete::processFile(void *item) {
 	FileItem *fileItem = static_cast<FileItem*>(item);
-	bool resume = fileItem->m_resume;
-	delete fileItem;

-	log(LOG_INFO, "Processing %s", s_tmp_filename);
+	log(LOG_INFO, "Processing %s", fileItem->m_tmpFilename);

-	g_urlHostBlackList.load(s_tmp_filename);
+	g_urlHostBlackList.load(fileItem->m_tmpFilename, fileItem->m_matchHost);

 	CollectionRec *collRec = g_collectiondb.getRec("main");
 	if (!collRec) {
@ -134,7 +164,7 @@ void SpiderdbHostDelete::processFile(void *item) {
 	RdbBase *base = collRec->getBase(RDB_SPIDERDB);
 	Rdb *rdb = g_spiderdb.getRdb();

-	if (!resume) {
+	if (!fileItem->m_resume) {
 		// dump tree
 		rdb->submitRdbDumpJob(true);

@ -149,15 +179,17 @@ void SpiderdbHostDelete::processFile(void *item) {
 			}

 			if (s_stop) {
+				delete fileItem;
 				return;
 			}
 		}
 	}

 	// tight merge (only force merge all when not resuming)
-	if (!base->attemptMerge(0, !resume)) {
+	if (!base->attemptMerge(0, !fileItem->m_resume)) {
 		// unable to start merge
 		g_urlHostBlackList.unload();
+		delete fileItem;
 		return;
 	}

@ -172,14 +204,17 @@ void SpiderdbHostDelete::processFile(void *item) {
 		}

 		if (s_stop) {
+			delete fileItem;
 			return;
 		}
 	}

-	log(LOG_INFO, "Processed %s", s_tmp_filename);
+	log(LOG_INFO, "Processed %s", fileItem->m_tmpFilename);

 	g_urlHostBlackList.unload();

 	// delete files
-	unlink(s_tmp_filename);
+	unlink(fileItem->m_tmpFilename);
+
+	delete fileItem;
 }
--- a/TitleRecVersion.h
+++ b/TitleRecVersion.h
@ -18,6 +18,7 @@
 //#define TITLEREC_CURRENT_VERSION    124

 // strip ascii tab & newline from url
+// store m_indexCode in TitleRec
 #define TITLEREC_CURRENT_VERSION    125

 #endif // GB_TITLERECVERSION_H
--- a/UrlMatchHostList.cpp
+++ b/UrlMatchHostList.cpp
@ -10,11 +10,13 @@ UrlMatchHostList g_urlHostBlackList;

 UrlMatchHostList::UrlMatchHostList()
 	: m_filename()
+	, m_matchHost(false)
 	, m_urlmatchhostlist(new urlmatchhostlist_t) {
 }

-bool UrlMatchHostList::load(const char *filename) {
+bool UrlMatchHostList::load(const char *filename, bool matchHost) {
 	m_filename = filename;
+	m_matchHost = matchHost;

 	log(LOG_INFO, "Loading %s", m_filename);

@ -56,8 +58,8 @@ void UrlMatchHostList::unload() {
 bool UrlMatchHostList::isUrlMatched(const Url &url) {
 	auto urlmatchhostlist = getUrlMatchHostList();

-	std::string host(url.getHost(), url.getHostLen());
-	return (urlmatchhostlist->count(host) > 0);
+	std::string key = m_matchHost ? std::string(url.getHost(), url.getHostLen()) : std::string(url.getUrl(), url.getUrlLen());
+	return (urlmatchhostlist->count(key) > 0);
 }

 urlmatchhostlistconst_ptr_t UrlMatchHostList::getUrlMatchHostList() {
--- a/UrlMatchHostList.h
+++ b/UrlMatchHostList.h
@ -14,7 +14,7 @@ class UrlMatchHostList {
 public:
 	UrlMatchHostList();

-	bool load(const char *filename);
+	bool load(const char *filename, bool matchHost);
 	void unload();

 	bool isUrlMatched(const Url &url);
@ -24,6 +24,7 @@ private:
 	void swapUrlMatchHostList(urlmatchhostlistconst_ptr_t urlMatchHostList);

 	const char *m_filename;
+	bool m_matchHost;

 	urlmatchhostlistconst_ptr_t m_urlmatchhostlist;
 };
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1149,7 +1149,11 @@ bool XmlDoc::set2 ( char    *titleRec ,
 	m_isSiteRootValid             = true;

 	// there was no issue indexing it...
-	m_indexCode       = 0;
+	if (m_version < 125) {
+		// we only start storing indexCode in version 125
+		m_indexCode = 0;
+	}
+
 	m_indexCodeValid  = true;
 	m_redirError      = 0;
 	m_redirErrorValid = true;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -149,7 +149,9 @@ public:
 	uint32_t  m_tagPairHash32;
 	int32_t      m_siteNumInlinks;

-	int32_t    m_reserved1;
+	// this is non-zero if we decided not to index the doc
+	int32_t m_indexCode;
+
 	int32_t    m_reserved2;
 	uint32_t   m_spideredTime; // time_t
 	uint32_t  m_indexedTime; // slightly > m_spideredTime (time_t)
@ -1041,9 +1043,6 @@ public:
 	bool (* m_callback2) ( void *state );	
 	void  *m_state;

-	// this is non-zero if we decided not to index the doc
-	int32_t m_indexCode;
-
 	// the spider priority
 	int32_t m_priority;

--- a/tools/dump_badlinks.cpp
+++ b/tools/dump_badlinks.cpp
@ -7,6 +7,8 @@
 #include "Log.h"
 #include "Conf.h"
 #include "Mem.h"
+#include "UrlBlockCheck.h"
+#include "UrlMatchList.h"
 #include <libgen.h>
 #include <algorithm>

@ -97,6 +99,9 @@ int main(int argc, char **argv) {
 	key96_t endKey;
 	endKey.setMax();

+	g_urlBlackList.init();
+	g_urlWhiteList.init();
+
 	while (msg5.getList(RDB_TITLEDB, cr->m_collnum, &list, &startKey, &endKey, 10485760, true, 0, -1, NULL, NULL, 0, true, -1, false)) {

 		if (list.isEmpty()) {
@ -145,7 +150,7 @@ int main(int argc, char **argv) {
 				Url url;
 				url.set(link.c_str());

-				if (isUrlUnwanted(url)) {
+				if (isUrlUnwanted(url) || (url.isRoot() && url.isValid())) {
 					continue;
 				}