mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-19 08:44:42 -04:00
better removal of stored urls when doing a crawl start
This commit is contained in:
@ -356,6 +356,16 @@ public class Crawler_p {
|
||||
crawlingMode = "url";
|
||||
if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls
|
||||
}
|
||||
|
||||
// delete all error urls for that domain
|
||||
// and all urls for that host from the crawl queue
|
||||
Set<String> hosthashes = new HashSet<String>();
|
||||
for (DigestURL u : rootURLs) {
|
||||
sb.index.fulltext().remove(u.hash());
|
||||
hosthashes.add(u.hosthash());
|
||||
}
|
||||
sb.crawlQueues.removeHosts(hosthashes);
|
||||
sb.index.fulltext().commit(true);
|
||||
|
||||
// compute mustmatch filter according to rootURLs
|
||||
if ((fullDomain || subPath) && newcrawlingdepth > 0) {
|
||||
@ -363,23 +373,17 @@ public class Crawler_p {
|
||||
if (fullDomain) {
|
||||
siteFilter = CrawlProfile.siteFilter(rootURLs);
|
||||
if (deleteold) {
|
||||
Set<String> hosthashes = new HashSet<String>();
|
||||
for (DigestURL u: rootURLs) hosthashes.add(u.hosthash());
|
||||
sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate);
|
||||
sb.crawlQueues.removeHosts(hosthashes);
|
||||
}
|
||||
} else if (subPath) {
|
||||
siteFilter = CrawlProfile.subpathFilter(rootURLs);
|
||||
if (deleteold) {
|
||||
Set<String> hosthashes = new HashSet<String>();
|
||||
for (DigestURL u: rootURLs) {
|
||||
hosthashes.add(u.hosthash());
|
||||
String basepath = u.toNormalform(true);
|
||||
if (!basepath.endsWith("/")) {int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
|
||||
int count = sb.index.fulltext().remove(basepath, deleteageDate);
|
||||
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
|
||||
}
|
||||
sb.crawlQueues.removeHosts(hosthashes);
|
||||
}
|
||||
}
|
||||
if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
|
||||
@ -449,15 +453,6 @@ public class Crawler_p {
|
||||
try {
|
||||
sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);
|
||||
} catch (final SpaceExceededException e1) { }
|
||||
|
||||
// delete all error urls for that domain
|
||||
Set<String> hosthashes = new HashSet<String>();
|
||||
for (DigestURL u : rootURLs) {
|
||||
sb.index.fulltext().remove(u.hash());
|
||||
hosthashes.add(u.hosthash());
|
||||
}
|
||||
sb.crawlQueues.errorURL.removeHosts(hosthashes);
|
||||
sb.index.fulltext().commit(true);
|
||||
} else {
|
||||
profile = null;
|
||||
handle = null;
|
||||
|
Reference in New Issue
Block a user