mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-02-09 07:38:42 -05:00
a main problem when crawling is long waiting time cuased by crawl-delay values from robots.txt entries. that attribute is not supported by google and interpreted by yandex and bing in different ways. In large crawls there is always one host which blocks the whole crawl with extreme large values. YaCy now still obeys crawl-delay but limits them to 10 seconds. Additionally the blocking logic when loading new robots.txt was analyzed and a deadlock was removed. Furthermore the construction of new queue lists was redesigned and it was ensured that always a large list of different hosts for host-balancing is provided for the loader.
359 lines
17 KiB
Java
359 lines
17 KiB
Java
// Latency.java
|
|
// ------------
|
|
// SPDX-FileCopyrightText: 2009 Michael Peter Christen <mc@yacy.net)>
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
// first published 19.03.2009 on http://yacy.net
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.crawler.data;
|
|
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
import java.util.concurrent.atomic.AtomicLong;
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.crawler.robots.RobotsTxt;
|
|
import net.yacy.crawler.robots.RobotsTxtEntry;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.search.SwitchboardConstants;
|
|
|
|
|
|
public class Latency {
|
|
|
|
// the map is a mapping from host names to host configurations
|
|
private static final int mapMaxSize = 1000;
|
|
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<>();
|
|
|
|
/**
|
|
* update the latency entry after a host was selected for queueing into the loader
|
|
* @param url
|
|
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
|
|
*/
|
|
public static void updateAfterSelection(final DigestURL url, final long robotsCrawlDelay) {
|
|
final String host = url.getHost();
|
|
if (host == null) return;
|
|
final String hosthash = url.hosthash();
|
|
Host h = map.get(hosthash);
|
|
if (h == null) {
|
|
h = new Host(host, Switchboard.getSwitchboard().getConfigInt("crawler.defaultAverageLatency", 500), robotsCrawlDelay);
|
|
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
|
|
map.put(hosthash, h);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* update the latency entry before a host is accessed
|
|
* @param url
|
|
* @param time the time to load the file in milliseconds
|
|
*/
|
|
public static void updateBeforeLoad(final DigestURL url) {
|
|
final String host = url.getHost();
|
|
if (host == null) return;
|
|
final String hosthash = url.hosthash();
|
|
Host h = map.get(hosthash);
|
|
if (h == null) {
|
|
h = new Host(host, 500, 0);
|
|
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
|
|
map.put(hosthash, h);
|
|
} else {
|
|
h.update();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* update the latency entry after a host was accessed to load a file
|
|
* @param url
|
|
* @param time the time to load the file in milliseconds
|
|
*/
|
|
public static void updateAfterLoad(final DigestURL url, final long time) {
|
|
final String host = url.getHost();
|
|
if (host == null) return;
|
|
final String hosthash = url.hosthash();
|
|
Host h = map.get(hosthash);
|
|
if (h == null) {
|
|
h = new Host(host, time, 0);
|
|
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
|
|
map.put(hosthash, h);
|
|
} else {
|
|
h.update(time);
|
|
}
|
|
}
|
|
|
|
private static Host host(final DigestURL url) {
|
|
final String host = url.getHost();
|
|
if (host == null) return null;
|
|
return map.get(url.hosthash());
|
|
}
|
|
|
|
public static Iterator<Map.Entry<String, Host>> iterator() {
|
|
return map.entrySet().iterator();
|
|
}
|
|
|
|
/**
|
|
* Return the waiting time demanded by the robots.txt file of the target host.
|
|
* A special case is, if the remote host has a special crawl-delay assignment for
|
|
* this crawler with 0. This causes that a -1 is returned
|
|
* @param url
|
|
* @param robots
|
|
* @param thisAgents
|
|
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
|
|
*/
|
|
public static int waitingRobots(final MultiProtocolURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
|
|
int robotsDelay;
|
|
final RobotsTxtEntry robotsEntry = robots.getEntry(url, agent);
|
|
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
|
|
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
|
|
return robotsDelay;
|
|
}
|
|
|
|
private static int waitingRobots(final String hostport, final RobotsTxt robots, final ClientIdentification.Agent agent, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
|
|
int robotsDelay;
|
|
final RobotsTxtEntry robotsEntry = robots.getEntry(hostport, agent, fetchOnlineIfNotAvailableOrNotFresh);
|
|
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
|
|
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
|
|
return robotsDelay;
|
|
}
|
|
|
|
/**
|
|
* guess a minimum waiting time
|
|
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
|
|
* @param hostname
|
|
* @param hosthash
|
|
* @param robots
|
|
* @param agent
|
|
* @return the remaining waiting time in milliseconds. The return value may be negative
|
|
* which expresses how long the time is over the minimum waiting time.
|
|
*/
|
|
public static int waitingRemainingGuessed(final String hostname, final int port, final String hosthash, final RobotsTxt robots, final ClientIdentification.Agent agent) {
|
|
|
|
// first check if the domain was _ever_ accessed before
|
|
final Host host = map.get(hosthash);
|
|
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
int waiting = agent.minimumDelta;
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
waiting += host.flux(waiting);
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
// consider so many external accesses
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 3000;
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
|
|
|
// find the delay as given by robots.txt on target site
|
|
if (robots != null) {
|
|
final int robotsDelay = waitingRobots(hostname + ":" + port, robots, agent, false);
|
|
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
|
|
waiting = Math.max(waiting, robotsDelay);
|
|
}
|
|
|
|
return waiting - timeSinceLastAccess;
|
|
}
|
|
|
|
/**
|
|
* calculates how long should be waited until the domain can be accessed again
|
|
* this follows from:
|
|
* - given minimum access times
|
|
* - the fact that an url is a CGI url or not
|
|
* - the times that the domain was accessed (flux factor)
|
|
* - the response latency of the domain
|
|
* - and a given minimum access time as given in robots.txt
|
|
* @param agent
|
|
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
|
|
*/
|
|
public static int waitingRemaining(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
|
|
|
|
// first check if the domain was _ever_ accessed before
|
|
final Host host = host(url);
|
|
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
final boolean local = url.isLocal();
|
|
int waiting = agent.minimumDelta;
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
if (!local) waiting += host.flux(waiting);
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 3000;
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
|
|
|
// find the delay as given by robots.txt on target site
|
|
final int robotsDelay = waitingRobots(url, robots, agent);
|
|
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
|
|
|
|
waiting = Math.max(waiting, robotsDelay);
|
|
return waiting - timeSinceLastAccess;
|
|
}
|
|
|
|
public static String waitingRemainingExplain(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
|
|
|
|
// first check if the domain was _ever_ accessed before
|
|
final Host host = host(url);
|
|
if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
final boolean local = url.isLocal();
|
|
final StringBuilder s = new StringBuilder(50);
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
int waiting = agent.minimumDelta;
|
|
s.append("minimumDelta = ").append(waiting);
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
if (!local) {
|
|
final int flux = host.flux(waiting);
|
|
waiting += flux;
|
|
s.append(", flux = ").append(flux);
|
|
}
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
// consider so many external accesses
|
|
s.append(", host.average = ").append(host.average());
|
|
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
|
|
|
|
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
|
|
final int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
|
|
if (hostcount > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) {
|
|
s.append(", hostcount = ").append(hostcount);
|
|
waiting += 3000;
|
|
}
|
|
|
|
// find the delay as given by robots.txt on target site
|
|
final int robotsDelay = waitingRobots(url, robots, agent);
|
|
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
|
|
|
|
waiting = Math.max(waiting, robotsDelay);
|
|
s.append(", robots.delay = ").append(robotsDelay);
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
|
s.append(", ((waitig = ").append(waiting);
|
|
s.append(") - (timeSinceLastAccess = ").append(timeSinceLastAccess).append(")) = ");
|
|
s.append(waiting - timeSinceLastAccess);
|
|
return s.toString();
|
|
}
|
|
|
|
/**
|
|
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
|
|
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
|
|
* @param robots
|
|
* @param profileEntry
|
|
* @param crawlURL
|
|
* @return the sleep time in milliseconds; may be negative for no sleep time
|
|
*/
|
|
public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
|
|
if (profileEntry == null) return 0;
|
|
final long sleeptime = (
|
|
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
|
|
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
|
|
) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
|
|
return sleeptime;
|
|
}
|
|
|
|
/**
|
|
* load a robots.txt to get the robots time.
|
|
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
|
|
* This shall therefore not be called in synchronized environments.
|
|
* @param robots
|
|
* @param profileEntry
|
|
* @param crawlURL
|
|
* @return
|
|
*/
|
|
public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
|
|
final long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
|
|
return sleeptime < 0 ? 0 : sleeptime;
|
|
}
|
|
|
|
public static final class Host {
|
|
private final AtomicLong timeacc;
|
|
private final AtomicLong lastacc;
|
|
private final AtomicInteger count;
|
|
private final String host;
|
|
private final long robotsMinDelay;
|
|
private Host(final String host, final long time, long robotsMinDelay) {
|
|
this.host = host;
|
|
this.timeacc = new AtomicLong(time);
|
|
this.count = new AtomicInteger(1);
|
|
this.lastacc = new AtomicLong(System.currentTimeMillis());
|
|
this.robotsMinDelay = robotsMinDelay;
|
|
}
|
|
private void update(final long time) {
|
|
if (this.count.get() > 100) {
|
|
synchronized(this) {
|
|
// faster adoption to new values
|
|
this.timeacc.set(this.timeacc.get() / this.count.get());
|
|
this.count.set(1);
|
|
}
|
|
}
|
|
this.lastacc.set(System.currentTimeMillis());
|
|
this.timeacc.addAndGet(Math.min(30000, time));
|
|
this.count.incrementAndGet();
|
|
}
|
|
private void update() {
|
|
this.lastacc.set(System.currentTimeMillis());
|
|
}
|
|
public int count() {
|
|
return this.count.get();
|
|
}
|
|
public int average() {
|
|
return (int) (this.timeacc.get() / this.count.get());
|
|
}
|
|
public long lastacc() {
|
|
return this.lastacc.get();
|
|
}
|
|
public String host() {
|
|
return this.host;
|
|
}
|
|
public long robotsDelay() {
|
|
return this.robotsMinDelay;
|
|
}
|
|
/**
|
|
* Used by crawler to calculate additional access delay time for often accessed hosts
|
|
* (access count > 10000 returns half of the range parameter) linear incrementet from 0 up to (range div 2)
|
|
* @param range the current delay time
|
|
* @return the additional delay in ms (max: range div 2)
|
|
*/
|
|
public int flux(final int range) {
|
|
return this.count.get() >= 10000 ? range >> 1 : (range * this.count.get() / 10000) >> 1;
|
|
}
|
|
}
|
|
|
|
}
|