mirror of
synced 2025-02-02 06:38:42 -05:00
a main problem when crawling is long waiting time cuased by crawl-delay values from robots.txt entries. that attribute is not supported by google and interpreted by yandex and bing in different ways. In large crawls there is always one host which blocks the whole crawl with extreme large values. YaCy now still obeys crawl-delay but limits them to 10 seconds. Additionally the blocking logic when loading new robots.txt was analyzed and a deadlock was removed. Furthermore the construction of new queue lists was redesigned and it was ensured that always a large list of different hosts for host-balancing is provided for the loader.
289 lines
13 KiB
289 lines
13 KiB
part of YACY
SPDX-FileCopyrightText: 2005, 2006 Alexander Schier
SPDX-FileCopyrightText: 2005, 2006 Martin Thelian
SPDX-License-Identifier: GPL-2.0-or-later
last change: $LastChangedDate$LastChangedBy: orbiter $
Revision: $LastChangedRevision$
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General public License for more details.
You should have received a copy of the GNU General private License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
extended to return structured objects instead of a Object[] and
extended to return a Allow-List by Michael Christen, 21.07.2008
extended to allow multiple user agents given by definition and
returning the used user agent my Michael Christen 3.4.2011
package net.yacy.crawler.robots;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.regex.Pattern;
import net.yacy.cora.document.encoding.UTF8;
* A class for Parsing robots.txt files.
* It only parses the Deny Part, yet.
* Robots RFC
* http://www.robotstxt.org/wc/norobots-rfc.html
* - On the request attempt resulted in temporary failure a robot
* should defer visits to the site until such time as the resource
* can be retrieved.
* - Extended Standard for Robot Exclusion
* See: http://www.conman.org/people/spc/robots2.html
* - Robot Exclusion Standard Revisited
* See: http://www.kollar.com/robots.html
public final class RobotsTxtParser {
private static final Pattern patternTab = Pattern.compile("\t");
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
private static final String ROBOTS_COMMENT = "#";
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private final ArrayList<String> allowList;
private final ArrayList<String> denyList;
private final ArrayList<String> sitemaps;
private long crawlDelayMillis;
private final String[] myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
protected RobotsTxtParser(final String[] myNames) {
this.allowList = new ArrayList<>(0);
this.denyList = new ArrayList<>(0);
this.sitemaps = new ArrayList<>(0);
this.crawlDelayMillis = 0;
this.myNames = myNames;
this.agentName = null;
protected RobotsTxtParser(final String[] myNames, final byte[] robotsTxt) {
if (robotsTxt != null && robotsTxt.length != 0) {
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
private void parse(final BufferedReader reader) {
final ArrayList<String> deny4AllAgents = new ArrayList<>();
final ArrayList<String> deny4ThisAgents = new ArrayList<>();
final ArrayList<String> allow4AllAgents = new ArrayList<>();
final ArrayList<String> allow4ThisAgents = new ArrayList<>();
int pos;
String line = null, lineUpper = null;
boolean isRule4AllAgents = false,
isRule4ThisAgents = false,
rule4ThisAgentsFound = false,
inBlock = false;
try {
lineparser: while ((line = reader.readLine()) != null) {
// replacing all tabs with spaces
line = patternTab.matcher(line).replaceAll(" ").trim();
lineUpper = line.toUpperCase();
// parse empty line
if (line.isEmpty()) {
// we have reached the end of the rule block
continue lineparser;
// parse comment
if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
continue lineparser;
// parse sitemap; if there are several sitemaps then take the first url
// TODO: support for multiple sitemaps
if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(' ');
if (pos != -1) {
continue lineparser;
// parse user agent
if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
inBlock = false;
isRule4AllAgents = false;
isRule4ThisAgents = false;
this.crawlDelayMillis = 0; // each block has a separate delay
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// getting out the robots name
pos = line.indexOf(' ');
if (pos != -1) {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
for (final String agent: this.myNames) {
if (userAgent.toLowerCase().equals(agent.toLowerCase())) {
this.agentName = agent;
isRule4ThisAgents = true;
if (isRule4ThisAgents) rule4ThisAgentsFound = true;
continue lineparser;
// Parse crawl delay:
// The crawl-delay directive is a non-standard value and not supported by google
// see: https://en.wikipedia.org/wiki/Robots_exclusion_standard#Crawl-delay_directive
// The interpretation of other crawlers vary:
// - Yandex interprets the value as the number of seconds to wait between subsequent visits.
// - Bing defines crawl-delay as the size of a time window (from 1 to 30 seconds) during which BingBot will access a web site only once.
// - Google provides an interface in its search console for webmasters, to control the Googlebot's subsequent visits.
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
inBlock = true;
if (isRule4ThisAgents || isRule4AllAgents) {
pos = line.indexOf(' ');
if (pos != -1) {
try {
// the crawl delay can be a float number
this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
// Because different crawlers apply different interpretations, we should do the same here for YaCy
// Many robots.txt entries have crawl-delay entries which makes it impossible to crawl the page at all,
// i.e. for values like "900" (would be 900 seconds which would be 15 minutes). To be able to operate,
// we must do a "good-for-everyone" interpetation. This is done already with the "flux"-compensation delay
// factor, which is applied additionally (its the 2-fold of the loading time from last visit), so we would be good
// even if we ignore the craw-delay at all.
this.crawlDelayMillis = Math.min(10000, this.crawlDelayMillis);
// In this case we limit the delay to 10 seconds most. If a host wants not to be indexed at all, there is
// an option for this in the robots.txt as well.
} catch (final NumberFormatException e) {
// invalid crawling delay
continue lineparser;
// parse disallow
if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRule4ThisAgents || isRule4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cut off tailing *
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// parse the path
pos = line.indexOf(' ');
if (pos >= 0) {
// getting the path
String path = line.substring(pos).trim();
// unencoding all special charsx
try {
path = UTF8.decodeURL(path);
} catch (final Exception e) {
* url decoding failed. E.g. because of
* "Incomplete trailing escape (%) pattern"
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
// adding it to the pathlist
if (isDisallowRule) {
if (isRule4AllAgents) deny4AllAgents.add(path);
if (isRule4ThisAgents) deny4ThisAgents.add(path);
} else {
if (isRule4AllAgents) allow4AllAgents.add(path);
if (isRule4ThisAgents) allow4ThisAgents.add(path);
continue lineparser;
} catch (final IOException e) {}
this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
* a crawl delay can be assigned to every agent or for all agents
* a special case is where the user agent of this yacy peer is given explicitely
* using the peer name and then if the crawl delay is given as '0' the crawler
* does not make any no-DOS-forced crawl pause.
* @return the crawl delay between two crawl access times in milliseconds
protected long crawlDelayMillis() {
return this.crawlDelayMillis;
* the user agent that was applied to get the crawl properties is recorded
* because it is possible that this robots.txt parser applies to several user agents
* which may be i.e. 'yacy', 'yacybot', <peer-name>'.yacy' or <peer-hash>'.yacyh'
* Effects: see also comment to crawlDelayMillis()
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
protected String agentName() {
return this.agentName;
protected ArrayList<String> sitemap() {
return this.sitemaps;
protected ArrayList<String> allowList() {
return this.allowList;
protected ArrayList<String> denyList() {
return this.denyList;