mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-02-02 06:38:42 -05:00
e6a87e0426
a main problem when crawling is long waiting time cuased by crawl-delay values from robots.txt entries. that attribute is not supported by google and interpreted by yandex and bing in different ways. In large crawls there is always one host which blocks the whole crawl with extreme large values. YaCy now still obeys crawl-delay but limits them to 10 seconds. Additionally the blocking logic when loading new robots.txt was analyzed and a deadlock was removed. Furthermore the construction of new queue lists was redesigned and it was ensured that always a large list of different hosts for host-balancing is provided for the loader.
289 lines
13 KiB
Java
289 lines
13 KiB
Java
/*
|
|
robotsParser.java
|
|
-------------------------------------
|
|
part of YACY
|
|
|
|
SPDX-FileCopyrightText: 2005, 2006 Alexander Schier
|
|
SPDX-FileCopyrightText: 2005, 2006 Martin Thelian
|
|
SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
last change: $LastChangedDate$LastChangedBy: orbiter $
|
|
Revision: $LastChangedRevision$
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General public License for more details.
|
|
|
|
You should have received a copy of the GNU General private License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
extended to return structured objects instead of a Object[] and
|
|
extended to return a Allow-List by Michael Christen, 21.07.2008
|
|
extended to allow multiple user agents given by definition and
|
|
returning the used user agent my Michael Christen 3.4.2011
|
|
*/
|
|
|
|
package net.yacy.crawler.robots;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.util.ArrayList;
|
|
import java.util.regex.Pattern;
|
|
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
|
|
/*
|
|
* A class for Parsing robots.txt files.
|
|
* It only parses the Deny Part, yet.
|
|
*
|
|
* Robots RFC
|
|
* http://www.robotstxt.org/wc/norobots-rfc.html
|
|
*
|
|
* TODO:
|
|
* - On the request attempt resulted in temporary failure a robot
|
|
* should defer visits to the site until such time as the resource
|
|
* can be retrieved.
|
|
*
|
|
* - Extended Standard for Robot Exclusion
|
|
* See: http://www.conman.org/people/spc/robots2.html
|
|
*
|
|
* - Robot Exclusion Standard Revisited
|
|
* See: http://www.kollar.com/robots.html
|
|
*/
|
|
|
|
public final class RobotsTxtParser {
|
|
|
|
private static final Pattern patternTab = Pattern.compile("\t");
|
|
|
|
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
|
|
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
|
|
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
|
|
private static final String ROBOTS_COMMENT = "#";
|
|
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
|
|
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
|
|
|
|
private final ArrayList<String> allowList;
|
|
private final ArrayList<String> denyList;
|
|
private final ArrayList<String> sitemaps;
|
|
private long crawlDelayMillis;
|
|
private final String[] myNames; // a list of own name lists
|
|
private String agentName; // the name of the agent that was used to return the result
|
|
|
|
protected RobotsTxtParser(final String[] myNames) {
|
|
this.allowList = new ArrayList<>(0);
|
|
this.denyList = new ArrayList<>(0);
|
|
this.sitemaps = new ArrayList<>(0);
|
|
this.crawlDelayMillis = 0;
|
|
this.myNames = myNames;
|
|
this.agentName = null;
|
|
}
|
|
|
|
protected RobotsTxtParser(final String[] myNames, final byte[] robotsTxt) {
|
|
this(myNames);
|
|
if (robotsTxt != null && robotsTxt.length != 0) {
|
|
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
|
|
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
|
|
this.parse(reader);
|
|
}
|
|
}
|
|
|
|
private void parse(final BufferedReader reader) {
|
|
final ArrayList<String> deny4AllAgents = new ArrayList<>();
|
|
final ArrayList<String> deny4ThisAgents = new ArrayList<>();
|
|
final ArrayList<String> allow4AllAgents = new ArrayList<>();
|
|
final ArrayList<String> allow4ThisAgents = new ArrayList<>();
|
|
|
|
int pos;
|
|
String line = null, lineUpper = null;
|
|
boolean isRule4AllAgents = false,
|
|
isRule4ThisAgents = false,
|
|
rule4ThisAgentsFound = false,
|
|
inBlock = false;
|
|
|
|
try {
|
|
lineparser: while ((line = reader.readLine()) != null) {
|
|
// replacing all tabs with spaces
|
|
line = patternTab.matcher(line).replaceAll(" ").trim();
|
|
lineUpper = line.toUpperCase();
|
|
|
|
// parse empty line
|
|
if (line.isEmpty()) {
|
|
// we have reached the end of the rule block
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse comment
|
|
if (line.startsWith(ROBOTS_COMMENT)) {
|
|
// we can ignore this. Just a comment line
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse sitemap; if there are several sitemaps then take the first url
|
|
// TODO: support for multiple sitemaps
|
|
if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
this.sitemaps.add(line.substring(pos).trim());
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse user agent
|
|
if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
|
|
|
|
if (inBlock) {
|
|
// we have detected the start of a new block
|
|
inBlock = false;
|
|
isRule4AllAgents = false;
|
|
isRule4ThisAgents = false;
|
|
this.crawlDelayMillis = 0; // each block has a separate delay
|
|
}
|
|
|
|
// cutting off comments at the line end
|
|
pos = line.indexOf(ROBOTS_COMMENT);
|
|
if (pos != -1) line = line.substring(0,pos).trim();
|
|
|
|
// getting out the robots name
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
final String userAgent = line.substring(pos).trim();
|
|
isRule4AllAgents |= userAgent.equals("*");
|
|
for (final String agent: this.myNames) {
|
|
if (userAgent.toLowerCase().equals(agent.toLowerCase())) {
|
|
this.agentName = agent;
|
|
isRule4ThisAgents = true;
|
|
break;
|
|
}
|
|
}
|
|
if (isRule4ThisAgents) rule4ThisAgentsFound = true;
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// Parse crawl delay:
|
|
// The crawl-delay directive is a non-standard value and not supported by google
|
|
// see: https://en.wikipedia.org/wiki/Robots_exclusion_standard#Crawl-delay_directive
|
|
// The interpretation of other crawlers vary:
|
|
// - Yandex interprets the value as the number of seconds to wait between subsequent visits.
|
|
// - Bing defines crawl-delay as the size of a time window (from 1 to 30 seconds) during which BingBot will access a web site only once.
|
|
// - Google provides an interface in its search console for webmasters, to control the Googlebot's subsequent visits.
|
|
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
|
|
inBlock = true;
|
|
if (isRule4ThisAgents || isRule4AllAgents) {
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
try {
|
|
// the crawl delay can be a float number
|
|
this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
|
|
// Because different crawlers apply different interpretations, we should do the same here for YaCy
|
|
// Many robots.txt entries have crawl-delay entries which makes it impossible to crawl the page at all,
|
|
// i.e. for values like "900" (would be 900 seconds which would be 15 minutes). To be able to operate,
|
|
// we must do a "good-for-everyone" interpetation. This is done already with the "flux"-compensation delay
|
|
// factor, which is applied additionally (its the 2-fold of the loading time from last visit), so we would be good
|
|
// even if we ignore the craw-delay at all.
|
|
this.crawlDelayMillis = Math.min(10000, this.crawlDelayMillis);
|
|
// In this case we limit the delay to 10 seconds most. If a host wants not to be indexed at all, there is
|
|
// an option for this in the robots.txt as well.
|
|
} catch (final NumberFormatException e) {
|
|
// invalid crawling delay
|
|
}
|
|
}
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse disallow
|
|
if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
|
|
inBlock = true;
|
|
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
|
|
|
|
if (isRule4ThisAgents || isRule4AllAgents) {
|
|
// cutting off comments at the line end
|
|
pos = line.indexOf(ROBOTS_COMMENT);
|
|
if (pos != -1) line = line.substring(0,pos).trim();
|
|
|
|
// cut off tailing *
|
|
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
|
|
|
|
// parse the path
|
|
pos = line.indexOf(' ');
|
|
if (pos >= 0) {
|
|
// getting the path
|
|
String path = line.substring(pos).trim();
|
|
|
|
// unencoding all special charsx
|
|
try {
|
|
path = UTF8.decodeURL(path);
|
|
} catch (final Exception e) {
|
|
/*
|
|
* url decoding failed. E.g. because of
|
|
* "Incomplete trailing escape (%) pattern"
|
|
*/
|
|
}
|
|
|
|
// escaping all occurences of ; because this char is used as special char in the Robots DB
|
|
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
|
|
|
|
// adding it to the pathlist
|
|
if (isDisallowRule) {
|
|
if (isRule4AllAgents) deny4AllAgents.add(path);
|
|
if (isRule4ThisAgents) deny4ThisAgents.add(path);
|
|
} else {
|
|
if (isRule4AllAgents) allow4AllAgents.add(path);
|
|
if (isRule4ThisAgents) allow4ThisAgents.add(path);
|
|
}
|
|
}
|
|
}
|
|
continue lineparser;
|
|
}
|
|
}
|
|
} catch (final IOException e) {}
|
|
|
|
this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
|
|
this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
|
|
}
|
|
|
|
/**
|
|
* a crawl delay can be assigned to every agent or for all agents
|
|
* a special case is where the user agent of this yacy peer is given explicitely
|
|
* using the peer name and then if the crawl delay is given as '0' the crawler
|
|
* does not make any no-DOS-forced crawl pause.
|
|
* @return the crawl delay between two crawl access times in milliseconds
|
|
*/
|
|
protected long crawlDelayMillis() {
|
|
return this.crawlDelayMillis;
|
|
}
|
|
|
|
/**
|
|
* the user agent that was applied to get the crawl properties is recorded
|
|
* because it is possible that this robots.txt parser applies to several user agents
|
|
* which may be i.e. 'yacy', 'yacybot', <peer-name>'.yacy' or <peer-hash>'.yacyh'
|
|
* Effects: see also comment to crawlDelayMillis()
|
|
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
|
|
*/
|
|
protected String agentName() {
|
|
return this.agentName;
|
|
}
|
|
|
|
protected ArrayList<String> sitemap() {
|
|
return this.sitemaps;
|
|
}
|
|
|
|
protected ArrayList<String> allowList() {
|
|
return this.allowList;
|
|
}
|
|
|
|
protected ArrayList<String> denyList() {
|
|
return this.denyList;
|
|
}
|
|
}
|