1
0
mirror of https://github.com/yacy/yacy_search_server.git synced 2025-07-19 08:44:42 -04:00

Redesign of crawler identification and robots steering. A non-p2p user

in intranets and the internet can now choose to appear as Googlebot.
This is an essential necessity to be able to compete in the field of
commercial search appliances, since most web pages are these days
optimized only for Google and no other search platform any more. All
commercial search engine providers have a built-in fake-Google User
Agent to be able to get the same search index as Google can do. Without
the resistance against obeying to robots.txt in this case, no
competition is possible any more. YaCy will always obey the robots.txt
when it is used for crawling the web in a peer-to-peer network, but to
establish a Search Appliance (like a Google Search Appliance, GSA) it is
necessary to be able to behave exactly like a Google crawler.
With this change, you will be able to switch the user agent when portal
or intranet mode is selected on per-crawl-start basis. Every crawl start
can have a different user agent.
This commit is contained in:
Michael Peter Christen
2013-08-22 14:23:47 +02:00
parent 0f3d8890db
commit 765943a4b7
82 changed files with 509 additions and 466 deletions
defaults
htroot
source/net/yacy

@ -266,15 +266,6 @@ dictionaries = DATA/DICTIONARIES
# storage place for new releases
releases = DATA/RELEASE
# time limits for the crawler:
# these times (milliseconds) are the shortest times for an access of the crawler to the same domain
# the crawler may read files faster than that, but never from the same domain faster than these time intervals
# a delta of 500 milliseconds means that no more than two files are taken from the same server
# there is a hard-coded limit which prevents that the used time is shorter that these default times
# the time-limits are distinguished for local and global crawls: there is no limit for an intranet-crawl.
minimumLocalDelta = 0
minimumGlobalDelta = 500
# the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny=

@ -57,7 +57,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -183,6 +182,7 @@ public class Bookmarks {
prop.put("mode", "2");
prop.put("display", "1");
display = 1;
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
if (urlHash.isEmpty()) {
prop.put("mode_edit", "0"); // create mode
prop.putHTML("mode_title", post.get("title"));
@ -202,7 +202,7 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash));
if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent));
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", urlentry.url().toNormalform(false));
prop.putHTML("mode_title", urlentry.dc_title());

@ -102,7 +102,7 @@ public class ConfigAppearance_p {
final Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
it = FileUtils.strings(u.get(ClientIdentification.getUserAgent(), 10000));
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch (final IOException e) {
prop.put("status", "1");// unable to get URL
prop.put("status_url", url);

@ -101,7 +101,7 @@ public class ConfigLanguage_p {
Iterator<String> it;
try {
final DigestURI u = new DigestURI(url);
it = FileUtils.strings(u.get(ClientIdentification.getUserAgent(), 10000));
it = FileUtils.strings(u.get(ClientIdentification.yacyInternetCrawlerAgent));
} catch(final IOException e) {
prop.put("status", "1");//unable to get url
prop.put("status_url", url);

@ -72,6 +72,7 @@ public class CrawlCheck_p {
prop.put("table", 0);
} else {
prop.put("table", 1);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
// make a string that is used to fill the starturls field again
// and analyze the urls to make the table rows
@ -84,22 +85,22 @@ public class CrawlCheck_p {
// try to load the robots
RobotsTxtEntry robotsEntry;
boolean robotsAllowed = true;
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
robotsEntry = sb.robots.getEntry(u, agent);
if (robotsEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", ClientIdentification.minLoadDelay() + " ms");
prop.put("table_list_" + row + "_crawldelay", agent.minimumDelta + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
robotsAllowed = !robotsEntry.isDisallowed(u);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
prop.put("table_list_" + row + "_crawldelay", Math.max(ClientIdentification.minLoadDelay(), robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
}
// try to load the url
if (robotsAllowed) try {
Request request = sb.loader.request(u, true, false);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, agent);
if (response == null) {
prop.put("table_list_" + row + "_access", "no response");
} else {

@ -263,6 +263,27 @@
</dd>
</dl>
</fieldset>
#(agentSelect)#<input type="hidden" name="agentName" id="agentName" value="#[defaultAgentName]#" />::
<fieldset>
<legend><label>Robot Behaviour</label></legend>
<dl>
<dt><label for="collection">Use Special User Agent and robot identification</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
(like the GSA) the user must be able to crawl all web pages that are granted to such commercial plattforms.
Not having this option would be a strong handicap for professional usage of this software. Therefore you are able to select
alternative user agents here which have different crawl timings and also identify itself with another user agent and obey the corresponding robots rule.
</span></span>
<select name="agentName" id="agentName">
#{list}#
<option value="#[name]#">#[name]#</option>
#{/list}#
</select>
</dd>
</dl>
</fieldset>
#(/agentSelect)#
<fieldset>
<legend><label>Index Administration</label></legend>
<dl>
@ -310,6 +331,7 @@
</span></span>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd>
</dl>
</fieldset>

@ -24,6 +24,10 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.List;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.search.Switchboard;
@ -82,8 +86,22 @@ public class CrawlStartExpert_p {
boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("collection", collectionEnabled ? "user" : "");
prop.put("collection", collectionEnabled ? "user" : "");
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
if (sb.isGlobalMode()) agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) agentNames.add(ClientIdentification.browserAgentName);
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
}
prop.put("agentSelect_list", agentNames.size());
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// return rewrite properties
return prop;
}

@ -264,6 +264,9 @@ public class Crawler_p {
env.setConfig("indexMedia", indexMedia);
env.setConfig("storeHTCache", storeHTCache);
String agentName = post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;
@ -290,7 +293,7 @@ public class Crawler_p {
// download document
Document scraper;
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, agent);
// get links and generate filter
for (DigestURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(u);
@ -375,7 +378,8 @@ public class Crawler_p {
storeHTCache,
crawlOrder,
cachePolicy,
collection);
collection,
agentName);
byte[] handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running

@ -66,7 +66,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -108,7 +108,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -150,7 +150,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -192,7 +192,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -235,7 +235,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo();
@ -279,7 +279,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND();

@ -34,6 +34,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ByteBuffer;
@ -95,6 +96,7 @@ public class IndexControlRWIs_p {
Segment segment = sb.index;
if ( post != null ) {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final String keystring = post.get("keystring", "").trim();
byte[] keyhash = post.get("keyhash", "").trim().getBytes();
if (keystring.length() > 0 && !keystring.contains(errmsg)) {
@ -175,7 +177,7 @@ public class IndexControlRWIs_p {
index = null;
}
if ( delurlref ) {
segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST);
segment.removeAllUrlReferences(urlb, sb.loader, agent, CacheStrategy.IFEXIST);
}
// delete the word first because that is much faster than the deletion of the urls from the url database
segment.termIndex().delete(keyhash);
@ -196,7 +198,7 @@ public class IndexControlRWIs_p {
if ( post.containsKey("keyhashdelete") ) {
try {
if ( delurlref ) {
segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST);
segment.removeAllUrlReferences(urlb, sb.loader, agent, CacheStrategy.IFEXIST);
}
if ( delurl || delurlref ) {
for ( final byte[] b : urlb ) {

@ -36,6 +36,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
@ -177,7 +178,8 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdeleteall")) {
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CacheStrategy.IFEXIST);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
}

@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.OAIPMHImporter;
@ -45,7 +46,8 @@ public class IndexImportOAIPMHList_p {
prop.put("source", 0);
if (post != null && post.containsKey("source")) {
final Set<String> oaiRoots = OAIListFriendsLoader.getListFriends(sb.loader).keySet();
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final Set<String> oaiRoots = OAIListFriendsLoader.getListFriends(sb.loader, agent).keySet();
boolean dark = false;
int count = 0;

@ -31,6 +31,7 @@ import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
@ -61,7 +62,8 @@ public class IndexImportOAIPMH_p {
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl);
final OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, agent);
final ResumptionToken rt = r.getResumptionToken();
prop.put("import-one", 1);
prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
@ -95,7 +97,8 @@ public class IndexImportOAIPMH_p {
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl);
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, url);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url);
job.start();
prop.put("status", 1);
prop.put("optiongetlist", 1);
@ -127,11 +130,12 @@ public class IndexImportOAIPMH_p {
// start jobs for the sources
DigestURI url = null;
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
while (!sourceList.isEmpty()) {
final String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size()));
try {
url = new DigestURI(oaipmhurl);
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, url);
final OAIPMHImporter job = new OAIPMHImporter(sb.loader, agent, url);
job.start();
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);

@ -154,6 +154,7 @@ public class Load_RSS_p {
}
if (post != null && post.containsKey("addSelectedFeedScheduler")) {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
Row row;
@ -175,7 +176,7 @@ public class Load_RSS_p {
continue;
}
// load feeds concurrently to get better responsibility in web interface
new RSSLoader(sb, url, collections).start();
new RSSLoader(sb, url, collections, agent).start();
}
}
}
@ -262,11 +263,13 @@ public class Load_RSS_p {
ConcurrentLog.warn("Load_RSS_p", "url not well-formed: '" + post.get("url", "") + "'");
}
ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
// if we have an url then try to load the rss
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true));
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, agent);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final IOException e) {

@ -382,7 +382,7 @@ public class Network {
prop.put(STR_TABLE_LIST + conCount + "_ssl", (seed.getFlagSSLAvailable()) ? 1 : 0);
userAgent = null;
if (seed.hash != null && seed.hash.equals(sb.peers.mySeed().hash)) {
userAgent = ClientIdentification.getUserAgent();
userAgent = ClientIdentification.yacyInternetCrawlerAgent.userAgent;
location = ClientIdentification.generateLocation();
} else {
userAgent = sb.peers.peerActions.getUserAgent(seed.getIP());

@ -123,34 +123,6 @@
</fieldset>
</form>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Balancer Settings:</legend>
<p>
This is the time delta between accessing of the same domain during a crawl. The crawl balancer tries to avoid that domains are
accessed too often, but if the balancer fails (i.e. if there are only links left from the same domain), then these minimum
delta times are ensured.
</p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td>Crawler Domain</td>
<td>Minimum Access Time Delta</td>
</tr>
<tr class="TableCellDark">
<td>local (intranet) crawls</td>
<td align="right"><input id="minimumLocalDelta" name="minimumLocalDelta" type="text" size="20" maxlength="100" value="#[minimumLocalDelta]#" /></td>
</tr>
<tr class="TableCellDark">
<td>global (internet) crawls</td>
<td align="right"><input id="minimumGlobalDelta" name="minimumGlobalDelta" type="text" size="20" maxlength="100" value="#[minimumGlobalDelta]#" /></td>
</tr>
<tr class="TableCellLight">
<td align="left" colspan="2"><input type="submit" name="minimumDeltaSubmit" value="Enter New Parameters" />
Changes take effect immediately</td>
</tr>
</table>
</fieldset>
</form>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Thread Pool Settings:</legend>
<table border="0" cellpadding="2" cellspacing="1">

@ -290,18 +290,6 @@ public class PerformanceQueues_p {
sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
}
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
final int minimumLocalDelta = post.getInt("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
final int minimumGlobalDelta = post.getInt("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
sb.setConfig("minimumLocalDelta", minimumLocalDelta);
sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
}
// delta settings
prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
prop.putNum("wordCacheSize", indexSegment.RWIBufferCount());
prop.putNum("wordCacheSizeKBytes", rwi == null ? 0 : rwi.getBufferSizeBytes()/1024);

@ -34,6 +34,7 @@ import java.util.Date;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -151,7 +152,8 @@ public class QuickCrawlLink_p {
indexText, indexMedia,
storeHTCache, remoteIndexing,
CacheStrategy.IFFRESH,
collection);
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist

@ -168,7 +168,8 @@ public class ViewFile {
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, agent);
} catch (final IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());

@ -104,7 +104,8 @@ public class ViewImage {
if (image == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, agent);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
}

@ -74,7 +74,9 @@ public class getpageinfo {
if (post != null && post.containsKey("url")) {
if (post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
String url = post.get("url");
String agentName = post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1"); // ok to crawl
prop.put("robotsInfo", "ftp does not follow robots.txt");
@ -96,7 +98,7 @@ public class getpageinfo {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -148,7 +150,7 @@ public class getpageinfo {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

@ -96,7 +96,8 @@ public class getpageinfo_p {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -148,8 +149,9 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
sb.robots.ensureExist(theURL, sb.peers.myBotIDs(), true);
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.robots.ensureExist(theURL, agent, true);
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

@ -35,7 +35,7 @@ import net.yacy.server.serverSwitch;
public class latency_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
final serverObjects prop = new serverObjects();
//final plasmaSwitchboard sb = (plasmaSwitchboard) env;
@ -43,6 +43,7 @@ public class latency_p {
Map.Entry<String, Host> e;
int c = 0;
Latency.Host host;
ClientIdentification.Agent agent = post == null ? ClientIdentification.yacyInternetCrawlerAgent : ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
while (i.hasNext()) {
e = i.next();
host = e.getValue();
@ -52,7 +53,7 @@ public class latency_p {
prop.put("domains_" + c + "_count", host.count());
prop.put("domains_" + c + "_average", host.average());
prop.put("domains_" + c + "_robots", host.robotsDelay());
prop.put("domains_" + c + "_flux", host.flux(ClientIdentification.minimumGlobalDeltaInit));
prop.put("domains_" + c + "_flux", host.flux(agent.minimumDelta));
c++;
}
prop.put("domains", c);

@ -97,7 +97,8 @@ public class webstructure {
prop.put("references", 1);
net.yacy.document.Document scraper = null;
if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, agent);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}

@ -1,5 +1,6 @@
import java.io.IOException;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.UserDB;
@ -23,21 +24,22 @@ public class add_ymark {
final boolean isAdmin = (sb.verifyAuthentication(header));
final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);
if(isAdmin || isAuthUser) {
if (isAdmin || isAuthUser) {
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
if(post.containsKey("redirect") && post.get("redirect").length() > 0) {
if (post.containsKey("redirect") && post.get("redirect").length() > 0) {
prop.put("redirect_url", post.get("redirect"));
prop.put("redirect", "1");
}
if(post.containsKey("urlHash")) {
if (post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes());
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {
sb.tables.bookmarks.createBookmark(sb.loader, url, bmk_user, true, tags, folders);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
prop.put("status", "1");
} catch (final IOException e) {
// TODO Auto-generated catch block
@ -47,7 +49,7 @@ public class add_ymark {
ConcurrentLog.logException(e);
}
} else if(post.containsKey(YMarkEntry.BOOKMARK.URL.key())) {
} else if (post.containsKey(YMarkEntry.BOOKMARK.URL.key())) {
String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
boolean hasProtocol = false;
for (final YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {

@ -4,6 +4,7 @@ import java.util.EnumMap;
import java.util.Iterator;
import java.util.regex.Pattern;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.UserDB;
@ -49,7 +50,8 @@ public class get_metadata {
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.index);
final Document document = meta.loadDocument(sb.loader);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final Document document = meta.loadDocument(sb.loader, agent);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE));

@ -8,6 +8,7 @@ import java.util.regex.Pattern;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
@ -212,7 +213,8 @@ public class get_treeview {
} else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) {
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.index);
final Document document = meta.loadDocument(sb.loader);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
final Document document = meta.loadDocument(sb.loader, agent);
final TreeMap<String, YMarkTag> tags = sb.tables.bookmarks.getTags(bmk_user);
if(isAutoTagger) {
prop.put("folders_"+count+"_foldername","<small><b>meta-"+YMarkMetadata.METADATA.KEYWORDS.name().toLowerCase()+":</b> " + meta.loadMetadata().get(YMarkMetadata.METADATA.KEYWORDS) + "</small>");

@ -13,6 +13,7 @@ import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.BookmarksDB;
@ -86,6 +87,7 @@ public class import_ymark {
if(post.containsKey("root") && post.get("root").length() > 0) {
root = post.get("root");
}
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
if(post.containsKey("bmkfile") && !post.get("bmkfile").isEmpty() && post.containsKey("importer")){
final byte[] bytes = UTF8.getBytes(post.get("bmkfile$file"));
stream = new ByteArrayInputStream(bytes);
@ -156,7 +158,7 @@ public class import_ymark {
row = APIcalls.next();
if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) {
final String url = row.get(WorkTables.TABLE_API_COL_COMMENT, "").substring(16);
sb.tables.bookmarks.createBookmark(sb.loader, url, bmk_user, autotag, "crawlStart", "/Crawl Start");
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, autotag, "crawlStart", "/Crawl Start");
}
}
prop.put("status", "1");
@ -186,7 +188,7 @@ public class import_ymark {
bmk_entry.put(YMarkEntry.BOOKMARK.FOLDERS.key(), root+bookmark.getFoldersString().replaceAll(".*"+YMarkUtil.TAGS_SEPARATOR+YMarkUtil.FOLDERS_SEPARATOR, root+YMarkUtil.FOLDERS_SEPARATOR));
}
if(autotag) {
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkAutoTagger.autoTag(bookmark.getUrl(), sb.loader, 3, sb.tables.bookmarks.getTags(bmk_user)));
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkAutoTagger.autoTag(bookmark.getUrl(), sb.loader, agent, 3, sb.tables.bookmarks.getTags(bmk_user)));
}
sb.tables.bookmarks.addBookmark(bmk_user, bmk_entry, merge, true);
prop.put("status", "1");

@ -102,6 +102,7 @@ public class sharedBlacklist_p {
Iterator<String> otherBlacklist = null;
ListAccumulator otherBlacklists = null;
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
if (post.containsKey("hash")) {
/* ======================================================
@ -138,7 +139,7 @@ public class sharedBlacklist_p {
// get List
final DigestURI u = new DigestURI(downloadURLOld);
otherBlacklist = FileUtils.strings(u.get(ClientIdentification.getUserAgent(), 10000));
otherBlacklist = FileUtils.strings(u.get(agent));
} catch (final Exception e) {
prop.put("status", STATUS_PEER_UNKNOWN);
prop.putHTML("status_name", hash);
@ -155,7 +156,7 @@ public class sharedBlacklist_p {
try {
final DigestURI u = new DigestURI(downloadURL);
otherBlacklist = FileUtils.strings(u.get(ClientIdentification.getUserAgent(), 10000));
otherBlacklist = FileUtils.strings(u.get(agent));
} catch (final Exception e) {
prop.put("status", STATUS_URL_PROBLEM);
prop.putHTML("status_address",downloadURL);

@ -85,7 +85,6 @@ import net.yacy.search.query.SearchEvent;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.query.SearchEventType;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.server.serverCore;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -579,7 +578,7 @@ public class yacysearch {
sb.loader.loadDocuments(
sb.loader.request(urlentry.url(), true, false),
CacheStrategy.IFEXIST,
Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
Integer.MAX_VALUE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} catch (final IOException e ) {
} catch (final Parser.Failure e ) {
}
@ -613,6 +612,7 @@ public class yacysearch {
sb.tables.bookmarks.createBookmark(
sb.loader,
url,
ClientIdentification.yacyInternetCrawlerAgent,
YMarkTables.USER_ADMIN,
true,
"searchresult",

@ -192,7 +192,7 @@ public class yacysearchitem {
String resultFileName = resultURL.getFileName();
prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString);
prop.put("content_ranking", result.ranking());
@ -286,7 +286,7 @@ public class yacysearchitem {
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = URLLicense.aquireLicense(ms.url());
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
prop.putHTML("content_item_href", resultUrlstring);
prop.putHTML("content_item_target", target);

@ -79,7 +79,7 @@ public class SMWListSyncThread {
+ "/limit%3D200000"
+ "/format%3Dystat");
String reply = UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).GETbytes(urlCount.toString()));
String reply = UTF8.String(new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent).GETbytes(urlCount.toString()));
String overallcount = reply.split(",")[0];
String lastsyncstring = reply.split(",")[1];
this.currentmax = Integer.parseInt(overallcount);

@ -50,6 +50,7 @@ import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient;
@ -2038,7 +2039,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return null;
}
public InputStream getInputStream(final String userAgent, final int timeout) throws IOException {
public InputStream getInputStream(final ClientIdentification.Agent agent) throws IOException {
if (isFile()) return new BufferedInputStream(new FileInputStream(getFSFile()));
if (isSMB()) return new BufferedInputStream(new SmbFileInputStream(getSmbFile()));
if (isFTP()) {
@ -2049,7 +2050,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return new ByteArrayInputStream(b);
}
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient(userAgent, timeout);
final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost());
return new ByteArrayInputStream(client.GETbytes(this));
}
@ -2057,7 +2058,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return null;
}
public byte[] get(final String userAgent, final int timeout) throws IOException {
public byte[] get(final ClientIdentification.Agent agent) throws IOException {
if (isFile()) return read(new FileInputStream(getFSFile()));
if (isSMB()) return read(new SmbFileInputStream(getSmbFile()));
if (isFTP()) {
@ -2068,7 +2069,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return b;
}
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient(userAgent, timeout);
final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost());
return client.GETbytes(this);
}

@ -53,7 +53,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final CacheStrategy verify;
final boolean global;
final Map<RSSMessage, List<Integer>> result;
final String userAgent;
final ClientIdentification.Agent agent;
private final BlockingQueue<RSSMessage> results;
@ -65,7 +65,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final int maximumRecordsInit,
final CacheStrategy verify,
final boolean global,
final String userAgent) {
final ClientIdentification.Agent agent) {
this.results = new LinkedBlockingQueue<RSSMessage>();
this.result = result;
this.query = query;
@ -74,12 +74,12 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
this.maximumRecordsInit = maximumRecordsInit;
this.verify = verify;
this.global = global;
this.userAgent = userAgent;
this.agent = agent;
}
@Override
public void run() {
searchSRURSS(this.results, this.urlBase, this.query, this.timeoutInit, this.maximumRecordsInit, this.verify, this.global, this.userAgent);
searchSRURSS(this.results, this.urlBase, this.query, this.timeoutInit, this.maximumRecordsInit, this.verify, this.global, this.agent);
int p = 1;
RSSMessage message;
try {
@ -103,7 +103,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final int maximumRecordsInit,
final CacheStrategy verify,
final boolean global,
final String userAgent) {
final ClientIdentification.Agent agent) {
final Thread job = new Thread() {
@Override
public void run() {
@ -116,7 +116,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final long st = System.currentTimeMillis();
RSSFeed feed;
try {
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global, userAgent);
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global, agent);
} catch (final IOException e1) {
//e1.printStackTrace();
break mainloop;
@ -162,7 +162,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
final int maximumRecords,
final CacheStrategy cacheStrategy,
final boolean global,
final String userAgent) throws IOException {
final ClientIdentification.Agent agent) throws IOException {
MultiProtocolURI uri = null;
try {
uri = new MultiProtocolURI(rssSearchServiceURL);
@ -181,7 +181,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
parts.put("nav", UTF8.StringBody("none"));
// result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
final HTTPClient httpClient = new HTTPClient(userAgent == null ? ClientIdentification.getUserAgent() : userAgent, (int) timeout);
final HTTPClient httpClient = new HTTPClient(agent);
result = httpClient.POSTbytes(new MultiProtocolURI(rssSearchServiceURL), uri.getHost(), parts, false);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);

@ -49,7 +49,7 @@ public class Network {
*/
public static Peers getNetwork(final String address) throws IOException {
Peers peers = new Peers();
final HTTPClient httpclient = new HTTPClient(ClientIdentification.getUserAgent(), 15000);
final HTTPClient httpclient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
final byte[] content = httpclient.GETbytes("http://" + address + "/Network.xml?page=1&maxCount=1000&ip=");
ByteArrayInputStream bais = new ByteArrayInputStream(content);
Document doc = null;

@ -25,13 +25,66 @@
package net.yacy.cora.protocol;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
public class ClientIdentification {
public static final long MIN_LOAD_DELAY = 500;
public static final int DEFAULT_TIMEOUT = 10000;
public static final int clientTimeoutInit = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
public static class Agent {
public final String userAgent; // the name that is send in http request to identify the agent
public final String[] robotIDs; // the name that is used in robots.txt to identify the agent
public final int minimumDelta; // the minimum delay between two accesses
public final int clientTimeout;
public Agent(final String userAgent, final String[] robotIDs, final int minimumDelta, final int clientTimeout) {
this.userAgent = userAgent;
this.robotIDs = robotIDs;
this.minimumDelta = minimumDelta;
this.clientTimeout = clientTimeout;
}
}
private final static String[] browserAgents = new String[]{ // fake browser user agents are NOT AVAILABLE IN P2P OPERATION, only on special customer configurations (commercial users demanded this, I personally think this is inadvisable)
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0",
"Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0"
};
private static final Random random = new Random(System.currentTimeMillis());
private static Map<String, Agent> agents = new ConcurrentHashMap<String, Agent>();
public final static String yacyInternetCrawlerAgentName = "YaCy Internet (cautious)";
public static Agent yacyInternetCrawlerAgent = null; // defined later in static
public final static String yacyIntranetCrawlerAgentName = "YaCy Intranet (greedy)";
public static Agent yacyIntranetCrawlerAgent = null; // defined later in static
public final static String googleAgentName = "Googlebot";
public final static Agent googleAgentAgent = new Agent("Googlebot/2.1 (+http://www.google.com/bot.html)", new String[]{"Googlebot", "Googlebot-Mobile"}, minimumGlobalDeltaInit / 2, clientTimeoutInit);
public final static String browserAgentName = "Random Browser";
public final static Agent browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit);
public final static String yacyProxyAgentName = "YaCyProxy";
public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit);
static {
generateYaCyBot("new");
agents.put(googleAgentName, googleAgentAgent);
agents.put(browserAgentName, browserAgent);
agents.put(yacyProxyAgentName, yacyProxyAgent);
}
/**
* provide system information (this is part of YaCy protocol)
*/
@ -39,34 +92,27 @@ public class ClientIdentification {
System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") +
"; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation();
/**
* the default user agent: YaCy
*/
private static String agent = generateYaCyBot("new");
/**
* produce a YaCy user agent string
* @param addinfo
* @return
*/
public static String generateYaCyBot(String addinfo) {
return "yacybot (" + addinfo + "; " + yacySystem + ") http://yacy.net/bot.html";
public static void generateYaCyBot(String addinfo) {
String agentString = "yacybot (" + addinfo + "; " + yacySystem + ") http://yacy.net/bot.html";
yacyInternetCrawlerAgent = new Agent(agentString, new String[]{"yacybot"}, minimumGlobalDeltaInit, clientTimeoutInit);
yacyIntranetCrawlerAgent = new Agent(agentString, new String[]{"yacybot"}, minimumLocalDeltaInit, clientTimeoutInit); // must have the same userAgent String as the web crawler because this is also used for snippets
agents.put(yacyInternetCrawlerAgentName, yacyInternetCrawlerAgent);
agents.put(yacyIntranetCrawlerAgentName, yacyIntranetCrawlerAgent);
}
/**
* set the user agent
* get the default agent
* @param newagent
*/
public static void setUserAgent(String newagent) {
agent = newagent;
}
/**
* produce a userAgent String for this cora client
* @return
*/
public static String getUserAgent() {
return agent;
public static Agent getAgent(String agentName) {
if (agentName == null || agentName.length() == 0) return yacyInternetCrawlerAgent;
Agent agent = agents.get(agentName);
return agent == null ? yacyInternetCrawlerAgent : agent;
}
/**
@ -119,8 +165,4 @@ public class ClientIdentification {
return location;
}
public static long minLoadDelay() {
return MIN_LOAD_DELAY;
}
}

@ -116,22 +116,24 @@ public class HTTPClient {
private HttpUriRequest currentRequest = null;
private long upbytes = 0L;
private int timeout = 10000;
private String userAgent = null;
private ClientIdentification.Agent agent = null;
private String host = null;
private boolean redirecting = true;
private String realm = null;
public HTTPClient(final String userAgent) {
super();
this.userAgent = userAgent;
HttpProtocolParams.setUserAgent(httpClient.getParams(), userAgent);
}
public HTTPClient(final String userAgent, final int timeout) {
public HTTPClient(final ClientIdentification.Agent agent) {
super();
this.userAgent = userAgent;
this.agent = agent;
this.timeout = agent.clientTimeout;
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
}
public HTTPClient(final ClientIdentification.Agent agent, final int timeout) {
super();
this.agent = agent;
this.timeout = timeout;
HttpProtocolParams.setUserAgent(httpClient.getParams(), userAgent);
HttpProtocolParams.setUserAgent(httpClient.getParams(), agent.userAgent);
}
public static void setDefaultUserAgent(final String defaultAgent) {
@ -165,7 +167,7 @@ public class HTTPClient {
*/
HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1);
// UserAgent
HttpProtocolParams.setUserAgent(httpParams, ClientIdentification.getUserAgent());
HttpProtocolParams.setUserAgent(httpParams, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
HttpProtocolParams.setUseExpectContinue(httpParams, false); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds
/**
* HTTP connection settings
@ -267,8 +269,8 @@ public class HTTPClient {
*
* @param userAgent
*/
public void setUserAgent(final String userAgent) {
this.userAgent = userAgent;
public void setUserAgent(final ClientIdentification.Agent agent) {
this.agent = agent;
}
/**
@ -671,8 +673,8 @@ public class HTTPClient {
HttpClientParams.setRedirecting(httpParams, this.redirecting);
HttpConnectionParams.setConnectionTimeout(httpParams, this.timeout);
HttpConnectionParams.setSoTimeout(httpParams, this.timeout);
if (this.userAgent != null)
HttpProtocolParams.setUserAgent(httpParams, this.userAgent);
if (this.agent != null)
HttpProtocolParams.setUserAgent(httpParams, this.agent.userAgent);
if (this.host != null)
httpParams.setParameter(HTTP.TARGET_HOST, this.host);
}
@ -778,8 +780,7 @@ public class HTTPClient {
} catch (final UnsupportedEncodingException e) {
System.out.println(e.getStackTrace());
}
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
client.setUserAgent("foobar");
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRedirecting(false);
// Get some
for (final String arg : args) {

@ -34,15 +34,17 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.openjena.atlas.logging.Log;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet;
@ -72,9 +74,6 @@ public class Balancer {
// class variables filled with external values
private final File cacheStacksPath;
private int minimumLocalDelta;
private int minimumGlobalDelta;
private final Set<String> myAgentIDs;
private BufferedObjectIndex urlFileIndex;
// class variables computed during operation
@ -97,16 +96,10 @@ public class Balancer {
public Balancer(
final File cachePath,
final String stackname,
final int minimumLocalDelta,
final int minimumGlobalDelta,
final Set<String> myAgentIDs,
final boolean useTailCache,
final boolean exceed134217727) {
this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE;
this.double_push_check = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
@ -129,19 +122,6 @@ public class Balancer {
ConcurrentLog.info("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString());
}
public int getMinimumLocalDelta() {
return this.minimumLocalDelta;
}
public int getMinimumGlobalDelta() {
return this.minimumGlobalDelta;
}
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
}
public synchronized void close() {
if (this.urlFileIndex != null) {
this.urlFileIndex.close();
@ -293,7 +273,7 @@ public class Balancer {
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
@ -307,7 +287,7 @@ public class Balancer {
final String hostname = entry.getKey();
final HostHandles hosthandles = entry.getValue();
int size = hosthandles.handleSet.size();
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
map.put(hostname, new Integer[]{size, delta});
}
return map;
@ -326,7 +306,7 @@ public class Balancer {
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
@ -339,8 +319,8 @@ public class Balancer {
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
@ -430,7 +410,7 @@ public class Balancer {
CrawlProfile profileEntry = null;
byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) {
byte[] nexthash = getbest(robots);
byte[] nexthash = getbest(robots, cs);
if (nexthash == null) return null;
synchronized (this) {
@ -464,15 +444,15 @@ public class Balancer {
}
}
if (crawlEntry == null) return null;
long robotsTime = getRobotsTime(robots, crawlEntry.url());
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {
@ -493,7 +473,7 @@ public class Balancer {
return crawlEntry;
}
private byte[] getbest(final RobotsTxt robots) {
private byte[] getbest(final RobotsTxt robots, final CrawlSwitchboard cs) {
synchronized (this.zeroWaitingCandidates) {
if (this.zeroWaitingCandidates.size() > 0) {
@ -535,11 +515,15 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry);
w = Latency.waitingRemaining(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(hostname, this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
}
w = Latency.waitingRemaining(crawlEntry.url(), robots, profileEntry.getAgent());
} catch (final IOException e1) {
w = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
Log.warn("Balancer", e1.getMessage(), e1);
continue;
}
if (w <= 0) {

@ -23,12 +23,12 @@ package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
@ -55,25 +55,13 @@ public class CrawlQueue {
private BufferedObjectIndex urlFileIndex;
private final HandleSet double_push_check;
private final Set<String> myAgentIDs;
private final RobotsTxt robots;
private final int minimumLocalDelta;
private final int minimumGlobalDelta;
public CrawlQueue(
final File cachePath,
final String filename,
final int minimumLocalDelta,
final int minimumGlobalDelta,
final Set<String> myAgentIDs,
final RobotsTxt robots,
final boolean useTailCache,
final boolean exceed134217727) {
this.myAgentIDs = myAgentIDs;
this.robots = robots;
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
cachePath.mkdirs();
@ -184,7 +172,7 @@ public class CrawlQueue {
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile) throws IOException, SpaceExceededException {
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
@ -210,7 +198,7 @@ public class CrawlQueue {
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
this.robots.ensureExist(entry.url(), CrawlQueue.this.myAgentIDs, true); // concurrently load all robots.txt
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
return null;
}
@ -222,12 +210,12 @@ public class CrawlQueue {
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final CrawlProfile profileEntry, final DigestURI crawlURL) {
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
@ -240,8 +228,8 @@ public class CrawlQueue {
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, this.myAgentIDs); // this uses the robots.txt database and may cause a loading of robots.txt from the server
private long getRobotsTime(final RobotsTxt robots, final DigestURI crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
@ -291,17 +279,17 @@ public class CrawlQueue {
}
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(profileEntry, crawlEntry.url());
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
long robotsTime = getRobotsTime(robots, crawlEntry.url());
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("CrawlQueue", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
ConcurrentLog.info("CrawlQueue", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {

@ -41,6 +41,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
@ -273,7 +274,8 @@ public final class CrawlSwitchboard {
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY);
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
@ -301,7 +303,8 @@ public final class CrawlSwitchboard {
false,
false,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE);
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
@ -329,7 +332,8 @@ public final class CrawlSwitchboard {
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
@ -357,7 +361,8 @@ public final class CrawlSwitchboard {
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -386,7 +391,8 @@ public final class CrawlSwitchboard {
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT);
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -414,7 +420,8 @@ public final class CrawlSwitchboard {
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
@ -442,7 +449,8 @@ public final class CrawlSwitchboard {
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
@ -470,7 +478,8 @@ public final class CrawlSwitchboard {
false,
false,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE);
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);

@ -38,6 +38,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
@ -54,7 +55,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
// this is a simple record structure that hold all properties of a single crawl start
private static final String HANDLE = "handle";
private static final String HANDLE = "handle";
public static final String AGENT_NAME = "agentName";
public static final String NAME = "name";
public static final String DEPTH = "generalDepth";
public static final String DIRECT_DOC_BY_URL= "directDocByURL";
@ -135,7 +137,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean storeHTCache,
final boolean remoteIndexing,
final CacheStrategy cacheStrategy,
final String collections) {
final String collections,
final String userAgentName) {
super(40);
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
@ -145,6 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength);
put(HANDLE, handle);
put(NAME, name);
put(AGENT_NAME, userAgentName);
put(CRAWLER_URL_MUSTMATCH, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CRAWLER_URL_MUSTNOTMATCH, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CRAWLER_IP_MUSTMATCH, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@ -209,6 +213,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return domname;
}
public ClientIdentification.Agent getAgent() {
String agentName = this.get(AGENT_NAME);
return ClientIdentification.getAgent(agentName);
}
public AtomicInteger getCount(final String domain) {
return this.doms.get(domain);
}

@ -41,7 +41,6 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
@ -82,7 +81,7 @@ public class CrawlQueues {
// start crawling management
this.log.config("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.index.fulltext(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.index.fulltext(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
@ -94,7 +93,7 @@ public class CrawlQueues {
this.workers = new ConcurrentHashMap<Integer, Loader>();
this.remoteCrawlProviderHashes.clear();
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.index.fulltext(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.index.fulltext(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
@ -634,7 +633,7 @@ public class CrawlQueues {
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
RobotsTxtEntry robotsEntry;
if ((this.request.url().getProtocol().equals("http") || this.request.url().getProtocol().equals("https")) &&
(robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), CrawlQueues.this.sb.peers.myBotIDs())) != null &&
(robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), this.profile.getAgent())) != null &&
robotsEntry.isDisallowed(this.request.url())) {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
CrawlQueues.this.errorURL.push(
@ -655,7 +654,7 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, this.profile.getAgent());
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) {

@ -25,13 +25,12 @@ package net.yacy.crawler.data;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
@ -120,17 +119,17 @@ public class Latency {
* @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/
public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
RobotsTxtEntry robotsEntry = robots.getEntry(url, agent);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
private static int waitingRobots(final String hostport, final RobotsTxt robots, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
private static int waitingRobots(final String hostport, final RobotsTxt robots, final ClientIdentification.Agent agent, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(hostport, thisAgents, fetchOnlineIfNotAvailableOrNotFresh);
RobotsTxtEntry robotsEntry = robots.getEntry(hostport, agent, fetchOnlineIfNotAvailableOrNotFresh);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
@ -143,20 +142,18 @@ public class Latency {
* @param hostname
* @param hosthash
* @param robots
* @param thisAgents
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @param agent
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = map.get(hosthash);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta;
int waiting = agent.minimumDelta;
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
@ -171,7 +168,7 @@ public class Latency {
// find the delay as given by robots.txt on target site
if (robots != null) {
int robotsDelay = waitingRobots(hostname + ":80", robots, thisAgents, false);
int robotsDelay = waitingRobots(hostname + ":80", robots, agent, false);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
}
@ -187,11 +184,10 @@ public class Latency {
* - the times that the domain was accessed (flux factor)
* - the response latency of the domain
* - and a given minimum access time as given in robots.txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @param agent
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/
public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
@ -199,7 +195,7 @@ public class Latency {
// find the minimum waiting time based on the network domain (local or global)
boolean local = url.isLocal();
int waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
int waiting = agent.minimumDelta;
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
@ -216,14 +212,14 @@ public class Latency {
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
int robotsDelay = waitingRobots(url, robots, thisAgents);
int robotsDelay = waitingRobots(url, robots, agent);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
return Math.min(60000, waiting) - timeSinceLastAccess;
}
public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
@ -232,7 +228,7 @@ public class Latency {
final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global)
int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
int waiting = agent.minimumDelta;
s.append("minimumDelta = ").append(waiting);
// for CGI accesses, we double the minimum time
@ -252,7 +248,7 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 3 / 2);
// find the delay as given by robots.txt on target site
int robotsDelay = waitingRobots(url, robots, thisAgents);
int robotsDelay = waitingRobots(url, robots, agent);
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);

@ -33,13 +33,11 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.Balancer;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.retrieval.Request;
@ -59,30 +57,14 @@ public class NoticedURL {
protected NoticedURL(
final File cachePath,
final Set<String> myAgentIDs,
final boolean useTailCache,
final boolean exceed134217727) {
ConcurrentLog.info("NoticedURL", "CREATING STACKS at " + cachePath.toString());
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", useTailCache, exceed134217727);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", ClientIdentification.minimumLocalDeltaInit, ClientIdentification.minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
}
public int getMinimumLocalDelta() {
return this.coreStack.getMinimumLocalDelta();
}
public int getMinimumGlobalDelta() {
return this.coreStack.getMinimumGlobalDelta();
}
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", useTailCache, exceed134217727);
}
public void clear() {

@ -34,6 +34,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -134,7 +135,7 @@ public class FileLoader {
}
// load the resource
InputStream is = url.getInputStream(null, -1);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
is.close();

@ -70,15 +70,15 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
}
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis();
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
@ -117,7 +117,7 @@ public final class HTTPLoader {
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURI refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
@ -127,7 +127,7 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
// HTTP-Client
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient client = new HTTPClient(agent);
client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
client.setTimout(this.socketTimeout);
client.setHeader(requestHeader.entrySet());
@ -178,7 +178,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, timeout);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
@ -218,11 +218,11 @@ public final class HTTPLoader {
}
}
public static Response load(final Request request) throws IOException {
return load(request, 3);
public static Response load(final Request request, ClientIdentification.Agent agent) throws IOException {
return load(request, agent, 3);
}
private static Response load(final Request request, final int retryCount) throws IOException {
private static Response load(final Request request, ClientIdentification.Agent agent, final int retryCount) throws IOException {
if (retryCount < 0) {
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
@ -246,12 +246,12 @@ public final class HTTPLoader {
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, DEFAULT_LANGUAGE);
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET);
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING);
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(agent);
client.setTimout(20000);
client.setHeader(requestHeader.entrySet());
final byte[] responseBody = client.GETbytes(request.url());
@ -300,7 +300,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1);
return load(request, agent, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file

@ -56,21 +56,23 @@ public class RSSLoader extends Thread {
public static final ARC<byte[], Date> indexTriggered = new ComparableARC<byte[], Date>(1000, Base64Order.enhancedCoder);
DigestURI urlf;
Switchboard sb;
String[] collections;
private final DigestURI urlf;
private final Switchboard sb;
private final String[] collections;
private final ClientIdentification.Agent agent;
public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections) {
public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections, final ClientIdentification.Agent agent) {
this.sb = sb;
this.urlf = urlf;
this.collections = collections;
this.agent = agent;
}
@Override
public void run() {
RSSReader rss = null;
try {
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, this.agent);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final MalformedURLException e) {

@ -42,6 +42,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -152,7 +153,7 @@ public class SMBLoader {
}
// load the resource
InputStream is = url.getInputStream(null, -1);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
is.close();

@ -57,7 +57,7 @@ public class SitemapImporter extends Thread {
public void run() {
try {
logger.info("Start parsing sitemap file " + this.siteMapURL);
sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL);
sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent());
parser.start();
URLEntry item;
while ((item = parser.take()) != sitemapParser.POISON_URLEntry) {

@ -31,13 +31,13 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
@ -89,13 +89,13 @@ public class RobotsTxt {
return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size();
}
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) {
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final ClientIdentification.Agent agent) {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(getHostPort(theURL), thisAgents, true);
return getEntry(getHostPort(theURL), agent, true);
}
public RobotsTxtEntry getEntry(final String urlHostPort, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
public RobotsTxtEntry getEntry(final String urlHostPort, final ClientIdentification.Agent agent, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
// this method will always return a non-null value
RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record;
@ -164,7 +164,7 @@ public class RobotsTxt {
if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null);
try {
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000);
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, agent);
} catch (final Throwable e) {
log.info("Trying to download the robots.txt file from URL '" + robotsURL + "' failed - " + e.getMessage());
response = null;
@ -174,7 +174,7 @@ public class RobotsTxt {
if (response == null) {
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, thisAgents);
processNewEntry(robotsURL, response, agent.robotIDs);
}
}
}
@ -182,7 +182,7 @@ public class RobotsTxt {
return robotsTxt4Host;
}
public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
public void ensureExist(final MultiProtocolURI theURL, final ClientIdentification.Agent agent, boolean concurrent) {
if (theURL.isLocal()) return;
final String urlHostPort = getHostPort(theURL);
if (urlHostPort == null) return;
@ -220,7 +220,7 @@ public class RobotsTxt {
if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null);
try {
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000);
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, agent);
} catch (final IOException e) {
response = null;
}
@ -229,7 +229,7 @@ public class RobotsTxt {
if (response == null) {
processOldEntry(null, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, thisAgents);
processNewEntry(robotsURL, response, agent.robotIDs);
}
}
}
@ -265,7 +265,7 @@ public class RobotsTxt {
}
}
private void processNewEntry(DigestURI robotsURL, Response response, final Set<String> thisAgents) {
private void processNewEntry(DigestURI robotsURL, Response response, final String[] thisAgents) {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
@ -282,6 +282,8 @@ public class RobotsTxt {
// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
if (isBrowserAgent) denyPath.clear();
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
parserResult.allowList(),

@ -160,7 +160,16 @@ public class RobotsTxtEntry {
public String toString() {
final StringBuilder str = new StringBuilder(6000);
str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
if (this.mem != null) str.append(this.mem.toString());
if (this.mem != null) {
str.append('{');
for (Map.Entry<String, byte[]> entry: this.mem.entrySet()) {
str.append(entry.getKey());
str.append('=');
str.append(UTF8.String(entry.getValue()));
str.append(',');
}
str.append('}');
}
return str.toString();
}

@ -36,7 +36,6 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
@ -75,10 +74,10 @@ public final class RobotsTxtParser {
private final ArrayList<String> denyList;
private String sitemap;
private long crawlDelayMillis;
private final Set<String> myNames; // a list of own name lists
private final String[] myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
protected RobotsTxtParser(final Set<String> myNames) {
protected RobotsTxtParser(final String[] myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
@ -87,7 +86,7 @@ public final class RobotsTxtParser {
this.agentName = null;
}
protected RobotsTxtParser(final Set<String> myNames, final byte[] robotsTxt) {
protected RobotsTxtParser(final String[] myNames, final byte[] robotsTxt) {
this(myNames);
if (robotsTxt != null && robotsTxt.length != 0) {
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
@ -158,7 +157,7 @@ public final class RobotsTxtParser {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
for (final String agent: this.myNames) {
if (userAgent.toLowerCase().equals(agent)) {
if (userAgent.toLowerCase().equals(agent.toLowerCase())) {
this.agentName = agent;
isRule4ThisAgents = true;
break;

@ -220,7 +220,7 @@ public class WorkTables extends Tables {
*/
public Map<String, Integer> execAPICalls(String host, int port, String realm, Collection<String> pks) {
// now call the api URLs and store the result status
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRealm(realm);
client.setTimout(120000);
Tables.Row row;
@ -252,7 +252,7 @@ public class WorkTables extends Tables {
public static int execAPICall(String host, int port, String realm, String path, byte[] pk) {
// now call the api URLs and store the result status
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setRealm(realm);
client.setTimout(120000);
String url = "http://" + host + ":" + port + path;

@ -24,7 +24,6 @@ import net.yacy.document.WordTokenizer;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {
@ -60,7 +59,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
this.merge = true;
}
private static Document loadDocument(final String url, final LoaderDispatcher loader) throws IOException {
private static Document loadDocument(final String url, final LoaderDispatcher loader, ClientIdentification.Agent agent) throws IOException {
DigestURI uri;
Response response;
try {
@ -69,7 +68,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url);
return null;
}
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent);
try {
return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Failure e) {
@ -210,11 +209,11 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
}
public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {
public static String autoTag(final String url, final LoaderDispatcher loader, ClientIdentification.Agent agent, final int max, final TreeMap<String, YMarkTag> tags) {
Document document = null;
String exception = "/IOExceptions";
try {
document = loadDocument(url, loader);
document = loadDocument(url, loader, agent);
} catch (final IOException e) {
exception = e.getMessage();
int start = exception.indexOf('\'')+9;
@ -247,7 +246,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
try {
final TreeMap<String, YMarkTag> tags = this.ymarks.getTags(this.bmk_user);
while((url = this.bmkQueue.take()) != POISON) {
tagString = autoTag(url, this.loader, 5, tags);
tagString = autoTag(url, this.loader, ClientIdentification.yacyInternetCrawlerAgent, 5, tags);
if (tagString.startsWith("/IOExceptions")) {
this.ymarks.addFolder(this.bmk_user, url, tagString);
tagString = "";

@ -34,6 +34,7 @@ import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
@ -185,7 +186,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ,
true, true, true, true, true, false,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),

@ -41,7 +41,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.TextSnippet;
public class YMarkMetadata {
private DigestURI uri;
@ -96,10 +95,10 @@ public class YMarkMetadata {
this.indexSegment = null;
}
public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
public Document loadDocument(final LoaderDispatcher loader, ClientIdentification.Agent agent) throws IOException, Failure {
if(this.document == null) {
Response response = null;
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent);
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
}
return this.document;

@ -43,6 +43,7 @@ import javax.swing.event.ChangeEvent;
import javax.swing.event.ChangeListener;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.WorkTables;
@ -362,15 +363,15 @@ public class YMarkTables {
addBookmark(bmk_user, bmk, true, true);
}
public void createBookmark(final LoaderDispatcher loader, final String url, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure {
createBookmark(loader, new DigestURI(url), bmk_user, autotag, tagsString, foldersString);
public void createBookmark(final LoaderDispatcher loader, final String url, final ClientIdentification.Agent agent, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure {
createBookmark(loader, new DigestURI(url), agent, bmk_user, autotag, tagsString, foldersString);
}
public void createBookmark(final LoaderDispatcher loader, final DigestURI url, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure {
public void createBookmark(final LoaderDispatcher loader, final DigestURI url, final ClientIdentification.Agent agent, final String bmk_user, final boolean autotag, final String tagsString, final String foldersString) throws IOException, Failure {
final YMarkEntry bmk_entry = new YMarkEntry(false);
final YMarkMetadata meta = new YMarkMetadata(url);
final Document document = meta.loadDocument(loader);
final Document document = meta.loadDocument(loader, agent);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
final String urls = url.toNormalform(true);
bmk_entry.put(YMarkEntry.BOOKMARK.URL.key(), urls);

@ -47,7 +47,6 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@ -60,11 +59,11 @@ public class OAIListFriendsLoader implements Serializable {
private static final HashMap<String, File> listFriends = new HashMap<String, File>();
public static void init(final LoaderDispatcher loader, final Map<String, File> moreFriends) {
public static void init(final LoaderDispatcher loader, final Map<String, File> moreFriends, final ClientIdentification.Agent agent) {
listFriends.putAll(moreFriends);
if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
try {
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, agent);
} catch (final MalformedURLException e) {
}
}
@ -84,12 +83,12 @@ public class OAIListFriendsLoader implements Serializable {
}
public static Map<String, String> getListFriends(final LoaderDispatcher loader) {
public static Map<String, String> getListFriends(final LoaderDispatcher loader, final ClientIdentification.Agent agent) {
final Map<String, String> map = new TreeMap<String, String>();
Map<String, String> m;
for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}

@ -33,6 +33,7 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
@ -59,8 +60,10 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
private final ResumptionToken resumptionToken;
private String message;
private final int serialNumber;
private final ClientIdentification.Agent agent;
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
public OAIPMHImporter(final LoaderDispatcher loader, final ClientIdentification.Agent agent, final DigestURI source) {
this.agent = agent;
this.serialNumber = importerCounter--;
this.loader = loader;
this.recordsCount = 0;
@ -134,7 +137,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
this.message = "loading first part of records";
while (true) {
try {
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath);
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, this.agent);
this.completeListSize = Math.max(this.completeListSize, loader.getResumptionToken().getCompleteListSize());
this.chunkCount++;
this.recordsCount += loader.getResumptionToken().getRecordCounter();
@ -183,8 +186,9 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
LoaderDispatcher loader,
File surrogatesIn,
File surrogatesOut,
long staleLimit) {
Set<String> plainList = OAIListFriendsLoader.getListFriends(loader).keySet();
long staleLimit,
ClientIdentification.Agent agent) {
Set<String> plainList = OAIListFriendsLoader.getListFriends(loader, agent).keySet();
Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
long limit = System.currentTimeMillis() - staleLimit;
for (Map.Entry<String, Date> a: loaded.entrySet()) {

@ -32,7 +32,6 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
// get one server with
@ -46,7 +45,7 @@ public class OAIPMHLoader {
private final DigestURI source;
private final ResumptionToken resumptionToken;
public OAIPMHLoader(final LoaderDispatcher loader, final DigestURI source, final File targetDir) throws IOException {
public OAIPMHLoader(final LoaderDispatcher loader, final DigestURI source, final File targetDir, final ClientIdentification.Agent agent) throws IOException {
this.source = source;
// load the file from the net
@ -56,7 +55,7 @@ public class OAIPMHLoader {
for (int i = 0; i < 5; i++) {
// make some retries if first attempt fails
try {
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent);
break;
} catch (final IOException e) {
ConcurrentLog.warn("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true));

@ -299,7 +299,7 @@ public class htmlParser extends AbstractParser implements Parser {
DigestURI url;
try {
url = new DigestURI(args[0]);
final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent);
final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);

@ -72,7 +72,7 @@ public class sitemapParser extends AbstractParser implements Parser {
final String charset, final InputStream source)
throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>();
SitemapReader sitemap = new SitemapReader(source);
SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent);
sitemap.start();
DigestURI uri;
Document doc;
@ -107,11 +107,11 @@ public class sitemapParser extends AbstractParser implements Parser {
return da;
}
public static SitemapReader parse(final DigestURI sitemapURL) throws IOException {
public static SitemapReader parse(final DigestURI sitemapURL, final ClientIdentification.Agent agent) throws IOException {
// download document
ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true));
final RequestHeader requestHeader = new RequestHeader();
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), 5000);
final HTTPClient client = new HTTPClient(agent);
client.setHeader(requestHeader.entrySet());
try {
client.GET(sitemapURL.toString());
@ -130,7 +130,7 @@ public class sitemapParser extends AbstractParser implements Parser {
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
return new SitemapReader(counterStream);
return new SitemapReader(counterStream, agent);
} catch (final IOException e) {
throw e;
}
@ -144,9 +144,11 @@ public class sitemapParser extends AbstractParser implements Parser {
public static class SitemapReader extends Thread {
private final InputStream source;
private final BlockingQueue<URLEntry> queue;
public SitemapReader(final InputStream source) {
private final ClientIdentification.Agent agent;
public SitemapReader(final InputStream source, final ClientIdentification.Agent agent) {
this.source = source;
this.queue = new ArrayBlockingQueue<URLEntry>(10000);
this.agent = agent;
}
@Override
public void run() {
@ -157,7 +159,7 @@ public class sitemapParser extends AbstractParser implements Parser {
String url = new SitemapEntry((Element) sitemapNodes.item(i)).url();
if (url != null && url.length() > 0) {
try {
final SitemapReader r = parse(new DigestURI(url));
final SitemapReader r = parse(new DigestURI(url), agent);
r.start();
URLEntry item;
while ((item = r.take()) != POISON_URLEntry) {

@ -98,6 +98,7 @@ public class opensearchdescriptionReader extends DefaultHandler {
private boolean parsingDescription, parsingTextValue;
private final HashMap<String, String> items; // Opensearchdescription Item map
private String rssurl, atomurl; // search url templates
private ClientIdentification.Agent agent;
public opensearchdescriptionReader() {
this.items = new HashMap<String, String>();
@ -106,6 +107,7 @@ public class opensearchdescriptionReader extends DefaultHandler {
this.parsingTextValue = false;
this.rssurl = null;
this.atomurl = null;
this.agent = ClientIdentification.yacyInternetCrawlerAgent;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
@ -142,10 +144,11 @@ public class opensearchdescriptionReader extends DefaultHandler {
}
}
public opensearchdescriptionReader(final String path, int timeout) {
public opensearchdescriptionReader(final String path, final ClientIdentification.Agent agent) {
this();
this.agent = agent;
try {
HTTPClient www = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
HTTPClient www = new HTTPClient(agent);
www.GET(path);
final SAXParser saxParser = getParser();
saxParser.parse(www.getContentstream(), this);
@ -163,7 +166,7 @@ public class opensearchdescriptionReader extends DefaultHandler {
this.rssurl = null;
this.atomurl = null;
try {
HTTPClient www = new HTTPClient(ClientIdentification.getUserAgent(), 1000);
HTTPClient www = new HTTPClient(this.agent);
www.GET(path);
final SAXParser saxParser = getParser();
try {

@ -20,8 +20,6 @@ import net.yacy.server.http.ServerSideIncludes;
import org.jsoup.Jsoup;
public class AugmentHtmlStream {
static RequestHeader globalrequestHeader;
@ -32,7 +30,7 @@ public class AugmentHtmlStream {
* @return the web page with integrated REFLECT elements
*/
private static String processExternal(String url, String fieldname, String data) throws IOException {
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
try {
StringBuilder postdata = new StringBuilder();
postdata.append(fieldname);

@ -187,7 +187,7 @@ public static String Tableentry(String url, String type, String comment, String
Seed host = sb.peers.lookupByName(sb.getConfig("interaction.contribution.accumulationpeer", ""));
return (UTF8.String(new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT).POSTbytes(
return (UTF8.String(new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent).POSTbytes(
"http://"+host.getPublicAddress()+"/interaction/Contribution.json"
+ "?url=" + url + "&comment=" + comment
+ "&from=" + from + "&peer=" + peer,

@ -150,7 +150,8 @@ public final class Protocol {
final String filename,
final Map<String, ContentBody> parts,
final int timeout) throws IOException {
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
httpClient.setTimout(timeout);
return httpClient.POSTbytes(
new MultiProtocolURI("http://" + targetAddress + "/yacy/" + filename),
Seed.b64Hash2hexHash(targetPeerHash) + ".yacyh",
@ -192,7 +193,7 @@ public final class Protocol {
// send request
final long start = System.currentTimeMillis();
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/hello.html"), 30000, yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 30000);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 30000);
content =
httpClient.POSTbytes(
new MultiProtocolURI("http://" + address + "/yacy/hello.html"),
@ -513,7 +514,7 @@ public final class Protocol {
parts.put("count", UTF8.StringBody(Integer.toString(maxCount)));
parts.put("time", UTF8.StringBody(Long.toString(maxTime)));
// final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), (int) maxTime);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, (int) maxTime);
final byte[] result =
httpClient.POSTbytes(new MultiProtocolURI("http://"
+ target.getClusterAddress()
@ -935,7 +936,7 @@ public final class Protocol {
//resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts));
}
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 8000);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 8000);
byte[] a = httpClient.POSTbytes(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false);
if (a != null && a.length > 200000) {
// there is something wrong. This is too large, maybe a hack on the other side?
@ -1286,7 +1287,7 @@ public final class Protocol {
UTF8.StringBody(((entry == null) ? "" : crypt.simpleEncode(entry.toString(), salt))));
// send request
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 10000);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000);
final byte[] content =
httpClient.POSTbytes(
new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"),
@ -1465,7 +1466,7 @@ public final class Protocol {
parts.put("entryc", UTF8.StringBody(Integer.toString(indexcount)));
parts.put("indexes", UTF8.StringBody(entrypost.toString()));
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout);
final byte[] content =
httpClient.POSTbytes(
new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"),
@ -1523,7 +1524,7 @@ public final class Protocol {
try {
parts.put("urlc", UTF8.StringBody(Integer.toString(urlc)));
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout);
final byte[] content =
httpClient.POSTbytes(
new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"),
@ -1556,7 +1557,7 @@ public final class Protocol {
final Map<String, ContentBody> parts =
basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt);
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), 5000, targetSeed.getHexHash() + ".yacyh", parts);
final HTTPClient httpclient = new HTTPClient(ClientIdentification.getUserAgent(), 15000);
final HTTPClient httpclient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 15000);
final byte[] content =
httpclient.POSTbytes(
new MultiProtocolURI("http://" + address + "/yacy/profile.html"),

@ -95,7 +95,6 @@ public final class SeedDB implements AlternativeDomainNames {
public Distribution scheme;
private Seed mySeed; // my own seed
private final Set<String> myBotIDs; // list of id's that this bot accepts as robots.txt identification
public SeedDB(
final File networkRoot,
@ -112,10 +111,6 @@ public final class SeedDB implements AlternativeDomainNames {
this.seedPotentialDBFile = new File(networkRoot, seedPotentialDBFileName);
this.mySeed = null; // my own seed
this.myOwnSeedFile = myOwnSeedFile;
this.myBotIDs = new HashSet<String>();
this.myBotIDs.add("yacy");
this.myBotIDs.add("yacybot");
this.myBotIDs.add("yacyproxy");
this.netRedundancy = redundancy;
this.scheme = new Distribution(partitionExponent);
@ -221,16 +216,10 @@ public final class SeedDB implements AlternativeDomainNames {
System.exit(-1);
}
}
this.myBotIDs.add(this.mySeed.getName() + ".yacy");
this.myBotIDs.add(this.mySeed.hash + ".yacyh");
this.mySeed.setIP(""); // we delete the old information to see what we have now
this.mySeed.put(Seed.PEERTYPE, Seed.PEERTYPE_VIRGIN); // markup startup condition
}
public Set<String> myBotIDs() {
return this.myBotIDs;
}
public int redundancy() {
if (this.mySeed.isJunior()) return 1;
return this.netRedundancy;
@ -253,9 +242,7 @@ public final class SeedDB implements AlternativeDomainNames {
}
public void setMyName(final String name) {
this.myBotIDs.remove(this.mySeed.getName() + ".yacy");
this.mySeed.setName(name);
this.myBotIDs.add(name + ".yacy");
}
@Override
@ -821,9 +808,9 @@ public final class SeedDB implements AlternativeDomainNames {
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.PRAGMA, "no-cache");
reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache"); // httpc uses HTTP/1.0 is this necessary?
reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setHeader(reqHeader.entrySet());
byte[] content = null;
try {

@ -41,7 +41,6 @@ import net.yacy.crawler.data.Cache;
import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.visualization.RasterPlotter;
public class OSMTile {
@ -114,7 +113,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, ClientIdentification.yacyInternetCrawlerAgent);
} catch (final IOException e) {
ConcurrentLog.warn("OSMTile", "cannot load: " + e.getMessage());
return null;

@ -238,7 +238,7 @@ public final class yacyRelease extends yacyVersion {
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, ClientIdentification.yacyInternetCrawlerAgent);
} catch (final IOException e) {
return null;
}
@ -288,7 +288,7 @@ public final class yacyRelease extends yacyVersion {
final String name = getUrl().getFileName();
byte[] signatureBytes = null;
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
// download signature first, if public key is available
try {

@ -132,9 +132,9 @@ public final class LoaderDispatcher {
0);
}
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay, timeout).getContent();
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, agent).getContent();
if (b == null) throw new IOException("load == null");
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -145,11 +145,11 @@ public final class LoaderDispatcher {
tmp.renameTo(targetFile);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay, timeout);
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, agent);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
Semaphore check = this.loaderSteering.get(request.url());
if (check != null) {
// a loading process may be going on for that url
@ -160,7 +160,7 @@ public final class LoaderDispatcher {
this.loaderSteering.put(request.url(), new Semaphore(0));
try {
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout);
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
return response;
@ -180,7 +180,7 @@ public final class LoaderDispatcher {
* @return the loaded entity in a Response object
* @throws IOException
*/
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
// get the protocol of the next URL
final DigestURI url = request.url();
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
@ -206,7 +206,7 @@ public final class LoaderDispatcher {
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURI refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
@ -258,7 +258,7 @@ public final class LoaderDispatcher {
if (!url.isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
if (lastAccess != null) wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis());
if (wait > 0) {
// force a sleep here. Instead just sleep we clean up the accessTime map
final long untilTime = System.currentTimeMillis() + wait;
@ -280,7 +280,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, timeout);
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) {
response = this.ftpLoader.load(request, true);
} else if (protocol.equals("smb")) {
@ -335,19 +335,19 @@ public final class LoaderDispatcher {
* @return the content as {@link byte[]}
* @throws IOException
*/
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// try to download the resource using the loader
final Response entry = load(request, cacheStrategy, blacklistType, minDelay, timeout);
final Response entry = load(request, cacheStrategy, blacklistType, agent);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
return entry.getContent();
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException, Parser.Failure {
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout);
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, agent);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -358,10 +358,10 @@ public final class LoaderDispatcher {
return response.parse();
}
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load resource
Request request = request(location, true, false);
final Response response = this.load(request, cachePolicy, blacklistType, minDelay, timeout);
final Response response = this.load(request, cachePolicy, blacklistType, agent);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -384,8 +384,8 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<DigestURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay, timeout);
public final Map<DigestURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
if (response.getContent() == null) throw new IOException("resource == null");
@ -414,12 +414,12 @@ public final class LoaderDispatcher {
}
}
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay, final int timeout) {
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start();
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) {
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, agent).start();
}
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) {
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start();
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) {
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, agent).start();
}
private class Loader extends Thread {
@ -429,17 +429,15 @@ public final class LoaderDispatcher {
private final int maxFileSize;
private final CacheStrategy cacheStrategy;
private final BlacklistType blacklistType;
private final long minDelay;
private final int timeout;
private final ClientIdentification.Agent agent;
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, final int timeout) {
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) {
this.url = url;
this.cache = cache;
this.maxFileSize = maxFileSize;
this.cacheStrategy = cacheStrategy;
this.blacklistType = blacklistType;
this.minDelay = minDelay;
this.timeout = timeout;
this.agent = agent;
}
@Override
@ -447,7 +445,7 @@ public final class LoaderDispatcher {
if (this.cache != null && this.cache.exists()) return;
try {
// load from the net
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay, this.timeout);
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.agent);
final byte[] b = response.getContent();
if (this.cache != null) FileUtils.copy(b, this.cache);
} catch (final MalformedURLException e) {} catch (final IOException e) {}

@ -191,7 +191,6 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.server.serverCore;
import net.yacy.server.serverSwitch;
import net.yacy.server.http.RobotsTxtConfig;
@ -792,11 +791,8 @@ public final class Switchboard extends serverSwitch {
OAIListFriendsLoader.loadListFriendsSources(
new File("defaults/oaiListFriendsSource.xml"),
getDataPath());
OAIListFriendsLoader.init(this.loader, oaiFriends);
OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent);
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
getConfigInt("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
getConfigInt("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
// on startup, resume all crawls
setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false");
@ -1233,16 +1229,9 @@ public final class Switchboard extends serverSwitch {
}
*/
// write the YaCy network identification inside the yacybot client user agent to distinguish networks
String newagent =
ClientIdentification.generateYaCyBot(getConfig(SwitchboardConstants.NETWORK_NAME, "")
ClientIdentification.generateYaCyBot(getConfig(SwitchboardConstants.NETWORK_NAME, "")
+ (isRobinsonMode() ? "-" : "/")
+ getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global"));
if ( !getConfigBool(SwitchboardConstants.DHT_ENABLED, false)
&& getConfig("network.unit.tenant.agent", "").length() > 0 ) {
newagent = getConfig("network.unit.tenant.agent", "").trim();
this.log.info("new user agent: '" + newagent + "'");
}
ClientIdentification.setUserAgent(newagent);
}
public void switchNetwork(final String networkDefinition) throws FileNotFoundException, IOException {
@ -1486,6 +1475,10 @@ public final class Switchboard extends serverSwitch {
return sb;
}
public boolean isP2PMode() {
return getConfig(SwitchboardConstants.NETWORK_BOOTSTRAP_SEEDLIST_STUB + "0", null) != null;
}
public boolean isIntranetMode() {
return "local.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0;
}
@ -2910,7 +2903,7 @@ public final class Switchboard extends serverSwitch {
// get a scraper to get the title
Document scraper;
try {
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, profile.getAgent());
} catch (final IOException e) {
return "scraper cannot load URL: " + e.getMessage();
}
@ -2964,7 +2957,7 @@ public final class Switchboard extends serverSwitch {
// do the same for ymarks
// TODO: could a non admin user add crawls?
try {
this.tables.bookmarks.createBookmark(this.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
this.tables.bookmarks.createBookmark(this.loader, url, profile.getAgent(), YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final Failure e) {
@ -3017,7 +3010,7 @@ public final class Switchboard extends serverSwitch {
String urlName = url.toNormalform(true);
Thread.currentThread().setName("Switchboard.addToIndex:" + urlName);
try {
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.yacyIntranetCrawlerAgent);
if (response == null) {
throw new IOException("response == null");
}
@ -3418,7 +3411,7 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURI, String> links;
searchEvent.oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000);
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
if ( links != null ) {
final Iterator<DigestURI> i = links.keySet().iterator();
while ( i.hasNext() ) {
@ -3457,7 +3450,7 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURI, String> links;
DigestURI url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000);
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<DigestURI> i = links.keySet().iterator();
@ -3521,7 +3514,7 @@ public final class Switchboard extends serverSwitch {
searchEvent.oneFeederStarted();
try {
final Response response =
Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
@ -3629,7 +3622,7 @@ public final class Switchboard extends serverSwitch {
if ( Thread.currentThread().isInterrupted() ) {
break;
}
seedListFileURL = this.getConfig("network.unit.bootstrap.seedlist" + c, "");
seedListFileURL = this.getConfig(SwitchboardConstants.NETWORK_BOOTSTRAP_SEEDLIST_STUB + c, "");
if ( seedListFileURL.isEmpty() ) {
break;
}
@ -3653,7 +3646,7 @@ public final class Switchboard extends serverSwitch {
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.PRAGMA, "no-cache");
reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache");
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout);
client.setHeader(reqHeader.entrySet());
client.HEADResponse(url.toString());

@ -444,10 +444,11 @@ public final class SwitchboardConstants {
*
*/
public static final String NETWORK_NAME = "network.unit.name";
public static final String NETWORK_DOMAIN = "network.unit.domain";
public static final String NETWORK_DOMAIN = "network.unit.domain"; // can be filled with: global, local, any
public static final String NETWORK_DOMAIN_NOCHECK = "network.unit.domain.nocheck";
public static final String NETWORK_WHITELIST = "network.unit.access.whitelist";
public static final String NETWORK_BLACKLIST = "network.unit.access.blacklist";
public static final String NETWORK_BOOTSTRAP_SEEDLIST_STUB = "network.unit.bootstrap.seedlist";
public static final String NETWORK_SEARCHVERIFY = "network.unit.inspection.searchverify";

@ -34,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -151,7 +152,7 @@ public class DocumentIndex extends Segment {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent));
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}

@ -767,8 +767,8 @@ public class Segment {
return vector;
}
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final ClientIdentification.Agent agent, final CacheStrategy cacheStrategy) {
for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, agent, cacheStrategy);
}
/**
@ -779,7 +779,7 @@ public class Segment {
* @param cacheStrategy
* @return number of removed words
*/
public int removeAllUrlReferences(final byte[] urlhash, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
public int removeAllUrlReferences(final byte[] urlhash, final LoaderDispatcher loader, final ClientIdentification.Agent agent, final CacheStrategy cacheStrategy) {
if (urlhash == null) return 0;
// determine the url string
@ -788,7 +788,7 @@ public class Segment {
try {
// parse the resource
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT));
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
if (document == null) {
// delete just the url entry
fulltext().remove(urlhash);

@ -143,7 +143,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
Document document;
try {
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent));
} catch (final IOException e) {
ConcurrentLog.fine("snippet fetch", "load error: " + e.getMessage());
return new ArrayList<MediaSnippet>();

@ -36,6 +36,7 @@ import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.storage.HandleSet;
@ -60,7 +61,6 @@ import net.yacy.search.query.QueryGoal;
public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnippet> {
public static final long snippetMinLoadDelay = 10;
private static final int MAX_CACHE = 1000;
@ -204,7 +204,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
final Request request = loader == null ? null : loader.request(url, true, reindexing);
Response response;
try {
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, snippetMinLoadDelay, 3000);
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} catch (final IOException e1) {
response = null;
}
@ -258,7 +258,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try to load the resource from the cache
Response response = null;
try {
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, snippetMinLoadDelay, 3000);
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
} catch (final IOException e) {
response = null;
}

@ -97,6 +97,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -1532,7 +1533,7 @@ public final class HTTPDFileHandler {
requestHeader.put("YACYACTION", action);
final ByteArrayOutputStream o = new ByteArrayOutputStream();
HTTPDProxyHandler.doGet(prop, requestHeader, o);
HTTPDProxyHandler.doGet(prop, requestHeader, o, ClientIdentification.yacyProxyAgent);
// reparse header to extract content-length and mimetype
final ResponseHeader outgoingHeader = new ResponseHeader(200);

@ -287,7 +287,7 @@ public final class HTTPDProxyHandler {
* @param respond the OutputStream to the client
* @see de.anomic.http.httpdHandler#doGet(java.util.Properties, net.yacy.cora.protocol.HeaderFramework, java.io.OutputStream)
*/
public static void doGet(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream respond) {
public static void doGet(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream respond, final ClientIdentification.Agent agent) {
ByteCountOutputStream countedRespond = null;
try {
final int reqID = requestHeader.hashCode();
@ -387,7 +387,7 @@ public final class HTTPDProxyHandler {
// case 1 and case 3
if (cachedResponseHeader == null) {
if (log.isFinest()) log.finest(reqID + " page not in cache: fulfill request from web");
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond, agent);
} else {
final Request request = new Request(
null,
@ -413,7 +413,7 @@ public final class HTTPDProxyHandler {
fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
} else {
if (log.isFinest()) log.finest(reqID + " fulfill request from web");
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond, agent);
}
}
@ -443,7 +443,7 @@ public final class HTTPDProxyHandler {
}
}
private static void fulfillRequestFromWeb(final HashMap<String, Object> conProp, final DigestURI url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond) {
private static void fulfillRequestFromWeb(final HashMap<String, Object> conProp, final DigestURI url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond, final ClientIdentification.Agent agent) {
try {
final boolean proxyAugmentation = sb.getConfigBool("proxyAugmentation", false);
final int reqID = requestHeader.hashCode();
@ -488,7 +488,7 @@ public final class HTTPDProxyHandler {
requestHeader.remove(HeaderFramework.HOST);
final HTTPClient client = setupHttpClient(requestHeader, connectHost);
final HTTPClient client = setupHttpClient(requestHeader, agent, connectHost);
// send request
try {
@ -761,7 +761,7 @@ public final class HTTPDProxyHandler {
return;
}
public static void doHead(final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream respond) {
public static void doHead(final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream respond, final ClientIdentification.Agent agent) {
// ResponseContainer res = null;
DigestURI url = null;
@ -832,7 +832,7 @@ public final class HTTPDProxyHandler {
final String getUrl = "http://"+ connectHost + remotePath;
if (log.isFinest()) log.finest(reqID +" using url: "+ getUrl);
final HTTPClient client = setupHttpClient(requestHeader, connectHost);
final HTTPClient client = setupHttpClient(requestHeader, agent, connectHost);
// send request
// try {
@ -877,7 +877,7 @@ public final class HTTPDProxyHandler {
}
}
public static void doPost(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream respond, final InputStream body) throws IOException {
public static void doPost(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream respond, final InputStream body, final ClientIdentification.Agent agent) throws IOException {
assert conProp != null : "precondition violated: conProp != null";
assert requestHeader != null : "precondition violated: requestHeader != null";
assert body != null : "precondition violated: body != null";
@ -942,7 +942,7 @@ public final class HTTPDProxyHandler {
final int contentLength = requestHeader.getContentLength();
requestHeader.remove(HeaderFramework.CONTENT_LENGTH);
final HTTPClient client = setupHttpClient(requestHeader, connectHost);
final HTTPClient client = setupHttpClient(requestHeader, agent, connectHost);
// check input
if(body == null) {
@ -1073,9 +1073,9 @@ public final class HTTPDProxyHandler {
* @param connectHost may be 'host:port' or 'host:port/path'
* @return
*/
private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) {
private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final ClientIdentification.Agent agent, final String connectHost) {
// setup HTTP-client
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
final HTTPClient client = new HTTPClient(agent, timeout);
client.setHeader(requestHeader.entrySet());
client.setRedirecting(false);
return client;
@ -1215,7 +1215,7 @@ public final class HTTPDProxyHandler {
}
}
public static void doConnect(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final InputStream clientIn, final OutputStream clientOut) throws IOException {
public static void doConnect(final HashMap<String, Object> conProp, final RequestHeader requestHeader, final InputStream clientIn, final OutputStream clientOut, final ClientIdentification.Agent agent) throws IOException {
sb.proxyLastAccess = System.currentTimeMillis();
@ -1247,7 +1247,7 @@ public final class HTTPDProxyHandler {
// possibly branch into PROXY-PROXY connection
if (ProxySettings.useForHost(host, Protocol.HTTPS)) {
final HTTPClient remoteProxy = setupHttpClient(requestHeader, host);
final HTTPClient remoteProxy = setupHttpClient(requestHeader, agent, host);
try {
remoteProxy.HEADResponse("http://" + host + ":" + port);

@ -55,6 +55,7 @@ import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -410,7 +411,7 @@ public final class HTTPDemon implements serverHandler, Cloneable {
// pass to proxy
if (((allowYaCyHop()) && (handleYaCyHopAuthentication(header, prop))) ||
((allowProxy(session)) && (handleProxyAuthentication(header, prop, session)))) {
HTTPDProxyHandler.doGet(prop, header, session.out);
HTTPDProxyHandler.doGet(prop, header, session.out, ClientIdentification.yacyProxyAgent);
} else {
// not authorized through firewall blocking (ip does not match filter)
session.out.write(UTF8.getBytes(httpVersion + " 403 refused (IP not granted, 2)" + serverCore.CRLF_STRING + serverCore.CRLF_STRING + "you are not allowed to connect to this proxy, because you are using a non-granted IP (" + session.userAddress.getHostAddress() + "). allowed are only connections that match with the following filter (2): " + switchboard.getConfig("proxyClient", "*") + serverCore.CRLF_STRING));
@ -478,7 +479,7 @@ public final class HTTPDemon implements serverHandler, Cloneable {
// pass to proxy
if (((allowYaCyHop()) && (handleYaCyHopAuthentication(header, prop))) ||
((allowProxy(session)) && (handleProxyAuthentication(header, prop, session)))) {
HTTPDProxyHandler.doHead(prop, header, session.out);
HTTPDProxyHandler.doHead(prop, header, session.out, ClientIdentification.yacyProxyAgent);
} else {
// not authorized through firewall blocking (ip does not match filter)
session.out.write(UTF8.getBytes(httpVersion + " 403 refused (IP not granted)" + serverCore.CRLF_STRING));
@ -544,7 +545,7 @@ public final class HTTPDemon implements serverHandler, Cloneable {
// pass to proxy
if (((allowYaCyHop()) && (handleYaCyHopAuthentication(header, prop))) ||
((allowProxy(session)) && (handleProxyAuthentication(header, prop, session)))) {
HTTPDProxyHandler.doPost(prop, header, session.out, sessionIn);
HTTPDProxyHandler.doPost(prop, header, session.out, sessionIn, ClientIdentification.yacyProxyAgent);
} else {
// not authorized through firewall blocking (ip does not match filter)
session.out.write(UTF8.getBytes(httpVersion + " 403 refused (IP not granted)" + serverCore.CRLF_STRING + serverCore.CRLF_STRING + "you are not allowed to connect to this proxy, because you are using the non-granted IP " + session.userAddress.getHostAddress() + ". allowed are only connections that match with the following filter (4): " + switchboard.getConfig("proxyClient", "*") + serverCore.CRLF_STRING));
@ -633,7 +634,7 @@ public final class HTTPDemon implements serverHandler, Cloneable {
// pass to proxy
if (((allowYaCyHop()) && (handleYaCyHopAuthentication(header, prop))) ||
((allowProxy(session)) && (handleProxyAuthentication(header, prop, session)))) {
HTTPDProxyHandler.doConnect(prop, header, session.in, session.out);
HTTPDProxyHandler.doConnect(prop, header, session.in, session.out, ClientIdentification.yacyProxyAgent);
} else {
// not authorized through firewall blocking (ip does not match filter)
session.out.write(UTF8.getBytes(httpVersion + " 403 refused (IP not granted)" + serverCore.CRLF_STRING + serverCore.CRLF_STRING + "you are not allowed to connect to this proxy, because you are using the non-granted IP " + session.userAddress.getHostAddress() + ". allowed are only connections that match with the following filter (6): " + switchboard.getConfig("proxyClient", "*") + serverCore.CRLF_STRING));

@ -589,8 +589,8 @@ public class serverSwitch
netdef = netdef.trim();
try {
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.yacyInternetCrawlerAgent.userAgent);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
client.setHeader(reqHeader.entrySet());
byte[] data = client.GETbytes(uri);
if ( data == null || data.length == 0 ) {

@ -60,8 +60,8 @@ public class loaderThreads {
this.failed = 0;
}
public void newThread(final String name, final DigestURI url, final loaderProcess process) {
final Thread t = new loaderThread(url, process);
public void newThread(final String name, final DigestURI url, final loaderProcess process, final ClientIdentification.Agent agent) {
final Thread t = new loaderThread(url, process, agent);
this.threads.put(name, t);
t.start();
}
@ -108,19 +108,21 @@ public class loaderThreads {
private final loaderProcess process;
private byte[] page;
private boolean loaded;
final ClientIdentification.Agent agent;
public loaderThread(final DigestURI url, final loaderProcess process) {
public loaderThread(final DigestURI url, final loaderProcess process, final ClientIdentification.Agent agent) {
this.url = url;
this.process = process;
this.error = null;
this.page = null;
this.loaded = false;
this.agent = agent;
}
@Override
public void run() {
try {
this.page = this.url.get(ClientIdentification.getUserAgent(), loaderThreads.this.timeout);
this.page = this.url.get(this.agent);
this.loaded = true;
this.process.feed(this.page);
if (this.process.status() == loaderCore.STATUS_FAILED) {

@ -301,7 +301,7 @@ public final class yacy {
yacyRelease.deleteOldDownloads(sb.releasePath, deleteOldDownloadsAfterDays );
// set user-agent
HTTPClient.setDefaultUserAgent(ClientIdentification.getUserAgent());
HTTPClient.setDefaultUserAgent(ClientIdentification.yacyInternetCrawlerAgent.userAgent);
// initial fill of the triplestore
File triplestore = new File(sb.getConfig("triplestore", new File(dataHome, "DATA/TRIPLESTORE").getAbsolutePath()));
@ -549,7 +549,7 @@ public final class yacy {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(RequestHeader.AUTHORIZATION, "realm=" + encodedPassword); // for http-authentify
// final Client con = new Client(10000, requestHeader);
final HTTPClient con = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT);
final HTTPClient con = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
con.setHeader(requestHeader.entrySet());
// ResponseContainer res = null;
try {