mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-17 08:26:08 -04:00
protection against crawl balancer failure:
a minimum of 500 milliseconds distance between two acesses to the same domain is now ensured git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3354 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
source/de/anomic/plasma
@ -58,10 +58,12 @@ public class plasmaCrawlBalancer {
|
||||
|
||||
private kelondroStack stack;
|
||||
private HashMap domainStacks;
|
||||
private HashMap domainAccess;
|
||||
|
||||
public plasmaCrawlBalancer(File stackFile) {
|
||||
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0));
|
||||
domainStacks = new HashMap();
|
||||
domainAccess = new HashMap();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
@ -135,17 +137,36 @@ public class plasmaCrawlBalancer {
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] get() throws IOException {
|
||||
public String get(long minimumDelta) throws IOException {
|
||||
// returns an url-hash from the stack
|
||||
synchronized (domainStacks) {
|
||||
String entry = null;
|
||||
if (stack.size() > 0) {
|
||||
return stack.pop().getColBytes(0);
|
||||
entry = new String(stack.pop().getColBytes(0));
|
||||
} else if (domainStacks.size() > 0) {
|
||||
flushOnce();
|
||||
return stack.pop().getColBytes(0);
|
||||
} else {
|
||||
return null;
|
||||
entry = new String(stack.pop().getColBytes(0));
|
||||
}
|
||||
if ((minimumDelta > 0) && (entry != null)) {
|
||||
// check if the time after retrieval of last hash from same
|
||||
// domain is not shorter than the minimumDelta
|
||||
String domhash = entry.substring(6);
|
||||
Long lastAccess = (Long) domainAccess.get(domhash);
|
||||
if (lastAccess != null) {
|
||||
// this is not the first access of the same domain
|
||||
long la = lastAccess.longValue();
|
||||
if (System.currentTimeMillis() - la > minimumDelta) {
|
||||
// force a busy waiting here
|
||||
// in best case, this should never happen if the balancer works propertly
|
||||
// this is only to protect against the worst case, where the crawler could
|
||||
// behave in a DoS-manner
|
||||
long sleeptime = System.currentTimeMillis() - la - minimumDelta;
|
||||
if (sleeptime > 0) try {this.wait(sleeptime);} catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
||||
domainAccess.put(domhash, new Long(System.currentTimeMillis()));
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -76,6 +76,7 @@ public class plasmaCrawlNURL {
|
||||
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
|
||||
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
|
||||
|
||||
private static final long minimumDelta = 500; // the minimum time difference between access of the same domain
|
||||
/**
|
||||
* column length definition for the {@link plasmaURL#urlIndexFile} DB
|
||||
*/
|
||||
@ -432,7 +433,7 @@ public class plasmaCrawlNURL {
|
||||
private Entry pop(plasmaCrawlBalancer balancer) throws IOException {
|
||||
// this is a filo - pop
|
||||
if (balancer.size() > 0) {
|
||||
String hash = new String(balancer.get());
|
||||
String hash = balancer.get(minimumDelta);
|
||||
if (hash == null) throw new IOException("hash is null");
|
||||
Entry e = new Entry(hash);
|
||||
stackIndex.remove(e.hash);
|
||||
|
Reference in New Issue
Block a user