mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-14 07:56:08 -04:00
Speed up remove operations in rowCollections.
- Array element shifting during remove is only done when it is necessary to keep the order of a row collection. - This will speed up the most expensive operation "common word shrinking" by a factor of 500-1000 (in the worst cases we shifted > 60 GB of data during this operation) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4158 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
source/de/anomic/kelondro
@ -599,7 +599,6 @@ public class kelondroCollectionIndex {
|
||||
int newPartitionNumber;
|
||||
while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) {
|
||||
kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions));
|
||||
saveCommons(key, oldcollection);
|
||||
oldcollection = newcollection;
|
||||
}
|
||||
|
||||
@ -714,7 +713,6 @@ public class kelondroCollectionIndex {
|
||||
int newPartitionNumber;
|
||||
while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) {
|
||||
kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions));
|
||||
saveCommons(key, oldcollection);
|
||||
oldcollection = newcollection;
|
||||
}
|
||||
|
||||
@ -747,6 +745,7 @@ public class kelondroCollectionIndex {
|
||||
}
|
||||
|
||||
private kelondroRowSet shrinkCollection(byte[] key, kelondroRowSet collection, int targetSize) {
|
||||
//TODO Remove timing before release
|
||||
// removes entries from collection
|
||||
// the removed entries are stored in a 'commons' dump file
|
||||
|
||||
@ -754,23 +753,32 @@ public class kelondroCollectionIndex {
|
||||
int oldsize = collection.size();
|
||||
kelondroRowSet survival = new kelondroRowSet(collection.rowdef, 0);
|
||||
if (oldsize <= targetSize) return survival;
|
||||
long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0;
|
||||
long t1 = 0, t2 = 0;
|
||||
|
||||
// delete some entries, which are bad rated
|
||||
Iterator i = collection.rows();
|
||||
kelondroRow.Entry entry;
|
||||
byte[] ref;
|
||||
t1 = System.currentTimeMillis();
|
||||
while (i.hasNext()) {
|
||||
entry = (kelondroRow.Entry) i.next();
|
||||
ref = entry.getColBytes(0);
|
||||
if ((ref.length == 12) && (yacyURL.probablyRootURL(new String(ref)))) {
|
||||
t2 = System.currentTimeMillis();
|
||||
survival.addUnique(entry);
|
||||
sadd1 += System.currentTimeMillis() - t2;
|
||||
t2 = System.currentTimeMillis();
|
||||
i.remove();
|
||||
srem1 += System.currentTimeMillis() - t2;
|
||||
}
|
||||
}
|
||||
int firstSurvival = survival.size();
|
||||
tot1 = System.currentTimeMillis() - t1;
|
||||
|
||||
// check if we shrinked enough
|
||||
Random rand = new Random(System.currentTimeMillis());
|
||||
t1 = System.currentTimeMillis();
|
||||
while (survival.size() > targetSize) {
|
||||
// now delete randomly more entries from the survival collection
|
||||
i = survival.rows();
|
||||
@ -778,13 +786,22 @@ public class kelondroCollectionIndex {
|
||||
entry = (kelondroRow.Entry) i.next();
|
||||
ref = entry.getColBytes(0);
|
||||
if (rand.nextInt() % 4 != 0) {
|
||||
t2 = System.currentTimeMillis();
|
||||
collection.addUnique(entry);
|
||||
sadd2 += System.currentTimeMillis() - t2;
|
||||
t2 = System.currentTimeMillis();
|
||||
i.remove();
|
||||
srem2 += System.currentTimeMillis() - t2;
|
||||
}
|
||||
}
|
||||
}
|
||||
tot2 = System.currentTimeMillis() - t1;
|
||||
|
||||
serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2);
|
||||
serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", survival size = " + survival.size() + ", first survival = " + firstSurvival);
|
||||
|
||||
//finally dump the removed entries to a file
|
||||
saveCommons(key, collection);
|
||||
return survival;
|
||||
}
|
||||
|
||||
|
@ -363,26 +363,39 @@ public class kelondroRowCollection {
|
||||
chunkcount += c.size();
|
||||
}
|
||||
|
||||
protected synchronized final void removeRow(int p) {
|
||||
/**
|
||||
* This method removes the entry at position p ensuring the order of the remaining
|
||||
* entries if specified by keepOrder.
|
||||
* Note: Keeping the order is expensive. If you want to remove more than one element in
|
||||
* a batch with this method, it'd be better to do the removes without order keeping and doing
|
||||
* the sort after all the removes are done.
|
||||
*
|
||||
* @param p element at this position will be removed
|
||||
* @param keepOrder keep the order of remaining entries
|
||||
*/
|
||||
protected synchronized final void removeRow(int p, boolean keepOrder) {
|
||||
assert p >= 0 : "p = " + p;
|
||||
assert p < chunkcount : "p = " + p + ", chunkcount = " + chunkcount;
|
||||
assert chunkcount > 0 : "chunkcount = " + chunkcount;
|
||||
assert sortBound <= chunkcount : "sortBound = " + sortBound + ", chunkcount = " + chunkcount;
|
||||
if (p < sortBound) {
|
||||
// remove by shift
|
||||
System.arraycopy(
|
||||
chunkcache, (p + 1) * this.rowdef.objectsize(),
|
||||
if (keepOrder && (p < sortBound)) {
|
||||
// remove by shift (quite expensive for big collections)
|
||||
System.arraycopy(
|
||||
chunkcache, (p + 1) * this.rowdef.objectsize(),
|
||||
chunkcache, p * this.rowdef.objectsize(),
|
||||
(chunkcount - p - 1) * this.rowdef.objectsize());
|
||||
sortBound--;
|
||||
} else {
|
||||
// remove by copying the top-element to the remove position
|
||||
if (p != chunkcount - 1) {
|
||||
System.arraycopy(
|
||||
chunkcache, (chunkcount - 1) * this.rowdef.objectsize(),
|
||||
chunkcache, p * this.rowdef.objectsize(),
|
||||
this.rowdef.objectsize());
|
||||
}
|
||||
// remove by copying the top-element to the remove position
|
||||
if (p != chunkcount - 1) {
|
||||
System.arraycopy(
|
||||
chunkcache, (chunkcount - 1) * this.rowdef.objectsize(),
|
||||
chunkcache, p * this.rowdef.objectsize(),
|
||||
this.rowdef.objectsize());
|
||||
}
|
||||
// we moved the last element to the remove position: (p+1)st element
|
||||
// only the first p elements keep their order
|
||||
if (sortBound > p) sortBound = p;
|
||||
}
|
||||
chunkcount--;
|
||||
this.lastTimeWrote = System.currentTimeMillis();
|
||||
@ -414,6 +427,12 @@ public class kelondroRowCollection {
|
||||
return new rowIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator for kelondroRowCollection.
|
||||
* It supports remove() though it doesn't contain the order of the underlying
|
||||
* collection during removes.
|
||||
*
|
||||
*/
|
||||
public class rowIterator implements Iterator {
|
||||
|
||||
private int p;
|
||||
@ -432,7 +451,7 @@ public class kelondroRowCollection {
|
||||
|
||||
public void remove() {
|
||||
p--;
|
||||
removeRow(p);
|
||||
removeRow(p, false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -562,7 +581,7 @@ public class kelondroRowCollection {
|
||||
//System.out.println("ENTRY0: " + serverLog.arrayList(chunkcache, rowdef.objectsize*i, rowdef.objectsize));
|
||||
//System.out.println("ENTRY1: " + serverLog.arrayList(chunkcache, rowdef.objectsize*(i+1), rowdef.objectsize));
|
||||
if (compare(i, i + 1) == 0) {
|
||||
removeRow(i); // this decreases the chunkcount
|
||||
removeRow(i, true); // this decreases the chunkcount
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
|
@ -124,7 +124,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
|
||||
if (index < 0) return null;
|
||||
//System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length));
|
||||
kelondroRow.Entry entry = super.get(index);
|
||||
super.removeRow(index);
|
||||
super.removeRow(index, false);
|
||||
//System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length));
|
||||
int findagainindex = find(a, start, length);
|
||||
//System.out.println("kelondroRowSet.remove");
|
||||
|
Reference in New Issue
Block a user