mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-17 08:26:08 -04:00
- removed some usage of indexEntity
- changed index collection process: indexes are not first flushed to indexEntity, but now collected directly from ram cache and assortments git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1489 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
@ -3,7 +3,7 @@ javacSource=1.4
|
||||
javacTarget=1.4
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=0.423
|
||||
releaseVersion=0.424
|
||||
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
|
||||
|
@ -60,7 +60,6 @@ import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaCrawlLURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaURL;
|
||||
import de.anomic.plasma.plasmaWordIndexEntity;
|
||||
import de.anomic.plasma.plasmaWordIndexEntry;
|
||||
import de.anomic.plasma.plasmaWordIndexEntryContainer;
|
||||
import de.anomic.server.serverObjects;
|
||||
@ -150,22 +149,15 @@ public class IndexControl_p {
|
||||
if (post.containsKey("keyhashdeleteall")) {
|
||||
if (delurl || delurlref) {
|
||||
// generate an urlx array
|
||||
plasmaWordIndexEntity index = null;
|
||||
try {
|
||||
index = switchboard.wordIndex.getEntity(keyhash, true, -1);
|
||||
Iterator en = index.elements(true);
|
||||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
while (en.hasNext()) {
|
||||
urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
|
||||
}
|
||||
index.close();
|
||||
index = null;
|
||||
} catch (IOException e) {
|
||||
urlx = new String[0];
|
||||
} finally {
|
||||
if (index != null) try { index.close(); } catch (Exception e) {}
|
||||
plasmaWordIndexEntryContainer index = null;
|
||||
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
|
||||
Iterator en = index.entries();
|
||||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
while (en.hasNext()) {
|
||||
urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
|
||||
}
|
||||
index = null;
|
||||
}
|
||||
if (delurlref) {
|
||||
for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
|
||||
@ -256,12 +248,12 @@ public class IndexControl_p {
|
||||
}
|
||||
prop.put("urlstring", "");
|
||||
prop.put("urlhash", "");
|
||||
plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
|
||||
plasmaWordIndexEntryContainer index;
|
||||
String result;
|
||||
long starttime = System.currentTimeMillis();
|
||||
indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
|
||||
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
|
||||
// built urlCache
|
||||
Iterator urlIter = indexes[0].entries();
|
||||
Iterator urlIter = index.entries();
|
||||
HashMap knownURLs = new HashMap();
|
||||
HashSet unknownURLEntries = new HashSet();
|
||||
plasmaWordIndexEntry indexEntry;
|
||||
@ -271,8 +263,8 @@ public class IndexControl_p {
|
||||
try {
|
||||
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
|
||||
if (lurl.toString() == null) {
|
||||
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
|
||||
unknownURLEntries.add(indexEntry.getUrlHash());
|
||||
urlIter.remove();
|
||||
} else {
|
||||
knownURLs.put(indexEntry.getUrlHash(), lurl);
|
||||
}
|
||||
@ -280,23 +272,17 @@ public class IndexControl_p {
|
||||
unknownURLEntries.add(indexEntry.getUrlHash());
|
||||
}
|
||||
}
|
||||
// now delete all entries that have no url entry
|
||||
Iterator hashIter = unknownURLEntries.iterator();
|
||||
while (hashIter.hasNext()) {
|
||||
indexes[0].remove((String) hashIter.next());
|
||||
}
|
||||
// use whats remaining
|
||||
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
|
||||
int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000);
|
||||
result = yacyClient.transferIndex (
|
||||
result = yacyClient.transferIndex(
|
||||
yacyCore.seedDB.getConnected(post.get("hostHash", "")),
|
||||
indexes,
|
||||
new plasmaWordIndexEntryContainer[]{index},
|
||||
knownURLs,
|
||||
"true".equalsIgnoreCase(gzipBody),
|
||||
timeout);
|
||||
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
|
||||
indexes[0] = null;
|
||||
indexes = null;
|
||||
prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
|
||||
index = null;
|
||||
}
|
||||
|
||||
// generate list
|
||||
|
@ -47,7 +47,6 @@
|
||||
// if the shell's current path is HTROOT
|
||||
|
||||
import java.util.Date;
|
||||
import java.io.IOException;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
@ -86,16 +85,7 @@ public final class query {
|
||||
if (obj.equals("rwiurlcount")) {
|
||||
// the total number of different urls in the rwi is returned
|
||||
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned
|
||||
de.anomic.plasma.plasmaWordIndexEntity entity = null;
|
||||
try {
|
||||
entity = sb.wordIndex.getEntity(env, true, -1);
|
||||
prop.put("response", entity.size());
|
||||
entity.close();
|
||||
} catch (IOException e) {
|
||||
prop.put("response", -1);
|
||||
} finally {
|
||||
if (entity != null) try { entity.close(); } catch (Exception e) {}
|
||||
}
|
||||
prop.put("response", sb.wordIndex.indexSize(env));
|
||||
return prop;
|
||||
}
|
||||
|
||||
|
@ -225,22 +225,16 @@ public class plasmaDbImporter extends Thread {
|
||||
Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true);
|
||||
while (!isAborted() && importWordHashIterator.hasNext()) {
|
||||
|
||||
plasmaWordIndexEntity importWordIdxEntity = null;
|
||||
plasmaWordIndexEntryContainer newContainer;
|
||||
try {
|
||||
wordCounter++;
|
||||
wordHash = (String) importWordHashIterator.next();
|
||||
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1);
|
||||
newContainer = importWordIndex.getContainer(wordHash, true, -1);
|
||||
|
||||
if (importWordIdxEntity.size() == 0) {
|
||||
importWordIdxEntity.deleteComplete();
|
||||
continue;
|
||||
}
|
||||
|
||||
// creating a container used to hold the imported entries
|
||||
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
|
||||
if (newContainer.size() == 0) continue;
|
||||
|
||||
// the combined container will fit, read the container
|
||||
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
|
||||
Iterator importWordIdxEntries = newContainer.entries();
|
||||
plasmaWordIndexEntry importWordIdxEntry;
|
||||
while (importWordIdxEntries.hasNext()) {
|
||||
|
||||
@ -262,9 +256,6 @@ public class plasmaDbImporter extends Thread {
|
||||
}
|
||||
} catch (IOException e) {}
|
||||
|
||||
// adding word index entity to container
|
||||
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
|
||||
|
||||
if (entryCounter % 500 == 0) {
|
||||
this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far.");
|
||||
}
|
||||
@ -277,7 +268,6 @@ public class plasmaDbImporter extends Thread {
|
||||
homeWordIndex.addEntries(newContainer, true);
|
||||
|
||||
// delete complete index entity file
|
||||
importWordIdxEntity.close();
|
||||
importWordIndex.deleteIndex(wordHash);
|
||||
|
||||
// print out some statistical information
|
||||
@ -300,7 +290,6 @@ public class plasmaDbImporter extends Thread {
|
||||
} catch (Exception e) {
|
||||
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
|
||||
} finally {
|
||||
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -551,8 +551,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
getConfig("allowDistributeIndex", "false").equalsIgnoreCase("true"),
|
||||
getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("true"),
|
||||
getConfig("indexDistribution.gzipBody","false").equalsIgnoreCase("true"),
|
||||
(int)getConfigLong("indexDistribution.timeout",60000),
|
||||
(int)getConfigLong("indexDistribution.maxOpenFiles",800)
|
||||
(int)getConfigLong("indexDistribution.timeout",60000) /*,
|
||||
(int)getConfigLong("indexDistribution.maxOpenFiles",800)*/
|
||||
);
|
||||
indexDistribution.setCounts(150, 1, 3, 10000);
|
||||
deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null,
|
||||
@ -1353,7 +1353,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
words = condenser.RESULT_SIMI_WORDS;
|
||||
|
||||
// transfering the index to the storage peer
|
||||
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
|
||||
String error = yacyClient.transferIndex(
|
||||
seed,
|
||||
(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]),
|
||||
urlCache,
|
||||
true,
|
||||
120000);
|
||||
|
||||
if (error != null) {
|
||||
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
|
||||
|
@ -178,10 +178,14 @@ public final class plasmaWordIndex {
|
||||
return condenser.RESULT_SIMI_WORDS;
|
||||
}
|
||||
|
||||
public int indexSize(String wordHash) {
|
||||
return ramCache.indexSize(wordHash);
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
|
||||
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
|
||||
}
|
||||
|
||||
|
||||
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
|
||||
return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
|
||||
}
|
||||
|
@ -172,6 +172,23 @@ public final class plasmaWordIndexAssortment {
|
||||
}
|
||||
return row2container(wordHash, row);
|
||||
}
|
||||
|
||||
public boolean contains(String wordHash) {
|
||||
// gets a word index from assortment database
|
||||
// and returns the content record
|
||||
byte[][] row = null;
|
||||
try {
|
||||
row = assortments.get(wordHash.getBytes());
|
||||
return (row != null);
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
} catch (kelondroException e) {
|
||||
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
|
||||
+ " - reset assortment-DB " + assortments.file(), e);
|
||||
resetDatabase();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntryContainer get(String wordHash) {
|
||||
// gets a word index from assortment database
|
||||
|
@ -226,6 +226,14 @@ public final class plasmaWordIndexAssortmentCluster {
|
||||
return record;
|
||||
}
|
||||
|
||||
public int indexSize(String wordHash) {
|
||||
int size = 0;
|
||||
for (int i = 0; i < clusterCount; i++) {
|
||||
if (assortments[i].contains(wordHash)) size += i + 1;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
|
||||
HashSet iterators = new HashSet();
|
||||
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");
|
||||
|
@ -258,6 +258,21 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size()));
|
||||
}
|
||||
|
||||
public int indexSize(String wordHash) {
|
||||
int size = 0;
|
||||
try {
|
||||
plasmaWordIndexEntity entity = backend.getEntity(wordHash, true, -1);
|
||||
if (entity != null) {
|
||||
size += entity.size();
|
||||
entity.close();
|
||||
}
|
||||
} catch (IOException e) {}
|
||||
size += assortmentCluster.indexSize(wordHash);
|
||||
TreeMap cacheIndex = (TreeMap) cache.get(wordHash);
|
||||
if (cacheIndex != null) size += cacheIndex.size();
|
||||
return size;
|
||||
}
|
||||
|
||||
public Iterator wordHashes(String startWordHash, boolean up) {
|
||||
// Old convention implies rot = true
|
||||
//return new rotatingWordHashes(startWordHash, up);
|
||||
|
@ -47,7 +47,6 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
@ -77,8 +76,6 @@ public final class plasmaWordIndexDistribution {
|
||||
private boolean closed;
|
||||
private boolean gzipBody4Distribution;
|
||||
private int timeout4Distribution;
|
||||
private int maxOpenFiles4Distribution;
|
||||
|
||||
public transferIndexThread transferIdxThread = null;
|
||||
|
||||
public plasmaWordIndexDistribution(
|
||||
@ -88,8 +85,7 @@ public final class plasmaWordIndexDistribution {
|
||||
boolean enable,
|
||||
boolean enabledWhileCrawling,
|
||||
boolean gzipBody,
|
||||
int timeout,
|
||||
int maxOpenFiles
|
||||
int timeout
|
||||
) {
|
||||
this.urlPool = urlPool;
|
||||
this.wordIndex = wordIndex;
|
||||
@ -100,7 +96,6 @@ public final class plasmaWordIndexDistribution {
|
||||
setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000);
|
||||
this.gzipBody4Distribution = gzipBody;
|
||||
this.timeout4Distribution = timeout;
|
||||
this.maxOpenFiles4Distribution = maxOpenFiles;
|
||||
}
|
||||
|
||||
public void enable() {
|
||||
@ -201,9 +196,8 @@ public final class plasmaWordIndexDistribution {
|
||||
// collect index
|
||||
String startPointHash = selectTransferStart();
|
||||
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
|
||||
Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
|
||||
Object[] selectResult = selectTransferContainers(startPointHash, indexCount);
|
||||
plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
|
||||
//Integer openedFiles = (Integer) selectResult[2];
|
||||
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
|
||||
if ((indexContainers == null) || (indexContainers.length == 0)) {
|
||||
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
|
||||
@ -267,7 +261,12 @@ public final class plasmaWordIndexDistribution {
|
||||
return -1; // interrupted
|
||||
}
|
||||
start = System.currentTimeMillis();
|
||||
error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
|
||||
error = yacyClient.transferIndex(
|
||||
seeds[i],
|
||||
indexContainers,
|
||||
urlCache,
|
||||
this.gzipBody4Distribution,
|
||||
this.timeout4Distribution);
|
||||
if (error == null) {
|
||||
log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
|
||||
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
|
||||
@ -285,18 +284,9 @@ public final class plasmaWordIndexDistribution {
|
||||
if (hc1 >= peerCount) {
|
||||
// success
|
||||
if (delete) {
|
||||
try {
|
||||
if (deleteTransferIndexes(indexContainers)) {
|
||||
log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
|
||||
return indexCount;
|
||||
} else {
|
||||
log.logSevere("Deleted not all transferred whole-word indexes");
|
||||
return -1;
|
||||
}
|
||||
} catch (IOException ee) {
|
||||
log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
|
||||
return -1;
|
||||
}
|
||||
int deletedURLs = deleteTransferIndexes(indexContainers);
|
||||
log.logFine("Deleted from " + indexContainers.length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
|
||||
return indexCount;
|
||||
} else {
|
||||
// simply close the indexEntities
|
||||
closeTransferIndexes(indexContainers);
|
||||
@ -323,86 +313,67 @@ public final class plasmaWordIndexDistribution {
|
||||
}
|
||||
|
||||
Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
|
||||
selectTransferContainers(String hash, int count, int maxOpenFiles) {
|
||||
selectTransferContainers(String hash, int count) {
|
||||
// the hash is a start hash from where the indexes are picked
|
||||
ArrayList tmpContainers = new ArrayList(count);
|
||||
String nexthash = "";
|
||||
try {
|
||||
int currOpenFiles = 0;
|
||||
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
|
||||
plasmaWordIndexEntity indexEntity;
|
||||
plasmaWordIndexEntryContainer indexContainer;
|
||||
Iterator urlIter;
|
||||
Iterator hashIter;
|
||||
plasmaWordIndexEntry indexEntry;
|
||||
plasmaCrawlLURL.Entry lurl;
|
||||
final HashSet unknownURLEntries = new HashSet();
|
||||
int notBoundCounter = 0;
|
||||
final HashMap knownURLs = new HashMap();
|
||||
while (
|
||||
(count > 0) &&
|
||||
(currOpenFiles < maxOpenFiles) &&
|
||||
(count > 0) &&
|
||||
(wordHashIterator.hasNext()) &&
|
||||
((nexthash = (String) wordHashIterator.next()) != null) &&
|
||||
(nexthash.trim().length() > 0) &&
|
||||
((currOpenFiles == 0) ||
|
||||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
|
||||
((tmpContainers.size() == 0) ||
|
||||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer)tmpContainers.get(0)).wordHash()) < 0.2))
|
||||
) {
|
||||
indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
|
||||
if (indexEntity.size() == 0) {
|
||||
indexEntity.deleteComplete();
|
||||
} else {
|
||||
// make an on-the-fly entity and insert values
|
||||
indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
|
||||
// make an on-the-fly entity and insert values
|
||||
indexContainer = this.wordIndex.getContainer(nexthash, true, 10000);
|
||||
try {
|
||||
urlIter = indexEntity.elements(true);
|
||||
unknownURLEntries.clear();
|
||||
urlIter = indexContainer.entries();
|
||||
// iterate over indexes to fetch url entries and store them in the urlCache
|
||||
while ((urlIter.hasNext()) && (count > 0)) {
|
||||
indexEntry = (plasmaWordIndexEntry) urlIter.next();
|
||||
try {
|
||||
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
|
||||
if ((lurl == null) || (lurl.url()==null)) {
|
||||
unknownURLEntries.add(indexEntry.getUrlHash());
|
||||
if ((lurl == null) || (lurl.url() == null)) {
|
||||
notBoundCounter++;
|
||||
urlIter.remove();
|
||||
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
|
||||
} else {
|
||||
knownURLs.put(indexEntry.getUrlHash(), lurl);
|
||||
indexContainer.add(indexEntry);
|
||||
count--;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
unknownURLEntries.add(indexEntry.getUrlHash());
|
||||
notBoundCounter++;
|
||||
urlIter.remove();
|
||||
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
|
||||
}
|
||||
}
|
||||
// now delete all entries that have no url entry
|
||||
hashIter = unknownURLEntries.iterator();
|
||||
while (hashIter.hasNext()) {
|
||||
String nextUrlHash = (String) hashIter.next();
|
||||
indexEntity.removeEntry(nextUrlHash, true);
|
||||
this.urlPool.loadedURL.remove(nextUrlHash);
|
||||
|
||||
// remove all remaining; we have enough
|
||||
while (urlIter.hasNext()) {
|
||||
indexEntry = (plasmaWordIndexEntry) urlIter.next();
|
||||
urlIter.remove();
|
||||
}
|
||||
|
||||
// deleting entity if there are no more entries left
|
||||
// This could occure if there are unknownURLs in the entity
|
||||
if (indexEntity.size() == 0) {
|
||||
indexEntity.deleteComplete();
|
||||
}
|
||||
|
||||
// use whats remaining
|
||||
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
|
||||
// use whats left
|
||||
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + this.wordIndex.indexSize(nexthash) +" URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
|
||||
tmpContainers.add(indexContainer);
|
||||
} catch (kelondroException e) {
|
||||
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
|
||||
indexEntity.deleteComplete();
|
||||
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + nexthash, e);
|
||||
this.wordIndex.deleteIndex(nexthash);
|
||||
}
|
||||
indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards
|
||||
indexEntity = null;
|
||||
}
|
||||
|
||||
}
|
||||
// transfer to array
|
||||
plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
|
||||
return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
|
||||
} catch (IOException e) {
|
||||
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
|
||||
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
|
||||
return new Object[]{entryContainers, knownURLs};
|
||||
} catch (kelondroException e) {
|
||||
this.log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
|
||||
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
|
||||
@ -443,13 +414,11 @@ public final class plasmaWordIndexDistribution {
|
||||
}
|
||||
}
|
||||
|
||||
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
|
||||
int deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
|
||||
Iterator urlIter;
|
||||
plasmaWordIndexEntry indexEntry;
|
||||
plasmaWordIndexEntity indexEntity;
|
||||
String[] urlHashes;
|
||||
int sz;
|
||||
boolean success = true;
|
||||
int count = 0;
|
||||
for (int i = 0; i < indexContainers.length; i++) {
|
||||
// delete entries separately
|
||||
int c = 0;
|
||||
@ -459,15 +428,11 @@ public final class plasmaWordIndexDistribution {
|
||||
indexEntry = (plasmaWordIndexEntry) urlIter.next();
|
||||
urlHashes[c++] = indexEntry.getUrlHash();
|
||||
}
|
||||
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
|
||||
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
|
||||
sz = indexEntity.size();
|
||||
// indexEntity.close();
|
||||
closeTransferIndex(indexEntity);
|
||||
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
|
||||
count += wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
|
||||
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].wordHash()) + " entries left");
|
||||
indexContainers[i] = null;
|
||||
}
|
||||
return success;
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -706,7 +671,6 @@ public final class plasmaWordIndexDistribution {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class transferIndexThread extends Thread {
|
||||
@ -715,7 +679,6 @@ public final class plasmaWordIndexDistribution {
|
||||
private boolean finished = false;
|
||||
private boolean gzipBody4Transfer = false;
|
||||
private int timeout4Transfer = 60000;
|
||||
private int maxOpenFiles4Transfer = 800;
|
||||
private int transferedEntryCount = 0;
|
||||
private int transferedEntityCount = 0;
|
||||
private String status = "Running";
|
||||
@ -734,7 +697,7 @@ public final class plasmaWordIndexDistribution {
|
||||
this.initialWordsDBSize = sb.wordIndex.size();
|
||||
this.gzipBody4Transfer = "true".equalsIgnoreCase(sb.getConfig("indexTransfer.gzipBody","false"));
|
||||
this.timeout4Transfer = (int) sb.getConfigLong("indexTransfer.timeout",60000);
|
||||
this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800);
|
||||
//this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
@ -821,7 +784,6 @@ public final class plasmaWordIndexDistribution {
|
||||
*/
|
||||
long selectionStart = System.currentTimeMillis(), selectionEnd = 0, selectionTime = 0, iteration = 0;
|
||||
|
||||
Integer openedFiles = new Integer(0);
|
||||
while (!finished && !Thread.currentThread().isInterrupted()) {
|
||||
iteration++;
|
||||
int idxCount = 0;
|
||||
@ -830,10 +792,9 @@ public final class plasmaWordIndexDistribution {
|
||||
|
||||
// selecting 500 words to transfer
|
||||
this.status = "Running: Selecting chunk " + iteration;
|
||||
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
|
||||
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize);
|
||||
newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
|
||||
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
|
||||
openedFiles = (Integer) selectResult[2];
|
||||
|
||||
/* If we havn't selected a word chunk this could be because of
|
||||
* a) no words are left in the index
|
||||
@ -909,17 +870,10 @@ public final class plasmaWordIndexDistribution {
|
||||
// deleting transfered words from index
|
||||
if (delete) {
|
||||
this.status = "Running: Deleting chunk " + iteration;
|
||||
try {
|
||||
if (deleteTransferIndexes(oldIndexContainers)) {
|
||||
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
|
||||
transferedEntryCount += idxCount;
|
||||
transferedEntityCount += oldIndexContainers.length;
|
||||
} else {
|
||||
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
|
||||
}
|
||||
} catch (IOException ee) {
|
||||
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
|
||||
}
|
||||
int urlReferences = deleteTransferIndexes(oldIndexContainers);
|
||||
plasmaWordIndexDistribution.this.log.logFine("Deleted from " + oldIndexContainers.length + " transferred RWIs locally " + urlReferences + " URL references");
|
||||
transferedEntryCount += idxCount;
|
||||
transferedEntityCount += oldIndexContainers.length;
|
||||
} else {
|
||||
this.closeContainers(oldIndexContainers);
|
||||
transferedEntryCount += idxCount;
|
||||
|
@ -112,6 +112,7 @@ public final class plasmaWordIndexEntity {
|
||||
}
|
||||
|
||||
public int size() {
|
||||
if (theIndex == null) return 0;
|
||||
int size = theIndex.size();
|
||||
if ((size == 0) && (delete)) {
|
||||
deleteComplete();
|
||||
|
@ -842,14 +842,23 @@ public final class yacyClient {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/*
|
||||
public static byte[] singleGET(String host, int port, String path, int timeout,
|
||||
String user, String password,
|
||||
httpHeader requestHeader) throws IOException {
|
||||
*/
|
||||
|
||||
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
|
||||
|
||||
// check if we got all necessary urls in the urlCache (only for debugging)
|
||||
Iterator eenum;
|
||||
plasmaWordIndexEntry entry;
|
||||
for (int i = 0; i < indexes.length; i++) {
|
||||
eenum = indexes[i].entries();
|
||||
while (eenum.hasNext()) {
|
||||
entry = (plasmaWordIndexEntry) eenum.next();
|
||||
if (urlCache.get(entry.getUrlHash()) == null) {
|
||||
System.out.println("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// transfer the RWI without the URLs
|
||||
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
|
||||
if (in == null) { return "no_connection_1"; }
|
||||
String result = (String) in.get("result");
|
||||
@ -868,7 +877,9 @@ public final class yacyClient {
|
||||
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
|
||||
for (int i = 0; i < uhs.length; i++) {
|
||||
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
|
||||
if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
|
||||
if (urls[i] == null) {
|
||||
System.out.println("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
|
||||
}
|
||||
}
|
||||
|
||||
in = transferURL(targetSeed, urls, gzipBody, timeout);
|
||||
|
@ -53,6 +53,7 @@ import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintWriter;
|
||||
import java.net.URL;
|
||||
import java.util.ConcurrentModificationException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
@ -432,11 +433,13 @@ public final class yacy {
|
||||
run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb));
|
||||
|
||||
// save information about available memory after all initializations
|
||||
sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory());
|
||||
sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory());
|
||||
System.gc();
|
||||
sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory());
|
||||
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory());
|
||||
try {
|
||||
sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory());
|
||||
sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory());
|
||||
System.gc();
|
||||
sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory());
|
||||
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory());
|
||||
} catch (ConcurrentModificationException e) {}
|
||||
|
||||
// wait for server shutdown
|
||||
try {
|
||||
@ -834,22 +837,16 @@ public final class yacy {
|
||||
// testing if import process was aborted
|
||||
if (Thread.interrupted()) break;
|
||||
|
||||
plasmaWordIndexEntity importWordIdxEntity = null;
|
||||
plasmaWordIndexEntryContainer newContainer;
|
||||
try {
|
||||
wordCounter++;
|
||||
wordHash = (String) importWordHashIterator.next();
|
||||
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1);
|
||||
newContainer = importWordIndex.getContainer(wordHash, true, -1);
|
||||
|
||||
if (importWordIdxEntity.size() == 0) {
|
||||
importWordIdxEntity.deleteComplete();
|
||||
continue;
|
||||
}
|
||||
|
||||
// creating a container used to hold the imported entries
|
||||
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
|
||||
if (newContainer.size() == 0) continue;
|
||||
|
||||
// the combined container will fit, read the container
|
||||
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
|
||||
Iterator importWordIdxEntries = newContainer.entries();
|
||||
plasmaWordIndexEntry importWordIdxEntry;
|
||||
while (importWordIdxEntries.hasNext()) {
|
||||
|
||||
@ -871,9 +868,6 @@ public final class yacy {
|
||||
}
|
||||
} catch (IOException e) {}
|
||||
|
||||
// adding word index entity to container
|
||||
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
|
||||
|
||||
if (entryCounter % 500 == 0) {
|
||||
log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
|
||||
}
|
||||
@ -886,7 +880,6 @@ public final class yacy {
|
||||
homeWordIndex.addEntries(newContainer, true);
|
||||
|
||||
// delete complete index entity file
|
||||
importWordIdxEntity.close();
|
||||
importWordIndex.deleteIndex(wordHash);
|
||||
|
||||
// print out some statistical information
|
||||
@ -912,7 +905,6 @@ public final class yacy {
|
||||
} catch (Exception e) {
|
||||
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
|
||||
} finally {
|
||||
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user