mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-17 08:26:08 -04:00
introduced assortment structure (generalization of singletons)
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@139 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
source/de/anomic/plasma
@ -240,10 +240,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
log.logSystem("Wiki Cache memory = " + ppRamString(ramWiki));
|
||||
|
||||
// make crawl profiles database and default profiles
|
||||
log.logSystem("Initializing Crawl Profiles");
|
||||
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
|
||||
initProfiles();
|
||||
|
||||
// start indexing management
|
||||
log.logSystem("Starting Indexing Management");
|
||||
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
|
||||
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL);
|
||||
errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL);
|
||||
@ -253,19 +255,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
searchManager = new plasmaSearch(loadedURL, wordIndex);
|
||||
|
||||
// start a cache manager
|
||||
log.logSystem("Starting HT Cache Manager");
|
||||
this.cacheManager = new plasmaHTCache(this, ramHTTP);
|
||||
|
||||
// make parser
|
||||
log.logSystem("Starting Parser");
|
||||
this.parser = new plasmaParser();
|
||||
|
||||
// define an extension-blacklist
|
||||
log.logSystem("Parser: Initializing Media Extensions");
|
||||
plasmaParser.initMediaExt(getConfig("mediaExt",null));
|
||||
|
||||
// define a realtime parsable mimetype list
|
||||
log.logSystem("Parser: Initializing Mime Types");
|
||||
plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain"));
|
||||
plasmaParser.initParseableMimeTypes(getConfig("parseableMimeTypes",null));
|
||||
|
||||
// start a loader
|
||||
log.logSystem("Starting Crawl Loader");
|
||||
int remoteport;
|
||||
try { remoteport = Integer.parseInt(getConfig("remoteProxyPort","3128")); }
|
||||
catch (NumberFormatException e) { remoteport = 3128; }
|
||||
@ -277,18 +284,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
remoteport);
|
||||
|
||||
// init boards
|
||||
log.logSystem("Starting Message Board");
|
||||
messageDB = new messageBoard(new File(getRootPath(), "DATA/SETTINGS/message.db"), ramMessage);
|
||||
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
|
||||
log.logSystem("Starting Wiki Board");
|
||||
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
|
||||
new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
|
||||
|
||||
// init cookie-Monitor
|
||||
log.logSystem("Starting Cookie Monitor");
|
||||
outgoingCookies = new HashMap();
|
||||
incomingCookies = new HashMap();
|
||||
|
||||
// clean up profiles
|
||||
log.logSystem("Cleaning Profiles");
|
||||
cleanProfiles();
|
||||
|
||||
// init facility DB
|
||||
log.logSystem("Starting Facility Database");
|
||||
File facilityDBpath = new File(getRootPath(), "DATA/SETTINGS/");
|
||||
facilityDB = new kelondroTables(facilityDBpath);
|
||||
facilityDB.declareMaps("backlinks", 250, 500, new String[] {"date"}, null);
|
||||
@ -299,10 +311,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
|
||||
|
||||
// start yacy core
|
||||
log.logSystem("Starting YaCy Protocol Core");
|
||||
yacyCore yc = new yacyCore(this);
|
||||
serverInstantThread.oneTimeJob(yc, "loadSeeds", yc.log, 3000);
|
||||
|
||||
// deploy threads
|
||||
log.logSystem("Starting Threads");
|
||||
deployThread("90_cleanup", "Cleanup", "simple cleaning process for monitoring information" ,
|
||||
new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes
|
||||
deployThread("80_dequeue", "Indexing Dequeue", "thread that creates database entries from scraped web content and performes indexing" ,
|
||||
|
250
source/de/anomic/plasma/plasmaWordIndexAssortment.java
Normal file
250
source/de/anomic/plasma/plasmaWordIndexAssortment.java
Normal file
@ -0,0 +1,250 @@
|
||||
// plasmaWordIndexAssortment.java
|
||||
// ------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 18.5.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
/*
|
||||
An assortment is a set of words that appear exactly on a specific
|
||||
number of different web pages. A special case is, when the the word
|
||||
appear only on a single web page: this is called a 'singleton'.
|
||||
YaCy maintains a word cache for words appearing on x web pages.
|
||||
For each 'x' there is an assortment database, where 1<=x<=max
|
||||
If a word appears on more than 'max' web pages, the corresponing url-list
|
||||
is stored to some kind of back-end database which we consider as the
|
||||
'slowes' option to save data.
|
||||
*/
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.lang.RuntimeException;
|
||||
import de.anomic.kelondro.*;
|
||||
import de.anomic.server.serverLog;
|
||||
|
||||
public final class plasmaWordIndexAssortment {
|
||||
|
||||
// environment constants
|
||||
private static final String assortmentFileName = "indexAssortment";
|
||||
public static final int[] bufferStructureBasis = new int[]{
|
||||
plasmaWordIndexEntry.wordHashLength, // a wordHash
|
||||
4, // occurrence counter
|
||||
8, // timestamp of last access
|
||||
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
|
||||
plasmaWordIndexEntry.attrSpaceLong // URL attributes
|
||||
};
|
||||
|
||||
// class variables
|
||||
private File assortmentFile;
|
||||
private int assortmentCapacity;
|
||||
private serverLog log;
|
||||
private kelondroTree assortments;
|
||||
private long bufferSize;
|
||||
private int bufferStructureLength;
|
||||
|
||||
private static String intx(int x) {
|
||||
String s = "" + x;
|
||||
while (s.length() < 3) s = "0" + s;
|
||||
return s;
|
||||
}
|
||||
|
||||
private static int[] bufferStructure(int assortmentCapacity) {
|
||||
int[] structure = new int[3 + 2 * assortmentCapacity];
|
||||
structure[0] = bufferStructureBasis[0];
|
||||
structure[1] = bufferStructureBasis[1];
|
||||
structure[2] = bufferStructureBasis[2];
|
||||
for (int i = 0; i < assortmentCapacity; i++) {
|
||||
structure[3 + 2 * i] = bufferStructureBasis[3];
|
||||
structure[4 + 2 * i] = bufferStructureBasis[4];
|
||||
}
|
||||
return structure;
|
||||
}
|
||||
|
||||
public plasmaWordIndexAssortment(File storagePath, int assortmentCapacity, int bufferkb, serverLog log) {
|
||||
if (!(storagePath.exists())) storagePath.mkdirs();
|
||||
this.assortmentFile = new File(storagePath, assortmentFileName + intx(assortmentCapacity) + ".db");
|
||||
this.assortmentCapacity = assortmentCapacity;
|
||||
this.bufferStructureLength = 3 + 2 * assortmentCapacity;
|
||||
this.bufferSize = bufferkb * 1024;
|
||||
this.log = log;
|
||||
if (assortmentFile.exists()) {
|
||||
// open existing singeton tree file
|
||||
try {
|
||||
assortments = new kelondroTree(assortmentFile, bufferSize);
|
||||
log.logSystem("Opened Assortment Database, " + assortments.size() + " entries.");
|
||||
} catch (IOException e){
|
||||
log.logError("unable to open assortment database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else {
|
||||
// create new sigleton tree file
|
||||
try {
|
||||
assortments = new kelondroTree(assortmentFile, bufferSize, bufferStructure(assortmentCapacity));
|
||||
log.logSystem("Created new Assortment Database");
|
||||
} catch (IOException e){
|
||||
log.logError("unable to create assortment database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record newRecord(plasmaWordIndexEntry entry, long creationTime) {
|
||||
return new record(new plasmaWordIndexEntry[]{entry}, creationTime);
|
||||
}
|
||||
|
||||
public record newRecord(plasmaWordIndexEntry[] entries, long creationTime) {
|
||||
return new record(entries, creationTime);
|
||||
}
|
||||
|
||||
public class record {
|
||||
public plasmaWordIndexEntry[] entries;
|
||||
public long creationTime;
|
||||
public record(plasmaWordIndexEntry[] entries, long creationTime) {
|
||||
this.entries = entries;
|
||||
this.creationTime = creationTime;
|
||||
}
|
||||
}
|
||||
|
||||
public void store(String wordHash, record newRecord) {
|
||||
// stores a word index to assortment database
|
||||
// this throws an exception if the word hash already existed
|
||||
//log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
|
||||
byte[][] row = new byte[this.bufferStructureLength][];
|
||||
row[0] = wordHash.getBytes();
|
||||
row[1] = kelondroRecords.long2bytes(1, 4);
|
||||
row[2] = kelondroRecords.long2bytes(newRecord.creationTime, 8);
|
||||
for (int i = 0; i < assortmentCapacity; i++) {
|
||||
row[3 + 2 * i] = newRecord.entries[i].getUrlHash().getBytes();
|
||||
row[4 + 2 * i] = newRecord.entries[i].toEncodedForm(true).getBytes();
|
||||
}
|
||||
byte[][] oldrow = null;
|
||||
try {
|
||||
oldrow = assortments.put(row);
|
||||
} catch (IOException e) {
|
||||
log.logFailure("storeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("storeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
}
|
||||
if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous");
|
||||
}
|
||||
|
||||
public record read(String wordHash) {
|
||||
// returns a single word index from assortment database; returns null if index does not exist
|
||||
//log.logDebug("readAssortment: wordHash=" + wordHash);
|
||||
byte[][] row = null;
|
||||
try {
|
||||
row = assortments.get(wordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("readAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("readAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
}
|
||||
if (row == null) return null;
|
||||
long creationTime = kelondroRecords.bytes2long(row[2]);
|
||||
plasmaWordIndexEntry[] wordEntries = new plasmaWordIndexEntry[this.bufferStructureLength];
|
||||
for (int i = 0; i < assortmentCapacity; i++) {
|
||||
wordEntries[i] = new plasmaWordIndexEntry(new String(row[3 + 2 * i]), new String(row[4 + 2 * i]));
|
||||
}
|
||||
return new record(wordEntries, creationTime);
|
||||
}
|
||||
|
||||
public void remove(String wordHash) {
|
||||
// deletes a word index from assortment database
|
||||
//log.logDebug("removeAssortment: wordHash=" + wordHash);
|
||||
byte[][] row = null;
|
||||
try {
|
||||
row = assortments.remove(wordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("removeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("removeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
}
|
||||
}
|
||||
|
||||
private void resetDatabase() {
|
||||
// deletes the assortment database and creates a new one
|
||||
try {
|
||||
assortments.close();
|
||||
} catch (IOException e) {}
|
||||
if (!(assortmentFile.delete())) throw new RuntimeException("cannot delete assortment database");
|
||||
try {
|
||||
assortments = new kelondroTree(assortmentFile, bufferSize, bufferStructure(assortmentCapacity));
|
||||
} catch (IOException e){
|
||||
log.logError("unable to re-create assortment database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator hashes(String startWordHash, boolean up, boolean rot) {
|
||||
try {
|
||||
return assortments.keys(up, rot, startWordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("iterateAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB");
|
||||
e.printStackTrace();
|
||||
resetDatabase();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return assortments.size();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
assortments.close();
|
||||
} catch (IOException e){
|
||||
log.logError("unable to close assortment database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -53,14 +53,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
|
||||
// environment constants
|
||||
private static final String indexDumpFileName = "indexDump0.stack";
|
||||
private static final String singletonFileName = "indexSingletons0.db";
|
||||
private static final int[] bufferStructure = new int[]{
|
||||
plasmaWordIndexEntry.wordHashLength, // a wordHash
|
||||
4, // occurrence counter
|
||||
8, // timestamp of last access
|
||||
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
|
||||
plasmaWordIndexEntry.attrSpaceLong // URL attributes
|
||||
};
|
||||
private static final String oldSingletonFileName = "indexSingletons0.db";
|
||||
private static final String newSingletonFileName = "indexAssortment001.db";
|
||||
|
||||
// class variables
|
||||
private File databaseRoot;
|
||||
@ -70,8 +64,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
private HashMap hashDate;
|
||||
private int maxWords;
|
||||
private serverLog log;
|
||||
private kelondroTree singletons;
|
||||
private long singletonBufferSize;
|
||||
private plasmaWordIndexAssortment singletons;
|
||||
private int singletonBufferSize; //kb
|
||||
|
||||
// calculated constants
|
||||
private static String minKey, maxKey;
|
||||
@ -83,36 +77,23 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
}
|
||||
|
||||
public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
|
||||
// migrate
|
||||
File oldSingletonFile = new File(databaseRoot, oldSingletonFileName);
|
||||
File newSingletonFile = new File(databaseRoot, newSingletonFileName);
|
||||
if ((oldSingletonFile.exists()) && (!(newSingletonFile.exists()))) oldSingletonFile.renameTo(newSingletonFile);
|
||||
|
||||
// creates a new index cache
|
||||
// the cache has a back-end where indexes that do not fit in the cache are flushed
|
||||
this.databaseRoot = databaseRoot;
|
||||
this.singletonBufferSize = singletonbufferkb * 1024;
|
||||
this.singletonBufferSize = singletonbufferkb;
|
||||
this.cache = new TreeMap();
|
||||
this.hashScore = new kelondroMScoreCluster();
|
||||
this.hashDate = new HashMap();
|
||||
this.maxWords = 10000;
|
||||
this.backend = backend;
|
||||
this.log = log;
|
||||
File singletonFile = new File(databaseRoot, singletonFileName);
|
||||
if (singletonFile.exists()) {
|
||||
// open existing singeton tree file
|
||||
try {
|
||||
singletons = new kelondroTree(singletonFile, singletonBufferSize);
|
||||
log.logSystem("Opened Singleton Database, " + singletons.size() + " entries.");
|
||||
} catch (IOException e){
|
||||
log.logError("unable to open singleton database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else {
|
||||
// create new sigleton tree file
|
||||
try {
|
||||
singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure);
|
||||
log.logSystem("Created new Singleton Database");
|
||||
} catch (IOException e){
|
||||
log.logError("unable to create singleton database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
this.singletons = new plasmaWordIndexAssortment(databaseRoot, 1, singletonBufferSize, log);
|
||||
|
||||
// read in dump of last session
|
||||
try {
|
||||
restore();
|
||||
@ -126,7 +107,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)");
|
||||
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
|
||||
if (indexDumpFile.exists()) indexDumpFile.delete();
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, bufferStructure);
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 1024, plasmaWordIndexAssortment.bufferStructureBasis);
|
||||
long startTime = System.currentTimeMillis();
|
||||
long messageTime = System.currentTimeMillis() + 5000;
|
||||
long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
|
||||
@ -179,7 +160,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
private long restore() throws IOException {
|
||||
File indexDumpFile = new File(databaseRoot, indexDumpFileName);
|
||||
if (!(indexDumpFile.exists())) return 0;
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0);
|
||||
kelondroStack dumpStack = new kelondroStack(indexDumpFile, 1024);
|
||||
log.logSystem("restore dump of index cache, " + dumpStack.size() + " word/url relations");
|
||||
long startTime = System.currentTimeMillis();
|
||||
long messageTime = System.currentTimeMillis() + 5000;
|
||||
@ -217,97 +198,6 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
return urlCount;
|
||||
}
|
||||
|
||||
// singleton access methods
|
||||
|
||||
private void storeSingleton(String wordHash, plasmaWordIndexEntry entry, long creationTime) {
|
||||
// stores a word index to singleton database
|
||||
// this throws an exception if the word hash already existed
|
||||
//log.logDebug("storeSingleton: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime);
|
||||
byte[][] row = new byte[5][];
|
||||
row[0] = wordHash.getBytes();
|
||||
row[1] = kelondroRecords.long2bytes(1, 4);
|
||||
row[2] = kelondroRecords.long2bytes(creationTime, 8);
|
||||
row[3] = entry.getUrlHash().getBytes();
|
||||
row[4] = entry.toEncodedForm(true).getBytes();
|
||||
byte[][] oldrow = null;
|
||||
try {
|
||||
oldrow = singletons.put(row);
|
||||
} catch (IOException e) {
|
||||
log.logFailure("storeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("storeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
}
|
||||
if (oldrow != null) throw new RuntimeException("Store to singleton ambiguous");
|
||||
}
|
||||
|
||||
public Object[] /*{plasmaWordIndexEntry, Long(creationTime)}*/ readSingleton(String wordHash) {
|
||||
// returns a single word index from singleton database; returns null if index does not exist
|
||||
//log.logDebug("readSingleton: wordHash=" + wordHash);
|
||||
byte[][] row = null;
|
||||
try {
|
||||
row = singletons.get(wordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("readSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("readSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
}
|
||||
if (row == null) return null;
|
||||
long creationTime = kelondroRecords.bytes2long(row[2]);
|
||||
plasmaWordIndexEntry wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
|
||||
return new Object[]{wordEntry, new Long(creationTime)};
|
||||
}
|
||||
|
||||
private void removeSingleton(String wordHash) {
|
||||
// deletes a word index from singleton database
|
||||
//log.logDebug("removeSingleton: wordHash=" + wordHash);
|
||||
byte[][] row = null;
|
||||
try {
|
||||
row = singletons.remove(wordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("removeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
} catch (kelondroException e) {
|
||||
log.logFailure("removeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
}
|
||||
}
|
||||
|
||||
private void resetSingletonDatabase() {
|
||||
// deletes the singleton database and creates a new one
|
||||
try {
|
||||
singletons.close();
|
||||
} catch (IOException e) {}
|
||||
File singletonFile = new File(databaseRoot, singletonFileName);
|
||||
if (!(singletonFile.delete())) throw new RuntimeException("cannot delete singleton database");
|
||||
try {
|
||||
singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure);
|
||||
} catch (IOException e){
|
||||
log.logError("unable to re-create singleton database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator singletonHashes(String startWordHash, boolean up, boolean rot) {
|
||||
try {
|
||||
return singletons.keys(up, rot, startWordHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
log.logFailure("iterateSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB");
|
||||
e.printStackTrace();
|
||||
resetSingletonDatabase();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// cache settings
|
||||
|
||||
public int maxURLinWordCache() {
|
||||
@ -318,14 +208,14 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
return cache.size();
|
||||
}
|
||||
|
||||
public int singletonsSize() {
|
||||
return singletons.size();
|
||||
}
|
||||
|
||||
public void setMaxWords(int maxWords) {
|
||||
this.maxWords = maxWords;
|
||||
}
|
||||
|
||||
public int singletonsSize() {
|
||||
return singletons.size();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return java.lang.Math.max(singletons.size(), java.lang.Math.max(backend.size(), cache.size()));
|
||||
}
|
||||
@ -339,7 +229,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
return new kelondroMergeIterator(
|
||||
new kelondroMergeIterator(
|
||||
cache.keySet().iterator(),
|
||||
singletonHashes(startWordHash, true, false),
|
||||
singletons.hashes(startWordHash, true, false),
|
||||
true),
|
||||
backend.wordHashes(startWordHash, true),
|
||||
true);
|
||||
@ -366,13 +256,14 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
hashScore.deleteScore(key);
|
||||
hashDate.remove(key);
|
||||
}
|
||||
|
||||
// now decide where to flush that container
|
||||
Object[] singleton = readSingleton(key);
|
||||
plasmaWordIndexAssortment.record singleton = singletons.read(key);
|
||||
if (singleton == null) {
|
||||
// not found in singletons
|
||||
if (container.size() == 1) {
|
||||
// it is a singleton: store to singleton
|
||||
storeSingleton(key, container.getOne(), time);
|
||||
singletons.store(key, singletons.newRecord(container.getOne(), time));
|
||||
return 1;
|
||||
} else {
|
||||
// store to back-end; this should be a rare case
|
||||
@ -380,8 +271,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
}
|
||||
} else {
|
||||
// we have a singleton and need to integrate this in the flush
|
||||
plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) singleton[0];
|
||||
long oldTime = ((Long) singleton[1]).longValue();
|
||||
plasmaWordIndexEntry oldEntry = singleton.entries[0];
|
||||
long oldTime = singleton.creationTime;
|
||||
if (container.contains(oldEntry.getUrlHash())) {
|
||||
// we have an double-occurrence
|
||||
if (container.size() == 1) {
|
||||
@ -389,13 +280,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
return 0;
|
||||
} else {
|
||||
// we flush to the backend, and the entry from the singletons
|
||||
removeSingleton(key);
|
||||
singletons.remove(key);
|
||||
return backend.addEntries(container, java.lang.Math.max(time, oldTime));
|
||||
}
|
||||
} else {
|
||||
// now we have more than one entry
|
||||
// we must remove the key from the singleton database
|
||||
removeSingleton(key);
|
||||
singletons.remove(key);
|
||||
// .. and put it to the container
|
||||
container.add(oldEntry);
|
||||
if (reintegrate) {
|
||||
@ -416,15 +307,15 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
|
||||
private boolean flushFromSingleton(String key) {
|
||||
// this should only be called if the singleton shall be deleted or returned in an index entity
|
||||
Object[] singleton = readSingleton(key);
|
||||
plasmaWordIndexAssortment.record singleton = singletons.read(key);
|
||||
if (singleton == null) {
|
||||
return false;
|
||||
} else {
|
||||
// we have a singleton
|
||||
plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton[0];
|
||||
long time = ((Long) singleton[1]).longValue();
|
||||
plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton.entries[0];
|
||||
long time = singleton.creationTime;
|
||||
// remove it from the singleton database
|
||||
removeSingleton(key);
|
||||
singletons.remove(key);
|
||||
// integrate it to the backend
|
||||
return backend.addEntries(plasmaWordIndexEntryContainer.instantContainer(key, entry), time) > 0;
|
||||
}
|
||||
@ -518,7 +409,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
hashScore.deleteScore(wordHash);
|
||||
hashDate.remove(wordHash);
|
||||
}
|
||||
removeSingleton(wordHash);
|
||||
singletons.remove(wordHash);
|
||||
backend.deleteIndex(wordHash);
|
||||
}
|
||||
|
||||
@ -561,12 +452,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
|
||||
}
|
||||
|
||||
public void close(int waitingSeconds) {
|
||||
try {
|
||||
singletons.close();
|
||||
} catch (IOException e){
|
||||
log.logError("unable to close singleton database: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
singletons.close();
|
||||
try {
|
||||
dump(waitingSeconds);
|
||||
} catch (IOException e){
|
||||
|
Reference in New Issue
Block a user