mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-18 08:36:07 -04:00
- more abstraction for the RWI index as preparation for solr integration
- added options in search index to switch parts of the index on or off
This commit is contained in:
htroot
Bookmarks.javaCrawlResults.javaIndexControlRWIs_p.htmlIndexControlRWIs_p.javaIndexControlURLs_p.javaViewFile.javaVocabulary_p.java
api
yacy
yacysearch.javasource
de
anomic
net
yacy
@ -46,7 +46,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.peers.NewsPool;
|
||||
import net.yacy.search.Switchboard;
|
||||
@ -194,7 +194,7 @@ public class Bookmarks {
|
||||
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
|
||||
if (bookmark == null) {
|
||||
// try to get the bookmark from the LURL database
|
||||
final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
|
||||
final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
|
||||
if (urlentry != null) try {
|
||||
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null));
|
||||
prop.put("mode_edit", "0"); // create mode
|
||||
|
@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.peers.Seed;
|
||||
import net.yacy.search.Switchboard;
|
||||
@ -178,7 +178,7 @@ public class CrawlResults {
|
||||
boolean dark = true;
|
||||
String urlstr, urltxt;
|
||||
Seed initiatorSeed, executorSeed;
|
||||
URIMetadataRow urle;
|
||||
URIMetadata urle;
|
||||
|
||||
int cnt = 0;
|
||||
final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);
|
||||
|
@ -33,10 +33,10 @@
|
||||
<dt class="TableCellDark">Index Deletion</dt>
|
||||
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
|
||||
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteTriplestore').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteTriplestore').disabled=c;document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
|
||||
/><label for="deleteIndex">Delete Search Index</label><br/>
|
||||
#(solr)#::<input type="checkbox" name="deleteSolr" id="deleteSolr"
|
||||
onclick="x=document.getElementById('deleteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
|
||||
/><label for="deleteSolr">Delete Solr Index</label><br/>#(/solr)#
|
||||
/><label for="deleteIndex">Delete local search index (including local solr)</label><br/>
|
||||
#(solr)#::<input type="checkbox" name="deleteRemoteSolr" id="deleteRemoteSolr"
|
||||
onclick="x=document.getElementById('deleteRemoteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
|
||||
/><label for="deleteRemoteSolr">Delete remote solr index</label><br/>#(/solr)#
|
||||
<input type="checkbox" name="deleteTriplestore" id="deleteTriplestore" disabled="disabled" /><label for="deleteTriplestore">Delete RDF Triplestore</label><br/>
|
||||
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP & FTP Cache</label><br/>
|
||||
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/>
|
||||
|
@ -42,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
@ -156,7 +157,7 @@ public class IndexControlRWIs_p {
|
||||
if ( post.get("deleteIndex", "").equals("on") ) {
|
||||
segment.clear();
|
||||
}
|
||||
if ( post.get("deleteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) {
|
||||
if ( post.get("deleteRemoteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) {
|
||||
try {
|
||||
sb.index.getRemoteSolr().clear();
|
||||
} catch ( final Exception e ) {
|
||||
@ -307,15 +308,15 @@ public class IndexControlRWIs_p {
|
||||
index = segment.termIndex().get(keyhash, null);
|
||||
// built urlCache
|
||||
final Iterator<WordReference> urlIter = index.entries();
|
||||
final TreeMap<byte[], URIMetadataRow> knownURLs =
|
||||
new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
|
||||
final TreeMap<byte[], URIMetadata> knownURLs =
|
||||
new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
|
||||
final HandleSet unknownURLEntries =
|
||||
new HandleSet(
|
||||
WordReferenceRow.urlEntryRow.primaryKeyLength,
|
||||
WordReferenceRow.urlEntryRow.objectOrder,
|
||||
index.size());
|
||||
Reference iEntry;
|
||||
URIMetadataRow lurl;
|
||||
URIMetadata lurl;
|
||||
while (urlIter.hasNext()) {
|
||||
iEntry = urlIter.next();
|
||||
lurl = segment.urlMetadata().load(iEntry.urlhash());
|
||||
@ -413,7 +414,7 @@ public class IndexControlRWIs_p {
|
||||
} catch ( final RowSpaceExceededException e ) {
|
||||
Log.logException(e);
|
||||
}
|
||||
final URIMetadataRow e = segment.urlMetadata().load(b);
|
||||
final URIMetadata e = segment.urlMetadata().load(b);
|
||||
segment.urlMetadata().remove(b);
|
||||
if ( e != null ) {
|
||||
url = e.url();
|
||||
@ -448,7 +449,7 @@ public class IndexControlRWIs_p {
|
||||
} catch ( final RowSpaceExceededException e ) {
|
||||
Log.logException(e);
|
||||
}
|
||||
final URIMetadataRow e = segment.urlMetadata().load(b);
|
||||
final URIMetadata e = segment.urlMetadata().load(b);
|
||||
segment.urlMetadata().remove(b);
|
||||
if ( e != null ) {
|
||||
url = e.url();
|
||||
@ -514,7 +515,7 @@ public class IndexControlRWIs_p {
|
||||
prop.put("genUrlList_lines", maxlines);
|
||||
int i = 0;
|
||||
DigestURI url;
|
||||
URIMetadataRow entry;
|
||||
URIMetadata entry;
|
||||
String us;
|
||||
long rn = -1;
|
||||
while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) {
|
||||
|
@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
@ -132,7 +132,7 @@ public class IndexControlURLs_p {
|
||||
}
|
||||
|
||||
if (post.containsKey("urlhashdelete")) {
|
||||
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
} else {
|
||||
@ -166,7 +166,7 @@ public class IndexControlURLs_p {
|
||||
final DigestURI url = new DigestURI(urlstring);
|
||||
urlhash = ASCII.String(url.hash());
|
||||
prop.put("urlhash", urlhash);
|
||||
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
|
||||
prop.putHTML("urlstring", urlstring);
|
||||
@ -184,7 +184,7 @@ public class IndexControlURLs_p {
|
||||
}
|
||||
|
||||
if (post.containsKey("urlhashsearch")) {
|
||||
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash);
|
||||
} else {
|
||||
@ -199,9 +199,9 @@ public class IndexControlURLs_p {
|
||||
// generate list
|
||||
if (post.containsKey("urlhashsimilar")) {
|
||||
try {
|
||||
final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
|
||||
final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
|
||||
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
|
||||
URIMetadataRow entry;
|
||||
URIMetadata entry;
|
||||
int i = 0, rows = 0, cols = 0;
|
||||
prop.put("urlhashsimilar", "1");
|
||||
while (entryIt.hasNext() && i < 256) {
|
||||
@ -303,14 +303,14 @@ public class IndexControlURLs_p {
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static serverObjects genUrlProfile(final Segment segment, final URIMetadataRow entry, final String urlhash) {
|
||||
private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
if (entry == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
return prop;
|
||||
}
|
||||
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
|
||||
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
|
||||
if (entry.url() == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
|
@ -51,7 +51,7 @@ import net.yacy.document.WordTokenizer;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segment;
|
||||
|
||||
@ -112,7 +112,7 @@ public class ViewFile {
|
||||
|
||||
// get the url hash from which the content should be loaded
|
||||
String urlHash = post.get("urlHash", "");
|
||||
URIMetadataRow urlEntry = null;
|
||||
URIMetadata urlEntry = null;
|
||||
// get the urlEntry that belongs to the url hash
|
||||
if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
|
||||
// get the url that belongs to the entry
|
||||
|
@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segment;
|
||||
@ -86,12 +86,12 @@ public class Vocabulary_p {
|
||||
if (p >= 0) t = t.substring(p + 1);
|
||||
}
|
||||
if (discoverFromTitle || discoverFromTitleSplitted) {
|
||||
URIMetadataRow m = segment.urlMetadata().load(u.hash());
|
||||
URIMetadata m = segment.urlMetadata().load(u.hash());
|
||||
if (m != null) t = m.dc_title();
|
||||
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
|
||||
}
|
||||
if (discoverFromAuthor) {
|
||||
URIMetadataRow m = segment.urlMetadata().load(u.hash());
|
||||
URIMetadata m = segment.urlMetadata().load(u.hash());
|
||||
if (m != null) t = m.dc_creator();
|
||||
}
|
||||
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
|
||||
|
@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.RequestHeader.FileType;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
@ -97,13 +97,13 @@ public class yacydoc {
|
||||
}
|
||||
if (urlhash == null || urlhash.isEmpty()) return prop;
|
||||
|
||||
final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
|
||||
final URIMetadata entry = segment.urlMetadata().load(urlhash.getBytes());
|
||||
if (entry == null) return prop;
|
||||
|
||||
if (entry.url() == null) {
|
||||
return prop;
|
||||
}
|
||||
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
|
||||
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
|
||||
|
||||
prop.putXML("dc_title", entry.dc_title());
|
||||
prop.putXML("dc_creator", entry.dc_creator());
|
||||
|
@ -31,7 +31,7 @@ import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.peers.Protocol;
|
||||
import net.yacy.search.Switchboard;
|
||||
import de.anomic.crawler.NoticedURL;
|
||||
@ -110,7 +110,7 @@ public class urls {
|
||||
if (urlhashes.length() % 12 != 0) return prop;
|
||||
final int count = urlhashes.length() / 12;
|
||||
int c = 0;
|
||||
URIMetadataRow entry;
|
||||
URIMetadata entry;
|
||||
DigestURI referrer;
|
||||
for (int i = 0; i < count; i++) {
|
||||
entry = sb.index.urlMetadata().load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
|
||||
|
@ -57,7 +57,7 @@ import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.geolocation.GeoLocation;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
@ -660,7 +660,7 @@ public class yacysearch {
|
||||
return prop;
|
||||
}
|
||||
final String recommendHash = post.get("recommendref", ""); // urlhash
|
||||
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
|
||||
final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
|
||||
if ( urlentry != null ) {
|
||||
Document[] documents = null;
|
||||
try {
|
||||
@ -696,7 +696,7 @@ public class yacysearch {
|
||||
return prop;
|
||||
}
|
||||
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
|
||||
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
|
||||
final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
|
||||
if ( urlentry != null ) {
|
||||
try {
|
||||
sb.tables.bookmarks.createBookmark(
|
||||
|
@ -45,7 +45,7 @@ import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.ftp.FTPClient;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
import net.yacy.kelondro.workflow.WorkflowProcessor;
|
||||
@ -439,7 +439,7 @@ public final class CrawlStacker {
|
||||
|
||||
// check if the url is double registered
|
||||
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
|
||||
final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash());
|
||||
final URIMetadata oldEntry = this.indexSegment.urlMetadata().load(url.hash());
|
||||
if (oldEntry == null) {
|
||||
if (dbocc != null) {
|
||||
// do double-check
|
||||
|
@ -32,7 +32,7 @@ import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.document.parser.sitemapParser;
|
||||
import net.yacy.document.parser.sitemapParser.URLEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
import de.anomic.crawler.retrieval.Request;
|
||||
@ -84,7 +84,7 @@ public class SitemapImporter extends Thread {
|
||||
final String dbocc = this.sb.urlExists(nexturlhash);
|
||||
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
|
||||
// the url was already loaded. we need to check the date
|
||||
final URIMetadataRow oldEntry = this.sb.index.urlMetadata().load(nexturlhash);
|
||||
final URIMetadata oldEntry = this.sb.index.urlMetadata().load(nexturlhash);
|
||||
if (oldEntry != null) {
|
||||
final Date modDate = oldEntry.moddate();
|
||||
// check if modDate is null
|
||||
|
@ -419,7 +419,8 @@ public class URLAnalysis {
|
||||
public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
|
||||
System.out.println("INDEX DIFF URL-COL startup");
|
||||
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
|
||||
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
|
||||
final long start = System.currentTimeMillis();
|
||||
@ -447,7 +448,8 @@ public class URLAnalysis {
|
||||
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
|
||||
// format: 0=text, 1=html, 2=rss/xml
|
||||
System.out.println("URL EXPORT startup");
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
|
||||
System.out.println("URL EXPORT loaded dump, starting export");
|
||||
final Export e = mr.export(new File(export), ".*", hs, format, false);
|
||||
@ -461,7 +463,8 @@ public class URLAnalysis {
|
||||
|
||||
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
|
||||
System.out.println("URL DELETE startup");
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
|
||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
final int mrSize = mr.size();
|
||||
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
|
||||
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
|
||||
|
@ -36,7 +36,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser.Failure;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
import net.yacy.search.index.Segment;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
@ -105,7 +105,7 @@ public class YMarkMetadata {
|
||||
|
||||
public EnumMap<METADATA, String> getMetadata() {
|
||||
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
||||
final URIMetadataRow urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash());
|
||||
final URIMetadata urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash());
|
||||
if (urlEntry != null) {
|
||||
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
|
||||
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
|
||||
|
@ -34,6 +34,7 @@ import org.apache.solr.client.solrj.SolrServer;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
@ -89,7 +90,7 @@ public class AbstractSolrConnector implements SolrConnector {
|
||||
@Override
|
||||
public long getSize() {
|
||||
try {
|
||||
final SolrDocumentList list = get("*:*", 0, 1);
|
||||
final SolrDocumentList list = query("*:*", 0, 1);
|
||||
return list.getNumFound();
|
||||
} catch (final Throwable e) {
|
||||
Log.logException(e);
|
||||
@ -132,8 +133,8 @@ public class AbstractSolrConnector implements SolrConnector {
|
||||
@Override
|
||||
public boolean exists(final String id) throws IOException {
|
||||
try {
|
||||
final SolrDocumentList list = get(SolrField.id.getSolrFieldName() + ":" + id, 0, 1);
|
||||
return list.getNumFound() > 0;
|
||||
final SolrDocument doc = get(id);
|
||||
return doc != null;
|
||||
} catch (final Throwable e) {
|
||||
Log.logException(e);
|
||||
return false;
|
||||
@ -186,7 +187,7 @@ public class AbstractSolrConnector implements SolrConnector {
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
|
||||
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
|
||||
// construct query
|
||||
final SolrQuery query = new SolrQuery();
|
||||
query.setQuery(querystring);
|
||||
@ -209,8 +210,33 @@ public class AbstractSolrConnector implements SolrConnector {
|
||||
} catch (final Throwable e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
//return result;
|
||||
/**
|
||||
* get a document from solr by given id
|
||||
* @param id
|
||||
* @return one result or null if no result exists
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public SolrDocument get(final String id) throws IOException {
|
||||
// construct query
|
||||
StringBuffer sb = new StringBuffer(id.length() + 3);
|
||||
sb.append(SolrField.id.getSolrFieldName()).append(':').append(id);
|
||||
final SolrQuery query = new SolrQuery();
|
||||
query.setQuery(sb.toString());
|
||||
query.setRows(1);
|
||||
query.setStart(0);
|
||||
|
||||
// query the server
|
||||
try {
|
||||
final QueryResponse rsp = this.server.query( query );
|
||||
final SolrDocumentList docs = rsp.getResults();
|
||||
if (docs.isEmpty()) return null;
|
||||
return docs.get(0);
|
||||
} catch (final Throwable e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
@ -111,6 +112,11 @@ public class MultipleSolrConnector implements SolrConnector {
|
||||
return this.solr.exists(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrDocument get(String id) throws IOException {
|
||||
return this.solr.get(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final SolrDoc solrdoc) throws IOException, SolrException {
|
||||
try {
|
||||
@ -132,8 +138,8 @@ public class MultipleSolrConnector implements SolrConnector {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrDocumentList get(String querystring, int offset, int count) throws IOException {
|
||||
return this.solr.get(querystring, offset, count);
|
||||
public SolrDocumentList query(String querystring, int offset, int count) throws IOException {
|
||||
return this.solr.query(querystring, offset, count);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -28,6 +28,7 @@ import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
@ -120,6 +121,21 @@ public class RetrySolrConnector implements SolrConnector {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrDocument get(String id) throws IOException {
|
||||
final long t = System.currentTimeMillis() + this.retryMaxTime;
|
||||
Throwable ee = null;
|
||||
while (System.currentTimeMillis() < t) try {
|
||||
return this.solrConnector.get(id);
|
||||
} catch (final Throwable e) {
|
||||
ee = e;
|
||||
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
|
||||
continue;
|
||||
}
|
||||
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final SolrDoc solrdoc) throws IOException, SolrException {
|
||||
final long t = System.currentTimeMillis() + this.retryMaxTime;
|
||||
@ -141,11 +157,11 @@ public class RetrySolrConnector implements SolrConnector {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
|
||||
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
|
||||
final long t = System.currentTimeMillis() + this.retryMaxTime;
|
||||
Throwable ee = null;
|
||||
while (System.currentTimeMillis() < t) try {
|
||||
return this.solrConnector.get(querystring, offset, count);
|
||||
return this.solrConnector.query(querystring, offset, count);
|
||||
} catch (final Throwable e) {
|
||||
ee = e;
|
||||
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
|
||||
|
@ -116,6 +116,15 @@ public class ShardSolrConnector implements SolrConnector {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SolrDocument get(String id) throws IOException {
|
||||
for (final SolrConnector connector: this.connectors) {
|
||||
SolrDocument doc = connector.get(id);
|
||||
if (doc != null) return doc;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* add a Solr document
|
||||
@ -148,10 +157,10 @@ public class ShardSolrConnector implements SolrConnector {
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
|
||||
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
|
||||
final SolrDocumentList list = new SolrDocumentList();
|
||||
for (final SolrConnector connector: this.connectors) {
|
||||
final SolrDocumentList l = connector.get(querystring, offset, count);
|
||||
final SolrDocumentList l = connector.query(querystring, offset, count);
|
||||
for (final SolrDocument d: l) {
|
||||
list.add(d);
|
||||
}
|
||||
@ -163,7 +172,7 @@ public class ShardSolrConnector implements SolrConnector {
|
||||
final SolrDocumentList[] list = new SolrDocumentList[this.connectors.size()];
|
||||
int i = 0;
|
||||
for (final SolrConnector connector: this.connectors) {
|
||||
list[i++] = connector.get(querystring, offset, count);
|
||||
list[i++] = connector.query(querystring, offset, count);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
@ -28,6 +28,7 @@ import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
@ -87,13 +88,21 @@ public interface SolrConnector {
|
||||
public void add(final SolrDoc solrdoc) throws IOException, SolrException;
|
||||
public void add(final Collection<SolrDoc> solrdocs) throws IOException, SolrException;
|
||||
|
||||
/**
|
||||
* get a document from solr by given id
|
||||
* @param id
|
||||
* @return one result or null if no result exists
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrDocument get(final String id) throws IOException;
|
||||
|
||||
/**
|
||||
* get a query result from solr
|
||||
* to get all results set the query String to "*:*"
|
||||
* @param querystring
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
|
||||
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException;
|
||||
|
||||
/**
|
||||
* get the size of the index
|
||||
|
@ -24,8 +24,8 @@ package net.yacy.kelondro.data.meta;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.order.Bitfield;
|
||||
import net.yacy.kelondro.rwi.Reference;
|
||||
|
||||
|
||||
public interface URIMetadata extends URIReference {
|
||||
@ -74,10 +74,12 @@ public interface URIMetadata extends URIReference {
|
||||
|
||||
public String snippet();
|
||||
|
||||
public Reference word();
|
||||
public WordReference word();
|
||||
|
||||
public boolean isOlder(final URIMetadata other);
|
||||
|
||||
public String toString(final String snippet);
|
||||
|
||||
public byte[] referrerHash();
|
||||
|
||||
}
|
||||
|
@ -35,6 +35,12 @@ public interface URIReference {
|
||||
*/
|
||||
public byte[] hash();
|
||||
|
||||
/**
|
||||
* the second half of a uri hash is the host hash
|
||||
* @return
|
||||
*/
|
||||
public String hosthash();
|
||||
|
||||
/**
|
||||
* The modification date of the URIReference is given if
|
||||
* the record was created first and is defined with the
|
||||
|
@ -49,6 +49,14 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
private String hostHash = null;
|
||||
@Override
|
||||
public String hosthash() {
|
||||
if (this.hostHash != null) return this.hostHash;
|
||||
this.hostHash = ASCII.String(this.hash, 6, 6);
|
||||
return this.hostHash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Date moddate() {
|
||||
byte[] x = this.get(MetadataVocabulary.moddate.name());
|
||||
|
@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.MapTools;
|
||||
import net.yacy.peers.operation.yacyVersion;
|
||||
|
||||
|
@ -77,6 +77,7 @@ import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
@ -1155,7 +1156,7 @@ public final class Protocol
|
||||
public static String transferIndex(
|
||||
final Seed targetSeed,
|
||||
final ReferenceContainerCache<WordReference> indexes,
|
||||
final SortedMap<byte[], URIMetadataRow> urlCache,
|
||||
final SortedMap<byte[], URIMetadata> urlCache,
|
||||
final boolean gzipBody,
|
||||
final int timeout) {
|
||||
|
||||
@ -1216,7 +1217,7 @@ public final class Protocol
|
||||
} // all url's known
|
||||
|
||||
// extract the urlCache from the result
|
||||
final URIMetadataRow[] urls = new URIMetadataRow[uhs.length];
|
||||
final URIMetadata[] urls = new URIMetadataRow[uhs.length];
|
||||
for ( int i = 0; i < uhs.length; i++ ) {
|
||||
urls[i] = urlCache.get(ASCII.getBytes(uhs[i]));
|
||||
if ( urls[i] == null ) {
|
||||
@ -1324,7 +1325,7 @@ public final class Protocol
|
||||
|
||||
private static Map<String, String> transferURL(
|
||||
final Seed targetSeed,
|
||||
final URIMetadataRow[] urls,
|
||||
final URIMetadata[] urls,
|
||||
boolean gzipBody,
|
||||
final int timeout) {
|
||||
// this post a message to the remote message board
|
||||
@ -1346,7 +1347,7 @@ public final class Protocol
|
||||
String resource;
|
||||
int urlc = 0;
|
||||
int urlPayloadSize = 0;
|
||||
for ( final URIMetadataRow url : urls ) {
|
||||
for ( final URIMetadata url : urls ) {
|
||||
if ( url != null ) {
|
||||
resource = url.toString();
|
||||
//System.out.println("*** DEBUG resource = " + resource);
|
||||
|
@ -32,7 +32,7 @@ import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
@ -90,7 +90,7 @@ public class Transmission {
|
||||
*/
|
||||
private final byte[] primaryTarget;
|
||||
private final ReferenceContainerCache<WordReference> containers;
|
||||
private final SortedMap<byte[], URIMetadataRow> references;
|
||||
private final SortedMap<byte[], URIMetadata> references;
|
||||
private final HandleSet badReferences;
|
||||
private final List<Seed> targets;
|
||||
private int hit, miss;
|
||||
@ -106,7 +106,7 @@ public class Transmission {
|
||||
super();
|
||||
this.primaryTarget = primaryTarget;
|
||||
this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
|
||||
this.references = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
|
||||
this.references = new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
|
||||
this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
|
||||
this.targets = targets;
|
||||
this.hit = 0;
|
||||
@ -175,7 +175,7 @@ public class Transmission {
|
||||
notFoundx.add(e.urlhash());
|
||||
continue;
|
||||
}
|
||||
final URIMetadataRow r = Transmission.this.segment.urlMetadata().load(e.urlhash());
|
||||
final URIMetadata r = Transmission.this.segment.urlMetadata().load(e.urlhash());
|
||||
if (r == null) {
|
||||
notFoundx.add(e.urlhash());
|
||||
this.badReferences.put(e.urlhash());
|
||||
|
@ -45,6 +45,7 @@ import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
@ -332,7 +333,7 @@ public class Blacklist {
|
||||
* @param entry Entry to be checked
|
||||
* @return Whether the given entry is blacklisted
|
||||
*/
|
||||
public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) {
|
||||
public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) {
|
||||
// Call inner method
|
||||
return isListed(blacklistType, entry.url());
|
||||
}
|
||||
|
@ -111,6 +111,7 @@ import net.yacy.document.parser.html.Evaluation;
|
||||
import net.yacy.gui.Tray;
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
@ -391,8 +392,12 @@ public final class Switchboard extends serverSwitch
|
||||
fileSizeMax,
|
||||
this.useTailCache,
|
||||
this.exceed134217727,
|
||||
solrLocal);
|
||||
|
||||
solrLocal,
|
||||
true, // useCitationIndex
|
||||
true, // useRWI
|
||||
true // useMetadata
|
||||
);
|
||||
|
||||
// prepare a solr index profile switch list
|
||||
final File solrBackupProfile = new File("defaults/solr.keys.list");
|
||||
final String schemename =
|
||||
@ -1197,7 +1202,11 @@ public final class Switchboard extends serverSwitch
|
||||
fileSizeMax,
|
||||
this.useTailCache,
|
||||
this.exceed134217727,
|
||||
solrLocal);
|
||||
solrLocal,
|
||||
true, // useCitationIndex
|
||||
true, // useRWI
|
||||
true // useMetadata
|
||||
);
|
||||
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
|
||||
|
||||
// create a crawler
|
||||
@ -1447,7 +1456,7 @@ public final class Switchboard extends serverSwitch
|
||||
if ( urlhash.length == 0 ) {
|
||||
return null;
|
||||
}
|
||||
final URIMetadataRow le = this.index.urlMetadata().load(urlhash);
|
||||
final URIMetadata le = this.index.urlMetadata().load(urlhash);
|
||||
if ( le != null ) {
|
||||
return le.url();
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ import net.yacy.document.Document;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
@ -74,7 +75,19 @@ public class DocumentIndex extends Segment
|
||||
|
||||
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
|
||||
throws IOException {
|
||||
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false, true);
|
||||
super(
|
||||
new Log("DocumentIndex"),
|
||||
segmentPath,
|
||||
cachesize,
|
||||
targetFileSize * 4 - 1,
|
||||
false, // useTailCache
|
||||
false, // exceed134217727
|
||||
true, // connectLocalSolr
|
||||
true, // useCitationIndex
|
||||
true, // useRWI
|
||||
true // useMetadata
|
||||
);
|
||||
|
||||
final int cores = Runtime.getRuntime().availableProcessors() + 1;
|
||||
this.callback = callback;
|
||||
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
|
||||
@ -227,7 +240,7 @@ public class DocumentIndex extends Segment
|
||||
rankedCache.start();
|
||||
|
||||
// search is running; retrieve results
|
||||
URIMetadataRow row;
|
||||
URIMetadata row;
|
||||
final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
|
||||
while ( (row = rankedCache.takeURL(false, 1000)) != null ) {
|
||||
files.add(row.url());
|
||||
|
@ -49,6 +49,7 @@ import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.WordReferenceVars;
|
||||
import net.yacy.kelondro.index.Cache;
|
||||
@ -65,43 +66,38 @@ import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.solr.EmbeddedSolrConnector;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import de.anomic.crawler.CrawlStacker;
|
||||
|
||||
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
|
||||
|
||||
// class objects
|
||||
protected Index urlIndexFile;
|
||||
private final File location;
|
||||
private Index urlIndexFile;
|
||||
private Export exportthread; // will have a export thread assigned if exporter is running
|
||||
private final File location;
|
||||
private final String tablename;
|
||||
private String tablename;
|
||||
private ArrayList<HostStat> statsDump;
|
||||
private SolrConnector localSolr, remoteSolr;
|
||||
|
||||
public MetadataRepository(
|
||||
final File path,
|
||||
final String tablename,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727) {
|
||||
public MetadataRepository(final File path) {
|
||||
this.location = path;
|
||||
this.tablename = tablename;
|
||||
Index backupIndex = null;
|
||||
backupIndex = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
|
||||
this.urlIndexFile = backupIndex; //new Cache(backupIndex, 20000000, 20000000);
|
||||
this.tablename = null;
|
||||
this.urlIndexFile = null;
|
||||
this.exportthread = null; // will have a export thread assigned if exporter is running
|
||||
this.statsDump = null;
|
||||
this.remoteSolr = null;
|
||||
this.localSolr = null;
|
||||
}
|
||||
|
||||
public void connectRemoteSolr(final SolrConnector solr) {
|
||||
this.remoteSolr = solr;
|
||||
|
||||
public void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
|
||||
if (this.urlIndexFile != null) return;
|
||||
this.tablename = tablename;
|
||||
this.urlIndexFile = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
|
||||
}
|
||||
|
||||
public void disconnectRemoteSolr() {
|
||||
if (this.remoteSolr == null) return;
|
||||
this.remoteSolr.close();
|
||||
this.remoteSolr = null;
|
||||
public void disconnectUrlDb() {
|
||||
if (this.urlIndexFile == null) return;
|
||||
this.urlIndexFile.close();
|
||||
this.urlIndexFile = null;
|
||||
}
|
||||
|
||||
public void connectLocalSolr() throws IOException {
|
||||
@ -123,6 +119,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
this.localSolr.close();
|
||||
this.localSolr = null;
|
||||
}
|
||||
|
||||
public void connectRemoteSolr(final SolrConnector solr) {
|
||||
this.remoteSolr = solr;
|
||||
}
|
||||
|
||||
public void disconnectRemoteSolr() {
|
||||
if (this.remoteSolr == null) return;
|
||||
this.remoteSolr.close();
|
||||
this.remoteSolr = null;
|
||||
}
|
||||
|
||||
public SolrConnector getLocalSolr() {
|
||||
return this.localSolr;
|
||||
@ -133,7 +139,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
}
|
||||
|
||||
public void clearCache() {
|
||||
if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
|
||||
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
|
||||
if (this.statsDump != null) this.statsDump.clear();
|
||||
this.statsDump = null;
|
||||
}
|
||||
@ -142,15 +148,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
if (this.exportthread != null) this.exportthread.interrupt();
|
||||
if (this.urlIndexFile == null) {
|
||||
SplitTable.delete(this.location, this.tablename);
|
||||
this.urlIndexFile = new SplitTable(this.location, this.tablename, URIMetadataRow.rowdef, false, false);
|
||||
} else {
|
||||
this.urlIndexFile.clear();
|
||||
}
|
||||
if (this.localSolr != null) {
|
||||
this.localSolr.clear();
|
||||
}
|
||||
// the remote solr is not cleared here because that shall be done separately
|
||||
this.statsDump = null;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
|
||||
int size = 0;
|
||||
size += this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
|
||||
size += this.localSolr == null ? 0 : this.localSolr.getSize();
|
||||
size += this.remoteSolr == null ? 0 : this.remoteSolr.getSize();
|
||||
return size;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
@ -170,8 +183,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
}
|
||||
|
||||
public int writeCacheSize() {
|
||||
if (this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
|
||||
if (this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
|
||||
if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
|
||||
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -181,59 +194,69 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
* @param obrwi
|
||||
* @return
|
||||
*/
|
||||
public URIMetadataRow load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
|
||||
if (this.urlIndexFile == null) return null;
|
||||
public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
|
||||
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
|
||||
final byte[] urlHash = obrwi.getElement().urlhash();
|
||||
if (urlHash == null) return null;
|
||||
try {
|
||||
if (this.urlIndexFile != null) try {
|
||||
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
|
||||
if (entry == null) return null;
|
||||
return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
|
||||
} catch (final IOException e) {
|
||||
return null;
|
||||
Log.logException(e);
|
||||
}
|
||||
/*
|
||||
if (this.localSolr != null) {
|
||||
try {
|
||||
SolrDocument doc = this.localSolr.get(ASCII.String(urlHash));
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
*/
|
||||
return null;
|
||||
}
|
||||
|
||||
public URIMetadataRow load(final byte[] urlHash) {
|
||||
if (this.urlIndexFile == null) return null;
|
||||
public URIMetadata load(final byte[] urlHash) {
|
||||
if (urlHash == null) return null;
|
||||
try {
|
||||
if (this.urlIndexFile != null) try {
|
||||
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
|
||||
if (entry == null) return null;
|
||||
return new URIMetadataRow(entry, null, 0);
|
||||
} catch (final IOException e) {
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void store(final URIMetadataRow entry) throws IOException {
|
||||
public void store(final URIMetadata entry) throws IOException {
|
||||
// Check if there is a more recent Entry already in the DB
|
||||
URIMetadataRow oldEntry;
|
||||
if (this.urlIndexFile == null) return; // case may happen during shutdown or startup
|
||||
try {
|
||||
final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
|
||||
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
oldEntry = null;
|
||||
if (this.urlIndexFile != null && entry instanceof URIMetadataRow) {
|
||||
URIMetadata oldEntry = null;
|
||||
try {
|
||||
final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
|
||||
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
oldEntry = null;
|
||||
}
|
||||
if (oldEntry != null && entry.isOlder(oldEntry)) {
|
||||
// the fetched oldEntry is better, so return its properties instead of the new ones
|
||||
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
|
||||
// this.url = oldEntry.url; // unnecessary, should be the same
|
||||
// doesn't make sense, since no return value:
|
||||
//entry = oldEntry;
|
||||
return; // this did not need to be stored, but is updated
|
||||
}
|
||||
|
||||
try {
|
||||
this.urlIndexFile.put(((URIMetadataRow) entry).toRowEntry());
|
||||
} catch (final RowSpaceExceededException e) {
|
||||
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCache();
|
||||
}
|
||||
if (oldEntry != null && entry.isOlder(oldEntry)) {
|
||||
// the fetched oldEntry is better, so return its properties instead of the new ones
|
||||
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
|
||||
// this.url = oldEntry.url; // unnecessary, should be the same
|
||||
// doesn't make sense, since no return value:
|
||||
//entry = oldEntry;
|
||||
return; // this did not need to be stored, but is updated
|
||||
}
|
||||
|
||||
try {
|
||||
this.urlIndexFile.put(entry.toRowEntry());
|
||||
} catch (final RowSpaceExceededException e) {
|
||||
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCache() ;
|
||||
}
|
||||
|
||||
public boolean remove(final byte[] urlHash) {
|
||||
@ -251,13 +274,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
if (this.urlIndexFile != null) try {
|
||||
final Row.Entry r = this.urlIndexFile.remove(urlHash);
|
||||
if (r != null) this.statsDump = null;
|
||||
return r != null;
|
||||
} catch (final IOException e) {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean exists(final byte[] urlHash) {
|
||||
@ -297,17 +321,17 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
return keys(true, null);
|
||||
}
|
||||
|
||||
public CloneableIterator<URIMetadataRow> entries() throws IOException {
|
||||
public CloneableIterator<URIMetadata> entries() throws IOException {
|
||||
// enumerates entry elements
|
||||
return new kiter();
|
||||
}
|
||||
|
||||
public CloneableIterator<URIMetadataRow> entries(final boolean up, final String firstHash) throws IOException {
|
||||
public CloneableIterator<URIMetadata> entries(final boolean up, final String firstHash) throws IOException {
|
||||
// enumerates entry elements
|
||||
return new kiter(up, firstHash);
|
||||
}
|
||||
|
||||
public class kiter implements CloneableIterator<URIMetadataRow> {
|
||||
public class kiter implements CloneableIterator<URIMetadata> {
|
||||
// enumerates entry elements
|
||||
private final CloneableIterator<Row.Entry> iter;
|
||||
private final boolean error;
|
||||
@ -342,7 +366,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
}
|
||||
|
||||
@Override
|
||||
public final URIMetadataRow next() {
|
||||
public final URIMetadata next() {
|
||||
Row.Entry e = null;
|
||||
if (this.iter == null) { return null; }
|
||||
if (this.iter.hasNext()) { e = this.iter.next(); }
|
||||
@ -372,7 +396,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
final Log log = new Log("URLDBCLEANUP");
|
||||
final HashSet<String> damagedURLS = new HashSet<String>();
|
||||
try {
|
||||
final Iterator<URIMetadataRow> eiter = entries(true, null);
|
||||
final Iterator<URIMetadata> eiter = entries(true, null);
|
||||
int iteratorCount = 0;
|
||||
while (eiter.hasNext()) try {
|
||||
eiter.next();
|
||||
@ -456,7 +480,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
public void run() {
|
||||
try {
|
||||
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
|
||||
final Iterator<URIMetadataRow> eiter = entries(true, null);
|
||||
final Iterator<URIMetadata> eiter = entries(true, null);
|
||||
while (eiter.hasNext() && this.run) {
|
||||
synchronized (this) {
|
||||
if (this.pause) {
|
||||
@ -469,7 +493,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
}
|
||||
}
|
||||
}
|
||||
final URIMetadataRow entry = eiter.next();
|
||||
final URIMetadata entry = eiter.next();
|
||||
if (entry == null) {
|
||||
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
|
||||
} else if (entry.hash() == null) {
|
||||
@ -605,8 +629,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
this.count++;
|
||||
}
|
||||
} else {
|
||||
final Iterator<URIMetadataRow> i = entries(); // iterates indexURLEntry objects
|
||||
URIMetadataRow entry;
|
||||
final Iterator<URIMetadata> i = entries(); // iterates indexURLEntry objects
|
||||
URIMetadata entry;
|
||||
String url;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
@ -704,7 +728,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
// collect hashes from all domains
|
||||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
URIMetadataRow urlref;
|
||||
URIMetadata urlref;
|
||||
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
|
||||
this.statsDump = new ArrayList<HostStat>();
|
||||
final TreeSet<String> set = new TreeSet<String>();
|
||||
@ -741,7 +765,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
*/
|
||||
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
|
||||
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
|
||||
URIMetadataRow urlref;
|
||||
URIMetadata urlref;
|
||||
|
||||
final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
|
||||
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
|
||||
@ -762,7 +786,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
|
||||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
|
||||
URIMetadataRow urlref;
|
||||
URIMetadata urlref;
|
||||
String urlhash;
|
||||
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
|
||||
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
|
||||
|
@ -47,6 +47,7 @@ import net.yacy.document.Parser;
|
||||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
@ -88,7 +89,8 @@ public class Segment {
|
||||
public static final int lowcachedivisor = 900;
|
||||
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
|
||||
public static final int writeBufferSize = 4 * 1024 * 1024;
|
||||
|
||||
public static final String UrlDbName = "text.urlmd";
|
||||
|
||||
// the reference factory
|
||||
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
|
||||
public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory();
|
||||
@ -109,14 +111,17 @@ public class Segment {
|
||||
final long maxFileSize,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727,
|
||||
final boolean connectLocalSolr) throws IOException {
|
||||
final boolean connectLocalSolr,
|
||||
final boolean useCitationIndex,
|
||||
final boolean useRWI,
|
||||
final boolean useMetadata) throws IOException {
|
||||
|
||||
log.logInfo("Initializing Segment '" + segmentPath + ".");
|
||||
|
||||
this.log = log;
|
||||
this.segmentPath = segmentPath;
|
||||
|
||||
this.termIndex = new IndexCell<WordReference>(
|
||||
this.termIndex = useRWI ? new IndexCell<WordReference>(
|
||||
segmentPath,
|
||||
"text.index",
|
||||
wordReferenceFactory,
|
||||
@ -125,9 +130,9 @@ public class Segment {
|
||||
entityCacheMaxSize,
|
||||
targetFileSize,
|
||||
maxFileSize,
|
||||
writeBufferSize);
|
||||
writeBufferSize) : null;
|
||||
|
||||
this.urlCitationIndex = new IndexCell<CitationReference>(
|
||||
this.urlCitationIndex = useCitationIndex ? new IndexCell<CitationReference>(
|
||||
segmentPath,
|
||||
"citation.index",
|
||||
citationReferenceFactory,
|
||||
@ -136,10 +141,11 @@ public class Segment {
|
||||
entityCacheMaxSize,
|
||||
targetFileSize,
|
||||
maxFileSize,
|
||||
writeBufferSize);
|
||||
writeBufferSize) : null;
|
||||
|
||||
// create LURL-db
|
||||
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
|
||||
this.urlMetadata = new MetadataRepository(segmentPath);
|
||||
if (useMetadata) this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
|
||||
if (connectLocalSolr) this.connectLocalSolr();
|
||||
}
|
||||
|
||||
@ -148,10 +154,12 @@ public class Segment {
|
||||
}
|
||||
|
||||
public long RWICount() {
|
||||
if (this.termIndex == null) return 0;
|
||||
return this.termIndex.sizesMax();
|
||||
}
|
||||
|
||||
public int RWIBufferCount() {
|
||||
if (this.termIndex == null) return 0;
|
||||
return this.termIndex.getBufferSize();
|
||||
}
|
||||
|
||||
@ -235,7 +243,7 @@ public class Segment {
|
||||
}
|
||||
@Override
|
||||
public DigestURI next() {
|
||||
URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next());
|
||||
URIMetadata umr = Segment.this.urlMetadata.load(bi.next());
|
||||
return umr.url();
|
||||
}
|
||||
@Override
|
||||
@ -260,9 +268,9 @@ public class Segment {
|
||||
|
||||
public void clear() {
|
||||
try {
|
||||
this.termIndex.clear();
|
||||
this.urlMetadata.clear();
|
||||
this.urlCitationIndex.clear();
|
||||
if (this.termIndex != null) this.termIndex.clear();
|
||||
if (this.urlMetadata != null) this.urlMetadata.clear();
|
||||
if (this.urlCitationIndex != null) this.urlCitationIndex.clear();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
@ -328,7 +336,7 @@ public class Segment {
|
||||
assert (wprop.flags != null);
|
||||
ientry.setWord(wprop);
|
||||
wordhash = Word.word2hash(word);
|
||||
try {
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(wordhash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
@ -354,7 +362,7 @@ public class Segment {
|
||||
|
||||
// assign the catchall word
|
||||
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
|
||||
try {
|
||||
if (this.termIndex != null) try {
|
||||
this.termIndex.add(catchallHash, ientry);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
@ -385,9 +393,9 @@ public class Segment {
|
||||
}
|
||||
|
||||
public synchronized void close() {
|
||||
this.termIndex.close();
|
||||
this.urlMetadata.close();
|
||||
this.urlCitationIndex.close();
|
||||
if (this.termIndex != null) this.termIndex.close();
|
||||
if (this.urlMetadata != null) this.urlMetadata.close();
|
||||
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
|
||||
}
|
||||
|
||||
public URIMetadataRow storeDocument(
|
||||
@ -541,7 +549,7 @@ public class Segment {
|
||||
|
||||
if (urlhash == null) return 0;
|
||||
// determine the url string
|
||||
final URIMetadataRow entry = urlMetadata().load(urlhash);
|
||||
final URIMetadata entry = urlMetadata().load(urlhash);
|
||||
if (entry == null) return 0;
|
||||
if (entry.url() == null) return 0;
|
||||
|
||||
@ -612,7 +620,7 @@ public class Segment {
|
||||
entry = new WordReferenceVars(containerIterator.next());
|
||||
// System.out.println("Wordhash: "+wordHash+" UrlHash:
|
||||
// "+entry.getUrlHash());
|
||||
final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
|
||||
final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash());
|
||||
if (ue == null) {
|
||||
urlHashs.put(entry.urlhash());
|
||||
} else {
|
||||
|
@ -55,6 +55,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
@ -616,7 +617,7 @@ public final class RWIProcess extends Thread
|
||||
* @param waitingtime the time this method may take for a result computation
|
||||
* @return a metadata entry for a url
|
||||
*/
|
||||
public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) {
|
||||
public URIMetadata takeURL(final boolean skipDoubleDom, final long waitingtime) {
|
||||
// returns from the current RWI list the best URL entry and removes this entry from the list
|
||||
final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
|
||||
int p = -1;
|
||||
@ -627,7 +628,7 @@ public final class RWIProcess extends Thread
|
||||
if ( obrwi == null ) {
|
||||
return null; // all time was already wasted in takeRWI to get another element
|
||||
}
|
||||
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
|
||||
final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi);
|
||||
if ( page == null ) {
|
||||
try {
|
||||
this.misses.putUnique(obrwi.getElement().urlhash());
|
||||
@ -864,7 +865,7 @@ public final class RWIProcess extends Thread
|
||||
}
|
||||
|
||||
final Iterator<String> domhashs = this.hostNavigator.keys(false);
|
||||
URIMetadataRow row;
|
||||
URIMetadata row;
|
||||
byte[] urlhash;
|
||||
String hosthash, hostname;
|
||||
if ( this.hostResolver != null ) {
|
||||
|
@ -41,7 +41,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
@ -454,7 +454,7 @@ public class SnippetProcess {
|
||||
public void run() {
|
||||
|
||||
// start fetching urls and snippets
|
||||
URIMetadataRow page;
|
||||
URIMetadata page;
|
||||
ResultEntry resultEntry;
|
||||
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
|
||||
final boolean nav_topics = SnippetProcess.this.query.navigators.equals("all") || SnippetProcess.this.query.navigators.indexOf("topics",0) >= 0;
|
||||
@ -498,7 +498,7 @@ public class SnippetProcess {
|
||||
String solrContent = null;
|
||||
if (this.solr != null) {
|
||||
SolrDocument sd = null;
|
||||
final SolrDocumentList sdl = this.solr.get(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1);
|
||||
final SolrDocumentList sdl = this.solr.query(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1);
|
||||
if (!sdl.isEmpty()) {
|
||||
sd = sdl.get(0);
|
||||
}
|
||||
@ -553,7 +553,7 @@ public class SnippetProcess {
|
||||
}
|
||||
}
|
||||
|
||||
protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
|
||||
protected ResultEntry fetchSnippet(final URIMetadata page, final String solrText, final CacheStrategy cacheStrategy) {
|
||||
// Snippet Fetching can has 3 modes:
|
||||
// 0 - do not fetch snippets
|
||||
// 1 - fetch snippets offline only
|
||||
|
@ -34,7 +34,7 @@ import java.util.List;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReferenceVars;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
@ -50,7 +50,7 @@ import net.yacy.search.index.Segment;
|
||||
public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEntry> {
|
||||
|
||||
// payload objects
|
||||
private final URIMetadataRow urlentry;
|
||||
private final URIMetadata urlentry;
|
||||
private String alternative_urlstring;
|
||||
private String alternative_urlname;
|
||||
private final TextSnippet textSnippet;
|
||||
@ -60,7 +60,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
|
||||
// statistic objects
|
||||
public long dbRetrievalTime, snippetComputationTime, ranking;
|
||||
|
||||
public ResultEntry(final URIMetadataRow urlentry,
|
||||
public ResultEntry(final URIMetadata urlentry,
|
||||
final Segment indexSegment,
|
||||
SeedDB peers,
|
||||
final TextSnippet textSnippet,
|
||||
|
@ -45,7 +45,7 @@ import net.yacy.document.SnippetExtractor;
|
||||
import net.yacy.document.WordTokenizer;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
@ -146,7 +146,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
||||
public TextSnippet(
|
||||
final LoaderDispatcher loader,
|
||||
final String solrText,
|
||||
final URIMetadataRow row,
|
||||
final URIMetadata row,
|
||||
final HandleSet queryhashes,
|
||||
final CacheStrategy cacheStrategy,
|
||||
final boolean pre,
|
||||
|
@ -155,7 +155,7 @@ public class EmbeddedSolrConnector extends AbstractSolrConnector implements Solr
|
||||
solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.");
|
||||
solr.add(solrdoc);
|
||||
SolrServlet.startServer("/solr", 8091, solr);
|
||||
SolrDocumentList searchresult = solr.get(SolrField.text_t.name() + ":tempor", 0, 10);
|
||||
SolrDocumentList searchresult = solr.query(SolrField.text_t.name() + ":tempor", 0, 10);
|
||||
for (SolrDocument d : searchresult) {
|
||||
System.out.println(d.toString());
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
package net.yacy;
|
||||
// yacy.java
|
||||
// -----------------------
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
@ -23,8 +22,8 @@ package net.yacy;
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy;
|
||||
|
||||
//import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
@ -61,7 +60,7 @@ import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.gui.YaCyApp;
|
||||
import net.yacy.gui.framework.Browser;
|
||||
import net.yacy.kelondro.blob.MapDataMining;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
@ -657,11 +656,13 @@ public final class yacy {
|
||||
log.logInfo("STARTING URL CLEANUP");
|
||||
|
||||
// db containing all currently loades urls
|
||||
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false);
|
||||
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"));
|
||||
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
|
||||
// db used to hold all neede urls
|
||||
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);
|
||||
|
||||
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
|
||||
minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
|
||||
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
|
||||
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
|
||||
|
||||
@ -669,7 +670,14 @@ public final class yacy {
|
||||
log,
|
||||
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
|
||||
10000,
|
||||
Integer.MAX_VALUE, false, false, false);
|
||||
Integer.MAX_VALUE,
|
||||
false, // useTailCache
|
||||
false, // exceed134217727
|
||||
false, // connectLocalSolr
|
||||
false, // useCitationIndex
|
||||
true, // useRWI
|
||||
true // useMetadata
|
||||
);
|
||||
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
|
||||
|
||||
long urlCounter = 0, wordCounter = 0;
|
||||
@ -689,7 +697,7 @@ public final class yacy {
|
||||
iEntry = wordIdxEntries.next();
|
||||
final byte[] urlHash = iEntry.urlhash();
|
||||
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
|
||||
final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
|
||||
final URIMetadata urlEntry = currentUrlDB.load(urlHash);
|
||||
urlCounter++;
|
||||
minimizedUrlDB.store(urlEntry);
|
||||
if (urlCounter % 500 == 0) {
|
||||
@ -829,7 +837,8 @@ public final class yacy {
|
||||
final File root = dataHome;
|
||||
final File indexroot = new File(root, "DATA/INDEX");
|
||||
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
|
||||
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false);
|
||||
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
|
||||
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
|
||||
currentUrlDB.deadlinkCleaner();
|
||||
currentUrlDB.close();
|
||||
}
|
||||
@ -849,7 +858,14 @@ public final class yacy {
|
||||
log,
|
||||
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
|
||||
10000,
|
||||
Integer.MAX_VALUE, false, false, false);
|
||||
Integer.MAX_VALUE,
|
||||
false, // useTailCache
|
||||
false, // exceed134217727
|
||||
false, // connectLocalSolr
|
||||
false, // useCitationIndex
|
||||
true, // useRWI
|
||||
true // useMetadata
|
||||
);
|
||||
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
|
||||
}
|
||||
int counter = 0;
|
||||
|
Reference in New Issue
Block a user