mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-05-13 22:09:33 -04:00
migrated the index export methods from the old metadata to solr. Now
exports are done using solr queries. removed superfluous methods and servlets.
This commit is contained in:
parent
1768c82010
commit
0fe7b6fd3b
@ -124,10 +124,9 @@ public class CrawlResults {
|
||||
|
||||
if (post.containsKey("deletedomain")) {
|
||||
final String domain = post.get("domain", null);
|
||||
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
|
||||
if (hashpart != null) {
|
||||
sb.index.fulltext().deleteDomain(hashpart, null, false);
|
||||
ResultURLs.deleteDomain(tabletype, domain, hashpart);
|
||||
if (domain != null) {
|
||||
sb.index.fulltext().deleteDomainHostname(domain, null, false);
|
||||
ResultURLs.deleteDomain(tabletype, domain);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -294,7 +294,7 @@ public class Crawler_p {
|
||||
siteFilter = CrawlProfile.siteFilter(rootURLs);
|
||||
if (deleteold) {
|
||||
for (DigestURI u: rootURLs) {
|
||||
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
|
||||
int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
|
||||
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
|
||||
}
|
||||
}
|
||||
|
@ -77,7 +77,6 @@ function updatepage(str) {
|
||||
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
|
||||
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
|
||||
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="submitready" style="width:240px;"/>
|
||||
<input type="submit" name="urlhashsimilar" value="Generate List" class="submitready" style="width:240px;"/>
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
@ -132,7 +131,7 @@ function updatepage(str) {
|
||||
<td>
|
||||
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<div>
|
||||
<input type="hidden" name="hashpart" value="#[hashpart]#" />
|
||||
<input type="hidden" name="domain" value="#[domain]#" />
|
||||
<input type="hidden" name="lines" value="#[lines]#" />
|
||||
<input type="submit" name="deletedomain" value="delete all" class="submitready" style="width:240px;"/>
|
||||
</div>
|
||||
@ -206,13 +205,6 @@ function updatepage(str) {
|
||||
<div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
|
||||
#(/indexdump)#
|
||||
|
||||
#(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
|
||||
#{rows}#
|
||||
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
|
||||
#{/rows}#
|
||||
</p>
|
||||
#(/urlhashsimilar)#
|
||||
|
||||
#(genUrlProfile)#
|
||||
::No entry found for URL-hash #[urlhash]#
|
||||
::<iframe src="/api/yacydoc.html?urlhash=#[urlhash]#" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />
|
||||
|
@ -30,13 +30,15 @@ import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.lod.JenaTripleStore;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.sorting.ReversibleScoreMap;
|
||||
import net.yacy.crawler.data.Cache;
|
||||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.data.WorkTables;
|
||||
@ -44,7 +46,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.RotateIterator;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.index.Segment;
|
||||
@ -236,30 +237,6 @@ public class IndexControlURLs_p {
|
||||
}
|
||||
}
|
||||
|
||||
// generate list
|
||||
if (post.containsKey("urlhashsimilar")) {
|
||||
final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
|
||||
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
|
||||
DigestURI entry;
|
||||
int i = 0, rows = 0, cols = 0;
|
||||
prop.put("urlhashsimilar", "1");
|
||||
while (entryIt.hasNext() && i < 256) {
|
||||
entry = entryIt.next();
|
||||
if (entry == null) break;
|
||||
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
|
||||
cols++;
|
||||
if (cols==8) {
|
||||
prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
|
||||
cols = 0;
|
||||
rows++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
prop.put("statistics", 0);
|
||||
prop.put("urlhashsimilar_rows", rows);
|
||||
prop.put("result", result.toString());
|
||||
}
|
||||
|
||||
if (post.containsKey("lurlexport")) {
|
||||
// parse format
|
||||
int format = 0;
|
||||
@ -279,7 +256,7 @@ public class IndexControlURLs_p {
|
||||
final File f = new File(s);
|
||||
f.getParentFile().mkdirs();
|
||||
final String filter = post.get("exportfilter", ".*");
|
||||
final Fulltext.Export running = segment.fulltext().export(f, filter, null, format, dom);
|
||||
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);
|
||||
|
||||
prop.put("lurlexport_exportfile", s);
|
||||
prop.put("lurlexport_urlcount", running.count());
|
||||
@ -301,29 +278,29 @@ public class IndexControlURLs_p {
|
||||
}
|
||||
|
||||
if (post.containsKey("deletedomain")) {
|
||||
final String hp = post.get("hashpart");
|
||||
segment.fulltext().deleteDomain(hp, null, false);
|
||||
final String domain = post.get("domain");
|
||||
segment.fulltext().deleteDomainHostname(domain, null, false);
|
||||
// trigger the loading of the table
|
||||
post.put("statistics", "");
|
||||
}
|
||||
|
||||
if (post.containsKey("statistics")) {
|
||||
final int count = post.getInt("lines", 100);
|
||||
Iterator<Fulltext.HostStat> statsiter;
|
||||
prop.put("statistics_lines", count);
|
||||
int cnt = 0;
|
||||
try {
|
||||
final Fulltext metadata = segment.fulltext();
|
||||
statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
|
||||
Map<String, ReversibleScoreMap<String>> scores = metadata.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", count, YaCySchema.host_s.getSolrFieldName());
|
||||
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
|
||||
Iterator<String> statsiter = stats.keys(false);
|
||||
boolean dark = true;
|
||||
Fulltext.HostStat hs;
|
||||
String hostname;
|
||||
prop.put("statisticslines_domains_" + cnt + "lines", count);
|
||||
while (statsiter.hasNext() && cnt < count) {
|
||||
hs = statsiter.next();
|
||||
hostname = statsiter.next();
|
||||
prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
|
||||
prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
|
||||
prop.put("statisticslines_domains_" + cnt + "lines", count);
|
||||
prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
|
||||
prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
|
||||
prop.put("statisticslines_domains_" + cnt + "_domain", hostname);
|
||||
prop.put("statisticslines_domains_" + cnt + "_count", stats.get(hostname));
|
||||
dark = !dark;
|
||||
cnt++;
|
||||
}
|
||||
|
@ -13,13 +13,4 @@
|
||||
#(indexdump)#::
|
||||
<dumpfile>#[dumpfile]#</dumpfile>::
|
||||
#(/indexdump)#
|
||||
#(urlhashsimilar)#::
|
||||
<urls>
|
||||
#{rows}#
|
||||
#{cols}#
|
||||
<urlhash>#[urlHash]#</urlhash>
|
||||
#{/cols}#
|
||||
#{/rows}#
|
||||
</urls>
|
||||
#(/urlhashsimilar)#
|
||||
</data>
|
@ -1,70 +0,0 @@
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainerCache;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.index.Fulltext.HostStat;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.ranking.BlockRank;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
import net.yacy.server.servletProperties;
|
||||
|
||||
public class YBRFetch_p
|
||||
{
|
||||
|
||||
public static servletProperties respond(
|
||||
@SuppressWarnings("unused") final RequestHeader requestHeader,
|
||||
final serverObjects post,
|
||||
final serverSwitch env) {
|
||||
final servletProperties prop = new servletProperties();
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
|
||||
if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024L * 1024L * 1024L ) {
|
||||
return prop;
|
||||
}
|
||||
final File hostIndexFile = new File(sb.queuesRoot, "hostIndex.blob");
|
||||
|
||||
ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
|
||||
if ( !hostIndexFile.exists() ) {
|
||||
hostIndex = BlockRank.collect(sb.peers, sb.webStructure, Integer.MAX_VALUE);
|
||||
BlockRank.saveHostIndex(hostIndex, hostIndexFile);
|
||||
} else {
|
||||
hostIndex = BlockRank.loadHostIndex(hostIndexFile);
|
||||
}
|
||||
|
||||
// use an index segment to find hosts for given host hashes
|
||||
final Segment segment = sb.index;
|
||||
final Fulltext metadata = segment.fulltext();
|
||||
Map<String, HostStat> hostHashResolver;
|
||||
try {
|
||||
hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
|
||||
} catch ( final IOException e ) {
|
||||
hostHashResolver = new HashMap<String, HostStat>();
|
||||
}
|
||||
|
||||
// recursively compute a new ranking table
|
||||
Log.logInfo("BLOCK RANK", "computing new ranking tables...");
|
||||
BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
|
||||
hostIndex = null; // we don't need that here any more, so free the memory
|
||||
|
||||
// use the web structure and the hostHash resolver to analyse the ranking table
|
||||
Log.logInfo("BLOCK RANK", "analysis of " + BlockRank.ybrTables.length + " tables...");
|
||||
BlockRank.analyse(sb.webStructure, hostHashResolver);
|
||||
// store the new table
|
||||
Log.logInfo("BLOCK RANK", "storing fresh table...");
|
||||
final File rankingPath = new File(sb.appPath, "ranking/YBR".replace('/', File.separatorChar));
|
||||
BlockRank.storeBlockRankTable(rankingPath);
|
||||
BlockRank.loadBlockRankTable(rankingPath, 16);
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
}
|
@ -143,17 +143,8 @@ public final class ResultURLs {
|
||||
return getDomains(stack).keys(false);
|
||||
}
|
||||
|
||||
public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
|
||||
public static int deleteDomain(final EventOrigin stack, final String host) {
|
||||
assert host != null : "host = null";
|
||||
assert hosthash.length() == 6;
|
||||
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
|
||||
Map.Entry<String, InitExecEntry> w;
|
||||
String urlhash;
|
||||
while (i.hasNext()) {
|
||||
w = i.next();
|
||||
urlhash = w.getKey();
|
||||
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
|
||||
}
|
||||
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
|
||||
return getDomains(stack).delete(host);
|
||||
}
|
||||
|
@ -34,9 +34,9 @@ import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
@ -49,8 +49,8 @@ import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.order.CloneableIterator;
|
||||
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
||||
import net.yacy.cora.sorting.ReversibleScoreMap;
|
||||
import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.cora.storage.ZIPReader;
|
||||
import net.yacy.cora.storage.ZIPWriter;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
@ -64,15 +64,15 @@ import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.table.SplitTable;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import net.yacy.kelondro.util.MergeIterator;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
||||
import org.apache.commons.httpclient.util.DateUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
public final class Fulltext implements Iterable<byte[]> {
|
||||
public final class Fulltext {
|
||||
|
||||
private static final String SOLR_PATH = "solr_40"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
|
||||
private static final String SOLR_OLD_PATH[] = new String[]{"solr_36"};
|
||||
@ -359,7 +359,7 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
* @return number of deleted domains
|
||||
* @throws IOException
|
||||
*/
|
||||
public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
|
||||
public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
|
||||
// first collect all url hashes that belong to the domain
|
||||
assert hosthash.length() == 6;
|
||||
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
|
||||
@ -412,6 +412,38 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
return count.get();
|
||||
}
|
||||
|
||||
public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
|
||||
// first collect all url hashes that belong to the domain
|
||||
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
|
||||
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
|
||||
final AtomicInteger count = new AtomicInteger(0);
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
// delete in solr
|
||||
synchronized (Fulltext.this.solr) {
|
||||
try {
|
||||
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
|
||||
if (count.get() > 0) Fulltext.this.solr.commit(true);
|
||||
} catch (IOException e) {}
|
||||
}
|
||||
// finally remove the line with statistics
|
||||
if (Fulltext.this.statsDump != null) {
|
||||
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
|
||||
HostStat hs;
|
||||
while (hsi.hasNext()) {
|
||||
hs = hsi.next();
|
||||
if (hs.hostname.equals(hostname)) {
|
||||
hsi.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
if (concurrent) t.start(); else t.run();
|
||||
return count.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* remove a full subpath from the index
|
||||
* @param subpath the left path of the url; at least until the end of the host
|
||||
@ -510,96 +542,6 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
if (reason == null) return null;
|
||||
return reason == null ? null : reason.length() == 0 ? null : reason;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<byte[]> iterator() {
|
||||
CloneableIterator<byte[]> a = null;
|
||||
if (this.urlIndexFile != null) try {a = this.urlIndexFile.keys(true, null);} catch (IOException e) {}
|
||||
final Iterator<String> idi = this.solr.iterator();
|
||||
CloneableIterator<byte[]> b = new CloneableIterator<byte[]>() {
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return idi.hasNext();
|
||||
}
|
||||
@Override
|
||||
public byte[] next() {
|
||||
String s = idi.next();
|
||||
return s == null ? null : ASCII.getBytes(s);
|
||||
}
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
@Override
|
||||
public CloneableIterator<byte[]> clone(Object modifier) {
|
||||
return this;
|
||||
}
|
||||
@Override
|
||||
public void close() {
|
||||
}
|
||||
};
|
||||
if (a == null) return b;
|
||||
return new MergeIterator<byte[]>(a, b,
|
||||
URIMetadataRow.rowdef.objectOrder,
|
||||
MergeIterator.simpleMerge,
|
||||
true);
|
||||
}
|
||||
|
||||
public CloneableIterator<DigestURI> urls() {
|
||||
// enumerates entry elements
|
||||
final Iterator<byte[]> ids = iterator();
|
||||
return new CloneableIterator<DigestURI>() {
|
||||
@Override
|
||||
public CloneableIterator<DigestURI> clone(final Object secondHash) {
|
||||
return this;
|
||||
}
|
||||
@Override
|
||||
public final boolean hasNext() {
|
||||
return ids.hasNext();
|
||||
}
|
||||
@Override
|
||||
public final DigestURI next() {
|
||||
byte[] id = ids.next();
|
||||
if (id == null) return null;
|
||||
return getURL(id);
|
||||
}
|
||||
@Override
|
||||
public final void remove() {
|
||||
ids.remove();
|
||||
}
|
||||
@Override
|
||||
public void close() {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public CloneableIterator<URIMetadataNode> entries() {
|
||||
// enumerates entry elements
|
||||
final Iterator<byte[]> ids = iterator();
|
||||
return new CloneableIterator<URIMetadataNode>() {
|
||||
@Override
|
||||
public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
|
||||
return this;
|
||||
}
|
||||
@Override
|
||||
public final boolean hasNext() {
|
||||
return ids.hasNext();
|
||||
}
|
||||
@Override
|
||||
public final URIMetadataNode next() {
|
||||
byte[] id = ids.next();
|
||||
if (id == null) return null;
|
||||
return getMetadata(id);
|
||||
}
|
||||
@Override
|
||||
public final void remove() {
|
||||
ids.remove();
|
||||
}
|
||||
@Override
|
||||
public void close() {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public List<File> dumpFiles() {
|
||||
EmbeddedSolrConnector esc = (EmbeddedSolrConnector) this.solr.getSolr0();
|
||||
@ -675,12 +617,12 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
}
|
||||
|
||||
// export methods
|
||||
public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
|
||||
public Export export(final File f, final String filter, final int format, final boolean dom) {
|
||||
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
|
||||
Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
|
||||
return this.exportthread;
|
||||
}
|
||||
this.exportthread = new Export(f, filter, set, format, dom);
|
||||
this.exportthread = new Export(f, filter, format, dom);
|
||||
this.exportthread.start();
|
||||
return this.exportthread;
|
||||
}
|
||||
@ -691,22 +633,20 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
|
||||
public class Export extends Thread {
|
||||
private final File f;
|
||||
private final String filter;
|
||||
private final Pattern pattern;
|
||||
private int count;
|
||||
private String failure;
|
||||
private final int format;
|
||||
private final boolean dom;
|
||||
private final HandleSet set;
|
||||
|
||||
private Export(final File f, final String filter, final HandleSet set, final int format, boolean dom) {
|
||||
private Export(final File f, final String filter, final int format, boolean dom) {
|
||||
// format: 0=text, 1=html, 2=rss/xml
|
||||
this.f = f;
|
||||
this.filter = filter;
|
||||
this.pattern = filter == null ? null : Pattern.compile(filter);
|
||||
this.count = 0;
|
||||
this.failure = null;
|
||||
this.format = format;
|
||||
this.dom = dom;
|
||||
this.set = set;
|
||||
if ((dom) && (format == 2)) dom = false;
|
||||
}
|
||||
|
||||
@ -724,43 +664,54 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
|
||||
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
|
||||
pw.println("<channel>");
|
||||
pw.println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
|
||||
pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
|
||||
pw.println("<description></description>");
|
||||
pw.println("<link>http://yacy.net</link>");
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (this.dom) {
|
||||
final TreeSet<String> set = domainNameCollector(-1, domainSampleCollector());
|
||||
for (final String host: set) {
|
||||
if (!host.matches(this.filter)) continue;
|
||||
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 100000, YaCySchema.host_s.getSolrFieldName());
|
||||
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
|
||||
for (final String host: stats) {
|
||||
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
|
||||
if (this.format == 0) pw.println(host);
|
||||
if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
|
||||
this.count++;
|
||||
}
|
||||
} else {
|
||||
final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
|
||||
URIMetadataNode entry;
|
||||
String url;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
if (this.set != null && !this.set.has(entry.hash())) continue;
|
||||
url = entry.url().toNormalform(true);
|
||||
if (!url.matches(this.filter)) continue;
|
||||
BlockingQueue<SolrDocument> docs = Fulltext.this.getSolr().concurrentQuery(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
|
||||
YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName(), YaCySchema.title.getSolrFieldName(),
|
||||
YaCySchema.author.getSolrFieldName(), YaCySchema.description.getSolrFieldName(), YaCySchema.size_i.getSolrFieldName(), YaCySchema.last_modified.getSolrFieldName());
|
||||
SolrDocument doc;
|
||||
ArrayList<?> title;
|
||||
String url, author, description, hash;
|
||||
Integer size;
|
||||
Date date;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
hash = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName());
|
||||
url = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
|
||||
title = (ArrayList<?>) doc.getFieldValue(YaCySchema.title.getSolrFieldName());
|
||||
author = (String) doc.getFieldValue(YaCySchema.author.getSolrFieldName());
|
||||
description = (String) doc.getFieldValue(YaCySchema.description.getSolrFieldName());
|
||||
size = (Integer) doc.getFieldValue(YaCySchema.size_i.getSolrFieldName());
|
||||
date = (Date) doc.getFieldValue(YaCySchema.last_modified.getSolrFieldName());
|
||||
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
|
||||
if (this.format == 0) {
|
||||
pw.println(url);
|
||||
}
|
||||
if (this.format == 1) {
|
||||
pw.println("<a href=\"" + url + "\">" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</a><br>");
|
||||
if (title != null) pw.println("<a href=\"" + MultiProtocolURI.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
|
||||
}
|
||||
if (this.format == 2) {
|
||||
pw.println("<item>");
|
||||
pw.println("<title>" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</title>");
|
||||
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
|
||||
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
|
||||
if (!entry.dc_creator().isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(entry.dc_creator(), true) + "</author>");
|
||||
if (!entry.dc_subject().isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(entry.dc_subject(), true) + "</description>");
|
||||
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
|
||||
pw.println("<yacy:size>" + entry.size() + "</yacy:size>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + ASCII.String(entry.hash()) + "</guid>");
|
||||
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
|
||||
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
|
||||
if (date != null) pw.println("<pubDate>" + DateUtil.formatDate(date) + "</pubDate>");
|
||||
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
|
||||
pw.println("</item>");
|
||||
}
|
||||
this.count++;
|
||||
@ -798,60 +749,6 @@ public final class Fulltext implements Iterable<byte[]> {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* collect domain samples: all url hashes from the metadata database is listed and the domain part
|
||||
* of the url hashes is used to count how many of these domain hashes appear
|
||||
* @return a map from domain hashes to hash statistics
|
||||
* @throws IOException
|
||||
*/
|
||||
public Map<String, URLHashCounter> domainSampleCollector() throws IOException {
|
||||
final Map<String, URLHashCounter> map = new HashMap<String, URLHashCounter>();
|
||||
// first collect all domains and calculate statistics about it
|
||||
synchronized (this) {
|
||||
final Iterator<byte[]> i = this.iterator();
|
||||
String hosthash;
|
||||
byte[] urlhashb;
|
||||
URLHashCounter ds;
|
||||
if (i != null) while (i.hasNext()) {
|
||||
urlhashb = i.next();
|
||||
hosthash = ASCII.String(urlhashb, 6, 6);
|
||||
ds = map.get(hosthash);
|
||||
if (ds == null) {
|
||||
ds = new URLHashCounter(urlhashb);
|
||||
map.put(hosthash, ds);
|
||||
} else {
|
||||
ds.count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a list of domain names in this database
|
||||
* @param count number of entries or -1 for all
|
||||
* @param domainSamples a map from domain hashes to hash statistics
|
||||
* @return a set of domain names, ordered by name of the domains
|
||||
*/
|
||||
private TreeSet<String> domainNameCollector(int count, final Map<String, URLHashCounter> domainSamples) {
|
||||
// collect hashes from all domains
|
||||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
DigestURI url;
|
||||
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
|
||||
this.statsDump = new ArrayList<HostStat>();
|
||||
final TreeSet<String> set = new TreeSet<String>();
|
||||
for (final URLHashCounter hs: domainSamples.values()) {
|
||||
if (hs == null) continue;
|
||||
url = this.getURL(hs.urlhashb);
|
||||
if (url == null || url.getHost() == null) continue;
|
||||
set.add(url.getHost());
|
||||
count--;
|
||||
if (count == 0) break;
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate a score map for url hash samples: each sample is a single url hash
|
||||
* that stands for all entries for the corresponding domain. The map counts the number
|
||||
|
@ -246,7 +246,8 @@ public class QueryGoal {
|
||||
q.append(')');
|
||||
|
||||
// add filter to prevent that results come from failed urls
|
||||
q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
|
||||
q.append(" AND ").append(YaCySchema.httpstatus_i.getSolrFieldName()).append(":200");
|
||||
//q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
|
||||
|
||||
return q;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user