mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
added domain list extraction and html export format
to URL administration menu http://localhost:8080/IndexControlURLs_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4228 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
htroot
source
@ -17,7 +17,7 @@
|
||||
<input type="submit" name="urlstringsearch" value="Show Details for URL" />
|
||||
</dd>
|
||||
|
||||
<dt class="TableCellDark">Rertieve by URL-Hash:</dt>
|
||||
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
|
||||
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
|
||||
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" />
|
||||
<input type="submit" name="urlhashsimilar" value="Generate List" />
|
||||
@ -51,8 +51,14 @@
|
||||
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
|
||||
</dd>
|
||||
<dt class="TableCellDark">Export Format</dt>
|
||||
<dd><input type="radio" name="format" value="rss" checked />XML (RSS)
|
||||
<input type="radio" name="format" value="text" />Plain Text List (URLs only)
|
||||
<dd>Only Domain:
|
||||
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)
|
||||
<input type="radio" name="format" value="dom-html" />HTML (domains as URLs, no title)<br>
|
||||
Full URL List:
|
||||
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)
|
||||
<input type="radio" name="format" value="url-html" />HTML (URLs with title)
|
||||
<input type="radio" name="format" value="url-rss" checked />XML (RSS)
|
||||
</br>
|
||||
</dd>
|
||||
<dt class="TableCellLight"></dt>
|
||||
<dd><input type="submit" name="lurlexport" value="Export URLs" />
|
||||
|
@ -191,15 +191,25 @@ public class IndexControlURLs_p {
|
||||
}
|
||||
|
||||
if (post.containsKey("lurlexport")) {
|
||||
boolean rss = post.get("format", "text").equals("rss");
|
||||
// parse format
|
||||
int format = 0;
|
||||
String fname = post.get("format", "url-text");
|
||||
boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, othervise only the domain
|
||||
if (fname.endsWith("text")) format = 0;
|
||||
if (fname.endsWith("html")) format = 1;
|
||||
if (fname.endsWith("rss")) format = 2;
|
||||
|
||||
// extend export file name
|
||||
String s = post.get("exportfile", "");
|
||||
if (s.indexOf('.') < 0) {
|
||||
if (rss) s = s + ".xml"; else s = s + ".txt";
|
||||
if (format == 0) s = s + ".txt";
|
||||
if (format == 1) s = s + ".html";
|
||||
if (format == 2) s = s + ".xml";
|
||||
}
|
||||
File f = new File(s);
|
||||
f.getParentFile().mkdirs();
|
||||
String filter = post.get("exportfilter", ".*");
|
||||
boolean running = sb.wordIndex.loadedURL.export(f, filter, rss);
|
||||
boolean running = sb.wordIndex.loadedURL.export(f, filter, format, dom);
|
||||
|
||||
prop.put("lurlexport_exportfile", s);
|
||||
prop.put("lurlexport_urlcount", sb.wordIndex.loadedURL.export_count());
|
||||
|
@ -267,7 +267,7 @@ public class yacysearch {
|
||||
"",
|
||||
20,
|
||||
constraint,
|
||||
false);
|
||||
true);
|
||||
serverProfiling localTiming = new serverProfiling(4 * theQuery.maximumTime / 10, theQuery.displayResults());
|
||||
|
||||
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search
|
||||
|
@ -66,12 +66,14 @@ import de.anomic.http.httpc;
|
||||
import de.anomic.http.httpc.response;
|
||||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroCache;
|
||||
import de.anomic.kelondro.kelondroCloneableIterator;
|
||||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.kelondro.kelondroFlexSplitTable;
|
||||
import de.anomic.kelondro.kelondroIndex;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.kelondro.kelondroRowSet;
|
||||
import de.anomic.plasma.urlPattern.plasmaURLPattern;
|
||||
import de.anomic.server.serverCodings;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
@ -534,12 +536,12 @@ public final class plasmaCrawlLURL {
|
||||
|
||||
private exportc exportthread = null;
|
||||
|
||||
public boolean export(File f, String filter, boolean rss) {
|
||||
public boolean export(File f, String filter, int format, boolean dom) {
|
||||
if ((exportthread != null) && (exportthread.isAlive())) {
|
||||
serverLog.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
|
||||
return false;
|
||||
}
|
||||
this.exportthread = new exportc(f, filter, rss);
|
||||
this.exportthread = new exportc(f, filter, format, dom);
|
||||
this.exportthread.start();
|
||||
return (this.exportthread.isAlive());
|
||||
}
|
||||
@ -569,21 +571,30 @@ public final class plasmaCrawlLURL {
|
||||
String filter;
|
||||
int count;
|
||||
String failure;
|
||||
boolean rss;
|
||||
int format;
|
||||
boolean dom;
|
||||
kelondroRowSet doms;
|
||||
|
||||
public exportc(File f, String filter, boolean rss) {
|
||||
public exportc(File f, String filter, int format, boolean dom) {
|
||||
// format: 0=text, 1=html, 2=rss/xml
|
||||
this.f = f;
|
||||
this.filter = filter;
|
||||
this.count = 0;
|
||||
this.failure = null;
|
||||
this.rss = rss;
|
||||
this.format = format;
|
||||
this.dom = dom;
|
||||
if ((dom) && (format == 2)) dom = false;
|
||||
this.doms = new kelondroRowSet(new kelondroRow("String hash-6", kelondroBase64Order.enhancedCoder, 0), 0);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
f.getParentFile().mkdirs();
|
||||
PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(f)));
|
||||
if (rss) {
|
||||
if (format == 1) {
|
||||
pw.println("<html><head></head><body>");
|
||||
}
|
||||
if (format == 2) {
|
||||
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
|
||||
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
|
||||
pw.println("<rss version=\"2.0\">");
|
||||
@ -597,26 +608,45 @@ public final class plasmaCrawlLURL {
|
||||
indexURLEntry entry;
|
||||
indexURLEntry.Components comp;
|
||||
String url;
|
||||
while (i.hasNext()) {
|
||||
loop: while (i.hasNext()) {
|
||||
entry = (indexURLEntry) i.next();
|
||||
comp = entry.comp();
|
||||
url = comp.url().toNormalform(true, false);
|
||||
if (!url.matches(filter)) continue;
|
||||
if (rss) {
|
||||
pw.println("<item>");
|
||||
pw.println("<title>" + yacyURL.escape(comp.title()) + "</title>");
|
||||
pw.println("<link>" + url + "</link>");
|
||||
if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
|
||||
if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
|
||||
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
|
||||
pw.println("</item>");
|
||||
if (dom) {
|
||||
if (doms.has(entry.hash().substring(6).getBytes())) continue loop;
|
||||
doms.add(entry.hash().substring(6).getBytes());
|
||||
url = comp.url().getHost();
|
||||
if (format == 0) {
|
||||
pw.println(url);
|
||||
}
|
||||
if (format == 1) {
|
||||
pw.println("<a href=\"http://" + url + "\">" + url + "</a><br>");
|
||||
}
|
||||
} else {
|
||||
pw.println(url);
|
||||
if (format == 0) {
|
||||
pw.println(url);
|
||||
}
|
||||
if (format == 1) {
|
||||
pw.println("<a href=\"" + url + "\">" + comp.title() + "</a><br>");
|
||||
}
|
||||
if (format == 2) {
|
||||
pw.println("<item>");
|
||||
pw.println("<title>" + comp.title() + "</title>");
|
||||
pw.println("<link>" + yacyURL.escape(url) + "</link>");
|
||||
if (comp.author().length() > 0) pw.println("<author>" + comp.author() + "</author>");
|
||||
if (comp.tags().length() > 0) pw.println("<description>" + comp.tags() + "</description>");
|
||||
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + entry.hash() + "</guid>");
|
||||
pw.println("</item>");
|
||||
}
|
||||
}
|
||||
count++;
|
||||
count++;
|
||||
}
|
||||
if (rss) {
|
||||
if (format == 1) {
|
||||
pw.println("</body></html>");
|
||||
}
|
||||
if (format == 2) {
|
||||
pw.println("</channel>");
|
||||
pw.println("</rss>");
|
||||
}
|
||||
|
147
source/yacy.java
147
source/yacy.java
@ -75,10 +75,7 @@ import de.anomic.kelondro.kelondroDyn;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.kelondro.kelondroMapObjects;
|
||||
import de.anomic.plasma.plasmaCondenser;
|
||||
import de.anomic.plasma.plasmaCrawlEntry;
|
||||
import de.anomic.plasma.plasmaCrawlLURL;
|
||||
import de.anomic.plasma.plasmaCrawlNURL;
|
||||
import de.anomic.plasma.plasmaCrawlZURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.server.serverCore;
|
||||
@ -766,129 +763,6 @@ public final class yacy {
|
||||
serverLog.logInfo("TRANSFER-CR", "could not read file " + crfile);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Generates a text file containing all domains in this peer's DB.
|
||||
* This may be useful to calculate the YaCy-Blockrank.
|
||||
*
|
||||
* @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain"
|
||||
* @see urllist
|
||||
*/
|
||||
private static void domlist(String homePath, String source, String format, String targetName) {
|
||||
|
||||
File root = new File(homePath);
|
||||
try {
|
||||
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf", false);
|
||||
HashMap doms = new HashMap();
|
||||
System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
|
||||
System.out.println("a dump will be written after double-check of all extracted domains.");
|
||||
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
|
||||
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
|
||||
int c = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
if (source.equals("lurl")) {
|
||||
Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
|
||||
indexURLEntry entry;
|
||||
while (eiter.hasNext()) {
|
||||
try {
|
||||
entry = (indexURLEntry) eiter.next();
|
||||
indexURLEntry.Components comp = entry.comp();
|
||||
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
|
||||
} catch (Exception e) {
|
||||
// here a MalformedURLException may occur
|
||||
// just ignore
|
||||
}
|
||||
c++;
|
||||
if (c % 10000 == 0) System.out.println(
|
||||
c + " urls checked, " +
|
||||
doms.size() + " domains collected, " +
|
||||
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
|
||||
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
|
||||
}
|
||||
}
|
||||
if (source.equals("eurl")) {
|
||||
Iterator eiter = sb.crawlQueues.errorURL.entries(true, null);
|
||||
plasmaCrawlZURL.Entry entry;
|
||||
while (eiter.hasNext()) {
|
||||
try {
|
||||
entry = (plasmaCrawlZURL.Entry) eiter.next();
|
||||
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
|
||||
} catch (Exception e) {
|
||||
// here a MalformedURLException may occur
|
||||
// just ignore
|
||||
}
|
||||
c++;
|
||||
if (c % 10000 == 0) System.out.println(
|
||||
c + " urls checked, " +
|
||||
doms.size() + " domains collected, " +
|
||||
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
|
||||
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
|
||||
}
|
||||
}
|
||||
if (source.equals("nurl")) {
|
||||
Iterator eiter = sb.crawlQueues.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
plasmaCrawlEntry entry;
|
||||
while (eiter.hasNext()) {
|
||||
try {
|
||||
entry = (plasmaCrawlEntry) eiter.next();
|
||||
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
|
||||
} catch (Exception e) {
|
||||
// here a MalformedURLException may occur
|
||||
// just ignore
|
||||
}
|
||||
c++;
|
||||
if (c % 10000 == 0) System.out.println(
|
||||
c + " urls checked, " +
|
||||
doms.size() + " domains collected, " +
|
||||
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
|
||||
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
|
||||
}
|
||||
}
|
||||
|
||||
if (format.equals("html")) {
|
||||
// output file in HTML format
|
||||
File file = new File(root, targetName + ".html");
|
||||
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
Iterator i = doms.entrySet().iterator();
|
||||
Map.Entry entry;
|
||||
String key;
|
||||
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
|
||||
bos.write(serverCore.crlf);
|
||||
bos.write(("<html><head><title>YaCy " + source + " domainlist</title></head><body>").getBytes());
|
||||
bos.write(serverCore.crlf);
|
||||
while (i.hasNext()) {
|
||||
entry = (Map.Entry) i.next();
|
||||
key = (String) entry.getKey();
|
||||
bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
|
||||
((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "<br>"
|
||||
).getBytes());
|
||||
bos.write(serverCore.crlf);
|
||||
}
|
||||
bos.write(("</body></html>").getBytes());
|
||||
bos.close();
|
||||
|
||||
} else if (format.equals("zip")) {
|
||||
// output file in plain text but compressed with ZIP
|
||||
File file = new File(root, targetName + ".zip");
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf));
|
||||
|
||||
} else if (format.equals("gzip")) {
|
||||
// output file in plain text but compressed with GZIP
|
||||
File file = new File(root, targetName + ".txt.gz");
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf));
|
||||
} else {
|
||||
// plain text list
|
||||
File file = new File(root, targetName + ".txt");
|
||||
System.out.println("Started domain list dump to file " + file);
|
||||
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
|
||||
}
|
||||
sb.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static String[] shift(String[] args, int pos, int count) {
|
||||
String[] newargs = new String[args.length - count];
|
||||
@ -1082,27 +956,6 @@ public final class yacy {
|
||||
String targetaddress = args[1];
|
||||
String crfile = args[2];
|
||||
transferCR(targetaddress, crfile);
|
||||
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) {
|
||||
// generate a url list and save it in a file
|
||||
String source = "lurl";
|
||||
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
|
||||
if ((args[2].equals("nurl")) ||
|
||||
(args[2].equals("lurl")) ||
|
||||
(args[2].equals("eurl")))
|
||||
source = args[2];
|
||||
args = shift(args, 1, 2);
|
||||
}
|
||||
String format = "txt";
|
||||
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
|
||||
if ((args[2].equals("html")) ||
|
||||
(args[2].equals("zip")) ||
|
||||
(args[2].equals("gzip")))
|
||||
format = args[2];
|
||||
args = shift(args, 1, 2);
|
||||
}
|
||||
if (args.length == 2) applicationRoot= args[1];
|
||||
String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
|
||||
domlist(applicationRoot, source, format, outfile);
|
||||
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
|
||||
// generate a url list and save it in a file
|
||||
if (args.length == 2) applicationRoot= args[1];
|
||||
|
Reference in New Issue
Block a user