mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-11-08 00:06:13 -05:00
161 lines
7.4 KiB
Java
161 lines
7.4 KiB
Java
// IndexExport_p.java
|
||
// -----------------------
|
||
// (C) 2004-2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||
// first published 2004 on http://yacy.net
|
||
//
|
||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||
//
|
||
// LICENSE
|
||
//
|
||
// This program is free software; you can redistribute it and/or modify
|
||
// it under the terms of the GNU General Public License as published by
|
||
// the Free Software Foundation; either version 2 of the License, or
|
||
// (at your option) any later version.
|
||
//
|
||
// This program is distributed in the hope that it will be useful,
|
||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
// GNU General Public License for more details.
|
||
//
|
||
// You should have received a copy of the GNU General Public License
|
||
// along with this program; if not, write to the Free Software
|
||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
||
package net.yacy.htroot;
|
||
|
||
import java.io.File;
|
||
import java.io.IOException;
|
||
import java.util.Date;
|
||
|
||
import net.yacy.cora.date.GenericFormatter;
|
||
import net.yacy.cora.protocol.RequestHeader;
|
||
import net.yacy.data.WorkTables;
|
||
import net.yacy.search.Switchboard;
|
||
import net.yacy.search.SwitchboardConstants;
|
||
import net.yacy.search.index.Fulltext;
|
||
import net.yacy.search.index.Segment;
|
||
import net.yacy.search.schema.CollectionSchema;
|
||
import net.yacy.server.serverObjects;
|
||
import net.yacy.server.serverSwitch;
|
||
|
||
public class LLMSelection_p {
|
||
|
||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||
// return variable that accumulates replacements
|
||
final Switchboard sb = (Switchboard) env;
|
||
|
||
final serverObjects prop = new serverObjects();
|
||
|
||
final Segment segment = sb.index;
|
||
// we have two counts of document: total number and such that are exportable with status code 200
|
||
final long ucount = segment.fulltext().collectionSize();
|
||
long ucount200 = ucount;
|
||
try {
|
||
ucount200 = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
|
||
} catch (final IOException e1) {}
|
||
|
||
// set default values
|
||
prop.put("reload", 0);
|
||
prop.put("lurlexport", 0);
|
||
prop.putNum("ucount", ucount);
|
||
prop.putNum("ucount200", ucount200);
|
||
|
||
// show Pack folder contents
|
||
int i = 0;
|
||
boolean dark = true;
|
||
for (final String file: sb.packsInHold()) {
|
||
prop.put("packs_" + i + "_file", file);
|
||
prop.put("packs_" + i + "_type", "hold");
|
||
prop.put("packs_" + i + "_size", new File(sb.packsHoldPath, file).length() / 1024);
|
||
prop.put("packs_" + i + "_dark", dark ? "1" : "0");
|
||
i++;
|
||
dark = !dark;
|
||
}
|
||
for (final String file: sb.packsInLoaded()) {
|
||
prop.put("packs_" + i + "_file", file);
|
||
prop.put("packs_" + i + "_type", "loaded");
|
||
prop.put("packs_" + i + "_size", new File(sb.packsLoadedPath, file).length() / 1024);
|
||
prop.put("packs_" + i + "_dark", dark ? "1" : "0");
|
||
i++;
|
||
dark = !dark;
|
||
}
|
||
for (final String file: sb.packsInLive()) {
|
||
prop.put("packs_" + i + "_file", file);
|
||
prop.put("packs_" + i + "_type", "live");
|
||
prop.put("packs_" + i + "_size", new File(sb.packsLivePath, file).length() / 1024);
|
||
prop.put("packs_" + i + "_dark", dark ? "1" : "0");
|
||
i++;
|
||
dark = !dark;
|
||
}
|
||
prop.put("packs", i);
|
||
|
||
if (post == null || env == null) {
|
||
return prop; // nothing to do
|
||
}
|
||
|
||
if (post.containsKey("lurlexport")) {
|
||
try {
|
||
// parse format
|
||
Fulltext.ExportFormat format = Fulltext.ExportFormat.elasticsearch;
|
||
final String fname = post.get("format", "full-elasticsearch");
|
||
final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
|
||
final boolean text = fname.startsWith("text");
|
||
if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss;
|
||
if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr;
|
||
if (fname.endsWith("elasticsearch")) format = Fulltext.ExportFormat.elasticsearch;
|
||
|
||
final String filter = post.get("exportfilter", ".*");
|
||
String query = post.get("exportquery", "*:*");
|
||
final String collection = post.get("collection", "user");
|
||
query += " AND " + CollectionSchema.collection_sxt.getSolrFieldName() + ":\"" + collection + "\"";
|
||
|
||
// store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
|
||
sb.tables.recordAPICall(post, "IndexPackGenerator_p.html", WorkTables.TABLE_API_TYPE_DUMP, "PackGenerator, q=" + query);
|
||
|
||
// start the export
|
||
/*
|
||
Tier Tags:
|
||
| Tier | Size | Notes |
|
||
|-----------|-----------|--------------------|
|
||
| common | ≤ 1 GB | IndexPackGenerator |
|
||
| uncommon | 1–5 GB | large web crawls |
|
||
| rare | 5–50 GB | custom parser |
|
||
| epic | 50–200 GB | special infra |
|
||
| legendary | any | human curation |
|
||
*/
|
||
final long now = System.currentTimeMillis();
|
||
final long doccount = sb.index.fulltext().getDefaultConnector().getCountByQuery(query);
|
||
if (doccount == 0) throw new IOException("number of exported documents == 0");
|
||
final String category = post.get("category", "scroll"); // core, scroll, codex, gem, fiction, map, echo, spirit, vault
|
||
final String tier = "common"; // common, uncommon, rare, epic, legendary, legendary
|
||
final String origin = "web"; // web, synth,
|
||
String slug = post.get("slug", "export").trim().replaceAll(" ", "-");
|
||
if (slug.isEmpty()) slug = "export";
|
||
|
||
// if collection is not user, the slug is the collection name
|
||
if (!"user".equals(collection)) {
|
||
slug = collection.trim().replaceAll(" ", "-");
|
||
}
|
||
// we can not construct the file name
|
||
final String filename =
|
||
SwitchboardConstants.YACY_PACK_PREFIX +
|
||
category + "-" + tier + "-" + origin + "_" +
|
||
slug + "_" +
|
||
GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(now));
|
||
// file name schema: YaCyPack_<category>-<tier>-<origin>_<slug>_<YYMMDD>.jsonlist
|
||
// possible storage paths are: hold, load, loaded, unload, live; we use hold here, loaded would also be correct
|
||
} catch (final IOException e) {
|
||
prop.put("lurlexporterror", 1);
|
||
prop.put("lurlexporterror_exportfile", "-no export-");
|
||
prop.put("lurlexporterror_exportfailmsg", e.getMessage());
|
||
return prop;
|
||
}
|
||
}
|
||
|
||
// insert constants
|
||
prop.putNum("ucount", ucount);
|
||
// return rewrite properties
|
||
return prop;
|
||
}
|
||
|
||
} |