mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-09-03 15:26:13 -04:00
139 lines
8.0 KiB
HTML
139 lines
8.0 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "DTD/xhtml1-transitional.dtd">
|
||
<!-- This page is only XHTML 1.0 Transitional because target is being used in a links -->
|
||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||
#(reload)#::<meta http-equiv="REFRESH" content="5; url=/IndexPackGenerator_p.html">#(/reload)#
|
||
<head>
|
||
<title>YaCy '#[clientname]#': Index Pack Generator</title>
|
||
#%env/templates/metas.template%#
|
||
</head>
|
||
<body id="IndexControl">
|
||
#%env/templates/header.template%#
|
||
#%env/templates/submenuIndexImport.template%#
|
||
|
||
<h2>YaCy Pack Generator</h2>
|
||
<p>The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.</p>
|
||
|
||
#(lurlexport)#::
|
||
<form action="IndexPackGenerator_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||
<fieldset><legend>Index Pack Generator</legend>
|
||
<dl>
|
||
<dt class="TableCellDark">Set a Category (this goes into the filename)</dt>
|
||
<dd>
|
||
<select name="category" id="category" class="form-control">
|
||
<option value="mix" selected="selected">mix - a mix of document types, for content from wide web crawls</option>
|
||
<option value="core">core - technical documentation, operation systems, computer hardware, open source and free software, manuals, protocol standards</option>
|
||
<option value="scroll">scroll - non-technical documents: knowledge, encyclopedia, linguistic corpora, dictionaries, translation memories, texts, non-fiction books, historical books</option>
|
||
<option value="codex">codex - non-technical standards: industry standards, laws, rules, compliance</option>
|
||
<option value="gem">gem - research, papers, university publications, science</option>
|
||
<option value="fiction">fiction - fictional documents: movies, stories, series, books (fiction, science-fiction)</option>
|
||
<option value="map">map - geological data, geolocation-data, earth/world information</option>
|
||
<option value="echo">echo – micro-content (tweets, toots, short headlines, SMS corpora), podcasts, radio archives, audio lectures, spoken-word datasets, logs, incidents, telemetry</option>
|
||
<option value="spirit">spirit – related to non-textual data (possibly only metadata): art, music, game assets, creative-commons media (non-text culture loot)</option>
|
||
<option value="vault">vault - sensitive data: secrets, leaks, non-public documents, security advisories</option>
|
||
</select>
|
||
</dd>
|
||
|
||
<dt class="TableCellDark">Index Collection</dt>
|
||
<dd>
|
||
<select id="collection" name="collection" class="form-control">
|
||
#{collections}#
|
||
<option #(selected)#::selected="selected"#(/selected)#>#[collection]#</option>
|
||
#{/collections}#
|
||
</select> the collection name is used as part of the filename to describe the content. Exception: if the collection is "user", then you can name the content with a slug.
|
||
</dd>
|
||
|
||
<dt class="TableCellDark">Slug - describe the content<br>(only if collection is "user")</dt>
|
||
<dd><input type="text" name="slug" id="slug" value="export" size="20" maxlength="40" class="form-control"/> This will become a part of the filename, spaces will be replaced by "-"; must not be empty; should end with a language description, e.g. "-en"
|
||
</dd>
|
||
|
||
|
||
<script>
|
||
const collectionSelect = document.getElementById('collection');
|
||
const slugInput = document.getElementById('slug');
|
||
|
||
function toggleSlug() {
|
||
const enable = collectionSelect.value === 'user';
|
||
slugInput.disabled = !enable;
|
||
slugInput.style.opacity = enable ? 1 : 0.5; // optional visual hint
|
||
}
|
||
|
||
// run once on load
|
||
toggleSlug();
|
||
|
||
// keep it in sync
|
||
collectionSelect.addEventListener('change', toggleSlug);
|
||
</script>
|
||
|
||
<dt class="TableCellDark">URL Filter</dt>
|
||
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" class="form-control"/> .*.* (default) is a catch-all; format: java regex
|
||
</dd>
|
||
|
||
<dt class="TableCellDark">Search Query - </dt>
|
||
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" class="form-control"/> *:* (default) is a catch-all; format: <field-name>:<solr-pattern>
|
||
</dd>
|
||
<dt class="TableCellDark">Export Format</dt>
|
||
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||
This JSON is an elasticsearch index dump format and can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:<br />
|
||
Start docker container of opensearch:<br />
|
||
<code>docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest</code><br />
|
||
Unblock index creation:<br />
|
||
<code>curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
|
||
{
|
||
"persistent": {
|
||
"cluster.blocks.create_index": null
|
||
}
|
||
}'</code><br />
|
||
Create the search index:<br />
|
||
<code>curl -X PUT "http://localhost:9200/collection1/yacy"</code><br />
|
||
Bulk-upload the index file:<br />
|
||
<code>curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson</code><br />
|
||
Make a search, get 10 results, search in fields text_t, title, description with boosts:<br />
|
||
<code>curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d'
|
||
{"size": 10, "query": {"multi_match": {
|
||
"query": "one two three",
|
||
"fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO"
|
||
}}}'</code>
|
||
</span></span>
|
||
<input type="radio" name="format" value="full-elasticsearch" checked="checked" />
|
||
JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file)
|
||
<br />
|
||
|
||
<input type="radio" name="format" value="full-solr" />
|
||
XML (Rich and full-text Solr data, one document per line in one large xml file,
|
||
can be processed with shell tools, can be imported with DATA/PACKS/load/)
|
||
<br />
|
||
<input type="radio" name="format" value="full-rss" />
|
||
XML (RSS)
|
||
</dd>
|
||
|
||
<dt> </dt>
|
||
<dd><input type="submit" name="lurlexport" value="Generate Data Pack" class="btn btn-primary" style="width:240px;"/>
|
||
</dd>
|
||
</dl>
|
||
</fieldset>
|
||
</form>::
|
||
<div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# Documents so far</div>::
|
||
#(/lurlexport)#
|
||
|
||
#(lurlexportfinished)#::
|
||
<div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
|
||
<em>Import this file by moving it to DATA/PACKS/load</em></div>::
|
||
#(/lurlexportfinished)#
|
||
|
||
#(lurlexporterror)#::
|
||
<div class="alert alert-warning">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
|
||
#(/lurlexporterror)#
|
||
|
||
<fieldset><legend>Pack List</legend>
|
||
<table border="0" summary="Pack List Archive">
|
||
<tr class="TableHeader"><td>Pack</td><td>Process</td><td>Size (KB)</td></tr>
|
||
#{packs}#
|
||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#"><td>#[file]#</td><td>#[type]#</td><td>#[size]#</td></tr>
|
||
#{/packs}#
|
||
</table>
|
||
</fieldset>
|
||
|
||
#%env/templates/footer.template%#
|
||
</body>
|
||
</html>
|