1
0
mirror of https://github.com/yacy/yacy_search_server.git synced 2025-07-23 09:24:39 -04:00

Merge pull request from yacy/master

Fork update
This commit is contained in:
Andreas
2017-10-07 12:29:55 +02:00
committed by GitHub
50 changed files with 504 additions and 116 deletions

@ -44,20 +44,20 @@
<classpathentry kind="lib" path="lib/commons-io-2.5.jar"/>
<classpathentry kind="lib" path="lib/slf4j-api-1.7.24.jar"/>
<classpathentry kind="lib" path="lib/chardet.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.4.6.v20170531.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.4.7.v20170914.jar"/>
<classpathentry kind="lib" path="lib/httpclient-4.5.3.jar"/>
<classpathentry kind="lib" path="lib/httpmime-4.5.3.jar"/>
<classpathentry kind="lib" path="lib/noggit-0.6.jar"/>

@ -5,7 +5,24 @@ cache:
- $HOME/.m2
jdk:
- oraclejdk7
- oraclejdk8
# for running tests on Travis CI container infrastructure for faster builds
sudo: false
# only `sudo: true` allows running tests on Travis CI container infrastructure for faster builds, but testing installation of .deb makes sense
sudo: true
# Installing ghostscript (optional imagemagick dependency) is required for Html2ImageTest to run
# dpkg-dev,debhelper,m4 and fakeroot packages are required for the 'ant deb' task
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y ghostscript dpkg-dev debhelper m4 fakeroot
install:
- cd libbuild && MAVEN_OPTS="-Xmx6g -Xms2g" mvn clean install && cd ..
script:
- MAVEN_OPTS="-Xmx6g -Xms2g" mvn clean install
# test build instructions
- ant
- ant dist
- ant deb
- sudo dpkg -i ../yacy_*_all.deb

@ -202,20 +202,20 @@
<pathelement location="${lib}/javax.servlet-api-3.1.0.jar" />
<pathelement location="${lib}/jcifs-1.3.17.jar" />
<pathelement location="${lib}/jcl-over-slf4j-1.7.24.jar" />
<pathelement location="${lib}/jetty-client-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-continuation-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-deploy-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-http-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-io-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-jmx-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-proxy-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-security-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-server-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-servlet-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-servlets-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-util-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-webapp-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-xml-9.4.6.v20170531.jar" />
<pathelement location="${lib}/jetty-client-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-continuation-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-deploy-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-http-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-io-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-jmx-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-proxy-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-security-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-server-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-servlet-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-servlets-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-util-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-webapp-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jetty-xml-9.4.7.v20170914.jar" />
<pathelement location="${lib}/jsch-0.1.54.jar" />
<pathelement location="${lib}/json-simple-1.1.1.jar" />
<pathelement location="${lib}/jsonic-1.3.10.jar" />
@ -327,7 +327,8 @@
<!-- copy build libs -->
<copy todir="${release_main}/libbuild">
<fileset dir="${libbuild}"
includes="**/*"/>
includes="**/*"
excludes="**/target/**"/>
</copy>
<!-- copy configuration files -->
@ -787,7 +788,7 @@
<delete dir="${release_mac}" failonerror="false" verbose="false" />
</target>
<!-- to use the deb command the following debian packages must be installed: dpkg-dev debhelper m4 -->
<!-- to use the deb command the following debian packages must be installed: dpkg-dev debhelper m4 fakeroot -->
<target name="deb" depends="init" description="Creates a debian package">
<!-- replacing the old with the new revision number -->
<replaceregexp file="debian/changelog"

2
debian/control vendored

@ -8,7 +8,7 @@ Standards-Version: 3.7.2
Package: yacy
Architecture: all
Depends: java8-runtime-headless, sudo, debconf
Suggests: curl | wget
Suggests: curl | wget, wkhtmltopdf, imagemagick, xvfb, ghostscript
Description: Peer-to-Peer Web Search Engine
YaCy is a Java-based peer-to-peer search engine.
It provides a personal web search engine, which is

@ -33,15 +33,15 @@
<li>
<img src="env/grafics/ok.png" height="16" width="16" alt="ok" />&nbsp;Select a language for the interface:<br />
<fieldset>
<input type="radio" name="language" value="browser" id="lang_browser" onchange="this.form.submit()" #(lang_browser)#::checked="checked"#(/lang_browser)# /><label for="lang_browser">Browser</label>&nbsp;
<input type="radio" name="language" value="default" id="lang_en" onchange="this.form.submit()" #(lang_en)#::checked="checked"#(/lang_en)# /><label for="lang_en" class="#[active_en]#">English</label>&nbsp;
<input type="radio" name="language" value="de" id="lang_de" onchange="this.form.submit()" #(lang_de)#::checked="checked"#(/lang_de)# /><label class="#[active_de]#" for="lang_de">Deutsch</label>&nbsp;
<input type="radio" name="language" value="fr" id="lang_fr" onchange="this.form.submit()" #(lang_fr)#::checked="checked"#(/lang_fr)# /><label class="#[active_fr]#" for="lang_fr">Fran&ccedil;ais</label>&nbsp;
<input type="radio" name="language" value="cn" id="lang_cn" onchange="this.form.submit()" #(lang_cn)#::checked="checked"#(/lang_cn)# /><label class="#[active_cn]#" for="lang_cn">&#27721;&#35821;/&#28450;&#35486</label>
<input type="radio" name="language" value="ru" id="lang_ru" onchange="this.form.submit()" #(lang_ru)#::checked="checked"#(/lang_ru)# /><label class="#[active_ru]#" for="lang_ru">&#1056;&#1091;&#1089;&#1089;&#1082;&#1080;&#1081;</label>
<input type="radio" name="language" value="uk" id="lang_uk" onchange="this.form.submit()" #(lang_uk)#::checked="checked"#(/lang_uk)# /><label class="#[active_uk]#" for="lang_uk">&#1059;&#1082;&#1088;&#1072;&#1111;&#1085;&#1089;&#1100;&#1082;&#1072;</label>
<input type="radio" name="language" value="hi" id="lang_hi" onchange="this.form.submit()" #(lang_hi)#::checked="checked"#(/lang_hi)# /><label class="#[active_hi]#" for="lang_hi">&#2361;&#2367;&#2344;&#2381;&#2342;&#2368;</label>
<input type="radio" name="language" value="ja" id="lang_ja" onchange="this.form.submit()" #(lang_ja)#::checked="checked"#(/lang_ja)# /><label class="#[active_ja]#" for="lang_ja">&#26085;&#26412;&#35486;</label>
<input type="radio" name="language" value="browser" id="lang_browser" onchange="this.form.submit()" #(lang_browser)#::checked="checked"#(/lang_browser)# /><label for="lang_browser" title="Use the browser preferred language if available">Browser</label>&nbsp;
<input type="radio" name="language" value="default" id="lang_en" onchange="this.form.submit()" #(lang_en)#::checked="checked"#(/lang_en)# /><label for="lang_en" #(active_en)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_en)#>English</label>&nbsp;
<input type="radio" name="language" value="de" id="lang_de" onchange="this.form.submit()" #(lang_de)#::checked="checked"#(/lang_de)# /><label for="lang_de" #(active_de)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_de)#>Deutsch</label>&nbsp;
<input type="radio" name="language" value="fr" id="lang_fr" onchange="this.form.submit()" #(lang_fr)#::checked="checked"#(/lang_fr)# /><label for="lang_fr" #(active_fr)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_fr)#>Fran&ccedil;ais</label>&nbsp;
<input type="radio" name="language" value="cn" id="lang_cn" onchange="this.form.submit()" #(lang_cn)#::checked="checked"#(/lang_cn)# /><label for="lang_cn" #(active_cn)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_cn)#>&#27721;&#35821;/&#28450;&#35486</label>
<input type="radio" name="language" value="ru" id="lang_ru" onchange="this.form.submit()" #(lang_ru)#::checked="checked"#(/lang_ru)# /><label for="lang_ru" #(active_ru)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_ru)#>&#1056;&#1091;&#1089;&#1089;&#1082;&#1080;&#1081;</label>
<input type="radio" name="language" value="uk" id="lang_uk" onchange="this.form.submit()" #(lang_uk)#::checked="checked"#(/lang_uk)# /><label for="lang_uk" #(active_uk)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_uk)#>&#1059;&#1082;&#1088;&#1072;&#1111;&#1085;&#1089;&#1100;&#1082;&#1072;</label>
<input type="radio" name="language" value="hi" id="lang_hi" onchange="this.form.submit()" #(lang_hi)#::checked="checked"#(/lang_hi)# /><label for="lang_hi" #(active_hi)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_hi)#>&#2361;&#2367;&#2344;&#2381;&#2342;&#2368;</label>
<input type="radio" name="language" value="ja" id="lang_ja" onchange="this.form.submit()" #(lang_ja)#::checked="checked"#(/lang_ja)# /><label for="lang_ja" #(active_ja)#::title="Click to generate translated pages"::class="label-success" title="Active : translated pages are available"#(/active_ja)#>&#26085;&#26412;&#35486;</label>
</fieldset>
</li>
<!-- take care that no other items are changed, but also change the former if no js is enabled -->

@ -284,24 +284,24 @@ public class ConfigBasic {
// set label class (green background) for active translation
if (lang.equals("browser")) {
List<String> l = Translator.activeTranslations();
prop.put("active_cn", l.contains("cn") ? "label-success" : "");
prop.put("active_de", l.contains("de") ? "label-success" : "");
prop.put("active_fr", l.contains("fr") ? "label-success" : "");
prop.put("active_hi", l.contains("hi") ? "label-success" : "");
prop.put("active_ja", l.contains("ja") ? "label-success" : "");
prop.put("active_ru", l.contains("ru") ? "label-success" : "");
prop.put("active_uk", l.contains("uk") ? "label-success" : "");
prop.put("active_en", "label-success");
prop.put("active_cn", l.contains("cn") ? "2" : "1");
prop.put("active_de", l.contains("de") ? "2" : "1");
prop.put("active_fr", l.contains("fr") ? "2" : "1");
prop.put("active_hi", l.contains("hi") ? "2" : "1");
prop.put("active_ja", l.contains("ja") ? "2" : "1");
prop.put("active_ru", l.contains("ru") ? "2" : "1");
prop.put("active_uk", l.contains("uk") ? "2" : "1");
prop.put("active_en", "2");
} else {
prop.put("active_de", "");
prop.put("active_fr", "");
prop.put("active_hi", "");
prop.put("active_cn", "");
prop.put("active_ru", "");
prop.put("active_uk", "");
prop.put("active_en", "");
prop.put("active_ja", "");
prop.put("active_de", "0");
prop.put("active_fr", "0");
prop.put("active_hi", "0");
prop.put("active_cn", "0");
prop.put("active_ru", "0");
prop.put("active_uk", "0");
prop.put("active_en", "0");
prop.put("active_ja", "0");
}
return prop;
}

@ -236,8 +236,8 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
<p class="url">
<a href="yacysearch.html" id="urlhash" target="LayouTest">http://url-of-the-search-result.net</a>
</p>
<p>
<input type="checkbox" name="search.result.show.keywords" aria-labelledby="tagsTitle" value="true" #(search.result.show.keywords)#::checked="checked" #(/search.result.show.keywords)# /> <span id="tagsTitle">Tags</span>: keyword subject keyword2 keyword3
<p class="tags">
<input type="checkbox" name="search.result.show.keywords" aria-labelledby="tagsTitle" value="true" #(search.result.show.keywords)#::checked="checked" #(/search.result.show.keywords)# /> <span id="tagsTitle">Tags</span>: <span class="tag label label-default">keyword</span> <span class="tag label label-default">subject</span> <span class="tag label label-default">keyword2</span> <span class="tag label label-default">keyword3</span>
</p>
<div class="urlinfo">
<table style="border-width:0">

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -36,7 +36,7 @@
<!-- the Solr version used in dependency section for all related dependencies -->
<solr.version>6.6.1</solr.version>
<!-- the Jetty version used in dependency section for all related dependencies -->
<jetty.version>9.4.6.v20170531</jetty.version>
<jetty.version>9.4.7.v20170914</jetty.version>
<!-- properties used for filtering yacyBuildProperties.java -->
<REPL_DATE>${DSTAMP}</REPL_DATE>

@ -82,7 +82,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
sid.addField(name, doc.getFieldValue(name));
}
}
return sid;
@ -165,10 +165,4 @@ public class SchemaConfiguration extends Configuration implements Serializable {
if (isEmpty() || contains(key)) key.add(doc, value);
}
public static Date getDate(SolrInputDocument doc, final SchemaDeclaration key) {
Date x = (Date) doc.getFieldValue(key.getSolrFieldName());
Date now = new Date();
return (x == null) ? new Date(0) : x.after(now) ? now : x;
}
}

@ -248,7 +248,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
}
if (Math.min(images_protocol_obj.size(), images_stub.size()) > 0) {
List<String> images_protocol = CollectionConfiguration.indexedList2protocolList(images_protocol_obj, images_protocol_obj.size());
List<String> images_protocol = CollectionConfiguration.indexedList2protocolList(images_protocol_obj, images_stub.size());
String imageurl = images_protocol.get(0) + "://" + images_stub.get(0);
writer.write("<media:content medium=\"image\" url=\"");
XML.escapeCharData(imageurl, writer); writer.write("\"/>\n");

@ -216,7 +216,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
}
if (Math.min(images_protocol_obj.size(), images_stub.size()) > 0) {
List<String> images_protocol = CollectionConfiguration.indexedList2protocolList(images_protocol_obj, images_protocol_obj.size());
List<String> images_protocol = CollectionConfiguration.indexedList2protocolList(images_protocol_obj, images_stub.size());
String imageurl = images_protocol.get(0) + "://" + images_stub.get(0);
solitaireTag(writer, "image", imageurl);
} else {

@ -65,7 +65,7 @@ public class Html2Image {
private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");
// debian
// to install: apt-get install wkhtmltopdf imagemagick xvfb
// to install: apt-get install wkhtmltopdf imagemagick xvfb ghostscript
private final static File wkhtmltopdfDebian = new File("/usr/bin/wkhtmltopdf"); // there is no wkhtmltoimage, use convert to create images
private final static File convertDebian = new File("/usr/bin/convert");

@ -326,16 +326,23 @@ public class Translator {
* @return list of language-codes of available/active translations
*/
public static List<String> activeTranslations() {
Switchboard sb = Switchboard.getSwitchboard();
File localePath;
if (sb != null)
final Switchboard sb = Switchboard.getSwitchboard();
final File localePath;
if (sb != null) {
localePath = sb.getDataPath("locale.translated_html", "DATA/LOCALE/htroot");
else
} else {
localePath = new File ("DATA/LOCALE/htroot");
List<String> dirlist = new ArrayList<String>(); // get list of language subdirectories
File[] list = localePath.listFiles();
for (File f : list) {
if (f.isDirectory()) dirlist.add(f.getName()); // filter directories to add to result
}
final List<String> dirlist = new ArrayList<String>(); // get list of language subdirectories
if(localePath.isDirectory()) {
final File[] list = localePath.listFiles();
if(list != null) { // the list may be null on IO error
for (final File f : list) {
if (f.isDirectory()) {
dirlist.add(f.getName()); // filter directories to add to result
}
}
}
}
return dirlist;
}

@ -31,8 +31,12 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
@ -42,15 +46,12 @@ import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
/**
* Parses a bz2 archive.
* Unzips and parses the content and adds it to the created main document
*/
public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
this.SUPPORTED_EXTENSIONS.add("bz2");
@ -117,27 +118,8 @@ public class bzipParser extends AbstractParser implements Parser {
}
}
try {
final String filename = location.getFileName();
// create maindoc for this bzip container, register with supplied url & mime
maindoc = new Document(
location,
mimeType,
charset,
this,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
@ -153,4 +135,112 @@ public class bzipParser extends AbstractParser implements Parser {
}
return maindoc == null ? null : new Document[]{maindoc};
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
/**
* Create the main resulting parsed document for a bzip archive
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param an instance of bzipParser that is registered as the parser origin of the document
* @return a Document instance
*/
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
location,
mimeType,
charset,
parser,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a bzipped resource.
* @param location the URL of the bzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName();
final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Parser.Failure {
Document maindoc = null;
BZip2CompressorInputStream zippedContent = null;
try {
// BZip2CompressorInputStream checks filecontent (magic start-bytes "BZh") and throws ioexception if no match
zippedContent = new BZip2CompressorInputStream(source);
} catch(Exception e) {
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
}
try {
// create maindoc for this bzip container, register with supplied url & mime
maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if(docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Exception e) {
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
}
throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);
}
return maindoc == null ? null : new Document[]{maindoc};
}
}

@ -98,7 +98,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
private static final int col_posintext = 15; // t 2 first appearance of word in text
private static final int col_posinphrase = 16; // r 1 position of word in its phrase
private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
private static final int col_reserve1 = 18; // i 1 reserve1
private static final int col_worddistance = 18; // i avg distance of search query words
private static final int col_reserve2 = 19; // k 1 reserve2
// appearance flags, used in RWI entry
@ -130,6 +130,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
final char doctype, // type of document
final int outlinksSame, // outlinks to same domain
final int outlinksOther, // outlinks to other domain
final int wordDistance, // average distance of multi search query words
final Bitfield flags // attributes to the url and to the word according the url
) {
@ -155,7 +156,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_posinphrase, posinphrase);
this.entry.setCol(col_posofphrase, posofphrase);
this.entry.setCol(col_reserve1, 0);
this.entry.setCol(col_worddistance, wordDistance);
this.entry.setCol(col_reserve2, 0);
}
@ -194,7 +195,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_lother, outlinksOther);
this.entry.setCol(col_urlLength, urlLength);
this.entry.setCol(col_urlComps, urlComps);
this.entry.setCol(col_reserve1, 0);
this.entry.setCol(col_worddistance, 0);
this.entry.setCol(col_reserve2, 0);
}
@ -271,6 +272,12 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
int pos = (int) this.entry.getColLong(col_posintext);
return pos;
}
@Override
public int distance() {
final int distance = (int) this.entry.getColLong(col_worddistance);
return distance;
}
/**
* positions() is used to remember word positions for each query word of an

@ -31,6 +31,7 @@ import java.util.Comparator;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
@ -66,6 +67,9 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
posinphrase, posofphrase,
urlcomps, urllength,
wordsintext, wordsintitle;
/** Stored average words distance, when it can not be processed from positions because created from a WordReferenceRow instance */
private int distance;
private int virtualAge;
private Queue<Integer> positions; // word positons of joined references
private double termFrequency;
@ -109,6 +113,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
} else {
this.positions = null;
}
this.distance = 0; // stored distance value is set to zero here because it has to be calculated from positions
this.posinphrase = posinphrase;
this.posintext = posintext;
this.posofphrase = posofphrase;
@ -139,6 +144,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
} else {
this.positions = null;
}
this.distance = e.distance();
this.posinphrase = e.posinphrase();
this.posintext = e.posintext();
this.posofphrase = e.posofphrase();
@ -165,6 +171,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.lother = 0;
this.phrasesintext = 0;
this.positions = null;
this.distance = 0;
this.posinphrase = 0;
this.posintext = 0;
this.posofphrase = 0;
@ -275,6 +282,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public Collection<Integer> positions() {
return this.positions;
}
@Override
public int distance() {
int value = super.distance();
if(value == 0) {
/* Calcualtion from positions returned 0 : let's try with the stored value */
value = this.distance;
}
return value;
}
@Override
public int posofphrase() {
@ -299,6 +316,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.type, // type of document
this.llocal, // outlinks to same domain
this.lother, // outlinks to other domain
this.distance(), // // average distance of multi search query words
this.flags // attributes to the url and to the word according the url
);
}
@ -376,7 +394,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (this.posintext > (v = other.posintext)) this.posintext = v;
// calculate and remember min distance
if (this.positions != null || other.positions != null) {
if (this.distance() > 0 || other.distance() > 0) {
int odist = other.distance();
int dist = this.distance();
if (odist > 0 && odist < dist) {
@ -413,7 +431,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (this.posintext < (v = other.posintext)) this.posintext = v;
// calculate and remember max distance
if (this.positions != null || other.positions != null) {
if (this.distance() > 0 || other.distance() > 0) {
int odist = other.distance();
int dist = this.distance();
if (odist > 0 && odist > dist) {

@ -331,7 +331,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name)) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
sid.addField(name, doc.getFieldValue(name));
}
}
return sid;

@ -0,0 +1,252 @@
// bzipParserTest.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import org.junit.Test;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.VocabularyScraper;
/**
* Unit tests for the {@link bzipParser} class
*
* @author luccioman
*
*/
public class bzipParserTest {
/** Folder containing test files */
private static final File TEST_FOLER = new File("test" + File.separator + "parsertest" + File.separator);
/**
* Unit test for the bzipParser.parse() function with some small bzip2 test files.
*
* @throws Failure
* when a file could not be parsed
* @throws InterruptedException
* when the test was interrupted before its termination
* @throws IOException
* when a read/write error occurred
*/
@Test
public void testParse() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
final bzipParser parser = new bzipParser();
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
}
}
}
/**
* Testing parse integration with the tar parser on a test tbz2 archive.
*
* @throws Failure
* when a file could not be parsed
* @throws InterruptedException
* when the test was interrupted before its termination
* @throws IOException
* when a read/write error occurred
*/
@Test
public void testParseTbz() throws Failure, InterruptedException, IOException {
final String fileName = "umlaute_html_xml_txt_gnu.tbz2";
final bzipParser parser = new bzipParser();
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
final String parsedText = documents[0].getTextString();
assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
parsedText.contains("Maßkrügen"));
assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
assertTrue(parsedText.contains("URL reference in raw text file"));
assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
}
}
/**
* Unit test for the bzipParser.parseWithLimits() function with some small bz2
* test files which content is within limits.
*
* @throws Failure
* when a file could not be parsed
* @throws InterruptedException
* when the test was interrupted before its termination
* @throws IOException
* when a read/write error occurred
*/
@Test
public void testParseWithLimits() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
final bzipParser parser = new bzipParser();
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000,
10000);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}
/**
* Unit test for the bzipParser.parseWithLimits() when maxLinks limit is exceeded
*
* @throws Failure
* when a file could not be parsed
* @throws InterruptedException
* when the test was interrupted before its termination
* @throws IOException
* when a read/write error occurred
*/
@Test
public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
final bzipParser parser = new bzipParser();
/* maxLinks limit exceeded */
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}
/**
* Unit test for the bzipParser.parseWithLimits() when maxBytes limit is exceeded
*
* @throws Failure
* when a file could not be parsed
* @throws InterruptedException
* when the test was interrupted before its termination
* @throws IOException
* when a read/write error occurred
*/
@Test
public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
final bzipParser parser = new bzipParser();
String fileName = fileNames[0];
DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
/* The bytes limit is set to let parsing the beginning text part, but stop before reaching the <a> tag */
final long maxBytes = 258;
final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
fileName = fileNames[1];
location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
/* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */
final long maxBytes = 65;
final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}

@ -19,8 +19,15 @@
*/
package net.yacy.kelondro.rwi;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
import org.junit.Test;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.crawler.retrieval.Response;
@ -29,10 +36,6 @@ import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.util.Bitfield;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
/**
* Unit tests for ReferenceContainer class.
@ -50,7 +53,7 @@ public class ReferenceContainerTest {
ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
byte[] termHash = Word.word2hash("test");
ReferenceContainer<WordReference> rc = new ReferenceContainer(wordReferenceFactory, termHash);
ReferenceContainer<WordReference> rc = new ReferenceContainer<WordReference>(wordReferenceFactory, termHash);
// prepare a WordReference to be added to the container
DigestURL url = new DigestURL("http://test.org/test.html");
@ -89,7 +92,6 @@ public class ReferenceContainerTest {
assertNotNull("getReference failed", wc);
// TODO: ReferenceContainer used for rwi results. As distance doesn't persist after adding ref to container making the distance ranking obsolete -> remove or fix
System.out.println("-----------------------------------------------------------");
System.out.println("WordReference (word distance) before add to container: " + wentry.distance());
System.out.println("WordReference (word distance) after get from container: " + wc.distance());

Binary file not shown.

Binary file not shown.

Binary file not shown.