mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-04-29 19:49:33 -04:00
Merge branch 'master' of github.com:yacy/yacy_search_server
This commit is contained in:
commit
f1c70dce33
@ -6,12 +6,14 @@
|
||||
# run with
|
||||
# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:latest
|
||||
|
||||
|
||||
## build base
|
||||
FROM eclipse-temurin:11-jdk-jammy AS base
|
||||
RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
## build app
|
||||
FROM eclipse-temurin:11-jdk-jammy AS appbuilder
|
||||
FROM base AS appbuilder
|
||||
|
||||
RUN apt-get update && apt-get install -yq ant git curl wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
|
||||
RUN apt-get update && apt-get install -yq ant git curl && rm -rf /var/lib/apt/lists/*
|
||||
RUN java -version
|
||||
|
||||
WORKDIR /opt
|
||||
@ -20,8 +22,12 @@ COPY . /opt/yacy_search_server/
|
||||
RUN ant compile -f /opt/yacy_search_server/build.xml && \
|
||||
apt-get purge -yq --auto-remove ant && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf /opt/yacy_search_server/.git
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /opt/yacy_search_server/
|
||||
RUN git rev-parse HEAD > .git/shallow && \
|
||||
git tag -l | xargs git tag -d && \
|
||||
git gc --prune=now
|
||||
|
||||
# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex())
|
||||
RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \
|
||||
@ -31,16 +37,13 @@ RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a09
|
||||
|
||||
|
||||
## build dist
|
||||
FROM eclipse-temurin:11-jre-jammy
|
||||
FROM base
|
||||
LABEL maintainer="Michael Peter Christen <mc@yacy.net>"
|
||||
|
||||
RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN adduser --system --group --no-create-home --disabled-password yacy
|
||||
WORKDIR /opt
|
||||
COPY . /opt/yacy_search_server/
|
||||
COPY --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server
|
||||
COPY --chown=yacy:yacy --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server
|
||||
|
||||
RUN adduser --system --group --no-create-home --disabled-password yacy && chown yacy:yacy -R /opt/yacy_search_server
|
||||
EXPOSE 8090 8443
|
||||
VOLUME ["/opt/yacy_search_server/DATA"]
|
||||
USER yacy
|
||||
|
@ -5,7 +5,7 @@ FROM arm64v8/openjdk:17-buster
|
||||
|
||||
# Install needed packages not in base image
|
||||
# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation)
|
||||
RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \
|
||||
RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript ca-certificates-java && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# trace java version
|
||||
|
28
ivy.xml
28
ivy.xml
@ -72,22 +72,22 @@
|
||||
<dependency org="org.codehaus.woodstox" name="woodstox-core-asl" rev="4.4.1">
|
||||
<exclude module="stax-api" />
|
||||
</dependency>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-client" rev="9.4.52.v20230823" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-deploy" rev="9.4.52.v20230823" conf="compile->master" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-jmx" rev="9.4.52.v20230823" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-http" rev="9.4.52.v20230823"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-proxy" rev="9.4.52.v20230823"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-security" rev="9.4.52.v20230823"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.52.v20230823"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-servlets" rev="9.4.52.v20230823"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-servlet" rev="9.4.52.v20230823">
|
||||
<dependency org="org.eclipse.jetty" name="jetty-client" rev="9.4.54.v20240208" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-deploy" rev="9.4.54.v20240208" conf="compile->master" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-jmx" rev="9.4.54.v20240208" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-http" rev="9.4.54.v20240208"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-proxy" rev="9.4.54.v20240208"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-security" rev="9.4.54.v20240208"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-server" rev="9.4.54.v20240208"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-servlets" rev="9.4.54.v20240208"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-servlet" rev="9.4.54.v20240208">
|
||||
<exclude module="jetty-util-ajax" />
|
||||
</dependency>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-util" rev="9.4.52.v20230823" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-webapp" rev="9.4.52.v20230823" />
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-client" rev="9.4.52.v20230823" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-common" rev="9.4.52.v20230823" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-http-client-transport" rev="9.4.52.v20230823" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty" name="jetty-util" rev="9.4.54.v20240208" />
|
||||
<dependency org="org.eclipse.jetty" name="jetty-webapp" rev="9.4.54.v20240208" />
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-client" rev="9.4.54.v20240208" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-common" rev="9.4.54.v20240208" conf="compile->master"/>
|
||||
<dependency org="org.eclipse.jetty.http2" name="http2-http-client-transport" rev="9.4.54.v20240208" conf="compile->master"/>
|
||||
<dependency org="org.jsoup" name="jsoup" rev="1.15.3" />
|
||||
<dependency org="org.jwat" name="jwat-warc" rev="1.1.3" />
|
||||
<dependency org="org.locationtech.spatial4j" name="spatial4j" rev="0.8"/>
|
||||
|
@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer {
|
||||
}
|
||||
if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
|
||||
final SolrInputDocument surrogate = new SolrInputDocument();
|
||||
|
||||
// set default values which act as constraints for a proper search
|
||||
CollectionSchema.httpstatus_i.add(surrogate, 200);
|
||||
|
||||
// get fields for json object
|
||||
jsonreader: for (final String key: json.keySet()) {
|
||||
final Object o = json.opt(key);
|
||||
if (o == null) continue;
|
||||
@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer {
|
||||
final String id = ASCII.String(durl.hash());
|
||||
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
|
||||
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
|
||||
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
|
||||
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
|
||||
continue jsonreader;
|
||||
}
|
||||
if (key.equals("description")) {
|
||||
// in YaCy descriptions are full-text indexed and also multi-value fields
|
||||
final List<Object> descriptions = new ArrayList<>();
|
||||
descriptions.add(o.toString());
|
||||
CollectionSchema.description_txt.add(surrogate, descriptions);
|
||||
continue jsonreader;
|
||||
}
|
||||
if (key.equals("referrer_url_s")) {
|
||||
// same patch as for urls which require re-calculation of id's; in this case we store the id only!
|
||||
final DigestURL durl = new DigestURL(o.toString());
|
||||
final String id = ASCII.String(durl.hash());
|
||||
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
|
||||
@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer {
|
||||
continue jsonreader;
|
||||
}
|
||||
|
||||
// check if required fields are still missing and compute them
|
||||
if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
|
||||
final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
|
||||
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
|
||||
}
|
||||
|
||||
// regular situation, just read content of field
|
||||
surrogate.setField(key, o.toString());
|
||||
}
|
||||
|
@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch {
|
||||
|| s.endsWith(".xml.zip")
|
||||
|| s.endsWith(".warc")
|
||||
|| s.endsWith(".warc.gz")
|
||||
|| s.endsWith(".jsonl")
|
||||
|| s.endsWith(".jsonl.gz")
|
||||
|| s.endsWith(".jsonlist")
|
||||
|| s.endsWith(".jsonlist.gz")
|
||||
|| s.endsWith(".flatjson") ) {
|
||||
count++;
|
||||
}
|
||||
@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
|
||||
// check mustmatch pattern
|
||||
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
|
||||
final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
|
||||
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
|
||||
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
|
||||
// check mustnotmatch
|
||||
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
|
||||
final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
|
||||
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
|
||||
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch {
|
||||
|
||||
// check canonical
|
||||
if (profile.noindexWhenCanonicalUnequalURL()) {
|
||||
AnchorURL canonical = document.getCanonical();
|
||||
DigestURL source = document.dc_source();
|
||||
final AnchorURL canonical = document.getCanonical();
|
||||
final DigestURL source = document.dc_source();
|
||||
if (canonical != null && source != null) {
|
||||
String canonical_norm = canonical.toNormalform(true);
|
||||
String source_norm = source.toNormalform(true);
|
||||
final String canonical_norm = canonical.toNormalform(true);
|
||||
final String source_norm = source.toNormalform(true);
|
||||
if (!canonical_norm.equals(source_norm)) {
|
||||
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
|
||||
final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
|
||||
// check content pattern must-match
|
||||
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
|
||||
final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
|
||||
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
|
||||
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
|
||||
// check content pattern must-not-match
|
||||
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
|
||||
final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
|
||||
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
|
||||
final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
|
Loading…
x
Reference in New Issue
Block a user