mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
added url_file_name_s in default collection schema for the file name
without the file extension. This part of the file path is removed from the multi-field url_paths_sxt, which has now not the file name as last part of the path list. The same applies to the new fields source_file_name_s and target_file_name_s in the webgraph schema.
This commit is contained in:
defaults
htroot
source/net/yacy
cora
document
federate
solr
crawler
data
ymark
document
search
@ -334,12 +334,15 @@ underline_txt
|
||||
## the protocol of the url
|
||||
url_protocol_s
|
||||
|
||||
## all path elements in the url
|
||||
url_paths_sxt
|
||||
## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension
|
||||
url_file_name_s
|
||||
|
||||
## the file name extension
|
||||
url_file_ext_s
|
||||
|
||||
## all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name
|
||||
url_paths_sxt
|
||||
|
||||
## number of key-value pairs in search part of the url
|
||||
#url_parameter_i
|
||||
|
||||
|
@ -41,6 +41,9 @@ source_id_s
|
||||
## the url without the protocol (source)
|
||||
#source_urlstub_s
|
||||
|
||||
## the file name without the extension (source)
|
||||
#source_file_name_s
|
||||
|
||||
## the file name extension (source)
|
||||
#source_file_ext_s
|
||||
|
||||
@ -53,7 +56,7 @@ source_id_s
|
||||
## count of all path elements in the url (source)
|
||||
#source_path_folders_count_i
|
||||
|
||||
## all path elements in the url (source)
|
||||
## all path elements in the url without the file name (source)
|
||||
#source_path_folders_sxt
|
||||
|
||||
## number of key-value pairs in search part of the url (source)
|
||||
@ -132,6 +135,9 @@ target_protocol_s
|
||||
## the url without the protocol (target)
|
||||
target_urlstub_s
|
||||
|
||||
## the file name without the extension (target)
|
||||
target_file_name_s
|
||||
|
||||
## the file name extension (target)
|
||||
target_file_ext_s
|
||||
|
||||
@ -144,7 +150,7 @@ target_file_ext_s
|
||||
## count of all path elements in the url (target)
|
||||
#target_path_folders_count_i
|
||||
|
||||
## all path elements in the url (target)
|
||||
## all path elements in the url without the file name (target)
|
||||
target_path_folders_sxt
|
||||
|
||||
## number of key-value pairs in search part of the url (target)
|
||||
|
@ -185,7 +185,7 @@ public class ViewFile {
|
||||
}
|
||||
|
||||
final String[] wordArray = wordArray(post.get("words", null));
|
||||
|
||||
final String ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
if (viewMode.equals("plain")) {
|
||||
|
||||
// TODO: how to handle very large files here ?
|
||||
@ -209,7 +209,6 @@ public class ViewFile {
|
||||
|
||||
} else if (viewMode.equals("iframeCache")) {
|
||||
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
|
||||
final String ext = url.getFileExtension();
|
||||
prop.put("viewMode_png", 0);
|
||||
prop.put("viewMode_html", 0);
|
||||
if (ext.length() > 0 && "jpg.jpeg.png.gif".indexOf(ext) >= 0) {
|
||||
@ -389,7 +388,7 @@ public class ViewFile {
|
||||
prop.put("error_md5", urlEntry.md5());
|
||||
prop.put("error_lat", urlEntry.lat());
|
||||
prop.put("error_lon", urlEntry.lon());
|
||||
prop.put("error_doctype", Response.doctype2mime(url.getFileExtension(), urlEntry.doctype()));
|
||||
prop.put("error_doctype", Response.doctype2mime(ext, urlEntry.doctype()));
|
||||
prop.put("error_language", urlEntry.language());
|
||||
prop.put("error_flags", urlEntry.flags().toString());
|
||||
prop.put("error_wordCount", urlEntry.wordCount());
|
||||
|
@ -29,6 +29,7 @@ import java.util.List;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.analysis.Classification;
|
||||
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
||||
@ -189,6 +190,7 @@ public class yacysearchitem {
|
||||
// prop.putHTML("content_value", Interaction.TripleGet(result.urlstring(), "http://virtual.x/hasvalue", "anonymous"));
|
||||
// END interaction
|
||||
|
||||
String resultFileName = resultURL.getFileName();
|
||||
prop.putHTML("content_target", target);
|
||||
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
||||
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
|
||||
@ -210,7 +212,7 @@ public class yacysearchitem {
|
||||
prop.putHTML("content_sizename", RSSMessage.sizename(result.filesize()));
|
||||
prop.putHTML("content_showSize_sizename", RSSMessage.sizename(result.filesize()));
|
||||
prop.putHTML("content_host", resultURL.getHost() == null ? "" : resultURL.getHost());
|
||||
prop.putHTML("content_file", resultURL.getFileName());
|
||||
prop.putHTML("content_file", resultFileName);
|
||||
prop.putHTML("content_path", resultURL.getPath());
|
||||
prop.put("content_nl", (item == theSearch.query.offset) ? 0 : 1);
|
||||
prop.putHTML("content_publisher", result.publisher());
|
||||
@ -243,7 +245,7 @@ public class yacysearchitem {
|
||||
prop.put("content_heuristic_name", heuristic.heuristicName);
|
||||
}
|
||||
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false);
|
||||
final String ext = resultURL.getFileExtension().toLowerCase();
|
||||
final String ext = MultiProtocolURI.getFileExtension(resultFileName).toLowerCase();
|
||||
if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) {
|
||||
final String license = URLLicense.aquireLicense(resultURL);
|
||||
prop.put("content_code", license);
|
||||
|
@ -269,7 +269,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
|
||||
public final ContentDomain getContentDomain() {
|
||||
if (this.contentDomain == null) {
|
||||
this.contentDomain = Classification.getContentDomain(this.getFileExtension());
|
||||
this.contentDomain = Classification.getContentDomain(getFileExtension(this.getFileName()));
|
||||
}
|
||||
return this.contentDomain;
|
||||
}
|
||||
@ -711,14 +711,10 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
return this.path.substring(p + 1); // the 'real' file name
|
||||
}
|
||||
|
||||
public String getFileExtension() {
|
||||
return getFileExtension(getFileName());
|
||||
}
|
||||
|
||||
public static String getFileExtension(final String fileName) {
|
||||
final int p = fileName.lastIndexOf('.');
|
||||
if (p < 0) return "";
|
||||
return fileName.substring(p + 1);
|
||||
return fileName.substring(p + 1).toLowerCase();
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
@ -726,7 +722,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
}
|
||||
|
||||
public String[] getPaths() {
|
||||
return this.path == null ? null : this.path.charAt(0) == '/' ? CommonPattern.SLASH.split(this.path.substring(1)) : CommonPattern.SLASH.split(this.path);
|
||||
String s = this.path == null ? "" : this.path.charAt(0) == '/' ? this.path.substring(1) : this.path;
|
||||
int p = s.lastIndexOf('/');
|
||||
if (p < 0) return new String[0];
|
||||
s = s.substring(0, p); // the paths do not contain the last part, which is considered as the getFileName() part.
|
||||
String[] paths = CommonPattern.SLASH.split(s);
|
||||
return paths;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -973,15 +974,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
return (this.searchpart != null) && (this.searchpart.length() > 0);
|
||||
}
|
||||
|
||||
public final boolean isCGI() {
|
||||
final String ls = unescape(this.path.toLowerCase());
|
||||
return ls.indexOf(".cgi",0) >= 0 ||
|
||||
ls.indexOf(".exe",0) >= 0;
|
||||
public static final boolean isCGI(final String extension) {
|
||||
return "cgi.exe.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0;
|
||||
}
|
||||
|
||||
public final boolean isImage() {
|
||||
final String ext = getFileExtension().toLowerCase();
|
||||
return "png.gif.jpg.jpeg".indexOf(ext) >= 0;
|
||||
public static final boolean isImage(final String extension) {
|
||||
return "png.gif.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0;
|
||||
}
|
||||
|
||||
public final boolean isIndividual() {
|
||||
|
@ -201,10 +201,10 @@ public class Classification {
|
||||
}
|
||||
|
||||
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
|
||||
return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension(), dfltMime);
|
||||
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()), dfltMime);
|
||||
}
|
||||
|
||||
public static String url2mime(final MultiProtocolURI url) {
|
||||
return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension());
|
||||
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
|
||||
}
|
||||
}
|
||||
|
@ -146,43 +146,43 @@ public class SchemaConfiguration extends Configuration implements Serializable {
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String value) {
|
||||
assert !key.isMultiValued();
|
||||
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Date value) {
|
||||
assert !key.isMultiValued();
|
||||
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String[] value) {
|
||||
assert key.isMultiValued();
|
||||
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Integer[] value) {
|
||||
assert key.isMultiValued();
|
||||
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final List<?> values) {
|
||||
assert key.isMultiValued();
|
||||
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final int value) {
|
||||
assert !key.isMultiValued();
|
||||
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final long value) {
|
||||
assert !key.isMultiValued();
|
||||
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
|
||||
}
|
||||
|
||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final boolean value) {
|
||||
assert !key.isMultiValued();
|
||||
if (isEmpty() || contains(key)) key.add(doc, value);
|
||||
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
|
||||
if ((isEmpty() || contains(key)) && (!this.lazy || value)) key.add(doc, value);
|
||||
}
|
||||
|
||||
public static Date getDate(SolrInputDocument doc, final SchemaDeclaration key) {
|
||||
|
@ -148,14 +148,16 @@ public class JsonResponseWriter implements QueryResponseWriter {
|
||||
solitaireTag(writer, stag, value.stringValue());
|
||||
continue;
|
||||
}
|
||||
|
||||
// some special handling here
|
||||
if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) {
|
||||
String u = value.stringValue();
|
||||
try {
|
||||
url = new MultiProtocolURI(u);
|
||||
String filename = url.getFileName();
|
||||
solitaireTag(writer, "link", u);
|
||||
solitaireTag(writer, "file", url.getFileName());
|
||||
solitaireTag(writer, "file", filename);
|
||||
// get image license
|
||||
if (MultiProtocolURI.isImage(filename)) URLLicense.aquireLicense(urlhash, url.toNormalform(true));
|
||||
} catch (MalformedURLException e) {}
|
||||
continue;
|
||||
}
|
||||
@ -206,9 +208,6 @@ public class JsonResponseWriter implements QueryResponseWriter {
|
||||
//missing: "code","faviconCode"
|
||||
}
|
||||
|
||||
// get image license
|
||||
if (url.isImage()) URLLicense.aquireLicense(urlhash, url.toNormalform(true));
|
||||
|
||||
// compute snippet from texts
|
||||
solitaireTag(writer, "path", path.toString());
|
||||
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
|
||||
|
@ -204,7 +204,7 @@ public class Latency {
|
||||
// for CGI accesses, we double the minimum time
|
||||
// mostly there is a database access in the background
|
||||
// which creates a lot of unwanted IO on target site
|
||||
if (url.isCGI()) waiting = waiting * 2;
|
||||
if (MultiProtocolURI.isCGI(url.getFileName())) waiting = waiting * 2;
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
if (!local) waiting += host.flux(waiting);
|
||||
@ -238,7 +238,7 @@ public class Latency {
|
||||
// for CGI accesses, we double the minimum time
|
||||
// mostly there is a database access in the background
|
||||
// which creates a lot of unwanted IO on target site
|
||||
if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
|
||||
if (MultiProtocolURI.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
int flux = host.flux(waiting);
|
||||
|
@ -74,7 +74,7 @@ public class ResultImages {
|
||||
image.height() > 100 &&
|
||||
image.width() < 1200 &&
|
||||
image.height() < 1000 &&
|
||||
!"gif".equals(image.url().getFileExtension())) {
|
||||
!"gif".equals(MultiProtocolURI.getFileExtension(image.url().getFileName()))) {
|
||||
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
|
||||
// ((urlString.lastIndexOf(".png") != -1)){
|
||||
|
||||
|
@ -31,6 +31,7 @@ import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.document.analysis.Classification;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
@ -94,7 +95,7 @@ public class FileLoader {
|
||||
}
|
||||
|
||||
// create response header
|
||||
String mime = Classification.ext2mime(url.getFileExtension());
|
||||
String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
|
||||
ResponseHeader responseHeader = new ResponseHeader(200);
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
|
||||
|
@ -70,7 +70,7 @@ public class Response {
|
||||
|
||||
// doctype calculation
|
||||
public static char docType(final MultiProtocolURI url) {
|
||||
String ext = url.getFileExtension();
|
||||
String ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
if (ext == null) return DT_UNKNOWN;
|
||||
if (ext.equals(".gif")) return DT_IMAGE;
|
||||
if (ext.equals(".ico")) return DT_IMAGE;
|
||||
@ -169,7 +169,7 @@ public class Response {
|
||||
// request and response headers may be zero in case that we process surrogates
|
||||
this.requestHeader = new RequestHeader();
|
||||
this.responseHeader = new ResponseHeader(200);
|
||||
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(request.url().getFileExtension(), "text/plain")); // tell parser how to handle the content
|
||||
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURI.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content
|
||||
if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
|
||||
this.profile = profile;
|
||||
this.status = QUEUE_STATE_FRESH;
|
||||
@ -291,7 +291,7 @@ public class Response {
|
||||
return "dynamic_post";
|
||||
}
|
||||
|
||||
if (url().isCGI()) {
|
||||
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
|
||||
return "dynamic_cgi";
|
||||
}
|
||||
|
||||
@ -390,7 +390,7 @@ public class Response {
|
||||
if (url().isPOST()) {
|
||||
return false;
|
||||
}
|
||||
if (url().isCGI()) {
|
||||
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -541,7 +541,7 @@ public class Response {
|
||||
if (url().isPOST()) {
|
||||
return "Dynamic_(POST)";
|
||||
}
|
||||
if (url().isCGI()) {
|
||||
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
|
||||
return "Dynamic_(CGI)";
|
||||
}
|
||||
}
|
||||
@ -684,7 +684,7 @@ public class Response {
|
||||
// CGI access makes the page very individual, and therefore not usable in caches
|
||||
if (!profile().crawlingQ()) {
|
||||
if (url().isPOST()) { return "Dynamic_(POST)"; }
|
||||
if (url().isCGI()) { return "Dynamic_(CGI)"; }
|
||||
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; }
|
||||
}
|
||||
|
||||
// -authorization cases in request
|
||||
|
@ -113,7 +113,7 @@ public class SMBLoader {
|
||||
}
|
||||
|
||||
// create response header
|
||||
String mime = Classification.ext2mime(url.getFileExtension());
|
||||
String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
|
||||
ResponseHeader responseHeader = new ResponseHeader(200);
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
|
||||
|
@ -10,6 +10,7 @@ import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
@ -161,7 +162,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
||||
}
|
||||
final String clean = YMarkUtil.cleanTagsString(buffer.toString());
|
||||
if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
|
||||
return document.getFileExtension();
|
||||
return MultiProtocolURI.getFileExtension(document.dc_source().getFileName());
|
||||
}
|
||||
return clean;
|
||||
} finally {
|
||||
|
@ -153,8 +153,8 @@ public class Document {
|
||||
return this.languages;
|
||||
}
|
||||
|
||||
public String getFileExtension() {
|
||||
return this.source.getFileExtension();
|
||||
public String getFileName() {
|
||||
return this.source.getFileName();
|
||||
}
|
||||
|
||||
public Map<String, Set<String>> getGenericFacets() {
|
||||
|
@ -90,7 +90,7 @@ public class LibraryProvider {
|
||||
|
||||
private Dictionary(final String nickname, final String url) {
|
||||
try {
|
||||
this.filename = new MultiProtocolURI(url).getFileName();
|
||||
this.filename = (new MultiProtocolURI(url)).getFileName();
|
||||
} catch ( final MalformedURLException e ) {
|
||||
assert false;
|
||||
}
|
||||
|
@ -194,7 +194,7 @@ public final class TextParser {
|
||||
try {
|
||||
idioms = parsers(location, mimeType);
|
||||
} catch (final Parser.Failure e) {
|
||||
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
|
||||
final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage();
|
||||
AbstractParser.log.logWarning(errorMsg);
|
||||
throw new Parser.Failure(errorMsg, location);
|
||||
}
|
||||
@ -218,7 +218,7 @@ public final class TextParser {
|
||||
try {
|
||||
idioms = parsers(location, mimeType);
|
||||
} catch (final Parser.Failure e) {
|
||||
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
|
||||
final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage();
|
||||
AbstractParser.log.logWarning(errorMsg);
|
||||
throw new Parser.Failure(errorMsg, location);
|
||||
}
|
||||
@ -252,7 +252,7 @@ public final class TextParser {
|
||||
final InputStream sourceStream
|
||||
) throws Parser.Failure {
|
||||
if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing '" + location + "' from stream");
|
||||
final String fileExt = location.getFileExtension();
|
||||
final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName());
|
||||
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
||||
assert parser != null;
|
||||
|
||||
@ -272,7 +272,7 @@ public final class TextParser {
|
||||
final String charset,
|
||||
final byte[] sourceArray
|
||||
) throws Parser.Failure {
|
||||
final String fileExt = location.getFileExtension();
|
||||
final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName());
|
||||
if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
|
||||
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
||||
assert !parsers.isEmpty();
|
||||
@ -312,7 +312,7 @@ public final class TextParser {
|
||||
|
||||
if (docs == null) {
|
||||
if (failedParser.isEmpty()) {
|
||||
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
|
||||
final String errorMsg = "Parsing content with file extension '" + fileExt + "' and mimetype '" + mimeType + "' failed.";
|
||||
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
|
||||
throw new Parser.Failure(errorMsg, location);
|
||||
}
|
||||
@ -362,7 +362,7 @@ public final class TextParser {
|
||||
final Set<Parser> idioms = new HashSet<Parser>(2);
|
||||
|
||||
// check extension
|
||||
String ext = url.getFileExtension();
|
||||
String ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
Set<Parser> idiom;
|
||||
if (ext != null && ext.length() > 0) {
|
||||
ext = ext.toLowerCase();
|
||||
@ -428,11 +428,11 @@ public final class TextParser {
|
||||
* @return an error if the extension is not supported, null otherwise
|
||||
*/
|
||||
public static String supportsExtension(final MultiProtocolURI url) {
|
||||
return supportsExtension(url.getFileExtension().toLowerCase());
|
||||
return supportsExtension(MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase());
|
||||
}
|
||||
|
||||
public static String mimeOf(final MultiProtocolURI url) {
|
||||
return mimeOf(url.getFileExtension());
|
||||
return mimeOf(MultiProtocolURI.getFileExtension(url.getFileName()));
|
||||
}
|
||||
|
||||
public static String mimeOf(final String ext) {
|
||||
|
@ -72,8 +72,9 @@ public class audioTagParser extends AbstractParser implements Parser {
|
||||
final String charset, final InputStream source)
|
||||
throws Parser.Failure, InterruptedException {
|
||||
|
||||
final String filename = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName());
|
||||
final String fileext = '.'+location.getFileExtension();
|
||||
String filename = location.getFileName();
|
||||
final String fileext = '.' + MultiProtocolURI.getFileExtension(filename);
|
||||
filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename);
|
||||
String mime = mimeType;
|
||||
|
||||
// fix mimeType
|
||||
@ -190,7 +191,7 @@ public class audioTagParser extends AbstractParser implements Parser {
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
|
||||
singleList(filename), // title
|
||||
"", // author
|
||||
location.getHost(),
|
||||
null,
|
||||
|
@ -47,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser {
|
||||
public Document[] parse(final DigestURI location, final String mimeType,
|
||||
final String charset, final InputStream source1)
|
||||
throws Parser.Failure, InterruptedException {
|
||||
|
||||
String filename = location.getFileName();
|
||||
final Document[] docs = new Document[]{new Document(
|
||||
location,
|
||||
mimeType,
|
||||
@ -55,7 +55,7 @@ public class genericParser extends AbstractParser implements Parser {
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
|
||||
singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename)), // title
|
||||
"", // author
|
||||
location.getHost(),
|
||||
null,
|
||||
|
@ -473,10 +473,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
||||
final String href = tagopts.getProperty("href", EMPTY_STRING);
|
||||
DigestURI url;
|
||||
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
|
||||
final String f = url.getFileName();
|
||||
final int p = f.lastIndexOf('.');
|
||||
final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1);
|
||||
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
|
||||
final String ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
|
||||
// special handling of such urls: put them to the image urls
|
||||
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
|
||||
addImage(this.images, ie);
|
||||
@ -656,7 +654,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
||||
String ext;
|
||||
ArrayList<DigestURI> f = new ArrayList<DigestURI>();
|
||||
for (final DigestURI url: this.anchors.keySet()) {
|
||||
ext = url.getFileExtension();
|
||||
ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
if (ext == null) continue;
|
||||
if (ext.equals("swf")) f.add(url);
|
||||
}
|
||||
@ -666,7 +664,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
||||
public boolean containsFlash() {
|
||||
String ext;
|
||||
for (final MultiProtocolURI url: this.anchors.keySet()) {
|
||||
ext = url.getFileExtension();
|
||||
ext = MultiProtocolURI.getFileExtension(url.getFileName());
|
||||
if (ext == null) continue;
|
||||
if (ext.equals("swf")) return true;
|
||||
}
|
||||
|
@ -99,8 +99,9 @@ public class genericImageParser extends AbstractParser implements Parser {
|
||||
String author = null;
|
||||
String keywords = null;
|
||||
String description = null;
|
||||
if (mimeType.equals("image/bmp") ||
|
||||
location.getFileExtension().equalsIgnoreCase("bmp")) {
|
||||
String filename = location.getFileName();
|
||||
String ext = MultiProtocolURI.getFileExtension(filename);
|
||||
if (mimeType.equals("image/bmp") || ext.equalsIgnoreCase("bmp")) {
|
||||
byte[] b;
|
||||
try {
|
||||
b = FileUtils.read(sourceStream);
|
||||
@ -110,10 +111,7 @@ public class genericImageParser extends AbstractParser implements Parser {
|
||||
}
|
||||
final IMAGEMAP imap = bmpParser.parse(b);
|
||||
ii = parseJavaImage(location, imap.getImage());
|
||||
} else if (mimeType.equals("image/jpeg") ||
|
||||
location.getFileExtension().equalsIgnoreCase("jpg") ||
|
||||
location.getFileExtension().equalsIgnoreCase("jpeg") ||
|
||||
location.getFileExtension().equalsIgnoreCase("jpe")) {
|
||||
} else if (mimeType.equals("image/jpeg") || ext.equalsIgnoreCase("jpg") || ext.equalsIgnoreCase("jpeg") || ext.equalsIgnoreCase("jpe")) {
|
||||
// use the exif parser from
|
||||
// http://www.drewnoakes.com/drewnoakes.com/code/exif/
|
||||
// javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/
|
||||
@ -190,7 +188,7 @@ public class genericImageParser extends AbstractParser implements Parser {
|
||||
final String infoString = ii.info.toString();
|
||||
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
|
||||
|
||||
if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(location.getFileName());
|
||||
if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(filename);
|
||||
|
||||
return new Document[]{new Document(
|
||||
location,
|
||||
@ -297,7 +295,7 @@ public class genericImageParser extends AbstractParser implements Parser {
|
||||
DigestURI uri;
|
||||
try {
|
||||
uri = new DigestURI("http://localhost/" + image.getName());
|
||||
final Document[] document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
|
||||
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURI.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image));
|
||||
System.out.println(document[0].toString());
|
||||
} catch (final MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -33,6 +33,7 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
@ -64,7 +65,7 @@ public class tarParser extends AbstractParser implements Parser {
|
||||
|
||||
final List<Document> docacc = new ArrayList<Document>();
|
||||
Document[] subDocs = null;
|
||||
final String ext = url.getFileExtension().toLowerCase();
|
||||
final String ext = MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase();
|
||||
if (ext.equals("gz") || ext.equals("tgz")) {
|
||||
try {
|
||||
source = new GZIPInputStream(source);
|
||||
|
@ -35,10 +35,10 @@ import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
@ -79,6 +79,7 @@ import net.yacy.kelondro.util.Bitfield;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Segment.ReferenceReport;
|
||||
import net.yacy.search.index.Segment.ReferenceReportCache;
|
||||
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
@ -256,8 +257,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
add(doc, CollectionSchema.description_words_val, cv);
|
||||
}
|
||||
|
||||
String filename = digestURI.getFileName();
|
||||
String extension = MultiProtocolURI.getFileExtension(filename);
|
||||
if (allAttr || contains(CollectionSchema.author)) add(doc, CollectionSchema.author, md.dc_creator());
|
||||
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
|
||||
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, md.doctype()));
|
||||
if (allAttr || contains(CollectionSchema.last_modified)) add(doc, CollectionSchema.last_modified, md.moddate());
|
||||
if (allAttr || contains(CollectionSchema.wordcount_i)) add(doc, CollectionSchema.wordcount_i, md.wordCount());
|
||||
|
||||
@ -274,7 +277,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
|
||||
// path elements of link
|
||||
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
|
||||
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
|
||||
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
|
||||
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
|
||||
|
||||
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage());
|
||||
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal());
|
||||
@ -474,8 +478,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards!
|
||||
|
||||
// path elements of link
|
||||
String filename = digestURI.getFileName();
|
||||
String extension = MultiProtocolURI.getFileExtension(filename);
|
||||
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
|
||||
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
|
||||
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
|
||||
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
|
||||
|
||||
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
|
||||
Set<DigestURI> inboundLinks = document.inboundLinks();
|
||||
@ -695,8 +702,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
outboundLinks.remove(canonical);
|
||||
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
|
||||
// set a flag if this is equal to sku
|
||||
if (contains(CollectionSchema.canonical_equal_sku_b) && canonical.equals(docurl)) {
|
||||
add(doc, CollectionSchema.canonical_equal_sku_b, true);
|
||||
if (contains(CollectionSchema.canonical_equal_sku_b)) {
|
||||
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(docurl));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -784,9 +791,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
|
||||
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
|
||||
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
|
||||
Map<DigestURI, Properties> alllinks = document.getAnchors();
|
||||
|
||||
// create a subgraph
|
||||
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
|
||||
//if () {
|
||||
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
|
||||
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);
|
||||
//}
|
||||
|
||||
// list all links
|
||||
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, collections, clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
|
||||
doc.webgraphDocuments.addAll(subgraph.edges);
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]);
|
||||
@ -1164,8 +1178,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
if (contains(CollectionSchema.load_date_dt)) add(solrdoc, CollectionSchema.load_date_dt, new Date());
|
||||
|
||||
// path elements of link
|
||||
String filename = digestURI.getFileName();
|
||||
String extension = MultiProtocolURI.getFileExtension(filename);
|
||||
if (contains(CollectionSchema.url_paths_sxt)) add(solrdoc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
|
||||
if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
|
||||
if (contains(CollectionSchema.url_file_name_s)) add(solrdoc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
|
||||
if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, extension);
|
||||
|
||||
// fail reason and status
|
||||
if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason);
|
||||
|
@ -152,8 +152,9 @@ public enum CollectionSchema implements SchemaDeclaration {
|
||||
publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
|
||||
|
||||
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
|
||||
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
|
||||
url_file_name_s(SolrType.string, true, true, false, false, false, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"),
|
||||
url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),
|
||||
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name"),
|
||||
url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"),
|
||||
url_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url"),
|
||||
url_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url"),
|
||||
|
@ -42,6 +42,7 @@ import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.federate.solr.ProcessType;
|
||||
import net.yacy.cora.federate.solr.SchemaConfiguration;
|
||||
import net.yacy.cora.federate.solr.SchemaDeclaration;
|
||||
@ -111,31 +112,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
||||
}
|
||||
}
|
||||
|
||||
public Subgraph edges(
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
|
||||
final Map<DigestURI, Properties> alllinks,
|
||||
final Map<DigestURI, ImageEntry> images,
|
||||
final Set<DigestURI> inboundLinks,
|
||||
final Set<DigestURI> outboundLinks,
|
||||
IndexCell<CitationReference> citations
|
||||
) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
|
||||
addEdges(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source,
|
||||
allAttr, alllinks, images, true, inboundLinks, citations);
|
||||
addEdges(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source,
|
||||
allAttr, alllinks, images, false, outboundLinks, citations);
|
||||
return subgraph;
|
||||
}
|
||||
|
||||
private void addEdges(
|
||||
public void addEdges(
|
||||
final Subgraph subgraph,
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
|
||||
final boolean allAttr, final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
|
||||
final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
|
||||
final boolean inbound, final Set<DigestURI> links,
|
||||
final IndexCell<CitationReference> citations) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
for (final DigestURI target_url: links) {
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
@ -194,7 +177,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
||||
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension());
|
||||
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
|
||||
String source_file_name = source.getFileName();
|
||||
String source_file_ext = MultiProtocolURI.getFileExtension(source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
|
||||
String[] paths = source.getPaths();
|
||||
@ -251,7 +239,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
||||
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension());
|
||||
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
|
||||
String target_file_name = target_url.getFileName();
|
||||
String target_file_ext = MultiProtocolURI.getFileExtension(target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
|
||||
String[] paths = target_url.getPaths();
|
||||
|
@ -41,11 +41,12 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
||||
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
|
||||
source_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (source)"),
|
||||
source_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (source)"),
|
||||
source_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (source)"),
|
||||
source_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension (source)"),
|
||||
source_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (source)"),
|
||||
source_path_s(SolrType.string, true, true, false, false, false, "path of the url (source)"),
|
||||
source_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (source)"),
|
||||
source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url (source)"),
|
||||
source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url without the file name (source)"),
|
||||
source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
|
||||
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
|
||||
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
|
||||
@ -73,11 +74,12 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
||||
target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"),
|
||||
target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"),
|
||||
target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"),
|
||||
target_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (target)"),
|
||||
target_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension (target)"),
|
||||
target_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (target)"),
|
||||
target_path_s(SolrType.string, true, true, false, false, false, "path of the url (target)"),
|
||||
target_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (target)"),
|
||||
target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url (target)"),
|
||||
target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url without the file name (target)"),
|
||||
target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
|
||||
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
|
||||
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
|
||||
|
Reference in New Issue
Block a user