mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
check mime prior to ext for metadata modification for images
This commit is contained in:
source/net/yacy
@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
||||
return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE)
|
||||
*/
|
||||
@Deprecated
|
||||
public static final boolean isImage(final String extension) {
|
||||
return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
|
||||
}
|
||||
|
@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.analysis.Classification;
|
||||
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
||||
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
LinkedHashMap<DigestURL,String> outboundLinks = document.outboundLinks();
|
||||
|
||||
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
|
||||
List<ImageEntry> images = new ArrayList<ImageEntry>();
|
||||
int c = 0;
|
||||
final Object parser = document.getParserObject();
|
||||
boolean containsCanonical = false;
|
||||
DigestURL canonical = null;
|
||||
if (parser instanceof ContentScraper) {
|
||||
final ContentScraper html = (ContentScraper) parser;
|
||||
images = html.getImages();
|
||||
List<ImageEntry> images = html.getImages();
|
||||
|
||||
// header tags
|
||||
int h = 0;
|
||||
@ -912,12 +912,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
!content.endsWith(" " + r)) content += " " + r;
|
||||
}
|
||||
}
|
||||
|
||||
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
|
||||
|
||||
// handle image source meta data
|
||||
if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) {
|
||||
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
|
||||
content = digestURL.toTokens(); // remove all other entry but the url tokens
|
||||
}
|
||||
|
||||
|
||||
// content (must be written after special parser data, since this can influence the content)
|
||||
if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
|
||||
if (allAttr || contains(CollectionSchema.wordcount_i)) {
|
||||
|
Reference in New Issue
Block a user