mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-06-01 00:59:34 -04:00
new indexing strategy: ALL links that appear anywhere are indexed, not
only links where the content can be parsed. All non-parseable links are placed into the noload queue. The search process must therefore be able to filter out non-text search results. - This fixes the problem that image search results appeared in the text search. - The interactive search can retrieve now ALL types of links - The p2p interface is now extended to retrieve only certain types of links (text, image, video, apps) - The search process has an extension to filter the right document type according to the search query
This commit is contained in:
parent
14f67f217c
commit
f8cd57c92f
htroot
source
de/anomic/crawler
net/yacy
@ -36,7 +36,7 @@ function search(search, count, offset) {
|
||||
} else if (window.ActiveXObject) { // IE
|
||||
self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
|
||||
}
|
||||
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true);
|
||||
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&nav=all&contentdom=all&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord + "&query=" + query, true);
|
||||
self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
|
||||
self.xmlHttpReq.onreadystatechange = function() {
|
||||
if (self.xmlHttpReq.readyState == 4) {
|
||||
|
@ -119,7 +119,7 @@ public final class search {
|
||||
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
|
||||
final String prefer = post.get("prefer", "");
|
||||
final String modifier = post.get("modifier", "").trim();
|
||||
final String contentdom = post.get("contentdom", "text");
|
||||
final String contentdom = post.get("contentdom", "all");
|
||||
final String filter = post.get("filter", ".*"); // a filter on the url
|
||||
final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet
|
||||
String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null;
|
||||
|
@ -56,6 +56,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
|
||||
<fieldset class="yacys">
|
||||
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
|
||||
<input type="hidden" name="startRecord" value="#[startRecord]#" />
|
||||
<input type="hidden" name="verify", value="false" />
|
||||
<input type="hidden" name="resource", value="local" />
|
||||
<input type="hidden" name="contentdom", value="all" />
|
||||
<input id="search" class="searchinput" name="query" type="text" value="#[query]#" size="40" maxlength="80" onFocus="this.select()" />
|
||||
#(allowrealtime)#
|
||||
<input id="Enter" type="submit" name="Enter" value="Search" />::
|
||||
|
@ -272,7 +272,7 @@ public class yacysearch {
|
||||
|
||||
// find search domain
|
||||
final Classification.ContentDomain contentdom =
|
||||
ContentDomain.contentdomParser(post == null ? "text" : post.get("contentdom", "text"));
|
||||
ContentDomain.contentdomParser(post == null ? "all" : post.get("contentdom", "all"));
|
||||
|
||||
// patch until better search profiles are available
|
||||
if ( contentdom == ContentDomain.TEXT ) {
|
||||
|
@ -105,7 +105,7 @@ public class yacysearchitem {
|
||||
prop.put("navurlBase", QueryParams.navurlBase("html", theQuery, null, theQuery.urlMask.toString(), theQuery.navigators).toString());
|
||||
final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, "");
|
||||
|
||||
if (theQuery.contentdom == Classification.ContentDomain.TEXT) {
|
||||
if (theQuery.contentdom == Classification.ContentDomain.TEXT || theQuery.contentdom == Classification.ContentDomain.ALL) {
|
||||
// text search
|
||||
|
||||
// generate result object
|
||||
|
@ -39,6 +39,7 @@ import java.util.Properties;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.Classification.ContentDomain;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
@ -353,9 +354,11 @@ public final class CrawlStacker {
|
||||
|
||||
// check availability of parser and maxfilesize
|
||||
String warning = null;
|
||||
if (entry.size() > maxFileSize /*||
|
||||
(entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)
|
||||
*/) {
|
||||
if (entry.size() > maxFileSize ||
|
||||
entry.url().getContentDomain() == ContentDomain.APP ||
|
||||
entry.url().getContentDomain() == ContentDomain.IMAGE ||
|
||||
entry.url().getContentDomain() == ContentDomain.AUDIO ||
|
||||
entry.url().getContentDomain() == ContentDomain.VIDEO ) {
|
||||
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
|
||||
//if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
|
||||
return null;
|
||||
|
@ -24,11 +24,13 @@ import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
public class Classification {
|
||||
|
||||
private static final Set<String> textExtSet = new HashSet<String>();
|
||||
private static final Set<String> mediaExtSet = new HashSet<String>();
|
||||
private static final Set<String> imageExtSet = new HashSet<String>();
|
||||
private static final Set<String> audioExtSet = new HashSet<String>();
|
||||
@ -78,11 +80,13 @@ public class Classification {
|
||||
|
||||
static {
|
||||
|
||||
final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
|
||||
final String text = "htm,html,phtml,shtml,xhtml,php,php3,php4,php5,cfm,asp,aspx,tex,txt,jsp,mf,asp,aspx,csv,gpx,vcf,xsl,xml,pdf,doc,docx,xls,xlsx,ppt,pptx";
|
||||
final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,java,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
|
||||
final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma";
|
||||
final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv";
|
||||
final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf";
|
||||
|
||||
addSet(textExtSet, text); // image formats
|
||||
addSet(imageExtSet, image); // image formats
|
||||
addSet(audioExtSet, audio); // audio formats
|
||||
addSet(videoExtSet, video); // video formats
|
||||
@ -95,6 +99,11 @@ public class Classification {
|
||||
for (String s: extString.split(",")) set.add(s.toLowerCase().trim());
|
||||
}
|
||||
|
||||
public static boolean isTextExtension(String textExt) {
|
||||
if (textExt == null) return false;
|
||||
return textExtSet.contains(textExt.trim().toLowerCase());
|
||||
}
|
||||
|
||||
public static boolean isMediaExtension(String mediaExt) {
|
||||
if (mediaExt == null) return false;
|
||||
return mediaExtSet.contains(mediaExt.trim().toLowerCase());
|
||||
@ -120,12 +129,20 @@ public class Classification {
|
||||
return appsExtSet.contains(appsExt.trim().toLowerCase());
|
||||
}
|
||||
|
||||
public static ContentDomain getContentDomain(String ext) {
|
||||
if (isTextExtension(ext)) return ContentDomain.TEXT;
|
||||
if (isImageExtension(ext)) return ContentDomain.IMAGE;
|
||||
if (isAudioExtension(ext)) return ContentDomain.AUDIO;
|
||||
if (isVideoExtension(ext)) return ContentDomain.VIDEO;
|
||||
if (isApplicationExtension(ext)) return ContentDomain.APP;
|
||||
return ContentDomain.ALL;
|
||||
}
|
||||
|
||||
public static boolean isPictureMime(final String mimeType) {
|
||||
if (mimeType == null) return false;
|
||||
return mimeType.toUpperCase().startsWith("IMAGE");
|
||||
}
|
||||
|
||||
|
||||
private static final Properties mimeTable = new Properties();
|
||||
|
||||
public static void init(final File mimeFile) {
|
||||
@ -140,6 +157,14 @@ public class Classification {
|
||||
if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}
|
||||
}
|
||||
}
|
||||
for (Entry<Object, Object> entry: mimeTable.entrySet()) {
|
||||
String ext = (String) entry.getKey();
|
||||
String mime = (String) entry.getValue();
|
||||
if (mime.startsWith("text/")) textExtSet.add(ext.toLowerCase());
|
||||
if (mime.startsWith("audio/")) audioExtSet.add(ext.toLowerCase());
|
||||
if (mime.startsWith("video/")) videoExtSet.add(ext.toLowerCase());
|
||||
if (mime.startsWith("application/")) appsExtSet.add(ext.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
public static int countMimes() {
|
||||
|
@ -45,6 +45,7 @@ import java.util.regex.Pattern;
|
||||
import jcifs.smb.SmbException;
|
||||
import jcifs.smb.SmbFile;
|
||||
import jcifs.smb.SmbFileInputStream;
|
||||
import net.yacy.cora.document.Classification.ContentDomain;
|
||||
import net.yacy.cora.document.Punycode.PunycodeException;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.TimeoutRequest;
|
||||
@ -89,6 +90,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
protected String host, path, quest, ref;
|
||||
protected int port;
|
||||
protected InetAddress hostAddress;
|
||||
protected ContentDomain contentDomain;
|
||||
|
||||
/**
|
||||
* initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues
|
||||
@ -101,6 +103,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
this.path = null;
|
||||
this.quest = null;
|
||||
this.ref = null;
|
||||
this.contentDomain = null;
|
||||
this.port = -1;
|
||||
}
|
||||
|
||||
@ -116,6 +119,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
this.path = url.path;
|
||||
this.quest = url.quest;
|
||||
this.ref = url.ref;
|
||||
this.contentDomain = null;
|
||||
this.port = url.port;
|
||||
}
|
||||
|
||||
@ -123,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
if (url == null) throw new MalformedURLException("url string is null");
|
||||
|
||||
this.hostAddress = null;
|
||||
this.contentDomain = null;
|
||||
|
||||
// identify protocol
|
||||
assert (url != null);
|
||||
@ -258,6 +263,13 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
|
||||
public final boolean isFile() { return this.protocol.equals("file"); }
|
||||
public final boolean isSMB() { return this.protocol.equals("smb"); }
|
||||
|
||||
public final ContentDomain getContentDomain() {
|
||||
if (this.contentDomain == null) {
|
||||
this.contentDomain = Classification.getContentDomain(this.getFileExtension());
|
||||
}
|
||||
return this.contentDomain;
|
||||
}
|
||||
|
||||
public static MultiProtocolURI newURL(final String baseURL, final String relPath) throws MalformedURLException {
|
||||
if ((baseURL == null) ||
|
||||
isHTTP(relPath) ||
|
||||
|
@ -38,6 +38,7 @@ import java.util.SortedSet;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.Classification.ContentDomain;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.document.language.Identificator;
|
||||
@ -112,10 +113,10 @@ public final class Condenser {
|
||||
this.RESULT_FLAGS = new Bitfield(4);
|
||||
|
||||
// construct flag set for document
|
||||
if (!document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||
if (!document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
||||
if (!document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
||||
if (!document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
|
||||
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
||||
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
||||
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
|
||||
if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
|
||||
|
||||
this.languageIdentificator = new Identificator();
|
||||
|
@ -833,5 +833,22 @@ dc_rights
|
||||
return result;
|
||||
}
|
||||
|
||||
public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
|
||||
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
|
||||
for (final Document d: documents) result.putAll(d.audiolinks);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
|
||||
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
|
||||
for (final Document d: documents) result.putAll(d.videolinks);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
|
||||
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
|
||||
for (final Document d: documents) result.putAll(d.applinks);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -589,6 +589,7 @@ public final class Protocol
|
||||
final String language,
|
||||
final String sitehash,
|
||||
final String authorhash,
|
||||
final String contentdom,
|
||||
final int count,
|
||||
final long time,
|
||||
final int maxDistance,
|
||||
@ -634,6 +635,7 @@ public final class Protocol
|
||||
language,
|
||||
sitehash,
|
||||
authorhash,
|
||||
contentdom,
|
||||
count,
|
||||
time,
|
||||
maxDistance,
|
||||
@ -893,6 +895,7 @@ public final class Protocol
|
||||
final String language,
|
||||
final String sitehash,
|
||||
final String authorhash,
|
||||
final String contentdom,
|
||||
final int count,
|
||||
final long time,
|
||||
final int maxDistance,
|
||||
@ -945,6 +948,7 @@ public final class Protocol
|
||||
parts.put("language", UTF8.StringBody(language));
|
||||
parts.put("sitehash", UTF8.StringBody(sitehash));
|
||||
parts.put("authorhash", UTF8.StringBody(authorhash));
|
||||
parts.put("contentdom", UTF8.StringBody(contentdom));
|
||||
parts.put("ttl", UTF8.StringBody("0"));
|
||||
parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance)));
|
||||
parts.put("profile", UTF8.StringBody(crypt.simpleEncode(rankingProfile.toExternalString())));
|
||||
@ -1516,6 +1520,7 @@ public final class Protocol
|
||||
"", // language,
|
||||
"", // sitehash,
|
||||
"", // authorhash,
|
||||
"all", // contentdom,
|
||||
10, // count,
|
||||
3000, // time,
|
||||
1000, // maxDistance,
|
||||
|
@ -46,7 +46,7 @@ public class RemoteSearch extends Thread {
|
||||
|
||||
private static final ThreadGroup ysThreadGroup = new ThreadGroup("yacySearchThreadGroup");
|
||||
|
||||
final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash;
|
||||
final private String wordhashes, excludehashes, urlhashes, sitehash, authorhash, contentdom;
|
||||
final private boolean global;
|
||||
final private int partitions;
|
||||
final private Segment indexSegment;
|
||||
@ -72,7 +72,7 @@ public class RemoteSearch extends Thread {
|
||||
final Pattern snippet,
|
||||
final QueryParams.Modifier modifier,
|
||||
final String language,
|
||||
final String sitehash, final String authorhash,
|
||||
final String sitehash, final String authorhash, final String contentdom,
|
||||
final int count, final long time, final int maxDistance,
|
||||
final boolean global, final int partitions,
|
||||
final Seed targetPeer,
|
||||
@ -96,6 +96,7 @@ public class RemoteSearch extends Thread {
|
||||
this.language = language;
|
||||
this.sitehash = sitehash;
|
||||
this.authorhash = authorhash;
|
||||
this.contentdom = contentdom;
|
||||
this.global = global;
|
||||
this.partitions = partitions;
|
||||
this.indexSegment = indexSegment;
|
||||
@ -120,7 +121,7 @@ public class RemoteSearch extends Thread {
|
||||
this.peers.mySeed(),
|
||||
this.wordhashes, this.excludehashes, this.urlhashes,
|
||||
this.prefer, this.filter, this.snippet, this.modifier.getModifier(),
|
||||
this.language, this.sitehash, this.authorhash,
|
||||
this.language, this.sitehash, this.authorhash, this.contentdom,
|
||||
this.count, this.time, this.maxDistance, this.global, this.partitions,
|
||||
this.targetPeer, this.indexSegment, this.containerCache, this.secondarySearchSuperviser,
|
||||
this.blacklist, this.rankingProfile, this.constraint);
|
||||
@ -166,6 +167,7 @@ public class RemoteSearch extends Thread {
|
||||
final String language,
|
||||
final String sitehash,
|
||||
final String authorhash,
|
||||
final String contentdom,
|
||||
final int count, final long time, final int maxDist,
|
||||
final Segment indexSegment,
|
||||
final SeedDB peers,
|
||||
@ -200,7 +202,7 @@ public class RemoteSearch extends Thread {
|
||||
try {
|
||||
RemoteSearch rs = new RemoteSearch(
|
||||
wordhashes, excludehashes, "", prefer, filter, snippet, modifier,
|
||||
language, sitehash, authorhash,
|
||||
language, sitehash, authorhash, contentdom,
|
||||
count, time, maxDist, true, targets, targetPeers[i],
|
||||
indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint);
|
||||
rs.start();
|
||||
@ -233,7 +235,7 @@ public class RemoteSearch extends Thread {
|
||||
if (targetPeer == null || targetPeer.hash == null) return null;
|
||||
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash)));
|
||||
final RemoteSearch searchThread = new RemoteSearch(
|
||||
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", 20, time, 9999, true, 0, targetPeer,
|
||||
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", "all", 20, time, 9999, true, 0, targetPeer,
|
||||
indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint);
|
||||
searchThread.start();
|
||||
return searchThread;
|
||||
|
@ -2355,8 +2355,11 @@ public final class Switchboard extends serverSwitch
|
||||
// get the hyperlinks
|
||||
final Map<MultiProtocolURI, String> hl = Document.getHyperlinks(documents);
|
||||
|
||||
// add all images also to the crawl stack
|
||||
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
|
||||
hl.putAll(Document.getImagelinks(documents));
|
||||
hl.putAll(Document.getApplinks(documents));
|
||||
hl.putAll(Document.getVideolinks(documents));
|
||||
hl.putAll(Document.getAudiolinks(documents));
|
||||
|
||||
// insert those hyperlinks to the crawler
|
||||
MultiProtocolURI nextUrl;
|
||||
|
@ -289,7 +289,7 @@ public final class RWIProcess extends Thread
|
||||
}
|
||||
|
||||
// check document domain
|
||||
if ( this.query.contentdom != Classification.ContentDomain.TEXT ) {
|
||||
if ( this.query.contentdom != Classification.ContentDomain.ALL ) {
|
||||
if ( (this.query.contentdom == ContentDomain.AUDIO)
|
||||
&& (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) {
|
||||
continue pollloop;
|
||||
@ -588,6 +588,12 @@ public final class RWIProcess extends Thread
|
||||
continue; // rare case where the url is corrupted
|
||||
}
|
||||
|
||||
// check content domain
|
||||
if (this.query.contentdom != Classification.ContentDomain.ALL && page.url().getContentDomain() != this.query.contentdom) {
|
||||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final String pageurl = page.url().toNormalform(true, true);
|
||||
final String pageauthor = page.dc_creator();
|
||||
final String pagetitle = page.dc_title().toLowerCase();
|
||||
@ -620,15 +626,6 @@ public final class RWIProcess extends Thread
|
||||
continue;
|
||||
}
|
||||
|
||||
// check content domain
|
||||
if ( (this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0)
|
||||
|| (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0)
|
||||
|| (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0)
|
||||
|| (this.query.contentdom == ContentDomain.APP && page.lapp() == 0) ) {
|
||||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// check vocabulary constraint
|
||||
final String tags = page.dc_subject();
|
||||
final String[] taglist = tags == null || tags.length() == 0 ? new String[0] : SPACE_PATTERN.split(page.dc_subject());
|
||||
|
@ -176,6 +176,7 @@ public final class SearchEvent
|
||||
SearchEvent.this.query.targetlang == null ? "" : SearchEvent.this.query.targetlang,
|
||||
SearchEvent.this.query.sitehash == null ? "" : SearchEvent.this.query.sitehash,
|
||||
SearchEvent.this.query.authorhash == null ? "" : SearchEvent.this.query.authorhash,
|
||||
SearchEvent.this.query.contentdom == null ? "all" : SearchEvent.this.query.contentdom.toString(),
|
||||
remote_maxcount,
|
||||
remote_maxtime,
|
||||
SearchEvent.this.query.maxDistance,
|
||||
|
@ -580,7 +580,7 @@ public class SnippetProcess {
|
||||
}
|
||||
|
||||
// load snippet
|
||||
if (this.query.contentdom == Classification.ContentDomain.TEXT) {
|
||||
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT) {
|
||||
// attach text snippet
|
||||
startTime = System.currentTimeMillis();
|
||||
final TextSnippet snippet = new TextSnippet(
|
||||
@ -612,7 +612,7 @@ public class SnippetProcess {
|
||||
Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason);
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
} else if (page.url().getContentDomain() == Classification.ContentDomain.IMAGE) {
|
||||
// attach media information
|
||||
startTime = System.currentTimeMillis();
|
||||
final List<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(page.url(), this.snippetFetchWordHashes, this.query.contentdom, cacheStrategy, 6000, !this.query.isLocal());
|
||||
@ -633,6 +633,8 @@ public class SnippetProcess {
|
||||
Log.logInfo("SEARCH", "sorted out url " + page.url().toNormalform(true, false) + " during search: " + reason);
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, dbRetrievalTime, 0); // result without snippet
|
||||
}
|
||||
// finished, no more actions possible here
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user