mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-18 08:36:07 -04:00
added an option to exclude image search results from text search. This
is on by default.
This commit is contained in:
htroot
source/net/yacy
@ -112,7 +112,7 @@ public class searchresult {
|
||||
|
||||
// get a solr query string
|
||||
QueryGoal qg = new QueryGoal(originalQuery, originalQuery);
|
||||
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0);
|
||||
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0, false);
|
||||
post.put("defType", "edismax");
|
||||
post.put(CommonParams.Q, solrQ.toString());
|
||||
post.put(CommonParams.ROWS, post.remove("num"));
|
||||
|
@ -168,7 +168,7 @@ public class select {
|
||||
querystring = modifier.parse(querystring);
|
||||
modifier.apply(post);
|
||||
QueryGoal qg = new QueryGoal(querystring, querystring);
|
||||
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr);
|
||||
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr, false);
|
||||
post.put(CommonParams.Q, solrQ.toString()); // sru patch
|
||||
}
|
||||
String q = post.get(CommonParams.Q, "");
|
||||
|
@ -172,7 +172,7 @@ public class RemoteSearch extends Thread {
|
||||
nodePeers.add(event.peers.mySeed());
|
||||
}
|
||||
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) {
|
||||
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0);
|
||||
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0, event.excludeintext_image);
|
||||
for (Seed s: nodePeers) {
|
||||
Thread t = solrRemoteSearch(event, solrQuery, start, count, s, blacklist);
|
||||
event.nodeSearchThreads.add(t);
|
||||
|
@ -207,11 +207,12 @@ public class QueryGoal {
|
||||
for (final byte[] b: blues) this.include_hashes.remove(b);
|
||||
}
|
||||
|
||||
public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile) {
|
||||
public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile, boolean noimages) {
|
||||
final StringBuilder q = new StringBuilder(80);
|
||||
|
||||
// add filter to prevent that results come from failed urls
|
||||
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
|
||||
if (noimages) q.append(" AND -").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif)");
|
||||
|
||||
// parse special requests
|
||||
if (isCatchall()) return q;
|
||||
|
@ -376,12 +376,12 @@ public final class QueryParams {
|
||||
return SetTools.anymatch(wordhashes, keyhashes);
|
||||
}
|
||||
|
||||
public SolrQuery solrQuery(ContentDomain cd, boolean getFacets) {
|
||||
public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
|
||||
if (cd == ContentDomain.IMAGE) return solrImageQuery(getFacets);
|
||||
return solrTextQuery(getFacets);
|
||||
return solrTextQuery(getFacets, excludeintext_image);
|
||||
}
|
||||
|
||||
private SolrQuery solrTextQuery(boolean getFacets) {
|
||||
private SolrQuery solrTextQuery(final boolean getFacets, final boolean excludeintext_image) {
|
||||
if (this.cachedQuery != null) {
|
||||
this.cachedQuery.setStart(this.offset);
|
||||
return this.cachedQuery;
|
||||
@ -391,7 +391,7 @@ public final class QueryParams {
|
||||
// construct query
|
||||
final SolrQuery params = getBasicParams(getFacets);
|
||||
int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0;
|
||||
params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString());
|
||||
params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile, excludeintext_image).toString());
|
||||
Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile
|
||||
|
||||
String bq = ranking.getBoostQuery();
|
||||
@ -399,36 +399,6 @@ public final class QueryParams {
|
||||
if (bq.length() > 0) params.setParam("bq", bq);
|
||||
if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
|
||||
|
||||
/*
|
||||
if (this.contentdom == ContentDomain.IMAGE) {
|
||||
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")");
|
||||
}
|
||||
|
||||
if (this.contentdom == ContentDomain.AUDIO) {
|
||||
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")");
|
||||
}
|
||||
|
||||
if (this.contentdom == ContentDomain.VIDEO) {
|
||||
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")");
|
||||
}
|
||||
|
||||
if (this.contentdom == ContentDomain.APP) {
|
||||
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\"");
|
||||
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")");
|
||||
}
|
||||
*/
|
||||
|
||||
// prepare result
|
||||
ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
|
||||
this.cachedQuery = params;
|
||||
|
@ -164,7 +164,8 @@ public final class SearchEvent {
|
||||
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack; // thats the bag where the solr results are written to
|
||||
private final WeakPriorityBlockingQueue<ResultEntry> resultList; // thats the result list where the actual search result is waiting to be displayed
|
||||
private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source.
|
||||
|
||||
public final boolean excludeintext_image;
|
||||
|
||||
// the following values are filled during the search process as statistics for the search
|
||||
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
|
||||
public final AtomicInteger local_rwi_stored; // the number of existing hits by the local search in rwi index
|
||||
@ -220,6 +221,7 @@ public final class SearchEvent {
|
||||
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(100, false);
|
||||
this.maxExpectedRemoteReferences = new AtomicInteger(0);
|
||||
this.expectedRemoteReferences = new AtomicInteger(0);
|
||||
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
|
||||
// prepare configured search navigation
|
||||
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
|
||||
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null;
|
||||
@ -282,7 +284,7 @@ public final class SearchEvent {
|
||||
|
||||
// start a local solr search
|
||||
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true, this.excludeintext_image), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
}
|
||||
this.localsolroffset = this.query.itemsPerPage;
|
||||
|
||||
@ -837,6 +839,13 @@ public final class SearchEvent {
|
||||
if (log.isFine()) log.fine("dropped Node: content domain does not match");
|
||||
continue pollloop;
|
||||
}
|
||||
|
||||
// filter out media links in text search, if wanted
|
||||
String ext = MultiProtocolURI.getFileExtension(iEntry.url().getFileName());
|
||||
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
|
||||
if (log.isFine()) log.fine("dropped Node: file name domain does not match");
|
||||
continue pollloop;
|
||||
}
|
||||
|
||||
// check site constraints
|
||||
final String hosthash = iEntry.hosthash();
|
||||
@ -1014,7 +1023,7 @@ public final class SearchEvent {
|
||||
}
|
||||
|
||||
// check content domain
|
||||
if (((this.query.contentdom == Classification.ContentDomain.TEXT && page.url().getContentDomain() == Classification.ContentDomain.IMAGE) ||
|
||||
if (this.query.contentdom.getCode() > 0 && (
|
||||
(this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) ||
|
||||
(this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) ||
|
||||
(this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) ||
|
||||
@ -1024,6 +1033,13 @@ public final class SearchEvent {
|
||||
continue;
|
||||
}
|
||||
|
||||
// filter out media links in text search, if wanted
|
||||
String ext = MultiProtocolURI.getFileExtension(page.url().getFileName());
|
||||
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
|
||||
if (log.isFine()) log.fine("dropped RWI: file name domain does not match");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for blacklist
|
||||
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page)) {
|
||||
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist");
|
||||
@ -1340,7 +1356,7 @@ public final class SearchEvent {
|
||||
int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded.
|
||||
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
|
||||
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
}
|
||||
this.localsolroffset += nextitems;
|
||||
}
|
||||
@ -1361,7 +1377,7 @@ public final class SearchEvent {
|
||||
if (this.localsolrsearch == null || !this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0) {
|
||||
// at the end of a list, trigger a next solr search
|
||||
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
|
||||
}
|
||||
this.localsolroffset += this.query.itemsPerPage;
|
||||
}
|
||||
|
Reference in New Issue
Block a user