mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
- fixed some bugs with domain filter
- added new ranking filter "prefermask": urls that match the filter are ranked better git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2022 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
@ -3,7 +3,7 @@ javacSource=1.4
|
||||
javacTarget=1.4
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=0.443
|
||||
releaseVersion=0.444
|
||||
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
|
||||
|
@ -132,7 +132,7 @@ public class DetailedSearch {
|
||||
}
|
||||
|
||||
// do the search
|
||||
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, count, searchtime, urlmask,
|
||||
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask,
|
||||
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
|
||||
"", 20);
|
||||
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());
|
||||
|
@ -81,8 +81,6 @@ public class IndexCreate_p {
|
||||
prop.put("error", 0);
|
||||
prop.put("info", 0);
|
||||
prop.put("refreshbutton", 0);
|
||||
|
||||
switchboard.cleanProfiles();
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("crawlingstart")) {
|
||||
@ -395,6 +393,7 @@ public class IndexCreate_p {
|
||||
|
||||
// sed crawl profiles
|
||||
int count = 0;
|
||||
int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
|
||||
//try{
|
||||
Iterator it = switchboard.profiles.profiles(true);
|
||||
plasmaCrawlProfile.entry profile;
|
||||
@ -410,7 +409,7 @@ public class IndexCreate_p {
|
||||
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
|
||||
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
|
||||
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
|
||||
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160));
|
||||
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, domlistlength));
|
||||
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
|
||||
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
|
||||
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));
|
||||
|
@ -34,6 +34,8 @@
|
||||
<input type="hidden" name="order" value="Date-YBR-Quality">
|
||||
<input type="hidden" name="resource" value="global">
|
||||
<input type="hidden" name="time" value="6">
|
||||
<input type="hidden" name="urlmaskfilter" value=".*">
|
||||
<input type="hidden" name="prefermaskfilter" value="">
|
||||
<tr align="left">
|
||||
<td></td>
|
||||
<td><a href="/index.html?searchoptions=1&display=#[display]#">more options...</a></td>
|
||||
@ -106,6 +108,19 @@
|
||||
#(/urlmaskoptions)#
|
||||
</td>
|
||||
</tr>
|
||||
<tr align="left">
|
||||
<td>
|
||||
Prefer mask:
|
||||
</td>
|
||||
<td>
|
||||
#(prefermaskoptions)#
|
||||
<input name="prefermaskfilter" type="text" size="12" maxlength="80" value="#[prefermaskfilter]#">
|
||||
::
|
||||
<input type="radio" name="prefermask" value="yes" checked> restrict on <input name="prefermaskfilter" type="text" size="12" maxlength="80" value="#[prefermaskfilter]#">
|
||||
<input type="radio" name="prefermask" value="no"> show all
|
||||
#(/prefermaskoptions)#
|
||||
</td>
|
||||
</tr>
|
||||
#(/searchoptions)#
|
||||
</table>
|
||||
</center>
|
||||
|
@ -102,8 +102,8 @@ public class index {
|
||||
prop.put("combine", 0);
|
||||
prop.put("resultbottomline", 0);
|
||||
prop.put("searchoptions", searchoptions);
|
||||
prop.put("searchoptions_count-10", 1);
|
||||
prop.put("searchoptions_count-50", 0);
|
||||
prop.put("searchoptions_count-10", 0);
|
||||
prop.put("searchoptions_count-50", 1);
|
||||
prop.put("searchoptions_count-100", 0);
|
||||
prop.put("searchoptions_count-1000", 0);
|
||||
prop.put("searchoptions_order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0);
|
||||
@ -122,6 +122,8 @@ public class index {
|
||||
prop.put("searchoptions_time-60", 0);
|
||||
prop.put("searchoptions_urlmaskoptions", 0);
|
||||
prop.put("searchoptions_urlmaskoptions_urlmaskfilter", ".*");
|
||||
prop.put("searchoptions_prefermaskoptions", 0);
|
||||
prop.put("searchoptions_prefermaskoptions_prefermaskfilter", "");
|
||||
prop.put("results", "");
|
||||
prop.put("cat", "href");
|
||||
prop.put("type", "0");
|
||||
|
@ -83,6 +83,7 @@ public final class search {
|
||||
final long duetime= post.getLong("duetime", 3000);
|
||||
final int count = post.getInt("count", 10); // maximum number of wanted results
|
||||
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
|
||||
final String prefer = post.get("prefer", "");
|
||||
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
|
||||
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
|
||||
|
||||
@ -104,7 +105,7 @@ public final class search {
|
||||
}
|
||||
final long timestamp = System.currentTimeMillis();
|
||||
|
||||
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, count, duetime, ".*");
|
||||
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, ".*");
|
||||
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
|
||||
|
||||
serverObjects prop = new serverObjects();
|
||||
|
@ -55,6 +55,7 @@ picPlus.src = "/env/grafics/plus.gif";
|
||||
<input type="hidden" name="resource" value="#[resource]#">
|
||||
<input type="hidden" name="time" value="#[time]#">
|
||||
<input type="hidden" name="urlmaskfilter" value="#[urlmaskfilter]#">
|
||||
<input type="hidden" name="prefermaskfilter" value="#[prefermaskfilter]#">
|
||||
<input type="hidden" name="depth" value="#[depth]#">
|
||||
<input type="hidden" name="cat" value="#[cat]#">
|
||||
<input type="hidden" name="type" value="#[type]#">
|
||||
|
@ -109,6 +109,7 @@ public class yacysearch {
|
||||
prop.put("resource", "global");
|
||||
prop.put("time", 6);
|
||||
prop.put("urlmaskfilter", ".*");
|
||||
prop.put("prefermaskfilter", "");
|
||||
prop.put("cat", "href");
|
||||
prop.put("depth", "0");
|
||||
prop.put("type", 0);
|
||||
@ -144,7 +145,8 @@ public class yacysearch {
|
||||
} else {
|
||||
urlmask = (post.containsKey("urlmaskfilter")) ? (String) post.get("urlmaskfilter") : ".*";
|
||||
}
|
||||
String prefer = post.get("prefer", ".*");
|
||||
String prefermask = post.get("prefermaskfilter", "");
|
||||
if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
|
||||
|
||||
serverObjects prop = new serverObjects();
|
||||
|
||||
@ -189,6 +191,7 @@ public class yacysearch {
|
||||
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
|
||||
query,
|
||||
maxDistance,
|
||||
prefermask,
|
||||
count,
|
||||
searchtime,
|
||||
urlmask,
|
||||
@ -351,7 +354,7 @@ public class yacysearch {
|
||||
prop.put("resource", (global) ? "global" : "local");
|
||||
prop.put("time", searchtime / 1000);
|
||||
prop.put("urlmaskfilter", urlmask);
|
||||
prop.put("prefer", prefer);
|
||||
prop.put("prefermaskfilter", prefermask);
|
||||
prop.put("display", display);
|
||||
|
||||
// return rewrite properties
|
||||
|
@ -456,7 +456,7 @@ public class plasmaCrawlProfile {
|
||||
while (domnamesi.hasNext()) {
|
||||
ey = (Map.Entry) domnamesi.next();
|
||||
dp = (DomProfile) ey.getValue();
|
||||
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
|
||||
domnames += ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count + " ") : " ") + "<br>";
|
||||
if ((maxlength > 0) && (domnames.length() >= maxlength)) {
|
||||
domnames = domnames.substring(0, maxlength-3) + "...";
|
||||
break;
|
||||
|
@ -318,7 +318,7 @@ public final class plasmaCrawlStacker {
|
||||
}
|
||||
|
||||
// add domain to profile domain list
|
||||
if (currentdepth <= profile.domFilterDepth()) {
|
||||
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
|
||||
profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth);
|
||||
}
|
||||
|
||||
|
@ -61,6 +61,7 @@ public final class plasmaSearchQuery {
|
||||
public Set queryWords;
|
||||
public Set queryHashes;
|
||||
public int wantedResults;
|
||||
public String prefer;
|
||||
public long maximumTime;
|
||||
public String urlMask;
|
||||
public int domType;
|
||||
@ -68,11 +69,12 @@ public final class plasmaSearchQuery {
|
||||
public int domMaxTargets;
|
||||
public int maxDistance;
|
||||
|
||||
public plasmaSearchQuery(Set queryWords, int maxDistance,
|
||||
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer,
|
||||
int wantedResults, long maximumTime, String urlMask,
|
||||
int domType, String domGroupName, int domMaxTargets) {
|
||||
this.queryWords = queryWords;
|
||||
this.maxDistance = maxDistance;
|
||||
this.prefer = prefer;
|
||||
this.queryHashes = words2hashes(queryWords);
|
||||
this.wantedResults = wantedResults;
|
||||
this.maximumTime = maximumTime;
|
||||
@ -82,10 +84,11 @@ public final class plasmaSearchQuery {
|
||||
this.domMaxTargets = domMaxTargets;
|
||||
}
|
||||
|
||||
public plasmaSearchQuery(Set queryHashes, int maxDistance,
|
||||
public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer,
|
||||
int wantedResults, long maximumTime, String urlMask) {
|
||||
this.queryWords = null;
|
||||
this.maxDistance = maxDistance;
|
||||
this.prefer = prefer;
|
||||
this.queryHashes = queryHashes;
|
||||
this.wantedResults = wantedResults;
|
||||
this.maximumTime = maximumTime;
|
||||
|
@ -73,6 +73,7 @@ public class plasmaSearchRankingProfile {
|
||||
public static final String QUERYINDESCR = "queryindescr";
|
||||
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
|
||||
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
|
||||
public static final String PREFER = "prefer";
|
||||
|
||||
public String[] order;
|
||||
private HashMap coeff;
|
||||
@ -96,6 +97,7 @@ public class plasmaSearchRankingProfile {
|
||||
coeff.put(QUERYINDESCR, new Integer(8));
|
||||
coeff.put(URLCOMPINTOPLIST, new Integer(3));
|
||||
coeff.put(DESCRCOMPINTOPLIST, new Integer(2));
|
||||
coeff.put(PREFER, new Integer(15));
|
||||
}
|
||||
|
||||
public plasmaSearchRankingProfile(String prefix, String profile) {
|
||||
@ -183,6 +185,10 @@ public class plasmaSearchRankingProfile {
|
||||
// apply pre-calculated order attributes
|
||||
long ranking = this.preRanking(normalizedEntry);
|
||||
|
||||
// prefer hit with 'prefer' pattern
|
||||
if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
|
||||
if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
|
||||
|
||||
// apply 'common-sense' heuristic using references
|
||||
for (int j = 0; j < urlcomps.length; j++) {
|
||||
if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
|
||||
@ -210,6 +216,7 @@ public class plasmaSearchRankingProfile {
|
||||
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
|
||||
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
|
||||
|
||||
|
||||
return ranking;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user