mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-05-03 20:29:33 -04:00
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!):
Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
d5d48b8dc7
commit
dcd01698b4
htroot
source/de/anomic
crawler
http/server
search
yacy
@ -376,7 +376,6 @@ public class Crawler_p {
|
||||
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
|
||||
if (e.getKey() == null) continue;
|
||||
nexturl = new DigestURI(e.getKey());
|
||||
if (nexturl == null) continue;
|
||||
|
||||
// enqueuing the url for crawling
|
||||
sb.crawlStacker.enqueueEntry(new Request(
|
||||
|
6
htroot/env/base.css
vendored
6
htroot/env/base.css
vendored
@ -490,9 +490,9 @@ a:hover.MenuItemLink {
|
||||
|
||||
div.urlactions a {
|
||||
display:block;
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
margin: 2px 0px;
|
||||
width: 11px;
|
||||
height: 11px;
|
||||
margin: 0px 0px 0px 3px;
|
||||
}
|
||||
|
||||
a.bookmarklink:hover, div.searchresults:hover a.bookmarklink, div.searchresults.hover a.bookmarklink {
|
||||
|
BIN
htroot/env/grafics/heuristic_new.gif
vendored
Normal file
BIN
htroot/env/grafics/heuristic_new.gif
vendored
Normal file
Binary file not shown.
After ![]() (image error) Size: 127 B |
BIN
htroot/env/grafics/heuristic_redundant.gif
vendored
Normal file
BIN
htroot/env/grafics/heuristic_redundant.gif
vendored
Normal file
Binary file not shown.
After ![]() (image error) Size: 126 B |
@ -332,6 +332,12 @@ public class yacysearch {
|
||||
while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1);
|
||||
sitehash = DigestURI.domhash(sitehost);
|
||||
}
|
||||
|
||||
int heuristic = querystring.indexOf("heuristic:scroogle");
|
||||
if (heuristic >= 0) {
|
||||
querystring = querystring.replace("heuristic:scroogle", "");
|
||||
}
|
||||
|
||||
int authori = querystring.indexOf("author:");
|
||||
String authorhash = null;
|
||||
if (authori >= 0) {
|
||||
@ -503,7 +509,9 @@ public class yacysearch {
|
||||
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
|
||||
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
|
||||
|
||||
if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch);
|
||||
if (sitehost != null && authenticated) sb.heuristicSite(theSearch, sitehost);
|
||||
if (heuristic >= 0 && authenticated) sb.heuristicScroogle(theSearch);
|
||||
|
||||
// generate result object
|
||||
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
|
||||
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms");
|
||||
|
@ -1,20 +1,24 @@
|
||||
#(content)#::
|
||||
<div class="searchresults">
|
||||
<h4 class="linktitle">
|
||||
<img src="ViewImage.png?width=16&height=16&code=#[faviconCode]#" id="f#[urlhash]#" class="favicon" style="width:16px; height:16px;" alt="" />
|
||||
<img width="16" height="16" src="ViewImage.png?width=16&height=16&code=#[faviconCode]#" id="f#[urlhash]#" class="favicon" style="width:16px; height:16px;" alt="" />
|
||||
<a href="#[link]#">#[title]#</a></h4>
|
||||
#(authorized)#::
|
||||
<div class="urlactions">
|
||||
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="bookmarkIcon" /></a>
|
||||
<div class="urlactions">
|
||||
#(heuristic)#::
|
||||
<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:#[name]# (redundant)" style="width:16px; height:9px;" alt="heuristic#[name]# (redundant)"/>::
|
||||
<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:#[name]# (new link)" style="width:16px; height:9px;" alt="heuristic#[name]# (new link)"/>
|
||||
#(/heuristic)#
|
||||
#(authorized)#::
|
||||
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img width="11" height="11" src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="recommendIcon" /></a>
|
||||
#(recommend)#
|
||||
<img src="/env/grafics/empty.gif" title="" alt="recommend" class="recommendIcon" />
|
||||
<img src="/env/grafics/empty.gif" title="" alt="delete" class="deleteIcon" />
|
||||
<img width="11" height="11" src="/env/grafics/empty.gif" title="" alt="recommend" class="recommendIcon" />
|
||||
<img width="11" height="11" src="/env/grafics/empty.gif" title="" alt="delete" class="deleteIcon" />
|
||||
::
|
||||
<a href="#[recommendlink]#" class="recommendlink" title="recommend"><img src="/env/grafics/empty.gif" title="recommend" alt="recommend" class="recommendIcon" /></a>
|
||||
<a href="#[deletelink]#" title="delete" class="deletelink" ><img src="/env/grafics/empty.gif" title="delete" alt="delete" class="deleteIcon" /></a>
|
||||
<a href="#[recommendlink]#" class="recommendlink" title="recommend"><img width="11" height="11" src="/env/grafics/empty.gif" title="recommend" alt="recommend" class="recommendIcon" /></a>
|
||||
<a href="#[deletelink]#" title="delete" class="deletelink" ><img width="11" height="11" src="/env/grafics/empty.gif" title="delete" alt="delete" class="deleteIcon" /></a>
|
||||
#(/recommend)#
|
||||
</div>
|
||||
#(/authorized)#
|
||||
#(/authorized)#
|
||||
</div>
|
||||
<p class="snippet"><span class="snippetLoaded" id="h#[urlhash]#">#[description]#</span></p>
|
||||
<p class="url"><a href="#[link]#" id="url#[urlhash]#">#[urlname]#</a></p>
|
||||
<p class="urlinfo">#[date]# | #[sizename]# | <a href="api/yacydoc.html?urlhash=#[urlhash]#" onclick="return hs.htmlExpand(this, { objectType: 'ajax'} )">Metadata</a> | <a href="ViewFile.html?urlHash=#[urlhash]#&words=#[words]#&display=#[display]#">Parser</a> | <a href="yacysearch.html?cat=image&url=#[link]#&query=#[former]#&display=#[display]#">Pictures</a></p>
|
||||
|
@ -117,6 +117,17 @@ public class yacysearchitem {
|
||||
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*");
|
||||
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*");
|
||||
prop.put("content_authorized_urlhash", new String(result.hash()));
|
||||
SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
|
||||
if (heuristic == null) {
|
||||
prop.put("content_heuristic", 0);
|
||||
} else {
|
||||
if (heuristic.redundant) {
|
||||
prop.put("content_heuristic", 1);
|
||||
} else {
|
||||
prop.put("content_heuristic", 2);
|
||||
}
|
||||
prop.put("content_heuristic_name", heuristic.heuristicName);
|
||||
}
|
||||
String resulthashString = new String(result.hash());
|
||||
prop.putHTML("content_title", result.title());
|
||||
prop.putXML("content_title-xml", result.title());
|
||||
|
@ -49,13 +49,13 @@ public final class CrawlStacker {
|
||||
private final Log log = new Log("STACKCRAWL");
|
||||
|
||||
private final WorkflowProcessor<Request> fastQueue, slowQueue;
|
||||
//private long dnsHit;
|
||||
private long dnsMiss;
|
||||
private final CrawlQueues nextQueue;
|
||||
private final CrawlSwitchboard crawler;
|
||||
private final Segment indexSegment;
|
||||
private final yacySeedDB peers;
|
||||
private final boolean acceptLocalURLs, acceptGlobalURLs;
|
||||
//private long dnsHit;
|
||||
private long dnsMiss;
|
||||
private final CrawlQueues nextQueue;
|
||||
private final CrawlSwitchboard crawler;
|
||||
private final Segment indexSegment;
|
||||
private final yacySeedDB peers;
|
||||
private final boolean acceptLocalURLs, acceptGlobalURLs;
|
||||
|
||||
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
|
||||
|
||||
@ -178,66 +178,18 @@ public final class CrawlStacker {
|
||||
// stacks a crawl item. The position can also be remote
|
||||
// returns null if successful, a reason string if not successful
|
||||
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
|
||||
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
// check if the protocol is supported
|
||||
final String urlProtocol = entry.url().getProtocol();
|
||||
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
|
||||
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "unsupported protocol";
|
||||
}
|
||||
|
||||
// check if ip is local ip address
|
||||
final String urlRejectReason = urlInAcceptedDomain(entry.url());
|
||||
if (urlRejectReason != null) {
|
||||
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "denied_(" + urlRejectReason + ")";
|
||||
}
|
||||
|
||||
// check blacklist
|
||||
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "url in blacklist";
|
||||
}
|
||||
|
||||
|
||||
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle());
|
||||
String error;
|
||||
if (profile == null) {
|
||||
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
|
||||
log.logWarning(errorMsg);
|
||||
return errorMsg;
|
||||
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
|
||||
log.logWarning(error);
|
||||
return error;
|
||||
}
|
||||
|
||||
// filter with must-match
|
||||
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "url does not match must-match filter";
|
||||
}
|
||||
|
||||
// filter with must-not-match
|
||||
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "url matches must-not-match filter";
|
||||
}
|
||||
|
||||
// deny cgi
|
||||
if (entry.url().isIndividual()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "cgi url not allowed";
|
||||
}
|
||||
|
||||
// deny post properties
|
||||
if (entry.url().isPOST() && !(profile.crawlingQ())) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "post url not allowed";
|
||||
}
|
||||
|
||||
|
||||
error = checkAcceptance(entry.url(), profile, entry.depth());
|
||||
if (error != null) return error;
|
||||
|
||||
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
|
||||
|
||||
// add domain to profile domain list
|
||||
@ -245,55 +197,6 @@ public final class CrawlStacker {
|
||||
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
|
||||
}
|
||||
|
||||
// deny urls that do not match with the profile domain list
|
||||
if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "url does not match domain filter";
|
||||
}
|
||||
|
||||
// deny urls that exceed allowed number of occurrences
|
||||
if (!(profile.grantedDomCount(entry.url().getHost()))) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
return "domain counter exceeded";
|
||||
}
|
||||
|
||||
// check if the url is double registered
|
||||
final String dbocc = nextQueue.urlExists(entry.url().hash()); // returns the name of the queue if entry exists
|
||||
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0);
|
||||
if (oldEntry == null) {
|
||||
if (dbocc != null) {
|
||||
// do double-check
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
if (dbocc.equals("errors")) {
|
||||
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
|
||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
||||
} else {
|
||||
return "double in: " + dbocc;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
|
||||
if (recrawl) {
|
||||
if (this.log.isFine())
|
||||
this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
|
||||
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
|
||||
} else {
|
||||
if (dbocc == null) {
|
||||
return "double in: LURL-DB";
|
||||
} else {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
|
||||
if (dbocc.equals("errors")) {
|
||||
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
|
||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
||||
} else {
|
||||
return "double in: " + dbocc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// store information
|
||||
final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), peers.mySeed().hash.getBytes());
|
||||
final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || new String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle());
|
||||
@ -308,7 +211,7 @@ public final class CrawlStacker {
|
||||
) /* qualified */;
|
||||
|
||||
if (!local && !global && !remote && !proxy) {
|
||||
String error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle();
|
||||
error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle();
|
||||
this.log.logSevere(error);
|
||||
return error;
|
||||
}
|
||||
@ -344,6 +247,103 @@ public final class CrawlStacker {
|
||||
return null;
|
||||
}
|
||||
|
||||
public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) {
|
||||
|
||||
// check if the protocol is supported
|
||||
final String urlProtocol = url.getProtocol();
|
||||
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
|
||||
this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'.");
|
||||
return "unsupported protocol";
|
||||
}
|
||||
|
||||
// check if ip is local ip address
|
||||
final String urlRejectReason = urlInAcceptedDomain(url);
|
||||
if (urlRejectReason != null) {
|
||||
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ")");
|
||||
return "denied_(" + urlRejectReason + ")";
|
||||
}
|
||||
|
||||
// check blacklist
|
||||
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist.");
|
||||
return "url in blacklist";
|
||||
}
|
||||
|
||||
// filter with must-match
|
||||
if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'.");
|
||||
return "url does not match must-match filter";
|
||||
}
|
||||
|
||||
// filter with must-not-match
|
||||
if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'.");
|
||||
return "url matches must-not-match filter";
|
||||
}
|
||||
|
||||
// deny cgi
|
||||
if (url.isIndividual()) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
|
||||
return "cgi url not allowed";
|
||||
}
|
||||
|
||||
// deny post properties
|
||||
if (url.isPOST() && !(profile.crawlingQ())) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL.");
|
||||
return "post url not allowed";
|
||||
}
|
||||
|
||||
// deny urls that do not match with the profile domain list
|
||||
if (!(profile.grantedDomAppearance(url.getHost()))) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
|
||||
return "url does not match domain filter";
|
||||
}
|
||||
|
||||
// deny urls that exceed allowed number of occurrences
|
||||
if (!(profile.grantedDomCount(url.getHost()))) {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
|
||||
return "domain counter exceeded";
|
||||
}
|
||||
|
||||
// check if the url is double registered
|
||||
final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
|
||||
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash(), null, 0);
|
||||
if (oldEntry == null) {
|
||||
if (dbocc != null) {
|
||||
// do double-check
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'.");
|
||||
if (dbocc.equals("errors")) {
|
||||
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
|
||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
||||
} else {
|
||||
return "double in: " + dbocc;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
|
||||
if (recrawl) {
|
||||
if (this.log.isFine())
|
||||
this.log.logFine("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
|
||||
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
|
||||
} else {
|
||||
if (dbocc == null) {
|
||||
return "double in: LURL-DB";
|
||||
} else {
|
||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
|
||||
if (dbocc.equals("errors")) {
|
||||
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
|
||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
||||
} else {
|
||||
return "double in: " + dbocc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test a url if it can be used for crawling/indexing
|
||||
* This mainly checks if the url is in the declared domain (local/global)
|
||||
|
@ -973,7 +973,6 @@ public final class HTTPDFileHandler {
|
||||
if ((ranges.length == 1)&&(ranges[0].endsWith("-"))) {
|
||||
rangeStartOffset = Integer.parseInt(ranges[0].substring(0,ranges[0].length()-1));
|
||||
statusCode = 206;
|
||||
if (header == null) header = new ResponseHeader();
|
||||
header.put(HeaderFramework.CONTENT_RANGE, "bytes " + rangeStartOffset + "-" + (targetFile.length()-1) + "/" + targetFile.length());
|
||||
}
|
||||
}
|
||||
|
@ -80,12 +80,13 @@ public final class SearchEvent {
|
||||
private final TreeMap<byte[], String> preselectedPeerHashes;
|
||||
private final ResultURLs crawlResults;
|
||||
private final Thread localSearchThread;
|
||||
private final TreeMap<byte[], String> IAResults;
|
||||
private final TreeMap<byte[], Integer> IACount;
|
||||
private final TreeMap<byte[], String> IAResults;
|
||||
private final TreeMap<byte[], HeuristicResult> heuristics;
|
||||
private byte[] IAmaxcounthash, IAneardhthash;
|
||||
private final ReferenceOrder order;
|
||||
|
||||
@SuppressWarnings("unchecked") SearchEvent(final QueryParams query,
|
||||
public SearchEvent(final QueryParams query,
|
||||
final yacySeedDB peers,
|
||||
final ResultURLs crawlResults,
|
||||
final TreeMap<byte[], String> preselectedPeerHashes,
|
||||
@ -102,6 +103,7 @@ public final class SearchEvent {
|
||||
this.preselectedPeerHashes = preselectedPeerHashes;
|
||||
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
|
||||
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
|
||||
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
|
||||
this.IAmaxcounthash = null;
|
||||
this.IAneardhthash = null;
|
||||
this.localSearchThread = null;
|
||||
@ -169,7 +171,7 @@ public final class SearchEvent {
|
||||
assert this.rankedCache.searchContainerMap() != null;
|
||||
for (Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankedCache.searchContainerMap().entrySet()) {
|
||||
wordhash = entry.getKey();
|
||||
final ReferenceContainer container = entry.getValue();
|
||||
final ReferenceContainer<WordReference> container = entry.getValue();
|
||||
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + new String(container.getTermHash()) + ", wordhash = " + new String(wordhash);
|
||||
if (container.size() > maxcount) {
|
||||
IAmaxcounthash = wordhash;
|
||||
@ -317,6 +319,18 @@ public final class SearchEvent {
|
||||
return this.rankedCache.getAuthorNavigator(maxentries);
|
||||
}
|
||||
|
||||
public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
|
||||
synchronized (this.heuristics) {
|
||||
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
|
||||
}
|
||||
}
|
||||
|
||||
public HeuristicResult getHeuristic(byte[] urlhash) {
|
||||
synchronized (this.heuristics) {
|
||||
return this.heuristics.get(urlhash);
|
||||
}
|
||||
}
|
||||
|
||||
public ResultEntry oneResult(final int item) {
|
||||
if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
|
||||
(query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) {
|
||||
@ -333,6 +347,22 @@ public final class SearchEvent {
|
||||
|
||||
boolean secondarySearchStartet = false;
|
||||
|
||||
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/ {
|
||||
public final byte[] urlhash; public final String heuristicName; public final boolean redundant;
|
||||
public HeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
|
||||
this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant;
|
||||
}/*
|
||||
public int compareTo(HeuristicResult o) {
|
||||
return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash);
|
||||
}
|
||||
public int hashCode() {
|
||||
return (int) Base64Order.enhancedCoder.cardinal(this.urlhash);
|
||||
}
|
||||
public boolean equals(Object o) {
|
||||
return Base64Order.enhancedCoder.equal(this.urlhash, ((HeuristicResult) o).urlhash);
|
||||
}*/
|
||||
}
|
||||
|
||||
public class SecondarySearchSuperviser extends Thread {
|
||||
|
||||
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
|
||||
|
@ -1894,6 +1894,31 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
}
|
||||
|
||||
public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent, final String heuristicName) {
|
||||
|
||||
// add the landing page to the index. should not load that again since it should be in the cache
|
||||
if (url != null) try {
|
||||
this.addToIndex(url, searchEvent, heuristicName);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
|
||||
// check if some of the links match with the query
|
||||
Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
|
||||
|
||||
// take the matcher and load them all
|
||||
for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
|
||||
try {
|
||||
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
}
|
||||
|
||||
// take then the no-matcher and load them also
|
||||
for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
|
||||
try {
|
||||
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* load the content of a URL, parse the content and add the content to the index
|
||||
* This process is started concurrently. The method returns immediately after the call.
|
||||
@ -1902,12 +1927,21 @@ public final class Switchboard extends serverSwitch {
|
||||
* @throws IOException
|
||||
* @throws ParserException
|
||||
*/
|
||||
public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException {
|
||||
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException {
|
||||
final Segments.Process process = Segments.Process.LOCALCRAWLING;
|
||||
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) {
|
||||
searchEvent.addHeuristicResult(url.hash(), heuristicName, true);
|
||||
return; // don't do double-work
|
||||
}
|
||||
final Request request = loader.request(url, true, true);
|
||||
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0);
|
||||
if (acceptedError != null) {
|
||||
log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
|
||||
return;
|
||||
}
|
||||
new Thread() {public void run() {
|
||||
try {
|
||||
Segments.Process process = Segments.Process.LOCALCRAWLING;
|
||||
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
|
||||
Request request = loader.request(url, true, true);
|
||||
searchEvent.addHeuristicResult(url.hash(), heuristicName, false);
|
||||
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
|
||||
if (response == null) throw new IOException("response == null");
|
||||
if (response.getContent() == null) throw new IOException("content == null");
|
||||
@ -1918,42 +1952,15 @@ public final class Switchboard extends serverSwitch {
|
||||
ResultImages.registerImages(document, true);
|
||||
webStructure.generateCitationReference(document, condenser, response.lastModified());
|
||||
storeDocumentIndex(process, response, document, condenser, searchEvent);
|
||||
log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished");
|
||||
log.logInfo("heuristic fill of url " + url.toNormalform(true, true) + " finished");
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
//Log.logException(e);
|
||||
} catch (ParserException e) {
|
||||
Log.logException(e);
|
||||
//Log.logException(e);
|
||||
}
|
||||
}}.start();
|
||||
}
|
||||
|
||||
public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent) {
|
||||
|
||||
// add the landing page to the index. should not load that again since it should be in the cache
|
||||
try {
|
||||
this.addToIndex(url, searchEvent);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
|
||||
// check if some of the links match with the query
|
||||
Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
|
||||
|
||||
// take the matcher and load them all
|
||||
for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
|
||||
try {
|
||||
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
}
|
||||
|
||||
// take then the no-matcher and load them also
|
||||
for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
|
||||
try {
|
||||
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
|
||||
} catch (IOException e) {} catch (ParserException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public class receiptSending implements Runnable {
|
||||
yacySeed initiatorPeer;
|
||||
URIMetadataRow reference;
|
||||
@ -2165,7 +2172,7 @@ public final class Switchboard extends serverSwitch {
|
||||
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
|
||||
}
|
||||
|
||||
public final void quickFillSite(final String host, final SearchEvent searchEvent) {
|
||||
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
|
||||
new Thread() {public void run() {
|
||||
String r = host;
|
||||
if (r.indexOf("//") < 0) r = "http://" + r;
|
||||
@ -2194,7 +2201,42 @@ public final class Switchboard extends serverSwitch {
|
||||
}
|
||||
|
||||
// add all pages to the index
|
||||
addAllToIndex(url, links, searchEvent);
|
||||
addAllToIndex(url, links, searchEvent, "site");
|
||||
}}.start();
|
||||
}
|
||||
|
||||
public final void heuristicScroogle(final SearchEvent searchEvent) {
|
||||
new Thread() {public void run() {
|
||||
String query = searchEvent.getQuery().queryString(true);
|
||||
int meta = query.indexOf("heuristic:");
|
||||
if (meta >= 0) {
|
||||
int q = query.indexOf(' ', meta);
|
||||
if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta);
|
||||
}
|
||||
final String urlString = "http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=" + query.trim().replaceAll(" ", "+") + "&n=2";
|
||||
DigestURI url;
|
||||
try {
|
||||
url = new DigestURI(MultiProtocolURI.unescape(urlString), null);
|
||||
} catch (MalformedURLException e1) {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<MultiProtocolURI, String> links = null;
|
||||
try {
|
||||
links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
return;
|
||||
}
|
||||
Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
||||
MultiProtocolURI u;
|
||||
while (i.hasNext()) {
|
||||
u = i.next();
|
||||
if (u.toNormalform(false, false).indexOf("scroogle") >= 0) i.remove();
|
||||
}
|
||||
log.logInfo("Heuristic: adding " + links.size() + " links from scroogle");
|
||||
// add all pages to the index
|
||||
addAllToIndex(null, links, searchEvent, "scroogle");
|
||||
}}.start();
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,6 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
@ -239,15 +238,15 @@ public class Dispatcher {
|
||||
|
||||
// check all entries and split them to the partitions
|
||||
ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
|
||||
WordReferenceRow re;
|
||||
for (ReferenceContainer container: containers) {
|
||||
WordReference re;
|
||||
for (ReferenceContainer<WordReference> container: containers) {
|
||||
// init the new partitions
|
||||
for (int j = 0; j < partitionBuffer.length; j++) {
|
||||
partitionBuffer[j] = new ReferenceContainer(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
|
||||
partitionBuffer[j] = new ReferenceContainer<WordReference>(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
|
||||
}
|
||||
|
||||
// split the container
|
||||
Iterator<WordReferenceRow> i = container.entries();
|
||||
Iterator<WordReference> i = container.entries();
|
||||
while (i.hasNext()) {
|
||||
re = i.next();
|
||||
if (re == null) continue;
|
||||
|
@ -91,7 +91,6 @@ public class OSMTile {
|
||||
return null;
|
||||
}
|
||||
tileb = entry.getContent();
|
||||
if (entry == null) return null;
|
||||
}
|
||||
try {
|
||||
ImageIO.setUseCache(false); // do not write a cache to disc; keep in RAM
|
||||
|
Loading…
x
Reference in New Issue
Block a user