mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-06-20 03:56:07 -04:00
Limit the number of initially previewed links in crawl start pages.
This prevent rendering a big and inconvenient scrollbar on resources containing many links. If really needed, preview of all links is still available with a "Show all links" button. Doesn't affect the number of links used once the crawl is effectively started, as the list is then loaded again server-side.
This commit is contained in:
@ -246,6 +246,9 @@
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
|
||||
<div id="sitelistURLs"></div>
|
||||
<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
|
||||
<span class="glyphicon glyphicon-option-horizontal"/>
|
||||
</button>
|
||||
</dd>
|
||||
<dt>From Sitemap</dt>
|
||||
<dd>
|
||||
|
@ -47,7 +47,12 @@
|
||||
</td>
|
||||
</tr><tr>
|
||||
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
|
||||
<td><div id="sitelistURLs"></div></td>
|
||||
<td>
|
||||
<div id="sitelistURLs"></div>
|
||||
<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
|
||||
<span class="glyphicon glyphicon-option-horizontal"/>
|
||||
</button>
|
||||
</td>
|
||||
</tr><tr>
|
||||
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
|
||||
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
|
||||
|
@ -28,6 +28,7 @@ import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ -85,6 +86,7 @@ public class getpageinfo_p {
|
||||
* </ul>
|
||||
* </li>
|
||||
* <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
|
||||
* <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
|
||||
* </ul>
|
||||
* @param env
|
||||
* server environment
|
||||
@ -110,6 +112,7 @@ public class getpageinfo_p {
|
||||
String actions = "title,robots";
|
||||
|
||||
if (post != null && post.containsKey("url")) {
|
||||
final int maxLinks = post.getInt("maxLinks", Integer.MAX_VALUE);
|
||||
if (post.containsKey("actions"))
|
||||
actions=post.get("actions");
|
||||
String url=post.get("url");
|
||||
@ -135,7 +138,7 @@ public class getpageinfo_p {
|
||||
net.yacy.document.Document scraper = null;
|
||||
if (u != null) try {
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
|
||||
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
||||
@ -145,20 +148,25 @@ public class getpageinfo_p {
|
||||
// put the document title
|
||||
prop.putXML("title", scraper.dc_title());
|
||||
|
||||
// put the icons that belongs to the document
|
||||
// put the icons that belong to the document
|
||||
Set<DigestURL> iconURLs = scraper.getIcons().keySet();
|
||||
int i = 0;
|
||||
int count = 0;
|
||||
for (DigestURL iconURL : iconURLs) {
|
||||
prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
|
||||
prop.put("icons_" + i + "_eol", 1);
|
||||
i++;
|
||||
if(count >= maxLinks) {
|
||||
break;
|
||||
}
|
||||
prop.putXML("icons_" + count + "_icon", iconURL.toNormalform(false));
|
||||
prop.put("icons_" + count + "_eol", 1);
|
||||
count++;
|
||||
}
|
||||
prop.put("icons_" + (i - 1) + "_eol", 0);
|
||||
prop.put("icons", iconURLs.size());
|
||||
if(count > 0) {
|
||||
prop.put("icons_" + (count - 1) + "_eol", 0);
|
||||
}
|
||||
prop.put("icons", count);
|
||||
|
||||
// put keywords
|
||||
final Set<String> list = scraper.dc_subject();
|
||||
int count = 0;
|
||||
count = 0;
|
||||
for (final String element: list) {
|
||||
if (!element.equals("")) {
|
||||
prop.putXML("tags_"+count+"_tag", element);
|
||||
@ -177,14 +185,20 @@ public class getpageinfo_p {
|
||||
final StringBuilder links = new StringBuilder(uris.size() * 80);
|
||||
final StringBuilder filter = new StringBuilder(uris.size() * 40);
|
||||
count = 0;
|
||||
for (final DigestURL uri: uris) {
|
||||
final Iterator<AnchorURL> urisIt = uris.iterator();
|
||||
while (urisIt.hasNext()) {
|
||||
AnchorURL uri = urisIt.next();
|
||||
if (uri == null) continue;
|
||||
if(count >= maxLinks) {
|
||||
break;
|
||||
}
|
||||
links.append(';').append(uri.toNormalform(true));
|
||||
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
|
||||
prop.putXML("links_" + count + "_link", uri.toNormalform(true));
|
||||
count++;
|
||||
}
|
||||
prop.put("links", count);
|
||||
prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
|
||||
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
|
||||
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
|
||||
}
|
||||
@ -200,12 +214,17 @@ public class getpageinfo_p {
|
||||
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
|
||||
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
|
||||
|
||||
// get the sitemap URL of the domain
|
||||
// get the sitemap URL(s) of the domain
|
||||
final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
|
||||
for (int i = 0; i < sitemaps.size(); i++) {
|
||||
prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
|
||||
int count = 0;
|
||||
for (String sitemap : sitemaps) {
|
||||
if(count >= maxLinks) {
|
||||
break;
|
||||
}
|
||||
prop.putXML("sitemaps_" + count + "_sitemap", sitemap);
|
||||
count++;
|
||||
}
|
||||
prop.put("sitemaps", sitemaps.size());
|
||||
prop.put("sitemaps", count);
|
||||
} catch (final MalformedURLException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
|
@ -25,5 +25,6 @@
|
||||
<link name="#[link]#" />
|
||||
#{/links}#
|
||||
</links>
|
||||
<hasMoreLinks>#(hasMoreLinks)#false::true#(/hasMoreLinks)#</hasMoreLinks>
|
||||
<oai>#[oai]#</oai>
|
||||
</pageinfo>
|
||||
|
@ -87,7 +87,29 @@ function handleResponse(){
|
||||
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
|
||||
}
|
||||
document.getElementById("sitelistURLs").innerHTML = sitelist;
|
||||
if (sitelist) document.getElementById("sitelist").disabled=false;
|
||||
var expandButton = document.getElementById("expandSiteListBtn");
|
||||
var siteListRadio = document.getElementById("sitelist");
|
||||
if (sitelist) {
|
||||
siteListRadio.disabled = false;
|
||||
var hasMoreLinksElement = response.getElementsByTagName("hasMoreLinks");
|
||||
if(hasMoreLinksElement != null && hasMoreLinksElement.length > 0
|
||||
&& hasMoreLinksElement[0].firstChild != null && hasMoreLinksElement[0].firstChild.nodeValue == "true") {
|
||||
expandButton.style.visibility = "visible";
|
||||
expandButton.disabled = false;
|
||||
} else {
|
||||
expandButton.style.visibility = "hidden";
|
||||
}
|
||||
} else {
|
||||
siteListRadio.disabled = true;
|
||||
siteListRadio.checked = false;
|
||||
var urlModeRadio = document.getElementById("url");
|
||||
if(urlModeRadio != null) {
|
||||
urlModeRadio.checked = true;
|
||||
}
|
||||
if(expandButton != null) {
|
||||
expandButton.style.visibility = "hidden";
|
||||
}
|
||||
}
|
||||
|
||||
// clear the ajax image
|
||||
document.getElementById("ajax").setAttribute("src", AJAX_OFF);
|
||||
@ -96,15 +118,18 @@ function handleResponse(){
|
||||
|
||||
function changed() {
|
||||
window.clearTimeout(timeout);
|
||||
timeout=window.setTimeout("loadInfos()", 1500);
|
||||
timeout=window.setTimeout(loadInfos, 1500);
|
||||
}
|
||||
|
||||
function loadInfos() {
|
||||
/**
|
||||
* @param loadAll {Boolean} when true, load all links, else limit to the 100 first
|
||||
*/
|
||||
function loadInfos(loadAll) {
|
||||
// displaying ajax image
|
||||
document.getElementById("ajax").setAttribute("src",AJAX_ON);
|
||||
|
||||
var url=document.getElementById("crawlingURL").value;
|
||||
if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").checked = true; // since the pdf parser update for page separation, we need to set this
|
||||
sndReq('api/getpageinfo_p.xml?actions=title,robots&url='+url);
|
||||
sndReq('api/getpageinfo_p.xml?actions=title,robots' + (loadAll ? '' : '&maxLinks=50') + '&url='+url);
|
||||
document.getElementById("api").innerHTML = "<a href='api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>";
|
||||
}
|
||||
|
Reference in New Issue
Block a user