From 0f80c978d67acb6e3ab92e13d3ed0d21d0d8d4ab Mon Sep 17 00:00:00 2001 From: luccioman <luccioman@users.noreply.github.com> Date: Sat, 17 Jun 2017 09:33:14 +0200 Subject: [PATCH] Limit the number of initially previewed links in crawl start pages. This prevent rendering a big and inconvenient scrollbar on resources containing many links. If really needed, preview of all links is still available with a "Show all links" button. Doesn't affect the number of links used once the crawl is effectively started, as the list is then loaded again server-side. --- htroot/CrawlStartExpert.html | 3 +++ htroot/CrawlStartSite.html | 7 +++++- htroot/api/getpageinfo_p.java | 47 ++++++++++++++++++++++++----------- htroot/api/getpageinfo_p.xml | 1 + htroot/js/IndexCreate.js | 33 +++++++++++++++++++++--- 5 files changed, 72 insertions(+), 19 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index a65b89bd8..d8d77634c 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -246,6 +246,9 @@ <dd> <input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br /> <div id="sitelistURLs"></div> + <button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links"> + <span class="glyphicon glyphicon-option-horizontal"/> + </button> </dd> <dt>From Sitemap</dt> <dd> diff --git a/htroot/CrawlStartSite.html b/htroot/CrawlStartSite.html index f3b79b415..d8191be6f 100644 --- a/htroot/CrawlStartSite.html +++ b/htroot/CrawlStartSite.html @@ -47,7 +47,12 @@ </td> </tr><tr> <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td> - <td><div id="sitelistURLs"></div></td> + <td> + <div id="sitelistURLs"></div> + <button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links"> + <span class="glyphicon glyphicon-option-horizontal"/> + </button> + </td> </tr><tr> <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled" onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td> diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index dd4e42013..51b6f2bb8 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; +import java.util.Iterator; import java.util.List; import java.util.Set; @@ -85,6 +86,7 @@ public class getpageinfo_p { * </ul> * </li> * <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li> + * <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li> * </ul> * @param env * server environment @@ -110,6 +112,7 @@ public class getpageinfo_p { String actions = "title,robots"; if (post != null && post.containsKey("url")) { + final int maxLinks = post.getInt("maxLinks", Integer.MAX_VALUE); if (post.containsKey("actions")) actions=post.get("actions"); String url=post.get("url"); @@ -135,7 +138,7 @@ public class getpageinfo_p { net.yacy.document.Document scraper = null; if (u != null) try { ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent); + scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent); } catch (final IOException e) { ConcurrentLog.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" @@ -145,20 +148,25 @@ public class getpageinfo_p { // put the document title prop.putXML("title", scraper.dc_title()); - // put the icons that belongs to the document + // put the icons that belong to the document Set<DigestURL> iconURLs = scraper.getIcons().keySet(); - int i = 0; + int count = 0; for (DigestURL iconURL : iconURLs) { - prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false)); - prop.put("icons_" + i + "_eol", 1); - i++; + if(count >= maxLinks) { + break; + } + prop.putXML("icons_" + count + "_icon", iconURL.toNormalform(false)); + prop.put("icons_" + count + "_eol", 1); + count++; } - prop.put("icons_" + (i - 1) + "_eol", 0); - prop.put("icons", iconURLs.size()); + if(count > 0) { + prop.put("icons_" + (count - 1) + "_eol", 0); + } + prop.put("icons", count); // put keywords final Set<String> list = scraper.dc_subject(); - int count = 0; + count = 0; for (final String element: list) { if (!element.equals("")) { prop.putXML("tags_"+count+"_tag", element); @@ -177,14 +185,20 @@ public class getpageinfo_p { final StringBuilder links = new StringBuilder(uris.size() * 80); final StringBuilder filter = new StringBuilder(uris.size() * 40); count = 0; - for (final DigestURL uri: uris) { + final Iterator<AnchorURL> urisIt = uris.iterator(); + while (urisIt.hasNext()) { + AnchorURL uri = urisIt.next(); if (uri == null) continue; + if(count >= maxLinks) { + break; + } links.append(';').append(uri.toNormalform(true)); filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); prop.putXML("links_" + count + "_link", uri.toNormalform(true)); count++; } prop.put("links", count); + prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0"); prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : ""); prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); } @@ -200,12 +214,17 @@ public class getpageinfo_p { prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); - // get the sitemap URL of the domain + // get the sitemap URL(s) of the domain final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps(); - for (int i = 0; i < sitemaps.size(); i++) { - prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i)); + int count = 0; + for (String sitemap : sitemaps) { + if(count >= maxLinks) { + break; + } + prop.putXML("sitemaps_" + count + "_sitemap", sitemap); + count++; } - prop.put("sitemaps", sitemaps.size()); + prop.put("sitemaps", count); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); } diff --git a/htroot/api/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml index 9ccd8e3d9..ab8cde09a 100644 --- a/htroot/api/getpageinfo_p.xml +++ b/htroot/api/getpageinfo_p.xml @@ -25,5 +25,6 @@ <link name="#[link]#" /> #{/links}# </links> + <hasMoreLinks>#(hasMoreLinks)#false::true#(/hasMoreLinks)#</hasMoreLinks> <oai>#[oai]#</oai> </pageinfo> diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index 5c827d22e..5b61d558b 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -87,7 +87,29 @@ function handleResponse(){ sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue; } document.getElementById("sitelistURLs").innerHTML = sitelist; - if (sitelist) document.getElementById("sitelist").disabled=false; + var expandButton = document.getElementById("expandSiteListBtn"); + var siteListRadio = document.getElementById("sitelist"); + if (sitelist) { + siteListRadio.disabled = false; + var hasMoreLinksElement = response.getElementsByTagName("hasMoreLinks"); + if(hasMoreLinksElement != null && hasMoreLinksElement.length > 0 + && hasMoreLinksElement[0].firstChild != null && hasMoreLinksElement[0].firstChild.nodeValue == "true") { + expandButton.style.visibility = "visible"; + expandButton.disabled = false; + } else { + expandButton.style.visibility = "hidden"; + } + } else { + siteListRadio.disabled = true; + siteListRadio.checked = false; + var urlModeRadio = document.getElementById("url"); + if(urlModeRadio != null) { + urlModeRadio.checked = true; + } + if(expandButton != null) { + expandButton.style.visibility = "hidden"; + } + } // clear the ajax image document.getElementById("ajax").setAttribute("src", AJAX_OFF); @@ -96,15 +118,18 @@ function handleResponse(){ function changed() { window.clearTimeout(timeout); - timeout=window.setTimeout("loadInfos()", 1500); + timeout=window.setTimeout(loadInfos, 1500); } -function loadInfos() { +/** + * @param loadAll {Boolean} when true, load all links, else limit to the 100 first + */ +function loadInfos(loadAll) { // displaying ajax image document.getElementById("ajax").setAttribute("src",AJAX_ON); var url=document.getElementById("crawlingURL").value; if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").checked = true; // since the pdf parser update for page separation, we need to set this - sndReq('api/getpageinfo_p.xml?actions=title,robots&url='+url); + sndReq('api/getpageinfo_p.xml?actions=title,robots' + (loadAll ? '' : '&maxLinks=50') + '&url='+url); document.getElementById("api").innerHTML = "<a href='api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>"; }