Limit the number of initially previewed links in crawl start pages.

This prevent rendering a big and inconvenient scrollbar on resources containing many links. If really needed, preview of all links is still available with a "Show all links" button. Doesn't affect the number of links used once the crawl is effectively started, as the list is then loaded again server-side.
2025-06-20 03:56:07 -04:00 · 2017-06-17 09:33:14 +02:00
parent d2a4a27f52
commit 0f80c978d6
5 changed files with 72 additions and 19 deletions
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -246,6 +246,9 @@
 	        <dd>
 	          <input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
              <div id="sitelistURLs"></div>
+              <button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
+              	<span class="glyphicon glyphicon-option-horizontal"/>
+              </button>
 	        </dd>
 	        <dt>From Sitemap</dt>
 	        <dd>
--- a/htroot/CrawlStartSite.html
+++ b/htroot/CrawlStartSite.html
@ -47,7 +47,12 @@
            </td>
          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
-            <td><div id="sitelistURLs"></div></td>
+            <td>
+            	<div id="sitelistURLs"></div>
+            	<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
+            		<span class="glyphicon glyphicon-option-horizontal"/>
+            	</button>
+            </td>
          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -28,6 +28,7 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Set;

@ -85,6 +86,7 @@ public class getpageinfo_p {
 	 *            	</ul>
 	 *            </li>
 	 *            <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
+	 *            <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
 	 *            </ul>
 	 * @param env
 	 *            server environment
@ -110,6 +112,7 @@ public class getpageinfo_p {
        String actions = "title,robots";

        if (post != null && post.containsKey("url")) {
+        	final int maxLinks = post.getInt("maxLinks", Integer.MAX_VALUE);
            if (post.containsKey("actions"))
                actions=post.get("actions");
            String url=post.get("url");
@ -135,7 +138,7 @@ public class getpageinfo_p {
                net.yacy.document.Document scraper = null;
                if (u != null) try {
                    ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
-                    scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
+                    scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
                } catch (final IOException e) {
                    ConcurrentLog.logException(e);
                    // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -145,20 +148,25 @@ public class getpageinfo_p {
                    // put the document title
                    prop.putXML("title", scraper.dc_title());

-                    // put the icons that belongs to the document
+                    // put the icons that belong to the document
                    Set<DigestURL> iconURLs = scraper.getIcons().keySet();
-                    int i = 0;
+                    int count = 0;
                    for (DigestURL iconURL : iconURLs) {
-                        prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
-						prop.put("icons_" + i + "_eol", 1);
-                        i++;
+                        if(count >= maxLinks) {
+                        	break;
+                        }
+                        prop.putXML("icons_" + count + "_icon", iconURL.toNormalform(false));
+						prop.put("icons_" + count + "_eol", 1);
+						count++;
                    }
-                    prop.put("icons_" + (i - 1) + "_eol", 0);
-                    prop.put("icons", iconURLs.size());
+                    if(count > 0) {
+                    	prop.put("icons_" + (count - 1) + "_eol", 0);
+                    }
+                    prop.put("icons", count);

                    // put keywords
                    final Set<String> list = scraper.dc_subject();
-                    int count = 0;
+                    count = 0;
                    for (final String element: list) {
                        if (!element.equals("")) {
                            prop.putXML("tags_"+count+"_tag", element);
@ -177,14 +185,20 @@ public class getpageinfo_p {
                    final StringBuilder links = new StringBuilder(uris.size() * 80);
                    final StringBuilder filter = new StringBuilder(uris.size() * 40);
                    count = 0;
-                    for (final DigestURL uri: uris) {
+                    final Iterator<AnchorURL> urisIt = uris.iterator();
+                    while (urisIt.hasNext()) {
+                    	AnchorURL uri = urisIt.next();
                        if (uri == null) continue;
+                        if(count >= maxLinks) {
+                        	break;
+                        }
                        links.append(';').append(uri.toNormalform(true));
                        filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
                        prop.putXML("links_" + count + "_link", uri.toNormalform(true));
                        count++;
                    }
                    prop.put("links", count);
+                   	prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
@ -200,12 +214,17 @@ public class getpageinfo_p {
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

-                    // get the sitemap URL of the domain
+                    // get the sitemap URL(s) of the domain
                    final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
-                    for (int i = 0; i < sitemaps.size(); i++) {
-                        prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
+                    int count = 0;
+                    for (String sitemap : sitemaps) {
+                        if(count >= maxLinks) {
+                        	break;
+                        }
+                        prop.putXML("sitemaps_" + count + "_sitemap", sitemap);
+                        count++;
                    }
-                    prop.put("sitemaps", sitemaps.size());
+                    prop.put("sitemaps", count);
                } catch (final MalformedURLException e) {
                    ConcurrentLog.logException(e);
                }
--- a/htroot/api/getpageinfo_p.xml
+++ b/htroot/api/getpageinfo_p.xml
@ -25,5 +25,6 @@
    <link name="#[link]#" />
    #{/links}#
  </links>
+  <hasMoreLinks>#(hasMoreLinks)#false::true#(/hasMoreLinks)#</hasMoreLinks>
  <oai>#[oai]#</oai>
 </pageinfo>
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -87,7 +87,29 @@ function handleResponse(){
 	    	sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		}
 		document.getElementById("sitelistURLs").innerHTML = sitelist;
-		if (sitelist) document.getElementById("sitelist").disabled=false;
+		var expandButton = document.getElementById("expandSiteListBtn");
+		var siteListRadio = document.getElementById("sitelist");
+		if (sitelist) {
+			siteListRadio.disabled = false;
+			var hasMoreLinksElement = response.getElementsByTagName("hasMoreLinks");
+			if(hasMoreLinksElement != null && hasMoreLinksElement.length > 0 
+					&& hasMoreLinksElement[0].firstChild != null && hasMoreLinksElement[0].firstChild.nodeValue == "true") {
+				expandButton.style.visibility = "visible";
+				expandButton.disabled = false;
+			} else {
+				expandButton.style.visibility = "hidden";
+			}
+		} else {
+			siteListRadio.disabled = true;
+			siteListRadio.checked = false;
+			var urlModeRadio = document.getElementById("url");
+			if(urlModeRadio != null) {
+				urlModeRadio.checked = true;	
+			}
+			if(expandButton != null) {
+				expandButton.style.visibility = "hidden";
+			}
+		}
        
 		// clear the ajax image
 		document.getElementById("ajax").setAttribute("src", AJAX_OFF);
@ -96,15 +118,18 @@ function handleResponse(){

 function changed() {
 	window.clearTimeout(timeout);
-	timeout=window.setTimeout("loadInfos()", 1500);
+	timeout=window.setTimeout(loadInfos, 1500);
 }

-function loadInfos() {
+/**
+ * @param loadAll {Boolean} when true, load all links, else limit to the 100 first
+ */
+function loadInfos(loadAll) {
 	// displaying ajax image
 	document.getElementById("ajax").setAttribute("src",AJAX_ON);	
 	
 	var url=document.getElementById("crawlingURL").value;
 	if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").checked = true; // since the pdf parser update for page separation, we need to set this
-	sndReq('api/getpageinfo_p.xml?actions=title,robots&url='+url);
+	sndReq('api/getpageinfo_p.xml?actions=title,robots' + (loadAll ? '' : '&maxLinks=50') + '&url='+url);
 	document.getElementById("api").innerHTML = "<a href='api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>";
 }