From 0f80c978d67acb6e3ab92e13d3ed0d21d0d8d4ab Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Sat, 17 Jun 2017 09:33:14 +0200
Subject: [PATCH] Limit the number of initially previewed links in crawl start
 pages.

This prevent rendering a big and inconvenient scrollbar on resources
containing many links.
If really needed, preview of all links is still available with a "Show
all links" button.

Doesn't affect the number of links used once the crawl is effectively
started, as the list is then loaded again server-side.
---
 htroot/CrawlStartExpert.html  |  3 +++
 htroot/CrawlStartSite.html    |  7 +++++-
 htroot/api/getpageinfo_p.java | 47 ++++++++++++++++++++++++-----------
 htroot/api/getpageinfo_p.xml  |  1 +
 htroot/js/IndexCreate.js      | 33 +++++++++++++++++++++---
 5 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index a65b89bd8..d8d77634c 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -246,6 +246,9 @@
 	        <dd>
 	          <input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
               <div id="sitelistURLs"></div>
+              <button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
+              	<span class="glyphicon glyphicon-option-horizontal"/>
+              </button>
 	        </dd>
 	        <dt>From Sitemap</dt>
 	        <dd>
diff --git a/htroot/CrawlStartSite.html b/htroot/CrawlStartSite.html
index f3b79b415..d8191be6f 100644
--- a/htroot/CrawlStartSite.html
+++ b/htroot/CrawlStartSite.html
@@ -47,7 +47,12 @@
             </td>
           </tr><tr>
             <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
-            <td><div id="sitelistURLs"></div></td>
+            <td>
+            	<div id="sitelistURLs"></div>
+            	<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
+            		<span class="glyphicon glyphicon-option-horizontal"/>
+            	</button>
+            </td>
           </tr><tr>
             <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
             onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index dd4e42013..51b6f2bb8 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -28,6 +28,7 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 
@@ -85,6 +86,7 @@ public class getpageinfo_p {
 	 *            	</ul>
 	 *            </li>
 	 *            <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
+	 *            <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
 	 *            </ul>
 	 * @param env
 	 *            server environment
@@ -110,6 +112,7 @@ public class getpageinfo_p {
         String actions = "title,robots";
 
         if (post != null && post.containsKey("url")) {
+        	final int maxLinks = post.getInt("maxLinks", Integer.MAX_VALUE);
             if (post.containsKey("actions"))
                 actions=post.get("actions");
             String url=post.get("url");
@@ -135,7 +138,7 @@ public class getpageinfo_p {
                 net.yacy.document.Document scraper = null;
                 if (u != null) try {
                     ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
-                    scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
+                    scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
                 } catch (final IOException e) {
                     ConcurrentLog.logException(e);
                     // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@@ -145,20 +148,25 @@ public class getpageinfo_p {
                     // put the document title
                     prop.putXML("title", scraper.dc_title());
 
-                    // put the icons that belongs to the document
+                    // put the icons that belong to the document
                     Set<DigestURL> iconURLs = scraper.getIcons().keySet();
-                    int i = 0;
+                    int count = 0;
                     for (DigestURL iconURL : iconURLs) {
-                        prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
-						prop.put("icons_" + i + "_eol", 1);
-                        i++;
+                        if(count >= maxLinks) {
+                        	break;
+                        }
+                        prop.putXML("icons_" + count + "_icon", iconURL.toNormalform(false));
+						prop.put("icons_" + count + "_eol", 1);
+						count++;
                     }
-                    prop.put("icons_" + (i - 1) + "_eol", 0);
-                    prop.put("icons", iconURLs.size());
+                    if(count > 0) {
+                    	prop.put("icons_" + (count - 1) + "_eol", 0);
+                    }
+                    prop.put("icons", count);
 
                     // put keywords
                     final Set<String> list = scraper.dc_subject();
-                    int count = 0;
+                    count = 0;
                     for (final String element: list) {
                         if (!element.equals("")) {
                             prop.putXML("tags_"+count+"_tag", element);
@@ -177,14 +185,20 @@ public class getpageinfo_p {
                     final StringBuilder links = new StringBuilder(uris.size() * 80);
                     final StringBuilder filter = new StringBuilder(uris.size() * 40);
                     count = 0;
-                    for (final DigestURL uri: uris) {
+                    final Iterator<AnchorURL> urisIt = uris.iterator();
+                    while (urisIt.hasNext()) {
+                    	AnchorURL uri = urisIt.next();
                         if (uri == null) continue;
+                        if(count >= maxLinks) {
+                        	break;
+                        }
                         links.append(';').append(uri.toNormalform(true));
                         filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
                         prop.putXML("links_" + count + "_link", uri.toNormalform(true));
                         count++;
                     }
                     prop.put("links", count);
+                   	prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
                     prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
                     prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                 }
@@ -200,12 +214,17 @@ public class getpageinfo_p {
                 	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                     prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
 
-                    // get the sitemap URL of the domain
+                    // get the sitemap URL(s) of the domain
                     final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
-                    for (int i = 0; i < sitemaps.size(); i++) {
-                        prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
+                    int count = 0;
+                    for (String sitemap : sitemaps) {
+                        if(count >= maxLinks) {
+                        	break;
+                        }
+                        prop.putXML("sitemaps_" + count + "_sitemap", sitemap);
+                        count++;
                     }
-                    prop.put("sitemaps", sitemaps.size());
+                    prop.put("sitemaps", count);
                 } catch (final MalformedURLException e) {
                     ConcurrentLog.logException(e);
                 }
diff --git a/htroot/api/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml
index 9ccd8e3d9..ab8cde09a 100644
--- a/htroot/api/getpageinfo_p.xml
+++ b/htroot/api/getpageinfo_p.xml
@@ -25,5 +25,6 @@
     <link name="#[link]#" />
     #{/links}#
   </links>
+  <hasMoreLinks>#(hasMoreLinks)#false::true#(/hasMoreLinks)#</hasMoreLinks>
   <oai>#[oai]#</oai>
 </pageinfo>
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index 5c827d22e..5b61d558b 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -87,7 +87,29 @@ function handleResponse(){
 	    	sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		}
 		document.getElementById("sitelistURLs").innerHTML = sitelist;
-		if (sitelist) document.getElementById("sitelist").disabled=false;
+		var expandButton = document.getElementById("expandSiteListBtn");
+		var siteListRadio = document.getElementById("sitelist");
+		if (sitelist) {
+			siteListRadio.disabled = false;
+			var hasMoreLinksElement = response.getElementsByTagName("hasMoreLinks");
+			if(hasMoreLinksElement != null && hasMoreLinksElement.length > 0 
+					&& hasMoreLinksElement[0].firstChild != null && hasMoreLinksElement[0].firstChild.nodeValue == "true") {
+				expandButton.style.visibility = "visible";
+				expandButton.disabled = false;
+			} else {
+				expandButton.style.visibility = "hidden";
+			}
+		} else {
+			siteListRadio.disabled = true;
+			siteListRadio.checked = false;
+			var urlModeRadio = document.getElementById("url");
+			if(urlModeRadio != null) {
+				urlModeRadio.checked = true;	
+			}
+			if(expandButton != null) {
+				expandButton.style.visibility = "hidden";
+			}
+		}
         
 		// clear the ajax image
 		document.getElementById("ajax").setAttribute("src", AJAX_OFF);
@@ -96,15 +118,18 @@ function handleResponse(){
 
 function changed() {
 	window.clearTimeout(timeout);
-	timeout=window.setTimeout("loadInfos()", 1500);
+	timeout=window.setTimeout(loadInfos, 1500);
 }
 
-function loadInfos() {
+/**
+ * @param loadAll {Boolean} when true, load all links, else limit to the 100 first
+ */
+function loadInfos(loadAll) {
 	// displaying ajax image
 	document.getElementById("ajax").setAttribute("src",AJAX_ON);	
 	
 	var url=document.getElementById("crawlingURL").value;
 	if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").checked = true; // since the pdf parser update for page separation, we need to set this
-	sndReq('api/getpageinfo_p.xml?actions=title,robots&url='+url);
+	sndReq('api/getpageinfo_p.xml?actions=title,robots' + (loadAll ? '' : '&maxLinks=50') + '&url='+url);
 	document.getElementById("api").innerHTML = "<a href='api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>";
 }