mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-17 08:26:08 -04:00
Merge remote-tracking branch 'jensbees/crawlexpert-post'
This commit is contained in:
@ -6,9 +6,185 @@
|
||||
<script type="text/javascript" src="/js/ajax.js"></script>
|
||||
<script type="text/javascript" src="/js/IndexCreate.js"></script>
|
||||
<script type="text/javascript">
|
||||
function check(key){
|
||||
document.getElementById(key).checked = 'checked';
|
||||
//<![CDATA[
|
||||
/**
|
||||
* Set the state of all elements based on other elements state.
|
||||
* @param {String} cId id of the element that had changed it's state
|
||||
*/
|
||||
function setStates(cId) {
|
||||
// order matters!
|
||||
// crawl start points
|
||||
if ($('#url').isChecked()) {
|
||||
$('#crawlingURL').enable();
|
||||
$('#sitemapURL, #crawlingFile').disable();
|
||||
if (cId === "url") { $('#crawlingURL').focus(); }
|
||||
} else if ($('#sitemap').isChecked()) {
|
||||
$('#sitemapURL').enable();
|
||||
$('#crawlingURL, #crawlingFile').disable();
|
||||
if (cId === "sitemap") { $('#sitemapURL').focus(); }
|
||||
} else if ($('#file').isChecked()) {
|
||||
$('#crawlingFile').enable();
|
||||
$('#crawlingURL, #sitemapURL').disable();
|
||||
if (cId === "file") { $('#crawlingFile').focus(); }
|
||||
}
|
||||
|
||||
// Load Filters
|
||||
if (cId === "rangeDomain" || cId === "rangeSubpath" ||
|
||||
cId === "rangeWide" || typeof cId === 'undefined') {
|
||||
if ($('#rangeDomain').isChecked() ||
|
||||
$('#rangeSubpath').isChecked()) {
|
||||
// restrict to sub-path / domain
|
||||
$('#mustmatch').disable();
|
||||
// skip these on initial load
|
||||
if (typeof cId !== 'undefined') {
|
||||
$('#deleteoldoff, #deleteoldage').uncheck();
|
||||
$('#deleteoldon').check();
|
||||
}
|
||||
} else if ($('#rangeWide').isChecked()) {
|
||||
// use Filter
|
||||
$('#mustmatch').enable();
|
||||
// skip these on initial load
|
||||
if (typeof cId !== 'undefined') {
|
||||
$('#deleteoldon, #deleteoldage').uncheck();
|
||||
$('#deleteoldoff').check();
|
||||
if (cId === "rangeWide") { $('#mustmatch').focus(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// crawl start: From File
|
||||
if ($("#sitelist").isChecked()) {
|
||||
$('#rangeDomain').check();
|
||||
}
|
||||
|
||||
// Delete only old
|
||||
if ($('#deleteoldage').isChecked()) {
|
||||
$('#deleteIfOlderNumber, #deleteIfOlderUnit').enable();
|
||||
} else {
|
||||
$('#deleteIfOlderNumber, #deleteIfOlderUnit').disable();
|
||||
}
|
||||
|
||||
// Reload if old
|
||||
if ($('#reloadoldage').isChecked()) {
|
||||
$('#reloadIfOlderNumber, #reloadIfOlderUnit').enable();
|
||||
} else {
|
||||
$('#reloadIfOlderNumber, #reloadIfOlderUnit').disable();
|
||||
}
|
||||
|
||||
// Use Must-Match List for Country Codes?
|
||||
if ($('#noCountryMustMatchSwitch').isChecked()) {
|
||||
$('#countryMustMatchList').disable();
|
||||
} else {
|
||||
$('#countryMustMatchList').enable();
|
||||
if (cId === "countryMustMatchSwitch") {
|
||||
$('#countryMustMatchList').focus();
|
||||
}
|
||||
}
|
||||
|
||||
// Maximum pages per domain
|
||||
if ($('#crawlingDomMaxCheck').isChecked()) {
|
||||
$('#crawlingDomMaxPages').enable();
|
||||
if (cId === "crawlingDomMaxCheck") {
|
||||
$('#crawlingDomMaxPages').focus();
|
||||
}
|
||||
} else {
|
||||
$('#crawlingDomMaxPages').disable();
|
||||
}
|
||||
|
||||
// Remote crawl
|
||||
if ($('#crawlOrder').isChecked()) {
|
||||
$('#intention').enable();
|
||||
if (cId === "crawlOrder") { $('#intention').focus(); }
|
||||
} else {
|
||||
$('#intention').disable();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable element if value matches val.
|
||||
* @param {String} id element id
|
||||
* @param {String} val value to comapre to elements value */
|
||||
function disableIf(id, val) {
|
||||
var e = $('#'+id);
|
||||
if (e.val() === val) {
|
||||
e.disable();
|
||||
}
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
(function($) {
|
||||
/** Disable a form element. */
|
||||
$.fn.disable = function() {
|
||||
return this.each(function() {
|
||||
$(this).prop('disabled', true);
|
||||
});
|
||||
};
|
||||
|
||||
/** Enable a form element. */
|
||||
$.fn.enable = function() {
|
||||
return this.each(function() {
|
||||
$(this).prop('disabled', false);
|
||||
});
|
||||
};
|
||||
|
||||
/** Check DOM & properties if element is checkeds. */
|
||||
$.fn.isChecked = function() {
|
||||
return $(this).attr("checked") || $(this).prop("checked");
|
||||
};
|
||||
|
||||
/** Set checked state for checkoxes/radio buttons. */
|
||||
$.fn.check = function() {
|
||||
return this.each(function() {
|
||||
$(this).attr("checked", "checked").prop("checked", true);
|
||||
});
|
||||
};
|
||||
|
||||
/** Unset checked state for checkoxes/radio buttons. */
|
||||
$.fn.uncheck = function() {
|
||||
return this.each(function() {
|
||||
$(this).removeAttr("checked").prop("checked", false);
|
||||
});
|
||||
};
|
||||
})(jQuery);
|
||||
|
||||
/**
|
||||
* On form submission remove text fields with default values as they
|
||||
* are set to those by yacy values by yacy, if missing.
|
||||
* @param {eventObject} ev */
|
||||
$('#Crawler').on('submit', function(ev){
|
||||
var defaultMatchAll = "#[matchAllStr]#";
|
||||
var defaultMatchNone = "#[matchNoneStr]#";
|
||||
|
||||
// remove empty textfields
|
||||
disableIf('crawlingDepthExtension', '');
|
||||
disableIf('intention', '');
|
||||
|
||||
// remove if MATCH_NEVER_STRING
|
||||
disableIf('mustnotmatch', defaultMatchNone);
|
||||
disableIf('ipMustnotmatch', defaultMatchNone);
|
||||
disableIf('indexmustnotmatch', defaultMatchNone);
|
||||
disableIf('indexcontentmustnotmatch', defaultMatchNone);
|
||||
|
||||
// remove if MATCH_ALL_STRING
|
||||
disableIf('mustmatch', defaultMatchAll);
|
||||
disableIf('ipMustmatch', defaultMatchAll);
|
||||
disableIf('indexmustmatch', defaultMatchAll);
|
||||
disableIf('indexcontentmustmatch', defaultMatchAll);
|
||||
|
||||
// remove default collection name
|
||||
disableIf('collection', '#[defaultCollection]#');
|
||||
});
|
||||
|
||||
// add event handlers to all checkoxes & radio buttons
|
||||
$(document).on('change', 'input:checkbox,input:radio', function() {
|
||||
setStates($(this).attr("id"));
|
||||
});
|
||||
|
||||
// set initial states
|
||||
if ($('#crawlingURL').val() !== '') { changed(); }
|
||||
setStates();
|
||||
});
|
||||
//]]>
|
||||
</script>
|
||||
<style type="text/css">
|
||||
.nobr {
|
||||
@ -50,8 +226,8 @@
|
||||
Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
|
||||
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
|
||||
</span></span>
|
||||
<input type="radio" align="top" name="crawlingMode" id="url" value="url" checked="checked" />
|
||||
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
|
||||
<input type="radio" align="top" name="crawlingMode" id="url" value="url" #(crawlingMode_url)#::checked="checked"#(/crawlingMode_url)# />
|
||||
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()">#[starturl]#</textarea>
|
||||
|
||||
<span id="robotsOK"></span>
|
||||
<span id="title"><br/></span>
|
||||
@ -59,20 +235,20 @@
|
||||
</dd>
|
||||
<dt></dt>
|
||||
<dd>
|
||||
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
|
||||
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="#[bookmarkTitle]#" readonly="readonly" style="background:transparent; border:0px"/>
|
||||
</dd>
|
||||
<dt>From Link-List of URL</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><br />
|
||||
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
|
||||
<div id="sitelistURLs"></div>
|
||||
</dd>
|
||||
<dt>From Sitemap</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/><input name="sitemapURL" type="text" size="71" maxlength="256" value="" readonly="readonly"/>
|
||||
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" #(crawlingMode_sitemap)#::checked="checked"#(/crawlingMode_sitemap)# #(has_sitemapURL)#disabled="disabled"::#(/has_sitemapURL)#/><input name="sitemapURL" id="sitemapURL" type="text" size="71" maxlength="256" value="#[sitemapURL]#"/>
|
||||
</dd>
|
||||
<dt>From File (enter a path<br/>within your local file system)</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><input type="text" name="crawlingFile" size="71" maxlength="256" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>-->
|
||||
<input type="radio" name="crawlingMode" id="file" value="file" #(crawlingMode_file)#::checked="checked"#(/crawlingMode_file)#/><input type="text" name="crawlingFile" id="crawlingFile" value="#[crawlingFile]#" size="71" maxlength="256"/>
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
@ -129,10 +305,10 @@
|
||||
</span></span>
|
||||
<table border="0">
|
||||
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td></td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)</td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)</td></tr>
|
||||
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
|
||||
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" #(range_domain)#::checked="checked"#(/range_domain)#/>Restrict to start domain(s)</td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" #(range_subpath)#::checked="checked"#(/range_subpath)#/>Restrict to sub-path(s)</td></tr>
|
||||
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" #(range_wide)#::checked="checked"#(/range_wide)#/>Use filter</td>
|
||||
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#"/></td></tr>
|
||||
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
|
||||
</table>
|
||||
</dd>
|
||||
@ -149,8 +325,8 @@
|
||||
Crawls can be restricted to specific countries. This uses the country code that can be computed from
|
||||
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
|
||||
</span></span>
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction<br />
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter
|
||||
<input type="radio" name="countryMustMatchSwitch" id="noCountryMustMatchSwitch" value="0" #(countryMustMatchSwitchChecked)#checked="checked"::#(/countryMustMatchSwitchChecked)# />no country code restriction<br />
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="1" #(countryMustMatchSwitchChecked)#::checked="checked"#(/countryMustMatchSwitchChecked)# />Use filter
|
||||
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
|
||||
</dd>
|
||||
</dl>
|
||||
@ -187,25 +363,21 @@
|
||||
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
|
||||
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
|
||||
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
|
||||
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/>Do not delete any document before the crawl is started.</dd>
|
||||
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" #(deleteold_off)#::checked="checked"#(/deleteold_off)#/>Do not delete any document before the crawl is started.</dd>
|
||||
<dt>Delete sub-path</dt>
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" #(deleteold_on)#::checked="checked"#(/deleteold_on)#/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
|
||||
<dt>Delete only old</dt>
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/>Treat documents that are loaded
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" #(deleteold_age)#::checked="checked"#(/deleteold_age)#/>Treat documents that are loaded
|
||||
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
|
||||
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
|
||||
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
|
||||
<option value="7">7</option>
|
||||
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
|
||||
<option value="12">12</option><option value="14" selected="selected">14</option><option value="21">21</option>
|
||||
<option value="28">28</option><option value="30">30</option>
|
||||
</select>
|
||||
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
|
||||
<option value="year">years</option>
|
||||
<option value="month">months</option>
|
||||
<option value="day" selected="selected">days</option>
|
||||
<option value="hour">hours</option>
|
||||
</select> ago as stale and delete them before the crawl is started.
|
||||
#(deleteIfOlderSelect)#::
|
||||
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
||||
#(/deleteIfOlderSelect)#
|
||||
</select>
|
||||
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
|
||||
#(deleteIfOlderUnitSelect)#::
|
||||
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
||||
#(/deleteIfOlderUnitSelect)#
|
||||
</select> ago as stale and delete them before the crawl is started.
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
@ -217,23 +389,19 @@
|
||||
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
|
||||
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
|
||||
to use that check the 're-load' option.
|
||||
</span></span><input type="radio" name="recrawl" value="nodoubles" checked="checked"/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
|
||||
</span></span><input type="radio" name="recrawl" id="reloadoldoff" value="nodoubles" #(recrawl_nodoubles)#::checked="checked"#(/recrawl_nodoubles)#/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
|
||||
<dt>Re-load</dt>
|
||||
<dd><input type="radio" name="recrawl" value="reload"/>Treat documents that are loaded
|
||||
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
|
||||
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
|
||||
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
|
||||
<option value="7" selected="selected">7</option>
|
||||
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
|
||||
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
|
||||
<option value="28">28</option><option value="30">30</option>
|
||||
</select>
|
||||
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
|
||||
<option value="year">years</option>
|
||||
<option value="month">months</option>
|
||||
<option value="day" selected="selected">days</option>
|
||||
<option value="hour">hours</option>
|
||||
</select> ago as stale and load them again. If they are younger, they are ignored.
|
||||
<dd><input type="radio" name="recrawl" id="reloadoldage" value="reload" #(recrawl_reload)#::checked="checked"#(/recrawl_reload)#/>Treat documents that are loaded
|
||||
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
|
||||
#(reloadIfOlderSelect)#::
|
||||
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
||||
#(/reloadIfOlderSelect)#
|
||||
</select>
|
||||
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
|
||||
#(reloadIfOlderUnitSelect)#::
|
||||
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
||||
#(/reloadIfOlderUnitSelect)#
|
||||
</select> ago as stale and load them again. If they are younger, they are ignored.
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
@ -256,10 +424,10 @@
|
||||
<b>if exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
|
||||
<b>cache only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
|
||||
</span></span>
|
||||
<input type="radio" name="cachePolicy" value="nocache" />no cache
|
||||
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if fresh
|
||||
<input type="radio" name="cachePolicy" value="ifexist" />if exist
|
||||
<input type="radio" name="cachePolicy" value="cacheonly" />cache only
|
||||
<input type="radio" name="cachePolicy" value="nocache" #(cachePolicy_nocache)#::checked="checked"#(/cachePolicy_nocache)#/>no cache
|
||||
<input type="radio" name="cachePolicy" value="iffresh" #(cachePolicy_iffresh)#::checked="checked"#(/cachePolicy_iffresh)# />if fresh
|
||||
<input type="radio" name="cachePolicy" value="ifexist" #(cachePolicy_ifexist)#::checked="checked"#(/cachePolicy_ifexist)#/>if exist
|
||||
<input type="radio" name="cachePolicy" value="cacheonly" #(cachePolicy_cacheonly)#::checked="checked"#(/cachePolicy_cacheonly)#/>cache only
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
@ -290,7 +458,7 @@
|
||||
<dt>Do Local Indexing</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
|
||||
This enables indexing of the webpages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
|
||||
Document Cache without indexing.
|
||||
</span></span>
|
||||
<label for="indexText">index text</label>:
|
||||
@ -315,7 +483,7 @@
|
||||
</td>
|
||||
<td>
|
||||
<label for="intention">Describe your intention to start this global crawl (optional)</label>:<br />
|
||||
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="" /><br />
|
||||
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="#[intention]#" /><br />
|
||||
This message will appear in the 'Other Peer Crawl Start' table of other peers.
|
||||
</td>
|
||||
</tr>
|
||||
@ -335,7 +503,7 @@
|
||||
</dl>
|
||||
</fieldset>
|
||||
|
||||
<dt><input type="submit" name="crawlingstart" value="Start New Crawl Job" class="submitready"/></dt><dd></dd>
|
||||
<dt><input type="hidden" name="crawlingstart" value="1"/><input type="submit" value="Start New Crawl Job" class="submitready"/></dt><dd></dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
@ -42,36 +42,516 @@ public class CrawlStartExpert_p {
|
||||
// return variable that accumulates replacements
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
final String defaultCollection = "user";
|
||||
|
||||
// define visible variables
|
||||
prop.put("starturl", /*(intranet) ? repository :*/ "");
|
||||
// javascript values
|
||||
prop.put("matchAllStr", CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("matchNoneStr", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("defaultCollection", defaultCollection);
|
||||
|
||||
// ---------- Start point
|
||||
// crawl start URL
|
||||
if (post != null && post.containsKey("crawlingURL")) {
|
||||
final String crawlingURL = post.get("crawlingURL", "");
|
||||
prop.put("starturl", crawlingURL);
|
||||
// simple check for content since it may be empty
|
||||
if (!crawlingURL.trim().isEmpty()) {
|
||||
prop.put("has_url", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("starturl", "");
|
||||
}
|
||||
|
||||
// sitemap URL
|
||||
if (post != null && post.containsKey("sitemapURL")) {
|
||||
final String sitemapURL = post.get("sitemapURL", "");
|
||||
prop.put("sitemapURL", sitemapURL);
|
||||
// simple check for content since it may be empty
|
||||
if (!sitemapURL.trim().isEmpty()) {
|
||||
prop.put("has_sitemapURL", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("sitemapURL", "");
|
||||
}
|
||||
|
||||
// crawling file
|
||||
if (post != null && post.containsKey("crawlingFile")) {
|
||||
final String crawlingFile = post.get("crawlingFile", "");
|
||||
prop.put("crawlingFile", crawlingFile);
|
||||
// simple check for content since it may be empty
|
||||
if (!crawlingFile.trim().isEmpty()) {
|
||||
prop.put("has_crawlingFile", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("crawlingFile", "");
|
||||
}
|
||||
|
||||
// Crawling mode
|
||||
if (post != null && post.containsKey("crawlingMode")) {
|
||||
final String crawlingMode = post.get("crawlingMode", "");
|
||||
boolean hasMode = false;
|
||||
if (crawlingMode.equalsIgnoreCase("sitelist")
|
||||
&& prop.getBoolean("has_url")) {
|
||||
// sitelist needs "crawlingURL" parameter, checked already
|
||||
prop.put("crawlingMode_sitelist", 1);
|
||||
hasMode = true;
|
||||
} else if (crawlingMode.equalsIgnoreCase("sitemap")
|
||||
&& prop.getBoolean("has_sitemapURL")) {
|
||||
// sitemap needs "sitemapURL" parameter, checked already
|
||||
prop.put("crawlingMode_sitemap", 1);
|
||||
hasMode = true;
|
||||
} else if (crawlingMode.equalsIgnoreCase("file")
|
||||
&& prop.getBoolean("has_crawlingFile")) {
|
||||
// sitemap needs "crawlingFile" parameter, checked already
|
||||
prop.put("crawlingMode_file", 1);
|
||||
hasMode = true;
|
||||
} else if (crawlingMode.equalsIgnoreCase("url")
|
||||
&& prop.getBoolean("has_crawlingURL")) {
|
||||
prop.put("crawlingMode_url", 1);
|
||||
hasMode = true;
|
||||
}
|
||||
// try to guess mode
|
||||
if (!hasMode) {
|
||||
if (prop.getBoolean("has_url")) {
|
||||
prop.put("crawlingMode_url", 1);
|
||||
} else if (prop.getBoolean("has_sitemapURL")) {
|
||||
prop.put("crawlingMode_sitemap", 1);
|
||||
} else if (prop.getBoolean("has_crawlingFile")) {
|
||||
prop.put("crawlingMode_file", 1);
|
||||
} else {
|
||||
prop.put("crawlingMode_url", 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// default to URL
|
||||
prop.put("crawlingMode_url", 1);
|
||||
}
|
||||
|
||||
|
||||
// Bookmark title (set by script)
|
||||
if (post != null && post.containsKey("bookmarkTitle")) {
|
||||
prop.put("bookmarkTitle", post.get("bookmarkTitle", ""));
|
||||
} else {
|
||||
prop.put("bookmarkTitle", "");
|
||||
}
|
||||
|
||||
|
||||
// ---------- Crawling filter
|
||||
final int crawlingDomMaxPages = env.getConfigInt(
|
||||
"crawlingDomMaxPages", -1);
|
||||
|
||||
// crawling depth
|
||||
if (post != null && post.containsKey("crawlingDepth")) {
|
||||
final Integer depth = post.getInt("crawlingDepth", -1);
|
||||
// depth is limited to two digits, zero allowed
|
||||
if (depth >= 0 && depth < 100) {
|
||||
prop.put("crawlingDepth", depth);
|
||||
}
|
||||
}
|
||||
if (!prop.containsKey("crawlingDepth")) {
|
||||
prop.put("crawlingDepth", Math.min(3,
|
||||
env.getConfigLong("crawlingDepth", 0)));
|
||||
}
|
||||
|
||||
// linked non-parseable documents?
|
||||
if (post == null) {
|
||||
prop.put("directDocByURLChecked",
|
||||
sb.getConfigBool("crawlingDirectDocByURL", true) ? 1 : 0);
|
||||
} else {
|
||||
prop.put("directDocByURLChecked",
|
||||
post.getBoolean("directDocByURL") ? 1 : 0);
|
||||
}
|
||||
|
||||
// Unlimited crawl depth for URLs matching with
|
||||
if (post != null && post.containsKey("crawlingDepthExtension")) {
|
||||
prop.put("crawlingDepthExtension",
|
||||
post.get("crawlingDepthExtension", ""));
|
||||
} else {
|
||||
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
|
||||
}
|
||||
|
||||
// Limit by maximum Pages per Domain?
|
||||
if (post == null) {
|
||||
prop.put("crawlingDomMaxCheck",
|
||||
(crawlingDomMaxPages == -1) ? 0 : 1);
|
||||
} else {
|
||||
prop.put("crawlingDomMaxCheck",
|
||||
post.getBoolean("crawlingDomMaxCheck") ? 1 : 0);
|
||||
}
|
||||
|
||||
// Maximum Pages per Domain
|
||||
if (post != null && post.containsKey("crawlingDomMaxPages")) {
|
||||
final Integer maxPages = post.getInt("crawlingDomMaxPages", -1);
|
||||
// depth is limited to six digits, zero not allowed
|
||||
if (maxPages > 0 && maxPages < 1000000) {
|
||||
prop.put("crawlingDomMaxPages", maxPages);
|
||||
}
|
||||
}
|
||||
if (!prop.containsKey("crawlingDomMaxPages")) {
|
||||
prop.put("crawlingDomMaxPages",
|
||||
(crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
|
||||
}
|
||||
|
||||
// Accept URLs with query-part?
|
||||
// Obey html-robots-noindex?
|
||||
if (post == null) {
|
||||
prop.put("crawlingQChecked",
|
||||
env.getConfigBool("crawlingQ", true) ? 1 : 0);
|
||||
prop.put("obeyHtmlRobotsNoindexChecked",
|
||||
env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
|
||||
} else {
|
||||
prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? 1 : 0);
|
||||
prop.put("obeyHtmlRobotsNoindexChecked",
|
||||
post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
|
||||
}
|
||||
|
||||
// Load Filter on URLs (range)
|
||||
if (post != null && post.containsKey("range")) {
|
||||
final String range = post.get("range", "");
|
||||
if (range.equalsIgnoreCase("domain")) {
|
||||
prop.put("range_domain", 1);
|
||||
} else if (range.equalsIgnoreCase("subpath")) {
|
||||
prop.put("range_subpath", 1);
|
||||
} else if (range.equalsIgnoreCase("wide")) {
|
||||
prop.put("range_wide", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("range_wide", 1);
|
||||
}
|
||||
|
||||
// Load Filter on URLs: must match
|
||||
if (post != null && post.containsKey("mustmatch")) {
|
||||
prop.put("mustmatch", post.get("mustmatch", ""));
|
||||
} else {
|
||||
prop.put("mustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
}
|
||||
|
||||
// Load Filter on URLs: must-not-match
|
||||
if (post != null && post.containsKey("mustnotmatch")) {
|
||||
prop.put("mustnotmatch", post.get("mustnotmatch", ""));
|
||||
} else {
|
||||
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
}
|
||||
|
||||
// Load Filter on IPs: must match
|
||||
if (post != null && post.containsKey("ipMustmatch")) {
|
||||
prop.put("ipMustmatch", post.get("ipMustmatch", ""));
|
||||
} else {
|
||||
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch",
|
||||
CrawlProfile.MATCH_ALL_STRING));
|
||||
}
|
||||
|
||||
// Load Filter on IPs: must-not-match
|
||||
if (post != null && post.containsKey("ipMustnotmatch")) {
|
||||
prop.put("ipMustnotmatch", post.get("ipMustnotmatch", ""));
|
||||
} else {
|
||||
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch",
|
||||
CrawlProfile.MATCH_NEVER_STRING));
|
||||
}
|
||||
|
||||
// Use Country Codes Match-List?
|
||||
if (post == null) {
|
||||
// use the default that was set in the original template
|
||||
prop.put("countryMustMatchSwitchChecked", 0);
|
||||
} else {
|
||||
prop.put("countryMustMatchSwitchChecked",
|
||||
post.getBoolean("countryMustMatchSwitch") ? 1 : 0);
|
||||
}
|
||||
|
||||
// Must-Match List for Country Codes
|
||||
if (post != null && post.containsKey("countryMustMatchList")) {
|
||||
prop.put("countryMustMatch", post.get("countryMustMatchList", ""));
|
||||
} else {
|
||||
prop.put("countryMustMatch",
|
||||
sb.getConfig("crawlingCountryMustMatch", ""));
|
||||
}
|
||||
|
||||
|
||||
// ---------- Document filter
|
||||
// Indexer filter on URLs: must match
|
||||
if (post != null && post.containsKey("indexmustmatch")) {
|
||||
prop.put("indexmustmatch", post.get("indexmustmatch", ""));
|
||||
} else {
|
||||
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
}
|
||||
|
||||
// Indexer filter on URLs: must-no-match
|
||||
if (post != null && post.containsKey("indexmustnotmatch")) {
|
||||
prop.put("indexmustnotmatch", post.get("indexmustnotmatch", ""));
|
||||
} else {
|
||||
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
}
|
||||
|
||||
// Filter on Content of Document: must match
|
||||
if (post != null && post.containsKey("indexcontentmustmatch")) {
|
||||
prop.put("indexcontentmustmatch",
|
||||
post.get("indexcontentmustmatch", ""));
|
||||
} else {
|
||||
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
}
|
||||
|
||||
// Filter on Content of Document: must-not-match
|
||||
if (post != null && post.containsKey("indexcontentmustnotmatch")) {
|
||||
prop.put("indexcontentmustnotmatch",
|
||||
post.get("indexcontentmustnotmatch", ""));
|
||||
} else {
|
||||
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
}
|
||||
|
||||
|
||||
// ---------- Clean-Up before Crawl Start
|
||||
// delete if older settings: number value
|
||||
prop.put("deleteIfOlderSelect", 1);
|
||||
for (int i=0; i<13; i++) {
|
||||
prop.put("deleteIfOlderSelect_list_"+i+"_name", Integer.toString(i));
|
||||
}
|
||||
prop.put("deleteIfOlderSelect_list_13_name", "14");
|
||||
prop.put("deleteIfOlderSelect_list_14_name", "21");
|
||||
prop.put("deleteIfOlderSelect_list_15_name", "28");
|
||||
prop.put("deleteIfOlderSelect_list_16_name", "30");
|
||||
prop.put("deleteIfOlderSelect_list", 17);
|
||||
|
||||
if (post != null && post.containsKey("deleteIfOlderNumber")) {
|
||||
final Integer olderNumber = post.getInt("deleteIfOlderNumber", -1);
|
||||
if (olderNumber >0 && olderNumber <= 12) {
|
||||
prop.put("deleteIfOlderSelect_list_" + olderNumber +
|
||||
"_default", 1);
|
||||
} else {
|
||||
switch (olderNumber) {
|
||||
case 21:
|
||||
prop.put("deleteIfOlderSelect_list_14_default", 1);
|
||||
break;
|
||||
case 28:
|
||||
prop.put("deleteIfOlderSelect_list_15_default", 1);
|
||||
break;
|
||||
case 30:
|
||||
prop.put("deleteIfOlderSelect_list_16_default", 1);
|
||||
break;
|
||||
default:
|
||||
prop.put("deleteIfOlderSelect_list_13_default", 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
prop.put("deleteIfOlderSelect_list_13_default", 1);
|
||||
}
|
||||
|
||||
// delete if older settings: number unit
|
||||
prop.put("deleteIfOlderUnitSelect", 1);
|
||||
prop.put("deleteIfOlderUnitSelect_list_0_name", "years");
|
||||
prop.put("deleteIfOlderUnitSelect_list_0_value", "year");
|
||||
prop.put("deleteIfOlderUnitSelect_list_1_name", "months");
|
||||
prop.put("deleteIfOlderUnitSelect_list_1_value", "month");
|
||||
prop.put("deleteIfOlderUnitSelect_list_2_name", "days");
|
||||
prop.put("deleteIfOlderUnitSelect_list_2_value", "day");
|
||||
prop.put("deleteIfOlderUnitSelect_list_3_name", "hours");
|
||||
prop.put("deleteIfOlderUnitSelect_list_3_value", "hour");
|
||||
prop.put("deleteIfOlderUnitSelect_list", 4);
|
||||
|
||||
if (post != null && post.containsKey("deleteIfOlderUnit")) {
|
||||
final String olderUnit = post.get("deleteIfOlderUnit", "");
|
||||
if (olderUnit.equalsIgnoreCase("year")) {
|
||||
prop.put("deleteIfOlderUnitSelect_list_0_default", 1);
|
||||
} else if (olderUnit.equalsIgnoreCase("month")) {
|
||||
prop.put("deleteIfOlderUnitSelect_list_1_default", 1);
|
||||
} else if (olderUnit.equalsIgnoreCase("hour")) {
|
||||
prop.put("deleteIfOlderUnitSelect_list_3_default", 1);
|
||||
} else {
|
||||
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
|
||||
}
|
||||
|
||||
// delete any document before the crawl is started?
|
||||
if (post != null && post.containsKey("deleteold")) {
|
||||
final String deleteold = post.get("deleteold", "");
|
||||
if (deleteold.equalsIgnoreCase("on")){
|
||||
prop.put("deleteold_on", 1);
|
||||
} else if (deleteold.equalsIgnoreCase("age")) {
|
||||
prop.put("deleteold_age", 1);
|
||||
} else {
|
||||
prop.put("deleteold_off", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("deleteold_off", 1);
|
||||
}
|
||||
|
||||
|
||||
// ---------- Double-Check Rules
|
||||
// reload settings: number value
|
||||
prop.put("reloadIfOlderSelect", 1);
|
||||
for (int i=0; i<13; i++) {
|
||||
prop.put("reloadIfOlderSelect_list_"+i+"_name", Integer.toString(i));
|
||||
}
|
||||
prop.put("reloadIfOlderSelect_list_13_name", "14");
|
||||
prop.put("reloadIfOlderSelect_list_14_name", "21");
|
||||
prop.put("reloadIfOlderSelect_list_15_name", "28");
|
||||
prop.put("reloadIfOlderSelect_list_16_name", "30");
|
||||
prop.put("reloadIfOlderSelect_list", 17);
|
||||
|
||||
if (post != null && post.containsKey("reloadIfOlderNumber")) {
|
||||
final Integer olderNumber = post.getInt("reloadIfOlderNumber", -1);
|
||||
if (olderNumber >0 && olderNumber <= 12) {
|
||||
prop.put("reloadIfOlderSelect_list_" + olderNumber +
|
||||
"_default", 1);
|
||||
} else {
|
||||
switch (olderNumber) {
|
||||
case 21:
|
||||
prop.put("reloadIfOlderSelect_list_14_default", 1);
|
||||
break;
|
||||
case 28:
|
||||
prop.put("reloadIfOlderSelect_list_15_default", 1);
|
||||
break;
|
||||
case 30:
|
||||
prop.put("reloadIfOlderSelect_list_16_default", 1);
|
||||
break;
|
||||
default:
|
||||
prop.put("reloadIfOlderSelect_list_13_default", 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
prop.put("reloadIfOlderSelect_list_13_default", 1);
|
||||
}
|
||||
|
||||
// reload settings: number unit
|
||||
prop.put("reloadIfOlderUnitSelect", 1);
|
||||
prop.put("reloadIfOlderUnitSelect_list_0_name", "years");
|
||||
prop.put("reloadIfOlderUnitSelect_list_0_value", "year");
|
||||
prop.put("reloadIfOlderUnitSelect_list_1_name", "months");
|
||||
prop.put("reloadIfOlderUnitSelect_list_1_value", "month");
|
||||
prop.put("reloadIfOlderUnitSelect_list_2_name", "days");
|
||||
prop.put("reloadIfOlderUnitSelect_list_2_value", "day");
|
||||
prop.put("reloadIfOlderUnitSelect_list_3_name", "hours");
|
||||
prop.put("reloadIfOlderUnitSelect_list_3_value", "hour");
|
||||
prop.put("reloadIfOlderUnitSelect_list", 4);
|
||||
|
||||
if (post != null && post.containsKey("reloadIfOlderUnit")) {
|
||||
final String olderUnit = post.get("reloadIfOlderUnit", "");
|
||||
if (olderUnit.equalsIgnoreCase("year")) {
|
||||
prop.put("reloadIfOlderUnitSelect_list_0_default", 1);
|
||||
} else if (olderUnit.equalsIgnoreCase("month")) {
|
||||
prop.put("reloadIfOlderUnitSelect_list_1_default", 1);
|
||||
} else if (olderUnit.equalsIgnoreCase("hour")) {
|
||||
prop.put("reloadIfOlderUnitSelect_list_3_default", 1);
|
||||
} else {
|
||||
prop.put("reloadIfOlderUnitSelect_list_2_default", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("reloadIfOlderUnitSelect_list_2_default", 1);
|
||||
}
|
||||
|
||||
if (post != null && post.containsKey("recrawl")) {
|
||||
final String recrawl = post.get("recrawl", "");
|
||||
if (recrawl.equalsIgnoreCase("reload")) {
|
||||
prop.put("recrawl_reload", 1);
|
||||
} else {
|
||||
prop.put("recrawl_nodoubles", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("recrawl_nodoubles", 1);
|
||||
}
|
||||
|
||||
|
||||
// ---------- Document Cache
|
||||
// Store to Web Cache?
|
||||
if (post == null) {
|
||||
prop.put("storeHTCacheChecked",
|
||||
env.getConfigBool("storeHTCache", true) ? 1 : 0);
|
||||
} else {
|
||||
prop.put("storeHTCacheChecked",
|
||||
post.getBoolean("storeHTCache") ? 1 : 0);
|
||||
}
|
||||
|
||||
// Policy for usage of Web Cache
|
||||
if (post != null && post.containsKey("cachePolicy")) {
|
||||
final String cachePolicy = post.get("cachePolicy", "");
|
||||
if (cachePolicy.equalsIgnoreCase("nocache")) {
|
||||
prop.put("cachePolicy_nocache", 1);
|
||||
} else if (cachePolicy.equalsIgnoreCase("ifexist")) {
|
||||
prop.put("cachePolicy_ifexist", 1);
|
||||
} else if (cachePolicy.equalsIgnoreCase("cacheonly")) {
|
||||
prop.put("cachePolicy_cacheonly", 1);
|
||||
} else {
|
||||
prop.put("cachePolicy_iffresh", 1);
|
||||
}
|
||||
} else {
|
||||
prop.put("cachePolicy_iffresh", 1);
|
||||
}
|
||||
|
||||
|
||||
// ---------- Agent name (untested & untouched)
|
||||
if (sb.isP2PMode()) {
|
||||
prop.put("agentSelect", 0);
|
||||
} else {
|
||||
prop.put("agentSelect", 1);
|
||||
List<String> agentNames = new ArrayList<String>();
|
||||
if (sb.isIntranetMode()) {
|
||||
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
|
||||
}
|
||||
if (sb.isGlobalMode()) {
|
||||
agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
}
|
||||
agentNames.add(ClientIdentification.googleAgentName);
|
||||
if (sb.isAllIPMode()) {
|
||||
agentNames.add(ClientIdentification.browserAgentName);
|
||||
}
|
||||
for (int i = 0; i < agentNames.size(); i++) {
|
||||
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
|
||||
}
|
||||
prop.put("agentSelect_list", agentNames.size());
|
||||
}
|
||||
prop.put("agentSelect_defaultAgentName",
|
||||
ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
|
||||
|
||||
// ---------- Index Administration
|
||||
// Do Local Indexing
|
||||
if (post == null) {
|
||||
// Local index text?
|
||||
prop.put("indexingTextChecked",
|
||||
env.getConfigBool("indexText", true) ? 1 : 0);
|
||||
// Local index media?
|
||||
prop.put("indexingMediaChecked",
|
||||
env.getConfigBool("indexMedia", true) ? 1 : 0);
|
||||
// Do Remote Indexing?
|
||||
prop.put("crawlOrderChecked",
|
||||
env.getConfigBool("crawlOrder", true) ? 1 : 0);
|
||||
// Remote crawl intention
|
||||
prop.put("intention", "");
|
||||
} else {
|
||||
prop.put("indexingTextChecked",
|
||||
post.getBoolean("indexText") ? 1 : 0);
|
||||
prop.put("indexingMediaChecked",
|
||||
post.getBoolean("indexMedia") ? 1 : 0);
|
||||
prop.put("crawlOrderChecked",
|
||||
post.getBoolean("crawlOrder") ? 1 : 0);
|
||||
prop.put("intention", post.get("intention", ""));
|
||||
}
|
||||
|
||||
// Target collection
|
||||
boolean collectionEnabled =
|
||||
sb.index.fulltext().getDefaultConfiguration().isEmpty() ||
|
||||
sb.index.fulltext().getDefaultConfiguration().contains(
|
||||
CollectionSchema.collection_sxt);
|
||||
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
|
||||
if (collectionEnabled) {
|
||||
if (post != null && post.containsKey("collection")) {
|
||||
prop.put("collection", post.get("collection", ""));
|
||||
} else {
|
||||
prop.put("collection", collectionEnabled ? defaultCollection : "");
|
||||
}
|
||||
}
|
||||
|
||||
/* problaby unused (no corresponding entry in template)
|
||||
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
|
||||
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
|
||||
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
|
||||
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
|
||||
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
|
||||
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
|
||||
|
||||
|
||||
final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
|
||||
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
|
||||
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
|
||||
final int crawlingDomMaxPages = env.getConfigInt("crawlingDomMaxPages", -1);
|
||||
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
|
||||
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
|
||||
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0");
|
||||
|
||||
prop.put("followFramesChecked", env.getConfigBool("followFrames", true) ? "1" : "0");
|
||||
prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0");
|
||||
prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0");
|
||||
prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
|
||||
prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");
|
||||
prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? "1" : "0");
|
||||
|
||||
final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
|
||||
final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
|
||||
@ -83,25 +563,8 @@ public class CrawlStartExpert_p {
|
||||
prop.put("xsstopwChecked", env.getConfigBool("xsstopw", true) ? "1" : "0");
|
||||
prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0");
|
||||
prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0");
|
||||
*/
|
||||
|
||||
boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
|
||||
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
|
||||
prop.put("collection", collectionEnabled ? "user" : "");
|
||||
if (sb.isP2PMode()) {
|
||||
prop.put("agentSelect", 0);
|
||||
} else {
|
||||
prop.put("agentSelect", 1);
|
||||
List<String> agentNames = new ArrayList<String>();
|
||||
if (sb.isIntranetMode()) agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
|
||||
if (sb.isGlobalMode()) agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
agentNames.add(ClientIdentification.googleAgentName);
|
||||
if (sb.isAllIPMode()) agentNames.add(ClientIdentification.browserAgentName);
|
||||
for (int i = 0; i < agentNames.size(); i++) {
|
||||
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
|
||||
}
|
||||
prop.put("agentSelect_list", agentNames.size());
|
||||
}
|
||||
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
@ -161,7 +161,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
|
||||
</table>
|
||||
#(/scheduler)#
|
||||
</td>
|
||||
#(inline)#<td>#[url]#</td>::#(/inline)#
|
||||
#(inline)#<td>#(isCrawlerStart)#::<a href="#[url]#">Edit</a> #(/isCrawlerStart)##[url]#</td>::#(/inline)#
|
||||
</tr>
|
||||
#{/list}#
|
||||
</table>
|
||||
|
@ -287,6 +287,13 @@ public class Table_API_p {
|
||||
prop.put("showtable_list_" + count + "_dateNextExec", date_next_exec == null ? "-" : DateFormat.getDateTimeInstance().format(date_next_exec));
|
||||
prop.put("showtable_list_" + count + "_type", row.get(WorkTables.TABLE_API_COL_TYPE));
|
||||
prop.put("showtable_list_" + count + "_comment", row.get(WorkTables.TABLE_API_COL_COMMENT));
|
||||
// check type & action to link crawl start URLs back to CrawlStartExpert_p.html
|
||||
if (prop.get("showtable_list_" + count + "_type", "").equals(WorkTables.TABLE_API_TYPE_CRAWLER)
|
||||
&& prop.get("showtable_list_" + count + "_comment", "").startsWith("crawl start for")) {
|
||||
prop.put("showtable_list_" + count + "_inline_isCrawlerStart", 1);
|
||||
final String editUrl = UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)).replace("Crawler_p", "CrawlStartExpert_p");
|
||||
prop.put("showtable_list_" + count + "_inline_isCrawlerStart_url", editUrl);
|
||||
}
|
||||
prop.putHTML("showtable_list_" + count + "_inline_url", "http://" + sb.myPublicIP() + ":" + sb.getConfig("port", "8090") + UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)));
|
||||
prop.put("showtable_list_" + count + "_scheduler_inline", inline ? "true" : "false");
|
||||
prop.put("showtable_list_" + count + "_scheduler_filter", typefilter.pattern());
|
||||
|
Reference in New Issue
Block a user