1
0
mirror of https://github.com/yacy/yacy_search_server.git synced 2025-07-21 09:04:37 -04:00

added field postprocessing.partialUpdate to settings which can be used

to switch on or off partial updates. Both options should cause the same
result. Default is on.
This commit is contained in:
Michael Peter Christen
2014-10-17 14:17:49 +02:00
parent 11074d8d24
commit 2e5214eb21
3 changed files with 25 additions and 10 deletions
defaults
source/net/yacy/search

@ -1243,6 +1243,7 @@ greedylearning.active = true
# postprocessing steering
postprocessing.maximum_load = 2.5
postprocessing.minimum_ram = 536870912
postprocessing.partialUpdate = true
# Custom user agents for 'allip' networks:
# This user agent is only available if the network is set to 'allip' (which is a non-limited domain 'network'

@ -989,7 +989,7 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.CLEANUP_METHOD_START,
SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT,
SwitchboardConstants.CLEANUP_METHOD_FREEMEM,
60000,
30000,
Long.MAX_VALUE,
10000,
Long.MAX_VALUE),
@ -2312,7 +2312,7 @@ public final class Switchboard extends serverSwitch {
if (postprocessing) {
// run postprocessing on all profiles
ReferenceReportCache rrCache = index.getReferenceReportCache();
proccount += collection1Configuration.postprocessing(index, rrCache, null);
proccount += collection1Configuration.postprocessing(index, rrCache, null, getConfigBool("postprocessing.partialUpdate", true));
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
}
this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
@ -2325,7 +2325,7 @@ public final class Switchboard extends serverSwitch {
if (postprocessing) {
// run postprocessing on these profiles
ReferenceReportCache rrCache = index.getReferenceReportCache();
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash);
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash, getConfigBool("postprocessing.partialUpdate", true));
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
}
this.crawler.cleanProfiles(deletionCandidates);

@ -994,7 +994,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation
* @return
*/
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) {
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
@ -1262,6 +1262,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
: null, // null sort is faster!
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
byPartialUpdate ?
new String[]{
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.harvestkey_s.getSolrFieldName(),
@ -1279,7 +1281,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.url_protocol_s.getSolrFieldName(),
CollectionSchema.httpstatus_i.getSolrFieldName(),
CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
CollectionSchema.robots_i.getSolrFieldName());
CollectionSchema.robots_i.getSolrFieldName()} :
new String[0]);
final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger();
final AtomicInteger proccount_citationchange = new AtomicInteger();
@ -1305,7 +1308,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try {
DigestURL url = new DigestURL(u, ASCII.getBytes(i));
byte[] id = url.hash();
SolrInputDocument sid = new SolrInputDocument(); //collection.toSolrInputDocument(doc, omitFields);
SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
sid.setField(CollectionSchema.id.getSolrFieldName(), i);
for (Object tag: proctags) try {
@ -1346,13 +1349,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// all processing steps checked, remove the processing and harvesting key
sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
if (byPartialUpdate) {
sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
} else {
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
}
// with standard solr fields selected, the sid now contains the fields
// id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
// and the value for host_extent_i is by default 2147483647
// send back to index
//collectionConnector.deleteById(i);
collectionConnector.update(sid);
if (byPartialUpdate) {
collectionConnector.update(sid);
} else {
collectionConnector.add(sid);
}
long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
if (thiscount % 100 == 0) {
postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +