mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-21 09:04:37 -04:00
added field postprocessing.partialUpdate to settings which can be used
to switch on or off partial updates. Both options should cause the same result. Default is on.
This commit is contained in:
@ -1243,6 +1243,7 @@ greedylearning.active = true
|
||||
# postprocessing steering
|
||||
postprocessing.maximum_load = 2.5
|
||||
postprocessing.minimum_ram = 536870912
|
||||
postprocessing.partialUpdate = true
|
||||
|
||||
# Custom user agents for 'allip' networks:
|
||||
# This user agent is only available if the network is set to 'allip' (which is a non-limited domain 'network'
|
||||
|
@ -989,7 +989,7 @@ public final class Switchboard extends serverSwitch {
|
||||
SwitchboardConstants.CLEANUP_METHOD_START,
|
||||
SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT,
|
||||
SwitchboardConstants.CLEANUP_METHOD_FREEMEM,
|
||||
60000,
|
||||
30000,
|
||||
Long.MAX_VALUE,
|
||||
10000,
|
||||
Long.MAX_VALUE),
|
||||
@ -2312,7 +2312,7 @@ public final class Switchboard extends serverSwitch {
|
||||
if (postprocessing) {
|
||||
// run postprocessing on all profiles
|
||||
ReferenceReportCache rrCache = index.getReferenceReportCache();
|
||||
proccount += collection1Configuration.postprocessing(index, rrCache, null);
|
||||
proccount += collection1Configuration.postprocessing(index, rrCache, null, getConfigBool("postprocessing.partialUpdate", true));
|
||||
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
|
||||
}
|
||||
this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
|
||||
@ -2325,7 +2325,7 @@ public final class Switchboard extends serverSwitch {
|
||||
if (postprocessing) {
|
||||
// run postprocessing on these profiles
|
||||
ReferenceReportCache rrCache = index.getReferenceReportCache();
|
||||
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash);
|
||||
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash, getConfigBool("postprocessing.partialUpdate", true));
|
||||
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
|
||||
}
|
||||
this.crawler.cleanProfiles(deletionCandidates);
|
||||
|
@ -994,7 +994,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
* @param urlCitation
|
||||
* @return
|
||||
*/
|
||||
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) {
|
||||
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) {
|
||||
if (!this.contains(CollectionSchema.process_sxt)) return 0;
|
||||
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
|
||||
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
|
||||
@ -1262,6 +1262,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
|
||||
: null, // null sort is faster!
|
||||
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
|
||||
byPartialUpdate ?
|
||||
new String[]{
|
||||
CollectionSchema.id.getSolrFieldName(),
|
||||
CollectionSchema.sku.getSolrFieldName(),
|
||||
CollectionSchema.harvestkey_s.getSolrFieldName(),
|
||||
@ -1279,7 +1281,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
CollectionSchema.url_protocol_s.getSolrFieldName(),
|
||||
CollectionSchema.httpstatus_i.getSolrFieldName(),
|
||||
CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
|
||||
CollectionSchema.robots_i.getSolrFieldName());
|
||||
CollectionSchema.robots_i.getSolrFieldName()} :
|
||||
new String[0]);
|
||||
final AtomicInteger proccount = new AtomicInteger();
|
||||
final AtomicInteger proccount_referencechange = new AtomicInteger();
|
||||
final AtomicInteger proccount_citationchange = new AtomicInteger();
|
||||
@ -1305,7 +1308,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
try {
|
||||
DigestURL url = new DigestURL(u, ASCII.getBytes(i));
|
||||
byte[] id = url.hash();
|
||||
SolrInputDocument sid = new SolrInputDocument(); //collection.toSolrInputDocument(doc, omitFields);
|
||||
SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
|
||||
sid.setField(CollectionSchema.id.getSolrFieldName(), i);
|
||||
for (Object tag: proctags) try {
|
||||
|
||||
@ -1346,13 +1349,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||
}
|
||||
|
||||
// all processing steps checked, remove the processing and harvesting key
|
||||
sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
|
||||
sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
|
||||
if (byPartialUpdate) {
|
||||
sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
|
||||
sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
|
||||
} else {
|
||||
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
|
||||
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
|
||||
}
|
||||
// with standard solr fields selected, the sid now contains the fields
|
||||
// id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
|
||||
// and the value for host_extent_i is by default 2147483647
|
||||
|
||||
// send back to index
|
||||
//collectionConnector.deleteById(i);
|
||||
collectionConnector.update(sid);
|
||||
|
||||
if (byPartialUpdate) {
|
||||
collectionConnector.update(sid);
|
||||
} else {
|
||||
collectionConnector.add(sid);
|
||||
}
|
||||
long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
|
||||
if (thiscount % 100 == 0) {
|
||||
postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +
|
||||
|
Reference in New Issue
Block a user