2015-05-15 19:23:08 -04:00
/ * *
* RecrawlBusyThread . java
2021-03-11 06:17:11 -05:00
* SPDX - FileCopyrightText : 2015 by Burkhard Buelte
* SPDX - License - Identifier : GPL - 2 . 0 - or - later
2024-07-21 12:02:58 -04:00
* First released 15 . 05 . 2015 at https : //yacy.net
2015-05-15 19:23:08 -04:00
*
* This is a part of YaCy , a peer - to - peer based web search engine
*
* LICENSE
*
* This library is free software ; you can redistribute it and / or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation ; either version 2 . 1 of the License , or ( at your option )
* any later version .
*
* This library is distributed in the hope that it will be useful , but WITHOUT
* ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE . See the GNU Lesser General Public License for more
* details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt If not , see
* < http : //www.gnu.org/licenses/>.
* /
package net.yacy.crawler ;
2015-09-12 17:06:13 -04:00
import java.io.IOException ;
2015-05-15 19:23:08 -04:00
import java.net.MalformedURLException ;
2018-01-09 16:33:15 -05:00
import java.time.LocalDateTime ;
2018-01-30 15:00:18 -05:00
import java.util.Date ;
2015-05-15 19:23:08 -04:00
import java.util.HashSet ;
import java.util.Set ;
2018-01-09 16:33:15 -05:00
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrDocumentList ;
2018-01-30 15:00:18 -05:00
import net.yacy.cora.document.encoding.ASCII ;
2015-05-15 19:23:08 -04:00
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
2018-01-13 09:46:04 -05:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
import net.yacy.cora.protocol.ClientIdentification ;
2015-05-15 19:23:08 -04:00
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.NoticedURL ;
import net.yacy.crawler.retrieval.Request ;
2023-01-14 19:20:12 -05:00
import net.yacy.document.parser.html.TagValency ;
2015-05-15 19:23:08 -04:00
import net.yacy.kelondro.workflow.AbstractBusyThread ;
import net.yacy.search.Switchboard ;
import net.yacy.search.schema.CollectionSchema ;
/ * *
* Selects documents by a query from the local index
* and feeds the found urls to the crawler to recrawl the documents .
* This is intended to keep the index up - to - date
* Currently the doucments are selected by expired fresh_date_dt field
2018-01-09 04:22:26 -05:00
* an added to the crawler in smaller chunks ( see chunksize ) as long as no other crawl is running .
2015-05-15 19:23:08 -04:00
* /
public class RecrawlBusyThread extends AbstractBusyThread {
2021-08-17 09:23:21 -04:00
/** The thread name */
2015-05-15 19:23:08 -04:00
public final static String THREAD_NAME = " recrawlindex " ;
2021-08-17 09:23:21 -04:00
2018-01-08 15:20:46 -05:00
/** The default selection query */
public static final String DEFAULT_QUERY = CollectionSchema . fresh_date_dt . getSolrFieldName ( ) + " :[* TO NOW/DAY-1DAY] " ;
2021-08-17 09:23:21 -04:00
2018-01-08 15:20:46 -05:00
/** Default value for inclusion or not of documents with a https status different from 200 (success) */
public static final boolean DEFAULT_INCLUDE_FAILED = false ;
2021-08-17 09:23:21 -04:00
2020-07-09 13:32:16 -04:00
/** The default value whether to delete on Recrawl */
public static final boolean DEFAULT_DELETE_ON_RECRAWL = false ;
2018-01-08 15:20:46 -05:00
/** The current query selecting documents to recrawl */
private String currentQuery ;
2021-08-17 09:23:21 -04:00
2018-01-08 15:20:46 -05:00
/** flag if docs with httpstatus_i <> 200 shall be recrawled */
private boolean includefailed ;
2021-08-17 09:23:21 -04:00
2020-07-09 13:32:16 -04:00
/** flag whether to delete on Recrawl */
private boolean deleteOnRecrawl ;
2021-08-17 09:23:21 -04:00
2015-05-15 19:23:08 -04:00
private int chunkstart = 0 ;
2019-01-04 12:46:59 -05:00
private final int chunksize = 100 ;
2018-01-09 04:22:26 -05:00
private final Switchboard sb ;
2021-08-17 09:23:21 -04:00
2018-01-08 15:20:46 -05:00
/** buffer of urls to recrawl */
private final Set < DigestURL > urlstack ;
2021-08-17 09:23:21 -04:00
2018-01-09 04:22:26 -05:00
/** The total number of candidate URLs found for recrawl */
private long urlsToRecrawl = 0 ;
2021-08-17 09:23:21 -04:00
2018-01-09 16:33:15 -05:00
/** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0 ;
2021-08-17 09:23:21 -04:00
2018-01-12 05:47:13 -05:00
/** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */
private long rejectedUrlsCount = 0 ;
2021-08-17 09:23:21 -04:00
2018-01-12 05:47:13 -05:00
/** Total number of malformed URLs found */
private long malformedUrlsCount = 0 ;
2021-08-17 09:23:21 -04:00
2018-01-12 05:47:13 -05:00
/** Total number of malformed URLs deleted from index */
private long malformedUrlsDeletedCount = 0 ;
2021-08-17 09:23:21 -04:00
private final String solrSortBy ;
2018-01-09 04:22:26 -05:00
/** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true ;
2021-08-17 09:23:21 -04:00
2018-01-12 05:47:13 -05:00
/** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */
private boolean terminatedBySolrFailure = false ;
2021-08-17 09:23:21 -04:00
2018-01-09 16:33:15 -05:00
/** The recrawl job start time */
private LocalDateTime startTime ;
2021-08-17 09:23:21 -04:00
2018-01-09 16:33:15 -05:00
/** The recrawl job end time */
private LocalDateTime endTime ;
2015-05-15 19:23:08 -04:00
2021-08-17 09:23:21 -04:00
/ * *
* @param xsb
* the Switchboard instance holding server environment
* @param query
* the Solr selection query
* @param includeFailed
* set to true when documents with a https status different from 200
* ( success ) must be included
* /
2020-07-09 13:32:16 -04:00
public RecrawlBusyThread ( final Switchboard xsb , final String query , final boolean includeFailed , final boolean deleteOnRecrawl ) {
2015-05-15 19:23:08 -04:00
super ( 3000 , 1000 ) ; // set lower limits of cycle delay
2021-08-17 09:23:21 -04:00
this . setName ( THREAD_NAME ) ;
2015-05-15 19:23:08 -04:00
this . setIdleSleep ( 10 * 60000 ) ; // set actual cycle delays
this . setBusySleep ( 2 * 60000 ) ;
2015-05-16 18:13:00 -04:00
this . setPriority ( Thread . MIN_PRIORITY ) ;
2020-07-09 13:32:16 -04:00
this . setLoadPreReqisite ( 1 ) ;
2015-05-15 19:23:08 -04:00
this . sb = xsb ;
2018-01-08 15:20:46 -05:00
this . currentQuery = query ;
this . includefailed = includeFailed ;
2020-07-09 13:32:16 -04:00
this . deleteOnRecrawl = deleteOnRecrawl ;
2021-08-17 09:23:21 -04:00
this . urlstack = new HashSet < > ( ) ;
2015-10-03 23:43:40 -04:00
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
2021-08-17 09:23:21 -04:00
this . solrSortBy = CollectionSchema . load_date_dt . getSolrFieldName ( ) + " asc " ;
2018-01-26 04:31:13 -05:00
final SolrConnector solrConnector = this . sb . index . fulltext ( ) . getDefaultConnector ( ) ;
if ( solrConnector ! = null & & ! solrConnector . isClosed ( ) ) {
2021-08-17 09:23:21 -04:00
/* Ensure indexed data is up-to-date before running the main job */
solrConnector . commit ( true ) ;
2018-01-26 04:31:13 -05:00
}
2015-05-15 19:23:08 -04:00
}
2015-06-06 12:45:39 -04:00
/ * *
* Set the query to select documents to recrawl
* and resets the counter to start a fresh query loop
* @param q select query
* @param includefailedurls true = all http status docs are recrawled , false = httpstatus = 200 docs are recrawled
2020-07-09 13:32:16 -04:00
* @param deleteOnRecrawl
2015-06-06 12:45:39 -04:00
* /
2020-07-09 13:32:16 -04:00
public void setQuery ( String q , boolean includefailedurls , final boolean deleteOnRecrawl ) {
2015-06-06 12:45:39 -04:00
this . currentQuery = q ;
this . includefailed = includefailedurls ;
2020-07-09 13:32:16 -04:00
this . deleteOnRecrawl = deleteOnRecrawl ;
2015-06-06 12:45:39 -04:00
this . chunkstart = 0 ;
}
2018-01-09 16:33:15 -05:00
public String getQuery ( ) {
2015-06-06 12:45:39 -04:00
return this . currentQuery ;
}
2021-08-17 09:23:21 -04:00
/ * *
*
* @param queryBase
* the base query
* @param includeFailed
* set to true when documents with a https status different from 200
* ( success ) must be included
* @return the Solr selection query for candidate URLs to recrawl
* /
public static final String buildSelectionQuery ( final String queryBase , final boolean includeFailed ) {
return includeFailed ? queryBase : queryBase + " AND ( " + CollectionSchema . httpstatus_i . name ( ) + " :200) " ;
}
2015-06-06 12:45:39 -04:00
/ * *
* Flag to include failed urls ( httpstatus_i < > 200 )
* if true - > currentQuery is used as is ,
* if false - > the term " AND (httpstatus_i:200) " is appended to currentQuery
* @param includefailedurls
* /
public void setIncludeFailed ( boolean includefailedurls ) {
this . includefailed = includefailedurls ;
}
public boolean getIncludeFailed ( ) {
return this . includefailed ;
}
2021-08-17 09:23:21 -04:00
2020-07-09 13:32:16 -04:00
public void setDeleteOnRecrawl ( final boolean deleteOnRecrawl ) {
this . deleteOnRecrawl = deleteOnRecrawl ;
}
2021-08-17 09:23:21 -04:00
2020-07-09 13:32:16 -04:00
public boolean getDeleteOnRecrawl ( ) {
return this . deleteOnRecrawl ;
}
2015-06-06 12:45:39 -04:00
2015-05-15 19:23:08 -04:00
/ * *
* feed urls to the local crawler
2021-08-17 09:23:21 -04:00
* ( Switchboard . addToCrawler ( ) is not used here , as there existing urls are always skipped )
2015-05-15 19:23:08 -04:00
*
* @return true if urls were added / accepted to the crawler
* /
private boolean feedToCrawler ( ) {
int added = 0 ;
if ( ! this . urlstack . isEmpty ( ) ) {
2021-08-17 09:23:21 -04:00
final CrawlProfile profile = this . sb . crawler . defaultRecrawlJobProfile ;
2015-05-15 19:23:08 -04:00
2018-01-12 05:47:13 -05:00
for ( final DigestURL url : this . urlstack ) {
2021-08-17 09:23:21 -04:00
final Request request = new Request ( ASCII . getBytes ( this . sb . peers . mySeed ( ) . hash ) , url , null , " " ,
new Date ( ) , profile . handle ( ) , 0 , profile . timezoneOffset ( ) ) ;
String acceptedError = this . sb . crawlStacker . checkAcceptanceChangeable ( url , profile , 0 ) ;
if ( ! this . includefailed & & acceptedError = = null ) { // skip check if failed docs to be included
acceptedError = this . sb . crawlStacker . checkAcceptanceInitially ( url , profile ) ;
2015-05-15 19:23:08 -04:00
}
if ( acceptedError ! = null ) {
2021-08-17 09:23:21 -04:00
this . rejectedUrlsCount + + ;
2015-05-15 19:23:08 -04:00
ConcurrentLog . info ( THREAD_NAME , " addToCrawler: cannot load " + url . toNormalform ( true ) + " : " + acceptedError ) ;
continue ;
}
final String s ;
2021-08-17 09:23:21 -04:00
s = this . sb . crawlQueues . noticeURL . push ( NoticedURL . StackType . LOCAL , request , profile , this . sb . robots ) ;
2015-05-15 19:23:08 -04:00
if ( s ! = null ) {
2021-08-17 09:23:21 -04:00
this . rejectedUrlsCount + + ;
2015-05-15 19:23:08 -04:00
ConcurrentLog . info ( THREAD_NAME , " addToCrawler: failed to add " + url . toNormalform ( true ) + " : " + s ) ;
} else {
added + + ;
2018-01-09 16:33:15 -05:00
this . recrawledUrlsCount + + ;
2015-05-15 19:23:08 -04:00
}
}
this . urlstack . clear ( ) ;
}
2015-05-16 18:13:00 -04:00
return ( added > 0 ) ;
2015-05-15 19:23:08 -04:00
}
/ * *
* Process query and hand over urls to the crawler
*
* @return true if something processed
* /
@Override
public boolean job ( ) {
2015-10-15 21:05:39 -04:00
// more than chunksize crawls are running, do nothing
2021-08-17 09:23:21 -04:00
if ( this . sb . crawlQueues . coreCrawlJobSize ( ) > this . chunksize ) {
2015-05-15 19:23:08 -04:00
return false ;
}
2018-01-09 04:22:26 -05:00
boolean didSomething = false ;
2015-05-15 19:23:08 -04:00
if ( this . urlstack . isEmpty ( ) ) {
2021-08-17 09:23:21 -04:00
if ( ! this . moreToRecrawl ) {
/ * We do not remove the thread from the Switchboard worker threads using serverSwitch . terminateThread ( String , boolean ) ,
* because we want to be able to provide a report after its termination * /
this . terminate ( false ) ;
} else {
this . moreToRecrawl = this . processSingleQuery ( ) ;
/* Even if no more URLs are to recrawl, the job has done something by searching the Solr index */
didSomething = true ;
}
2018-01-09 04:22:26 -05:00
} else {
2021-08-17 09:23:21 -04:00
didSomething = this . feedToCrawler ( ) ;
2015-05-15 19:23:08 -04:00
}
2018-01-09 04:22:26 -05:00
return didSomething ;
2018-01-09 16:33:15 -05:00
}
2021-08-17 09:23:21 -04:00
2018-01-09 16:33:15 -05:00
@Override
public synchronized void start ( ) {
2021-08-17 09:23:21 -04:00
this . startTime = LocalDateTime . now ( ) ;
super . start ( ) ;
2018-01-09 16:33:15 -05:00
}
2021-08-17 09:23:21 -04:00
2018-01-09 16:33:15 -05:00
@Override
public void terminate ( boolean waitFor ) {
2021-08-17 09:23:21 -04:00
super . terminate ( waitFor ) ;
this . endTime = LocalDateTime . now ( ) ;
2015-05-15 19:23:08 -04:00
}
/ * *
* Selects documents to recrawl the urls
2015-05-16 18:13:00 -04:00
* @return true if query has more results
2015-05-15 19:23:08 -04:00
* /
2015-05-16 18:13:00 -04:00
private boolean processSingleQuery ( ) {
2015-05-15 19:23:08 -04:00
if ( ! this . urlstack . isEmpty ( ) ) {
2015-05-16 18:13:00 -04:00
return true ;
2015-05-15 19:23:08 -04:00
}
SolrDocumentList docList = null ;
2021-08-17 09:23:21 -04:00
final SolrConnector solrConnector = this . sb . index . fulltext ( ) . getDefaultConnector ( ) ;
2018-01-26 04:31:13 -05:00
if ( solrConnector = = null | | solrConnector . isClosed ( ) ) {
2021-08-17 09:23:21 -04:00
this . urlsToRecrawl = 0 ;
this . terminatedBySolrFailure = true ;
return false ;
2018-01-09 04:22:26 -05:00
}
2021-08-17 09:23:21 -04:00
2018-01-09 04:22:26 -05:00
try {
// query all or only httpstatus=200 depending on includefailed flag
docList = solrConnector . getDocumentListByQuery ( RecrawlBusyThread . buildSelectionQuery ( this . currentQuery , this . includefailed ) ,
2020-07-22 16:15:00 -04:00
this . solrSortBy , this . chunkstart , this . chunksize , CollectionSchema . id . getSolrFieldName ( ) , CollectionSchema . sku . getSolrFieldName ( ) ) ;
2018-01-09 04:22:26 -05:00
this . urlsToRecrawl = docList . getNumFound ( ) ;
} catch ( final Throwable e ) {
2021-08-17 09:23:21 -04:00
this . urlsToRecrawl = 0 ;
this . terminatedBySolrFailure = true ;
2015-05-15 19:23:08 -04:00
}
if ( docList ! = null ) {
2021-08-17 09:23:21 -04:00
final Set < String > tobedeletedIDs = new HashSet < > ( ) ;
2018-01-09 04:22:26 -05:00
for ( final SolrDocument doc : docList ) {
2015-05-15 19:23:08 -04:00
try {
this . urlstack . add ( new DigestURL ( ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ) ) ;
2021-08-17 09:23:21 -04:00
if ( this . deleteOnRecrawl ) tobedeletedIDs . add ( ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ) ;
2018-01-12 05:47:13 -05:00
} catch ( final MalformedURLException ex ) {
2021-08-17 09:23:21 -04:00
this . malformedUrlsCount + + ;
2020-07-09 13:32:16 -04:00
// if index entry hasn't a valid url (useless), delete it
tobedeletedIDs . add ( ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ) ;
this . malformedUrlsDeletedCount + + ;
ConcurrentLog . severe ( THREAD_NAME , " deleted index document with invalid url " + ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ) ;
2015-05-15 19:23:08 -04:00
}
}
2021-08-17 09:23:21 -04:00
2020-07-09 13:32:16 -04:00
if ( ! tobedeletedIDs . isEmpty ( ) ) try {
solrConnector . deleteByIds ( tobedeletedIDs ) ;
2020-07-22 16:15:00 -04:00
solrConnector . commit ( false ) ;
2021-08-17 09:23:21 -04:00
} catch ( final IOException e ) {
2020-07-09 13:32:16 -04:00
ConcurrentLog . severe ( THREAD_NAME , " error deleting IDs " , e ) ;
}
2021-08-17 09:23:21 -04:00
this . chunkstart = this . deleteOnRecrawl ? 0 : this . chunkstart + this . chunksize ;
2015-05-15 19:23:08 -04:00
}
2021-08-17 09:23:21 -04:00
2018-01-09 04:22:26 -05:00
if ( docList = = null | | docList . size ( ) < this . chunksize ) {
2015-05-16 18:13:00 -04:00
return false ;
}
return true ;
2015-05-15 19:23:08 -04:00
}
2021-08-17 09:23:21 -04:00
/ * *
* @return a new default CrawlProfile instance to be used for recrawl jobs .
* /
public static CrawlProfile buildDefaultCrawlProfile ( ) {
final CrawlProfile profile = new CrawlProfile ( CrawlSwitchboard . CRAWL_PROFILE_RECRAWL_JOB , CrawlProfile . MATCH_ALL_STRING , // crawlerUrlMustMatch
CrawlProfile . MATCH_NEVER_STRING , // crawlerUrlMustNotMatch
CrawlProfile . MATCH_ALL_STRING , // crawlerIpMustMatch
CrawlProfile . MATCH_NEVER_STRING , // crawlerIpMustNotMatch
CrawlProfile . MATCH_NEVER_STRING , // crawlerCountryMustMatch
CrawlProfile . MATCH_NEVER_STRING , // crawlerNoDepthLimitMatch
CrawlProfile . MATCH_ALL_STRING , // indexUrlMustMatch
CrawlProfile . MATCH_NEVER_STRING , // indexUrlMustNotMatch
CrawlProfile . MATCH_ALL_STRING , // indexContentMustMatch
CrawlProfile . MATCH_NEVER_STRING , // indexContentMustNotMatch
2023-01-16 08:50:30 -05:00
true , //noindexWhenCanonicalUnequalURL
2021-08-17 09:23:21 -04:00
0 , false , CrawlProfile . getRecrawlDate ( CrawlSwitchboard . CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE ) , - 1 ,
true , true , true , false , // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true , true , true , false , - 1 , false , true , CrawlProfile . MATCH_NEVER_STRING , CacheStrategy . IFFRESH ,
" robot_ " + CrawlSwitchboard . CRAWL_PROFILE_RECRAWL_JOB ,
2023-01-14 19:20:12 -05:00
ClientIdentification . yacyInternetCrawlerAgentName ,
TagValency . EVAL , null , null , 0 ) ;
2021-08-17 09:23:21 -04:00
return profile ;
}
2015-05-15 19:23:08 -04:00
@Override
public int getJobCount ( ) {
return this . urlstack . size ( ) ;
}
2021-08-17 09:23:21 -04:00
2018-01-09 04:22:26 -05:00
/ * *
* @return The total number of candidate URLs found for recrawl
* /
public long getUrlsToRecrawl ( ) {
2021-08-17 09:23:21 -04:00
return this . urlsToRecrawl ;
}
2015-05-15 19:23:08 -04:00
2021-08-17 09:23:21 -04:00
/ * *
* @return The total number of URLs added to the crawler queue for recrawl
* /
2018-01-09 16:33:15 -05:00
public long getRecrawledUrlsCount ( ) {
2021-08-17 09:23:21 -04:00
return this . recrawledUrlsCount ;
}
/ * *
* @return The total number of URLs rejected for some reason by the crawl
* stacker or the crawler queue
* /
2018-01-12 05:47:13 -05:00
public long getRejectedUrlsCount ( ) {
2021-08-17 09:23:21 -04:00
return this . rejectedUrlsCount ;
}
2018-01-12 05:47:13 -05:00
2021-08-17 09:23:21 -04:00
/ * *
* @return The total number of malformed URLs found
* /
2018-01-12 05:47:13 -05:00
public long getMalformedUrlsCount ( ) {
2021-08-17 09:23:21 -04:00
return this . malformedUrlsCount ;
}
/ * *
* @return The total number of malformed URLs deleted from index
* /
2018-01-12 05:47:13 -05:00
public long getMalformedUrlsDeletedCount ( ) {
2021-08-17 09:23:21 -04:00
return this . malformedUrlsDeletedCount ;
}
/ * *
* @return true when the job terminated early because an error occurred when
* requesting the Solr index , or the Solr index was closed
* /
2018-01-12 05:47:13 -05:00
public boolean isTerminatedBySolrFailure ( ) {
2021-08-17 09:23:21 -04:00
return this . terminatedBySolrFailure ;
}
2018-01-09 16:33:15 -05:00
/** @return The recrawl job start time */
public LocalDateTime getStartTime ( ) {
2021-08-17 09:23:21 -04:00
return this . startTime ;
}
2018-01-09 16:33:15 -05:00
/** @return The recrawl job end time */
public LocalDateTime getEndTime ( ) {
2021-08-17 09:23:21 -04:00
return this . endTime ;
}
2018-01-09 16:33:15 -05:00
2015-05-15 19:23:08 -04:00
@Override
public void freemem ( ) {
this . urlstack . clear ( ) ;
}
}