2005-10-09 19:11:17 -04:00
// plasmaCrawlStacker.java
2005-10-09 11:59:09 -04:00
// -----------------------
// part of YaCy
2021-08-17 09:23:21 -04:00
// SPDX-FileCopyrightText: 2005 Michael Peter Christen <mc@yacy.net)>
2021-03-11 06:17:11 -05:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-10-09 11:59:09 -04:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
// This file was contributed by Martin Thelian
2007-10-28 21:43:20 -04:00
// ([MC] removed all multithreading and thread pools, this is not necessary here; complete renovation 2007)
2005-10-09 11:59:09 -04:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 09:48:16 -04:00
package net.yacy.crawler ;
2005-10-05 06:45:33 -04:00
2010-12-09 12:17:25 -05:00
import java.io.IOException ;
2010-08-11 21:29:56 -04:00
import java.net.InetAddress ;
2010-12-09 12:17:25 -05:00
import java.net.MalformedURLException ;
2005-10-05 06:45:33 -04:00
import java.util.Date ;
2013-10-24 10:20:20 -04:00
import java.util.HashSet ;
2013-09-05 07:22:16 -04:00
import java.util.List ;
2011-09-29 11:17:39 -04:00
import java.util.Locale ;
2013-10-24 10:20:20 -04:00
import java.util.Set ;
2010-12-09 12:17:25 -05:00
import java.util.concurrent.BlockingQueue ;
2012-10-29 16:08:45 -04:00
import java.util.concurrent.atomic.AtomicInteger ;
2005-10-05 06:45:33 -04:00
2016-11-05 14:40:14 -04:00
import net.yacy.cora.date.ISO8601Formatter ;
2013-09-14 18:30:23 -04:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.encoding.UTF8 ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
2013-09-17 09:27:02 -04:00
import net.yacy.cora.federate.solr.FailCategory ;
2012-09-21 10:46:57 -04:00
import net.yacy.cora.order.Base64Order ;
2010-08-23 08:32:02 -04:00
import net.yacy.cora.protocol.Domains ;
2010-12-09 12:17:25 -05:00
import net.yacy.cora.protocol.ftp.FTPClient ;
2013-07-09 08:28:25 -04:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-21 09:48:16 -04:00
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.CrawlQueues ;
import net.yacy.crawler.data.NoticedURL ;
import net.yacy.crawler.retrieval.Request ;
2012-10-29 16:08:45 -04:00
import net.yacy.crawler.robots.RobotsTxt ;
2013-11-04 05:59:28 -05:00
import net.yacy.document.TextParser ;
2009-10-10 20:12:19 -04:00
import net.yacy.kelondro.workflow.WorkflowProcessor ;
2018-01-15 04:05:49 -05:00
import net.yacy.kelondro.workflow.WorkflowTask ;
2011-10-04 05:06:24 -04:00
import net.yacy.peers.SeedDB ;
2012-06-10 18:17:30 -04:00
import net.yacy.repository.Blacklist.BlacklistType ;
2010-10-30 10:44:33 -04:00
import net.yacy.repository.FilterEngine ;
2011-09-25 12:59:06 -04:00
import net.yacy.search.Switchboard ;
import net.yacy.search.index.Segment ;
2005-10-05 06:45:33 -04:00
2018-01-15 04:05:49 -05:00
public final class CrawlStacker implements WorkflowTask < Request > {
2021-08-17 09:23:21 -04:00
2012-09-14 06:04:54 -04:00
public static String ERROR_NO_MATCH_MUST_MATCH_FILTER = " url does not match must-match filter " ;
public static String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER = " url matches must-not-match filter " ;
2021-08-17 09:23:21 -04:00
2018-01-13 04:45:00 -05:00
/** Crawl reject reason prefix having specific processing */
public static final String CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX = " double in " ;
2021-08-17 09:23:21 -04:00
2013-11-13 00:18:48 -05:00
private final static ConcurrentLog log = new ConcurrentLog ( " STACKCRAWL " ) ;
2021-08-17 09:23:21 -04:00
2012-10-29 16:08:45 -04:00
private final RobotsTxt robots ;
2013-02-27 14:58:34 -05:00
private final WorkflowProcessor < Request > requestQueue ;
2013-09-17 09:27:02 -04:00
public final CrawlQueues nextQueue ;
2010-06-25 12:44:57 -04:00
private final CrawlSwitchboard crawler ;
private final Segment indexSegment ;
2011-11-08 10:38:08 -05:00
private final SeedDB peers ;
2010-06-25 12:44:57 -04:00
private final boolean acceptLocalURLs , acceptGlobalURLs ;
2010-10-30 10:44:33 -04:00
private final FilterEngine domainList ;
2009-03-16 14:08:43 -04:00
2008-12-14 19:02:58 -05:00
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
2009-03-16 14:08:43 -04:00
2009-05-28 10:26:05 -04:00
public CrawlStacker (
2012-10-29 16:08:45 -04:00
final RobotsTxt robots ,
2011-09-08 08:23:55 -04:00
final CrawlQueues cq ,
final CrawlSwitchboard cs ,
final Segment indexSegment ,
2011-10-04 05:06:24 -04:00
final SeedDB peers ,
2011-09-08 08:23:55 -04:00
final boolean acceptLocalURLs ,
final boolean acceptGlobalURLs ,
final FilterEngine domainList ) {
2012-10-29 16:08:45 -04:00
this . robots = robots ;
2008-12-14 19:02:58 -05:00
this . nextQueue = cq ;
2009-05-28 10:26:05 -04:00
this . crawler = cs ;
this . indexSegment = indexSegment ;
this . peers = peers ;
2008-12-14 19:02:58 -05:00
this . acceptLocalURLs = acceptLocalURLs ;
this . acceptGlobalURLs = acceptGlobalURLs ;
2010-10-30 10:44:33 -04:00
this . domainList = domainList ;
2021-08-17 09:23:21 -04:00
this . requestQueue = new WorkflowProcessor < > ( " CrawlStacker " , " This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter) " , new String [ ] { " Balancer " } , this , 10000 , null , WorkflowProcessor . availableCPU ) ;
2013-11-13 00:18:48 -05:00
CrawlStacker . log . info ( " STACKCRAWL thread initialized. " ) ;
2007-10-28 21:43:20 -04:00
}
2005-11-15 07:46:22 -05:00
2008-12-14 19:02:58 -05:00
public int size ( ) {
2013-05-13 07:28:07 -04:00
return this . requestQueue . getQueueSize ( ) ;
2006-10-17 17:01:35 -04:00
}
2021-08-17 09:23:21 -04:00
2009-12-01 19:37:59 -05:00
public boolean isEmpty ( ) {
2013-02-27 14:58:34 -05:00
if ( ! this . requestQueue . queueIsEmpty ( ) ) return false ;
2009-12-01 19:37:59 -05:00
return true ;
}
2008-12-14 19:02:58 -05:00
2008-12-17 17:53:06 -05:00
public void clear ( ) {
2013-02-27 14:58:34 -05:00
this . requestQueue . clear ( ) ;
2008-06-04 17:34:57 -04:00
}
2009-03-16 14:08:43 -04:00
2008-12-19 10:26:01 -05:00
public void announceClose ( ) {
2021-08-17 09:23:21 -04:00
CrawlStacker . log . info ( " Flushing remaining " + this . size ( ) + " crawl stacker job entries. " ) ;
2013-02-27 14:58:34 -05:00
this . requestQueue . shutdown ( ) ;
2008-12-19 10:26:01 -05:00
}
2009-03-16 14:08:43 -04:00
2012-05-14 01:41:55 -04:00
public synchronized void close ( ) {
2021-08-17 09:23:21 -04:00
CrawlStacker . log . info ( " Shutdown. waiting for remaining " + this . size ( ) + " crawl stacker job entries. please wait. " ) ;
2013-02-27 14:58:34 -05:00
this . requestQueue . shutdown ( ) ;
2022-10-05 16:13:06 -04:00
// busy waiting for the queue to empty
for ( int i = 0 ; i < 10 ; i + + ) {
if ( this . size ( ) < = 0 ) break ;
try { Thread . sleep ( 1000 ) ; } catch ( InterruptedException e ) { }
}
2009-03-16 14:08:43 -04:00
2013-11-13 00:18:48 -05:00
CrawlStacker . log . info ( " Shutdown. Closing stackCrawl queue. " ) ;
2007-10-28 21:43:20 -04:00
2021-08-17 09:23:21 -04:00
this . clear ( ) ;
2005-10-05 06:45:33 -04:00
}
2008-12-14 19:02:58 -05:00
2018-01-15 04:05:49 -05:00
@Override
public Request process ( final Request entry ) {
2008-12-14 19:02:58 -05:00
// this is the method that is called by the busy thread from outside
2008-12-18 18:18:34 -05:00
if ( entry = = null ) return null ;
2021-08-17 09:23:21 -04:00
2007-10-28 21:43:20 -04:00
try {
2021-08-17 09:23:21 -04:00
final String rejectReason = this . stackCrawl ( entry ) ;
2007-10-28 21:43:20 -04:00
// if the url was rejected we store it into the error URL db
2018-01-13 04:45:00 -05:00
if ( rejectReason ! = null & & ! rejectReason . startsWith ( CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX ) ) {
2013-09-25 12:27:54 -04:00
final CrawlProfile profile = this . crawler . get ( UTF8 . getBytes ( entry . profileHandle ( ) ) ) ;
2014-04-17 07:21:43 -04:00
this . nextQueue . errorURL . push ( entry . url ( ) , entry . depth ( ) , profile , FailCategory . FINAL_LOAD_CONTEXT , rejectReason , - 1 ) ;
2006-11-29 18:09:56 -05:00
}
2008-08-02 08:12:04 -04:00
} catch ( final Exception e ) {
2013-11-13 00:18:48 -05:00
CrawlStacker . log . warn ( " Error while processing stackCrawl entry. \ n " + " Entry: " + entry . toString ( ) + " Error: " + e . toString ( ) , e ) ;
2008-12-18 18:18:34 -05:00
return null ;
2005-10-05 06:45:33 -04:00
}
2008-12-18 18:18:34 -05:00
return null ;
2005-10-05 06:45:33 -04:00
}
2009-03-16 14:08:43 -04:00
2009-07-15 17:07:46 -04:00
public void enqueueEntry ( final Request entry ) {
2009-03-16 14:08:43 -04:00
2008-08-26 12:34:24 -04:00
// DEBUG
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFinest ( ) ) CrawlStacker . log . finest ( " ENQUEUE " + entry . url ( ) + " , referer= " + entry . referrerhash ( ) + " , initiator= " + ( ( entry . initiator ( ) = = null ) ? " " : ASCII . String ( entry . initiator ( ) ) ) + " , name= " + entry . name ( ) + " , appdate= " + entry . appdate ( ) + " , depth= " + entry . depth ( ) ) ;
2013-04-25 05:33:17 -04:00
this . requestQueue . enQueue ( entry ) ;
2005-10-05 06:45:33 -04:00
}
2021-08-17 09:23:21 -04:00
2015-04-15 07:17:23 -04:00
public void enqueueEntriesAsynchronous (
final byte [ ] initiator ,
final String profileHandle ,
final List < AnchorURL > hyperlinks ,
final int timezoneOffset ) {
2016-10-21 07:03:31 -04:00
new Thread ( " enqueueEntriesAsynchronous " ) {
2012-02-23 18:38:57 -05:00
@Override
2011-04-18 12:11:16 -04:00
public void run ( ) {
2021-08-17 09:23:21 -04:00
CrawlStacker . this . enqueueEntries ( initiator , profileHandle , hyperlinks , true , timezoneOffset ) ;
2011-04-18 12:11:16 -04:00
}
} . start ( ) ;
}
2021-08-17 09:23:21 -04:00
2016-10-21 07:03:31 -04:00
/ * *
* Enqueue crawl start entries
* @param initiator Hash of the peer initiating the crawl
* @param profileHandle name of the active crawl profile
* @param hyperlinks crawl starting points links to stack
* @param replace Specify whether old indexed entries should be replaced
* @param timezoneOffset local time - zone offset
2016-10-22 02:23:48 -04:00
* @throws IllegalCrawlProfileException when the crawl profile is not active
2016-10-21 07:03:31 -04:00
* /
public void enqueueEntries (
2015-04-15 07:17:23 -04:00
final byte [ ] initiator ,
final String profileHandle ,
final List < AnchorURL > hyperlinks ,
final boolean replace ,
final int timezoneOffset ) {
2021-08-17 09:23:21 -04:00
/* Let's check if the profile is still active before removing any existing entry */
final byte [ ] handle = UTF8 . getBytes ( profileHandle ) ;
2016-10-21 07:03:31 -04:00
final CrawlProfile profile = this . crawler . get ( handle ) ;
if ( profile = = null ) {
String error ;
if ( hyperlinks . size ( ) = = 1 ) {
2021-08-17 09:23:21 -04:00
error = " Rejected URL : " + hyperlinks . get ( 0 ) . toNormalform ( false ) + " . Reason : LOST STACKER PROFILE HANDLE ' " + profileHandle + " ' " ;
2016-10-21 07:03:31 -04:00
} else {
2021-08-17 09:23:21 -04:00
error = " Rejected " + hyperlinks . size ( ) + " crawl entries. Reason : LOST STACKER PROFILE HANDLE ' " + profileHandle + " ' " ;
2016-10-21 07:03:31 -04:00
}
2016-10-22 02:23:48 -04:00
CrawlStacker . log . info ( error ) ; // this is NOT an error but a normal behavior when terminating a crawl queue
/* Throw an exception to signal caller it can stop stacking URLs using this crawl profile */
throw new IllegalCrawlProfileException ( " Profile " + profileHandle + " is no more active " ) ;
2016-10-21 07:03:31 -04:00
}
2013-10-24 10:20:20 -04:00
if ( replace ) {
// delete old entries, if exists to force a re-load of the url (thats wanted here)
2021-08-17 09:23:21 -04:00
final Set < String > hosthashes = new HashSet < > ( ) ;
2013-10-24 10:20:20 -04:00
for ( final AnchorURL url : hyperlinks ) {
if ( url = = null ) continue ;
2017-01-16 04:18:42 -05:00
hosthashes . add ( url . hosthash ( ) ) ;
2013-10-24 10:20:20 -04:00
}
this . nextQueue . errorURL . removeHosts ( hosthashes ) ;
}
2013-09-15 17:27:04 -04:00
for ( final AnchorURL url : hyperlinks ) {
2013-09-14 18:30:23 -04:00
if ( url = = null ) continue ;
2011-09-08 08:23:55 -04:00
2010-12-09 12:17:25 -05:00
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
2012-08-17 09:52:33 -04:00
this . indexSegment . fulltext ( ) . remove ( urlhash ) ;
2012-10-10 05:46:22 -04:00
String u = url . toNormalform ( true ) ;
2011-03-09 07:50:39 -05:00
if ( u . endsWith ( " / " ) ) {
u = u + " index.html " ;
} else if ( ! u . contains ( " . " ) ) {
u = u + " /index.html " ;
}
try {
2013-09-14 18:30:23 -04:00
final byte [ ] uh = new DigestURL ( u ) . hash ( ) ;
2012-08-17 09:52:33 -04:00
this . indexSegment . fulltext ( ) . remove ( uh ) ;
2011-03-09 07:50:39 -05:00
this . nextQueue . noticeURL . removeByURLHash ( uh ) ;
2011-09-08 08:23:55 -04:00
} catch ( final MalformedURLException e1 ) { }
2010-12-09 12:17:25 -05:00
}
2011-09-08 08:23:55 -04:00
2010-12-10 19:31:57 -05:00
if ( url . getProtocol ( ) . equals ( " ftp " ) ) {
2021-08-17 09:23:21 -04:00
/ * put ftp site entries on the crawl stack ,
2016-12-22 10:25:09 -05:00
* using the crawl profile depth to control how many children folders of the url are stacked * /
2021-08-17 09:23:21 -04:00
this . enqueueEntriesFTP ( initiator , profile , url , replace , timezoneOffset ) ;
2010-12-10 19:31:57 -05:00
} else {
// put entry on crawl stack
2021-08-17 09:23:21 -04:00
this . enqueueEntry ( new Request (
2011-09-08 08:23:55 -04:00
initiator ,
url ,
null ,
2013-09-15 17:27:04 -04:00
url . getNameProperty ( ) ,
2010-12-10 19:31:57 -05:00
new Date ( ) ,
profileHandle ,
2015-04-15 07:17:23 -04:00
0 ,
timezoneOffset
2010-12-10 19:31:57 -05:00
) ) ;
}
2010-12-09 12:17:25 -05:00
}
}
2021-08-17 09:23:21 -04:00
2016-12-22 10:25:09 -05:00
/ * *
* Asynchronously enqueue crawl start entries for a ftp url .
* @param initiator Hash of the peer initiating the crawl
* @param profile the active crawl profile
* @param ftpURL crawl start point URL : protocol must be ftp
* @param replace Specify whether old indexed entries should be replaced
* @param timezoneOffset local time - zone offset
* /
2015-04-15 07:17:23 -04:00
public void enqueueEntriesFTP (
final byte [ ] initiator ,
2016-12-22 10:25:09 -05:00
final CrawlProfile profile ,
final DigestURL ftpURL ,
2015-04-15 07:17:23 -04:00
final boolean replace ,
final int timezoneOffset ) {
2010-12-09 12:17:25 -05:00
final CrawlQueues cq = this . nextQueue ;
2016-12-22 10:25:09 -05:00
final String userInfo = ftpURL . getUserInfo ( ) ;
final int p = userInfo = = null ? - 1 : userInfo . indexOf ( ':' ) ;
final String user = userInfo = = null ? FTPClient . ANONYMOUS : userInfo . substring ( 0 , p ) ;
final String pw = userInfo = = null | | p = = - 1 ? " anomic " : userInfo . substring ( p + 1 ) ;
final String host = ftpURL . getHost ( ) ;
final int port = ftpURL . getPort ( ) ;
2017-01-02 04:24:17 -05:00
final int pathParts = ftpURL . getPaths ( ) . length ;
2018-05-23 08:45:35 -04:00
new Thread ( " enqueueEntriesFTP " ) {
2012-02-23 18:38:57 -05:00
@Override
2010-12-09 12:17:25 -05:00
public void run ( ) {
BlockingQueue < FTPClient . entryInfo > queue ;
try {
2016-12-22 10:25:09 -05:00
queue = FTPClient . sitelist ( host , port , user , pw , ftpURL . getPath ( ) , profile . depth ( ) ) ;
2010-12-09 12:17:25 -05:00
FTPClient . entryInfo entry ;
while ( ( entry = queue . take ( ) ) ! = FTPClient . POISON_entryInfo ) {
2011-09-08 08:23:55 -04:00
2010-12-09 12:17:25 -05:00
// delete old entry, if exists to force a re-load of the url (thats wanted here)
2013-09-14 18:30:23 -04:00
DigestURL url = null ;
2010-12-09 12:17:25 -05:00
try {
2013-12-28 16:42:02 -05:00
url = new DigestURL ( " ftp:// " + user + " : " + pw + " @ " + host + ( port = = 21 ? " " : " : " + port ) + MultiProtocolURL . escape ( entry . name ) ) ;
2011-09-08 08:23:55 -04:00
} catch ( final MalformedURLException e ) {
2010-12-09 12:17:25 -05:00
continue ;
}
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
2012-08-17 09:52:33 -04:00
CrawlStacker . this . indexSegment . fulltext ( ) . remove ( urlhash ) ;
2010-12-09 12:17:25 -05:00
cq . noticeURL . removeByURLHash ( urlhash ) ;
}
2021-08-17 09:23:21 -04:00
/ * Each entry is a children resource of the starting ftp URL :
2017-01-02 04:24:17 -05:00
* take into account the sub folder depth in the crawl depth control * /
2021-08-17 09:23:21 -04:00
final int nextDepth = Math . max ( 0 , url . getPaths ( ) . length - pathParts ) ;
2011-09-08 08:23:55 -04:00
2010-12-09 12:17:25 -05:00
// put entry on crawl stack
2021-08-17 09:23:21 -04:00
CrawlStacker . this . enqueueEntry ( new Request (
2011-09-08 08:23:55 -04:00
initiator ,
url ,
null ,
2013-09-14 18:30:23 -04:00
MultiProtocolURL . unescape ( entry . name ) ,
2010-12-09 12:17:25 -05:00
entry . date ,
2016-12-22 10:25:09 -05:00
profile . handle ( ) ,
2017-01-02 04:24:17 -05:00
nextDepth ,
2015-04-15 07:17:23 -04:00
timezoneOffset ) ) ;
2010-12-09 12:17:25 -05:00
}
2011-09-08 08:23:55 -04:00
} catch ( final IOException e1 ) {
2013-12-28 16:42:02 -05:00
ConcurrentLog . logException ( e1 ) ;
2011-09-08 08:23:55 -04:00
} catch ( final InterruptedException e ) {
2010-12-09 12:17:25 -05:00
}
}
} . start ( ) ;
}
2011-09-08 08:23:55 -04:00
2011-04-28 09:04:33 -04:00
/ * *
* simple method to add one url as crawljob
* @param url
* @return null if successfull , a reason string if not successful
* /
2013-09-14 18:30:23 -04:00
public String stackSimpleCrawl ( final DigestURL url ) {
2021-08-17 09:23:21 -04:00
final CrawlProfile pe = this . crawler . defaultSurrogateProfile ;
return this . stackCrawl ( new Request (
2011-09-08 08:23:55 -04:00
this . peers . mySeed ( ) . hash . getBytes ( ) ,
2011-04-28 09:04:33 -04:00
url ,
null ,
" CRAWLING-ROOT " ,
new Date ( ) ,
pe . handle ( ) ,
2015-04-15 07:17:23 -04:00
0 , 0 ) ) ;
2011-04-28 09:04:33 -04:00
}
2011-09-08 08:23:55 -04:00
2011-04-28 09:04:33 -04:00
/ * *
* stacks a crawl item . The position can also be remote
* @param entry
* @return null if successful , a reason string if not successful
* /
2009-07-15 17:07:46 -04:00
public String stackCrawl ( final Request entry ) {
2006-10-10 20:46:45 -04:00
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
2011-09-08 08:23:55 -04:00
2021-08-17 09:23:21 -04:00
final byte [ ] handle = UTF8 . getBytes ( entry . profileHandle ( ) ) ;
2013-09-25 12:27:54 -04:00
final CrawlProfile profile = this . crawler . get ( handle ) ;
2010-06-25 12:44:57 -04:00
String error ;
if ( profile = = null ) {
2014-08-04 06:58:39 -04:00
error = " LOST STACKER PROFILE HANDLE ' " + entry . profileHandle ( ) + " ' for URL " + entry . url ( ) . toNormalform ( true ) ;
2014-09-16 08:41:04 -04:00
CrawlStacker . log . info ( error ) ; // this is NOT an error but a normal effect when terminating a crawl queue
2010-06-25 12:44:57 -04:00
return error ;
}
2011-09-08 08:23:55 -04:00
2021-08-17 09:23:21 -04:00
error = this . checkAcceptanceChangeable ( entry . url ( ) , profile , entry . depth ( ) ) ;
2013-09-26 07:41:52 -04:00
if ( error ! = null ) return error ;
2021-08-17 09:23:21 -04:00
error = this . checkAcceptanceInitially ( entry . url ( ) , profile ) ;
2010-06-25 12:44:57 -04:00
if ( error ! = null ) return error ;
2011-09-08 08:23:55 -04:00
2010-06-25 12:44:57 -04:00
// store information
2011-09-08 08:23:55 -04:00
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , UTF8 . getBytes ( this . peers . mySeed ( ) . hash ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | ASCII . String ( entry . initiator ( ) ) . equals ( " ------------ " ) ) & & profile . handle ( ) . equals ( this . crawler . defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( this . crawler . defaultRemoteProfile . handle ( ) ) ;
2010-06-25 12:44:57 -04:00
final boolean global =
2022-09-28 11:25:04 -04:00
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
( this . peers . mySeed ( ) . isSenior ( ) ) | |
( this . peers . mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
2010-06-25 12:44:57 -04:00
if ( ! local & & ! global & & ! remote & & ! proxy ) {
2011-05-27 04:24:54 -04:00
error = " URL ' " + entry . url ( ) . toString ( ) + " ' cannot be crawled. initiator = " + ( ( entry . initiator ( ) = = null ) ? " " : ASCII . String ( entry . initiator ( ) ) ) + " , profile.handle = " + profile . handle ( ) ;
2013-11-13 00:18:48 -05:00
CrawlStacker . log . severe ( error ) ;
2010-06-25 12:44:57 -04:00
return error ;
}
2011-09-08 08:23:55 -04:00
2011-03-09 07:50:39 -05:00
String warning = null ;
2018-10-25 04:42:12 -04:00
if ( ! profile . isCrawlerAlwaysCheckMediaType ( ) & & TextParser . supportsExtension ( entry . url ( ) ) ! = null ) {
2021-08-17 09:23:21 -04:00
if ( profile . isIndexNonParseableUrls ( ) ) {
/* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
warning = this . nextQueue . noticeURL . push ( NoticedURL . StackType . NOLOAD , entry , profile , this . robots ) ;
if ( warning ! = null & & CrawlStacker . log . isFine ( ) ) {
CrawlStacker . log . fine ( " CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true ) + " - not pushed to " + NoticedURL . StackType . NOLOAD + " stack : " + warning ) ;
}
return null ;
}
2018-10-25 04:42:12 -04:00
error = " URL ' " + entry . url ( ) . toString ( ) + " ' file extension is not supported and indexing of linked non-parsable documents is disabled. " ;
CrawlStacker . log . info ( error ) ;
return error ;
2010-12-10 19:31:57 -05:00
}
2011-02-11 19:01:40 -05:00
2010-06-25 12:44:57 -04:00
if ( global ) {
// it may be possible that global == true and local == true, so do not check an error case against it
2013-11-13 00:18:48 -05:00
if ( proxy ) CrawlStacker . log . warn ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: global = true, proxy = true, initiator = proxy " + " , profile.handle = " + profile . handle ( ) ) ;
if ( remote ) CrawlStacker . log . warn ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2013-07-16 12:18:55 -04:00
warning = this . nextQueue . noticeURL . push ( NoticedURL . StackType . GLOBAL , entry , profile , this . robots ) ;
2010-06-25 12:44:57 -04:00
} else if ( local ) {
2013-11-13 00:18:48 -05:00
if ( proxy ) CrawlStacker . log . warn ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: local = true, proxy = true, initiator = proxy " + " , profile.handle = " + profile . handle ( ) ) ;
if ( remote ) CrawlStacker . log . warn ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2013-07-16 12:18:55 -04:00
warning = this . nextQueue . noticeURL . push ( NoticedURL . StackType . LOCAL , entry , profile , this . robots ) ;
2010-06-25 12:44:57 -04:00
} else if ( proxy ) {
2013-11-13 00:18:48 -05:00
if ( remote ) CrawlStacker . log . warn ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2013-07-16 12:18:55 -04:00
warning = this . nextQueue . noticeURL . push ( NoticedURL . StackType . LOCAL , entry , profile , this . robots ) ;
2010-06-25 12:44:57 -04:00
} else if ( remote ) {
2013-07-16 12:18:55 -04:00
warning = this . nextQueue . noticeURL . push ( NoticedURL . StackType . REMOTE , entry , profile , this . robots ) ;
2010-06-25 12:44:57 -04:00
}
2013-11-13 00:18:48 -05:00
if ( warning ! = null & & CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true ) + " - not pushed: " + warning ) ;
2009-03-16 14:08:43 -04:00
2010-06-25 12:44:57 -04:00
return null ;
}
2006-09-04 07:46:17 -04:00
2013-09-26 07:41:52 -04:00
/ * *
* Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
* These tests are incomplete and must be followed with an checkAcceptanceChangeable - test .
* @param url
* @param profile
* @return null if the url is accepted , an error string in case if the url is not accepted with an error description
* /
public String checkAcceptanceInitially ( final DigestURL url , final CrawlProfile profile ) {
2011-09-08 08:23:55 -04:00
2006-09-03 10:59:00 -04:00
// check if the url is double registered
2014-07-11 13:52:25 -04:00
final HarvestProcess dbocc = this . nextQueue . exists ( url . hash ( ) ) ; // returns the name of the queue if entry exists
2014-07-11 12:36:04 -04:00
if ( dbocc ! = null ) {
2018-01-13 04:45:00 -05:00
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + " : " + dbocc . name ( ) ;
2014-07-11 12:36:04 -04:00
}
2022-09-28 11:25:04 -04:00
final String urls = url . toNormalform ( false ) ;
2021-12-20 10:23:05 -05:00
final long oldDate = this . indexSegment . getLoadTime ( url . hash ( ) ) ;
2011-02-11 19:01:40 -05:00
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile . domMaxPages ( ) ;
2012-10-28 14:56:02 -04:00
if ( maxAllowedPagesPerDomain < Integer . MAX_VALUE & & maxAllowedPagesPerDomain > 0 ) {
2012-10-29 16:08:45 -04:00
final AtomicInteger dp = profile . getCount ( url . getHost ( ) ) ;
if ( dp ! = null & & dp . get ( ) > = maxAllowedPagesPerDomain ) {
2021-12-19 21:47:56 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " URL ' " + urls + " ' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed. " ) ;
2013-09-26 07:41:52 -04:00
return " crawl stack domain counter exceeded (test by profile) " ;
2011-02-11 19:01:40 -05:00
}
2021-08-17 09:23:21 -04:00
2013-09-26 07:41:52 -04:00
/ *
2012-10-29 16:08:45 -04:00
if ( ResultURLs . domainCount ( EventOrigin . LOCAL_CRAWLING , url . getHost ( ) ) > = maxAllowedPagesPerDomain ) {
2013-07-09 08:28:25 -04:00
if ( this . log . isFine ( ) ) this . log . fine ( " URL ' " + urlstring + " ' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed. " ) ;
2013-09-26 07:41:52 -04:00
return " result stack domain counter exceeded (test by domainCount) " ;
2011-02-11 19:01:40 -05:00
}
2022-09-28 11:25:04 -04:00
* /
2013-09-26 07:41:52 -04:00
}
2021-08-17 09:23:21 -04:00
2021-12-20 10:23:05 -05:00
//final Long oldDate = oldEntry == null ? null : oldEntry.date;
if ( oldDate < 0 ) {
2014-09-20 07:06:46 -04:00
return null ; // no evidence that we know that url
}
2021-12-20 10:23:05 -05:00
final boolean recrawl = profile . recrawlIfOlder ( ) > oldDate ;
2014-09-20 07:06:46 -04:00
final String urlstring = url . toNormalform ( false ) ;
if ( recrawl ) {
if ( CrawlStacker . log . isFine ( ) )
CrawlStacker . log . fine ( " RE-CRAWL of URL ' " + urlstring + " ': this url was crawled " +
2022-09-28 11:25:04 -04:00
( ( System . currentTimeMillis ( ) - oldDate ) / 60000 / 60 / 24 ) + " days ago. " ) ;
2014-09-20 07:06:46 -04:00
} else {
2021-08-17 09:23:21 -04:00
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + " : local index, recrawl rejected. Document date = "
+ ISO8601Formatter . FORMATTER . format ( new Date ( oldDate ) ) + " is not older than crawl profile recrawl minimum date = "
+ ISO8601Formatter . FORMATTER . format ( new Date ( profile . recrawlIfOlder ( ) ) ) ;
2014-09-20 07:06:46 -04:00
}
2013-09-26 07:41:52 -04:00
return null ;
}
/ * *
* Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl .
* @param url
* @param profile
* @param depth
* @return null if the url is accepted , an error string in case if the url is not accepted with an error description
* /
public String checkAcceptanceChangeable ( final DigestURL url , final CrawlProfile profile , final int depth ) {
// check if the protocol is supported
final String urlProtocol = url . getProtocol ( ) ;
2014-07-18 06:43:01 -04:00
final String urlstring = url . toNormalform ( true ) ;
2013-09-26 07:41:52 -04:00
if ( ! Switchboard . getSwitchboard ( ) . loader . isSupportedProtocol ( urlProtocol ) ) {
2013-11-13 00:18:48 -05:00
CrawlStacker . log . severe ( " Unsupported protocol in URL ' " + urlstring + " '. " ) ;
2013-09-26 07:41:52 -04:00
return " unsupported protocol " ;
}
// check if ip is local ip address
2021-08-17 09:23:21 -04:00
final String urlRejectReason = this . urlInAcceptedDomain ( url ) ;
2013-09-26 07:41:52 -04:00
if ( urlRejectReason ! = null ) {
2023-01-16 08:50:30 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " URL not in accepted Domain ( " + urlRejectReason + " ) " ) ;
2013-09-26 07:41:52 -04:00
return " denied_( " + urlRejectReason + " ) " ;
}
// check blacklist
if ( Switchboard . urlBlacklist . isListed ( BlacklistType . CRAWLER , url ) ) {
2013-11-13 00:18:48 -05:00
CrawlStacker . log . fine ( " URL ' " + urlstring + " ' is in blacklist. " ) ;
2013-09-26 07:41:52 -04:00
return " url in blacklist " ;
}
// filter with must-match for URLs
if ( ( depth > 0 ) & & ! profile . urlMustMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
2021-08-17 09:23:21 -04:00
final String patternStr = profile . formattedUrlMustMatchPattern ( ) ;
2018-07-11 02:13:29 -04:00
if ( CrawlStacker . log . isFine ( ) ) {
2021-08-17 09:23:21 -04:00
CrawlStacker . log . fine ( " URL ' " + urlstring + " ' does not match must-match crawling filter ' " + patternStr + " '. " ) ;
2018-07-11 02:13:29 -04:00
}
return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr ;
2013-09-26 07:41:52 -04:00
}
// filter with must-not-match for URLs
if ( ( depth > 0 ) & & profile . urlMustNotMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " URL ' " + urlstring + " ' matches must-not-match crawling filter ' " + profile . urlMustNotMatchPattern ( ) . toString ( ) + " '. " ) ;
2013-09-26 07:41:52 -04:00
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile . urlMustNotMatchPattern ( ) . toString ( ) ;
}
// deny cgi
if ( url . isIndividual ( ) & & ! profile . crawlingQ ( ) ) { // TODO: make special property for crawlingIndividual
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " URL ' " + urlstring + " ' is CGI URL. " ) ;
2013-09-26 07:41:52 -04:00
return " individual url (sessionid etc) not wanted " ;
}
// deny post properties
if ( url . isPOST ( ) & & ! profile . crawlingQ ( ) ) {
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " URL ' " + urlstring + " ' is post URL. " ) ;
2013-09-26 07:41:52 -04:00
return " post url not allowed " ;
2011-02-11 19:01:40 -05:00
}
2011-09-08 08:23:55 -04:00
2011-09-29 11:17:39 -04:00
// the following filters use a DNS lookup to check if the url matches with IP filter
// this is expensive and those filters are check at the end of all other tests
// filter with must-match for IPs
2012-02-23 18:38:57 -05:00
if ( ( depth > 0 ) & & profile . ipMustMatchPattern ( ) ! = CrawlProfile . MATCH_ALL_PATTERN & & url . getHost ( ) ! = null & & ! profile . ipMustMatchPattern ( ) . matcher ( url . getInetAddress ( ) . getHostAddress ( ) ) . matches ( ) ) {
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " IP " + url . getInetAddress ( ) . getHostAddress ( ) + " of URL ' " + urlstring + " ' does not match must-match crawling filter ' " + profile . ipMustMatchPattern ( ) . toString ( ) + " '. " ) ;
2011-09-29 11:17:39 -04:00
return " ip " + url . getInetAddress ( ) . getHostAddress ( ) + " of url does not match must-match filter " ;
}
// filter with must-not-match for IPs
2012-02-26 18:52:44 -05:00
if ( ( depth > 0 ) & & profile . ipMustNotMatchPattern ( ) ! = CrawlProfile . MATCH_NEVER_PATTERN & & url . getHost ( ) ! = null & & profile . ipMustNotMatchPattern ( ) . matcher ( url . getInetAddress ( ) . getHostAddress ( ) ) . matches ( ) ) {
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " IP " + url . getInetAddress ( ) . getHostAddress ( ) + " of URL ' " + urlstring + " ' matches must-not-match crawling filter ' " + profile . ipMustNotMatchPattern ( ) . toString ( ) + " '. " ) ;
2011-09-29 11:17:39 -04:00
return " ip " + url . getInetAddress ( ) . getHostAddress ( ) + " of url matches must-not-match filter " ;
}
// filter with must-match for IPs
final String [ ] countryMatchList = profile . countryMustMatchList ( ) ;
if ( depth > 0 & & countryMatchList ! = null & & countryMatchList . length > 0 ) {
final Locale locale = url . getLocale ( ) ;
if ( locale ! = null ) {
final String c0 = locale . getCountry ( ) ;
boolean granted = false ;
matchloop : for ( final String c : countryMatchList ) {
if ( c0 . equals ( c ) ) {
granted = true ;
break matchloop ;
}
}
if ( ! granted ) {
2013-11-13 00:18:48 -05:00
if ( CrawlStacker . log . isFine ( ) ) CrawlStacker . log . fine ( " IP " + url . getInetAddress ( ) . getHostAddress ( ) + " of URL ' " + urlstring + " ' does not match must-match crawling filter ' " + profile . ipMustMatchPattern ( ) . toString ( ) + " '. " ) ;
2011-09-29 11:17:39 -04:00
return " country " + c0 + " of url does not match must-match filter for countries " ;
}
}
}
2005-10-05 06:45:33 -04:00
return null ;
}
2011-09-08 08:23:55 -04:00
2008-12-14 19:02:58 -05:00
/ * *
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )
* @param url
* @return null if the url can be accepted , a string containing a rejection reason if the url cannot be accepted
* /
2013-09-14 18:30:23 -04:00
public String urlInAcceptedDomain ( final DigestURL url ) {
2010-07-18 16:14:20 -04:00
// returns true if the url can be accepted according to network.unit.domain
2008-12-14 19:02:58 -05:00
if ( url = = null ) return " url is null " ;
2010-10-30 10:44:33 -04:00
// check domainList from network-definition
if ( this . domainList ! = null ) {
2021-08-17 09:23:21 -04:00
if ( ! this . domainList . isListed ( url , null ) ) {
return " the url ' " + url + " ' is not in domainList of this network " ;
}
2010-10-30 10:44:33 -04:00
}
2021-08-17 09:23:21 -04:00
2010-07-18 16:14:20 -04:00
final boolean local = url . isLocal ( ) ;
if ( this . acceptLocalURLs & & local ) return null ;
if ( this . acceptGlobalURLs & & ! local ) return null ;
2008-12-14 19:02:58 -05:00
final String host = url . getHost ( ) ;
2014-10-11 03:02:12 -04:00
if ( host = = null ) return " url.host is null (you must switch to intranet mode to crawl these sources) " ;
2008-12-14 19:02:58 -05:00
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
2011-09-08 08:23:55 -04:00
final InetAddress ia = Domains . dnsResolve ( host ) ;
2008-12-14 19:02:58 -05:00
return ( local ) ?
2022-09-28 11:25:04 -04:00
( " the host ' " + host + " ' is local, but local addresses are not accepted: " + ( ( ia = = null ) ? " DNS lookup resulted in null (unknown host name) " : ia . getHostAddress ( ) ) ) :
( " the host ' " + host + " ' is global, but global addresses are not accepted: " + ( ( ia = = null ) ? " null " : ia . getHostAddress ( ) ) ) ;
2008-12-14 19:02:58 -05:00
}
2011-09-08 08:23:55 -04:00
2010-04-15 09:22:59 -04:00
public String urlInAcceptedDomainHash ( final byte [ ] urlhash ) {
2010-07-18 16:14:20 -04:00
// returns true if the url can be accepted according to network.unit.domain
2009-04-20 02:38:28 -04:00
if ( urlhash = = null ) return " url is null " ;
// check if this is a local address and we are allowed to index local pages:
2020-08-02 14:14:56 -04:00
@SuppressWarnings ( " deprecation " )
2013-09-14 18:30:23 -04:00
final boolean local = DigestURL . isLocal ( urlhash ) ;
2010-07-18 16:14:20 -04:00
if ( this . acceptLocalURLs & & local ) return null ;
if ( this . acceptGlobalURLs & & ! local ) return null ;
2009-04-20 02:38:28 -04:00
return ( local ) ?
2022-09-28 11:25:04 -04:00
( " the urlhash ' " + ASCII . String ( urlhash ) + " ' is local, but local addresses are not accepted " ) :
( " the urlhash ' " + ASCII . String ( urlhash ) + " ' is global, but global addresses are not accepted " ) ;
2009-04-20 02:38:28 -04:00
}
2009-03-16 14:08:43 -04:00
2008-12-14 19:02:58 -05:00
public boolean acceptLocalURLs ( ) {
return this . acceptLocalURLs ;
}
2009-03-16 14:08:43 -04:00
2008-12-14 19:02:58 -05:00
public boolean acceptGlobalURLs ( ) {
return this . acceptGlobalURLs ;
}
2005-12-03 04:58:00 -05:00
}