Do not attempt to spider a site if getting robots.txt returns errors other than 404.

2015-12-15 14:12:06 +01:00
parent bdaaaacd23
commit a38d8da0f0
2 changed files with 7 additions and 17 deletions
--- a/Url.cpp
+++ b/Url.cpp
@ -598,8 +598,6 @@ void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
 	}
 	else 
 	{
-		log(LOG_DEBUG, "build: Url:set no scheme found, defaulting to http://");
-		
 		gbmemcpy ( m_url,"http://" , 7 );
 		m_scheme = m_url;
 		m_slen   = 4;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -13666,22 +13666,16 @@ bool *XmlDoc::getIsAllowed ( ) {
 	m_isAllowed      = true;
 	m_isAllowedValid = true;

-	// put in a crawldelay test for diffbot
-	/*
-	SafeBuf tmp;
-	if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
-		tmp.safePrintf("User-Agent: *\n"
-			       "Crawl-Delay: 10.1\n"
-			       );
-		content = tmp.getBufStart();
-		contentLen = tmp.getLength();
-	}
-
-	// if not success, assume no robots.txt
-	else*/

 	if ( mime->getHttpStatus() != 200 ) 
 	{
+		// BR 20151215: Do not allow spidering if we cannot read robots.txt EXCEPT
+		// if the error code is 404 (Not Found).
+		if( mime->getHttpStatus() != 404 )
+		{
+			m_isAllowed = false;
+		}
+		
 		if( g_conf.m_logDebugDetailed ) log("%s:%s: END. httpStatus != 200. Return %s", __FILE__,__FUNCTION__, (m_isAllowed?"true":"false"));

 		// nuke it to save mem
@ -13689,8 +13683,6 @@ bool *XmlDoc::getIsAllowed ( ) {
 		return &m_isAllowed;
 	}

-	// get the url we lookup
-	//Url *cu = getCurrentUrl();

 	// this is set to true if our userAgent was found explicitly
 	bool uaFound;