privacore-open-source-searc.../test/unit/RobotsTest.cpp

1837 lines
61 KiB
C++

#include <gtest/gtest.h>
#include "Robots.h"
#include "Url.h"
#include "Log.h"
#include <fstream>
#include <sstream>
#include <string>
#define TEST_DOMAIN "http://example.com"
//
// test class
//
class TestRobots : public Robots {
public:
TestRobots( const char *robotsTxt, int32_t robotsTxtLen, const char *userAgent = "testbot" )
: Robots (robotsTxt, robotsTxtLen, userAgent ) {
}
using Robots::getNextLine;
using Robots::getField;
using Robots::getValue;
using Robots::getCurrentLine;
using Robots::getCurrentLineLen;
using Robots::isUserAgentFound;
using Robots::isDefaultUserAgentFound;
using Robots::isRulesEmpty;
using Robots::isDefaultRulesEmpty;
bool isAllowed( const char *path ) {
char urlStr[1024];
snprintf( urlStr, 1024, TEST_DOMAIN "%s", path );
Url url;
url.set( urlStr );
return Robots::isAllowed( &url );
}
};
static void expectRobotsNoNextLine( TestRobots *robots ) {
EXPECT_FALSE( robots->getNextLine() );
}
static void expectRobots( TestRobots *robots, const char *expectedLine, const char *expectedField = "", const char *expectedValue = "" ) {
{
std::stringstream ss;
ss << __func__ << ":"
<< " expectedLine='" << expectedLine << "'"
<< " currentLine='" << robots->getCurrentLineLen() << "'";
SCOPED_TRACE(ss.str());
EXPECT_TRUE( robots->getNextLine() );
EXPECT_EQ( strlen( expectedLine ), robots->getCurrentLineLen() );
EXPECT_EQ( 0, memcmp( expectedLine, robots->getCurrentLine(), robots->getCurrentLineLen() ) );
}
if ( expectedField != "" ) {
const char *field = NULL;
int32_t fieldLen = 0;
EXPECT_TRUE( robots->getField( &field, &fieldLen ) );
std::stringstream ss;
ss << __func__ << ":"
<< " expectedField='" << expectedField << "'"
<< " currentField='" << std::string( field, fieldLen ) << "'";
SCOPED_TRACE(ss.str());
EXPECT_EQ( strlen( expectedField ), fieldLen );
EXPECT_EQ( 0, memcmp( expectedField, field, fieldLen ) );
if ( expectedValue != "" ) {
const char *value = NULL;
int32_t valueLen = 0;
EXPECT_TRUE( robots->getValue( &value, &valueLen ) );
std::stringstream ss;
ss << __func__ << ":"
<< " expectedValue='" << expectedValue << "'"
<< " currentValue='" << std::string( value, valueLen ) << "'";
SCOPED_TRACE(ss.str());
EXPECT_EQ( strlen( expectedValue ), valueLen );
EXPECT_EQ( 0, memcmp( expectedValue, value, valueLen ) );
}
}
}
TEST( RobotsTest, RobotsGetNextLineLineEndings ) {
const char *robotsTxt = "line 1\n"
"line 2\r"
"line 3\r\n"
"line 4\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
expectRobots( &robots, "line 1" );
expectRobots( &robots, "line 2" );
expectRobots( &robots, "line 3" );
expectRobots( &robots, "line 4" );
expectRobotsNoNextLine( &robots);
}
TEST( RobotsTest, RobotsGetNextLineWhitespaces ) {
const char *robotsTxt = " line 1 \n"
" line 2 \r"
" \n"
"\tline 3\t\r\n"
"\t\tline 4 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
expectRobots( &robots, "line 1" );
expectRobots( &robots, "line 2" );
expectRobots( &robots, "line 3" );
expectRobots( &robots, "line 4" );
expectRobotsNoNextLine( &robots);
}
TEST( RobotsTest, RobotsGetNextLineComments ) {
const char *robotsTxt = " line 1 # comment \n"
" line 2#comment \r"
" # line 2a \n"
"\tline 3\t#\tcomment\r\n"
"\t\t#line 4\t\t\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
expectRobots( &robots, "line 1" );
expectRobots( &robots, "line 2" );
expectRobots( &robots, "line 3" );
expectRobotsNoNextLine( &robots);
}
TEST( RobotsTest, RobotsGetFieldValue ) {
const char *robotsTxt = " field1: value1 # comment \n"
" field2 : value2#comment \r"
" # line 2a \n"
"\tfield3\t\t:\tvalue3\t#\tcomment\r\n"
"\t\t#line 4\t\t\n"
"\tfield4\t\t:\tvalue four#comment\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
expectRobots( &robots, "field1: value1", "field1", "value1" );
expectRobots( &robots, "field2 : value2", "field2", "value2" );
expectRobots( &robots, "field3\t\t:\tvalue3", "field3", "value3" );
expectRobots( &robots, "field4\t\t:\tvalue four", "field4", "value four" );
expectRobotsNoNextLine( &robots);
}
//
// helper method
//
static void generateRobotsTxt ( char *robotsTxt, size_t robotsTxtSize, int32_t *pos, const char *userAgent = "testbot", const char *allow = "", const char *disallow = "", bool reversed = false ) {
if ( *pos != 0 ) {
*pos += snprintf ( robotsTxt + *pos, robotsTxtSize - *pos, "\n" );
}
*pos += snprintf ( robotsTxt + *pos, robotsTxtSize - *pos, "user-agent: %s\n", userAgent );
if ( reversed && disallow != "" ) {
*pos += snprintf (robotsTxt + *pos, robotsTxtSize - *pos, "disallow: %s\n", disallow );
}
if ( allow != "" ) {
*pos += snprintf ( robotsTxt + *pos, robotsTxtSize - *pos, "allow: %s\n", allow );
}
if ( !reversed && disallow != "" ) {
*pos += snprintf (robotsTxt + *pos, robotsTxtSize - *pos, "disallow: %s\n", disallow );
}
}
static void generateTestRobotsTxt ( char *robotsTxt, size_t robotsTxtSize, const char *allow = "", const char *disallow = "" ) {
int32_t pos = 0;
generateRobotsTxt( robotsTxt, robotsTxtSize, &pos, "testbot", allow, disallow);
}
static void generateTestReversedRobotsTxt ( char *robotsTxt, size_t robotsTxtSize, const char *allow = "", const char *disallow = "" ) {
int32_t pos = 0;
generateRobotsTxt( robotsTxt, robotsTxtSize, &pos, "testbot", allow, disallow, true);
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test user-agent //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, UserAgentSingleUANoMatch ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSingleUAPrefixMatch ) {
char robotsTxt[1024] = "user-agent: testbotabc\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSingleUAPrefixVersionMatch ) {
char robotsTxt[1024] = "user-agent: testbot/1.0\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSingleUAIgnoreCase ) {
char robotsTxt[1024] = "user-agent: TestBot/1.0\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSingleUAMatch ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSeparateUANone ) {
char robotsTxt[1024] = "user-agent: atestbot\n"
"crawl-delay: 1\n"
"user-agent: abcbot\n"
"crawl-delay: 2\n"
"user-agent: defbot\n"
"crawl-delay: 3\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSeparateUAFirst ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 1\n"
"user-agent: abcbot\n"
"crawl-delay: 2\n"
"user-agent: defbot\n"
"crawl-delay: 3\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSeparateUASecond ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1\n"
"user-agent: testbot\n"
"crawl-delay: 2\n"
"user-agent: defbot\n"
"crawl-delay: 3\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentSeparateUALast ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1\n"
"user-agent: defbot\n"
"crawl-delay: 2\n"
"user-agent: testbot\n"
"crawl-delay: 3\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 3000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiUANone ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1\n"
"user-agent: atestbot\n"
"crawl-delay: 2\n"
"user-agent: defbot\n"
"crawl-delay: 3\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiUAFirst ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"user-agent: abcbot\n"
"user-agent: defbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiUASecond ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"user-agent: testbot\n"
"user-agent: defbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiUALast ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"user-agent: defbot\n"
"user-agent: testbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentDefaultMultiUAFirst ) {
char robotsTxt[1024] = "user-agent: *\n"
"crawl-delay: 1\n"
"user-agent: testbot\n"
"user-agent: abcbot\n"
"user-agent: defbot\n"
"crawl-delay: 2\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentDefaultMultiUASecond ) {
char robotsTxt[1024] = "user-agent: *\n"
"crawl-delay: 1\n"
"user-agent: abcbot\n"
"user-agent: testbot\n"
"user-agent: defbot\n"
"crawl-delay: 2\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentDefaultMultiUALast ) {
char robotsTxt[1024] = "user-agent: *\n"
"crawl-delay: 1\n"
"user-agent: abcbot\n"
"user-agent: defbot\n"
"user-agent: testbot\n"
"crawl-delay: 2\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiDefaultUAFirst ) {
char robotsTxt[1024] = "user-agent: *\n"
"user-agent: abcbot\n"
"user-agent: defbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiDefaultUASecond ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"user-agent: *\n"
"user-agent: defbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentMultiDefaultUALast ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"user-agent: defbot\n"
"user-agent: *\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, UserAgentFieldCaseInsensitive ) {
char robotsTxt[1024] = "User-Agent: testbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test comments //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, CommentsFullLine ) {
char robotsTxt[1024] = "user-agent: *\n"
"#user-agent: testbot\n"
"user-agent: defbot\n"
"crawl-delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CommentsAfterWithSpace ) {
int32_t pos = 0;
char robotsTxt[1024];
generateRobotsTxt( robotsTxt, 1024, &pos, "testbot #user-agent", "/test #allow", "/ #disallow");
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, CommentsAfterNoSpace ) {
int32_t pos = 0;
char robotsTxt[1024];
generateRobotsTxt( robotsTxt, 1024, &pos, "testbot#user-agent", "/test#allow", "/#disallow");
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/test.html" ) );
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test whitespace //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, WhitespaceSpaceDirectiveBefore ) {
char robotsTxt[1024] = " user-agent:testbot\n"
" disallow:/test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, WhitespaceSpaceDirectiveAfter ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"disallow: /test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, WhitespaceSpaceDirectiveBoth ) {
char robotsTxt[1024] = " user-agent: testbot\n"
" disallow: /test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, WhitespaceTabsDirectiveBefore ) {
char robotsTxt[1024] = "\tuser-agent:testbot\n"
"\t\tdisallow:/test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, WhitespaceTabsDirectiveAfter ) {
char robotsTxt[1024] = "user-agent:\ttestbot\n"
"disallow:\t/test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
TEST( RobotsTest, WhitespaceTabsDirectiveBoth ) {
char robotsTxt[1024] = "\tuser-agent:\ttestbot\n"
"\t\tdisallow:\t/test\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_FALSE( robots.isAllowed( "/test.html" ) );
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test allow/disallow //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, AllowAll ) {
static const char *allow = "";
static const char *disallow = " ";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
}
TEST( RobotsTest, DisallowAll ) {
static const char *allow = "";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/index.html" ) );
}
// /123 matches /123 and /123/ and /1234 and /123/456
TEST( RobotsTest, PathMatch ) {
static const char *allow = "";
static const char *disallow = "/123";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/12" ) );
EXPECT_FALSE( robots.isAllowed( "/123" ) );
EXPECT_FALSE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/1234" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456" ) );
}
// treat /123* as /123
TEST( RobotsTest, PathMatchWildcardEnd ) {
static const char *allow = "";
static const char *disallow = "/123*";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/12" ) );
EXPECT_FALSE( robots.isAllowed( "/123" ) );
EXPECT_FALSE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/1234" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456" ) );
}
// treat /123*** as /123
TEST( RobotsTest, PathMatchMultipleWildcardEnd ) {
static const char *allow = "";
static const char *disallow = "/123***";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/12" ) );
EXPECT_FALSE( robots.isAllowed( "/123" ) );
EXPECT_FALSE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/1234" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456" ) );
}
// /123/ matches /123/ and /123/456
TEST( RobotsTest, PathMatchDir ) {
static const char *allow = "";
static const char *disallow = "/123/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/123" ) );
EXPECT_TRUE( robots.isAllowed( "/1234" ) );
EXPECT_FALSE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456/" ) );
}
// treat /123/* as /123/
TEST( RobotsTest, PathMatchDirWildcardEnd ) {
static const char *allow = "";
static const char *disallow = "/123/*";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
EXPECT_TRUE( robots.isAllowed( "/123" ) );
EXPECT_TRUE( robots.isAllowed( "/1234" ) );
EXPECT_FALSE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456" ) );
EXPECT_FALSE( robots.isAllowed( "/123/456/" ) );
}
// /*abc matches /123abc and /123/abc and /123abc456 and /123/abc/456
TEST( RobotsTest, PathMatchWildcardStart ) {
static const char *allow = "";
static const char *disallow = "/*abc";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/123" ) );
EXPECT_TRUE( robots.isAllowed( "/123ab" ) );
EXPECT_FALSE( robots.isAllowed( "/123abc" ) );
EXPECT_FALSE( robots.isAllowed( "/123/abc" ) );
EXPECT_FALSE( robots.isAllowed( "/123abc456" ) );
EXPECT_FALSE( robots.isAllowed( "/123/abc/456" ) );
}
// /123*xyz matches /123qwertyxyz and /123/qwerty/xyz/789
TEST( RobotsTest, PathMatchWildcardMid ) {
static const char *allow = "";
static const char *disallow = "/123*xyz";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/123/qwerty/xy" ) );
EXPECT_FALSE( robots.isAllowed( "/123qwertyxyz" ) );
EXPECT_FALSE( robots.isAllowed( "/123qwertyxyz/" ) );
EXPECT_FALSE( robots.isAllowed( "/123/qwerty/xyz/789" ) );
}
// /123$ matches ONLY /123
TEST( RobotsTest, PathMatchEndAnchor ) {
static const char *allow = "";
static const char *disallow = "/123$";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/123/" ) );
EXPECT_FALSE( robots.isAllowed( "/123" ) );
}
// /*abc$ matches /123abc and /123/abc but NOT /123/abc/x etc.
TEST( RobotsTest, PathMatchWildcardEndAnchor ) {
static const char *allow = "";
static const char *disallow = "/*abc$";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/123/abc/x" ) );
EXPECT_FALSE( robots.isAllowed( "/123abc" ) );
EXPECT_FALSE( robots.isAllowed( "/123/abc" ) );
}
/// @todo ALC test multiple wildcard
/// @todo ALC test multiple wildcard end (line anchor)
/// @todo ALC test _escaped_fragment_
////////////////////////////////////////////////////////////////////////////////
// //
// Test crawl delay //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, CrawlDelayValueNone ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay:";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueInvalid ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: abc";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayNoMatch ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayMissing ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"disallow: /";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_FALSE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueFractionPartial ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: .5";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 500, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueFractionFull ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 1.5\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1500, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueIntegerValid ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 30 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 30000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueIntegerInvalid ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 60abc \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( -1, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayValueComment ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 60#abc \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 60000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayDefaultFirstNoMatch ) {
char robotsTxt[1024] = "user-agent: *\n"
"crawl-delay: 1 \n"
"user-agent: testbot\n"
"crawl-delay: 2 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayDefaultLastNoMatch ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"crawl-delay: 1 \n"
"user-agent: * \n"
"crawl-delay: 2 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayDefaultFirstMatch ) {
char robotsTxt[1024] = "user-agent: *\n"
"crawl-delay: 1 \n"
"user-agent: abcbot\n"
"crawl-delay: 2 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayDefaultLastMatch ) {
char robotsTxt[1024] = "user-agent: abcbot\n"
"crawl-delay: 1 \n"
"user-agent: *\n"
"crawl-delay: 2 \n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 2000, robots.getCrawlDelay() );
}
TEST( RobotsTest, CrawlDelayFieldCaseInsensitive ) {
char robotsTxt[1024] = "user-agent: testbot\n"
"Crawl-Delay: 1\n";
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_EQ( 1000, robots.getCrawlDelay() );
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test site map //
// //
////////////////////////////////////////////////////////////////////////////////
/// @todo ALC test site map
////////////////////////////////////////////////////////////////////////////////
// //
// Test line endings //
// //
////////////////////////////////////////////////////////////////////////////////
/// @todo ALC test line endings
////////////////////////////////////////////////////////////////////////////////
// //
// Test utf-8 encoding (non-ascii) //
// //
////////////////////////////////////////////////////////////////////////////////
/// @todo ALC test utf-8 encoding
////////////////////////////////////////////////////////////////////////////////
// //
// Test url encoded path //
// //
////////////////////////////////////////////////////////////////////////////////
/// @todo ALC test url encoded path
//////////////////////////////////////////////////////////////////////////////////////////
// //
// Test cases based on google's robots.txt specification //
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt //
// #example-path-matches //
// //
//////////////////////////////////////////////////////////////////////////////////////////
// [path] [match] [no match] [comments]
// / any valid url Matches the root and any lower level URL
// /* equivalent to / equivalent to / Equivalent to "/" -- the trailing wildcard is ignored.
TEST( RobotsTest, GPathMatchDisallowAll ) {
static const char *allow = "";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/index.html" ) );
}
TEST( RobotsTest, GPathMatchDisallowAllWildcard ) {
static const char *allow = "";
static const char *disallow = "/*";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/index.html" ) );
}
// [path] [match] [no match] [comments]
// /fish /fish /Fish.asp Note the case-sensitive matching.
// /fish.html /catfish
// /fish/salmon.html /?id=fish
// /fishheads
// /fishheads/yummy.html
// /fish.php?id=anything
TEST( RobotsTest, GPathMatchPrefixDisallow ) {
static const char *allow = "";
static const char *disallow = "/fish";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/fish" ) );
EXPECT_FALSE( robots.isAllowed( "/fish.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fish/salmon.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fishheads" ) );
EXPECT_FALSE( robots.isAllowed( "/fishheads/yummy.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fish.php?id=anything" ) );
EXPECT_TRUE( robots.isAllowed( "/Fish.asp" ) );
EXPECT_TRUE( robots.isAllowed( "/catfish" ) );
EXPECT_TRUE( robots.isAllowed( "/?id=fish" ) );
}
TEST( RobotsTest, GPathMatchPrefixAllow ) {
static const char *allow = "/fish";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/fish" ) );
EXPECT_TRUE( robots.isAllowed( "/fish.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fish/salmon.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fishheads" ) );
EXPECT_TRUE( robots.isAllowed( "/fishheads/yummy.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fish.php?id=anything" ) );
EXPECT_FALSE( robots.isAllowed( "/Fish.asp" ) );
EXPECT_FALSE( robots.isAllowed( "/catfish" ) );
EXPECT_FALSE( robots.isAllowed( "/?id=fish" ) );
}
// [path] [match] [no match] [comments]
// /fish* /fish /Fish.asp Equivalent to "/fish" -- the trailing wildcard is ignored.
// /fish.html /catfish
// /fish/salmon.html /?id=fish
// /fishheads
// /fishheads/yummy.html
// /fish.php?id=anything
TEST( RobotsTest, GPathMatchPrefixWildcardDisallow ) {
static const char *allow = "";
static const char *disallow = "/fish*";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/fish" ) );
EXPECT_FALSE( robots.isAllowed( "/fish.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fish/salmon.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fishheads" ) );
EXPECT_FALSE( robots.isAllowed( "/fishheads/yummy.html" ) );
EXPECT_FALSE( robots.isAllowed( "/fish.php?id=anything" ) );
EXPECT_TRUE( robots.isAllowed( "/Fish.asp" ) );
EXPECT_TRUE( robots.isAllowed( "/catfish" ) );
EXPECT_TRUE( robots.isAllowed( "/?id=fish" ) );
}
TEST( RobotsTest, GPathMatchPrefixWildcardAllow ) {
static const char *allow = "/fish*";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/fish" ) );
EXPECT_TRUE( robots.isAllowed( "/fish.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fish/salmon.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fishheads" ) );
EXPECT_TRUE( robots.isAllowed( "/fishheads/yummy.html" ) );
EXPECT_TRUE( robots.isAllowed( "/fish.php?id=anything" ) );
EXPECT_FALSE( robots.isAllowed( "/Fish.asp" ) );
EXPECT_FALSE( robots.isAllowed( "/catfish" ) );
EXPECT_FALSE( robots.isAllowed( "/?id=fish" ) );
}
// [path] [match] [no match] [comments]
// /fish/ /fish/ /fish The trailing slash means this matches anything in this folder.
// /fish/?id=anything /fish.html
// /fish/salmon.htm /Fish/Salmon.php
TEST( RobotsTest, GPathMatchPrefixDirDisallow ) {
static const char *allow = "";
static const char *disallow = "/fish/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/fish/" ) );
EXPECT_FALSE( robots.isAllowed( "/fish/?id=anything" ) );
EXPECT_FALSE( robots.isAllowed( "/fish/salmon.htm" ) );
EXPECT_TRUE( robots.isAllowed( "/fish" ) );
EXPECT_TRUE( robots.isAllowed( "/fish.html" ) );
EXPECT_TRUE( robots.isAllowed( "/Fish/Salmon.php" ) );
}
TEST( RobotsTest, GPathMatchPrefixDirAllow ) {
static const char *allow = "/fish/";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/fish/" ) );
EXPECT_TRUE( robots.isAllowed( "/fish/?id=anything" ) );
EXPECT_TRUE( robots.isAllowed( "/fish/salmon.htm" ) );
EXPECT_FALSE( robots.isAllowed( "/fish" ) );
EXPECT_FALSE( robots.isAllowed( "/fish.html" ) );
EXPECT_FALSE( robots.isAllowed( "/Fish/Salmon.php" ) );
}
// [path] [match] [no match] [comments]
// *.php /filename.php / (even if it maps to /index.php)
// /folder/filename.php /windows.PHP
// /folder/filename.php?parameters
// /folder/any.php.file.html
// /filename.php/
TEST( RobotsTest, GPathMatchWildcardExtDisallow ) {
static const char *allow = "";
static const char *disallow = "*.php";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/filename.php" ) );
EXPECT_FALSE( robots.isAllowed( "/folter/filename.php" ) );
EXPECT_FALSE( robots.isAllowed( "/folder/filename.php?parameters" ) );
EXPECT_FALSE( robots.isAllowed( "/folder/any.php.file.html" ) );
EXPECT_FALSE( robots.isAllowed( "/filename.php/" ) );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/windows.PHP" ) );
}
TEST( RobotsTest, GPathMatchWildcardExtAllow ) {
static const char *allow = "/*.php";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/filename.php" ) );
EXPECT_TRUE( robots.isAllowed( "/folter/filename.php" ) );
EXPECT_TRUE( robots.isAllowed( "/folder/filename.php?parameters" ) );
EXPECT_TRUE( robots.isAllowed( "/folder/any.php.file.html" ) );
EXPECT_TRUE( robots.isAllowed( "/filename.php/" ) );
EXPECT_FALSE( robots.isAllowed( "/" ) );
EXPECT_FALSE( robots.isAllowed( "/windows.PHP" ) );
}
// [path] [match] [no match] [comments]
// /*.php$ /filename.php /filename.php?parameters
// /folder/filename.php /filename.php/
// /filename.php5
// /windows.PHP
TEST( RobotsTest, GPathMatchWildcardExtEndDisallow ) {
static const char *allow = "";
static const char *disallow = "/*.php$";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/filename.php" ) );
EXPECT_FALSE( robots.isAllowed( "/folder/filename.php" ) );
EXPECT_TRUE( robots.isAllowed( "/filename.php?parameters" ) );
EXPECT_TRUE( robots.isAllowed( "/filename.php/" ) );
EXPECT_TRUE( robots.isAllowed( "/filename.php5" ) );
EXPECT_TRUE( robots.isAllowed( "/windows.PHP" ) );
}
TEST( RobotsTest, GPathMatchWildcardExtEndAllow ) {
static const char *allow = "/*.php$";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/filename.php" ) );
EXPECT_TRUE( robots.isAllowed( "/folder/filename.php" ) );
EXPECT_FALSE( robots.isAllowed( "/filename.php?parameters" ) );
EXPECT_FALSE( robots.isAllowed( "/filename.php/" ) );
EXPECT_FALSE( robots.isAllowed( "/filename.php5" ) );
EXPECT_FALSE( robots.isAllowed( "/windows.PHP" ) );
}
// [path] [match] [no match] [comments]
// /fish*.php /fish.php /Fish.PHP
// /fishheads/catfish.php?parameters
TEST( RobotsTest, GPathMatchPrefixWildcardExtDisallow ) {
static const char *allow = "";
static const char *disallow = "/fish*.php";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/fish.php" ) );
EXPECT_FALSE( robots.isAllowed( "/fishheads/catfish.php?parameters" ) );
EXPECT_TRUE( robots.isAllowed( "/Fish.PHP" ) );
}
TEST( RobotsTest, GPathMatchPrefixWildcardExtAllow ) {
static const char *allow = "/fish*.php";
static const char *disallow = "/";
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, allow, disallow );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/fish.php" ) );
EXPECT_TRUE( robots.isAllowed( "/fishheads/catfish.php?parameters" ) );
EXPECT_FALSE( robots.isAllowed( "/Fish.PHP" ) );
}
//////////////////////////////////////////////////////////////////////////////////////////
// //
// Test cases based on google's robots.txt specification //
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt //
// #order-of-precedence-for-group-member-records //
// //
//////////////////////////////////////////////////////////////////////////////////////////
// [url] [allow] [disallow] [verdict]
// http://example.com/page /p / allow
// http://example.com/folder/page /folder/ /folder allow
// http://example.com/page.htm /page /*.htm undefined
// http://example.com/ /$ / allow
// http://example.com/page.htm /$ / disallow
TEST( RobotsTest, GPrecedenceAllowDisallow ) {
char robotsTxt[1024];
{
generateTestRobotsTxt( robotsTxt, 1024, "/p", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/page" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/folder/", "/folder" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/folder/page" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/page.", "/*.htm" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed ( "/page.htm" ) );
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/$", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/" ));
EXPECT_FALSE( robots.isAllowed( "/page.htm" ));
}
}
TEST( RobotsTest, GPrecedenceDisallowAllow ) {
char robotsTxt[1024];
{
generateTestReversedRobotsTxt( robotsTxt, 1024, "/p", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/page" ));
}
{
generateTestReversedRobotsTxt( robotsTxt, 1024, "/folder/", "/folder" );
TestRobots robots( robotsTxt, strlen( robotsTxt ));
EXPECT_TRUE( robots.isAllowed( "/folder/page" ));
}
{
generateTestReversedRobotsTxt( robotsTxt, 1024, "/page.", "/*.htm" );
TestRobots robots( robotsTxt, strlen( robotsTxt ));
EXPECT_FALSE( robots.isAllowed ( "/page.htm" ) );
}
{
generateTestReversedRobotsTxt( robotsTxt, 1024, "/$", "/" );
TestRobots robots( robotsTxt, strlen( robotsTxt ));
EXPECT_TRUE( robots.isAllowed( "/" ));
EXPECT_FALSE( robots.isAllowed( "/page.htm" ));
}
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test cases based on RFC //
// http://www.robotstxt.org/norobots-rfc.txt //
// //
////////////////////////////////////////////////////////////////////////////////
TEST( RobotsTest, RFCPathMatchPrefixDisallow ) {
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, "/", "/tmp" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/tmp" ));
EXPECT_FALSE( robots.isAllowed( "/tmp.html" ));
EXPECT_FALSE( robots.isAllowed( "/tmp/a.html" ));
}
TEST( RobotsTest, RFCPathMatchPrefixAllow ) {
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, "/tmp", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/tmp" ));
EXPECT_TRUE( robots.isAllowed( "/tmp.html" ));
EXPECT_TRUE( robots.isAllowed( "/tmp/a.html" ));
}
TEST( RobotsTest, RFCPathMatchPrefixDirDisallow ) {
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, "/", "/tmp/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/tmp" ));
EXPECT_FALSE( robots.isAllowed( "/tmp/" ));
EXPECT_FALSE( robots.isAllowed( "/tmp/a.html" ));
}
TEST( RobotsTest, RFCPathMatchPrefixDirAllow ) {
char robotsTxt[1024];
generateTestRobotsTxt( robotsTxt, 1024, "/tmp/", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/tmp" ));
EXPECT_TRUE( robots.isAllowed( "/tmp/" ));
EXPECT_TRUE( robots.isAllowed( "/tmp/a.html" ));
}
TEST( RobotsTest, RFCPathMatchUrlEncodeDisallow ) {
char robotsTxt[1024];
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/a%3cd.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/a%3cd.html" ));
EXPECT_FALSE( robots.isAllowed( "/a%3Cd.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/a%3Cd.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/a%3cd.html" ));
EXPECT_FALSE( robots.isAllowed( "/a%3Cd.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/a%2fb.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/a%2fb.html" ));
EXPECT_TRUE( robots.isAllowed( "/a/b.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/a/b.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/a%2fb.html" ));
EXPECT_FALSE( robots.isAllowed( "/a/b.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/%7ejoe/index.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/~joe/index.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/", "/~joe/index.html" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/%7ejoe/index.html" ));
}
}
TEST( RobotsTest, RFCPathMatchUrlEncodeAllow ) {
char robotsTxt[1024];
{
generateTestRobotsTxt( robotsTxt, 1024, "/a%3cd.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/a%3cd.html" ));
EXPECT_TRUE( robots.isAllowed( "/a%3Cd.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/a%3Cd.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/a%3cd.html" ));
EXPECT_TRUE( robots.isAllowed( "/a%3Cd.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/a%2fb.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/a%2fb.html" ));
EXPECT_FALSE( robots.isAllowed( "/a/b.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/a/b.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_FALSE( robots.isAllowed( "/a%2fb.html" ));
EXPECT_TRUE( robots.isAllowed( "/a/b.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/%7ejoe/index.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/~joe/index.html" ));
}
{
generateTestRobotsTxt( robotsTxt, 1024, "/~joe/index.html", "/" );
TestRobots robots( robotsTxt, strlen(robotsTxt) );
EXPECT_TRUE( robots.isAllowed( "/%7ejoe/index.html" ));
}
}
TEST( RobotsTest, RFCExample ) {
char robotsTxt[1024] = "# /robots.txt for http://www.fict.org/\n"
"# comments to webmaster@fict.org\n"
"\n"
"User-agent: unhipbot\n"
"Disallow: /\n"
"\n"
"User-agent: webcrawler\n"
"User-agent: excite\n"
"Disallow: \n"
"\n"
"User-agent: *\n"
"Disallow: /org/plans.html\n"
"Allow: /org/\n"
"Allow: /serv\n"
"Allow: /~mak\n"
"Disallow: /";
{
// unhipbot
TestRobots robots( robotsTxt, strlen(robotsTxt), "unhipbot" );
EXPECT_FALSE( robots.isAllowed( "/" ));
EXPECT_FALSE( robots.isAllowed( "/index.html" ));
//EXPECT_FALSE( robots.isAllowed( "/robots.txt" ));
EXPECT_FALSE( robots.isAllowed( "/server.html" ));
EXPECT_FALSE( robots.isAllowed( "/services/fast.html" ));
EXPECT_FALSE( robots.isAllowed( "/services/slow.html" ));
EXPECT_FALSE( robots.isAllowed( "/orgo.gif" ));
EXPECT_FALSE( robots.isAllowed( "/org/about.html" ));
EXPECT_FALSE( robots.isAllowed( "/org/plans.html" ));
EXPECT_FALSE( robots.isAllowed( "/%7Ejim/jim.html" ));
EXPECT_FALSE( robots.isAllowed( "/%7Emak/mak.html" ));
}
{
// webcrawler
TestRobots robots( robotsTxt, strlen(robotsTxt), "webcrawler" );
EXPECT_TRUE( robots.isAllowed( "/" ));
EXPECT_TRUE( robots.isAllowed( "/index.html" ));
//EXPECT_TRUE( robots.isAllowed( "/robots.txt" ));
EXPECT_TRUE( robots.isAllowed( "/server.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/fast.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/slow.html" ));
EXPECT_TRUE( robots.isAllowed( "/orgo.gif" ));
EXPECT_TRUE( robots.isAllowed( "/org/about.html" ));
EXPECT_TRUE( robots.isAllowed( "/org/plans.html" ));
EXPECT_TRUE( robots.isAllowed( "/%7Ejim/jim.html" ));
EXPECT_TRUE( robots.isAllowed( "/%7Emak/mak.html" ));
}
{
// excite
TestRobots robots( robotsTxt, strlen(robotsTxt), "excite" );
EXPECT_TRUE( robots.isAllowed( "/" ));
EXPECT_TRUE( robots.isAllowed( "/index.html" ));
//EXPECT_TRUE( robots.isAllowed( "/robots.txt" ));
EXPECT_TRUE( robots.isAllowed( "/server.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/fast.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/slow.html" ));
EXPECT_TRUE( robots.isAllowed( "/orgo.gif" ));
EXPECT_TRUE( robots.isAllowed( "/org/about.html" ));
EXPECT_TRUE( robots.isAllowed( "/org/plans.html" ));
EXPECT_TRUE( robots.isAllowed( "/%7Ejim/jim.html" ));
EXPECT_TRUE( robots.isAllowed( "/%7Emak/mak.html" ));
}
{
// other
TestRobots robots( robotsTxt, strlen(robotsTxt), "testbot" );
EXPECT_FALSE( robots.isAllowed( "/" ));
EXPECT_FALSE( robots.isAllowed( "/index.html" ));
//EXPECT_TRUE( robots.isAllowed( "/robots.txt" ));
EXPECT_TRUE( robots.isAllowed( "/server.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/fast.html" ));
EXPECT_TRUE( robots.isAllowed( "/services/slow.html" ));
EXPECT_FALSE( robots.isAllowed( "/orgo.gif" ));
EXPECT_TRUE( robots.isAllowed( "/org/about.html" ));
EXPECT_FALSE( robots.isAllowed( "/org/plans.html" ));
EXPECT_FALSE( robots.isAllowed( "/%7Ejim/jim.html" ));
EXPECT_TRUE( robots.isAllowed( "/%7Emak/mak.html" ));
}
}
////////////////////////////////////////////////////////////////////////////////
// //
// Test real robots.txt //
// //
////////////////////////////////////////////////////////////////////////////////
static std::string loadRobotsFile( const char *fileName ) {
std::ifstream file( fileName );
if ( file.is_open() ) {
std::stringstream contents;
contents << file.rdbuf();
file.close();
return contents.str();
}
return "";
}
// empty file
TEST( RobotsTest, RRobotsSpeedtestNet ) {
std::string robotsTxt = loadRobotsFile( "robots/speedtest.net" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
}
// comments only
TEST( RobotsTest, RRobotsThekitchnCom ) {
std::string robotsTxt = loadRobotsFile( "robots/thekitchn.com" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
EXPECT_TRUE( robots.isAllowed( "/" ) );
EXPECT_TRUE( robots.isAllowed( "/index.html" ) );
}
// wildcard use
TEST( RobotsTest, RRobotsRedditCom ) {
std::string robotsTxt = loadRobotsFile( "robots/reddit.com" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
EXPECT_FALSE( robots.isAllowed( "/r/GameDeals/comments/4csg7b/steam_baldurs_gate_enhanced_edition_75_off_499/?sort=top") );
EXPECT_FALSE( robots.isAllowed( "/r/GameDeals/search?q=humble+bundle&restrict_sr=on") );
EXPECT_TRUE( robots.isAllowed( "/r/GameDeals/hot/") );
EXPECT_FALSE( robots.isAllowed( "/r/GameDeals.json" ) );
}
// many user agents
TEST( RobotsTest, RRobotsNeedromCom ) {
std::string robotsTxt = loadRobotsFile( "robots/needrom.com" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
EXPECT_FALSE( robots.isAllowed( "/wp-admin/" ) );
EXPECT_TRUE( robots.isAllowed( "/download/galaxy-ace-duos-s6802/" ) );
}
// many disallow (no wildcard)
TEST( RobotsTest, RRobotsStateGov ) {
std::string robotsTxt = loadRobotsFile( "robots/state.gov" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
EXPECT_TRUE( robots.isAllowed( "/documents/organization/81807.pdf" ) );
EXPECT_FALSE( robots.isAllowed( "/g/abc") );
}
// many disallow (with wildcard)
TEST( RobotsTest, RRobotsBoeEs ) {
std::string robotsTxt = loadRobotsFile( "robots/boe.es" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
EXPECT_TRUE( robots.isAllowed( "/buscar/" ) );
EXPECT_FALSE( robots.isAllowed( "/buscar/doc.php?id=BOE-B-2015-14008") );
}
// url encoded / utf-8
TEST( RobotsTest, RRobotsWikipediaOrg ) {
std::string robotsTxt = loadRobotsFile( "robots/wikipedia.org" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
/// @todo add some test cases
}
// no line endings (last line)
TEST( RobotsTest, RRobotsUpfEdu ) {
std::string robotsTxt = loadRobotsFile( "robots/upf.edu" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_TRUE( robots.isDefaultUserAgentFound() );
EXPECT_FALSE( robots.isDefaultRulesEmpty() );
}
// no line endings with starting whitespaces (last line)
TEST( RobotsTest, RRobotsCoriolisIo ) {
std::string robotsTxt = loadRobotsFile( "robots/coriolis.io" );
TestRobots robots( robotsTxt.c_str(), robotsTxt.length() );
EXPECT_FALSE( robots.isUserAgentFound() );
EXPECT_TRUE( robots.isRulesEmpty() );
EXPECT_FALSE( robots.isDefaultUserAgentFound() );
EXPECT_TRUE( robots.isDefaultRulesEmpty() );
}