Merge branch 'master' of github.com:privacore/open-source-search-engine

This commit is contained in:
Ivan Skytte Jørgensen
2016-05-25 23:13:56 +02:00
4 changed files with 59 additions and 53 deletions

73
Url.cpp

@ -633,46 +633,27 @@ static void stripParameters( UrlParser *urlParser ) {
UrlComponent *cUrlComponent = ( cQueryMatches.size() == 1 ) ? cQueryMatches[0] : NULL;
UrlComponent *oUrlComponent = ( oQueryMatches.size() == 1 ) ? oQueryMatches[0] : NULL;
bool deleteC = false;
bool deleteO = false;
if ( cUrlComponent ) {
if ( cUrlComponent->getValueLen() == 0 ) {
deleteC = true;
urlParser->deleteComponent( cUrlComponent );
} else if ( cUrlComponent->getValueLen() == 1 ) {
char c = *( cUrlComponent->getValue() );
if ( c == 'N' || c == 'M' || c == 'S' || c == 'D' ) {
deleteC = true;
urlParser->deleteComponent( cUrlComponent );
}
}
}
if ( oUrlComponent ) {
if ( oUrlComponent->getValueLen() == 0 ) {
deleteO = true;
urlParser->deleteComponent( oUrlComponent );
} else if ( oUrlComponent->getValueLen() == 1 ) {
char o = *( oUrlComponent->getValue() );
if ( o == 'A' || o == 'D' ) {
deleteO = true;
urlParser->deleteComponent( oUrlComponent );
}
}
}
if ( urlParser->getQueryParamCount() == 2 ) {
if ( deleteC && deleteO ) {
urlParser->deleteComponent( cUrlComponent );
urlParser->deleteComponent( oUrlComponent );
}
} else {
if ( deleteC ) {
urlParser->deleteComponent( cUrlComponent );
}
if ( deleteO ) {
oUrlComponent->setDeleted();
urlParser->deleteComponent( oUrlComponent );
}
}
}
/// @todo ALC token?
@ -774,31 +755,35 @@ static void stripParameters( UrlParser *urlParser ) {
/// @todo ALC cater for more affiliate links here
if ( strncmp( urlParser->getDomain(), "amazon.", 7 ) == 0 ) {
// amazon
// https://www.reddit.com/r/GameDeals/wiki/affiliate
// only check domain specific logic when we have a domain
if ( urlParser->getDomain() ) {
if ( strncmp( urlParser->getDomain(), "amazon.", 7 ) == 0 ) {
// amazon
// https://www.reddit.com/r/GameDeals/wiki/affiliate
// affiliate
urlParser->removeQueryParam( "tag" );
// affiliate
urlParser->removeQueryParam( "tag" );
// wishlist
urlParser->removeQueryParam( "coliid" );
urlParser->removeQueryParam( "colid" );
// wishlist
urlParser->removeQueryParam( "coliid" );
urlParser->removeQueryParam( "colid" );
// reference
urlParser->removeQueryParam( "ref" );
urlParser->removePathParam( UrlComponent::Matcher( "ref" ), UrlComponent::Validator( 0, 0, false, ALLOW_ALL, MANDATORY_PUNCTUATION ) );
} else if ( strncmp( urlParser->getDomain(), "ebay.", 5 ) == 0 ) {
// ebay
// http://www.ebaypartnernetworkblog.com/en/2009/05/new-link-generator-tool-additional-information/
// reference
urlParser->removeQueryParam( "ref" );
urlParser->removePathParam( UrlComponent::Matcher( "ref" ),
UrlComponent::Validator( 0, 0, false, ALLOW_ALL, MANDATORY_PUNCTUATION ) );
} else if ( strncmp( urlParser->getDomain(), "ebay.", 5 ) == 0 ) {
// ebay
// http://www.ebaypartnernetworkblog.com/en/2009/05/new-link-generator-tool-additional-information/
urlParser->removeQueryParam( "icep_ff3" );
urlParser->removeQueryParam( "pub" );
urlParser->removeQueryParam( "toolid" );
urlParser->removeQueryParam( "campid" );
urlParser->removeQueryParam( "customid" );
urlParser->removeQueryParam( "afepn" );
urlParser->removeQueryParam( "pid" );
urlParser->removeQueryParam( "icep_ff3" );
urlParser->removeQueryParam( "pub" );
urlParser->removeQueryParam( "toolid" );
urlParser->removeQueryParam( "campid" );
urlParser->removeQueryParam( "customid" );
urlParser->removeQueryParam( "afepn" );
urlParser->removeQueryParam( "pid" );
}
}
}

@ -97,7 +97,7 @@ void UrlParser::parse() {
const char *userInfoPos = static_cast<const char *>( memchr( m_authority, '@', m_authorityLen ) );
if ( userInfoPos != NULL ) {
m_host = userInfoPos + 1;
m_hostLen = m_authorityLen - ( userInfoPos - m_authority );
m_hostLen = m_authorityLen - ( userInfoPos - m_authority ) - 1;
} else {
m_host = m_authority;
m_hostLen = m_authorityLen;
@ -119,14 +119,11 @@ void UrlParser::parse() {
if ( m_domain ) {
m_domain += 1;
m_domainLen = m_hostLen - ( m_domain - m_host );
} else {
m_domain = m_host;
m_domainLen = m_hostLen;
}
}
// defaults to host
if ( !m_domain ) {
m_domain = m_host;
m_domainLen = m_hostLen;
}
}
const char *queryPos = static_cast<const char*>( memchr( currentPos, '?', urlEnd - currentPos ) );

@ -30,6 +30,14 @@ TEST( UrlParserTest, ParseSchemeNone ) {
}
TEST( UrlParserTest, ParseUserInfo ) {
std::string url( "http://username:password@www.example.com/param1=abc-123" );
UrlParser urlParser( url.c_str(), url.size() );
checkResult( "username:password@www.example.com", urlParser.getAuthority(), urlParser.getAuthorityLen() );
checkResult( "example.com", urlParser.getDomain(), urlParser.getDomainLen() );
}
TEST( UrlParserTest, ParseUserInfoPort ) {
std::string url( "http://username:password@www.example.com:8080/param1=abc-123" );
UrlParser urlParser( url.c_str(), url.size() );
@ -53,6 +61,22 @@ TEST( UrlParserTest, ParsePortSchemeNone ) {
checkResult( "example.com", urlParser.getDomain(), urlParser.getDomainLen() );
}
TEST( UrlParserTest, ParseIP ) {
std::string url( "http://127.0.0.1/param1=abc-123" );
UrlParser urlParser( url.c_str(), url.size() );
checkResult( "127.0.0.1", urlParser.getAuthority(), urlParser.getAuthorityLen() );
checkResult( "", urlParser.getDomain(), urlParser.getDomainLen() );
}
TEST( UrlParserTest, ParseIPPort ) {
std::string url( "http://127.0.0.1:8080/param1=abc-123" );
UrlParser urlParser( url.c_str(), url.size() );
checkResult( "127.0.0.1:8080", urlParser.getAuthority(), urlParser.getAuthorityLen() );
checkResult( "", urlParser.getDomain(), urlParser.getDomainLen() );
}
TEST( UrlParserTest, ParseSubdomainNone ) {
std::string url( "http://example.com/param1=abc-123" );
UrlParser urlParser( url.c_str(), url.size() );
@ -83,7 +107,7 @@ TEST( UrlParserTest, ParseTLDNone ) {
UrlParser urlParser( url.c_str(), url.size() );
checkResult( "ok", urlParser.getAuthority(), urlParser.getAuthorityLen() );
checkResult( "ok", urlParser.getDomain(), urlParser.getDomainLen() );
checkResult( "", urlParser.getDomain(), urlParser.getDomainLen() );
}
TEST( UrlParserTest, ParseSLD ) {

@ -521,7 +521,7 @@ TEST( UrlTest, StripApacheDirSort ) {
std::make_tuple( "http://www.3ddx.com/blog/wp-includes/SimplePie/Decode/HTML/?C=N;O=D",
"http://www.3ddx.com/blog/wp-includes/SimplePie/Decode/HTML/" ),
std::make_tuple( "http://macports.mirror.ac.za/release/ports/www/midori/?C=M&O=A",
"http://macports.mirror.ac.za/release/ports/www/midori/" )
"http://macports.mirror.ac.za/release/ports/www/midori/" )
};
strip_param_tests( test_cases, 123 );