forked from Mirrors/privacore-open-source-search-engine
411 lines
12 KiB
C++
411 lines
12 KiB
C++
#include "UrlParser.h"
|
|
#include "Log.h"
|
|
#include "fctypes.h"
|
|
#include "Domains.h"
|
|
#include "ip.h"
|
|
#include <string.h>
|
|
#include <iterator>
|
|
|
|
static const char* strnpbrk( const char *str1, size_t len, const char *str2 ) {
|
|
const char *haystack = str1;
|
|
const char *haystackEnd = str1 + len;
|
|
|
|
while ( haystack < haystackEnd && *haystack ) {
|
|
const char *needle = str2;
|
|
while ( *needle ) {
|
|
if ( *haystack == *needle ) {
|
|
return haystack;
|
|
}
|
|
++needle;
|
|
}
|
|
++haystack;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/// @todo ALC we should see if we need to do relative path resolution here
|
|
/// @todo ALC we should cater for scheme relative address ( pass in parent scheme )
|
|
/// https://tools.ietf.org/html/rfc3986#section-5.2
|
|
UrlParser::UrlParser( const char *url, size_t urlLen )
|
|
: m_url( url )
|
|
, m_urlLen( urlLen )
|
|
, m_scheme( NULL )
|
|
, m_schemeLen( 0 )
|
|
, m_authority( NULL )
|
|
, m_authorityLen( 0 )
|
|
, m_domain( NULL )
|
|
, m_domainLen( 0 )
|
|
, m_paths()
|
|
, m_pathEndChar('\0')
|
|
, m_pathsDeleteCount( 0 )
|
|
, m_queries()
|
|
, m_queriesMap()
|
|
, m_queriesDeleteCount( 0 )
|
|
, m_urlParsed() {
|
|
m_urlParsed.reserve( m_urlLen );
|
|
parse();
|
|
}
|
|
|
|
void UrlParser::print() const {
|
|
logf( LOG_DEBUG, "UrlParser::url : %.*s", static_cast<uint32_t>( m_urlLen ), m_url );
|
|
logf( LOG_DEBUG, "UrlParser::scheme : %.*s", static_cast<uint32_t>( m_schemeLen ), m_scheme );
|
|
logf( LOG_DEBUG, "UrlParser::authority : %.*s", static_cast<uint32_t>( m_authorityLen ), m_authority );
|
|
logf( LOG_DEBUG, "UrlParser::host : %.*s", static_cast<uint32_t>( m_hostLen ), m_host );
|
|
logf( LOG_DEBUG, "UrlParser::domain : %.*s", static_cast<uint32_t>( m_domainLen ), m_domain );
|
|
|
|
for ( auto it = m_paths.begin(); it != m_paths.end(); ++it ) {
|
|
logf( LOG_DEBUG, "UrlParser::path[%02zi] : %s", std::distance( m_paths.begin(), it ), it->getString().c_str() );
|
|
}
|
|
|
|
for ( auto it = m_queries.begin(); it != m_queries.end(); ++it ) {
|
|
logf( LOG_DEBUG, "UrlParser::query[%02zi] : %s", std::distance( m_queries.begin(), it ), it->getString().c_str() );
|
|
}
|
|
}
|
|
|
|
void UrlParser::parse() {
|
|
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
|
|
|
const char *urlEnd = m_url + m_urlLen;
|
|
const char *currentPos = m_url;
|
|
|
|
// hier-part = "//" authority path-abempty
|
|
// / path-absolute
|
|
// / path-rootless
|
|
// / path-empty
|
|
|
|
const char *authorityPos = static_cast<const char*>( memmem( currentPos, urlEnd - currentPos, "//", 2 ) );
|
|
if ( authorityPos != NULL ) {
|
|
if ( authorityPos != currentPos ) {
|
|
m_scheme = currentPos;
|
|
m_schemeLen = authorityPos - currentPos - 1;
|
|
}
|
|
|
|
m_authority = authorityPos + 2;
|
|
currentPos = m_authority;
|
|
} else {
|
|
m_authority = currentPos;
|
|
}
|
|
|
|
const char *pathPos = static_cast<const char*>( memchr( currentPos, '/', urlEnd - currentPos ) );
|
|
if ( pathPos != NULL ) {
|
|
m_authorityLen = pathPos - m_authority;
|
|
currentPos = pathPos + 1;
|
|
} else {
|
|
m_authorityLen = urlEnd - m_authority;
|
|
|
|
// nothing else to process
|
|
return;
|
|
}
|
|
|
|
// @todo similar logic in Url.cpp ( merge this )
|
|
|
|
// authority = [ userinfo "@" ] host [ ":" port ]
|
|
const char *userInfoPos = static_cast<const char *>( memchr( m_authority, '@', m_authorityLen ) );
|
|
if ( userInfoPos != NULL ) {
|
|
m_host = userInfoPos + 1;
|
|
m_hostLen = m_authorityLen - ( userInfoPos - m_authority ) - 1;
|
|
} else {
|
|
m_host = m_authority;
|
|
m_hostLen = m_authorityLen;
|
|
}
|
|
|
|
const char *portPos = static_cast<const char *>( memrchr( m_host, ':', m_hostLen ) );
|
|
if ( portPos != NULL ) {
|
|
m_hostLen -= ( m_hostLen - ( portPos - m_host ) );
|
|
}
|
|
|
|
// host = IP-literal / IPv4address / reg-name
|
|
|
|
/// @todo ALC we should remove the const cast once we fix all the const issue
|
|
int32_t ip = atoip( m_host, m_hostLen );
|
|
if ( ip ) {
|
|
int32_t domainLen = 0;
|
|
m_domain = getDomainOfIp ( const_cast<char *>( m_host ), m_hostLen , &domainLen );
|
|
m_domainLen = domainLen;
|
|
} else {
|
|
const char *tldPos = ::getTLD( const_cast<char *>( m_host ), m_hostLen );
|
|
if ( tldPos ) {
|
|
size_t tldLen = m_host + m_hostLen - tldPos;
|
|
if ( tldLen < m_hostLen ) {
|
|
m_domain = static_cast<const char *>( memrchr( m_host, '.', m_hostLen - tldLen - 1 ) );
|
|
if ( m_domain ) {
|
|
m_domain += 1;
|
|
m_domainLen = m_hostLen - ( m_domain - m_host );
|
|
} else {
|
|
m_domain = m_host;
|
|
m_domainLen = m_hostLen;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const char *queryPos = static_cast<const char*>( memchr( currentPos, '?', urlEnd - currentPos ) );
|
|
if ( queryPos != NULL ) {
|
|
currentPos = queryPos + 1;
|
|
}
|
|
|
|
const char *anchorPos = static_cast<const char*>( memrchr( currentPos, '#', urlEnd - currentPos ) );
|
|
// if ( anchorPos != NULL ) {
|
|
// currentPos = anchorPos + 1;
|
|
// }
|
|
|
|
const char *pathEnd = queryPos ?: anchorPos ?: urlEnd;
|
|
m_pathEndChar = *( pathEnd - 1 );
|
|
|
|
const char *queryEnd = anchorPos ?: urlEnd;
|
|
|
|
// path
|
|
const char *prevPos = pathPos + 1;
|
|
while ( prevPos && ( prevPos <= pathEnd ) ) {
|
|
size_t len = pathEnd - prevPos;
|
|
currentPos = strnpbrk( prevPos, len, "/;&" );
|
|
if ( currentPos ) {
|
|
len = currentPos - prevPos;
|
|
}
|
|
|
|
UrlComponent urlPart = UrlComponent( UrlComponent::TYPE_PATH, prevPos, len, *( prevPos - 1 ) );
|
|
|
|
m_paths.push_back( urlPart );
|
|
|
|
prevPos = currentPos ? currentPos + 1 : NULL;
|
|
}
|
|
|
|
// query
|
|
if ( queryPos ) {
|
|
prevPos = queryPos + 1;
|
|
|
|
bool isPrevAmpersand = false;
|
|
while ( prevPos && ( prevPos < queryEnd ) ) {
|
|
size_t len = queryEnd - prevPos;
|
|
currentPos = strnpbrk( prevPos, len, "&;" );
|
|
if ( currentPos ) {
|
|
len = currentPos - prevPos;
|
|
}
|
|
|
|
UrlComponent urlPart = UrlComponent( UrlComponent::TYPE_QUERY, prevPos, len, *( prevPos - 1 ) );
|
|
std::string key = urlPart.getKey();
|
|
|
|
// check previous urlPart
|
|
if ( isPrevAmpersand ) {
|
|
urlPart.setSeparator( '&' );
|
|
}
|
|
|
|
bool isAmpersand = ( !urlPart.hasValue() && urlPart.getKey() == "amp" );
|
|
if ( !key.empty() && !isAmpersand ) {
|
|
// we don't cater for case sensitive query parameter (eg: parm, Parm, PARM is assumed to be the same)
|
|
auto it = m_queriesMap.find( key );
|
|
if (it == m_queriesMap.end()) {
|
|
m_queries.push_back( urlPart );
|
|
m_queriesMap[key] = m_queries.size() - 1;
|
|
} else {
|
|
m_queries[it->second] = urlPart;
|
|
}
|
|
}
|
|
|
|
prevPos = currentPos ? currentPos + 1 : NULL;
|
|
isPrevAmpersand = isAmpersand;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// @todo ALC a better way of doing this will be to check if the url has changed,
|
|
/// and call unparse automatically when getUrlParsed/getUrlParsedLen is called
|
|
void UrlParser::unparse() {
|
|
m_urlParsed.clear();
|
|
|
|
// domain
|
|
m_urlParsed.append( m_url, ( m_authority - m_url ) + m_authorityLen );
|
|
|
|
bool isFirst = true;
|
|
for ( auto it = m_paths.begin(); it != m_paths.end(); ++it ) {
|
|
if ( !it->isDeleted() ) {
|
|
if ( isFirst ) {
|
|
isFirst = false;
|
|
if ( it->getSeparator() != '/' ) {
|
|
m_urlParsed.append( "/" );
|
|
}
|
|
}
|
|
|
|
m_urlParsed += it->getSeparator();
|
|
m_urlParsed.append( it->getString() );
|
|
}
|
|
}
|
|
|
|
if ( m_urlParsed[ m_urlParsed.size() - 1 ] != '/' && m_pathEndChar == '/' ) {
|
|
m_urlParsed += m_pathEndChar;
|
|
}
|
|
|
|
isFirst = true;
|
|
for ( auto it = m_queries.begin(); it != m_queries.end(); ++it ) {
|
|
if ( !it->isDeleted() ) {
|
|
if ( isFirst ) {
|
|
isFirst = false;
|
|
m_urlParsed.append( "?" );
|
|
} else {
|
|
m_urlParsed += ( it->getSeparator() == '?' ) ? '&' : it->getSeparator();
|
|
}
|
|
|
|
m_urlParsed.append( it->getString() );
|
|
}
|
|
}
|
|
}
|
|
|
|
void UrlParser::deleteComponent( UrlComponent *urlComponent ) {
|
|
if ( urlComponent ) {
|
|
urlComponent->setDeleted();
|
|
|
|
switch ( urlComponent->getType() ) {
|
|
case UrlComponent::TYPE_PATH:
|
|
++m_pathsDeleteCount;
|
|
break;
|
|
case UrlComponent::TYPE_QUERY:
|
|
++m_queriesDeleteCount;
|
|
|
|
// also remove from map
|
|
m_queriesMap.erase( urlComponent->getKey() );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool UrlParser::removeComponent( const std::vector<UrlComponent*> &urlComponents, const UrlComponent::Validator &validator ) {
|
|
bool hasRemoval = false;
|
|
|
|
for ( auto it = urlComponents.begin(); it != urlComponents.end(); ++it ) {
|
|
if ( (*it)->isDeleted() ) {
|
|
continue;
|
|
}
|
|
|
|
if ( ( (*it)->hasValue() && validator.isValid( *(*it) ) ) ||
|
|
( !(*it)->hasValue() && validator.allowEmptyValue() ) ) {
|
|
hasRemoval = true;
|
|
deleteComponent( *it );
|
|
}
|
|
}
|
|
|
|
return hasRemoval;
|
|
}
|
|
|
|
std::vector<std::pair<UrlComponent*, UrlComponent*> > UrlParser::matchPath( const UrlComponent::Matcher &matcher ) {
|
|
std::vector<std::pair<UrlComponent*, UrlComponent*> > result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if ( m_pathsDeleteCount == m_paths.size() ) {
|
|
return result;
|
|
}
|
|
|
|
for ( auto it = m_paths.begin(); it != m_paths.end(); ++it ) {
|
|
if ( it->isDeleted() ) {
|
|
continue;
|
|
}
|
|
|
|
if ( !it->hasValue() && matcher.isMatching( *it ) ) {
|
|
auto valueIt = std::next( it, 1 );
|
|
result.push_back( std::make_pair( &( *it ), ( valueIt != m_paths.end() ? &( *valueIt ) : NULL ) ) );
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removePath( const std::vector<std::pair<UrlComponent*, UrlComponent*> > &urlComponents,
|
|
const UrlComponent::Validator &validator ) {
|
|
bool hasRemoval = false;
|
|
for ( auto it = urlComponents.begin(); it != urlComponents.end(); ++it ) {
|
|
if ( it->second == NULL ) {
|
|
if ( validator.allowEmptyValue() ) {
|
|
hasRemoval = true;
|
|
deleteComponent( it->first );
|
|
}
|
|
} else {
|
|
if ( validator.isValid( *( it->second ) ) ) {
|
|
hasRemoval = true;
|
|
deleteComponent( it->first );
|
|
deleteComponent( it->second );
|
|
}
|
|
}
|
|
}
|
|
|
|
return hasRemoval;
|
|
}
|
|
|
|
bool UrlParser::removePath( const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator ) {
|
|
std::vector<std::pair<UrlComponent*, UrlComponent*> > matches = matchPath( matcher );
|
|
|
|
return removePath( matches, validator );
|
|
}
|
|
|
|
std::vector<UrlComponent*> UrlParser::matchPathParam( const UrlComponent::Matcher &matcher ) {
|
|
std::vector<UrlComponent*> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if ( m_pathsDeleteCount == m_paths.size() ) {
|
|
return result;
|
|
}
|
|
|
|
for ( auto it = m_paths.begin(); it != m_paths.end(); ++it ) {
|
|
if ( it->isDeleted() ) {
|
|
continue;
|
|
}
|
|
|
|
if ( it->hasValue() && matcher.isMatching( *it ) ) {
|
|
result.push_back( &( *it ) );
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removePathParam( const std::vector<UrlComponent*> &urlComponents, const UrlComponent::Validator &validator ) {
|
|
return removeComponent( urlComponents, validator );
|
|
}
|
|
|
|
bool UrlParser::removePathParam( const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator ) {
|
|
std::vector<UrlComponent*> matches = matchPathParam( matcher );
|
|
|
|
return removeComponent( matches, validator );
|
|
}
|
|
|
|
std::vector<UrlComponent*> UrlParser::matchQueryParam( const UrlComponent::Matcher &matcher ) {
|
|
std::vector<UrlComponent*> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if ( m_queriesDeleteCount == m_queries.size() ) {
|
|
return result;
|
|
}
|
|
|
|
if ( matcher.getMatchCriteria() == MATCH_DEFAULT ) {
|
|
auto it = m_queriesMap.find( matcher.getParam() );
|
|
if ( it != m_queriesMap.end() ) {
|
|
result.push_back( &(m_queries[ it->second ]) );
|
|
}
|
|
} else {
|
|
for ( auto it = m_queries.begin(); it != m_queries.end(); ++it ) {
|
|
if ( it->isDeleted() ) {
|
|
continue;
|
|
}
|
|
|
|
if ( matcher.isMatching( *it ) ) {
|
|
result.push_back( &(*it));
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam( const char *param ) {
|
|
static const UrlComponent::Validator s_validator( 0, 0, true, ALLOW_ALL, MANDATORY_NONE );
|
|
|
|
return removeQueryParam( UrlComponent::Matcher( param ), s_validator );
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam( const std::vector<UrlComponent*> &urlComponents, const UrlComponent::Validator &validator ) {
|
|
return removeComponent( urlComponents, validator );
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam( const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator ) {
|
|
return removeComponent( matchQueryParam( matcher ), validator );
|
|
}
|