Files
privacore-open-source-searc…/UrlParser.cpp
2016-12-22 12:42:56 +01:00

484 lines
13 KiB
C++

#include "UrlParser.h"
#include "Log.h"
#include "fctypes.h"
#include "Domains.h"
#include "ip.h"
#include <string.h>
#include <iterator>
static const char *strnpbrk(const char *str1, size_t len, const char *str2) {
const char *haystack = str1;
const char *haystackEnd = str1 + len;
while (haystack < haystackEnd && *haystack) {
const char *needle = str2;
while (*needle) {
if (*haystack == *needle) {
return haystack;
}
++needle;
}
++haystack;
}
return NULL;
}
/// @todo ALC we should see if we need to do relative path resolution here
/// @todo ALC we should cater for scheme relative address (pass in parent scheme)
/// https://tools.ietf.org/html/rfc3986#section-5.2
UrlParser::UrlParser(const char *url, size_t urlLen)
: m_url(url)
, m_urlLen(urlLen)
, m_scheme(NULL)
, m_schemeLen(0)
, m_authority(NULL)
, m_authorityLen(0)
, m_host(NULL)
, m_hostLen(0)
, m_port(NULL)
, m_portLen(0)
, m_domain(NULL)
, m_domainLen(0)
, m_paths()
, m_pathEndChar('\0')
, m_pathsDeleteCount(0)
, m_queries()
, m_queriesMap()
, m_queriesDeleteCount(0)
, m_urlParsed() {
m_urlParsed.reserve(m_urlLen);
parse();
}
void UrlParser::print() const {
logf(LOG_DEBUG, "UrlParser::url : '%.*s'", static_cast<uint32_t>(m_urlLen), m_url);
logf(LOG_DEBUG, "UrlParser::scheme : '%.*s'", static_cast<uint32_t>(m_schemeLen), m_scheme);
logf(LOG_DEBUG, "UrlParser::authority : '%.*s'", static_cast<uint32_t>(m_authorityLen), m_authority);
logf(LOG_DEBUG, "UrlParser::host : '%.*s'", static_cast<uint32_t>(m_hostLen), m_host);
logf(LOG_DEBUG, "UrlParser::domain : '%.*s'", static_cast<uint32_t>(m_domainLen), m_domain);
logf(LOG_DEBUG, "UrlParser::port : '%.*s'", static_cast<uint32_t>(m_portLen), m_port);
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
logf(LOG_DEBUG, "UrlParser::path[%02zi] : '%s'%s", std::distance(m_paths.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
}
for (auto it = m_queries.begin(); it != m_queries.end(); ++it) {
logf(LOG_DEBUG, "UrlParser::query[%02zi] : '%s'%s", std::distance(m_queries.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
}
}
void UrlParser::parse() {
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
const char *urlEnd = m_url + m_urlLen;
const char *currentPos = m_url;
// hier-part = "//" authority path-abempty
// / path-absolute
// / path-rootless
// / path-empty
const char *authorityPos = static_cast<const char *>(memmem(currentPos, urlEnd - currentPos, "//", 2));
if (authorityPos != NULL) {
if (authorityPos != currentPos) {
m_scheme = currentPos;
m_schemeLen = authorityPos - currentPos - 1;
}
m_authority = authorityPos + 2;
currentPos = m_authority;
} else {
m_authority = currentPos;
}
const char *pathPos = static_cast<const char *>(memchr(currentPos, '/', urlEnd - currentPos));
if (pathPos != NULL) {
m_authorityLen = pathPos - m_authority;
currentPos = pathPos + 1;
} else {
m_authorityLen = urlEnd - m_authority;
}
// @todo similar logic in Url.cpp (merge this)
// authority = [ userinfo "@" ] host [ ":" port ]
const char *userInfoPos = static_cast<const char *>(memchr(m_authority, '@', m_authorityLen));
if (userInfoPos != NULL) {
m_host = userInfoPos + 1;
m_hostLen = m_authorityLen - (userInfoPos - m_authority) - 1;
} else {
m_host = m_authority;
m_hostLen = m_authorityLen;
}
const char *portPos = static_cast<const char *>(memrchr(m_host, ':', m_hostLen));
if (portPos != NULL) {
m_port = portPos + 1;
m_portLen = m_authorityLen - (portPos - m_authority) - 1;
m_hostLen -= (m_hostLen - (portPos - m_host));
}
// host = IP-literal / IPv4address / reg-name
/// @todo ALC we should remove the const cast once we fix all the const issue
int32_t ip = atoip(m_host, m_hostLen);
if (ip) {
int32_t domainLen = 0;
m_domain = getDomainOfIp(const_cast<char *>(m_host), m_hostLen, &domainLen);
m_domainLen = domainLen;
} else {
const char *tldPos = ::getTLD(const_cast<char *>(m_host), m_hostLen);
if (tldPos) {
size_t tldLen = m_host + m_hostLen - tldPos;
if (tldLen < m_hostLen) {
m_domain = static_cast<const char *>(memrchr(m_host, '.', m_hostLen - tldLen - 1));
if (m_domain) {
m_domain += 1;
m_domainLen = m_hostLen - (m_domain - m_host);
} else {
m_domain = m_host;
m_domainLen = m_hostLen;
}
}
}
}
if (pathPos == NULL) {
// nothing else to process
return;
}
const char *queryPos = static_cast<const char *>(memchr(currentPos, '?', urlEnd - currentPos));
if (queryPos != NULL) {
currentPos = queryPos + 1;
}
/// @note url fragment is stripped and not part of the rebuild url
const char *fragmentPos = static_cast<const char *>(memrchr(currentPos, '#', urlEnd - currentPos));
if (fragmentPos != NULL) {
// https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
// don't treat '#!" as anchor
if (fragmentPos != urlEnd && *(fragmentPos + 1) == '!') {
fragmentPos = NULL;
}
}
const char *pathEnd = queryPos ? queryPos : (fragmentPos ? fragmentPos : urlEnd);
m_pathEndChar = *(pathEnd - 1);
const char *queryEnd = fragmentPos ? fragmentPos : urlEnd;
// path
bool updatePathEncChar = false;
const char *prevPos = pathPos + 1;
while (prevPos && (prevPos <= pathEnd)) {
size_t len = pathEnd - prevPos;
currentPos = strnpbrk(prevPos, len, "/;&");
if (currentPos) {
len = currentPos - prevPos;
}
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_PATH, prevPos, len, *(prevPos - 1));
// check for special cases before adding to m_paths
if (len == 1 && memcmp(prevPos, ".", 1) == 0) {
urlPart.setDeleted();
updatePathEncChar = true;
} else if (len == 2 && memcmp(prevPos, "..", 2) == 0) {
deleteComponent(&urlPart);
updatePathEncChar = true;
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
if (it->isDeleted()) {
continue;
}
deleteComponent(&(*it));
if (it->getSeparator() == '/') {
break;
}
}
}
m_paths.push_back(urlPart);
prevPos = currentPos ? currentPos + 1 : NULL;
}
// set pathEndChar to component after last non-deleted component (if exist)
if (updatePathEncChar) {
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
if (it->isDeleted()) {
continue;
}
if (it != m_paths.rbegin()) {
m_pathEndChar = std::prev(it)->getSeparator();
}
break;
}
}
// query
if (queryPos) {
prevPos = queryPos + 1;
bool isPrevAmpersand = false;
while (prevPos && (prevPos < queryEnd)) {
size_t len = queryEnd - prevPos;
currentPos = strnpbrk(prevPos, len, "&;");
if (currentPos) {
len = currentPos - prevPos;
}
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_QUERY, prevPos, len, *(prevPos - 1));
std::string key = urlPart.getKey();
// check previous urlPart
if (isPrevAmpersand) {
urlPart.setSeparator('&');
}
bool isAmpersand = (!urlPart.hasValue() && urlPart.getKey() == "amp");
if (!key.empty() && !isAmpersand) {
// we don't cater for case sensitive query parameter (eg: parm, Parm, PARM is assumed to be the same)
auto it = m_queriesMap.find(key);
if (it == m_queriesMap.end()) {
m_queries.push_back(urlPart);
m_queriesMap[key] = m_queries.size() - 1;
} else {
m_queries[it->second] = urlPart;
}
}
prevPos = currentPos ? currentPos + 1 : NULL;
isPrevAmpersand = isAmpersand;
}
}
}
/// @todo ALC a better way of doing this will be to check if the url has changed,
/// and call unparse automatically when getUrlParsed/getUrlParsedLen is called
void UrlParser::unparse() {
m_urlParsed.clear();
if (m_scheme == NULL || m_schemeLen == 0) {
m_urlParsed.append("http");
} else {
for (size_t i = 0; i < m_schemeLen; ++i) {
m_urlParsed.push_back(tolower(m_scheme[i]));
}
}
m_urlParsed.append("://");
// userinfo '@'
m_urlParsed.append(m_authority, m_host - m_authority);
// host
for (size_t i = 0; i < m_hostLen; ++i) {
m_urlParsed.push_back(tolower(m_host[i]));
}
// port
if (m_port) {
m_urlParsed.push_back(':');
m_urlParsed.append(m_port, m_portLen);
}
bool isFirst = true;
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
if (!it->isDeleted()) {
if (isFirst) {
isFirst = false;
if (it->getSeparator() != '/') {
m_urlParsed.append("/");
}
}
m_urlParsed += it->getSeparator();
m_urlParsed.append(it->getString());
}
}
if (m_urlParsed[m_urlParsed.size() - 1] != '/' && m_pathEndChar == '/') {
m_urlParsed += m_pathEndChar;
}
isFirst = true;
for (auto it = m_queries.begin(); it != m_queries.end(); ++it) {
if (!it->isDeleted()) {
if (isFirst) {
isFirst = false;
m_urlParsed.append("?");
} else {
m_urlParsed += (it->getSeparator() == '?') ? '&' : it->getSeparator();
}
m_urlParsed.append(it->getString());
}
}
}
void UrlParser::deleteComponent(UrlComponent *urlComponent) {
if (urlComponent) {
urlComponent->setDeleted();
switch (urlComponent->getType()) {
case UrlComponent::TYPE_PATH:
++m_pathsDeleteCount;
break;
case UrlComponent::TYPE_QUERY:
++m_queriesDeleteCount;
// also remove from map
m_queriesMap.erase(urlComponent->getKey());
break;
}
}
}
bool UrlParser::removeComponent(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
bool hasRemoval = false;
for (auto it = urlComponents.begin(); it != urlComponents.end(); ++it) {
if ((*it)->isDeleted()) {
continue;
}
if (((*it)->hasValue() && validator.isValid(*(*it))) ||
(!(*it)->hasValue() && validator.allowEmptyValue())) {
hasRemoval = true;
deleteComponent(*it);
}
}
return hasRemoval;
}
std::vector<std::pair<UrlComponent *, UrlComponent *> > UrlParser::matchPath(const UrlComponent::Matcher &matcher) {
std::vector<std::pair<UrlComponent *, UrlComponent *> > result;
// don't need to loop if it's all deleted
if (m_pathsDeleteCount == m_paths.size()) {
return result;
}
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
if (it->isDeleted()) {
continue;
}
if (!it->hasValue() && matcher.isMatching(*it)) {
auto valueIt = std::next(it, 1);
result.push_back(std::make_pair(&(*it), (valueIt != m_paths.end() ? &(*valueIt) : NULL)));
}
}
return result;
}
bool UrlParser::removePath(const std::vector<std::pair<UrlComponent *, UrlComponent *> > &urlComponents,
const UrlComponent::Validator &validator) {
bool hasRemoval = false;
for (auto it = urlComponents.begin(); it != urlComponents.end(); ++it) {
if (it->second == NULL) {
if (validator.allowEmptyValue()) {
hasRemoval = true;
deleteComponent(it->first);
}
} else {
if (validator.isValid(*(it->second))) {
hasRemoval = true;
deleteComponent(it->first);
deleteComponent(it->second);
}
}
}
return hasRemoval;
}
bool UrlParser::removePath(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
std::vector<std::pair<UrlComponent *, UrlComponent *> > matches = matchPath(matcher);
return removePath(matches, validator);
}
std::vector<UrlComponent *> UrlParser::matchPathParam(const UrlComponent::Matcher &matcher) {
std::vector<UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_pathsDeleteCount == m_paths.size()) {
return result;
}
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
if (it->isDeleted()) {
continue;
}
if (it->hasValue() && matcher.isMatching(*it)) {
result.push_back(&(*it));
}
}
return result;
}
bool UrlParser::removePathParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
return removeComponent(urlComponents, validator);
}
bool UrlParser::removePathParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
std::vector<UrlComponent *> matches = matchPathParam(matcher);
return removeComponent(matches, validator);
}
std::vector<UrlComponent *> UrlParser::matchQueryParam(const UrlComponent::Matcher &matcher) {
std::vector<UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_queriesDeleteCount == m_queries.size()) {
return result;
}
if (matcher.getMatchCriteria() == MATCH_DEFAULT) {
auto it = m_queriesMap.find(matcher.getParam());
if (it != m_queriesMap.end()) {
result.push_back(&(m_queries[it->second]));
}
} else {
for (auto it = m_queries.begin(); it != m_queries.end(); ++it) {
if (it->isDeleted()) {
continue;
}
if (matcher.isMatching(*it)) {
result.push_back(&(*it));
}
}
}
return result;
}
bool UrlParser::removeQueryParam(const char *param) {
static const UrlComponent::Validator s_validator(0, 0, true, ALLOW_ALL, MANDATORY_NONE);
return removeQueryParam(UrlComponent::Matcher(param), s_validator);
}
bool UrlParser::removeQueryParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
return removeComponent(urlComponents, validator);
}
bool UrlParser::removeQueryParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
return removeComponent(matchQueryParam(matcher), validator);
}