privacore-open-source-searc.../UrlParser.cpp

555 lines
15 KiB
C++

#include "UrlParser.h"
#include "Log.h"
#include "fctypes.h"
#include "Domains.h"
#include "ip.h"
#include <string.h>
#include <iterator>
#include <algorithm>
static const char *strnpbrk(const char *str1, size_t len, const char *str2) {
const char *haystack = str1;
const char *haystackEnd = str1 + len;
while (haystack < haystackEnd && *haystack) {
const char *needle = str2;
while (*needle) {
if (*haystack == *needle) {
return haystack;
}
++needle;
}
++haystack;
}
return NULL;
}
/// @todo ALC we should see if we need to do relative path resolution here
/// @todo ALC we should cater for scheme relative address (pass in parent scheme)
/// https://tools.ietf.org/html/rfc3986#section-5.2
UrlParser::UrlParser(const char *url, size_t urlLen, int32_t titledbVersion)
: m_titledbVersion(titledbVersion)
, m_url(url, urlLen)
, m_scheme(NULL)
, m_schemeLen(0)
, m_authority(NULL)
, m_authorityLen(0)
, m_host(NULL)
, m_hostLen(0)
, m_port(NULL)
, m_portLen(0)
, m_domain(NULL)
, m_domainLen(0)
, m_paths()
, m_pathEndChar('\0')
, m_pathsDeleteCount(0)
, m_queries()
, m_queriesDeleteCount(0)
, m_urlParsed() {
m_urlParsed.reserve(m_url.length());
parse();
}
void UrlParser::print() const {
logf(LOG_DEBUG, "UrlParser::url : '%s'", m_url.c_str());
logf(LOG_DEBUG, "UrlParser::scheme : '%.*s'", static_cast<uint32_t>(m_schemeLen), m_scheme);
logf(LOG_DEBUG, "UrlParser::authority : '%.*s'", static_cast<uint32_t>(m_authorityLen), m_authority);
logf(LOG_DEBUG, "UrlParser::host : '%.*s'", static_cast<uint32_t>(m_hostLen), m_host);
logf(LOG_DEBUG, "UrlParser::domain : '%.*s'", static_cast<uint32_t>(m_domainLen), m_domain);
logf(LOG_DEBUG, "UrlParser::port : '%.*s'", static_cast<uint32_t>(m_portLen), m_port);
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
logf(LOG_DEBUG, "UrlParser::path[%02zi] : '%s'%s", std::distance(m_paths.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
}
for (auto it = m_queries.begin(); it != m_queries.end(); ++it) {
logf(LOG_DEBUG, "UrlParser::query[%02zi] : '%s'%s", std::distance(m_queries.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
}
}
void UrlParser::parse() {
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
const char *urlEnd = m_url.c_str() + m_url.length();
const char *currentPos = m_url.c_str();
// hier-part = "//" authority path-abempty
// / path-absolute
// / path-rootless
// / path-empty
const char *authorityPos = static_cast<const char *>(memmem(currentPos, urlEnd - currentPos, "//", 2));
if (authorityPos != NULL) {
if (authorityPos != currentPos) {
m_scheme = currentPos;
m_schemeLen = authorityPos - currentPos - 1;
}
m_authority = authorityPos + 2;
currentPos = m_authority;
} else {
m_authority = currentPos;
}
const char *pathPos = static_cast<const char *>(memchr(currentPos, '/', urlEnd - currentPos));
if (pathPos != NULL) {
m_authorityLen = pathPos - m_authority;
currentPos = pathPos + 1;
} else {
m_authorityLen = urlEnd - m_authority;
}
// @todo similar logic in Url.cpp (merge this)
// authority = [ userinfo "@" ] host [ ":" port ]
const char *userInfoPos = static_cast<const char *>(memchr(m_authority, '@', m_authorityLen));
if (userInfoPos != NULL) {
m_host = userInfoPos + 1;
m_hostLen = m_authorityLen - (userInfoPos - m_authority) - 1;
} else {
m_host = m_authority;
m_hostLen = m_authorityLen;
}
const char *portPos = static_cast<const char *>(memrchr(m_host, ':', m_hostLen));
if (portPos != NULL) {
m_port = portPos + 1;
m_portLen = m_authorityLen - (portPos - m_authority) - 1;
m_hostLen -= (m_hostLen - (portPos - m_host));
}
// host = IP-literal / IPv4address / reg-name
/// @todo ALC we should remove the const cast once we fix all the const issue
int32_t ip = atoip(m_host, m_hostLen);
if (ip) {
int32_t domainLen = 0;
m_domain = getDomainOfIp(m_host, m_hostLen, &domainLen);
m_domainLen = domainLen;
} else {
const char *tldPos = ::getTLD(m_host, m_hostLen);
if (tldPos) {
size_t tldLen = m_host + m_hostLen - tldPos;
if (tldLen < m_hostLen) {
m_domain = static_cast<const char *>(memrchr(m_host, '.', m_hostLen - tldLen - 1));
if (m_domain) {
m_domain += 1;
m_domainLen = m_hostLen - (m_domain - m_host);
} else {
m_domain = m_host;
m_domainLen = m_hostLen;
}
}
}
}
if (pathPos == NULL) {
// nothing else to process
return;
}
const char *queryPos = static_cast<const char *>(memchr(currentPos, '?', urlEnd - currentPos));
if (queryPos != NULL) {
currentPos = queryPos + 1;
}
/// @note url fragment is stripped and not part of the rebuild url
const char *fragmentPos = static_cast<const char *>(memrchr(currentPos, '#', urlEnd - currentPos));
if (fragmentPos != NULL) {
// https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
// don't treat '#!" as anchor
if (fragmentPos != urlEnd && *(fragmentPos + 1) == '!') {
fragmentPos = NULL;
}
}
const char *pathEnd = queryPos ? queryPos : (fragmentPos ? fragmentPos : urlEnd);
m_pathEndChar = *(pathEnd - 1);
const char *queryEnd = fragmentPos ? fragmentPos : urlEnd;
// path
bool isFirstComponent = true;
bool updatePathEncChar = false;
const char *prevPos = pathPos + 1;
while (prevPos && (prevPos <= pathEnd)) {
size_t len = pathEnd - prevPos;
currentPos = strnpbrk(prevPos, len, "/;&");
if (currentPos) {
len = currentPos - prevPos;
}
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_PATH, prevPos, len, *(prevPos - 1), isFirstComponent);
isFirstComponent = false;
// check for special cases before adding to m_paths
if (len == 1 && memcmp(prevPos, ".", 1) == 0) {
deleteComponent(&urlPart);
updatePathEncChar = true;
} else if (len == 2 && memcmp(prevPos, "..", 2) == 0) {
deleteComponent(&urlPart);
updatePathEncChar = true;
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
if (it->isDeleted()) {
continue;
}
deleteComponent(&(*it));
if (it->getSeparator() == '/') {
break;
}
}
}
m_paths.push_back(urlPart);
prevPos = currentPos ? currentPos + 1 : NULL;
}
// set pathEndChar to component after last non-deleted component (if exist)
if (updatePathEncChar) {
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
if (it->isDeleted()) {
continue;
}
if (it != m_paths.rbegin()) {
m_pathEndChar = std::prev(it)->getSeparator();
}
break;
}
}
// query
if (queryPos) {
prevPos = queryPos + 1;
bool isFirstComponent = true;
bool isPrevAmpersand = false;
while (prevPos && (prevPos < queryEnd)) {
const char *querySeparator = m_titledbVersion <= 128 ? "&;" : "&;?";
size_t len = queryEnd - prevPos;
currentPos = strnpbrk(prevPos, len, querySeparator);
if (currentPos) {
len = currentPos - prevPos;
}
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_QUERY, prevPos, len, *(prevPos - 1), isFirstComponent);
isFirstComponent = false;
std::string key = urlPart.getKey();
// check previous urlPart
if (isPrevAmpersand) {
urlPart.setSeparator('&');
}
bool isAmpersand = (!urlPart.hasValue() && urlPart.getKey() == "amp");
if (!key.empty() && !isAmpersand) {
// we don't cater for case sensitive query parameter (eg: parm, Parm, PARM is assumed to be the same)
auto it = std::find_if(m_queries.begin(), m_queries.end(), [&key](const UrlComponent& u) { return key == u.getKey(); });
if (it == m_queries.end()) {
m_queries.push_back(urlPart);
} else {
*it = urlPart;
}
}
prevPos = currentPos ? currentPos + 1 : NULL;
isPrevAmpersand = isAmpersand;
}
}
if (m_titledbVersion >= 124) {
// remove empty query parameters
for (auto &query : m_queries) {
if (query.getValueLen() == 0) {
deleteComponent(&query);
}
}
}
}
/// @todo ALC a better way of doing this will be to check if the url has changed,
/// and call unparse automatically when getUrlParsed/getUrlParsedLen is called
void UrlParser::unparse() {
m_urlParsed.clear();
if (m_scheme == NULL || m_schemeLen == 0) {
m_urlParsed.append("http");
} else {
for (size_t i = 0; i < m_schemeLen; ++i) {
m_urlParsed.push_back(tolower(m_scheme[i]));
}
}
m_urlParsed.append("://");
// userinfo '@'
m_urlParsed.append(m_authority, m_host - m_authority);
// host
for (size_t i = 0; i < m_hostLen; ++i) {
m_urlParsed.push_back(tolower(m_host[i]));
}
// port
if (m_port) {
m_urlParsed.push_back(':');
m_urlParsed.append(m_port, m_portLen);
}
if (m_pathsDeleteCount != m_paths.size()) {
bool isFirst = true;
for (auto &path : m_paths) {
if (!path.isDeleted()) {
if (isFirst) {
isFirst = false;
if (path.getSeparator() != '/') {
m_urlParsed.append("/");
}
}
m_urlParsed += path.getSeparator();
m_urlParsed.append(path.getString());
}
}
if (m_urlParsed[m_urlParsed.size() - 1] != '/' && m_pathEndChar == '/') {
m_urlParsed += m_pathEndChar;
}
} else {
if (m_titledbVersion >= 124) {
m_urlParsed += '/';
}
}
if (m_queriesDeleteCount != m_queries.size()) {
bool isFirst = true;
for (auto &query : m_queries) {
if (!query.isDeleted()) {
if (isFirst) {
isFirst = false;
m_urlParsed.append("?");
} else {
// we should preserve '?' that is not the first separator
// because '?' should not have any special meaning after query parameter starts
m_urlParsed += (query.isFirst() && query.getSeparator() == '?') ? '&' : query.getSeparator();
}
m_urlParsed.append(query.getString());
}
}
}
}
void UrlParser::deleteComponent(UrlComponent *urlComponent) {
if (urlComponent == nullptr || urlComponent->isDeleted()) {
return;
}
urlComponent->setDeleted();
switch (urlComponent->getType()) {
case UrlComponent::TYPE_PATH:
++m_pathsDeleteCount;
break;
case UrlComponent::TYPE_QUERY:
++m_queriesDeleteCount;
break;
}
}
void UrlParser::deleteComponents(std::vector<UrlComponent*> &urlComponents) {
for (auto &urlComponent : urlComponents) {
if (urlComponent->isDeleted()) {
continue;
}
deleteComponent(urlComponent);
}
}
bool UrlParser::removeComponent(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
bool hasRemoval = false;
for (auto urlComponent : urlComponents) {
if (urlComponent->isDeleted()) {
continue;
}
if ((urlComponent->hasValue() && validator.isValid(*urlComponent)) ||
(!urlComponent->hasValue() && validator.allowEmptyValue())) {
hasRemoval = true;
deleteComponent(urlComponent);
}
}
return hasRemoval;
}
std::vector<std::pair<UrlComponent *, UrlComponent *> > UrlParser::matchPath(const UrlComponent::Matcher &matcher) {
std::vector<std::pair<UrlComponent *, UrlComponent *> > result;
// don't need to loop if it's all deleted
if (m_pathsDeleteCount == m_paths.size()) {
return result;
}
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
if (it->isDeleted()) {
continue;
}
if (!it->hasValue() && matcher.isMatching(*it)) {
auto valueIt = std::next(it, 1);
result.emplace_back(&(*it), (valueIt != m_paths.end() ? &(*valueIt) : NULL));
}
}
return result;
}
bool UrlParser::removePath(const std::vector<std::pair<UrlComponent *, UrlComponent *> > &urlComponents,
const UrlComponent::Validator &validator) {
bool hasRemoval = false;
for (const auto &urlComponent : urlComponents) {
if (urlComponent.second == NULL || (m_titledbVersion <= 123 && urlComponent.second->getValueLen() == 0)) {
if (validator.allowEmptyValue()) {
hasRemoval = true;
deleteComponent(urlComponent.first);
}
} else {
const char *value = (m_titledbVersion <= 123) ? urlComponent.second->getValue() : urlComponent.second->getString().c_str();
size_t valueLen = (m_titledbVersion <= 123) ? urlComponent.second->getValueLen() : urlComponent.second->getString().size();
if (validator.isValid(value, valueLen)) {
hasRemoval = true;
deleteComponent(urlComponent.first);
deleteComponent(urlComponent.second);
}
}
}
return hasRemoval;
}
bool UrlParser::removePath(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
std::vector<std::pair<UrlComponent *, UrlComponent *> > matches = matchPath(matcher);
return removePath(matches, validator);
}
std::vector<UrlComponent *> UrlParser::matchPathParam(const UrlComponent::Matcher &matcher) {
std::vector<UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_pathsDeleteCount == m_paths.size()) {
return result;
}
for (auto &path : m_paths) {
if (path.isDeleted()) {
continue;
}
if (path.hasValue() && matcher.isMatching(path)) {
result.push_back(&path);
}
}
return result;
}
std::vector<const UrlComponent *> UrlParser::matchPathParam(const UrlComponent::Matcher &matcher) const {
std::vector<const UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_pathsDeleteCount == m_paths.size()) {
return result;
}
for (auto &path : m_paths) {
if (path.isDeleted()) {
continue;
}
if (path.hasValue() && matcher.isMatching(path)) {
result.push_back(&path);
}
}
return result;
}
bool UrlParser::removePathParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
return removeComponent(urlComponents, validator);
}
bool UrlParser::removePathParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
std::vector<UrlComponent *> matches = matchPathParam(matcher);
return removeComponent(matches, validator);
}
const std::vector<UrlComponent *> UrlParser::matchQueryParam(const UrlComponent::Matcher &matcher) {
std::vector<UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_queriesDeleteCount == m_queries.size()) {
return result;
}
for (auto &query : m_queries) {
if (query.isDeleted()) {
continue;
}
if (matcher.isMatching(query)) {
result.push_back(&query);
}
}
return result;
}
const std::vector<const UrlComponent *> UrlParser::matchQueryParam(const UrlComponent::Matcher &matcher) const {
std::vector<const UrlComponent *> result;
// don't need to loop if it's all deleted
if (m_queriesDeleteCount == m_queries.size()) {
return result;
}
for (const auto &query : m_queries) {
if (query.isDeleted()) {
continue;
}
if (matcher.isMatching(query)) {
result.push_back(&query);
}
}
return result;
}
bool UrlParser::removeQueryParam(const char *param) {
static const UrlComponent::Validator s_validator(0, 0, true, ALLOW_ALL, MANDATORY_NONE);
return removeQueryParam(UrlComponent::Matcher(param), s_validator);
}
bool UrlParser::removeQueryParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
return removeComponent(urlComponents, validator);
}
bool UrlParser::removeQueryParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
return removeComponent(matchQueryParam(matcher), validator);
}