mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
555 lines
15 KiB
C++
555 lines
15 KiB
C++
#include "UrlParser.h"
|
|
#include "Log.h"
|
|
#include "fctypes.h"
|
|
#include "Domains.h"
|
|
#include "ip.h"
|
|
#include <string.h>
|
|
#include <iterator>
|
|
#include <algorithm>
|
|
|
|
static const char *strnpbrk(const char *str1, size_t len, const char *str2) {
|
|
const char *haystack = str1;
|
|
const char *haystackEnd = str1 + len;
|
|
|
|
while (haystack < haystackEnd && *haystack) {
|
|
const char *needle = str2;
|
|
while (*needle) {
|
|
if (*haystack == *needle) {
|
|
return haystack;
|
|
}
|
|
++needle;
|
|
}
|
|
++haystack;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/// @todo ALC we should see if we need to do relative path resolution here
|
|
/// @todo ALC we should cater for scheme relative address (pass in parent scheme)
|
|
/// https://tools.ietf.org/html/rfc3986#section-5.2
|
|
UrlParser::UrlParser(const char *url, size_t urlLen, int32_t titledbVersion)
|
|
: m_titledbVersion(titledbVersion)
|
|
, m_url(url, urlLen)
|
|
, m_scheme(NULL)
|
|
, m_schemeLen(0)
|
|
, m_authority(NULL)
|
|
, m_authorityLen(0)
|
|
, m_host(NULL)
|
|
, m_hostLen(0)
|
|
, m_port(NULL)
|
|
, m_portLen(0)
|
|
, m_domain(NULL)
|
|
, m_domainLen(0)
|
|
, m_paths()
|
|
, m_pathEndChar('\0')
|
|
, m_pathsDeleteCount(0)
|
|
, m_queries()
|
|
, m_queriesDeleteCount(0)
|
|
, m_urlParsed() {
|
|
m_urlParsed.reserve(m_url.length());
|
|
parse();
|
|
}
|
|
|
|
void UrlParser::print() const {
|
|
logf(LOG_DEBUG, "UrlParser::url : '%s'", m_url.c_str());
|
|
logf(LOG_DEBUG, "UrlParser::scheme : '%.*s'", static_cast<uint32_t>(m_schemeLen), m_scheme);
|
|
logf(LOG_DEBUG, "UrlParser::authority : '%.*s'", static_cast<uint32_t>(m_authorityLen), m_authority);
|
|
logf(LOG_DEBUG, "UrlParser::host : '%.*s'", static_cast<uint32_t>(m_hostLen), m_host);
|
|
logf(LOG_DEBUG, "UrlParser::domain : '%.*s'", static_cast<uint32_t>(m_domainLen), m_domain);
|
|
logf(LOG_DEBUG, "UrlParser::port : '%.*s'", static_cast<uint32_t>(m_portLen), m_port);
|
|
|
|
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
|
|
logf(LOG_DEBUG, "UrlParser::path[%02zi] : '%s'%s", std::distance(m_paths.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
|
|
}
|
|
|
|
for (auto it = m_queries.begin(); it != m_queries.end(); ++it) {
|
|
logf(LOG_DEBUG, "UrlParser::query[%02zi] : '%s'%s", std::distance(m_queries.begin(), it), it->getString().c_str(), it->isDeleted() ? " (deleted)" : "");
|
|
}
|
|
}
|
|
|
|
void UrlParser::parse() {
|
|
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
|
|
|
const char *urlEnd = m_url.c_str() + m_url.length();
|
|
const char *currentPos = m_url.c_str();
|
|
|
|
// hier-part = "//" authority path-abempty
|
|
// / path-absolute
|
|
// / path-rootless
|
|
// / path-empty
|
|
|
|
const char *authorityPos = static_cast<const char *>(memmem(currentPos, urlEnd - currentPos, "//", 2));
|
|
if (authorityPos != NULL) {
|
|
if (authorityPos != currentPos) {
|
|
m_scheme = currentPos;
|
|
m_schemeLen = authorityPos - currentPos - 1;
|
|
}
|
|
|
|
m_authority = authorityPos + 2;
|
|
currentPos = m_authority;
|
|
} else {
|
|
m_authority = currentPos;
|
|
}
|
|
|
|
const char *pathPos = static_cast<const char *>(memchr(currentPos, '/', urlEnd - currentPos));
|
|
if (pathPos != NULL) {
|
|
m_authorityLen = pathPos - m_authority;
|
|
currentPos = pathPos + 1;
|
|
} else {
|
|
m_authorityLen = urlEnd - m_authority;
|
|
}
|
|
|
|
// @todo similar logic in Url.cpp (merge this)
|
|
|
|
// authority = [ userinfo "@" ] host [ ":" port ]
|
|
const char *userInfoPos = static_cast<const char *>(memchr(m_authority, '@', m_authorityLen));
|
|
if (userInfoPos != NULL) {
|
|
m_host = userInfoPos + 1;
|
|
m_hostLen = m_authorityLen - (userInfoPos - m_authority) - 1;
|
|
} else {
|
|
m_host = m_authority;
|
|
m_hostLen = m_authorityLen;
|
|
}
|
|
|
|
const char *portPos = static_cast<const char *>(memrchr(m_host, ':', m_hostLen));
|
|
if (portPos != NULL) {
|
|
m_port = portPos + 1;
|
|
m_portLen = m_authorityLen - (portPos - m_authority) - 1;
|
|
|
|
m_hostLen -= (m_hostLen - (portPos - m_host));
|
|
}
|
|
|
|
// host = IP-literal / IPv4address / reg-name
|
|
|
|
/// @todo ALC we should remove the const cast once we fix all the const issue
|
|
int32_t ip = atoip(m_host, m_hostLen);
|
|
if (ip) {
|
|
int32_t domainLen = 0;
|
|
m_domain = getDomainOfIp(m_host, m_hostLen, &domainLen);
|
|
m_domainLen = domainLen;
|
|
} else {
|
|
const char *tldPos = ::getTLD(m_host, m_hostLen);
|
|
if (tldPos) {
|
|
size_t tldLen = m_host + m_hostLen - tldPos;
|
|
if (tldLen < m_hostLen) {
|
|
m_domain = static_cast<const char *>(memrchr(m_host, '.', m_hostLen - tldLen - 1));
|
|
if (m_domain) {
|
|
m_domain += 1;
|
|
m_domainLen = m_hostLen - (m_domain - m_host);
|
|
} else {
|
|
m_domain = m_host;
|
|
m_domainLen = m_hostLen;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (pathPos == NULL) {
|
|
// nothing else to process
|
|
return;
|
|
}
|
|
|
|
const char *queryPos = static_cast<const char *>(memchr(currentPos, '?', urlEnd - currentPos));
|
|
if (queryPos != NULL) {
|
|
currentPos = queryPos + 1;
|
|
}
|
|
|
|
/// @note url fragment is stripped and not part of the rebuild url
|
|
const char *fragmentPos = static_cast<const char *>(memrchr(currentPos, '#', urlEnd - currentPos));
|
|
if (fragmentPos != NULL) {
|
|
// https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
// don't treat '#!" as anchor
|
|
if (fragmentPos != urlEnd && *(fragmentPos + 1) == '!') {
|
|
fragmentPos = NULL;
|
|
}
|
|
}
|
|
|
|
const char *pathEnd = queryPos ? queryPos : (fragmentPos ? fragmentPos : urlEnd);
|
|
m_pathEndChar = *(pathEnd - 1);
|
|
|
|
const char *queryEnd = fragmentPos ? fragmentPos : urlEnd;
|
|
|
|
// path
|
|
bool isFirstComponent = true;
|
|
bool updatePathEncChar = false;
|
|
const char *prevPos = pathPos + 1;
|
|
while (prevPos && (prevPos <= pathEnd)) {
|
|
size_t len = pathEnd - prevPos;
|
|
currentPos = strnpbrk(prevPos, len, "/;&");
|
|
if (currentPos) {
|
|
len = currentPos - prevPos;
|
|
}
|
|
|
|
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_PATH, prevPos, len, *(prevPos - 1), isFirstComponent);
|
|
isFirstComponent = false;
|
|
|
|
// check for special cases before adding to m_paths
|
|
if (len == 1 && memcmp(prevPos, ".", 1) == 0) {
|
|
deleteComponent(&urlPart);
|
|
updatePathEncChar = true;
|
|
} else if (len == 2 && memcmp(prevPos, "..", 2) == 0) {
|
|
deleteComponent(&urlPart);
|
|
updatePathEncChar = true;
|
|
|
|
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
|
|
if (it->isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
deleteComponent(&(*it));
|
|
|
|
if (it->getSeparator() == '/') {
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
m_paths.push_back(urlPart);
|
|
|
|
prevPos = currentPos ? currentPos + 1 : NULL;
|
|
}
|
|
|
|
// set pathEndChar to component after last non-deleted component (if exist)
|
|
if (updatePathEncChar) {
|
|
for (auto it = m_paths.rbegin(); it != m_paths.rend(); ++it) {
|
|
if (it->isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (it != m_paths.rbegin()) {
|
|
m_pathEndChar = std::prev(it)->getSeparator();
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
// query
|
|
if (queryPos) {
|
|
prevPos = queryPos + 1;
|
|
|
|
bool isFirstComponent = true;
|
|
bool isPrevAmpersand = false;
|
|
while (prevPos && (prevPos < queryEnd)) {
|
|
const char *querySeparator = m_titledbVersion <= 128 ? "&;" : "&;?";
|
|
size_t len = queryEnd - prevPos;
|
|
currentPos = strnpbrk(prevPos, len, querySeparator);
|
|
if (currentPos) {
|
|
len = currentPos - prevPos;
|
|
}
|
|
|
|
UrlComponent urlPart = UrlComponent(UrlComponent::TYPE_QUERY, prevPos, len, *(prevPos - 1), isFirstComponent);
|
|
isFirstComponent = false;
|
|
|
|
std::string key = urlPart.getKey();
|
|
|
|
// check previous urlPart
|
|
if (isPrevAmpersand) {
|
|
urlPart.setSeparator('&');
|
|
}
|
|
|
|
bool isAmpersand = (!urlPart.hasValue() && urlPart.getKey() == "amp");
|
|
if (!key.empty() && !isAmpersand) {
|
|
// we don't cater for case sensitive query parameter (eg: parm, Parm, PARM is assumed to be the same)
|
|
auto it = std::find_if(m_queries.begin(), m_queries.end(), [&key](const UrlComponent& u) { return key == u.getKey(); });
|
|
if (it == m_queries.end()) {
|
|
m_queries.push_back(urlPart);
|
|
} else {
|
|
*it = urlPart;
|
|
}
|
|
}
|
|
|
|
prevPos = currentPos ? currentPos + 1 : NULL;
|
|
isPrevAmpersand = isAmpersand;
|
|
}
|
|
}
|
|
|
|
if (m_titledbVersion >= 124) {
|
|
// remove empty query parameters
|
|
for (auto &query : m_queries) {
|
|
if (query.getValueLen() == 0) {
|
|
deleteComponent(&query);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// @todo ALC a better way of doing this will be to check if the url has changed,
|
|
/// and call unparse automatically when getUrlParsed/getUrlParsedLen is called
|
|
void UrlParser::unparse() {
|
|
m_urlParsed.clear();
|
|
|
|
if (m_scheme == NULL || m_schemeLen == 0) {
|
|
m_urlParsed.append("http");
|
|
} else {
|
|
for (size_t i = 0; i < m_schemeLen; ++i) {
|
|
m_urlParsed.push_back(tolower(m_scheme[i]));
|
|
}
|
|
}
|
|
|
|
m_urlParsed.append("://");
|
|
|
|
// userinfo '@'
|
|
m_urlParsed.append(m_authority, m_host - m_authority);
|
|
|
|
// host
|
|
for (size_t i = 0; i < m_hostLen; ++i) {
|
|
m_urlParsed.push_back(tolower(m_host[i]));
|
|
}
|
|
|
|
// port
|
|
if (m_port) {
|
|
m_urlParsed.push_back(':');
|
|
m_urlParsed.append(m_port, m_portLen);
|
|
}
|
|
|
|
if (m_pathsDeleteCount != m_paths.size()) {
|
|
bool isFirst = true;
|
|
|
|
for (auto &path : m_paths) {
|
|
if (!path.isDeleted()) {
|
|
if (isFirst) {
|
|
isFirst = false;
|
|
if (path.getSeparator() != '/') {
|
|
m_urlParsed.append("/");
|
|
}
|
|
}
|
|
|
|
m_urlParsed += path.getSeparator();
|
|
m_urlParsed.append(path.getString());
|
|
}
|
|
}
|
|
|
|
if (m_urlParsed[m_urlParsed.size() - 1] != '/' && m_pathEndChar == '/') {
|
|
m_urlParsed += m_pathEndChar;
|
|
}
|
|
} else {
|
|
if (m_titledbVersion >= 124) {
|
|
m_urlParsed += '/';
|
|
}
|
|
}
|
|
|
|
if (m_queriesDeleteCount != m_queries.size()) {
|
|
bool isFirst = true;
|
|
for (auto &query : m_queries) {
|
|
if (!query.isDeleted()) {
|
|
if (isFirst) {
|
|
isFirst = false;
|
|
m_urlParsed.append("?");
|
|
} else {
|
|
// we should preserve '?' that is not the first separator
|
|
// because '?' should not have any special meaning after query parameter starts
|
|
m_urlParsed += (query.isFirst() && query.getSeparator() == '?') ? '&' : query.getSeparator();
|
|
}
|
|
|
|
m_urlParsed.append(query.getString());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void UrlParser::deleteComponent(UrlComponent *urlComponent) {
|
|
if (urlComponent == nullptr || urlComponent->isDeleted()) {
|
|
return;
|
|
}
|
|
|
|
urlComponent->setDeleted();
|
|
|
|
switch (urlComponent->getType()) {
|
|
case UrlComponent::TYPE_PATH:
|
|
++m_pathsDeleteCount;
|
|
break;
|
|
case UrlComponent::TYPE_QUERY:
|
|
++m_queriesDeleteCount;
|
|
break;
|
|
}
|
|
}
|
|
|
|
void UrlParser::deleteComponents(std::vector<UrlComponent*> &urlComponents) {
|
|
for (auto &urlComponent : urlComponents) {
|
|
if (urlComponent->isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
deleteComponent(urlComponent);
|
|
}
|
|
}
|
|
|
|
bool UrlParser::removeComponent(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
|
|
bool hasRemoval = false;
|
|
|
|
for (auto urlComponent : urlComponents) {
|
|
if (urlComponent->isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if ((urlComponent->hasValue() && validator.isValid(*urlComponent)) ||
|
|
(!urlComponent->hasValue() && validator.allowEmptyValue())) {
|
|
hasRemoval = true;
|
|
deleteComponent(urlComponent);
|
|
}
|
|
}
|
|
|
|
return hasRemoval;
|
|
}
|
|
|
|
std::vector<std::pair<UrlComponent *, UrlComponent *> > UrlParser::matchPath(const UrlComponent::Matcher &matcher) {
|
|
std::vector<std::pair<UrlComponent *, UrlComponent *> > result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if (m_pathsDeleteCount == m_paths.size()) {
|
|
return result;
|
|
}
|
|
|
|
for (auto it = m_paths.begin(); it != m_paths.end(); ++it) {
|
|
if (it->isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (!it->hasValue() && matcher.isMatching(*it)) {
|
|
auto valueIt = std::next(it, 1);
|
|
result.emplace_back(&(*it), (valueIt != m_paths.end() ? &(*valueIt) : NULL));
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removePath(const std::vector<std::pair<UrlComponent *, UrlComponent *> > &urlComponents,
|
|
const UrlComponent::Validator &validator) {
|
|
bool hasRemoval = false;
|
|
for (const auto &urlComponent : urlComponents) {
|
|
if (urlComponent.second == NULL || (m_titledbVersion <= 123 && urlComponent.second->getValueLen() == 0)) {
|
|
if (validator.allowEmptyValue()) {
|
|
hasRemoval = true;
|
|
deleteComponent(urlComponent.first);
|
|
}
|
|
} else {
|
|
const char *value = (m_titledbVersion <= 123) ? urlComponent.second->getValue() : urlComponent.second->getString().c_str();
|
|
size_t valueLen = (m_titledbVersion <= 123) ? urlComponent.second->getValueLen() : urlComponent.second->getString().size();
|
|
if (validator.isValid(value, valueLen)) {
|
|
hasRemoval = true;
|
|
deleteComponent(urlComponent.first);
|
|
deleteComponent(urlComponent.second);
|
|
}
|
|
}
|
|
}
|
|
|
|
return hasRemoval;
|
|
}
|
|
|
|
bool UrlParser::removePath(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
|
|
std::vector<std::pair<UrlComponent *, UrlComponent *> > matches = matchPath(matcher);
|
|
|
|
return removePath(matches, validator);
|
|
}
|
|
|
|
std::vector<UrlComponent *> UrlParser::matchPathParam(const UrlComponent::Matcher &matcher) {
|
|
std::vector<UrlComponent *> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if (m_pathsDeleteCount == m_paths.size()) {
|
|
return result;
|
|
}
|
|
|
|
for (auto &path : m_paths) {
|
|
if (path.isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (path.hasValue() && matcher.isMatching(path)) {
|
|
result.push_back(&path);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
std::vector<const UrlComponent *> UrlParser::matchPathParam(const UrlComponent::Matcher &matcher) const {
|
|
std::vector<const UrlComponent *> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if (m_pathsDeleteCount == m_paths.size()) {
|
|
return result;
|
|
}
|
|
|
|
for (auto &path : m_paths) {
|
|
if (path.isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (path.hasValue() && matcher.isMatching(path)) {
|
|
result.push_back(&path);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removePathParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
|
|
return removeComponent(urlComponents, validator);
|
|
}
|
|
|
|
bool UrlParser::removePathParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
|
|
std::vector<UrlComponent *> matches = matchPathParam(matcher);
|
|
|
|
return removeComponent(matches, validator);
|
|
}
|
|
|
|
const std::vector<UrlComponent *> UrlParser::matchQueryParam(const UrlComponent::Matcher &matcher) {
|
|
std::vector<UrlComponent *> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if (m_queriesDeleteCount == m_queries.size()) {
|
|
return result;
|
|
}
|
|
|
|
for (auto &query : m_queries) {
|
|
if (query.isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (matcher.isMatching(query)) {
|
|
result.push_back(&query);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
const std::vector<const UrlComponent *> UrlParser::matchQueryParam(const UrlComponent::Matcher &matcher) const {
|
|
std::vector<const UrlComponent *> result;
|
|
|
|
// don't need to loop if it's all deleted
|
|
if (m_queriesDeleteCount == m_queries.size()) {
|
|
return result;
|
|
}
|
|
|
|
for (const auto &query : m_queries) {
|
|
if (query.isDeleted()) {
|
|
continue;
|
|
}
|
|
|
|
if (matcher.isMatching(query)) {
|
|
result.push_back(&query);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam(const char *param) {
|
|
static const UrlComponent::Validator s_validator(0, 0, true, ALLOW_ALL, MANDATORY_NONE);
|
|
|
|
return removeQueryParam(UrlComponent::Matcher(param), s_validator);
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam(const std::vector<UrlComponent *> &urlComponents, const UrlComponent::Validator &validator) {
|
|
return removeComponent(urlComponents, validator);
|
|
}
|
|
|
|
bool UrlParser::removeQueryParam(const UrlComponent::Matcher &matcher, const UrlComponent::Validator &validator) {
|
|
return removeComponent(matchQueryParam(matcher), validator);
|
|
}
|