privacore-open-source-searc.../Url.cpp
Ivan Skytte Jørgensen db43287908 Removed unused Url::getIp() method
It was called one place but result was ignored. Memmber m_ip was only used internally inUrl::set() so it was changed to a local variable
2018-08-27 14:35:30 +02:00

2705 lines
82 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "Url.h"
#include "UrlParser.h"
#include "Domains.h"
#include "HashTable.h"
#include "FxCheckAdult.h"
#include "ip.h" // atoip ( s,len)
#include "Punycode.h"
#include "SafeBuf.h"
#include "Sanity.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include "GbUtil.h"
#include "gbmemcpy.h"
#include <vector>
#include <algorithm>
#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif
Url::Url() {
reset();
}
void Url::reset() {
m_scheme = NULL;
m_host = NULL;
m_path = NULL;
m_filename = NULL;
m_extension = NULL;
m_query = NULL;
m_domain = NULL;
m_tld = NULL;
m_url[0] = '\0';
m_ulen = 0;
m_dlen = 0;
m_slen = 0;
m_qlen = 0;
m_hlen = 0;
m_elen = 0;
m_mdlen = 0;
// Coverity
m_plen = 0;
m_flen = 0;
m_tldLen = 0;
m_port = 0;
m_defPort = 0;
m_portLen = 0;
m_portPtr = nullptr;
m_portPtrLen = 0;
}
void Url::set( const Url *baseUrl, const char *s, int32_t len, bool addWWW, bool stripParams,
bool stripCommonFile, int32_t titledbVersion ) {
reset();
if ( ! baseUrl ) {
set( s, len, addWWW, false, false, titledbVersion );
return;
}
char *base = (char *) baseUrl->m_url;
int32_t blen = baseUrl->m_ulen;
// don't include cgi crap
if ( baseUrl->m_query ) {
blen -= ( baseUrl->m_qlen + 1 );
}
// . adjust length of the base url.
// . if base url does not end in / then it must have a m_filename at
// the end, therefore we should strip the m_filename
if ( blen > 0 && base[blen - 1] != '/' ) {
while ( blen > 0 && base[blen - 1] != '/' ) {
blen--;
}
}
if ( blen == 0 && len == 0 ) {
return;
}
// if empty string / an url fragment, use baseUrl
if (len == 0 || s[0] == '#') {
set(baseUrl);
return;
}
// . fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
// . if "s" starts with ? then keep the filename in the base url
if (s[0] == '?') {
for ( ; base[blen] && base[blen] != '?'; blen++ ) {
;
}
}
// skip s over spaces
const char *send = s + len;
while ( s < send && is_wspace_a( *s ) ) {
s++;
len--;
}
// . is s a relative url? search for ://, but break at first /
// . but break at any non-alnum or non-hyphen
bool isAbsolute = false;
int32_t i;
for ( i = 0; i < len && ( is_alnum_a( s[i] ) || s[i] == '-' ); i++ ) {
;
}
if ( !isAbsolute ) {
isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '/'); // some are missing both /'s!
}
if ( !isAbsolute ) {
isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '\\');
}
// or if s starts with // then it's also considered absolute!
if ( !isAbsolute && len > 1 && s[0] == '/' && s[1] == '/' ) {
isAbsolute = true;
}
// watch out for idiots
if ( !isAbsolute && len > 1 && s[0] == '\\' && s[1] == '\\' ) {
isAbsolute = true;
}
// don't use base if s is not relative
if ( blen==0 || isAbsolute ) {
set( s, len, addWWW, stripParams, false, titledbVersion );
return;
}
// . if s starts with / then hack of base's m_path
// . careful not to hack of the port, if any
// . blen = baseUrl->m_slen + 3 + baseUrl->m_hlen;
if ( len > 0 && s[0]=='/' )
blen = baseUrl->m_path - baseUrl->m_url ;
char temp[MAX_URL_LEN * 2 + 1];
strncpy( temp, base, blen );
if ( len > MAX_URL_LEN ) {
len = MAX_URL_LEN - 2;
}
// if s does NOT start with a '/' then add one here in case baseUrl
// does NOT end in one.
// fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
if ( len > 0 && s[0] != '/' && s[0] != '?' && temp[blen - 1] != '/' ) {
temp[blen++] = '/';
}
strncpy( temp + blen, s, len );
temp[blen+len] = '\0';
set( temp, blen + len, addWWW, stripParams, stripCommonFile, titledbVersion );
}
static bool isSessionId ( const char *hh ) {
int32_t count = 0;
int32_t nonNumCount = 0;
// do not limit count to 12, the hex numbers may only be
// after the 12th character! we were not identifying these
// as sessionids when we shold have been because of that.
for ( ; *hh ; ++count, ++hh ) {
if ( *hh >= '0' && *hh <= '9' ) continue;
nonNumCount++;
if ( *hh >= 'a' && *hh <= 'f' ) continue;
// we got an illegal session id character
return false;
}
// if we got at least 12 of em, consider it a valid id
// make sure it's a hexadecimal number...lots of product
// ids and dates use only decimal numbers
return ( nonNumCount > 0 && count >= 12);
}
static void stripParametersv122( char *s, int32_t *len ) {
// . remove session ids from s
// . ';' most likely preceeds a session id
// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1
// . http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
// . http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395&p=1
// . http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&p=1
// . http://www.b.com/default?SID=f320a739cdecb4c3edef67e&p=1
// CHECK FOR A SESSION ID USING QUERY STRINGS
char *p = s;
while ( *p && *p != '?' && *p != ';' ) p++;
// bail if no ?
if ( ! *p ) {
return;
}
// now search for severl strings in the cgi query string
char *tt = NULL;
int32_t x = 0;
if ( ! tt ) { tt = gb_strcasestr ( p, "PHPSESSID=" ); x = 10;}
if ( ! tt ) { tt = strstr ( p , "SID=" ); x = 4;}
// . osCsid and XTCsid are new session ids
// . keep this up here so "sid=" doesn't override it
if ( ! tt ) {
tt = strstr ( p , "osCsid=" );
x = 7;
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x ) )
tt = NULL;
}
if ( ! tt ) {
tt = strstr ( p , "osCsid/" );
x = 7;
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x ) )
tt = NULL;
}
// this is a new session id thing
if ( ! tt ) {
tt = strstr ( p , "sid=" ); x = 4;
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x ) )
tt = NULL;
}
// osCsid and XTCsid are new session ids
if ( ! tt ) {
tt = strstr ( p , "osCsid=" );
x = 7;
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x ) )
tt = NULL;
}
// fixes for bug of matching plain &sessionid= first and
// then realizing char before is an alnum...
if ( ! tt ) { tt = gb_strcasestr ( p, "jsessionid="); x = 11; }
if ( ! tt ) { tt = gb_strcasestr ( p, "vbsessid=" ); x = 9;}
if ( ! tt ) { tt = gb_strcasestr ( p, "asesessid=" ); x = 10; }
if ( ! tt ) { tt = gb_strcasestr ( p, "nlsessid=" ); x = 9; }
if ( ! tt ) { tt = gb_strcasestr ( p, "psession=" ); x = 9; }
if ( ! tt ) { tt = gb_strcasestr ( p, "session_id="); x = 11;}
if ( ! tt ) { tt = gb_strcasestr ( p, "sessionid=" ); x = 10;}
if ( ! tt ) { tt = gb_strcasestr ( p, "sessid=" ); x = 7;}
if ( ! tt ) { tt = gb_strcasestr ( p, "session=" ); x = 8;}
if ( ! tt ) { tt = gb_strcasestr ( p, "session/" ); x = 8; }
if ( ! tt ) { tt = gb_strcasestr ( p, "POSTNUKESID=");x = 12;}
// some new session ids as of Feb 2005
if ( ! tt ) { tt = gb_strcasestr ( p, "auth_sess=" ); x = 10; }
if ( ! tt ) { tt = gb_strcasestr ( p, "mysid=" ); x = 6; }
if ( ! tt ) { tt = gb_strcasestr ( p, "oscsid=" ); x = 7; }
if ( ! tt ) { tt = gb_strcasestr ( p, "cg_sess=" ); x = 8; }
if ( ! tt ) { tt = gb_strcasestr ( p, "galileoSession");x=14; }
// new as of Jan 2006. is hurting news5 collection on gb6
if ( ! tt ) { tt = gb_strcasestr ( p, "sess=" ); x = 5; }
// .php?s=8af9d6d0d59e8a3108f3bf3f64166f5a&
// .php?s=eae5808588c0708d428784a483083734&
// .php?s=6256dbb2912e517e5952caccdbc534f3&
if ( ! tt && (tt = strstr ( p-4 , ".php?s=" )) ) {
// point to the value of the s=
char *pp = tt + 7;
int32_t i = 0;
// ensure we got 32 hexadecimal chars
while ( pp[i] &&
( is_digit(pp[i]) ||
( pp[i]>='a' && pp[i]<='f' ) ) ) i++;
// if not, do not consider it a session id
if ( i < 32 ) tt = NULL;
// point to s= for removal
else { tt += 5; x = 2; }
}
// BR 20160117
// http://br4622.customervoice360.com/about_us.php?SES=652ee78702fe135cd96ae925aa9ec556&frmnd=registration
if ( ! tt ) { tt = strstr ( p , "SES=" ); x = 4;}
// BR 20160117: Skip most common tracking parameters
// Oracle Eloqua
// http://app.reg.techweb.com/e/er?s=2150&lid=25554&elq=00000000000000000000000000000000&elqaid=2294&elqat=2&elqTrackId=3de2badc5d7c4a748bc30253468225fd
if ( ! tt ) { tt = gb_strcasestr ( p, "elq="); x = 4;}
if ( ! tt ) { tt = gb_strcasestr ( p, "elqat="); x = 6;}
if ( ! tt ) { tt = gb_strcasestr ( p, "elqaid="); x = 7;}
if ( ! tt ) { tt = gb_strcasestr ( p, "elq_mid="); x = 8;}
if ( ! tt ) { tt = gb_strcasestr ( p, "elqTrackId="); x = 11;}
// Google Analytics
// http://kikolani.com/blog-post-promotion-ultimate-guide?utm_source=kikolani&utm_medium=320banner&utm_campaign=bpp
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_term="); x = 9;}
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_hp_ref="); x = 11;} // Lots on huffingtonpost.com
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_source="); x = 11;}
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_medium="); x = 11;}
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_content="); x = 12;}
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_campaign="); x = 13;}
// Piwik
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_kwd="); x = 7;}
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_source="); x = 10;}
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_medium="); x = 10;}
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_campaign="); x = 12;}
// Misc
if ( ! tt ) { tt = gb_strcasestr ( p, "trk="); x = 4;}
if ( ! tt ) { tt = gb_strcasestr ( p, "promoid="); x = 8;}
if ( ! tt ) { tt = gb_strcasestr ( p, "promCode="); x = 9;}
if ( ! tt ) { tt = gb_strcasestr ( p, "promoCode="); x = 10;}
if ( ! tt ) { tt = gb_strcasestr ( p, "partnerref="); x = 11;}
// bail if none were found
if ( ! tt ) {
return;
}
// . must not have an alpha char before it!
// . prevent "DAVESID=" from being labeled as session id
if ( is_alnum_a ( *(tt-1) ) ) {
return;
}
// start of the shit
int32_t a = tt - s;
// get the end of the shit
int32_t b = a + x;
// back up until we hit a ? or & or / or ;
while ( a > 0 && s[a-1] != '?' && s[a-1] != '&' &&
s[a-1] != '/' && s[a-1] != ';' ) a--;
// keep the '?'
if ( s[a]=='?' ) a++;
// back up over any semicolon
if ( s[a-1] == ';' ) a--;
// advance b until we hit & or end or ? or a ';'
while ( s[b] && s[b] != '&' && s[b] != '?' && s[b] != ';') b++;
// if we don't have 5+ chars in session id itself, skip it
if ( b - (a + x) < 5 ) {
return;
}
// go over a & or a ;
if ( s[b] == '&' || s[b] == ';' ) b++;
// remove the session id by covering it up
memmove ( &s[a] , &s[b] , *len - b );
// reduce length
*len -= (b-a);
// if s ends in ? or & or ;, backup
while ( *len > 0 && (s[*len-1]=='?'||s[*len-1]=='&'||s[*len-1]==';'))
(*len)--;
// NULL terminate
s[*len] = '\0';
}
static void stripParameters(UrlParser *urlParser) {
/// @todo ALC reorder parameter?
/// if we have ?abc=123&def=456
/// wouldn't it be the same as ?def=456&abc=123
/// @todo ALC login pages?
/// should we even spider them?
static const UrlComponent::Validator s_defaultParamValidator(0, 0, true, ALLOW_ALL, MANDATORY_NONE);
// 3 different component that we can remove from
// - path (we have a much more restrictive criteria on path to avoid removing valid path)
// eg: http://www.example.com/search/keywords/chardonnay/osCAdminID/45de8edd68f8bc05e9fde0d2c528a619/sort/3d/
//
// - path param
// eg: http://www.example.com/search/keywords,chardonnay/osCAdminID,45de8edd68f8bc05e9fde0d2c528a619/sort,3d/
// eg: http://www.example.com/search/;keywords=chardonnay;osCAdminID=45de8edd68f8bc05e9fde0d2c528a619;sort=3d/
//
// - query string
// eg: http://www.example.com/search/?keywords=chardonnay&osCAdminID=45de8edd68f8bc05e9fde0d2c528a619&sort=3d
// osCommerce (osCsid)
// eg:
// be1566df2284664244ce73ea6bed81fa09d4
// b8d15fefe8648f7f77c6e47f7bc0b881
// ddtvpkt3rpqdprsagsi52tj5o4
{
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCsid"));
if (!pathMatches.empty()) {
urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
}
urlParser->removeQueryParam(UrlComponent::Matcher("osCsid"), s_defaultParamValidator);
}
// osCommerce (osCAdminID)
// eg:
// 20d2f836fd203140dc6391b7ba3cdd82
// c40fe2ad32efad2e9cc2748a3f1f90cc
{
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCAdminID"));
if (!pathMatches.empty()) {
urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
}
urlParser->removeQueryParam(UrlComponent::Matcher("osCAdminID"), s_defaultParamValidator);
}
// XT-commerce
// eg:
// ha6n43ndtnlm53tpqgnclbv7ukkroue9k7m1e2o7t7rr5nb366a1
// 7ib1soln64vslra70ep2qcvde4s8dsm1
// big3ika24atc4j19mlaha6d906
urlParser->removePath(UrlComponent::Matcher("XTCsid", MATCH_CASE), UrlComponent::Validator(26, 52, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
urlParser->removeQueryParam(UrlComponent::Matcher("XTCsid", MATCH_CASE), s_defaultParamValidator);
// ColdFusion
// http://help.adobe.com/en_US/ColdFusion/9.0/Developing/WSc3ff6d0ea77859461172e0811cbec0c35c-7fef.html#WSc3ff6d0ea77859461172e0811cbec22c24-7cbf
// ColdFusion (CTOKEN)
// eg:
// e718cd6cc29050df-8051DC1E-C29B-554E-6DFF6B5D2704A9A5
// 92566684.html
// 94175176
// 322257
{
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("CFTOKEN"));
if (!pathMatches.empty()) {
urlParser->removePath(pathMatches, UrlComponent::Validator(52, 52, true, ALLOW_ALL, MANDATORY_NONE));
urlParser->removePath(pathMatches, UrlComponent::Validator(10, 14, true, ALLOW_ALL, MANDATORY_PUNCTUATION));
urlParser->removePath(pathMatches, UrlComponent::Validator(6, 0, true, ALLOW_DIGIT, MANDATORY_NONE));
}
urlParser->removePathParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
urlParser->removeQueryParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
}
// ColdFusion (CFID)
urlParser->removePath(UrlComponent::Matcher("CFID"), UrlComponent::Validator(0, 0, true, ALLOW_DIGIT, MANDATORY_NONE));
urlParser->removePathParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);
urlParser->removeQueryParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);
urlParser->removeQueryParam(UrlComponent::Matcher("cftokenPass"), s_defaultParamValidator);
/// SAP load balancer
// https://help.sap.com/saphelp_nw70/helpdata/de/f2/d7914b8deb48f090c0343ef1d907f0/content.htm
urlParser->removePathParam(UrlComponent::Matcher("saplb_*"), s_defaultParamValidator);
// Atlassian
// https://developer.atlassian.com/confdev/confluence-plugin-guide/writing-confluence-plugins/form-token-handling
// 3 different format
// eg:
// AFP6-ISR2-ZLJY-KBY3|926a76e0017be6a18e889d2ddffb0aaab21865c1|lout
// 56c1bb338d5ad3ac262dd4e97bda482efc151f30
// 15BWJdAr0U
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("atl_token"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(65, 0, true, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, true, ALLOW_HEX, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(10, 10, true, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
}
}
// psession
// eg:
// 491022863920110420135759
// 7d01p6qvcl2e72j8ivmppk12k0
// XUjuplcPFGlJD2ZF5O26ApqAj5ZNEZwZrUKX5kkA
urlParser->removeQueryParam(UrlComponent::Matcher("psession"), UrlComponent::Validator(24, 0, true, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
// Galileo
// eg:
// 65971783A4.z17ZHFAI
// 63105032A6BFxgQFfV8
urlParser->removeQueryParam(UrlComponent::Matcher("GalileoSession"), UrlComponent::Validator(19, 19, true, ALLOW_ALL, MANDATORY_NONE));
// postnuke
// normally it would be hex string length of 32. but shorter length exist (looks to be chopped off somehow)
// eg:
// 549178d5035b622229a39cd5baf75d2a
// 4ed3b0a832d4687020b05ce70
urlParser->removeQueryParam(UrlComponent::Matcher("POSTNUKESID"), UrlComponent::Validator(16, 32, true, ALLOW_HEX, MANDATORY_NONE));
// jsessionid
// eg:
// C14778D1240A6CFEE5417030DDB37D41
urlParser->removePath(UrlComponent::Matcher("jsessionid"), UrlComponent::Validator(32, 32, false, ALLOW_HEX, MANDATORY_NONE));
urlParser->removePathParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true, ALLOW_ALL, MANDATORY_NONE));
// phpsessid
// eg:
// 7711
// 4g8v6ndp6gnnc4tagn8coam0n7
// 414c6917961d5b4998973d1613b7926f
// qfou95mlih5jjans36kevj2pti7p847v6bl79f03nrvtaadif6u0
urlParser->removePath(UrlComponent::Matcher("PHPSESSID"), UrlComponent::Validator(26, 32, false, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
urlParser->removeQueryParam(UrlComponent::Matcher("PHPSESSID", MATCH_PARTIAL), s_defaultParamValidator);
// auth_sess
// mostly job sites (same group?)
// eg:
// 7ofc7ep3i8g6i2foinq6uks7e0
// 6ce228460946fc4b3ed154abea1530b8
urlParser->removeQueryParam(UrlComponent::Matcher("auth_sess"), UrlComponent::Validator(26, 32, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
// ps_sess_id
// eg:
// 0056c53b03ee56c8b791a5cf061a910d
urlParser->removeQueryParam(UrlComponent::Matcher("ps_sess_id"), UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
// mysid
// eg:
// c357e16d973188ad99cc3e32a059e805
// 11GeUYNB4fCVXeySSumKM3
// hNrnd87gxn9LU0X-N-4TS2
// glwcjvci
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("mysid"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(22, 22, ALLOW_ALL, MANDATORY_ALPHA));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(8, 8, ALLOW_ALPHA, MANDATORY_NONE));
}
}
// sid
// eg:
// 3565de85-0bf0-47d3-8fb3-80120d6b60a6
// 8E67BB91-5056-9000-2C8C1473A967F273
// 0b721aa1c34b75fcf41e17304537d965
// 3KnGJS3ga7ae891-33115175851.04
// v0uqho4nv0mnghv4ap3ieeqp94
// K6FYyt
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sid"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(30, 0, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_ALPHA_UPPER)));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_DIGIT)));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_UPPER | MANDATORY_DIGIT)));
}
}
// SES
// eg:
// 74339eda735516fd51ed1c5eb6bc76ceav
// 39a11261f58150fd4327a80da6daafa0
// 99cj5cbf6g8irau20h1hkvr8o6
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("ses"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(34, 34, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
}
}
// s
// eg:
// 4d9ae8a969305848227e5d6d7d0fb9672bd38d96
// 81cfba6ed9b66a8ad0df43c2f3d259bd
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("s"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, ALLOW_HEX, MANDATORY_ALPHA_HEX));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_ALPHA_HEX));
}
}
// session_id
// eg:
// NiHhUceSP6At57u0
// ospnr7npc97urgoi1p9i9kd1e4
urlParser->removeQueryParam(UrlComponent::Matcher("session_id"), UrlComponent::Validator(16, 0, ALLOW_ALL, MANDATORY_ALPHA));
// sessionid
// eg:
// 094104BqFHWLmUCiZAMvgboVyVFiIKDqRPJCxIUMZIPNkMVJVK
// 1a0d43d9a6753940649bbaeb56f01176
// ej3fa4fe7eikfb8ej1fd6
// ObUlshp63oxfnZzvCzwe
// mN3XmQ{hXgsK8jY7VUm8
urlParser->removeQueryParam(UrlComponent::Matcher("sessionid"), UrlComponent::Validator(20, 0, ALLOW_ALL, MANDATORY_ALPHA));
// other session id variations
// sessid (vbSESSID, asesessid, nlsessid, GLBSESSID, sessid, etc ...)
// eg:
// 91hpb1p3b69bu0vqruar2fpltf3b509bsdeqh1qtj1p8ugb8rpc0
// a12cb492ec7bcc9677916f02913587064d4279ed
// 50d96959db895a0adbfebd325a4a65e0
// f4db3ec33001c9759d095c6432651e39
// 82d0pbm7f6aa55no7p0rqb37r6
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sessid", MATCH_PARTIAL));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(52, 52, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, ALLOW_HEX, MANDATORY_ALPHA_HEX));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_ALPHA_HEX));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
}
}
// session
// eg:
// eRbInbLDoNaEr4gkIju0
// vfdplav2ske1blvadpv9du54k3
// ARC-1454710019-541634862-12401
// ARC-1454807400-18472177182-25788
// A5C45BC6DC3B436899C43B9D904FC8DE
// 7b478486-e52c-46aa-aca8-8cd446fcb79e
// 39663_1455055828_84298238456ba63d42992a
// 14185_1455099610_106560567456bb0eda9d317
// NlG8XCo5MgpctBTMRut4Gq6J5Z5de9foAe4rh3ikLQYWQmFzLR4zSHuieO8
// DMQBEXa5Z-aJ7r67ylAJ_y9H8_S2HTUaIjoafUtOjYuGcxwRefR0Q3xXzyS
// bGJL_GuP2eDGwJJzoXM9T3_LRgjAsalqaREGEBDoEERJOIMIL8Wh7Q3K3FcgHtYc9hM6CuJmVKlmmCxjmSYEhwVlOdUEX5RnUXycKSHKO5iAz2_ulWoJOZ1d7QCD2Afn9WPkXkvaJaSgjo7hcfYbBnUOXhedzMolha6kfV7hvf4mRAF700MhB350--QV0wQAur9Rz47QiX8SiRXp_vQDdwInUSfO3PqOwXfBu72w4e-JySzUf7Aj9Ks9ouOUPAn1W_GtORLLT4Gho7-Tb_IwyGVYPKF97f3VMXsTfoFqUvs
//
urlParser->removeQueryParam(urlParser->matchQueryParam(UrlComponent::Matcher("session")),
UrlComponent::Validator(20, 0, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_DIGIT)));
// sess
// eg:
// 4be234480736093ba237bc397fb6e32d
urlParser->removeQueryParam(UrlComponent::Matcher("sess"),
UrlComponent::Validator(20, 0, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
if (urlParser->getTitledbVersion() >= 124) {
// zenid
// eg:
// f-PkGiBLKfCX6tEkFg9IX0
// oolgqmle6imrmn7fank6dt35j0
// 7338c37c2d3a1a23d43b70cc07202861
// hugsngcjfn6chl4crs21mgkchff1tape
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("zenid"));
if (!queryMatches.empty()) {
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(22, 22, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
}
}
if (urlParser->getTitledbVersion() >= 124) {
// sid
// eg:
// A22CTDDQDAWWOUD2AMDTDOS2B
// iss5teou1s7jn25gnr57ta50g4
// 6Ld1DQiSaTtLoxlRZV2Q4ZWa1ME6QT
// c6dc67a613ac0d459ea256e30a5c5f22
// 57D64937B89D9203F0032B416DBEEF78
// plangbfbficubs1hun1rodemqmravs9b
// 00573df4e5c7851f9e60c5e30c7529454d4279f4
// WsNYRPZQpDIqXVIYwYOrbACqDZqscupKvuWoynYsGA3Z3R8Bck
// 011gipkpcil1ce7ono5vqmtb7h7uvlcbpbd4jm2mo4bmoje94f30
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sid", MATCH_SUFFIX));
std::vector<UrlComponent*> queryMatchesNotWhitelisted;
for (auto &queryMatch : queryMatches) {
// whitelist
if (queryMatch->getKey().compare("newsid") == 0 || queryMatch->getKey().compare("smpagesid") == 0) {
continue;
}
queryMatchesNotWhitelisted.push_back(queryMatch);
}
if (!queryMatchesNotWhitelisted.empty()) {
urlParser->removeQueryParam(queryMatchesNotWhitelisted, UrlComponent::Validator(25, 0, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
}
}
if (urlParser->getTitledbVersion() >= 124) {
// cart_id
// eg:
// 680114.127.22580
// 4988233_28056
// 84
urlParser->removeQueryParam("cart_id");
}
// ts
// eg:
// 1422344216175
// 1425080080316
urlParser->removeQueryParam(UrlComponent::Matcher("ts"), UrlComponent::Validator(13, 13, ALLOW_DIGIT, MANDATORY_NONE));
// apache dir sort
// C={N,M,S,D} O={A,D}
// eg:
// ?C=N;O=A
if (urlParser->getQueryParamCount() <= 2) {
auto cQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("C", MATCH_CASE));
auto oQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("O", MATCH_CASE));
UrlComponent *cUrlComponent = (cQueryMatches.size() == 1) ? cQueryMatches[0] : NULL;
UrlComponent *oUrlComponent = (oQueryMatches.size() == 1) ? oQueryMatches[0] : NULL;
if (cUrlComponent) {
if (cUrlComponent->getValueLen() == 0) {
urlParser->deleteComponent(cUrlComponent);
} else if (cUrlComponent->getValueLen() == 1) {
char c = *(cUrlComponent->getValue());
if (c == 'N' || c == 'M' || c == 'S' || c == 'D') {
urlParser->deleteComponent(cUrlComponent);
}
}
}
if (oUrlComponent) {
if (oUrlComponent->getValueLen() == 0) {
urlParser->deleteComponent(oUrlComponent);
} else if (oUrlComponent->getValueLen() == 1) {
char o = *(oUrlComponent->getValue());
if (o == 'A' || o == 'D') {
urlParser->deleteComponent(oUrlComponent);
}
}
}
}
if (urlParser->getTitledbVersion() >= 124) {
// token
// eg:
// 02170932a082516cf758dbaa0a5ebab1
auto tokenMatches = urlParser->matchQueryParam(UrlComponent::Matcher("token"));
if (!tokenMatches.empty()) {
// only delete token parameter when:
// - no path; or
// - when id_product exist
if ((urlParser->getPaths()->size() == 1 && urlParser->getPaths()->front().getString().size() == 0) ||
!urlParser->matchQueryParam(UrlComponent::Matcher("id_product")).empty()) {
urlParser->removeQueryParam(tokenMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
}
}
}
if (urlParser->getTitledbVersion() >= 124) {
// random
urlParser->removeQueryParam("random");
// _random
urlParser->removeQueryParam("_random");
// rand
urlParser->removeQueryParam("rand");
// _rand
urlParser->removeQueryParam("_rand");
}
// Skip most common tracking parameters
// Oracle Eloqua
// http://docs.oracle.com/cloud/latest/marketingcs_gs/OMCAA/index.html#Help/General/EloquaTrackingParameters.htm
urlParser->removeQueryParam("elqTrackId");
urlParser->removeQueryParam("elq");
urlParser->removeQueryParam("elqCampaignId");
urlParser->removeQueryParam("elqaid");
urlParser->removeQueryParam("elqat");
urlParser->removeQueryParam("elq_mid");
urlParser->removeQueryParam("elq_cid");
urlParser->removeQueryParam("elq2"); // others
// Google Analytics
// https://support.google.com/analytics/answer/1033867
urlParser->removeQueryParam("utm_source");
urlParser->removeQueryParam("utm_medium");
urlParser->removeQueryParam("utm_term");
urlParser->removeQueryParam("utm_content");
urlParser->removeQueryParam("utm_campaign");
urlParser->removeQueryParam("utm_hp_ref"); // Lots on huffingtonpost.com
urlParser->removeQueryParam("utm_rid"); // others
// https://support.google.com/analytics/answer/1033981?hl=en
// https://support.google.com/ds/answer/6292795?hl=en
urlParser->removeQueryParam("gclid");
urlParser->removeQueryParam("gclsrc");
// Piwik
// http://piwik.org/docs/tracking-campaigns/
// https://plugins.piwik.org/AdvancedCampaignReporting
urlParser->removeQueryParam("pk_campaign");
urlParser->removeQueryParam("pk_kwd");
urlParser->removeQueryParam("pk_source");
urlParser->removeQueryParam("pk_medium");
urlParser->removeQueryParam("pk_keyword");
urlParser->removeQueryParam("pk_content");
urlParser->removeQueryParam("pk_cid");
// Open Web Analytics
// https://github.com/padams/Open-Web-Analytics/wiki/Campaign-Tracking
urlParser->removeQueryParam("owa_medium");
urlParser->removeQueryParam("owa_source");
urlParser->removeQueryParam("owa_campaign");
urlParser->removeQueryParam("owa_ad");
urlParser->removeQueryParam("owa_ad_type");
// Webtrends
// http://help.webtrends.com/en/queryparameters/index.html
urlParser->removeQueryParam("wt.mc_id");
// Mailchimp
// https://apidocs.mailchimp.com/api/how-to/ecommerce.php
urlParser->removeQueryParam("mc_cid");
urlParser->removeQueryParam("mc_eid");
// Marketo
// http://developers.marketo.com/documentation/websites/lead-tracking-munchkin-js/
urlParser->removeQueryParam("mkt_tok");
// trk
// eg:
// ppro_cprof
// prc-basic
urlParser->removeQueryParam(UrlComponent::Matcher("trk"),
UrlComponent::Validator(0, 0, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_PUNCTUATION)));
// who
// eg:
// r,Usyg2mo/krON58h7Cqp0HHHvPhsMdK5lNmP76/O/gxQb/ObopGwS3yJwoT241Hf8EMrMDicKKYtMqLKqmtywdZFGbvS6J6jbKbUd5HzTkv_FxyTEsYw1rLJr9LHquA3O
// r,yrl2BJY6LMkbtXa9k/lflCwQqzDqf/AF7zFIQoBAhI_t6U_gJztkZ/8ABugLiijm2NRXjt_LYh56mwmTv5cCNuIkgnB2cLFEfL62Gaoyddeh89cXgi9UqjWLP/Y1lD/4watUuyy2WINYipnkSygRLQ--
// r,nEBHD2D/_wnDIxXmNMZRjB1wQZikW7uTA8ZXGmCH3a1IvIXSpSv0QicLoCGpTnsBe2QR7xzvq2i2JeKu2AbpgLJaexxw5VON6yG8DP2t5oFhOdoM/kuVnhIt4PEVt1UwqKBNApZk56tTem_r5wqaF4ko65Bo5i7J67PUNHOZs3U-
// r,0/asSWWd2MeHwFRbMqZP42yZoh0UlWB2zyP9nAoa3ejKyLPsBjxivhuAY2RH6r94BV2DcmQQYxk6MYZD4Uo6cb30qgNTwVY/_rl_BjRSWosgbpRtPuMytbSX0OmxKuNedtcT27C3fJG/oia/88wI_Ec5PIerpxyPLAgXEsi78vAyuZAXymqhujGGTf6ACryR
// r,rW75z4HBqJegN3eAao88RaQcHsIgPXhAP/K1KCbI3x6dMrYllBZLlVfpuL_C0IQed0WspcLWMeT79fzDoAnb0qioGuFSnCHaZXYoH5_GZsWESFdk4CznUlTZuyeTFKsu9xblmYa56ShIKUyILXaFAI8HbNh7dpaXr7q66jIOuo_0r2_GFlbGaSScvbnAWWjH/dMPW8UZsTetZ2a9tqYaHQ--
{
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("who"));
for (auto it = queryMatches.begin(); it != queryMatches.end(); ++it) {
if ((*it)->getValueLen() <= 130 && memcmp((*it)->getValue(), "r,", 2) == 0) {
urlParser->deleteComponent(*it);
}
}
urlParser->removeQueryParam(UrlComponent::Matcher("who"), UrlComponent::Validator(130, 0, ALLOW_ALL, MANDATORY_NONE));
}
// Misc
urlParser->removeQueryParam("partnerref");
// gallery project
if (urlParser->getTitledbVersion() >= 124) {
// g1_return / g2_return / g2_returnUrl
// eg:
// http%3A%2F%2Fgaleria.waw.net.pl%2Fwesele%3Fpage%3D1
// %2Fgallery%2FNovember-2015-trip%2FIMG_2364
urlParser->removeQueryParam("g1_return");
urlParser->removeQueryParam("g2_return");
urlParser->removeQueryParam("g2_returnUrl");
// g2_returnName
// eg:
// album
// Photo
// Mount+with+WebDAV
urlParser->removeQueryParam("g2_returnName");
// g2_authToken
// eg:
// b7cf40525e11
// 1c5a53515a3a
urlParser->removeQueryParam(UrlComponent::Matcher("g2_authToken"), UrlComponent::Validator(12, 12, ALLOW_HEX, MANDATORY_NONE));
}
if (urlParser->getTitledbVersion() >= 124) {
urlParser->removeQueryParam("redirect");
urlParser->removeQueryParam("redirect_to");
urlParser->removeQueryParam("redirectto");
urlParser->removePath(UrlComponent::Matcher("redirectto"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam("redirect_uri");
urlParser->removeQueryParam("redirect_url");
urlParser->removeQueryParam("redirecturl");
urlParser->removeQueryParam("return");
urlParser->removeQueryParam("return_page");
urlParser->removeQueryParam("returnpage");
urlParser->removeQueryParam("return_to");
urlParser->removeQueryParam("returnto");
urlParser->removePath(UrlComponent::Matcher("returnto"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam("returntopage");
urlParser->removeQueryParam("returntoquery");
urlParser->removeQueryParam("returntosearch");
urlParser->removeQueryParam("returntotitle");
urlParser->removeQueryParam("returntourl");
urlParser->removeQueryParam("return_uri");
urlParser->removeQueryParam("return_url");
urlParser->removePath(UrlComponent::Matcher("return_url"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam("returnurl");
urlParser->removePath(UrlComponent::Matcher("returnurl"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
// referer
// eg:
// aHR0cDovL3d3dy5zbGF0ZXJnYXJ0cmVsbHNwb3J0cy5jb20uYXUvaW5kZXgucGhwL2NhdGFsb2cvcHJvZHVjdC92aWV3L2lkLzE5NTgvY2F0ZWdvcnkvMjI0Lz9fX19TSUQ9VQ,,
// aHR0cDovL3d3dy5zZGpzcG9ydHMuY28udWsvc2FsZS5odG1s
// displayimage.php%3Fpid%3D185
// hoh
urlParser->removePath(UrlComponent::Matcher("referer"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
urlParser->removePathParam(UrlComponent::Matcher("referer"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
urlParser->removeQueryParam("referer");
urlParser->removeQueryParam("referrer");
}
if (urlParser->getTitledbVersion() >= 124) {
urlParser->removeQueryParam("afid");
urlParser->removeQueryParam("affid");
urlParser->removeQueryParam("affiliateid");
urlParser->removeQueryParam("affiliate_id");
urlParser->removeQueryParam("affiliate");
urlParser->removeQueryParam(UrlComponent::Matcher("psid"), UrlComponent::Validator(0, 0, false, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_ALPHA));
}
if (urlParser->getTitledbVersion() >= 124) {
// sort
urlParser->removeQueryParam("sort");
// order
urlParser->removeQueryParam("order");
}
/// @todo ALC cater for more affiliate links here
// only check domain specific logic when we have a domain
if (urlParser->getDomain()) {
if (strncmp(urlParser->getDomain(), "amazon.", 7) == 0) {
// amazon
// https://www.reddit.com/r/GameDeals/wiki/affiliate
// affiliate
urlParser->removeQueryParam("tag");
// wishlist
urlParser->removeQueryParam("coliid");
urlParser->removeQueryParam("colid");
// reference
urlParser->removeQueryParam("ref");
urlParser->removePathParam(UrlComponent::Matcher("ref"),
UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_PUNCTUATION));
} else if (strncmp(urlParser->getDomain(), "ebay.", 5) == 0) {
// ebay
// http://www.ebaypartnernetworkblog.com/en/2009/05/new-link-generator-tool-additional-information/
urlParser->removeQueryParam("icep_ff3");
urlParser->removeQueryParam("pub");
urlParser->removeQueryParam("toolid");
urlParser->removeQueryParam("campid");
urlParser->removeQueryParam("customid");
urlParser->removeQueryParam("afepn");
urlParser->removeQueryParam("pid");
}
}
}
// . url rfc = http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
// . "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'(),"
// [not including the quotes - ed], and reserved characters used for their
// reserved purposes may be used unencoded within a URL."
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
// . url should be ENCODED PROPERLY for this to work properly
void Url::set(const char *t, int32_t tlen, bool addWWW, bool stripParams, bool stripCommonFile, int32_t titledbVersion) {
#ifdef _VALGRIND_
VALGRIND_CHECK_MEM_IS_DEFINED(t,tlen);
#endif
reset();
if (!t || tlen == 0) {
return;
}
// we may add a "www." a trailing backslash and \0, ...
if (tlen > MAX_URL_LEN - 10) {
log( LOG_LIMIT, "db: Encountered url of length %" PRId32 ". Truncating to %i", tlen, MAX_URL_LEN - 10 );
tlen = MAX_URL_LEN - 10;
}
char stripped[MAX_URL_LEN];
if (titledbVersion >= 125) {
// skip starting spaces
while (tlen > 0 && is_wspace_a(*t)) {
++t;
--tlen;
}
// remove tab/cr/lf
std::string url(t, tlen);
url.erase(std::remove_if(url.begin(), url.end(), [](char c) { return c == 0x09 || c == 0x0A || c == 0x0D; }), url.end());
memcpy(stripped, url.c_str(), url.size());
stripped[url.size()] = '\0';
t = stripped;
tlen = url.size();
// skip ending spaces
while (tlen > 0 && is_wspace_a(t[tlen - 1])) {
--tlen;
}
}
// . skip over non-alnum chars (except - or /) in the beginning
// . if url begins with // then it's just missing the http: (slashdot)
// . watch out for hostname like: -dark-.deviantart.com(yes, it's real)
// . so all protocols are hostnames MUST start with alnum OR hyphen
while (tlen > 0 && !is_alnum_a(*t) && *t != '-' && *t != '/') {
t++;
tlen--;
}
// . stop t at first space or binary char
// . url should be in encoded form!
int32_t i;
int32_t nonAsciiPos = -1;
for ( i = 0 ; i < tlen ; i++ ) {
if (titledbVersion < 125 && is_wspace_a(t[i])) {
break;
}
if (!is_ascii(t[i])) {
// Sometimes the length with the null is passed in,
// so ignore nulls FIXME?
if (t[i]) {
nonAsciiPos = i;
}
break; // no non-ascii chars allowed
}
}
if ( nonAsciiPos != -1 ) {
// Try turning utf8 and latin1 encodings into punycode.
// All labels(between dots) in the domain are encoded
// separately. We don't support encoded tlds, but they are
// not widespread yet.
// If it is a non ascii domain it needs to take the form
// xn--<punycoded label>.xn--<punycoded label>.../
log(LOG_DEBUG, "build: attempting to decode unicode url %*.*s pos at %" PRId32, (int)tlen, (int)tlen, t, nonAsciiPos);
char encoded [ MAX_URL_LEN ];
size_t encodedLen = MAX_URL_LEN;
char *encodedDomStart = encoded;
const char *p = t;
const char *pend = t+tlen;
// Find the start of the domain
if ( tlen > 7 && strncmp( p, "http://", 7 ) == 0 ) {
p += 7;
} else if ( tlen > 8 && strncmp( p, "https://", 8 ) == 0 ) {
p += 8;
}
gbmemcpy(encodedDomStart, t, p-t);
encodedDomStart += p-t;
while (p < pend && *p != '/' && *p != ':') {
const char *labelStart = p;
uint32_t tmpBuf[MAX_URL_LEN];
int32_t tmpLen = 0;
while (p < pend && *p != '.' && *p != '/' &&
(titledbVersion < 125 || (titledbVersion >= 125 && *p != ':'))) {
p++;
}
int32_t labelLen = p - labelStart;
bool tryLatin1 = false;
// For utf8 urls
p = labelStart;
bool labelIsAscii = true;
// Convert the domain to code points and copy it to tmpbuf to be punycoded
for ( ; p - labelStart < labelLen; p += utf8Size( tmpBuf[tmpLen] ), tmpLen++ ) {
labelIsAscii = labelIsAscii && is_ascii( *p );
tmpBuf[tmpLen] = utf8Decode( p );
if ( !tmpBuf[tmpLen] ) { // invalid char?
tryLatin1 = true;
break;
}
}
if ( labelIsAscii ) {
if ( labelStart[labelLen] == '.' ) {
labelLen++;
p++;
}
gbmemcpy( encodedDomStart, labelStart, labelLen );
encodedDomStart += labelLen;
continue;
}
if ( tryLatin1 ) {
// For latin1 urls
tmpLen = 0;
for ( ; tmpLen < labelLen; tmpLen++ ) {
tmpBuf[tmpLen] = labelStart[tmpLen];
}
}
gbmemcpy( encodedDomStart, "xn--", 4 );
encodedDomStart += 4;
encodedLen = MAX_URL_LEN - (encodedDomStart - encoded);
punycode_status status = punycode_encode( tmpLen, tmpBuf, NULL, &encodedLen, encodedDomStart );
if ( status != 0 ) {
// Give up? try again?
log("build: Bad Engineer, failed to "
"punycode international url %s (%" PRId32 ")",
t, (int32_t)status);
return;
}
// We should check if what we encoded were valid url characters, no spaces, etc
// FIXME: should we exclude just the bad chars? I've seen plenty of urls with
// a newline in the middle. Just discard the whole chunk for now
bool badUrlChars = false;
for ( uint32_t i = 0; i < encodedLen; i++ ) {
if ( is_wspace_a( encodedDomStart[i] ) ) {
badUrlChars = true;
break;
}
}
if ( encodedLen == 0 || badUrlChars ) {
encodedDomStart -= 4; // don't need the xn--
p++;
} else {
encodedDomStart += encodedLen;
*encodedDomStart++ = *p++; // Copy in the . or the /
}
}
// p now points to the end of the domain
// encodedDomStart now points to the first free space in encoded string
// Now copy the rest of the url in. Watch out for non-ascii chars
// truncate the url, and keep it under max url length
uint32_t newUrlLen = encodedDomStart - encoded;
while (p < pend) {
if ( ! *p ) {
break; // null?
}
if (!is_ascii(*p)) {
// url encode utf8 characters now
char cs = getUtf8CharSize(p);
// bad utf8 char?
if ( !isValidUtf8Char(p) ) {
break;
}
int maxDestLen = (cs * 3) + 1; // %XX + \0
// too long?
if ( newUrlLen + maxDestLen >= MAX_URL_LEN ) {
break;
}
char stored = urlEncode(&encoded[newUrlLen], maxDestLen, p, cs);
p += cs;
newUrlLen += stored;
continue;
}
if (is_wspace_a(*p)) {
break;
}
if (newUrlLen + 1 >= MAX_URL_LEN) {
break;
}
encoded[newUrlLen++] = *p++;
}
encoded[newUrlLen] = '\0';
return this->set( encoded, newUrlLen, addWWW, stripParams, stripCommonFile, titledbVersion );
}
// truncate length to the first occurence of an unacceptable char
tlen = i;
// . jump over http:// if it starts with http://http://
// . a common mistake...
while ( tlen > 14 && ! strncasecmp ( t , "http://http://" , 14 ) ) {
t += 7;
tlen -= 7;
}
// only strip anchor for version <= 122 (we're stripping anchor in UrlParser)
if (titledbVersion <= 122) {
// strip the "#anchor" from http://www.xyz.com/somepage.html#anchor"
for (int32_t i = 0; i < tlen; i++) {
if (t[i] == '#') {
// ignore anchor if a ! follows it. 'google hash bang hack'
// which breaks the web and is now deprecated, but, there it is
if (i + 1 < tlen && t[i + 1] == '!') {
continue;
}
tlen = i;
break;
}
}
}
// copy to "s" so we can NULL terminate it
char s[MAX_URL_LEN];
int32_t len = tlen;
if (titledbVersion <= 122) {
// store filtered url into s
memcpy(s, t, tlen);
s[len] = '\0';
if (stripParams) {
stripParametersv122(s, &len);
}
} else {
UrlParser urlParser(t, tlen, titledbVersion);
if (stripParams) {
stripParameters(&urlParser);
}
// rebuild url
urlParser.unparse();
len = urlParser.getUrlParsedLen();
if (len > MAX_URL_LEN - 10) {
len = MAX_URL_LEN - 10;
}
strncpy(s, urlParser.getUrlParsed(), len);
s[len] = '\0';
}
// remove common filenames like index.html
if ( stripCommonFile ) {
if ( len - 14 > 0 &&
strncasecmp(&s[len-14], "/default.xhtml", 14) == 0 )
len -= 13;
else if ( len - 13 > 0 &&
( strncasecmp(&s[len-13], "/default.html", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ascx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ashx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.asmx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.xhtm", 13) == 0 ||
strncasecmp(&s[len-13], "/default.aspx", 13) == 0 ) )
len -= 12;
else if ( len - 12 > 0 &&
( strncasecmp(&s[len-12], "/default.htm", 12) == 0 ||
strncasecmp(&s[len-12], "/default.php", 12) == 0 ||
strncasecmp(&s[len-12], "/default.asp", 12) == 0 ||
strncasecmp(&s[len-12], "/index.xhtml", 12) == 0 ) )
len -= 11;
else if ( len - 11 > 0 &&
( strncasecmp(&s[len-11], "/index.html", 11) == 0 ||
strncasecmp(&s[len-11], "/index.aspx", 11) == 0 ||
strncasecmp(&s[len-11], "/index.xhtm", 11) == 0 ||
strncasecmp(&s[len-11], "/default.pl", 11) == 0 ||
strncasecmp(&s[len-11], "/default.cs", 11) == 0 ) )
len -= 10;
else if ( len - 10 > 0 &&
( strncasecmp(&s[len-10], "/index.htm", 10) == 0 ||
strncasecmp(&s[len-10], "/index.php", 10) == 0 ||
strncasecmp(&s[len-10], "/index.asp", 10) == 0 ||
strncasecmp(&s[len-10], "/main.html", 10) == 0 ||
strncasecmp(&s[len-10], "/main.aspx", 10) == 0 ) )
len -= 9;
else if ( len - 9 > 0 &&
( strncasecmp(&s[len-9], "/index.pl", 9) == 0 ||
strncasecmp(&s[len-9], "/main.htm", 9) == 0 ||
strncasecmp(&s[len-9], "/main.php", 9) == 0 ) )
len -= 8;
else if ( len - 8 > 0 &&
( strncasecmp(&s[len-8], "/main.pl", 8) == 0 ) )
len -= 7;
s[len] = '\0';
}
// replace the "\" with "/" -- a common mistake
int32_t j;
for ( j = 0 ; s[j] ; j++)
{
if (s[j]=='\\')
{
s[j]='/';
}
}
// . dig out the protocol/scheme for this s (check for ://)
// . protocol may only have alnums and hyphens in it
for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
// if we have a legal protocol, then set "m_scheme", "slen" and "sch"
// and advance i to the m_host
if ( i + 2 < len && s[i]==':' && s[i+1]=='/' && s[i+2]=='/')
{
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 3 , m_url );
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 3;
}
else
if (i + 2 < len && s[i]==':' && s[i+1]=='/'&& is_alnum_a(s[i+2]))
{
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 2 , m_url );
// add in needed /
m_url[i+2]='/';
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 2;
}
else
{
gbmemcpy ( m_url,"http://" , 7 );
m_scheme = m_url;
m_slen = 4;
m_ulen = 7;
i = 0;
// if s started with // then skip that (slashdot)
if ( s[0]=='/' && s[1]=='/' ) i = 2;
}
// . now &s[i] should point to the m_host name
// . chars allowed in hostname = period,alnum,hyphen,underscore
// . stops at '/' or ':' or any other disallowed character
j = i;
while (s[j] && (is_alnum_a(s[j]) || s[j]=='.' || s[j]=='-'||s[j]=='_'))
j++;
// copy m_host into "s" (make it lower case, too)
to_lower3_a ( s + i, j - i, m_url + m_ulen );
m_host = m_url + m_ulen;
m_hlen = j - i;
// common mistake: if hostname ends in a . then back up
while ( m_hlen > 0 && m_host[m_hlen-1]=='.' ) m_hlen--;
// NULL terminate for strchr()
m_host [ m_hlen ] = '\0';
// advance m_ulen to end of hostname
m_ulen += m_hlen;
// . Test if hostname is in a.b.c.d format
// . this returns 0 if not a valid ip string
int32_t ip = atoip ( m_host , m_hlen );
// advance i to the : for the port, if it exists
i = j;
// NULL terminate m_host for getTLD(), getDomain() and strchr() below
m_host [ m_hlen ] = '\0';
// use ip as domain if we're just an ip address like 192.0.2.1
if ( ip ) {
// ip address has no tld, or mid domain
m_tld = NULL;
m_tldLen = 0;
// but it does have a domain (1.2.3)
m_domain = getDomainOfIp ( m_host , m_hlen , &m_dlen );
// just use the domain as the mid domain for ip-based urls
m_mdlen = m_dlen;
}
// . otherwise, get the tld
// . uses thorough list of tlds in Domains.cpp
else if ( ( m_tld = ::getTLD ( m_host, m_hlen ) ) && m_tld > m_host ) {
// set m_domain if we had a tld that's not equal to our host
m_tldLen = strlen ( m_tld );
m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
// set the mid domain length (-1 for the '.')
m_mdlen = m_dlen - m_tldLen - 1;
}
// otherwise, we're no ip and we have no valid domain
else {
m_domain = NULL;
m_dlen = 0;
m_tldLen = 0;
m_mdlen = 0;
}
// . if domain same as host then we might insert a "www." server name
// . however, must have a period in domain name
// . otherwise a domain name of "xxx" would become "www.xxx" and if
// Url::set() is called on that it would be "www.www.xxx" (bad bad)
// . let's only add "www." if there's only 1 period, ok?
if ( ! ip && addWWW && m_host == m_domain && strchr(m_host,'.') ) {
memmove ( m_host + 4 , m_host , m_hlen );
gbmemcpy ( m_host , "www." , 4 );
if ( m_domain ) m_domain += 4;
if ( m_tld ) m_tld += 4;
m_ulen += 4;
m_hlen += 4;
}
// set the default port based on the protocol
m_defPort = 80;
if ( m_slen==5 && strncmp(m_scheme, "https",5)==0 ) m_defPort = 443;
// assume we're using the default port for this scheme/protocol
m_port = m_defPort;
// see if a port was provided in the hostname after a colon
if ( s[i] == ':' ) {
// remember the ptr so far
int32_t savedLen = m_ulen;
// add a colon to our m_url
m_url [ m_ulen++ ] = ':';
// scan for a '/'
j = i + 1;
while ( s[j] && s[j]!='/') m_url[m_ulen++] = s[j++];
m_portPtr = s + i + 1;
m_portPtrLen = j - (i + 1);
// now read our port
m_port = atol2(m_portPtr, m_portPtrLen);
// if it's the default port, then remove what we copied
if ( m_port == m_defPort ) m_ulen = savedLen;
// make i point to the root / in the m_path, if any
i = j;
}
// how many chars is taken up by a specified port?
m_portLen = 0;
if ( m_port != m_defPort ) {
m_portLen += 2; // :3
if ( m_port >= 10 ) m_portLen += 1;
if ( m_port >= 100 ) m_portLen += 1;
if ( m_port >= 1000 ) m_portLen += 1;
if ( m_port >= 10000 ) m_portLen += 1;
}
// append a '/' to m_url then bail if there is no m_path after the port
if ( s[i] != '/') {
m_path = m_url + m_ulen;
m_path[0] = '/';
m_plen = 1;
m_url[ ++m_ulen ]='\0';
return;
}
// . get the m_path and m_path length
// . j,i should point to start of path slash '/'
// . scan so it points to end or a ? or #
j = i;
// now we include # as part of the path if it is a hash bang '#!'
// which was the web-breaking google hack that is now deprecated
while ( s[j] && s[j]!='?' ) {
if ( s[j] == '#' && s[j+1] != '!' )
break;
j++;
}
// point the path inside m_url even though we haven't written it yet
m_path = m_url + m_ulen;
m_plen = m_ulen;
// . deal with wierd things in the path
// . i points to start of path (should be /)
for (; i < j ; i++ ) {
// dedup double backslashes
// ensure m_ulen >= m_plen so we don't hurt "http:///" ...
// but people sometimes put http:// in the *path*
if ( s[i] == '/' && m_url[m_ulen-1] == '/' &&
m_ulen-1 >= m_plen &&
m_ulen >= 2 && m_url[m_ulen-2] != ':' ) continue;
// handled by UrlParser for version 123 and above
if (titledbVersion <= 122) {
// deal with current directories in the m_path
if ( s[i] == '.' && m_url[m_ulen-1] == '/' &&
(i+1 == j || s[i+1]=='/')) continue;
// . deal with damned ..'s in the m_path
// . if next 2 chars are .'s and last char we wrote was '/'
if ( s[i] == '.' && s[i+1]=='.' && (s[i+2] == '/' || s[i+2] == '\0') && m_url[m_ulen-1] == '/' ) {
// dont back up over first / in path
if ( m_url + m_ulen - 1 > m_path ) m_ulen--;
while ( m_url[m_ulen-1] != '/' ) m_ulen--;
// skip i to next / after these 2 dots
while ( s[i] && s[i]!='/' ) i++;
continue;
}
}
// don't allow ; before the ?...probably because of stripped
// sessionId...
// I was going to add other possible dup separators, but now
// it seems as though it might cause problems
if (s[i] == ';' && s[i+1] == '?') continue;
// store char and advance to next
m_url[m_ulen++] = s[i];
}
// reset the path length in case we had to remove some wierd stuff
m_plen = m_ulen - m_plen;
// . get the m_query
// . the query is anything after the path that starts with ?
// . NOTE: we ignore strings beginning with '#' (page relative anchors)
if ( i < len && s[i] != '#' ) {
//remove back to back &'s in the cgi query
//http://www.nyasatimes.com/national/politics/160.html?print&&&
char *kstart = s + i;
char *kend = s + i + (len - i);
char *dst = m_url + m_ulen;
for ( char *k = kstart ; k < kend ; k++ ) {
// skip & if we just did one
if ( *k == '&' && k > kstart && *(k-1)=='&' ) continue;
// copy over one char at a time
*dst++ = *k;
}
// point after the '?' i guess
m_query = m_url + m_ulen + 1;
m_qlen = dst - m_query;
m_ulen += m_qlen + 1;
}
// get the m_filename from the m_path (m_flen might be 0)
m_flen = 0;
while (m_path[m_plen-1-m_flen]!='/' && m_flen<m_plen) m_flen++;
m_filename = m_path + m_plen - m_flen;
// get the m_extension from the m_path
m_elen = 0;
while (is_alnum_a(m_path[m_plen-1-m_elen]) && m_elen < m_plen)m_elen++;
if ( m_path[ m_plen-1-m_elen] != '.' ) m_elen = 0; // no m_extension
m_extension = m_path + m_plen - m_elen;
// null terminate our s
m_url[ m_ulen ]='\0';
}
// hostname must also be www or NULL to be a root url
bool Url::isRoot() const {
if ( m_plen != 1 ) return false;
if ( !m_path || m_path[0] != '/' ) return false;
if ( m_query ) return false;
// for now we'll let all thos *.deviantart.com names clog us up
// because i don't want to dis' stuff like espn.go.com
return true;
}
bool Url::isSimpleSubdomain ( ) const {
// if hostname is same as domain, it's passes
if ( m_host == m_domain && m_hlen == m_dlen ) return true;
// if host is not "www." followed by domain, it's NOT
if ( m_hlen != m_dlen + 4 ) return false;
if ( strncmp ( m_host , "www." , 4 ) == 0 ) return true;
return false;
}
// . get length of sub-url #j
// . basically like adding j /.. to the end of the url
// . sub-url #0 is the full url
// . includes /~ as it's own path
int32_t Url::getSubUrlLen ( int32_t j ) const {
// assume it's the whole url
int32_t len = m_ulen;
// subtract the m_query (cgi) part at the end of the url
if ( m_query ) len -= m_qlen + 1; //and the ?
// return the full url (without m_query) if j is 0
if ( j == 0 ) return len;
// . start right past the http://m_host.domain.com/
int32_t start = m_slen + 3 + m_hlen + 1 + m_portLen ;
while ( len > start ) {
if ( m_url [ len - 1 ] == '/' ) j--;
if ( m_url [ len - 2 ] == '/' && m_url [ len - 1 ] == '~') j--;
// include this backslash (or ~) in the sub-url
if ( j == 0 ) return len;
// shrink by one character
len--;
}
// return 0 if jth sub-url does not exist
return 0;
}
// . similar to getSubUrlLen() above but only works on the path
// . if j is 0 that's the whole url path!
int32_t Url::getSubPathLen ( int32_t j ) const {
int32_t subUrlLen = getSubUrlLen ( j );
if ( subUrlLen <= 0 ) return 0;
// . the subPath length includes the root backslash
// . portLen includes the whole :8080 thing (for non default ports)
return subUrlLen - m_slen - 3 - m_hlen - m_portLen;
}
void Url::print() const {
logf(LOG_TRACE, "Url info");
logf(LOG_TRACE, "\turl : %s", m_url);
logf(LOG_TRACE, "\turlhash32 : %" PRIx32, getUrlHash32());
logf(LOG_TRACE, "\turlhash48 : %" PRIx64, getUrlHash48());
logf(LOG_TRACE, "\turlhash64 : %" PRIx64, getUrlHash64());
logf(LOG_TRACE, "\thost : %.*s", m_hlen, m_host);
logf(LOG_TRACE, "\thosthash32 : %" PRIx32, getHostHash32());
logf(LOG_TRACE, "\thosthash48 : %" PRIx64, getHostHash64());
logf(LOG_TRACE, "\tscheme : %.*s", m_slen, m_scheme);
logf(LOG_TRACE, "\tpath : %.*s", m_plen, m_path);
logf(LOG_TRACE, "\tquery : %s", m_query);
logf(LOG_TRACE, "\tport : %" PRId32, m_port);
logf(LOG_TRACE, "\tdomain : %.*s", m_dlen, m_domain);
logf(LOG_TRACE, "\tdomainhash32 : %" PRIx32, getDomainHash32());
logf(LOG_TRACE, "\tdomainhash64 : %" PRIx64, getDomainHash64());
logf(LOG_TRACE, "\ttld : %.*s", m_tldLen, m_tld);
logf(LOG_TRACE, "\tmid domain : %.*s", m_mdlen, m_domain);
logf(LOG_TRACE, "\tis root : %i", isRoot());
}
int32_t Url::getPathDepth ( bool countFilename ) const {
const char *s = m_path + 1;
const char *send = m_url + m_ulen;
int32_t count = 0;
while ( s < send ) if ( *s++ == '/' ) count++;
// if we're counting the filename as a path component...
if ( countFilename && *(send-1) != '/' ) count++;
return count;
}
bool Url::isHostWWW ( ) const {
if ( m_hlen < 4 ) return false;
if ( m_host[0] != 'w' ||
m_host[1] != 'w' ||
m_host[2] != 'w' ||
m_host[3] != '.' ) return false;
return true;
}
// . is the url a porn/adult url?
// . i use /usr/share/dict/words to check for legit words
// . if it's long and has 4+ hyphens, consider it spam
// . if you add a word here, add it to PageResults.cpp:isQueryDirty()
bool Url::isAdult() const {
//certain TLDs are clearly adult-oriented
if( isAdultTLD(m_tld, m_tldLen))
return true;
#if 0
//@@@
if(m_hlen<=0)
return false; // Invalid URL (no hostname)
if(m_tldLen<=0)
return false; // no TLD
// store the hostname in a buf since we strtok it
char s [ MAX_URL_LEN ];
// don't store the .com or .org while searching for isSpam
int32_t slen = m_hlen - m_tldLen - 1;
gbmemcpy ( s , m_host , slen );
if ( ! m_domain ) return false;
if ( ! m_dlen ) return false;
//int32_t len = m_dlen;
//gbmemcpy ( s , m_domain , len );
// if tld is gov or edu or org, not porn
if ( m_tldLen >= 3 && strncmp ( m_tld , "edu" , 3 )==0 ) return false;
if ( m_tldLen >= 3 && strncmp ( m_tld , "gov" , 3 )==0 ) return false;
// NULL terminate for strstr
s[slen]='\0';
// . if there is 4 or more hyphens, and hostLen > 30 consider it spam
// . actually there seems to be a lot of legit sites with many hyphens
if ( slen > 30 ) {
int32_t count = 0;
char *p = s;
while ( *p ) if ( *p++ == '-' ) count++;
if ( count >= 4 ) return true;
}
//
// TODO: use getMatch()!!!! +pts -pts system
//
// check each thing separated by periods for porn
const char *send = s + slen;
const char *p = s;
while(p<send) {
// find the next period or hyphen
const char *pend = p;
while ( pend < send && *pend != '.' && *pend !='-' ) pend++;
// check that
if ( isAdultUrl ( p , pend - p ) ) return true;
// point to next
p = pend + 1;
}
#endif
return false;
}
// . remove any session id
// . i'm sick of these tihngs causing dup problems
// . types:
// http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395
// http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&
// http://www.b.com/default?SID=f320a739cdecb4c3edef67e
// http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
// http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?stuff=1
// look for ';'
// look for PHPSESSID, session_id, SID, jsessionid
// followed by string of at least 4 letters/numbers
//List of extensions NOT to parse
static const char * const s_badExtensions[] = {
"ai",
"aif",
"aifc",
"aiff",
"asc",
"au",
"avi",
"bcpio",
"bin",
"bmp",
"bz2",
//"c",
//"cc",// c source code, allow
"ccad",
"cdf",
//"class",// text source code file usually, allow
"cpio",
"cpt",
//"csh",
"css",
"dcr",
"dir",
"dms",
//"doc",
"drw",
"dvi",
"dwg",
"dxf",
"dxr",
"eps",
"etx",
"exe",
"ez",
//"f", // ambigous
"f90",
"fli",
"gif",
"gtar",
"gz",
//"h",
"hdf",
"hh",
"hqx",
//"htm",
//"html",
"ice",
"ief",
"iges",
"igs",
"ips",
"ipx",
"jpe",
"jpeg",
"jpg",
//"js",
"kar",
"latex",
"lha",
"lsp",
"lzh",
//"m", // ambiguous
"man",
"me",
"mesh",
"mid",
"midi",
"mif",
"mime",
"mov",
"movie",
"mp2",
"mp3",
"mpe",
"mpeg",
"mpg",
"mpga",
"ms",
"msh",
"nc",
"oda",
"pbm",
"pdb",
//"pdf",
"pgm",
"pgn",
"png",
"pnm",
"pot",
"ppm",
"pps",
// "ppt",
"ppz",
"pre",
"prt",
// "ps",
"qt",
"ra",
"ram",
"ras",
"rgb",
"rm",
"roff",
"rpm",
"deb", // debian/ubuntu package file
"rtf",
"rtx",
"scm",
"set",
"sgm",
"sgml",
//"sh", // shells are text files
"shar",
"silo",
"sit",
"skd",
"skm",
"skp",
"skt",
"smi",
"smil",
"snd",
"sol",
"spl",
"src",
"step",
"stl",
"stp",
"sv4cpio",
"sv4crc",
"swf",
//"t", // ambiguous ... Mr.T.
"tar",
"tcl",
"tex",
"texi",
"texinfo",
"tif",
"tiff",
"tr",
"tsi",
"tsp",
"tsv",
//"txt",
"unv",
"ustar",
"vcd",
"vda",
"viv",
"vivo",
"vrml",
"wav",
"wrl",
"xbm",
"xlc",
"xll",
"xlm",
//"xls",
"xlw",
//"xml",
"xpm",
"xwd",
"xyz",
"zip",//
};//look below, I added 3 more types for TR version 73
static const size_t s_badExtensionsCount = sizeof(s_badExtensions)/sizeof(s_badExtensions[0]);
static HashTable s_badExtTable;
static bool s_badExtInitialized;
static GbMutex s_badExtTableMutex;
//returns True if the extension is listed as bad
bool Url::hasNonIndexableExtension( int32_t version ) const {
if ( ! m_extension || m_elen == 0 ) return false;
ScopedLock sl(s_badExtTableMutex);
if(!s_badExtInitialized) { //if hash has not been created-create one
//version 72 and before.
for(size_t i=0; i<s_badExtensionsCount; i++) {
int tlen = strlen(s_badExtensions[i]);
int64_t swh = hash64Lower_a(s_badExtensions[i],tlen);
if(!s_badExtTable.addKey(swh,(int32_t)50))
{
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash %" PRId64" to badExtTable.", swh);
return false;
}
}
//version 73 and after.
if(!s_badExtTable.addKey(hash64Lower_a("wmv", 3),(int32_t)73) ||
!s_badExtTable.addKey(hash64Lower_a("wma", 3),(int32_t)73) ||
!s_badExtTable.addKey(hash64Lower_a("ogg", 3),(int32_t)73))
{
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (2).");
return false;
}
// More unwanted extensions
if(
!s_badExtTable.addKey(hash64Lower_a("7z", 2),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("lz", 2),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("xz", 2),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("apk", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("com", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("dic", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("dll", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("dmg", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("flv", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("gpx", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("ico", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("iso", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("kmz", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("mp4", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("rar", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("svg", 3),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("vcf", 3),(int32_t)122) ||
// !s_badExtTable.addKey(hash64Lower_a("xls", 3),(int32_t)122) || // Should be handled by converter (AbiWord)
!s_badExtTable.addKey(hash64Lower_a("lzma", 4),(int32_t)122) ||
// !s_badExtTable.addKey(hash64Lower_a("pptx", 4),(int32_t)122) || // Should be handled by converter (AbiWord)
!s_badExtTable.addKey(hash64Lower_a("thmx", 4),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("zipx", 4),(int32_t)122) ||
// !s_badExtTable.addKey(hash64Lower_a("xlsx", 4),(int32_t)122) || // Should be handled by converter (AbiWord)
!s_badExtTable.addKey(hash64Lower_a("zsync", 5),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("torrent", 7),(int32_t)122) ||
!s_badExtTable.addKey(hash64Lower_a("manifest", 8),(int32_t)122)
)
{
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (3).");
return false;
}
s_badExtInitialized = true;
}
int myKey = hash64Lower_a(m_extension,m_elen);
int32_t badVersion = s_badExtTable.getValue(myKey);
if( badVersion == 0 || badVersion > version )
{
return false;
}
return true;
}
bool Url::hasXmlExtension ( ) const {
if ( ! m_extension || ! m_elen || m_elen > 3 ) return false;
char ext[5];
int i;
for(i=0; i < m_elen; i++)
{
ext[i] = to_lower_a(m_extension[i]);
}
ext[i] = '\0';
switch( m_elen )
{
case 3:
if( memcmp(ext, "xml", 3) == 0 )
{
return true;
}
break;
default:
break;
}
return false;
}
bool Url::hasJsonExtension ( ) const {
if ( ! m_extension || ! m_elen || m_elen >= 4 ) return false;
char ext[5];
int i;
for(i=0; i < m_elen; i++)
{
ext[i] = to_lower_a(m_extension[i]);
}
ext[i] = '\0';
switch( m_elen )
{
case 4:
if( memcmp(ext, "json", 4) == 0 )
{
return true;
}
break;
default:
break;
}
return false;
}
bool Url::hasScriptExtension ( ) const {
if ( ! m_extension || ! m_elen || m_elen > 4 ) return false;
char ext[5];
int i;
for(i=0; i < m_elen; i++)
{
ext[i] = to_lower_a(m_extension[i]);
}
ext[i] = '\0';
switch( m_elen )
{
case 2:
if( memcmp(ext, "js", 2) == 0 )
{
return true;
}
break;
default:
break;
}
return false;
}
// see Url.h for a description of this.
bool Url::isLinkLoop ( ) const {
const char *s = m_path ;
const char *send = m_url + m_ulen;
int32_t count = 0;
int32_t components = 0;
bool prevWasDouble = false;
const char *last = NULL;
if (!s) return false;
// use this hash table to hash each path component in the url
char buf [ 5000 ];
HashTable t; t.set ( 100 , buf , 5000 );
// grab each path component
for ( ; s < send ; s++ ) {
if ( *s != '/' ) continue;
// ok, add this guy to the hash table, if we had one
if ( ! last ) { last = s; continue; }
// give up after 50 components
if ( components++ >= 50 ) return false;
// hash him
uint32_t h = hash32 ( last , s - last );
// is he in there?
int32_t slot = t.getSlot ( h );
// get his val (count)
int32_t val = 0;
if ( slot >= 0 ) val = t.getValueFromSlot ( slot );
// if not in there put him in a slot
if ( slot < 0 ) {
last = s;
t.addKey ( h , 1 );
continue;
}
// increment it
val++;
// does it occur 3 or more times? if so, we have a link loop
if ( val >= 3 ) return true;
// is it 2 or more?
if ( val == 2 ) count++;
// if we have two such components, then we are a link loop.
// BUT, we must be a pair!
if ( count >= 2 && prevWasDouble ) return true;
// set this so in case next guy is a double
if ( val == 2 ) prevWasDouble = true;
else prevWasDouble = false;
// add it back after incrementing
t.setValue ( slot , val );
// update "last"
last = s;
}
return false;
}
//
// here are some examples of link loops in urls:
//
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/bish/archive/bish/letters/bish/archive/lette\rs/send/archive/letters/send/bish/letters/archive/bish/letters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/bish/letters/archive/bish/archive/letters/send/archive/letters/send/archive/le\tters/send/archive/letters/send/bish/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/letters/send/archive/bish/archive/bish/\archive/bish/letters/send/archive/letters/archive/letters/send/archive/bish/let\ters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/letters/archive/bish/archive/bish/archive/bi\sh/letters/send/archive/bish/archive/letters/send/bish/archive/bish/letters/sen\d/archive/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/bish/letters/send/archive/\bish/archive/letters/bish/letters/send/archive/bish/letters/send/bish/archive/l\etters/bish/letters/archive/letters/send/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/send/bish/archive/letters/\send/bish/archive/letters/send/archive/letters/bish/archive/bish/archive/letter\s/
bool Url::isValid() const {
// validate port
if (m_port <= 0 || m_port > 65535 || m_portPtrLen > 5) {
return false;
}
return true;
}
bool Url::isIp() const {
if(!m_host) return false;
if(!is_digit(*m_host)) return false;
return atoip ( m_host , m_hlen );
}
int32_t Url::getHash32WithWWW ( ) const {
uint32_t hh = hash32n ( "www." );
int32_t conti = 4;
hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
return hh;
}
int32_t Url::getHostHash32 ( ) const {
return hash32 ( m_host , m_hlen );
}
int64_t Url::getHostHash64 ( ) const {
return hash64 ( m_host , m_hlen );
}
int32_t Url::getDomainHash32 ( ) const {
return hash32 ( m_domain , m_dlen );
}
int64_t Url::getDomainHash64 ( ) const {
return hash64 ( m_domain , m_dlen );
}
int32_t Url::getUrlHash32 ( ) const {
return hash32(m_url,m_ulen);
}
int64_t Url::getUrlHash64 ( ) const {
return hash64(m_url,m_ulen);
}
const char *getHostFast ( const char *url , int32_t *hostLen , int32_t *port ) {
// point to the url
const char *pp = url;
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
// point "uhost" to hostname right away
const char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
const char *pe = pp;
if ( *pp == ':' ) {
// if port ptr given, do not treat port as part of hostname
if ( port ) *port = atoi(pp+1);
// i think this was including :1234 as part of hostname
// if port was NULL!
//else while ( *pe && *pe != '/' ) pe++;
}
// set length
if ( hostLen ) *hostLen = pe - uhost;
return uhost;
}
char *getPathFast ( char *url ) {
// point to the url
char *pp = url;
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
// point "uhost" to hostname right away
//char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
return pe;
}
const char *getTLDFast(const char *url, int32_t *tldLen, bool hasHttp) {
// point to the url
const char *pp = url;
// only do this for some
if ( hasHttp ) {
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
}
// point "uhost" to hostname right away
const char *uhost = pp;
// advance "pp" till we hit a / or :<port> or \0
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
const char *pe = pp;
if ( *pp == ':' ) {
while ( *pe && *pe != '/' ) {
pe++;
}
}
// set length of host
int32_t uhostLen = pp - uhost;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
const char *ss = uhost;
bool isIp = true;
for ( ; *ss && ss<pp ; ss++ ) {
if ( is_alpha_a( *ss ) ) {
isIp = false;
break;
}
}
// if ip, no tld
if ( isIp ) {
return NULL;
}
// get the tld
const char *tld = ::getTLD ( uhost , uhostLen );
// if none, done
if ( ! tld ) {
return NULL;
}
// set length
if ( tldLen ) {
*tldLen = pp - tld;
}
// return it
return tld;
}
bool hasSubdomain(const char *url) {
// point to the url
const char *pp = url;
// skip http if there
if ( pp[0] == 'h' &&
pp[1] == 't' &&
pp[2] == 't' &&
pp[3] == 'p' &&
pp[4] == ':' &&
pp[5] == '/' &&
pp[6] == '/' )
pp += 7;
else if ( pp[0] == 'h' &&
pp[1] == 't' &&
pp[2] == 't' &&
pp[3] == 'p' &&
pp[4] == 's' &&
pp[5] == ':' &&
pp[6] == '/' &&
pp[7] == '/' )
pp += 8;
// point "uhost" to hostname right away
const char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// are we a root? assume so.
//char isRoot = true;
// advance "pe" over the port
const char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
//if ( *pe == '/' && *(pe+1) ) isRoot = false;
// set length
int32_t uhostLen = pp - uhost;
// get end
//char *hostEnd = uhost + uhostLen;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
const char *ss = uhost;
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
// if we are an ip, say yes
if ( ss == pp ) return true;
// get the tld
const char *utld = ::getTLD ( uhost , uhostLen );
// no tld, then no domain
if ( ! utld ) return false;
// the domain, can only be gotten once we know the TLD
// back up a couple chars
const char *udom = utld - 2;
// backup until we hit a '.' or hit the beginning
while ( udom > uhost && *udom != '.' ) udom--;
// fix http://ok/
if ( udom < uhost || *udom =='/' ) return false;
// if we hit '.' advance 1
if ( *udom == '.' ) udom++;
// eqal to host? if not, we do have a subdomain
if ( udom != uhost ) return true;
// otherwise the hostname equals the domain name
return false;
}
// returns NULL if url was in bad format and could not get domain. this
// was happening when a host gave us a bad redir url and xmldoc tried
// to set extra doc's robot.txt url to it "http://2010/robots.txt" where
// the host said "Location: 2010 ...".
const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) {
// point to the url
const char *pp = url;
// skip http if there
if ( hasHttp ) {
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
}
// point "uhost" to hostname right away
const char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
const char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// set length
int32_t uhostLen = pp - uhost;
// get end
const char *hostEnd = uhost + uhostLen;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
const char *ss = uhost;
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
//bool isIp = false;
//if ( ss == pp ) isIp = true;
// if we are an ip, treat special
if ( ss == pp ) {
// . might just be empty! like "\0"
// . fixes core dump from
// http://www.marcom1.unimelb.edu.au/public/contact.html
// parsing host email address
if ( uhostLen == 0 ) return NULL;
// to be consistent with how Url::m_domain/m_dlen is set we
// need to remove the last .X from the ip address
// skip back over digits
for ( hostEnd-- ; is_digit(*hostEnd); hostEnd-- );
// must be a period
if ( *hostEnd != '.' ) {
log("url: getDomFast() could not find period for "
"hostname in url");
return NULL;
}
// set length
*domLen = hostEnd - uhost;
// that's it
return uhost;
}
// get the tld
const char *utld = ::getTLD ( uhost , uhostLen );
// no tld, then no domain
if ( ! utld ) return NULL;
// the domain, can only be gotten once we know the TLD
// set utldLen
//int32_t utldLen = hostEnd - utld;
// back up a couple chars
const char *udom = utld - 2;
// backup until we hit a '.' or hit the beginning
while ( udom > uhost && *udom != '.' ) udom--;
// fix http://ok/
if ( udom < uhost || *udom =='/' ) return NULL;
// if we hit '.' advance 1
if ( *udom == '.' ) udom++;
// set domain length
*domLen = hostEnd - udom;
return udom;
}
// "s" point to the start of a normalized url (includes http://, etc.)
const char *getHost(const char *s, int32_t *hostLen) {
// skip proto
while ( *s != ':' ) s++;
// skip ://
s += 3;
// that is the host
const char *host = s;
// get length of hostname
for ( s++; *s && *s != '/' ; s++ );
// that is it
*hostLen = s - host;
// return it
return host;
}
// "s" point to the start of a normalized url (includes http://, etc.)
const char *getScheme ( const char *s , int32_t *schemeLen )
{
const char *div = strstr(s, "://");
if( !div )
{
*schemeLen=0;
return "";
}
*schemeLen = div - s;
return s;
}
// . return ptrs to the end
// . the character it points to SHOULD NOT BE part of the site
const char *getPathEnd(const char *s, int32_t desiredDepth) {
// skip proto
while ( *s != ':' ) s++;
// skip ://
s += 3;
// get length of hostname
for ( s++; *s && *s != '/' ; s++ );
// should always have a /
if ( *s != '/' ) gbshutdownLogicError();
// skip that
s++;
// init depth
int32_t depth = 0;
// do a character loop
for ( ; depth <= desiredDepth && *s ; s++ )
// count the '/'
if ( *s == '/' ) depth++;
// return the end
return s;
/*
// save for below
int32_t saved = depth;
// keep going
while ( depth-- > 0 ) {
for ( s++; *s && *s != '/' && *s != '?' ; s++ );
// if not enough path components (or cgi), return NULL
if ( *s != '/' ) return NULL;
}
// include the last '/' if we have path components
if ( saved > 0 ) s++;
// . we got it
// . if depth==0 just use "www.xyz.com" as site
// . if depth==1 just use "www.xyz.com/foo/" as site
return s;
*/
}
// . pathDepth==0 for "www.xyz.com"
// . pathDepth==0 for "www.xyz.com/"
// . pathDepth==0 for "www.xyz.com/foo"
// . pathDepth==1 for "www.xyz.com/foo/"
// . pathDepth==1 for "www.xyz.com/foo/x"
// . pathDepth==2 for "www.xyz.com/foo/x/"
// . pathDepth==2 for "www.xyz.com/foo/x/y"
int32_t getPathDepth(const char *s, bool hasHttp) {
// skip http:// if we got it
if ( hasHttp ) {
// skip proto
while ( *s != ':' ) s++;
// must have it!
if ( ! *s ) gbshutdownLogicError();
// skip ://
s += 3;
}
// skip over hostname
for ( s++; *s && *s != '/' ; s++ );
// no, might be a site like "xyz.com"
if ( ! *s ) return 0;
// should always have a /
if ( *s != '/' ) gbshutdownLogicError();
// skip that
s++;
// init depth
int32_t depth = 0;
// do a character loop
for ( ; *s ; s++ ) {
// stop if we hit ? or #
if ( *s == '?' ) break;
if ( *s == '#' ) break;
// count the '/'
if ( *s == '/' ) depth++;
}
return depth;
}
bool Url::isPunycodeSafeTld() const {
//TODO: use a configuration file for this or some more clever logic
//Some ccTLDs are safe because they only allow punycode for non-ascii letters that used by the country's language(s).
//Firefox/mozilla used to use a TLD whitelist, but then switched to a "no mixed scripts" rule, which mostly works but
//fails for www.са.com (note: the "ca" in that url is cyrillic letters)
if(m_tldLen==2) {
//Some ccTLDs have strict and sensible policies
//List inspired by firefox's old/unused whitelist
static const char *safe_cctld[] = {
"ac",
"ar",
"at",
"br",
"ca",
"ch",
"cl",
"de",
"dk",
"ee",
"es",
"fi",
"fr",
"gr",
"gt",
"hu",
"il",
"is",
"jp",
"li",
"lt",
"lu",
"lv",
"no",
"nz",
"pl",
"se",
"ua",
nullptr
};
for(size_t i=0; safe_cctld[i]; i++)
if(memcmp(m_tld,safe_cctld[i],2)==0)
return true;
}
//example.com/are safe (reserved for documentation purposes and we use them internally for testing)
if(m_dlen==11 && memcmp(m_domain,"example.com",11)==0)
return true;
if(m_dlen==11 && memcmp(m_domain,"example.net",11)==0)
return true;
return false;
}
bool Url::hasPunycode() const {
const char *s = (const char*)memmem(m_host,m_hlen,"xn--",4);
if(!s)
return false;
if(s==m_host || s[-1]=='.')
return true;
else
return false;
}
bool Url::getPunycodeDecodedHost(SafeBuf *sb) const {
const char *s = m_host;
const char *end = m_host+m_hlen;
while(s<end) {
const char *d = (const char*)memchr(s,'.',end-s);
if(!d)
d = end;
if(d-s<4 || memcmp(s,"xn--",4)!=0) {
sb->safeMemcpy(s,d-s);
} else {
uint32_t decoded[256]; //64 should be enough according to DNS spec, but let's be a bit safer
size_t decoded_count = 256;
punycode_status status = punycode_decode(d-s-4, s+4, &decoded_count, decoded, NULL);
if(status!=punycode_success) {
log(LOG_WARN, "build: Could not decode punycode '%.*s' component in host '%.*s'", (int)(d-s),s, m_hlen, m_host);
return false;
}
char utf8buf[256*4];
size_t utf8len = 0;
for(size_t i=0; i<decoded_count; i++)
utf8len += utf8Encode(decoded[i],utf8buf+utf8len);
sb->safeMemcpy(utf8buf,utf8len);
}
if(d<end)
sb->safeMemcpy(".",1);
s = d+1;
}
return true;
}
bool Url::getPunycodeDecodedMidDomain(SafeBuf *sb) const {
const char *s = m_domain;
const char *end = m_domain+m_mdlen;
const char *d = end;
if(!d)
d = end;
if(d-s<4 || memcmp(s,"xn--",4)!=0) {
sb->safeMemcpy(s,d-s);
} else {
uint32_t decoded[256]; //64 should be enough according to DNS spec, but let's be a bit safer
size_t decoded_count = 256;
punycode_status status = punycode_decode(d-s-4, s+4, &decoded_count, decoded, NULL);
if(status!=punycode_success) {
log(LOG_WARN, "build: Could not decode punycode '%.*s' component in middomain '%.*s'", (int)(d-s),s, m_hlen, m_host);
return false;
}
char utf8buf[256*4];
size_t utf8len = 0;
for(size_t i=0; i<decoded_count; i++)
utf8len += utf8Encode(decoded[i],utf8buf+utf8len);
sb->safeMemcpy(utf8buf,utf8len);
}
if(d<end)
sb->safeMemcpy(".",1);
return true;
}
char* Url::getDisplayUrl( const char* url, SafeBuf* sb ) {
const char *urlEnd = url + strlen(url);
const char *p = url;
if ( strncmp( p, "http://", 7 ) == 0 )
p += 7;
else if ( strncmp(p, "https://", 8 ) == 0 )
p += 8;
const char *domEnd = static_cast<const char*>( memchr( p, '/', urlEnd - p ) );
if (domEnd == NULL) {
domEnd = urlEnd;
}
bool firstRun = true;
const char *found = NULL;
const char *labelCursor = url;
while( ( found = strstr( labelCursor, "xn--" ) ) && ( found < domEnd ) ) {
if ( firstRun ) {
sb->safeMemcpy( url, found - url );
firstRun = false;
}
const char* encodedStart = found + 4;
uint32_t decoded [ MAX_URL_LEN];
size_t decodedLen = MAX_URL_LEN - 1 ;
const char* labelEnd = encodedStart;
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' ) {
labelEnd++;
}
punycode_status status = punycode_decode(labelEnd - encodedStart, encodedStart, &decodedLen, decoded, NULL);
if ( status != 0 ) {
log( "build: Bad Engineer, failed to depunycode international url %s", url );
sb->safePrintf("%s", labelCursor);
sb->nullTerm();
return sb->getBufStart();
}
sb->utf32Encode( decoded, decodedLen );
if ( *labelEnd == '.' ) {
sb->pushChar( *labelEnd++ );
}
labelCursor = labelEnd;
}
// Copy in the rest
sb->safePrintf("%s", labelCursor);
sb->nullTerm();
return sb->getBufStart();
}