db43287908
It was called one place but result was ignored. Memmber m_ip was only used internally inUrl::set() so it was changed to a local variable
2705 lines
82 KiB
C++
2705 lines
82 KiB
C++
#include "Url.h"
|
||
#include "UrlParser.h"
|
||
#include "Domains.h"
|
||
#include "HashTable.h"
|
||
#include "FxCheckAdult.h"
|
||
#include "ip.h" // atoip ( s,len)
|
||
#include "Punycode.h"
|
||
#include "SafeBuf.h"
|
||
#include "Sanity.h"
|
||
#include "GbMutex.h"
|
||
#include "ScopedLock.h"
|
||
#include "GbUtil.h"
|
||
#include "gbmemcpy.h"
|
||
#include <vector>
|
||
#include <algorithm>
|
||
|
||
#ifdef _VALGRIND_
|
||
#include <valgrind/memcheck.h>
|
||
#endif
|
||
|
||
Url::Url() {
|
||
reset();
|
||
}
|
||
|
||
|
||
void Url::reset() {
|
||
m_scheme = NULL;
|
||
m_host = NULL;
|
||
m_path = NULL;
|
||
m_filename = NULL;
|
||
m_extension = NULL;
|
||
m_query = NULL;
|
||
m_domain = NULL;
|
||
m_tld = NULL;
|
||
|
||
m_url[0] = '\0';
|
||
m_ulen = 0;
|
||
m_dlen = 0;
|
||
m_slen = 0;
|
||
m_qlen = 0;
|
||
m_hlen = 0;
|
||
m_elen = 0;
|
||
m_mdlen = 0;
|
||
|
||
// Coverity
|
||
m_plen = 0;
|
||
m_flen = 0;
|
||
m_tldLen = 0;
|
||
|
||
m_port = 0;
|
||
m_defPort = 0;
|
||
m_portLen = 0;
|
||
|
||
m_portPtr = nullptr;
|
||
m_portPtrLen = 0;
|
||
}
|
||
|
||
void Url::set( const Url *baseUrl, const char *s, int32_t len, bool addWWW, bool stripParams,
|
||
bool stripCommonFile, int32_t titledbVersion ) {
|
||
reset();
|
||
|
||
if ( ! baseUrl ) {
|
||
set( s, len, addWWW, false, false, titledbVersion );
|
||
return;
|
||
}
|
||
|
||
char *base = (char *) baseUrl->m_url;
|
||
int32_t blen = baseUrl->m_ulen;
|
||
|
||
// don't include cgi crap
|
||
if ( baseUrl->m_query ) {
|
||
blen -= ( baseUrl->m_qlen + 1 );
|
||
}
|
||
|
||
// . adjust length of the base url.
|
||
// . if base url does not end in / then it must have a m_filename at
|
||
// the end, therefore we should strip the m_filename
|
||
if ( blen > 0 && base[blen - 1] != '/' ) {
|
||
while ( blen > 0 && base[blen - 1] != '/' ) {
|
||
blen--;
|
||
}
|
||
}
|
||
|
||
if ( blen == 0 && len == 0 ) {
|
||
return;
|
||
}
|
||
|
||
// if empty string / an url fragment, use baseUrl
|
||
if (len == 0 || s[0] == '#') {
|
||
set(baseUrl);
|
||
return;
|
||
}
|
||
|
||
// . fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
|
||
// . if "s" starts with ? then keep the filename in the base url
|
||
if (s[0] == '?') {
|
||
for ( ; base[blen] && base[blen] != '?'; blen++ ) {
|
||
;
|
||
}
|
||
}
|
||
|
||
// skip s over spaces
|
||
const char *send = s + len;
|
||
while ( s < send && is_wspace_a( *s ) ) {
|
||
s++;
|
||
len--;
|
||
}
|
||
|
||
// . is s a relative url? search for ://, but break at first /
|
||
// . but break at any non-alnum or non-hyphen
|
||
bool isAbsolute = false;
|
||
int32_t i;
|
||
for ( i = 0; i < len && ( is_alnum_a( s[i] ) || s[i] == '-' ); i++ ) {
|
||
;
|
||
}
|
||
|
||
if ( !isAbsolute ) {
|
||
isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '/'); // some are missing both /'s!
|
||
}
|
||
|
||
if ( !isAbsolute ) {
|
||
isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '\\');
|
||
}
|
||
|
||
// or if s starts with // then it's also considered absolute!
|
||
if ( !isAbsolute && len > 1 && s[0] == '/' && s[1] == '/' ) {
|
||
isAbsolute = true;
|
||
}
|
||
|
||
// watch out for idiots
|
||
if ( !isAbsolute && len > 1 && s[0] == '\\' && s[1] == '\\' ) {
|
||
isAbsolute = true;
|
||
}
|
||
|
||
// don't use base if s is not relative
|
||
if ( blen==0 || isAbsolute ) {
|
||
set( s, len, addWWW, stripParams, false, titledbVersion );
|
||
return;
|
||
}
|
||
|
||
// . if s starts with / then hack of base's m_path
|
||
// . careful not to hack of the port, if any
|
||
// . blen = baseUrl->m_slen + 3 + baseUrl->m_hlen;
|
||
if ( len > 0 && s[0]=='/' )
|
||
blen = baseUrl->m_path - baseUrl->m_url ;
|
||
|
||
char temp[MAX_URL_LEN * 2 + 1];
|
||
strncpy( temp, base, blen );
|
||
|
||
if ( len > MAX_URL_LEN ) {
|
||
len = MAX_URL_LEN - 2;
|
||
}
|
||
|
||
// if s does NOT start with a '/' then add one here in case baseUrl
|
||
// does NOT end in one.
|
||
// fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
|
||
if ( len > 0 && s[0] != '/' && s[0] != '?' && temp[blen - 1] != '/' ) {
|
||
temp[blen++] = '/';
|
||
}
|
||
strncpy( temp + blen, s, len );
|
||
temp[blen+len] = '\0';
|
||
|
||
set( temp, blen + len, addWWW, stripParams, stripCommonFile, titledbVersion );
|
||
}
|
||
|
||
|
||
static bool isSessionId ( const char *hh ) {
|
||
int32_t count = 0;
|
||
int32_t nonNumCount = 0;
|
||
|
||
// do not limit count to 12, the hex numbers may only be
|
||
// after the 12th character! we were not identifying these
|
||
// as sessionids when we shold have been because of that.
|
||
for ( ; *hh ; ++count, ++hh ) {
|
||
if ( *hh >= '0' && *hh <= '9' ) continue;
|
||
nonNumCount++;
|
||
if ( *hh >= 'a' && *hh <= 'f' ) continue;
|
||
// we got an illegal session id character
|
||
return false;
|
||
}
|
||
|
||
// if we got at least 12 of em, consider it a valid id
|
||
// make sure it's a hexadecimal number...lots of product
|
||
// ids and dates use only decimal numbers
|
||
return ( nonNumCount > 0 && count >= 12);
|
||
}
|
||
|
||
static void stripParametersv122( char *s, int32_t *len ) {
|
||
// . remove session ids from s
|
||
// . ';' most likely preceeds a session id
|
||
// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1
|
||
// . http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
|
||
// . http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395&p=1
|
||
// . http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&p=1
|
||
// . http://www.b.com/default?SID=f320a739cdecb4c3edef67e&p=1
|
||
|
||
// CHECK FOR A SESSION ID USING QUERY STRINGS
|
||
char *p = s;
|
||
while ( *p && *p != '?' && *p != ';' ) p++;
|
||
|
||
// bail if no ?
|
||
if ( ! *p ) {
|
||
return;
|
||
}
|
||
|
||
// now search for severl strings in the cgi query string
|
||
char *tt = NULL;
|
||
int32_t x = 0;
|
||
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "PHPSESSID=" ); x = 10;}
|
||
if ( ! tt ) { tt = strstr ( p , "SID=" ); x = 4;}
|
||
// . osCsid and XTCsid are new session ids
|
||
// . keep this up here so "sid=" doesn't override it
|
||
if ( ! tt ) {
|
||
tt = strstr ( p , "osCsid=" );
|
||
x = 7;
|
||
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
|
||
// a hex sequence of at least 10 digits must follow
|
||
if ( tt && ! isSessionId ( tt + x ) )
|
||
tt = NULL;
|
||
}
|
||
if ( ! tt ) {
|
||
tt = strstr ( p , "osCsid/" );
|
||
x = 7;
|
||
// a hex sequence of at least 10 digits must follow
|
||
if ( tt && ! isSessionId ( tt + x ) )
|
||
tt = NULL;
|
||
}
|
||
// this is a new session id thing
|
||
if ( ! tt ) {
|
||
tt = strstr ( p , "sid=" ); x = 4;
|
||
// a hex sequence of at least 10 digits must follow
|
||
if ( tt && ! isSessionId ( tt + x ) )
|
||
tt = NULL;
|
||
}
|
||
// osCsid and XTCsid are new session ids
|
||
if ( ! tt ) {
|
||
tt = strstr ( p , "osCsid=" );
|
||
x = 7;
|
||
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
|
||
// a hex sequence of at least 10 digits must follow
|
||
if ( tt && ! isSessionId ( tt + x ) )
|
||
tt = NULL;
|
||
}
|
||
|
||
// fixes for bug of matching plain &sessionid= first and
|
||
// then realizing char before is an alnum...
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "jsessionid="); x = 11; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "vbsessid=" ); x = 9;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "asesessid=" ); x = 10; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "nlsessid=" ); x = 9; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "psession=" ); x = 9; }
|
||
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "session_id="); x = 11;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "sessionid=" ); x = 10;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "sessid=" ); x = 7;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "session=" ); x = 8;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "session/" ); x = 8; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "POSTNUKESID=");x = 12;}
|
||
// some new session ids as of Feb 2005
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "auth_sess=" ); x = 10; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "mysid=" ); x = 6; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "oscsid=" ); x = 7; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "cg_sess=" ); x = 8; }
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "galileoSession");x=14; }
|
||
// new as of Jan 2006. is hurting news5 collection on gb6
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "sess=" ); x = 5; }
|
||
|
||
// .php?s=8af9d6d0d59e8a3108f3bf3f64166f5a&
|
||
// .php?s=eae5808588c0708d428784a483083734&
|
||
// .php?s=6256dbb2912e517e5952caccdbc534f3&
|
||
if ( ! tt && (tt = strstr ( p-4 , ".php?s=" )) ) {
|
||
// point to the value of the s=
|
||
char *pp = tt + 7;
|
||
int32_t i = 0;
|
||
// ensure we got 32 hexadecimal chars
|
||
while ( pp[i] &&
|
||
( is_digit(pp[i]) ||
|
||
( pp[i]>='a' && pp[i]<='f' ) ) ) i++;
|
||
// if not, do not consider it a session id
|
||
if ( i < 32 ) tt = NULL;
|
||
// point to s= for removal
|
||
else { tt += 5; x = 2; }
|
||
}
|
||
|
||
// BR 20160117
|
||
// http://br4622.customervoice360.com/about_us.php?SES=652ee78702fe135cd96ae925aa9ec556&frmnd=registration
|
||
if ( ! tt ) { tt = strstr ( p , "SES=" ); x = 4;}
|
||
|
||
// BR 20160117: Skip most common tracking parameters
|
||
// Oracle Eloqua
|
||
// http://app.reg.techweb.com/e/er?s=2150&lid=25554&elq=00000000000000000000000000000000&elqaid=2294&elqat=2&elqTrackId=3de2badc5d7c4a748bc30253468225fd
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "elq="); x = 4;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "elqat="); x = 6;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "elqaid="); x = 7;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "elq_mid="); x = 8;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "elqTrackId="); x = 11;}
|
||
|
||
// Google Analytics
|
||
// http://kikolani.com/blog-post-promotion-ultimate-guide?utm_source=kikolani&utm_medium=320banner&utm_campaign=bpp
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_term="); x = 9;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_hp_ref="); x = 11;} // Lots on huffingtonpost.com
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_source="); x = 11;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_medium="); x = 11;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_content="); x = 12;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "utm_campaign="); x = 13;}
|
||
|
||
// Piwik
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_kwd="); x = 7;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_source="); x = 10;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_medium="); x = 10;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "pk_campaign="); x = 12;}
|
||
|
||
// Misc
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "trk="); x = 4;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "promoid="); x = 8;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "promCode="); x = 9;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "promoCode="); x = 10;}
|
||
if ( ! tt ) { tt = gb_strcasestr ( p, "partnerref="); x = 11;}
|
||
|
||
// bail if none were found
|
||
if ( ! tt ) {
|
||
return;
|
||
}
|
||
|
||
// . must not have an alpha char before it!
|
||
// . prevent "DAVESID=" from being labeled as session id
|
||
if ( is_alnum_a ( *(tt-1) ) ) {
|
||
return;
|
||
}
|
||
|
||
// start of the shit
|
||
int32_t a = tt - s;
|
||
|
||
// get the end of the shit
|
||
int32_t b = a + x;
|
||
|
||
// back up until we hit a ? or & or / or ;
|
||
while ( a > 0 && s[a-1] != '?' && s[a-1] != '&' &&
|
||
s[a-1] != '/' && s[a-1] != ';' ) a--;
|
||
|
||
// keep the '?'
|
||
if ( s[a]=='?' ) a++;
|
||
|
||
// back up over any semicolon
|
||
if ( s[a-1] == ';' ) a--;
|
||
|
||
// advance b until we hit & or end or ? or a ';'
|
||
while ( s[b] && s[b] != '&' && s[b] != '?' && s[b] != ';') b++;
|
||
|
||
// if we don't have 5+ chars in session id itself, skip it
|
||
if ( b - (a + x) < 5 ) {
|
||
return;
|
||
}
|
||
|
||
// go over a & or a ;
|
||
if ( s[b] == '&' || s[b] == ';' ) b++;
|
||
|
||
// remove the session id by covering it up
|
||
memmove ( &s[a] , &s[b] , *len - b );
|
||
|
||
// reduce length
|
||
*len -= (b-a);
|
||
|
||
// if s ends in ? or & or ;, backup
|
||
while ( *len > 0 && (s[*len-1]=='?'||s[*len-1]=='&'||s[*len-1]==';'))
|
||
(*len)--;
|
||
|
||
// NULL terminate
|
||
s[*len] = '\0';
|
||
}
|
||
|
||
static void stripParameters(UrlParser *urlParser) {
|
||
/// @todo ALC reorder parameter?
|
||
/// if we have ?abc=123&def=456
|
||
/// wouldn't it be the same as ?def=456&abc=123
|
||
|
||
/// @todo ALC login pages?
|
||
/// should we even spider them?
|
||
|
||
static const UrlComponent::Validator s_defaultParamValidator(0, 0, true, ALLOW_ALL, MANDATORY_NONE);
|
||
|
||
// 3 different component that we can remove from
|
||
// - path (we have a much more restrictive criteria on path to avoid removing valid path)
|
||
// eg: http://www.example.com/search/keywords/chardonnay/osCAdminID/45de8edd68f8bc05e9fde0d2c528a619/sort/3d/
|
||
//
|
||
// - path param
|
||
// eg: http://www.example.com/search/keywords,chardonnay/osCAdminID,45de8edd68f8bc05e9fde0d2c528a619/sort,3d/
|
||
// eg: http://www.example.com/search/;keywords=chardonnay;osCAdminID=45de8edd68f8bc05e9fde0d2c528a619;sort=3d/
|
||
//
|
||
// - query string
|
||
// eg: http://www.example.com/search/?keywords=chardonnay&osCAdminID=45de8edd68f8bc05e9fde0d2c528a619&sort=3d
|
||
|
||
// osCommerce (osCsid)
|
||
// eg:
|
||
// be1566df2284664244ce73ea6bed81fa09d4
|
||
// b8d15fefe8648f7f77c6e47f7bc0b881
|
||
// ddtvpkt3rpqdprsagsi52tj5o4
|
||
{
|
||
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCsid"));
|
||
if (!pathMatches.empty()) {
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
|
||
}
|
||
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("osCsid"), s_defaultParamValidator);
|
||
}
|
||
|
||
// osCommerce (osCAdminID)
|
||
// eg:
|
||
// 20d2f836fd203140dc6391b7ba3cdd82
|
||
// c40fe2ad32efad2e9cc2748a3f1f90cc
|
||
{
|
||
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCAdminID"));
|
||
if (!pathMatches.empty()) {
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
|
||
}
|
||
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("osCAdminID"), s_defaultParamValidator);
|
||
}
|
||
|
||
// XT-commerce
|
||
// eg:
|
||
// ha6n43ndtnlm53tpqgnclbv7ukkroue9k7m1e2o7t7rr5nb366a1
|
||
// 7ib1soln64vslra70ep2qcvde4s8dsm1
|
||
// big3ika24atc4j19mlaha6d906
|
||
urlParser->removePath(UrlComponent::Matcher("XTCsid", MATCH_CASE), UrlComponent::Validator(26, 52, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("XTCsid", MATCH_CASE), s_defaultParamValidator);
|
||
|
||
// ColdFusion
|
||
// http://help.adobe.com/en_US/ColdFusion/9.0/Developing/WSc3ff6d0ea77859461172e0811cbec0c35c-7fef.html#WSc3ff6d0ea77859461172e0811cbec22c24-7cbf
|
||
|
||
// ColdFusion (CTOKEN)
|
||
// eg:
|
||
// e718cd6cc29050df-8051DC1E-C29B-554E-6DFF6B5D2704A9A5
|
||
// 92566684.html
|
||
// 94175176
|
||
// 322257
|
||
{
|
||
auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("CFTOKEN"));
|
||
if (!pathMatches.empty()) {
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(52, 52, true, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(10, 14, true, ALLOW_ALL, MANDATORY_PUNCTUATION));
|
||
urlParser->removePath(pathMatches, UrlComponent::Validator(6, 0, true, ALLOW_DIGIT, MANDATORY_NONE));
|
||
}
|
||
|
||
urlParser->removePathParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
|
||
}
|
||
|
||
// ColdFusion (CFID)
|
||
urlParser->removePath(UrlComponent::Matcher("CFID"), UrlComponent::Validator(0, 0, true, ALLOW_DIGIT, MANDATORY_NONE));
|
||
urlParser->removePathParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);
|
||
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("cftokenPass"), s_defaultParamValidator);
|
||
|
||
/// SAP load balancer
|
||
// https://help.sap.com/saphelp_nw70/helpdata/de/f2/d7914b8deb48f090c0343ef1d907f0/content.htm
|
||
urlParser->removePathParam(UrlComponent::Matcher("saplb_*"), s_defaultParamValidator);
|
||
|
||
// Atlassian
|
||
// https://developer.atlassian.com/confdev/confluence-plugin-guide/writing-confluence-plugins/form-token-handling
|
||
// 3 different format
|
||
// eg:
|
||
// AFP6-ISR2-ZLJY-KBY3|926a76e0017be6a18e889d2ddffb0aaab21865c1|lout
|
||
// 56c1bb338d5ad3ac262dd4e97bda482efc151f30
|
||
// 15BWJdAr0U
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("atl_token"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(65, 0, true, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, true, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(10, 10, true, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
|
||
}
|
||
}
|
||
|
||
// psession
|
||
// eg:
|
||
// 491022863920110420135759
|
||
// 7d01p6qvcl2e72j8ivmppk12k0
|
||
// XUjuplcPFGlJD2ZF5O26ApqAj5ZNEZwZrUKX5kkA
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("psession"), UrlComponent::Validator(24, 0, true, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
|
||
|
||
// Galileo
|
||
// eg:
|
||
// 65971783A4.z17ZHFAI
|
||
// 63105032A6BFxgQFfV8
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("GalileoSession"), UrlComponent::Validator(19, 19, true, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
// postnuke
|
||
// normally it would be hex string length of 32. but shorter length exist (looks to be chopped off somehow)
|
||
// eg:
|
||
// 549178d5035b622229a39cd5baf75d2a
|
||
// 4ed3b0a832d4687020b05ce70
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("POSTNUKESID"), UrlComponent::Validator(16, 32, true, ALLOW_HEX, MANDATORY_NONE));
|
||
|
||
// jsessionid
|
||
// eg:
|
||
// C14778D1240A6CFEE5417030DDB37D41
|
||
urlParser->removePath(UrlComponent::Matcher("jsessionid"), UrlComponent::Validator(32, 32, false, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removePathParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
// phpsessid
|
||
// eg:
|
||
// 7711
|
||
// 4g8v6ndp6gnnc4tagn8coam0n7
|
||
// 414c6917961d5b4998973d1613b7926f
|
||
// qfou95mlih5jjans36kevj2pti7p847v6bl79f03nrvtaadif6u0
|
||
urlParser->removePath(UrlComponent::Matcher("PHPSESSID"), UrlComponent::Validator(26, 32, false, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("PHPSESSID", MATCH_PARTIAL), s_defaultParamValidator);
|
||
|
||
// auth_sess
|
||
// mostly job sites (same group?)
|
||
// eg:
|
||
// 7ofc7ep3i8g6i2foinq6uks7e0
|
||
// 6ce228460946fc4b3ed154abea1530b8
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("auth_sess"), UrlComponent::Validator(26, 32, true, (ALLOW_DIGIT | ALLOW_ALPHA), MANDATORY_NONE));
|
||
|
||
// ps_sess_id
|
||
// eg:
|
||
// 0056c53b03ee56c8b791a5cf061a910d
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("ps_sess_id"), UrlComponent::Validator(32, 32, true, ALLOW_HEX, MANDATORY_NONE));
|
||
|
||
// mysid
|
||
// eg:
|
||
// c357e16d973188ad99cc3e32a059e805
|
||
// 11GeUYNB4fCVXeySSumKM3
|
||
// hNrnd87gxn9LU0X-N-4TS2
|
||
// glwcjvci
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("mysid"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(22, 22, ALLOW_ALL, MANDATORY_ALPHA));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(8, 8, ALLOW_ALPHA, MANDATORY_NONE));
|
||
}
|
||
}
|
||
|
||
// sid
|
||
// eg:
|
||
// 3565de85-0bf0-47d3-8fb3-80120d6b60a6
|
||
// 8E67BB91-5056-9000-2C8C1473A967F273
|
||
// 0b721aa1c34b75fcf41e17304537d965
|
||
// 3KnGJS3ga7ae891-33115175851.04
|
||
// v0uqho4nv0mnghv4ap3ieeqp94
|
||
// K6FYyt
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sid"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(30, 0, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_ALPHA_UPPER)));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_DIGIT)));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_UPPER | MANDATORY_DIGIT)));
|
||
}
|
||
}
|
||
|
||
// SES
|
||
// eg:
|
||
// 74339eda735516fd51ed1c5eb6bc76ceav
|
||
// 39a11261f58150fd4327a80da6daafa0
|
||
// 99cj5cbf6g8irau20h1hkvr8o6
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("ses"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(34, 34, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
}
|
||
}
|
||
|
||
// s
|
||
// eg:
|
||
// 4d9ae8a969305848227e5d6d7d0fb9672bd38d96
|
||
// 81cfba6ed9b66a8ad0df43c2f3d259bd
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("s"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, ALLOW_HEX, MANDATORY_ALPHA_HEX));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_ALPHA_HEX));
|
||
}
|
||
}
|
||
|
||
// session_id
|
||
// eg:
|
||
// NiHhUceSP6At57u0
|
||
// ospnr7npc97urgoi1p9i9kd1e4
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("session_id"), UrlComponent::Validator(16, 0, ALLOW_ALL, MANDATORY_ALPHA));
|
||
|
||
// sessionid
|
||
// eg:
|
||
// 094104BqFHWLmUCiZAMvgboVyVFiIKDqRPJCxIUMZIPNkMVJVK
|
||
// 1a0d43d9a6753940649bbaeb56f01176
|
||
// ej3fa4fe7eikfb8ej1fd6
|
||
// ObUlshp63oxfnZzvCzwe
|
||
// mN3XmQ{hXgsK8jY7VUm8
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("sessionid"), UrlComponent::Validator(20, 0, ALLOW_ALL, MANDATORY_ALPHA));
|
||
|
||
// other session id variations
|
||
|
||
// sessid (vbSESSID, asesessid, nlsessid, GLBSESSID, sessid, etc ...)
|
||
// eg:
|
||
// 91hpb1p3b69bu0vqruar2fpltf3b509bsdeqh1qtj1p8ugb8rpc0
|
||
// a12cb492ec7bcc9677916f02913587064d4279ed
|
||
// 50d96959db895a0adbfebd325a4a65e0
|
||
// f4db3ec33001c9759d095c6432651e39
|
||
// 82d0pbm7f6aa55no7p0rqb37r6
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sessid", MATCH_PARTIAL));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(52, 52, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, ALLOW_HEX, MANDATORY_ALPHA_HEX));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_ALPHA_HEX));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
}
|
||
}
|
||
|
||
// session
|
||
// eg:
|
||
// eRbInbLDoNaEr4gkIju0
|
||
// vfdplav2ske1blvadpv9du54k3
|
||
// ARC-1454710019-541634862-12401
|
||
// ARC-1454807400-18472177182-25788
|
||
// A5C45BC6DC3B436899C43B9D904FC8DE
|
||
// 7b478486-e52c-46aa-aca8-8cd446fcb79e
|
||
// 39663_1455055828_84298238456ba63d42992a
|
||
// 14185_1455099610_106560567456bb0eda9d317
|
||
// NlG8XCo5MgpctBTMRut4Gq6J5Z5de9foAe4rh3ikLQYWQmFzLR4zSHuieO8
|
||
// DMQBEXa5Z-aJ7r67ylAJ_y9H8_S2HTUaIjoafUtOjYuGcxwRefR0Q3xXzyS
|
||
// bGJL_GuP2eDGwJJzoXM9T3_LRgjAsalqaREGEBDoEERJOIMIL8Wh7Q3K3FcgHtYc9hM6CuJmVKlmmCxjmSYEhwVlOdUEX5RnUXycKSHKO5iAz2_ulWoJOZ1d7QCD2Afn9WPkXkvaJaSgjo7hcfYbBnUOXhedzMolha6kfV7hvf4mRAF700MhB350--QV0wQAur9Rz47QiX8SiRXp_vQDdwInUSfO3PqOwXfBu72w4e-JySzUf7Aj9Ks9ouOUPAn1W_GtORLLT4Gho7-Tb_IwyGVYPKF97f3VMXsTfoFqUvs
|
||
//
|
||
urlParser->removeQueryParam(urlParser->matchQueryParam(UrlComponent::Matcher("session")),
|
||
UrlComponent::Validator(20, 0, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
|
||
// sess
|
||
// eg:
|
||
// 4be234480736093ba237bc397fb6e32d
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("sess"),
|
||
UrlComponent::Validator(20, 0, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_NONE));
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// zenid
|
||
// eg:
|
||
// f-PkGiBLKfCX6tEkFg9IX0
|
||
// oolgqmle6imrmn7fank6dt35j0
|
||
// 7338c37c2d3a1a23d43b70cc07202861
|
||
// hugsngcjfn6chl4crs21mgkchff1tape
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("zenid"));
|
||
if (!queryMatches.empty()) {
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(22, 22, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
}
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// sid
|
||
// eg:
|
||
// A22CTDDQDAWWOUD2AMDTDOS2B
|
||
// iss5teou1s7jn25gnr57ta50g4
|
||
// 6Ld1DQiSaTtLoxlRZV2Q4ZWa1ME6QT
|
||
// c6dc67a613ac0d459ea256e30a5c5f22
|
||
// 57D64937B89D9203F0032B416DBEEF78
|
||
// plangbfbficubs1hun1rodemqmravs9b
|
||
// 00573df4e5c7851f9e60c5e30c7529454d4279f4
|
||
// WsNYRPZQpDIqXVIYwYOrbACqDZqscupKvuWoynYsGA3Z3R8Bck
|
||
// 011gipkpcil1ce7ono5vqmtb7h7uvlcbpbd4jm2mo4bmoje94f30
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sid", MATCH_SUFFIX));
|
||
std::vector<UrlComponent*> queryMatchesNotWhitelisted;
|
||
for (auto &queryMatch : queryMatches) {
|
||
// whitelist
|
||
if (queryMatch->getKey().compare("newsid") == 0 || queryMatch->getKey().compare("smpagesid") == 0) {
|
||
continue;
|
||
}
|
||
|
||
queryMatchesNotWhitelisted.push_back(queryMatch);
|
||
}
|
||
|
||
if (!queryMatchesNotWhitelisted.empty()) {
|
||
urlParser->removeQueryParam(queryMatchesNotWhitelisted, UrlComponent::Validator(25, 0, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
|
||
}
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// cart_id
|
||
// eg:
|
||
// 680114.127.22580
|
||
// 4988233_28056
|
||
// 84
|
||
urlParser->removeQueryParam("cart_id");
|
||
}
|
||
|
||
// ts
|
||
// eg:
|
||
// 1422344216175
|
||
// 1425080080316
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("ts"), UrlComponent::Validator(13, 13, ALLOW_DIGIT, MANDATORY_NONE));
|
||
|
||
// apache dir sort
|
||
// C={N,M,S,D} O={A,D}
|
||
// eg:
|
||
// ?C=N;O=A
|
||
if (urlParser->getQueryParamCount() <= 2) {
|
||
auto cQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("C", MATCH_CASE));
|
||
auto oQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("O", MATCH_CASE));
|
||
|
||
UrlComponent *cUrlComponent = (cQueryMatches.size() == 1) ? cQueryMatches[0] : NULL;
|
||
UrlComponent *oUrlComponent = (oQueryMatches.size() == 1) ? oQueryMatches[0] : NULL;
|
||
|
||
if (cUrlComponent) {
|
||
if (cUrlComponent->getValueLen() == 0) {
|
||
urlParser->deleteComponent(cUrlComponent);
|
||
} else if (cUrlComponent->getValueLen() == 1) {
|
||
char c = *(cUrlComponent->getValue());
|
||
if (c == 'N' || c == 'M' || c == 'S' || c == 'D') {
|
||
urlParser->deleteComponent(cUrlComponent);
|
||
}
|
||
}
|
||
}
|
||
|
||
if (oUrlComponent) {
|
||
if (oUrlComponent->getValueLen() == 0) {
|
||
urlParser->deleteComponent(oUrlComponent);
|
||
} else if (oUrlComponent->getValueLen() == 1) {
|
||
char o = *(oUrlComponent->getValue());
|
||
if (o == 'A' || o == 'D') {
|
||
urlParser->deleteComponent(oUrlComponent);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// token
|
||
// eg:
|
||
// 02170932a082516cf758dbaa0a5ebab1
|
||
auto tokenMatches = urlParser->matchQueryParam(UrlComponent::Matcher("token"));
|
||
if (!tokenMatches.empty()) {
|
||
// only delete token parameter when:
|
||
// - no path; or
|
||
// - when id_product exist
|
||
if ((urlParser->getPaths()->size() == 1 && urlParser->getPaths()->front().getString().size() == 0) ||
|
||
!urlParser->matchQueryParam(UrlComponent::Matcher("id_product")).empty()) {
|
||
urlParser->removeQueryParam(tokenMatches, UrlComponent::Validator(32, 32, ALLOW_HEX, MANDATORY_NONE));
|
||
}
|
||
}
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// random
|
||
urlParser->removeQueryParam("random");
|
||
|
||
// _random
|
||
urlParser->removeQueryParam("_random");
|
||
|
||
// rand
|
||
urlParser->removeQueryParam("rand");
|
||
|
||
// _rand
|
||
urlParser->removeQueryParam("_rand");
|
||
}
|
||
|
||
// Skip most common tracking parameters
|
||
|
||
// Oracle Eloqua
|
||
// http://docs.oracle.com/cloud/latest/marketingcs_gs/OMCAA/index.html#Help/General/EloquaTrackingParameters.htm
|
||
urlParser->removeQueryParam("elqTrackId");
|
||
urlParser->removeQueryParam("elq");
|
||
urlParser->removeQueryParam("elqCampaignId");
|
||
urlParser->removeQueryParam("elqaid");
|
||
urlParser->removeQueryParam("elqat");
|
||
urlParser->removeQueryParam("elq_mid");
|
||
urlParser->removeQueryParam("elq_cid");
|
||
urlParser->removeQueryParam("elq2"); // others
|
||
|
||
// Google Analytics
|
||
// https://support.google.com/analytics/answer/1033867
|
||
urlParser->removeQueryParam("utm_source");
|
||
urlParser->removeQueryParam("utm_medium");
|
||
urlParser->removeQueryParam("utm_term");
|
||
urlParser->removeQueryParam("utm_content");
|
||
urlParser->removeQueryParam("utm_campaign");
|
||
urlParser->removeQueryParam("utm_hp_ref"); // Lots on huffingtonpost.com
|
||
urlParser->removeQueryParam("utm_rid"); // others
|
||
|
||
// https://support.google.com/analytics/answer/1033981?hl=en
|
||
// https://support.google.com/ds/answer/6292795?hl=en
|
||
urlParser->removeQueryParam("gclid");
|
||
urlParser->removeQueryParam("gclsrc");
|
||
|
||
// Piwik
|
||
// http://piwik.org/docs/tracking-campaigns/
|
||
// https://plugins.piwik.org/AdvancedCampaignReporting
|
||
urlParser->removeQueryParam("pk_campaign");
|
||
urlParser->removeQueryParam("pk_kwd");
|
||
urlParser->removeQueryParam("pk_source");
|
||
urlParser->removeQueryParam("pk_medium");
|
||
urlParser->removeQueryParam("pk_keyword");
|
||
urlParser->removeQueryParam("pk_content");
|
||
urlParser->removeQueryParam("pk_cid");
|
||
|
||
// Open Web Analytics
|
||
// https://github.com/padams/Open-Web-Analytics/wiki/Campaign-Tracking
|
||
urlParser->removeQueryParam("owa_medium");
|
||
urlParser->removeQueryParam("owa_source");
|
||
urlParser->removeQueryParam("owa_campaign");
|
||
urlParser->removeQueryParam("owa_ad");
|
||
urlParser->removeQueryParam("owa_ad_type");
|
||
|
||
// Webtrends
|
||
// http://help.webtrends.com/en/queryparameters/index.html
|
||
urlParser->removeQueryParam("wt.mc_id");
|
||
|
||
// Mailchimp
|
||
// https://apidocs.mailchimp.com/api/how-to/ecommerce.php
|
||
urlParser->removeQueryParam("mc_cid");
|
||
urlParser->removeQueryParam("mc_eid");
|
||
|
||
// Marketo
|
||
// http://developers.marketo.com/documentation/websites/lead-tracking-munchkin-js/
|
||
urlParser->removeQueryParam("mkt_tok");
|
||
|
||
// trk
|
||
// eg:
|
||
// ppro_cprof
|
||
// prc-basic
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("trk"),
|
||
UrlComponent::Validator(0, 0, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_PUNCTUATION)));
|
||
|
||
// who
|
||
// eg:
|
||
// r,Usyg2mo/krON58h7Cqp0HHHvPhsMdK5lNmP76/O/gxQb/ObopGwS3yJwoT241Hf8EMrMDicKKYtMqLKqmtywdZFGbvS6J6jbKbUd5HzTkv_FxyTEsYw1rLJr9LHquA3O
|
||
// r,yrl2BJY6LMkbtXa9k/lflCwQqzDqf/AF7zFIQoBAhI_t6U_gJztkZ/8ABugLiijm2NRXjt_LYh56mwmTv5cCNuIkgnB2cLFEfL62Gaoyddeh89cXgi9UqjWLP/Y1lD/4watUuyy2WINYipnkSygRLQ--
|
||
// r,nEBHD2D/_wnDIxXmNMZRjB1wQZikW7uTA8ZXGmCH3a1IvIXSpSv0QicLoCGpTnsBe2QR7xzvq2i2JeKu2AbpgLJaexxw5VON6yG8DP2t5oFhOdoM/kuVnhIt4PEVt1UwqKBNApZk56tTem_r5wqaF4ko65Bo5i7J67PUNHOZs3U-
|
||
// r,0/asSWWd2MeHwFRbMqZP42yZoh0UlWB2zyP9nAoa3ejKyLPsBjxivhuAY2RH6r94BV2DcmQQYxk6MYZD4Uo6cb30qgNTwVY/_rl_BjRSWosgbpRtPuMytbSX0OmxKuNedtcT27C3fJG/oia/88wI_Ec5PIerpxyPLAgXEsi78vAyuZAXymqhujGGTf6ACryR
|
||
// r,rW75z4HBqJegN3eAao88RaQcHsIgPXhAP/K1KCbI3x6dMrYllBZLlVfpuL_C0IQed0WspcLWMeT79fzDoAnb0qioGuFSnCHaZXYoH5_GZsWESFdk4CznUlTZuyeTFKsu9xblmYa56ShIKUyILXaFAI8HbNh7dpaXr7q66jIOuo_0r2_GFlbGaSScvbnAWWjH/dMPW8UZsTetZ2a9tqYaHQ--
|
||
{
|
||
auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("who"));
|
||
for (auto it = queryMatches.begin(); it != queryMatches.end(); ++it) {
|
||
if ((*it)->getValueLen() <= 130 && memcmp((*it)->getValue(), "r,", 2) == 0) {
|
||
urlParser->deleteComponent(*it);
|
||
}
|
||
}
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("who"), UrlComponent::Validator(130, 0, ALLOW_ALL, MANDATORY_NONE));
|
||
}
|
||
|
||
|
||
// Misc
|
||
urlParser->removeQueryParam("partnerref");
|
||
|
||
// gallery project
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// g1_return / g2_return / g2_returnUrl
|
||
// eg:
|
||
// http%3A%2F%2Fgaleria.waw.net.pl%2Fwesele%3Fpage%3D1
|
||
// %2Fgallery%2FNovember-2015-trip%2FIMG_2364
|
||
urlParser->removeQueryParam("g1_return");
|
||
urlParser->removeQueryParam("g2_return");
|
||
urlParser->removeQueryParam("g2_returnUrl");
|
||
|
||
// g2_returnName
|
||
// eg:
|
||
// album
|
||
// Photo
|
||
// Mount+with+WebDAV
|
||
urlParser->removeQueryParam("g2_returnName");
|
||
|
||
// g2_authToken
|
||
// eg:
|
||
// b7cf40525e11
|
||
// 1c5a53515a3a
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("g2_authToken"), UrlComponent::Validator(12, 12, ALLOW_HEX, MANDATORY_NONE));
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
urlParser->removeQueryParam("redirect");
|
||
|
||
urlParser->removeQueryParam("redirect_to");
|
||
|
||
urlParser->removeQueryParam("redirectto");
|
||
urlParser->removePath(UrlComponent::Matcher("redirectto"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
urlParser->removeQueryParam("redirect_uri");
|
||
|
||
urlParser->removeQueryParam("redirect_url");
|
||
urlParser->removeQueryParam("redirecturl");
|
||
|
||
urlParser->removeQueryParam("return");
|
||
|
||
urlParser->removeQueryParam("return_page");
|
||
urlParser->removeQueryParam("returnpage");
|
||
|
||
urlParser->removeQueryParam("return_to");
|
||
urlParser->removeQueryParam("returnto");
|
||
urlParser->removePath(UrlComponent::Matcher("returnto"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
urlParser->removeQueryParam("returntopage");
|
||
urlParser->removeQueryParam("returntoquery");
|
||
urlParser->removeQueryParam("returntosearch");
|
||
urlParser->removeQueryParam("returntotitle");
|
||
urlParser->removeQueryParam("returntourl");
|
||
|
||
urlParser->removeQueryParam("return_uri");
|
||
|
||
urlParser->removeQueryParam("return_url");
|
||
urlParser->removePath(UrlComponent::Matcher("return_url"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
urlParser->removeQueryParam("returnurl");
|
||
urlParser->removePath(UrlComponent::Matcher("returnurl"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
|
||
// referer
|
||
// eg:
|
||
// aHR0cDovL3d3dy5zbGF0ZXJnYXJ0cmVsbHNwb3J0cy5jb20uYXUvaW5kZXgucGhwL2NhdGFsb2cvcHJvZHVjdC92aWV3L2lkLzE5NTgvY2F0ZWdvcnkvMjI0Lz9fX19TSUQ9VQ,,
|
||
// aHR0cDovL3d3dy5zZGpzcG9ydHMuY28udWsvc2FsZS5odG1s
|
||
// displayimage.php%3Fpid%3D185
|
||
// hoh
|
||
urlParser->removePath(UrlComponent::Matcher("referer"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removePathParam(UrlComponent::Matcher("referer"), UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_NONE));
|
||
urlParser->removeQueryParam("referer");
|
||
|
||
urlParser->removeQueryParam("referrer");
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
urlParser->removeQueryParam("afid");
|
||
urlParser->removeQueryParam("affid");
|
||
urlParser->removeQueryParam("affiliateid");
|
||
urlParser->removeQueryParam("affiliate_id");
|
||
urlParser->removeQueryParam("affiliate");
|
||
|
||
urlParser->removeQueryParam(UrlComponent::Matcher("psid"), UrlComponent::Validator(0, 0, false, (ALLOW_ALPHA | ALLOW_DIGIT), MANDATORY_ALPHA));
|
||
}
|
||
|
||
if (urlParser->getTitledbVersion() >= 124) {
|
||
// sort
|
||
urlParser->removeQueryParam("sort");
|
||
|
||
// order
|
||
urlParser->removeQueryParam("order");
|
||
}
|
||
|
||
/// @todo ALC cater for more affiliate links here
|
||
|
||
// only check domain specific logic when we have a domain
|
||
if (urlParser->getDomain()) {
|
||
if (strncmp(urlParser->getDomain(), "amazon.", 7) == 0) {
|
||
// amazon
|
||
// https://www.reddit.com/r/GameDeals/wiki/affiliate
|
||
|
||
// affiliate
|
||
urlParser->removeQueryParam("tag");
|
||
|
||
// wishlist
|
||
urlParser->removeQueryParam("coliid");
|
||
urlParser->removeQueryParam("colid");
|
||
|
||
// reference
|
||
urlParser->removeQueryParam("ref");
|
||
urlParser->removePathParam(UrlComponent::Matcher("ref"),
|
||
UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_PUNCTUATION));
|
||
} else if (strncmp(urlParser->getDomain(), "ebay.", 5) == 0) {
|
||
// ebay
|
||
// http://www.ebaypartnernetworkblog.com/en/2009/05/new-link-generator-tool-additional-information/
|
||
|
||
urlParser->removeQueryParam("icep_ff3");
|
||
urlParser->removeQueryParam("pub");
|
||
urlParser->removeQueryParam("toolid");
|
||
urlParser->removeQueryParam("campid");
|
||
urlParser->removeQueryParam("customid");
|
||
urlParser->removeQueryParam("afepn");
|
||
urlParser->removeQueryParam("pid");
|
||
}
|
||
}
|
||
}
|
||
|
||
// . url rfc = http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
|
||
// . "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'(),"
|
||
// [not including the quotes - ed], and reserved characters used for their
|
||
// reserved purposes may be used unencoded within a URL."
|
||
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
|
||
// . url should be ENCODED PROPERLY for this to work properly
|
||
void Url::set(const char *t, int32_t tlen, bool addWWW, bool stripParams, bool stripCommonFile, int32_t titledbVersion) {
|
||
#ifdef _VALGRIND_
|
||
VALGRIND_CHECK_MEM_IS_DEFINED(t,tlen);
|
||
#endif
|
||
reset();
|
||
|
||
if (!t || tlen == 0) {
|
||
return;
|
||
}
|
||
|
||
// we may add a "www." a trailing backslash and \0, ...
|
||
if (tlen > MAX_URL_LEN - 10) {
|
||
log( LOG_LIMIT, "db: Encountered url of length %" PRId32 ". Truncating to %i", tlen, MAX_URL_LEN - 10 );
|
||
tlen = MAX_URL_LEN - 10;
|
||
}
|
||
|
||
char stripped[MAX_URL_LEN];
|
||
|
||
if (titledbVersion >= 125) {
|
||
// skip starting spaces
|
||
while (tlen > 0 && is_wspace_a(*t)) {
|
||
++t;
|
||
--tlen;
|
||
}
|
||
|
||
// remove tab/cr/lf
|
||
std::string url(t, tlen);
|
||
url.erase(std::remove_if(url.begin(), url.end(), [](char c) { return c == 0x09 || c == 0x0A || c == 0x0D; }), url.end());
|
||
memcpy(stripped, url.c_str(), url.size());
|
||
stripped[url.size()] = '\0';
|
||
t = stripped;
|
||
tlen = url.size();
|
||
|
||
// skip ending spaces
|
||
while (tlen > 0 && is_wspace_a(t[tlen - 1])) {
|
||
--tlen;
|
||
}
|
||
}
|
||
|
||
// . skip over non-alnum chars (except - or /) in the beginning
|
||
// . if url begins with // then it's just missing the http: (slashdot)
|
||
// . watch out for hostname like: -dark-.deviantart.com(yes, it's real)
|
||
// . so all protocols are hostnames MUST start with alnum OR hyphen
|
||
while (tlen > 0 && !is_alnum_a(*t) && *t != '-' && *t != '/') {
|
||
t++;
|
||
tlen--;
|
||
}
|
||
|
||
// . stop t at first space or binary char
|
||
// . url should be in encoded form!
|
||
int32_t i;
|
||
int32_t nonAsciiPos = -1;
|
||
for ( i = 0 ; i < tlen ; i++ ) {
|
||
if (titledbVersion < 125 && is_wspace_a(t[i])) {
|
||
break;
|
||
}
|
||
|
||
if (!is_ascii(t[i])) {
|
||
// Sometimes the length with the null is passed in,
|
||
// so ignore nulls FIXME?
|
||
if (t[i]) {
|
||
nonAsciiPos = i;
|
||
}
|
||
|
||
break; // no non-ascii chars allowed
|
||
}
|
||
}
|
||
|
||
if ( nonAsciiPos != -1 ) {
|
||
// Try turning utf8 and latin1 encodings into punycode.
|
||
// All labels(between dots) in the domain are encoded
|
||
// separately. We don't support encoded tlds, but they are
|
||
// not widespread yet.
|
||
// If it is a non ascii domain it needs to take the form
|
||
// xn--<punycoded label>.xn--<punycoded label>.../
|
||
|
||
log(LOG_DEBUG, "build: attempting to decode unicode url %*.*s pos at %" PRId32, (int)tlen, (int)tlen, t, nonAsciiPos);
|
||
|
||
char encoded [ MAX_URL_LEN ];
|
||
size_t encodedLen = MAX_URL_LEN;
|
||
char *encodedDomStart = encoded;
|
||
const char *p = t;
|
||
const char *pend = t+tlen;
|
||
|
||
// Find the start of the domain
|
||
if ( tlen > 7 && strncmp( p, "http://", 7 ) == 0 ) {
|
||
p += 7;
|
||
} else if ( tlen > 8 && strncmp( p, "https://", 8 ) == 0 ) {
|
||
p += 8;
|
||
}
|
||
|
||
gbmemcpy(encodedDomStart, t, p-t);
|
||
encodedDomStart += p-t;
|
||
|
||
while (p < pend && *p != '/' && *p != ':') {
|
||
const char *labelStart = p;
|
||
uint32_t tmpBuf[MAX_URL_LEN];
|
||
int32_t tmpLen = 0;
|
||
|
||
while (p < pend && *p != '.' && *p != '/' &&
|
||
(titledbVersion < 125 || (titledbVersion >= 125 && *p != ':'))) {
|
||
p++;
|
||
}
|
||
|
||
int32_t labelLen = p - labelStart;
|
||
|
||
bool tryLatin1 = false;
|
||
// For utf8 urls
|
||
p = labelStart;
|
||
bool labelIsAscii = true;
|
||
|
||
// Convert the domain to code points and copy it to tmpbuf to be punycoded
|
||
for ( ; p - labelStart < labelLen; p += utf8Size( tmpBuf[tmpLen] ), tmpLen++ ) {
|
||
labelIsAscii = labelIsAscii && is_ascii( *p );
|
||
tmpBuf[tmpLen] = utf8Decode( p );
|
||
if ( !tmpBuf[tmpLen] ) { // invalid char?
|
||
tryLatin1 = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if ( labelIsAscii ) {
|
||
if ( labelStart[labelLen] == '.' ) {
|
||
labelLen++;
|
||
p++;
|
||
}
|
||
gbmemcpy( encodedDomStart, labelStart, labelLen );
|
||
encodedDomStart += labelLen;
|
||
continue;
|
||
}
|
||
|
||
if ( tryLatin1 ) {
|
||
// For latin1 urls
|
||
tmpLen = 0;
|
||
for ( ; tmpLen < labelLen; tmpLen++ ) {
|
||
tmpBuf[tmpLen] = labelStart[tmpLen];
|
||
}
|
||
}
|
||
|
||
gbmemcpy( encodedDomStart, "xn--", 4 );
|
||
encodedDomStart += 4;
|
||
|
||
encodedLen = MAX_URL_LEN - (encodedDomStart - encoded);
|
||
punycode_status status = punycode_encode( tmpLen, tmpBuf, NULL, &encodedLen, encodedDomStart );
|
||
|
||
if ( status != 0 ) {
|
||
// Give up? try again?
|
||
log("build: Bad Engineer, failed to "
|
||
"punycode international url %s (%" PRId32 ")",
|
||
t, (int32_t)status);
|
||
return;
|
||
}
|
||
|
||
// We should check if what we encoded were valid url characters, no spaces, etc
|
||
// FIXME: should we exclude just the bad chars? I've seen plenty of urls with
|
||
// a newline in the middle. Just discard the whole chunk for now
|
||
bool badUrlChars = false;
|
||
for ( uint32_t i = 0; i < encodedLen; i++ ) {
|
||
if ( is_wspace_a( encodedDomStart[i] ) ) {
|
||
badUrlChars = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if ( encodedLen == 0 || badUrlChars ) {
|
||
encodedDomStart -= 4; // don't need the xn--
|
||
p++;
|
||
} else {
|
||
encodedDomStart += encodedLen;
|
||
*encodedDomStart++ = *p++; // Copy in the . or the /
|
||
}
|
||
}
|
||
|
||
// p now points to the end of the domain
|
||
// encodedDomStart now points to the first free space in encoded string
|
||
|
||
// Now copy the rest of the url in. Watch out for non-ascii chars
|
||
// truncate the url, and keep it under max url length
|
||
uint32_t newUrlLen = encodedDomStart - encoded;
|
||
|
||
while (p < pend) {
|
||
if ( ! *p ) {
|
||
break; // null?
|
||
}
|
||
|
||
if (!is_ascii(*p)) {
|
||
// url encode utf8 characters now
|
||
char cs = getUtf8CharSize(p);
|
||
|
||
// bad utf8 char?
|
||
if ( !isValidUtf8Char(p) ) {
|
||
break;
|
||
}
|
||
|
||
int maxDestLen = (cs * 3) + 1; // %XX + \0
|
||
|
||
// too long?
|
||
if ( newUrlLen + maxDestLen >= MAX_URL_LEN ) {
|
||
break;
|
||
}
|
||
|
||
char stored = urlEncode(&encoded[newUrlLen], maxDestLen, p, cs);
|
||
p += cs;
|
||
newUrlLen += stored;
|
||
|
||
continue;
|
||
}
|
||
|
||
if (is_wspace_a(*p)) {
|
||
break;
|
||
}
|
||
|
||
if (newUrlLen + 1 >= MAX_URL_LEN) {
|
||
break;
|
||
}
|
||
|
||
encoded[newUrlLen++] = *p++;
|
||
}
|
||
|
||
encoded[newUrlLen] = '\0';
|
||
return this->set( encoded, newUrlLen, addWWW, stripParams, stripCommonFile, titledbVersion );
|
||
}
|
||
|
||
// truncate length to the first occurence of an unacceptable char
|
||
tlen = i;
|
||
|
||
// . jump over http:// if it starts with http://http://
|
||
// . a common mistake...
|
||
while ( tlen > 14 && ! strncasecmp ( t , "http://http://" , 14 ) ) {
|
||
t += 7;
|
||
tlen -= 7;
|
||
}
|
||
|
||
// only strip anchor for version <= 122 (we're stripping anchor in UrlParser)
|
||
if (titledbVersion <= 122) {
|
||
// strip the "#anchor" from http://www.xyz.com/somepage.html#anchor"
|
||
for (int32_t i = 0; i < tlen; i++) {
|
||
if (t[i] == '#') {
|
||
// ignore anchor if a ! follows it. 'google hash bang hack'
|
||
// which breaks the web and is now deprecated, but, there it is
|
||
if (i + 1 < tlen && t[i + 1] == '!') {
|
||
continue;
|
||
}
|
||
|
||
tlen = i;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// copy to "s" so we can NULL terminate it
|
||
char s[MAX_URL_LEN];
|
||
int32_t len = tlen;
|
||
|
||
if (titledbVersion <= 122) {
|
||
// store filtered url into s
|
||
memcpy(s, t, tlen);
|
||
s[len] = '\0';
|
||
|
||
if (stripParams) {
|
||
stripParametersv122(s, &len);
|
||
}
|
||
} else {
|
||
UrlParser urlParser(t, tlen, titledbVersion);
|
||
|
||
if (stripParams) {
|
||
stripParameters(&urlParser);
|
||
}
|
||
|
||
// rebuild url
|
||
urlParser.unparse();
|
||
|
||
len = urlParser.getUrlParsedLen();
|
||
|
||
if (len > MAX_URL_LEN - 10) {
|
||
len = MAX_URL_LEN - 10;
|
||
}
|
||
strncpy(s, urlParser.getUrlParsed(), len);
|
||
s[len] = '\0';
|
||
}
|
||
|
||
// remove common filenames like index.html
|
||
if ( stripCommonFile ) {
|
||
if ( len - 14 > 0 &&
|
||
strncasecmp(&s[len-14], "/default.xhtml", 14) == 0 )
|
||
len -= 13;
|
||
else if ( len - 13 > 0 &&
|
||
( strncasecmp(&s[len-13], "/default.html", 13) == 0 ||
|
||
strncasecmp(&s[len-13], "/default.ascx", 13) == 0 ||
|
||
strncasecmp(&s[len-13], "/default.ashx", 13) == 0 ||
|
||
strncasecmp(&s[len-13], "/default.asmx", 13) == 0 ||
|
||
strncasecmp(&s[len-13], "/default.xhtm", 13) == 0 ||
|
||
strncasecmp(&s[len-13], "/default.aspx", 13) == 0 ) )
|
||
len -= 12;
|
||
else if ( len - 12 > 0 &&
|
||
( strncasecmp(&s[len-12], "/default.htm", 12) == 0 ||
|
||
strncasecmp(&s[len-12], "/default.php", 12) == 0 ||
|
||
strncasecmp(&s[len-12], "/default.asp", 12) == 0 ||
|
||
strncasecmp(&s[len-12], "/index.xhtml", 12) == 0 ) )
|
||
len -= 11;
|
||
else if ( len - 11 > 0 &&
|
||
( strncasecmp(&s[len-11], "/index.html", 11) == 0 ||
|
||
strncasecmp(&s[len-11], "/index.aspx", 11) == 0 ||
|
||
strncasecmp(&s[len-11], "/index.xhtm", 11) == 0 ||
|
||
strncasecmp(&s[len-11], "/default.pl", 11) == 0 ||
|
||
strncasecmp(&s[len-11], "/default.cs", 11) == 0 ) )
|
||
len -= 10;
|
||
else if ( len - 10 > 0 &&
|
||
( strncasecmp(&s[len-10], "/index.htm", 10) == 0 ||
|
||
strncasecmp(&s[len-10], "/index.php", 10) == 0 ||
|
||
strncasecmp(&s[len-10], "/index.asp", 10) == 0 ||
|
||
strncasecmp(&s[len-10], "/main.html", 10) == 0 ||
|
||
strncasecmp(&s[len-10], "/main.aspx", 10) == 0 ) )
|
||
len -= 9;
|
||
else if ( len - 9 > 0 &&
|
||
( strncasecmp(&s[len-9], "/index.pl", 9) == 0 ||
|
||
strncasecmp(&s[len-9], "/main.htm", 9) == 0 ||
|
||
strncasecmp(&s[len-9], "/main.php", 9) == 0 ) )
|
||
len -= 8;
|
||
else if ( len - 8 > 0 &&
|
||
( strncasecmp(&s[len-8], "/main.pl", 8) == 0 ) )
|
||
len -= 7;
|
||
s[len] = '\0';
|
||
}
|
||
|
||
|
||
// replace the "\" with "/" -- a common mistake
|
||
int32_t j;
|
||
for ( j = 0 ; s[j] ; j++)
|
||
{
|
||
if (s[j]=='\\')
|
||
{
|
||
s[j]='/';
|
||
}
|
||
}
|
||
|
||
// . dig out the protocol/scheme for this s (check for ://)
|
||
// . protocol may only have alnums and hyphens in it
|
||
for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
|
||
|
||
// if we have a legal protocol, then set "m_scheme", "slen" and "sch"
|
||
// and advance i to the m_host
|
||
if ( i + 2 < len && s[i]==':' && s[i+1]=='/' && s[i+2]=='/')
|
||
{
|
||
// copy lowercase protocol to "m_url"
|
||
to_lower3_a ( s , i + 3 , m_url );
|
||
m_scheme = m_url;
|
||
m_slen = i;
|
||
m_ulen = i + 3;
|
||
i += 3;
|
||
}
|
||
else
|
||
if (i + 2 < len && s[i]==':' && s[i+1]=='/'&& is_alnum_a(s[i+2]))
|
||
{
|
||
// copy lowercase protocol to "m_url"
|
||
to_lower3_a ( s , i + 2 , m_url );
|
||
// add in needed /
|
||
m_url[i+2]='/';
|
||
m_scheme = m_url;
|
||
m_slen = i;
|
||
m_ulen = i + 3;
|
||
i += 2;
|
||
}
|
||
else
|
||
{
|
||
gbmemcpy ( m_url,"http://" , 7 );
|
||
m_scheme = m_url;
|
||
m_slen = 4;
|
||
m_ulen = 7;
|
||
i = 0;
|
||
// if s started with // then skip that (slashdot)
|
||
if ( s[0]=='/' && s[1]=='/' ) i = 2;
|
||
}
|
||
// . now &s[i] should point to the m_host name
|
||
// . chars allowed in hostname = period,alnum,hyphen,underscore
|
||
// . stops at '/' or ':' or any other disallowed character
|
||
j = i;
|
||
while (s[j] && (is_alnum_a(s[j]) || s[j]=='.' || s[j]=='-'||s[j]=='_'))
|
||
j++;
|
||
// copy m_host into "s" (make it lower case, too)
|
||
to_lower3_a ( s + i, j - i, m_url + m_ulen );
|
||
m_host = m_url + m_ulen;
|
||
m_hlen = j - i;
|
||
// common mistake: if hostname ends in a . then back up
|
||
while ( m_hlen > 0 && m_host[m_hlen-1]=='.' ) m_hlen--;
|
||
// NULL terminate for strchr()
|
||
m_host [ m_hlen ] = '\0';
|
||
|
||
// advance m_ulen to end of hostname
|
||
m_ulen += m_hlen;
|
||
|
||
// . Test if hostname is in a.b.c.d format
|
||
// . this returns 0 if not a valid ip string
|
||
int32_t ip = atoip ( m_host , m_hlen );
|
||
|
||
// advance i to the : for the port, if it exists
|
||
i = j;
|
||
|
||
// NULL terminate m_host for getTLD(), getDomain() and strchr() below
|
||
m_host [ m_hlen ] = '\0';
|
||
|
||
// use ip as domain if we're just an ip address like 192.0.2.1
|
||
if ( ip ) {
|
||
// ip address has no tld, or mid domain
|
||
m_tld = NULL;
|
||
m_tldLen = 0;
|
||
// but it does have a domain (1.2.3)
|
||
m_domain = getDomainOfIp ( m_host , m_hlen , &m_dlen );
|
||
// just use the domain as the mid domain for ip-based urls
|
||
m_mdlen = m_dlen;
|
||
}
|
||
// . otherwise, get the tld
|
||
// . uses thorough list of tlds in Domains.cpp
|
||
else if ( ( m_tld = ::getTLD ( m_host, m_hlen ) ) && m_tld > m_host ) {
|
||
// set m_domain if we had a tld that's not equal to our host
|
||
m_tldLen = strlen ( m_tld );
|
||
m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
|
||
// set the mid domain length (-1 for the '.')
|
||
m_mdlen = m_dlen - m_tldLen - 1;
|
||
}
|
||
// otherwise, we're no ip and we have no valid domain
|
||
else {
|
||
m_domain = NULL;
|
||
m_dlen = 0;
|
||
m_tldLen = 0;
|
||
m_mdlen = 0;
|
||
}
|
||
|
||
// . if domain same as host then we might insert a "www." server name
|
||
// . however, must have a period in domain name
|
||
// . otherwise a domain name of "xxx" would become "www.xxx" and if
|
||
// Url::set() is called on that it would be "www.www.xxx" (bad bad)
|
||
// . let's only add "www." if there's only 1 period, ok?
|
||
if ( ! ip && addWWW && m_host == m_domain && strchr(m_host,'.') ) {
|
||
memmove ( m_host + 4 , m_host , m_hlen );
|
||
gbmemcpy ( m_host , "www." , 4 );
|
||
if ( m_domain ) m_domain += 4;
|
||
if ( m_tld ) m_tld += 4;
|
||
m_ulen += 4;
|
||
m_hlen += 4;
|
||
}
|
||
// set the default port based on the protocol
|
||
m_defPort = 80;
|
||
if ( m_slen==5 && strncmp(m_scheme, "https",5)==0 ) m_defPort = 443;
|
||
// assume we're using the default port for this scheme/protocol
|
||
m_port = m_defPort;
|
||
// see if a port was provided in the hostname after a colon
|
||
if ( s[i] == ':' ) {
|
||
// remember the ptr so far
|
||
int32_t savedLen = m_ulen;
|
||
// add a colon to our m_url
|
||
m_url [ m_ulen++ ] = ':';
|
||
// scan for a '/'
|
||
j = i + 1;
|
||
while ( s[j] && s[j]!='/') m_url[m_ulen++] = s[j++];
|
||
|
||
m_portPtr = s + i + 1;
|
||
m_portPtrLen = j - (i + 1);
|
||
|
||
// now read our port
|
||
m_port = atol2(m_portPtr, m_portPtrLen);
|
||
|
||
// if it's the default port, then remove what we copied
|
||
if ( m_port == m_defPort ) m_ulen = savedLen;
|
||
// make i point to the root / in the m_path, if any
|
||
i = j;
|
||
}
|
||
// how many chars is taken up by a specified port?
|
||
m_portLen = 0;
|
||
if ( m_port != m_defPort ) {
|
||
m_portLen += 2; // :3
|
||
if ( m_port >= 10 ) m_portLen += 1;
|
||
if ( m_port >= 100 ) m_portLen += 1;
|
||
if ( m_port >= 1000 ) m_portLen += 1;
|
||
if ( m_port >= 10000 ) m_portLen += 1;
|
||
}
|
||
|
||
// append a '/' to m_url then bail if there is no m_path after the port
|
||
if ( s[i] != '/') {
|
||
m_path = m_url + m_ulen;
|
||
m_path[0] = '/';
|
||
m_plen = 1;
|
||
m_url[ ++m_ulen ]='\0';
|
||
return;
|
||
}
|
||
|
||
// . get the m_path and m_path length
|
||
// . j,i should point to start of path slash '/'
|
||
// . scan so it points to end or a ? or #
|
||
j = i;
|
||
|
||
// now we include # as part of the path if it is a hash bang '#!'
|
||
// which was the web-breaking google hack that is now deprecated
|
||
while ( s[j] && s[j]!='?' ) {
|
||
if ( s[j] == '#' && s[j+1] != '!' )
|
||
break;
|
||
j++;
|
||
}
|
||
|
||
// point the path inside m_url even though we haven't written it yet
|
||
m_path = m_url + m_ulen;
|
||
m_plen = m_ulen;
|
||
// . deal with wierd things in the path
|
||
// . i points to start of path (should be /)
|
||
for (; i < j ; i++ ) {
|
||
// dedup double backslashes
|
||
// ensure m_ulen >= m_plen so we don't hurt "http:///" ...
|
||
// but people sometimes put http:// in the *path*
|
||
if ( s[i] == '/' && m_url[m_ulen-1] == '/' &&
|
||
m_ulen-1 >= m_plen &&
|
||
m_ulen >= 2 && m_url[m_ulen-2] != ':' ) continue;
|
||
|
||
// handled by UrlParser for version 123 and above
|
||
if (titledbVersion <= 122) {
|
||
// deal with current directories in the m_path
|
||
if ( s[i] == '.' && m_url[m_ulen-1] == '/' &&
|
||
(i+1 == j || s[i+1]=='/')) continue;
|
||
// . deal with damned ..'s in the m_path
|
||
// . if next 2 chars are .'s and last char we wrote was '/'
|
||
if ( s[i] == '.' && s[i+1]=='.' && (s[i+2] == '/' || s[i+2] == '\0') && m_url[m_ulen-1] == '/' ) {
|
||
// dont back up over first / in path
|
||
if ( m_url + m_ulen - 1 > m_path ) m_ulen--;
|
||
while ( m_url[m_ulen-1] != '/' ) m_ulen--;
|
||
// skip i to next / after these 2 dots
|
||
while ( s[i] && s[i]!='/' ) i++;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// don't allow ; before the ?...probably because of stripped
|
||
// sessionId...
|
||
// I was going to add other possible dup separators, but now
|
||
// it seems as though it might cause problems
|
||
if (s[i] == ';' && s[i+1] == '?') continue;
|
||
|
||
// store char and advance to next
|
||
m_url[m_ulen++] = s[i];
|
||
}
|
||
// reset the path length in case we had to remove some wierd stuff
|
||
m_plen = m_ulen - m_plen;
|
||
|
||
// . get the m_query
|
||
// . the query is anything after the path that starts with ?
|
||
// . NOTE: we ignore strings beginning with '#' (page relative anchors)
|
||
if ( i < len && s[i] != '#' ) {
|
||
//remove back to back &'s in the cgi query
|
||
//http://www.nyasatimes.com/national/politics/160.html?print&&&
|
||
char *kstart = s + i;
|
||
char *kend = s + i + (len - i);
|
||
char *dst = m_url + m_ulen;
|
||
for ( char *k = kstart ; k < kend ; k++ ) {
|
||
// skip & if we just did one
|
||
if ( *k == '&' && k > kstart && *(k-1)=='&' ) continue;
|
||
// copy over one char at a time
|
||
*dst++ = *k;
|
||
}
|
||
// point after the '?' i guess
|
||
m_query = m_url + m_ulen + 1;
|
||
m_qlen = dst - m_query;
|
||
m_ulen += m_qlen + 1;
|
||
}
|
||
// get the m_filename from the m_path (m_flen might be 0)
|
||
m_flen = 0;
|
||
while (m_path[m_plen-1-m_flen]!='/' && m_flen<m_plen) m_flen++;
|
||
m_filename = m_path + m_plen - m_flen;
|
||
|
||
// get the m_extension from the m_path
|
||
m_elen = 0;
|
||
while (is_alnum_a(m_path[m_plen-1-m_elen]) && m_elen < m_plen)m_elen++;
|
||
if ( m_path[ m_plen-1-m_elen] != '.' ) m_elen = 0; // no m_extension
|
||
m_extension = m_path + m_plen - m_elen;
|
||
|
||
// null terminate our s
|
||
m_url[ m_ulen ]='\0';
|
||
}
|
||
|
||
// hostname must also be www or NULL to be a root url
|
||
bool Url::isRoot() const {
|
||
if ( m_plen != 1 ) return false;
|
||
if ( !m_path || m_path[0] != '/' ) return false;
|
||
if ( m_query ) return false;
|
||
// for now we'll let all thos *.deviantart.com names clog us up
|
||
// because i don't want to dis' stuff like espn.go.com
|
||
return true;
|
||
}
|
||
|
||
bool Url::isSimpleSubdomain ( ) const {
|
||
// if hostname is same as domain, it's passes
|
||
if ( m_host == m_domain && m_hlen == m_dlen ) return true;
|
||
// if host is not "www." followed by domain, it's NOT
|
||
if ( m_hlen != m_dlen + 4 ) return false;
|
||
if ( strncmp ( m_host , "www." , 4 ) == 0 ) return true;
|
||
return false;
|
||
}
|
||
|
||
// . get length of sub-url #j
|
||
// . basically like adding j /.. to the end of the url
|
||
// . sub-url #0 is the full url
|
||
// . includes /~ as it's own path
|
||
int32_t Url::getSubUrlLen ( int32_t j ) const {
|
||
|
||
// assume it's the whole url
|
||
int32_t len = m_ulen;
|
||
|
||
// subtract the m_query (cgi) part at the end of the url
|
||
if ( m_query ) len -= m_qlen + 1; //and the ?
|
||
|
||
// return the full url (without m_query) if j is 0
|
||
if ( j == 0 ) return len;
|
||
|
||
// . start right past the http://m_host.domain.com/
|
||
int32_t start = m_slen + 3 + m_hlen + 1 + m_portLen ;
|
||
while ( len > start ) {
|
||
if ( m_url [ len - 1 ] == '/' ) j--;
|
||
if ( m_url [ len - 2 ] == '/' && m_url [ len - 1 ] == '~') j--;
|
||
// include this backslash (or ~) in the sub-url
|
||
if ( j == 0 ) return len;
|
||
// shrink by one character
|
||
len--;
|
||
}
|
||
|
||
// return 0 if jth sub-url does not exist
|
||
return 0;
|
||
}
|
||
|
||
// . similar to getSubUrlLen() above but only works on the path
|
||
// . if j is 0 that's the whole url path!
|
||
int32_t Url::getSubPathLen ( int32_t j ) const {
|
||
int32_t subUrlLen = getSubUrlLen ( j );
|
||
if ( subUrlLen <= 0 ) return 0;
|
||
// . the subPath length includes the root backslash
|
||
// . portLen includes the whole :8080 thing (for non default ports)
|
||
return subUrlLen - m_slen - 3 - m_hlen - m_portLen;
|
||
}
|
||
|
||
void Url::print() const {
|
||
logf(LOG_TRACE, "Url info");
|
||
logf(LOG_TRACE, "\turl : %s", m_url);
|
||
logf(LOG_TRACE, "\turlhash32 : %" PRIx32, getUrlHash32());
|
||
logf(LOG_TRACE, "\turlhash48 : %" PRIx64, getUrlHash48());
|
||
logf(LOG_TRACE, "\turlhash64 : %" PRIx64, getUrlHash64());
|
||
|
||
logf(LOG_TRACE, "\thost : %.*s", m_hlen, m_host);
|
||
logf(LOG_TRACE, "\thosthash32 : %" PRIx32, getHostHash32());
|
||
logf(LOG_TRACE, "\thosthash48 : %" PRIx64, getHostHash64());
|
||
|
||
logf(LOG_TRACE, "\tscheme : %.*s", m_slen, m_scheme);
|
||
logf(LOG_TRACE, "\tpath : %.*s", m_plen, m_path);
|
||
logf(LOG_TRACE, "\tquery : %s", m_query);
|
||
logf(LOG_TRACE, "\tport : %" PRId32, m_port);
|
||
|
||
logf(LOG_TRACE, "\tdomain : %.*s", m_dlen, m_domain);
|
||
logf(LOG_TRACE, "\tdomainhash32 : %" PRIx32, getDomainHash32());
|
||
logf(LOG_TRACE, "\tdomainhash64 : %" PRIx64, getDomainHash64());
|
||
|
||
logf(LOG_TRACE, "\ttld : %.*s", m_tldLen, m_tld);
|
||
logf(LOG_TRACE, "\tmid domain : %.*s", m_mdlen, m_domain);
|
||
logf(LOG_TRACE, "\tis root : %i", isRoot());
|
||
}
|
||
|
||
int32_t Url::getPathDepth ( bool countFilename ) const {
|
||
const char *s = m_path + 1;
|
||
const char *send = m_url + m_ulen;
|
||
int32_t count = 0;
|
||
while ( s < send ) if ( *s++ == '/' ) count++;
|
||
// if we're counting the filename as a path component...
|
||
if ( countFilename && *(send-1) != '/' ) count++;
|
||
return count;
|
||
}
|
||
|
||
bool Url::isHostWWW ( ) const {
|
||
if ( m_hlen < 4 ) return false;
|
||
if ( m_host[0] != 'w' ||
|
||
m_host[1] != 'w' ||
|
||
m_host[2] != 'w' ||
|
||
m_host[3] != '.' ) return false;
|
||
return true;
|
||
}
|
||
|
||
// . is the url a porn/adult url?
|
||
// . i use /usr/share/dict/words to check for legit words
|
||
// . if it's long and has 4+ hyphens, consider it spam
|
||
// . if you add a word here, add it to PageResults.cpp:isQueryDirty()
|
||
bool Url::isAdult() const {
|
||
//certain TLDs are clearly adult-oriented
|
||
if( isAdultTLD(m_tld, m_tldLen))
|
||
return true;
|
||
|
||
|
||
#if 0
|
||
//@@@
|
||
|
||
if(m_hlen<=0)
|
||
return false; // Invalid URL (no hostname)
|
||
if(m_tldLen<=0)
|
||
return false; // no TLD
|
||
// store the hostname in a buf since we strtok it
|
||
char s [ MAX_URL_LEN ];
|
||
// don't store the .com or .org while searching for isSpam
|
||
int32_t slen = m_hlen - m_tldLen - 1;
|
||
gbmemcpy ( s , m_host , slen );
|
||
if ( ! m_domain ) return false;
|
||
if ( ! m_dlen ) return false;
|
||
//int32_t len = m_dlen;
|
||
//gbmemcpy ( s , m_domain , len );
|
||
// if tld is gov or edu or org, not porn
|
||
if ( m_tldLen >= 3 && strncmp ( m_tld , "edu" , 3 )==0 ) return false;
|
||
if ( m_tldLen >= 3 && strncmp ( m_tld , "gov" , 3 )==0 ) return false;
|
||
// NULL terminate for strstr
|
||
s[slen]='\0';
|
||
// . if there is 4 or more hyphens, and hostLen > 30 consider it spam
|
||
// . actually there seems to be a lot of legit sites with many hyphens
|
||
if ( slen > 30 ) {
|
||
int32_t count = 0;
|
||
char *p = s;
|
||
while ( *p ) if ( *p++ == '-' ) count++;
|
||
if ( count >= 4 ) return true;
|
||
}
|
||
|
||
//
|
||
// TODO: use getMatch()!!!! +pts -pts system
|
||
//
|
||
|
||
// check each thing separated by periods for porn
|
||
const char *send = s + slen;
|
||
const char *p = s;
|
||
|
||
while(p<send) {
|
||
// find the next period or hyphen
|
||
const char *pend = p;
|
||
while ( pend < send && *pend != '.' && *pend !='-' ) pend++;
|
||
// check that
|
||
if ( isAdultUrl ( p , pend - p ) ) return true;
|
||
// point to next
|
||
p = pend + 1;
|
||
}
|
||
#endif
|
||
return false;
|
||
}
|
||
|
||
// . remove any session id
|
||
// . i'm sick of these tihngs causing dup problems
|
||
// . types:
|
||
// http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395
|
||
// http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&
|
||
// http://www.b.com/default?SID=f320a739cdecb4c3edef67e
|
||
|
||
// http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
|
||
// http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?stuff=1
|
||
// look for ';'
|
||
// look for PHPSESSID, session_id, SID, jsessionid
|
||
// followed by string of at least 4 letters/numbers
|
||
|
||
//List of extensions NOT to parse
|
||
static const char * const s_badExtensions[] = {
|
||
"ai",
|
||
"aif",
|
||
"aifc",
|
||
"aiff",
|
||
"asc",
|
||
"au",
|
||
"avi",
|
||
"bcpio",
|
||
"bin",
|
||
"bmp",
|
||
"bz2",
|
||
//"c",
|
||
//"cc",// c source code, allow
|
||
"ccad",
|
||
"cdf",
|
||
//"class",// text source code file usually, allow
|
||
"cpio",
|
||
"cpt",
|
||
//"csh",
|
||
"css",
|
||
"dcr",
|
||
"dir",
|
||
"dms",
|
||
//"doc",
|
||
"drw",
|
||
"dvi",
|
||
"dwg",
|
||
"dxf",
|
||
"dxr",
|
||
"eps",
|
||
"etx",
|
||
"exe",
|
||
"ez",
|
||
//"f", // ambigous
|
||
"f90",
|
||
"fli",
|
||
"gif",
|
||
"gtar",
|
||
"gz",
|
||
//"h",
|
||
"hdf",
|
||
"hh",
|
||
"hqx",
|
||
//"htm",
|
||
//"html",
|
||
"ice",
|
||
"ief",
|
||
"iges",
|
||
"igs",
|
||
"ips",
|
||
"ipx",
|
||
"jpe",
|
||
"jpeg",
|
||
"jpg",
|
||
//"js",
|
||
"kar",
|
||
"latex",
|
||
"lha",
|
||
"lsp",
|
||
"lzh",
|
||
//"m", // ambiguous
|
||
"man",
|
||
"me",
|
||
"mesh",
|
||
"mid",
|
||
"midi",
|
||
"mif",
|
||
"mime",
|
||
"mov",
|
||
"movie",
|
||
"mp2",
|
||
"mp3",
|
||
"mpe",
|
||
"mpeg",
|
||
"mpg",
|
||
"mpga",
|
||
"ms",
|
||
"msh",
|
||
"nc",
|
||
"oda",
|
||
"pbm",
|
||
"pdb",
|
||
//"pdf",
|
||
"pgm",
|
||
"pgn",
|
||
"png",
|
||
"pnm",
|
||
"pot",
|
||
"ppm",
|
||
"pps",
|
||
// "ppt",
|
||
"ppz",
|
||
"pre",
|
||
"prt",
|
||
// "ps",
|
||
"qt",
|
||
"ra",
|
||
"ram",
|
||
"ras",
|
||
"rgb",
|
||
"rm",
|
||
"roff",
|
||
"rpm",
|
||
"deb", // debian/ubuntu package file
|
||
"rtf",
|
||
"rtx",
|
||
"scm",
|
||
"set",
|
||
"sgm",
|
||
"sgml",
|
||
//"sh", // shells are text files
|
||
"shar",
|
||
"silo",
|
||
"sit",
|
||
"skd",
|
||
"skm",
|
||
"skp",
|
||
"skt",
|
||
"smi",
|
||
"smil",
|
||
"snd",
|
||
"sol",
|
||
"spl",
|
||
"src",
|
||
"step",
|
||
"stl",
|
||
"stp",
|
||
"sv4cpio",
|
||
"sv4crc",
|
||
"swf",
|
||
//"t", // ambiguous ... Mr.T.
|
||
"tar",
|
||
"tcl",
|
||
"tex",
|
||
"texi",
|
||
"texinfo",
|
||
"tif",
|
||
"tiff",
|
||
"tr",
|
||
"tsi",
|
||
"tsp",
|
||
"tsv",
|
||
//"txt",
|
||
"unv",
|
||
"ustar",
|
||
"vcd",
|
||
"vda",
|
||
"viv",
|
||
"vivo",
|
||
"vrml",
|
||
"wav",
|
||
"wrl",
|
||
"xbm",
|
||
"xlc",
|
||
"xll",
|
||
"xlm",
|
||
//"xls",
|
||
"xlw",
|
||
//"xml",
|
||
"xpm",
|
||
"xwd",
|
||
"xyz",
|
||
"zip",//
|
||
};//look below, I added 3 more types for TR version 73
|
||
static const size_t s_badExtensionsCount = sizeof(s_badExtensions)/sizeof(s_badExtensions[0]);
|
||
|
||
|
||
|
||
static HashTable s_badExtTable;
|
||
static bool s_badExtInitialized;
|
||
static GbMutex s_badExtTableMutex;
|
||
|
||
//returns True if the extension is listed as bad
|
||
bool Url::hasNonIndexableExtension( int32_t version ) const {
|
||
if ( ! m_extension || m_elen == 0 ) return false;
|
||
ScopedLock sl(s_badExtTableMutex);
|
||
if(!s_badExtInitialized) { //if hash has not been created-create one
|
||
//version 72 and before.
|
||
for(size_t i=0; i<s_badExtensionsCount; i++) {
|
||
int tlen = strlen(s_badExtensions[i]);
|
||
int64_t swh = hash64Lower_a(s_badExtensions[i],tlen);
|
||
if(!s_badExtTable.addKey(swh,(int32_t)50))
|
||
{
|
||
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash %" PRId64" to badExtTable.", swh);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
|
||
//version 73 and after.
|
||
if(!s_badExtTable.addKey(hash64Lower_a("wmv", 3),(int32_t)73) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("wma", 3),(int32_t)73) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("ogg", 3),(int32_t)73))
|
||
{
|
||
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (2).");
|
||
return false;
|
||
}
|
||
|
||
// More unwanted extensions
|
||
if(
|
||
!s_badExtTable.addKey(hash64Lower_a("7z", 2),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("lz", 2),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("xz", 2),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("apk", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("com", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("dic", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("dll", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("dmg", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("flv", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("gpx", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("ico", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("iso", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("kmz", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("mp4", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("rar", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("svg", 3),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("vcf", 3),(int32_t)122) ||
|
||
// !s_badExtTable.addKey(hash64Lower_a("xls", 3),(int32_t)122) || // Should be handled by converter (AbiWord)
|
||
!s_badExtTable.addKey(hash64Lower_a("lzma", 4),(int32_t)122) ||
|
||
// !s_badExtTable.addKey(hash64Lower_a("pptx", 4),(int32_t)122) || // Should be handled by converter (AbiWord)
|
||
!s_badExtTable.addKey(hash64Lower_a("thmx", 4),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("zipx", 4),(int32_t)122) ||
|
||
// !s_badExtTable.addKey(hash64Lower_a("xlsx", 4),(int32_t)122) || // Should be handled by converter (AbiWord)
|
||
!s_badExtTable.addKey(hash64Lower_a("zsync", 5),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("torrent", 7),(int32_t)122) ||
|
||
!s_badExtTable.addKey(hash64Lower_a("manifest", 8),(int32_t)122)
|
||
)
|
||
{
|
||
log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (3).");
|
||
return false;
|
||
}
|
||
|
||
s_badExtInitialized = true;
|
||
}
|
||
|
||
|
||
int myKey = hash64Lower_a(m_extension,m_elen);
|
||
int32_t badVersion = s_badExtTable.getValue(myKey);
|
||
|
||
if( badVersion == 0 || badVersion > version )
|
||
{
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool Url::hasXmlExtension ( ) const {
|
||
|
||
if ( ! m_extension || ! m_elen || m_elen > 3 ) return false;
|
||
|
||
char ext[5];
|
||
int i;
|
||
for(i=0; i < m_elen; i++)
|
||
{
|
||
ext[i] = to_lower_a(m_extension[i]);
|
||
}
|
||
ext[i] = '\0';
|
||
|
||
switch( m_elen )
|
||
{
|
||
case 3:
|
||
if( memcmp(ext, "xml", 3) == 0 )
|
||
{
|
||
return true;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
bool Url::hasJsonExtension ( ) const {
|
||
|
||
if ( ! m_extension || ! m_elen || m_elen >= 4 ) return false;
|
||
|
||
char ext[5];
|
||
int i;
|
||
for(i=0; i < m_elen; i++)
|
||
{
|
||
ext[i] = to_lower_a(m_extension[i]);
|
||
}
|
||
ext[i] = '\0';
|
||
|
||
switch( m_elen )
|
||
{
|
||
case 4:
|
||
if( memcmp(ext, "json", 4) == 0 )
|
||
{
|
||
return true;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
bool Url::hasScriptExtension ( ) const {
|
||
|
||
if ( ! m_extension || ! m_elen || m_elen > 4 ) return false;
|
||
|
||
char ext[5];
|
||
int i;
|
||
for(i=0; i < m_elen; i++)
|
||
{
|
||
ext[i] = to_lower_a(m_extension[i]);
|
||
}
|
||
ext[i] = '\0';
|
||
|
||
switch( m_elen )
|
||
{
|
||
case 2:
|
||
if( memcmp(ext, "js", 2) == 0 )
|
||
{
|
||
return true;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
// see Url.h for a description of this.
|
||
bool Url::isLinkLoop ( ) const {
|
||
const char *s = m_path ;
|
||
const char *send = m_url + m_ulen;
|
||
int32_t count = 0;
|
||
int32_t components = 0;
|
||
bool prevWasDouble = false;
|
||
const char *last = NULL;
|
||
if (!s) return false;
|
||
// use this hash table to hash each path component in the url
|
||
char buf [ 5000 ];
|
||
HashTable t; t.set ( 100 , buf , 5000 );
|
||
// grab each path component
|
||
for ( ; s < send ; s++ ) {
|
||
if ( *s != '/' ) continue;
|
||
// ok, add this guy to the hash table, if we had one
|
||
if ( ! last ) { last = s; continue; }
|
||
// give up after 50 components
|
||
if ( components++ >= 50 ) return false;
|
||
// hash him
|
||
uint32_t h = hash32 ( last , s - last );
|
||
// is he in there?
|
||
int32_t slot = t.getSlot ( h );
|
||
// get his val (count)
|
||
int32_t val = 0;
|
||
if ( slot >= 0 ) val = t.getValueFromSlot ( slot );
|
||
// if not in there put him in a slot
|
||
if ( slot < 0 ) {
|
||
last = s;
|
||
t.addKey ( h , 1 );
|
||
continue;
|
||
}
|
||
// increment it
|
||
val++;
|
||
// does it occur 3 or more times? if so, we have a link loop
|
||
if ( val >= 3 ) return true;
|
||
// is it 2 or more?
|
||
if ( val == 2 ) count++;
|
||
// if we have two such components, then we are a link loop.
|
||
// BUT, we must be a pair!
|
||
if ( count >= 2 && prevWasDouble ) return true;
|
||
// set this so in case next guy is a double
|
||
if ( val == 2 ) prevWasDouble = true;
|
||
else prevWasDouble = false;
|
||
// add it back after incrementing
|
||
t.setValue ( slot , val );
|
||
// update "last"
|
||
last = s;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
//
|
||
// here are some examples of link loops in urls:
|
||
//
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/bish/archive/bish/letters/bish/archive/lette\rs/send/archive/letters/send/bish/letters/archive/bish/letters/
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/bish/letters/archive/bish/archive/letters/send/archive/letters/send/archive/le\tters/send/archive/letters/send/bish/
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/letters/send/archive/bish/archive/bish/\archive/bish/letters/send/archive/letters/archive/letters/send/archive/bish/let\ters/
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/letters/archive/bish/archive/bish/archive/bi\sh/letters/send/archive/bish/archive/letters/send/bish/archive/bish/letters/sen\d/archive/
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/bish/letters/send/archive/\bish/archive/letters/bish/letters/send/archive/bish/letters/send/bish/archive/l\etters/bish/letters/archive/letters/send/
|
||
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/send/bish/archive/letters/\send/bish/archive/letters/send/archive/letters/bish/archive/bish/archive/letter\s/
|
||
|
||
bool Url::isValid() const {
|
||
// validate port
|
||
if (m_port <= 0 || m_port > 65535 || m_portPtrLen > 5) {
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool Url::isIp() const {
|
||
if(!m_host) return false;
|
||
if(!is_digit(*m_host)) return false;
|
||
return atoip ( m_host , m_hlen );
|
||
}
|
||
|
||
int32_t Url::getHash32WithWWW ( ) const {
|
||
uint32_t hh = hash32n ( "www." );
|
||
int32_t conti = 4;
|
||
hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
|
||
return hh;
|
||
}
|
||
|
||
int32_t Url::getHostHash32 ( ) const {
|
||
return hash32 ( m_host , m_hlen );
|
||
}
|
||
|
||
int64_t Url::getHostHash64 ( ) const {
|
||
return hash64 ( m_host , m_hlen );
|
||
}
|
||
|
||
int32_t Url::getDomainHash32 ( ) const {
|
||
return hash32 ( m_domain , m_dlen );
|
||
}
|
||
|
||
int64_t Url::getDomainHash64 ( ) const {
|
||
return hash64 ( m_domain , m_dlen );
|
||
}
|
||
|
||
int32_t Url::getUrlHash32 ( ) const {
|
||
return hash32(m_url,m_ulen);
|
||
}
|
||
|
||
int64_t Url::getUrlHash64 ( ) const {
|
||
return hash64(m_url,m_ulen);
|
||
}
|
||
|
||
const char *getHostFast ( const char *url , int32_t *hostLen , int32_t *port ) {
|
||
// point to the url
|
||
const char *pp = url;
|
||
// skip http(s):// or ftp:// (always there?)
|
||
while ( *pp && *pp != ':' ) pp++;
|
||
// skip ://
|
||
pp += 3;
|
||
// point "uhost" to hostname right away
|
||
const char *uhost = pp;
|
||
// advance "pp" till we hit a / or :<port>
|
||
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
|
||
// advance "pe" over the port
|
||
const char *pe = pp;
|
||
if ( *pp == ':' ) {
|
||
// if port ptr given, do not treat port as part of hostname
|
||
if ( port ) *port = atoi(pp+1);
|
||
// i think this was including :1234 as part of hostname
|
||
// if port was NULL!
|
||
//else while ( *pe && *pe != '/' ) pe++;
|
||
}
|
||
// set length
|
||
if ( hostLen ) *hostLen = pe - uhost;
|
||
return uhost;
|
||
}
|
||
|
||
char *getPathFast ( char *url ) {
|
||
// point to the url
|
||
char *pp = url;
|
||
// skip http(s):// or ftp:// (always there?)
|
||
while ( *pp && *pp != ':' ) pp++;
|
||
// skip ://
|
||
pp += 3;
|
||
// point "uhost" to hostname right away
|
||
//char *uhost = pp;
|
||
// advance "pp" till we hit a / or :<port>
|
||
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
|
||
// advance "pe" over the port
|
||
char *pe = pp;
|
||
if ( *pp == ':' )
|
||
while ( *pe && *pe != '/' ) pe++;
|
||
// but not if something follows the '/'
|
||
return pe;
|
||
}
|
||
|
||
const char *getTLDFast(const char *url, int32_t *tldLen, bool hasHttp) {
|
||
// point to the url
|
||
const char *pp = url;
|
||
// only do this for some
|
||
if ( hasHttp ) {
|
||
// skip http(s):// or ftp:// (always there?)
|
||
while ( *pp && *pp != ':' ) pp++;
|
||
// skip ://
|
||
pp += 3;
|
||
}
|
||
// point "uhost" to hostname right away
|
||
const char *uhost = pp;
|
||
|
||
// advance "pp" till we hit a / or :<port> or \0
|
||
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
|
||
|
||
// advance "pe" over the port
|
||
const char *pe = pp;
|
||
if ( *pp == ':' ) {
|
||
while ( *pe && *pe != '/' ) {
|
||
pe++;
|
||
}
|
||
}
|
||
|
||
// set length of host
|
||
int32_t uhostLen = pp - uhost;
|
||
// . is the hostname just an IP address?
|
||
// . if it is an ip based url make domain the hostname
|
||
const char *ss = uhost;
|
||
bool isIp = true;
|
||
for ( ; *ss && ss<pp ; ss++ ) {
|
||
if ( is_alpha_a( *ss ) ) {
|
||
isIp = false;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// if ip, no tld
|
||
if ( isIp ) {
|
||
return NULL;
|
||
}
|
||
|
||
// get the tld
|
||
const char *tld = ::getTLD ( uhost , uhostLen );
|
||
|
||
// if none, done
|
||
if ( ! tld ) {
|
||
return NULL;
|
||
}
|
||
|
||
// set length
|
||
if ( tldLen ) {
|
||
*tldLen = pp - tld;
|
||
}
|
||
|
||
// return it
|
||
return tld;
|
||
}
|
||
|
||
bool hasSubdomain(const char *url) {
|
||
// point to the url
|
||
const char *pp = url;
|
||
// skip http if there
|
||
if ( pp[0] == 'h' &&
|
||
pp[1] == 't' &&
|
||
pp[2] == 't' &&
|
||
pp[3] == 'p' &&
|
||
pp[4] == ':' &&
|
||
pp[5] == '/' &&
|
||
pp[6] == '/' )
|
||
pp += 7;
|
||
else if ( pp[0] == 'h' &&
|
||
pp[1] == 't' &&
|
||
pp[2] == 't' &&
|
||
pp[3] == 'p' &&
|
||
pp[4] == 's' &&
|
||
pp[5] == ':' &&
|
||
pp[6] == '/' &&
|
||
pp[7] == '/' )
|
||
pp += 8;
|
||
// point "uhost" to hostname right away
|
||
const char *uhost = pp;
|
||
// advance "pp" till we hit a / or :<port>
|
||
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
|
||
// are we a root? assume so.
|
||
//char isRoot = true;
|
||
// advance "pe" over the port
|
||
const char *pe = pp;
|
||
if ( *pp == ':' )
|
||
while ( *pe && *pe != '/' ) pe++;
|
||
// but not if something follows the '/'
|
||
//if ( *pe == '/' && *(pe+1) ) isRoot = false;
|
||
// set length
|
||
int32_t uhostLen = pp - uhost;
|
||
// get end
|
||
//char *hostEnd = uhost + uhostLen;
|
||
// . is the hostname just an IP address?
|
||
// . if it is an ip based url make domain the hostname
|
||
const char *ss = uhost;
|
||
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
|
||
// if we are an ip, say yes
|
||
if ( ss == pp ) return true;
|
||
// get the tld
|
||
const char *utld = ::getTLD ( uhost , uhostLen );
|
||
// no tld, then no domain
|
||
if ( ! utld ) return false;
|
||
// the domain, can only be gotten once we know the TLD
|
||
// back up a couple chars
|
||
const char *udom = utld - 2;
|
||
// backup until we hit a '.' or hit the beginning
|
||
while ( udom > uhost && *udom != '.' ) udom--;
|
||
// fix http://ok/
|
||
if ( udom < uhost || *udom =='/' ) return false;
|
||
// if we hit '.' advance 1
|
||
if ( *udom == '.' ) udom++;
|
||
// eqal to host? if not, we do have a subdomain
|
||
if ( udom != uhost ) return true;
|
||
// otherwise the hostname equals the domain name
|
||
return false;
|
||
}
|
||
|
||
// returns NULL if url was in bad format and could not get domain. this
|
||
// was happening when a host gave us a bad redir url and xmldoc tried
|
||
// to set extra doc's robot.txt url to it "http://2010/robots.txt" where
|
||
// the host said "Location: 2010 ...".
|
||
const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) {
|
||
// point to the url
|
||
const char *pp = url;
|
||
// skip http if there
|
||
if ( hasHttp ) {
|
||
// skip http(s):// or ftp:// (always there?)
|
||
while ( *pp && *pp != ':' ) pp++;
|
||
// skip ://
|
||
pp += 3;
|
||
}
|
||
// point "uhost" to hostname right away
|
||
const char *uhost = pp;
|
||
// advance "pp" till we hit a / or :<port>
|
||
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
|
||
|
||
// advance "pe" over the port
|
||
const char *pe = pp;
|
||
if ( *pp == ':' )
|
||
while ( *pe && *pe != '/' ) pe++;
|
||
|
||
// set length
|
||
int32_t uhostLen = pp - uhost;
|
||
// get end
|
||
const char *hostEnd = uhost + uhostLen;
|
||
// . is the hostname just an IP address?
|
||
// . if it is an ip based url make domain the hostname
|
||
const char *ss = uhost;
|
||
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
|
||
//bool isIp = false;
|
||
//if ( ss == pp ) isIp = true;
|
||
// if we are an ip, treat special
|
||
if ( ss == pp ) {
|
||
// . might just be empty! like "\0"
|
||
// . fixes core dump from
|
||
// http://www.marcom1.unimelb.edu.au/public/contact.html
|
||
// parsing host email address
|
||
if ( uhostLen == 0 ) return NULL;
|
||
// to be consistent with how Url::m_domain/m_dlen is set we
|
||
// need to remove the last .X from the ip address
|
||
// skip back over digits
|
||
for ( hostEnd-- ; is_digit(*hostEnd); hostEnd-- );
|
||
// must be a period
|
||
if ( *hostEnd != '.' ) {
|
||
log("url: getDomFast() could not find period for "
|
||
"hostname in url");
|
||
return NULL;
|
||
}
|
||
// set length
|
||
*domLen = hostEnd - uhost;
|
||
// that's it
|
||
return uhost;
|
||
}
|
||
// get the tld
|
||
const char *utld = ::getTLD ( uhost , uhostLen );
|
||
// no tld, then no domain
|
||
if ( ! utld ) return NULL;
|
||
// the domain, can only be gotten once we know the TLD
|
||
// set utldLen
|
||
//int32_t utldLen = hostEnd - utld;
|
||
// back up a couple chars
|
||
const char *udom = utld - 2;
|
||
// backup until we hit a '.' or hit the beginning
|
||
while ( udom > uhost && *udom != '.' ) udom--;
|
||
// fix http://ok/
|
||
if ( udom < uhost || *udom =='/' ) return NULL;
|
||
// if we hit '.' advance 1
|
||
if ( *udom == '.' ) udom++;
|
||
// set domain length
|
||
*domLen = hostEnd - udom;
|
||
return udom;
|
||
}
|
||
|
||
|
||
// "s" point to the start of a normalized url (includes http://, etc.)
|
||
const char *getHost(const char *s, int32_t *hostLen) {
|
||
// skip proto
|
||
while ( *s != ':' ) s++;
|
||
// skip ://
|
||
s += 3;
|
||
// that is the host
|
||
const char *host = s;
|
||
// get length of hostname
|
||
for ( s++; *s && *s != '/' ; s++ );
|
||
// that is it
|
||
*hostLen = s - host;
|
||
// return it
|
||
return host;
|
||
}
|
||
|
||
// "s" point to the start of a normalized url (includes http://, etc.)
|
||
const char *getScheme ( const char *s , int32_t *schemeLen )
|
||
{
|
||
const char *div = strstr(s, "://");
|
||
|
||
if( !div )
|
||
{
|
||
*schemeLen=0;
|
||
return "";
|
||
}
|
||
|
||
*schemeLen = div - s;
|
||
return s;
|
||
}
|
||
|
||
// . return ptrs to the end
|
||
// . the character it points to SHOULD NOT BE part of the site
|
||
const char *getPathEnd(const char *s, int32_t desiredDepth) {
|
||
// skip proto
|
||
while ( *s != ':' ) s++;
|
||
// skip ://
|
||
s += 3;
|
||
// get length of hostname
|
||
for ( s++; *s && *s != '/' ; s++ );
|
||
// should always have a /
|
||
if ( *s != '/' ) gbshutdownLogicError();
|
||
// skip that
|
||
s++;
|
||
// init depth
|
||
int32_t depth = 0;
|
||
// do a character loop
|
||
for ( ; depth <= desiredDepth && *s ; s++ )
|
||
// count the '/'
|
||
if ( *s == '/' ) depth++;
|
||
// return the end
|
||
return s;
|
||
/*
|
||
// save for below
|
||
int32_t saved = depth;
|
||
// keep going
|
||
while ( depth-- > 0 ) {
|
||
for ( s++; *s && *s != '/' && *s != '?' ; s++ );
|
||
// if not enough path components (or cgi), return NULL
|
||
if ( *s != '/' ) return NULL;
|
||
}
|
||
// include the last '/' if we have path components
|
||
if ( saved > 0 ) s++;
|
||
// . we got it
|
||
// . if depth==0 just use "www.xyz.com" as site
|
||
// . if depth==1 just use "www.xyz.com/foo/" as site
|
||
return s;
|
||
*/
|
||
}
|
||
|
||
// . pathDepth==0 for "www.xyz.com"
|
||
// . pathDepth==0 for "www.xyz.com/"
|
||
// . pathDepth==0 for "www.xyz.com/foo"
|
||
// . pathDepth==1 for "www.xyz.com/foo/"
|
||
// . pathDepth==1 for "www.xyz.com/foo/x"
|
||
// . pathDepth==2 for "www.xyz.com/foo/x/"
|
||
// . pathDepth==2 for "www.xyz.com/foo/x/y"
|
||
int32_t getPathDepth(const char *s, bool hasHttp) {
|
||
// skip http:// if we got it
|
||
if ( hasHttp ) {
|
||
// skip proto
|
||
while ( *s != ':' ) s++;
|
||
// must have it!
|
||
if ( ! *s ) gbshutdownLogicError();
|
||
// skip ://
|
||
s += 3;
|
||
}
|
||
// skip over hostname
|
||
for ( s++; *s && *s != '/' ; s++ );
|
||
// no, might be a site like "xyz.com"
|
||
if ( ! *s ) return 0;
|
||
// should always have a /
|
||
if ( *s != '/' ) gbshutdownLogicError();
|
||
// skip that
|
||
s++;
|
||
// init depth
|
||
int32_t depth = 0;
|
||
// do a character loop
|
||
for ( ; *s ; s++ ) {
|
||
// stop if we hit ? or #
|
||
if ( *s == '?' ) break;
|
||
if ( *s == '#' ) break;
|
||
// count the '/'
|
||
if ( *s == '/' ) depth++;
|
||
}
|
||
return depth;
|
||
}
|
||
|
||
|
||
bool Url::isPunycodeSafeTld() const {
|
||
//TODO: use a configuration file for this or some more clever logic
|
||
//Some ccTLDs are safe because they only allow punycode for non-ascii letters that used by the country's language(s).
|
||
//Firefox/mozilla used to use a TLD whitelist, but then switched to a "no mixed scripts" rule, which mostly works but
|
||
//fails for www.са.com (note: the "ca" in that url is cyrillic letters)
|
||
if(m_tldLen==2) {
|
||
//Some ccTLDs have strict and sensible policies
|
||
//List inspired by firefox's old/unused whitelist
|
||
static const char *safe_cctld[] = {
|
||
"ac",
|
||
"ar",
|
||
"at",
|
||
"br",
|
||
"ca",
|
||
"ch",
|
||
"cl",
|
||
"de",
|
||
"dk",
|
||
"ee",
|
||
"es",
|
||
"fi",
|
||
"fr",
|
||
"gr",
|
||
"gt",
|
||
"hu",
|
||
"il",
|
||
"is",
|
||
"jp",
|
||
"li",
|
||
"lt",
|
||
"lu",
|
||
"lv",
|
||
"no",
|
||
"nz",
|
||
"pl",
|
||
"se",
|
||
"ua",
|
||
nullptr
|
||
};
|
||
for(size_t i=0; safe_cctld[i]; i++)
|
||
if(memcmp(m_tld,safe_cctld[i],2)==0)
|
||
return true;
|
||
}
|
||
//example.com/are safe (reserved for documentation purposes and we use them internally for testing)
|
||
if(m_dlen==11 && memcmp(m_domain,"example.com",11)==0)
|
||
return true;
|
||
if(m_dlen==11 && memcmp(m_domain,"example.net",11)==0)
|
||
return true;
|
||
return false;
|
||
}
|
||
|
||
bool Url::hasPunycode() const {
|
||
const char *s = (const char*)memmem(m_host,m_hlen,"xn--",4);
|
||
if(!s)
|
||
return false;
|
||
if(s==m_host || s[-1]=='.')
|
||
return true;
|
||
else
|
||
return false;
|
||
}
|
||
|
||
bool Url::getPunycodeDecodedHost(SafeBuf *sb) const {
|
||
const char *s = m_host;
|
||
const char *end = m_host+m_hlen;
|
||
while(s<end) {
|
||
const char *d = (const char*)memchr(s,'.',end-s);
|
||
if(!d)
|
||
d = end;
|
||
if(d-s<4 || memcmp(s,"xn--",4)!=0) {
|
||
sb->safeMemcpy(s,d-s);
|
||
} else {
|
||
uint32_t decoded[256]; //64 should be enough according to DNS spec, but let's be a bit safer
|
||
size_t decoded_count = 256;
|
||
punycode_status status = punycode_decode(d-s-4, s+4, &decoded_count, decoded, NULL);
|
||
if(status!=punycode_success) {
|
||
log(LOG_WARN, "build: Could not decode punycode '%.*s' component in host '%.*s'", (int)(d-s),s, m_hlen, m_host);
|
||
return false;
|
||
}
|
||
char utf8buf[256*4];
|
||
size_t utf8len = 0;
|
||
for(size_t i=0; i<decoded_count; i++)
|
||
utf8len += utf8Encode(decoded[i],utf8buf+utf8len);
|
||
sb->safeMemcpy(utf8buf,utf8len);
|
||
}
|
||
if(d<end)
|
||
sb->safeMemcpy(".",1);
|
||
s = d+1;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
|
||
bool Url::getPunycodeDecodedMidDomain(SafeBuf *sb) const {
|
||
const char *s = m_domain;
|
||
const char *end = m_domain+m_mdlen;
|
||
const char *d = end;
|
||
if(!d)
|
||
d = end;
|
||
if(d-s<4 || memcmp(s,"xn--",4)!=0) {
|
||
sb->safeMemcpy(s,d-s);
|
||
} else {
|
||
uint32_t decoded[256]; //64 should be enough according to DNS spec, but let's be a bit safer
|
||
size_t decoded_count = 256;
|
||
punycode_status status = punycode_decode(d-s-4, s+4, &decoded_count, decoded, NULL);
|
||
if(status!=punycode_success) {
|
||
log(LOG_WARN, "build: Could not decode punycode '%.*s' component in middomain '%.*s'", (int)(d-s),s, m_hlen, m_host);
|
||
return false;
|
||
}
|
||
char utf8buf[256*4];
|
||
size_t utf8len = 0;
|
||
for(size_t i=0; i<decoded_count; i++)
|
||
utf8len += utf8Encode(decoded[i],utf8buf+utf8len);
|
||
sb->safeMemcpy(utf8buf,utf8len);
|
||
}
|
||
if(d<end)
|
||
sb->safeMemcpy(".",1);
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
char* Url::getDisplayUrl( const char* url, SafeBuf* sb ) {
|
||
const char *urlEnd = url + strlen(url);
|
||
const char *p = url;
|
||
if ( strncmp( p, "http://", 7 ) == 0 )
|
||
p += 7;
|
||
else if ( strncmp(p, "https://", 8 ) == 0 )
|
||
p += 8;
|
||
|
||
const char *domEnd = static_cast<const char*>( memchr( p, '/', urlEnd - p ) );
|
||
if (domEnd == NULL) {
|
||
domEnd = urlEnd;
|
||
}
|
||
|
||
bool firstRun = true;
|
||
const char *found = NULL;
|
||
const char *labelCursor = url;
|
||
|
||
while( ( found = strstr( labelCursor, "xn--" ) ) && ( found < domEnd ) ) {
|
||
if ( firstRun ) {
|
||
sb->safeMemcpy( url, found - url );
|
||
firstRun = false;
|
||
}
|
||
|
||
const char* encodedStart = found + 4;
|
||
uint32_t decoded [ MAX_URL_LEN];
|
||
size_t decodedLen = MAX_URL_LEN - 1 ;
|
||
const char* labelEnd = encodedStart;
|
||
while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' ) {
|
||
labelEnd++;
|
||
}
|
||
|
||
punycode_status status = punycode_decode(labelEnd - encodedStart, encodedStart, &decodedLen, decoded, NULL);
|
||
if ( status != 0 ) {
|
||
log( "build: Bad Engineer, failed to depunycode international url %s", url );
|
||
sb->safePrintf("%s", labelCursor);
|
||
sb->nullTerm();
|
||
return sb->getBufStart();
|
||
}
|
||
|
||
sb->utf32Encode( decoded, decodedLen );
|
||
|
||
if ( *labelEnd == '.' ) {
|
||
sb->pushChar( *labelEnd++ );
|
||
}
|
||
|
||
labelCursor = labelEnd;
|
||
}
|
||
|
||
// Copy in the rest
|
||
sb->safePrintf("%s", labelCursor);
|
||
sb->nullTerm();
|
||
return sb->getBufStart();
|
||
}
|