#include "Url.h"
#include "UrlParser.h"
#include "Domains.h"
#include "HashTable.h"
#include "AdultCheck.h"
#include "ip.h"      // atoip ( s,len)
#include "Punycode.h"
#include "Unicode.h"
#include "SafeBuf.h"
#include "Sanity.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include "GbUtil.h"
#include <vector>
#include <algorithm>

#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif

Url::Url() {
	reset();
}


void Url::reset() {
	m_scheme    = NULL;
	m_host      = NULL;
	m_path      = NULL;
	m_filename  = NULL;
	m_extension = NULL;
	m_query     = NULL;
	m_domain    = NULL;
	m_tld       = NULL;

	m_url[0]    = '\0';
	m_ulen      = 0;
	m_dlen      = 0;
	m_slen      = 0;
	m_qlen      = 0;
	m_hlen      = 0;
	m_elen      = 0;
	m_mdlen     = 0;
	// ip related stuff
	m_ip          = 0;

	// Coverity
	m_plen = 0;
	m_flen = 0;
	m_tldLen = 0;
	m_port = 0;
	m_defPort = 0;
	m_portLen = 0;
}

void Url::set( const Url *baseUrl, const char *s, int32_t len, bool addWWW, bool stripParams,
               bool stripCommonFile, int32_t titledbVersion ) {
	reset();

	if ( ! baseUrl ) {
		set( s, len, addWWW, false, false, titledbVersion );
		return;
	}

	char *base = (char *) baseUrl->m_url;
	int32_t  blen = baseUrl->m_ulen;

	// don't include cgi crap
	if ( baseUrl->m_query ) {
		blen -= ( baseUrl->m_qlen + 1 );
	}

	// . adjust length of the base url.
	// . if base url does not end in / then it must have a m_filename at
	//   the end, therefore we should strip the m_filename
	if ( blen > 0 && base[blen - 1] != '/' ) {
		while ( blen > 0 && base[blen - 1] != '/' ) {
			blen--;
		}
	}

	if ( blen == 0 && len == 0 ) {
		return;
	}

	// if empty string / an url fragment, use baseUrl
	if (len == 0 || s[0] == '#') {
		set(baseUrl);
		return;
	}

	// . fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
	// . if "s" starts with ? then keep the filename in the base url
	if (s[0] == '?') {
		for ( ; base[blen] && base[blen] != '?'; blen++ ) {
			;
		}
	}

	// skip s over spaces
	const char *send = s + len;
	while ( s < send && is_wspace_a( *s ) ) {
		s++;
		len--;
	}

	// . is s a relative url? search for ://, but break at first /
	// . but break at any non-alnum or non-hyphen
	bool isAbsolute = false;
	int32_t i;
	for ( i = 0; i < len && ( is_alnum_a( s[i] ) || s[i] == '-' ); i++ ) {
		;
	}

	if ( !isAbsolute ) {
		isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '/'); // some are missing both /'s!
	}

	if ( !isAbsolute ) {
		isAbsolute = (i + 2 < len && s[i + 0] == ':' && s[i + 1] == '\\');
	}

	// or if s starts with // then it's also considered absolute!
	if ( !isAbsolute && len > 1 && s[0] == '/' && s[1] == '/' ) {
		isAbsolute = true;
	}

	// watch out for idiots
	if ( !isAbsolute && len > 1 && s[0] == '\\' && s[1] == '\\' ) {
		isAbsolute = true;
	}

	// don't use base if s is not relative
	if ( blen==0 || isAbsolute ) {
		set( s, len, addWWW, stripParams, false, titledbVersion );
		return;
	}

	// . if s starts with / then hack of base's m_path
	// . careful not to hack of the port, if any
	// . blen = baseUrl->m_slen + 3 + baseUrl->m_hlen;
	if ( len > 0 && s[0]=='/' )
		blen = baseUrl->m_path - baseUrl->m_url ;

	char temp[MAX_URL_LEN * 2 + 1];
	strncpy( temp, base, blen );

	if ( len > MAX_URL_LEN ) {
		len = MAX_URL_LEN - 2;
	}

	// if s does NOT start with a '/' then add one here in case baseUrl
	// does NOT end in one.
	// fix baseurl = "http://xyz.com/poo/all" and s = "?page=3"
	if ( len > 0 && s[0] != '/' && s[0] != '?' && temp[blen - 1] != '/' ) {
		temp[blen++] = '/';
	}
	strncpy( temp + blen, s, len );
	temp[blen+len] = '\0';
	set( temp, blen + len, addWWW, stripParams, stripCommonFile, titledbVersion );
}


static bool isSessionId ( const char *hh ) {
	int32_t count = 0;
	int32_t nonNumCount = 0;

	// do not limit count to 12, the hex numbers may only be
	// after the 12th character! we were not identifying these
	// as sessionids when we shold have been because of that.
	for ( ; *hh ; ++count, ++hh ) {
		if ( *hh >= '0' && *hh <= '9' ) continue;
		nonNumCount++;
		if ( *hh >= 'a' && *hh <= 'f' ) continue;
		// we got an illegal session id character
		return false;
	}

	// if we got at least 12 of em, consider it a valid id
	// make sure it's a hexadecimal number...lots of product
	// ids and dates use only decimal numbers
	return ( nonNumCount > 0 && count >= 12);
}

static void stripParametersv122( char *s, int32_t *len ) {
	// . remove session ids from s
	// . ';' most likely preceeds a session id
	// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1
	// . http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
	// . http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395&p=1
	// . http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&p=1
	// . http://www.b.com/default?SID=f320a739cdecb4c3edef67e&p=1

	// CHECK FOR A SESSION ID USING QUERY STRINGS
	char *p = s;
	while ( *p && *p != '?' && *p != ';' ) p++;

	// bail if no ?
	if ( ! *p ) {
		return;
	}

	// now search for severl strings in the cgi query string
	char *tt = NULL;
	int32_t x = 0;

	if ( ! tt ) { tt = gb_strcasestr ( p, "PHPSESSID=" ); x = 10;}
	if ( ! tt ) { tt = strstr        ( p , "SID="       ); x =  4;}
	// . osCsid and XTCsid are new session ids
	// . keep this up here so "sid=" doesn't override it
	if ( ! tt  ) {
		tt = strstr ( p , "osCsid=" );
		x =  7;
		if ( ! tt ) tt = strstr ( p , "XTCsid=" );
		// a hex sequence of at least 10 digits must follow
		if ( tt && ! isSessionId ( tt + x ) )
			tt = NULL;
	}
	if ( ! tt ) {
		tt = strstr ( p , "osCsid/" );
		x =  7;
		// a hex sequence of at least 10 digits must follow
		if ( tt && ! isSessionId ( tt + x ) )
			tt = NULL;
	}
	// this is a new session id thing
	if ( ! tt ) {
		tt = strstr ( p , "sid=" ); x =  4;
		// a hex sequence of at least 10 digits must follow
		if ( tt && ! isSessionId ( tt + x ) )
			tt = NULL;
	}
	// osCsid and XTCsid are new session ids
	if ( ! tt ) {
		tt = strstr ( p , "osCsid=" );
		x =  7;
		if ( ! tt ) tt = strstr ( p , "XTCsid=" );
		// a hex sequence of at least 10 digits must follow
		if ( tt && ! isSessionId ( tt + x ) )
			tt = NULL;
	}

	// fixes for bug of matching plain &sessionid= first and
	// then realizing char before is an alnum...
	if ( ! tt ) { tt = gb_strcasestr ( p, "jsessionid="); x = 11; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "vbsessid="  ); x =  9;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "asesessid=" ); x = 10; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "nlsessid="  ); x =  9; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "psession="  ); x =  9; }

	if ( ! tt ) { tt = gb_strcasestr ( p, "session_id="); x = 11;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "sessionid=" ); x = 10;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "sessid="    ); x =  7;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "session="   ); x =  8;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "session/"   ); x =  8; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "POSTNUKESID=");x = 12;}
	// some new session ids as of Feb 2005
	if ( ! tt ) { tt = gb_strcasestr ( p, "auth_sess=" ); x = 10; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "mysid="     ); x =  6; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "oscsid="    ); x =  7; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "cg_sess="   ); x =  8; }
	if ( ! tt ) { tt = gb_strcasestr ( p, "galileoSession");x=14; }
	// new as of Jan 2006. is hurting news5 collection on gb6
	if ( ! tt ) { tt = gb_strcasestr ( p, "sess="      ); x =  5; }

	// .php?s=8af9d6d0d59e8a3108f3bf3f64166f5a&
	// .php?s=eae5808588c0708d428784a483083734&
	// .php?s=6256dbb2912e517e5952caccdbc534f3&
	if ( ! tt && (tt = strstr ( p-4 , ".php?s=" )) ) {
		// point to the value of the s=
		char *pp = tt + 7;
		int32_t i = 0;
		// ensure we got 32 hexadecimal chars
		while ( pp[i] &&
			( is_digit(pp[i]) ||
			  ( pp[i]>='a' && pp[i]<='f' ) ) ) i++;
		// if not, do not consider it a session id
		if ( i < 32 ) tt = NULL;
		// point to s= for removal
		else { tt += 5; x = 2; }
	}

	// BR 20160117
	// http://br4622.customervoice360.com/about_us.php?SES=652ee78702fe135cd96ae925aa9ec556&frmnd=registration
	if ( ! tt ) { tt = strstr        ( p , "SES="       ); x =  4;}

	// BR 20160117: Skip most common tracking parameters
	// Oracle Eloqua
	// http://app.reg.techweb.com/e/er?s=2150&lid=25554&elq=00000000000000000000000000000000&elqaid=2294&elqat=2&elqTrackId=3de2badc5d7c4a748bc30253468225fd
	if ( ! tt ) { tt = gb_strcasestr ( p, "elq="); x = 4;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "elqat="); x = 6;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "elqaid="); x = 7;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "elq_mid="); x = 8;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "elqTrackId="); x = 11;}

	// Google Analytics
	// http://kikolani.com/blog-post-promotion-ultimate-guide?utm_source=kikolani&utm_medium=320banner&utm_campaign=bpp
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_term="); x = 9;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_hp_ref="); x = 11;}	// Lots on huffingtonpost.com
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_source="); x = 11;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_medium="); x = 11;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_content="); x = 12;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "utm_campaign="); x = 13;}

	// Piwik
	if ( ! tt ) { tt = gb_strcasestr ( p, "pk_kwd="); x = 7;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "pk_source="); x = 10;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "pk_medium="); x = 10;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "pk_campaign="); x = 12;}

	// Misc
	if ( ! tt ) { tt = gb_strcasestr ( p, "trk="); x = 4;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "promoid="); x = 8;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "promCode="); x = 9;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "promoCode="); x = 10;}
	if ( ! tt ) { tt = gb_strcasestr ( p, "partnerref="); x = 11;}

	// bail if none were found
	if ( ! tt ) {
		return;
	}

	// . must not have an alpha char before it!
	// . prevent "DAVESID=" from being labeled as session id
	if ( is_alnum_a ( *(tt-1) ) ) {
		return;
	}

	// start of the shit
	int32_t a = tt - s;

	// get the end of the shit
	int32_t b = a + x;

	// back up until we hit a ? or & or / or ;
	while ( a > 0 && s[a-1] != '?' && s[a-1] != '&' &&
		s[a-1] != '/' && s[a-1] != ';' ) a--;

	// keep the '?'
	if ( s[a]=='?' ) a++;

	// back up over any semicolon
	if ( s[a-1] == ';' ) a--;

	// advance b until we hit & or end or ? or a ';'
	while ( s[b] && s[b] != '&' && s[b] != '?' && s[b] != ';') b++;

	// if we don't have 5+ chars in session id itself, skip it
	if ( b - (a + x) < 5 ) {
		return;
	}

	// go over a & or a ;
	if ( s[b] == '&' || s[b] == ';' ) b++;

	// remove the session id by covering it up
	memmove ( &s[a] , &s[b] , *len - b );

	// reduce length
	*len -= (b-a);

	// if s ends in ? or & or ;, backup
	while ( *len > 0 && (s[*len-1]=='?'||s[*len-1]=='&'||s[*len-1]==';'))
		(*len)--;

	// NULL terminate
	s[*len] = '\0';
}

static void stripParameters( UrlParser *urlParser ) {
	/// @todo ALC reorder parameter?
	/// if we have ?abc=123&def=456
	/// wouldn't it be the same as ?def=456&abc=123

	/// @todo ALC login pages?
	/// should we even spider them?

	static const UrlComponent::Validator s_defaultParamValidator( 0, 0, true, ALLOW_ALL, MANDATORY_NONE );

	// 3 different component that we can remove from
	// - path (we have a much more restrictive criteria on path to avoid removing valid path)
	//   eg: http://www.example.com/search/keywords/chardonnay/osCAdminID/45de8edd68f8bc05e9fde0d2c528a619/sort/3d/
	//
	// - path param
	//   eg: http://www.example.com/search/keywords,chardonnay/osCAdminID,45de8edd68f8bc05e9fde0d2c528a619/sort,3d/
	//   eg: http://www.example.com/search/;keywords=chardonnay;osCAdminID=45de8edd68f8bc05e9fde0d2c528a619;sort=3d/
	//
	// - query string
	//   eg: http://www.example.com/search/?keywords=chardonnay&osCAdminID=45de8edd68f8bc05e9fde0d2c528a619&sort=3d

	// osCommerce (osCsid)
	// eg:
	//   be1566df2284664244ce73ea6bed81fa09d4
	//   b8d15fefe8648f7f77c6e47f7bc0b881
	//   ddtvpkt3rpqdprsagsi52tj5o4
	{
		auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCsid"));
		if (!pathMatches.empty()) {
			urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX));
			urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA)));
		}

		urlParser->removeQueryParam(UrlComponent::Matcher("osCsid"), s_defaultParamValidator);
	}

	// osCommerce (osCAdminID)
	// eg:
	//   20d2f836fd203140dc6391b7ba3cdd82
	//   c40fe2ad32efad2e9cc2748a3f1f90cc
	{
		auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("osCAdminID"));
		if (!pathMatches.empty()) {
			urlParser->removePath(pathMatches, UrlComponent::Validator(32, 32, true, ALLOW_HEX));
			urlParser->removePath(pathMatches, UrlComponent::Validator(26, 26, true, (ALLOW_DIGIT | ALLOW_ALPHA)));
		}

		urlParser->removeQueryParam(UrlComponent::Matcher("osCAdminID"), s_defaultParamValidator);
	}

	// XT-commerce
	// eg:
	//   ha6n43ndtnlm53tpqgnclbv7ukkroue9k7m1e2o7t7rr5nb366a1
	//   7ib1soln64vslra70ep2qcvde4s8dsm1
	//   big3ika24atc4j19mlaha6d906
	urlParser->removePath(UrlComponent::Matcher("XTCsid", MATCH_CASE), UrlComponent::Validator(26, 52, true, (ALLOW_DIGIT | ALLOW_ALPHA)));
	urlParser->removeQueryParam(UrlComponent::Matcher("XTCsid", MATCH_CASE), s_defaultParamValidator);

	// ColdFusion
	// http://help.adobe.com/en_US/ColdFusion/9.0/Developing/WSc3ff6d0ea77859461172e0811cbec0c35c-7fef.html#WSc3ff6d0ea77859461172e0811cbec22c24-7cbf

	// ColdFusion (CTOKEN)
	// eg:
	//   e718cd6cc29050df-8051DC1E-C29B-554E-6DFF6B5D2704A9A5
	//   92566684.html
	//   94175176
	//   322257
	{
		auto pathMatches = urlParser->matchPath(UrlComponent::Matcher("CFTOKEN"));
		if (!pathMatches.empty()) {
			urlParser->removePath(pathMatches, UrlComponent::Validator(52, 52, true, ALLOW_ALL));
			urlParser->removePath(pathMatches, UrlComponent::Validator(10, 14, true, ALLOW_ALL, MANDATORY_PUNCTUATION));
			urlParser->removePath(pathMatches, UrlComponent::Validator(6, 0, true, ALLOW_DIGIT));
		}

		urlParser->removePathParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
		urlParser->removeQueryParam(UrlComponent::Matcher("CFTOKEN"), s_defaultParamValidator);
	}

	// ColdFusion (CFID)
	urlParser->removePath(UrlComponent::Matcher("CFID"), UrlComponent::Validator(0, 0, true, ALLOW_DIGIT));
	urlParser->removePathParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);
	urlParser->removeQueryParam(UrlComponent::Matcher("CFID"), s_defaultParamValidator);

	urlParser->removeQueryParam(UrlComponent::Matcher("cftokenPass"), s_defaultParamValidator);

	/// SAP load balancer
	// https://help.sap.com/saphelp_nw70/helpdata/de/f2/d7914b8deb48f090c0343ef1d907f0/content.htm
	urlParser->removePathParam(UrlComponent::Matcher("saplb_*"), s_defaultParamValidator);

	// Atlassian
	//   https://developer.atlassian.com/confdev/confluence-plugin-guide/writing-confluence-plugins/form-token-handling
	//   3 different format
	//   eg:
	//     AFP6-ISR2-ZLJY-KBY3|926a76e0017be6a18e889d2ddffb0aaab21865c1|lout
	//     56c1bb338d5ad3ac262dd4e97bda482efc151f30
	//     15BWJdAr0U
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("atl_token"));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(65, 0, true, ALLOW_ALL));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, true, ALLOW_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(10, 10, true, (ALLOW_ALPHA | ALLOW_DIGIT)));
		}
	}

	// psession
	// eg:
	//   491022863920110420135759
	//   7d01p6qvcl2e72j8ivmppk12k0
	//   XUjuplcPFGlJD2ZF5O26ApqAj5ZNEZwZrUKX5kkA
	urlParser->removeQueryParam(UrlComponent::Matcher("psession"), UrlComponent::Validator(24, 0, (ALLOW_ALPHA | ALLOW_DIGIT)));

	// Galileo
	// eg:
	//   65971783A4.z17ZHFAI
	//   63105032A6BFxgQFfV8
	urlParser->removeQueryParam(UrlComponent::Matcher("GalileoSession"), UrlComponent::Validator(19, 19, ALLOW_ALL));

	// postnuke
	//   normally it would be hex string length of 32. but shorter length exist (looks to be chopped off somehow)
	//   eg:
	//     549178d5035b622229a39cd5baf75d2a
	//     4ed3b0a832d4687020b05ce70
	urlParser->removeQueryParam(UrlComponent::Matcher("POSTNUKESID"), UrlComponent::Validator(16, 32, (ALLOW_HEX)));

	// jsessionid
	// eg:
	//   C14778D1240A6CFEE5417030DDB37D41
	urlParser->removePath(UrlComponent::Matcher("jsessionid"), UrlComponent::Validator(32, 32, false, ALLOW_HEX));
	urlParser->removePathParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true));
	urlParser->removeQueryParam(UrlComponent::Matcher("jsessionid", MATCH_PARTIAL), UrlComponent::Validator(20, 0, true));

	// phpsessid
	// eg:
	//   7711
	//   4g8v6ndp6gnnc4tagn8coam0n7
	//   414c6917961d5b4998973d1613b7926f
	//   qfou95mlih5jjans36kevj2pti7p847v6bl79f03nrvtaadif6u0
	urlParser->removePath(UrlComponent::Matcher("PHPSESSID"), UrlComponent::Validator(26, 32, false, (ALLOW_ALPHA | ALLOW_DIGIT)));
	urlParser->removeQueryParam(UrlComponent::Matcher("PHPSESSID", MATCH_PARTIAL), s_defaultParamValidator);

	// auth_sess
	//   mostly job sites (same group?)
	//   eg:
	//     7ofc7ep3i8g6i2foinq6uks7e0
	//     6ce228460946fc4b3ed154abea1530b8
	urlParser->removeQueryParam(UrlComponent::Matcher("auth_sess"), UrlComponent::Validator(26, 32, true, (ALLOW_DIGIT | ALLOW_ALPHA)));

	// ps_sess_id
	// eg:
	//   0056c53b03ee56c8b791a5cf061a910d
	urlParser->removeQueryParam(UrlComponent::Matcher("ps_sess_id"), UrlComponent::Validator(32, 32, true, ALLOW_HEX));

	// mysid
	// eg:
	//   c357e16d973188ad99cc3e32a059e805
	//   11GeUYNB4fCVXeySSumKM3
	//   hNrnd87gxn9LU0X-N-4TS2
	//   glwcjvci
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("mysid"));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, false, ALLOW_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(22, 22, false, ALLOW_ALL, MANDATORY_ALPHA));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(8, 8, false, ALLOW_ALPHA));
		}
	}

	// sid
	// eg:
	//   3565de85-0bf0-47d3-8fb3-80120d6b60a6
	//   8E67BB91-5056-9000-2C8C1473A967F273
	//   0b721aa1c34b75fcf41e17304537d965
	//   3KnGJS3ga7ae891-33115175851.04
	//   v0uqho4nv0mnghv4ap3ieeqp94
	//   K6FYyt
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sid"));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(30, 0, false, ALLOW_ALL));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, false, (ALLOW_ALPHA | ALLOW_DIGIT)));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_ALPHA_UPPER)));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_LOWER | MANDATORY_DIGIT)));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(6, 6, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA_UPPER | MANDATORY_DIGIT)));
		}
	}

	// SES
	// eg:
	//   74339eda735516fd51ed1c5eb6bc76ceav
	//   39a11261f58150fd4327a80da6daafa0
	//   99cj5cbf6g8irau20h1hkvr8o6
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("ses"));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(34, 34, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, false, ALLOW_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
		}
	}

	// s
	// eg:
	//   4d9ae8a969305848227e5d6d7d0fb9672bd38d96
	//   81cfba6ed9b66a8ad0df43c2f3d259bd
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("s"));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, false, ALLOW_HEX, MANDATORY_ALPHA_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, false, ALLOW_HEX, MANDATORY_ALPHA_HEX));
		}
	}

	// session_id
	// eg:
	//   NiHhUceSP6At57u0
	//   ospnr7npc97urgoi1p9i9kd1e4
	urlParser->removeQueryParam(UrlComponent::Matcher("session_id"), UrlComponent::Validator(16, 0, false, ALLOW_ALL, MANDATORY_ALPHA));

	// sessionid
	// eg:
	//   094104BqFHWLmUCiZAMvgboVyVFiIKDqRPJCxIUMZIPNkMVJVK
	//   1a0d43d9a6753940649bbaeb56f01176
	//   ej3fa4fe7eikfb8ej1fd6
	//   ObUlshp63oxfnZzvCzwe
	//   mN3XmQ{hXgsK8jY7VUm8
	urlParser->removeQueryParam(UrlComponent::Matcher("sessionid"), UrlComponent::Validator(20, 0, false, ALLOW_ALL, MANDATORY_ALPHA));

	// other session id variations

	// sessid (vbSESSID, asesessid, nlsessid, GLBSESSID, sessid, etc ...)
	// eg:
	//   91hpb1p3b69bu0vqruar2fpltf3b509bsdeqh1qtj1p8ugb8rpc0
	//   a12cb492ec7bcc9677916f02913587064d4279ed
	//   50d96959db895a0adbfebd325a4a65e0
	//   f4db3ec33001c9759d095c6432651e39
	//   82d0pbm7f6aa55no7p0rqb37r6
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("sessid", MATCH_PARTIAL));
		if (!queryMatches.empty()) {
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(52, 52, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(40, 40, false, ALLOW_HEX, MANDATORY_ALPHA_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(32, 32, false, ALLOW_HEX, MANDATORY_ALPHA_HEX));
			urlParser->removeQueryParam(queryMatches, UrlComponent::Validator(26, 26, false, (ALLOW_ALPHA | ALLOW_DIGIT), (MANDATORY_ALPHA | MANDATORY_DIGIT)));
		}
	}

	// session
	// eg:
	//   eRbInbLDoNaEr4gkIju0
	//   vfdplav2ske1blvadpv9du54k3
	//   ARC-1454710019-541634862-12401
	//   ARC-1454807400-18472177182-25788
	//   A5C45BC6DC3B436899C43B9D904FC8DE
	//   7b478486-e52c-46aa-aca8-8cd446fcb79e
	//   39663_1455055828_84298238456ba63d42992a
	//   14185_1455099610_106560567456bb0eda9d317
	//   NlG8XCo5MgpctBTMRut4Gq6J5Z5de9foAe4rh3ikLQYWQmFzLR4zSHuieO8
	//   DMQBEXa5Z-aJ7r67ylAJ_y9H8_S2HTUaIjoafUtOjYuGcxwRefR0Q3xXzyS
	//   bGJL_GuP2eDGwJJzoXM9T3_LRgjAsalqaREGEBDoEERJOIMIL8Wh7Q3K3FcgHtYc9hM6CuJmVKlmmCxjmSYEhwVlOdUEX5RnUXycKSHKO5iAz2_ulWoJOZ1d7QCD2Afn9WPkXkvaJaSgjo7hcfYbBnUOXhedzMolha6kfV7hvf4mRAF700MhB350--QV0wQAur9Rz47QiX8SiRXp_vQDdwInUSfO3PqOwXfBu72w4e-JySzUf7Aj9Ks9ouOUPAn1W_GtORLLT4Gho7-Tb_IwyGVYPKF97f3VMXsTfoFqUvs
	//
	urlParser->removeQueryParam(urlParser->matchQueryParam(UrlComponent::Matcher("session")),
	                            UrlComponent::Validator(20, 0, false, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_DIGIT)));

	// sess
	//   eg: 4be234480736093ba237bc397fb6e32d
	urlParser->removeQueryParam(UrlComponent::Matcher("sess"),
	                            UrlComponent::Validator(20, 0, false, (ALLOW_ALPHA | ALLOW_DIGIT)));

	// ts
	// eg:
	//   1422344216175
	//   1425080080316
	urlParser->removeQueryParam(UrlComponent::Matcher("ts"), UrlComponent::Validator(13, 13, false, ALLOW_DIGIT));

	// apache dir sort
	//   C={N,M,S,D} O={A,D}
	// eg:
	//   ?C=N;O=A
	if (urlParser->getQueryParamCount() <= 2) {
		auto cQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("C", MATCH_CASE));
		auto oQueryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("O", MATCH_CASE));

		UrlComponent *cUrlComponent = (cQueryMatches.size() == 1) ? cQueryMatches[0] : NULL;
		UrlComponent *oUrlComponent = (oQueryMatches.size() == 1) ? oQueryMatches[0] : NULL;

		if (cUrlComponent) {
			if (cUrlComponent->getValueLen() == 0) {
				urlParser->deleteComponent(cUrlComponent);
			} else if (cUrlComponent->getValueLen() == 1) {
				char c = *(cUrlComponent->getValue());
				if (c == 'N' || c == 'M' || c == 'S' || c == 'D') {
					urlParser->deleteComponent(cUrlComponent);
				}
			}
		}

		if (oUrlComponent) {
			if (oUrlComponent->getValueLen() == 0) {
				urlParser->deleteComponent(oUrlComponent);
			} else if (oUrlComponent->getValueLen() == 1) {
				char o = *(oUrlComponent->getValue());
				if (o == 'A' || o == 'D') {
					urlParser->deleteComponent(oUrlComponent);
				}
			}
		}
	}

	/// @todo ALC token?

	// Skip most common tracking parameters

	// Oracle Eloqua
	// http://docs.oracle.com/cloud/latest/marketingcs_gs/OMCAA/index.html#Help/General/EloquaTrackingParameters.htm
	urlParser->removeQueryParam("elqTrackId");
	urlParser->removeQueryParam("elq");
	urlParser->removeQueryParam("elqCampaignId");
	urlParser->removeQueryParam("elqaid");
	urlParser->removeQueryParam("elqat");
	urlParser->removeQueryParam("elq_mid");
	urlParser->removeQueryParam("elq_cid");
	urlParser->removeQueryParam("elq2"); // others

	// Google Analytics
	// https://support.google.com/analytics/answer/1033867
	urlParser->removeQueryParam("utm_source");
	urlParser->removeQueryParam("utm_medium");
	urlParser->removeQueryParam("utm_term");
	urlParser->removeQueryParam("utm_content");
	urlParser->removeQueryParam("utm_campaign");
	urlParser->removeQueryParam("utm_hp_ref"); // Lots on huffingtonpost.com
	urlParser->removeQueryParam("utm_rid"); // others

	// https://support.google.com/analytics/answer/1033981?hl=en
	// https://support.google.com/ds/answer/6292795?hl=en
	urlParser->removeQueryParam("gclid");
	urlParser->removeQueryParam("gclsrc");

	// Piwik
	// http://piwik.org/docs/tracking-campaigns/
	// https://plugins.piwik.org/AdvancedCampaignReporting
	urlParser->removeQueryParam("pk_campaign");
	urlParser->removeQueryParam("pk_kwd");
	urlParser->removeQueryParam("pk_source");
	urlParser->removeQueryParam("pk_medium");
	urlParser->removeQueryParam("pk_keyword");
	urlParser->removeQueryParam("pk_content");
	urlParser->removeQueryParam("pk_cid");

	// Open Web Analytics
	// https://github.com/padams/Open-Web-Analytics/wiki/Campaign-Tracking
	urlParser->removeQueryParam("owa_medium");
	urlParser->removeQueryParam("owa_source");
	urlParser->removeQueryParam("owa_campaign");
	urlParser->removeQueryParam("owa_ad");
	urlParser->removeQueryParam("owa_ad_type");

	// Webtrends
	// http://help.webtrends.com/en/queryparameters/index.html
	urlParser->removeQueryParam("wt.mc_id");

	// Mailchimp
	// https://apidocs.mailchimp.com/api/how-to/ecommerce.php
	urlParser->removeQueryParam("mc_cid");
	urlParser->removeQueryParam("mc_eid");

	// Marketo
	// http://developers.marketo.com/documentation/websites/lead-tracking-munchkin-js/
	urlParser->removeQueryParam("mkt_tok");

	// trk
	// eg:
	//   ppro_cprof
	//   prc-basic
	urlParser->removeQueryParam(UrlComponent::Matcher("trk"),
	                            UrlComponent::Validator(0, 0, false, ALLOW_ALL, (MANDATORY_ALPHA | MANDATORY_PUNCTUATION)));

	// who
	// eg:
	//   r,Usyg2mo/krON58h7Cqp0HHHvPhsMdK5lNmP76/O/gxQb/ObopGwS3yJwoT241Hf8EMrMDicKKYtMqLKqmtywdZFGbvS6J6jbKbUd5HzTkv_FxyTEsYw1rLJr9LHquA3O
	//   r,yrl2BJY6LMkbtXa9k/lflCwQqzDqf/AF7zFIQoBAhI_t6U_gJztkZ/8ABugLiijm2NRXjt_LYh56mwmTv5cCNuIkgnB2cLFEfL62Gaoyddeh89cXgi9UqjWLP/Y1lD/4watUuyy2WINYipnkSygRLQ--
	//   r,nEBHD2D/_wnDIxXmNMZRjB1wQZikW7uTA8ZXGmCH3a1IvIXSpSv0QicLoCGpTnsBe2QR7xzvq2i2JeKu2AbpgLJaexxw5VON6yG8DP2t5oFhOdoM/kuVnhIt4PEVt1UwqKBNApZk56tTem_r5wqaF4ko65Bo5i7J67PUNHOZs3U-
	//   r,0/asSWWd2MeHwFRbMqZP42yZoh0UlWB2zyP9nAoa3ejKyLPsBjxivhuAY2RH6r94BV2DcmQQYxk6MYZD4Uo6cb30qgNTwVY/_rl_BjRSWosgbpRtPuMytbSX0OmxKuNedtcT27C3fJG/oia/88wI_Ec5PIerpxyPLAgXEsi78vAyuZAXymqhujGGTf6ACryR
	//   r,rW75z4HBqJegN3eAao88RaQcHsIgPXhAP/K1KCbI3x6dMrYllBZLlVfpuL_C0IQed0WspcLWMeT79fzDoAnb0qioGuFSnCHaZXYoH5_GZsWESFdk4CznUlTZuyeTFKsu9xblmYa56ShIKUyILXaFAI8HbNh7dpaXr7q66jIOuo_0r2_GFlbGaSScvbnAWWjH/dMPW8UZsTetZ2a9tqYaHQ--
	{
		auto queryMatches = urlParser->matchQueryParam(UrlComponent::Matcher("who"));
		for (auto it = queryMatches.begin(); it != queryMatches.end(); ++it) {
			if ((*it)->getValueLen() <= 130 && memcmp((*it)->getValue(), "r,", 2) == 0) {
				urlParser->deleteComponent(*it);
			}
		}
		urlParser->removeQueryParam(UrlComponent::Matcher("who"), UrlComponent::Validator(130, 0, false, ALLOW_ALL));
	}


	// Misc
	urlParser->removeQueryParam( "partnerref" );

	/// @todo ALC redirect ??
	/// redirect_to, redirect, redirect_url

	/// @todo ALC referer??
	/// /referer/, referer=


	/// @todo ALC cater for more affiliate links here

	// only check domain specific logic when we have a domain
	if (urlParser->getDomain()) {
		if (strncmp(urlParser->getDomain(), "amazon.", 7) == 0) {
			// amazon
			// https://www.reddit.com/r/GameDeals/wiki/affiliate

			// affiliate
			urlParser->removeQueryParam("tag");

			// wishlist
			urlParser->removeQueryParam("coliid");
			urlParser->removeQueryParam("colid");

			// reference
			urlParser->removeQueryParam("ref");
			urlParser->removePathParam(UrlComponent::Matcher("ref"),
			                           UrlComponent::Validator(0, 0, false, ALLOW_ALL, MANDATORY_PUNCTUATION));
		} else if (strncmp(urlParser->getDomain(), "ebay.", 5) == 0) {
			// ebay
			// http://www.ebaypartnernetworkblog.com/en/2009/05/new-link-generator-tool-additional-information/

			urlParser->removeQueryParam("icep_ff3");
			urlParser->removeQueryParam("pub");
			urlParser->removeQueryParam("toolid");
			urlParser->removeQueryParam("campid");
			urlParser->removeQueryParam("customid");
			urlParser->removeQueryParam("afepn");
			urlParser->removeQueryParam("pid");
		}
	}
}

// . url rfc = http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
// . "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'()," 
//    [not including the quotes - ed], and reserved characters used for their 
//    reserved purposes may be used unencoded within a URL."
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
// . url should be ENCODED PROPERLY for this to work properly
void Url::set( const char *t, int32_t tlen, bool addWWW, bool stripParams, bool stripCommonFile,
               int32_t titledbVersion ) {
#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(t,tlen);
#endif
	reset();

	if ( ! t || tlen == 0 ) {
		return;
	}

	// we may add a "www." a trailing backslash and \0, ...
	if ( tlen > MAX_URL_LEN - 10 ) {
		log( LOG_LIMIT, "db: Encountered url of length %" PRId32 ". Truncating to %i", tlen, MAX_URL_LEN - 10 );
		tlen = MAX_URL_LEN - 10;
	}

	// . skip over non-alnum chars (except - or /) in the beginning
	// . if url begins with // then it's just missing the http: (slashdot)
	// . watch out for hostname like: -dark-.deviantart.com(yes, it's real)
	// . so all protocols are hostnames MUST start with alnum OR hyphen
	while ( tlen > 0 && !is_alnum_a( *t ) && *t != '-' && *t != '/' ) {
		t++;
		tlen--;
	}

	// . stop t at first space or binary char
	// . url should be in encoded form!
	int32_t i;
	int32_t nonAsciiPos = -1;
	for ( i = 0 ; i < tlen ; i++ )	{
		if ( is_wspace_a(t[i]) ) {
			break; // no spaces allowed
		}

		if ( ! is_ascii(t[i]) ) {
			// Sometimes the length with the null is passed in, 
			// so ignore nulls FIXME?
			if ( t[i] ) {
				nonAsciiPos = i;
			}

			break; // no non-ascii chars allowed
		}
	}

	if ( nonAsciiPos != -1 ) {
		// Try turning utf8 and latin1 encodings into punycode.
		// All labels(between dots) in the domain are encoded 
		// separately.  We don't support encoded tlds, but they are 
		// not widespread yet.
		// If it is a non ascii domain it needs to take the form 
		// xn--<punycoded label>.xn--<punycoded label>.../

		log(LOG_DEBUG, "build: attempting to decode unicode url %*.*s pos at %" PRId32, (int)tlen, (int)tlen, t, nonAsciiPos);

		char encoded [ MAX_URL_LEN ];
		size_t encodedLen = MAX_URL_LEN;
		char *encodedDomStart = encoded;
		const char *p = t;
		const char *pend = t+tlen;

		// Find the start of the domain
		if ( tlen > 7 && strncmp( p, "http://", 7 ) == 0 ) {
			p += 7;
		} else if ( tlen > 8 && strncmp( p, "https://", 8 ) == 0 ) {
			p += 8;
		}

		gbmemcpy(encodedDomStart, t, p-t);
		encodedDomStart += p-t;

		while ( p < pend && *p != '/' ) {
			const char *labelStart = p;
			uint32_t tmpBuf[MAX_URL_LEN];
			int32_t tmpLen = 0;

			while ( p < pend && *p != '.' && *p != '/' )
				p++;
			int32_t labelLen = p - labelStart;

			bool tryLatin1 = false;
			// For utf8 urls
			p = labelStart;
			bool labelIsAscii = true;

			// Convert the domain to code points and copy it to tmpbuf to be punycoded
			for ( ; p - labelStart < labelLen; p += utf8Size( tmpBuf[tmpLen] ), tmpLen++ ) {
				labelIsAscii = labelIsAscii && is_ascii( *p );
				tmpBuf[tmpLen] = utf8Decode( p );
				if ( !tmpBuf[tmpLen] ) { // invalid char?
					tryLatin1 = true;
					break;
				}
			}

			if ( labelIsAscii ) {
				if ( labelStart[labelLen] == '.' ) {
					labelLen++;
					p++;
				}
				gbmemcpy( encodedDomStart, labelStart, labelLen );
				encodedDomStart += labelLen;
				continue;
			}

			if ( tryLatin1 ) {
				// For latin1 urls
				tmpLen = 0;
				for ( ; tmpLen < labelLen; tmpLen++ ) {
					tmpBuf[tmpLen] = labelStart[tmpLen];
				}
			}

			gbmemcpy( encodedDomStart, "xn--", 4 );
			encodedDomStart += 4;

			encodedLen = MAX_URL_LEN - (encodedDomStart - encoded);
			punycode_status status = punycode_encode( tmpLen, tmpBuf, NULL, &encodedLen, encodedDomStart );

			if ( status != 0 ) {
				// Give up? try again?
				log("build: Bad Engineer, failed to "
				    "punycode international url %s (%" PRId32 ")",
				    t, (int32_t)status);
				return;
			}

			// We should check if what we encoded were valid url characters, no spaces, etc
			// FIXME: should we exclude just the bad chars? I've seen plenty of urls with
			// a newline in the middle.  Just discard the whole chunk for now
			bool badUrlChars = false;
			for ( uint32_t i = 0; i < encodedLen; i++ ) {
				if ( is_wspace_a( encodedDomStart[i] ) ) {
					badUrlChars = true;
					break;
				}
			}

			if ( encodedLen == 0 || badUrlChars ) {
				encodedDomStart -= 4; // don't need the xn--
				p++;
			} else {
				encodedDomStart += encodedLen;
				*encodedDomStart++ = *p++; // Copy in the . or the /
			}
		}

		// p now points to the end of the domain
		// encodedDomStart now points to the first free space in encoded string

		// Now copy the rest of the url in.  Watch out for non-ascii chars 
		// truncate the url, and keep it under max url length
		uint32_t newUrlLen = encodedDomStart - encoded;

		while (p < pend) {
			if ( ! *p ) {
				break; // null?
			}

			if (!is_ascii(*p)) {
				// url encode utf8 characters now
				char cs = getUtf8CharSize(p);

				// bad utf8 char?
				if ( !isValidUtf8Char(p) ) {
					break;
				}

				// too long?
				if ( newUrlLen + 12 >= MAX_URL_LEN ) {
					break;
				}

				char stored = urlEncode ( &encoded[newUrlLen], 12 , p , cs );
				p += cs;
				newUrlLen += stored;

				continue;
			}

			if (is_wspace_a(*p)) {
				break;
			}

			if (newUrlLen + 1 >= MAX_URL_LEN) {
				break;
			}

			encoded[newUrlLen++] = *p++;
		}

		encoded[newUrlLen] = '\0';
		return this->set( encoded, newUrlLen, addWWW, stripParams, stripCommonFile, titledbVersion );
	}

	// truncate length to the first occurence of an unacceptable char
	tlen = i;

	// . jump over http:// if it starts with http://http://
	// . a common mistake...
	while ( tlen > 14 && ! strncasecmp ( t , "http://http://" , 14 ) ) {
		t += 7;
		tlen -= 7;
	}

	// only strip anchor for version <= 122 (we're stripping anchor in UrlParser)
	if (titledbVersion <= 122) {
		// strip the "#anchor" from http://www.xyz.com/somepage.html#anchor"
		for (int32_t i = 0; i < tlen; i++) {
			if (t[i] == '#') {
				// ignore anchor if a ! follows it. 'google hash bang hack'
				// which breaks the web and is now deprecated, but, there it is
				if (i + 1 < tlen && t[i + 1] == '!') {
					continue;
				}

				tlen = i;
				break;
			}
		}
	}

	// copy to "s" so we can NULL terminate it
	char s[MAX_URL_LEN];
	int32_t len = tlen;

	if (titledbVersion <= 122) {
		// store filtered url into s
		gbmemcpy (s, t, tlen);
		s[len] = '\0';

		if (stripParams) {
			stripParametersv122(s, &len);
		}
	} else {
		UrlParser urlParser(t, tlen);

		if (stripParams) {
			stripParameters(&urlParser);
		}

		// rebuild url
		urlParser.unparse();

		len = urlParser.getUrlParsedLen();

		if (len > MAX_URL_LEN - 10) {
			len = MAX_URL_LEN - 10;
		}
		strncpy(s, urlParser.getUrlParsed(), len);
		s[len] = '\0';
	}

	// remove common filenames like index.html
	if ( stripCommonFile ) {
		if ( len - 14 > 0 &&
		     strncasecmp(&s[len-14], "/default.xhtml", 14) == 0 )
			len -= 13;
		else if ( len - 13 > 0 &&
			( strncasecmp(&s[len-13], "/default.html", 13) == 0 ||
		          strncasecmp(&s[len-13], "/default.ascx", 13) == 0 ||
		          strncasecmp(&s[len-13], "/default.ashx", 13) == 0 ||
		          strncasecmp(&s[len-13], "/default.asmx", 13) == 0 ||
		          strncasecmp(&s[len-13], "/default.xhtm", 13) == 0 ||
		          strncasecmp(&s[len-13], "/default.aspx", 13) == 0 ) )
			len -= 12;
		else if ( len - 12 > 0 &&
		        ( strncasecmp(&s[len-12], "/default.htm", 12) == 0 ||
		          strncasecmp(&s[len-12], "/default.php", 12) == 0 ||
		          strncasecmp(&s[len-12], "/default.asp", 12) == 0 ||
		          strncasecmp(&s[len-12], "/index.xhtml", 12) == 0 ) )
			len -= 11;
		else if ( len - 11 > 0 &&
		        ( strncasecmp(&s[len-11], "/index.html", 11) == 0 ||
		          strncasecmp(&s[len-11], "/index.aspx", 11) == 0 ||
		          strncasecmp(&s[len-11], "/index.xhtm", 11) == 0 ||
		          strncasecmp(&s[len-11], "/default.pl", 11) == 0 ||
		          strncasecmp(&s[len-11], "/default.cs", 11) == 0 ) )
			len -= 10;
		else if ( len - 10 > 0 &&
			( strncasecmp(&s[len-10], "/index.htm", 10) == 0 ||
			  strncasecmp(&s[len-10], "/index.php", 10) == 0 ||
			  strncasecmp(&s[len-10], "/index.asp", 10) == 0 ||
			  strncasecmp(&s[len-10], "/main.html", 10) == 0 ||
			  strncasecmp(&s[len-10], "/main.aspx", 10) == 0 ) )
			len -= 9;
		else if ( len - 9 > 0 &&
			( strncasecmp(&s[len-9], "/index.pl", 9) == 0 ||
			  strncasecmp(&s[len-9], "/main.htm", 9) == 0 ||
			  strncasecmp(&s[len-9], "/main.php", 9) == 0 ) )
			len -= 8;
		else if ( len - 8 > 0 &&
			( strncasecmp(&s[len-8], "/main.pl", 8) == 0 ) )
			len -= 7;
		s[len] = '\0';
	}
	

	// replace the "\" with "/" -- a common mistake
	int32_t j;
	for ( j = 0 ; s[j] ; j++) 
	{
		if (s[j]=='\\') 
		{
			s[j]='/';
		}
	}
		
	// . dig out the protocol/scheme for this s (check for ://)
	// . protocol may only have alnums and hyphens in it
	for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
	
	// if we have a legal protocol, then set "m_scheme", "slen" and "sch"
	// and advance i to the m_host
	if ( i + 2 < len && s[i]==':' && s[i+1]=='/' && s[i+2]=='/') 
	{
		// copy lowercase protocol to "m_url"
		to_lower3_a ( s , i + 3 , m_url ); 
		m_scheme = m_url;
		m_slen   = i;
		m_ulen   = i + 3;
		i += 3;
	}
	else 
	if (i + 2 < len && s[i]==':' && s[i+1]=='/'&& is_alnum_a(s[i+2]))
	{
		// copy lowercase protocol to "m_url"
		to_lower3_a ( s , i + 2 , m_url ); 
		// add in needed /
		m_url[i+2]='/';
		m_scheme = m_url;
		m_slen   = i;
		m_ulen   = i + 3;
		i += 2;
	}
	else 
	{
		gbmemcpy ( m_url,"http://" , 7 );
		m_scheme = m_url;
		m_slen   = 4;
		m_ulen   = 7;
		i        = 0;
		// if s started with // then skip that (slashdot)
		if ( s[0]=='/' && s[1]=='/' ) i = 2;
	}
	// . now &s[i] should point to the m_host name
	// . chars allowed in hostname = period,alnum,hyphen,underscore
	// . stops at '/' or ':' or any other disallowed character
	j = i;
	while (s[j] && (is_alnum_a(s[j]) || s[j]=='.' || s[j]=='-'||s[j]=='_'))
		j++;
	// copy m_host into "s" (make it lower case, too)
	to_lower3_a ( s + i, j - i, m_url + m_ulen );
	m_host    = m_url + m_ulen;
	m_hlen    = j - i;
	// common mistake: if hostname ends in a . then back up
	while ( m_hlen > 0 && m_host[m_hlen-1]=='.' ) m_hlen--;
	// NULL terminate for strchr()
	m_host [ m_hlen ] = '\0';

	// advance m_ulen to end of hostname
	m_ulen += m_hlen;

	// . set our m_ip if hostname is in a.b.c.d format
	// . this returns 0 if not a valid ip string
	m_ip = atoip ( m_host , m_hlen );

	// advance i to the : for the port, if it exists
	i = j;

	// NULL terminate m_host for getTLD(), getDomain() and strchr() below
	m_host [ m_hlen ] = '\0';

	// use ip as domain if we're just an ip address like 192.0.2.1
	if ( m_ip ) {
		// ip address has no tld, or mid domain
		m_tld    = NULL;
		m_tldLen = 0;
		// but it does have a domain (1.2.3)
		m_domain = getDomainOfIp ( m_host , m_hlen , &m_dlen );
		// just use the domain as the mid domain for ip-based urls
		m_mdlen  = m_dlen;
	}
	// . otherwise, get the tld
	// . uses thorough list of tlds in Domains.cpp
	else if ( ( m_tld = ::getTLD ( m_host, m_hlen ) ) && m_tld > m_host ) {
		// set m_domain if we had a tld that's not equal to our host
		m_tldLen = strlen ( m_tld  );
		m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
		// set the mid domain length (-1 for the '.')
		m_mdlen  = m_dlen - m_tldLen - 1;
	}
	// otherwise, we're no ip and we have no valid domain
	else {
		m_domain = NULL;
		m_dlen   = 0;
		m_tldLen = 0;
		m_mdlen  = 0;
	}

	// . if domain same as host then we might insert a "www." server name
	// . however, must have a period in domain name
	// . otherwise a domain name of "xxx" would become "www.xxx" and if
	//   Url::set() is called on that it would be "www.www.xxx" (bad bad)
	// . let's only add "www." if there's only 1 period, ok?
	if ( ! m_ip && addWWW && m_host == m_domain  && strchr(m_host,'.') ) {
		memmove ( m_host + 4 , m_host , m_hlen );
		gbmemcpy ( m_host , "www." , 4 );
		if ( m_domain ) m_domain += 4;
		if ( m_tld    ) m_tld    += 4;
		m_ulen += 4;
		m_hlen += 4;
	}
	// set the default port based on the protocol
	m_defPort = 80;
	if ( m_slen==5 && strncmp(m_scheme, "https",5)==0 ) m_defPort = 443;
	// assume we're using the default port for this scheme/protocol
	m_port = m_defPort;
	// see if a port was provided in the hostname after a colon
	if ( s[i] == ':' ) { 
		// remember the ptr so far
		int32_t savedLen = m_ulen;
		// add a colon to our m_url
		m_url [ m_ulen++ ] = ':';
		// scan for a '/' 
		j = i + 1;
		while ( s[j] && s[j]!='/') m_url[m_ulen++] = s[j++];
		// now read our port
		m_port = atol2 ( s + (i + 1) , j - (i + 1) );
		// if it's the default port, then remove what we copied
		if ( m_port == m_defPort ) m_ulen = savedLen;
		// make i point to the root / in the m_path, if any
		i = j; 
	}
	// how many chars is taken up by a specified port?
	m_portLen = 0;
	if ( m_port != m_defPort ) {
		m_portLen += 2; // :3
		if ( m_port >= 10    ) m_portLen += 1;
		if ( m_port >= 100   ) m_portLen += 1;
		if ( m_port >= 1000  ) m_portLen += 1;
		if ( m_port >= 10000 ) m_portLen += 1;
	}

	// append a '/' to m_url then bail if there is no m_path after the port
	if ( s[i] != '/') {
		m_path    = m_url + m_ulen;
		m_path[0] = '/';
		m_plen    = 1;
		m_url[ ++m_ulen ]='\0';
		return;
	}

	// . get the m_path and m_path length
	// . j,i should point to start of path slash '/'
	// . scan so it points to end or a ? or # 
	j = i;
	
	// now we include # as part of the path if it is a hash bang '#!'
	// which was the web-breaking google hack that is now deprecated
	while ( s[j] && s[j]!='?' ) {
		if ( s[j] == '#' && s[j+1] != '!' )
			break;
		j++;
	}

	// point the path inside m_url even though we haven't written it yet
	m_path = m_url + m_ulen;
	m_plen = m_ulen; 
	// . deal with wierd things in the path
	// . i points to start of path (should be /)
	for (; i < j ; i++ ) {
		// dedup double backslashes
		// ensure m_ulen >= m_plen so we don't hurt "http:///" ...
		// but people sometimes put http:// in the *path*
		if ( s[i] == '/'  &&  m_url[m_ulen-1] == '/' &&
		     m_ulen-1 >= m_plen && 
		     m_ulen >= 2 && m_url[m_ulen-2] != ':' ) continue;

		// handled by UrlParser for version 123 and above
		if (titledbVersion <= 122) {
			// deal with current directories in the m_path
			if ( s[i] == '.'  &&  m_url[m_ulen-1] == '/' &&
			     (i+1 == j || s[i+1]=='/'))	continue;
			// . deal with damned ..'s in the m_path
			// . if next 2 chars are .'s and last char we wrote was '/'
			if ( s[i] == '.' && s[i+1]=='.' && (s[i+2] == '/' || s[i+2] == '\0') && m_url[m_ulen-1] == '/' ) {
				// dont back up over first / in path
				if ( m_url + m_ulen - 1 > m_path ) m_ulen--;
				while ( m_url[m_ulen-1] != '/'   ) m_ulen--;
				// skip i to next / after these 2 dots
				while ( s[i] && s[i]!='/' ) i++;
				continue;
			}
		}

		// don't allow ; before the ?...probably because of stripped 
		// sessionId...
		// I was going to add other possible dup separators, but now
		// it seems as though it might cause problems
		if (s[i] == ';' && s[i+1] == '?') continue;

		// store char and advance to next
		m_url[m_ulen++] = s[i];
	}
	// reset the path length in case we had to remove some wierd stuff
	m_plen = m_ulen - m_plen;

	// . get the m_query
	// . the query is anything after the path that starts with ?
	// . NOTE: we ignore strings beginning with '#' (page relative anchors)
	if ( i < len && s[i] != '#' ) {
		//remove back to back &'s in the cgi query
		//http://www.nyasatimes.com/national/politics/160.html?print&&&
		char *kstart = s + i;
		char *kend   = s + i + (len - i);
		char *dst    = m_url + m_ulen;
		for ( char *k = kstart ; k < kend ;  k++ ) {
			// skip & if we just did one
			if ( *k == '&' && k > kstart && *(k-1)=='&' ) continue;
			// copy over one char at a time
			*dst++ = *k;
		}
		// point after the '?' i guess
		m_query   = m_url + m_ulen + 1;
		m_qlen    = dst - m_query;
		m_ulen += m_qlen + 1;
	}
	// get the m_filename from the m_path (m_flen might be 0)
	m_flen = 0;
	while (m_path[m_plen-1-m_flen]!='/' && m_flen<m_plen) m_flen++;
	m_filename = m_path + m_plen - m_flen;

	// get the m_extension from the m_path
	m_elen = 0;
	while (is_alnum_a(m_path[m_plen-1-m_elen]) && m_elen < m_plen)m_elen++;
	if ( m_path[ m_plen-1-m_elen] != '.' ) m_elen = 0; // no m_extension
	m_extension = m_path + m_plen - m_elen;

	// null terminate our s
	m_url[ m_ulen ]='\0';
}

// hostname must also be www or NULL to be a root url
bool Url::isRoot() const {
	if ( m_plen    != 1              ) return false;
	if ( !m_path || m_path[0] != '/' ) return false;
	if ( m_query                     ) return false;
	// for now we'll let all thos *.deviantart.com names clog us up
	// because i don't want to dis' stuff like espn.go.com
	return true;
}

bool Url::isSimpleSubdomain ( ) const {
	// if hostname is same as domain, it's passes
	if ( m_host == m_domain && m_hlen == m_dlen ) return true;
	// if host is not "www." followed by domain, it's NOT
	if ( m_hlen != m_dlen + 4 ) return false;
	if ( strncmp ( m_host , "www." , 4 ) == 0 ) return true;
	return false;
}

// . get length of sub-url #j
// . basically like adding j /.. to the end of the url
// . sub-url #0 is the full url
// . includes /~ as it's own path
int32_t Url::getSubUrlLen ( int32_t j ) const {

	// assume it's the whole url
	int32_t len = m_ulen;

	// subtract the m_query (cgi) part at the end of the url
	if ( m_query ) len -= m_qlen + 1; //and the ?
	
	// return the full url (without m_query) if j is 0
	if ( j == 0 ) return len;

	// . start right past the http://m_host.domain.com/
	int32_t start = m_slen + 3 + m_hlen + 1 + m_portLen ;
	while ( len > start ) {
		if ( m_url [ len - 1 ] == '/'                            ) j--;
		if ( m_url [ len - 2 ] == '/' && m_url [ len - 1 ] == '~') j--;
		// include this backslash (or ~) in the sub-url
		if ( j == 0 ) return len;
		// shrink by one character
		len--;
	}

	// return 0 if jth sub-url does not exist
	return 0;
}

// . similar to getSubUrlLen() above but only works on the path
// . if j is 0 that's the whole url path!
int32_t Url::getSubPathLen ( int32_t j ) const {
	int32_t subUrlLen = getSubUrlLen ( j );
	if ( subUrlLen <= 0 ) return 0; 
	// . the subPath length includes the root backslash
	// . portLen includes the whole :8080 thing (for non default ports)
	return subUrlLen - m_slen - 3 - m_hlen - m_portLen; 
}

void Url::print() const {
	logf(LOG_TRACE, "Url info");
	logf(LOG_TRACE, "\turl          : %s", m_url);
	logf(LOG_TRACE, "\turlhash32    : %" PRIx32, getUrlHash32());
	logf(LOG_TRACE, "\turlhash48    : %" PRIx64, getUrlHash48());
	logf(LOG_TRACE, "\turlhash64    : %" PRIx64, getUrlHash64());

	logf(LOG_TRACE, "\thost         : %.*s", m_hlen, m_host);
	logf(LOG_TRACE, "\thosthash32   : %" PRIx32, getHostHash32());
	logf(LOG_TRACE, "\thosthash48   : %" PRIx64, getHostHash64());

	logf(LOG_TRACE, "\tip           : %" PRId32, m_ip);
	logf(LOG_TRACE, "\tscheme       : %.*s", m_slen, m_scheme);
	logf(LOG_TRACE, "\tpath         : %.*s", m_plen, m_path);
	logf(LOG_TRACE, "\tquery        : %s", m_query);
	logf(LOG_TRACE, "\tport         : %" PRId32, m_port);

	logf(LOG_TRACE, "\tdomain       : %.*s", m_dlen, m_domain);
	logf(LOG_TRACE, "\tdomainhash32 : %" PRIx32, getDomainHash32());
	logf(LOG_TRACE, "\tdomainhash64 : %" PRIx64, getDomainHash64());

	logf(LOG_TRACE, "\ttld          : %.*s", m_tldLen, m_tld);
	logf(LOG_TRACE, "\tmid domain   : %.*s", m_mdlen, m_domain);
	logf(LOG_TRACE, "\tis root      : %i", isRoot());
}

int32_t  Url::getPathDepth ( bool countFilename ) const {
	const char *s     = m_path + 1;
	const char *send  = m_url + m_ulen;
	int32_t  count = 0;
	while ( s < send ) if ( *s++ == '/' ) count++;
	// if we're counting the filename as a path component...
	if ( countFilename && *(send-1) != '/' ) count++;
	return count;
}

bool Url::isHostWWW ( ) const {
	if ( m_hlen < 4 ) return false;
	if ( m_host[0] != 'w' ||
	     m_host[1] != 'w' ||
	     m_host[2] != 'w' ||
	     m_host[3] != '.' ) return false;
	return true;
}

// . is the url a porn/adult url?
// . i use /usr/share/dict/words to check for legit words
// . if it's long and has 4+ hyphens, consider it spam
// . if you add a word here, add it to PageResults.cpp:isQueryDirty()
bool Url::isAdult() const {
	//certain TLDs are clearly adult-oriented
	if(isAdultTLD(m_tld,m_tldLen))
		return true;
	if(m_hlen<=0)
		return false; // Invalid URL (no hostname)
	if(m_tldLen<=0)
		return false; // no TLD
	// store the hostname in a buf since we strtok it
	char s [ MAX_URL_LEN ];
	// don't store the .com or .org while searching for isSpam
	int32_t  slen = m_hlen - m_tldLen - 1;
	gbmemcpy ( s , m_host , slen );
	if ( ! m_domain ) return false;
	if ( ! m_dlen   ) return false;
	//int32_t  len = m_dlen;
	//gbmemcpy ( s , m_domain , len );
	// if tld is gov or edu or org, not porn
	if ( m_tldLen >= 3 && strncmp ( m_tld , "edu" , 3 )==0 ) return false;
	if ( m_tldLen >= 3 && strncmp ( m_tld , "gov" , 3 )==0 ) return false;
	// NULL terminate for strstr
	s[slen]='\0';
	// . if there is 4 or more hyphens, and hostLen > 30 consider it spam
	// . actually there seems to be a lot of legit sites with many hyphens
	if ( slen > 30 ) {
		int32_t count = 0;
		char *p = s;
		while ( *p ) if ( *p++ == '-' ) count++;
		if ( count >= 4 ) return true;
	}

	//
	// TODO: use getMatch()!!!! +pts -pts system
	// 

	// check each thing separated by periods for porn
	const char *send = s + slen;
	const char *p    = s;

	while(p<send) {
		// find the next period or hyphen
		const char *pend = p;
		while ( pend < send && *pend != '.' && *pend !='-' ) pend++;
		// check that
		if ( isAdultUrl ( p , pend - p ) ) return true;
		// point to next
		p = pend + 1;
	}
	return false;
}


// . remove any session id
// . i'm sick of these tihngs causing dup problems
// . types:
// http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395
// http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&
// http://www.b.com/default?SID=f320a739cdecb4c3edef67e

// http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
// http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?stuff=1
// look for ';'
// look for PHPSESSID, session_id, SID, jsessionid
// followed by string of at least 4 letters/numbers
		
//List of extensions NOT to parse
static const char * const s_badExtensions[] = {
        "ai",
        "aif",
        "aifc",
        "aiff",
        "asc",
        "au",
        "avi",
        "bcpio",
        "bin",
        "bmp",
        "bz2",
        //"c",
        //"cc",// c source code, allow
        "ccad",
        "cdf",
        //"class",// text source code file usually, allow
        "cpio",
        "cpt",
        //"csh",
        "css",
        "dcr",
        "dir",
        "dms",
        //"doc",
        "drw",
        "dvi",
        "dwg",
        "dxf",
        "dxr",
        "eps",
        "etx",
        "exe",
        "ez",
        //"f", // ambigous
        "f90",
        "fli",
        "gif",
        "gtar",
        "gz",
        //"h",
        "hdf",
        "hh",
        "hqx",
        //"htm",
        //"html",
        "ice",
        "ief",
        "iges",
        "igs",
        "ips",
        "ipx",
        "jpe",
        "jpeg",
        "jpg",
        //"js",
        "kar",
        "latex",
        "lha",
        "lsp",
        "lzh",
        //"m", // ambiguous
        "man",
        "me",
        "mesh",
        "mid",
        "midi",
        "mif",
        "mime",
        "mov",
        "movie",
        "mp2",
        "mp3",
        "mpe",
        "mpeg",
        "mpg",
        "mpga",
        "ms",
        "msh",
        "nc",
        "oda",
        "pbm",
        "pdb",
        //"pdf",
        "pgm",
        "pgn",
        "png",
        "pnm",
        "pot",
        "ppm",
        "pps",
	// "ppt",
        "ppz",
        "pre",
        "prt",
	// "ps",
        "qt",
        "ra",
        "ram",
        "ras",
        "rgb",
        "rm",
        "roff",
        "rpm",
		"deb", // debian/ubuntu package file
        "rtf",
        "rtx",
        "scm",
        "set",
        "sgm",
        "sgml",
        //"sh", // shells are text files
        "shar",
        "silo",
        "sit",
        "skd",
        "skm",
        "skp",
        "skt",
        "smi",
        "smil",
        "snd",
        "sol",
        "spl",
        "src",
        "step",
        "stl",
        "stp",
        "sv4cpio",
        "sv4crc",
        "swf",
        //"t", // ambiguous ... Mr.T.
        "tar",
        "tcl",
        "tex",
        "texi",
        "texinfo",
        "tif",
        "tiff",
        "tr",
        "tsi",
        "tsp",
        "tsv",
        //"txt",
        "unv",
        "ustar",
        "vcd",
        "vda",
        "viv",
        "vivo",
        "vrml",
        "wav",
        "wrl",
        "xbm",
        "xlc",
        "xll",
        "xlm",
        //"xls",
        "xlw",
        //"xml",
        "xpm",
        "xwd",
        "xyz",
        "zip",//
};//look below, I added 3 more types for TR version 73



static HashTable s_badExtTable;
static bool s_badExtInitialized;
static GbMutex s_badExtTableMutex;

//returns True if the extension is listed as bad
bool Url::hasNonIndexableExtension( int32_t version ) const {
	if ( ! m_extension || m_elen == 0 ) return false;
	ScopedLock sl(s_badExtTableMutex);
	if(!s_badExtInitialized) { //if hash has not been created-create one
		int32_t i=0;
		//version 72 and before.
		do {
			int tlen = strlen(s_badExtensions[i]);
			int64_t swh = hash64Lower_a(s_badExtensions[i],tlen);
			if(!s_badExtTable.addKey(swh,(int32_t)50))
			{
				log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash %" PRId64" to badExtTable.", swh);
				return false;
			}
			i++;

		} while(strcmp(s_badExtensions[i],"zip")!=0);


		//version 73 and after.
		if(!s_badExtTable.addKey(hash64Lower_a("wmv", 3),(int32_t)73) ||
		   !s_badExtTable.addKey(hash64Lower_a("wma", 3),(int32_t)73) ||    
		   !s_badExtTable.addKey(hash64Lower_a("ogg", 3),(int32_t)73))
		{
			log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (2).");
			return false;
		}
		
		// BR 20160125: More unwanted extensions
		if(
			!s_badExtTable.addKey(hash64Lower_a("7z", 2),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("lz", 2),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("xz", 2),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("apk", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("com", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("dll", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("dmg", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("flv", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("gpx", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("ico", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("iso", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("kmz", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("mp4", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("rar", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("svg", 3),(int32_t)122) ||
			!s_badExtTable.addKey(hash64Lower_a("vcf", 3),(int32_t)122) ||
//			!s_badExtTable.addKey(hash64Lower_a("xls", 3),(int32_t)122) ||		// Should be handled by converter (AbiWord)
		   	!s_badExtTable.addKey(hash64Lower_a("lzma", 4),(int32_t)122) ||    
//			!s_badExtTable.addKey(hash64Lower_a("pptx", 4),(int32_t)122) ||		// Should be handled by converter (AbiWord)
			!s_badExtTable.addKey(hash64Lower_a("thmx", 4),(int32_t)122) ||
		   	!s_badExtTable.addKey(hash64Lower_a("zipx", 4),(int32_t)122) ||
//			!s_badExtTable.addKey(hash64Lower_a("xlsx", 4),(int32_t)122) ||		// Should be handled by converter (AbiWord)
		   	!s_badExtTable.addKey(hash64Lower_a("zsync", 5),(int32_t)122) ||    
		   	!s_badExtTable.addKey(hash64Lower_a("torrent", 7),(int32_t)122) ||
		   	!s_badExtTable.addKey(hash64Lower_a("manifest", 8),(int32_t)122)
		   	)
		{
			log(LOG_ERROR,"hasNonIndexableExtension: Could not add hash to badExtTable (3).");
			return false;
		}
		
		s_badExtInitialized = true;
	}


	int myKey = hash64Lower_a(m_extension,m_elen);
	int32_t badVersion = s_badExtTable.getValue(myKey);

	if( badVersion == 0 || badVersion > version ) 
	{
		return false;
	}
	
	return true;
}

bool Url::hasXmlExtension ( ) const {

	if ( ! m_extension || ! m_elen || m_elen > 3 ) return false;

	char ext[5];
	int i;
	for(i=0; i < m_elen; i++)
	{
		ext[i] = to_lower_a(m_extension[i]);
	}
	ext[i] = '\0';
	
	switch( m_elen )
	{
		case 3:
			if( memcmp(ext, "xml", 3) == 0 )
			{
				return true;
			}
			break;
		default:
			break;
	}

	return false;
}


bool Url::hasJsonExtension ( ) const {

	if ( ! m_extension || ! m_elen || m_elen >= 4 ) return false;

	char ext[5];
	int i;
	for(i=0; i < m_elen; i++)
	{
		ext[i] = to_lower_a(m_extension[i]);
	}
	ext[i] = '\0';
	
	switch( m_elen )
	{
		case 4:
			if( memcmp(ext, "json", 4) == 0 )
			{
				return true;
			}
			break;
		default:
			break;
	}

	return false;
}


bool Url::hasScriptExtension ( ) const {

	if ( ! m_extension || ! m_elen || m_elen > 4 ) return false;

	char ext[5];
	int i;
	for(i=0; i < m_elen; i++)
	{
		ext[i] = to_lower_a(m_extension[i]);
	}
	ext[i] = '\0';
	
	switch( m_elen )
	{
		case 2:
			if( memcmp(ext, "js", 2) == 0 )
			{
				return true;
			}
			break;
		default:
			break;
	}

	return false;
}



// see Url.h for a description of this.
bool Url::isLinkLoop ( ) const {
	const char *s          = m_path ;
	const char *send       = m_url + m_ulen;
	int32_t  count         = 0;
	int32_t  components    = 0;
	bool  prevWasDouble = false;
	const char *last     = NULL;
	if (!s) return false;
	// use this hash table to hash each path component in the url
	char  buf [ 5000 ];
	HashTable t; t.set ( 100 , buf , 5000 );
	// grab each path component
	for ( ; s < send ; s++ ) {
		if ( *s != '/' ) continue;
		// ok, add this guy to the hash table, if we had one
		if ( ! last ) { last = s; continue; }
		// give up after 50 components
		if ( components++ >= 50 ) return false;
		// hash him
		uint32_t h = hash32 ( last , s - last );
		// is he in there?
		int32_t slot = t.getSlot ( h );
		// get his val (count)
		int32_t val = 0;
		if ( slot >= 0 ) val = t.getValueFromSlot ( slot );
		// if not in there put him in a slot
		if ( slot < 0 ) {
			last = s;
			t.addKey ( h , 1 );
			continue;
		}
		// increment it
		val++;
		// does it occur 3 or more times? if so, we have a link loop
		if ( val >= 3 ) return true;
		// is it 2 or more? 
		if ( val == 2 ) count++;
		// if we have two such components, then we are a link loop.
		// BUT, we must be a pair!
		if ( count >= 2 && prevWasDouble ) return true;
		// set this so in case next guy is a double
		if ( val == 2 ) prevWasDouble = true;
		else            prevWasDouble = false;
		// add it back after incrementing
		t.setValue ( slot , val );
		// update "last"
		last = s;
	}
	return false;
}		

//
// here are some examples of link loops in urls:
//
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/bish/archive/bish/letters/bish/archive/lette\rs/send/archive/letters/send/bish/letters/archive/bish/letters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/bish/letters/archive/bish/archive/letters/send/archive/letters/send/archive/le\tters/send/archive/letters/send/bish/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/letters/send/archive/bish/archive/bish/\archive/bish/letters/send/archive/letters/archive/letters/send/archive/bish/let\ters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/letters/archive/bish/archive/bish/archive/bi\sh/letters/send/archive/bish/archive/letters/send/bish/archive/bish/letters/sen\d/archive/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/bish/letters/send/archive/\bish/archive/letters/bish/letters/send/archive/bish/letters/send/bish/archive/l\etters/bish/letters/archive/letters/send/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/send/bish/archive/letters/\send/bish/archive/letters/send/archive/letters/bish/archive/bish/archive/letter\s/


bool Url::isIp() const {
	if(!m_host)            return false;
	if(!is_digit(*m_host)) return false; 
	return atoip ( m_host , m_hlen ); 
}

int32_t Url::getHash32WithWWW ( ) const {
	uint32_t hh = hash32n ( "www." );
	int32_t conti = 4;
	hh = hash32_cont ( m_domain , m_dlen , hh , &conti );
	return hh;
}

int32_t Url::getHostHash32 ( ) const {
	return hash32 ( m_host , m_hlen ); 
}

int64_t Url::getHostHash64 ( ) const {
	return hash64 ( m_host , m_hlen ); 
}

int32_t Url::getDomainHash32 ( ) const {
	return hash32 ( m_domain , m_dlen ); 
}

int64_t Url::getDomainHash64 ( ) const {
	return hash64 ( m_domain , m_dlen ); 
}

int32_t Url::getUrlHash32 ( ) const {
	return hash32(m_url,m_ulen); 
}

int64_t Url::getUrlHash64 ( ) const {
	return hash64(m_url,m_ulen); 
}

const char *getHostFast ( const char *url , int32_t *hostLen , int32_t *port ) {
	// point to the url
	const char *pp = url;
	// skip http(s):// or ftp:// (always there?)
	while ( *pp && *pp != ':' ) pp++;
	// skip ://
	pp += 3;
	// point "uhost" to hostname right away
	const char *uhost = pp;
	// advance "pp" till we hit a / or :<port>
	while ( *pp && *pp !='/' && *pp !=':' ) pp++;
	// advance "pe" over the port
	const char *pe = pp;
	if ( *pp == ':' ) {
		// if port ptr given, do not treat port as part of hostname
		if ( port ) *port = atoi(pp+1);
		// i think this was including :1234 as part of hostname
		// if port was NULL!
		//else while ( *pe && *pe != '/' ) pe++;
	}
	// set length
	if ( hostLen ) *hostLen = pe - uhost;
	return uhost;
}

char *getPathFast ( char *url ) {
	// point to the url
	char *pp = url;
	// skip http(s):// or ftp:// (always there?)
	while ( *pp && *pp != ':' ) pp++;
	// skip ://
	pp += 3;
	// point "uhost" to hostname right away
	//char *uhost = pp;
	// advance "pp" till we hit a / or :<port>
	while ( *pp && *pp !='/' && *pp !=':' ) pp++;
	// advance "pe" over the port
	char *pe = pp;
	if ( *pp == ':' )
		while ( *pe && *pe != '/' ) pe++;
	// but not if something follows the '/'
	return pe;
}

const char *getTLDFast(const char *url, int32_t *tldLen, bool hasHttp) {
	// point to the url
	const char *pp = url;
	// only do this for some
	if ( hasHttp ) {
		// skip http(s):// or ftp:// (always there?)
		while ( *pp && *pp != ':' ) pp++;
		// skip ://
		pp += 3;
	}
	// point "uhost" to hostname right away
	const char *uhost = pp;

	// advance "pp" till we hit a / or :<port> or \0
	while ( *pp && *pp !='/' && *pp !=':' ) pp++;

	// advance "pe" over the port
	const char *pe = pp;
	if ( *pp == ':' ) {
		while ( *pe && *pe != '/' ) {
			pe++;
		}
	}

	// set length of host
	int32_t uhostLen = pp - uhost;
	// . is the hostname just an IP address?
	// . if it is an ip based url make domain the hostname
	const char *ss = uhost;
	bool isIp = true;
	for ( ; *ss && ss<pp ; ss++ ) {
		if ( is_alpha_a( *ss ) ) {
			isIp = false;
			break;
		}
	}

	// if ip, no tld
	if ( isIp ) {
		return NULL;
	}

	// get the tld
	const char *tld = ::getTLD ( uhost , uhostLen );

	// if none, done
	if ( ! tld ) {
		return NULL;
	}

	// set length
	if ( tldLen ) {
		*tldLen = pp - tld;
	}

	// return it
	return tld;
}

bool hasSubdomain(const char *url) {
	// point to the url
	const char *pp = url;
	// skip http if there
	if (      pp[0] == 'h' &&
		  pp[1] == 't' &&
		  pp[2] == 't' &&
		  pp[3] == 'p' &&
		  pp[4] == ':' &&
		  pp[5] == '/' &&
		  pp[6] == '/' )
		pp += 7;
	else if ( pp[0] == 'h' &&
		  pp[1] == 't' &&
		  pp[2] == 't' &&
		  pp[3] == 'p' &&
		  pp[4] == 's' &&
		  pp[5] == ':' &&
		  pp[6] == '/' &&
		  pp[7] == '/' )
		pp += 8;
	// point "uhost" to hostname right away
	const char *uhost = pp;
	// advance "pp" till we hit a / or :<port>
	while ( *pp && *pp !='/' && *pp !=':' ) pp++;
	// are we a root? assume so.
	//char isRoot = true;
	// advance "pe" over the port
	const char *pe = pp;
	if ( *pp == ':' )
		while ( *pe && *pe != '/' ) pe++;
	// but not if something follows the '/'
	//if ( *pe == '/' && *(pe+1) ) isRoot = false;
	// set length
	int32_t uhostLen = pp - uhost;
	// get end
	//char *hostEnd = uhost + uhostLen;
	// . is the hostname just an IP address?
	// . if it is an ip based url make domain the hostname
	const char *ss = uhost;
	while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
	// if we are an ip, say yes
	if ( ss == pp ) return true;
	// get the tld
	const char *utld = ::getTLD ( uhost , uhostLen );
	// no tld, then no domain
	if ( ! utld ) return false;
	// the domain, can only be gotten once we know the TLD
	// back up a couple chars
	const char *udom = utld - 2;
	// backup until we hit a '.' or hit the beginning
	while ( udom > uhost && *udom != '.' ) udom--;
	// fix http://ok/
	if ( udom < uhost || *udom =='/' ) return false;
	// if we hit '.' advance 1
	if ( *udom == '.' ) udom++;
	// eqal to host? if not, we do have a subdomain
	if ( udom != uhost ) return true;
	// otherwise the hostname equals the domain name
	return false;
}

// returns NULL if url was in bad format and could not get domain. this
// was happening when a host gave us a bad redir url and xmldoc tried
// to set extra doc's robot.txt url to it "http://2010/robots.txt" where
// the host said "Location: 2010 ...".
const char *getDomFast ( const char *url , int32_t *domLen , bool hasHttp ) {
	// point to the url
	const char *pp = url;
	// skip http if there
	if ( hasHttp ) {
		// skip http(s):// or ftp:// (always there?)
		while ( *pp && *pp != ':' ) pp++;
		// skip ://
		pp += 3;
	}
	// point "uhost" to hostname right away
	const char *uhost = pp;
	// advance "pp" till we hit a / or :<port>
	while ( *pp && *pp !='/' && *pp !=':' ) pp++;

	// advance "pe" over the port
	const char *pe = pp;
	if ( *pp == ':' )
		while ( *pe && *pe != '/' ) pe++;

	// set length
	int32_t uhostLen = pp - uhost;
	// get end
	const char *hostEnd = uhost + uhostLen;
	// . is the hostname just an IP address?
	// . if it is an ip based url make domain the hostname
	const char *ss = uhost;
	while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
	//bool isIp = false;
	//if ( ss == pp ) isIp = true;
	// if we are an ip, treat special
	if ( ss == pp ) {
		// . might just be empty! like "\0"
		// . fixes core dump from 
		//   http://www.marcom1.unimelb.edu.au/public/contact.html
		//   parsing host email address
		if ( uhostLen == 0 ) return NULL;
		// to be consistent with how Url::m_domain/m_dlen is set we
		// need to remove the last .X from the ip address
		// skip back over digits
		for ( hostEnd-- ; is_digit(*hostEnd); hostEnd-- );
		// must be a period
		if ( *hostEnd != '.' ) { 
			log("url: getDomFast() could not find period for "
			    "hostname in url");
			return NULL;
		}
		// set length
		*domLen = hostEnd - uhost;
		// that's it
		return uhost;
	}
	// get the tld
	const char *utld = ::getTLD ( uhost , uhostLen );
	// no tld, then no domain
	if ( ! utld ) return NULL;
	// the domain, can only be gotten once we know the TLD
	// set utldLen
	//int32_t utldLen = hostEnd - utld;
	// back up a couple chars
	const char *udom = utld - 2;
	// backup until we hit a '.' or hit the beginning
	while ( udom > uhost && *udom != '.' ) udom--;
	// fix http://ok/
	if ( udom < uhost || *udom =='/' ) return NULL;
	// if we hit '.' advance 1
	if ( *udom == '.' ) udom++;
	// set domain length
	*domLen = hostEnd - udom;
	return udom;
}

// Is it a ping server? It might respond with huge documents with thousands of
// links, which would normally be detected as link spam. This function is kept
// around until we have a better way of handling it  than hardcoded URLs in a
// source file.
bool Url::isPingServer ( ) const {
	return false;
}


// "s" point to the start of a normalized url (includes http://, etc.)
const char *getHost(const char *s, int32_t *hostLen) {
	// skip proto
	while ( *s != ':' ) s++;
	// skip ://
	s += 3;
	// that is the host
	const char *host = s;
	// get length of hostname
	for ( s++; *s && *s != '/' ; s++ );
	// that is it
	*hostLen = s - host;
	// return it
	return host;
}

// "s" point to the start of a normalized url (includes http://, etc.)
const char *getScheme ( const char *s , int32_t *schemeLen )
{
	const char *div = strstr(s, "://");
	
	if( !div )
	{
		*schemeLen=0;
		return "";
	}

	*schemeLen = div - s;
	return s;
}

// . return ptrs to the end
// . the character it points to SHOULD NOT BE part of the site
const char *getPathEnd(const char *s, int32_t desiredDepth) {
	// skip proto
	while ( *s != ':' ) s++;
	// skip ://
	s += 3;
	// get length of hostname
	for ( s++; *s && *s != '/' ; s++ );
	// should always have a /
	if ( *s != '/' ) gbshutdownLogicError();
	// skip that
	s++;
	// init depth
	int32_t depth = 0;
	// do a character loop
	for ( ; depth <= desiredDepth && *s ; s++ ) 
		// count the '/'
		if ( *s == '/' ) depth++;
	// return the end
	return s;
	/*
	// save for below
	int32_t saved = depth;
	// keep going
	while ( depth-- > 0 ) {
		for ( s++; *s && *s != '/' && *s != '?' ; s++ );
		// if not enough path components (or cgi), return NULL
		if ( *s != '/' ) return NULL;
	}
	// include the last '/' if we have path components
	if ( saved > 0 ) s++;
	// . we got it
	// . if depth==0 just use "www.xyz.com" as site
	// . if depth==1 just use "www.xyz.com/foo/" as site
	return s;
	*/
}

// . pathDepth==0 for "www.xyz.com"
// . pathDepth==0 for "www.xyz.com/"
// . pathDepth==0 for "www.xyz.com/foo"
// . pathDepth==1 for "www.xyz.com/foo/"
// . pathDepth==1 for "www.xyz.com/foo/x"
// . pathDepth==2 for "www.xyz.com/foo/x/"
// . pathDepth==2 for "www.xyz.com/foo/x/y"
int32_t getPathDepth(const char *s, bool hasHttp) {
	// skip http:// if we got it
	if ( hasHttp ) {
		// skip proto
		while ( *s != ':' ) s++;
		// must have it!
		if ( ! *s ) gbshutdownLogicError();
		// skip ://
		s += 3;
	}
	// skip over hostname
	for ( s++; *s && *s != '/' ; s++ );
	// no, might be a site like "xyz.com"
	if ( ! *s ) return 0;
	// should always have a /
	if ( *s != '/' ) gbshutdownLogicError();
	// skip that
	s++;
	// init depth
	int32_t depth = 0;
	// do a character loop
	for ( ; *s ; s++ ) {
		// stop if we hit ? or #
		if ( *s == '?' ) break;
		if ( *s == '#' ) break;
		// count the '/'
		if ( *s == '/' ) depth++;
	}
	return depth;
}

char* Url::getDisplayUrl( const char* url, SafeBuf* sb ) {
	const char *urlEnd = url + strlen(url);
	const char *p = url;
	if ( strncmp( p, "http://", 7 ) == 0 )
		p += 7;
	else if ( strncmp(p, "https://", 8 ) == 0 )
		p += 8;

	const char *domEnd = static_cast<const char*>( memchr( p, '/', urlEnd - p ) );
	if (domEnd == NULL) {
		domEnd = urlEnd;
	}

	bool firstRun = true;
	const char *found = NULL;
	const char *labelCursor = url;

	while( ( found = strstr( labelCursor, "xn--" ) ) && ( found < domEnd ) ) {
		if ( firstRun ) {
			sb->safeMemcpy( url, found - url );
			firstRun = false;
		}

		const char* encodedStart = found + 4;
		uint32_t decoded [ MAX_URL_LEN];
		size_t decodedLen = MAX_URL_LEN - 1 ;
		const char* labelEnd = encodedStart;
		while( labelEnd < domEnd && *labelEnd != '/' && *labelEnd != '.' ) {
			labelEnd++;
		}

		punycode_status status = punycode_decode(labelEnd - encodedStart, encodedStart, &decodedLen, decoded, NULL);
		if ( status != 0 ) {
			log( "build: Bad Engineer, failed to depunycode international url %s", url );
			sb->safePrintf("%s", labelCursor);
			sb->nullTerm();
			return sb->getBufStart();
		}

		sb->utf32Encode( decoded, decodedLen );

		if ( *labelEnd == '.' ) {
			sb->pushChar( *labelEnd++ );
		}

		labelCursor = labelEnd;
	}

    // Copy in the rest
    sb->safePrintf("%s", labelCursor);
    sb->nullTerm();
    return sb->getBufStart();
}