db43287908
It was called one place but result was ignored. Memmber m_ip was only used internally inUrl::set() so it was changed to a local variable
262 lines
7.4 KiB
C++
262 lines
7.4 KiB
C++
// Matt Wells, copyright Mar 2001
|
|
|
|
// . a class for parsing urls
|
|
// . used by many other classes
|
|
|
|
#ifndef GB_URL_H
|
|
#define GB_URL_H
|
|
|
|
#include "max_url_len.h"
|
|
|
|
#include "TitleRecVersion.h"
|
|
#include <cstddef>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
class SafeBuf;
|
|
|
|
char *getPathFast ( char *url );
|
|
const char *getTLDFast(const char *url, int32_t *tldLen, bool hasHttp = true);
|
|
const char *getDomFast(const char *url, int32_t *domLen, bool hasHttp = true);
|
|
bool hasSubdomain(const char *url);
|
|
const char *getHostFast ( const char *url , int32_t *hostLen , int32_t *port = NULL ) ;
|
|
|
|
// . returns the host of a normalized url pointed to by "s"
|
|
// . i.e. "s" must start with the protocol (i.e. http:// or https:// etc.)
|
|
// . used by Links.cpp for fast parsing and SiteGetter.cpp too
|
|
const char *getHost(const char *s, int32_t *hostLen);
|
|
|
|
// . returns the scheme of a normalized url pointed to by "s"
|
|
// . i.e. "s" must start with the protocol (i.e. http:// or https:// etc.)
|
|
// . used by SiteGetter.cpp too
|
|
const char *getScheme( const char *s , int32_t *hostLen );
|
|
|
|
|
|
// . get the path end of a normalized url
|
|
// . used by SiteGetter.cpp
|
|
// . if num==0 just use "www.xyz.com" as site (the hostname)
|
|
// . if num==1 just use "www.xyz.com/foo/" as site
|
|
const char *getPathEnd(const char *s, int32_t num);
|
|
|
|
int32_t getPathDepth(const char *s, bool hasHttp);
|
|
|
|
class Url {
|
|
public:
|
|
Url();
|
|
|
|
// set from another Url, does a copy
|
|
void set( const Url *url ) {
|
|
if ( ! url ) {
|
|
reset();
|
|
return;
|
|
}
|
|
|
|
set( url->getUrl(), url->getUrlLen(), false, false );
|
|
}
|
|
|
|
void set( const char *s ) {
|
|
set( s, strlen( s ), false, false );
|
|
}
|
|
|
|
void set( const char *s, int32_t len ) {
|
|
set( s, len, false, false );
|
|
}
|
|
|
|
void set( const char *s, int32_t len, bool addWWW, bool stripParams, int32_t titledbVersion = TITLEREC_CURRENT_VERSION ) {
|
|
set( s, len, addWWW, stripParams, false, titledbVersion );
|
|
}
|
|
|
|
void set( const Url *baseUrl, const char *s, int32_t len ) {
|
|
set( baseUrl, s, len, false, false, false );
|
|
}
|
|
|
|
// . "s" must be an ENCODED url
|
|
void set( const Url *baseUrl, const char *s, int32_t len, bool addWWW, bool stripParams,
|
|
bool stripCommonFile, int32_t titledbVersion = TITLEREC_CURRENT_VERSION );
|
|
|
|
void print() const;
|
|
void reset ();
|
|
|
|
// compare another url to us
|
|
bool equals ( const Url *u ) const {
|
|
return ( m_ulen == u->m_ulen && strcmp( m_url, u->m_url ) == 0 );
|
|
}
|
|
|
|
// is the url's hostname actually in ip in disguise ("a.b.c.d")
|
|
bool isIp() const;
|
|
|
|
bool isRoot() const;
|
|
bool isCgi() const { return m_query; }
|
|
|
|
//returns True if the extension is in the list of
|
|
//badExtensions - extensions not to be parsed
|
|
bool hasNonIndexableExtension(int32_t xxx) const;
|
|
|
|
int32_t getSubUrlLen(int32_t i) const;
|
|
int32_t getSubPathLen(int32_t i) const;
|
|
|
|
int32_t getPort() const { return m_port; }
|
|
int32_t getPortLen() const { return m_portLen; }
|
|
|
|
int32_t getDefaultPort() const { return m_defPort; }
|
|
|
|
const char *getUrl() const { return m_url; }
|
|
int32_t getUrlLen() const { return m_ulen; }
|
|
|
|
const char *getScheme() const { return m_scheme; }
|
|
int32_t getSchemeLen() const { return m_slen; }
|
|
|
|
const char *getHost() const { return m_host; }
|
|
int32_t getHostLen() const { return m_hlen; }
|
|
|
|
const char *getDomain() const { return m_domain; }
|
|
int32_t getDomainLen() const { return m_dlen; }
|
|
|
|
const char *getTLD() const { return m_tld; }
|
|
int32_t getTLDLen() const { return m_tldLen; }
|
|
|
|
// w/o the tld
|
|
const char *getMidDomain() const { return m_domain; }
|
|
int32_t getMidDomainLen() const { return m_mdlen; }
|
|
|
|
const char *getPath() const { return m_path; }
|
|
int32_t getPathLen() const { return m_plen; }
|
|
const char *getPathEnd() const { return m_path + m_plen; }
|
|
|
|
const char *getFilename() const { return m_filename; }
|
|
int32_t getFilenameLen() const { return m_flen; }
|
|
|
|
const char *getExtension() const { return m_extension; }
|
|
int32_t getExtensionLen() const { return m_elen; }
|
|
|
|
const char *getQuery() const { return m_query; }
|
|
int32_t getQueryLen() const { return m_qlen; }
|
|
|
|
int32_t getPathLenWithCgi () const {
|
|
return m_query ? (m_plen + 1 + m_qlen) : m_plen;
|
|
}
|
|
|
|
bool isHttp() const { return ( m_defPort == 80 ); }
|
|
bool isHttps() const { return ( m_defPort == 443 ); }
|
|
|
|
// used by buzz i guess
|
|
int32_t getUrlHash32 ( ) const;
|
|
int32_t getHostHash32 ( ) const;
|
|
int32_t getDomainHash32 ( ) const;
|
|
|
|
// if url is xyz.com then get hash of www.xyz.com
|
|
int32_t getHash32WithWWW ( ) const;
|
|
|
|
int64_t getUrlHash64 ( ) const;
|
|
int64_t getHostHash64 ( ) const;
|
|
int64_t getDomainHash64 ( ) const;
|
|
|
|
int64_t getUrlHash48 ( ) const {
|
|
return getUrlHash64() & 0x0000ffffffffffffLL;
|
|
}
|
|
|
|
bool hasScriptExtension() const;
|
|
bool hasXmlExtension() const;
|
|
bool hasJsonExtension() const;
|
|
|
|
// count the path components (root url as 0 path components)
|
|
int32_t getPathDepth ( bool countFilename ) const;
|
|
|
|
// is our hostname "www" ?
|
|
bool isHostWWW ( ) const;
|
|
|
|
bool hasSubdomain() const { return (m_dlen != m_hlen); }
|
|
|
|
// is it xxx.com/* or www.xxx.com/* (CAUTION: www.xxx.yyy.com)
|
|
bool isSimpleSubdomain() const;
|
|
|
|
// is the url a porn/adult url?
|
|
bool isAdult() const;
|
|
|
|
// . detects crazy repetetive urls like this:
|
|
// http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
|
|
// steigerwald/letters/send/archive/letters/send/archive/bish/
|
|
// archive/bish/letters/bish/archive/lettes/send/archive/letters/...
|
|
// . The problem is they use a relative href link on the page when they
|
|
// should us an absolute and the microsoft web server will still
|
|
// give the content they meant to give!
|
|
// . this is called by Msg14.cpp to not even spider such urls, and we
|
|
// also have some even better detection logic in Links.cpp which
|
|
// is probably more accurate than this function.
|
|
bool isLinkLoop() const;
|
|
|
|
bool isValid() const;
|
|
|
|
static char* getDisplayUrl( const char* url, SafeBuf* sb );
|
|
|
|
//is the registrar(s) of the TLD enforcing sane rules on punycode?
|
|
bool isPunycodeSafeTld() const;
|
|
bool hasPunycode() const;
|
|
bool getPunycodeDecodedHost(SafeBuf *sb) const;
|
|
bool getPunycodeDecodedMidDomain(SafeBuf *sb) const;
|
|
|
|
private:
|
|
void set( const char *s, int32_t len, bool addWWW, bool stripParams, bool stripCommonFile,
|
|
int32_t titledbVersion );
|
|
|
|
//What is what?
|
|
//If the given URL is http://www.foo.example.co.uk:8080/dir1/dir2/file.html?referrrrerrrr=Lrrr-from-Omicron-Persei-8
|
|
//then:
|
|
// scheme: http
|
|
// host: www.foo.example.co.uk
|
|
// domain: example.co.uk
|
|
// tld: co.uk
|
|
// middomain example
|
|
// path: /dir1/dir2/file.html
|
|
// port: 8080
|
|
// query: referrrrerrrr=Lrrr-from-Omicron-Persei-8
|
|
// filename: file.html
|
|
|
|
// the normalized url
|
|
char m_url[MAX_URL_LEN];
|
|
int32_t m_ulen;
|
|
|
|
// points into "url" (http, ftp, mailto, ...)(all lowercase)
|
|
char *m_scheme;
|
|
int32_t m_slen;
|
|
|
|
// points into "url" (a.com, www.yahoo.com, 192.0.2.1, ...)(allLowercase)
|
|
char *m_host;
|
|
int32_t m_hlen;
|
|
|
|
// points into "url" (/ /~mwells/ /a/b/ ...) (always ends in /)
|
|
char *m_path;
|
|
int32_t m_plen;
|
|
|
|
// points into "url" (a=hi+there, ...)
|
|
char *m_query;
|
|
int32_t m_qlen;
|
|
|
|
// points into "url" (html, mpg, wav, doc, ...)
|
|
char *m_extension;
|
|
int32_t m_elen;
|
|
|
|
// (a.html NULL index.html) (can be NULL)
|
|
char *m_filename;
|
|
int32_t m_flen;
|
|
|
|
const char *m_domain;
|
|
int32_t m_dlen;
|
|
|
|
const char *m_tld;
|
|
int32_t m_tldLen;
|
|
|
|
// char *m_midDomain equals m_domain
|
|
int32_t m_mdlen;
|
|
|
|
// (80, 8080, 8000, ...)
|
|
int32_t m_port;
|
|
int32_t m_defPort;
|
|
int32_t m_portLen;
|
|
|
|
const char *m_portPtr;
|
|
int32_t m_portPtrLen;
|
|
};
|
|
|
|
#endif // GB_URL_H
|