open-source-search-engine/Url.h

// Matt Wells, copyright Mar 2001

// . a class for parsing urls
// . used by many other classes

#ifndef  _URL_H_
#define  _URL_H_

#define MAX_URL_LEN 2083

// where should i put this #define? for now i'll keep it here
#define MAX_COLL_LEN  64

#include "ip.h"      // atoip ( s,len)

char *getPathFast  ( char *url );
char *getFilenameFast ( char *url , int32_t *filenameLen ) ;
char *getTLDFast   ( char *url , int32_t *tldLen  , bool hasHttp = true ) ;
char *getDomFast   ( char *url , int32_t *domLen  , bool hasHttp = true ) ;
bool  hasSubdomain ( char *url );
char *getHostFast  ( char *url , int32_t *hostLen , int32_t *port = NULL ) ;
bool  isPermalinky ( char *url );

bool isHijackerFormat ( char *url );

bool  isPingServer ( char *s ) ;

// . returns the host of a normalized url pointed to by "s"
// . i.e. "s" must start with the protocol (i.e. http:// or https:// etc.)
// . used by Links.cpp for fast parsing and SiteGetter.cpp too
char *getHost ( char *s , int32_t *hostLen ) ;

// . get the path end of a normalized url
// . used by SiteGetter.cpp
// . if num==0 just use "www.xyz.com" as site (the hostname)
// . if num==1 just use "www.xyz.com/foo/" as site
char *getPathEnd ( char *s , int32_t num );

int32_t getPathDepth ( char *s , bool hasHttp );

class Url {

public:

	void print  ();
	void reset  ();

	// set from another Url, does a copy
	void set ( Url *url , bool addWWW );

	void set    ( char *s ) {
		if ( ! s ) { char *xx=NULL;*xx=0; }
		return set ( s , strlen(s) ); }

	void set ( Url *baseUrl , char *s ) {
		if ( ! s ) { char *xx=NULL;*xx=0; }
		set ( baseUrl , s , strlen(s) ); }

	// . "s" must be an ENCODED url
	void set    ( char *s , int32_t len , bool addWWW = false,
		      bool stripSessionIds = false , bool stripPound = false ,
		      bool stripCommonFile = false ,
		      int32_t titleRecVersion = 0x7fffffff );
	void set    ( Url *baseUrl , char *s , int32_t len , bool addWWW = false,
		      bool stripSessionIds = false , bool stripPound = false ,
		      bool stripCommonFile = false ,
		      int32_t titleRecVersion = 0x7fffffff );
	void setIp  ( int32_t ip ) { m_ip = ip; };

	char isSessionId ( char *hh, int32_t titleRecVersion ) ;

	// compare another url to us
	bool equals ( Url *u ) {
		if ( m_ulen != u->m_ulen ) return false;
		if ( strcmp(m_url,u->m_url) == 0 ) return true;
		return false;
	};

	// is the url's hostname actually in ip in disguise ("a.b.c.d")
	bool isIp   ();

	// is the hostname an ip #?
	bool hasIp               () { return m_ip; }; // ip of 0 means none
	bool isRoot              ();
	// a super root url is a root url where the hostname is NULL or "www"
	bool isSuperRoot         ();
	bool isCgi               () { return m_query ; };
	bool isExtensionIndexable(); // html, htm, cgi, asp, shtml, ...

	//returns True if the extension is in the list of
	//badExtensions - extensions not to be parsed
	bool isBadExtension(int32_t xxx);
	bool isSet()            { return m_ulen != 0; }

	// is this url a warc or arc url? i.e. ends in .warc or .arc or
	// .warc.gz or .arc.gz?
	bool isWarc ( );
	bool isArc ( );

	// does it end in .xml, .rdb or .rss, etc. kinda thing
	//bool isRSSFormat ( ) ;

	// is it http://rpc.weblogs.com/int16_tChanges.xml, etc.?
	bool isPingServer ( ) ;

	void setPort             (uint16_t port ) { m_port = port; };

	int32_t getSubUrlLen        (int32_t i);
	int32_t getSubPathLen       (int32_t i);

	int32_t getPort             () { return m_port;};
	int32_t getIp               () { return m_ip; };
	int32_t getIpDomain         () { return ipdom(m_ip); };

	char *getUrl         () { return m_url;};
	char *getUrlEnd      () { return m_url + m_ulen;};
	char *getScheme      () { return m_scheme;};
	char *getHost        () { return m_host;};
	char *getDomain      () { return m_domain;};
	char *getTLD         () { return m_tld; };
	char *getMidDomain   () { return m_domain; }; // w/o the tld
	char *getPath        () { return m_path;};
	char *getFilename    () { return m_filename;};
	char *getExtension   () { return m_extension;};
	char *getQuery       () { return m_query;};
	char *getIpString    () { return iptoa ( m_ip ); };
	char *getAnchor      () { return m_anchor;};
	//char *getSite         () {return m_site;};
	char *getPortStr     () { return m_portStr; }
	int32_t  getUrlLen         () { return m_ulen;};
	int32_t  getSchemeLen      () { return m_slen;};
	int32_t  getHostLen        () { return m_hlen;};
	int32_t  getDomainLen      () { return m_dlen;};
	int32_t  getPathLen        () { return m_plen;};
	char *getPathEnd        () { return m_path + m_plen; };
	int32_t  getFilenameLen    () { return m_flen;};
	int32_t  getExtensionLen   () { return m_elen;};
	int32_t  getQueryLen       () { return m_qlen;};
	int32_t  getTLDLen         () { return m_tldLen; };
	int32_t  getMidDomainLen   () { return m_mdlen;};
	int32_t  getPortLen        () { return m_portLen;};
	int32_t  getAnchorLen      () { return m_anchorLen;};
	int32_t  getDefaultPort    () { return m_defPort;};
	//int32_t  getSiteLen         () {return m_siteLen;};
	int32_t  getPathLenWithCgi () {
		if ( ! m_query ) return m_plen;	return m_plen + 1 + m_qlen; };
	bool  isHttp            () {
		if ( m_ulen  < 4 ) return false;
		if ( m_slen != 4 ) return false;
		if ( m_scheme[0] != 'h' ) return false;
		if ( m_scheme[1] != 't' ) return false;
		if ( m_scheme[2] != 't' ) return false;
		if ( m_scheme[3] != 'p' ) return false;
		return true;
	};
	bool  isHttps           () {
		if ( m_ulen  < 5 ) return false;
		if ( m_slen != 5 ) return false;
		if ( m_scheme[0] != 'h' ) return false;
		if ( m_scheme[1] != 't' ) return false;
		if ( m_scheme[2] != 't' ) return false;
		if ( m_scheme[3] != 'p' ) return false;
		if ( m_scheme[4] != 's' ) return false;
		return true;
	};


	// . are we a site root?
	// . i.e. does this url == hometown.com/users/fred/ , etc.
	// . does not take into account whether we have a subdomain or domain
	//bool isSiteRoot(char *coll,
	//		class TagRec *tagRec = NULL ,
	//		char **retSite=NULL,
	//		int32_t *retSiteLen=NULL);

	// . returns the site and sets *siteLen
	// . returns NULL and sets g_errno on error
	// . returns NULL without g_errno set if our domain is invalid
	// . sets "*isDefault" to true if we just returned the default site,
	//   otherwise false
	//char *getSite ( int32_t *siteLen , char *coll ,
	//		bool defaultToHostname ,
	//		class TagRec *tagRec = NULL ,
	//		bool *isDefault = NULL );

	// used by buzz i guess
	//int32_t  getSiteHash32   ( char *coll );
	int32_t      getUrlHash32    ( ) ;
	int32_t      getHostHash32   ( ) ;
	int32_t      getDomainHash32 ( ) ;

	// if url is xyz.com then get hash of www.xyz.com
	int32_t getHash32WithWWW ( );

	int64_t getUrlHash64    ( ) ;
	int64_t getHostHash64   ( ) ;
	int64_t getDomainHash64   ( ) ;

	int64_t getUrlHash48    ( ) {
		return getUrlHash64() & 0x0000ffffffffffffLL; }

	bool hasMediaExtension ( ) ;

	// . store url w/o http://
	// . without trailing / if path is just "/"
	// . without "www." if in hostname and "rmWWW" is true
	// . returns length
	// . if "buf" is NULL just returns the int16_thand-form length
	char *getShorthandUrl    ( bool rmWWW , int32_t *len );

	// count the path components (root url as 0 path components)
	int32_t  getPathDepth ( bool countFilename ); // = false );

	// get path component #num. starts at 0.
	char *getPathComponent ( int32_t num , int32_t *clen );
	//char *getPathEnd       ( int32_t num );

	// is our hostname "www" ?
	bool isHostWWW ( ) ;

	bool hasSubdomain() { return (m_dlen != m_hlen); };

	// is it xxx.com/* or www.xxx.com/* (CAUTION: www.xxx.yyy.com)
	bool isSimpleSubdomain();

	// spam means dirty/porn
	bool isDirty () { return isSpam(); };

	// is the url a porn/spam url?
	bool isSpam();

	// this is private
	bool isSpam ( char *s , int32_t slen ) ;


	// . detects crazy repetetive urls like this:
	//   http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
	//   steigerwald/letters/send/archive/letters/send/archive/bish/
	//   archive/bish/letters/bish/archive/lettes/send/archive/letters/...
	// . The problem is they use a relative href link on the page when they
	//   should us an absolute and the microsoft web server will still
	//   give the content they meant to give!
	// . this is called by Msg14.cpp to not even spider such urls, and we
	//   also have some even better detection logic in Links.cpp which
	//   is probably more accurate than this function.
	bool isLinkLoop();

	static uint32_t unitTests();
	static char* getDisplayUrl(char* url, SafeBuf* sb);

	// private:

	char    m_url[MAX_URL_LEN]; // the normalized url
	int32_t    m_ulen;

	// points into "url" (http, ftp, mailto, ...)(all lowercase)
	char   *m_scheme;
	int32_t    m_slen;

	// points into "url" (a.com, www.yahoo.com, 1.2.3.4, ...)(allLowercase)
	char   *m_host;
	int32_t    m_hlen;

	// it's 0 if we don't have one
	int32_t    m_ip;

	// points into "url" (/  /~mwells/  /a/b/ ...) (always ends in /)
	char   *m_path;
	int32_t    m_plen;

	// points into "url" (a=hi+there, ...)
	char   *m_query;
	int32_t    m_qlen;

	// points into "url" (html, mpg, wav, doc, ...)
	char   *m_extension;
	int32_t    m_elen;

	// (a.html NULL index.html) (can be NULL)
	char   *m_filename;
	int32_t    m_flen;

	char   *m_domain;
	int32_t    m_dlen;

	char   *m_tld;
	int32_t    m_tldLen;

	// char *m_midDomain equals m_domain
	int32_t    m_mdlen;

	// (80, 8080, 8000, ...)
	int32_t    m_port;
	int32_t    m_defPort;
	int32_t    m_portLen;
	char   *m_portStr;

	// anchor
	char   *m_anchor;
	int32_t    m_anchorLen;

	// Base site url
	//char *m_site;
	//int32_t m_siteLen;
};

#endif