open-source-search-engine/HttpMime.cpp

// -*- c-basic-offset:8 tab-width:8 -*-

#include "gb-include.h"

#include "HttpMime.h"
#include "HashTable.h"

// . convert these values to strings
// . these must be 1-1 with the #define's in HttpMime.h
char *g_contentTypeStrings [] = {
	""     ,
	"html" ,
	"text" ,
	"xml"  ,
	"pdf"  ,
	"doc"  ,
	"xls"  ,
        "ppt"  ,
        "ps"   , // 8
	"gif"  , // 9
	"jpg"  , // 10
	"png"  , // 11
	"tiff" , // 12
	"bmp"  , // 13
	"javascript" , // 14
	"css"  , // 15
	"json" ,  // 16
	"image", // 17
	"spiderstatus" // 18
};

HttpMime::HttpMime () { reset(); }

void HttpMime::reset ( ) {
	m_mimeStartPtr     = NULL;
	m_firstCookie      = NULL;
	m_status           = -1;
	m_contentLen       = -1;
	m_lastModifiedDate =  0;
	m_contentType      =  CT_HTML;
	m_lastModifiedDate =  0;
	m_charset          =  NULL;
	m_charsetLen       =  0;
	m_cookie           =  NULL;
	m_cookieLen        =  0;
	m_locationField    = NULL;
	m_locationFieldLen = 0;
	m_contentEncodingPos = NULL;
	m_contentLengthPos = NULL;
	m_contentTypePos   = NULL;
}

// . returns false if could not get a valid mime
// . we need the url in case there's a Location: mime that's base-relative
bool HttpMime::set ( char *buf , int32_t bufLen , Url *url ) {
	// reset some stuff
	m_mimeStartPtr     = NULL;
	m_firstCookie      = NULL;
	m_contentLen       = -1;
	m_content          = NULL;
	m_bufLen           =  0;
	m_contentType      =  CT_HTML;
	m_contentEncoding  =  ET_IDENTITY;
	m_lastModifiedDate =  0;
	m_charset          =  NULL;
	m_charsetLen       =  0;
	// at the very least we should have a "HTTP/x.x 404\[nc]"
	if ( bufLen < 13 ) { m_boundaryLen = 0; return false; }
	// . get the length of the Mime, must end in \r\n\r\n , ...
	// . m_bufLen is used as the mime length
	m_mimeStartPtr = buf;
	m_bufLen = getMimeLen ( buf , bufLen , &m_boundaryLen );
	// . return false if we had no mime boundary
	// . but set m_bufLen to 0 so getMimeLen() will return 0 instead of -1
	//   thus avoiding a potential buffer overflow
	if ( m_bufLen < 0 ) {
		m_bufLen = 0;
		m_boundaryLen = 0;
		log("mime: no rnrn boundary detected");
		return false;
	}
	// set this
	m_content = buf + m_bufLen;
	// . parse out m_status, m_contentLen, m_lastModifiedData, contentType
	// . returns false on bad mime
	return parse ( buf , m_bufLen , url );
}

// . returns -1 if no boundary found
int32_t HttpMime::getMimeLen ( char *buf , int32_t bufLen , int32_t *bsize ) {
	// size of the boundary
	*bsize = 0;
	// find the boundary
	int32_t i;
	for ( i = 0 ; i < bufLen ; i++ ) {
		// continue until we hit a \r or \n
		if ( buf[i] != '\r' && buf[i] != '\n' ) continue;
		// boundary check
		if ( i + 1 >= bufLen ) continue;
		// prepare for a smaller mime size
		*bsize = 1;
		// \r\r
		if ( buf[i  ] == '\r' && buf[i+1] == '\r' ) break;
		// \n\n
		if ( buf[i  ] == '\n' && buf[i+1] == '\n' ) break;
		// boundary check
		if ( i + 3 >= bufLen ) continue;
		// prepare for a larger mime size
		*bsize = 2;
		// \r\n\r\n
		if ( buf[i  ] == '\r' && buf[i+1] == '\n' &&
		     buf[i+2] == '\r' && buf[i+3] == '\n'  ) break;
		// \n\r\n\r
		if ( buf[i  ] == '\n' && buf[i+1] == '\r' &&
		     buf[i+2] == '\n' && buf[i+3] == '\r'  ) break;
	}
	// return false if could not find the end of the MIME
	if ( i == bufLen ) return -1;
	return i + *bsize * 2;
}

// returns false on bad mime
bool HttpMime::parse ( char *mime , int32_t mimeLen , Url *url ) {
	// reset locUrl to 0
	m_locUrl.reset();
	// return if we have no valid complete mime
	if ( mimeLen == 0 ) return false;
	// status is on first line
	m_status = -1;
	// skip HTTP/x.x till we hit a space
	char *p = mime;
	char *pend = mime + mimeLen;
	while ( p < pend && !is_wspace_a(*p) ) p++;
	// then skip over spaces
	while ( p < pend &&  is_wspace_a(*p) ) p++;
	// return false on a problem
	if ( p == pend ) return false;
	// then read in the http status
	m_status = atol2 ( p , pend - p );
	// if no Content-Type: mime field was provided, assume html
	m_contentType = CT_HTML;
	// assume default charset
	m_charset    = NULL;
	m_charsetLen = 0;
	// set contentLen, lastModifiedDate, m_cookie
	p = mime;
	while ( p < pend ) {
		// compute the length of the string starting at p and ending
		// at a \n or \r
		int32_t len = 0;
		while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++;
		// . if we could not find a \n or \r there was an error
		// . MIMEs must always end in \n or \r
		if ( &p[len] >= pend ) return false;
		// . stick a NULL at the end of the line
		// . overwrites \n or \r TEMPORARILY
		char c = p [ len ];
		p [ len ] = '\0';
		// parse out some meaningful data
		if      ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) {
			m_contentLengthPos = p + 15;
			m_contentLen = atol( m_contentLengthPos);
		}
		else if ( strncasecmp ( p , "Last-Modified:"  ,14) == 0 ) {
			m_lastModifiedDate=atotime(p+14);
			// do not let them exceed current time for purposes
			// of sorting by date using datedb (see Msg16.cpp)
			time_t now = time(NULL);
			if (m_lastModifiedDate > now) m_lastModifiedDate = now;
		}
		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) {
			m_contentType = getContentTypePrivate ( p + 13 );
			char *s = p + 13;
			while ( *s == ' ' || *s == '\t' ) s++;
			m_contentTypePos = s;
		}
		else if ( strncasecmp ( p , "Set-Cookie:"   ,10) == 0 ) {
			if ( ! m_firstCookie ) m_firstCookie = p;
			m_cookie = p + 11;
			if ( m_cookie[0] == ' ' ) m_cookie++;
			m_cookieLen = gbstrlen ( m_cookie );
		}
		else if ( strncasecmp ( p , "Location:"       , 9) == 0 ) {
			// point to it
			char *tt = p + 9;
			// skip if space
			if ( *tt == ' ' ) tt++;
			if ( *tt == ' ' ) tt++;
			// at least set this for Msg13.cpp to use
			m_locationField    = tt;
			m_locationFieldLen = gbstrlen(tt);
			// . we don't add the "www." because of slashdot.com
			// . we skip initial spaces in this Url::set() routine
			if(url)
				m_locUrl.set ( url, p + 9, len - 9,
					       false/*addWWW?*/);
		}
		else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) {
			//only support gzip now, it doesn't seem like servers
			//implement the other types much
			m_contentEncodingPos = p+17;
			if(strstr(m_contentEncodingPos, "gzip")) {
				m_contentEncoding = ET_GZIP;
			}
			else if(strstr(m_contentEncodingPos, "deflate")) {
				//zlib's compression
				m_contentEncoding = ET_DEFLATE;
			}
		}
		//else if ( strncasecmp ( p, "Cookie:", 7) == 0 )
		//	log (LOG_INFO, "mime: Got Cookie = %s", (p+7));
		// re-insert the character that we replaced with a '\0'
		p [ len ] = c;
		// go to next line
		p += len;
		// skip over the cruft at the end of this line
		while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++;
	}
	return true;
}

// . s must be null terminated
// . http://wgc.chem.pu.ru/educate/rfc/rfc1945/part4.htm#3.3 has date formats
// . #1: Sun, 06 Nov 1994 08:49:37 GMT  ;RFC 822, updated by RFC 1123
// . #2: Sunday, 06-Nov-94 08:49:37 GMT ;RFC 850,obsoleted by RFC1036
// . #3: Sun Nov  6 08:49:37 1994       ;ANSI C's asctime() format
// . #4: 06 Nov 1994 08:49:37 GMT  ... my own
// . #5: 2007-12-31
// . #6: 2008-04-30T20:48:25Z (ISO8601)

time_t atotime ( char *s ) {

	// skip non-alnum padding
	while ( *s && ! isalnum (*s) ) s++;

	// if first char is a num, it's type #4
	if ( is_digit(*s) ) {
		int32_t num = atol(s);
		// 2007-12-31
		if ( num > 1900 ) return atotime5 ( s );
		return atotime4 ( s );
	}

	// . determine if we have type #1, #2 or #3 date format
	// . now if there's hyphens we have type #2
	char *t = s;
	while ( *t && *t!='-') t++;
	if ( *t == '-' ) return atotime2 ( s );

	// now if there's a comma we have type 1
	t = s;
	while ( *t && *t!=',') t++;
	if ( *t == ',' ) return atotime1 ( s );

	// otherwise, must be type 3
	return atotime3 ( s );
}

#include "Dates.h" // for getTimeZone()

// #1: Sun, 06 Nov 1994 08:49:37 GMT  ;RFC 822, updated by RFC 1123
time_t atotime1 ( char *s ) {
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// DAY OF WEEK
	t.tm_wday = getWeekday ( s );
	while ( *s && ! isdigit(*s) ) s++;
	// DAY OF MONTH
	t.tm_mday = atol ( s );
	while ( *s && ! isalpha (*s) ) s++;
	// MONTH
	t.tm_mon = getMonth ( s );
	while ( *s && ! isdigit (*s) ) s++;
	// YEAR
	t.tm_year = atol ( s ) - 1900 ; // # of years since 1900
	while ( isdigit (*s) ) s++;
	while ( isspace (*s) ) s++;
	// TIME
	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;

	// translate using mktime
	time_t global = timegm ( &t );

	// skip HH:MM:SS
	while ( *s && ! isspace (*s) ) s++;

	// no timezone following??? fix core.
	if ( ! *s ) return global;

	// skip spaces
	while ( isspace (*s) ) s++;
	// convert local time to "utc" or whatever timezone "s" points to,
	// which is usually gmt or utc
	int32_t tzoff = getTimeZone ( s ) ;
	if ( tzoff != BADTIMEZONE ) global += tzoff;
	return global;

	// now, convert to utc
	//time_t utc  = time(NULL);
	// get time here locally
	//time_t here = localtime(&utc);
	// what is the diff?
	//int32_t delta = here - utc;
	// modify our time to make it into utc
	//return local - delta;
}

// #2: Sunday, 06-Nov-94 08:49:37 GMT ;RFC 850,obsoleted by RFC1036
time_t atotime2 ( char *s ) {
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// DAY OF WEEK
	t.tm_wday = getWeekday ( s ); // need getLongWeekday()?
	while ( *s && ! isdigit ( *s ) ) s++;
	// DAY OF MONTH
	t.tm_mday = atol ( s );
	while ( *s && ! isalpha (*s) ) s++;
	// MONTH
	t.tm_mon = getMonth ( s );
	while ( *s && ! isdigit (*s) ) s++;
	// YEAR
	t.tm_year = atol ( s ) ;  // # of years since 1900
	while ( isdigit (*s) ) s++;
	while ( isspace (*s) ) s++;
	// TIME
	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;
	// translate using mktime
	time_t global = timegm ( &t );

	// skip HH:MM:SS
	while ( ! isspace (*s) ) s++;
	// skip spaces
	while ( isspace (*s) ) s++;
	// convert local time to "utc" or whatever timezone "s" points to,
	// which is usually gmt or utc
	int32_t tzoff = getTimeZone ( s ) ;
	if ( tzoff != BADTIMEZONE ) global += tzoff;
	return global;
}

// #3: Sun Nov  6 08:49:37 1994       ;ANSI C's asctime() format
time_t atotime3 ( char *s ) {
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// DAY OF WEEK
	t.tm_wday = getWeekday ( s ); // need getLongWeekday()?
	while ( isalpha(*s) ) s++;
	while ( isspace(*s) ) s++;
	// MONTH
	t.tm_mon = getMonth ( s );
	while ( *s && ! isdigit (*s) ) s++;
	// DAY OF MONTH
	t.tm_mday = atol ( s );
	while ( *s && ! isalpha (*s) ) s++;
	// TIME
	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
	while ( *s && ! isspace (*s) ) s++;
	while ( isspace (*s) ) s++;
	// YEAR
	t.tm_year = atol ( s ) - 1900 ; // # of years since 1900
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;
	// translate using mktime
	time_t tt = timegm ( &t );
	return tt;
}

// . #4: 06 Nov 1994 08:49:37 GMT  ;RFC 822, updated by RFC 1123
// . like atotime1()
time_t atotime4 ( char *s ) {
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// DAY OF WEEK
	//t.tm_wday = getWeekday ( s );
	//while ( *s && ! isdigit(*s) ) s++;
	// DAY OF MONTH
	t.tm_mday = atol ( s );
	while ( *s && ! isalpha (*s) ) s++;
	// MONTH
	t.tm_mon = getMonth ( s );
	while ( *s && ! isdigit (*s) ) s++;
	// YEAR
	t.tm_year = atol ( s ) - 1900 ; // # of years since 1900
	while ( isdigit (*s) ) s++;
	while ( isspace (*s) ) s++;
	// TIME
	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;
	// translate using mktime
	time_t global = timegm ( &t );

	// skip HH:MM:SS
	while ( ! isspace (*s) ) s++;
	// skip spaces
	while ( isspace (*s) ) s++;
	// convert local time to "utc" or whatever timezone "s" points to,
	// which is usually gmt or utc
	int32_t tzoff = getTimeZone ( s ) ;
	if ( tzoff != BADTIMEZONE ) global += tzoff;
	return global;
}

// 2007-12-31
// 2008-04-30T20:48:25Z (ISO8601)
time_t atotime5 ( char *s ) {
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// YEAR
	int32_t y = atol ( s ) ;
	// must be > 1900
	if ( y < 1900 ) return -1;
	if ( y > 2100 ) return -1;
	t.tm_year = y - 1900 ; // # of years since 1900
	// skip year
	while ( *s && isdigit (*s) ) s++;
	// skip the hyphen or space
	if ( *s != '-' && *s !='/' && *s !=' ' ) return -1;
	s++;
	// must be a digit
	if ( ! is_digit(*s) ) return -1;

	// month
	t.tm_mon = atol(s) - 1;
	// skip month
	while ( *s && isdigit (*s) ) s++;
	// skip the hyphen or space
	if ( *s != '-' && *s !='/' && *s !=' ' ) return -1;
	s++;
	// must be a digit
	if ( ! is_digit(*s) ) return -1;

	// day of week
	t.tm_mday = atol ( s );
	while ( isdigit (*s) ) s++;
	while ( isspace (*s) ) s++;
        if (*s == 'T')         s++;

	// TIME
	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;
	// translate using mktime
	return timegm ( &t );
}


// sunday=0, monday=1, tuesday=2, wednesday=3, thursday=4, friday=5, saturday=6
// sun=0, mon=1, tue=2, wed=3, thu=4, fri=5, sat=6
int32_t getWeekday ( char *s ) {

	char a = tolower(s[0]);
	char b = tolower(s[1]);

	switch ( a ) {
	case 's':
		if ( b=='u' ) return 0; // sun
		return 6;                  // sat
	case 'm':
		return 1;                  // mon
	case 't':
		if ( b=='u' ) return 2; // tue
		return 4;                  // thu
	case 'w':
		return 3;                  // wed
	case 'f':
		return 5;                  // fri
	}
	//  bad week day, return sunday
	return 0;
}

int32_t getMonth ( char *s ) {

	char a = tolower(s[0]);
	char b = tolower(s[1]);
	char c = tolower(s[2]);

	switch ( a ) {
	case 'j':
		if ( b == 'a' ) return 0; // january
		if ( c == 'n' ) return 5; // june
		if ( c == 'l' ) return 6; // july
	case 'm':
		if ( c == 'r' ) return 2; // march
		if ( c == 'y' ) return 4; // may
	case 'a':
		if ( b == 'p' ) return 3; // april
		if ( b == 'u' ) return 7; // august
	case 'f': return  1; // february
	case 's': return  8; // september
	case 'o': return  9; // october
	case 'n': return 10; // november
	case 'd': return 11; // december
	}
	// default
	return 0;
}

// . s = "xx:xx:xx"
void getTime ( char *s , int *sec , int *min , int *hour ) {
	*hour = atol ( s );
	while ( isdigit ( *s ) ) s++;  if ( *s == ':' ) s++;
	*min  = atol ( s );
	while ( isdigit ( *s ) ) s++;  if ( *s == ':' ) s++;
	*sec  = atol ( s );
}

int32_t getContentTypeFromStr ( char *s ) {

	int32_t slen = gbstrlen(s);

	// trim off spaces at the end
	char tmp[64];
	if ( s[slen-1] == ' ' ) {
		strncpy(tmp,s,63);
		tmp[63] = '\0';
		int32_t newLen = gbstrlen(tmp);
		s = tmp;
		char *send = tmp + newLen;
		for ( ; send>s && send[-1] == ' '; send-- );
		*send = '\0';
	}


	// -1 means unknown
	//int32_t ct = -1;
	int32_t ct = CT_UNKNOWN;
	// html
	if      (!strcasecmp(s,"text/html"               ) ) ct = CT_HTML;
	else if (!strcasecmp(s,"text/plain"              ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"text/xml"                ) ) ct = CT_XML;
	else if (!strcasecmp(s,"text/txt"                ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"text"                    ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"text"                    ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"txt"                     ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"application/xml"         ) ) ct = CT_XML;
	// we were not able to spider links on an xhtml doc because
	// this was set to CT_XML, so try CT_HTML
	else if (!strcasecmp(s,"application/xhtml+xml"   ) ) ct = CT_HTML;
	else if (!strcasecmp(s,"application/rss+xml"     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"rss"                     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/rdf+xml"     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/atom+xml"    ) ) ct = CT_XML;
	else if (!strcasecmp(s,"atom+xml"                ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/pdf"         ) ) ct = CT_PDF;
	else if (!strcasecmp(s,"application/msword"      ) ) ct = CT_DOC;
	else if (!strcasecmp(s,"application/vnd.ms-excel") ) ct = CT_XLS;
	else if (!strcasecmp(s,"application/vnd.ms-powerpoint")) ct = CT_PPT;
	else if (!strcasecmp(s,"application/mspowerpoint") ) ct = CT_PPT;
	else if (!strcasecmp(s,"application/postscript"  ) ) ct = CT_PS;
	else if (!strcasecmp(s,"application/warc"        ) ) ct = CT_WARC;
	else if (!strcasecmp(s,"application/arc"         ) ) ct = CT_ARC;
        else if (!strcasecmp(s,"image/gif"               ) ) ct = CT_GIF;
        else if (!strcasecmp(s,"image/jpeg"              ) ) ct = CT_JPG;
        else if (!strcasecmp(s,"image/png"               ) ) ct = CT_PNG;
        else if (!strcasecmp(s,"image/tiff"              ) ) ct = CT_TIFF;
        else if (!strncasecmp(s,"image/",6               ) ) ct = CT_IMAGE;
	else if (!strcasecmp(s,"application/javascript"  ) ) ct = CT_JS;
	else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
	else if (!strcasecmp(s,"application/x-gzip"      ) ) ct = CT_GZ;
	else if (!strcasecmp(s,"text/javascript"         ) ) ct = CT_JS;
	else if (!strcasecmp(s,"text/x-js"               ) ) ct = CT_JS;
	else if (!strcasecmp(s,"text/js"                 ) ) ct = CT_JS;
	else if (!strcasecmp(s,"text/css"                ) ) ct = CT_CSS;
	else if (!strcasecmp(s,"application/json"        ) ) ct = CT_JSON;
	// facebook.com:
	else if (!strcasecmp(s,"application/vnd.wap.xhtml+xml") ) ct =CT_HTML;
	else if (!strcasecmp(s,"binary/octet-stream") ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/octet-stream") ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/binary" ) ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/x-tar" ) ) ct = CT_UNKNOWN;
	else if ( !strncmp ( s , "audio/",6)  ) ct = CT_UNKNOWN;
	// . semicolon separated list of info, sometimes an element is html
	// . these might have an address in them...
	else if (!strcasecmp(s,"text/x-vcard" )  ) ct = CT_HTML;

	return ct;
}

// . s is a NULL terminated string like "text/html"
int32_t HttpMime::getContentTypePrivate ( char *s ) {
	char *send = NULL;
	char c;
	int32_t ct;
	// skip spaces
	while ( *s==' ' || *s=='\t' ) s++;
	// find end of s
	send = s;
	// they can have "text/plain;charset=UTF-8" too
	for ( ; *send && *send !=';' && *send !='\r' && *send !='\n' ; send++);

	//
	// point to possible charset desgination
	//
	char *t = send ;
	// charset follows the semicolon
	if ( *t == ';' ) {
		// skip semicolon
		t++;
		// skip spaces
		while ( *t==' ' || *t=='\t' ) t++;
		// get charset name "charset=euc-jp"
		if ( strncasecmp ( t , "charset" , 7 ) != 0 ) goto next;
		// skip it
		t += 7;
		// skip spaces, equal, spaces
		while ( *t==' ' || *t=='\t' ) t++;
		if    ( *t=='='             ) t++;
		while ( *t==' ' || *t=='\t' ) t++;
		// get charset
		m_charset = t;
		// get length
		while ( *t && *t!='\r' && *t!='\n' && *t!=' ' && *t!='\t') t++;
		m_charsetLen = t - m_charset;
	}

 next:

	// temp term it for the strcmp() function
	c = *send; *send = '\0';
	// set this
	//ct = -1;

	// returns CT_UNKNOWN if unknown
	ct = getContentTypeFromStr  ( s );

	// log it for reference
	//if ( ct == -1 ) { char *xx=NULL;*xx=0; }
	if ( ct == CT_UNKNOWN ) {
		//ct = CT_UNKNOWN;
		log("http: unrecognized content type \"%s\"",s);
	}
	// unterm it
	*send = c;
	// return 0 for the contentType if unknown
	return ct;
}

// the table that maps a file extension to a content type
static HashTableX s_mimeTable;
bool s_init = false;

void resetHttpMime ( ) {
	s_mimeTable.reset();
}

const char *extensionToContentTypeStr2 ( char *ext , int32_t elen ) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return NULL;
	if ( elen <= 0 ) return NULL;
	// get hash for table look up
	int32_t key = hash32 ( ext , elen );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	if ( ! pp ) return NULL;
	return *pp;
}

const char *HttpMime::getContentTypeFromExtension ( char *ext , int32_t elen) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return "text/html";
	if ( elen <= 0 ) return "text/html";
	// get hash for table look up
	int32_t key = hash32 ( ext , elen );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	// if not found in table, assume text/html
	if ( ! pp ) return "text/html";
	return *pp;
}


// . list of types is on: http://www.duke.edu/websrv/file-extensions.html
// . i copied it to the bottom of this file though
const char *HttpMime::getContentTypeFromExtension ( char *ext ) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return "text/html";
	// get hash for table look up
	int32_t key = hash32n ( ext );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	// if not found in table, assume text/html
	if ( ! pp ) return "text/html";
	return *pp;
}

const char *HttpMime::getContentEncodingFromExtension ( char *ext ) {
	if ( ! ext ) return NULL;
	if ( strcasecmp ( ext ,"bz2"  )==0 ) return "x-bzip2";
	if ( strcasecmp ( ext ,"gz"   )==0 ) return "x-gzip";
	//if ( strcasecmp ( ext ,"htm"   ) == 0 ) return "text/html";
	//if ( strcasecmp ( ext ,"html"  ) == 0 ) return "text/html";
	return NULL;
}

// make a redirect mime
void HttpMime::makeRedirMime ( char *redir , int32_t redirLen ) {
	char *p = m_buf;
	gbmemcpy ( p , "HTTP/1.0 302 RD\r\nLocation: " , 27 );
	p += 27;
	if ( redirLen > 600 ) redirLen = 600;
	gbmemcpy ( p , redir , redirLen );
	p += redirLen;
	*p++ = '\r';
	*p++ = '\n';
	*p++ = '\r';
	*p++ = '\n';
	*p = '\0';
	m_bufLen = p - m_buf;
	if ( m_bufLen > 1023 ) { char *xx=NULL;*xx=0; }
	// set the mime's length
	//m_bufLen = gbstrlen ( m_buf );
}

// a cacheTime of -1 means browser should not cache at all
void HttpMime::makeMime  ( int32_t    totalContentLen    ,
			   int32_t    cacheTime          ,
			   time_t  lastModified       ,
			   int32_t    offset             ,
			   int32_t    bytesToSend        ,
			   char   *ext                ,
			   bool    POSTReply          ,
			   char   *contentType        ,
			   char   *charset            ,
			   int32_t    httpStatus         ,
			   char   *cookie             ) {
	// assume UTF-8
	//if ( ! charset ) charset = "utf-8";
	// . make the content type line
	// . uses a static buffer
	if ( ! contentType )
		contentType = (char *)getContentTypeFromExtension ( ext );

	// do not cache plug ins
	if ( contentType && strcmp(contentType,"application/x-xpinstall")==0)
		cacheTime = -2;

	// assume UTF-8, but only if content type is text
	// . No No No!!!
	// . This prevents charset specification in html files
	// . -partap

	//if ( ! charset && contentType && strncmp(contentType,"text",4)==0)
	//	charset = "utf-8";
	// this is used for bz2 and gz files (mp3?)
	const char *contentEncoding = getContentEncodingFromExtension ( ext );
	// the string
	char enc[128];
	if ( contentEncoding )
		sprintf ( enc , "Content-Encoding: %s\r\n", contentEncoding );
	else
		enc[0] = '\0';
	// get the time now
	//time_t now = getTimeGlobal();
	time_t now;
	if ( isClockInSync() ) now = getTimeGlobal();
	else                   now = getTimeLocal();
	// get the greenwhich mean time (GMT)
	char ns[128];
	struct tm *timeStruct = gmtime ( &now );
	// Wed, 20 Mar 2002 16:47:30 GMT
	strftime ( ns , 126 , "%a, %d %b %Y %T GMT" , timeStruct );
	// if lastModified is 0 use now
	if ( lastModified == 0 ) lastModified = now;
	// convert lastModified greenwhich mean time (GMT)
	char lms[128];
	timeStruct = gmtime ( &lastModified );
	// Wed, 20 Mar 2002 16:47:30 GMT
	strftime ( lms , 126 , "%a, %d %b %Y %T GMT" , timeStruct );
	// . the pragma no cache string (used just for proxy servers?)
	// . also use cache-control: for the browser itself (HTTP1.1, though)
	// . pns = "Pragma: no-cache\nCache-Control: no-cache\nExpires: -1\n";
	char tmp[128];
	char *pns ;
	// with cache-control on, when you hit the back button, it reloads
	// the page, this is bad for most things... so we only avoid the
	// cache for index.html and PageAddUrl.cpp (the main and addurl page)
	if      ( cacheTime == -2 ) pns =  "Cache-Control: no-cache\r\n"
					   "Pragma: no-cache\r\n"
					   "Expires: -1\r\n";
	// so when we click on a control link, it responds correctly.
	// like turning spiders on.
	else if  ( cacheTime == -1 ) pns = "Pragma: no-cache\r\n"
					   "Expires: -1\r\n";
	// don't specify cache times if it's 0 (let browser regulate it)
	else if ( cacheTime == 0 ) pns = "";
	// otherwise, expire tag: "Expires: Wed, 23 Dec 2001 10:23:01 GMT"
	else {
		time_t  expDate = now + cacheTime;
		timeStruct = gmtime ( &expDate );
		strftime ( tmp , 100 , "Expires: %a, %d %b %Y %T GMT\r\n",
			   timeStruct );
		pns = tmp;
	}
	// . set httpStatus
	// . a reply to a POST (not a GET or HEAD) should be 201
	char *p = m_buf;
	char *smsg = "";
	if ( POSTReply ) {
		if ( httpStatus == -1 ) httpStatus = 200;
		if ( httpStatus == 200 ) smsg = " OK";
		if ( ! charset ) charset = "utf-8";
		//sprintf ( m_buf ,
		p += sprintf ( p,
			  "HTTP/1.0 %" INT32 "%s\r\n"
			  "Date: %s\r\n"
			       //"P3P: CP=\"CAO PSA OUR\"\r\n"
			  "Access-Control-Allow-Origin: *\r\n"
			  "Server: Gigablast/1.0\r\n"
			  "Content-Length: %" INT32 "\r\n"
			  //"Expires: Wed, 23 Dec 2003 10:23:01 GMT\r\n"
			  //"Expires: -1\r\n"
			  "Connection: Close\r\n"
			  "%s"
			  "Content-Type: %s\r\n",
			  //"Connection: Keep-Alive\r\n"
			  //"%s"
			  //"Location: fuck\r\n"
			  //"Location: http://192.168.0.4:8000/cgi/3.cgi\r\n"
			  //"Last-Modified: %s\r\n\r\n" ,
			  httpStatus , smsg ,
			  ns , totalContentLen , enc , contentType  );
			  //pns ,
	                  //ns );
			  //lms );
	}
	// . is it partial content?
	// . if bytesToSend is < 0 it means "totalContentLen"
	else if ( offset > 0 || bytesToSend != -1 ) {
		if ( httpStatus == -1 ) httpStatus = 206;
		if ( ! charset ) charset = "utf-8";
		//sprintf ( m_buf ,
		p += sprintf( p,
			      "HTTP/1.0 %" INT32 " Partial content\r\n"
			      "%s"
			      "Content-Length: %" INT32 "\r\n"
			      "Content-Range: %" INT32 "-%" INT32 "(%" INT32 ")\r\n"// added "bytes"
			      "Connection: Close\r\n"
			      //"P3P: CP=\"CAO PSA OUR\"\r\n"
			      // for ajax support
			      "Access-Control-Allow-Origin: *\r\n"
			      "Server: Gigablast/1.0\r\n"
			      "%s"
			      "Date: %s\r\n"
			      "Last-Modified: %s\r\n"
			      "Content-Type: %s\r\n",
			      httpStatus ,
			      enc ,bytesToSend ,
			      offset , offset + bytesToSend ,
			      totalContentLen ,
			      pns ,
			      ns ,
			      lms , contentType );
		// otherwise, do a normal mime
	}
	else {
		char encoding[256];
		if (charset) sprintf(encoding, "; charset=%s", charset);
		else encoding[0] = '\0';


		if ( httpStatus == -1 ) httpStatus = 200;
		if ( httpStatus == 200 ) smsg = " OK";
		//sprintf ( m_buf ,
		p += sprintf( p,
			      "HTTP/1.0 %" INT32 "%s\r\n"
			      , httpStatus , smsg );
		// if content length is not known, as in diffbot.cpp, then
		// do not print it into the mime
		if ( totalContentLen >= 0 )
			p += sprintf ( p ,
				       // make it at least 4 spaces so we can
				       // change the length of the content
				       // should we insert a login bar in
				       // Proxy::storeLoginBar()
				       "Content-Length: %04" INT32 "\r\n"
				       , totalContentLen );
		p += sprintf ( p ,
			      "%s"
			      "Content-Type: %s",
			       enc , contentType );
		if ( charset ) p += sprintf ( p , "; charset=%s", charset );
		p += sprintf ( p , "\r\n");
		p += sprintf ( p ,
			       //"Connection: Keep-Alive\r\n"
			       "Connection: Close\r\n"
			       //"P3P: CP=\"CAO PSA OUR\"\r\n"
			       "Access-Control-Allow-Origin: *\r\n"
			       "Server: Gigablast/1.0\r\n"
			       "%s"
			       "Date: %s\r\n"
			       "Last-Modified: %s\r\n" ,
			       pns ,
			       ns ,
			       lms );
	}
	// write the cookie if we have one
	if (cookie) {
		// now it is a list of Set-Cookie: x=y\r\n lines
		//p += sprintf ( p, "Set-Cookie: %s\r\n", cookie);
		if ( strncmp(cookie,"Set-Cookie",10 ) )
			p += sprintf(p,"Set-Cookie: ");
		p += sprintf ( p, "%s", cookie);
		if ( p[-1] != '\n' && p[-2] != '\r' ) {
			*p++ = '\r';
			*p++ = '\n';
		}
	}

	// write another line to end the mime
	p += sprintf(p, "\r\n");
	// set the mime's length
	//m_bufLen = gbstrlen ( m_buf );
	m_bufLen = p - m_buf;
}


//FILE EXTENSIONS to MIME CONTENT-TYPE
//------------------------------------

// set hash table
static char *s_ext[] = {
      "ai" , "application/postscript",
     "aif" , "audio/x-aiff",
    "aifc" , "audio/x-aiff",
    "aiff" , "audio/x-aiff",
     "asc" , "text/plain",
      "au" , "audio/basic",
     "avi" , "video/x-msvideo",
   "bcpio" , "application/x-bcpio",
     "bin" , "application/octet-stream",
     "bmp" , "image/gif",
      "bz2", "application/x-bzip2",
       "c" , "text/plain",
      "cc" , "text/plain",
    "ccad" , "application/clariscad",
     "cdf" , "application/x-netcdf",
   "class" , "application/octet-stream",
    "cpio" , "application/x-cpio",
     "cpt" , "application/mac-compactpro",
     "csh" , "application/x-csh",
     "css" , "text/css",
     "dcr" , "application/x-director",
     "dir" , "application/x-director",
     "dms" , "application/octet-stream",
     "doc" , "application/msword",
     "drw" , "application/drafting",
     "dvi" , "application/x-dvi",
     "dwg" , "application/acad",
     "dxf" , "application/dxf",
     "dxr" , "application/x-director",
     "eps" , "application/postscript",
     "etx" , "text/x-setext",
     "exe" , "application/octet-stream",
      "ez" , "application/andrew-inset",
       "f" , "text/plain",
     "f90" , "text/plain",
     "fli" , "video/x-fli",
     "gif" , "image/gif",
    "gtar" , "application/x-gtar",
      "gz" , "application/x-gzip",
       "h" , "text/plain",
     "hdf" , "application/x-hdf",
      "hh" , "text/plain",
     "hqx" , "application/mac-binhex40",
     "htm" , "text/html",
    "html" , "text/html",
     "ice" , "x-conference/x-cooltalk",
     "ief" , "image/ief",
    "iges" , "model/iges",
     "igs" , "model/iges",
     "ips" , "application/x-ipscript",
     "ipx" , "application/x-ipix",
     "jpe" , "image/jpeg",
    "jpeg" , "image/jpeg",
     "jpg" , "image/jpeg",
      "js" , "application/x-javascript",
     "kar" , "audio/midi",
   "latex" , "application/x-latex",
     "lha" , "application/octet-stream",
     "lsp" , "application/x-lisp",
     "lzh" , "application/octet-stream",
       "m" , "text/plain",
     "man" , "application/x-troff-man",
      "me" , "application/x-troff-me",
    "mesh" , "model/mesh",
     "mid" , "audio/midi",
    "midi" , "audio/midi",
     "mif" , "application/vnd.mif",
    "mime" , "www/mime",
     "mov" , "video/quicktime",
   "movie" , "video/x-sgi-movie",
     "mp2" , "audio/mpeg",
     "mp3" , "audio/mpeg",
     "mpe" , "video/mpeg",
    "mpeg" , "video/mpeg",
     "mpg" , "video/mpeg",
    "mpga" , "audio/mpeg",
      "ms" , "application/x-troff-ms",
     "msh" , "model/mesh",
      "nc" , "application/x-netcdf",
     "oda" , "application/oda",
     "pbm" , "image/x-portable-bitmap",
     "pdb" , "chemical/x-pdb",
     "pdf" , "application/pdf",
     "pgm" , "image/x-portable-graymap",
     "pgn" , "application/x-chess-pgn",
     "png" , "image/png",
     "ico" , "image/x-icon",
     "pnm" , "image/x-portable-anymap",
     "pot" , "application/mspowerpoint",
     "ppm" , "image/x-portable-pixmap",
     "pps" , "application/mspowerpoint",
     "ppt" , "application/mspowerpoint",
     "ppz" , "application/mspowerpoint",
     "pre" , "application/x-freelance",
     "prt" , "application/pro_eng",
      "ps" , "application/postscript",
      "qt" , "video/quicktime",
      "ra" , "audio/x-realaudio",
     "ram" , "audio/x-pn-realaudio",
     "ras" , "image/cmu-raster",
     "rgb" , "image/x-rgb",
      "rm" , "audio/x-pn-realaudio",
    "roff" , "application/x-troff",
     "rpm" , "audio/x-pn-realaudio-plugin",
     "rtf" , "text/rtf",
     "rtx" , "text/richtext",
     "scm" , "application/x-lotusscreencam",
     "set" , "application/set",
     "sgm" , "text/sgml",
    "sgml" , "text/sgml",
      "sh" , "application/x-sh",
    "shar" , "application/x-shar",
    "silo" , "model/mesh",
     "sit" , "application/x-stuffit",
     "skd" , "application/x-koan",
     "skm" , "application/x-koan",
     "skp" , "application/x-koan",
     "skt" , "application/x-koan",
     "smi" , "application/smil",
    "smil" , "application/smil",
     "snd" , "audio/basic",
     "sol" , "application/solids",
     "spl" , "application/x-futuresplash",
     "src" , "application/x-wais-source",
    "step" , "application/STEP",
     "stl" , "application/SLA",
     "stp" , "application/STEP",
 "sv4cpio" , "application/x-sv4cpio",
  "sv4crc" , "application/x-sv4crc",
     "swf" , "application/x-shockwave-flash",
       "t" , "application/x-troff",
     "tar" , "application/x-tar",
     "tcl" , "application/x-tcl",
     "tex" , "application/x-tex",
    "texi" , "application/x-texinfo",
  "texinfo", "application/x-texinfo",
     "tif" , "image/tiff",
    "tiff" , "image/tiff",
      "tr" , "application/x-troff",
     "tsi" , "audio/TSP-audio",
     "tsp" , "application/dsptype",
     "tsv" , "text/tab-separated-values",
     "txt" , "text/plain",
     "unv" , "application/i-deas",
   "ustar" , "application/x-ustar",
     "vcd" , "application/x-cdlink",
     "vda" , "application/vda",
     "viv" , "video/vnd.vivo",
    "vivo" , "video/vnd.vivo",
    "vrml" , "model/vrml",
     "wav" , "audio/x-wav",
     "wrl" , "model/vrml",
     "xbm" , "image/x-xbitmap",
     "xlc" , "application/vnd.ms-excel",
     "xll" , "application/vnd.ms-excel",
     "xlm" , "application/vnd.ms-excel",
     "xls" , "application/vnd.ms-excel",
     "xlw" , "application/vnd.ms-excel",
     "xml" , "text/xml",
     "xpm" , "image/x-xpixmap",
     "xwd" , "image/x-xwindowdump",
     "xyz" , "chemical/x-pdb",
      "zip" , "application/zip" ,
      "xpi", "application/x-xpinstall",
      // newstuff
      "warc", "application/warc",
      "arc", "application/arc"
};

// . init s_mimeTable in this call
// . called from HttpServer::init
// . returns false and sets g_errno on error
bool HttpMime::init ( ) {
	// only need to call once
	if ( s_init ) return true;
	// make sure only called once
	s_init = true;
	//s_mimeTable.set ( 256 );
	//s_mimeTable.setLabel("mimetbl");
	if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl"))
		return false;
	// set table from internal list
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) {
		int32_t key = hash32n ( s_ext[i] );
		if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) )
			return log("HttpMime::init: failed to set table.");
	}
	// quick text
	const char *tt = getContentTypeFromExtension ( "zip" );
	if ( strcmp(tt,"application/zip") != 0 ) {
		g_errno = EBADENGINEER;
		return log("http: Failed to init mime table correctly.");
	}
	// a more thorough test
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) {
		tt = getContentTypeFromExtension ( s_ext[i] );
		if ( strcmp(tt,s_ext[i+1]) == 0 ) continue;
		g_errno = EBADENGINEER;
		return log("http: Failed to do mime table correctly. i=%" INT32 "",i);
	}

	// TODO: set it from a user supplied file here
	return true;
}

bool HttpMime::addCookiesIntoBuffer ( SafeBuf *sb ) {
	// point to start of request
	if ( m_bufLen <= 0 ) return true;
	if ( ! m_mimeStartPtr ) return true;
	if ( ! m_firstCookie  ) return true;
	char *p = m_firstCookie;
	char *pend = m_mimeStartPtr + m_bufLen;
	while ( p < pend ) {
		// compute the length of the string starting at p and ending
		// at a \n or \r
		int32_t len = 0;
		while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++;
		// . if we could not find a \n or \r there was an error
		// . MIMEs must always end in \n or \r
		if ( &p[len] >= pend ) return false;
		// . stick a NULL at the end of the line
		// . overwrites \n or \r TEMPORARILY
		char c = p [ len ];
		p [ len ] = '\0';
		// parse out some meaningful data
		if ( strncasecmp ( p , "Set-Cookie:"   ,10) == 0 ) {
			char *cookie = p + 11;
			if ( cookie[0] == ' ' ) cookie++;
			char *cookieEnd = cookie;
			for ( ; *cookieEnd && *cookieEnd != ';';cookieEnd++);
			int32_t cookieLen = cookieEnd - cookie;
			// accumulate into buffer
			sb->safeMemcpy ( cookie , cookieLen );
			sb->pushChar(';');
			sb->nullTerm();
		}
		// re-insert the character that we replaced with a '\0'
		p [ len ] = c;
		// go to next line
		p += len;
		// skip over the cruft at the end of this line
		while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++;
	}
	return true;
}