proxy fixes

2015-03-05 11:10:40 -08:00
parent 0eafc68a13
commit 95e3a760e9
8 changed files with 121 additions and 47 deletions
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -156,6 +156,14 @@ bool HttpServer::getDoc ( char   *url      ,
 		defPort = 443;
 	}

+	// if we are using gigablast as a squid proxy then the
+	// "fullRequest" and the url will be like "CONNECT foo.com:443 HTT..."
+	// and it is an https url, because we only use the CONNECT cmd for
+	// downloading https urls over a proxy i think
+	char *p = fullRequest;
+	if ( p && strncmp(p,"CONNECT ",8)==0 )
+		urlIsHttps = true;
+
 	// if going through a proxy do not use the ssl server, it will
 	// handle the encryption from itself to the host website. unfortunately
 	// then the http requests/responses are unencrypted from the
@ -3582,7 +3590,10 @@ void gotSquidProxiedUrlIp ( void *state , int32_t ip ) {

 	// include terminating \0. well it is already i think. see
 	// Msg13Request::getSize(), so no need to add +1
-	r->size_url = sqs->m_sock->m_readOffset;
+	r->size_url = sqs->m_sock->m_readOffset + 1;
+
+	// sanity
+	if ( r->ptr_url && r->ptr_url[r->size_url-1] ) { char *xx=NULL;*xx=0;}

 	// use urlip for this, it determines what host downloads it
 	r->m_firstIp                = r->m_urlIp;
--- a/Mem.cpp
+++ b/Mem.cpp
@ -1178,9 +1178,9 @@ int Mem::printBreech ( int32_t i , char core ) {
 	int32_t size = s_sizes[i];
 	for ( int32_t j = 0 ; j < OVERPAD ; j++ ) {
 		if ( (unsigned char)mem[size+j] == MAGICCHAR ) continue;
-		log(LOG_LOGIC,"mem: overrun  at %"PTRFMT" "
+		log(LOG_LOGIC,"mem: overrun  at 0x%"PTRFMT" (size=%"INT32")"
 		    "roff=%"INT32" note=%s",
-		    (PTRTYPE)mem,j,&s_labels[i*16]);
+		    (PTRTYPE)mem,size,j,&s_labels[i*16]);

 		// mark it for freed mem re-use check below
 		if ( ! bp ) bp = &mem[size+j];
@ -1205,8 +1205,10 @@ int Mem::printBreech ( int32_t i , char core ) {
 		}
 		// now report it
 		if ( mink == -1 ) continue;
-		log("mem: possible breeching buffer=%s dist=%"PTRFMT"",
+		log("mem: possible breeching buffer=%s at 0x%"PTRFMT" "
+		    "breaching at offset of %"PTRFMT" bytes",
 		    &s_labels[mink*16],
+		    (PTRTYPE)s_mptrs[mink],
 		    (PTRTYPE)s_mptrs[mink]-((PTRTYPE)mem+s_sizes[i]));
 		flag = 1;
 	}
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -61,6 +61,7 @@ int32_t convertIntoLinks    ( char *reply, int32_t replySize , Xml *xml ,

 static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
 static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
+static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
 static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
 static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;

@ -1030,6 +1031,17 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
 	if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
 		fixGETorPOST ( exactRequest );

+	// ALSO ADD authorization to the NEW proxy we are sending to
+	// r->m_proxyIp/r->m_proxyPort that has a username:password
+	char tmpBuf[1024];
+	SafeBuf newReq (tmpBuf,1024);
+	if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
+		newReq.safeStrcpy ( exactRequest );
+		addNewProxyAuthorization ( &newReq , r );
+		newReq.nullTerm();
+		exactRequest = newReq.getBufStart();
+	}
+
 	// indicate start of download so we can overwrite the 0 we stored
 	// into the hammercache
 	r->m_downloadStartTimeMS = nowms;
@ -3031,6 +3043,29 @@ void scanHammerQueue ( int fd , void *state ) {
 	}
 }

+bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
+
+	if ( ! r->m_proxyIp   ) return true;
+	if ( ! r->m_proxyPort ) return true;
+
+	// get proxy from list to get username/password
+	SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
+
+	// if none required, all done
+	if ( ! sp->m_usernamePwd ) return true;
+	// strange?
+	if ( req->length() < 8 ) return false;
+	// back up over final \r\n
+	req->m_length -= 2 ;
+	// insert it
+	req->safePrintf("Proxy-Authorization: Basic ");
+	req->base64Encode ( sp->m_usernamePwd );
+	req->safePrintf("\r\n");
+	req->safePrintf("\r\n");
+	req->nullTerm();
+	return true;
+}
+
 // When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
 // using us like a proxy, so Msg13Request::m_url is in reality a complete
 // HTTP request mime. so in that case we have to call this code to
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -3525,6 +3525,9 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
 	return true;
 }

+bool SafeBuf::base64Encode( char *s ) {
+	return base64Encode(s,gbstrlen(s)); 
+}

 bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {

--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -122,6 +122,8 @@ public:
 	bool  base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
 	bool  base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;

+	bool base64Encode( char *s ) ;
+
 	//bool  pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
--- a/SpiderProxy.cpp
+++ b/SpiderProxy.cpp
@ -24,47 +24,6 @@
 // . TODO: to prevent host #0 from getting too slammed we can also recruit
 //   other hosts to act just like host #0.

-// host #0 breaks Conf::m_spiderIps safebuf into an array of
-// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure 
-// persistence
-class SpiderProxy {
-public:
-	// ip/port of the spider proxy
-	int32_t m_ip;
-	uint16_t m_port;
-	// last time we attempted to download the test url through this proxy
-	int64_t m_lastDownloadTestAttemptMS;
-	// use -1 to indicate timed out when downloading test url
-	int32_t   m_lastDownloadTookMS;
-	// 0 means none... use mstrerror()
-	int32_t   m_lastDownloadError;
-	// use -1 to indicate never
-	int64_t m_lastSuccessfulTestMS;
-
-	// how many times have we told a requesting host to use this proxy
-	// to download their url with.
-	int32_t m_numDownloadRequests;
-
-	// how many are outstanding? everytime a host requests a proxyip
-	// it also tells us its outstanding counts for each proxy ip
-	// so we can ensure this is accurate even though a host may die
-	// and come back up.
-	int32_t m_numOutstandingDownloads;
-
-	// waiting on test url to be downloaded
-	bool m_isWaiting;
-
-	int64_t m_timesUsed;
-
-	int32_t m_lastBytesDownloaded;
-
-	// special things used by LoadBucket algo to determine which
-	// SpiderProxy to use to download from a particular IP
-	int32_t m_countForThisIp;
-	int64_t m_lastTimeUsedForThisIp;
-
-	char m_usernamePwd[MAXUSERNAMEPWD];
-};

 // hashtable that maps an ip:port key (64-bits) to a SpiderProxy
 static HashTableX s_iptab;
@ -1081,3 +1040,14 @@ bool initSpiderProxyStuff() {

 }

+SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
+	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
+		// skip empty slots
+		if ( ! s_iptab.m_flags[i] ) continue;
+		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
+		if ( sp->m_ip != ip ) continue;
+		if ( sp->m_port != port ) continue;
+		return sp;
+	}
+	return NULL;
+}
--- a/SpiderProxy.h
+++ b/SpiderProxy.h
@ -19,6 +19,51 @@ bool resetProxyStats ( ) ;
 // save stats on the spider proxies if any
 bool saveSpiderProxyStats();

+#define MAXUSERNAMEPWD 64
+
+// host #0 breaks Conf::m_spiderIps safebuf into an array of
+// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure 
+// persistence
+class SpiderProxy {
+public:
+	// ip/port of the spider proxy
+	int32_t m_ip;
+	uint16_t m_port;
+	// last time we attempted to download the test url through this proxy
+	int64_t m_lastDownloadTestAttemptMS;
+	// use -1 to indicate timed out when downloading test url
+	int32_t   m_lastDownloadTookMS;
+	// 0 means none... use mstrerror()
+	int32_t   m_lastDownloadError;
+	// use -1 to indicate never
+	int64_t m_lastSuccessfulTestMS;
+
+	// how many times have we told a requesting host to use this proxy
+	// to download their url with.
+	int32_t m_numDownloadRequests;
+
+	// how many are outstanding? everytime a host requests a proxyip
+	// it also tells us its outstanding counts for each proxy ip
+	// so we can ensure this is accurate even though a host may die
+	// and come back up.
+	int32_t m_numOutstandingDownloads;
+
+	// waiting on test url to be downloaded
+	bool m_isWaiting;
+
+	int64_t m_timesUsed;
+
+	int32_t m_lastBytesDownloaded;
+
+	// special things used by LoadBucket algo to determine which
+	// SpiderProxy to use to download from a particular IP
+	int32_t m_countForThisIp;
+	int64_t m_lastTimeUsedForThisIp;
+
+	char m_usernamePwd[MAXUSERNAMEPWD];
+};
+
+class SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) ;

 // value for m_opCode. get a proxy to use from host #0:
 #define OP_GETPROXY 1
@ -38,8 +83,6 @@ bool saveSpiderProxyStats();
 //	char m_opCode;
 //};
 	
-#define MAXUSERNAMEPWD 64
-
 // host #0 gives us a proxy to use:
 class ProxyReply {
 public:
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -15272,6 +15272,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		m_diffbotUrl.urlEncode(p1);
 		*p2 = c;
 	}
+
+	// now so it works just give it a proxy directly, so it doesn't
+	// have to go through gb.
+	// if ( useProxies ) {
+	// 	// msg13 typically uses this to get an unbanned proxy
+	// 	getProxiesToUse();
+	// }
+
 	// if we use proxies then increase the timeout since proxies 
 	// increase the crawl delay in hopes of backing off to discover
 	// the website's policy so we don't hit it too hard and get banned.