Merge branch 'testing'

2015-03-05 20:34:32 -08:00 · 2015-03-05 20:34:32 -08:00 · 4723d9eefa
commit 4723d9eefa
parent ad2e1f11fa 4506b93ed3
10 changed files with 1565 additions and 69 deletions
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -156,6 +156,14 @@ bool HttpServer::getDoc ( char   *url      ,
 		defPort = 443;
 	}

+	// if we are using gigablast as a squid proxy then the
+	// "fullRequest" and the url will be like "CONNECT foo.com:443 HTT..."
+	// and it is an https url, because we only use the CONNECT cmd for
+	// downloading https urls over a proxy i think
+	char *p = fullRequest;
+	if ( p && strncmp(p,"CONNECT ",8)==0 )
+		urlIsHttps = true;
+
 	// if going through a proxy do not use the ssl server, it will
 	// handle the encryption from itself to the host website. unfortunately
 	// then the http requests/responses are unencrypted from the
@ -3582,7 +3590,10 @@ void gotSquidProxiedUrlIp ( void *state , int32_t ip ) {

 	// include terminating \0. well it is already i think. see
 	// Msg13Request::getSize(), so no need to add +1
-	r->size_url = sqs->m_sock->m_readOffset;
+	r->size_url = sqs->m_sock->m_readOffset + 1;
+
+	// sanity
+	if ( r->ptr_url && r->ptr_url[r->size_url-1] ) { char *xx=NULL;*xx=0;}

 	// use urlip for this, it determines what host downloads it
 	r->m_firstIp                = r->m_urlIp;
--- a/Mem.cpp
+++ b/Mem.cpp
@ -1178,9 +1178,9 @@ int Mem::printBreech ( int32_t i , char core ) {
 	int32_t size = s_sizes[i];
 	for ( int32_t j = 0 ; j < OVERPAD ; j++ ) {
 		if ( (unsigned char)mem[size+j] == MAGICCHAR ) continue;
-		log(LOG_LOGIC,"mem: overrun  at %"PTRFMT" "
+		log(LOG_LOGIC,"mem: overrun  at 0x%"PTRFMT" (size=%"INT32")"
 		    "roff=%"INT32" note=%s",
-		    (PTRTYPE)mem,j,&s_labels[i*16]);
+		    (PTRTYPE)mem,size,j,&s_labels[i*16]);

 		// mark it for freed mem re-use check below
 		if ( ! bp ) bp = &mem[size+j];
@ -1205,8 +1205,10 @@ int Mem::printBreech ( int32_t i , char core ) {
 		}
 		// now report it
 		if ( mink == -1 ) continue;
-		log("mem: possible breeching buffer=%s dist=%"PTRFMT"",
+		log("mem: possible breeching buffer=%s at 0x%"PTRFMT" "
+		    "breaching at offset of %"PTRFMT" bytes",
 		    &s_labels[mink*16],
+		    (PTRTYPE)s_mptrs[mink],
 		    (PTRTYPE)s_mptrs[mink]-((PTRTYPE)mem+s_sizes[i]));
 		flag = 1;
 	}
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -61,6 +61,7 @@ int32_t convertIntoLinks    ( char *reply, int32_t replySize , Xml *xml ,

 static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
 static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
+static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
 static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
 static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;

@ -1030,6 +1031,17 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
 	if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
 		fixGETorPOST ( exactRequest );

+	// ALSO ADD authorization to the NEW proxy we are sending to
+	// r->m_proxyIp/r->m_proxyPort that has a username:password
+	char tmpBuf[1024];
+	SafeBuf newReq (tmpBuf,1024);
+	if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
+		newReq.safeStrcpy ( exactRequest );
+		addNewProxyAuthorization ( &newReq , r );
+		newReq.nullTerm();
+		exactRequest = newReq.getBufStart();
+	}
+
 	// indicate start of download so we can overwrite the 0 we stored
 	// into the hammercache
 	r->m_downloadStartTimeMS = nowms;
@ -3031,6 +3043,29 @@ void scanHammerQueue ( int fd , void *state ) {
 	}
 }

+bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
+
+	if ( ! r->m_proxyIp   ) return true;
+	if ( ! r->m_proxyPort ) return true;
+
+	// get proxy from list to get username/password
+	SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
+
+	// if none required, all done
+	if ( ! sp->m_usernamePwd ) return true;
+	// strange?
+	if ( req->length() < 8 ) return false;
+	// back up over final \r\n
+	req->m_length -= 2 ;
+	// insert it
+	req->safePrintf("Proxy-Authorization: Basic ");
+	req->base64Encode ( sp->m_usernamePwd );
+	req->safePrintf("\r\n");
+	req->safePrintf("\r\n");
+	req->nullTerm();
+	return true;
+}
+
 // When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
 // using us like a proxy, so Msg13Request::m_url is in reality a complete
 // HTTP request mime. so in that case we have to call this code to
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -3525,6 +3525,9 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
 	return true;
 }

+bool SafeBuf::base64Encode( char *s ) {
+	return base64Encode(s,gbstrlen(s)); 
+}

 bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {

--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -122,6 +122,8 @@ public:
 	bool  base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
 	bool  base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;

+	bool base64Encode( char *s ) ;
+
 	//bool  pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
--- a/SpiderProxy.cpp
+++ b/SpiderProxy.cpp
@ -24,47 +24,6 @@
 // . TODO: to prevent host #0 from getting too slammed we can also recruit
 //   other hosts to act just like host #0.

-// host #0 breaks Conf::m_spiderIps safebuf into an array of
-// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure 
-// persistence
-class SpiderProxy {
-public:
-	// ip/port of the spider proxy
-	int32_t m_ip;
-	uint16_t m_port;
-	// last time we attempted to download the test url through this proxy
-	int64_t m_lastDownloadTestAttemptMS;
-	// use -1 to indicate timed out when downloading test url
-	int32_t   m_lastDownloadTookMS;
-	// 0 means none... use mstrerror()
-	int32_t   m_lastDownloadError;
-	// use -1 to indicate never
-	int64_t m_lastSuccessfulTestMS;
-
-	// how many times have we told a requesting host to use this proxy
-	// to download their url with.
-	int32_t m_numDownloadRequests;
-
-	// how many are outstanding? everytime a host requests a proxyip
-	// it also tells us its outstanding counts for each proxy ip
-	// so we can ensure this is accurate even though a host may die
-	// and come back up.
-	int32_t m_numOutstandingDownloads;
-
-	// waiting on test url to be downloaded
-	bool m_isWaiting;
-
-	int64_t m_timesUsed;
-
-	int32_t m_lastBytesDownloaded;
-
-	// special things used by LoadBucket algo to determine which
-	// SpiderProxy to use to download from a particular IP
-	int32_t m_countForThisIp;
-	int64_t m_lastTimeUsedForThisIp;
-
-	char m_usernamePwd[MAXUSERNAMEPWD];
-};

 // hashtable that maps an ip:port key (64-bits) to a SpiderProxy
 static HashTableX s_iptab;
@ -1081,3 +1040,14 @@ bool initSpiderProxyStuff() {

 }

+SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
+	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
+		// skip empty slots
+		if ( ! s_iptab.m_flags[i] ) continue;
+		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
+		if ( sp->m_ip != ip ) continue;
+		if ( sp->m_port != port ) continue;
+		return sp;
+	}
+	return NULL;
+}
--- a/SpiderProxy.h
+++ b/SpiderProxy.h
@ -19,6 +19,51 @@ bool resetProxyStats ( ) ;
 // save stats on the spider proxies if any
 bool saveSpiderProxyStats();

+#define MAXUSERNAMEPWD 64
+
+// host #0 breaks Conf::m_spiderIps safebuf into an array of
+// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure 
+// persistence
+class SpiderProxy {
+public:
+	// ip/port of the spider proxy
+	int32_t m_ip;
+	uint16_t m_port;
+	// last time we attempted to download the test url through this proxy
+	int64_t m_lastDownloadTestAttemptMS;
+	// use -1 to indicate timed out when downloading test url
+	int32_t   m_lastDownloadTookMS;
+	// 0 means none... use mstrerror()
+	int32_t   m_lastDownloadError;
+	// use -1 to indicate never
+	int64_t m_lastSuccessfulTestMS;
+
+	// how many times have we told a requesting host to use this proxy
+	// to download their url with.
+	int32_t m_numDownloadRequests;
+
+	// how many are outstanding? everytime a host requests a proxyip
+	// it also tells us its outstanding counts for each proxy ip
+	// so we can ensure this is accurate even though a host may die
+	// and come back up.
+	int32_t m_numOutstandingDownloads;
+
+	// waiting on test url to be downloaded
+	bool m_isWaiting;
+
+	int64_t m_timesUsed;
+
+	int32_t m_lastBytesDownloaded;
+
+	// special things used by LoadBucket algo to determine which
+	// SpiderProxy to use to download from a particular IP
+	int32_t m_countForThisIp;
+	int64_t m_lastTimeUsedForThisIp;
+
+	char m_usernamePwd[MAXUSERNAMEPWD];
+};
+
+class SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) ;

 // value for m_opCode. get a proxy to use from host #0:
 #define OP_GETPROXY 1
@ -38,8 +83,6 @@ bool saveSpiderProxyStats();
 //	char m_opCode;
 //};
 	
-#define MAXUSERNAMEPWD 64
-
 // host #0 gives us a proxy to use:
 class ProxyReply {
 public:
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -15237,6 +15237,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	if ( cr->m_forceUseFloaters ) useProxies = true;
 	// we gotta have some proxy ips that we can use
 	if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
+
+	// until we fix https CONNECT support for https urls diffbot can't
+	// go through gb. we should fix that by downloading the whole page
+	// ourselves and sending it back, and tell diffbot's phantomjs not
+	// to do the certificate check.
+	useProxies = false;
+
 	// if we used a proxy to download the doc, then diffbot should too
 	// BUT tell diffbot to go through host #0 so we can send it to the
 	// correct proxy using our load balancing & backoff algos.
@ -15272,6 +15279,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		m_diffbotUrl.urlEncode(p1);
 		*p2 = c;
 	}
+
+	// now so it works just give it a proxy directly, so it doesn't
+	// have to go through gb.
+	// if ( useProxies ) {
+	// 	// msg13 typically uses this to get an unbanned proxy
+	// 	getProxiesToUse();
+	// }
+
 	// if we use proxies then increase the timeout since proxies 
 	// increase the crawl delay in hopes of backing off to discover
 	// the website's policy so we don't hit it too hard and get banned.
--- a/main.cpp
+++ b/main.cpp
@ -165,8 +165,16 @@ static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool inclu
 			   char printStats , int32_t firstIp );
 static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
 static void dumpRevdb    ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
-static void dumpTagdb   ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
-			   int32_t c, char rec=0, int32_t rdbId = RDB_TAGDB );
+
+static void dumpTagdb   ( char *coll,
+			  int32_t sfn,
+			  int32_t numFiles,
+			  bool includeTree,
+			  int32_t c, 
+			  char rec=0, 
+			  int32_t rdbId = RDB_TAGDB ,
+			  char *site = NULL );
+
 static void dumpIndexdb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree, 
 			   int64_t termId ) ;
 void dumpPosdb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree, 
@ -2858,11 +2866,24 @@ int main2 ( int argc , char *argv[] ) {
 		       dumpSectiondb(coll,startFileNum,numFiles,includeTree);
 		else if ( argv[cmdarg+1][0] == 'V' )
 		       dumpRevdb(coll,startFileNum,numFiles,includeTree);
-		else if ( argv[cmdarg+1][0] == 'S' )
-			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0);
-		else if ( argv[cmdarg+1][0] == 'z' )
+		else if ( argv[cmdarg+1][0] == 'S' ) {
+			char *site = NULL;
+			if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
+			dumpTagdb(coll,
+				  startFileNum,
+				  numFiles,
+				  includeTree,
+				  0,
+				  0,
+				  RDB_TAGDB,
+				  site);
+		}
+		else if ( argv[cmdarg+1][0] == 'z' ) {
+			char *site = NULL;
+			if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
 			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
-				    'z');
+				    'z',RDB_TAGDB,site);
+		}
 		else if ( argv[cmdarg+1][0] == 'A' )
 			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
 				     'A');
@ -11956,7 +11977,8 @@ void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTre

 void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 		bool includeTree, 
-		 int32_t c , char req, int32_t rdbId ) {
+		int32_t c , char req, int32_t rdbId ,
+		char *siteArg ) {
 	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
 	g_tagdb.init ();
 	//g_collectiondb.init(true);
@ -11966,6 +11988,11 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
 	key128_t endKey   ;
 	startKey.setMin();
 	endKey.setMax();
+	if ( siteArg ) {
+		startKey = g_tagdb.makeStartKey ( siteArg );
+		endKey = g_tagdb.makeEndKey ( siteArg );
+		log("gb: using site %s for start key",siteArg );
+	}
 	// turn off threads
 	g_threads.disableThreads();
 	// get a meg at a time
--- a/sitelinks.txt
+++ b/sitelinks.txt