forked from Mirrors/privacore-open-source-search-engine
proxy fixes
This commit is contained in:
@ -156,6 +156,14 @@ bool HttpServer::getDoc ( char *url ,
|
||||
defPort = 443;
|
||||
}
|
||||
|
||||
// if we are using gigablast as a squid proxy then the
|
||||
// "fullRequest" and the url will be like "CONNECT foo.com:443 HTT..."
|
||||
// and it is an https url, because we only use the CONNECT cmd for
|
||||
// downloading https urls over a proxy i think
|
||||
char *p = fullRequest;
|
||||
if ( p && strncmp(p,"CONNECT ",8)==0 )
|
||||
urlIsHttps = true;
|
||||
|
||||
// if going through a proxy do not use the ssl server, it will
|
||||
// handle the encryption from itself to the host website. unfortunately
|
||||
// then the http requests/responses are unencrypted from the
|
||||
@ -3582,7 +3590,10 @@ void gotSquidProxiedUrlIp ( void *state , int32_t ip ) {
|
||||
|
||||
// include terminating \0. well it is already i think. see
|
||||
// Msg13Request::getSize(), so no need to add +1
|
||||
r->size_url = sqs->m_sock->m_readOffset;
|
||||
r->size_url = sqs->m_sock->m_readOffset + 1;
|
||||
|
||||
// sanity
|
||||
if ( r->ptr_url && r->ptr_url[r->size_url-1] ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
// use urlip for this, it determines what host downloads it
|
||||
r->m_firstIp = r->m_urlIp;
|
||||
|
8
Mem.cpp
8
Mem.cpp
@ -1178,9 +1178,9 @@ int Mem::printBreech ( int32_t i , char core ) {
|
||||
int32_t size = s_sizes[i];
|
||||
for ( int32_t j = 0 ; j < OVERPAD ; j++ ) {
|
||||
if ( (unsigned char)mem[size+j] == MAGICCHAR ) continue;
|
||||
log(LOG_LOGIC,"mem: overrun at %"PTRFMT" "
|
||||
log(LOG_LOGIC,"mem: overrun at 0x%"PTRFMT" (size=%"INT32")"
|
||||
"roff=%"INT32" note=%s",
|
||||
(PTRTYPE)mem,j,&s_labels[i*16]);
|
||||
(PTRTYPE)mem,size,j,&s_labels[i*16]);
|
||||
|
||||
// mark it for freed mem re-use check below
|
||||
if ( ! bp ) bp = &mem[size+j];
|
||||
@ -1205,8 +1205,10 @@ int Mem::printBreech ( int32_t i , char core ) {
|
||||
}
|
||||
// now report it
|
||||
if ( mink == -1 ) continue;
|
||||
log("mem: possible breeching buffer=%s dist=%"PTRFMT"",
|
||||
log("mem: possible breeching buffer=%s at 0x%"PTRFMT" "
|
||||
"breaching at offset of %"PTRFMT" bytes",
|
||||
&s_labels[mink*16],
|
||||
(PTRTYPE)s_mptrs[mink],
|
||||
(PTRTYPE)s_mptrs[mink]-((PTRTYPE)mem+s_sizes[i]));
|
||||
flag = 1;
|
||||
}
|
||||
|
35
Msg13.cpp
35
Msg13.cpp
@ -61,6 +61,7 @@ int32_t convertIntoLinks ( char *reply, int32_t replySize , Xml *xml ,
|
||||
|
||||
static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
|
||||
static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
|
||||
static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
|
||||
static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
|
||||
static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;
|
||||
|
||||
@ -1030,6 +1031,17 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
|
||||
fixGETorPOST ( exactRequest );
|
||||
|
||||
// ALSO ADD authorization to the NEW proxy we are sending to
|
||||
// r->m_proxyIp/r->m_proxyPort that has a username:password
|
||||
char tmpBuf[1024];
|
||||
SafeBuf newReq (tmpBuf,1024);
|
||||
if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
|
||||
newReq.safeStrcpy ( exactRequest );
|
||||
addNewProxyAuthorization ( &newReq , r );
|
||||
newReq.nullTerm();
|
||||
exactRequest = newReq.getBufStart();
|
||||
}
|
||||
|
||||
// indicate start of download so we can overwrite the 0 we stored
|
||||
// into the hammercache
|
||||
r->m_downloadStartTimeMS = nowms;
|
||||
@ -3031,6 +3043,29 @@ void scanHammerQueue ( int fd , void *state ) {
|
||||
}
|
||||
}
|
||||
|
||||
bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
|
||||
|
||||
if ( ! r->m_proxyIp ) return true;
|
||||
if ( ! r->m_proxyPort ) return true;
|
||||
|
||||
// get proxy from list to get username/password
|
||||
SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
|
||||
|
||||
// if none required, all done
|
||||
if ( ! sp->m_usernamePwd ) return true;
|
||||
// strange?
|
||||
if ( req->length() < 8 ) return false;
|
||||
// back up over final \r\n
|
||||
req->m_length -= 2 ;
|
||||
// insert it
|
||||
req->safePrintf("Proxy-Authorization: Basic ");
|
||||
req->base64Encode ( sp->m_usernamePwd );
|
||||
req->safePrintf("\r\n");
|
||||
req->safePrintf("\r\n");
|
||||
req->nullTerm();
|
||||
return true;
|
||||
}
|
||||
|
||||
// When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
|
||||
// using us like a proxy, so Msg13Request::m_url is in reality a complete
|
||||
// HTTP request mime. so in that case we have to call this code to
|
||||
|
@ -3525,6 +3525,9 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Encode( char *s ) {
|
||||
return base64Encode(s,gbstrlen(s));
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {
|
||||
|
||||
|
@ -122,6 +122,8 @@ public:
|
||||
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
|
||||
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
|
||||
|
||||
bool base64Encode( char *s ) ;
|
||||
|
||||
//bool pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
|
||||
bool cat(SafeBuf& c);
|
||||
// . only cat the sections/tag that start with "tagFilter"
|
||||
|
@ -24,47 +24,6 @@
|
||||
// . TODO: to prevent host #0 from getting too slammed we can also recruit
|
||||
// other hosts to act just like host #0.
|
||||
|
||||
// host #0 breaks Conf::m_spiderIps safebuf into an array of
|
||||
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
|
||||
// persistence
|
||||
class SpiderProxy {
|
||||
public:
|
||||
// ip/port of the spider proxy
|
||||
int32_t m_ip;
|
||||
uint16_t m_port;
|
||||
// last time we attempted to download the test url through this proxy
|
||||
int64_t m_lastDownloadTestAttemptMS;
|
||||
// use -1 to indicate timed out when downloading test url
|
||||
int32_t m_lastDownloadTookMS;
|
||||
// 0 means none... use mstrerror()
|
||||
int32_t m_lastDownloadError;
|
||||
// use -1 to indicate never
|
||||
int64_t m_lastSuccessfulTestMS;
|
||||
|
||||
// how many times have we told a requesting host to use this proxy
|
||||
// to download their url with.
|
||||
int32_t m_numDownloadRequests;
|
||||
|
||||
// how many are outstanding? everytime a host requests a proxyip
|
||||
// it also tells us its outstanding counts for each proxy ip
|
||||
// so we can ensure this is accurate even though a host may die
|
||||
// and come back up.
|
||||
int32_t m_numOutstandingDownloads;
|
||||
|
||||
// waiting on test url to be downloaded
|
||||
bool m_isWaiting;
|
||||
|
||||
int64_t m_timesUsed;
|
||||
|
||||
int32_t m_lastBytesDownloaded;
|
||||
|
||||
// special things used by LoadBucket algo to determine which
|
||||
// SpiderProxy to use to download from a particular IP
|
||||
int32_t m_countForThisIp;
|
||||
int64_t m_lastTimeUsedForThisIp;
|
||||
|
||||
char m_usernamePwd[MAXUSERNAMEPWD];
|
||||
};
|
||||
|
||||
// hashtable that maps an ip:port key (64-bits) to a SpiderProxy
|
||||
static HashTableX s_iptab;
|
||||
@ -1081,3 +1040,14 @@ bool initSpiderProxyStuff() {
|
||||
|
||||
}
|
||||
|
||||
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
|
||||
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
// skip empty slots
|
||||
if ( ! s_iptab.m_flags[i] ) continue;
|
||||
SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
|
||||
if ( sp->m_ip != ip ) continue;
|
||||
if ( sp->m_port != port ) continue;
|
||||
return sp;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
@ -19,6 +19,51 @@ bool resetProxyStats ( ) ;
|
||||
// save stats on the spider proxies if any
|
||||
bool saveSpiderProxyStats();
|
||||
|
||||
#define MAXUSERNAMEPWD 64
|
||||
|
||||
// host #0 breaks Conf::m_spiderIps safebuf into an array of
|
||||
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
|
||||
// persistence
|
||||
class SpiderProxy {
|
||||
public:
|
||||
// ip/port of the spider proxy
|
||||
int32_t m_ip;
|
||||
uint16_t m_port;
|
||||
// last time we attempted to download the test url through this proxy
|
||||
int64_t m_lastDownloadTestAttemptMS;
|
||||
// use -1 to indicate timed out when downloading test url
|
||||
int32_t m_lastDownloadTookMS;
|
||||
// 0 means none... use mstrerror()
|
||||
int32_t m_lastDownloadError;
|
||||
// use -1 to indicate never
|
||||
int64_t m_lastSuccessfulTestMS;
|
||||
|
||||
// how many times have we told a requesting host to use this proxy
|
||||
// to download their url with.
|
||||
int32_t m_numDownloadRequests;
|
||||
|
||||
// how many are outstanding? everytime a host requests a proxyip
|
||||
// it also tells us its outstanding counts for each proxy ip
|
||||
// so we can ensure this is accurate even though a host may die
|
||||
// and come back up.
|
||||
int32_t m_numOutstandingDownloads;
|
||||
|
||||
// waiting on test url to be downloaded
|
||||
bool m_isWaiting;
|
||||
|
||||
int64_t m_timesUsed;
|
||||
|
||||
int32_t m_lastBytesDownloaded;
|
||||
|
||||
// special things used by LoadBucket algo to determine which
|
||||
// SpiderProxy to use to download from a particular IP
|
||||
int32_t m_countForThisIp;
|
||||
int64_t m_lastTimeUsedForThisIp;
|
||||
|
||||
char m_usernamePwd[MAXUSERNAMEPWD];
|
||||
};
|
||||
|
||||
class SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) ;
|
||||
|
||||
// value for m_opCode. get a proxy to use from host #0:
|
||||
#define OP_GETPROXY 1
|
||||
@ -38,8 +83,6 @@ bool saveSpiderProxyStats();
|
||||
// char m_opCode;
|
||||
//};
|
||||
|
||||
#define MAXUSERNAMEPWD 64
|
||||
|
||||
// host #0 gives us a proxy to use:
|
||||
class ProxyReply {
|
||||
public:
|
||||
|
@ -15272,6 +15272,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
m_diffbotUrl.urlEncode(p1);
|
||||
*p2 = c;
|
||||
}
|
||||
|
||||
// now so it works just give it a proxy directly, so it doesn't
|
||||
// have to go through gb.
|
||||
// if ( useProxies ) {
|
||||
// // msg13 typically uses this to get an unbanned proxy
|
||||
// getProxiesToUse();
|
||||
// }
|
||||
|
||||
// if we use proxies then increase the timeout since proxies
|
||||
// increase the crawl delay in hopes of backing off to discover
|
||||
// the website's policy so we don't hit it too hard and get banned.
|
||||
|
Reference in New Issue
Block a user