Merge branch 'testing'

This commit is contained in:
Matt Wells 2015-03-05 20:34:32 -08:00
commit 4723d9eefa
10 changed files with 1565 additions and 69 deletions

@ -156,6 +156,14 @@ bool HttpServer::getDoc ( char *url ,
defPort = 443;
}
// if we are using gigablast as a squid proxy then the
// "fullRequest" and the url will be like "CONNECT foo.com:443 HTT..."
// and it is an https url, because we only use the CONNECT cmd for
// downloading https urls over a proxy i think
char *p = fullRequest;
if ( p && strncmp(p,"CONNECT ",8)==0 )
urlIsHttps = true;
// if going through a proxy do not use the ssl server, it will
// handle the encryption from itself to the host website. unfortunately
// then the http requests/responses are unencrypted from the
@ -3582,7 +3590,10 @@ void gotSquidProxiedUrlIp ( void *state , int32_t ip ) {
// include terminating \0. well it is already i think. see
// Msg13Request::getSize(), so no need to add +1
r->size_url = sqs->m_sock->m_readOffset;
r->size_url = sqs->m_sock->m_readOffset + 1;
// sanity
if ( r->ptr_url && r->ptr_url[r->size_url-1] ) { char *xx=NULL;*xx=0;}
// use urlip for this, it determines what host downloads it
r->m_firstIp = r->m_urlIp;

@ -1178,9 +1178,9 @@ int Mem::printBreech ( int32_t i , char core ) {
int32_t size = s_sizes[i];
for ( int32_t j = 0 ; j < OVERPAD ; j++ ) {
if ( (unsigned char)mem[size+j] == MAGICCHAR ) continue;
log(LOG_LOGIC,"mem: overrun at %"PTRFMT" "
log(LOG_LOGIC,"mem: overrun at 0x%"PTRFMT" (size=%"INT32")"
"roff=%"INT32" note=%s",
(PTRTYPE)mem,j,&s_labels[i*16]);
(PTRTYPE)mem,size,j,&s_labels[i*16]);
// mark it for freed mem re-use check below
if ( ! bp ) bp = &mem[size+j];
@ -1205,8 +1205,10 @@ int Mem::printBreech ( int32_t i , char core ) {
}
// now report it
if ( mink == -1 ) continue;
log("mem: possible breeching buffer=%s dist=%"PTRFMT"",
log("mem: possible breeching buffer=%s at 0x%"PTRFMT" "
"breaching at offset of %"PTRFMT" bytes",
&s_labels[mink*16],
(PTRTYPE)s_mptrs[mink],
(PTRTYPE)s_mptrs[mink]-((PTRTYPE)mem+s_sizes[i]));
flag = 1;
}

@ -61,6 +61,7 @@ int32_t convertIntoLinks ( char *reply, int32_t replySize , Xml *xml ,
static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;
@ -1030,6 +1031,17 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
fixGETorPOST ( exactRequest );
// ALSO ADD authorization to the NEW proxy we are sending to
// r->m_proxyIp/r->m_proxyPort that has a username:password
char tmpBuf[1024];
SafeBuf newReq (tmpBuf,1024);
if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
newReq.safeStrcpy ( exactRequest );
addNewProxyAuthorization ( &newReq , r );
newReq.nullTerm();
exactRequest = newReq.getBufStart();
}
// indicate start of download so we can overwrite the 0 we stored
// into the hammercache
r->m_downloadStartTimeMS = nowms;
@ -3031,6 +3043,29 @@ void scanHammerQueue ( int fd , void *state ) {
}
}
bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
if ( ! r->m_proxyIp ) return true;
if ( ! r->m_proxyPort ) return true;
// get proxy from list to get username/password
SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
// if none required, all done
if ( ! sp->m_usernamePwd ) return true;
// strange?
if ( req->length() < 8 ) return false;
// back up over final \r\n
req->m_length -= 2 ;
// insert it
req->safePrintf("Proxy-Authorization: Basic ");
req->base64Encode ( sp->m_usernamePwd );
req->safePrintf("\r\n");
req->safePrintf("\r\n");
req->nullTerm();
return true;
}
// When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
// using us like a proxy, so Msg13Request::m_url is in reality a complete
// HTTP request mime. so in that case we have to call this code to

@ -3525,6 +3525,9 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
return true;
}
bool SafeBuf::base64Encode( char *s ) {
return base64Encode(s,gbstrlen(s));
}
bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {

@ -122,6 +122,8 @@ public:
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
bool base64Encode( char *s ) ;
//bool pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"

@ -24,47 +24,6 @@
// . TODO: to prevent host #0 from getting too slammed we can also recruit
// other hosts to act just like host #0.
// host #0 breaks Conf::m_spiderIps safebuf into an array of
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
// persistence
class SpiderProxy {
public:
// ip/port of the spider proxy
int32_t m_ip;
uint16_t m_port;
// last time we attempted to download the test url through this proxy
int64_t m_lastDownloadTestAttemptMS;
// use -1 to indicate timed out when downloading test url
int32_t m_lastDownloadTookMS;
// 0 means none... use mstrerror()
int32_t m_lastDownloadError;
// use -1 to indicate never
int64_t m_lastSuccessfulTestMS;
// how many times have we told a requesting host to use this proxy
// to download their url with.
int32_t m_numDownloadRequests;
// how many are outstanding? everytime a host requests a proxyip
// it also tells us its outstanding counts for each proxy ip
// so we can ensure this is accurate even though a host may die
// and come back up.
int32_t m_numOutstandingDownloads;
// waiting on test url to be downloaded
bool m_isWaiting;
int64_t m_timesUsed;
int32_t m_lastBytesDownloaded;
// special things used by LoadBucket algo to determine which
// SpiderProxy to use to download from a particular IP
int32_t m_countForThisIp;
int64_t m_lastTimeUsedForThisIp;
char m_usernamePwd[MAXUSERNAMEPWD];
};
// hashtable that maps an ip:port key (64-bits) to a SpiderProxy
static HashTableX s_iptab;
@ -1081,3 +1040,14 @@ bool initSpiderProxyStuff() {
}
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
// skip empty slots
if ( ! s_iptab.m_flags[i] ) continue;
SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
if ( sp->m_ip != ip ) continue;
if ( sp->m_port != port ) continue;
return sp;
}
return NULL;
}

@ -19,6 +19,51 @@ bool resetProxyStats ( ) ;
// save stats on the spider proxies if any
bool saveSpiderProxyStats();
#define MAXUSERNAMEPWD 64
// host #0 breaks Conf::m_spiderIps safebuf into an array of
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
// persistence
class SpiderProxy {
public:
// ip/port of the spider proxy
int32_t m_ip;
uint16_t m_port;
// last time we attempted to download the test url through this proxy
int64_t m_lastDownloadTestAttemptMS;
// use -1 to indicate timed out when downloading test url
int32_t m_lastDownloadTookMS;
// 0 means none... use mstrerror()
int32_t m_lastDownloadError;
// use -1 to indicate never
int64_t m_lastSuccessfulTestMS;
// how many times have we told a requesting host to use this proxy
// to download their url with.
int32_t m_numDownloadRequests;
// how many are outstanding? everytime a host requests a proxyip
// it also tells us its outstanding counts for each proxy ip
// so we can ensure this is accurate even though a host may die
// and come back up.
int32_t m_numOutstandingDownloads;
// waiting on test url to be downloaded
bool m_isWaiting;
int64_t m_timesUsed;
int32_t m_lastBytesDownloaded;
// special things used by LoadBucket algo to determine which
// SpiderProxy to use to download from a particular IP
int32_t m_countForThisIp;
int64_t m_lastTimeUsedForThisIp;
char m_usernamePwd[MAXUSERNAMEPWD];
};
class SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) ;
// value for m_opCode. get a proxy to use from host #0:
#define OP_GETPROXY 1
@ -38,8 +83,6 @@ bool saveSpiderProxyStats();
// char m_opCode;
//};
#define MAXUSERNAMEPWD 64
// host #0 gives us a proxy to use:
class ProxyReply {
public:

@ -15237,6 +15237,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
if ( cr->m_forceUseFloaters ) useProxies = true;
// we gotta have some proxy ips that we can use
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
// until we fix https CONNECT support for https urls diffbot can't
// go through gb. we should fix that by downloading the whole page
// ourselves and sending it back, and tell diffbot's phantomjs not
// to do the certificate check.
useProxies = false;
// if we used a proxy to download the doc, then diffbot should too
// BUT tell diffbot to go through host #0 so we can send it to the
// correct proxy using our load balancing & backoff algos.
@ -15272,6 +15279,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
m_diffbotUrl.urlEncode(p1);
*p2 = c;
}
// now so it works just give it a proxy directly, so it doesn't
// have to go through gb.
// if ( useProxies ) {
// // msg13 typically uses this to get an unbanned proxy
// getProxiesToUse();
// }
// if we use proxies then increase the timeout since proxies
// increase the crawl delay in hopes of backing off to discover
// the website's policy so we don't hit it too hard and get banned.

@ -165,8 +165,16 @@ static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool inclu
char printStats , int32_t firstIp );
static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
static void dumpRevdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
static void dumpTagdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
int32_t c, char rec=0, int32_t rdbId = RDB_TAGDB );
static void dumpTagdb ( char *coll,
int32_t sfn,
int32_t numFiles,
bool includeTree,
int32_t c,
char rec=0,
int32_t rdbId = RDB_TAGDB ,
char *site = NULL );
static void dumpIndexdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
int64_t termId ) ;
void dumpPosdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
@ -2858,11 +2866,24 @@ int main2 ( int argc , char *argv[] ) {
dumpSectiondb(coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 'V' )
dumpRevdb(coll,startFileNum,numFiles,includeTree);
else if ( argv[cmdarg+1][0] == 'S' )
dumpTagdb (coll,startFileNum,numFiles,includeTree,0);
else if ( argv[cmdarg+1][0] == 'z' )
else if ( argv[cmdarg+1][0] == 'S' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
dumpTagdb(coll,
startFileNum,
numFiles,
includeTree,
0,
0,
RDB_TAGDB,
site);
}
else if ( argv[cmdarg+1][0] == 'z' ) {
char *site = NULL;
if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
'z');
'z',RDB_TAGDB,site);
}
else if ( argv[cmdarg+1][0] == 'A' )
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
'A');
@ -11956,7 +11977,8 @@ void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTre
void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
bool includeTree,
int32_t c , char req, int32_t rdbId ) {
int32_t c , char req, int32_t rdbId ,
char *siteArg ) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_tagdb.init ();
//g_collectiondb.init(true);
@ -11966,6 +11988,11 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
key128_t endKey ;
startKey.setMin();
endKey.setMax();
if ( siteArg ) {
startKey = g_tagdb.makeStartKey ( siteArg );
endKey = g_tagdb.makeEndKey ( siteArg );
log("gb: using site %s for start key",siteArg );
}
// turn off threads
g_threads.disableThreads();
// get a meg at a time

File diff suppressed because it is too large Load Diff