Merge branch 'testing'
This commit is contained in:
commit
4723d9eefa
@ -156,6 +156,14 @@ bool HttpServer::getDoc ( char *url ,
|
||||
defPort = 443;
|
||||
}
|
||||
|
||||
// if we are using gigablast as a squid proxy then the
|
||||
// "fullRequest" and the url will be like "CONNECT foo.com:443 HTT..."
|
||||
// and it is an https url, because we only use the CONNECT cmd for
|
||||
// downloading https urls over a proxy i think
|
||||
char *p = fullRequest;
|
||||
if ( p && strncmp(p,"CONNECT ",8)==0 )
|
||||
urlIsHttps = true;
|
||||
|
||||
// if going through a proxy do not use the ssl server, it will
|
||||
// handle the encryption from itself to the host website. unfortunately
|
||||
// then the http requests/responses are unencrypted from the
|
||||
@ -3582,7 +3590,10 @@ void gotSquidProxiedUrlIp ( void *state , int32_t ip ) {
|
||||
|
||||
// include terminating \0. well it is already i think. see
|
||||
// Msg13Request::getSize(), so no need to add +1
|
||||
r->size_url = sqs->m_sock->m_readOffset;
|
||||
r->size_url = sqs->m_sock->m_readOffset + 1;
|
||||
|
||||
// sanity
|
||||
if ( r->ptr_url && r->ptr_url[r->size_url-1] ) { char *xx=NULL;*xx=0;}
|
||||
|
||||
// use urlip for this, it determines what host downloads it
|
||||
r->m_firstIp = r->m_urlIp;
|
||||
|
8
Mem.cpp
8
Mem.cpp
@ -1178,9 +1178,9 @@ int Mem::printBreech ( int32_t i , char core ) {
|
||||
int32_t size = s_sizes[i];
|
||||
for ( int32_t j = 0 ; j < OVERPAD ; j++ ) {
|
||||
if ( (unsigned char)mem[size+j] == MAGICCHAR ) continue;
|
||||
log(LOG_LOGIC,"mem: overrun at %"PTRFMT" "
|
||||
log(LOG_LOGIC,"mem: overrun at 0x%"PTRFMT" (size=%"INT32")"
|
||||
"roff=%"INT32" note=%s",
|
||||
(PTRTYPE)mem,j,&s_labels[i*16]);
|
||||
(PTRTYPE)mem,size,j,&s_labels[i*16]);
|
||||
|
||||
// mark it for freed mem re-use check below
|
||||
if ( ! bp ) bp = &mem[size+j];
|
||||
@ -1205,8 +1205,10 @@ int Mem::printBreech ( int32_t i , char core ) {
|
||||
}
|
||||
// now report it
|
||||
if ( mink == -1 ) continue;
|
||||
log("mem: possible breeching buffer=%s dist=%"PTRFMT"",
|
||||
log("mem: possible breeching buffer=%s at 0x%"PTRFMT" "
|
||||
"breaching at offset of %"PTRFMT" bytes",
|
||||
&s_labels[mink*16],
|
||||
(PTRTYPE)s_mptrs[mink],
|
||||
(PTRTYPE)s_mptrs[mink]-((PTRTYPE)mem+s_sizes[i]));
|
||||
flag = 1;
|
||||
}
|
||||
|
35
Msg13.cpp
35
Msg13.cpp
@ -61,6 +61,7 @@ int32_t convertIntoLinks ( char *reply, int32_t replySize , Xml *xml ,
|
||||
|
||||
static bool setProxiedUrlFromSquidProxiedRequest ( Msg13Request *r );
|
||||
static void stripProxyAuthorization ( char *squidProxiedReqBuf ) ;
|
||||
static bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r );
|
||||
static void fixGETorPOST ( char *squidProxiedReqBuf ) ;
|
||||
static int64_t computeProxiedCacheKey64 ( Msg13Request *r ) ;
|
||||
|
||||
@ -1030,6 +1031,17 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
if ( r->m_isSquidProxiedUrl && ! r->m_proxyIp )
|
||||
fixGETorPOST ( exactRequest );
|
||||
|
||||
// ALSO ADD authorization to the NEW proxy we are sending to
|
||||
// r->m_proxyIp/r->m_proxyPort that has a username:password
|
||||
char tmpBuf[1024];
|
||||
SafeBuf newReq (tmpBuf,1024);
|
||||
if ( r->m_isSquidProxiedUrl && r->m_proxyIp ) {
|
||||
newReq.safeStrcpy ( exactRequest );
|
||||
addNewProxyAuthorization ( &newReq , r );
|
||||
newReq.nullTerm();
|
||||
exactRequest = newReq.getBufStart();
|
||||
}
|
||||
|
||||
// indicate start of download so we can overwrite the 0 we stored
|
||||
// into the hammercache
|
||||
r->m_downloadStartTimeMS = nowms;
|
||||
@ -3031,6 +3043,29 @@ void scanHammerQueue ( int fd , void *state ) {
|
||||
}
|
||||
}
|
||||
|
||||
bool addNewProxyAuthorization ( SafeBuf *req , Msg13Request *r ) {
|
||||
|
||||
if ( ! r->m_proxyIp ) return true;
|
||||
if ( ! r->m_proxyPort ) return true;
|
||||
|
||||
// get proxy from list to get username/password
|
||||
SpiderProxy *sp = getSpiderProxyByIpPort (r->m_proxyIp,r->m_proxyPort);
|
||||
|
||||
// if none required, all done
|
||||
if ( ! sp->m_usernamePwd ) return true;
|
||||
// strange?
|
||||
if ( req->length() < 8 ) return false;
|
||||
// back up over final \r\n
|
||||
req->m_length -= 2 ;
|
||||
// insert it
|
||||
req->safePrintf("Proxy-Authorization: Basic ");
|
||||
req->base64Encode ( sp->m_usernamePwd );
|
||||
req->safePrintf("\r\n");
|
||||
req->safePrintf("\r\n");
|
||||
req->nullTerm();
|
||||
return true;
|
||||
}
|
||||
|
||||
// When the Msg13Request::m_isSquidProxiedUrl bit then request we got is
|
||||
// using us like a proxy, so Msg13Request::m_url is in reality a complete
|
||||
// HTTP request mime. so in that case we have to call this code to
|
||||
|
@ -3525,6 +3525,9 @@ bool SafeBuf::base64Encode ( char *sx , int32_t len , int32_t niceness ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Encode( char *s ) {
|
||||
return base64Encode(s,gbstrlen(s));
|
||||
}
|
||||
|
||||
bool SafeBuf::base64Decode ( char *src , int32_t srcLen , int32_t niceness ) {
|
||||
|
||||
|
@ -122,6 +122,8 @@ public:
|
||||
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
|
||||
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
|
||||
|
||||
bool base64Encode( char *s ) ;
|
||||
|
||||
//bool pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
|
||||
bool cat(SafeBuf& c);
|
||||
// . only cat the sections/tag that start with "tagFilter"
|
||||
|
@ -24,47 +24,6 @@
|
||||
// . TODO: to prevent host #0 from getting too slammed we can also recruit
|
||||
// other hosts to act just like host #0.
|
||||
|
||||
// host #0 breaks Conf::m_spiderIps safebuf into an array of
|
||||
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
|
||||
// persistence
|
||||
class SpiderProxy {
|
||||
public:
|
||||
// ip/port of the spider proxy
|
||||
int32_t m_ip;
|
||||
uint16_t m_port;
|
||||
// last time we attempted to download the test url through this proxy
|
||||
int64_t m_lastDownloadTestAttemptMS;
|
||||
// use -1 to indicate timed out when downloading test url
|
||||
int32_t m_lastDownloadTookMS;
|
||||
// 0 means none... use mstrerror()
|
||||
int32_t m_lastDownloadError;
|
||||
// use -1 to indicate never
|
||||
int64_t m_lastSuccessfulTestMS;
|
||||
|
||||
// how many times have we told a requesting host to use this proxy
|
||||
// to download their url with.
|
||||
int32_t m_numDownloadRequests;
|
||||
|
||||
// how many are outstanding? everytime a host requests a proxyip
|
||||
// it also tells us its outstanding counts for each proxy ip
|
||||
// so we can ensure this is accurate even though a host may die
|
||||
// and come back up.
|
||||
int32_t m_numOutstandingDownloads;
|
||||
|
||||
// waiting on test url to be downloaded
|
||||
bool m_isWaiting;
|
||||
|
||||
int64_t m_timesUsed;
|
||||
|
||||
int32_t m_lastBytesDownloaded;
|
||||
|
||||
// special things used by LoadBucket algo to determine which
|
||||
// SpiderProxy to use to download from a particular IP
|
||||
int32_t m_countForThisIp;
|
||||
int64_t m_lastTimeUsedForThisIp;
|
||||
|
||||
char m_usernamePwd[MAXUSERNAMEPWD];
|
||||
};
|
||||
|
||||
// hashtable that maps an ip:port key (64-bits) to a SpiderProxy
|
||||
static HashTableX s_iptab;
|
||||
@ -1081,3 +1040,14 @@ bool initSpiderProxyStuff() {
|
||||
|
||||
}
|
||||
|
||||
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
|
||||
for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
// skip empty slots
|
||||
if ( ! s_iptab.m_flags[i] ) continue;
|
||||
SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
|
||||
if ( sp->m_ip != ip ) continue;
|
||||
if ( sp->m_port != port ) continue;
|
||||
return sp;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
@ -19,6 +19,51 @@ bool resetProxyStats ( ) ;
|
||||
// save stats on the spider proxies if any
|
||||
bool saveSpiderProxyStats();
|
||||
|
||||
#define MAXUSERNAMEPWD 64
|
||||
|
||||
// host #0 breaks Conf::m_spiderIps safebuf into an array of
|
||||
// SpiderProxy classes and saves to disk as spoderproxies.dat to ensure
|
||||
// persistence
|
||||
class SpiderProxy {
|
||||
public:
|
||||
// ip/port of the spider proxy
|
||||
int32_t m_ip;
|
||||
uint16_t m_port;
|
||||
// last time we attempted to download the test url through this proxy
|
||||
int64_t m_lastDownloadTestAttemptMS;
|
||||
// use -1 to indicate timed out when downloading test url
|
||||
int32_t m_lastDownloadTookMS;
|
||||
// 0 means none... use mstrerror()
|
||||
int32_t m_lastDownloadError;
|
||||
// use -1 to indicate never
|
||||
int64_t m_lastSuccessfulTestMS;
|
||||
|
||||
// how many times have we told a requesting host to use this proxy
|
||||
// to download their url with.
|
||||
int32_t m_numDownloadRequests;
|
||||
|
||||
// how many are outstanding? everytime a host requests a proxyip
|
||||
// it also tells us its outstanding counts for each proxy ip
|
||||
// so we can ensure this is accurate even though a host may die
|
||||
// and come back up.
|
||||
int32_t m_numOutstandingDownloads;
|
||||
|
||||
// waiting on test url to be downloaded
|
||||
bool m_isWaiting;
|
||||
|
||||
int64_t m_timesUsed;
|
||||
|
||||
int32_t m_lastBytesDownloaded;
|
||||
|
||||
// special things used by LoadBucket algo to determine which
|
||||
// SpiderProxy to use to download from a particular IP
|
||||
int32_t m_countForThisIp;
|
||||
int64_t m_lastTimeUsedForThisIp;
|
||||
|
||||
char m_usernamePwd[MAXUSERNAMEPWD];
|
||||
};
|
||||
|
||||
class SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) ;
|
||||
|
||||
// value for m_opCode. get a proxy to use from host #0:
|
||||
#define OP_GETPROXY 1
|
||||
@ -38,8 +83,6 @@ bool saveSpiderProxyStats();
|
||||
// char m_opCode;
|
||||
//};
|
||||
|
||||
#define MAXUSERNAMEPWD 64
|
||||
|
||||
// host #0 gives us a proxy to use:
|
||||
class ProxyReply {
|
||||
public:
|
||||
|
15
XmlDoc.cpp
15
XmlDoc.cpp
@ -15237,6 +15237,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
if ( cr->m_forceUseFloaters ) useProxies = true;
|
||||
// we gotta have some proxy ips that we can use
|
||||
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
||||
|
||||
// until we fix https CONNECT support for https urls diffbot can't
|
||||
// go through gb. we should fix that by downloading the whole page
|
||||
// ourselves and sending it back, and tell diffbot's phantomjs not
|
||||
// to do the certificate check.
|
||||
useProxies = false;
|
||||
|
||||
// if we used a proxy to download the doc, then diffbot should too
|
||||
// BUT tell diffbot to go through host #0 so we can send it to the
|
||||
// correct proxy using our load balancing & backoff algos.
|
||||
@ -15272,6 +15279,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
m_diffbotUrl.urlEncode(p1);
|
||||
*p2 = c;
|
||||
}
|
||||
|
||||
// now so it works just give it a proxy directly, so it doesn't
|
||||
// have to go through gb.
|
||||
// if ( useProxies ) {
|
||||
// // msg13 typically uses this to get an unbanned proxy
|
||||
// getProxiesToUse();
|
||||
// }
|
||||
|
||||
// if we use proxies then increase the timeout since proxies
|
||||
// increase the crawl delay in hopes of backing off to discover
|
||||
// the website's policy so we don't hit it too hard and get banned.
|
||||
|
41
main.cpp
41
main.cpp
@ -165,8 +165,16 @@ static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool inclu
|
||||
char printStats , int32_t firstIp );
|
||||
static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
|
||||
static void dumpRevdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
|
||||
static void dumpTagdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
||||
int32_t c, char rec=0, int32_t rdbId = RDB_TAGDB );
|
||||
|
||||
static void dumpTagdb ( char *coll,
|
||||
int32_t sfn,
|
||||
int32_t numFiles,
|
||||
bool includeTree,
|
||||
int32_t c,
|
||||
char rec=0,
|
||||
int32_t rdbId = RDB_TAGDB ,
|
||||
char *site = NULL );
|
||||
|
||||
static void dumpIndexdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
||||
int64_t termId ) ;
|
||||
void dumpPosdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
|
||||
@ -2858,11 +2866,24 @@ int main2 ( int argc , char *argv[] ) {
|
||||
dumpSectiondb(coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 'V' )
|
||||
dumpRevdb(coll,startFileNum,numFiles,includeTree);
|
||||
else if ( argv[cmdarg+1][0] == 'S' )
|
||||
dumpTagdb (coll,startFileNum,numFiles,includeTree,0);
|
||||
else if ( argv[cmdarg+1][0] == 'z' )
|
||||
else if ( argv[cmdarg+1][0] == 'S' ) {
|
||||
char *site = NULL;
|
||||
if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
|
||||
dumpTagdb(coll,
|
||||
startFileNum,
|
||||
numFiles,
|
||||
includeTree,
|
||||
0,
|
||||
0,
|
||||
RDB_TAGDB,
|
||||
site);
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'z' ) {
|
||||
char *site = NULL;
|
||||
if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
|
||||
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
|
||||
'z');
|
||||
'z',RDB_TAGDB,site);
|
||||
}
|
||||
else if ( argv[cmdarg+1][0] == 'A' )
|
||||
dumpTagdb (coll,startFileNum,numFiles,includeTree,0,
|
||||
'A');
|
||||
@ -11956,7 +11977,8 @@ void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTre
|
||||
|
||||
void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
bool includeTree,
|
||||
int32_t c , char req, int32_t rdbId ) {
|
||||
int32_t c , char req, int32_t rdbId ,
|
||||
char *siteArg ) {
|
||||
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
|
||||
g_tagdb.init ();
|
||||
//g_collectiondb.init(true);
|
||||
@ -11966,6 +11988,11 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
|
||||
key128_t endKey ;
|
||||
startKey.setMin();
|
||||
endKey.setMax();
|
||||
if ( siteArg ) {
|
||||
startKey = g_tagdb.makeStartKey ( siteArg );
|
||||
endKey = g_tagdb.makeEndKey ( siteArg );
|
||||
log("gb: using site %s for start key",siteArg );
|
||||
}
|
||||
// turn off threads
|
||||
g_threads.disableThreads();
|
||||
// get a meg at a time
|
||||
|
1418
sitelinks.txt
1418
sitelinks.txt
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user