tested auto proxy use and auto spider (non-proxy) backoff to
3 second crawldelay successfully on the stamps site.
This commit is contained in:
parent
e1a1fd001a
commit
0970975a57
55
Msg13.cpp
55
Msg13.cpp
@ -1147,6 +1147,15 @@ void doneReportingStatsWrapper ( void *state, UdpSlot *slot ) {
|
||||
|
||||
bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
|
||||
|
||||
bool banCheck = false;
|
||||
if ( ! g_errno ) banCheck = true;
|
||||
// g_errno is 104 for 'connection reset by peer'
|
||||
if ( g_errno == ECONNRESET ) banCheck = true;
|
||||
// on other errors do not do the ban check. it might be a
|
||||
// tcp time out or something so we have no reply. but connection resets
|
||||
// are a popular way of saying, hey, don't hit me so hard.
|
||||
if ( ! banCheck ) return true;
|
||||
|
||||
// if they closed the socket on us we read 0 bytes, assumed
|
||||
// we were banned...
|
||||
if ( ts->m_readOffset == 0 ) {
|
||||
@ -1197,9 +1206,10 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
// if we got a 403 Forbidden or an empty reply
|
||||
// then assume the proxy ip got banned so try another.
|
||||
const char *banMsg = NULL;
|
||||
bool banned = false;
|
||||
if ( ! g_errno )
|
||||
banned = ipWasBanned ( ts , &banMsg );
|
||||
//bool banned = false;
|
||||
|
||||
//if ( ! g_errno )
|
||||
bool banned = ipWasBanned ( ts , &banMsg );
|
||||
|
||||
if ( g_errno )
|
||||
log("msg13: got error from proxy: %s",mstrerror(g_errno));
|
||||
@ -1425,6 +1435,11 @@ void gotHttpReply2 ( void *state ,
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// ' connection reset' debug stuff
|
||||
// log("spider: httpreplysize=%i",(int)replySize);
|
||||
// if ( replySize == 0 )
|
||||
// log("hey");
|
||||
|
||||
// error?
|
||||
if ( g_errno && g_conf.m_logDebugSpider )
|
||||
log("spider: http reply (msg13) had error = %s "
|
||||
@ -1432,8 +1447,7 @@ void gotHttpReply2 ( void *state ,
|
||||
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||
|
||||
const char *banMsg = NULL;
|
||||
if ( ! g_errno &&
|
||||
// must have a collrec to hold the ips
|
||||
if ( // must have a collrec to hold the ips
|
||||
cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
@ -2951,19 +2965,15 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
// and it is in the twitchy table
|
||||
isIpInTwitchyTable ( cr , r->m_urlIp ) &&
|
||||
// and no proxies are available to use
|
||||
! canUseProxies ) {
|
||||
isIpInTwitchyTable ( cr , r->m_urlIp ) ) {
|
||||
// and no proxies are available to use
|
||||
//! canUseProxies ) {
|
||||
// then just back off when a crawldelay of 3 seconds
|
||||
if ( crawlDelayMS < 3000 ) crawlDelayMS = 3000;
|
||||
if ( ! canUseProxies && crawlDelayMS < 3000 )
|
||||
crawlDelayMS = 3000;
|
||||
}
|
||||
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
|
||||
"hammercache (waited=%"INT64") for %s",last,waited,
|
||||
iptoa(r->m_firstIp));
|
||||
|
||||
// . if we got a proxybackoff base it on # of banned proxies for urlIp
|
||||
// . try to be more sensitive for more sensitive website policies
|
||||
// . we don't know why this proxy was banned, or if we were
|
||||
@ -2976,6 +2986,15 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
crawlDelayMS = MAX_PROXYCRAWLDELAYMS;
|
||||
}
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got timestamp of %"INT64" from "
|
||||
"hammercache (waited=%"INT64" crawlDelayMS=%"INT32") "
|
||||
"for %s"
|
||||
,last
|
||||
,waited
|
||||
,crawlDelayMS
|
||||
,iptoa(r->m_firstIp));
|
||||
|
||||
bool queueIt = false;
|
||||
if ( last > 0 && waited < crawlDelayMS ) queueIt = true;
|
||||
// a "last" of 0 means currently downloading
|
||||
@ -2993,11 +3012,15 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
if ( queueIt ) {
|
||||
// debug
|
||||
log(LOG_INFO,
|
||||
"spider: adding %s to crawldelayqueue cd=%"INT32"ms",
|
||||
r->ptr_url,crawlDelayMS);
|
||||
"spider: adding %s to crawldelayqueue cd=%"INT32"ms "
|
||||
"ip=%s",
|
||||
r->ptr_url,crawlDelayMS,iptoa(r->m_urlIp));
|
||||
// save this
|
||||
//r->m_udpSlot = slot; // this is already saved!
|
||||
r->m_nextLink = NULL;
|
||||
// we gotta update the crawldelay here in case we modified
|
||||
// it in the above logic.
|
||||
r->m_crawlDelayMS = crawlDelayMS;
|
||||
// add it to queue
|
||||
if ( ! s_hammerQueueHead ) {
|
||||
s_hammerQueueHead = r;
|
||||
|
19
Parms.cpp
19
Parms.cpp
@ -8787,13 +8787,15 @@ void Parms::init ( ) {
|
||||
//
|
||||
///////////////////////////////////////////
|
||||
|
||||
m->m_title = "use spider proxies";
|
||||
m->m_desc = "Use the spider proxies listed below. If none are "
|
||||
m->m_title = "always use spider proxies for all collections";
|
||||
m->m_desc = "ALWAYS Use the spider proxies listed below for "
|
||||
"spidering. If none are "
|
||||
"listed then gb will not use any. Applies to all collections. "
|
||||
"If you want to regulate this on a per collection basis then "
|
||||
"set this to <b>NO</b> here and adjust the controls on the "
|
||||
"set this to <b>NO</b> here and adjust the "
|
||||
"proxy controls on the "
|
||||
"<b>spider controls</b> page. If the list of proxy IPs below "
|
||||
"is empty, then of course, no proxies can be used.";
|
||||
"is empty, then of course, no proxies will be used.";
|
||||
m->m_cgi = "useproxyips";
|
||||
m->m_xml = "useSpiderProxies";
|
||||
m->m_off = (char *)&g_conf.m_useProxyIps - g;
|
||||
@ -16444,7 +16446,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
|
||||
m->m_title = "use proxies for spidering";
|
||||
m->m_title = "always use spider proxies";
|
||||
m->m_desc = "If this is true Gigablast will ALWAYS use the proxies "
|
||||
"listed on the <a href=/admin/proxies>proxies</a> page for "
|
||||
"spidering for "
|
||||
@ -16464,12 +16466,13 @@ void Parms::init ( ) {
|
||||
"if we detect that "
|
||||
"a webserver is throttling the spiders. This way we can "
|
||||
"learn the webserver's spidering policy so that our spiders "
|
||||
"can be more polite. If not proxies are listed on the "
|
||||
"proxies page then this parameter will have no affect.";
|
||||
"can be more polite. If no proxies are listed on the "
|
||||
"proxies page then this parameter will have no effect.";
|
||||
m->m_cgi = "automaticallyuseproxies";
|
||||
m->m_off = (char *)&cr.m_automaticallyUseProxies - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m->m_flags = PF_CLONE;
|
||||
|
@ -3074,8 +3074,17 @@ bool gotMxIp ( EmailInfo *ei ) {
|
||||
|
||||
|
||||
static void gotMandrillReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// log the mandril reply
|
||||
log("email: got mandrill reply: %s",s->m_readBuf);
|
||||
// why core here with s NULL
|
||||
if ( ! s ) {
|
||||
// crap seems like we do not retry so they will not get
|
||||
// the notification... how to fix better?
|
||||
log("email: failed to lookup mandrill ip. sock is null.");
|
||||
g_errno = EBADIP;
|
||||
}
|
||||
else {
|
||||
// log the mandril reply
|
||||
log("email: got mandrill reply: %s",s->m_readBuf);
|
||||
}
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
if ( ei->m_callback ) ei->m_callback ( ei->m_state );
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user