if a root/seed url has no outlinks, assumed banned.
This commit is contained in:
24
Msg13.cpp
24
Msg13.cpp
@ -1159,12 +1159,16 @@ void doneReportingStatsWrapper ( void *state, UdpSlot *slot ) {
|
||||
s_55Out--;
|
||||
}
|
||||
|
||||
bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
|
||||
bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
|
||||
|
||||
// ts will be null if we got a fake reply from a bulk job
|
||||
if ( ! ts )
|
||||
return false;
|
||||
|
||||
// do not do this on robots.txt files
|
||||
if ( r->m_isRobotsTxt )
|
||||
return false;
|
||||
|
||||
// g_errno is 104 for 'connection reset by peer'
|
||||
if ( g_errno == ECONNRESET ) {
|
||||
*msg = "connection reset";
|
||||
@ -1206,11 +1210,25 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
|
||||
|
||||
// if it has link to "google.com/recaptcha"
|
||||
// TODO: use own gbstrstr so we can do QUICKPOLL(niceness)
|
||||
// TODO: ensure NOT in an invisible div
|
||||
if ( strstr ( ts->m_readBuf , "google.com/recaptcha/api/challenge") ) {
|
||||
*msg = "recaptcha link";
|
||||
return true;
|
||||
}
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if it is a seed url and there are no links, then perhaps we
|
||||
// are in a blacklist somewhere already from triggering a spider trap
|
||||
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
|
||||
// this is set in XmlDoc.cpp based on hopcount really
|
||||
r->m_isRootSeedUrl &&
|
||||
! strstr ( ts->m_readBuf, "<a href" ) ) {
|
||||
*msg = "root/seed url with no outlinks";
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// TODO: compare a simple checksum of the page content to what
|
||||
// we have downloaded previously from this domain or ip. if it
|
||||
// seems to be the same no matter what the url, then perhaps we
|
||||
@ -1236,7 +1254,7 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
//bool banned = false;
|
||||
|
||||
//if ( ! g_errno )
|
||||
bool banned = ipWasBanned ( ts , &banMsg );
|
||||
bool banned = ipWasBanned ( ts , &banMsg , r );
|
||||
|
||||
if ( g_errno )
|
||||
log("msg13: got error from proxy: %s",mstrerror(g_errno));
|
||||
@ -1489,7 +1507,7 @@ void gotHttpReply2 ( void *state ,
|
||||
const char *banMsg = NULL;
|
||||
bool banned = false;
|
||||
if ( checkIfBanned )
|
||||
banned = ipWasBanned ( ts , &banMsg );
|
||||
banned = ipWasBanned ( ts , &banMsg , r );
|
||||
if ( banned )
|
||||
// should we turn proxies on for this IP address only?
|
||||
log("msg13: url %s detected as banned (%s), "
|
||||
|
1
Msg13.h
1
Msg13.h
@ -111,6 +111,7 @@ public:
|
||||
int32_t m_forceUseFloaters:1;
|
||||
|
||||
int32_t m_wasInTableBeforeStarting:1;
|
||||
int32_t m_isRootSeedUrl:1;
|
||||
|
||||
//int32_t m_testParserEnabled:1;
|
||||
//int32_t m_testSpiderEnabled:1;
|
||||
|
10
XmlDoc.cpp
10
XmlDoc.cpp
@ -15834,6 +15834,16 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
//strcpy ( r->m_url , cu->getUrl() );
|
||||
r->ptr_url = cu->getUrl();
|
||||
r->size_url = cu->getUrlLen()+1;
|
||||
|
||||
// caution: m_sreq.m_hopCountValid is false sometimes for page parser
|
||||
// this is used for Msg13.cpp's ipWasBanned()
|
||||
// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
|
||||
bool isInjecting = getIsInjecting();
|
||||
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
|
||||
r->m_isRootSeedUrl = 1;
|
||||
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
|
||||
r->m_isRootSeedUrl = 1;
|
||||
|
||||
// sanity check
|
||||
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// max to download in bytes. currently 1MB.
|
||||
|
Reference in New Issue
Block a user