if a root/seed url has no outlinks, assumed banned.

This commit is contained in:
Matt
2015-05-04 14:23:28 -07:00
parent 79b81ede00
commit 86800a0656
3 changed files with 32 additions and 3 deletions

@ -1159,12 +1159,16 @@ void doneReportingStatsWrapper ( void *state, UdpSlot *slot ) {
s_55Out--;
}
bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
bool ipWasBanned ( TcpSocket *ts , const char **msg , Msg13Request *r ) {
// ts will be null if we got a fake reply from a bulk job
if ( ! ts )
return false;
// do not do this on robots.txt files
if ( r->m_isRobotsTxt )
return false;
// g_errno is 104 for 'connection reset by peer'
if ( g_errno == ECONNRESET ) {
*msg = "connection reset";
@ -1206,11 +1210,25 @@ bool ipWasBanned ( TcpSocket *ts , const char **msg ) {
// if it has link to "google.com/recaptcha"
// TODO: use own gbstrstr so we can do QUICKPOLL(niceness)
// TODO: ensure NOT in an invisible div
if ( strstr ( ts->m_readBuf , "google.com/recaptcha/api/challenge") ) {
*msg = "recaptcha link";
return true;
}
//CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
// if it is a seed url and there are no links, then perhaps we
// are in a blacklist somewhere already from triggering a spider trap
if ( //isInSeedBuf ( cr , r->ptr_url ) &&
// this is set in XmlDoc.cpp based on hopcount really
r->m_isRootSeedUrl &&
! strstr ( ts->m_readBuf, "<a href" ) ) {
*msg = "root/seed url with no outlinks";
return true;
}
// TODO: compare a simple checksum of the page content to what
// we have downloaded previously from this domain or ip. if it
// seems to be the same no matter what the url, then perhaps we
@ -1236,7 +1254,7 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
//bool banned = false;
//if ( ! g_errno )
bool banned = ipWasBanned ( ts , &banMsg );
bool banned = ipWasBanned ( ts , &banMsg , r );
if ( g_errno )
log("msg13: got error from proxy: %s",mstrerror(g_errno));
@ -1489,7 +1507,7 @@ void gotHttpReply2 ( void *state ,
const char *banMsg = NULL;
bool banned = false;
if ( checkIfBanned )
banned = ipWasBanned ( ts , &banMsg );
banned = ipWasBanned ( ts , &banMsg , r );
if ( banned )
// should we turn proxies on for this IP address only?
log("msg13: url %s detected as banned (%s), "

@ -111,6 +111,7 @@ public:
int32_t m_forceUseFloaters:1;
int32_t m_wasInTableBeforeStarting:1;
int32_t m_isRootSeedUrl:1;
//int32_t m_testParserEnabled:1;
//int32_t m_testSpiderEnabled:1;

@ -15834,6 +15834,16 @@ char **XmlDoc::getHttpReply2 ( ) {
//strcpy ( r->m_url , cu->getUrl() );
r->ptr_url = cu->getUrl();
r->size_url = cu->getUrlLen()+1;
// caution: m_sreq.m_hopCountValid is false sometimes for page parser
// this is used for Msg13.cpp's ipWasBanned()
// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
bool isInjecting = getIsInjecting();
if ( ! isInjecting && m_sreqValid && m_sreq.m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
if ( ! isInjecting && m_hopCountValid && m_hopCount == 0 )
r->m_isRootSeedUrl = 1;
// sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// max to download in bytes. currently 1MB.