Merge branch 'diffbot-testing' into ia

This commit is contained in:
Matt 2015-05-02 14:46:29 -07:00
commit c54b1e429c
8 changed files with 185 additions and 70 deletions

@ -1942,8 +1942,10 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
////////////

1
Conf.h

@ -680,6 +680,7 @@ class Conf {
bool m_logDebugStats ;
bool m_logDebugSummary ;
bool m_logDebugSpider ;
bool m_logDebugMsg13 ;
bool m_logDebugUrlAttempts ;
bool m_logDebugTcp ;
bool m_logDebugThread ;

@ -606,7 +606,8 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
}
// log it so we can see if we are hammering
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ||
g_conf.m_logDebugMsg13 )
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
r->ptr_url,iptoa(r->m_firstIp));
@ -669,7 +670,7 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
int32_t key = ((uint32_t)r->m_firstIp >> 8);
// send to host "h"
Host *h = g_hostdb.getBestSpiderCompressionProxy(&key);
if ( g_conf.m_logDebugSpider )
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
log(LOG_DEBUG,"spider: sending to compression proxy "
"%s:%"UINT32"",iptoa(h->m_ip),(uint32_t)h->m_port);
// . otherwise, send the request to the key host
@ -736,6 +737,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
// wait in line cuz someone else downloading it now
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
log("spider: error adding to waiting table %s",r->ptr_url);
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
return;
}
@ -830,7 +832,8 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// report it
log("spider: msg54 request: %s",mstrerror(g_errno));
log("spider: msg54 request1: %s %s",
mstrerror(g_errno),r->ptr_url);
// crap we gotta send back a reply i guess
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
// g_errno must be set!
@ -862,8 +865,8 @@ void gotProxyHostReplyWrapper ( void *state , UdpSlot *slot ) {
//int32_t replyAllocSize = slot->m_readBufMaxSize;
// bad reply? ip/port/LBid
if ( replySize != sizeof(ProxyReply) ) {
log("sproxy: bad 54 reply size of %"INT32" != %"INT32"",
replySize,(int32_t)sizeof(ProxyReply));
log("sproxy: bad 54 reply size of %"INT32" != %"INT32" %s",
replySize,(int32_t)sizeof(ProxyReply),r->ptr_url);
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
return;
}
@ -982,7 +985,7 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
// flag this
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
// note it here
if ( g_conf.m_logDebugSpider )
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
log("spider: downloading %s (%s) (skiphammercheck=%"INT32")",
r->ptr_url,iptoa(r->m_urlIp) ,
(int32_t)r->m_skipHammerCheck);
@ -1312,7 +1315,8 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
// sanity check
//if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// report it
if ( g_errno ) log("spider: msg54 request: %s",mstrerror(g_errno));
if ( g_errno ) log("spider: msg54 request2: %s %s",
mstrerror(g_errno),r->ptr_url);
// it failed i guess proceed
gotHttpReply( state , ts );
}
@ -1461,17 +1465,18 @@ void gotHttpReply2 ( void *state ,
// log("hey");
// error?
if ( g_errno && g_conf.m_logDebugSpider )
if ( g_errno && ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
log("spider: http reply (msg13) had error = %s "
"for %s at ip %s",
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
mstrerror(savedErr),r->ptr_url,iptoa(r->m_urlIp));
bool inTable = false;
// must have a collrec to hold the ips
if ( cr && r->m_urlIp != 0 && r->m_urlIp != -1 )
inTable = isIpInTwitchyTable ( cr , r->m_urlIp );
// check if our ip seems banned
// check if our ip seems banned. if g_errno was ECONNRESET that
// is an indicator it was throttled/banned.
const char *banMsg = NULL;
bool banned = ipWasBanned ( ts , &banMsg );
if ( banned )
@ -1501,6 +1506,8 @@ void gotHttpReply2 ( void *state ,
r->ptr_url);
// reset this so we don't endless loop it
r->m_wasInTableBeforeStarting = true;
// reset error
g_errno = 0;
/// and retry. it should use the proxy... or at least
// use a crawldelay of 3 seconds since we added it to the
// twitchy table.
@ -1509,7 +1516,9 @@ void gotHttpReply2 ( void *state ,
return;
}
if ( banned && r->m_wasInTableBeforeStarting )
// do not print this if we are already using proxies, it is for
// the auto crawldelay backoff logic only
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )
log("msg13: can not retry banned download of %s "
"because we knew ip was banned at start",r->ptr_url);
@ -1535,9 +1544,10 @@ void gotHttpReply2 ( void *state ,
timeToAdd,iptoa(r->m_firstIp),r->ptr_url);
if ( g_conf.m_logDebugSpider )
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s",
iptoa(r->m_firstIp),r->ptr_url);
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s "
"err=%s",
iptoa(r->m_firstIp),r->ptr_url,mstrerror(savedErr));
// sanity. this was happening from iframe download
@ -1563,8 +1573,10 @@ void gotHttpReply2 ( void *state ,
savedErr , r );
// note it
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%"UINT64"",
if ( r->m_useTestCache &&
( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
logf(LOG_DEBUG,"spider: got reply for %s "
"firstIp=%s uh48=%"UINT64"",
r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
int32_t niceness = r->m_niceness;
@ -1791,8 +1803,13 @@ void gotHttpReply2 ( void *state ,
// . returns false if blocks
// . returns true if did not block, sets g_errno on error
// . if it blocked it will recall THIS function
if ( ! getIframeExpandedContent ( r , ts ) )
if ( ! getIframeExpandedContent ( r , ts ) ) {
if ( g_conf.m_logDebugMsg13 ||
g_conf.m_logDebugSpider )
log("msg13: iframe expansion blocked %s",
r->ptr_url);
return;
}
// ok, did we have an error?
if ( g_errno )
log("scproxy: xml set for %s had error: %s",
@ -1946,6 +1963,7 @@ void gotHttpReply2 ( void *state ,
char *compressedBuf = (char*)mmalloc(need, "Msg13Zip");
if ( ! compressedBuf ) {
g_errno = ENOMEM;
log("msg13: compression failed1 %s",r->ptr_url);
g_udpServer.sendErrorReply(slot,g_errno);
return;
}
@ -1966,6 +1984,7 @@ void gotHttpReply2 ( void *state ,
zError(zipErr),(int32_t)zipErr,r->ptr_url);
mfree (compressedBuf, need, "Msg13ZipError");
g_errno = ECORRUPTDATA;
log("msg13: compression failed2 %s",r->ptr_url);
g_udpServer.sendErrorReply(slot,g_errno);
return;
}
@ -2083,7 +2102,8 @@ void gotHttpReply2 ( void *state ,
s_rt.removeSlot ( tableSlot );
// send back error? maybe...
if ( err ) {
if ( g_conf.m_logDebugSpider )
if ( g_conf.m_logDebugSpider ||
g_conf.m_logDebugMsg13 )
log("proxy: msg13: sending back error: %s "
"for url %s with ip %s",
mstrerror(err),
@ -2092,6 +2112,9 @@ void gotHttpReply2 ( void *state ,
g_udpServer.sendErrorReply ( slot , err );
continue;
}
// for debug for now
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
log("msg13: sending reply for %s",r->ptr_url);
// send reply
us->sendReply_ass ( copy,replySize,copy,copyAllocSize, slot );
// now final udp slot will free the reply, so tcp server
@ -2112,6 +2135,9 @@ void gotHttpReply2 ( void *state ,
// we free it - if it was never sent over a udp slot
if ( savedErr && compressed )
mfree ( reply , replyAllocSize , "msg13ubuf" );
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
log("msg13: handled reply ok %s",r->ptr_url);
}

@ -16519,11 +16519,14 @@ void Parms::init ( ) {
"that an IP is throttling or banning gigabot from crawling "
"it. The crawl delay just applies to that IP. "
"Such throttling will be logged.";
m->m_cgi = "autobackoff";
m->m_cgi = "automaticallybackoff";
m->m_xml = "automaticallyBackOff";
m->m_off = (char *)&cr.m_automaticallyBackOff - x;
m->m_type = TYPE_BOOL;
m->m_def = "1";
// a lot of pages have recaptcha links but they have valid content
// so leave this off for now... they have it in a hidden div which
// popups to email the article link or whatever to someone.
m->m_def = "0";
m->m_group = 0;
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
@ -19552,6 +19555,16 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug msg13 messages";
m->m_cgi = "ldspmth";
m->m_off = (char *)&g_conf.m_logDebugMsg13 - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_priv = 1;
m->m_page = PAGE_LOG;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "log debug spider proxies";
m->m_cgi = "ldspr";
m->m_off = (char *)&g_conf.m_logDebugProxies - g;

@ -797,6 +797,8 @@ bool Spiderdb::verify ( char *coll ) {
key128_t Spiderdb::makeKey ( int32_t firstIp ,
int64_t urlHash48 ,
bool isRequest ,
// MDW: now we use timestamp instead of parentdocid
// for spider replies. so they do not dedup...
int64_t parentDocId ,
bool isDel ) {
key128_t k;
@ -814,6 +816,9 @@ key128_t Spiderdb::makeKey ( int32_t firstIp ,
if ( isRequest ) k.n0 |= 0x01;
// parent docid
k.n0 <<= 38;
// if we are making a spider reply key just leave the parentdocid as 0
// so we only store one reply per url. the last reply we got.
// if ( isRequest ) k.n0 |= parentDocId & DOCID_MASK;
k.n0 |= parentDocId & DOCID_MASK;
// reserved (padding)
k.n0 <<= 8;
@ -1802,8 +1807,13 @@ void SpiderColl::clearLocks ( ) {
void SpiderColl::reset ( ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// reset these for SpiderLoop;
m_nextDoledbKey.setMin();
@ -3980,15 +3990,65 @@ bool SpiderColl::scanListForWinners ( ) {
// see if this is the most recent one
SpiderReply *tmp = (SpiderReply *)rec;
// reset reply stats if beginning a new url
if ( srepUh48 != tmp->getUrlHash48() ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
// . MDW: we have to detect corrupt replies up here so
// they do not become the winning reply because
// their date is in the future!!
// . this is -1 on corruption
// . i've seen -31757, 21... etc for bad http replies
// in the qatest123 doc cache... so turn off for that
if ( tmp->m_httpStatus >= 1000 ) {
if ( m_cr->m_spiderCorruptCount == 0 ) {
log("spider: got corrupt 3 "
"spiderReply in "
"scan "
"uh48=%"INT64" "
"httpstatus=%"INT32" "
"(cn=%"INT32")",
tmp->getUrlHash48(),
(int32_t)tmp->m_httpStatus,
(int32_t)m_collnum);
}
m_cr->m_spiderCorruptCount++;
// don't nuke it just for that...
//srep = NULL;
continue;
}
// bad langid?
if ( ! getLanguageAbbr (tmp->m_langId) ) {
log("spider: got corrupt 4 spiderReply in "
"scan uh48=%"INT64" "
"langid=%"INT32" (cn=%"INT32")",
tmp->getUrlHash48(),
(int32_t)tmp->m_langId,
(int32_t)m_collnum);
m_cr->m_spiderCorruptCount++;
//srep = NULL;
// if ( tmp->getUrlHash48() ==
// 271713196158770LL )
// log("hey");
continue;
}
// reset reply stats if beginning a new url
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( srepUh48 != tmp->getUrlHash48() ) {
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// }
// inc stats
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
else m_numFailedReplies ++;
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
// else m_numFailedReplies ++;
// if we have a more recent reply already, skip this
if ( srep &&
@ -4010,10 +4070,14 @@ bool SpiderColl::scanListForWinners ( ) {
int64_t uh48 = sreq->getUrlHash48();
// reset reply stats if beginning a new url
if ( ! srep ) {
m_numSuccessReplies = 0;
m_numFailedReplies = 0;
}
// these don't work because we only store one reply
// which overwrites any older reply. that's how the key is.
// we can change the key to use the timestamp and not
// parent docid in makeKey() for spider replies later.
// if ( ! srep ) {
// m_numSuccessReplies = 0;
// m_numFailedReplies = 0;
// }
// . skip if our twin should add it to doledb
// . waiting tree only has firstIps assigned to us so
@ -4100,8 +4164,13 @@ bool SpiderColl::scanListForWinners ( ) {
// put these in the spiderequest in doledb so we can
// show in the json spider status docs in
// XmlDoc::getSpiderStatusDocMetaList2()
sreq->m_reservedc1 = m_numSuccessReplies;
sreq->m_reservedc2 = m_numFailedReplies;
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// sreq->m_reservedc1 = m_numSuccessReplies;
// sreq->m_reservedc2 = m_numFailedReplies;
m_lastSreqUh48 = uh48;
m_lastCBlockIp = cblock;
@ -4256,28 +4325,6 @@ bool SpiderColl::scanListForWinners ( ) {
// if we tried it before
sreq->m_hadReply = true;
}
// . this is -1 on corruption
// . i've seen -31757, 21... etc for bad http replies
// in the qatest123 doc cache... so turn off for that
if ( srep && srep->m_httpStatus >= 1000 ) {
if ( m_cr->m_spiderCorruptCount == 0 ) {
log("spider: got corrupt 3 spiderReply in "
"scan httpstatus=%"INT32" (cn=%"INT32")",
(int32_t)srep->m_httpStatus,
(int32_t)m_collnum);
}
m_cr->m_spiderCorruptCount++;
// don't nuke it just for that...
//srep = NULL;
}
// bad langid?
if ( srep && ! getLanguageAbbr (srep->m_langId) ) {
log("spider: got corrupt 4 spiderReply in scan "
"langid=%"INT32" (cn=%"INT32")",
(int32_t)srep->m_langId,
(int32_t)m_collnum);
srep = NULL;
}
// . get the url filter we match
// . if this is slow see the TODO below in dedupSpiderdbList()
@ -4310,7 +4357,8 @@ bool SpiderColl::scanListForWinners ( ) {
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider )
log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url);
log("spider: got ufn=%"INT32" for %s (%"INT64"",
ufn,sreq->m_url,sreq->getUrlHash48());
if ( g_conf.m_logDebugSpider && srep )
log("spider: lastspidered=%"UINT32"",

@ -1155,8 +1155,13 @@ class SpiderColl {
int32_t m_tailHopCount;
int64_t m_minFutureTimeMS;
int32_t m_numSuccessReplies;
int32_t m_numFailedReplies;
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// int32_t m_numSuccessReplies;
// int32_t m_numFailedReplies;
// . do not re-send CrawlInfoLocal for a coll if not update
// . we store the flags in here as true if we should send our

@ -2861,6 +2861,10 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
(int32_t)sslError,r,iptoa(s->m_ip),sslMsg);
g_errno = ESSLERROR;
// note in log
log("tcp: ssl: try running "
"'openssl s_client -connect www.hostnamehere.com:443 "
"-debug' to debug the webserver on the other side.");
// make sure read callback is registered
// g_loop.registerReadCallback (s->m_sd,this,readSocketWrapper,
// s->m_niceness);

@ -2426,8 +2426,8 @@ bool XmlDoc::indexDoc ( ) {
"error reply.",
m_firstUrl.m_url,mstrerror(g_errno));
else if ( g_errno )
log("build: docid=%"INT64" had internal error = %s. adding spider "
"error reply.",
log("build: docid=%"INT64" had internal error = %s. "
"adding spider error reply.",
m_docId,mstrerror(g_errno));
// seems like this was causing a core somehow...
@ -2450,6 +2450,16 @@ bool XmlDoc::indexDoc ( ) {
m_indexCodeValid = true;
}
// this should not be retired either. i am seeing it excessively
// retried from a
// "TitleRec::set: uncompress uncompressed size=-2119348471"
// error condition. it also said
// "Error spidering for doc http://www.... : Bad cached document"
if ( g_errno == EBADTITLEREC ) {
m_indexCode = g_errno;
m_indexCodeValid = true;
}
// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
// transId=496026 nice=1 net=default): Buf too small.
// so fix that with this
@ -2468,7 +2478,7 @@ bool XmlDoc::indexDoc ( ) {
m_indexCodeValid = true;
}
// default to internal error which will be retried forever otherwise
if ( ! m_indexCodeValid ) {
m_indexCode = EINTERNALERROR;//g_errno;
m_indexCodeValid = true;
@ -25494,6 +25504,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// store it
m_srep.m_firstIp = firstIp;
// assume no error
// MDW: not right...
m_srep.m_errCount = 0;
// otherwise, inherit from oldsr to be safe
//if ( m_sreqValid )
@ -28153,14 +28164,19 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
m_docIdWeAreADupOf);
// how many spiderings were successful vs. failed
if ( m_sreqValid ) {
jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
m_sreq.m_reservedc1);
jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
m_sreq.m_reservedc2);
}
// these don't work because we only store one reply
// which overwrites any older reply. that's how the
// key is. we can change the key to use the timestamp
// and not parent docid in makeKey() for spider
// replies later.
// if ( m_sreqValid ) {
// jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
// m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
// jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
// m_sreq.m_reservedc1);
// jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
// m_sreq.m_reservedc2);
// }
if ( m_spideredTimeValid )
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",