Merge branch 'diffbot-testing' into ia
This commit is contained in:
commit
c54b1e429c
@ -1942,8 +1942,10 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.m_allocName = "twittbl";
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
|
||||
|
||||
|
||||
////////////
|
||||
|
1
Conf.h
1
Conf.h
@ -680,6 +680,7 @@ class Conf {
|
||||
bool m_logDebugStats ;
|
||||
bool m_logDebugSummary ;
|
||||
bool m_logDebugSpider ;
|
||||
bool m_logDebugMsg13 ;
|
||||
bool m_logDebugUrlAttempts ;
|
||||
bool m_logDebugTcp ;
|
||||
bool m_logDebugThread ;
|
||||
|
62
Msg13.cpp
62
Msg13.cpp
@ -606,7 +606,8 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
}
|
||||
|
||||
// log it so we can see if we are hammering
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ||
|
||||
g_conf.m_logDebugMsg13 )
|
||||
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
||||
r->ptr_url,iptoa(r->m_firstIp));
|
||||
|
||||
@ -669,7 +670,7 @@ void handleRequest13 ( UdpSlot *slot , int32_t niceness ) {
|
||||
int32_t key = ((uint32_t)r->m_firstIp >> 8);
|
||||
// send to host "h"
|
||||
Host *h = g_hostdb.getBestSpiderCompressionProxy(&key);
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log(LOG_DEBUG,"spider: sending to compression proxy "
|
||||
"%s:%"UINT32"",iptoa(h->m_ip),(uint32_t)h->m_port);
|
||||
// . otherwise, send the request to the key host
|
||||
@ -736,6 +737,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
||||
// wait in line cuz someone else downloading it now
|
||||
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
||||
log("spider: error adding to waiting table %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -830,7 +832,8 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
// sanity check
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// report it
|
||||
log("spider: msg54 request: %s",mstrerror(g_errno));
|
||||
log("spider: msg54 request1: %s %s",
|
||||
mstrerror(g_errno),r->ptr_url);
|
||||
// crap we gotta send back a reply i guess
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
// g_errno must be set!
|
||||
@ -862,8 +865,8 @@ void gotProxyHostReplyWrapper ( void *state , UdpSlot *slot ) {
|
||||
//int32_t replyAllocSize = slot->m_readBufMaxSize;
|
||||
// bad reply? ip/port/LBid
|
||||
if ( replySize != sizeof(ProxyReply) ) {
|
||||
log("sproxy: bad 54 reply size of %"INT32" != %"INT32"",
|
||||
replySize,(int32_t)sizeof(ProxyReply));
|
||||
log("sproxy: bad 54 reply size of %"INT32" != %"INT32" %s",
|
||||
replySize,(int32_t)sizeof(ProxyReply),r->ptr_url);
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -982,7 +985,7 @@ void downloadTheDocForReals3b ( Msg13Request *r ) {
|
||||
// flag this
|
||||
//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
|
||||
// note it here
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("spider: downloading %s (%s) (skiphammercheck=%"INT32")",
|
||||
r->ptr_url,iptoa(r->m_urlIp) ,
|
||||
(int32_t)r->m_skipHammerCheck);
|
||||
@ -1312,7 +1315,8 @@ void gotHttpReply9 ( void *state , TcpSocket *ts ) {
|
||||
// sanity check
|
||||
//if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
// report it
|
||||
if ( g_errno ) log("spider: msg54 request: %s",mstrerror(g_errno));
|
||||
if ( g_errno ) log("spider: msg54 request2: %s %s",
|
||||
mstrerror(g_errno),r->ptr_url);
|
||||
// it failed i guess proceed
|
||||
gotHttpReply( state , ts );
|
||||
}
|
||||
@ -1461,17 +1465,18 @@ void gotHttpReply2 ( void *state ,
|
||||
// log("hey");
|
||||
|
||||
// error?
|
||||
if ( g_errno && g_conf.m_logDebugSpider )
|
||||
if ( g_errno && ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
|
||||
log("spider: http reply (msg13) had error = %s "
|
||||
"for %s at ip %s",
|
||||
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||
mstrerror(savedErr),r->ptr_url,iptoa(r->m_urlIp));
|
||||
|
||||
bool inTable = false;
|
||||
// must have a collrec to hold the ips
|
||||
if ( cr && r->m_urlIp != 0 && r->m_urlIp != -1 )
|
||||
inTable = isIpInTwitchyTable ( cr , r->m_urlIp );
|
||||
|
||||
// check if our ip seems banned
|
||||
// check if our ip seems banned. if g_errno was ECONNRESET that
|
||||
// is an indicator it was throttled/banned.
|
||||
const char *banMsg = NULL;
|
||||
bool banned = ipWasBanned ( ts , &banMsg );
|
||||
if ( banned )
|
||||
@ -1501,6 +1506,8 @@ void gotHttpReply2 ( void *state ,
|
||||
r->ptr_url);
|
||||
// reset this so we don't endless loop it
|
||||
r->m_wasInTableBeforeStarting = true;
|
||||
// reset error
|
||||
g_errno = 0;
|
||||
/// and retry. it should use the proxy... or at least
|
||||
// use a crawldelay of 3 seconds since we added it to the
|
||||
// twitchy table.
|
||||
@ -1509,7 +1516,9 @@ void gotHttpReply2 ( void *state ,
|
||||
return;
|
||||
}
|
||||
|
||||
if ( banned && r->m_wasInTableBeforeStarting )
|
||||
// do not print this if we are already using proxies, it is for
|
||||
// the auto crawldelay backoff logic only
|
||||
if ( banned && r->m_wasInTableBeforeStarting && ! r->m_proxyIp )
|
||||
log("msg13: can not retry banned download of %s "
|
||||
"because we knew ip was banned at start",r->ptr_url);
|
||||
|
||||
@ -1535,9 +1544,10 @@ void gotHttpReply2 ( void *state ,
|
||||
timeToAdd,iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s",
|
||||
iptoa(r->m_firstIp),r->ptr_url);
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log(LOG_DEBUG,"spider: got http reply for firstip=%s url=%s "
|
||||
"err=%s",
|
||||
iptoa(r->m_firstIp),r->ptr_url,mstrerror(savedErr));
|
||||
|
||||
|
||||
// sanity. this was happening from iframe download
|
||||
@ -1563,8 +1573,10 @@ void gotHttpReply2 ( void *state ,
|
||||
savedErr , r );
|
||||
|
||||
// note it
|
||||
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
||||
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%"UINT64"",
|
||||
if ( r->m_useTestCache &&
|
||||
( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 ) )
|
||||
logf(LOG_DEBUG,"spider: got reply for %s "
|
||||
"firstIp=%s uh48=%"UINT64"",
|
||||
r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
||||
|
||||
int32_t niceness = r->m_niceness;
|
||||
@ -1791,8 +1803,13 @@ void gotHttpReply2 ( void *state ,
|
||||
// . returns false if blocks
|
||||
// . returns true if did not block, sets g_errno on error
|
||||
// . if it blocked it will recall THIS function
|
||||
if ( ! getIframeExpandedContent ( r , ts ) )
|
||||
if ( ! getIframeExpandedContent ( r , ts ) ) {
|
||||
if ( g_conf.m_logDebugMsg13 ||
|
||||
g_conf.m_logDebugSpider )
|
||||
log("msg13: iframe expansion blocked %s",
|
||||
r->ptr_url);
|
||||
return;
|
||||
}
|
||||
// ok, did we have an error?
|
||||
if ( g_errno )
|
||||
log("scproxy: xml set for %s had error: %s",
|
||||
@ -1946,6 +1963,7 @@ void gotHttpReply2 ( void *state ,
|
||||
char *compressedBuf = (char*)mmalloc(need, "Msg13Zip");
|
||||
if ( ! compressedBuf ) {
|
||||
g_errno = ENOMEM;
|
||||
log("msg13: compression failed1 %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -1966,6 +1984,7 @@ void gotHttpReply2 ( void *state ,
|
||||
zError(zipErr),(int32_t)zipErr,r->ptr_url);
|
||||
mfree (compressedBuf, need, "Msg13ZipError");
|
||||
g_errno = ECORRUPTDATA;
|
||||
log("msg13: compression failed2 %s",r->ptr_url);
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
return;
|
||||
}
|
||||
@ -2083,7 +2102,8 @@ void gotHttpReply2 ( void *state ,
|
||||
s_rt.removeSlot ( tableSlot );
|
||||
// send back error? maybe...
|
||||
if ( err ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
if ( g_conf.m_logDebugSpider ||
|
||||
g_conf.m_logDebugMsg13 )
|
||||
log("proxy: msg13: sending back error: %s "
|
||||
"for url %s with ip %s",
|
||||
mstrerror(err),
|
||||
@ -2092,6 +2112,9 @@ void gotHttpReply2 ( void *state ,
|
||||
g_udpServer.sendErrorReply ( slot , err );
|
||||
continue;
|
||||
}
|
||||
// for debug for now
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("msg13: sending reply for %s",r->ptr_url);
|
||||
// send reply
|
||||
us->sendReply_ass ( copy,replySize,copy,copyAllocSize, slot );
|
||||
// now final udp slot will free the reply, so tcp server
|
||||
@ -2112,6 +2135,9 @@ void gotHttpReply2 ( void *state ,
|
||||
// we free it - if it was never sent over a udp slot
|
||||
if ( savedErr && compressed )
|
||||
mfree ( reply , replyAllocSize , "msg13ubuf" );
|
||||
|
||||
if ( g_conf.m_logDebugSpider || g_conf.m_logDebugMsg13 )
|
||||
log("msg13: handled reply ok %s",r->ptr_url);
|
||||
}
|
||||
|
||||
|
||||
|
17
Parms.cpp
17
Parms.cpp
@ -16519,11 +16519,14 @@ void Parms::init ( ) {
|
||||
"that an IP is throttling or banning gigabot from crawling "
|
||||
"it. The crawl delay just applies to that IP. "
|
||||
"Such throttling will be logged.";
|
||||
m->m_cgi = "autobackoff";
|
||||
m->m_cgi = "automaticallybackoff";
|
||||
m->m_xml = "automaticallyBackOff";
|
||||
m->m_off = (char *)&cr.m_automaticallyBackOff - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
// a lot of pages have recaptcha links but they have valid content
|
||||
// so leave this off for now... they have it in a hidden div which
|
||||
// popups to email the article link or whatever to someone.
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_page = PAGE_SPIDER;
|
||||
m->m_obj = OBJ_COLL;
|
||||
@ -19552,6 +19555,16 @@ void Parms::init ( ) {
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug msg13 messages";
|
||||
m->m_cgi = "ldspmth";
|
||||
m->m_off = (char *)&g_conf.m_logDebugMsg13 - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_priv = 1;
|
||||
m->m_page = PAGE_LOG;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m++;
|
||||
|
||||
m->m_title = "log debug spider proxies";
|
||||
m->m_cgi = "ldspr";
|
||||
m->m_off = (char *)&g_conf.m_logDebugProxies - g;
|
||||
|
122
Spider.cpp
122
Spider.cpp
@ -797,6 +797,8 @@ bool Spiderdb::verify ( char *coll ) {
|
||||
key128_t Spiderdb::makeKey ( int32_t firstIp ,
|
||||
int64_t urlHash48 ,
|
||||
bool isRequest ,
|
||||
// MDW: now we use timestamp instead of parentdocid
|
||||
// for spider replies. so they do not dedup...
|
||||
int64_t parentDocId ,
|
||||
bool isDel ) {
|
||||
key128_t k;
|
||||
@ -814,6 +816,9 @@ key128_t Spiderdb::makeKey ( int32_t firstIp ,
|
||||
if ( isRequest ) k.n0 |= 0x01;
|
||||
// parent docid
|
||||
k.n0 <<= 38;
|
||||
// if we are making a spider reply key just leave the parentdocid as 0
|
||||
// so we only store one reply per url. the last reply we got.
|
||||
// if ( isRequest ) k.n0 |= parentDocId & DOCID_MASK;
|
||||
k.n0 |= parentDocId & DOCID_MASK;
|
||||
// reserved (padding)
|
||||
k.n0 <<= 8;
|
||||
@ -1802,8 +1807,13 @@ void SpiderColl::clearLocks ( ) {
|
||||
|
||||
void SpiderColl::reset ( ) {
|
||||
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
|
||||
// reset these for SpiderLoop;
|
||||
m_nextDoledbKey.setMin();
|
||||
@ -3980,15 +3990,65 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// see if this is the most recent one
|
||||
SpiderReply *tmp = (SpiderReply *)rec;
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
// . MDW: we have to detect corrupt replies up here so
|
||||
// they do not become the winning reply because
|
||||
// their date is in the future!!
|
||||
|
||||
// . this is -1 on corruption
|
||||
// . i've seen -31757, 21... etc for bad http replies
|
||||
// in the qatest123 doc cache... so turn off for that
|
||||
if ( tmp->m_httpStatus >= 1000 ) {
|
||||
if ( m_cr->m_spiderCorruptCount == 0 ) {
|
||||
log("spider: got corrupt 3 "
|
||||
"spiderReply in "
|
||||
"scan "
|
||||
"uh48=%"INT64" "
|
||||
"httpstatus=%"INT32" "
|
||||
"(cn=%"INT32")",
|
||||
tmp->getUrlHash48(),
|
||||
(int32_t)tmp->m_httpStatus,
|
||||
(int32_t)m_collnum);
|
||||
}
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
// don't nuke it just for that...
|
||||
//srep = NULL;
|
||||
continue;
|
||||
}
|
||||
// bad langid?
|
||||
if ( ! getLanguageAbbr (tmp->m_langId) ) {
|
||||
log("spider: got corrupt 4 spiderReply in "
|
||||
"scan uh48=%"INT64" "
|
||||
"langid=%"INT32" (cn=%"INT32")",
|
||||
tmp->getUrlHash48(),
|
||||
(int32_t)tmp->m_langId,
|
||||
(int32_t)m_collnum);
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
//srep = NULL;
|
||||
// if ( tmp->getUrlHash48() ==
|
||||
// 271713196158770LL )
|
||||
// log("hey");
|
||||
continue;
|
||||
}
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// if ( srepUh48 != tmp->getUrlHash48() ) {
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
// }
|
||||
|
||||
// inc stats
|
||||
if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
else m_numFailedReplies ++;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
|
||||
// else m_numFailedReplies ++;
|
||||
|
||||
// if we have a more recent reply already, skip this
|
||||
if ( srep &&
|
||||
@ -4010,10 +4070,14 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
int64_t uh48 = sreq->getUrlHash48();
|
||||
|
||||
// reset reply stats if beginning a new url
|
||||
if ( ! srep ) {
|
||||
m_numSuccessReplies = 0;
|
||||
m_numFailedReplies = 0;
|
||||
}
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the key is.
|
||||
// we can change the key to use the timestamp and not
|
||||
// parent docid in makeKey() for spider replies later.
|
||||
// if ( ! srep ) {
|
||||
// m_numSuccessReplies = 0;
|
||||
// m_numFailedReplies = 0;
|
||||
// }
|
||||
|
||||
// . skip if our twin should add it to doledb
|
||||
// . waiting tree only has firstIps assigned to us so
|
||||
@ -4100,8 +4164,13 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// put these in the spiderequest in doledb so we can
|
||||
// show in the json spider status docs in
|
||||
// XmlDoc::getSpiderStatusDocMetaList2()
|
||||
sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
sreq->m_reservedc2 = m_numFailedReplies;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// sreq->m_reservedc1 = m_numSuccessReplies;
|
||||
// sreq->m_reservedc2 = m_numFailedReplies;
|
||||
|
||||
m_lastSreqUh48 = uh48;
|
||||
m_lastCBlockIp = cblock;
|
||||
@ -4256,28 +4325,6 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// if we tried it before
|
||||
sreq->m_hadReply = true;
|
||||
}
|
||||
// . this is -1 on corruption
|
||||
// . i've seen -31757, 21... etc for bad http replies
|
||||
// in the qatest123 doc cache... so turn off for that
|
||||
if ( srep && srep->m_httpStatus >= 1000 ) {
|
||||
if ( m_cr->m_spiderCorruptCount == 0 ) {
|
||||
log("spider: got corrupt 3 spiderReply in "
|
||||
"scan httpstatus=%"INT32" (cn=%"INT32")",
|
||||
(int32_t)srep->m_httpStatus,
|
||||
(int32_t)m_collnum);
|
||||
}
|
||||
m_cr->m_spiderCorruptCount++;
|
||||
// don't nuke it just for that...
|
||||
//srep = NULL;
|
||||
}
|
||||
// bad langid?
|
||||
if ( srep && ! getLanguageAbbr (srep->m_langId) ) {
|
||||
log("spider: got corrupt 4 spiderReply in scan "
|
||||
"langid=%"INT32" (cn=%"INT32")",
|
||||
(int32_t)srep->m_langId,
|
||||
(int32_t)m_collnum);
|
||||
srep = NULL;
|
||||
}
|
||||
|
||||
// . get the url filter we match
|
||||
// . if this is slow see the TODO below in dedupSpiderdbList()
|
||||
@ -4310,7 +4357,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url);
|
||||
log("spider: got ufn=%"INT32" for %s (%"INT64"",
|
||||
ufn,sreq->m_url,sreq->getUrlHash48());
|
||||
|
||||
if ( g_conf.m_logDebugSpider && srep )
|
||||
log("spider: lastspidered=%"UINT32"",
|
||||
|
9
Spider.h
9
Spider.h
@ -1155,8 +1155,13 @@ class SpiderColl {
|
||||
int32_t m_tailHopCount;
|
||||
int64_t m_minFutureTimeMS;
|
||||
|
||||
int32_t m_numSuccessReplies;
|
||||
int32_t m_numFailedReplies;
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// int32_t m_numSuccessReplies;
|
||||
// int32_t m_numFailedReplies;
|
||||
|
||||
// . do not re-send CrawlInfoLocal for a coll if not update
|
||||
// . we store the flags in here as true if we should send our
|
||||
|
@ -2861,6 +2861,10 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
|
||||
(int32_t)sslError,r,iptoa(s->m_ip),sslMsg);
|
||||
|
||||
g_errno = ESSLERROR;
|
||||
// note in log
|
||||
log("tcp: ssl: try running "
|
||||
"'openssl s_client -connect www.hostnamehere.com:443 "
|
||||
"-debug' to debug the webserver on the other side.");
|
||||
// make sure read callback is registered
|
||||
// g_loop.registerReadCallback (s->m_sd,this,readSocketWrapper,
|
||||
// s->m_niceness);
|
||||
|
38
XmlDoc.cpp
38
XmlDoc.cpp
@ -2426,8 +2426,8 @@ bool XmlDoc::indexDoc ( ) {
|
||||
"error reply.",
|
||||
m_firstUrl.m_url,mstrerror(g_errno));
|
||||
else if ( g_errno )
|
||||
log("build: docid=%"INT64" had internal error = %s. adding spider "
|
||||
"error reply.",
|
||||
log("build: docid=%"INT64" had internal error = %s. "
|
||||
"adding spider error reply.",
|
||||
m_docId,mstrerror(g_errno));
|
||||
|
||||
// seems like this was causing a core somehow...
|
||||
@ -2450,6 +2450,16 @@ bool XmlDoc::indexDoc ( ) {
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
// this should not be retired either. i am seeing it excessively
|
||||
// retried from a
|
||||
// "TitleRec::set: uncompress uncompressed size=-2119348471"
|
||||
// error condition. it also said
|
||||
// "Error spidering for doc http://www.... : Bad cached document"
|
||||
if ( g_errno == EBADTITLEREC ) {
|
||||
m_indexCode = g_errno;
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
|
||||
// transId=496026 nice=1 net=default): Buf too small.
|
||||
// so fix that with this
|
||||
@ -2468,7 +2478,7 @@ bool XmlDoc::indexDoc ( ) {
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
|
||||
// default to internal error which will be retried forever otherwise
|
||||
if ( ! m_indexCodeValid ) {
|
||||
m_indexCode = EINTERNALERROR;//g_errno;
|
||||
m_indexCodeValid = true;
|
||||
@ -25494,6 +25504,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
// store it
|
||||
m_srep.m_firstIp = firstIp;
|
||||
// assume no error
|
||||
// MDW: not right...
|
||||
m_srep.m_errCount = 0;
|
||||
// otherwise, inherit from oldsr to be safe
|
||||
//if ( m_sreqValid )
|
||||
@ -28153,14 +28164,19 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
m_docIdWeAreADupOf);
|
||||
|
||||
// how many spiderings were successful vs. failed
|
||||
if ( m_sreqValid ) {
|
||||
jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
|
||||
m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
|
||||
jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
|
||||
m_sreq.m_reservedc1);
|
||||
jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
|
||||
m_sreq.m_reservedc2);
|
||||
}
|
||||
// these don't work because we only store one reply
|
||||
// which overwrites any older reply. that's how the
|
||||
// key is. we can change the key to use the timestamp
|
||||
// and not parent docid in makeKey() for spider
|
||||
// replies later.
|
||||
// if ( m_sreqValid ) {
|
||||
// jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
|
||||
// m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
|
||||
// jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
|
||||
// m_sreq.m_reservedc1);
|
||||
// jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
|
||||
// m_sreq.m_reservedc2);
|
||||
// }
|
||||
|
||||
if ( m_spideredTimeValid )
|
||||
jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
|
||||
|
Loading…
x
Reference in New Issue
Block a user