mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-13 02:36:06 -04:00
merged spidering related changes from staging
This commit is contained in:
23
Parms.cpp
23
Parms.cpp
@ -10917,22 +10917,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
//"<td>Matches if document is NOT a new outlink."
|
||||
//"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>age</td>"
|
||||
"<td>"
|
||||
"How old is the doucment <b>in seconds</b>. "
|
||||
"The age is based on the publication date of "
|
||||
"the document, which could also be the "
|
||||
"time that the document was last significantly "
|
||||
"modified. If this date is unknown then the age "
|
||||
"will be -1 and only match the expression "
|
||||
"<i>age==-1</i>. "
|
||||
"When harvesting links, we guess the publication "
|
||||
"date of the oulink by detecting dates contained "
|
||||
"in the url itself, which is popular among some "
|
||||
"forms of permalinks. This allows us to put "
|
||||
"older permalinks into a slower spider queue."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>spiderwaited < 3600</td>"
|
||||
"<td>"
|
||||
"<i>spiderwaited</i> is how many seconds have elapsed "
|
||||
@ -11032,6 +11016,13 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"\"temporary\" errors like DNS timeouts."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>sameerrorcount==1</td>"
|
||||
"<td>"
|
||||
"The number of times the url has failed to "
|
||||
"be indexed with the same error. Reset to 0 "
|
||||
"every time the error code changes."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>errorcode==32880</td>"
|
||||
"<td>"
|
||||
"If the last time it was spidered it had this "
|
||||
|
103
Spider.cpp
103
Spider.cpp
@ -78,6 +78,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
|
||||
// indicate it's a request not a reply
|
||||
sb->safePrintf("REQ ");
|
||||
sb->safePrintf("ver=%d ", (int)m_version);
|
||||
|
||||
sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
|
||||
// if negtaive bail early now
|
||||
if ( (m_key.n0 & 0x01) == 0x00 ) {
|
||||
@ -105,11 +107,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
timeStruct = gmtime_r(&ts,&tm_buf);
|
||||
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
|
||||
sb->safePrintf("addedTime=%s(%" PRIu32") ",time,(uint32_t)m_addedTime );
|
||||
|
||||
sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
|
||||
|
||||
sb->safePrintf("hopCount=%" PRId32" ",(int32_t)m_hopCount );
|
||||
|
||||
sb->safePrintf("ufn=%" PRId32" ", (int32_t)m_ufn);
|
||||
// why was this unsigned?
|
||||
sb->safePrintf("priority=%" PRId32" ", (int32_t)m_priority);
|
||||
@ -158,12 +157,12 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
|
||||
|
||||
// indicate it's a reply
|
||||
sb->safePrintf("REP ");
|
||||
sb->safePrintf("ver=%d ", (int)m_version);
|
||||
|
||||
sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
|
||||
sb->safePrintf("parentDocId=%" PRIu64" ",getParentDocId());
|
||||
|
||||
|
||||
// if negtaive bail early now
|
||||
// if negative bail early now
|
||||
if ( (m_key.n0 & 0x01) == 0x00 ) {
|
||||
sb->safePrintf("[DELETE]");
|
||||
if ( ! sbarg ) printf("%s",sb->getBufStart() );
|
||||
@ -187,13 +186,6 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
|
||||
|
||||
sb->safePrintf("siteNumInlinks=%" PRId32" ",m_siteNumInlinks );
|
||||
|
||||
time_t ts2 = (time_t)m_pubDate;
|
||||
timeStruct = gmtime_r(&ts2,&tm_buf);
|
||||
time[0] = 0;
|
||||
if ( m_pubDate != 0 && m_pubDate != -1 )
|
||||
strftime (time,256,"%b %e %T %Y UTC",timeStruct);
|
||||
sb->safePrintf("pubDate=%s(%" PRId32") ",time,m_pubDate );
|
||||
|
||||
sb->safePrintf("ch32=%" PRIu32" ",(uint32_t)m_contentHash32);
|
||||
|
||||
sb->safePrintf("crawldelayms=%" PRId32"ms ",m_crawlDelayMS );
|
||||
@ -204,6 +196,9 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
|
||||
if ( m_errCount )
|
||||
sb->safePrintf("errCount=%" PRId32" ",(int32_t)m_errCount);
|
||||
|
||||
if ( m_sameErrCount )
|
||||
sb->safePrintf("sameErrCount=%" PRId32" ",(int32_t)m_sameErrCount);
|
||||
|
||||
sb->safePrintf("errCode=%s(%" PRIu32") ",mstrerror(m_errCode),
|
||||
(uint32_t)m_errCode );
|
||||
|
||||
@ -257,16 +252,12 @@ int32_t SpiderRequest::printToJSON(SafeBuf *sb, const char *status, XmlDoc *xd,
|
||||
|
||||
char ipbuf[16];
|
||||
sb->safePrintf("\t\t\t\"firstIp\": \"%s\",\n", iptoa(m_firstIp,ipbuf));
|
||||
|
||||
sb->safePrintf("\t\t\t\"errCount\": %hhd,\n", m_errCount);
|
||||
|
||||
sb->safePrintf("\t\t\t\"sameErrCount\": %hhd,\n", m_sameErrCount);
|
||||
sb->safePrintf("\t\t\t\"urlHash48\": %" PRId64",\n", getUrlHash48());
|
||||
|
||||
sb->safePrintf("\t\t\t\"siteInLinks\": %" PRId32",\n", m_siteNumInlinks);
|
||||
sb->safePrintf("\t\t\t\"hops\": %" PRId16",\n", m_hopCount);
|
||||
|
||||
sb->safePrintf("\t\t\t\"addedTime\": %" PRIu32",\n", m_addedTime);
|
||||
|
||||
sb->safePrintf("\t\t\t\"pageNumInLinks\": %" PRIu8",\n", m_pageNumInlinks);
|
||||
sb->safePrintf("\t\t\t\"parentDocId\": %" PRId64"\n", getParentDocId());
|
||||
|
||||
@ -314,9 +305,8 @@ int32_t SpiderRequest::printToTable(SafeBuf *sb, const char *status, XmlDoc *xd,
|
||||
char ipbuf[16];
|
||||
sb->safePrintf(" <td>%s</td>\n",iptoa(m_firstIp,ipbuf) );
|
||||
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_errCount );
|
||||
|
||||
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_sameErrCount );
|
||||
sb->safePrintf(" <td>%" PRIu64"</td>\n",getUrlHash48());
|
||||
|
||||
sb->safePrintf(" <td>%" PRId32"</td>\n",m_siteNumInlinks );
|
||||
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_hopCount );
|
||||
|
||||
@ -374,6 +364,7 @@ int32_t SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering)
|
||||
|
||||
sb->safePrintf(" <td><b>firstIp</b></td>\n");
|
||||
sb->safePrintf(" <td><b>errCount</b></td>\n");
|
||||
sb->safePrintf(" <td><b>sameErrCount</b></td>\n");
|
||||
sb->safePrintf(" <td><b>urlHash48</b></td>\n");
|
||||
sb->safePrintf(" <td><b>siteInlinks</b></td>\n");
|
||||
sb->safePrintf(" <td><b>hops</b></td>\n");
|
||||
@ -1327,8 +1318,10 @@ checkNextRule:
|
||||
if ( isForMsg20 ) continue;
|
||||
// reply based
|
||||
if ( ! srep ) continue;
|
||||
|
||||
// get our error code
|
||||
int32_t errCode = srep->m_errCode;
|
||||
|
||||
// . make it zero if not tmp error
|
||||
// . now have EDOCUNCHANGED and EDOCNOGOODDATE from
|
||||
// Msg13.cpp, so don't count those here...
|
||||
@ -1346,6 +1339,7 @@ checkNextRule:
|
||||
errCode = 0;
|
||||
// if no match continue
|
||||
if ( (bool)errCode == val ) continue;
|
||||
|
||||
// skip
|
||||
p += 11;
|
||||
// skip to next constraint
|
||||
@ -2140,6 +2134,41 @@ checkNextRule:
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p=='s' && strncmp(p,"sameerrorcount",14) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) {
|
||||
logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
|
||||
return -1;
|
||||
}
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// reply based
|
||||
if ( ! srep ) continue;
|
||||
// shortcut
|
||||
int32_t a = srep->m_sameErrCount;
|
||||
// make it point to the retry count
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
if ( sign == SIGN_EQ && a != b ) continue;
|
||||
if ( sign == SIGN_NE && a == b ) continue;
|
||||
if ( sign == SIGN_GT && a <= b ) continue;
|
||||
if ( sign == SIGN_LT && a >= b ) continue;
|
||||
if ( sign == SIGN_GE && a < b ) continue;
|
||||
if ( sign == SIGN_LE && a > b ) continue;
|
||||
// skip fast
|
||||
//p += 14;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) {
|
||||
logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
|
||||
return i;
|
||||
}
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
|
||||
// EBADURL malformed url is ... 32880
|
||||
if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
@ -2351,42 +2380,6 @@ checkNextRule:
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// how old is the doc in seconds? age is the pubDate age
|
||||
if ( *p =='a' && strncmp(p, "age", 3) == 0){
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) {
|
||||
logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
|
||||
return -1;
|
||||
}
|
||||
// must have a reply
|
||||
if ( ! srep ) continue;
|
||||
// shortcut
|
||||
int32_t age;
|
||||
if ( srep->m_pubDate <= 0 ) age = -1;
|
||||
else age = nowGlobal - srep->m_pubDate;
|
||||
// we can not match if invalid
|
||||
if ( age <= 0 ) continue;
|
||||
// make it point to the priority
|
||||
int32_t b = atoi(s);
|
||||
// compare
|
||||
if ( sign == SIGN_EQ && age != b ) continue;
|
||||
if ( sign == SIGN_NE && age == b ) continue;
|
||||
if ( sign == SIGN_GT && age <= b ) continue;
|
||||
if ( sign == SIGN_LT && age >= b ) continue;
|
||||
if ( sign == SIGN_GE && age < b ) continue;
|
||||
if ( sign == SIGN_LE && age > b ) continue;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p )
|
||||
{
|
||||
logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
|
||||
return i;
|
||||
}
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// our own regex thing (match front of url)
|
||||
if ( *p=='^' ) {
|
||||
// advance over caret
|
||||
|
40
Spider.h
40
Spider.h
@ -14,6 +14,11 @@ class RdbList;
|
||||
class HashTableX;
|
||||
class SpiderColl;
|
||||
|
||||
|
||||
#define SPIDERREQ_CURRENT_VERSION 1
|
||||
#define SPIDERREP_CURRENT_VERSION 1
|
||||
|
||||
|
||||
// lower from 1300 to 300
|
||||
#define MAXUDPSLOTS 300
|
||||
|
||||
@ -459,8 +464,15 @@ public:
|
||||
// # of spider requests from different c-blocks. capped at 255.
|
||||
// taken from the # of SpiderRequests.
|
||||
uint8_t m_pageNumInlinks;
|
||||
uint8_t m_reservedb2;
|
||||
uint8_t m_reservedb3;
|
||||
|
||||
// . this is copied from the most recent SpiderReply into here
|
||||
// . its so XMlDoc.cpp can increment it and add it to the new
|
||||
// SpiderReply it adds in case there is another download error ,
|
||||
// like ETCPTIMEDOUT or EDNSTIMEDOUT
|
||||
uint8_t m_sameErrCount;
|
||||
|
||||
|
||||
uint8_t m_version;
|
||||
uint8_t m_reservedb4;
|
||||
|
||||
// info on the page we were harvest from
|
||||
@ -474,7 +486,9 @@ public:
|
||||
// when we scan all of the SpiderRequests it has.
|
||||
int32_t m_discoveryTime;
|
||||
|
||||
int32_t m_reservedc2;
|
||||
// Used to compare previous errcode with current errcode, for counting
|
||||
// sameErrCode value.
|
||||
int32_t m_prevErrCode; // m_reservedc2;
|
||||
|
||||
// . replace this with something we need for smart compression
|
||||
// . this is zero if none or invalid
|
||||
@ -627,6 +641,7 @@ public:
|
||||
m_ufn = -1;
|
||||
// this too
|
||||
m_priority = -1;
|
||||
m_version = SPIDERREQ_CURRENT_VERSION;
|
||||
}
|
||||
|
||||
static int32_t getNeededSize ( int32_t urlLen ) {
|
||||
@ -688,6 +703,10 @@ public:
|
||||
bool setFromInject(const char *url);
|
||||
|
||||
bool isCorrupt() const;
|
||||
|
||||
SpiderRequest() {
|
||||
reset();
|
||||
}
|
||||
} __attribute__((packed, aligned(4)));
|
||||
|
||||
// . XmlDoc adds this record to spiderdb after attempting to spider a url
|
||||
@ -734,8 +753,10 @@ public:
|
||||
// SpiderRequest's m_siteNumLinks
|
||||
int32_t m_siteNumInlinks;
|
||||
|
||||
// the actual pub date we extracted (0 means none, -1 unknown)
|
||||
int32_t m_pubDate;
|
||||
uint8_t m_sameErrCount;
|
||||
uint8_t m_version;
|
||||
uint8_t m_reserved_u8b;
|
||||
uint8_t m_reserved_u8c;
|
||||
|
||||
// . this is zero if none or invalid
|
||||
int32_t m_contentHash32;
|
||||
@ -827,7 +848,10 @@ public:
|
||||
int32_t getRecSize () const { return m_dataSize + 4 + sizeof(key128_t); }
|
||||
|
||||
// clear all
|
||||
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); }
|
||||
void reset() {
|
||||
memset(this, 0, sizeof(SpiderReply));
|
||||
m_version = SPIDERREP_CURRENT_VERSION;
|
||||
}
|
||||
|
||||
void setKey ( int32_t firstIp, int64_t parentDocId, int64_t uh48, bool isDel ) ;
|
||||
|
||||
@ -840,6 +864,10 @@ public:
|
||||
int64_t getParentDocId() const {
|
||||
return Spiderdb::getParentDocId(&m_key);
|
||||
}
|
||||
|
||||
SpiderReply() {
|
||||
reset();
|
||||
}
|
||||
} __attribute__((packed, aligned(4)));
|
||||
|
||||
// was 1000 but breached, now equals SR_READ_SIZE/sizeof(SpiderReply)
|
||||
|
@ -2407,6 +2407,12 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
if ( srep ) {
|
||||
sreq->m_errCount = srep->m_errCount;
|
||||
|
||||
// Save error code of last reply in the request so we
|
||||
// can compare with error code after next spider attempt.
|
||||
sreq->m_prevErrCode = srep->m_errCode;
|
||||
sreq->m_sameErrCount = srep->m_sameErrCount;
|
||||
|
||||
|
||||
// . assign this too from latest reply - smart compress
|
||||
// . this WAS SpiderReply::m_pubdate so it might be
|
||||
// set to a non-zero value that is wrong now... but
|
||||
@ -2415,6 +2421,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// if we tried it before
|
||||
sreq->m_hadReply = true;
|
||||
}
|
||||
else {
|
||||
sreq->m_errCount = 0;
|
||||
sreq->m_sameErrCount = 0;
|
||||
sreq->m_prevErrCode = 0;
|
||||
}
|
||||
|
||||
// . get the url filter we match
|
||||
// . if this is slow see the TODO below in dedupSpiderdbList()
|
||||
|
52
XmlDoc.cpp
52
XmlDoc.cpp
@ -14303,9 +14303,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
}
|
||||
// store it
|
||||
m_srep.m_firstIp = firstIp;
|
||||
// assume no error
|
||||
// MDW: not right...
|
||||
|
||||
// Default to no error. Will be set below.
|
||||
m_srep.m_errCount = 0;
|
||||
m_srep.m_sameErrCount = 0;
|
||||
|
||||
// otherwise, inherit from oldsr to be safe
|
||||
//if ( m_sreqValid )
|
||||
// m_srep.m_firstIp = m_sreq.m_firstIp;
|
||||
@ -14411,22 +14413,25 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
m_srep.m_crawlDelayMS = m_crawlDelay;
|
||||
else
|
||||
m_srep.m_crawlDelayMS = -1;
|
||||
//if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate;
|
||||
m_srep.m_pubDate = 0;
|
||||
|
||||
if ( m_langIdValid ) m_srep.m_langId = m_langId;
|
||||
if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS;
|
||||
if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
|
||||
if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus;
|
||||
|
||||
// stuff that is automatically valid
|
||||
m_srep.m_isPingServer = 0;
|
||||
if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
|
||||
|
||||
// this was replaced by m_contentHash32
|
||||
//m_srep.m_newRequests = 0;
|
||||
m_srep.m_errCode = m_indexCode;
|
||||
|
||||
if ( m_downloadEndTimeValid )
|
||||
m_srep.m_downloadEndTime = m_downloadEndTime;
|
||||
else
|
||||
m_srep.m_downloadEndTime = 0;
|
||||
|
||||
// is the original spider request valid?
|
||||
if ( m_sreqValid ) {
|
||||
// preserve the content hash in case m_indexCode is
|
||||
@ -14442,16 +14447,43 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
|
||||
n->m_isPingServer = o->m_isPingServer;
|
||||
// the validator flags
|
||||
n->m_hasAuthorityInlinkValid =
|
||||
o->m_hasAuthorityInlinkValid;
|
||||
n->m_hasAuthorityInlinkValid = o->m_hasAuthorityInlinkValid;
|
||||
|
||||
// get error count from original spider request
|
||||
int32_t newc = m_sreq.m_errCount;
|
||||
// inc for us, since we had an error
|
||||
newc++;
|
||||
// contain to one byte
|
||||
if ( newc > 255 ) newc = 255;
|
||||
if ( newc > 255 ) {
|
||||
newc = 255;
|
||||
}
|
||||
// store in our spiderreply
|
||||
m_srep.m_errCount = newc;
|
||||
|
||||
|
||||
// Number of times we have seen the same error code in a row
|
||||
if( m_sreq.m_prevErrCode == m_srep.m_errCode ) {
|
||||
int32_t newc = m_sreq.m_sameErrCount;
|
||||
|
||||
// Sanity. Must not be same or larger here.
|
||||
if( newc >= m_srep.m_errCount ) {
|
||||
log(LOG_WARN,"Correcting sameErrCount. Count=%" PRId32 ", sameErrCount=%" PRId32 ", prev_errCode=%" PRId32 ", curr_errCode=%" PRId32 ", url=%s, uh48=%" PRIx64 ", err=%s", m_srep.m_errCount, m_srep.m_sameErrCount, m_sreq.m_prevErrCode, m_srep.m_errCode, m_sreq.m_url, uh48, mstrerror( m_srep.m_errCode ));
|
||||
newc = 0;
|
||||
}
|
||||
|
||||
// inc for us, since we had an error
|
||||
newc++;
|
||||
|
||||
// contain to one byte
|
||||
if ( newc > 255 ) {
|
||||
newc = 255;
|
||||
}
|
||||
// store in our spiderreply
|
||||
m_srep.m_sameErrCount = newc;
|
||||
}
|
||||
else {
|
||||
m_srep.m_sameErrCount = 0;
|
||||
}
|
||||
}
|
||||
// . and do not really consider this an error
|
||||
// . i don't want the url filters treating it as an error reply
|
||||
@ -14466,6 +14498,8 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
m_srep.m_errCode = 0;
|
||||
// and no error count, it wasn't an error per se
|
||||
m_srep.m_errCount = 0;
|
||||
m_srep.m_sameErrCount = 0;
|
||||
|
||||
// call it 200
|
||||
m_srep.m_httpStatus = 200;
|
||||
}
|
||||
@ -14473,8 +14507,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
if ( m_indexCode == EDOCUNCHANGED &&
|
||||
m_oldDocValid &&
|
||||
m_oldDoc ) {
|
||||
//m_srep.m_pubDate = m_oldDoc->m_pubDate;
|
||||
m_srep.m_pubDate = 0;
|
||||
m_srep.m_langId = m_oldDoc->m_langId;
|
||||
m_srep.m_isRSS = m_oldDoc->m_isRSS;
|
||||
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
|
||||
@ -14579,8 +14611,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
|
||||
// ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
|
||||
m_srep.m_siteNumInlinks = m_siteNumInlinks;
|
||||
//m_srep.m_pubDate = *pubDate;
|
||||
m_srep.m_pubDate = 0;
|
||||
// this was replaced by m_contentHash32
|
||||
//m_srep.m_newRequests = 0;
|
||||
m_srep.m_langId = *langId;
|
||||
|
10
main.cpp
10
main.cpp
@ -2843,6 +2843,8 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
|
||||
static int64_t s_lastRepUh48 = 0LL;
|
||||
static int32_t s_lastErrCode = 0;
|
||||
static int32_t s_lastErrCount = 0;
|
||||
static int32_t s_sameErrCount = 0;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
|
||||
loop:
|
||||
@ -2905,6 +2907,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
|
||||
s_lastRepUh48 = srep->getUrlHash48();
|
||||
s_lastErrCode = srep->m_errCode;
|
||||
s_lastErrCount = srep->m_errCount;
|
||||
s_sameErrCount = srep->m_sameErrCount;
|
||||
|
||||
// get firstip
|
||||
if ( printStats == 1 ) {
|
||||
@ -2935,6 +2938,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
|
||||
printf(" hadReply=%" PRId32,(int32_t)hadReply);
|
||||
|
||||
printf(" errcount=%" PRId32,(int32_t)s_lastErrCount);
|
||||
printf(" sameerrcount=%" PRId32,(int32_t)s_sameErrCount);
|
||||
|
||||
if ( s_lastErrCode ) {
|
||||
printf( " errcode=%" PRId32"(%s)", ( int32_t ) s_lastErrCode, mstrerror( s_lastErrCode ) );
|
||||
@ -2942,6 +2946,12 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
|
||||
printf( " errcode=%" PRId32, ( int32_t ) s_lastErrCode );
|
||||
}
|
||||
|
||||
if ( sreq->m_prevErrCode ) {
|
||||
printf( " preverrcode=%" PRId32"(%s)", ( int32_t ) sreq->m_prevErrCode, mstrerror( sreq->m_prevErrCode ) );
|
||||
} else {
|
||||
printf( " preverrcode=%" PRId32, ( int32_t ) sreq->m_prevErrCode );
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user