merged spidering related changes from staging

This commit is contained in:
Brian Rasmusson
2017-07-04 16:12:01 +02:00
6 changed files with 151 additions and 88 deletions

@ -10917,22 +10917,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
//"<td>Matches if document is NOT a new outlink."
//"</td></tr>"
"<tr class=poo><td>age</td>"
"<td>"
"How old is the doucment <b>in seconds</b>. "
"The age is based on the publication date of "
"the document, which could also be the "
"time that the document was last significantly "
"modified. If this date is unknown then the age "
"will be -1 and only match the expression "
"<i>age==-1</i>. "
"When harvesting links, we guess the publication "
"date of the oulink by detecting dates contained "
"in the url itself, which is popular among some "
"forms of permalinks. This allows us to put "
"older permalinks into a slower spider queue."
"</td></tr>"
"<tr class=poo><td>spiderwaited &lt; 3600</td>"
"<td>"
"<i>spiderwaited</i> is how many seconds have elapsed "
@ -11032,6 +11016,13 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
"\"temporary\" errors like DNS timeouts."
"</td></tr>"
"<tr class=poo><td>sameerrorcount==1</td>"
"<td>"
"The number of times the url has failed to "
"be indexed with the same error. Reset to 0 "
"every time the error code changes."
"</td></tr>"
"<tr class=poo><td>errorcode==32880</td>"
"<td>"
"If the last time it was spidered it had this "

@ -78,6 +78,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
// indicate it's a request not a reply
sb->safePrintf("REQ ");
sb->safePrintf("ver=%d ", (int)m_version);
sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
// if negtaive bail early now
if ( (m_key.n0 & 0x01) == 0x00 ) {
@ -105,11 +107,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
timeStruct = gmtime_r(&ts,&tm_buf);
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb->safePrintf("addedTime=%s(%" PRIu32") ",time,(uint32_t)m_addedTime );
sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
sb->safePrintf("hopCount=%" PRId32" ",(int32_t)m_hopCount );
sb->safePrintf("ufn=%" PRId32" ", (int32_t)m_ufn);
// why was this unsigned?
sb->safePrintf("priority=%" PRId32" ", (int32_t)m_priority);
@ -158,12 +157,12 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
// indicate it's a reply
sb->safePrintf("REP ");
sb->safePrintf("ver=%d ", (int)m_version);
sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
sb->safePrintf("parentDocId=%" PRIu64" ",getParentDocId());
// if negtaive bail early now
// if negative bail early now
if ( (m_key.n0 & 0x01) == 0x00 ) {
sb->safePrintf("[DELETE]");
if ( ! sbarg ) printf("%s",sb->getBufStart() );
@ -187,13 +186,6 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
sb->safePrintf("siteNumInlinks=%" PRId32" ",m_siteNumInlinks );
time_t ts2 = (time_t)m_pubDate;
timeStruct = gmtime_r(&ts2,&tm_buf);
time[0] = 0;
if ( m_pubDate != 0 && m_pubDate != -1 )
strftime (time,256,"%b %e %T %Y UTC",timeStruct);
sb->safePrintf("pubDate=%s(%" PRId32") ",time,m_pubDate );
sb->safePrintf("ch32=%" PRIu32" ",(uint32_t)m_contentHash32);
sb->safePrintf("crawldelayms=%" PRId32"ms ",m_crawlDelayMS );
@ -204,6 +196,9 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
if ( m_errCount )
sb->safePrintf("errCount=%" PRId32" ",(int32_t)m_errCount);
if ( m_sameErrCount )
sb->safePrintf("sameErrCount=%" PRId32" ",(int32_t)m_sameErrCount);
sb->safePrintf("errCode=%s(%" PRIu32") ",mstrerror(m_errCode),
(uint32_t)m_errCode );
@ -257,16 +252,12 @@ int32_t SpiderRequest::printToJSON(SafeBuf *sb, const char *status, XmlDoc *xd,
char ipbuf[16];
sb->safePrintf("\t\t\t\"firstIp\": \"%s\",\n", iptoa(m_firstIp,ipbuf));
sb->safePrintf("\t\t\t\"errCount\": %hhd,\n", m_errCount);
sb->safePrintf("\t\t\t\"sameErrCount\": %hhd,\n", m_sameErrCount);
sb->safePrintf("\t\t\t\"urlHash48\": %" PRId64",\n", getUrlHash48());
sb->safePrintf("\t\t\t\"siteInLinks\": %" PRId32",\n", m_siteNumInlinks);
sb->safePrintf("\t\t\t\"hops\": %" PRId16",\n", m_hopCount);
sb->safePrintf("\t\t\t\"addedTime\": %" PRIu32",\n", m_addedTime);
sb->safePrintf("\t\t\t\"pageNumInLinks\": %" PRIu8",\n", m_pageNumInlinks);
sb->safePrintf("\t\t\t\"parentDocId\": %" PRId64"\n", getParentDocId());
@ -314,9 +305,8 @@ int32_t SpiderRequest::printToTable(SafeBuf *sb, const char *status, XmlDoc *xd,
char ipbuf[16];
sb->safePrintf(" <td>%s</td>\n",iptoa(m_firstIp,ipbuf) );
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_errCount );
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_sameErrCount );
sb->safePrintf(" <td>%" PRIu64"</td>\n",getUrlHash48());
sb->safePrintf(" <td>%" PRId32"</td>\n",m_siteNumInlinks );
sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_hopCount );
@ -374,6 +364,7 @@ int32_t SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering)
sb->safePrintf(" <td><b>firstIp</b></td>\n");
sb->safePrintf(" <td><b>errCount</b></td>\n");
sb->safePrintf(" <td><b>sameErrCount</b></td>\n");
sb->safePrintf(" <td><b>urlHash48</b></td>\n");
sb->safePrintf(" <td><b>siteInlinks</b></td>\n");
sb->safePrintf(" <td><b>hops</b></td>\n");
@ -1327,8 +1318,10 @@ checkNextRule:
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// get our error code
int32_t errCode = srep->m_errCode;
// . make it zero if not tmp error
// . now have EDOCUNCHANGED and EDOCNOGOODDATE from
// Msg13.cpp, so don't count those here...
@ -1346,6 +1339,7 @@ checkNextRule:
errCode = 0;
// if no match continue
if ( (bool)errCode == val ) continue;
// skip
p += 11;
// skip to next constraint
@ -2140,6 +2134,41 @@ checkNextRule:
goto checkNextRule;
}
if ( *p=='s' && strncmp(p,"sameerrorcount",14) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) {
logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
return -1;
}
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// shortcut
int32_t a = srep->m_sameErrCount;
// make it point to the retry count
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
//p += 14;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) {
logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
return i;
}
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// EBADURL malformed url is ... 32880
if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
// if we do not have enough info for outlink, all done
@ -2351,42 +2380,6 @@ checkNextRule:
goto checkNextRule;
}
// how old is the doc in seconds? age is the pubDate age
if ( *p =='a' && strncmp(p, "age", 3) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) {
logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
return -1;
}
// must have a reply
if ( ! srep ) continue;
// shortcut
int32_t age;
if ( srep->m_pubDate <= 0 ) age = -1;
else age = nowGlobal - srep->m_pubDate;
// we can not match if invalid
if ( age <= 0 ) continue;
// make it point to the priority
int32_t b = atoi(s);
// compare
if ( sign == SIGN_EQ && age != b ) continue;
if ( sign == SIGN_NE && age == b ) continue;
if ( sign == SIGN_GT && age <= b ) continue;
if ( sign == SIGN_LT && age >= b ) continue;
if ( sign == SIGN_GE && age < b ) continue;
if ( sign == SIGN_LE && age > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p )
{
logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
return i;
}
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// our own regex thing (match front of url)
if ( *p=='^' ) {
// advance over caret

@ -14,6 +14,11 @@ class RdbList;
class HashTableX;
class SpiderColl;
#define SPIDERREQ_CURRENT_VERSION 1
#define SPIDERREP_CURRENT_VERSION 1
// lower from 1300 to 300
#define MAXUDPSLOTS 300
@ -459,8 +464,15 @@ public:
// # of spider requests from different c-blocks. capped at 255.
// taken from the # of SpiderRequests.
uint8_t m_pageNumInlinks;
uint8_t m_reservedb2;
uint8_t m_reservedb3;
// . this is copied from the most recent SpiderReply into here
// . its so XMlDoc.cpp can increment it and add it to the new
// SpiderReply it adds in case there is another download error ,
// like ETCPTIMEDOUT or EDNSTIMEDOUT
uint8_t m_sameErrCount;
uint8_t m_version;
uint8_t m_reservedb4;
// info on the page we were harvest from
@ -474,7 +486,9 @@ public:
// when we scan all of the SpiderRequests it has.
int32_t m_discoveryTime;
int32_t m_reservedc2;
// Used to compare previous errcode with current errcode, for counting
// sameErrCode value.
int32_t m_prevErrCode; // m_reservedc2;
// . replace this with something we need for smart compression
// . this is zero if none or invalid
@ -627,6 +641,7 @@ public:
m_ufn = -1;
// this too
m_priority = -1;
m_version = SPIDERREQ_CURRENT_VERSION;
}
static int32_t getNeededSize ( int32_t urlLen ) {
@ -688,6 +703,10 @@ public:
bool setFromInject(const char *url);
bool isCorrupt() const;
SpiderRequest() {
reset();
}
} __attribute__((packed, aligned(4)));
// . XmlDoc adds this record to spiderdb after attempting to spider a url
@ -734,8 +753,10 @@ public:
// SpiderRequest's m_siteNumLinks
int32_t m_siteNumInlinks;
// the actual pub date we extracted (0 means none, -1 unknown)
int32_t m_pubDate;
uint8_t m_sameErrCount;
uint8_t m_version;
uint8_t m_reserved_u8b;
uint8_t m_reserved_u8c;
// . this is zero if none or invalid
int32_t m_contentHash32;
@ -827,7 +848,10 @@ public:
int32_t getRecSize () const { return m_dataSize + 4 + sizeof(key128_t); }
// clear all
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); }
void reset() {
memset(this, 0, sizeof(SpiderReply));
m_version = SPIDERREP_CURRENT_VERSION;
}
void setKey ( int32_t firstIp, int64_t parentDocId, int64_t uh48, bool isDel ) ;
@ -840,6 +864,10 @@ public:
int64_t getParentDocId() const {
return Spiderdb::getParentDocId(&m_key);
}
SpiderReply() {
reset();
}
} __attribute__((packed, aligned(4)));
// was 1000 but breached, now equals SR_READ_SIZE/sizeof(SpiderReply)

@ -2407,6 +2407,12 @@ bool SpiderColl::scanListForWinners ( ) {
if ( srep ) {
sreq->m_errCount = srep->m_errCount;
// Save error code of last reply in the request so we
// can compare with error code after next spider attempt.
sreq->m_prevErrCode = srep->m_errCode;
sreq->m_sameErrCount = srep->m_sameErrCount;
// . assign this too from latest reply - smart compress
// . this WAS SpiderReply::m_pubdate so it might be
// set to a non-zero value that is wrong now... but
@ -2415,6 +2421,11 @@ bool SpiderColl::scanListForWinners ( ) {
// if we tried it before
sreq->m_hadReply = true;
}
else {
sreq->m_errCount = 0;
sreq->m_sameErrCount = 0;
sreq->m_prevErrCode = 0;
}
// . get the url filter we match
// . if this is slow see the TODO below in dedupSpiderdbList()

@ -14303,9 +14303,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
}
// store it
m_srep.m_firstIp = firstIp;
// assume no error
// MDW: not right...
// Default to no error. Will be set below.
m_srep.m_errCount = 0;
m_srep.m_sameErrCount = 0;
// otherwise, inherit from oldsr to be safe
//if ( m_sreqValid )
// m_srep.m_firstIp = m_sreq.m_firstIp;
@ -14411,22 +14413,25 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
m_srep.m_crawlDelayMS = m_crawlDelay;
else
m_srep.m_crawlDelayMS = -1;
//if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate;
m_srep.m_pubDate = 0;
if ( m_langIdValid ) m_srep.m_langId = m_langId;
if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS;
if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus;
// stuff that is automatically valid
m_srep.m_isPingServer = 0;
if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
// this was replaced by m_contentHash32
//m_srep.m_newRequests = 0;
m_srep.m_errCode = m_indexCode;
if ( m_downloadEndTimeValid )
m_srep.m_downloadEndTime = m_downloadEndTime;
else
m_srep.m_downloadEndTime = 0;
// is the original spider request valid?
if ( m_sreqValid ) {
// preserve the content hash in case m_indexCode is
@ -14442,16 +14447,43 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
n->m_isPingServer = o->m_isPingServer;
// the validator flags
n->m_hasAuthorityInlinkValid =
o->m_hasAuthorityInlinkValid;
n->m_hasAuthorityInlinkValid = o->m_hasAuthorityInlinkValid;
// get error count from original spider request
int32_t newc = m_sreq.m_errCount;
// inc for us, since we had an error
newc++;
// contain to one byte
if ( newc > 255 ) newc = 255;
if ( newc > 255 ) {
newc = 255;
}
// store in our spiderreply
m_srep.m_errCount = newc;
// Number of times we have seen the same error code in a row
if( m_sreq.m_prevErrCode == m_srep.m_errCode ) {
int32_t newc = m_sreq.m_sameErrCount;
// Sanity. Must not be same or larger here.
if( newc >= m_srep.m_errCount ) {
log(LOG_WARN,"Correcting sameErrCount. Count=%" PRId32 ", sameErrCount=%" PRId32 ", prev_errCode=%" PRId32 ", curr_errCode=%" PRId32 ", url=%s, uh48=%" PRIx64 ", err=%s", m_srep.m_errCount, m_srep.m_sameErrCount, m_sreq.m_prevErrCode, m_srep.m_errCode, m_sreq.m_url, uh48, mstrerror( m_srep.m_errCode ));
newc = 0;
}
// inc for us, since we had an error
newc++;
// contain to one byte
if ( newc > 255 ) {
newc = 255;
}
// store in our spiderreply
m_srep.m_sameErrCount = newc;
}
else {
m_srep.m_sameErrCount = 0;
}
}
// . and do not really consider this an error
// . i don't want the url filters treating it as an error reply
@ -14466,6 +14498,8 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
m_srep.m_errCode = 0;
// and no error count, it wasn't an error per se
m_srep.m_errCount = 0;
m_srep.m_sameErrCount = 0;
// call it 200
m_srep.m_httpStatus = 200;
}
@ -14473,8 +14507,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if ( m_indexCode == EDOCUNCHANGED &&
m_oldDocValid &&
m_oldDoc ) {
//m_srep.m_pubDate = m_oldDoc->m_pubDate;
m_srep.m_pubDate = 0;
m_srep.m_langId = m_oldDoc->m_langId;
m_srep.m_isRSS = m_oldDoc->m_isRSS;
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
@ -14579,8 +14611,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
// ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
m_srep.m_siteNumInlinks = m_siteNumInlinks;
//m_srep.m_pubDate = *pubDate;
m_srep.m_pubDate = 0;
// this was replaced by m_contentHash32
//m_srep.m_newRequests = 0;
m_srep.m_langId = *langId;

@ -2843,6 +2843,8 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
static int64_t s_lastRepUh48 = 0LL;
static int32_t s_lastErrCode = 0;
static int32_t s_lastErrCount = 0;
static int32_t s_sameErrCount = 0;
CollectionRec *cr = g_collectiondb.getRec(coll);
loop:
@ -2905,6 +2907,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
s_lastRepUh48 = srep->getUrlHash48();
s_lastErrCode = srep->m_errCode;
s_lastErrCount = srep->m_errCount;
s_sameErrCount = srep->m_sameErrCount;
// get firstip
if ( printStats == 1 ) {
@ -2935,6 +2938,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
printf(" hadReply=%" PRId32,(int32_t)hadReply);
printf(" errcount=%" PRId32,(int32_t)s_lastErrCount);
printf(" sameerrcount=%" PRId32,(int32_t)s_sameErrCount);
if ( s_lastErrCode ) {
printf( " errcode=%" PRId32"(%s)", ( int32_t ) s_lastErrCode, mstrerror( s_lastErrCode ) );
@ -2942,6 +2946,12 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
printf( " errcode=%" PRId32, ( int32_t ) s_lastErrCode );
}
if ( sreq->m_prevErrCode ) {
printf( " preverrcode=%" PRId32"(%s)", ( int32_t ) sreq->m_prevErrCode, mstrerror( sreq->m_prevErrCode ) );
} else {
printf( " preverrcode=%" PRId32, ( int32_t ) sreq->m_prevErrCode );
}
printf("\n");
}