merged spidering related changes from staging

2025-07-13 02:36:06 -04:00 · 2017-07-04 16:12:01 +02:00
parent be4c93c0dd 04ba9ba3a6
commit eeeed15067
6 changed files with 151 additions and 88 deletions
--- a/Parms.cpp
+++ b/Parms.cpp
@ -10917,22 +10917,6 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  //"<td>Matches if document is NOT a new outlink."
 			  //"</td></tr>"

-			  "<tr class=poo><td>age</td>"
-			  "<td>"
-			  "How old is the doucment <b>in seconds</b>. "
-			  "The age is based on the publication date of "
-			  "the document, which could also be the "
-			  "time that the document was last significantly "
-			  "modified. If this date is unknown then the age "
-			  "will be -1 and only match the expression "
-			  "<i>age==-1</i>. "
-			  "When harvesting links, we guess the publication "
-			  "date of the oulink by detecting dates contained "
-			  "in the url itself, which is popular among some "
-			  "forms of permalinks. This allows us to put "
-			  "older permalinks into a slower spider queue."
-			  "</td></tr>"
-
 			  "<tr class=poo><td>spiderwaited &lt; 3600</td>"
 			  "<td>"
 			  "<i>spiderwaited</i> is how many seconds have elapsed "
@ -11032,6 +11016,13 @@ static bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "\"temporary\" errors like DNS timeouts."
 			  "</td></tr>"

+			  "<tr class=poo><td>sameerrorcount==1</td>"
+			  "<td>"
+			  "The number of times the url has failed to "
+			  "be indexed with the same error. Reset to 0 "
+			  "every time the error code changes."
+			  "</td></tr>"
+
 			  "<tr class=poo><td>errorcode==32880</td>"
 			  "<td>"
 			  "If the last time it was spidered it had this "
--- a/Spider.cpp
+++ b/Spider.cpp
@ -78,6 +78,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {

 	// indicate it's a request not a reply
 	sb->safePrintf("REQ ");
+	sb->safePrintf("ver=%d ", (int)m_version);
+
 	sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
 	// if negtaive bail early now
 	if ( (m_key.n0 & 0x01) == 0x00 ) {
@ -105,11 +107,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
 	timeStruct = gmtime_r(&ts,&tm_buf);
 	strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
 	sb->safePrintf("addedTime=%s(%" PRIu32") ",time,(uint32_t)m_addedTime );
-
 	sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
-
 	sb->safePrintf("hopCount=%" PRId32" ",(int32_t)m_hopCount );
-
 	sb->safePrintf("ufn=%" PRId32" ", (int32_t)m_ufn);
 	// why was this unsigned?
 	sb->safePrintf("priority=%" PRId32" ", (int32_t)m_priority);
@ -158,12 +157,12 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {

 	// indicate it's a reply
 	sb->safePrintf("REP ");
+	sb->safePrintf("ver=%d ", (int)m_version);

 	sb->safePrintf("uh48=%" PRIx64" ",getUrlHash48());
 	sb->safePrintf("parentDocId=%" PRIu64" ",getParentDocId());

-
-	// if negtaive bail early now
+	// if negative bail early now
 	if ( (m_key.n0 & 0x01) == 0x00 ) {
 		sb->safePrintf("[DELETE]");
 		if ( ! sbarg ) printf("%s",sb->getBufStart() );
@ -187,13 +186,6 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {

 	sb->safePrintf("siteNumInlinks=%" PRId32" ",m_siteNumInlinks );

-	time_t ts2 = (time_t)m_pubDate;
-	timeStruct = gmtime_r(&ts2,&tm_buf);
-	time[0] = 0;
-	if ( m_pubDate != 0 && m_pubDate != -1 ) 
-		strftime (time,256,"%b %e %T %Y UTC",timeStruct);
-	sb->safePrintf("pubDate=%s(%" PRId32") ",time,m_pubDate );
-
 	sb->safePrintf("ch32=%" PRIu32" ",(uint32_t)m_contentHash32);

 	sb->safePrintf("crawldelayms=%" PRId32"ms ",m_crawlDelayMS );
@ -204,6 +196,9 @@ int32_t SpiderReply::print ( SafeBuf *sbarg ) {
 	if ( m_errCount )
 		sb->safePrintf("errCount=%" PRId32" ",(int32_t)m_errCount);

+	if ( m_sameErrCount )
+		sb->safePrintf("sameErrCount=%" PRId32" ",(int32_t)m_sameErrCount);
+
 	sb->safePrintf("errCode=%s(%" PRIu32") ",mstrerror(m_errCode),
 		       (uint32_t)m_errCode );

@ -257,16 +252,12 @@ int32_t SpiderRequest::printToJSON(SafeBuf *sb, const char *status, XmlDoc *xd,

 	char ipbuf[16];
 	sb->safePrintf("\t\t\t\"firstIp\": \"%s\",\n", iptoa(m_firstIp,ipbuf));
-
 	sb->safePrintf("\t\t\t\"errCount\": %hhd,\n", m_errCount);
-
+	sb->safePrintf("\t\t\t\"sameErrCount\": %hhd,\n", m_sameErrCount);
 	sb->safePrintf("\t\t\t\"urlHash48\": %" PRId64",\n", getUrlHash48());
-
 	sb->safePrintf("\t\t\t\"siteInLinks\": %" PRId32",\n", m_siteNumInlinks);
 	sb->safePrintf("\t\t\t\"hops\": %" PRId16",\n", m_hopCount);
-
 	sb->safePrintf("\t\t\t\"addedTime\": %" PRIu32",\n", m_addedTime);
-
 	sb->safePrintf("\t\t\t\"pageNumInLinks\": %" PRIu8",\n", m_pageNumInlinks);
 	sb->safePrintf("\t\t\t\"parentDocId\": %" PRId64"\n", getParentDocId());

@ -314,9 +305,8 @@ int32_t SpiderRequest::printToTable(SafeBuf *sb, const char *status, XmlDoc *xd,
 	char ipbuf[16];
 	sb->safePrintf(" <td>%s</td>\n",iptoa(m_firstIp,ipbuf) );
 	sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_errCount );
-
+	sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_sameErrCount );
 	sb->safePrintf(" <td>%" PRIu64"</td>\n",getUrlHash48());
-
 	sb->safePrintf(" <td>%" PRId32"</td>\n",m_siteNumInlinks );
 	sb->safePrintf(" <td>%" PRId32"</td>\n",(int32_t)m_hopCount );

@ -374,6 +364,7 @@ int32_t SpiderRequest::printTableHeader ( SafeBuf *sb , bool currentlySpidering)

 	sb->safePrintf(" <td><b>firstIp</b></td>\n");
 	sb->safePrintf(" <td><b>errCount</b></td>\n");
+	sb->safePrintf(" <td><b>sameErrCount</b></td>\n");
 	sb->safePrintf(" <td><b>urlHash48</b></td>\n");
 	sb->safePrintf(" <td><b>siteInlinks</b></td>\n");
 	sb->safePrintf(" <td><b>hops</b></td>\n");
@ -1327,8 +1318,10 @@ checkNextRule:
 			if ( isForMsg20 ) continue;
 			// reply based
 			if ( ! srep ) continue;
+
 			// get our error code
 			int32_t errCode = srep->m_errCode;
+
 			// . make it zero if not tmp error
 			// . now have EDOCUNCHANGED and EDOCNOGOODDATE from
 			//   Msg13.cpp, so don't count those here...
@ -1346,6 +1339,7 @@ checkNextRule:
 				errCode = 0;
 			// if no match continue
 			if ( (bool)errCode == val ) continue;
+
 			// skip
 			p += 11;
 			// skip to next constraint
@ -2140,6 +2134,41 @@ checkNextRule:
 			goto checkNextRule;
 		}

+		if ( *p=='s' && strncmp(p,"sameerrorcount",14) == 0 ) {
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) {
+				logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
+				return -1;
+			}
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// reply based
+			if ( ! srep ) continue;
+			// shortcut
+			int32_t a = srep->m_sameErrCount;
+			// make it point to the retry count
+			int32_t b = atoi(s);
+			// compare
+			if ( sign == SIGN_EQ && a != b ) continue;
+			if ( sign == SIGN_NE && a == b ) continue;
+			if ( sign == SIGN_GT && a <= b ) continue;
+			if ( sign == SIGN_LT && a >= b ) continue;
+			if ( sign == SIGN_GE && a <  b ) continue;
+			if ( sign == SIGN_LE && a >  b ) continue;
+			// skip fast
+			//p += 14;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) {
+				logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
+				return i;
+			}
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
+
 		// EBADURL malformed url is ... 32880
 		if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
 			// if we do not have enough info for outlink, all done
@ -2351,42 +2380,6 @@ checkNextRule:
 			goto checkNextRule;
 		}

-		// how old is the doc in seconds? age is the pubDate age
-		if ( *p =='a' && strncmp(p, "age", 3) == 0){
-			// if we do not have enough info for outlink, all done
-			if ( isOutlink ) {
-				logTrace( g_conf.m_logTraceSpider, "END, returning -1" );
-				return -1;
-			}
-			// must have a reply
-			if ( ! srep ) continue;
-			// shortcut
-			int32_t age;
-			if ( srep->m_pubDate <= 0 ) age = -1;
-			else age = nowGlobal - srep->m_pubDate;
-			// we can not match if invalid
-			if ( age <= 0 ) continue;
-			// make it point to the priority
-			int32_t b = atoi(s);
-			// compare
-			if ( sign == SIGN_EQ && age != b ) continue;
-			if ( sign == SIGN_NE && age == b ) continue;
-			if ( sign == SIGN_GT && age <= b ) continue;
-			if ( sign == SIGN_LT && age >= b ) continue;
-			if ( sign == SIGN_GE && age <  b ) continue;
-			if ( sign == SIGN_LE && age >  b ) continue;
-			p = strstr(s, "&&");
-			//if nothing, else then it is a match
-			if ( ! p ) 
-			{
-				logTrace( g_conf.m_logTraceSpider, "END, returning i (%" PRId32")", i );
-				return i;
-			}
-			//skip the '&&' and go to next rule
-			p += 2;
-			goto checkNextRule;
-		}
-
 		// our own regex thing (match front of url)
 		if ( *p=='^' ) {
 			// advance over caret
--- a/Spider.h
+++ b/Spider.h
@ -14,6 +14,11 @@ class RdbList;
 class HashTableX;
 class SpiderColl;

+
+#define SPIDERREQ_CURRENT_VERSION	1
+#define SPIDERREP_CURRENT_VERSION	1
+
+
 // lower from 1300 to 300
 #define MAXUDPSLOTS 300

@ -459,8 +464,15 @@ public:
 	// # of spider requests from different c-blocks. capped at 255.
 	// taken from the # of SpiderRequests.
 	uint8_t    m_pageNumInlinks;
-	uint8_t    m_reservedb2;
-	uint8_t    m_reservedb3;
+
+	// . this is copied from the most recent SpiderReply into here
+	// . its so XMlDoc.cpp can increment it and add it to the new
+	//   SpiderReply it adds in case there is another download error ,
+	//   like ETCPTIMEDOUT or EDNSTIMEDOUT
+	uint8_t    m_sameErrCount;
+
+
+	uint8_t    m_version;
 	uint8_t    m_reservedb4;

 	// info on the page we were harvest from
@ -474,7 +486,9 @@ public:
 	// when we scan all of the SpiderRequests it has.
 	int32_t m_discoveryTime;

-	int32_t m_reservedc2;
+	// Used to compare previous errcode with current errcode, for counting
+	// sameErrCode value.
+	int32_t m_prevErrCode;	// m_reservedc2;

 	// . replace this with something we need for smart compression
 	// . this is zero if none or invalid
@ -627,6 +641,7 @@ public:
 		m_ufn = -1;
 		// this too
 		m_priority = -1;
+		m_version = SPIDERREQ_CURRENT_VERSION;
 	}

 	static int32_t getNeededSize ( int32_t urlLen ) {
@ -688,6 +703,10 @@ public:
 	bool setFromInject(const char *url);

 	bool isCorrupt() const;
+
+	SpiderRequest() {
+		reset();
+	}
 } __attribute__((packed, aligned(4)));

 // . XmlDoc adds this record to spiderdb after attempting to spider a url
@ -734,8 +753,10 @@ public:
 	// SpiderRequest's m_siteNumLinks
 	int32_t    m_siteNumInlinks;

-	// the actual pub date we extracted (0 means none, -1 unknown)
-	int32_t    m_pubDate;
+	uint8_t		m_sameErrCount;
+	uint8_t		m_version;
+	uint8_t		m_reserved_u8b;
+	uint8_t		m_reserved_u8c;

 	// . this is zero if none or invalid
 	int32_t    m_contentHash32;
@ -827,7 +848,10 @@ public:
 	int32_t getRecSize () const { return m_dataSize + 4 + sizeof(key128_t); }

 	// clear all
-	void reset() { memset ( this , 0 , sizeof(SpiderReply) ); }
+	void reset() {
+		memset(this, 0, sizeof(SpiderReply));
+		m_version = SPIDERREP_CURRENT_VERSION;
+	}

 	void setKey ( int32_t firstIp, int64_t parentDocId, int64_t uh48, bool isDel ) ;

@ -840,6 +864,10 @@ public:
 	int64_t getParentDocId() const {
 		return Spiderdb::getParentDocId(&m_key);
 	}
+
+	SpiderReply() {
+		reset();
+	}
 } __attribute__((packed, aligned(4)));

 // was 1000 but breached, now equals SR_READ_SIZE/sizeof(SpiderReply)
--- a/SpiderColl.cpp
+++ b/SpiderColl.cpp
@ -2407,6 +2407,12 @@ bool SpiderColl::scanListForWinners ( ) {
 		if ( srep ) {
 			sreq->m_errCount = srep->m_errCount;

+			// Save error code of last reply in the request so we
+			// can compare with error code after next spider attempt.
+			sreq->m_prevErrCode = srep->m_errCode;
+			sreq->m_sameErrCount = srep->m_sameErrCount;
+
+
 			// . assign this too from latest reply - smart compress
 			// . this WAS SpiderReply::m_pubdate so it might be
 			//   set to a non-zero value that is wrong now... but
@ -2415,6 +2421,11 @@ bool SpiderColl::scanListForWinners ( ) {
 			// if we tried it before
 			sreq->m_hadReply = true;
 		}
+		else {
+			sreq->m_errCount = 0;
+			sreq->m_sameErrCount = 0;
+			sreq->m_prevErrCode = 0;
+		}

 		// . get the url filter we match
 		// . if this is slow see the TODO below in dedupSpiderdbList()
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -14303,9 +14303,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	}
 	// store it
 	m_srep.m_firstIp = firstIp;
-	// assume no error
-	// MDW: not right...
+
+	// Default to no error. Will be set below.
 	m_srep.m_errCount = 0;
+	m_srep.m_sameErrCount = 0;
+
 	// otherwise, inherit from oldsr to be safe
 	//if ( m_sreqValid )
 	//	m_srep.m_firstIp = m_sreq.m_firstIp;
@ -14411,22 +14413,25 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 			m_srep.m_crawlDelayMS = m_crawlDelay;
 		else
 			m_srep.m_crawlDelayMS = -1;
-		//if ( m_pubDateValid     ) m_srep.m_pubDate = m_pubDate;
-		                          m_srep.m_pubDate = 0;
+
 		if ( m_langIdValid      ) m_srep.m_langId = m_langId;
 		if ( m_isRSSValid       ) m_srep.m_isRSS = m_isRSS;
 		if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
 		if ( m_httpStatusValid  ) m_srep.m_httpStatus = m_httpStatus;
+
 		// stuff that is automatically valid
 		m_srep.m_isPingServer = 0;
 		if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
+
 		// this was replaced by m_contentHash32
 		//m_srep.m_newRequests  = 0;
 		m_srep.m_errCode      = m_indexCode;
+
 		if ( m_downloadEndTimeValid )
 			m_srep.m_downloadEndTime = m_downloadEndTime;
 		else
 			m_srep.m_downloadEndTime = 0;
+
 		// is the original spider request valid?
 		if ( m_sreqValid ) {
 			// preserve the content hash in case m_indexCode is
@ -14442,16 +14447,43 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 			n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
 			n->m_isPingServer       = o->m_isPingServer;
 			// the validator flags
-			n->m_hasAuthorityInlinkValid =
-				o->m_hasAuthorityInlinkValid;
+			n->m_hasAuthorityInlinkValid = o->m_hasAuthorityInlinkValid;
+
 			// get error count from original spider request
 			int32_t newc = m_sreq.m_errCount;
 			// inc for us, since we had an error
 			newc++;
 			// contain to one byte
-			if ( newc > 255 ) newc = 255;
+			if ( newc > 255 ) {
+				newc = 255;
+			}
 			// store in our spiderreply
 			m_srep.m_errCount = newc;
+
+
+			// Number of times we have seen the same error code in a row
+			if( m_sreq.m_prevErrCode == m_srep.m_errCode ) {
+				int32_t newc = m_sreq.m_sameErrCount;
+
+				// Sanity. Must not be same or larger here.
+				if( newc >= m_srep.m_errCount ) {
+					log(LOG_WARN,"Correcting sameErrCount. Count=%" PRId32 ", sameErrCount=%" PRId32 ", prev_errCode=%" PRId32 ", curr_errCode=%" PRId32 ", url=%s, uh48=%" PRIx64 ", err=%s", m_srep.m_errCount, m_srep.m_sameErrCount, m_sreq.m_prevErrCode, m_srep.m_errCode, m_sreq.m_url, uh48, mstrerror( m_srep.m_errCode ));
+					newc = 0;
+				}
+
+				// inc for us, since we had an error
+				newc++;
+
+				// contain to one byte
+				if ( newc > 255 ) {
+					newc = 255;
+				}
+				// store in our spiderreply
+				m_srep.m_sameErrCount = newc;
+			}
+			else {
+				m_srep.m_sameErrCount = 0;
+			}
 		}
 		// . and do not really consider this an error
 		// . i don't want the url filters treating it as an error reply
@ -14466,6 +14498,8 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 			m_srep.m_errCode = 0;
 			// and no error count, it wasn't an error per se
 			m_srep.m_errCount = 0;
+			m_srep.m_sameErrCount = 0;
+
 			// call it 200
 			m_srep.m_httpStatus = 200;
 		}
@ -14473,8 +14507,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 		if ( m_indexCode == EDOCUNCHANGED &&
 		     m_oldDocValid &&
 		     m_oldDoc ) {
-			//m_srep.m_pubDate        = m_oldDoc->m_pubDate;
-			m_srep.m_pubDate        = 0;
 			m_srep.m_langId         = m_oldDoc->m_langId;
 			m_srep.m_isRSS          = m_oldDoc->m_isRSS;
 			m_srep.m_isPermalink    = m_oldDoc->m_isPermalink;
@ -14579,8 +14611,6 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
 	//   ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
 	m_srep.m_siteNumInlinks       = m_siteNumInlinks;
-	//m_srep.m_pubDate              = *pubDate;
-	m_srep.m_pubDate              = 0;
 	// this was replaced by m_contentHash32
 	//m_srep.m_newRequests          = 0;
 	m_srep.m_langId               = *langId;
--- a/main.cpp
+++ b/main.cpp
@ -2843,6 +2843,8 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
 	static int64_t s_lastRepUh48 = 0LL;
 	static int32_t s_lastErrCode = 0;
 	static int32_t s_lastErrCount = 0;
+	static int32_t s_sameErrCount = 0;
+
 	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
@ -2905,6 +2907,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
 			s_lastRepUh48 = srep->getUrlHash48();
 			s_lastErrCode = srep->m_errCode;
 			s_lastErrCount = srep->m_errCount;
+			s_sameErrCount = srep->m_sameErrCount;

 			// get firstip
 			if ( printStats == 1 ) {
@ -2935,6 +2938,7 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
 			printf(" hadReply=%" PRId32,(int32_t)hadReply);

 			printf(" errcount=%" PRId32,(int32_t)s_lastErrCount);
+			printf(" sameerrcount=%" PRId32,(int32_t)s_sameErrCount);

 			if ( s_lastErrCode ) {
 				printf( " errcode=%" PRId32"(%s)", ( int32_t ) s_lastErrCode, mstrerror( s_lastErrCode ) );
@ -2942,6 +2946,12 @@ int32_t dumpSpiderdb ( const char *coll, int32_t startFileNum, int32_t numFiles,
 				printf( " errcode=%" PRId32, ( int32_t ) s_lastErrCode );
 			}

+			if ( sreq->m_prevErrCode ) {
+				printf( " preverrcode=%" PRId32"(%s)", ( int32_t ) sreq->m_prevErrCode, mstrerror( sreq->m_prevErrCode ) );
+			} else {
+				printf( " preverrcode=%" PRId32, ( int32_t ) sreq->m_prevErrCode );
+			}
+
 			printf("\n");
 		}