Revert "added SpiderRequest::m_lastSuccessfulSpideredTime"

This reverts commit 29824085f1.
2015-05-10 09:51:10 -06:00
parent 29824085f1
commit 7c22d4770a
4 changed files with 10 additions and 75 deletions
--- a/Parms.cpp
+++ b/Parms.cpp
@ -22563,25 +22563,15 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "to adjust how often you want things respidered."
 			  "</td></tr>"

-			  "<tr class=poo><td>indexage</td>"
-			  "<td>"
-			  "How long has it been since the url was last "
-			  "successfully indexed? In seconds. "
-			  "Can use <, >, <=, >=, ==, != comparison operators."
-			  "</td></tr>"
-
 			  "<tr class=poo><td>urlage</td>"
 			  "<td>"
-			  "This uses the time, in seconds, since a url was "
-			  "first added to spiderdb to be spidered, aka "
+			  "This is the time, in seconds, since a url was first "
+			  "added to spiderdb to be spidered. This is "
 			  "its discovery date. "
 			  "Can use <, >, <=, >=, ==, != comparison operators."
 			  "</td></tr>"
 			  

-			  
-
-
 			  //"<tr class=poo><td>!newoutlink</td>"
 			  //"<td>Matches if document is NOT a new outlink."
 			  //"</td></tr>"
--- a/Spider.cpp
+++ b/Spider.cpp
@ -207,21 +207,9 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
 }

 void SpiderReply::setKey (int32_t firstIp,
-			  // no need for parentdocid in this any more.
-			  //int64_t parentDocId,
+			  int64_t parentDocId,
 			  int64_t uh48,
 			  bool isDel) {
-	// now we use a 1 parentdocid for replies that were successful
-	int64_t parentDocId = 1;
-	// or 0 if had error. this way we only keep at most 2 SpiderReplies
-	// for each url in spiderdb. we need to keep the last successful
-	// spiderreply  in spiderdb so 
-	// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
-	// this way the reply that was successful will occur after the
-	// one that had an error, so we can just check the last spider reply
-	// when doing our scan in scanListForWinners().
-	if ( m_errCode ) parentDocId = 0;
-
 	m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
 	// set dataSize too!
 	m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
@ -4577,13 +4565,6 @@ bool SpiderColl::scanListForWinners ( ) {
 		// assume our added time is the first time this url was added
 		sreq->m_discoveryTime = sreq->m_addedTime;

-		// record the last time we successfully indexed this doc, ifany
-		if ( srep && ! srep->m_errCode )
-			sreq->m_lastSuccessfulSpideredTime =
-				srep->m_spideredTime;
-		else
-			sreq->m_lastSuccessfulSpideredTime = 0;
-
 		// if ( uh48 == 110582802025376LL )
 		// 	log("hey");

@ -4613,12 +4594,10 @@ bool SpiderColl::scanListForWinners ( ) {
 				// and the min added time as well!
 				// get the oldest timestamp so
 				// gbssDiscoveryTime will be accurate.
-				if ( sreq->m_discoveryTime < 
-				     wsreq->m_discoveryTime )
+				if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
 					wsreq->m_discoveryTime = 
 						sreq->m_discoveryTime;
-				if ( wsreq->m_discoveryTime < 
-				     sreq->m_discoveryTime )
+				if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
 					sreq->m_discoveryTime = 
 						wsreq->m_discoveryTime;
 			}
@ -11334,7 +11313,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,

 		if ( *p != 'i' ) goto skipi;

-
 		if ( strncmp(p,"isinjected",10) == 0 ) {
 			// skip for msg20
 			if ( isForMsg20 ) continue;
@ -11942,7 +11920,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

-
 		// non-boolen junk
 skipi:

@ -12427,32 +12404,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

-		// constraint for last time url was successfully indexed
-		if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
-			// skip for msg20
-			if ( isForMsg20 ) continue;
-			// if never successfully indexed, skip this one
-			if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
-			int32_t age;
-			age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
-			// the argument entered by user
-			int32_t uage = atoi(s) ;
-			if ( sign == SIGN_EQ && age != uage ) continue;
-			if ( sign == SIGN_NE && age == uage ) continue;
-			if ( sign == SIGN_GT && age <= uage ) continue;
-			if ( sign == SIGN_LT && age >= uage ) continue;
-			if ( sign == SIGN_GE && age <  uage ) continue;
-			if ( sign == SIGN_LE && age >  uage ) continue;
-			// skip over 'indexage'
-			p += 8;
-			p = strstr(s, "&&");
-			//if nothing, else then it is a match
-			if ( ! p ) return i;
-			//skip the '&&' and go to next rule
-			p += 2;
-			goto checkNextRule;
-		}
-
 		// selector using the first time it was added to the Spiderdb
 		// added by Sam, May 5th 2015
 		if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
@ -12476,8 +12427,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
 			if ( sign == SIGN_GE && sreq_age <  argument_age ) continue;
 			if ( sign == SIGN_LE && sreq_age >  argument_age ) continue;
-			// skip over 'urlage'
-			p += 6;
 			p = strstr(s, "&&");
 			//if nothing, else then it is a match
 			if ( ! p ) return i;
--- a/Spider.h
+++ b/Spider.h
@ -532,11 +532,7 @@ class SpiderRequest {
 	// then we increment the last 8 bits or so. see Msg22.cpp.
 	//int64_t m_probDocId;
 	//int32_t m_reservedc1;
-	//int32_t m_reservedc2;
-
-	// if there is a 'successful' SpiderReply for this url then this is
-	// the SpiderReply::m_spideredTime of the most recent one.
-	int32_t m_lastSuccessfulSpideredTime;
+	int32_t m_reservedc2;

 	//int32_t  m_parentPubDate;

@ -959,7 +955,7 @@ class SpiderReply {
 	void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };

 	void setKey ( int32_t firstIp,
-		      //int64_t parentDocId , 
+		      int64_t parentDocId , 
 		      int64_t uh48 , 
 		      bool isDel ) ;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -22028,7 +22028,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		srep.m_domHash32  = m_sreq.m_domHash32;
 		srep.m_spideredTime = getTimeGlobal();
 		int64_t uh48 = m_sreq.getUrlHash48();
-		//int64_t parentDocId = 0LL;
+		int64_t parentDocId = 0LL;
 		srep.m_contentHash32 = 0;
 		// were we already in titledb before we started spidering?
 		// yes otherwise we would have called "goto skip9" above
@ -22038,7 +22038,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		srep.m_isIndexedINValid = false;
 		srep.m_errCode      = EREINDEXREDIR; // indexCode
 		srep.m_downloadEndTime = 0;
-		srep.setKey (  srep.m_firstIp, /*parentDocId ,*/uh48 , false );
+		srep.setKey (  srep.m_firstIp, parentDocId , uh48 , false );
 		// lock of request needs to match that of reply so the
 		// reply, when recevied by Rdb.cpp which calls addSpiderReply()
 		// can unlock this url so it can be spidered again.
@ -24922,7 +24922,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 		log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);

 	// set the key, m_srep.m_key
-	m_srep.setKey (  firstIp, /*parentDocId ,*/ uh48 , false );
+	m_srep.setKey (  firstIp, parentDocId , uh48 , false );

 	// . did we download a page? even if indexcode is set we might have
 	// . if this is non-zero that means its valid