Revert "added SpiderRequest::m_lastSuccessfulSpideredTime"

This reverts commit 29824085f1.
This commit is contained in:
Matt
2015-05-10 09:51:10 -06:00
parent 29824085f1
commit 7c22d4770a
4 changed files with 10 additions and 75 deletions

@ -22563,25 +22563,15 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"to adjust how often you want things respidered."
"</td></tr>"
"<tr class=poo><td>indexage</td>"
"<td>"
"How long has it been since the url was last "
"successfully indexed? In seconds. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
"<tr class=poo><td>urlage</td>"
"<td>"
"This uses the time, in seconds, since a url was "
"first added to spiderdb to be spidered, aka "
"This is the time, in seconds, since a url was first "
"added to spiderdb to be spidered. This is "
"its discovery date. "
"Can use <, >, <=, >=, ==, != comparison operators."
"</td></tr>"
//"<tr class=poo><td>!newoutlink</td>"
//"<td>Matches if document is NOT a new outlink."
//"</td></tr>"

@ -207,21 +207,9 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
}
void SpiderReply::setKey (int32_t firstIp,
// no need for parentdocid in this any more.
//int64_t parentDocId,
int64_t parentDocId,
int64_t uh48,
bool isDel) {
// now we use a 1 parentdocid for replies that were successful
int64_t parentDocId = 1;
// or 0 if had error. this way we only keep at most 2 SpiderReplies
// for each url in spiderdb. we need to keep the last successful
// spiderreply in spiderdb so
// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
// this way the reply that was successful will occur after the
// one that had an error, so we can just check the last spider reply
// when doing our scan in scanListForWinners().
if ( m_errCode ) parentDocId = 0;
m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
// set dataSize too!
m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
@ -4577,13 +4565,6 @@ bool SpiderColl::scanListForWinners ( ) {
// assume our added time is the first time this url was added
sreq->m_discoveryTime = sreq->m_addedTime;
// record the last time we successfully indexed this doc, ifany
if ( srep && ! srep->m_errCode )
sreq->m_lastSuccessfulSpideredTime =
srep->m_spideredTime;
else
sreq->m_lastSuccessfulSpideredTime = 0;
// if ( uh48 == 110582802025376LL )
// log("hey");
@ -4613,12 +4594,10 @@ bool SpiderColl::scanListForWinners ( ) {
// and the min added time as well!
// get the oldest timestamp so
// gbssDiscoveryTime will be accurate.
if ( sreq->m_discoveryTime <
wsreq->m_discoveryTime )
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
wsreq->m_discoveryTime =
sreq->m_discoveryTime;
if ( wsreq->m_discoveryTime <
sreq->m_discoveryTime )
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
sreq->m_discoveryTime =
wsreq->m_discoveryTime;
}
@ -11334,7 +11313,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( *p != 'i' ) goto skipi;
if ( strncmp(p,"isinjected",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
@ -11942,7 +11920,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
// non-boolen junk
skipi:
@ -12427,32 +12404,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
// constraint for last time url was successfully indexed
if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if never successfully indexed, skip this one
if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
int32_t age;
age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
// the argument entered by user
int32_t uage = atoi(s) ;
if ( sign == SIGN_EQ && age != uage ) continue;
if ( sign == SIGN_NE && age == uage ) continue;
if ( sign == SIGN_GT && age <= uage ) continue;
if ( sign == SIGN_LT && age >= uage ) continue;
if ( sign == SIGN_GE && age < uage ) continue;
if ( sign == SIGN_LE && age > uage ) continue;
// skip over 'indexage'
p += 8;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// selector using the first time it was added to the Spiderdb
// added by Sam, May 5th 2015
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
@ -12476,8 +12427,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
// skip over 'urlage'
p += 6;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;

@ -532,11 +532,7 @@ class SpiderRequest {
// then we increment the last 8 bits or so. see Msg22.cpp.
//int64_t m_probDocId;
//int32_t m_reservedc1;
//int32_t m_reservedc2;
// if there is a 'successful' SpiderReply for this url then this is
// the SpiderReply::m_spideredTime of the most recent one.
int32_t m_lastSuccessfulSpideredTime;
int32_t m_reservedc2;
//int32_t m_parentPubDate;
@ -959,7 +955,7 @@ class SpiderReply {
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };
void setKey ( int32_t firstIp,
//int64_t parentDocId ,
int64_t parentDocId ,
int64_t uh48 ,
bool isDel ) ;

@ -22028,7 +22028,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
srep.m_domHash32 = m_sreq.m_domHash32;
srep.m_spideredTime = getTimeGlobal();
int64_t uh48 = m_sreq.getUrlHash48();
//int64_t parentDocId = 0LL;
int64_t parentDocId = 0LL;
srep.m_contentHash32 = 0;
// were we already in titledb before we started spidering?
// yes otherwise we would have called "goto skip9" above
@ -22038,7 +22038,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
srep.m_isIndexedINValid = false;
srep.m_errCode = EREINDEXREDIR; // indexCode
srep.m_downloadEndTime = 0;
srep.setKey ( srep.m_firstIp, /*parentDocId ,*/uh48 , false );
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
// lock of request needs to match that of reply so the
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
// can unlock this url so it can be spidered again.
@ -24922,7 +24922,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
// set the key, m_srep.m_key
m_srep.setKey ( firstIp, /*parentDocId ,*/ uh48 , false );
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
// . did we download a page? even if indexcode is set we might have
// . if this is non-zero that means its valid