Revert "added SpiderRequest::m_lastSuccessfulSpideredTime"
This reverts commit 29824085f1
.
This commit is contained in:
14
Parms.cpp
14
Parms.cpp
@ -22563,25 +22563,15 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"to adjust how often you want things respidered."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>indexage</td>"
|
||||
"<td>"
|
||||
"How long has it been since the url was last "
|
||||
"successfully indexed? In seconds. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr class=poo><td>urlage</td>"
|
||||
"<td>"
|
||||
"This uses the time, in seconds, since a url was "
|
||||
"first added to spiderdb to be spidered, aka "
|
||||
"This is the time, in seconds, since a url was first "
|
||||
"added to spiderdb to be spidered. This is "
|
||||
"its discovery date. "
|
||||
"Can use <, >, <=, >=, ==, != comparison operators."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//"<tr class=poo><td>!newoutlink</td>"
|
||||
//"<td>Matches if document is NOT a new outlink."
|
||||
//"</td></tr>"
|
||||
|
57
Spider.cpp
57
Spider.cpp
@ -207,21 +207,9 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
}
|
||||
|
||||
void SpiderReply::setKey (int32_t firstIp,
|
||||
// no need for parentdocid in this any more.
|
||||
//int64_t parentDocId,
|
||||
int64_t parentDocId,
|
||||
int64_t uh48,
|
||||
bool isDel) {
|
||||
// now we use a 1 parentdocid for replies that were successful
|
||||
int64_t parentDocId = 1;
|
||||
// or 0 if had error. this way we only keep at most 2 SpiderReplies
|
||||
// for each url in spiderdb. we need to keep the last successful
|
||||
// spiderreply in spiderdb so
|
||||
// SpiderRequest::m_lastSuccessfulSpideredTime will be valid.
|
||||
// this way the reply that was successful will occur after the
|
||||
// one that had an error, so we can just check the last spider reply
|
||||
// when doing our scan in scanListForWinners().
|
||||
if ( m_errCode ) parentDocId = 0;
|
||||
|
||||
m_key = g_spiderdb.makeKey ( firstIp,uh48,false,parentDocId , isDel );
|
||||
// set dataSize too!
|
||||
m_dataSize = sizeof(SpiderReply) - sizeof(key128_t) - 4;
|
||||
@ -4577,13 +4565,6 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// assume our added time is the first time this url was added
|
||||
sreq->m_discoveryTime = sreq->m_addedTime;
|
||||
|
||||
// record the last time we successfully indexed this doc, ifany
|
||||
if ( srep && ! srep->m_errCode )
|
||||
sreq->m_lastSuccessfulSpideredTime =
|
||||
srep->m_spideredTime;
|
||||
else
|
||||
sreq->m_lastSuccessfulSpideredTime = 0;
|
||||
|
||||
// if ( uh48 == 110582802025376LL )
|
||||
// log("hey");
|
||||
|
||||
@ -4613,12 +4594,10 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// and the min added time as well!
|
||||
// get the oldest timestamp so
|
||||
// gbssDiscoveryTime will be accurate.
|
||||
if ( sreq->m_discoveryTime <
|
||||
wsreq->m_discoveryTime )
|
||||
if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
|
||||
wsreq->m_discoveryTime =
|
||||
sreq->m_discoveryTime;
|
||||
if ( wsreq->m_discoveryTime <
|
||||
sreq->m_discoveryTime )
|
||||
if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
|
||||
sreq->m_discoveryTime =
|
||||
wsreq->m_discoveryTime;
|
||||
}
|
||||
@ -11334,7 +11313,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
|
||||
if ( *p != 'i' ) goto skipi;
|
||||
|
||||
|
||||
if ( strncmp(p,"isinjected",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
@ -11942,7 +11920,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
|
||||
// non-boolen junk
|
||||
skipi:
|
||||
|
||||
@ -12427,32 +12404,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// constraint for last time url was successfully indexed
|
||||
if ( *p=='i' && strncmp(p,"indexage",8) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if never successfully indexed, skip this one
|
||||
if ( sreq->m_lastSuccessfulSpideredTime == 0) continue;
|
||||
int32_t age;
|
||||
age = nowGlobal - sreq->m_lastSuccessfulSpideredTime;
|
||||
// the argument entered by user
|
||||
int32_t uage = atoi(s) ;
|
||||
if ( sign == SIGN_EQ && age != uage ) continue;
|
||||
if ( sign == SIGN_NE && age == uage ) continue;
|
||||
if ( sign == SIGN_GT && age <= uage ) continue;
|
||||
if ( sign == SIGN_LT && age >= uage ) continue;
|
||||
if ( sign == SIGN_GE && age < uage ) continue;
|
||||
if ( sign == SIGN_LE && age > uage ) continue;
|
||||
// skip over 'indexage'
|
||||
p += 8;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
//skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// selector using the first time it was added to the Spiderdb
|
||||
// added by Sam, May 5th 2015
|
||||
if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
|
||||
@ -12476,8 +12427,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
|
||||
if ( sign == SIGN_GE && sreq_age < argument_age ) continue;
|
||||
if ( sign == SIGN_LE && sreq_age > argument_age ) continue;
|
||||
// skip over 'urlage'
|
||||
p += 6;
|
||||
p = strstr(s, "&&");
|
||||
//if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
|
8
Spider.h
8
Spider.h
@ -532,11 +532,7 @@ class SpiderRequest {
|
||||
// then we increment the last 8 bits or so. see Msg22.cpp.
|
||||
//int64_t m_probDocId;
|
||||
//int32_t m_reservedc1;
|
||||
//int32_t m_reservedc2;
|
||||
|
||||
// if there is a 'successful' SpiderReply for this url then this is
|
||||
// the SpiderReply::m_spideredTime of the most recent one.
|
||||
int32_t m_lastSuccessfulSpideredTime;
|
||||
int32_t m_reservedc2;
|
||||
|
||||
//int32_t m_parentPubDate;
|
||||
|
||||
@ -959,7 +955,7 @@ class SpiderReply {
|
||||
void reset() { memset ( this , 0 , sizeof(SpiderReply) ); };
|
||||
|
||||
void setKey ( int32_t firstIp,
|
||||
//int64_t parentDocId ,
|
||||
int64_t parentDocId ,
|
||||
int64_t uh48 ,
|
||||
bool isDel ) ;
|
||||
|
||||
|
@ -22028,7 +22028,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
srep.m_domHash32 = m_sreq.m_domHash32;
|
||||
srep.m_spideredTime = getTimeGlobal();
|
||||
int64_t uh48 = m_sreq.getUrlHash48();
|
||||
//int64_t parentDocId = 0LL;
|
||||
int64_t parentDocId = 0LL;
|
||||
srep.m_contentHash32 = 0;
|
||||
// were we already in titledb before we started spidering?
|
||||
// yes otherwise we would have called "goto skip9" above
|
||||
@ -22038,7 +22038,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
srep.m_isIndexedINValid = false;
|
||||
srep.m_errCode = EREINDEXREDIR; // indexCode
|
||||
srep.m_downloadEndTime = 0;
|
||||
srep.setKey ( srep.m_firstIp, /*parentDocId ,*/uh48 , false );
|
||||
srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false );
|
||||
// lock of request needs to match that of reply so the
|
||||
// reply, when recevied by Rdb.cpp which calls addSpiderReply()
|
||||
// can unlock this url so it can be spidered again.
|
||||
@ -24922,7 +24922,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);
|
||||
|
||||
// set the key, m_srep.m_key
|
||||
m_srep.setKey ( firstIp, /*parentDocId ,*/ uh48 , false );
|
||||
m_srep.setKey ( firstIp, parentDocId , uh48 , false );
|
||||
|
||||
// . did we download a page? even if indexcode is set we might have
|
||||
// . if this is non-zero that means its valid
|
||||
|
Reference in New Issue
Block a user