fix critical spider bug that was deleting pages

because of bogus SpiderReply::m_langId values!
This commit is contained in:
Matt 2015-03-05 08:49:39 -08:00
parent 7ae5518c1d
commit dfd6d8b2cf
4 changed files with 41 additions and 19 deletions

@ -829,7 +829,8 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
MAX_NICENESS,
cr,
false, // isoutlink?
NULL);
NULL,
-1); // langIdArg
char *expression = NULL;
int32_t priority = -4;
// sanity check

@ -2278,7 +2278,8 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
// HACK: set isOutlink to true here since we don't know if we have sre
ufn = ::getUrlFilterNum(sreq,NULL,nowGlobalMS,false,MAX_NICENESS,m_cr,
true,//isoutlink? HACK!
NULL); // quota table quotatable
NULL,// quota table quotatable
-1 ); // langid not valid
// sanity check
//if ( ufn < 0 ) {
// log("spider: failed to add spider request for %s because "
@ -4148,7 +4149,8 @@ bool SpiderColl::scanListForWinners ( ) {
m_cr,
false, // isOutlink?
// provide the page quota table
&m_localTable);
&m_localTable,
-1);
// sanity check
if ( ufn == -1 ) {
log("spider: failed to match url filter for "
@ -10725,14 +10727,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
int32_t niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ) {
HashTableX *quotaTable ,
int32_t langIdArg ) {
int32_t langId = langIdArg;
if ( srep ) langId = srep->m_langId;
// convert lang to string
char *lang = NULL;
int32_t langLen = 0;
if ( srep ) {
if ( langId >= 0 ) { // if ( srep ) {
// this is NULL on corruption
lang = getLanguageAbbr ( srep->m_langId );
lang = getLanguageAbbr ( langId );//srep->m_langId );
langLen = gbstrlen(lang);
}
@ -11884,7 +11890,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq ,
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
if ( langId == -1 ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
@ -12446,7 +12452,8 @@ int32_t getUrlFilterNum ( SpiderRequest *sreq ,
int32_t niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ) {
HashTableX *quotaTable ,
int32_t langId ) {
/*
turn this off for now to save memory on the g0 cluster.
@ -12479,7 +12486,8 @@ int32_t getUrlFilterNum ( SpiderRequest *sreq ,
niceness,
cr,
isOutlink,
quotaTable );
quotaTable ,
langId );
/*
// is table full? clear it if so

@ -1596,6 +1596,7 @@ int32_t getUrlFilterNum ( class SpiderRequest *sreq ,
int32_t niceness ,
class CollectionRec *cr ,
bool isOutlink , // = false ,
HashTableX *quotaTable );//= NULL ) ;
HashTableX *quotaTable ,//= NULL ) ;
int32_t langIdArg );
#endif

@ -19160,10 +19160,15 @@ int32_t *XmlDoc::getUrlFilterNum ( ) {
// make a fake one for now
SpiderReply fakeReply;
// just language for now, so we can FILTER by language
if ( m_langIdValid ) fakeReply.m_langId = m_langId;
// SpiderReply fakeReply;
// // fix errors
// fakeReply.reset();
// fakeReply.m_isIndexedINValid = true;
// // just language for now, so we can FILTER by language
// if ( m_langIdValid ) fakeReply.m_langId = m_langId;
int32_t langIdArg = -1;
if ( m_langIdValid ) langIdArg = m_langId;
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
@ -19192,10 +19197,13 @@ int32_t *XmlDoc::getUrlFilterNum ( ) {
// . look it up
// . use the old spidered date for "nowGlobal" so we can be consistent
// for injecting into the "qatest123" coll
int32_t ufn = ::getUrlFilterNum ( oldsr,&fakeReply,spideredTime,false,
m_niceness,cr,
false, // isOutlink?
NULL);
int32_t ufn = ::getUrlFilterNum ( oldsr,
NULL,//&fakeReply,
spideredTime,false,
m_niceness,cr,
false, // isOutlink?
NULL,
langIdArg);
// put it back
//newsr->m_spideredTime = saved;
@ -24956,7 +24964,8 @@ void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq ,
// validate the stuff so getUrlFilterNum() acks it
sreq->m_hopCountValid = 1;
srep->reset();
srep->m_spideredTime = getSpideredTime();//m_spideredTime;
//srep->m_isSpam = isSpam; // real-time update this!!!
@ -30057,12 +30066,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
SpiderReply srep;
setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam );
int32_t spideredTime = getSpideredTime();
int32_t langIdArg = -1;
if ( m_langIdValid ) langIdArg = m_langId;
// get it
int32_t ufn;
ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true,
m_niceness,cr,
false, // isOutlink?
NULL );
NULL ,
langIdArg);
// sanity check
if ( ufn < 0 ) {
log("msg20: bad url filter for url %s", sreq.m_url);