forked from Mirrors/privacore-open-source-search-engine
Merge branch 'master' of github.com:privacore/open-source-search-engine
This commit is contained in:
@ -3056,19 +3056,35 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
}
|
||||
|
||||
// try to fix bug of EBADURL when it wasn't really
|
||||
// EBADURL is 32880
|
||||
// this is a HACK!
|
||||
m_regExs[i].set("errorcount==1 && errorcode==32880");
|
||||
m_spiderPriorities [i] = 15;
|
||||
m_spiderFreqs [i] = 0.1;
|
||||
m_maxSpidersPerRule [i] = 1;
|
||||
i++;
|
||||
|
||||
// 3rd rule for respidering
|
||||
if ( respiderFreq > 0.0 ) {
|
||||
m_regExs[i].set("lastspidertime>={roundstart}");
|
||||
// do not "remove" from index
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
//m_spidersEnabled [i] = 0;
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
|
||||
// which has been obsoleted, but we are running old code now!
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
// if doing a one-shot crawl limit error retries to 3 times or
|
||||
// if no urls currently available to spider, whichever comes first.
|
||||
else {
|
||||
m_regExs[i].set("errorcount>=3");
|
||||
m_spiderPriorities [i] = 11;
|
||||
m_spiderFreqs [i] = 0.0416;
|
||||
m_maxSpidersPerRule [i] = 0; // turn off spiders
|
||||
i++;
|
||||
}
|
||||
|
||||
m_regExs[i].set("errorcount>=1 && !hastmperror");
|
||||
m_spiderPriorities [i] = 14;
|
||||
m_spiderFreqs [i] = 0.0;
|
||||
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
|
||||
m_spiderFreqs [i] = 0.0416; // every hour
|
||||
//m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
|
||||
i++;
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
@ -3091,23 +3107,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
|
||||
i++;
|
||||
|
||||
// 3rd rule for respidering
|
||||
if ( respiderFreq > 0.0 ) {
|
||||
m_regExs[i].set("lastspidertime>={roundstart}");
|
||||
// do not "remove" from index
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
//m_spidersEnabled [i] = 0;
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
|
||||
// which has been obsoleted, but we are running old code now!
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
// if collectiverespiderfreq is 0 or less then do not RE-spider
|
||||
// documents already indexed.
|
||||
else {
|
||||
if ( respiderFreq <= 0.0 ) { // else {
|
||||
// this does NOT work! error docs continuosly respider
|
||||
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
|
||||
//m_regExs[i].set("isindexed");
|
||||
@ -3129,13 +3131,20 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
i++;
|
||||
|
||||
// crawl everything else, but don't harvest links,
|
||||
// we have to see if the page content matches the "ppp"
|
||||
// to determine whether the page should be processed or not.
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = 52;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
m_harvestLinks [i] = false;
|
||||
i++;
|
||||
goto done;
|
||||
@ -3146,6 +3155,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set("matchesucp && matchesupp");
|
||||
m_spiderPriorities [i] = 55;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
@ -3153,12 +3165,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
i++;
|
||||
// just process, do not spider links if does not match ucp
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
m_harvestLinks [i] = false;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
@ -3180,6 +3198,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
// process everything since upp is empty
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
@ -3202,6 +3223,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
|
||||
// let's always make this without delay because if we
|
||||
// restart the round we want these to process right away
|
||||
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
|
||||
//m_harvestLinks [i] = false;
|
||||
//m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
|
5
Conf.h
5
Conf.h
@ -310,6 +310,11 @@ class Conf {
|
||||
// lookup requests to a host to maxmize tfndb page cache hits?
|
||||
//bool m_useBiasedTfndb;
|
||||
|
||||
|
||||
// just ensure lists being written are valid rdb records (titlerecs)
|
||||
// trying to isolate titlerec corruption
|
||||
bool m_verifyDumpedLists;
|
||||
|
||||
// calls fsync(fd) if true after each write
|
||||
bool m_flushWrites ;
|
||||
bool m_verifyWrites;
|
||||
|
@ -4120,7 +4120,7 @@ bool Links::addLink ( const char *link , int32_t linkLen , int32_t nodeNum ,
|
||||
|
||||
// stop http://0x0017.0000000000000000000000000000000000000024521276/
|
||||
// which somehow make it through without this!!
|
||||
if ( url.getTLDLen() <= 0 ) return true;
|
||||
if ( ! url.isIp() && url.getTLDLen() <= 0 ) return true;
|
||||
|
||||
// Allocate more link buffer space?
|
||||
int32_t bufSpace ;
|
||||
|
15
Parms.cpp
15
Parms.cpp
@ -7262,6 +7262,21 @@ void Parms::init ( ) {
|
||||
m->m_group = false;
|
||||
m++;
|
||||
|
||||
m->m_title = "verify written lists";
|
||||
m->m_desc = "Ensure lists being written to disk are not corrupt. "
|
||||
"That title recs appear valid, etc. Helps isolate sources "
|
||||
"of corruption. Used for debugging.";
|
||||
m->m_cgi = "vwl";
|
||||
m->m_off = offsetof(Conf,m_verifyDumpedLists);
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_group = 0;
|
||||
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
|
||||
m->m_page = PAGE_MASTER;
|
||||
m->m_obj = OBJ_CONF;
|
||||
m->m_group = 0;
|
||||
m++;
|
||||
|
||||
m->m_title = "verify disk writes";
|
||||
m->m_desc = "Read what was written in a verification step. Decreases "
|
||||
"performance, but may help fight disk corruption mostly on "
|
||||
|
31
Rdb.cpp
31
Rdb.cpp
@ -1676,6 +1676,11 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
// if ( m_inDumpLoop ) {
|
||||
// g_errno = ETRYAGAIN;
|
||||
// return false;
|
||||
// }
|
||||
|
||||
// if we are well into repair mode, level 2, do not add anything
|
||||
// to spiderdb or titledb... that can mess up our titledb scan.
|
||||
// we always rebuild tfndb, clusterdb and spiderdb
|
||||
@ -2068,6 +2073,28 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
return false;
|
||||
}
|
||||
|
||||
// do not add if range being dumped at all because when the
|
||||
// dump completes it calls deleteList() and removes the nodes from
|
||||
// the tree, so if you were overriding a node currently being dumped
|
||||
// we would lose it.
|
||||
if ( m_dump.isDumping() &&
|
||||
//oppKey >= m_dump.getFirstKeyInQueue() &&
|
||||
// ensure the dump is dumping the collnum of this key
|
||||
m_dump.m_collnum == collnum &&
|
||||
m_dump.m_lastKeyInQueue &&
|
||||
// the dump should not split positive/negative keys so
|
||||
// if our positive/negative twin should be in the dump with us
|
||||
// or not in the dump with us, so any positive/negative
|
||||
// annihilation below should be ok and we should be save
|
||||
// to call deleteNode() below
|
||||
KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
|
||||
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
|
||||
KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0 ) {
|
||||
// tell caller to wait and try again later
|
||||
g_errno = ETRYAGAIN;
|
||||
return false;
|
||||
}
|
||||
|
||||
// save orig
|
||||
char *orig = NULL;
|
||||
|
||||
@ -2229,6 +2256,7 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
// CAUTION: we should not annihilate with oppKey if oppKey may
|
||||
// be in the process of being dumped to disk! This would
|
||||
// render our annihilation useless and make undeletable data
|
||||
/*
|
||||
if ( m_dump.isDumping() &&
|
||||
//oppKey >= m_dump.getFirstKeyInQueue() &&
|
||||
m_dump.m_lastKeyInQueue &&
|
||||
@ -2236,6 +2264,7 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
|
||||
KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0 )
|
||||
goto addIt;
|
||||
*/
|
||||
// BEFORE we delete it, save it. this is a special hack
|
||||
// so we can UNDO this deleteNode() should the titledb rec
|
||||
// add fail.
|
||||
@ -2309,7 +2338,7 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
// if we did not find an oppKey and are tfndb, flag this
|
||||
//if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false;
|
||||
|
||||
addIt:
|
||||
//addIt:
|
||||
// mark as changed
|
||||
//if ( ! m_needsSave ) {
|
||||
// m_needsSave = true;
|
||||
|
@ -424,10 +424,12 @@ bool RdbDump::dumpTree ( bool recall ) {
|
||||
// . check the list we got from the tree for problems
|
||||
// . ensures keys are ordered from lowest to highest as well
|
||||
//#ifdef GBSANITYCHECK
|
||||
if ( g_conf.m_verifyWrites ) {
|
||||
if ( g_conf.m_verifyWrites || g_conf.m_verifyDumpedLists ) {
|
||||
char *s = "none";
|
||||
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
|
||||
log("dump: verifying list before dumping (rdb=%s)",s);
|
||||
log("dump: verifying list before dumping (rdb=%s "
|
||||
"collnum=%i)",s,(int)m_collnum);
|
||||
|
||||
m_list->checkList_r ( false , // removeNegRecs?
|
||||
false , // sleep on problem?
|
||||
m_rdb->m_rdbId );
|
||||
|
@ -772,7 +772,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
|
||||
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
|
||||
char *rec = getCurrentRec();
|
||||
int32_t usize = *(int32_t *)(rec+12+4);
|
||||
if ( usize <= 0 ) {
|
||||
if ( usize <= 0 || usize>100000000 ) {
|
||||
log("db: bad titlerec uncompress size");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
25
RdbMem.cpp
25
RdbMem.cpp
@ -89,15 +89,22 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
|
||||
void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) {
|
||||
// if we're dumping and key has been dumped, use the secondary mem
|
||||
//if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) {
|
||||
if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() &&
|
||||
( collnum < m_rdb->m_dumpCollnum ||
|
||||
(collnum == m_rdb->m_dumpCollnum &&
|
||||
// if dump fails to alloc mem in RdbDump::dumpTree it does
|
||||
// a sleep wrapper and keeps retrying, and
|
||||
// RdbDump::m_lastKeyInQueue can remain NULL because we've
|
||||
// never dumped out a list from the tree yet
|
||||
m_rdb->m_dump.m_lastKeyInQueue &&
|
||||
KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
|
||||
if ( m_rdb->m_inDumpLoop ) {
|
||||
/////
|
||||
// MDW: 3/15/2016
|
||||
// if we're dumping then ALWAYS use secondary mem, wtf...
|
||||
// primary is being dumped out and when the dump completes
|
||||
// the ptr gets reset so we'll end up point to garbage.
|
||||
///////
|
||||
|
||||
// ( collnum < m_rdb->m_dumpCollnum ||
|
||||
// (collnum == m_rdb->m_dumpCollnum &&
|
||||
// // if dump fails to alloc mem in RdbDump::dumpTree it does
|
||||
// // a sleep wrapper and keeps retrying, and
|
||||
// // RdbDump::m_lastKeyInQueue can remain NULL because we've
|
||||
// // never dumped out a list from the tree yet
|
||||
// m_rdb->m_dump.m_lastKeyInQueue &&
|
||||
// KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
|
||||
// if secondary mem is growing down...
|
||||
if ( m_ptr2 > m_ptr1 ) {
|
||||
// return NULL if it would breech,
|
||||
|
67
RdbTree.cpp
67
RdbTree.cpp
@ -8,6 +8,7 @@
|
||||
#include "Loop.h"
|
||||
#include "Threads.h"
|
||||
#include "Linkdb.h"
|
||||
#include "Spider.h"
|
||||
|
||||
RdbTree::RdbTree () {
|
||||
//m_countsInitialized = false;
|
||||
@ -1128,6 +1129,12 @@ bool RdbTree::fixTree ( ) {
|
||||
//CollectionRec *recs = g_collectiondb.m_recs;
|
||||
int32_t max = g_collectiondb.m_numRecs;
|
||||
log("db: Valid collection numbers range from 0 to %"INT32".",max);
|
||||
|
||||
bool isTitledb = false;
|
||||
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
|
||||
bool isSpiderdb = false;
|
||||
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
|
||||
|
||||
// now re-add the old nods to the tree, they should not be overwritten
|
||||
// by addNode()
|
||||
for ( int32_t i = 0 ; i < n ; i++ ) {
|
||||
@ -1136,6 +1143,34 @@ bool RdbTree::fixTree ( ) {
|
||||
log("db: Fixing node #%"INT32" of %"INT32".",i,n);
|
||||
// skip if empty
|
||||
if ( m_parents[i] <= -2 ) continue;
|
||||
|
||||
if ( isTitledb && m_data[i] ) {
|
||||
char *data = m_data[i];
|
||||
int32_t ucompSize = *(int32_t *)data;
|
||||
if ( ucompSize < 0 || ucompSize > 100000000 ) {
|
||||
log("db: removing titlerec with uncompressed "
|
||||
"size of %i from tree",(int)ucompSize);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
char *key = &m_keys[i*m_ks];
|
||||
if ( isSpiderdb && m_data[i] &&
|
||||
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
|
||||
char *data = m_data[i];
|
||||
data -= sizeof(SPIDERDBKEY);
|
||||
data -= 4;
|
||||
SpiderRequest *sreq ;
|
||||
sreq =(SpiderRequest *)data;
|
||||
if ( strncmp(sreq->m_url,"http",4) ) {
|
||||
log("db: removing spiderrequest bad url "
|
||||
"%s from tree",sreq->m_url);
|
||||
//return false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
collnum_t cn = m_collnums[i];
|
||||
// verify collnum
|
||||
if ( cn < 0 ) continue;
|
||||
@ -1191,6 +1226,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
|
||||
|
||||
bool isTitledb = false;
|
||||
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
|
||||
bool isSpiderdb = false;
|
||||
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
|
||||
|
||||
// now check parent kid correlations
|
||||
for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
|
||||
// this thing blocks for 1.5 secs for indexdb
|
||||
@ -1208,6 +1249,32 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( isTitledb && m_data[i] ) {
|
||||
char *data = m_data[i];
|
||||
int32_t ucompSize = *(int32_t *)data;
|
||||
if ( ucompSize < 0 || ucompSize > 100000000 ) {
|
||||
log("db: found titlerec with uncompressed "
|
||||
"size of %i from tree",(int)ucompSize);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
char *key = &m_keys[i*m_ks];
|
||||
if ( isSpiderdb && m_data[i] &&
|
||||
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
|
||||
char *data = m_data[i];
|
||||
data -= sizeof(SPIDERDBKEY);
|
||||
data -= 4;
|
||||
SpiderRequest *sreq ;
|
||||
sreq =(SpiderRequest *)data;
|
||||
if ( strncmp(sreq->m_url,"http",4) ) {
|
||||
log("db: spiderrequest bad url "
|
||||
"%s",sreq->m_url);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// bad collnum?
|
||||
if ( doCollRecCheck ) {
|
||||
collnum_t cn = m_collnums[i];
|
||||
|
@ -2245,6 +2245,14 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
bool inCache = false;
|
||||
bool useCache = true;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
|
||||
|
||||
// did our collection rec get deleted? since we were doing a read
|
||||
// the SpiderColl will have been preserved in that case but its
|
||||
// m_deleteMyself flag will have been set.
|
||||
if ( tryToDeleteSpiderColl ( this ,"6") ) return false;
|
||||
|
||||
|
||||
// if doing site or page quotes for the sitepages or domainpages
|
||||
// url filter expressions, we can't muck with the cache because
|
||||
// we end up skipping the counting part.
|
||||
@ -4658,6 +4666,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
|
||||
int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
|
||||
// do not spider more than once per 15 seconds ever!
|
||||
// no! might be a query reindex!!
|
||||
/*
|
||||
if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) {
|
||||
static bool s_printed = false;
|
||||
if ( ! s_printed ) {
|
||||
@ -4667,6 +4676,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
|
||||
}
|
||||
waitInSecs = 15;//900; this was 15 minutes
|
||||
}
|
||||
*/
|
||||
// in fact, force docid based guys to be zero!
|
||||
//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
|
||||
if ( sreq->m_isPageReindex ) waitInSecs = 0;
|
||||
|
38
XmlDoc.cpp
38
XmlDoc.cpp
@ -5897,6 +5897,16 @@ char *XmlDoc::getIsDup ( ) {
|
||||
return &m_isDup;
|
||||
}
|
||||
|
||||
|
||||
// do not dedup seeds
|
||||
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
|
||||
if ( cr->m_isCustomCrawl && isSeed ) {
|
||||
m_isDupValid = true;
|
||||
m_isDup = false;
|
||||
return &m_isDup;
|
||||
}
|
||||
|
||||
|
||||
setStatus ( "checking for dups" );
|
||||
|
||||
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
|
||||
@ -10328,6 +10338,28 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
|
||||
for ( ; p < pend ; p++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// fix <!--[if lte IE 6]>
|
||||
// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
|
||||
if ( *p == '!' &&
|
||||
p[-1]=='<' &&
|
||||
p[1] == '-' &&
|
||||
p[2] == '-' ) {
|
||||
// find end of comment
|
||||
for ( ; p < pend ; p++ ) {
|
||||
QUICKPOLL(m_niceness);
|
||||
if (p[0] == '-' &&
|
||||
p[1] == '-' &&
|
||||
p[2] == '>' )
|
||||
break;
|
||||
}
|
||||
// if found no end of comment, then stop
|
||||
if ( p >= pend )
|
||||
break;
|
||||
// resume looking for meta redirect tags
|
||||
continue;
|
||||
}
|
||||
|
||||
// base everything off the equal sign
|
||||
if ( *p != '=' ) continue;
|
||||
// did we match "http-equiv="?
|
||||
@ -20249,6 +20281,11 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
"<td>%s</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<td>http status</td>"
|
||||
"<td>%i</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr>"
|
||||
"<td>url filter num</td>"
|
||||
"<td>%"INT32"</td>"
|
||||
@ -20284,6 +20321,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
|
||||
getFirstUrlHash64(), // uh48
|
||||
|
||||
mstrerror(m_indexCode),
|
||||
m_httpStatus,
|
||||
ufn,
|
||||
mstrerror(g_errno),
|
||||
allowed,
|
||||
|
Reference in New Issue
Block a user