Merge branch 'master' of github.com:privacore/open-source-search-engine

This commit is contained in:
Ivan Skytte Jørgensen
2016-03-17 21:54:47 +01:00
11 changed files with 236 additions and 39 deletions

@ -3056,19 +3056,35 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
i++;
}
// try to fix bug of EBADURL when it wasn't really
// EBADURL is 32880
// this is a HACK!
m_regExs[i].set("errorcount==1 && errorcode==32880");
m_spiderPriorities [i] = 15;
m_spiderFreqs [i] = 0.1;
m_maxSpidersPerRule [i] = 1;
i++;
// 3rd rule for respidering
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if doing a one-shot crawl limit error retries to 3 times or
// if no urls currently available to spider, whichever comes first.
else {
m_regExs[i].set("errorcount>=3");
m_spiderPriorities [i] = 11;
m_spiderFreqs [i] = 0.0416;
m_maxSpidersPerRule [i] = 0; // turn off spiders
i++;
}
m_regExs[i].set("errorcount>=1 && !hastmperror");
m_spiderPriorities [i] = 14;
m_spiderFreqs [i] = 0.0;
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
m_spiderFreqs [i] = 0.0416; // every hour
//m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
i++;
// and for docs that have errors respider once every 5 hours
@ -3091,23 +3107,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
i++;
// 3rd rule for respidering
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
else {
if ( respiderFreq <= 0.0 ) { // else {
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//m_regExs[i].set("isindexed");
@ -3129,13 +3131,20 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// crawl everything else, but don't harvest links,
// we have to see if the page content matches the "ppp"
// to determine whether the page should be processed or not.
m_regExs[i].set("default");
m_spiderPriorities [i] = 52;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
m_harvestLinks [i] = false;
i++;
goto done;
@ -3146,6 +3155,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp && matchesupp");
m_spiderPriorities [i] = 55;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
@ -3153,12 +3165,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// just process, do not spider links if does not match ucp
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
m_harvestLinks [i] = false;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
@ -3180,6 +3198,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
// process everything since upp is empty
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
@ -3202,6 +3223,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_harvestLinks [i] = false;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;

5
Conf.h

@ -310,6 +310,11 @@ class Conf {
// lookup requests to a host to maxmize tfndb page cache hits?
//bool m_useBiasedTfndb;
// just ensure lists being written are valid rdb records (titlerecs)
// trying to isolate titlerec corruption
bool m_verifyDumpedLists;
// calls fsync(fd) if true after each write
bool m_flushWrites ;
bool m_verifyWrites;

@ -4120,7 +4120,7 @@ bool Links::addLink ( const char *link , int32_t linkLen , int32_t nodeNum ,
// stop http://0x0017.0000000000000000000000000000000000000024521276/
// which somehow make it through without this!!
if ( url.getTLDLen() <= 0 ) return true;
if ( ! url.isIp() && url.getTLDLen() <= 0 ) return true;
// Allocate more link buffer space?
int32_t bufSpace ;

@ -7262,6 +7262,21 @@ void Parms::init ( ) {
m->m_group = false;
m++;
m->m_title = "verify written lists";
m->m_desc = "Ensure lists being written to disk are not corrupt. "
"That title recs appear valid, etc. Helps isolate sources "
"of corruption. Used for debugging.";
m->m_cgi = "vwl";
m->m_off = offsetof(Conf,m_verifyDumpedLists);
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "

31
Rdb.cpp

@ -1676,6 +1676,11 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
g_errno = ETRYAGAIN;
return false;
}
// if ( m_inDumpLoop ) {
// g_errno = ETRYAGAIN;
// return false;
// }
// if we are well into repair mode, level 2, do not add anything
// to spiderdb or titledb... that can mess up our titledb scan.
// we always rebuild tfndb, clusterdb and spiderdb
@ -2068,6 +2073,28 @@ bool Rdb::addRecord ( collnum_t collnum,
return false;
}
// do not add if range being dumped at all because when the
// dump completes it calls deleteList() and removes the nodes from
// the tree, so if you were overriding a node currently being dumped
// we would lose it.
if ( m_dump.isDumping() &&
//oppKey >= m_dump.getFirstKeyInQueue() &&
// ensure the dump is dumping the collnum of this key
m_dump.m_collnum == collnum &&
m_dump.m_lastKeyInQueue &&
// the dump should not split positive/negative keys so
// if our positive/negative twin should be in the dump with us
// or not in the dump with us, so any positive/negative
// annihilation below should be ok and we should be save
// to call deleteNode() below
KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0 ) {
// tell caller to wait and try again later
g_errno = ETRYAGAIN;
return false;
}
// save orig
char *orig = NULL;
@ -2229,6 +2256,7 @@ bool Rdb::addRecord ( collnum_t collnum,
// CAUTION: we should not annihilate with oppKey if oppKey may
// be in the process of being dumped to disk! This would
// render our annihilation useless and make undeletable data
/*
if ( m_dump.isDumping() &&
//oppKey >= m_dump.getFirstKeyInQueue() &&
m_dump.m_lastKeyInQueue &&
@ -2236,6 +2264,7 @@ bool Rdb::addRecord ( collnum_t collnum,
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0 )
goto addIt;
*/
// BEFORE we delete it, save it. this is a special hack
// so we can UNDO this deleteNode() should the titledb rec
// add fail.
@ -2309,7 +2338,7 @@ bool Rdb::addRecord ( collnum_t collnum,
// if we did not find an oppKey and are tfndb, flag this
//if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false;
addIt:
//addIt:
// mark as changed
//if ( ! m_needsSave ) {
// m_needsSave = true;

@ -424,10 +424,12 @@ bool RdbDump::dumpTree ( bool recall ) {
// . check the list we got from the tree for problems
// . ensures keys are ordered from lowest to highest as well
//#ifdef GBSANITYCHECK
if ( g_conf.m_verifyWrites ) {
if ( g_conf.m_verifyWrites || g_conf.m_verifyDumpedLists ) {
char *s = "none";
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
log("dump: verifying list before dumping (rdb=%s)",s);
log("dump: verifying list before dumping (rdb=%s "
"collnum=%i)",s,(int)m_collnum);
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );

@ -772,7 +772,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
char *rec = getCurrentRec();
int32_t usize = *(int32_t *)(rec+12+4);
if ( usize <= 0 ) {
if ( usize <= 0 || usize>100000000 ) {
log("db: bad titlerec uncompress size");
char *xx=NULL;*xx=0;
}

@ -89,15 +89,22 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) {
// if we're dumping and key has been dumped, use the secondary mem
//if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) {
if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() &&
( collnum < m_rdb->m_dumpCollnum ||
(collnum == m_rdb->m_dumpCollnum &&
// if dump fails to alloc mem in RdbDump::dumpTree it does
// a sleep wrapper and keeps retrying, and
// RdbDump::m_lastKeyInQueue can remain NULL because we've
// never dumped out a list from the tree yet
m_rdb->m_dump.m_lastKeyInQueue &&
KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
if ( m_rdb->m_inDumpLoop ) {
/////
// MDW: 3/15/2016
// if we're dumping then ALWAYS use secondary mem, wtf...
// primary is being dumped out and when the dump completes
// the ptr gets reset so we'll end up point to garbage.
///////
// ( collnum < m_rdb->m_dumpCollnum ||
// (collnum == m_rdb->m_dumpCollnum &&
// // if dump fails to alloc mem in RdbDump::dumpTree it does
// // a sleep wrapper and keeps retrying, and
// // RdbDump::m_lastKeyInQueue can remain NULL because we've
// // never dumped out a list from the tree yet
// m_rdb->m_dump.m_lastKeyInQueue &&
// KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
// if secondary mem is growing down...
if ( m_ptr2 > m_ptr1 ) {
// return NULL if it would breech,

@ -8,6 +8,7 @@
#include "Loop.h"
#include "Threads.h"
#include "Linkdb.h"
#include "Spider.h"
RdbTree::RdbTree () {
//m_countsInitialized = false;
@ -1128,6 +1129,12 @@ bool RdbTree::fixTree ( ) {
//CollectionRec *recs = g_collectiondb.m_recs;
int32_t max = g_collectiondb.m_numRecs;
log("db: Valid collection numbers range from 0 to %"INT32".",max);
bool isTitledb = false;
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
bool isSpiderdb = false;
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
// now re-add the old nods to the tree, they should not be overwritten
// by addNode()
for ( int32_t i = 0 ; i < n ; i++ ) {
@ -1136,6 +1143,34 @@ bool RdbTree::fixTree ( ) {
log("db: Fixing node #%"INT32" of %"INT32".",i,n);
// skip if empty
if ( m_parents[i] <= -2 ) continue;
if ( isTitledb && m_data[i] ) {
char *data = m_data[i];
int32_t ucompSize = *(int32_t *)data;
if ( ucompSize < 0 || ucompSize > 100000000 ) {
log("db: removing titlerec with uncompressed "
"size of %i from tree",(int)ucompSize);
continue;
}
}
char *key = &m_keys[i*m_ks];
if ( isSpiderdb && m_data[i] &&
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
char *data = m_data[i];
data -= sizeof(SPIDERDBKEY);
data -= 4;
SpiderRequest *sreq ;
sreq =(SpiderRequest *)data;
if ( strncmp(sreq->m_url,"http",4) ) {
log("db: removing spiderrequest bad url "
"%s from tree",sreq->m_url);
//return false;
continue;
}
}
collnum_t cn = m_collnums[i];
// verify collnum
if ( cn < 0 ) continue;
@ -1191,6 +1226,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
bool isTitledb = false;
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
bool isSpiderdb = false;
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
// now check parent kid correlations
for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
// this thing blocks for 1.5 secs for indexdb
@ -1208,6 +1249,32 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
char *xx=NULL;*xx=0; }
if ( isTitledb && m_data[i] ) {
char *data = m_data[i];
int32_t ucompSize = *(int32_t *)data;
if ( ucompSize < 0 || ucompSize > 100000000 ) {
log("db: found titlerec with uncompressed "
"size of %i from tree",(int)ucompSize);
return false;
}
}
char *key = &m_keys[i*m_ks];
if ( isSpiderdb && m_data[i] &&
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
char *data = m_data[i];
data -= sizeof(SPIDERDBKEY);
data -= 4;
SpiderRequest *sreq ;
sreq =(SpiderRequest *)data;
if ( strncmp(sreq->m_url,"http",4) ) {
log("db: spiderrequest bad url "
"%s",sreq->m_url);
return false;
}
}
// bad collnum?
if ( doCollRecCheck ) {
collnum_t cn = m_collnums[i];

@ -2245,6 +2245,14 @@ bool SpiderColl::evalIpLoop ( ) {
bool inCache = false;
bool useCache = true;
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// did our collection rec get deleted? since we were doing a read
// the SpiderColl will have been preserved in that case but its
// m_deleteMyself flag will have been set.
if ( tryToDeleteSpiderColl ( this ,"6") ) return false;
// if doing site or page quotes for the sitepages or domainpages
// url filter expressions, we can't muck with the cache because
// we end up skipping the counting part.
@ -4658,6 +4666,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
// do not spider more than once per 15 seconds ever!
// no! might be a query reindex!!
/*
if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) {
static bool s_printed = false;
if ( ! s_printed ) {
@ -4667,6 +4676,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
}
waitInSecs = 15;//900; this was 15 minutes
}
*/
// in fact, force docid based guys to be zero!
//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
if ( sreq->m_isPageReindex ) waitInSecs = 0;

@ -5897,6 +5897,16 @@ char *XmlDoc::getIsDup ( ) {
return &m_isDup;
}
// do not dedup seeds
bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
if ( cr->m_isCustomCrawl && isSeed ) {
m_isDupValid = true;
m_isDup = false;
return &m_isDup;
}
setStatus ( "checking for dups" );
// BUT if we are already indexed and a a crawlbot/bulk diffbot job
@ -10328,6 +10338,28 @@ Url **XmlDoc::getMetaRedirUrl ( ) {
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL ( m_niceness );
// fix <!--[if lte IE 6]>
// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
if ( *p == '!' &&
p[-1]=='<' &&
p[1] == '-' &&
p[2] == '-' ) {
// find end of comment
for ( ; p < pend ; p++ ) {
QUICKPOLL(m_niceness);
if (p[0] == '-' &&
p[1] == '-' &&
p[2] == '>' )
break;
}
// if found no end of comment, then stop
if ( p >= pend )
break;
// resume looking for meta redirect tags
continue;
}
// base everything off the equal sign
if ( *p != '=' ) continue;
// did we match "http-equiv="?
@ -20249,6 +20281,11 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
"<td>%s</td>"
"</tr>\n"
"<tr>"
"<td>http status</td>"
"<td>%i</td>"
"</tr>\n"
"<tr>"
"<td>url filter num</td>"
"<td>%"INT32"</td>"
@ -20284,6 +20321,7 @@ bool XmlDoc::printDoc ( SafeBuf *sb ) {
getFirstUrlHash64(), // uh48
mstrerror(m_indexCode),
m_httpStatus,
ufn,
mstrerror(g_errno),
allowed,