a bunch of bug fixes, mostly spider related.

also some for pagereindex.
This commit is contained in:
Matt Wells
2013-12-07 21:56:37 -07:00
parent 5e4b5a112c
commit 06edfddf31
10 changed files with 216 additions and 52 deletions

@ -15718,7 +15718,8 @@ pd=(PlaceDesc *)g_cities.getValueFromSlot(pd->getSlot());
g_cityBuf = tbuf;
g_cityBufSize = tbufSize;
// do not let "sb" free it
sb.m_buf = NULL;
//sb.m_buf = NULL;
sb.detachBuf();
//if ( ! g_indicators.save ( g_hostdb.m_dir, "indicators.dat" ) )
// return log("places: failed to save indicators.dat");

@ -1234,6 +1234,11 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
bool BigFile::unlink ( ) {
return unlinkRename ( NULL , -1 , false, NULL, NULL );
}
bool BigFile::move ( char *newDir ) {
return rename ( m_baseFilename , newDir );
}
bool BigFile::rename ( char *newBaseFilename , char *newBaseFilenameDir ) {
return unlinkRename ( newBaseFilename, -1, false, NULL, NULL ,
newBaseFilenameDir );

@ -8,6 +8,7 @@
#include "Datedb.h"
#include "Timedb.h"
#include "Spider.h"
#include "Process.h"
static CollectionRec g_default;
@ -85,6 +86,7 @@ CollectionRec::CollectionRec() {
CollectionRec::~CollectionRec() {
//invalidateRegEx ();
reset();
}
// new collection recs get this called on them
@ -109,6 +111,12 @@ void CollectionRec::reset() {
m_globalCrawlInfo.reset();
//m_requests = 0;
//m_replies = 0;
// free all RdbBases in each rdb
for ( long i = 0 ; i < g_process.m_numRdbs ; i++ ) {
Rdb *rdb = g_process.m_rdbs[i];
rdb->resetBase ( m_collnum );
}
}
CollectionRec *g_cr = NULL;

@ -891,13 +891,13 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we , bool purgeSeeds) {
// . updates RdbBase::m_collnum
// . so for the tree it just needs to mark the old collnum recs
// with a collnum -1 in case it is saving...
g_posdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_spiderdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
// reset crawl status too!
cr->m_spiderStatus = SP_INITIALIZING;

@ -834,7 +834,8 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
// if we missed to many, then dump core
if ( g_niceness == 1 && g_missedQuickPolls >= 4 ) {
//g_inSigHandler = true;
log("loop: missed quickpoll");
// NOT SAFE! can block forever waiting for a printf lock!
//log("loop: missed quickpoll");
//g_inSigHandler = false;
// seems to core a lot in gbcompress() we need to
// put a quickpoll into zlib deflate() or

@ -2767,8 +2767,11 @@ bool Parms::setFromRequest ( HttpRequest *r ,
if ( changedUrlFilters && THIS != (char *)&g_conf ) {
// cast it
CollectionRec *cr = (CollectionRec *)THIS;
// to prevent us having to rebuild doledb/waitingtree at startup
// we need to make the spidercoll here so it is not null
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
// get it
SpiderColl *sc = cr->m_spiderColl;
//SpiderColl *sc = cr->m_spiderColl;
// this will rebuild the waiting tree
if ( sc ) sc->urlFiltersChanged();
}
@ -2890,6 +2893,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
// array whose "count" was not incremented like it should have been.
// HACK: make new line at bottom always have spidering enabled
// checkbox set and make it impossible to unset.
/*
if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 &&
m_parms[mm-1].m_rowid == m->m_rowid ) {
char *pos = (char *)THIS + m_parms[mm-1].m_off - 4 ;
@ -2902,6 +2906,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
return;
}
}
*/
// ensure array count at least j+1
if ( m->m_max > 1 ) {

54
Rdb.cpp

@ -555,7 +555,47 @@ bool Rdb::addColl2 ( collnum_t collnum ) {
return true;
}
bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
bool Rdb::resetBase ( collnum_t collnum ) {
CollectionRec *cr = g_collectiondb.getRec(collnum);
if ( ! cr ) return true;
RdbBase *base = cr->m_bases[(unsigned char)m_rdbId];
if ( ! base ) return true;
base->reset();
return true;
}
bool Rdb::deleteAllRecs ( collnum_t collnum ) {
// remove from tree
if(m_useTree) m_tree.delColl ( collnum );
else m_buckets.delColl ( collnum );
// only for doledb now, because we unlink we do not move the files
// into the trash subdir and doledb is easily regenerated. i don't
// want to take the risk with other files.
if ( m_rdbId != RDB_DOLEDB ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.getRec ( collnum );
RdbBase *base = cr->m_bases[(unsigned char)m_rdbId];
if ( ! base ) return true;
// scan files in there
for ( long i = 0 ; i < base->m_numFiles ; i++ ) {
BigFile *f = base->m_files[i];
// move to trash
char newdir[1024];
sprintf(newdir, "%strash/",g_hostdb.m_dir);
f->move ( newdir );
}
// nuke all the files
base->reset();
return true;
}
bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {
//char *coll = g_collectiondb.m_recs[collnum]->m_coll;
@ -645,7 +685,7 @@ bool Rdb::delColl ( char *coll ) {
}
// move all files to trash and clear the tree/buckets
resetColl ( collnum , collnum );
deleteColl ( collnum , collnum );
// remove these collnums from tree
//if(m_useTree) m_tree.delColl ( collnum );
@ -2389,8 +2429,16 @@ bool Rdb::addRecord ( collnum_t collnum,
// don't actually add it if "fake". i.e. if it
// was an internal error of some sort... this will
// make it try over and over again i guess...
// no because we need some kinda reply so that gb knows
// the pagereindex docid-based spider requests are done,
// at least for now, because the replies were not being
// added for now. just for internal errors at least...
// we were not adding spider replies to the page reindexes
// as they completed and when i tried to rerun it
// the title recs were not found since they were deleted,
// so we gotta add the replies now.
long indexCode = rr->m_errCode;
if ( indexCode == EINTERNALERROR ||
if ( //indexCode == EINTERNALERROR ||
indexCode == EABANDONED ||
indexCode == EHITCRAWLLIMIT ||
indexCode == EHITPROCESSLIMIT ) {

@ -1235,25 +1235,83 @@ char *SpiderColl::getCollName() {
return cr->m_coll;
}
// . call this when changing the url filters
// . will make all entries in waiting tree have zero time basically
void SpiderColl::urlFiltersChanged ( ) {
// log it
log("spider: rebuilding waiting tree for coll=%s",getCollName());
m_lastUrlFiltersUpdate = getTimeGlobal();
//
// remove all recs from doledb for the given collection
//
void doDoledbNuke ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
if ( we->m_registered )
g_loop.unregisterSleepCallback ( we , doDoledbNuke );
// . nuke doledb for this collnum
// . it will unlink the files and maps for doledb for this collnum
// . it will remove all recs of this collnum from its tree too
if ( g_doledb.getRdb()->isSavingTree () ) {
g_loop.registerSleepCallback ( 100 , we , doDoledbNuke );
we->m_registered = true;
return;
}
// . ok, tree is not saving, it should complete entirely from this call
// . crap this is moving the whole directory!!!
// . say "false" to not move whole coll dira
g_doledb.getRdb()->deleteAllRecs ( we->m_cr->m_collnum );
// re-add it back so the RdbBase is new'd
//g_doledb.getRdb()->addColl2 ( we->m_collnum );
// shortcut
SpiderColl *sc = we->m_cr->m_spiderColl;
sc->m_lastUrlFiltersUpdate = getTimeGlobal();
// need to recompute this!
m_ufnMapValid = false;
sc->m_ufnMapValid = false;
// reset this cache
clearUfnTable();
// activate a scan if not already activated
m_waitingTreeNeedsRebuild = true;
sc->m_waitingTreeNeedsRebuild = true;
// if a scan is ongoing, this will re-set it
m_nextKey2.setMin();
sc->m_nextKey2.setMin();
// clear it?
m_waitingTree.clear();
m_waitingTable.clear();
// kick off the spiderdb scan
populateWaitingTreeFromSpiderdb(false);
sc->m_waitingTree.clear();
sc->m_waitingTable.clear();
// kick off the spiderdb scan to repopulate waiting tree and doledb
sc->populateWaitingTreeFromSpiderdb(false);
// nuke this state
mfree ( we , sizeof(WaitEntry) , "waitet" );
// note it
log("spider: finished clearing out doledb/waitingtree for %s",sc->m_coll);
}
// . call this when changing the url filters
// . will make all entries in waiting tree have zero time basically
// . and makes us repopulate doledb from these waiting tree entries
void SpiderColl::urlFiltersChanged ( ) {
// log it
log("spider: rebuilding doledb/waitingtree for coll=%s",getCollName());
WaitEntry *we = (WaitEntry *)mmalloc ( sizeof(WaitEntry) , "waite2" );
if ( ! we ) {
log("spider: wait entry alloc: %s",mstrerror(g_errno));
g_errno = 0;
return;
}
// prepare our state in case the purge operation would block
we->m_registered = false;
we->m_cr = m_cr;
we->m_collnum = m_cr->m_collnum;
//we->m_callback = doDoledbNuke2;
//we->m_state = NULL;
// remove all recs from doledb for the given collection
doDoledbNuke ( 0 , we );
}
// this one has to scan all of spiderdb
@ -1611,8 +1669,10 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
// . skip the rest if injecting
// . otherwise it triggers a lookup for this firstip in spiderdb to
// get a new spider request to add to doledb
if ( srep->m_fromInjectionRequest )
return true;
// . no, because there might be more on disk from the same firstip
// so comment this out again
//if ( srep->m_fromInjectionRequest )
// return true;
// clear error for this
g_errno = 0;
@ -1625,11 +1685,17 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
// and the webmaster did not have one. then we can
// crawl more vigorously...
//if ( srep->m_crawlDelayMS >= 0 ) {
bool update = false;
// use the domain hash for this guy! since its from robots.txt
long *cdp = (long *)m_cdTable.getValue32(srep->m_domHash32);
// update it only if better or empty
bool update = false;
if ( ! cdp ) update = true;
// no update if injecting or from pagereindex (docid based spider request)
if ( srep->m_fromInjectionRequest )
update = false;
//else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime)
// update = true;
// update m_sniTable if we should
@ -1668,19 +1734,26 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
// . TODO: consult crawldelay table here too! use that value if is
// less than our sameIpWait
// . make m_lastDownloadTable an rdbcache ...
// . this is 0 for pagereindex docid-based replies
if ( srep->m_downloadEndTime )
m_lastDownloadCache.addLongLong ( m_collnum,
srep->m_firstIp ,
srep->m_downloadEndTime );
// log this for now
if ( g_conf.m_logDebugSpider )
log("spider: adding last download end time %lli for "
"ip=%s uh48=%llu indexcode=\"%s\" coll=%li "
"to SpiderColl::m_lastDownloadCache",
log("spider: adding spider reply, download end time %lli for "
"ip=%s(%lu) uh48=%llu indexcode=\"%s\" coll=%li "
"k.n1=%llu k.n0=%llu",
//"to SpiderColl::m_lastDownloadCache",
srep->m_downloadEndTime,
iptoa(srep->m_firstIp),srep->getUrlHash48(),
iptoa(srep->m_firstIp),
srep->m_firstIp,
srep->getUrlHash48(),
mstrerror(srep->m_errCode),
(long)m_collnum);
(long)m_collnum,
srep->m_key.n1,
srep->m_key.n0);
// ignore errors from that, it's just a cache
g_errno = 0;
// sanity check - test cache
@ -2046,7 +2119,7 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp ,
// only if we are the responsible host in the shard
if ( ! isAssignedToUs ( firstIp ) )
return true;
return false;
// . do not add to waiting tree if already in doledb
// . an ip should not exist in both doledb and waiting tree.
@ -3879,10 +3952,10 @@ bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) {
long long pdocid = sreq->getParentDocId();
long ss = 1;
if ( score ) ss = *score + 1;
log("spider: added to doletbl uh48=%llu parentdocid=%llu "
"ipdolecount=%li ufn=%li priority=%li firstip=%s",
uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
iptoa(sreq->m_firstIp));
//log("spider: added to doletbl uh48=%llu parentdocid=%llu "
// "ipdolecount=%li ufn=%li priority=%li firstip=%s",
// uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
// iptoa(sreq->m_firstIp));
}
// we had a score there already, so inc it
if ( score ) {
@ -5542,8 +5615,15 @@ bool SpiderLoop::spiderUrl2 ( ) {
//}
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: spidering uh48=%llu pdocid=%llu",
m_sreq->getUrlHash48(),m_sreq->getParentDocId() );
logf(LOG_DEBUG,"spider: spidering firstip9=%s(%lu) "
"uh48=%llu prntdocid=%llu k.n1=%llu k.n0=%llu",
iptoa(m_sreq->m_firstIp),
m_sreq->m_firstIp,
m_sreq->getUrlHash48(),
m_sreq->getParentDocId() ,
m_sreq->m_key.n1,
m_sreq->m_key.n0);
// this returns false and sets g_errno on error
if ( ! xd->set4 ( m_sreq ,
@ -6495,7 +6575,9 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
// this will just return true if we are not the
// responsible host for this firstip
// DO NOT populate from this!!! say "false" here...
! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) ) {
! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) &&
// must be an error...
g_errno ) {
msg = "FAILED TO ADD TO WAITING TREE";
log("spider: %s %s",msg,mstrerror(g_errno));
us->sendErrorReply ( udpSlot , g_errno );
@ -6658,7 +6740,7 @@ void removeExpiredLocks ( long hostId ) {
// when we last cleaned them out
static time_t s_lastTime = 0;
long nowGlobal = getTimeGlobal();
long nowGlobal = getTimeGlobalNoCore();
long niceness = MAX_NICENESS;
// only do this once per second at the most

@ -1900,8 +1900,15 @@ bool XmlDoc::indexDoc ( ) {
// to spiderdb to release the lock.
///
log("build: %s had internal error = %s. adding spider error reply.",
m_firstUrl.m_url,mstrerror(g_errno));
if ( m_firstUrlValid )
log("build: %s had internal error = %s. adding spider "
"error reply.",
m_firstUrl.m_url,mstrerror(g_errno));
else
log("build: docid=%lli had internal error = %s. adding spider "
"error reply.",
m_docId,mstrerror(g_errno));
if ( ! m_indexCodeValid ) {
m_indexCode = EINTERNALERROR;//g_errno;
@ -1945,21 +1952,27 @@ bool XmlDoc::indexDoc ( ) {
// url spider lock in SpiderLoop::m_lockTable.
SpiderReply *nsr = getNewSpiderReply ();
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return true;
SafeBuf metaList;
metaList.pushChar(RDB_SPIDERDB);
metaList.safeMemcpy ( (char *)nsr , nsr->getRecSize() );
//SafeBuf metaList;
m_metaList2.pushChar(RDB_SPIDERDB);
m_metaList2.safeMemcpy ( (char *)nsr , nsr->getRecSize() );
m_msg4Launched = true;
// log this for debug now
SafeBuf tmp;
nsr->print(&tmp);
log("xmldoc: added reply %s",tmp.getBufStart());
// clear g_errno
g_errno = 0;
if ( ! m_msg4.addMetaList ( metaList.getBufStart() ,
metaList.length() ,
if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart() ,
m_metaList2.length() ,
cr->m_coll ,
m_masterState , // state
m_masterLoop ,
@ -15793,7 +15806,7 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
// null term it
m_esbuf.pushChar('\0');
// and point to that buffer
m_expandedUtf8Content = m_esbuf.m_buf;
m_expandedUtf8Content = m_esbuf.getBufStart();//m_buf;
// include the \0 as part of the size
m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
}
@ -16012,7 +16025,7 @@ char **XmlDoc::getUtf8Content ( ) {
// final \0
*dst = '\0';
// re-assign these
m_expandedUtf8Content = m_xbuf.m_buf;
m_expandedUtf8Content = m_xbuf.getBufStart();//m_buf;
m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
// free esbuf if we were referencing that to save mem
m_esbuf.purge();

@ -6150,6 +6150,7 @@ long dumpSpiderdb ( char *coll,
if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
// print it
if ( ! printStats ) {
printf( "offset=%lli ",curOff);
g_spiderdb.print ( srec );
printf("\n");
}