a bunch of bug fixes, mostly spider related.

also some for pagereindex.
2025-07-14 02:36:06 -04:00 · 2013-12-07 21:56:37 -07:00
parent 5e4b5a112c
commit 06edfddf31
10 changed files with 216 additions and 52 deletions
--- a/Address.cpp
+++ b/Address.cpp
@ -15718,7 +15718,8 @@ pd=(PlaceDesc *)g_cities.getValueFromSlot(pd->getSlot());
 	g_cityBuf     = tbuf;
 	g_cityBufSize = tbufSize;
 	// do not let "sb" free it
-	sb.m_buf      = NULL;
+	//sb.m_buf      = NULL;
+	sb.detachBuf();

 	//if ( ! g_indicators.save ( g_hostdb.m_dir, "indicators.dat" ) )
 	//	return log("places: failed to save indicators.dat");
--- a/BigFile.cpp
+++ b/BigFile.cpp
@ -1234,6 +1234,11 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
 bool BigFile::unlink ( ) {
 	return unlinkRename ( NULL , -1 , false, NULL, NULL );
 }
+
+bool BigFile::move ( char *newDir ) {
+	return rename ( m_baseFilename , newDir );
+}
+
 bool BigFile::rename ( char *newBaseFilename , char *newBaseFilenameDir ) {
 	return unlinkRename ( newBaseFilename, -1, false, NULL, NULL ,
 			      newBaseFilenameDir );
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -8,6 +8,7 @@
 #include "Datedb.h"
 #include "Timedb.h"
 #include "Spider.h"
+#include "Process.h"

 static CollectionRec g_default;

@ -85,6 +86,7 @@ CollectionRec::CollectionRec() {

 CollectionRec::~CollectionRec() {
 	//invalidateRegEx ();
+        reset();
 }

 // new collection recs get this called on them
@ -109,6 +111,12 @@ void CollectionRec::reset() {
 	m_globalCrawlInfo.reset();
 	//m_requests = 0;
 	//m_replies = 0;
+	// free all RdbBases in each rdb
+	for ( long i = 0 ; i < g_process.m_numRdbs ; i++ ) {
+	     Rdb *rdb = g_process.m_rdbs[i];
+	     rdb->resetBase ( m_collnum );
+	}
+
 }

 CollectionRec *g_cr = NULL;
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -891,13 +891,13 @@ bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we , bool purgeSeeds) {
 	// . updates RdbBase::m_collnum
 	// . so for the tree it just needs to mark the old collnum recs
 	//   with a collnum -1 in case it is saving...
-	g_posdb.getRdb()->resetColl     ( oldCollnum , newCollnum );
-	g_titledb.getRdb()->resetColl   ( oldCollnum , newCollnum );
-	g_tagdb.getRdb()->resetColl     ( oldCollnum , newCollnum );
-	g_spiderdb.getRdb()->resetColl  ( oldCollnum , newCollnum );
-	g_doledb.getRdb()->resetColl    ( oldCollnum , newCollnum );
-	g_clusterdb.getRdb()->resetColl ( oldCollnum , newCollnum );
-	g_linkdb.getRdb()->resetColl    ( oldCollnum , newCollnum );
+	g_posdb.getRdb()->deleteColl     ( oldCollnum , newCollnum );
+	g_titledb.getRdb()->deleteColl   ( oldCollnum , newCollnum );
+	g_tagdb.getRdb()->deleteColl     ( oldCollnum , newCollnum );
+	g_spiderdb.getRdb()->deleteColl  ( oldCollnum , newCollnum );
+	g_doledb.getRdb()->deleteColl    ( oldCollnum , newCollnum );
+	g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
+	g_linkdb.getRdb()->deleteColl    ( oldCollnum , newCollnum );

 	// reset crawl status too!
 	cr->m_spiderStatus = SP_INITIALIZING;
--- a/Loop.cpp
+++ b/Loop.cpp
@ -834,7 +834,8 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {
 	// if we missed to many, then dump core
 	if ( g_niceness == 1 && g_missedQuickPolls >= 4 ) {
 		//g_inSigHandler = true;
-		log("loop: missed quickpoll");
+		// NOT SAFE! can block forever waiting for a printf lock!
+		//log("loop: missed quickpoll");
 		//g_inSigHandler = false;
 		// seems to core a lot in gbcompress() we need to
 		// put a quickpoll into zlib deflate() or
--- a/Parms.cpp
+++ b/Parms.cpp
@ -2767,8 +2767,11 @@ bool Parms::setFromRequest ( HttpRequest *r ,
 	if ( changedUrlFilters && THIS != (char *)&g_conf ) {
 		// cast it
 		CollectionRec *cr = (CollectionRec *)THIS;
+		// to prevent us having to rebuild doledb/waitingtree at startup
+		// we need to make the spidercoll here so it is not null
+		SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
 		// get it
-		SpiderColl *sc = cr->m_spiderColl;
+		//SpiderColl *sc = cr->m_spiderColl;
 		// this will rebuild the waiting tree
 		if ( sc ) sc->urlFiltersChanged();
 	}
@ -2890,6 +2893,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
 	// array whose "count" was not incremented like it should have been.
 	// HACK: make new line at bottom always have spidering enabled
 	// checkbox set and make it impossible to unset.
+	/*
 	if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 &&
 	     m_parms[mm-1].m_rowid == m->m_rowid ) {
 		char *pos =  (char *)THIS + m_parms[mm-1].m_off - 4 ;
@ -2902,6 +2906,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
 			return;
 		}
 	}
+	*/

 	// ensure array count at least j+1
 	if ( m->m_max > 1 ) {
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -555,7 +555,47 @@ bool Rdb::addColl2 ( collnum_t collnum ) {
 	return true;
 }

-bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
+bool Rdb::resetBase ( collnum_t collnum ) {
+	CollectionRec *cr = g_collectiondb.getRec(collnum);
+	if ( ! cr ) return true;
+	RdbBase *base = cr->m_bases[(unsigned char)m_rdbId];
+	if ( ! base ) return true;
+	base->reset();
+	return true;
+}
+
+bool Rdb::deleteAllRecs ( collnum_t collnum ) {
+
+	// remove from tree
+	if(m_useTree) m_tree.delColl    ( collnum );
+	else          m_buckets.delColl ( collnum );
+
+	// only for doledb now, because we unlink we do not move the files
+	// into the trash subdir and doledb is easily regenerated. i don't
+	// want to take the risk with other files.
+	if ( m_rdbId != RDB_DOLEDB ) { char *xx=NULL;*xx=0; }
+
+	CollectionRec *cr = g_collectiondb.getRec ( collnum );
+
+	RdbBase *base = cr->m_bases[(unsigned char)m_rdbId];
+	if ( ! base ) return true;
+
+	// scan files in there
+	for ( long i = 0 ; i < base->m_numFiles ; i++ ) {
+		BigFile *f = base->m_files[i];
+		// move to trash
+		char newdir[1024];
+		sprintf(newdir, "%strash/",g_hostdb.m_dir);
+		f->move ( newdir );
+	}
+
+	// nuke all the files
+	base->reset();
+	return true;
+}
+
+
+bool Rdb::deleteColl ( collnum_t collnum , collnum_t newCollnum ) {

 	//char *coll = g_collectiondb.m_recs[collnum]->m_coll;

@ -645,7 +685,7 @@ bool Rdb::delColl ( char *coll ) {
 	}

 	// move all files to trash and clear the tree/buckets
-	resetColl ( collnum , collnum );
+	deleteColl ( collnum , collnum );

 	// remove these collnums from tree
 	//if(m_useTree) m_tree.delColl    ( collnum );
@ -2389,8 +2429,16 @@ bool Rdb::addRecord ( collnum_t collnum,
 			// don't actually add it if "fake". i.e. if it
 			// was an internal error of some sort... this will
 			// make it try over and over again i guess...
+			// no because we need some kinda reply so that gb knows
+			// the pagereindex docid-based spider requests are done,
+			// at least for now, because the replies were not being
+			// added for now. just for internal errors at least...
+			// we were not adding spider replies to the page reindexes
+			// as they completed and when i tried to rerun it
+			// the title recs were not found since they were deleted,
+			// so we gotta add the replies now.
 			long indexCode = rr->m_errCode;
-			if ( indexCode == EINTERNALERROR ||
+			if ( //indexCode == EINTERNALERROR ||
 			     indexCode == EABANDONED ||
 			     indexCode == EHITCRAWLLIMIT ||
 			     indexCode == EHITPROCESSLIMIT ) {
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1235,25 +1235,83 @@ char *SpiderColl::getCollName() {
 	return cr->m_coll;
 }

-// . call this when changing the url filters
-// . will make all entries in waiting tree have zero time basically
-void SpiderColl::urlFiltersChanged ( ) {
-	// log it
-	log("spider: rebuilding waiting tree for coll=%s",getCollName());
-	m_lastUrlFiltersUpdate = getTimeGlobal();
+//
+// remove all recs from doledb for the given collection
+//
+void doDoledbNuke ( int fd , void *state ) {
+
+	WaitEntry *we = (WaitEntry *)state;
+
+	if ( we->m_registered )
+		g_loop.unregisterSleepCallback ( we , doDoledbNuke );
+
+	// . nuke doledb for this collnum
+	// . it will unlink the files and maps for doledb for this collnum
+	// . it will remove all recs of this collnum from its tree too
+	if ( g_doledb.getRdb()->isSavingTree () ) {
+		g_loop.registerSleepCallback ( 100 , we , doDoledbNuke );
+		we->m_registered = true;
+		return;
+	}
+
+	// . ok, tree is not saving, it should complete entirely from this call
+	// . crap this is moving the whole directory!!!
+	// . say "false" to not move whole coll dira
+	g_doledb.getRdb()->deleteAllRecs ( we->m_cr->m_collnum );
+
+	// re-add it back so the RdbBase is new'd
+	//g_doledb.getRdb()->addColl2 ( we->m_collnum );
+
+	// shortcut
+	SpiderColl *sc = we->m_cr->m_spiderColl;
+
+	sc->m_lastUrlFiltersUpdate = getTimeGlobal();
 	// need to recompute this!
-	m_ufnMapValid = false;
+	sc->m_ufnMapValid = false;
 	// reset this cache
 	clearUfnTable();
 	// activate a scan if not already activated
-	m_waitingTreeNeedsRebuild = true;
+	sc->m_waitingTreeNeedsRebuild = true;
 	// if a scan is ongoing, this will re-set it
-	m_nextKey2.setMin();
+	sc->m_nextKey2.setMin();
 	// clear it?
-	m_waitingTree.clear();
-	m_waitingTable.clear();
-	// kick off the spiderdb scan
-	populateWaitingTreeFromSpiderdb(false);
+	sc->m_waitingTree.clear();
+	sc->m_waitingTable.clear();
+
+	// kick off the spiderdb scan to repopulate waiting tree and doledb
+	sc->populateWaitingTreeFromSpiderdb(false);
+
+	// nuke this state
+	mfree ( we , sizeof(WaitEntry) , "waitet" );
+
+	// note it
+	log("spider: finished clearing out doledb/waitingtree for %s",sc->m_coll);
+}
+
+// . call this when changing the url filters
+// . will make all entries in waiting tree have zero time basically
+// . and makes us repopulate doledb from these waiting tree entries
+void SpiderColl::urlFiltersChanged ( ) {
+
+	// log it
+	log("spider: rebuilding doledb/waitingtree for coll=%s",getCollName());
+
+	WaitEntry *we = (WaitEntry *)mmalloc ( sizeof(WaitEntry) , "waite2" );
+	if ( ! we ) {
+		log("spider: wait entry alloc: %s",mstrerror(g_errno));
+		g_errno = 0;
+		return;
+	}
+
+	// prepare our state in case the purge operation would block
+	we->m_registered = false;
+	we->m_cr = m_cr;
+	we->m_collnum = m_cr->m_collnum;
+	//we->m_callback = doDoledbNuke2;
+	//we->m_state = NULL;
+
+	// remove all recs from doledb for the given collection
+	doDoledbNuke ( 0 , we );
 }

 // this one has to scan all of spiderdb
@ -1611,8 +1669,10 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
 	// . skip the rest if injecting
 	// . otherwise it triggers a lookup for this firstip in spiderdb to
 	//   get a new spider request to add to doledb
-	if ( srep->m_fromInjectionRequest )
-		return true;
+	// . no, because there might be more on disk from the same firstip
+	//   so comment this out again
+	//if ( srep->m_fromInjectionRequest )
+	//	return true;

 	// clear error for this
 	g_errno = 0;
@ -1625,11 +1685,17 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
 	//   and the webmaster did not have one. then we can 
 	//   crawl more vigorously...
 	//if ( srep->m_crawlDelayMS >= 0 ) {
+
+	bool update = false;
 	// use the domain hash for this guy! since its from robots.txt
 	long *cdp = (long *)m_cdTable.getValue32(srep->m_domHash32);
 	// update it only if better or empty
-	bool update = false;
 	if ( ! cdp ) update = true;
+
+	// no update if injecting or from pagereindex (docid based spider request)
+	if ( srep->m_fromInjectionRequest )
+		update = false;
+
 	//else if (((*cdp)&0xffffffff)<(uint32_t)srep->m_spideredTime) 
 	//	update = true;
 	// update m_sniTable if we should
@ -1668,19 +1734,26 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
 	// . TODO: consult crawldelay table here too! use that value if is
 	//   less than our sameIpWait
 	// . make m_lastDownloadTable an rdbcache ...
+	// . this is 0 for pagereindex docid-based replies
 	if ( srep->m_downloadEndTime )
 		m_lastDownloadCache.addLongLong ( m_collnum,
 						  srep->m_firstIp ,
 						  srep->m_downloadEndTime );
 	// log this for now
 	if ( g_conf.m_logDebugSpider )
-		log("spider: adding last download end time %lli for "
-		    "ip=%s uh48=%llu indexcode=\"%s\" coll=%li "
-		    "to SpiderColl::m_lastDownloadCache",
+		log("spider: adding spider reply, download end time %lli for "
+		    "ip=%s(%lu) uh48=%llu indexcode=\"%s\" coll=%li "
+		    "k.n1=%llu k.n0=%llu",
+		    //"to SpiderColl::m_lastDownloadCache",
 		    srep->m_downloadEndTime,
-		    iptoa(srep->m_firstIp),srep->getUrlHash48(),
+		    iptoa(srep->m_firstIp),
+		    srep->m_firstIp,
+		    srep->getUrlHash48(),
 		    mstrerror(srep->m_errCode),
-		    (long)m_collnum);
+		    (long)m_collnum,
+		    srep->m_key.n1,
+		    srep->m_key.n0);
+	
 	// ignore errors from that, it's just a cache
 	g_errno = 0;
 	// sanity check - test cache
@ -2046,7 +2119,7 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp ,

 	// only if we are the responsible host in the shard
 	if ( ! isAssignedToUs ( firstIp ) ) 
-		return true;
+		return false;

 	// . do not add to waiting tree if already in doledb
 	// . an ip should not exist in both doledb and waiting tree.
@ -3879,10 +3952,10 @@ bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) {
 		long long pdocid = sreq->getParentDocId();
 		long ss = 1;
 		if ( score ) ss = *score + 1;
-		log("spider: added to doletbl uh48=%llu parentdocid=%llu "
-		    "ipdolecount=%li ufn=%li priority=%li firstip=%s",
-		    uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
-		    iptoa(sreq->m_firstIp));
+		//log("spider: added to doletbl uh48=%llu parentdocid=%llu "
+		//    "ipdolecount=%li ufn=%li priority=%li firstip=%s",
+		//    uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
+		//    iptoa(sreq->m_firstIp));
 	}
 	// we had a score there already, so inc it
 	if ( score ) {
@ -5542,8 +5615,15 @@ bool SpiderLoop::spiderUrl2 ( ) {
 	//}

 	if ( g_conf.m_logDebugSpider )
-		logf(LOG_DEBUG,"spider: spidering uh48=%llu pdocid=%llu",
-		     m_sreq->getUrlHash48(),m_sreq->getParentDocId() );
+		logf(LOG_DEBUG,"spider: spidering firstip9=%s(%lu) "
+		     "uh48=%llu prntdocid=%llu k.n1=%llu k.n0=%llu",
+		     iptoa(m_sreq->m_firstIp),
+		     m_sreq->m_firstIp,
+		     m_sreq->getUrlHash48(),
+		     m_sreq->getParentDocId() ,
+		     m_sreq->m_key.n1,
+		     m_sreq->m_key.n0);
+

 	// this returns false and sets g_errno on error
 	if ( ! xd->set4 ( m_sreq       ,
@ -6495,7 +6575,9 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
 		     // this will just return true if we are not the 
 		     // responsible host for this firstip
 		    // DO NOT populate from this!!! say "false" here...
-		     ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) ) {
+		     ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) &&
+		     // must be an error...
+		     g_errno ) {
 			msg = "FAILED TO ADD TO WAITING TREE";
 			log("spider: %s %s",msg,mstrerror(g_errno));
 			us->sendErrorReply ( udpSlot , g_errno );
@ -6658,7 +6740,7 @@ void removeExpiredLocks ( long hostId ) {
 	// when we last cleaned them out
 	static time_t s_lastTime = 0;

-	long nowGlobal = getTimeGlobal();
+	long nowGlobal = getTimeGlobalNoCore();
 	long niceness = MAX_NICENESS;

 	// only do this once per second at the most
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1900,8 +1900,15 @@ bool XmlDoc::indexDoc ( ) {
 	// to spiderdb to release the lock.
 	///

-	log("build: %s had internal error = %s. adding spider error reply.",
-	    m_firstUrl.m_url,mstrerror(g_errno));
+	if ( m_firstUrlValid ) 
+		log("build: %s had internal error = %s. adding spider "
+		    "error reply.",
+		    m_firstUrl.m_url,mstrerror(g_errno));
+	else
+		log("build: docid=%lli had internal error = %s. adding spider "
+		    "error reply.",
+		    m_docId,mstrerror(g_errno));
+

 	if ( ! m_indexCodeValid ) {
 		m_indexCode = EINTERNALERROR;//g_errno;
@ -1945,21 +1952,27 @@ bool XmlDoc::indexDoc ( ) {
 	// url spider lock in SpiderLoop::m_lockTable.
 	SpiderReply *nsr = getNewSpiderReply ();
 	if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
+	if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }

 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return true;

-	SafeBuf metaList;
-	metaList.pushChar(RDB_SPIDERDB);
-	metaList.safeMemcpy ( (char *)nsr , nsr->getRecSize() );
+	//SafeBuf metaList;
+	m_metaList2.pushChar(RDB_SPIDERDB);
+	m_metaList2.safeMemcpy ( (char *)nsr , nsr->getRecSize() );

 	m_msg4Launched = true;

+	// log this for debug now
+	SafeBuf tmp;
+	nsr->print(&tmp);
+	log("xmldoc: added reply %s",tmp.getBufStart());
+
 	// clear g_errno
 	g_errno = 0;

-	if ( ! m_msg4.addMetaList ( metaList.getBufStart()     ,
-				    metaList.length() ,
+	if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart()     ,
+				    m_metaList2.length() ,
 				    cr->m_coll         ,
 				    m_masterState  , // state
 				    m_masterLoop   ,
@ -15793,7 +15806,7 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
 		// null term it
 		m_esbuf.pushChar('\0');
 		// and point to that buffer
-		m_expandedUtf8Content     = m_esbuf.m_buf;
+		m_expandedUtf8Content     = m_esbuf.getBufStart();//m_buf;
 		// include the \0 as part of the size
 		m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
 	}
@ -16012,7 +16025,7 @@ char **XmlDoc::getUtf8Content ( ) {
 		// final \0
 		*dst = '\0';
 		// re-assign these
-		m_expandedUtf8Content     = m_xbuf.m_buf;
+		m_expandedUtf8Content     = m_xbuf.getBufStart();//m_buf;
 		m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
 		// free esbuf if we were referencing that to save mem
 		m_esbuf.purge();
--- a/main.cpp
+++ b/main.cpp
@ -6150,6 +6150,7 @@ long dumpSpiderdb ( char *coll,
 		if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
 			// print it
 			if ( ! printStats ) {
+				printf( "offset=%lli ",curOff);
 				g_spiderdb.print ( srec );
 				printf("\n");
 			}