fix lost spiders from updating crawl info.

fix maxspidersperip limitation not being obeyed. removed fakedb. only add "0" time waiting tree keys to waiting tree. only scanSpiderdb() will change their times to a future time or add them to doledb directly. confirmLockAcquisition() will not add to waitingtree if max spiders per ip limit would be exceeded. an incoming spider reply will trigger the add to waiting tree with a time of "0".
2013-09-28 13:12:33 -06:00
parent 00910e36d7
commit 9730e5f3ef
10 changed files with 249 additions and 65 deletions
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -62,6 +62,10 @@ CollectionRec::CollectionRec() {
 	//}
 	m_numRegExs = 0;

+	m_requests = 0;
+	m_replies = 0;
+	m_doingCallbacks = false;
+
 	// for diffbot caching the global spider stats
 	reset();

@ -81,6 +85,11 @@ void CollectionRec::setToDefaults ( ) {
 }

 void CollectionRec::reset() {
+	// make sure we do not leave spiders "hanging" waiting for their
+	// callback to be called... and it never gets called
+	if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
+	if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
+	if ( m_replies != m_requests  ) { char *xx=NULL;*xx=0; }
 	m_localCrawlInfo.reset();
 	m_globalCrawlInfo.reset();
 	m_requests = 0;
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -423,6 +423,7 @@ class CollectionRec {
 	// for counting replies
 	long m_replies;
 	long m_requests;
+	bool m_doingCallbacks;
 	// for storing callbacks waiting in line for freshest crawl info
 	SafeBuf m_callbackQueue;
 	
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -2324,10 +2324,10 @@ uint32_t Hostdb::getGroupId ( char rdbId,void *k,bool split ) {
 		unsigned long long d = g_revdb.getDocId( (key_t *)k );
 		return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
 	}
-	else if ( rdbId == RDB_FAKEDB ) {
-		// HACK:!!!!!!  this is a trick!!! it is us!!!
-		return g_hostdb.m_myHost->m_groupId;
-	}
+	//else if ( rdbId == RDB_FAKEDB ) {
+	//	// HACK:!!!!!!  this is a trick!!! it is us!!!
+	//	return g_hostdb.m_myHost->m_groupId;
+	//}

 	//else if ( rdbId == RDB_CATDB || rdbId == RDB2_CATDB2 ) {
 	//	return m_map [(*(uint16_t *)((char *)k + 10))>>3];
--- a/Msg4.cpp
+++ b/Msg4.cpp
@ -50,7 +50,7 @@ static void       sleepCallback4   ( int bogusfd      , void *state    ) ;
 static bool       sendBuffer       ( long hostId , long niceness ) ;
 static Multicast *getMulticast     ( ) ;
 static void       returnMulticast  ( Multicast *mcast ) ;
-static void processSpecialSignal ( collnum_t collnum , char *p ) ;
+//static void processSpecialSignal ( collnum_t collnum , char *p ) ;
 //static bool storeList2 ( RdbList *list , char rdbId , collnum_t collnum,
 //			 bool forceLocal, bool splitList , long niceness );
 static bool storeRec   ( collnum_t      collnum , 
@ -1122,16 +1122,16 @@ bool addMetaList ( char *p , UdpSlot *slot ) {
 	if ( rdbId != lastRdbId ) {
 		rdb = getRdbFromId ( (char) rdbId );
 		// skip RDBFAKEDB
-		if ( rdbId == RDB_FAKEDB ) {
-			// do special handler process
-			processSpecialSignal ( collnum , p );
-			// skip the fakedb record
-			p += recSize;
-			// drop it for now!!
-			if ( p < pend ) goto loop;
-			// all done
-			return true;
-		}
+		//if ( rdbId == RDB_FAKEDB ) {
+		//	// do special handler process
+		//	processSpecialSignal ( collnum , p );
+		//	// skip the fakedb record
+		//	p += recSize;
+		//	// drop it for now!!
+		//	if ( p < pend ) goto loop;
+		//	// all done
+		//	return true;
+		//}
 		// an uninitialized secondary rdb? it will have a keysize
 		// if 0 if its never been intialized from the repair page
 		if ( rdb && rdb->m_ks <= 0 ) {
@ -1596,6 +1596,7 @@ bool loadAddsInProgress ( char *prefix ) {
 // right now the FAKEDB record is a signal to remove the spider lock
 // from the lock table because we are done spidering it.
 //
+/*
 void processSpecialSignal ( collnum_t collnum , char *p ) {

 	key_t *fake = (key_t *)p;
@ -1649,4 +1650,5 @@ void processSpecialSignal ( collnum_t collnum , char *p ) {
 	// do not actually add this fake key to titledb!
 	//return true;
 }
+*/

--- a/PageCrawlBot.h
+++ b/PageCrawlBot.h
@ -3,16 +3,16 @@
 #define CRAWLBOT_H

 // values for the diffbot dropdown
-#define DBA_NONE 1
-#define DBA_ALL  2
-#define DBA_ARTICLE_FORCE 3
-#define DBA_ARTICLE_AUTO  4
-#define DBA_PRODUCT_FORCE 5
-#define DBA_PRODUCT_AUTO 6
-#define DBA_IMAGE_FORCE 7
-#define DBA_IMAGE_AUTO 8
-#define DBA_FRONTPAGE_FORCE 9 
-#define DBA_FRONTPAGE_AUTO 10
+#define DBA_NONE 0
+#define DBA_ALL  1
+#define DBA_ARTICLE_FORCE 2
+#define DBA_ARTICLE_AUTO  3
+#define DBA_PRODUCT_FORCE 4
+#define DBA_PRODUCT_AUTO 5
+#define DBA_IMAGE_FORCE 6
+#define DBA_IMAGE_AUTO 7
+#define DBA_FRONTPAGE_FORCE 8
+#define DBA_FRONTPAGE_AUTO 9

 // add new fields to END of list since i think we store the
 // field we use as a number in the coll.conf, starting at 0
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2416,7 +2416,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
 		s_table9 [ RDB_MONITORDB ] = g_monitordb.getRdb();
 		s_table9 [ RDB_STATSDB   ] = g_statsdb.getRdb();
 		s_table9 [ RDB_REVDB     ] = g_revdb.getRdb();
-		s_table9 [ RDB_FAKEDB    ] = NULL;
+		//s_table9 [ RDB_FAKEDB    ] = NULL;

 		s_table9 [ RDB2_INDEXDB2   ] = g_indexdb2.getRdb();
 		s_table9 [ RDB2_POSDB2     ] = g_posdb2.getRdb();
@ -2562,7 +2562,7 @@ long getDataSizeFromRdbId ( uint8_t rdbId ) {
 			     i == RDB_TFNDB ||
 			     i == RDB_CLUSTERDB ||
 			     i == RDB_DATEDB ||
-			     i == RDB_FAKEDB ||
+			     //i == RDB_FAKEDB ||
 			     i == RDB_LINKDB )
 				ds = 0;
 			else if ( i == RDB_TITLEDB ||
--- a/Rdb.h
+++ b/Rdb.h
@ -37,7 +37,7 @@ enum {
 	RDB_CACHEDB, // 17
 	RDB_SERPDB, // 18
 	RDB_MONITORDB, // 19
-	RDB_FAKEDB, // used by spider.cpp to fake things out
+	//RDB_FAKEDB, // used by spider.cpp to fake things out
 // . secondary rdbs for rebuilding done in PageRepair.cpp
 // . we add new recs into these guys and then make the original rdbs
 //   point to them when we are done.
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1516,7 +1516,47 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
 	//	if ( last != srep->m_downloadEndTime ) { char *xx=NULL;*xx=0;}
 	//}

+	/////////
+	//
+	// remove the lock here
+	//
+	//////
+	long long uh48 = srep->getUrlHash48() ;
+	// shortcut
+	HashTableX *ht = &g_spiderLoop.m_lockTable;
+	UrlLock *lock = (UrlLock *)ht->getValue ( &uh48 );
+	time_t nowGlobal = getTimeGlobal();

+	if ( g_conf.m_logDebugSpiderFlow )
+		logf(LOG_DEBUG,"spflow: scheduled lock removal in 5 secs for "
+		     "uh48=%llu",  uh48 );
+
+	// test it
+	//if ( m_nowGlobal == 0 && lock )
+	//	m_nowGlobal = getTimeGlobal();
+	// we do it this way rather than remove it ourselves
+	// because a lock request for this guy
+	// might be currently outstanding, and it will end up
+	// being granted the lock even though we have by now removed
+	// it from doledb, because it read doledb before we removed 
+	// it! so wait 5 seconds for the doledb negative key to 
+	// be absorbed to prevent a url we just spidered from being
+	// re-spidered right away because of this sync issue.
+	if ( lock ) lock->m_expires = nowGlobal + 5;
+	/////
+	//
+	// but do note that its spider has returned for populating the
+	// waiting tree. addToWaitingTree should not add an entry if
+	// a spiderReply is still pending according to the lock table,
+	// UNLESS, maxSpidersPerIP is more than what the lock table says
+	// is currently being spidered.
+	//
+	/////
+	if ( lock ) lock->m_spiderOutstanding = false;
+	// bitch if not in there
+	if ( !lock ) // &&g_conf.m_logDebugSpider)//ht->isInTable(&lockKey)) 
+		logf(LOG_DEBUG,"spider: rdb: uh48=%llu "
+		     "was not in lock table",uh48);
 	// skip:

 	// . add to wait tree and let it populate doledb on its batch run
@ -1666,15 +1706,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	sreq->m_priority = priority;

 	// get spider time -- i.e. earliest time when we can spider it
-	uint64_t spiderTimeMS = getSpiderTimeMS (sreq,ufn,NULL,nowGlobalMS );
+	//uint64_t spiderTimeMS = getSpiderTimeMS (sreq,ufn,NULL,nowGlobalMS );
 	// sanity
-	if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }
+	//if ( (long long)spiderTimeMS < 0 ) { char *xx=NULL;*xx=0; }

 	// once in waiting tree, we will scan waiting tree and then lookup
 	// each firstIp in waiting tree in spiderdb to get the best
 	// SpiderRequest for that firstIp, then we can add it to doledb
 	// as long as it can be spidered now
-	bool status = addToWaitingTree ( spiderTimeMS , sreq->m_firstIp, true);
+	//bool status = addToWaitingTree ( spiderTimeMS,sreq->m_firstIp,true);
+	addToWaitingTree ( 0 , sreq->m_firstIp , true );

 	// if already doled and we beat the priority/spidertime of what
 	// was doled then we should probably delete the old doledb key
@ -1700,7 +1741,7 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	}


-	if ( ! g_conf.m_logDebugSpider ) return status;
+	if ( ! g_conf.m_logDebugSpider ) return true;//status;

 	// log it
 	logf(LOG_DEBUG,
@ -1714,7 +1755,8 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	     "ufn=%li "
 	     "priority=%li "
 	     "addedtime=%lu "
-	     "spidertime=%llu",
+	     //"spidertime=%llu",
+	     ,
 	     sreq->m_url,
 	     sreq->getUrlHash48(),
 	     iptoa(sreq->m_firstIp),
@ -1724,10 +1766,11 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	     (long)(bool)sreq->m_isPageReindex,
 	     (long)sreq->m_ufn,
 	     (long)sreq->m_priority,
-	     sreq->m_addedTime,
-	     spiderTimeMS);
+	     sreq->m_addedTime
+	     //spiderTimeMS);
+	     );

-	return status;
+	return true;//status;
 }

 bool SpiderColl::printWaitingTree ( ) {
@ -1746,6 +1789,29 @@ bool SpiderColl::printWaitingTree ( ) {
 	return true;
 }

+//////
+//
+// . 1. called by addSpiderReply(). it should have the sameIpWait available
+//      or at least that will be in the crawldelay cache table.
+//      SpiderReply::m_crawlDelayMS. Unfortunately, no maxSpidersPerIP!!!
+//      we just add a "0" in the waiting tree which means scanSpiderdb() will
+//      be called and can get the maxSpidersPerIP from the winning candidate
+//      and add to the waiting tree based on that.
+// . 2. called by addSpiderRequests(). It SHOULD maybe just add a "0" as well
+//      to offload the logic. try that.
+// . 3. called by populateWaitingTreeFromSpiderdb(). it just adds "0" as well,
+//      if not doled
+// . 4. UPDATED in scanSpiderdb() if the best SpiderRequest for a firstIp is
+//      in the future, this is the only time we will add a waiting tree key
+//      whose spider time is non-zero. that is where we also take 
+//      sameIpWait and maxSpidersPerIP into consideration. scanSpiderdb()
+//      will actually REMOVE the entry from the waiting tree if that IP
+//      already has the max spiders outstanding per IP. when a spiderReply
+//      is received it will populate the waiting tree again with a "0" entry
+//      and scanSpiderdb() will re-do its check.
+//
+//////
+
 // . if one of these add fails consider increasing mem used by tree/table
 // . if we lose an ip that sux because it won't be gotten again unless
 //   we somehow add another request/reply to spiderdb in the future
@ -1755,6 +1821,12 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp ,
 	// a sooner spiderTimeMS
 	//if ( m_waitingTable.isInTable ( &firstIp ) ) return true;

+	// . this can now be only 0
+	// . only scanSpiderdb will add a waiting tree key with a non-zero
+	//   value after it figures out the EARLIEST time that a 
+	//   SpiderRequest from this firstIp can be spidered.
+	if ( spiderTimeMS != 0 ) { char *xx=NULL;*xx=0; }
+
 	// waiting tree might be saving!!!
 	if ( ! m_waitingTree.m_isWritable ) {
 		log("spider: addtowaitingtree: failed. is not writable. "
@ -2126,6 +2198,15 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 		// skip if ip already represented in doledb i guess otehrwise
 		// the populatedoledb scan will nuke it!!
 		if ( m_doleIpTable.isInTable ( &firstIp ) ) continue;
+		// not currently spidering either. when they got their
+		// lock they called confirmLockAcquisition() which will
+		// have added an entry to the waiting table. sometimes the
+		// lock still exists but the spider is done. because the
+		// lock persists for 5 seconds afterwards in case there was
+		// a lock request for that url in progress, so it will be
+		// denied.
+		if ( g_spiderLoop.getNumSpidersOutPerIp ( firstIp ) > 0 )
+			continue;
 		// otherwise, we want to add it with 0 time so the doledb
 		// scan will evaluate it properly
 		// this will return false if we are saving the tree i guess
@ -2441,6 +2522,23 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 	if ( needList && m_nextKey == m_firstKey && m_bestRequestValid ) {
 		char *xx=NULL; *xx=0 ; }

+	// . if the scanning ip has too many outstanding spiders
+	// . looks a UrlLock::m_firstIp and UrlLock::m_isSpiderOutstanding
+	//   since the lock lives for 5 seconds after the spider reply
+	//   comes back.
+	// . when the spiderReply comes back that will re-add a "0" entry
+	//   to the waiting tree. 
+	// . PROBLEM: some spiders don't seem to add a spiderReply!! wtf???
+	//   they end up having their locks timeout after like 3 hrs?
+	// . maybe just do not add to waiting tree in confirmLockAcquisition()
+	//   handler in such cases? YEAH.. try that
+	//long numOutPerIp = getOustandingSpidersPerIp ( firstIp );
+	//if ( numOutPerIp > maxSpidersPerIp ) {
+	//	// remove from the tree and table
+	//	removeFromWaitingTree ( firstIp );
+	//	return true;
+	//}
+
 readLoop:

 	// if we re-entered from the read wrapper, jump down
@ -2833,7 +2931,8 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 		// that scanSpiderdb() repopulates doledb again with that
 		// "firstIp". this way we can spider multiple urls from the
 		// same ip at the same time.
-		if ( g_spiderLoop.isInLockTable(sreq->m_probDocId) )
+		long long uh48 = sreq->getUrlHash48();
+		if ( g_spiderLoop.m_lockTable.isInTable ( &uh48 ) )
 			continue;
 		     
 		// ok, we got a new winner
@ -2993,6 +3092,15 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 	if ( m_bestRequest->m_ufn          < 0 ) { char *xx=NULL;*xx=0; }
 	if ( m_bestRequest->m_priority ==   -1 ) { char *xx=NULL;*xx=0; }

+	////////////////////
+	//
+	// UPDATE WAITING TREE ENTRY
+	//
+	// Normally the "spidertime" is 0 for a firstIp. This will make it
+	// a future time if it is not yet due for spidering.
+	//
+	////////////////////
+
 	// if best request has a future spiderTime, at least update
 	// the wait tree with that since we will not be doling this request
 	// right now.
@ -3047,7 +3155,8 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 	// somehow started spidering since our last spider read, so i would
 	// say we should bail on this spider scan! really i'm not exactly
 	// sure what happened...
-	if ( g_spiderLoop.isInLockTable ( m_bestRequest->m_probDocId ) ) {
+	long long uh48 = m_bestRequest->getUrlHash48();
+	if ( g_spiderLoop.m_lockTable.isInTable ( &uh48 ) ) {
 		log("spider: best request got doled out from under us");
 		return true;
 		char *xx=NULL;*xx=0; 
@ -3093,11 +3202,11 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 	//
 	// delete the winner from ufntree as well
 	//
-	long long uh48 = m_bestRequest->getUrlHash48();
+	long long buh48 = m_bestRequest->getUrlHash48();
 	key128_t bkey = makeUfnTreeKey ( m_bestRequest->m_firstIp ,
 					 m_bestRequest->m_priority ,
 					 m_bestSpiderTimeMS ,
-					 uh48 );
+					 buh48 );
 	// must be in tree!
 	long node = s_ufnTree.getNextNode ( 0, (char *)&bkey );
 	// if this firstip had too few requests to make it into the
@ -3186,6 +3295,13 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 	// if not found in cache
 	if ( lastMS == (uint64_t)-1 ) minSpiderTimeMS = 0LL;

+	/////////////////////////////////////////////////
+	/////////////////////////////////////////////////
+	// TODO: put crawldelay table check in here!!!!
+	/////////////////////////////////////////////////
+	/////////////////////////////////////////////////
+
+
 	// wait 5 seconds for all outlinks in order for them to have a
 	// chance to get any link info that might have been added
 	// from the page that supplied this outlink
@ -4745,12 +4861,6 @@ void gotLockReplyWrapper ( void *state , UdpSlot *slot ) {
 	else                     g_spiderLoop.spiderDoledUrls();
 }

-bool SpiderLoop::isInLockTable ( long long probDocId ) {
-	unsigned long long  lockKey=g_titledb.getFirstProbableDocId(probDocId);
-	HashTableX *ht = &g_spiderLoop.m_lockTable;
-	return ht->isInTable ( &lockKey );
-}
-
 // . returns false if blocked, true otherwise.
 // . returns true and sets g_errno on error
 // . before we can spider for a SpiderRequest we must be granted the lock
@ -5189,6 +5299,10 @@ long SpiderLoop::getNumSpidersOutPerIp ( long firstIp ) {
 		if ( ! ht->m_flags[i] ) continue;
 		// cast lock
 		UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
+		// skip if not outstanding, just a 5-second expiration wait
+		// when the spiderReply returns, so that in case a lock
+		// request for the same url was in progress, it will be denied.
+		if ( ! lock->m_spiderOutstanding ) continue;
 		// skip if not yet expired
 		if ( lock->m_firstIp == firstIp ) count++;
 	}
@ -5381,6 +5495,16 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
 	tmp.m_timestamp    = nowGlobal;
 	tmp.m_expires      = 0;
 	tmp.m_firstIp      = lr->m_firstIp;
+	// when the spider returns we remove its lock on reception of the
+	// spiderReply, however, we actually just set the m_expires time
+	// to 5 seconds into the future in case there is a current request
+	// to get a lock for that url in progress. but, we do need to
+	// indicate that the spider has indeed completed by setting
+	// m_spiderOutstanding to true. this way, addToWaitingTree() will
+	// not count it towards a "max spiders per IP" quota when deciding
+	// on if it should add a new entry for this IP.
+	tmp.m_spiderOutstanding = true;
+
 	// put it into the table
 	if ( ! ht->addKey ( &lr->m_lockKey , &tmp ) ) {
 		// return error if that failed!
@ -8746,6 +8870,10 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 		       void (* callback)(void *state) ,
 		       bool useCache ) {

+
+	// prevent condition described in gotCrawlInfoReply() below
+	if ( cr->m_doingCallbacks ) return true;
+
 	long now = getTimeLocal();
 	// keep it fresh within 1 second
 	long thresh = 1;
@ -8762,13 +8890,18 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 	CallbackEntry2 ce2;
 	ce2.m_state = state;
 	ce2.m_callback = callback;
-	if ( ! cr->m_callbackQueue.safeMemcpy ( &ce2, sizeof(CallbackEntry2)) )
+	if ( ! cr->m_callbackQueue.safeMemcpy ( &ce2, sizeof(CallbackEntry2))){
+		log("spider: failed to queue update crawl info request");
 		return true;
+	}

 	// if we were not the first, we do not initiate it, we just wait
 	// for all the replies to come back
 	if ( cr->m_replies < cr->m_requests ) return false;

+	// sanity test
+	if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
+
 	cr->m_globalCrawlInfo.reset();

 	cr->m_replies  = 0;
@ -8782,7 +8915,11 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 	for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
 		Host *h = g_hostdb.getHost(i);
 		// skip if dead
-		if ( g_hostdb.isDead(i) ) continue;
+		if ( g_hostdb.isDead(i) ) {
+			log("spider: skipping dead host #%li when getting "
+			    "crawl info",i);
+			continue;
+		}
 		// count it as launched
 		cr->m_requests++;
 		if ( ! g_udpServer.sendRequest ( request,
@ -8844,6 +8981,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// return if still waiting on more to come in
 	if ( cr->m_replies < cr->m_requests ) return;

+	// sanity check
+	if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
+
 	// update cache time
 	cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();

@ -8852,16 +8992,36 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	// call all callbacks
 	long nc = cr->m_callbackQueue.length() / sizeof(CallbackEntry2);
+
+	// wtf? need to be at least one in here
+	if ( nc <= 0 ) { char *xx=NULL;*xx=0; }
+
+	// see note below. if the m_callback we call ends up calling 
+	// updateCrawlInfo() its callback in callbackqueue will get flushed
+	// in the purge() below and will be a forever lost spider. so prevent
+	// that.
+	cr->m_doingCallbacks = true;
+
+	// call each callback
 	char *p = cr->m_callbackQueue.getBufStart();
 	for ( long i = 0 ; i < nc ; i++ ) {
 		CallbackEntry2 *ce2 = (CallbackEntry2 *)p;
 		p += sizeof(CallbackEntry2);
 		// clear g_errno just in case
 		g_errno = 0;
-		// call that callback waiting in the queue
+		// debug note
+		XmlDoc *xd = (XmlDoc *)(ce2->m_state);
+		log("spider: calling crawlupdate callback for %s",
+		    xd->m_firstUrl.m_url);
+		// . call that callback waiting in the queue
+		// . crap! if this callback ends up calling updateCrawlInfo()
+		//   itself, then the purge below flushes its callback out!
+		//   SHIT!
 		ce2->m_callback ( ce2->m_state );
 	}

+	cr->m_doingCallbacks = false;
+
 	// save the mem!
 	cr->m_callbackQueue.purge();
 }
--- a/Spider.h
+++ b/Spider.h
@ -1155,6 +1155,7 @@ public:
 	long m_timestamp;
 	long m_expires;
 	long m_firstIp;
+	char m_spiderOutstanding;
 };

 class Msg12 {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1721,7 +1721,7 @@ void XmlDoc::setStatus ( char *s ) {

 	s_last = s;

-	if ( ! g_conf.m_logDebugBuild ) return ;
+	//if ( ! g_conf.m_logDebugBuild ) return ;
 	//return;
 	if ( m_firstUrlValid )
 		logf(LOG_DEBUG,"build: status = %s for %s (this=0x%lx)",
@ -15057,6 +15057,8 @@ char *XmlDoc::getIsVisible ( ) {
 long *XmlDoc::getUrlFilterNum ( ) {
 	// return it if already set
 	if ( m_urlFilterNumValid ) return &m_urlFilterNum;
+	// note that
+	setStatus ( "getting url filter row num");
 	// make the partial new spider rec
 	SpiderReply *newsr = getNewSpiderReply ( );
 	// note it
@ -17275,7 +17277,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		//if ( m_doledbKey.n0 || m_doledbKey.n1 )
 		//	needx += 1 + sizeof(key_t); // + 4;
 		// the FAKEDB unlock key for msg12 in spider.cpp
-		needx += 1 + sizeof(key_t); // FAKEDB
+		//needx += 1 + sizeof(key_t); // FAKEDB
 		// make the buffer
 		m_metaList = (char *)mmalloc ( needx , "metalist");
 		if ( ! m_metaList ) return NULL;
@ -17322,13 +17324,15 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
 		// . make a fake titledb key
 		// . remove the spider lock (Msg12 in Spider.cpp)
-		*m_p++ = RDB_FAKEDB;
+		// . now SPider.cpp uses SpiderReply reception to remove lock
+		//   - mdw 9/28/13
+		//*m_p++ = RDB_FAKEDB;
 		//*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
-		key_t fakeKey;
-		fakeKey.n1 = 0;
-		fakeKey.n0 = m_docId;
-		memcpy ( m_p , &fakeKey , sizeof(key_t) );
-		m_p += sizeof(key_t);
+		//key_t fakeKey;
+		//fakeKey.n1 = 0;
+		//fakeKey.n0 = m_docId;
+		//memcpy ( m_p , &fakeKey , sizeof(key_t) );
+		//m_p += sizeof(key_t);
 		// now add the new rescheduled time
 		setStatus ( "adding SpiderReply to spiderdb" );
 		// rdbid first
@ -17718,6 +17722,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	if ( dbr->length() > 3 ) {
 		// make sure diffbot reply is valid for sure
 		if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
+		// set status for this
+		setStatus ( "indexing diffbot json doc");
 		// new guy here
 		if ( ! m_dx ) {
 			try { m_dx = new ( XmlDoc ); }
@ -18711,7 +18717,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	*/

 	// note it
-	setStatus ( "removing spider lock");
+	//setStatus ( "removing spider lock");
 	// . make a fake titledb key
 	// . remove the spider lock (Msg12 in Spider.cpp)
 	// . no need to do this if called from Repair.cpp
@ -18719,13 +18725,15 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// . i added "&& m_useSpiderdb" here because it was messing up
 	//   the cacheTermLists() function which ONLY wants posdb keys and
 	//   any other keys in the metalist messes it up. MDW 1/26/13
-	if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
-		*m_p++ = RDB_FAKEDB;
-		((key_t *)m_p)->n1 = 0;
-		((key_t *)m_p)->n0 = m_docId;
-		//= g_titledb.makeKey ( m_docId , 0LL , true );
-		m_p += sizeof(key_t);
-	}
+	// . now SPider.cpp uses SpiderReply reception to remove lock
+	//   - mdw 9/28/13
+	//if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
+	//	*m_p++ = RDB_FAKEDB;
+	//	((key_t *)m_p)->n1 = 0;
+	//	((key_t *)m_p)->n0 = m_docId;
+	//	//= g_titledb.makeKey ( m_docId , 0LL , true );
+	//	m_p += sizeof(key_t);
+	//}


 	bool addReply = true;
@ -29722,6 +29730,9 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	char **rtbufp = getRootTitleBuf();
 	if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp;

+	// overwrite "getting root title buf" status
+	setStatus ("computing new tags");
+
 	if ( g_conf.m_logDebugLinkInfo )
 		log("xmldoc: adding tags for mysite=%s",mysite);