better crawl status reporting.

allow for _ in coll names.
2013-10-30 10:00:46 -07:00 · 2013-10-30 10:00:46 -07:00 · adf4d258ae
commit adf4d258ae
parent a1ac5a5348
8 changed files with 213 additions and 112 deletions
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -7,6 +7,7 @@
 #include "Threads.h"
 #include "Datedb.h"
 #include "Timedb.h"
+#include "Spider.h"

 static CollectionRec g_default;

@ -29,8 +30,8 @@ CollectionRec::CollectionRec() {
 	m_overflow  = 0x12345678;
 	m_overflow2 = 0x12345678;
 	// the spiders are currently uninhibited i guess
-	m_spiderStatus = 0;
-	m_spiderStatusMsg = NULL;
+	m_spiderStatus = SP_INITIALIZING; // this is 0
+	//m_spiderStatusMsg = NULL;
 	// for Url::getSite()
 	m_updateSiteRulesTable = 1;
 	m_lastUpdateTime = 0LL;
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -85,7 +85,7 @@ class CrawlInfo {

 	long long m_objectsDeleted;        // 1
 	long long m_objectsAdded;          // 2
-	long long m_urlsConsidered;        // 3
+	long long m_urlsConsideredNOTUSED; // 3
 	long long m_pageDownloadAttempts;  // 4
 	long long m_pageDownloadSuccesses; // 5
 	long long m_pageProcessAttempts;   // 6
@ -304,7 +304,7 @@ class CollectionRec {
 	long  m_maxQueryTerms;

 	char  m_spiderStatus;
-	char *m_spiderStatusMsg;
+	//char *m_spiderStatusMsg;

 	// Language stuff
 	float			m_languageUnknownWeight;
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -185,6 +185,7 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
 	for ( ; *p ; p++ ) {
 		if ( is_alnum_a(*p) ) continue;
 		if ( *p == '-' ) continue;
+		if ( *p == '_' ) continue; // underscore now allowed
 		break;
 	}
 	if ( *p ) {
@ -774,8 +775,8 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
 	cr->m_spiderRoundNum = 0;
 	cr->m_spiderRoundStartTime = 0;

-	cr->m_spiderStatus = 0;
-	cr->m_spiderStatusMsg = NULL;
+	cr->m_spiderStatus = SP_INITIALIZING; // this is 0
+	//cr->m_spiderStatusMsg = NULL;

 	// reset seed buf
 	cr->m_diffbotSeeds.purge();
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -791,6 +791,41 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		return true;
 	}

+	//long pathLen = hr->getPathLen();
+	char rdbId = RDB_NONE;
+	bool downloadJSON = false;
+	long fmt;
+	char *xx;
+
+	if ( ( xx = strstr ( path , "_data.json" ) ) ) {
+		rdbId = RDB_TITLEDB;
+		fmt = FMT_JSON;
+		downloadJSON = true;
+	}
+	else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
+		rdbId = RDB_TITLEDB;
+		downloadJSON = true;
+		fmt = FMT_XML;
+	}
+	else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
+		rdbId = RDB_SPIDERDB;
+		fmt = FMT_CSV;
+	}
+	else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
+		rdbId = RDB_TITLEDB;
+		fmt = FMT_TXT;
+	}
+
+	// sanity, must be one of 3 download calls
+	if ( rdbId == RDB_NONE ) {
+		char *msg ;
+		msg = "usage: downloadurls, downloadpages, downloaddata";
+		log("crawlbot: %s",msg);
+		g_httpServer.sendErrorReply(sock,500,msg);
+		return true;
+	}
+
+
 	char *coll = str + 10;
 	if ( coll >= pathEnd ) {
 		char *msg = "bad download request2";
@ -799,14 +834,8 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		return true;
 	}

-	char *collEnd = strstr ( coll , "_");
-	if ( ! collEnd ) {
-		char *msg = "bad download request3";
-		log("crawlbot: %s",msg);
-		g_httpServer.sendErrorReply(sock,500,msg);
-		return true;
-	}
-
+	// get coll
+	char *collEnd = xx;

 	//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
 	CollectionRec *cr = g_collectiondb.getRec ( coll , collEnd - coll );
@ -817,29 +846,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		return true;
 	}

-	//long pathLen = hr->getPathLen();
-	char rdbId = RDB_NONE;
-	bool downloadJSON = false;
-	long fmt;
-
-	if ( strstr ( path , "_data.json" ) ) {
-		rdbId = RDB_TITLEDB;
-		fmt = FMT_JSON;
-		downloadJSON = true;
-	}
-	if ( strstr ( path , "_data.xml" ) ) {
-		rdbId = RDB_TITLEDB;
-		downloadJSON = true;
-		fmt = FMT_XML;
-	}
-	else if ( strstr ( path , "_urls.csv" ) ) {
-		rdbId = RDB_SPIDERDB;
-		fmt = FMT_CSV;
-	}
-	else if ( strstr ( path , "_pages.txt" ) ) {
-		rdbId = RDB_TITLEDB;
-		fmt = FMT_TXT;
-	}


 	//if ( strncmp ( path ,"/crawlbot/downloadurls",22  ) == 0 )
@ -851,14 +857,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 	//	rdbId = RDB_TITLEDB;
 	//}

-	// sanity, must be one of 3 download calls
-	if ( rdbId == RDB_NONE ) {
-		char *msg ;
-		msg = "usage: downloadurls, downloadpages, downloaddata";
-		log("crawlbot: %s",msg);
-		g_httpServer.sendErrorReply(sock,500,msg);
-		return true;
-	}

 	StateCD *st;
 	try { st = new (StateCD); }
@ -1268,7 +1266,8 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){

 	long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
 	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
-	
+	long lastSpidered = 0;
+
 	// parse through it
 	for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
 		// this record is either a SpiderRequest or SpiderReply
@ -1279,7 +1278,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		// spiderrequests for the same url
 		if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
 			srep = (SpiderReply *)rec;
+			if ( sreq ) lastSpidered = 0;
 			sreq = NULL;
+			if ( lastSpidered == 0 )
+				lastSpidered = srep->m_spideredTime;
+			else if ( srep->m_spideredTime > lastSpidered )
+				lastSpidered = srep->m_spideredTime;
 			prevReplyUh48 = srep->getUrlHash48();
 			// 0 means indexed successfully. not sure if
 			// this includes http status codes like 404 etc.
@ -1307,6 +1311,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		if ( ! printIt ) continue;
 		lastUh48 = uh48;

+		// make sure spiderreply is for the same url!
+		if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
+			srep = NULL;
+		if ( ! srep )
+			lastSpidered = 0;
+
 		// debug point
 		//if ( strstr(sreq->m_url,"chief") )
 		//	log("hey");
@ -1382,12 +1392,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 				       );
 		// but default to csv
 		else {
-			sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
+			sb->safePrintf("\"%s\",%lu,%lu,\"%s\",\"%s\",\""
 				       //",%s"
 				       //"\n"
 				       , sreq->m_url
 				       // when was it first added to spiderdb?
 				       , sreq->m_addedTime
+				       // last time spidered, 0 if none
+				       , lastSpidered
 				       //, status
 				       , msg
 				       // the url filter expression it matches
@ -2326,8 +2338,11 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

 	// collectionrec must be non-null at this point. i.e. we added it
 	if ( ! cr ) {
+		char *msg = "Crawl name was not found.";
+		if ( name && name[0] )
+			msg = "Failed to add crawl. Crawl name is illegal.";
 		//log("crawlbot: no collection found. need to add a crawl");
-		return sendErrorReply2(socket,fmt,"no crawls found. add one.");
+		return sendErrorReply2(socket,fmt, msg);
 	}

 	//char *spots = hr->getString("spots",NULL,NULL);
@ -2727,36 +2742,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			//if ( cx->m_collectionNameAlias.length() > 0 )
 			//	alias=cx->m_collectionNameAlias.getBufStart();
 			//long paused = 1;
-			char *ss = "Crawl in progress.";
-			if ( cx->m_spiderStatusMsg )
-				ss = cx->m_spiderStatusMsg;
-			// 0 means not to RE-crawl
-			char tmp[256];
-			// indicate if we are WAITING for next round...
-			if ( cx->m_collectiveRespiderFrequency > 0.0 &&
-			     getTimeGlobal() < cx->m_spiderRoundStartTime ) {
-				long now = getTimeGlobal();
-				sprintf(tmp,"Next crawl round to start in %li "
-					"seconds.",
-					cx->m_spiderRoundStartTime - now
-					);
-				ss = tmp;
-			}
-			// if we sent an email simply because no urls
-			// were left and we are not recrawling!
-			if ( cx->m_collectiveRespiderFrequency == 0.0 &&
-			     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
-				ss = "Crawl has completed and no "
-					"repeatCrawl is scheduled.";
-			}
-			if ( ! cx->m_spideringEnabled )
-				ss = "Crawl paused.";
-
-			// if spiderdb is empty for this coll, then no url
-			// has been added to spiderdb yet.. either seed or spot
-			CrawlInfo *cg = &cx->m_globalCrawlInfo;
-			if ( cg->m_pageDownloadAttempts == 0 )
-				ss = "Crawl is initializing.";
+			SafeBuf tmp;
+			long crawlStatus = -1;
+			getSpiderStatusMsg ( cx , &tmp , &crawlStatus );

 			CrawlInfo *ci = &cx->m_localCrawlInfo;
 			long sentAlert = (long)ci->m_sentCrawlDoneAlert;
@ -2766,7 +2754,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 				      "\"name\":\"%s\",\n"
 				      //"\"alias\":\"%s\",\n"
 				      //"\"crawlingEnabled\":%li,\n"
-				      "\"crawlStatus\":\"%s\",\n"
+				      "\"crawlStatus\":{"
+				      "\"status\":%li,"
+				      "\"message\":\"%s\"},\n"
 				      "\"sentCrawlDoneNotification\":%li,\n"
 				      //"\"crawlingPaused\":%li,\n"
 				      "\"objectsFound\":%lli,\n"
@ -2789,7 +2779,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 				      , cx->m_diffbotCrawlName.getBufStart()
 				      //, alias
 				      //, (long)cx->m_spideringEnabled
-				      , ss
+				      , crawlStatus
+				      , tmp.getBufStart()
 				      , sentAlert
 				      //, (long)paused
 				      , cx->m_globalCrawlInfo.m_objectsAdded -
--- a/PingServer.cpp
+++ b/PingServer.cpp
@ -3070,10 +3070,10 @@ bool sendNotification ( EmailInfo *ei ) {
 	if ( email && email[0] ) {
 		log("build: sending email notification to %s for "
 		    "crawl \"%s\" : %s",
-		    email,crawl,ei->m_spiderStatusMsg);
+		    email,crawl,ei->m_spiderStatusMsg.getBufStart());
 		SafeBuf msg;
 		msg.safePrintf("Your crawl \"%s\" has a new status: %s"
-			       , ei->m_spiderStatusMsg
+			       , ei->m_spiderStatusMsg.getBufStart()
 			       , crawl );

 		// reset m_length otherwise it builds up
@ -3110,7 +3110,7 @@ bool sendNotification ( EmailInfo *ei ) {
 				    "X-Crawl-Status: %s"// \r\n" // hdrs
 				    
 				    , cr->m_diffbotCrawlName.getBufStart()
-				    , ei->m_spiderStatusMsg
+				    , ei->m_spiderStatusMsg.getBufStart()
 				    );
 		// GET request
 		if ( ! g_httpServer.getDoc ( url ,
--- a/PingServer.h
+++ b/PingServer.h
@ -16,7 +16,8 @@ public:
 	SafeBuf m_fromAddress;
 	SafeBuf m_subject;
 	SafeBuf m_body;
-	char *m_spiderStatusMsg;
+	//char *m_spiderStatusMsg;
+	SafeBuf m_spiderStatusMsg;
 	//CollectionRec *m_cr;
 	collnum_t m_collnum;
 	char *m_dom; // ref into m_toAddress of the domain in email addr
--- a/Spider.cpp
+++ b/Spider.cpp
@ -2412,7 +2412,9 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 		// log it
 		if ( m_numAdded )
 			log("spider: added %li recs to waiting tree from "
-			    "scan of %lli bytes",m_numAdded,m_numBytesScanned);
+			    "scan of %lli bytes coll=%s",
+			    m_numAdded,m_numBytesScanned,
+			    m_cr->m_coll);
 		// reset the count for next scan
 		m_numAdded = 0 ;
 		m_numBytesScanned = 0;
@ -3962,11 +3964,6 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 	g_spiderLoop.spiderDoledUrls( );
 }

-#define SP_MAXROUNDS    1
-#define SP_MAXTOCRAWL   2
-#define SP_MAXTOPROCESS 3
-#define SP_ROUNDDONE    4
-
 void doneSendingNotification ( void *state ) {
 	EmailInfo *ei = (EmailInfo *)state;
 	collnum_t collnum = ei->m_collnum;
@ -3982,6 +3979,9 @@ void doneSendingNotification ( void *state ) {
 	// pingserver.cpp sets this
 	//ei->m_inUse = false;

+	log("spider: setting current spider status to %li",
+	    (long)cr->m_spiderStatus);
+
 	// mark it as sent. anytime a new url is spidered will mark this
 	// as false again! use LOCAL crawlInfo, since global is reset often.
 	cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
@ -4064,6 +4064,14 @@ bool sendNotificationForCollRec ( CollectionRec *cr )  {
 	if ( g_hostdb.m_myHost->m_hostId != 0 )
 		return true;

+	// . if already sent email for this, skip
+	// . localCrawlInfo stores this value on disk so it is persistent
+	// . we do it this way so SP_ROUNDDONE can be emailed and then
+	//   we'd email SP_MAXROUNDS to indicate we've hit the maximum
+	//   round count. 
+	if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
+		return true;
+
 	// do not send email for maxrounds hit, it will send a round done
 	// email for that. otherwise we end up calling doneSendingEmail()
 	// twice and increment the round twice
@ -4073,17 +4081,15 @@ bool sendNotificationForCollRec ( CollectionRec *cr )  {
 		return true;
 	}

-	// . if already sent email for this, skip
-	// . localCrawlInfo stores this value on disk so it is persistent
-	// . we do it this way so SP_ROUNDDONE can be emailed and then
-	//   we'd email SP_MAXROUNDS to indicate we've hit the maximum
-	//   round count. 
-	if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
-		return true;
-
 	// wtf? caller must set this
 	if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; }

+	log("spider: trying to send notification for new crawl status %li. "
+	    "current status is %li",
+	    (long)cr->m_spiderStatus,
+	    //cr->m_spiderStatusMsg,
+	    (long)cr->m_localCrawlInfo.m_sentCrawlDoneAlert);
+
 	// if we already sent it return now. we set this to false everytime
 	// we spider a url, which resets it. use local crawlinfo for this
 	// since we reset global.
@ -4103,7 +4109,9 @@ bool sendNotificationForCollRec ( CollectionRec *cr )  {
 	ei->m_finalState    = ei;
 	ei->m_collnum       = cr->m_collnum;

-	ei->m_spiderStatusMsg = cr->m_spiderStatusMsg;
+	SafeBuf *buf = &ei->m_spiderStatusMsg;
+	long status = -1;
+	getSpiderStatusMsg ( cr , buf , &status );
 					 
 	// if no email address or webhook provided this will not block!
 	if ( ! sendNotification ( ei ) ) return false;
@ -4112,6 +4120,11 @@ bool sendNotificationForCollRec ( CollectionRec *cr )  {
 	return true;
 }

+// we need to update crawl info for collections that
+// have urls ready to spider
+
+
+
 SpiderColl *getNextSpiderColl ( long *cri ) ;


@ -4204,8 +4217,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		if ( cr->m_maxCrawlRounds > 0 &&
 		     cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
 			cr->m_spiderStatus = SP_MAXROUNDS;
-			cr->m_spiderStatusMsg = "Crawl has reached "
-				"maxCrawlRounds limit.";
 			// it'll send a SP_ROUNDDONE email first
 			// so no need to repeat it, but we do want to
 			// update the status msg
@ -4217,8 +4228,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		if ( cr->m_globalCrawlInfo.m_pageDownloadSuccesses >=
 		     cr->m_maxToCrawl ) {
 			cr->m_spiderStatus = SP_MAXTOCRAWL;
-			cr->m_spiderStatusMsg = "Crawl has reached maxToCrawl "
-				"limit.";
 			sendNotificationForCollRec ( cr );
 			continue;
 		}
@ -4227,8 +4236,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
 		if ( cr->m_globalCrawlInfo.m_pageProcessSuccesses >=
 		     cr->m_maxToProcess ) {
 			cr->m_spiderStatus = SP_MAXTOPROCESS;
-			cr->m_spiderStatusMsg = "Crawl has reached "
-				"maxToProcess limit.";
 			sendNotificationForCollRec ( cr );
 			continue;
 		}
@ -4947,8 +4954,8 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 	ci->m_hasUrlsReadyToSpider = true;

 	// reset reason why crawl is not running, because we basically are now
-	cr->m_spiderStatus = 0;
-	cr->m_spiderStatusMsg = NULL;
+	cr->m_spiderStatus = SP_INPROGRESS; // 0;
+	//cr->m_spiderStatusMsg = NULL;

 	// be sure to save state so we do not re-send emails
 	cr->m_needsSave = 1;
@ -10000,10 +10007,17 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )

-	// if we have urls ready to be spidered then prepare to send another
-	// email/webhook notification
-	if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
+	// . if we have urls ready to be spidered then prepare to send another
+	//   email/webhook notification.
+	// . do not reset this flag if SP_MAXTOCRAWL etc otherwise we end up
+	//   sending multiple notifications, so this logic here is only
+	//   for when we are done spidering a round, which happens when
+	//   hasUrlsReadyToSpider goes false for all shards.
+	if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
+	     cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE ) {
+		log("spider: resetting sent crawl done alert to 0");
 		cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+	}


 	// update cache time
@ -10060,7 +10074,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// and we've examined at least one url. to prevent us from
 	// sending a notification if we haven't spidered anything
 	// because no seed urls have been added/injected.
-	if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
+	//if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
+	if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) return;

 	// if urls were considered and roundstarttime is still 0 then
 	// set it to the current time...
@ -10074,7 +10089,6 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	// update status
 	cr->m_spiderStatus = SP_ROUNDDONE;
-	cr->m_spiderStatusMsg = "Crawl round completed.";

 	// do email and web hook...
 	sendNotificationForCollRec ( cr );
@ -10161,3 +10175,81 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 				    slot );
 }

+bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
+
+	//char *ss = "Crawl in progress.";
+	//if ( cx->m_spiderStatusMsg )
+	//	ss = cx->m_spiderStatusMsg;
+
+	if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
+		*status = SP_MAXTOCRAWL;
+		return msg->safePrintf ( "Crawl has reached maxToCrawl "
+					 "limit." );
+	}
+
+	if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
+		*status = SP_MAXTOPROCESS;
+		return msg->safePrintf ( "Crawl has reached maxToProcess "
+					 "limit." );
+	}
+
+	if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
+		*status = SP_MAXROUNDS;
+		return msg->safePrintf ( "Crawl has reached maxCrawlRounds "
+					 "limit." );
+	}
+
+	long now = getTimeGlobal();
+	// . 0 means not to RE-crawl
+	// . indicate if we are WAITING for next round...
+	if ( cx->m_collectiveRespiderFrequency > 0.0 &&
+	     now < cx->m_spiderRoundStartTime ) {
+		*status = SP_ROUNDDONE;
+		return msg->safePrintf("Next crawl round to start "
+				       "in %li seconds.",
+				       cx->m_spiderRoundStartTime-now );
+	}
+
+	// if we sent an email simply because no urls
+	// were left and we are not recrawling!
+	if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
+	     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
+		*status = SP_COMPLETED;
+		return msg->safePrintf("Crawl has completed and no "
+			"repeatCrawl is scheduled.");
+	}
+
+	if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
+		*status = SP_ROUNDDONE;
+		return msg->safePrintf ( "Crawl round completed.");
+	}
+
+	if ( ! cx->m_spideringEnabled ) {
+		*status = SP_PAUSED;
+		return msg->safePrintf("Crawl paused.");
+	}
+
+	if ( ! g_conf.m_spideringEnabled ) {
+		*status = SP_ADMIN_PAUSED;
+		return msg->safePrintf("All crawling temporarily paused "
+				       "by root administrator for "
+				       "maintenance.");
+	}
+
+	// if spiderdb is empty for this coll, then no url
+	// has been added to spiderdb yet.. either seed or spot
+	CrawlInfo *cg = &cx->m_globalCrawlInfo;
+	if ( cg->m_pageDownloadAttempts == 0 ) {
+		*status = SP_NOURLS;
+		return msg->safePrintf("Crawl is waiting for urls.");
+	}
+
+	if ( cx->m_spiderStatus == SP_INITIALIZING ) {
+		*status = SP_INITIALIZING;
+		return msg->safePrintf("Crawl is initializing.");
+	}
+
+	// otherwise in progress?
+	*status = SP_INPROGRESS;
+	return msg->safePrintf("Crawl is in progress.");
+}
--- a/Spider.h
+++ b/Spider.h
@ -32,7 +32,22 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 		       void (* callback)(void *state) ,
 		       bool useCache = true ) ;

+// . values for CollectionRec::m_spiderStatus
+// . reasons why crawl is not happening
+#define SP_INITIALIZING 0
+#define SP_MAXROUNDS    1 // hit max rounds limit
+#define SP_MAXTOCRAWL   2 // hit max to crawl limit
+#define SP_MAXTOPROCESS 3 // hit max to process limit
+#define SP_ROUNDDONE    4 // spider round is done
+#define SP_NOURLS       5 // initializing
+#define SP_PAUSED       6 // user paused spider
+#define SP_INPROGRESS   7 // it is going on!
+#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
+#define SP_COMPLETED    9 // crawl is done, and no repeatCrawl is scheduled

+bool getSpiderStatusMsg ( class CollectionRec *cx , 
+			  class SafeBuf *msg , 
+			  long *status ) ;

 // Overview of Spider
 //