notification bug fixes. use

new "crawlDelay" parm. output that too.
2025-07-16 02:46:08 -04:00 · 2013-10-28 21:20:44 -07:00
parent 54d3375a00
commit 7bc5c30b16
4 changed files with 40 additions and 26 deletions
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -418,7 +418,7 @@ class CollectionRec {
 	SafeBuf m_notifyUrl;
 	// the default respider frequency for all rows in url filters
 	float   m_collectiveRespiderFrequency;
-	long    m_collectiveSpiderWait;
+	float   m_collectiveCrawlDelay;//SpiderWait;
 	// an alternate name for the collection. we tend to create
 	// collection names as a random sequence of hex digits. this
 	// will allow a user to give them an alternate name.
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -1974,8 +1974,8 @@ static class HelpItem s_his[] = {
 	{"repeat","Specify number of days as floating point to "
 	 "recrawl the pages. Set to 0.0 to NOT repeat the crawl."},

-	{"wait","Wait this many milliseconds between crawling urls from the "
-	 "same IP address."},
+	{"crawlDelay","Wait this many seconds between crawling urls from the "
+	 "same IP address. Can be a floating point number."},

 	{"deleteCrawl","Same as delete."},
 	{"resetCrawl","Same as delete."},
@ -2773,7 +2773,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 				      "\"maxCrawlRounds\":%li,\n"
 				      "\"obeyRobots\":%li,\n"
 				      "\"repeatCrawl\":%f,\n"
-				      "\"crawlWaitMS\":%li,\n"
+				      "\"crawlDelay\":%f,\n"
 				      "\"onlyProcessIfNew\":%li,\n"
 				      //,cx->m_coll
 				      , cx->m_diffbotCrawlName.getBufStart()
@ -2795,7 +2795,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 				      , (long)cx->m_maxCrawlRounds
 				      , (long)cx->m_useRobotsTxt
 				      , cx->m_collectiveRespiderFrequency
-				      , cx->m_collectiveSpiderWait
+				      , cx->m_collectiveCrawlDelay
 				      , (long)cx->m_diffbotOnlyProcessIfNew
 				      );
 			sb.safePrintf("\"seeds\":\"");
@ -3301,10 +3301,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "</tr>"

 			      "<tr>"
-			      "<td><b>Crawl Wait (ms):</b> "
+			      "<td><b>Crawl Delay (seconds):</b> "
 			      "</td><td>"
-			      "<input type=text name=wait "
-			      "size=9 value=%li> "
+			      "<input type=text name=crawlDelay "
+			      "size=9 value=%f> "
 			      "<input type=submit name=submit value=OK>"
 			      "</td>"
 			      "</tr>"
@ -3394,7 +3394,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      , isNewYes
 			      , isNewNo
 			      
-			      , cr->m_collectiveSpiderWait
+			      , cr->m_collectiveCrawlDelay


 			      , cr->m_maxToCrawl 
@ -4177,6 +4177,11 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 		sc->m_waitingTreeNeedsRebuild = true;
 	}

+	// convert from seconds to milliseconds. default is 250ms?
+	long wait = (long)(cr->m_collectiveCrawlDelay * 1000.0);
+	// default to 250ms i guess. -1 means unset i think.
+	if ( cr->m_collectiveCrawlDelay < 0.0 ) wait = 250;
+
 	// make the gigablast regex table just "default" so it does not
 	// filtering, but accepts all urls. we will add code to pass the urls
 	// through m_diffbotUrlCrawlPattern alternatively. if that itself
@ -4185,7 +4190,7 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 		cr->m_regExs[i].purge();
 		cr->m_spiderPriorities[i] = 0;
 		cr->m_maxSpidersPerRule [i] = 10;
-		cr->m_spiderIpWaits     [i] = cr->m_collectiveSpiderWait;//250
+		cr->m_spiderIpWaits     [i] = wait;
 		cr->m_spiderIpMaxSpiders[i] = 7; // keep it respectful
 		cr->m_spidersEnabled    [i] = 1;
 		cr->m_spiderFreqs       [i] =cr->m_collectiveRespiderFrequency;
@ -4360,10 +4365,10 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 		cr->m_needsSave = 1;
 	}

-	long crawlWait = hr->getLong("wait",-1);
-	if ( crawlWait >= 0 ) {
-		cr->m_collectiveSpiderWait = crawlWait;
-	}
+	float delay = hr->getFloat("crawlDelay",-1.0);
+	//long crawlWait = hr->getLong("wait",-1);
+	if ( delay >= 0.0 )
+		cr->m_collectiveCrawlDelay = delay;
 	
 	long onlyProcessNew = hr->getLong("onlyProcessNew",-1);
 	if ( onlyProcessNew != -1 ) {
--- a/Parms.cpp
+++ b/Parms.cpp
@ -8257,14 +8257,14 @@ void Parms::init ( ) {
 	m->m_units = "days";
 	m++;

-	m->m_title = "collective spider wait (ms)";
-	m->m_cgi   = "csw";
-	m->m_xml   = "collectiveSpiderWait";
-	m->m_off   = (char *)&cr.m_collectiveSpiderWait - x;
-	m->m_type  = TYPE_LONG;
-	m->m_def   = "250"; // 250 ms
+	m->m_title = "collective crawl delay (seconds)";
+	m->m_cgi   = "ccd";
+	m->m_xml   = "collectiveCrawlDelay";
+	m->m_off   = (char *)&cr.m_collectiveCrawlDelay - x;
+	m->m_type  = TYPE_FLOAT;
+	m->m_def   = ".250"; // 250 ms
 	m->m_page  = PAGE_NONE;
-	m->m_units = "milliseconds";
+	m->m_units = "seconds";
 	m++;

 	m->m_cgi   = "dbppp";
--- a/Spider.cpp
+++ b/Spider.cpp
@ -2599,7 +2599,11 @@ static void doledWrapper ( void *state ) {

 	long long now = gettimeofdayInMilliseconds();
 	long long diff = now - THIS->m_msg4Start;
-	log("spider: adding to doledb took %llims",diff);
+	// we add recs to doledb using msg1 to keep things fast because
+	// msg4 has a delay of 500ms in it. but even then, msg1 can take
+	// 6ms or more just because of load issues.
+	if ( diff > 10 ) 
+		log("spider: adding to doledb took %llims",diff);

 	// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
 	//   now go to the next node in the wait tree.
@ -3960,8 +3964,9 @@ void doneSendingNotification ( void *state ) {
 	// sanity
 	if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }

+	// i guess each host advances its own round... so take this out
 	// sanity check
-	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
+	//if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

 	// advance round if that round has completed, or there are no
 	// more urls to spider. if we hit maxToProcess/maxToCrawl then 
@ -4027,6 +4032,10 @@ void doneSendingNotification ( void *state ) {

 bool sendNotificationForCollRec ( CollectionRec *cr )  {

+	// only host #0 sends emails
+	if ( g_hostdb.m_myHost->m_hostId != 0 )
+		return true;
+
 	// do not send email for maxrounds hit, it will send a round done
 	// email for that. otherwise we end up calling doneSendingEmail()
 	// twice and increment the round twice
@ -6144,8 +6153,8 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
 					0    , //dataSize
 					1 )){ // niceness
 			// tree is dumping or something, probably ETRYAGAIN
-			msg = "error adding neg rec to doledb";
-			log("spider: %s %s",msg,mstrerror(g_errno));
+			if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb";	log("spider: %s %s",msg,mstrerror(g_errno));
+			}
 			//char *xx=NULL;*xx=0;
 			us->sendErrorReply ( udpSlot , g_errno );
 			return;
@ -10015,7 +10024,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// crawl is done.

 	// only host #0 sends alaerts
-	if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
+	//if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;

 	// and we've examined at least one url. to prevent us from
 	// sending a notification if we haven't spidered anything