notification bug fixes. use

new "crawlDelay" parm. output that too.
This commit is contained in:
Matt Wells
2013-10-28 21:20:44 -07:00
parent 54d3375a00
commit 7bc5c30b16
4 changed files with 40 additions and 26 deletions

@ -418,7 +418,7 @@ class CollectionRec {
SafeBuf m_notifyUrl;
// the default respider frequency for all rows in url filters
float m_collectiveRespiderFrequency;
long m_collectiveSpiderWait;
float m_collectiveCrawlDelay;//SpiderWait;
// an alternate name for the collection. we tend to create
// collection names as a random sequence of hex digits. this
// will allow a user to give them an alternate name.

@ -1974,8 +1974,8 @@ static class HelpItem s_his[] = {
{"repeat","Specify number of days as floating point to "
"recrawl the pages. Set to 0.0 to NOT repeat the crawl."},
{"wait","Wait this many milliseconds between crawling urls from the "
"same IP address."},
{"crawlDelay","Wait this many seconds between crawling urls from the "
"same IP address. Can be a floating point number."},
{"deleteCrawl","Same as delete."},
{"resetCrawl","Same as delete."},
@ -2773,7 +2773,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"\"maxCrawlRounds\":%li,\n"
"\"obeyRobots\":%li,\n"
"\"repeatCrawl\":%f,\n"
"\"crawlWaitMS\":%li,\n"
"\"crawlDelay\":%f,\n"
"\"onlyProcessIfNew\":%li,\n"
//,cx->m_coll
, cx->m_diffbotCrawlName.getBufStart()
@ -2795,7 +2795,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, (long)cx->m_maxCrawlRounds
, (long)cx->m_useRobotsTxt
, cx->m_collectiveRespiderFrequency
, cx->m_collectiveSpiderWait
, cx->m_collectiveCrawlDelay
, (long)cx->m_diffbotOnlyProcessIfNew
);
sb.safePrintf("\"seeds\":\"");
@ -3301,10 +3301,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</tr>"
"<tr>"
"<td><b>Crawl Wait (ms):</b> "
"<td><b>Crawl Delay (seconds):</b> "
"</td><td>"
"<input type=text name=wait "
"size=9 value=%li> "
"<input type=text name=crawlDelay "
"size=9 value=%f> "
"<input type=submit name=submit value=OK>"
"</td>"
"</tr>"
@ -3394,7 +3394,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, isNewYes
, isNewNo
, cr->m_collectiveSpiderWait
, cr->m_collectiveCrawlDelay
, cr->m_maxToCrawl
@ -4177,6 +4177,11 @@ bool resetUrlFilters ( CollectionRec *cr ) {
sc->m_waitingTreeNeedsRebuild = true;
}
// convert from seconds to milliseconds. default is 250ms?
long wait = (long)(cr->m_collectiveCrawlDelay * 1000.0);
// default to 250ms i guess. -1 means unset i think.
if ( cr->m_collectiveCrawlDelay < 0.0 ) wait = 250;
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
// through m_diffbotUrlCrawlPattern alternatively. if that itself
@ -4185,7 +4190,7 @@ bool resetUrlFilters ( CollectionRec *cr ) {
cr->m_regExs[i].purge();
cr->m_spiderPriorities[i] = 0;
cr->m_maxSpidersPerRule [i] = 10;
cr->m_spiderIpWaits [i] = cr->m_collectiveSpiderWait;//250
cr->m_spiderIpWaits [i] = wait;
cr->m_spiderIpMaxSpiders[i] = 7; // keep it respectful
cr->m_spidersEnabled [i] = 1;
cr->m_spiderFreqs [i] =cr->m_collectiveRespiderFrequency;
@ -4360,10 +4365,10 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
cr->m_needsSave = 1;
}
long crawlWait = hr->getLong("wait",-1);
if ( crawlWait >= 0 ) {
cr->m_collectiveSpiderWait = crawlWait;
}
float delay = hr->getFloat("crawlDelay",-1.0);
//long crawlWait = hr->getLong("wait",-1);
if ( delay >= 0.0 )
cr->m_collectiveCrawlDelay = delay;
long onlyProcessNew = hr->getLong("onlyProcessNew",-1);
if ( onlyProcessNew != -1 ) {

@ -8257,14 +8257,14 @@ void Parms::init ( ) {
m->m_units = "days";
m++;
m->m_title = "collective spider wait (ms)";
m->m_cgi = "csw";
m->m_xml = "collectiveSpiderWait";
m->m_off = (char *)&cr.m_collectiveSpiderWait - x;
m->m_type = TYPE_LONG;
m->m_def = "250"; // 250 ms
m->m_title = "collective crawl delay (seconds)";
m->m_cgi = "ccd";
m->m_xml = "collectiveCrawlDelay";
m->m_off = (char *)&cr.m_collectiveCrawlDelay - x;
m->m_type = TYPE_FLOAT;
m->m_def = ".250"; // 250 ms
m->m_page = PAGE_NONE;
m->m_units = "milliseconds";
m->m_units = "seconds";
m++;
m->m_cgi = "dbppp";

@ -2599,7 +2599,11 @@ static void doledWrapper ( void *state ) {
long long now = gettimeofdayInMilliseconds();
long long diff = now - THIS->m_msg4Start;
log("spider: adding to doledb took %llims",diff);
// we add recs to doledb using msg1 to keep things fast because
// msg4 has a delay of 500ms in it. but even then, msg1 can take
// 6ms or more just because of load issues.
if ( diff > 10 )
log("spider: adding to doledb took %llims",diff);
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
// now go to the next node in the wait tree.
@ -3960,8 +3964,9 @@ void doneSendingNotification ( void *state ) {
// sanity
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
// i guess each host advances its own round... so take this out
// sanity check
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
//if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
// advance round if that round has completed, or there are no
// more urls to spider. if we hit maxToProcess/maxToCrawl then
@ -4027,6 +4032,10 @@ void doneSendingNotification ( void *state ) {
bool sendNotificationForCollRec ( CollectionRec *cr ) {
// only host #0 sends emails
if ( g_hostdb.m_myHost->m_hostId != 0 )
return true;
// do not send email for maxrounds hit, it will send a round done
// email for that. otherwise we end up calling doneSendingEmail()
// twice and increment the round twice
@ -6144,8 +6153,8 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
0 , //dataSize
1 )){ // niceness
// tree is dumping or something, probably ETRYAGAIN
msg = "error adding neg rec to doledb";
log("spider: %s %s",msg,mstrerror(g_errno));
if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno));
}
//char *xx=NULL;*xx=0;
us->sendErrorReply ( udpSlot , g_errno );
return;
@ -10015,7 +10024,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// crawl is done.
// only host #0 sends alaerts
if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
//if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
// and we've examined at least one url. to prevent us from
// sending a notification if we haven't spidered anything