mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-16 02:46:08 -04:00
notification bug fixes. use
new "crawlDelay" parm. output that too.
This commit is contained in:
@ -418,7 +418,7 @@ class CollectionRec {
|
||||
SafeBuf m_notifyUrl;
|
||||
// the default respider frequency for all rows in url filters
|
||||
float m_collectiveRespiderFrequency;
|
||||
long m_collectiveSpiderWait;
|
||||
float m_collectiveCrawlDelay;//SpiderWait;
|
||||
// an alternate name for the collection. we tend to create
|
||||
// collection names as a random sequence of hex digits. this
|
||||
// will allow a user to give them an alternate name.
|
||||
|
@ -1974,8 +1974,8 @@ static class HelpItem s_his[] = {
|
||||
{"repeat","Specify number of days as floating point to "
|
||||
"recrawl the pages. Set to 0.0 to NOT repeat the crawl."},
|
||||
|
||||
{"wait","Wait this many milliseconds between crawling urls from the "
|
||||
"same IP address."},
|
||||
{"crawlDelay","Wait this many seconds between crawling urls from the "
|
||||
"same IP address. Can be a floating point number."},
|
||||
|
||||
{"deleteCrawl","Same as delete."},
|
||||
{"resetCrawl","Same as delete."},
|
||||
@ -2773,7 +2773,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"\"maxCrawlRounds\":%li,\n"
|
||||
"\"obeyRobots\":%li,\n"
|
||||
"\"repeatCrawl\":%f,\n"
|
||||
"\"crawlWaitMS\":%li,\n"
|
||||
"\"crawlDelay\":%f,\n"
|
||||
"\"onlyProcessIfNew\":%li,\n"
|
||||
//,cx->m_coll
|
||||
, cx->m_diffbotCrawlName.getBufStart()
|
||||
@ -2795,7 +2795,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, (long)cx->m_maxCrawlRounds
|
||||
, (long)cx->m_useRobotsTxt
|
||||
, cx->m_collectiveRespiderFrequency
|
||||
, cx->m_collectiveSpiderWait
|
||||
, cx->m_collectiveCrawlDelay
|
||||
, (long)cx->m_diffbotOnlyProcessIfNew
|
||||
);
|
||||
sb.safePrintf("\"seeds\":\"");
|
||||
@ -3301,10 +3301,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Crawl Wait (ms):</b> "
|
||||
"<td><b>Crawl Delay (seconds):</b> "
|
||||
"</td><td>"
|
||||
"<input type=text name=wait "
|
||||
"size=9 value=%li> "
|
||||
"<input type=text name=crawlDelay "
|
||||
"size=9 value=%f> "
|
||||
"<input type=submit name=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
@ -3394,7 +3394,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, isNewYes
|
||||
, isNewNo
|
||||
|
||||
, cr->m_collectiveSpiderWait
|
||||
, cr->m_collectiveCrawlDelay
|
||||
|
||||
|
||||
, cr->m_maxToCrawl
|
||||
@ -4177,6 +4177,11 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
sc->m_waitingTreeNeedsRebuild = true;
|
||||
}
|
||||
|
||||
// convert from seconds to milliseconds. default is 250ms?
|
||||
long wait = (long)(cr->m_collectiveCrawlDelay * 1000.0);
|
||||
// default to 250ms i guess. -1 means unset i think.
|
||||
if ( cr->m_collectiveCrawlDelay < 0.0 ) wait = 250;
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
// through m_diffbotUrlCrawlPattern alternatively. if that itself
|
||||
@ -4185,7 +4190,7 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
cr->m_regExs[i].purge();
|
||||
cr->m_spiderPriorities[i] = 0;
|
||||
cr->m_maxSpidersPerRule [i] = 10;
|
||||
cr->m_spiderIpWaits [i] = cr->m_collectiveSpiderWait;//250
|
||||
cr->m_spiderIpWaits [i] = wait;
|
||||
cr->m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
cr->m_spiderFreqs [i] =cr->m_collectiveRespiderFrequency;
|
||||
@ -4360,10 +4365,10 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
|
||||
cr->m_needsSave = 1;
|
||||
}
|
||||
|
||||
long crawlWait = hr->getLong("wait",-1);
|
||||
if ( crawlWait >= 0 ) {
|
||||
cr->m_collectiveSpiderWait = crawlWait;
|
||||
}
|
||||
float delay = hr->getFloat("crawlDelay",-1.0);
|
||||
//long crawlWait = hr->getLong("wait",-1);
|
||||
if ( delay >= 0.0 )
|
||||
cr->m_collectiveCrawlDelay = delay;
|
||||
|
||||
long onlyProcessNew = hr->getLong("onlyProcessNew",-1);
|
||||
if ( onlyProcessNew != -1 ) {
|
||||
|
14
Parms.cpp
14
Parms.cpp
@ -8257,14 +8257,14 @@ void Parms::init ( ) {
|
||||
m->m_units = "days";
|
||||
m++;
|
||||
|
||||
m->m_title = "collective spider wait (ms)";
|
||||
m->m_cgi = "csw";
|
||||
m->m_xml = "collectiveSpiderWait";
|
||||
m->m_off = (char *)&cr.m_collectiveSpiderWait - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "250"; // 250 ms
|
||||
m->m_title = "collective crawl delay (seconds)";
|
||||
m->m_cgi = "ccd";
|
||||
m->m_xml = "collectiveCrawlDelay";
|
||||
m->m_off = (char *)&cr.m_collectiveCrawlDelay - x;
|
||||
m->m_type = TYPE_FLOAT;
|
||||
m->m_def = ".250"; // 250 ms
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_units = "milliseconds";
|
||||
m->m_units = "seconds";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbppp";
|
||||
|
19
Spider.cpp
19
Spider.cpp
@ -2599,7 +2599,11 @@ static void doledWrapper ( void *state ) {
|
||||
|
||||
long long now = gettimeofdayInMilliseconds();
|
||||
long long diff = now - THIS->m_msg4Start;
|
||||
log("spider: adding to doledb took %llims",diff);
|
||||
// we add recs to doledb using msg1 to keep things fast because
|
||||
// msg4 has a delay of 500ms in it. but even then, msg1 can take
|
||||
// 6ms or more just because of load issues.
|
||||
if ( diff > 10 )
|
||||
log("spider: adding to doledb took %llims",diff);
|
||||
|
||||
// . we added a rec to doledb for the firstIp in m_waitingTreeKey, so
|
||||
// now go to the next node in the wait tree.
|
||||
@ -3960,8 +3964,9 @@ void doneSendingNotification ( void *state ) {
|
||||
// sanity
|
||||
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// i guess each host advances its own round... so take this out
|
||||
// sanity check
|
||||
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// advance round if that round has completed, or there are no
|
||||
// more urls to spider. if we hit maxToProcess/maxToCrawl then
|
||||
@ -4027,6 +4032,10 @@ void doneSendingNotification ( void *state ) {
|
||||
|
||||
bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
// only host #0 sends emails
|
||||
if ( g_hostdb.m_myHost->m_hostId != 0 )
|
||||
return true;
|
||||
|
||||
// do not send email for maxrounds hit, it will send a round done
|
||||
// email for that. otherwise we end up calling doneSendingEmail()
|
||||
// twice and increment the round twice
|
||||
@ -6144,8 +6153,8 @@ void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
|
||||
0 , //dataSize
|
||||
1 )){ // niceness
|
||||
// tree is dumping or something, probably ETRYAGAIN
|
||||
msg = "error adding neg rec to doledb";
|
||||
log("spider: %s %s",msg,mstrerror(g_errno));
|
||||
if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno));
|
||||
}
|
||||
//char *xx=NULL;*xx=0;
|
||||
us->sendErrorReply ( udpSlot , g_errno );
|
||||
return;
|
||||
@ -10015,7 +10024,7 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// crawl is done.
|
||||
|
||||
// only host #0 sends alaerts
|
||||
if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
|
||||
//if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
|
||||
|
||||
// and we've examined at least one url. to prevent us from
|
||||
// sending a notification if we haven't spidered anything
|
||||
|
Reference in New Issue
Block a user