fix notification system.

This commit is contained in:
mwells
2013-10-01 17:30:06 -06:00
parent 3fecb3eb1f
commit 45941e4b2f
5 changed files with 151 additions and 24 deletions

@ -2809,6 +2809,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"<input type=submit name=submit value=OK>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Max Page Process Successes:</b>"
"</td><td>"
@ -2818,6 +2819,24 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</td>"
"</tr>"
"<tr>"
"<td><b>Notification Email:</b>"
"</td><td>"
"<input type=text name=notifyemail "
"size=20 value=\"%s\"> "
"<input type=submit name=submit value=OK>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Notification URL:</b>"
"</td><td>"
"<input type=text name=notifyurl "
"size=20 value=\"%s\"> "
"<input type=submit name=submit value=OK>"
"</td>"
"</tr>"
"<tr><td>"
"Use Robots.txt when crawling? "
"</td><td>"
@ -2862,6 +2881,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cr->m_diffbotMaxToCrawl
, cr->m_diffbotMaxToProcess
, cr->m_notifyEmail.getBufStart()
, cr->m_notifyUrl.getBufStart()
, urtYes
, urtNo
);

@ -8204,6 +8204,7 @@ void Parms::init ( ) {
m++;
m->m_cgi = "notifyemail";
m->m_title = "notify email";
m->m_xml = "notifyEmail";
m->m_off = (char *)&cr.m_notifyEmail - x;
m->m_type = TYPE_SAFEBUF;
@ -8214,6 +8215,7 @@ void Parms::init ( ) {
m->m_cgi = "notifyurl";
m->m_xml = "notifyUrl";
m->m_title = "notify url";
m->m_off = (char *)&cr.m_notifyUrl - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
@ -8322,6 +8324,7 @@ void Parms::init ( ) {
m++;
m->m_cgi = "dbmaxtocrawl";
m->m_title = "diffbot max to crawl";
m->m_xml = "diffbotMaxToCrawl";
m->m_off = (char *)&cr.m_diffbotMaxToCrawl - x;
m->m_type = TYPE_LONG_LONG;
@ -8330,6 +8333,7 @@ void Parms::init ( ) {
m++;
m->m_cgi = "dbmaxtoprocess";
m->m_title = "diffbot max to process";
m->m_xml = "diffbotMaxToProcess";
m->m_off = (char *)&cr.m_diffbotMaxToProcess - x;
m->m_type = TYPE_LONG_LONG;

@ -2743,6 +2743,9 @@ bool gotMxIp ( EmailInfo *ei ) ;
void gotMxIpWrapper ( void *state , long ip ) {
EmailInfo *ei = (EmailInfo *)state;
// i guess set it
ei->m_mxIp = ip;
// handle it
if ( ! gotMxIp ( ei ) ) return;
// did not block, call callback
ei->m_callback ( ei->m_state );
@ -2778,6 +2781,11 @@ bool sendEmail ( class EmailInfo *ei ) {
}
void doneSendingEmailWrapper ( void *state , TcpSocket *sock ) {
if ( g_errno )
log("crawlbot: error sending email = %s",mstrerror(g_errno));
// log the reply
if ( sock && sock->m_readBuf )
log("crawlbot: got socket reply=%s", sock->m_readBuf);
EmailInfo *ei = (EmailInfo *)state;
ei->m_callback ( ei->m_state );
}
@ -2786,14 +2794,30 @@ void doneSendingEmailWrapper ( void *state , TcpSocket *sock ) {
// returns false if blocked, true otherwise
bool gotMxIp ( EmailInfo *ei ) {
// error?
if ( g_errno ) {
log("crawlbot: error getting MX IP to send email alert for "
"%s = %s",
ei->m_mxDomain.getBufStart(),
mstrerror(g_errno));
return true;
}
// wtf?
if ( ei->m_mxIp == 0 ) {
log("crawlbot: got bad MX ip of 0 for %s",
ei->m_mxDomain.getBufStart());
return true;
}
// label alloc'd mem with gotmxip in case of mem leak
SafeBuf sb;//("gotmxip");
// helo line
sb.safePrintf("HELO %s\r\n",ei->m_dom);
// mail line
sb.safePrintf( "MAIL from:<%s>\r\n", ei->m_fromAddress.getBufStart());
sb.safePrintf( "MAIL FROM:<%s>\r\n", ei->m_fromAddress.getBufStart());
// to line
sb.safePrintf( "RCPT to:<%s>\r\n", ei->m_toAddress.getBufStart());
sb.safePrintf( "RCPT TO:<%s>\r\n", ei->m_toAddress.getBufStart());
// data
sb.safePrintf( "DATA\r\n");
// body
@ -2803,12 +2827,13 @@ bool gotMxIp ( EmailInfo *ei ) {
sb.safePrintf( "\r\n");
sb.safePrintf( "%s", ei->m_body.getBufStart() );
// quit
sb.safePrintf( "\r\n.\r\nQUIT\r\n");
sb.safePrintf( "\r\n.\r\nQUIT\r\n\r\n");
// send the message
TcpServer *ts = g_httpServer.getTcp();
log ( LOG_WARN, "crawlbot: Sending email to %s:\n %s",
log ( LOG_WARN, "crawlbot: Sending email to %s (MX IP=%s):\n %s",
ei->m_toAddress.getBufStart(),
ei->m_body.getBufStart() );
iptoa(ei->m_mxIp),
sb.getBufStart() );
// make a temp string
SafeBuf mxIpStr;
mxIpStr.safePrintf("%s",iptoa(ei->m_mxIp) );
@ -2819,7 +2844,7 @@ bool gotMxIp ( EmailInfo *ei ) {
sb.getCapacity(),
sb.getLength(),
sb.getLength(),
NULL,//h,
ei,//NULL,//h,
doneSendingEmailWrapper,
60*1000,
100*1024,

18
Rdb.cpp

@ -2249,10 +2249,26 @@ bool Rdb::addRecord ( collnum_t collnum,
SpiderReply *rr = (SpiderReply *)sreq;
// log that. why isn't this undoling always
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: rdb: got spider reply"
logf(LOG_DEBUG,"rdb: rdb: got spider reply"
" for uh48=%llu",rr->getUrlHash48());
// add the reply
sc->addSpiderReply(rr);
// don't actually add it if "fake". i.e. if it
// was an internal error of some sort... this will
// make it try over and over again i guess...
long indexCode = rr->m_errCode;
if ( indexCode == EINTERNALERROR ||
indexCode == EABANDONED ||
indexCode == EHITCRAWLLIMIT ||
indexCode == EHITPROCESSLIMIT ) {
log("rdb: not adding spiderreply to rdb "
"because "
"it was an internal error for uh48=%llu "
"errCode = %s",
rr->getUrlHash48(),
mstrerror(indexCode));
m_tree.deleteNode(tn,false);
}
}
// clear errors from adding to SpiderCache
g_errno = 0;

@ -1821,10 +1821,16 @@ bool XmlDoc::indexDoc ( ) {
log("build: %s had internal error = %s. adding spider error reply.",
m_firstUrl.m_url,mstrerror(g_errno));
m_indexCode = EINTERNALERROR;//g_errno;
m_indexCodeValid = true;
if ( ! m_indexCodeValid ) {
m_indexCode = EINTERNALERROR;//g_errno;
m_indexCodeValid = true;
}
// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
// then this should make a "fake" reply to release the url spider
// lock in SpiderLoop::m_lockTable.
SpiderReply *nsr = getNewSpiderReply();
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
SafeBuf metaList;
metaList.pushChar(RDB_SPIDERDB);
@ -1902,37 +1908,46 @@ bool XmlDoc::indexDoc2 ( ) {
// this is just for this collection, from all hosts in network
m_cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
m_cr->m_diffbotMaxToCrawl ) {
m_cr->m_spideringEnabled = false;
// set the code to badness
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
m_indexCodeValid = true;
log("diffbot: abandoning url because we hit crawl limit "
"of %lli. downloaded %lli. Disabling spiders."
,m_cr->m_diffbotMaxToCrawl
,m_cr->m_globalCrawlInfo.m_pageDownloadSuccesses
);
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
m_indexCodeValid = true;
g_errno = m_indexCode;
// if spiders already off..
if ( ! m_cr->m_spideringEnabled ) return true;
// do not repeat call sendNotification()
m_cr->m_spideringEnabled = false;
// this returns false if it would block, so we ret fals
if ( ! sendNotification() ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
}
// likewise if we hit the max processing limit...
if ( ! m_isDiffbotJSONObject &&
m_cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
m_cr->m_diffbotMaxToProcess ) {
// if spiders are enabled send a notification then turn
// them off
if ( m_cr->m_spideringEnabled ){
// do not repeat call sendNotification()
m_cr->m_spideringEnabled = false;
// this returns false if it would block, so we ret fals
if ( ! sendNotification() ) return false;
}
// set the code to badness
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
m_indexCodeValid = true;
log("diffbot: abandoning url because we hit process limit "
"of %lli. processed %lli. Disabling spiders."
, m_cr->m_diffbotMaxToProcess
, m_cr->m_globalCrawlInfo.m_pageProcessSuccesses
);
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
m_indexCodeValid = true;
g_errno = m_indexCode;
// if spiders already off...
if ( ! m_cr->m_spideringEnabled ) return true;
// turn them off and send notification (email or url)
m_cr->m_spideringEnabled = false;
// this returns false if it would block, so we ret fals
if ( ! sendNotification() ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
}
@ -19367,6 +19382,39 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if (! indexCode || indexCode == (void *)-1)
return (SpiderReply *)indexCode;
// if it has been abandoned early, i.e. cut-off, then we should
// add a "fake" spider reply to release the lock in
// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
// to see what parts of this are relevant.
if ( *indexCode == EABANDONED ||
*indexCode == EHITCRAWLLIMIT ||
*indexCode == EHITPROCESSLIMIT ) {
// clear everything
m_newsr.reset();
// get from spider request, if there
long firstIp = 0;
if ( m_oldsrValid ) firstIp = m_oldsr.m_firstIp;
// otherwise, wtf?
if ( ! firstIp )
log("build: no first ip to make fake spiderReply. "
"injected?");
// we at least need this
m_newsr.m_firstIp = firstIp;
Url *fu = getFirstUrl();
// this is the lock key
long long uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
m_newsr.setKey ( firstIp, 0 , uh48 , false );
// tell it we are fake and not to really add us to
// spiderdb, but just to release the lock
m_newsr.m_errCode = *indexCode;
m_newsrValid = true;
return &m_newsr;
}
TagRec *gr = getTagRec();
if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;
@ -41838,6 +41886,10 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
void doneSendingNotifyEmailWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_notifyBlocked--;
// error?
log("build: email notification status: %s",mstrerror(g_errno));
// ignore it for rest
g_errno = 0;
// wait for post url to get done
if ( THIS->m_notifyBlocked > 0 ) return;
// all done
@ -41847,6 +41899,8 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_notifyBlocked--;
// error?
log("build: url notification status: %s",mstrerror(g_errno));
// wait for post url to get done
if ( THIS->m_notifyBlocked > 0 ) return;
// all done
@ -41860,6 +41914,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
// or maxToProcess limitation.
bool XmlDoc::sendNotification ( ) {
setStatus("sending notification");
char *email = m_cr->m_notifyEmail.getBufStart();
char *url = m_cr->m_notifyUrl.getBufStart();
@ -41867,6 +41923,8 @@ bool XmlDoc::sendNotification ( ) {
if ( m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
if ( email && email[0] ) {
log("build: sending email notification to %s for coll \"%s\"",
email,m_cr->m_coll);
SafeBuf msg;
msg.safePrintf("Your crawl \"%s\" "
"has hit a limitation and has "
@ -41876,7 +41934,7 @@ bool XmlDoc::sendNotification ( ) {
EmailInfo *ei = &m_emailInfo;
ei->m_toAddress.safeStrcpy ( email );
ei->m_toAddress.nullTerm();
ei->m_fromAddress.safePrintf("crawlbot");
ei->m_fromAddress.safePrintf("support@diffbot.com");
ei->m_subject.safePrintf("crawl paused");
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
"has been paused because it hit "
@ -41891,6 +41949,8 @@ bool XmlDoc::sendNotification ( ) {
}
if ( url && url[0] ) {
log("build: sending url notification to %s for coll \"%s\"",
url,m_cr->m_coll);
// GET request
if ( ! g_httpServer.getDoc ( url ,
0 , // ip