fix notification system.
This commit is contained in:
@ -2809,6 +2809,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<input type=submit name=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Max Page Process Successes:</b>"
|
||||
"</td><td>"
|
||||
@ -2818,6 +2819,24 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Notification Email:</b>"
|
||||
"</td><td>"
|
||||
"<input type=text name=notifyemail "
|
||||
"size=20 value=\"%s\"> "
|
||||
"<input type=submit name=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Notification URL:</b>"
|
||||
"</td><td>"
|
||||
"<input type=text name=notifyurl "
|
||||
"size=20 value=\"%s\"> "
|
||||
"<input type=submit name=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr><td>"
|
||||
"Use Robots.txt when crawling? "
|
||||
"</td><td>"
|
||||
@ -2862,6 +2881,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, cr->m_diffbotMaxToCrawl
|
||||
, cr->m_diffbotMaxToProcess
|
||||
|
||||
, cr->m_notifyEmail.getBufStart()
|
||||
, cr->m_notifyUrl.getBufStart()
|
||||
|
||||
, urtYes
|
||||
, urtNo
|
||||
);
|
||||
|
@ -8204,6 +8204,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_cgi = "notifyemail";
|
||||
m->m_title = "notify email";
|
||||
m->m_xml = "notifyEmail";
|
||||
m->m_off = (char *)&cr.m_notifyEmail - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
@ -8214,6 +8215,7 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_cgi = "notifyurl";
|
||||
m->m_xml = "notifyUrl";
|
||||
m->m_title = "notify url";
|
||||
m->m_off = (char *)&cr.m_notifyUrl - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
@ -8322,6 +8324,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbmaxtocrawl";
|
||||
m->m_title = "diffbot max to crawl";
|
||||
m->m_xml = "diffbotMaxToCrawl";
|
||||
m->m_off = (char *)&cr.m_diffbotMaxToCrawl - x;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
@ -8330,6 +8333,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbmaxtoprocess";
|
||||
m->m_title = "diffbot max to process";
|
||||
m->m_xml = "diffbotMaxToProcess";
|
||||
m->m_off = (char *)&cr.m_diffbotMaxToProcess - x;
|
||||
m->m_type = TYPE_LONG_LONG;
|
||||
|
@ -2743,6 +2743,9 @@ bool gotMxIp ( EmailInfo *ei ) ;
|
||||
|
||||
void gotMxIpWrapper ( void *state , long ip ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
// i guess set it
|
||||
ei->m_mxIp = ip;
|
||||
// handle it
|
||||
if ( ! gotMxIp ( ei ) ) return;
|
||||
// did not block, call callback
|
||||
ei->m_callback ( ei->m_state );
|
||||
@ -2778,6 +2781,11 @@ bool sendEmail ( class EmailInfo *ei ) {
|
||||
}
|
||||
|
||||
void doneSendingEmailWrapper ( void *state , TcpSocket *sock ) {
|
||||
if ( g_errno )
|
||||
log("crawlbot: error sending email = %s",mstrerror(g_errno));
|
||||
// log the reply
|
||||
if ( sock && sock->m_readBuf )
|
||||
log("crawlbot: got socket reply=%s", sock->m_readBuf);
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
ei->m_callback ( ei->m_state );
|
||||
}
|
||||
@ -2786,14 +2794,30 @@ void doneSendingEmailWrapper ( void *state , TcpSocket *sock ) {
|
||||
// returns false if blocked, true otherwise
|
||||
bool gotMxIp ( EmailInfo *ei ) {
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("crawlbot: error getting MX IP to send email alert for "
|
||||
"%s = %s",
|
||||
ei->m_mxDomain.getBufStart(),
|
||||
mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
// wtf?
|
||||
if ( ei->m_mxIp == 0 ) {
|
||||
log("crawlbot: got bad MX ip of 0 for %s",
|
||||
ei->m_mxDomain.getBufStart());
|
||||
return true;
|
||||
}
|
||||
|
||||
// label alloc'd mem with gotmxip in case of mem leak
|
||||
SafeBuf sb;//("gotmxip");
|
||||
// helo line
|
||||
sb.safePrintf("HELO %s\r\n",ei->m_dom);
|
||||
// mail line
|
||||
sb.safePrintf( "MAIL from:<%s>\r\n", ei->m_fromAddress.getBufStart());
|
||||
sb.safePrintf( "MAIL FROM:<%s>\r\n", ei->m_fromAddress.getBufStart());
|
||||
// to line
|
||||
sb.safePrintf( "RCPT to:<%s>\r\n", ei->m_toAddress.getBufStart());
|
||||
sb.safePrintf( "RCPT TO:<%s>\r\n", ei->m_toAddress.getBufStart());
|
||||
// data
|
||||
sb.safePrintf( "DATA\r\n");
|
||||
// body
|
||||
@ -2803,12 +2827,13 @@ bool gotMxIp ( EmailInfo *ei ) {
|
||||
sb.safePrintf( "\r\n");
|
||||
sb.safePrintf( "%s", ei->m_body.getBufStart() );
|
||||
// quit
|
||||
sb.safePrintf( "\r\n.\r\nQUIT\r\n");
|
||||
sb.safePrintf( "\r\n.\r\nQUIT\r\n\r\n");
|
||||
// send the message
|
||||
TcpServer *ts = g_httpServer.getTcp();
|
||||
log ( LOG_WARN, "crawlbot: Sending email to %s:\n %s",
|
||||
log ( LOG_WARN, "crawlbot: Sending email to %s (MX IP=%s):\n %s",
|
||||
ei->m_toAddress.getBufStart(),
|
||||
ei->m_body.getBufStart() );
|
||||
iptoa(ei->m_mxIp),
|
||||
sb.getBufStart() );
|
||||
// make a temp string
|
||||
SafeBuf mxIpStr;
|
||||
mxIpStr.safePrintf("%s",iptoa(ei->m_mxIp) );
|
||||
@ -2819,7 +2844,7 @@ bool gotMxIp ( EmailInfo *ei ) {
|
||||
sb.getCapacity(),
|
||||
sb.getLength(),
|
||||
sb.getLength(),
|
||||
NULL,//h,
|
||||
ei,//NULL,//h,
|
||||
doneSendingEmailWrapper,
|
||||
60*1000,
|
||||
100*1024,
|
||||
|
18
Rdb.cpp
18
Rdb.cpp
@ -2249,10 +2249,26 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
SpiderReply *rr = (SpiderReply *)sreq;
|
||||
// log that. why isn't this undoling always
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
logf(LOG_DEBUG,"spider: rdb: got spider reply"
|
||||
logf(LOG_DEBUG,"rdb: rdb: got spider reply"
|
||||
" for uh48=%llu",rr->getUrlHash48());
|
||||
// add the reply
|
||||
sc->addSpiderReply(rr);
|
||||
// don't actually add it if "fake". i.e. if it
|
||||
// was an internal error of some sort... this will
|
||||
// make it try over and over again i guess...
|
||||
long indexCode = rr->m_errCode;
|
||||
if ( indexCode == EINTERNALERROR ||
|
||||
indexCode == EABANDONED ||
|
||||
indexCode == EHITCRAWLLIMIT ||
|
||||
indexCode == EHITPROCESSLIMIT ) {
|
||||
log("rdb: not adding spiderreply to rdb "
|
||||
"because "
|
||||
"it was an internal error for uh48=%llu "
|
||||
"errCode = %s",
|
||||
rr->getUrlHash48(),
|
||||
mstrerror(indexCode));
|
||||
m_tree.deleteNode(tn,false);
|
||||
}
|
||||
}
|
||||
// clear errors from adding to SpiderCache
|
||||
g_errno = 0;
|
||||
|
94
XmlDoc.cpp
94
XmlDoc.cpp
@ -1821,10 +1821,16 @@ bool XmlDoc::indexDoc ( ) {
|
||||
log("build: %s had internal error = %s. adding spider error reply.",
|
||||
m_firstUrl.m_url,mstrerror(g_errno));
|
||||
|
||||
m_indexCode = EINTERNALERROR;//g_errno;
|
||||
m_indexCodeValid = true;
|
||||
if ( ! m_indexCodeValid ) {
|
||||
m_indexCode = EINTERNALERROR;//g_errno;
|
||||
m_indexCodeValid = true;
|
||||
}
|
||||
|
||||
// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
|
||||
// then this should make a "fake" reply to release the url spider
|
||||
// lock in SpiderLoop::m_lockTable.
|
||||
SpiderReply *nsr = getNewSpiderReply();
|
||||
if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
|
||||
|
||||
SafeBuf metaList;
|
||||
metaList.pushChar(RDB_SPIDERDB);
|
||||
@ -1902,37 +1908,46 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
// this is just for this collection, from all hosts in network
|
||||
m_cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
|
||||
m_cr->m_diffbotMaxToCrawl ) {
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// set the code to badness
|
||||
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
|
||||
m_indexCodeValid = true;
|
||||
log("diffbot: abandoning url because we hit crawl limit "
|
||||
"of %lli. downloaded %lli. Disabling spiders."
|
||||
,m_cr->m_diffbotMaxToCrawl
|
||||
,m_cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
||||
);
|
||||
m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
|
||||
m_indexCodeValid = true;
|
||||
g_errno = m_indexCode;
|
||||
// if spiders already off..
|
||||
if ( ! m_cr->m_spideringEnabled ) return true;
|
||||
// do not repeat call sendNotification()
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// this returns false if it would block, so we ret fals
|
||||
if ( ! sendNotification() ) return false;
|
||||
// it didn't block
|
||||
g_errno = m_indexCode;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// likewise if we hit the max processing limit...
|
||||
if ( ! m_isDiffbotJSONObject &&
|
||||
m_cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
|
||||
m_cr->m_diffbotMaxToProcess ) {
|
||||
// if spiders are enabled send a notification then turn
|
||||
// them off
|
||||
if ( m_cr->m_spideringEnabled ){
|
||||
// do not repeat call sendNotification()
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// this returns false if it would block, so we ret fals
|
||||
if ( ! sendNotification() ) return false;
|
||||
}
|
||||
// set the code to badness
|
||||
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
|
||||
m_indexCodeValid = true;
|
||||
log("diffbot: abandoning url because we hit process limit "
|
||||
"of %lli. processed %lli. Disabling spiders."
|
||||
, m_cr->m_diffbotMaxToProcess
|
||||
, m_cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
||||
);
|
||||
m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
|
||||
m_indexCodeValid = true;
|
||||
g_errno = m_indexCode;
|
||||
// if spiders already off...
|
||||
if ( ! m_cr->m_spideringEnabled ) return true;
|
||||
// turn them off and send notification (email or url)
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// this returns false if it would block, so we ret fals
|
||||
if ( ! sendNotification() ) return false;
|
||||
// it didn't block
|
||||
g_errno = m_indexCode;
|
||||
return true;
|
||||
}
|
||||
@ -19367,6 +19382,39 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
if (! indexCode || indexCode == (void *)-1)
|
||||
return (SpiderReply *)indexCode;
|
||||
|
||||
|
||||
// if it has been abandoned early, i.e. cut-off, then we should
|
||||
// add a "fake" spider reply to release the lock in
|
||||
// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
|
||||
// to see what parts of this are relevant.
|
||||
if ( *indexCode == EABANDONED ||
|
||||
*indexCode == EHITCRAWLLIMIT ||
|
||||
*indexCode == EHITPROCESSLIMIT ) {
|
||||
// clear everything
|
||||
m_newsr.reset();
|
||||
// get from spider request, if there
|
||||
long firstIp = 0;
|
||||
if ( m_oldsrValid ) firstIp = m_oldsr.m_firstIp;
|
||||
// otherwise, wtf?
|
||||
if ( ! firstIp )
|
||||
log("build: no first ip to make fake spiderReply. "
|
||||
"injected?");
|
||||
// we at least need this
|
||||
m_newsr.m_firstIp = firstIp;
|
||||
Url *fu = getFirstUrl();
|
||||
// this is the lock key
|
||||
long long uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
|
||||
m_newsr.setKey ( firstIp, 0 , uh48 , false );
|
||||
// tell it we are fake and not to really add us to
|
||||
// spiderdb, but just to release the lock
|
||||
m_newsr.m_errCode = *indexCode;
|
||||
m_newsrValid = true;
|
||||
return &m_newsr;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
TagRec *gr = getTagRec();
|
||||
if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;
|
||||
|
||||
@ -41838,6 +41886,10 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
THIS->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: email notification status: %s",mstrerror(g_errno));
|
||||
// ignore it for rest
|
||||
g_errno = 0;
|
||||
// wait for post url to get done
|
||||
if ( THIS->m_notifyBlocked > 0 ) return;
|
||||
// all done
|
||||
@ -41847,6 +41899,8 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
THIS->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: url notification status: %s",mstrerror(g_errno));
|
||||
// wait for post url to get done
|
||||
if ( THIS->m_notifyBlocked > 0 ) return;
|
||||
// all done
|
||||
@ -41860,6 +41914,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
// or maxToProcess limitation.
|
||||
bool XmlDoc::sendNotification ( ) {
|
||||
|
||||
setStatus("sending notification");
|
||||
|
||||
char *email = m_cr->m_notifyEmail.getBufStart();
|
||||
char *url = m_cr->m_notifyUrl.getBufStart();
|
||||
|
||||
@ -41867,6 +41923,8 @@ bool XmlDoc::sendNotification ( ) {
|
||||
if ( m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( email && email[0] ) {
|
||||
log("build: sending email notification to %s for coll \"%s\"",
|
||||
email,m_cr->m_coll);
|
||||
SafeBuf msg;
|
||||
msg.safePrintf("Your crawl \"%s\" "
|
||||
"has hit a limitation and has "
|
||||
@ -41876,7 +41934,7 @@ bool XmlDoc::sendNotification ( ) {
|
||||
EmailInfo *ei = &m_emailInfo;
|
||||
ei->m_toAddress.safeStrcpy ( email );
|
||||
ei->m_toAddress.nullTerm();
|
||||
ei->m_fromAddress.safePrintf("crawlbot");
|
||||
ei->m_fromAddress.safePrintf("support@diffbot.com");
|
||||
ei->m_subject.safePrintf("crawl paused");
|
||||
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
|
||||
"has been paused because it hit "
|
||||
@ -41891,6 +41949,8 @@ bool XmlDoc::sendNotification ( ) {
|
||||
}
|
||||
|
||||
if ( url && url[0] ) {
|
||||
log("build: sending url notification to %s for coll \"%s\"",
|
||||
url,m_cr->m_coll);
|
||||
// GET request
|
||||
if ( ! g_httpServer.getDoc ( url ,
|
||||
0 , // ip
|
||||
|
Reference in New Issue
Block a user