Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
mwells 2013-10-09 13:07:22 -06:00
commit 24e3b8cf52
14 changed files with 474 additions and 144 deletions

@ -68,7 +68,7 @@
#include "HashTableX.h"
#include "RdbList.h"
#include "Rdb.h" // for RdbBase
#include "PingServer.h" // EmailInfo
// how many counts are in CrawlInfo below????
#define NUMCRAWLSTATS 8
@ -94,6 +94,13 @@ class CrawlInfo {
long m_lastUpdateTime;
// this is non-zero if urls are available to be spidered right now.
long m_hasUrlsReadyToSpider;
// have we sent out email/webhook notifications crawl has no urls
// currently in the ready queue (doledb) to spider?
char m_sentCrawlDoneAlert;
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
//bool print (class SafeBuf *sb ) ;
//bool setFromSafeBuf (class SafeBuf *sb ) ;
@ -400,7 +407,7 @@ class CollectionRec {
//SafeBuf m_diffbotApiList;//QueryString;
//SafeBuf m_diffbotUrlCrawlPattern;
//SafeBuf m_diffbotUrlProcessPattern;
//SafeBuf m_diffbotPageProcessPattern;
SafeBuf m_diffbotPageProcessPattern;
//SafeBuf m_diffbotClassify;
//char m_diffbotClassify;
//char m_useDiffbot;
@ -424,6 +431,7 @@ class CollectionRec {
CrawlInfo m_globalCrawlInfo;
// last time we computed global crawl info
//time_t m_globalCrawlInfoUpdateTime;
EmailInfo m_emailInfo;
// for counting replies
long m_replies;
long m_requests;

@ -129,7 +129,8 @@ bool HttpServer::getDoc ( char *url ,
char *proto ,
bool doPost ,
char *cookie ,
char *additionalHeader ) {
char *additionalHeader ,
char *fullRequest ) {
// sanity
if ( ip == -1 )
log("http: you probably didn't mean to set ip=-1 did you? "
@ -152,24 +153,37 @@ bool HttpServer::getDoc ( char *url ,
tcp = &m_ssltcp;
defPort = 443;
}
// this returns false and sets g_errno on error
if ( ! r.set ( url , offset , size , ifModifiedSince ,
userAgent , proto , doPost , cookie ,
additionalHeader ) ) return true;
if ( g_conf.m_logDebugSpider )
log("spider: httprequest = %s", r.getRequest());
char *req = NULL;
long reqSize;
// this returns false and sets g_errno on error
if ( ! fullRequest ) {
if ( ! r.set ( url , offset , size , ifModifiedSince ,
userAgent , proto , doPost , cookie ,
additionalHeader ) ) return true;
reqSize = r.getRequestLen();
req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
}
else {
// does not contain \0 i guess
reqSize = gbstrlen(fullRequest);
req = (char *) mdup ( fullRequest , reqSize,"HttpServer");
}
// . get the request from the static buffer and dup it
// . return true and set g_errno on error
if ( ! req ) return true;
long hostLen ;
long port = defPort;
char *host = getHostFast ( url , &hostLen , &port );
// . get the request from the static buffer and dup it
// . return true and set g_errno on error
long reqSize = r.getRequestLen();
char *req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
if ( ! req ) return true;
if ( g_conf.m_logDebugSpider )
log("spider: httprequest = %s", req );
// do we have an ip to send to? assume not
if ( proxyIp ) { ip = proxyIp ; port = proxyPort; }
// special NULL case

@ -96,7 +96,9 @@ class HttpServer {
char *proto = "HTTP/1.0" ,
bool doPost = false ,
char *cookie = NULL ,
char *additionalHeader = NULL ); // does not include \r\n
char *additionalHeader = NULL , // does not include \r\n
// specify your own mime and post data here...
char *fullRequest = NULL );
bool getDoc ( long ip,
long port,

@ -323,7 +323,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
if ( g_mem.m_used + size >= g_mem.m_maxMem &&
g_mem.m_maxMem > 1000000 ) {
log("mem: new(%i): Out of memory.", size );
throw bad_alloc();
throw std::bad_alloc();
//throw 1;
}
#ifdef _EFENCE_

@ -1875,6 +1875,9 @@ static class HelpItem s_his[] = {
{"notifyurl","Fetch this URL when crawl hits "
"the maxtocrawl or maxtoprocess limit."},
{"urt","Use robots.txt?"},
{"pageprocesspattern","List of || separated strings. If the page "
"contains any of these then we send it to diffbot for processing. "
"If this is empty we send all pages to diffbot for processing."},
//{"dbapilist","Special list of diffbot API urls. The URL Filters "
// "will display these options in a drop down menu. "
// "Example (unencoded): "
@ -2056,16 +2059,30 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
cr->m_notifyEmail.set(email);
cr->m_notifyEmail.nullTerm();
}
else {
cr->m_notifyEmail.purge();
}
char *url = hr->getString("notifyurl",NULL,NULL);
if ( url ) {
cr->m_notifyUrl.set(url);
cr->m_notifyUrl.nullTerm();
}
else {
cr->m_notifyUrl.purge();
}
long pause = hr->getLong("pause",-1);
if ( pause == 0 ) cr->m_spideringEnabled = 1;
if ( pause == 1 ) cr->m_spideringEnabled = 0;
long urt = hr->getLong("urt",-1);
if ( urt != -1 ) cr->m_useRobotsTxt = urt;
char *ppp = hr->getString("pageprocesspattern",NULL);
if ( ppp ) {
cr->m_diffbotPageProcessPattern.set(ppp);
cr->m_diffbotPageProcessPattern.nullTerm();
}
else {
cr->m_diffbotPageProcessPattern.purge();
}
// this is a cast, so just return simple response
return g_httpServer.sendDynamicPage (socket,"OK",2);
}
@ -2804,6 +2821,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
//
"<tr>"
"<td><b>Page Process Pattern:</b> "
"</td><td>"
"<input type=text name=pageprocesspattern "
"size=20 value=\"%s\"> "
"<input type=submit name=submit value=OK>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Max Page Download Successes:</b> "
"</td><td>"
@ -2881,6 +2907,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cr->m_coll
, cr->m_coll
, cr->m_diffbotPageProcessPattern.getBufStart()
, cr->m_diffbotMaxToCrawl
, cr->m_diffbotMaxToProcess
@ -3291,7 +3319,6 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_diffbotApiQueryString.set ( apiQueryString );
cr->m_diffbotUrlCrawlPattern.set ( urlCrawlPattern );
cr->m_diffbotUrlProcessPattern.set ( urlProcessPattern );
cr->m_diffbotPageProcessPattern.set ( pageProcessPattern );
cr->m_diffbotClassify = classify;
// let's make these all NULL terminated strings
@ -3303,7 +3330,9 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_diffbotPageProcessPattern.nullTerm();
*/
// bring this back
cr->m_diffbotPageProcessPattern.set ( "" );
cr->m_diffbotPageProcessPattern.nullTerm();
// do not spider more than this many urls total. -1 means no max.
cr->m_diffbotMaxToCrawl = 100000;

@ -29,6 +29,7 @@
#include "Placedb.h"
#include "Sections.h"
//#include "Msg0.h" // g_termlistCache
#include "Msg13.h"
bool printNumAbbr ( SafeBuf &p, long long vvv ) {
float val = (float)vvv;

@ -2879,3 +2879,216 @@ bool gotMxIp ( EmailInfo *ei ) {
return true;
}
static void gotMandrillReplyWrapper ( void *state , TcpSocket *s ) {
EmailInfo *ei = (EmailInfo *)state;
ei->m_callback ( ei->m_state );
}
// mailchimp http mail api
bool sendEmailThroughMandrill ( class EmailInfo *ei ) {
// this is often set from XmlDoc.cpp::indexDoc()
g_errno = 0;
SafeBuf sb;
// then the message to send
sb.safePrintf(
"POST /api/1.0/messages/send-template.json"
" HTTP/1.0\r\n"
"Accept: image/gif, image/x-xbitmap, image/jpeg, "
"image/pjpeg, application/x-shockwave-flash, "
"application/msword, */*\r\n"
"Accept-Language: en-us\r\n"
"Content-Type: application/x-www-form-urlencoded\r\n"
"Accept-Encoding: gzip, deflate\r\n"
"User-Agent: Mozilla/4.0 "
"(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r\n"
"Host: mandrillapp.com\r\n" // www.t-mobile.com
"Content-Length: xxx\r\n"
//"Connection: Keep-Alive\r\n"
"Connection: close\r\n"
"Cookie: \r\n"
"Cache-Control: no-cache\r\n\r\n"
);
//
// post data
//
char *to = ei->m_toAddress.getBufStart();
char *from = ei->m_fromAddress.getBufStart();
SafeBuf ub;
sb.safePrintf( "{\"key\":\"GhWT0UpcVBl7kmumrt9dqg\","
"\"template_name\":\"crawl-finished\","
"\"template_content\": [],"
"\"message\": {"
"\"to\": ["
"{"
"\"email\":\"%s\""
"}"
"],"
"\"from_email\":\"%s\","
"\"headers\": {"
"\"Reply-To\":\"%s\""
"},"
"\"bcc_address\":\"%s\","
"\"global_merge_vars\":["
"{"
"\"name\":\"CRAWLNAME\","
"\"content\":\"%s\""
"}"
"]"
"}"
"}"
, to
, from
, from
, from
, ei->m_cr->m_coll
);
ub.urlEncode();
// append the post data to the full request
sb.safeMemcpy ( &ub );
// make sure ends in \0
sb.nullTerm();
// gotta get the cookie
char *uu = "https://mandrillapp.com/";
if ( ! g_httpServer.getDoc ( uu,
0, // ip
0 , // offset
-1 , // size
false , // m_ifModifiedSince
ei , // state
gotMandrillReplyWrapper , //
60*1000 , // timeout
0 , // m_proxyIp
0 , // m_proxyPort
100*1024 , // m_maxTextDocLen
100*1024 , // m_maxOtherDocLen
NULL, // user agent
"HTTP/1.0" , //proto
true, // post?
NULL, // cookie
NULL, // additional header
sb.getBufStart() ) ) // full requesst
return false;
// must have been an error
log("net: Got error getting page from mandrill: %s.",
mstrerror(g_errno));
// ignore it
g_errno = 0;
// always call this at the end
return true;
}
/////////////////////////////
//
// send two notifications, email and webhook
//
/////////////////////////////
void doneSendingNotifyEmailWrapper ( void *state ) {
EmailInfo *ei = (EmailInfo *)state;
ei->m_notifyBlocked--;
// error?
log("build: email notification status: %s",mstrerror(g_errno));
// ignore it for rest
g_errno = 0;
// wait for post url to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
}
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
EmailInfo *ei = (EmailInfo *)state;
ei->m_notifyBlocked--;
// error?
log("build: url notification status: %s",mstrerror(g_errno));
// wait for email to get done
if ( ei->m_notifyBlocked > 0 ) return;
// unmark it
ei->m_inUse = false;
// all done
ei->m_finalCallback ( ei->m_finalState );
}
// . return false if would block, true otherwise
// . used to send email and get a url when a crawl hits a maxToCrawl
// or maxToProcess limitation.
bool sendNotification ( EmailInfo *ei ) {
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
// caller must set this, as well as m_finalCallback/m_finalState
CollectionRec *cr = ei->m_cr;
char *email = cr->m_notifyEmail.getBufStart();
char *url = cr->m_notifyUrl.getBufStart();
// sanity check, can only call once
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
ei->m_inUse = true;
if ( email && email[0] ) {
log("build: sending email notification to %s for coll \"%s\"",
email,cr->m_coll);
SafeBuf msg;
msg.safePrintf("Your crawl \"%s\" "
"has hit a limitation and has "
"been paused."
, cr->m_coll);
// use this
ei->m_toAddress.safeStrcpy ( email );
ei->m_toAddress.nullTerm();
ei->m_fromAddress.safePrintf("support@diffbot.com");
/*
ei->m_subject.safePrintf("crawl paused");
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
"has been paused because it hit "
"a maxPagesToCrawl or maxPagesToProcess "
"limitation."
, cr->m_coll);
*/
ei->m_state = ei;//this;
ei->m_callback = doneSendingNotifyEmailWrapper;
// this will usually block, unless error maybe
if ( ! sendEmailThroughMandrill ( ei ) )
ei->m_notifyBlocked++;
}
if ( url && url[0] ) {
log("build: sending url notification to %s for coll \"%s\"",
url,cr->m_coll);
// GET request
if ( ! g_httpServer.getDoc ( url ,
0 , // ip
0 , // offset
-1 , // size
false, // ifmodsince
ei,//this ,
doneGettingNotifyUrlWrapper ,
60*1000 , // timeout
0, // proxyip
0 , // proxyport
10000, // maxTextDocLen
10000 // maxOtherDocLen
) )
ei->m_notifyBlocked++;
}
if ( ei->m_notifyBlocked == 0 ) {
ei->m_inUse = false;
return true;
}
// we blocked, wait
return false;
}

@ -5,7 +5,37 @@
#include "gb-include.h"
#include "Hostdb.h"
#include "Repair.h"
//#include "Repair.h"
extern char g_repairMode;
class EmailInfo {
public:
SafeBuf m_toAddress;
SafeBuf m_fromAddress;
SafeBuf m_subject;
SafeBuf m_body;
CollectionRec *m_cr;
char *m_dom; // ref into m_toAddress of the domain in email addr
SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
void *m_state;
void (* m_callback ) (void *state);
void *m_finalState;
void (* m_finalCallback ) (void *state);
// ip address of MX record for this domain
long m_mxIp;
long m_notifyBlocked;
bool m_inUse;
EmailInfo() {
memset ( this,0,sizeof(EmailInfo) );
};
void reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
memset ( this,0,sizeof(EmailInfo) );
};
};
class PingServer {
@ -135,5 +165,11 @@ extern class PingServer g_pingServer;
// . use this for sending generic emails
bool sendEmail ( class EmailInfo *ei ) ;
// use mailchimp's mandrill email http api
bool sendEmailThroughMandrill ( class EmailInfo *ei ) ;
// send email and webhook notification
bool sendNotification ( class EmailInfo *ei );
#endif

@ -2,6 +2,8 @@
#include "Proxy.h"
#include "Statsdb.h"
#include "Msg13.h"
#include "XmlDoc.h"
//#include "seo.h" // g_secret_tran_key and api_key

@ -576,7 +576,7 @@ bool RdbDump::dumpList ( RdbList *list , long niceness , bool recall ) {
//m_bytesWritten = 0;
// sanity check
log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);
//log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);
// . if we're called by RdbMerge directly use m_callback/m_state
// . otherwise, use doneWritingWrapper() which will call dumpTree()

@ -3880,9 +3880,15 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( m_cri >= g_collectiondb.m_numRecs ) { char *xx=NULL;*xx=0; }
// grab this
collnum_t collnum = m_cri;
//collnum_t collnum = m_cri;
//CollectionRec *cr = g_collectiondb.m_recs[collnum];
// update the crawlinfo for this collection if it has been a while.
// should never block since callback is NULL.
if ( ! updateCrawlInfo(cr,NULL,NULL,true) ) { char *xx=NULL;*xx=0; }
// get this
char *coll = g_collectiondb.m_recs[collnum]->m_coll;
char *coll = cr->m_coll;
// need this for msg5 call
key_t endKey; endKey.setMax();
@ -9188,8 +9194,16 @@ bool updateCrawlInfo ( CollectionRec *cr ,
long now = getTimeLocal();
// keep it fresh within 1 second
long thresh = 1;
// if being called from spiderloop, we just want to keep
// CrawlInfo::m_nextSpiderTime fresh
if ( ! callback ) thresh = 60;
// unless cluster is big
if ( g_hostdb.m_numHosts > 32 ) thresh = 30;
if ( g_hostdb.m_numHosts > 32 ) {
// update every 30 seconds
thresh = 30;
// if doing a passive refresh though...
if ( ! callback ) thresh = 120;
}
if ( useCache && now - cr->m_globalCrawlInfo.m_lastUpdateTime <thresh)
return true;
@ -9208,7 +9222,13 @@ bool updateCrawlInfo ( CollectionRec *cr ,
// if we were not the first, we do not initiate it, we just wait
// for all the replies to come back
if ( cr->m_replies < cr->m_requests ) return false;
if ( cr->m_replies < cr->m_requests ) {
// unless we had no callback! we do that in SpiderLoop above
// to keep the crawl info fresh.
if ( ! callback ) return true;
// otherwise, block and we'll call your callback when done
return false;
}
// sanity test
if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
@ -9259,6 +9279,15 @@ bool updateCrawlInfo ( CollectionRec *cr ,
return true;
}
void doneSendingNotification ( void *state ) {
EmailInfo *ei = (EmailInfo *)state;
log("spider: done sending notifications for coll=%s",
ei->m_cr->m_coll);
// mark it as sent. anytime a new url is spidered will mark this
// as false again! use LOCAL crawlInfo, since global is reset often.
ei->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 1;
}
void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// reply is error?
if ( ! slot->m_readBuf || g_errno ) {
@ -9288,6 +9317,11 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
gs++;
ss++;
}
if ( stats->m_hasUrlsReadyToSpider ) {
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider++;
// unflag the sent flag if we had sent an alert
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
}
// return if still waiting on more to come in
if ( cr->m_replies < cr->m_requests ) return;
@ -9320,6 +9354,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
p += sizeof(CallbackEntry2);
// clear g_errno just in case
g_errno = 0;
// this is NULL when called from SpiderLoop::spiderDoledUrls()
// because that is just updating it for maintenance
if ( ! ce2->m_callback ) continue;
// debug note
//XmlDoc *xd = (XmlDoc *)(ce2->m_state);
//log("spider: calling crawlupdate callback for %s",
@ -9335,6 +9372,34 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// save the mem!
cr->m_callbackQueue.purge();
// now if its the first time a crawl has no rec to spider for
// a while, we want to send an alert to the user so they know their
// crawl is done.
// only host #0 sends alaerts
if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
// but of course if it has urls ready to spider, do not send alert
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) return;
// if we already sent it return now. we set this to false everytime
// we spider a url, which resets it. use local crawlinfo for this
// since we reset global.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return;
// ok, send it
EmailInfo *ei = &cr->m_emailInfo;
// in use already?
if ( ei->m_inUse ) return;
// set it up
ei->m_finalCallback = doneSendingNotification;
ei->m_finalState = ei;
ei->m_cr = cr;
sendNotification ( ei );
}
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
@ -9343,6 +9408,16 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
if ( slot->m_readBufSize != sizeof(collnum_t) ) { char *xx=NULL;*xx=0;}
collnum_t collnum = *(collnum_t *)request;
CollectionRec *cr = g_collectiondb.getRec(collnum);
// while we are here update CrawlInfo::m_nextSpiderTime
// to the time of the next spider request to spider.
// if doledb is empty and the next rec in the waiting tree
// does not have a time of zero, but rather, in the future, then
// return that future time. so if a crawl is enabled we should
// actively call updateCrawlInfo a collection every minute or
// so.
char *reply = slot->m_tmpBuf;
if ( TMPBUFSIZE < sizeof(CrawlInfo) ) { char *xx=NULL;*xx=0; }
memcpy ( reply , &cr->m_localCrawlInfo , sizeof(CrawlInfo) );

@ -19,6 +19,7 @@
#include "Process.h"
#include "Placedb.h"
#include "Threads.h"
#include "Msge1.h"
//static void testWrapper ( int fd , void *state ) ;
static void injectedWrapper ( void *state ) ;

@ -44,7 +44,8 @@
#include "Highlight.h"
#include "Wiktionary.h"
#include "seo.h" // Msg99Request etc.
#include <regex.h>
//#include <regex.h>
#include "PingServer.h"
#define MAXDOCLEN (1024*1024)
@ -162,7 +163,7 @@ XmlDoc::XmlDoc() {
m_numMsg4fRequests = 0;
m_numMsg4fReplies = 0;
m_sentMsg4fRequests = false;
m_notifyBlocked = 0;
//m_notifyBlocked = 0;
//m_mcasts = NULL;
//for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ )
// m_currentBinPtrs[i] = NULL;
@ -180,7 +181,7 @@ static long long s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) {
// notifications pending?
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
m_loaded = false;
@ -1956,8 +1957,15 @@ bool XmlDoc::indexDoc2 ( ) {
if ( ! m_cr->m_spideringEnabled ) return true;
// do not repeat call sendNotification()
m_cr->m_spideringEnabled = false;
// set this
m_emailInfo.reset();
m_emailInfo.m_finalCallback = m_masterLoop;
m_emailInfo.m_finalState = m_masterState;
m_emailInfo.m_cr = m_cr;
// note it
setStatus("sending notification");
// this returns false if it would block, so we ret fals
if ( ! sendNotification() ) return false;
if ( ! sendNotification ( &m_emailInfo ) ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
@ -1980,8 +1988,16 @@ bool XmlDoc::indexDoc2 ( ) {
if ( ! m_cr->m_spideringEnabled ) return true;
// turn them off and send notification (email or url)
m_cr->m_spideringEnabled = false;
// this returns false if it would block, so we ret fals
if ( ! sendNotification() ) return false;
// set this
m_emailInfo.reset();
m_emailInfo.m_finalCallback = m_masterLoop;
m_emailInfo.m_finalState = m_masterState;
m_emailInfo.m_cr = m_cr;
// note it
setStatus("sending notification");
// . this returns false if it would block, so we ret fals
// . this is now in PingServer.cpp
if ( ! sendNotification( &m_emailInfo ) ) return false;
// it didn't block
g_errno = m_indexCode;
return true;
@ -12015,12 +12031,6 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// return &m_diffbotReply;
//}
// or if original page content matches the page regex dont hit diffbot
//if( m_useDiffbot && ! doesPageContentMatchDiffbotProcessPattern() ) {
// m_diffbotReplyValid = true;
// return &m_diffbotReply;
//}
// empty content, do not send to diffbot then
char **u8 = getUtf8Content();
if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
@ -12039,6 +12049,12 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
}
// or if original page content matches the page regex dont hit diffbot
if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
setStatus("getting diffbot reply");
//char *path = "api";
@ -17108,6 +17124,45 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
}
*/
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
char *p = m_cr->m_diffbotPageProcessPattern.getBufStart();
// how many did we have?
long count = 0;
// scan the " || " separated substrings
for ( ; *p ; ) {
// get beginning of this string
char *start = p;
// skip white space
while ( *start && is_wspace_a(*start) ) start++;
// done?
if ( ! *start ) break;
// find end of it
char *end = start;
while ( *end && end[0] != '|' && ! is_wspace_a(end[0]) )
end++;
// advance p for next guy
p = end;
while ( *p && (*p=='|' || is_wspace_a(*p) ) ) p++;
// temp null this
char c = *end;
*end = '\0';
// count it as an attempt
count++;
// . is this substring anywhere in the document
// . check the rawest content before converting to utf8 i guess
char *foundPtr = strstr ( m_content , start ) ;
// revert \0
*end = c;
// did we find it?
if ( foundPtr ) return true;
}
// if we had no attempts, it is ok
if ( count == 0 ) return true;
// if we had an unfound substring...
return false;
}
// . returns ptr to status
// . diffbot uses this to remove the indexed json pages associated with
// a url. each json object is basically its own url. a json object
@ -41951,94 +42006,3 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
return (char *)0x01;
}
void doneSendingNotifyEmailWrapper ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_notifyBlocked--;
// error?
log("build: email notification status: %s",mstrerror(g_errno));
// ignore it for rest
g_errno = 0;
// wait for post url to get done
if ( THIS->m_notifyBlocked > 0 ) return;
// all done
THIS->m_masterLoop ( THIS->m_masterState );
}
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_notifyBlocked--;
// error?
log("build: url notification status: %s",mstrerror(g_errno));
// wait for post url to get done
if ( THIS->m_notifyBlocked > 0 ) return;
// all done
THIS->m_masterLoop ( THIS->m_masterState );
}
#include "PingServer.h" // sendEmail() function
// . return false if would block, true otherwise
// . used to send email and get a url when a crawl hits a maxToCrawl
// or maxToProcess limitation.
bool XmlDoc::sendNotification ( ) {
setStatus("sending notification");
char *email = m_cr->m_notifyEmail.getBufStart();
char *url = m_cr->m_notifyUrl.getBufStart();
// sanity check, can only call once
if ( m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
if ( email && email[0] ) {
log("build: sending email notification to %s for coll \"%s\"",
email,m_cr->m_coll);
SafeBuf msg;
msg.safePrintf("Your crawl \"%s\" "
"has hit a limitation and has "
"been paused."
, m_cr->m_coll);
// use this
EmailInfo *ei = &m_emailInfo;
ei->m_toAddress.safeStrcpy ( email );
ei->m_toAddress.nullTerm();
ei->m_fromAddress.safePrintf("support@diffbot.com");
ei->m_subject.safePrintf("crawl paused");
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
"has been paused because it hit "
"a maxPagesToCrawl or maxPagesToProcess "
"limitation."
, m_cr->m_coll);
ei->m_state = this;
ei->m_callback = doneSendingNotifyEmailWrapper;
// this will usually block, unless error maybe
if ( ! sendEmail ( ei ) )
m_notifyBlocked++;
}
if ( url && url[0] ) {
log("build: sending url notification to %s for coll \"%s\"",
url,m_cr->m_coll);
// GET request
if ( ! g_httpServer.getDoc ( url ,
0 , // ip
0 , // offset
-1 , // size
false, // ifmodsince
this ,
doneGettingNotifyUrlWrapper ,
60*1000 , // timeout
0, // proxyip
0 , // proxyport
10000, // maxTextDocLen
10000 // maxOtherDocLen
) )
m_notifyBlocked++;
}
if ( m_notifyBlocked == 0 ) return true;
// we blocked, wait
return false;
}

@ -64,6 +64,7 @@
#include "Spider.h" // SpiderRequest/SpiderReply definitions
#include "HttpMime.h" // ET_DEFLAT
#include "Msg1.h"
#include "PingServer.h"
//#include "PageCrawlBot.h" // DBA_NONE
//#define XMLDOC_MAX_AD_IDS 4
@ -85,20 +86,6 @@
#define XD_MAX_AD_IDS 5
class EmailInfo {
public:
SafeBuf m_toAddress;
SafeBuf m_fromAddress;
SafeBuf m_subject;
SafeBuf m_body;
char *m_dom; // ref into m_toAddress of the domain in email addr
SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
void *m_state;
void (* m_callback ) (void *state);
// ip address of MX record for this domain
long m_mxIp;
};
double getTrafficPercent ( long rank ) ;
bool setLangVec ( class Words *words ,
@ -1534,14 +1521,12 @@ class XmlDoc {
SafeBuf *getDiffbotReply ( ) ;
//bool doesUrlMatchDiffbotCrawlPattern() ;
//bool doesUrlMatchDiffbotProcessPattern() ;
//bool doesPageContentMatchDiffbotProcessPattern() ;
bool doesPageContentMatchDiffbotProcessPattern() ;
char *hashJSON ( HashTableX *table );
long *nukeJSONObjects ( ) ;
long m_joc;
bool sendNotification ( );
EmailInfo m_emailInfo;
long m_notifyBlocked;
//
// functions and vars for the seo query matching tool