Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2013-10-09 13:07:22 -06:00 · 2013-10-09 13:07:22 -06:00 · 24e3b8cf52
commit 24e3b8cf52
parent a78dc35169 58e2be8b6f
14 changed files with 474 additions and 144 deletions
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -68,7 +68,7 @@
 #include "HashTableX.h"
 #include "RdbList.h"
 #include "Rdb.h" // for RdbBase
-
+#include "PingServer.h" // EmailInfo

 // how many counts are in CrawlInfo below????
 #define NUMCRAWLSTATS 8
@ -94,6 +94,13 @@ class CrawlInfo {

 	long m_lastUpdateTime;

+	// this is non-zero if urls are available to be spidered right now.
+	long m_hasUrlsReadyToSpider;
+
+	// have we sent out email/webhook notifications crawl has no urls
+	// currently in the ready queue (doledb) to spider?
+	char m_sentCrawlDoneAlert;
+
 	void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
 	//bool print (class SafeBuf *sb ) ;
 	//bool setFromSafeBuf (class SafeBuf *sb ) ;
@ -400,7 +407,7 @@ class CollectionRec {
 	//SafeBuf m_diffbotApiList;//QueryString;
 	//SafeBuf m_diffbotUrlCrawlPattern;
 	//SafeBuf m_diffbotUrlProcessPattern;
-	//SafeBuf m_diffbotPageProcessPattern;
+	SafeBuf m_diffbotPageProcessPattern;
 	//SafeBuf m_diffbotClassify;
 	//char m_diffbotClassify;
 	//char m_useDiffbot;
@ -424,6 +431,7 @@ class CollectionRec {
 	CrawlInfo m_globalCrawlInfo;
 	// last time we computed global crawl info
 	//time_t m_globalCrawlInfoUpdateTime;
+	EmailInfo m_emailInfo;
 	// for counting replies
 	long m_replies;
 	long m_requests;
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -129,7 +129,8 @@ bool HttpServer::getDoc ( char   *url      ,
 			  char    *proto ,
 			  bool     doPost ,
 			  char    *cookie ,
-			  char    *additionalHeader ) { 
+			  char    *additionalHeader ,
+			  char    *fullRequest ) { 
 	// sanity
 	if ( ip == -1 ) 
 		log("http: you probably didn't mean to set ip=-1 did you? "
@ -152,24 +153,37 @@ bool HttpServer::getDoc ( char   *url      ,
 		tcp = &m_ssltcp;
 		defPort = 443;
 	}
-	// this returns false and sets g_errno on error
-	if ( ! r.set ( url , offset , size , ifModifiedSince ,
-		       userAgent , proto , doPost , cookie ,
-		       additionalHeader ) ) return true;

-	if ( g_conf.m_logDebugSpider )
-		log("spider: httprequest = %s", r.getRequest());
+	char *req = NULL;
+	long reqSize;
+
+	// this returns false and sets g_errno on error
+	if ( ! fullRequest ) {
+		if ( ! r.set ( url , offset , size , ifModifiedSince ,
+			       userAgent , proto , doPost , cookie ,
+			       additionalHeader ) ) return true;
+		reqSize = r.getRequestLen();
+		req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
+	}
+	else {
+		// does not contain \0 i guess
+		reqSize = gbstrlen(fullRequest);
+		req = (char *) mdup ( fullRequest , reqSize,"HttpServer");
+	}
+
+	// . get the request from the static buffer and dup it
+	// . return true and set g_errno on error
+	if ( ! req ) return true;

 	long  hostLen ;
 	long  port = defPort;
 	char *host = getHostFast ( url , &hostLen , &port );
 	

-	// . get the request from the static buffer and dup it
-	// . return true and set g_errno on error
-	long  reqSize = r.getRequestLen();
-	char *req     = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
-	if ( ! req ) return true;
+	if ( g_conf.m_logDebugSpider )
+		log("spider: httprequest = %s", req );
+
+
 	// do we have an ip to send to? assume not
 	if ( proxyIp ) { ip = proxyIp ; port = proxyPort; }
 	// special NULL case
--- a/HttpServer.h
+++ b/HttpServer.h
@ -96,7 +96,9 @@ class HttpServer {
 		      char   *proto = "HTTP/1.0" ,
 		      bool    doPost = false ,
 		      char   *cookie = NULL ,
-		      char *additionalHeader = NULL ); // does not include \r\n
+		      char *additionalHeader = NULL , // does not include \r\n
+		      // specify your own mime and post data here...
+		      char *fullRequest = NULL );

 	bool getDoc ( long ip,
 		      long port,
--- a/Mem.cpp
+++ b/Mem.cpp
@ -323,7 +323,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
 	if ( g_mem.m_used + size >= g_mem.m_maxMem &&
 	     g_mem.m_maxMem > 1000000 ) {
 		log("mem: new(%i): Out of memory.", size );
-		throw bad_alloc();
+		throw std::bad_alloc();
 		//throw 1;
 	}
 #ifdef _EFENCE_
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -1875,6 +1875,9 @@ static class HelpItem s_his[] = {
 	{"notifyurl","Fetch this URL when crawl hits "
 	 "the maxtocrawl or maxtoprocess limit."},
 	{"urt","Use robots.txt?"},
+	{"pageprocesspattern","List of || separated strings. If the page "
+	 "contains any of these then we send it to diffbot for processing. "
+	 "If this is empty we send all pages to diffbot for processing."},
 	//{"dbapilist","Special list of diffbot API urls. The URL Filters "
 	// "will display these options in a drop down menu. "
 	// "Example (unencoded): "
@ -2056,16 +2059,30 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 			cr->m_notifyEmail.set(email);
 			cr->m_notifyEmail.nullTerm();
 		}
+		else {
+			cr->m_notifyEmail.purge();
+		}
 		char *url = hr->getString("notifyurl",NULL,NULL);
 		if ( url ) {
 			cr->m_notifyUrl.set(url);
 			cr->m_notifyUrl.nullTerm();
 		}
+		else {
+			cr->m_notifyUrl.purge();
+		}
 		long pause = hr->getLong("pause",-1);
 		if ( pause == 0 ) cr->m_spideringEnabled = 1;
 		if ( pause == 1 ) cr->m_spideringEnabled = 0;
 		long urt = hr->getLong("urt",-1);
 		if ( urt != -1 ) cr->m_useRobotsTxt = urt;
+		char *ppp = hr->getString("pageprocesspattern",NULL);
+		if ( ppp ) {
+			cr->m_diffbotPageProcessPattern.set(ppp);
+			cr->m_diffbotPageProcessPattern.nullTerm();
+		}
+		else {
+			cr->m_diffbotPageProcessPattern.purge();
+		}
 		// this is a cast, so just return simple response
 		return g_httpServer.sendDynamicPage (socket,"OK",2);
 	}
@ -2804,6 +2821,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      
 			      //
 			      //
+			      "<tr>"
+			      "<td><b>Page Process Pattern:</b> "
+			      "</td><td>"
+			      "<input type=text name=pageprocesspattern "
+			      "size=20 value=\"%s\"> "
+			      "<input type=submit name=submit value=OK>"
+			      "</td>"
+			      "</tr>"
+
 			      "<tr>"
 			      "<td><b>Max Page Download Successes:</b> "
 			      "</td><td>"
@ -2881,6 +2907,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      , cr->m_coll
 			      , cr->m_coll

+			      , cr->m_diffbotPageProcessPattern.getBufStart()
+
 			      , cr->m_diffbotMaxToCrawl 
 			      , cr->m_diffbotMaxToProcess

@ -3291,7 +3319,6 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
 	cr->m_diffbotApiQueryString.set ( apiQueryString );
 	cr->m_diffbotUrlCrawlPattern.set ( urlCrawlPattern );
 	cr->m_diffbotUrlProcessPattern.set ( urlProcessPattern );
-	cr->m_diffbotPageProcessPattern.set ( pageProcessPattern );
 	cr->m_diffbotClassify = classify;

 	// let's make these all NULL terminated strings
@ -3303,7 +3330,9 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
 	cr->m_diffbotPageProcessPattern.nullTerm();
 	*/

-
+	// bring this back
+	cr->m_diffbotPageProcessPattern.set ( "" );
+	cr->m_diffbotPageProcessPattern.nullTerm();

 	// do not spider more than this many urls total. -1 means no max.
 	cr->m_diffbotMaxToCrawl = 100000;
--- a/PageStats.cpp
+++ b/PageStats.cpp
@ -29,6 +29,7 @@
 #include "Placedb.h"
 #include "Sections.h"
 //#include "Msg0.h" // g_termlistCache
+#include "Msg13.h"

 bool printNumAbbr ( SafeBuf &p, long long vvv ) {
 	float val = (float)vvv;
--- a/PingServer.cpp
+++ b/PingServer.cpp
@ -2879,3 +2879,216 @@ bool gotMxIp ( EmailInfo *ei ) {
 	return true;
 }

+
+static void gotMandrillReplyWrapper ( void *state , TcpSocket *s ) {
+	EmailInfo *ei = (EmailInfo *)state;
+	ei->m_callback ( ei->m_state );
+}
+
+
+// mailchimp http mail api
+bool sendEmailThroughMandrill ( class EmailInfo *ei ) {
+
+	// this is often set from XmlDoc.cpp::indexDoc()
+	g_errno = 0;
+
+	SafeBuf sb;
+
+	// then the message to send
+	sb.safePrintf(
+		  "POST /api/1.0/messages/send-template.json"
+		  " HTTP/1.0\r\n"
+		  "Accept: image/gif, image/x-xbitmap, image/jpeg, "
+		  "image/pjpeg, application/x-shockwave-flash, "
+		  "application/msword, */*\r\n"
+		  "Accept-Language: en-us\r\n"
+		  "Content-Type: application/x-www-form-urlencoded\r\n"
+		  "Accept-Encoding: gzip, deflate\r\n"
+		  "User-Agent: Mozilla/4.0 "
+		  "(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r\n"
+		  "Host: mandrillapp.com\r\n" // www.t-mobile.com
+		  "Content-Length: xxx\r\n"
+		  //"Connection: Keep-Alive\r\n"
+		  "Connection: close\r\n"
+		  "Cookie: \r\n"
+		  "Cache-Control: no-cache\r\n\r\n"
+		  );
+	//
+	// post data
+	//
+	char *to = ei->m_toAddress.getBufStart();
+	char *from = ei->m_fromAddress.getBufStart();
+
+	SafeBuf ub;
+	sb.safePrintf( "{\"key\":\"GhWT0UpcVBl7kmumrt9dqg\","
+		       "\"template_name\":\"crawl-finished\","
+		       "\"template_content\": [],"
+		       "\"message\": {"
+		       "\"to\": ["
+		       "{"
+		       "\"email\":\"%s\""
+		       "}"
+		       "],"
+
+		       "\"from_email\":\"%s\","
+		       "\"headers\": {"
+		       "\"Reply-To\":\"%s\""
+		       "},"
+		       "\"bcc_address\":\"%s\","
+		       "\"global_merge_vars\":["
+		       "{"
+		       "\"name\":\"CRAWLNAME\","
+		       "\"content\":\"%s\""
+		       "}"
+		       "]"
+		       "}"
+		       "}"
+		       , to
+		       , from
+		       , from
+		       , from
+		       , ei->m_cr->m_coll
+		       );
+	ub.urlEncode();
+	// append the post data to the full request
+	sb.safeMemcpy ( &ub );
+	// make sure ends in \0
+	sb.nullTerm();
+
+	// gotta get the cookie
+	char *uu = "https://mandrillapp.com/";
+	if ( ! g_httpServer.getDoc ( uu,
+				     0, // ip
+				     0                 , // offset
+				     -1                , // size
+				     false             , // m_ifModifiedSince
+				     ei              , // state
+				     gotMandrillReplyWrapper    , // 
+				     60*1000           , // timeout
+				     0                 , // m_proxyIp
+				     0                 , // m_proxyPort
+				     100*1024          , // m_maxTextDocLen
+				     100*1024          , // m_maxOtherDocLen  
+				     NULL,     // user agent
+				     "HTTP/1.0" , //proto
+				     true, // post?
+				     NULL, // cookie
+				     NULL, // additional header
+				     sb.getBufStart() ) ) // full requesst
+		return false;
+	// must have been an error
+	log("net: Got error getting page from mandrill: %s.",
+	    mstrerror(g_errno));
+	// ignore it
+	g_errno = 0;
+	// always call this at the end
+	return true;
+}
+	
+/////////////////////////////
+//
+// send two notifications, email and webhook
+//
+/////////////////////////////
+
+void doneSendingNotifyEmailWrapper ( void *state ) {
+	EmailInfo *ei = (EmailInfo *)state;
+	ei->m_notifyBlocked--;
+	// error?
+	log("build: email notification status: %s",mstrerror(g_errno));
+	// ignore it for rest
+	g_errno = 0;
+	// wait for post url to get done
+	if ( ei->m_notifyBlocked > 0 ) return;
+	// unmark it
+	ei->m_inUse = false;
+	// all done
+	ei->m_finalCallback ( ei->m_finalState );
+}
+
+void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
+	EmailInfo *ei = (EmailInfo *)state;
+	ei->m_notifyBlocked--;
+	// error?
+	log("build: url notification status: %s",mstrerror(g_errno));
+	// wait for email to get done
+	if ( ei->m_notifyBlocked > 0 ) return;
+	// unmark it
+	ei->m_inUse = false;
+	// all done
+	ei->m_finalCallback ( ei->m_finalState );
+}
+
+// . return false if would block, true otherwise
+// . used to send email and get a url when a crawl hits a maxToCrawl
+//   or maxToProcess limitation.
+bool sendNotification ( EmailInfo *ei ) {
+
+	if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
+
+	// caller must set this, as well as m_finalCallback/m_finalState
+	CollectionRec *cr = ei->m_cr;
+
+	char *email = cr->m_notifyEmail.getBufStart();
+	char *url   = cr->m_notifyUrl.getBufStart();
+
+	// sanity check, can only call once
+	if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
+
+	ei->m_inUse = true;
+
+	if ( email && email[0] ) {
+		log("build: sending email notification to %s for coll \"%s\"",
+		    email,cr->m_coll);
+		SafeBuf msg;
+		msg.safePrintf("Your crawl \"%s\" "
+			       "has hit a limitation and has "
+			       "been paused."
+			       , cr->m_coll);
+		// use this
+		ei->m_toAddress.safeStrcpy ( email );
+		ei->m_toAddress.nullTerm();
+		ei->m_fromAddress.safePrintf("support@diffbot.com");
+		/*
+		ei->m_subject.safePrintf("crawl paused");
+		ei->m_body.safePrintf("Your crawl for collection \"%s\" "
+				      "has been paused because it hit "
+				      "a maxPagesToCrawl or maxPagesToProcess "
+				      "limitation."
+				      , cr->m_coll);
+		*/
+		ei->m_state = ei;//this;
+		ei->m_callback = doneSendingNotifyEmailWrapper;
+		// this will usually block, unless error maybe
+		if ( ! sendEmailThroughMandrill ( ei ) )
+			ei->m_notifyBlocked++;
+	}
+
+	if ( url && url[0] ) {
+		log("build: sending url notification to %s for coll \"%s\"",
+		    url,cr->m_coll);
+		// GET request
+		if ( ! g_httpServer.getDoc ( url ,
+					     0 , // ip
+					     0 , // offset
+					    -1 , // size
+					     false, // ifmodsince
+					     ei,//this ,
+					     doneGettingNotifyUrlWrapper ,
+					     60*1000 , // timeout
+					     0, // proxyip
+					     0 , // proxyport
+					     10000, // maxTextDocLen
+					     10000 // maxOtherDocLen
+					     ) )
+			ei->m_notifyBlocked++;
+	}
+
+	if ( ei->m_notifyBlocked == 0 ) {
+		ei->m_inUse = false;
+		return true;
+	}
+
+	// we blocked, wait
+	return false;
+}
--- a/PingServer.h
+++ b/PingServer.h
@ -5,7 +5,37 @@

 #include "gb-include.h"
 #include "Hostdb.h"
-#include "Repair.h"
+//#include "Repair.h"
+
+extern char g_repairMode;
+
+
+class EmailInfo {
+public:
+	SafeBuf m_toAddress;
+	SafeBuf m_fromAddress;
+	SafeBuf m_subject;
+	SafeBuf m_body;
+	CollectionRec *m_cr;
+	char *m_dom; // ref into m_toAddress of the domain in email addr
+	SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
+	void *m_state;
+	void (* m_callback ) (void *state);
+	void *m_finalState;
+	void (* m_finalCallback ) (void *state);
+	// ip address of MX record for this domain
+	long m_mxIp;
+	long m_notifyBlocked;
+	bool m_inUse;
+	EmailInfo() { 
+		memset ( this,0,sizeof(EmailInfo) ); 
+	};
+	void reset() { 
+		if ( m_inUse ) { char *xx=NULL;*xx=0; }
+		if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
+		memset ( this,0,sizeof(EmailInfo) ); 
+	};
+};

 class PingServer {

@ -135,5 +165,11 @@ extern class PingServer g_pingServer;
 // . use this for sending generic emails
 bool sendEmail ( class EmailInfo *ei ) ;

+// use mailchimp's mandrill email http api
+bool sendEmailThroughMandrill ( class EmailInfo *ei ) ;
+
+// send email and webhook notification
+bool sendNotification ( class EmailInfo *ei );
+
 #endif

--- a/Proxy.cpp
+++ b/Proxy.cpp
@ -2,6 +2,8 @@

 #include "Proxy.h"
 #include "Statsdb.h"
+#include "Msg13.h"
+#include "XmlDoc.h"
 //#include "seo.h" // g_secret_tran_key and api_key


--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -576,7 +576,7 @@ bool RdbDump::dumpList ( RdbList *list , long niceness , bool recall ) {
 	//m_bytesWritten = 0;

 	// sanity check
-	log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);
+	//log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);

 	// . if we're called by RdbMerge directly use m_callback/m_state
 	// . otherwise, use doneWritingWrapper() which will call dumpTree()
--- a/Spider.cpp
+++ b/Spider.cpp
@ -3880,9 +3880,15 @@ void SpiderLoop::spiderDoledUrls ( ) {
 	if ( m_cri >= g_collectiondb.m_numRecs ) { char *xx=NULL;*xx=0; }

 	// grab this
-	collnum_t collnum = m_cri;
+	//collnum_t collnum = m_cri;
+	//CollectionRec *cr = g_collectiondb.m_recs[collnum];
+
+	// update the crawlinfo for this collection if it has been a while.
+	// should never block since callback is NULL.
+	if ( ! updateCrawlInfo(cr,NULL,NULL,true) ) { char *xx=NULL;*xx=0; }
+
 	// get this
-	char *coll = g_collectiondb.m_recs[collnum]->m_coll;
+	char *coll = cr->m_coll;

 	// need this for msg5 call
 	key_t endKey; endKey.setMax();
@ -9188,8 +9194,16 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 	long now = getTimeLocal();
 	// keep it fresh within 1 second
 	long thresh = 1;
+	// if being called from spiderloop, we just want to keep
+	// CrawlInfo::m_nextSpiderTime fresh
+	if ( ! callback ) thresh = 60;
 	// unless cluster is big
-	if ( g_hostdb.m_numHosts > 32 ) thresh = 30;
+	if ( g_hostdb.m_numHosts > 32 ) {
+		// update every 30 seconds
+		thresh = 30;
+		// if doing a passive refresh though...
+		if ( ! callback ) thresh = 120;
+	}
 	
 	if ( useCache && now - cr->m_globalCrawlInfo.m_lastUpdateTime  <thresh)
 		return true;
@ -9208,7 +9222,13 @@ bool updateCrawlInfo ( CollectionRec *cr ,

 	// if we were not the first, we do not initiate it, we just wait
 	// for all the replies to come back
-	if ( cr->m_replies < cr->m_requests ) return false;
+	if ( cr->m_replies < cr->m_requests ) {
+		// unless we had no callback! we do that in SpiderLoop above
+		// to keep the crawl info fresh.
+		if ( ! callback ) return true;
+		// otherwise, block and we'll call your callback when done
+		return false;
+	}

 	// sanity test
 	if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
@ -9259,6 +9279,15 @@ bool updateCrawlInfo ( CollectionRec *cr ,
 	return true;
 }

+void doneSendingNotification ( void *state ) {
+	EmailInfo *ei = (EmailInfo *)state;
+	log("spider: done sending notifications for coll=%s",
+	    ei->m_cr->m_coll);
+	// mark it as sent. anytime a new url is spidered will mark this
+	// as false again! use LOCAL crawlInfo, since global is reset often.
+	ei->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 1;
+}
+
 void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// reply is error?
 	if ( ! slot->m_readBuf || g_errno ) {
@ -9288,6 +9317,11 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 			gs++;
 			ss++;
 		}
+		if ( stats->m_hasUrlsReadyToSpider ) {
+			cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider++;
+			// unflag the sent flag if we had sent an alert
+			cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
+		}
 	}
 	// return if still waiting on more to come in
 	if ( cr->m_replies < cr->m_requests ) return;
@ -9320,6 +9354,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 		p += sizeof(CallbackEntry2);
 		// clear g_errno just in case
 		g_errno = 0;
+		// this is NULL when called from SpiderLoop::spiderDoledUrls()
+		// because that is just updating it for maintenance
+		if ( ! ce2->m_callback ) continue;
 		// debug note
 		//XmlDoc *xd = (XmlDoc *)(ce2->m_state);
 		//log("spider: calling crawlupdate callback for %s",
@ -9335,6 +9372,34 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {

 	// save the mem!
 	cr->m_callbackQueue.purge();
+
+	// now if its the first time a crawl has no rec to spider for
+	// a while, we want to send an alert to the user so they know their
+	// crawl is done.
+
+	// only host #0 sends alaerts
+	if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
+
+	// but of course if it has urls ready to spider, do not send alert
+	if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) return;
+
+	// if we already sent it return now. we set this to false everytime
+	// we spider a url, which resets it. use local crawlinfo for this
+	// since we reset global.
+	if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return;
+
+	// ok, send it
+	EmailInfo *ei = &cr->m_emailInfo;
+
+	// in use already?
+	if ( ei->m_inUse ) return;
+
+	// set it up
+	ei->m_finalCallback = doneSendingNotification;
+	ei->m_finalState    = ei;
+	ei->m_cr            = cr;
+
+	sendNotification ( ei );
 }

 void handleRequestc1 ( UdpSlot *slot , long niceness ) {
@ -9343,6 +9408,16 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
 	if ( slot->m_readBufSize != sizeof(collnum_t) ) { char *xx=NULL;*xx=0;}
 	collnum_t collnum = *(collnum_t *)request;
 	CollectionRec *cr = g_collectiondb.getRec(collnum);
+
+	// while we are here update CrawlInfo::m_nextSpiderTime
+	// to the time of the next spider request to spider.
+	// if doledb is empty and the next rec in the waiting tree
+	// does not have a time of zero, but rather, in the future, then
+	// return that future time. so if a crawl is enabled we should
+	// actively call updateCrawlInfo a collection every minute or
+	// so.
+
+
 	char *reply = slot->m_tmpBuf;
 	if ( TMPBUFSIZE < sizeof(CrawlInfo) ) { char *xx=NULL;*xx=0; }
 	memcpy ( reply , &cr->m_localCrawlInfo , sizeof(CrawlInfo) );
--- a/Test.cpp
+++ b/Test.cpp
@ -19,6 +19,7 @@
 #include "Process.h"
 #include "Placedb.h"
 #include "Threads.h"
+#include "Msge1.h"

 //static void testWrapper ( int fd , void *state ) ;
 static void injectedWrapper ( void *state ) ;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -44,7 +44,8 @@
 #include "Highlight.h"
 #include "Wiktionary.h"
 #include "seo.h" // Msg99Request etc.
-#include <regex.h>
+//#include <regex.h>
+#include "PingServer.h"

 #define MAXDOCLEN (1024*1024)

@ -162,7 +163,7 @@ XmlDoc::XmlDoc() {
 	m_numMsg4fRequests = 0;
 	m_numMsg4fReplies = 0;
 	m_sentMsg4fRequests = false;
-	m_notifyBlocked = 0;
+	//m_notifyBlocked = 0;
 	//m_mcasts = NULL;
 	//for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) 
 	//	m_currentBinPtrs[i] = NULL;
@ -180,7 +181,7 @@ static long long s_lastTimeStart = 0LL;
 void XmlDoc::reset ( ) {

 	// notifications pending?
-	if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
+	//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }

 	m_loaded = false;

@ -1956,8 +1957,15 @@ bool XmlDoc::indexDoc2 ( ) {
 		if ( ! m_cr->m_spideringEnabled ) return true;
 		// do not repeat call sendNotification()
 		m_cr->m_spideringEnabled = false;
+		// set this
+		m_emailInfo.reset();
+		m_emailInfo.m_finalCallback = m_masterLoop;
+		m_emailInfo.m_finalState = m_masterState;
+		m_emailInfo.m_cr = m_cr;
+		// note it
+		setStatus("sending notification");
 		// this returns false if it would block, so we ret fals
-		if ( ! sendNotification() ) return false;
+		if ( ! sendNotification ( &m_emailInfo ) ) return false;
 		// it didn't block
 		g_errno = m_indexCode;
 		return true;
@ -1980,8 +1988,16 @@ bool XmlDoc::indexDoc2 ( ) {
 		if ( ! m_cr->m_spideringEnabled ) return true;
 		// turn them off and send notification (email or url)
 		m_cr->m_spideringEnabled = false;
-		// this returns false if it would block, so we ret fals
-		if ( ! sendNotification() ) return false;
+		// set this
+		m_emailInfo.reset();
+		m_emailInfo.m_finalCallback = m_masterLoop;
+		m_emailInfo.m_finalState = m_masterState;
+		m_emailInfo.m_cr = m_cr;
+		// note it
+		setStatus("sending notification");
+		// . this returns false if it would block, so we ret fals
+		// . this is now in PingServer.cpp
+		if ( ! sendNotification( &m_emailInfo ) ) return false;
 		// it didn't block
 		g_errno = m_indexCode;
 		return true;
@ -12015,12 +12031,6 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	//	return &m_diffbotReply;
 	//}

-	// or if original page content matches the page regex dont hit diffbot
-	//if( m_useDiffbot && ! doesPageContentMatchDiffbotProcessPattern() ) {
-	//	m_diffbotReplyValid = true;
-	//	return &m_diffbotReply;
-	//}
-
 	// empty content, do not send to diffbot then
 	char **u8 = getUtf8Content();
 	if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
@ -12039,6 +12049,12 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	}


+	// or if original page content matches the page regex dont hit diffbot
+	if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
+		m_diffbotReplyValid = true;
+		return &m_diffbotReply;
+	}
+
 	setStatus("getting diffbot reply");

 	//char *path = "api";
@ -17108,6 +17124,45 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
 }
 */

+bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
+	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
+	char *p = m_cr->m_diffbotPageProcessPattern.getBufStart();
+	// how many did we have?
+	long count = 0;
+	// scan the " || " separated substrings
+	for ( ; *p ; ) {
+		// get beginning of this string
+		char *start = p;
+		// skip white space
+		while ( *start && is_wspace_a(*start) ) start++;
+		// done?
+		if ( ! *start ) break;
+		// find end of it
+		char *end = start;
+		while ( *end && end[0] != '|' && ! is_wspace_a(end[0]) ) 
+			end++;
+		// advance p for next guy
+		p = end;
+		while ( *p && (*p=='|' || is_wspace_a(*p) ) ) p++;
+		// temp null this
+		char c = *end;
+		*end = '\0';
+		// count it as an attempt
+		count++;
+		// . is this substring anywhere in the document
+		// . check the rawest content before converting to utf8 i guess
+		char *foundPtr =  strstr ( m_content , start ) ;
+		// revert \0
+		*end = c;
+		// did we find it?
+		if ( foundPtr ) return true;
+	}
+	// if we had no attempts, it is ok
+	if ( count == 0 ) return true;
+	// if we had an unfound substring...
+	return false;
+}
+
 // . returns ptr to status
 // . diffbot uses this to remove the indexed json pages associated with
 //   a url. each json object is basically its own url. a json object
@ -41951,94 +42006,3 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {

 	return (char *)0x01;
 }
-
-void doneSendingNotifyEmailWrapper ( void *state ) {
-	XmlDoc *THIS = (XmlDoc *)state;
-	THIS->m_notifyBlocked--;
-	// error?
-	log("build: email notification status: %s",mstrerror(g_errno));
-	// ignore it for rest
-	g_errno = 0;
-	// wait for post url to get done
-	if ( THIS->m_notifyBlocked > 0 ) return;
-	// all done
-	THIS->m_masterLoop ( THIS->m_masterState );
-}
-
-void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
-	XmlDoc *THIS = (XmlDoc *)state;
-	THIS->m_notifyBlocked--;
-	// error?
-	log("build: url notification status: %s",mstrerror(g_errno));
-	// wait for post url to get done
-	if ( THIS->m_notifyBlocked > 0 ) return;
-	// all done
-	THIS->m_masterLoop ( THIS->m_masterState );
-}
-
-#include "PingServer.h" // sendEmail() function
-
-// . return false if would block, true otherwise
-// . used to send email and get a url when a crawl hits a maxToCrawl
-//   or maxToProcess limitation.
-bool XmlDoc::sendNotification ( ) {
-
-	setStatus("sending notification");
-
-	char *email = m_cr->m_notifyEmail.getBufStart();
-	char *url   = m_cr->m_notifyUrl.getBufStart();
-
-	// sanity check, can only call once
-	if ( m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
-
-	if ( email && email[0] ) {
-		log("build: sending email notification to %s for coll \"%s\"",
-		    email,m_cr->m_coll);
-		SafeBuf msg;
-		msg.safePrintf("Your crawl \"%s\" "
-			       "has hit a limitation and has "
-			       "been paused."
-			       , m_cr->m_coll);
-		// use this
-		EmailInfo *ei = &m_emailInfo;
-		ei->m_toAddress.safeStrcpy ( email );
-		ei->m_toAddress.nullTerm();
-		ei->m_fromAddress.safePrintf("support@diffbot.com");
-		ei->m_subject.safePrintf("crawl paused");
-		ei->m_body.safePrintf("Your crawl for collection \"%s\" "
-				      "has been paused because it hit "
-				      "a maxPagesToCrawl or maxPagesToProcess "
-				      "limitation."
-				      , m_cr->m_coll);
-		ei->m_state = this;
-		ei->m_callback = doneSendingNotifyEmailWrapper;
-		// this will usually block, unless error maybe
-		if ( ! sendEmail ( ei ) )
-			m_notifyBlocked++;
-	}
-
-	if ( url && url[0] ) {
-		log("build: sending url notification to %s for coll \"%s\"",
-		    url,m_cr->m_coll);
-		// GET request
-		if ( ! g_httpServer.getDoc ( url ,
-					     0 , // ip
-					     0 , // offset
-					    -1 , // size
-					     false, // ifmodsince
-					     this ,
-					     doneGettingNotifyUrlWrapper ,
-					     60*1000 , // timeout
-					     0, // proxyip
-					     0 , // proxyport
-					     10000, // maxTextDocLen
-					     10000 // maxOtherDocLen
-					     ) )
-			m_notifyBlocked++;
-	}
-
-	if ( m_notifyBlocked == 0 ) return true;
-
-	// we blocked, wait
-	return false;
-}
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -64,6 +64,7 @@
 #include "Spider.h" // SpiderRequest/SpiderReply definitions
 #include "HttpMime.h" // ET_DEFLAT
 #include "Msg1.h"
+#include "PingServer.h"
 //#include "PageCrawlBot.h" // DBA_NONE

 //#define XMLDOC_MAX_AD_IDS 4
@ -85,20 +86,6 @@

 #define XD_MAX_AD_IDS         5

-class EmailInfo {
-public:
-	SafeBuf m_toAddress;
-	SafeBuf m_fromAddress;
-	SafeBuf m_subject;
-	SafeBuf m_body;
-	char *m_dom; // ref into m_toAddress of the domain in email addr
-	SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
-	void *m_state;
-	void (* m_callback ) (void *state);
-	// ip address of MX record for this domain
-	long m_mxIp;
-};
-
 double getTrafficPercent ( long rank ) ;

 bool setLangVec ( class Words *words , 
@ -1534,14 +1521,12 @@ class XmlDoc {
 	SafeBuf *getDiffbotReply ( ) ;
 	//bool doesUrlMatchDiffbotCrawlPattern() ;
 	//bool doesUrlMatchDiffbotProcessPattern() ;
-	//bool doesPageContentMatchDiffbotProcessPattern() ;
+	bool doesPageContentMatchDiffbotProcessPattern() ;
 	char *hashJSON ( HashTableX *table );
 	long *nukeJSONObjects ( ) ;
 	long m_joc;

-	bool sendNotification ( );
 	EmailInfo m_emailInfo;
-	long m_notifyBlocked;

 	//
 	// functions and vars for the seo query matching tool