Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
This commit is contained in:
commit
24e3b8cf52
@ -68,7 +68,7 @@
|
||||
#include "HashTableX.h"
|
||||
#include "RdbList.h"
|
||||
#include "Rdb.h" // for RdbBase
|
||||
|
||||
#include "PingServer.h" // EmailInfo
|
||||
|
||||
// how many counts are in CrawlInfo below????
|
||||
#define NUMCRAWLSTATS 8
|
||||
@ -94,6 +94,13 @@ class CrawlInfo {
|
||||
|
||||
long m_lastUpdateTime;
|
||||
|
||||
// this is non-zero if urls are available to be spidered right now.
|
||||
long m_hasUrlsReadyToSpider;
|
||||
|
||||
// have we sent out email/webhook notifications crawl has no urls
|
||||
// currently in the ready queue (doledb) to spider?
|
||||
char m_sentCrawlDoneAlert;
|
||||
|
||||
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
|
||||
//bool print (class SafeBuf *sb ) ;
|
||||
//bool setFromSafeBuf (class SafeBuf *sb ) ;
|
||||
@ -400,7 +407,7 @@ class CollectionRec {
|
||||
//SafeBuf m_diffbotApiList;//QueryString;
|
||||
//SafeBuf m_diffbotUrlCrawlPattern;
|
||||
//SafeBuf m_diffbotUrlProcessPattern;
|
||||
//SafeBuf m_diffbotPageProcessPattern;
|
||||
SafeBuf m_diffbotPageProcessPattern;
|
||||
//SafeBuf m_diffbotClassify;
|
||||
//char m_diffbotClassify;
|
||||
//char m_useDiffbot;
|
||||
@ -424,6 +431,7 @@ class CollectionRec {
|
||||
CrawlInfo m_globalCrawlInfo;
|
||||
// last time we computed global crawl info
|
||||
//time_t m_globalCrawlInfoUpdateTime;
|
||||
EmailInfo m_emailInfo;
|
||||
// for counting replies
|
||||
long m_replies;
|
||||
long m_requests;
|
||||
|
@ -129,7 +129,8 @@ bool HttpServer::getDoc ( char *url ,
|
||||
char *proto ,
|
||||
bool doPost ,
|
||||
char *cookie ,
|
||||
char *additionalHeader ) {
|
||||
char *additionalHeader ,
|
||||
char *fullRequest ) {
|
||||
// sanity
|
||||
if ( ip == -1 )
|
||||
log("http: you probably didn't mean to set ip=-1 did you? "
|
||||
@ -152,24 +153,37 @@ bool HttpServer::getDoc ( char *url ,
|
||||
tcp = &m_ssltcp;
|
||||
defPort = 443;
|
||||
}
|
||||
// this returns false and sets g_errno on error
|
||||
if ( ! r.set ( url , offset , size , ifModifiedSince ,
|
||||
userAgent , proto , doPost , cookie ,
|
||||
additionalHeader ) ) return true;
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: httprequest = %s", r.getRequest());
|
||||
char *req = NULL;
|
||||
long reqSize;
|
||||
|
||||
// this returns false and sets g_errno on error
|
||||
if ( ! fullRequest ) {
|
||||
if ( ! r.set ( url , offset , size , ifModifiedSince ,
|
||||
userAgent , proto , doPost , cookie ,
|
||||
additionalHeader ) ) return true;
|
||||
reqSize = r.getRequestLen();
|
||||
req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
|
||||
}
|
||||
else {
|
||||
// does not contain \0 i guess
|
||||
reqSize = gbstrlen(fullRequest);
|
||||
req = (char *) mdup ( fullRequest , reqSize,"HttpServer");
|
||||
}
|
||||
|
||||
// . get the request from the static buffer and dup it
|
||||
// . return true and set g_errno on error
|
||||
if ( ! req ) return true;
|
||||
|
||||
long hostLen ;
|
||||
long port = defPort;
|
||||
char *host = getHostFast ( url , &hostLen , &port );
|
||||
|
||||
|
||||
// . get the request from the static buffer and dup it
|
||||
// . return true and set g_errno on error
|
||||
long reqSize = r.getRequestLen();
|
||||
char *req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
|
||||
if ( ! req ) return true;
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: httprequest = %s", req );
|
||||
|
||||
|
||||
// do we have an ip to send to? assume not
|
||||
if ( proxyIp ) { ip = proxyIp ; port = proxyPort; }
|
||||
// special NULL case
|
||||
|
@ -96,7 +96,9 @@ class HttpServer {
|
||||
char *proto = "HTTP/1.0" ,
|
||||
bool doPost = false ,
|
||||
char *cookie = NULL ,
|
||||
char *additionalHeader = NULL ); // does not include \r\n
|
||||
char *additionalHeader = NULL , // does not include \r\n
|
||||
// specify your own mime and post data here...
|
||||
char *fullRequest = NULL );
|
||||
|
||||
bool getDoc ( long ip,
|
||||
long port,
|
||||
|
2
Mem.cpp
2
Mem.cpp
@ -323,7 +323,7 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
|
||||
if ( g_mem.m_used + size >= g_mem.m_maxMem &&
|
||||
g_mem.m_maxMem > 1000000 ) {
|
||||
log("mem: new(%i): Out of memory.", size );
|
||||
throw bad_alloc();
|
||||
throw std::bad_alloc();
|
||||
//throw 1;
|
||||
}
|
||||
#ifdef _EFENCE_
|
||||
|
@ -1875,6 +1875,9 @@ static class HelpItem s_his[] = {
|
||||
{"notifyurl","Fetch this URL when crawl hits "
|
||||
"the maxtocrawl or maxtoprocess limit."},
|
||||
{"urt","Use robots.txt?"},
|
||||
{"pageprocesspattern","List of || separated strings. If the page "
|
||||
"contains any of these then we send it to diffbot for processing. "
|
||||
"If this is empty we send all pages to diffbot for processing."},
|
||||
//{"dbapilist","Special list of diffbot API urls. The URL Filters "
|
||||
// "will display these options in a drop down menu. "
|
||||
// "Example (unencoded): "
|
||||
@ -2056,16 +2059,30 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
cr->m_notifyEmail.set(email);
|
||||
cr->m_notifyEmail.nullTerm();
|
||||
}
|
||||
else {
|
||||
cr->m_notifyEmail.purge();
|
||||
}
|
||||
char *url = hr->getString("notifyurl",NULL,NULL);
|
||||
if ( url ) {
|
||||
cr->m_notifyUrl.set(url);
|
||||
cr->m_notifyUrl.nullTerm();
|
||||
}
|
||||
else {
|
||||
cr->m_notifyUrl.purge();
|
||||
}
|
||||
long pause = hr->getLong("pause",-1);
|
||||
if ( pause == 0 ) cr->m_spideringEnabled = 1;
|
||||
if ( pause == 1 ) cr->m_spideringEnabled = 0;
|
||||
long urt = hr->getLong("urt",-1);
|
||||
if ( urt != -1 ) cr->m_useRobotsTxt = urt;
|
||||
char *ppp = hr->getString("pageprocesspattern",NULL);
|
||||
if ( ppp ) {
|
||||
cr->m_diffbotPageProcessPattern.set(ppp);
|
||||
cr->m_diffbotPageProcessPattern.nullTerm();
|
||||
}
|
||||
else {
|
||||
cr->m_diffbotPageProcessPattern.purge();
|
||||
}
|
||||
// this is a cast, so just return simple response
|
||||
return g_httpServer.sendDynamicPage (socket,"OK",2);
|
||||
}
|
||||
@ -2804,6 +2821,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
//
|
||||
//
|
||||
"<tr>"
|
||||
"<td><b>Page Process Pattern:</b> "
|
||||
"</td><td>"
|
||||
"<input type=text name=pageprocesspattern "
|
||||
"size=20 value=\"%s\"> "
|
||||
"<input type=submit name=submit value=OK>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Max Page Download Successes:</b> "
|
||||
"</td><td>"
|
||||
@ -2881,6 +2907,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, cr->m_coll
|
||||
, cr->m_coll
|
||||
|
||||
, cr->m_diffbotPageProcessPattern.getBufStart()
|
||||
|
||||
, cr->m_diffbotMaxToCrawl
|
||||
, cr->m_diffbotMaxToProcess
|
||||
|
||||
@ -3291,7 +3319,6 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
|
||||
cr->m_diffbotApiQueryString.set ( apiQueryString );
|
||||
cr->m_diffbotUrlCrawlPattern.set ( urlCrawlPattern );
|
||||
cr->m_diffbotUrlProcessPattern.set ( urlProcessPattern );
|
||||
cr->m_diffbotPageProcessPattern.set ( pageProcessPattern );
|
||||
cr->m_diffbotClassify = classify;
|
||||
|
||||
// let's make these all NULL terminated strings
|
||||
@ -3303,7 +3330,9 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
|
||||
cr->m_diffbotPageProcessPattern.nullTerm();
|
||||
*/
|
||||
|
||||
|
||||
// bring this back
|
||||
cr->m_diffbotPageProcessPattern.set ( "" );
|
||||
cr->m_diffbotPageProcessPattern.nullTerm();
|
||||
|
||||
// do not spider more than this many urls total. -1 means no max.
|
||||
cr->m_diffbotMaxToCrawl = 100000;
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "Placedb.h"
|
||||
#include "Sections.h"
|
||||
//#include "Msg0.h" // g_termlistCache
|
||||
#include "Msg13.h"
|
||||
|
||||
bool printNumAbbr ( SafeBuf &p, long long vvv ) {
|
||||
float val = (float)vvv;
|
||||
|
213
PingServer.cpp
213
PingServer.cpp
@ -2879,3 +2879,216 @@ bool gotMxIp ( EmailInfo *ei ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void gotMandrillReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
ei->m_callback ( ei->m_state );
|
||||
}
|
||||
|
||||
|
||||
// mailchimp http mail api
|
||||
bool sendEmailThroughMandrill ( class EmailInfo *ei ) {
|
||||
|
||||
// this is often set from XmlDoc.cpp::indexDoc()
|
||||
g_errno = 0;
|
||||
|
||||
SafeBuf sb;
|
||||
|
||||
// then the message to send
|
||||
sb.safePrintf(
|
||||
"POST /api/1.0/messages/send-template.json"
|
||||
" HTTP/1.0\r\n"
|
||||
"Accept: image/gif, image/x-xbitmap, image/jpeg, "
|
||||
"image/pjpeg, application/x-shockwave-flash, "
|
||||
"application/msword, */*\r\n"
|
||||
"Accept-Language: en-us\r\n"
|
||||
"Content-Type: application/x-www-form-urlencoded\r\n"
|
||||
"Accept-Encoding: gzip, deflate\r\n"
|
||||
"User-Agent: Mozilla/4.0 "
|
||||
"(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r\n"
|
||||
"Host: mandrillapp.com\r\n" // www.t-mobile.com
|
||||
"Content-Length: xxx\r\n"
|
||||
//"Connection: Keep-Alive\r\n"
|
||||
"Connection: close\r\n"
|
||||
"Cookie: \r\n"
|
||||
"Cache-Control: no-cache\r\n\r\n"
|
||||
);
|
||||
//
|
||||
// post data
|
||||
//
|
||||
char *to = ei->m_toAddress.getBufStart();
|
||||
char *from = ei->m_fromAddress.getBufStart();
|
||||
|
||||
SafeBuf ub;
|
||||
sb.safePrintf( "{\"key\":\"GhWT0UpcVBl7kmumrt9dqg\","
|
||||
"\"template_name\":\"crawl-finished\","
|
||||
"\"template_content\": [],"
|
||||
"\"message\": {"
|
||||
"\"to\": ["
|
||||
"{"
|
||||
"\"email\":\"%s\""
|
||||
"}"
|
||||
"],"
|
||||
|
||||
"\"from_email\":\"%s\","
|
||||
"\"headers\": {"
|
||||
"\"Reply-To\":\"%s\""
|
||||
"},"
|
||||
"\"bcc_address\":\"%s\","
|
||||
"\"global_merge_vars\":["
|
||||
"{"
|
||||
"\"name\":\"CRAWLNAME\","
|
||||
"\"content\":\"%s\""
|
||||
"}"
|
||||
"]"
|
||||
"}"
|
||||
"}"
|
||||
, to
|
||||
, from
|
||||
, from
|
||||
, from
|
||||
, ei->m_cr->m_coll
|
||||
);
|
||||
ub.urlEncode();
|
||||
// append the post data to the full request
|
||||
sb.safeMemcpy ( &ub );
|
||||
// make sure ends in \0
|
||||
sb.nullTerm();
|
||||
|
||||
// gotta get the cookie
|
||||
char *uu = "https://mandrillapp.com/";
|
||||
if ( ! g_httpServer.getDoc ( uu,
|
||||
0, // ip
|
||||
0 , // offset
|
||||
-1 , // size
|
||||
false , // m_ifModifiedSince
|
||||
ei , // state
|
||||
gotMandrillReplyWrapper , //
|
||||
60*1000 , // timeout
|
||||
0 , // m_proxyIp
|
||||
0 , // m_proxyPort
|
||||
100*1024 , // m_maxTextDocLen
|
||||
100*1024 , // m_maxOtherDocLen
|
||||
NULL, // user agent
|
||||
"HTTP/1.0" , //proto
|
||||
true, // post?
|
||||
NULL, // cookie
|
||||
NULL, // additional header
|
||||
sb.getBufStart() ) ) // full requesst
|
||||
return false;
|
||||
// must have been an error
|
||||
log("net: Got error getting page from mandrill: %s.",
|
||||
mstrerror(g_errno));
|
||||
// ignore it
|
||||
g_errno = 0;
|
||||
// always call this at the end
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
//
|
||||
// send two notifications, email and webhook
|
||||
//
|
||||
/////////////////////////////
|
||||
|
||||
void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
ei->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: email notification status: %s",mstrerror(g_errno));
|
||||
// ignore it for rest
|
||||
g_errno = 0;
|
||||
// wait for post url to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
}
|
||||
|
||||
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
ei->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: url notification status: %s",mstrerror(g_errno));
|
||||
// wait for email to get done
|
||||
if ( ei->m_notifyBlocked > 0 ) return;
|
||||
// unmark it
|
||||
ei->m_inUse = false;
|
||||
// all done
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
}
|
||||
|
||||
// . return false if would block, true otherwise
|
||||
// . used to send email and get a url when a crawl hits a maxToCrawl
|
||||
// or maxToProcess limitation.
|
||||
bool sendNotification ( EmailInfo *ei ) {
|
||||
|
||||
if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// caller must set this, as well as m_finalCallback/m_finalState
|
||||
CollectionRec *cr = ei->m_cr;
|
||||
|
||||
char *email = cr->m_notifyEmail.getBufStart();
|
||||
char *url = cr->m_notifyUrl.getBufStart();
|
||||
|
||||
// sanity check, can only call once
|
||||
if ( ei->m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
ei->m_inUse = true;
|
||||
|
||||
if ( email && email[0] ) {
|
||||
log("build: sending email notification to %s for coll \"%s\"",
|
||||
email,cr->m_coll);
|
||||
SafeBuf msg;
|
||||
msg.safePrintf("Your crawl \"%s\" "
|
||||
"has hit a limitation and has "
|
||||
"been paused."
|
||||
, cr->m_coll);
|
||||
// use this
|
||||
ei->m_toAddress.safeStrcpy ( email );
|
||||
ei->m_toAddress.nullTerm();
|
||||
ei->m_fromAddress.safePrintf("support@diffbot.com");
|
||||
/*
|
||||
ei->m_subject.safePrintf("crawl paused");
|
||||
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
|
||||
"has been paused because it hit "
|
||||
"a maxPagesToCrawl or maxPagesToProcess "
|
||||
"limitation."
|
||||
, cr->m_coll);
|
||||
*/
|
||||
ei->m_state = ei;//this;
|
||||
ei->m_callback = doneSendingNotifyEmailWrapper;
|
||||
// this will usually block, unless error maybe
|
||||
if ( ! sendEmailThroughMandrill ( ei ) )
|
||||
ei->m_notifyBlocked++;
|
||||
}
|
||||
|
||||
if ( url && url[0] ) {
|
||||
log("build: sending url notification to %s for coll \"%s\"",
|
||||
url,cr->m_coll);
|
||||
// GET request
|
||||
if ( ! g_httpServer.getDoc ( url ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
-1 , // size
|
||||
false, // ifmodsince
|
||||
ei,//this ,
|
||||
doneGettingNotifyUrlWrapper ,
|
||||
60*1000 , // timeout
|
||||
0, // proxyip
|
||||
0 , // proxyport
|
||||
10000, // maxTextDocLen
|
||||
10000 // maxOtherDocLen
|
||||
) )
|
||||
ei->m_notifyBlocked++;
|
||||
}
|
||||
|
||||
if ( ei->m_notifyBlocked == 0 ) {
|
||||
ei->m_inUse = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// we blocked, wait
|
||||
return false;
|
||||
}
|
||||
|
38
PingServer.h
38
PingServer.h
@ -5,7 +5,37 @@
|
||||
|
||||
#include "gb-include.h"
|
||||
#include "Hostdb.h"
|
||||
#include "Repair.h"
|
||||
//#include "Repair.h"
|
||||
|
||||
extern char g_repairMode;
|
||||
|
||||
|
||||
class EmailInfo {
|
||||
public:
|
||||
SafeBuf m_toAddress;
|
||||
SafeBuf m_fromAddress;
|
||||
SafeBuf m_subject;
|
||||
SafeBuf m_body;
|
||||
CollectionRec *m_cr;
|
||||
char *m_dom; // ref into m_toAddress of the domain in email addr
|
||||
SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
|
||||
void *m_state;
|
||||
void (* m_callback ) (void *state);
|
||||
void *m_finalState;
|
||||
void (* m_finalCallback ) (void *state);
|
||||
// ip address of MX record for this domain
|
||||
long m_mxIp;
|
||||
long m_notifyBlocked;
|
||||
bool m_inUse;
|
||||
EmailInfo() {
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
void reset() {
|
||||
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
memset ( this,0,sizeof(EmailInfo) );
|
||||
};
|
||||
};
|
||||
|
||||
class PingServer {
|
||||
|
||||
@ -135,5 +165,11 @@ extern class PingServer g_pingServer;
|
||||
// . use this for sending generic emails
|
||||
bool sendEmail ( class EmailInfo *ei ) ;
|
||||
|
||||
// use mailchimp's mandrill email http api
|
||||
bool sendEmailThroughMandrill ( class EmailInfo *ei ) ;
|
||||
|
||||
// send email and webhook notification
|
||||
bool sendNotification ( class EmailInfo *ei );
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -2,6 +2,8 @@
|
||||
|
||||
#include "Proxy.h"
|
||||
#include "Statsdb.h"
|
||||
#include "Msg13.h"
|
||||
#include "XmlDoc.h"
|
||||
//#include "seo.h" // g_secret_tran_key and api_key
|
||||
|
||||
|
||||
|
@ -576,7 +576,7 @@ bool RdbDump::dumpList ( RdbList *list , long niceness , bool recall ) {
|
||||
//m_bytesWritten = 0;
|
||||
|
||||
// sanity check
|
||||
log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);
|
||||
//log("dump: writing %li bytes at offset %lli",m_bytesToWrite,offset);
|
||||
|
||||
// . if we're called by RdbMerge directly use m_callback/m_state
|
||||
// . otherwise, use doneWritingWrapper() which will call dumpTree()
|
||||
|
83
Spider.cpp
83
Spider.cpp
@ -3880,9 +3880,15 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( m_cri >= g_collectiondb.m_numRecs ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// grab this
|
||||
collnum_t collnum = m_cri;
|
||||
//collnum_t collnum = m_cri;
|
||||
//CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
|
||||
// update the crawlinfo for this collection if it has been a while.
|
||||
// should never block since callback is NULL.
|
||||
if ( ! updateCrawlInfo(cr,NULL,NULL,true) ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// get this
|
||||
char *coll = g_collectiondb.m_recs[collnum]->m_coll;
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
// need this for msg5 call
|
||||
key_t endKey; endKey.setMax();
|
||||
@ -9188,8 +9194,16 @@ bool updateCrawlInfo ( CollectionRec *cr ,
|
||||
long now = getTimeLocal();
|
||||
// keep it fresh within 1 second
|
||||
long thresh = 1;
|
||||
// if being called from spiderloop, we just want to keep
|
||||
// CrawlInfo::m_nextSpiderTime fresh
|
||||
if ( ! callback ) thresh = 60;
|
||||
// unless cluster is big
|
||||
if ( g_hostdb.m_numHosts > 32 ) thresh = 30;
|
||||
if ( g_hostdb.m_numHosts > 32 ) {
|
||||
// update every 30 seconds
|
||||
thresh = 30;
|
||||
// if doing a passive refresh though...
|
||||
if ( ! callback ) thresh = 120;
|
||||
}
|
||||
|
||||
if ( useCache && now - cr->m_globalCrawlInfo.m_lastUpdateTime <thresh)
|
||||
return true;
|
||||
@ -9208,7 +9222,13 @@ bool updateCrawlInfo ( CollectionRec *cr ,
|
||||
|
||||
// if we were not the first, we do not initiate it, we just wait
|
||||
// for all the replies to come back
|
||||
if ( cr->m_replies < cr->m_requests ) return false;
|
||||
if ( cr->m_replies < cr->m_requests ) {
|
||||
// unless we had no callback! we do that in SpiderLoop above
|
||||
// to keep the crawl info fresh.
|
||||
if ( ! callback ) return true;
|
||||
// otherwise, block and we'll call your callback when done
|
||||
return false;
|
||||
}
|
||||
|
||||
// sanity test
|
||||
if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
|
||||
@ -9259,6 +9279,15 @@ bool updateCrawlInfo ( CollectionRec *cr ,
|
||||
return true;
|
||||
}
|
||||
|
||||
void doneSendingNotification ( void *state ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
log("spider: done sending notifications for coll=%s",
|
||||
ei->m_cr->m_coll);
|
||||
// mark it as sent. anytime a new url is spidered will mark this
|
||||
// as false again! use LOCAL crawlInfo, since global is reset often.
|
||||
ei->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 1;
|
||||
}
|
||||
|
||||
void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// reply is error?
|
||||
if ( ! slot->m_readBuf || g_errno ) {
|
||||
@ -9288,6 +9317,11 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
gs++;
|
||||
ss++;
|
||||
}
|
||||
if ( stats->m_hasUrlsReadyToSpider ) {
|
||||
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider++;
|
||||
// unflag the sent flag if we had sent an alert
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
}
|
||||
}
|
||||
// return if still waiting on more to come in
|
||||
if ( cr->m_replies < cr->m_requests ) return;
|
||||
@ -9320,6 +9354,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
p += sizeof(CallbackEntry2);
|
||||
// clear g_errno just in case
|
||||
g_errno = 0;
|
||||
// this is NULL when called from SpiderLoop::spiderDoledUrls()
|
||||
// because that is just updating it for maintenance
|
||||
if ( ! ce2->m_callback ) continue;
|
||||
// debug note
|
||||
//XmlDoc *xd = (XmlDoc *)(ce2->m_state);
|
||||
//log("spider: calling crawlupdate callback for %s",
|
||||
@ -9335,6 +9372,34 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
// save the mem!
|
||||
cr->m_callbackQueue.purge();
|
||||
|
||||
// now if its the first time a crawl has no rec to spider for
|
||||
// a while, we want to send an alert to the user so they know their
|
||||
// crawl is done.
|
||||
|
||||
// only host #0 sends alaerts
|
||||
if ( g_hostdb.getMyHost()->m_hostId != 0 ) return;
|
||||
|
||||
// but of course if it has urls ready to spider, do not send alert
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) return;
|
||||
|
||||
// if we already sent it return now. we set this to false everytime
|
||||
// we spider a url, which resets it. use local crawlinfo for this
|
||||
// since we reset global.
|
||||
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) return;
|
||||
|
||||
// ok, send it
|
||||
EmailInfo *ei = &cr->m_emailInfo;
|
||||
|
||||
// in use already?
|
||||
if ( ei->m_inUse ) return;
|
||||
|
||||
// set it up
|
||||
ei->m_finalCallback = doneSendingNotification;
|
||||
ei->m_finalState = ei;
|
||||
ei->m_cr = cr;
|
||||
|
||||
sendNotification ( ei );
|
||||
}
|
||||
|
||||
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
@ -9343,6 +9408,16 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
if ( slot->m_readBufSize != sizeof(collnum_t) ) { char *xx=NULL;*xx=0;}
|
||||
collnum_t collnum = *(collnum_t *)request;
|
||||
CollectionRec *cr = g_collectiondb.getRec(collnum);
|
||||
|
||||
// while we are here update CrawlInfo::m_nextSpiderTime
|
||||
// to the time of the next spider request to spider.
|
||||
// if doledb is empty and the next rec in the waiting tree
|
||||
// does not have a time of zero, but rather, in the future, then
|
||||
// return that future time. so if a crawl is enabled we should
|
||||
// actively call updateCrawlInfo a collection every minute or
|
||||
// so.
|
||||
|
||||
|
||||
char *reply = slot->m_tmpBuf;
|
||||
if ( TMPBUFSIZE < sizeof(CrawlInfo) ) { char *xx=NULL;*xx=0; }
|
||||
memcpy ( reply , &cr->m_localCrawlInfo , sizeof(CrawlInfo) );
|
||||
|
1
Test.cpp
1
Test.cpp
@ -19,6 +19,7 @@
|
||||
#include "Process.h"
|
||||
#include "Placedb.h"
|
||||
#include "Threads.h"
|
||||
#include "Msge1.h"
|
||||
|
||||
//static void testWrapper ( int fd , void *state ) ;
|
||||
static void injectedWrapper ( void *state ) ;
|
||||
|
170
XmlDoc.cpp
170
XmlDoc.cpp
@ -44,7 +44,8 @@
|
||||
#include "Highlight.h"
|
||||
#include "Wiktionary.h"
|
||||
#include "seo.h" // Msg99Request etc.
|
||||
#include <regex.h>
|
||||
//#include <regex.h>
|
||||
#include "PingServer.h"
|
||||
|
||||
#define MAXDOCLEN (1024*1024)
|
||||
|
||||
@ -162,7 +163,7 @@ XmlDoc::XmlDoc() {
|
||||
m_numMsg4fRequests = 0;
|
||||
m_numMsg4fReplies = 0;
|
||||
m_sentMsg4fRequests = false;
|
||||
m_notifyBlocked = 0;
|
||||
//m_notifyBlocked = 0;
|
||||
//m_mcasts = NULL;
|
||||
//for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ )
|
||||
// m_currentBinPtrs[i] = NULL;
|
||||
@ -180,7 +181,7 @@ static long long s_lastTimeStart = 0LL;
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
// notifications pending?
|
||||
if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
m_loaded = false;
|
||||
|
||||
@ -1956,8 +1957,15 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
if ( ! m_cr->m_spideringEnabled ) return true;
|
||||
// do not repeat call sendNotification()
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// set this
|
||||
m_emailInfo.reset();
|
||||
m_emailInfo.m_finalCallback = m_masterLoop;
|
||||
m_emailInfo.m_finalState = m_masterState;
|
||||
m_emailInfo.m_cr = m_cr;
|
||||
// note it
|
||||
setStatus("sending notification");
|
||||
// this returns false if it would block, so we ret fals
|
||||
if ( ! sendNotification() ) return false;
|
||||
if ( ! sendNotification ( &m_emailInfo ) ) return false;
|
||||
// it didn't block
|
||||
g_errno = m_indexCode;
|
||||
return true;
|
||||
@ -1980,8 +1988,16 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
if ( ! m_cr->m_spideringEnabled ) return true;
|
||||
// turn them off and send notification (email or url)
|
||||
m_cr->m_spideringEnabled = false;
|
||||
// this returns false if it would block, so we ret fals
|
||||
if ( ! sendNotification() ) return false;
|
||||
// set this
|
||||
m_emailInfo.reset();
|
||||
m_emailInfo.m_finalCallback = m_masterLoop;
|
||||
m_emailInfo.m_finalState = m_masterState;
|
||||
m_emailInfo.m_cr = m_cr;
|
||||
// note it
|
||||
setStatus("sending notification");
|
||||
// . this returns false if it would block, so we ret fals
|
||||
// . this is now in PingServer.cpp
|
||||
if ( ! sendNotification( &m_emailInfo ) ) return false;
|
||||
// it didn't block
|
||||
g_errno = m_indexCode;
|
||||
return true;
|
||||
@ -12015,12 +12031,6 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// return &m_diffbotReply;
|
||||
//}
|
||||
|
||||
// or if original page content matches the page regex dont hit diffbot
|
||||
//if( m_useDiffbot && ! doesPageContentMatchDiffbotProcessPattern() ) {
|
||||
// m_diffbotReplyValid = true;
|
||||
// return &m_diffbotReply;
|
||||
//}
|
||||
|
||||
// empty content, do not send to diffbot then
|
||||
char **u8 = getUtf8Content();
|
||||
if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
|
||||
@ -12039,6 +12049,12 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
}
|
||||
|
||||
|
||||
// or if original page content matches the page regex dont hit diffbot
|
||||
if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
|
||||
m_diffbotReplyValid = true;
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
|
||||
setStatus("getting diffbot reply");
|
||||
|
||||
//char *path = "api";
|
||||
@ -17108,6 +17124,45 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
||||
}
|
||||
*/
|
||||
|
||||
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
||||
if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
|
||||
char *p = m_cr->m_diffbotPageProcessPattern.getBufStart();
|
||||
// how many did we have?
|
||||
long count = 0;
|
||||
// scan the " || " separated substrings
|
||||
for ( ; *p ; ) {
|
||||
// get beginning of this string
|
||||
char *start = p;
|
||||
// skip white space
|
||||
while ( *start && is_wspace_a(*start) ) start++;
|
||||
// done?
|
||||
if ( ! *start ) break;
|
||||
// find end of it
|
||||
char *end = start;
|
||||
while ( *end && end[0] != '|' && ! is_wspace_a(end[0]) )
|
||||
end++;
|
||||
// advance p for next guy
|
||||
p = end;
|
||||
while ( *p && (*p=='|' || is_wspace_a(*p) ) ) p++;
|
||||
// temp null this
|
||||
char c = *end;
|
||||
*end = '\0';
|
||||
// count it as an attempt
|
||||
count++;
|
||||
// . is this substring anywhere in the document
|
||||
// . check the rawest content before converting to utf8 i guess
|
||||
char *foundPtr = strstr ( m_content , start ) ;
|
||||
// revert \0
|
||||
*end = c;
|
||||
// did we find it?
|
||||
if ( foundPtr ) return true;
|
||||
}
|
||||
// if we had no attempts, it is ok
|
||||
if ( count == 0 ) return true;
|
||||
// if we had an unfound substring...
|
||||
return false;
|
||||
}
|
||||
|
||||
// . returns ptr to status
|
||||
// . diffbot uses this to remove the indexed json pages associated with
|
||||
// a url. each json object is basically its own url. a json object
|
||||
@ -41951,94 +42006,3 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
||||
void doneSendingNotifyEmailWrapper ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
THIS->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: email notification status: %s",mstrerror(g_errno));
|
||||
// ignore it for rest
|
||||
g_errno = 0;
|
||||
// wait for post url to get done
|
||||
if ( THIS->m_notifyBlocked > 0 ) return;
|
||||
// all done
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
THIS->m_notifyBlocked--;
|
||||
// error?
|
||||
log("build: url notification status: %s",mstrerror(g_errno));
|
||||
// wait for post url to get done
|
||||
if ( THIS->m_notifyBlocked > 0 ) return;
|
||||
// all done
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
#include "PingServer.h" // sendEmail() function
|
||||
|
||||
// . return false if would block, true otherwise
|
||||
// . used to send email and get a url when a crawl hits a maxToCrawl
|
||||
// or maxToProcess limitation.
|
||||
bool XmlDoc::sendNotification ( ) {
|
||||
|
||||
setStatus("sending notification");
|
||||
|
||||
char *email = m_cr->m_notifyEmail.getBufStart();
|
||||
char *url = m_cr->m_notifyUrl.getBufStart();
|
||||
|
||||
// sanity check, can only call once
|
||||
if ( m_notifyBlocked != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( email && email[0] ) {
|
||||
log("build: sending email notification to %s for coll \"%s\"",
|
||||
email,m_cr->m_coll);
|
||||
SafeBuf msg;
|
||||
msg.safePrintf("Your crawl \"%s\" "
|
||||
"has hit a limitation and has "
|
||||
"been paused."
|
||||
, m_cr->m_coll);
|
||||
// use this
|
||||
EmailInfo *ei = &m_emailInfo;
|
||||
ei->m_toAddress.safeStrcpy ( email );
|
||||
ei->m_toAddress.nullTerm();
|
||||
ei->m_fromAddress.safePrintf("support@diffbot.com");
|
||||
ei->m_subject.safePrintf("crawl paused");
|
||||
ei->m_body.safePrintf("Your crawl for collection \"%s\" "
|
||||
"has been paused because it hit "
|
||||
"a maxPagesToCrawl or maxPagesToProcess "
|
||||
"limitation."
|
||||
, m_cr->m_coll);
|
||||
ei->m_state = this;
|
||||
ei->m_callback = doneSendingNotifyEmailWrapper;
|
||||
// this will usually block, unless error maybe
|
||||
if ( ! sendEmail ( ei ) )
|
||||
m_notifyBlocked++;
|
||||
}
|
||||
|
||||
if ( url && url[0] ) {
|
||||
log("build: sending url notification to %s for coll \"%s\"",
|
||||
url,m_cr->m_coll);
|
||||
// GET request
|
||||
if ( ! g_httpServer.getDoc ( url ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
-1 , // size
|
||||
false, // ifmodsince
|
||||
this ,
|
||||
doneGettingNotifyUrlWrapper ,
|
||||
60*1000 , // timeout
|
||||
0, // proxyip
|
||||
0 , // proxyport
|
||||
10000, // maxTextDocLen
|
||||
10000 // maxOtherDocLen
|
||||
) )
|
||||
m_notifyBlocked++;
|
||||
}
|
||||
|
||||
if ( m_notifyBlocked == 0 ) return true;
|
||||
|
||||
// we blocked, wait
|
||||
return false;
|
||||
}
|
||||
|
19
XmlDoc.h
19
XmlDoc.h
@ -64,6 +64,7 @@
|
||||
#include "Spider.h" // SpiderRequest/SpiderReply definitions
|
||||
#include "HttpMime.h" // ET_DEFLAT
|
||||
#include "Msg1.h"
|
||||
#include "PingServer.h"
|
||||
//#include "PageCrawlBot.h" // DBA_NONE
|
||||
|
||||
//#define XMLDOC_MAX_AD_IDS 4
|
||||
@ -85,20 +86,6 @@
|
||||
|
||||
#define XD_MAX_AD_IDS 5
|
||||
|
||||
class EmailInfo {
|
||||
public:
|
||||
SafeBuf m_toAddress;
|
||||
SafeBuf m_fromAddress;
|
||||
SafeBuf m_subject;
|
||||
SafeBuf m_body;
|
||||
char *m_dom; // ref into m_toAddress of the domain in email addr
|
||||
SafeBuf m_mxDomain; // just the domain with a "gbmxrec-" prepended
|
||||
void *m_state;
|
||||
void (* m_callback ) (void *state);
|
||||
// ip address of MX record for this domain
|
||||
long m_mxIp;
|
||||
};
|
||||
|
||||
double getTrafficPercent ( long rank ) ;
|
||||
|
||||
bool setLangVec ( class Words *words ,
|
||||
@ -1534,14 +1521,12 @@ class XmlDoc {
|
||||
SafeBuf *getDiffbotReply ( ) ;
|
||||
//bool doesUrlMatchDiffbotCrawlPattern() ;
|
||||
//bool doesUrlMatchDiffbotProcessPattern() ;
|
||||
//bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
char *hashJSON ( HashTableX *table );
|
||||
long *nukeJSONObjects ( ) ;
|
||||
long m_joc;
|
||||
|
||||
bool sendNotification ( );
|
||||
EmailInfo m_emailInfo;
|
||||
long m_notifyBlocked;
|
||||
|
||||
//
|
||||
// functions and vars for the seo query matching tool
|
||||
|
Loading…
x
Reference in New Issue
Block a user