better crawl status reporting.

allow for _ in coll names.
This commit is contained in:
Matt Wells 2013-10-30 10:00:46 -07:00
parent a1ac5a5348
commit adf4d258ae
8 changed files with 213 additions and 112 deletions

@ -7,6 +7,7 @@
#include "Threads.h"
#include "Datedb.h"
#include "Timedb.h"
#include "Spider.h"
static CollectionRec g_default;
@ -29,8 +30,8 @@ CollectionRec::CollectionRec() {
m_overflow = 0x12345678;
m_overflow2 = 0x12345678;
// the spiders are currently uninhibited i guess
m_spiderStatus = 0;
m_spiderStatusMsg = NULL;
m_spiderStatus = SP_INITIALIZING; // this is 0
//m_spiderStatusMsg = NULL;
// for Url::getSite()
m_updateSiteRulesTable = 1;
m_lastUpdateTime = 0LL;

@ -85,7 +85,7 @@ class CrawlInfo {
long long m_objectsDeleted; // 1
long long m_objectsAdded; // 2
long long m_urlsConsidered; // 3
long long m_urlsConsideredNOTUSED; // 3
long long m_pageDownloadAttempts; // 4
long long m_pageDownloadSuccesses; // 5
long long m_pageProcessAttempts; // 6
@ -304,7 +304,7 @@ class CollectionRec {
long m_maxQueryTerms;
char m_spiderStatus;
char *m_spiderStatusMsg;
//char *m_spiderStatusMsg;
// Language stuff
float m_languageUnknownWeight;

@ -185,6 +185,7 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
for ( ; *p ; p++ ) {
if ( is_alnum_a(*p) ) continue;
if ( *p == '-' ) continue;
if ( *p == '_' ) continue; // underscore now allowed
break;
}
if ( *p ) {
@ -774,8 +775,8 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
cr->m_spiderRoundNum = 0;
cr->m_spiderRoundStartTime = 0;
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
cr->m_spiderStatus = SP_INITIALIZING; // this is 0
//cr->m_spiderStatusMsg = NULL;
// reset seed buf
cr->m_diffbotSeeds.purge();

@ -791,6 +791,41 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return true;
}
//long pathLen = hr->getPathLen();
char rdbId = RDB_NONE;
bool downloadJSON = false;
long fmt;
char *xx;
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_JSON;
downloadJSON = true;
}
else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
fmt = FMT_XML;
}
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
}
// sanity, must be one of 3 download calls
if ( rdbId == RDB_NONE ) {
char *msg ;
msg = "usage: downloadurls, downloadpages, downloaddata";
log("crawlbot: %s",msg);
g_httpServer.sendErrorReply(sock,500,msg);
return true;
}
char *coll = str + 10;
if ( coll >= pathEnd ) {
char *msg = "bad download request2";
@ -799,14 +834,8 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return true;
}
char *collEnd = strstr ( coll , "_");
if ( ! collEnd ) {
char *msg = "bad download request3";
log("crawlbot: %s",msg);
g_httpServer.sendErrorReply(sock,500,msg);
return true;
}
// get coll
char *collEnd = xx;
//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
CollectionRec *cr = g_collectiondb.getRec ( coll , collEnd - coll );
@ -817,29 +846,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
return true;
}
//long pathLen = hr->getPathLen();
char rdbId = RDB_NONE;
bool downloadJSON = false;
long fmt;
if ( strstr ( path , "_data.json" ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_JSON;
downloadJSON = true;
}
if ( strstr ( path , "_data.xml" ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
fmt = FMT_XML;
}
else if ( strstr ( path , "_urls.csv" ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
}
else if ( strstr ( path , "_pages.txt" ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
}
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
@ -851,14 +857,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// rdbId = RDB_TITLEDB;
//}
// sanity, must be one of 3 download calls
if ( rdbId == RDB_NONE ) {
char *msg ;
msg = "usage: downloadurls, downloadpages, downloaddata";
log("crawlbot: %s",msg);
g_httpServer.sendErrorReply(sock,500,msg);
return true;
}
StateCD *st;
try { st = new (StateCD); }
@ -1268,7 +1266,8 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
long lastSpidered = 0;
// parse through it
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
// this record is either a SpiderRequest or SpiderReply
@ -1279,7 +1278,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// spiderrequests for the same url
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
srep = (SpiderReply *)rec;
if ( sreq ) lastSpidered = 0;
sreq = NULL;
if ( lastSpidered == 0 )
lastSpidered = srep->m_spideredTime;
else if ( srep->m_spideredTime > lastSpidered )
lastSpidered = srep->m_spideredTime;
prevReplyUh48 = srep->getUrlHash48();
// 0 means indexed successfully. not sure if
// this includes http status codes like 404 etc.
@ -1307,6 +1311,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
if ( ! printIt ) continue;
lastUh48 = uh48;
// make sure spiderreply is for the same url!
if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
srep = NULL;
if ( ! srep )
lastSpidered = 0;
// debug point
//if ( strstr(sreq->m_url,"chief") )
// log("hey");
@ -1382,12 +1392,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
);
// but default to csv
else {
sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
sb->safePrintf("\"%s\",%lu,%lu,\"%s\",\"%s\",\""
//",%s"
//"\n"
, sreq->m_url
// when was it first added to spiderdb?
, sreq->m_addedTime
// last time spidered, 0 if none
, lastSpidered
//, status
, msg
// the url filter expression it matches
@ -2326,8 +2338,11 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// collectionrec must be non-null at this point. i.e. we added it
if ( ! cr ) {
char *msg = "Crawl name was not found.";
if ( name && name[0] )
msg = "Failed to add crawl. Crawl name is illegal.";
//log("crawlbot: no collection found. need to add a crawl");
return sendErrorReply2(socket,fmt,"no crawls found. add one.");
return sendErrorReply2(socket,fmt, msg);
}
//char *spots = hr->getString("spots",NULL,NULL);
@ -2727,36 +2742,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//if ( cx->m_collectionNameAlias.length() > 0 )
// alias=cx->m_collectionNameAlias.getBufStart();
//long paused = 1;
char *ss = "Crawl in progress.";
if ( cx->m_spiderStatusMsg )
ss = cx->m_spiderStatusMsg;
// 0 means not to RE-crawl
char tmp[256];
// indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
long now = getTimeGlobal();
sprintf(tmp,"Next crawl round to start in %li "
"seconds.",
cx->m_spiderRoundStartTime - now
);
ss = tmp;
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency == 0.0 &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
ss = "Crawl has completed and no "
"repeatCrawl is scheduled.";
}
if ( ! cx->m_spideringEnabled )
ss = "Crawl paused.";
// if spiderdb is empty for this coll, then no url
// has been added to spiderdb yet.. either seed or spot
CrawlInfo *cg = &cx->m_globalCrawlInfo;
if ( cg->m_pageDownloadAttempts == 0 )
ss = "Crawl is initializing.";
SafeBuf tmp;
long crawlStatus = -1;
getSpiderStatusMsg ( cx , &tmp , &crawlStatus );
CrawlInfo *ci = &cx->m_localCrawlInfo;
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
@ -2766,7 +2754,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"\"name\":\"%s\",\n"
//"\"alias\":\"%s\",\n"
//"\"crawlingEnabled\":%li,\n"
"\"crawlStatus\":\"%s\",\n"
"\"crawlStatus\":{"
"\"status\":%li,"
"\"message\":\"%s\"},\n"
"\"sentCrawlDoneNotification\":%li,\n"
//"\"crawlingPaused\":%li,\n"
"\"objectsFound\":%lli,\n"
@ -2789,7 +2779,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cx->m_diffbotCrawlName.getBufStart()
//, alias
//, (long)cx->m_spideringEnabled
, ss
, crawlStatus
, tmp.getBufStart()
, sentAlert
//, (long)paused
, cx->m_globalCrawlInfo.m_objectsAdded -

@ -3070,10 +3070,10 @@ bool sendNotification ( EmailInfo *ei ) {
if ( email && email[0] ) {
log("build: sending email notification to %s for "
"crawl \"%s\" : %s",
email,crawl,ei->m_spiderStatusMsg);
email,crawl,ei->m_spiderStatusMsg.getBufStart());
SafeBuf msg;
msg.safePrintf("Your crawl \"%s\" has a new status: %s"
, ei->m_spiderStatusMsg
, ei->m_spiderStatusMsg.getBufStart()
, crawl );
// reset m_length otherwise it builds up
@ -3110,7 +3110,7 @@ bool sendNotification ( EmailInfo *ei ) {
"X-Crawl-Status: %s"// \r\n" // hdrs
, cr->m_diffbotCrawlName.getBufStart()
, ei->m_spiderStatusMsg
, ei->m_spiderStatusMsg.getBufStart()
);
// GET request
if ( ! g_httpServer.getDoc ( url ,

@ -16,7 +16,8 @@ public:
SafeBuf m_fromAddress;
SafeBuf m_subject;
SafeBuf m_body;
char *m_spiderStatusMsg;
//char *m_spiderStatusMsg;
SafeBuf m_spiderStatusMsg;
//CollectionRec *m_cr;
collnum_t m_collnum;
char *m_dom; // ref into m_toAddress of the domain in email addr

@ -2412,7 +2412,9 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
// log it
if ( m_numAdded )
log("spider: added %li recs to waiting tree from "
"scan of %lli bytes",m_numAdded,m_numBytesScanned);
"scan of %lli bytes coll=%s",
m_numAdded,m_numBytesScanned,
m_cr->m_coll);
// reset the count for next scan
m_numAdded = 0 ;
m_numBytesScanned = 0;
@ -3962,11 +3964,6 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
g_spiderLoop.spiderDoledUrls( );
}
#define SP_MAXROUNDS 1
#define SP_MAXTOCRAWL 2
#define SP_MAXTOPROCESS 3
#define SP_ROUNDDONE 4
void doneSendingNotification ( void *state ) {
EmailInfo *ei = (EmailInfo *)state;
collnum_t collnum = ei->m_collnum;
@ -3982,6 +3979,9 @@ void doneSendingNotification ( void *state ) {
// pingserver.cpp sets this
//ei->m_inUse = false;
log("spider: setting current spider status to %li",
(long)cr->m_spiderStatus);
// mark it as sent. anytime a new url is spidered will mark this
// as false again! use LOCAL crawlInfo, since global is reset often.
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
@ -4064,6 +4064,14 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
if ( g_hostdb.m_myHost->m_hostId != 0 )
return true;
// . if already sent email for this, skip
// . localCrawlInfo stores this value on disk so it is persistent
// . we do it this way so SP_ROUNDDONE can be emailed and then
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
// round count.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
return true;
// do not send email for maxrounds hit, it will send a round done
// email for that. otherwise we end up calling doneSendingEmail()
// twice and increment the round twice
@ -4073,17 +4081,15 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
return true;
}
// . if already sent email for this, skip
// . localCrawlInfo stores this value on disk so it is persistent
// . we do it this way so SP_ROUNDDONE can be emailed and then
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
// round count.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
return true;
// wtf? caller must set this
if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; }
log("spider: trying to send notification for new crawl status %li. "
"current status is %li",
(long)cr->m_spiderStatus,
//cr->m_spiderStatusMsg,
(long)cr->m_localCrawlInfo.m_sentCrawlDoneAlert);
// if we already sent it return now. we set this to false everytime
// we spider a url, which resets it. use local crawlinfo for this
// since we reset global.
@ -4103,7 +4109,9 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
ei->m_finalState = ei;
ei->m_collnum = cr->m_collnum;
ei->m_spiderStatusMsg = cr->m_spiderStatusMsg;
SafeBuf *buf = &ei->m_spiderStatusMsg;
long status = -1;
getSpiderStatusMsg ( cr , buf , &status );
// if no email address or webhook provided this will not block!
if ( ! sendNotification ( ei ) ) return false;
@ -4112,6 +4120,11 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
return true;
}
// we need to update crawl info for collections that
// have urls ready to spider
SpiderColl *getNextSpiderColl ( long *cri ) ;
@ -4204,8 +4217,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( cr->m_maxCrawlRounds > 0 &&
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
cr->m_spiderStatus = SP_MAXROUNDS;
cr->m_spiderStatusMsg = "Crawl has reached "
"maxCrawlRounds limit.";
// it'll send a SP_ROUNDDONE email first
// so no need to repeat it, but we do want to
// update the status msg
@ -4217,8 +4228,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( cr->m_globalCrawlInfo.m_pageDownloadSuccesses >=
cr->m_maxToCrawl ) {
cr->m_spiderStatus = SP_MAXTOCRAWL;
cr->m_spiderStatusMsg = "Crawl has reached maxToCrawl "
"limit.";
sendNotificationForCollRec ( cr );
continue;
}
@ -4227,8 +4236,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( cr->m_globalCrawlInfo.m_pageProcessSuccesses >=
cr->m_maxToProcess ) {
cr->m_spiderStatus = SP_MAXTOPROCESS;
cr->m_spiderStatusMsg = "Crawl has reached "
"maxToProcess limit.";
sendNotificationForCollRec ( cr );
continue;
}
@ -4947,8 +4954,8 @@ bool SpiderLoop::gotDoledbList2 ( ) {
ci->m_hasUrlsReadyToSpider = true;
// reset reason why crawl is not running, because we basically are now
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
cr->m_spiderStatus = SP_INPROGRESS; // 0;
//cr->m_spiderStatusMsg = NULL;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
@ -10000,10 +10007,17 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
// if we have urls ready to be spidered then prepare to send another
// email/webhook notification
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
// . if we have urls ready to be spidered then prepare to send another
// email/webhook notification.
// . do not reset this flag if SP_MAXTOCRAWL etc otherwise we end up
// sending multiple notifications, so this logic here is only
// for when we are done spidering a round, which happens when
// hasUrlsReadyToSpider goes false for all shards.
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE ) {
log("spider: resetting sent crawl done alert to 0");
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
// update cache time
@ -10060,7 +10074,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// and we've examined at least one url. to prevent us from
// sending a notification if we haven't spidered anything
// because no seed urls have been added/injected.
if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
//if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) return;
// if urls were considered and roundstarttime is still 0 then
// set it to the current time...
@ -10074,7 +10089,6 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// update status
cr->m_spiderStatus = SP_ROUNDDONE;
cr->m_spiderStatusMsg = "Crawl round completed.";
// do email and web hook...
sendNotificationForCollRec ( cr );
@ -10161,3 +10175,81 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
slot );
}
bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
//char *ss = "Crawl in progress.";
//if ( cx->m_spiderStatusMsg )
// ss = cx->m_spiderStatusMsg;
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
*status = SP_MAXTOCRAWL;
return msg->safePrintf ( "Crawl has reached maxToCrawl "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
*status = SP_MAXTOPROCESS;
return msg->safePrintf ( "Crawl has reached maxToProcess "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
*status = SP_MAXROUNDS;
return msg->safePrintf ( "Crawl has reached maxCrawlRounds "
"limit." );
}
long now = getTimeGlobal();
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Next crawl round to start "
"in %li seconds.",
cx->m_spiderRoundStartTime-now );
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
*status = SP_COMPLETED;
return msg->safePrintf("Crawl has completed and no "
"repeatCrawl is scheduled.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Crawl round completed.");
}
if ( ! cx->m_spideringEnabled ) {
*status = SP_PAUSED;
return msg->safePrintf("Crawl paused.");
}
if ( ! g_conf.m_spideringEnabled ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "
"by root administrator for "
"maintenance.");
}
// if spiderdb is empty for this coll, then no url
// has been added to spiderdb yet.. either seed or spot
CrawlInfo *cg = &cx->m_globalCrawlInfo;
if ( cg->m_pageDownloadAttempts == 0 ) {
*status = SP_NOURLS;
return msg->safePrintf("Crawl is waiting for urls.");
}
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
*status = SP_INITIALIZING;
return msg->safePrintf("Crawl is initializing.");
}
// otherwise in progress?
*status = SP_INPROGRESS;
return msg->safePrintf("Crawl is in progress.");
}

@ -32,7 +32,22 @@ bool updateCrawlInfo ( CollectionRec *cr ,
void (* callback)(void *state) ,
bool useCache = true ) ;
// . values for CollectionRec::m_spiderStatus
// . reasons why crawl is not happening
#define SP_INITIALIZING 0
#define SP_MAXROUNDS 1 // hit max rounds limit
#define SP_MAXTOCRAWL 2 // hit max to crawl limit
#define SP_MAXTOPROCESS 3 // hit max to process limit
#define SP_ROUNDDONE 4 // spider round is done
#define SP_NOURLS 5 // initializing
#define SP_PAUSED 6 // user paused spider
#define SP_INPROGRESS 7 // it is going on!
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
bool getSpiderStatusMsg ( class CollectionRec *cx ,
class SafeBuf *msg ,
long *status ) ;
// Overview of Spider
//