better crawl status reporting.
allow for _ in coll names.
This commit is contained in:
parent
a1ac5a5348
commit
adf4d258ae
@ -7,6 +7,7 @@
|
||||
#include "Threads.h"
|
||||
#include "Datedb.h"
|
||||
#include "Timedb.h"
|
||||
#include "Spider.h"
|
||||
|
||||
static CollectionRec g_default;
|
||||
|
||||
@ -29,8 +30,8 @@ CollectionRec::CollectionRec() {
|
||||
m_overflow = 0x12345678;
|
||||
m_overflow2 = 0x12345678;
|
||||
// the spiders are currently uninhibited i guess
|
||||
m_spiderStatus = 0;
|
||||
m_spiderStatusMsg = NULL;
|
||||
m_spiderStatus = SP_INITIALIZING; // this is 0
|
||||
//m_spiderStatusMsg = NULL;
|
||||
// for Url::getSite()
|
||||
m_updateSiteRulesTable = 1;
|
||||
m_lastUpdateTime = 0LL;
|
||||
|
@ -85,7 +85,7 @@ class CrawlInfo {
|
||||
|
||||
long long m_objectsDeleted; // 1
|
||||
long long m_objectsAdded; // 2
|
||||
long long m_urlsConsidered; // 3
|
||||
long long m_urlsConsideredNOTUSED; // 3
|
||||
long long m_pageDownloadAttempts; // 4
|
||||
long long m_pageDownloadSuccesses; // 5
|
||||
long long m_pageProcessAttempts; // 6
|
||||
@ -304,7 +304,7 @@ class CollectionRec {
|
||||
long m_maxQueryTerms;
|
||||
|
||||
char m_spiderStatus;
|
||||
char *m_spiderStatusMsg;
|
||||
//char *m_spiderStatusMsg;
|
||||
|
||||
// Language stuff
|
||||
float m_languageUnknownWeight;
|
||||
|
@ -185,6 +185,7 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
|
||||
for ( ; *p ; p++ ) {
|
||||
if ( is_alnum_a(*p) ) continue;
|
||||
if ( *p == '-' ) continue;
|
||||
if ( *p == '_' ) continue; // underscore now allowed
|
||||
break;
|
||||
}
|
||||
if ( *p ) {
|
||||
@ -774,8 +775,8 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
|
||||
cr->m_spiderRoundNum = 0;
|
||||
cr->m_spiderRoundStartTime = 0;
|
||||
|
||||
cr->m_spiderStatus = 0;
|
||||
cr->m_spiderStatusMsg = NULL;
|
||||
cr->m_spiderStatus = SP_INITIALIZING; // this is 0
|
||||
//cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// reset seed buf
|
||||
cr->m_diffbotSeeds.purge();
|
||||
|
139
PageCrawlBot.cpp
139
PageCrawlBot.cpp
@ -791,6 +791,41 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//long pathLen = hr->getPathLen();
|
||||
char rdbId = RDB_NONE;
|
||||
bool downloadJSON = false;
|
||||
long fmt;
|
||||
char *xx;
|
||||
|
||||
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_JSON;
|
||||
downloadJSON = true;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
downloadJSON = true;
|
||||
fmt = FMT_XML;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_TXT;
|
||||
}
|
||||
|
||||
// sanity, must be one of 3 download calls
|
||||
if ( rdbId == RDB_NONE ) {
|
||||
char *msg ;
|
||||
msg = "usage: downloadurls, downloadpages, downloaddata";
|
||||
log("crawlbot: %s",msg);
|
||||
g_httpServer.sendErrorReply(sock,500,msg);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
char *coll = str + 10;
|
||||
if ( coll >= pathEnd ) {
|
||||
char *msg = "bad download request2";
|
||||
@ -799,14 +834,8 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char *collEnd = strstr ( coll , "_");
|
||||
if ( ! collEnd ) {
|
||||
char *msg = "bad download request3";
|
||||
log("crawlbot: %s",msg);
|
||||
g_httpServer.sendErrorReply(sock,500,msg);
|
||||
return true;
|
||||
}
|
||||
|
||||
// get coll
|
||||
char *collEnd = xx;
|
||||
|
||||
//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll , collEnd - coll );
|
||||
@ -817,29 +846,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//long pathLen = hr->getPathLen();
|
||||
char rdbId = RDB_NONE;
|
||||
bool downloadJSON = false;
|
||||
long fmt;
|
||||
|
||||
if ( strstr ( path , "_data.json" ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_JSON;
|
||||
downloadJSON = true;
|
||||
}
|
||||
if ( strstr ( path , "_data.xml" ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
downloadJSON = true;
|
||||
fmt = FMT_XML;
|
||||
}
|
||||
else if ( strstr ( path , "_urls.csv" ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
fmt = FMT_CSV;
|
||||
}
|
||||
else if ( strstr ( path , "_pages.txt" ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
fmt = FMT_TXT;
|
||||
}
|
||||
|
||||
|
||||
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
|
||||
@ -851,14 +857,6 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
// rdbId = RDB_TITLEDB;
|
||||
//}
|
||||
|
||||
// sanity, must be one of 3 download calls
|
||||
if ( rdbId == RDB_NONE ) {
|
||||
char *msg ;
|
||||
msg = "usage: downloadurls, downloadpages, downloaddata";
|
||||
log("crawlbot: %s",msg);
|
||||
g_httpServer.sendErrorReply(sock,500,msg);
|
||||
return true;
|
||||
}
|
||||
|
||||
StateCD *st;
|
||||
try { st = new (StateCD); }
|
||||
@ -1268,7 +1266,8 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
|
||||
long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
|
||||
|
||||
long lastSpidered = 0;
|
||||
|
||||
// parse through it
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
|
||||
// this record is either a SpiderRequest or SpiderReply
|
||||
@ -1279,7 +1278,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// spiderrequests for the same url
|
||||
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
|
||||
srep = (SpiderReply *)rec;
|
||||
if ( sreq ) lastSpidered = 0;
|
||||
sreq = NULL;
|
||||
if ( lastSpidered == 0 )
|
||||
lastSpidered = srep->m_spideredTime;
|
||||
else if ( srep->m_spideredTime > lastSpidered )
|
||||
lastSpidered = srep->m_spideredTime;
|
||||
prevReplyUh48 = srep->getUrlHash48();
|
||||
// 0 means indexed successfully. not sure if
|
||||
// this includes http status codes like 404 etc.
|
||||
@ -1307,6 +1311,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
if ( ! printIt ) continue;
|
||||
lastUh48 = uh48;
|
||||
|
||||
// make sure spiderreply is for the same url!
|
||||
if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
|
||||
srep = NULL;
|
||||
if ( ! srep )
|
||||
lastSpidered = 0;
|
||||
|
||||
// debug point
|
||||
//if ( strstr(sreq->m_url,"chief") )
|
||||
// log("hey");
|
||||
@ -1382,12 +1392,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
);
|
||||
// but default to csv
|
||||
else {
|
||||
sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
|
||||
sb->safePrintf("\"%s\",%lu,%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
//"\n"
|
||||
, sreq->m_url
|
||||
// when was it first added to spiderdb?
|
||||
, sreq->m_addedTime
|
||||
// last time spidered, 0 if none
|
||||
, lastSpidered
|
||||
//, status
|
||||
, msg
|
||||
// the url filter expression it matches
|
||||
@ -2326,8 +2338,11 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
// collectionrec must be non-null at this point. i.e. we added it
|
||||
if ( ! cr ) {
|
||||
char *msg = "Crawl name was not found.";
|
||||
if ( name && name[0] )
|
||||
msg = "Failed to add crawl. Crawl name is illegal.";
|
||||
//log("crawlbot: no collection found. need to add a crawl");
|
||||
return sendErrorReply2(socket,fmt,"no crawls found. add one.");
|
||||
return sendErrorReply2(socket,fmt, msg);
|
||||
}
|
||||
|
||||
//char *spots = hr->getString("spots",NULL,NULL);
|
||||
@ -2727,36 +2742,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//if ( cx->m_collectionNameAlias.length() > 0 )
|
||||
// alias=cx->m_collectionNameAlias.getBufStart();
|
||||
//long paused = 1;
|
||||
char *ss = "Crawl in progress.";
|
||||
if ( cx->m_spiderStatusMsg )
|
||||
ss = cx->m_spiderStatusMsg;
|
||||
// 0 means not to RE-crawl
|
||||
char tmp[256];
|
||||
// indicate if we are WAITING for next round...
|
||||
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
|
||||
long now = getTimeGlobal();
|
||||
sprintf(tmp,"Next crawl round to start in %li "
|
||||
"seconds.",
|
||||
cx->m_spiderRoundStartTime - now
|
||||
);
|
||||
ss = tmp;
|
||||
}
|
||||
// if we sent an email simply because no urls
|
||||
// were left and we are not recrawling!
|
||||
if ( cx->m_collectiveRespiderFrequency == 0.0 &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
|
||||
ss = "Crawl has completed and no "
|
||||
"repeatCrawl is scheduled.";
|
||||
}
|
||||
if ( ! cx->m_spideringEnabled )
|
||||
ss = "Crawl paused.";
|
||||
|
||||
// if spiderdb is empty for this coll, then no url
|
||||
// has been added to spiderdb yet.. either seed or spot
|
||||
CrawlInfo *cg = &cx->m_globalCrawlInfo;
|
||||
if ( cg->m_pageDownloadAttempts == 0 )
|
||||
ss = "Crawl is initializing.";
|
||||
SafeBuf tmp;
|
||||
long crawlStatus = -1;
|
||||
getSpiderStatusMsg ( cx , &tmp , &crawlStatus );
|
||||
|
||||
CrawlInfo *ci = &cx->m_localCrawlInfo;
|
||||
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
|
||||
@ -2766,7 +2754,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"\"name\":\"%s\",\n"
|
||||
//"\"alias\":\"%s\",\n"
|
||||
//"\"crawlingEnabled\":%li,\n"
|
||||
"\"crawlStatus\":\"%s\",\n"
|
||||
"\"crawlStatus\":{"
|
||||
"\"status\":%li,"
|
||||
"\"message\":\"%s\"},\n"
|
||||
"\"sentCrawlDoneNotification\":%li,\n"
|
||||
//"\"crawlingPaused\":%li,\n"
|
||||
"\"objectsFound\":%lli,\n"
|
||||
@ -2789,7 +2779,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
, cx->m_diffbotCrawlName.getBufStart()
|
||||
//, alias
|
||||
//, (long)cx->m_spideringEnabled
|
||||
, ss
|
||||
, crawlStatus
|
||||
, tmp.getBufStart()
|
||||
, sentAlert
|
||||
//, (long)paused
|
||||
, cx->m_globalCrawlInfo.m_objectsAdded -
|
||||
|
@ -3070,10 +3070,10 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
if ( email && email[0] ) {
|
||||
log("build: sending email notification to %s for "
|
||||
"crawl \"%s\" : %s",
|
||||
email,crawl,ei->m_spiderStatusMsg);
|
||||
email,crawl,ei->m_spiderStatusMsg.getBufStart());
|
||||
SafeBuf msg;
|
||||
msg.safePrintf("Your crawl \"%s\" has a new status: %s"
|
||||
, ei->m_spiderStatusMsg
|
||||
, ei->m_spiderStatusMsg.getBufStart()
|
||||
, crawl );
|
||||
|
||||
// reset m_length otherwise it builds up
|
||||
@ -3110,7 +3110,7 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
"X-Crawl-Status: %s"// \r\n" // hdrs
|
||||
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
, ei->m_spiderStatusMsg
|
||||
, ei->m_spiderStatusMsg.getBufStart()
|
||||
);
|
||||
// GET request
|
||||
if ( ! g_httpServer.getDoc ( url ,
|
||||
|
@ -16,7 +16,8 @@ public:
|
||||
SafeBuf m_fromAddress;
|
||||
SafeBuf m_subject;
|
||||
SafeBuf m_body;
|
||||
char *m_spiderStatusMsg;
|
||||
//char *m_spiderStatusMsg;
|
||||
SafeBuf m_spiderStatusMsg;
|
||||
//CollectionRec *m_cr;
|
||||
collnum_t m_collnum;
|
||||
char *m_dom; // ref into m_toAddress of the domain in email addr
|
||||
|
148
Spider.cpp
148
Spider.cpp
@ -2412,7 +2412,9 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
|
||||
// log it
|
||||
if ( m_numAdded )
|
||||
log("spider: added %li recs to waiting tree from "
|
||||
"scan of %lli bytes",m_numAdded,m_numBytesScanned);
|
||||
"scan of %lli bytes coll=%s",
|
||||
m_numAdded,m_numBytesScanned,
|
||||
m_cr->m_coll);
|
||||
// reset the count for next scan
|
||||
m_numAdded = 0 ;
|
||||
m_numBytesScanned = 0;
|
||||
@ -3962,11 +3964,6 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
g_spiderLoop.spiderDoledUrls( );
|
||||
}
|
||||
|
||||
#define SP_MAXROUNDS 1
|
||||
#define SP_MAXTOCRAWL 2
|
||||
#define SP_MAXTOPROCESS 3
|
||||
#define SP_ROUNDDONE 4
|
||||
|
||||
void doneSendingNotification ( void *state ) {
|
||||
EmailInfo *ei = (EmailInfo *)state;
|
||||
collnum_t collnum = ei->m_collnum;
|
||||
@ -3982,6 +3979,9 @@ void doneSendingNotification ( void *state ) {
|
||||
// pingserver.cpp sets this
|
||||
//ei->m_inUse = false;
|
||||
|
||||
log("spider: setting current spider status to %li",
|
||||
(long)cr->m_spiderStatus);
|
||||
|
||||
// mark it as sent. anytime a new url is spidered will mark this
|
||||
// as false again! use LOCAL crawlInfo, since global is reset often.
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
|
||||
@ -4064,6 +4064,14 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
if ( g_hostdb.m_myHost->m_hostId != 0 )
|
||||
return true;
|
||||
|
||||
// . if already sent email for this, skip
|
||||
// . localCrawlInfo stores this value on disk so it is persistent
|
||||
// . we do it this way so SP_ROUNDDONE can be emailed and then
|
||||
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
|
||||
// round count.
|
||||
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
|
||||
return true;
|
||||
|
||||
// do not send email for maxrounds hit, it will send a round done
|
||||
// email for that. otherwise we end up calling doneSendingEmail()
|
||||
// twice and increment the round twice
|
||||
@ -4073,17 +4081,15 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// . if already sent email for this, skip
|
||||
// . localCrawlInfo stores this value on disk so it is persistent
|
||||
// . we do it this way so SP_ROUNDDONE can be emailed and then
|
||||
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
|
||||
// round count.
|
||||
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == cr->m_spiderStatus )
|
||||
return true;
|
||||
|
||||
// wtf? caller must set this
|
||||
if ( ! cr->m_spiderStatus ) { char *xx=NULL; *xx=0; }
|
||||
|
||||
log("spider: trying to send notification for new crawl status %li. "
|
||||
"current status is %li",
|
||||
(long)cr->m_spiderStatus,
|
||||
//cr->m_spiderStatusMsg,
|
||||
(long)cr->m_localCrawlInfo.m_sentCrawlDoneAlert);
|
||||
|
||||
// if we already sent it return now. we set this to false everytime
|
||||
// we spider a url, which resets it. use local crawlinfo for this
|
||||
// since we reset global.
|
||||
@ -4103,7 +4109,9 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
ei->m_finalState = ei;
|
||||
ei->m_collnum = cr->m_collnum;
|
||||
|
||||
ei->m_spiderStatusMsg = cr->m_spiderStatusMsg;
|
||||
SafeBuf *buf = &ei->m_spiderStatusMsg;
|
||||
long status = -1;
|
||||
getSpiderStatusMsg ( cr , buf , &status );
|
||||
|
||||
// if no email address or webhook provided this will not block!
|
||||
if ( ! sendNotification ( ei ) ) return false;
|
||||
@ -4112,6 +4120,11 @@ bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// we need to update crawl info for collections that
|
||||
// have urls ready to spider
|
||||
|
||||
|
||||
|
||||
SpiderColl *getNextSpiderColl ( long *cri ) ;
|
||||
|
||||
|
||||
@ -4204,8 +4217,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( cr->m_maxCrawlRounds > 0 &&
|
||||
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
|
||||
cr->m_spiderStatus = SP_MAXROUNDS;
|
||||
cr->m_spiderStatusMsg = "Crawl has reached "
|
||||
"maxCrawlRounds limit.";
|
||||
// it'll send a SP_ROUNDDONE email first
|
||||
// so no need to repeat it, but we do want to
|
||||
// update the status msg
|
||||
@ -4217,8 +4228,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( cr->m_globalCrawlInfo.m_pageDownloadSuccesses >=
|
||||
cr->m_maxToCrawl ) {
|
||||
cr->m_spiderStatus = SP_MAXTOCRAWL;
|
||||
cr->m_spiderStatusMsg = "Crawl has reached maxToCrawl "
|
||||
"limit.";
|
||||
sendNotificationForCollRec ( cr );
|
||||
continue;
|
||||
}
|
||||
@ -4227,8 +4236,6 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( cr->m_globalCrawlInfo.m_pageProcessSuccesses >=
|
||||
cr->m_maxToProcess ) {
|
||||
cr->m_spiderStatus = SP_MAXTOPROCESS;
|
||||
cr->m_spiderStatusMsg = "Crawl has reached "
|
||||
"maxToProcess limit.";
|
||||
sendNotificationForCollRec ( cr );
|
||||
continue;
|
||||
}
|
||||
@ -4947,8 +4954,8 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
ci->m_hasUrlsReadyToSpider = true;
|
||||
|
||||
// reset reason why crawl is not running, because we basically are now
|
||||
cr->m_spiderStatus = 0;
|
||||
cr->m_spiderStatusMsg = NULL;
|
||||
cr->m_spiderStatus = SP_INPROGRESS; // 0;
|
||||
//cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// be sure to save state so we do not re-send emails
|
||||
cr->m_needsSave = 1;
|
||||
@ -10000,10 +10007,17 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
|
||||
|
||||
// if we have urls ready to be spidered then prepare to send another
|
||||
// email/webhook notification
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
|
||||
// . if we have urls ready to be spidered then prepare to send another
|
||||
// email/webhook notification.
|
||||
// . do not reset this flag if SP_MAXTOCRAWL etc otherwise we end up
|
||||
// sending multiple notifications, so this logic here is only
|
||||
// for when we are done spidering a round, which happens when
|
||||
// hasUrlsReadyToSpider goes false for all shards.
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE ) {
|
||||
log("spider: resetting sent crawl done alert to 0");
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
}
|
||||
|
||||
|
||||
// update cache time
|
||||
@ -10060,7 +10074,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// and we've examined at least one url. to prevent us from
|
||||
// sending a notification if we haven't spidered anything
|
||||
// because no seed urls have been added/injected.
|
||||
if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
|
||||
//if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
|
||||
if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) return;
|
||||
|
||||
// if urls were considered and roundstarttime is still 0 then
|
||||
// set it to the current time...
|
||||
@ -10074,7 +10089,6 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
// update status
|
||||
cr->m_spiderStatus = SP_ROUNDDONE;
|
||||
cr->m_spiderStatusMsg = "Crawl round completed.";
|
||||
|
||||
// do email and web hook...
|
||||
sendNotificationForCollRec ( cr );
|
||||
@ -10161,3 +10175,81 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
slot );
|
||||
}
|
||||
|
||||
bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
|
||||
|
||||
//char *ss = "Crawl in progress.";
|
||||
//if ( cx->m_spiderStatusMsg )
|
||||
// ss = cx->m_spiderStatusMsg;
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
|
||||
*status = SP_MAXTOCRAWL;
|
||||
return msg->safePrintf ( "Crawl has reached maxToCrawl "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
|
||||
*status = SP_MAXTOPROCESS;
|
||||
return msg->safePrintf ( "Crawl has reached maxToProcess "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
|
||||
*status = SP_MAXROUNDS;
|
||||
return msg->safePrintf ( "Crawl has reached maxCrawlRounds "
|
||||
"limit." );
|
||||
}
|
||||
|
||||
long now = getTimeGlobal();
|
||||
// . 0 means not to RE-crawl
|
||||
// . indicate if we are WAITING for next round...
|
||||
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
now < cx->m_spiderRoundStartTime ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
return msg->safePrintf("Next crawl round to start "
|
||||
"in %li seconds.",
|
||||
cx->m_spiderRoundStartTime-now );
|
||||
}
|
||||
|
||||
// if we sent an email simply because no urls
|
||||
// were left and we are not recrawling!
|
||||
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
|
||||
*status = SP_COMPLETED;
|
||||
return msg->safePrintf("Crawl has completed and no "
|
||||
"repeatCrawl is scheduled.");
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
|
||||
*status = SP_ROUNDDONE;
|
||||
return msg->safePrintf ( "Crawl round completed.");
|
||||
}
|
||||
|
||||
if ( ! cx->m_spideringEnabled ) {
|
||||
*status = SP_PAUSED;
|
||||
return msg->safePrintf("Crawl paused.");
|
||||
}
|
||||
|
||||
if ( ! g_conf.m_spideringEnabled ) {
|
||||
*status = SP_ADMIN_PAUSED;
|
||||
return msg->safePrintf("All crawling temporarily paused "
|
||||
"by root administrator for "
|
||||
"maintenance.");
|
||||
}
|
||||
|
||||
// if spiderdb is empty for this coll, then no url
|
||||
// has been added to spiderdb yet.. either seed or spot
|
||||
CrawlInfo *cg = &cx->m_globalCrawlInfo;
|
||||
if ( cg->m_pageDownloadAttempts == 0 ) {
|
||||
*status = SP_NOURLS;
|
||||
return msg->safePrintf("Crawl is waiting for urls.");
|
||||
}
|
||||
|
||||
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
|
||||
*status = SP_INITIALIZING;
|
||||
return msg->safePrintf("Crawl is initializing.");
|
||||
}
|
||||
|
||||
// otherwise in progress?
|
||||
*status = SP_INPROGRESS;
|
||||
return msg->safePrintf("Crawl is in progress.");
|
||||
}
|
||||
|
15
Spider.h
15
Spider.h
@ -32,7 +32,22 @@ bool updateCrawlInfo ( CollectionRec *cr ,
|
||||
void (* callback)(void *state) ,
|
||||
bool useCache = true ) ;
|
||||
|
||||
// . values for CollectionRec::m_spiderStatus
|
||||
// . reasons why crawl is not happening
|
||||
#define SP_INITIALIZING 0
|
||||
#define SP_MAXROUNDS 1 // hit max rounds limit
|
||||
#define SP_MAXTOCRAWL 2 // hit max to crawl limit
|
||||
#define SP_MAXTOPROCESS 3 // hit max to process limit
|
||||
#define SP_ROUNDDONE 4 // spider round is done
|
||||
#define SP_NOURLS 5 // initializing
|
||||
#define SP_PAUSED 6 // user paused spider
|
||||
#define SP_INPROGRESS 7 // it is going on!
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
|
||||
bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
class SafeBuf *msg ,
|
||||
long *status ) ;
|
||||
|
||||
// Overview of Spider
|
||||
//
|
||||
|
Loading…
x
Reference in New Issue
Block a user