mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-14 02:36:06 -04:00
Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
This commit is contained in:
@ -101,6 +101,8 @@ void CollectionRec::reset() {
|
||||
m_replies = 0;
|
||||
}
|
||||
|
||||
CollectionRec *g_cr = NULL;
|
||||
|
||||
// . load this data from a conf file
|
||||
// . values we do not explicitly have will be taken from "default",
|
||||
// collection config file. if it does not have them then we use
|
||||
|
@ -97,6 +97,11 @@ class CrawlInfo {
|
||||
// this is non-zero if urls are available to be spidered right now.
|
||||
long m_hasUrlsReadyToSpider;
|
||||
|
||||
// last time we launched a spider. 0 on startup.
|
||||
time_t m_lastSpiderAttempt;
|
||||
// time we had or might have had a url available for spidering
|
||||
time_t m_lastSpiderCouldLaunch;
|
||||
|
||||
// have we sent out email/webhook notifications crawl has no urls
|
||||
// currently in the ready queue (doledb) to spider?
|
||||
char m_sentCrawlDoneAlert;
|
||||
|
@ -774,9 +774,16 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
|
||||
cr->m_spiderRoundNum = 0;
|
||||
cr->m_spiderRoundStartTime = 0;
|
||||
|
||||
cr->m_spiderStatus = 0;
|
||||
cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// reset seed buf
|
||||
cr->m_diffbotSeeds.purge();
|
||||
|
||||
// reset seed dedup table
|
||||
HashTableX *ht = &cr->m_seedHashTable;
|
||||
ht->reset();
|
||||
|
||||
// so XmlDoc.cpp can detect if the collection was reset since it
|
||||
// launched its spider:
|
||||
cr->m_lastResetCount++;
|
||||
|
71
Hostdb.cpp
71
Hostdb.cpp
@ -194,6 +194,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
// skip known directives
|
||||
if ( ! strncmp(p,"port-offset:",12) ||
|
||||
! strncmp(p,"index-splits:",13) ||
|
||||
! strncmp(p,"num-mirrors:",12) ||
|
||||
! strncmp(p,"working-dir:",12) )
|
||||
p = p;
|
||||
// check if this is a spare host
|
||||
@ -243,13 +244,14 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
if ( ! m_hosts ) return log(
|
||||
"conf: Memory allocation failed.");
|
||||
|
||||
unsigned long maxShard = 0;
|
||||
//unsigned long maxShard = 0;
|
||||
long numGrunts = 0;
|
||||
|
||||
// now fill up m_hosts
|
||||
p = m_buf;
|
||||
i = 0;
|
||||
long line = 1;
|
||||
unsigned long lastShard = 0;
|
||||
//unsigned long lastShard = 0;
|
||||
long proxyNum = 0;
|
||||
|
||||
// assume defaults
|
||||
@ -257,6 +259,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
long indexSplits = 0;
|
||||
char *wdir2 = NULL;
|
||||
long wdirlen2 = 0;
|
||||
long numMirrors = -1;
|
||||
|
||||
for ( ; *p ; p++ , line++ ) {
|
||||
if ( is_wspace_a (*p) ) continue;
|
||||
@ -273,6 +276,15 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( ! strncmp(p,"num-mirrors:",12) ) {
|
||||
p += 12;
|
||||
// skip spaces after the colon
|
||||
while ( is_wspace_a(*p) ) p++;
|
||||
numMirrors = atol(p);
|
||||
while ( *p && *p != '\n' ) p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// does the line say "working-dir: xxxx" ?
|
||||
if ( ! strncmp(p,"working-dir:",12) ) {
|
||||
p += 12;
|
||||
@ -351,13 +363,6 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
// skip numeric hostid or "proxy" keyword
|
||||
while ( ! is_wspace_a(*p) ) p++;
|
||||
|
||||
if ( indexSplits == 0 ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("admin: need index-splits: xxx directive "
|
||||
"in hosts.conf");
|
||||
return false;
|
||||
}
|
||||
|
||||
// read in switch id
|
||||
//h->m_switchId = atoi(p);
|
||||
|
||||
@ -590,7 +595,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
// our group is based on our split!
|
||||
//h->m_group = i % g_hostdb.m_indexSplits; // # grps
|
||||
//h->m_group = i % indexSplits; // # grps
|
||||
h->m_shardNum = i % indexSplits;
|
||||
//h->m_shardNum = i % indexSplits;
|
||||
// i guess proxy and spares don't count
|
||||
if ( h->m_type != HT_GRUNT ) h->m_shardNum = 0;
|
||||
|
||||
@ -665,9 +670,12 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
h->m_externalHttpsPort = h->m_httpsPort;
|
||||
|
||||
// get max group number
|
||||
if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
|
||||
maxShard = h->m_shardNum;
|
||||
//if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
|
||||
// maxShard = h->m_shardNum;
|
||||
if ( h->m_type == HT_GRUNT )
|
||||
numGrunts++;
|
||||
|
||||
/*
|
||||
if ( h->m_shardNum <= lastShard && h->m_shardNum != 0
|
||||
&& !(h->m_type&(HT_ALL_PROXIES)) ) {
|
||||
g_errno = EBADENGINEER;
|
||||
@ -678,6 +686,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
filename,line);
|
||||
}
|
||||
lastShard = h->m_shardNum;
|
||||
*/
|
||||
|
||||
// skip line now
|
||||
while ( *p && *p != '\n' )
|
||||
@ -742,10 +751,46 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
|
||||
//m_numHosts = i;
|
||||
m_numTotalHosts = i;
|
||||
// how many shards are we configure for?
|
||||
m_numShards = maxShard + 1; // g_conf.m_numGroups;
|
||||
//m_numShards = maxShard + 1; // g_conf.m_numGroups;
|
||||
|
||||
|
||||
// # of mirrors is zero if no mirrors,
|
||||
// if it is 1 then each host has ONE MIRROR host
|
||||
if ( numMirrors == 0 )
|
||||
indexSplits = numGrunts;
|
||||
if ( numMirrors > 0 )
|
||||
indexSplits = numGrunts / (numMirrors+1);
|
||||
|
||||
if ( indexSplits == 0 ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("admin: need num-mirrors: xxx or "
|
||||
"index-splits: xxx directive "
|
||||
"in hosts.conf");
|
||||
return false;
|
||||
}
|
||||
|
||||
numMirrors = (numGrunts / indexSplits) - 1 ;
|
||||
|
||||
if ( numMirrors < 0 ) {
|
||||
g_errno = EBADENGINEER;
|
||||
log("admin: need num-mirrors: xxx or "
|
||||
"index-splits: xxx directive "
|
||||
"in hosts.conf (2)");
|
||||
return false;
|
||||
}
|
||||
|
||||
m_indexSplits = indexSplits;
|
||||
|
||||
m_numShards = numGrunts / (numMirrors+1);
|
||||
|
||||
//
|
||||
// set Host::m_shardNum
|
||||
//
|
||||
for ( long i = 0 ; i < numGrunts ; i++ ) {
|
||||
Host *h = &m_hosts[i];
|
||||
h->m_shardNum = i % indexSplits;
|
||||
}
|
||||
|
||||
// assign spare hosts
|
||||
if ( m_numSpareHosts > MAX_SPARES ) {
|
||||
log ( "conf: Number of spares (%li) exceeds max of %i, "
|
||||
|
2
Mem.cpp
2
Mem.cpp
@ -12,7 +12,7 @@
|
||||
//#include "Stats.h"
|
||||
|
||||
// put me back
|
||||
#define _EFENCE_
|
||||
//#define _EFENCE_
|
||||
|
||||
// uncomment this for _EFENCE_ to do underflow checks instead of the
|
||||
// default overflow checks
|
||||
|
26
Msg4.cpp
26
Msg4.cpp
@ -16,6 +16,21 @@
|
||||
#include "Multicast.h"
|
||||
#include "Syncdb.h"
|
||||
|
||||
//////////////
|
||||
//
|
||||
// Send out our records to add every X ms here:
|
||||
//
|
||||
// Batching up the add requests saves udp traffic
|
||||
// on large networks (100+ hosts).
|
||||
//
|
||||
// . currently: send out adds once every 500ms
|
||||
// . when this was 5000ms (5s) it would wait like
|
||||
// 5s to spider a url after adding it.
|
||||
//
|
||||
//////////////
|
||||
#define MSG4_WAIT 500
|
||||
|
||||
|
||||
// we have up to this many outstanding Multicasts to send add requests to hosts
|
||||
#define MAX_MCASTS 128
|
||||
Multicast s_mcasts[MAX_MCASTS];
|
||||
@ -98,8 +113,11 @@ bool registerHandler4 ( ) {
|
||||
}
|
||||
|
||||
// . register sleep handler every 5 seconds = 5000 ms
|
||||
// . right now MSG4_WAIT is 500ms... i lowered it from 5s
|
||||
// to speed up spidering so it would harvest outlinks
|
||||
// faster and be able to spider them right away.
|
||||
// . returns false on failure
|
||||
return g_loop.registerSleepCallback ( 5000 , NULL , sleepCallback4 );
|
||||
return g_loop.registerSleepCallback(MSG4_WAIT,NULL,sleepCallback4 );
|
||||
}
|
||||
|
||||
static void flushLocal ( ) ;
|
||||
@ -475,7 +493,8 @@ bool Msg4::addMetaList ( char *metaList ,
|
||||
if ( metaListSize == 0 ) return true;
|
||||
|
||||
// sanity
|
||||
if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
|
||||
if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// if first time set this
|
||||
m_currentPtr = metaList;
|
||||
@ -547,7 +566,8 @@ bool Msg4::addMetaList2 ( ) {
|
||||
|
||||
char *pend = m_metaList + m_metaListSize;
|
||||
|
||||
if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_collnum < 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// store each record in the list into the send buffers
|
||||
for ( ; p < pend ; ) {
|
||||
|
124
PageCrawlBot.cpp
124
PageCrawlBot.cpp
@ -1256,6 +1256,10 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
|
||||
long prevReplyError = 0;
|
||||
time_t prevReplyDownloadTime = 0LL;
|
||||
long badCount = 0;
|
||||
|
||||
long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
|
||||
|
||||
// parse through it
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
|
||||
// this record is either a SpiderRequest or SpiderReply
|
||||
@ -1316,8 +1320,36 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
|
||||
if ( status == 0 ) msg = "Unexamined";
|
||||
if ( status == -1 ) msg = mstrerror(prevReplyError);
|
||||
|
||||
// matching url filter, print out the expression
|
||||
long ufn ;
|
||||
ufn = ::getUrlFilterNum(sreq,
|
||||
srep,
|
||||
nowGlobalMS,
|
||||
false,
|
||||
MAX_NICENESS,
|
||||
cr);
|
||||
char *expression = NULL;
|
||||
long priority = -4;
|
||||
// sanity check
|
||||
if ( ufn >= 0 ) {
|
||||
expression = cr->m_regExs[ufn].getBufStart();
|
||||
priority = cr->m_spiderPriorities[ufn];
|
||||
}
|
||||
|
||||
if ( ! expression ) {
|
||||
expression = "error. matches no expression!";
|
||||
priority = -4;
|
||||
}
|
||||
|
||||
// when spidering rounds we use the
|
||||
// lastspidertime>={roundstart} --> spiders disabled rule
|
||||
// so that we do not spider a url twice in the same round
|
||||
if ( ufn >= 0 && ! cr->m_spidersEnabled[ufn] ) {
|
||||
priority = -5;
|
||||
}
|
||||
|
||||
// "csv" is default if json not specified
|
||||
if ( m_fmt == FMT_JSON )
|
||||
if ( m_fmt == FMT_JSON )
|
||||
sb->safePrintf("[{"
|
||||
"{\"url\":"
|
||||
"\"%s\"},"
|
||||
@ -1338,18 +1370,35 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
|
||||
, msg
|
||||
);
|
||||
// but default to csv
|
||||
else
|
||||
sb->safePrintf("%s,%lu,%li,\"%s\""
|
||||
else {
|
||||
sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
|
||||
//",%s"
|
||||
"\n"
|
||||
//"\n"
|
||||
, sreq->m_url
|
||||
// when was it first added to spiderdb?
|
||||
, sreq->m_addedTime
|
||||
, status
|
||||
//, status
|
||||
, msg
|
||||
// the url filter expression it matches
|
||||
, expression
|
||||
// the priority
|
||||
//, priorityMsg
|
||||
//, iptoa(sreq->m_firstIp)
|
||||
);
|
||||
|
||||
// print priority
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED )
|
||||
sb->safePrintf("url ignored");
|
||||
else if ( priority == SPIDER_PRIORITY_BANNED )
|
||||
sb->safePrintf("url banned");
|
||||
else if ( priority == -4 )
|
||||
sb->safePrintf("error");
|
||||
else if ( priority == -5 )
|
||||
sb->safePrintf("will spider next round");
|
||||
else
|
||||
sb->safePrintf("%li",priority);
|
||||
sb->safePrintf("\""
|
||||
"\n");
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! badCount ) return;
|
||||
@ -2649,15 +2698,40 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//if ( cx->m_collectionNameAlias.length() > 0 )
|
||||
// alias=cx->m_collectionNameAlias.getBufStart();
|
||||
//long paused = 1;
|
||||
char *ss = "Normal";
|
||||
char *ss = "In progress.";
|
||||
if ( cx->m_spiderStatusMsg )
|
||||
ss = cx->m_spiderStatusMsg;
|
||||
// 0 means not to RE-crawl
|
||||
char tmp[256];
|
||||
// indicate if we are WAITING for next round...
|
||||
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
|
||||
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
|
||||
long now = getTimeGlobal();
|
||||
sprintf(tmp,"Spidering next round in %li "
|
||||
"seconds.",
|
||||
cx->m_spiderRoundStartTime - now
|
||||
);
|
||||
ss = tmp;
|
||||
}
|
||||
// if we sent an email simply because no urls
|
||||
// were left and we are not recrawling!
|
||||
if ( cx->m_collectiveRespiderFrequency == 0.0 &&
|
||||
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
|
||||
ss = "Crawl has exhausted all urls and "
|
||||
"repeatCrawl is set to 0.0.";
|
||||
}
|
||||
if ( ! cx->m_spideringEnabled )
|
||||
ss = "Crawl paused.";
|
||||
CrawlInfo *ci = &cx->m_localCrawlInfo;
|
||||
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
|
||||
if ( sentAlert ) sentAlert = 1;
|
||||
//if ( cx->m_spideringEnabled ) paused = 0;
|
||||
sb.safePrintf("\n\n{"
|
||||
"\"name\":\"%s\",\n"
|
||||
//"\"alias\":\"%s\",\n"
|
||||
"\"crawlingEnabled\":%li,\n"
|
||||
"\"crawlingStatus\":\"%s\",\n"
|
||||
//"\"crawlingEnabled\":%li,\n"
|
||||
"\"crawlStatus\":\"%s\",\n"
|
||||
"\"sentCrawlDoneNotification\":%li,\n"
|
||||
//"\"crawlingPaused\":%li,\n"
|
||||
"\"objectsFound\":%lli,\n"
|
||||
"\"urlsHarvested\":%lli,\n"
|
||||
@ -2676,8 +2750,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
//,cx->m_coll
|
||||
, cx->m_diffbotCrawlName.getBufStart()
|
||||
//, alias
|
||||
, (long)cx->m_spideringEnabled
|
||||
//, (long)cx->m_spideringEnabled
|
||||
, ss
|
||||
, sentAlert
|
||||
//, (long)paused
|
||||
, cx->m_globalCrawlInfo.m_objectsAdded -
|
||||
cx->m_globalCrawlInfo.m_objectsDeleted
|
||||
@ -4085,7 +4160,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
// if collectiverespiderfreq is 0 or less then do not RE-spider
|
||||
// documents already indexed.
|
||||
else {
|
||||
cr->m_regExs[i].set("isindexed");
|
||||
// this does NOT work! error docs continuosly respider
|
||||
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
|
||||
//cr->m_regExs[i].set("isindexed");
|
||||
cr->m_regExs[i].set("hasreply");
|
||||
cr->m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
@ -4376,3 +4454,27 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
///////////
|
||||
//
|
||||
// SUPPORT for getting the last 100 spidered urls
|
||||
//
|
||||
// . sends request to each node
|
||||
// . each node returns top 100 after scanning spiderdb (cache for speed)
|
||||
// . master node gets top 100 of the top 100s
|
||||
// . sends pretty html or json back to socket
|
||||
// . then user can see why their crawl isn't working
|
||||
// . also since we are scanning spiderdb indicate how many urls are
|
||||
// ignored because they match "ismedia" or "!isonsamedomain" etc. so
|
||||
// show each url filter expression then show how many urls matched that.
|
||||
// when doing this make the spiderReply null, b/c the purpose is to see
|
||||
// what urls
|
||||
// . BUT url may never be attempted because it matches "ismedia" so that kind
|
||||
// of thing might have to be indicated on the spiderdb dump above, not here.
|
||||
//
|
||||
//////////
|
||||
|
||||
//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
|
||||
|
@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
|
||||
"it from."
|
||||
"</td></tr>"
|
||||
|
||||
"<tr><td>isnew | !isnew</td>"
|
||||
"<tr><td>hasreply | !hasreply</td>"
|
||||
"<td>"
|
||||
"This is true if we have never tried to spider "
|
||||
"this url. If we have tried to spider it and "
|
||||
"received an error, like a timeout or something, "
|
||||
"then it will no longer match <i>isnew</i>."
|
||||
"This is true if we have tried to spider "
|
||||
"this url, even if we got an error while trying."
|
||||
"</td></tr>"
|
||||
|
||||
|
||||
|
163
Spider.cpp
163
Spider.cpp
@ -1000,8 +1000,8 @@ SpiderColl::SpiderColl () {
|
||||
m_numAdded = 0;
|
||||
m_numBytesScanned = 0;
|
||||
m_lastPrintCount = 0;
|
||||
m_lastSpiderAttempt = 0;
|
||||
m_lastSpiderCouldLaunch = 0;
|
||||
//m_lastSpiderAttempt = 0;
|
||||
//m_lastSpiderCouldLaunch = 0;
|
||||
//m_numRoundsDone = 0;
|
||||
//m_lastDoledbReadEmpty = false; // over all priorities in this coll
|
||||
// re-set this to min and set m_needsWaitingTreeRebuild to true
|
||||
@ -3954,14 +3954,24 @@ void doneSendingNotification ( void *state ) {
|
||||
// as false again! use LOCAL crawlInfo, since global is reset often.
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
|
||||
|
||||
// be sure to save state so we do not re-send emails
|
||||
cr->m_needsSave = 1;
|
||||
|
||||
// sanity
|
||||
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// sanity check
|
||||
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// if not round done we are done
|
||||
if ( cr->m_spiderStatus != SP_ROUNDDONE ) return;
|
||||
// advance round if that round has completed, or there are no
|
||||
// more urls to spider. if we hit maxToProcess/maxToCrawl then
|
||||
// do not increment the round #. otherwise we should increment it.
|
||||
if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) return;
|
||||
if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) return;
|
||||
|
||||
|
||||
// this should have been set below
|
||||
if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// how is this possible
|
||||
//if ( getTimeGlobal()
|
||||
@ -3980,7 +3990,10 @@ void doneSendingNotification ( void *state ) {
|
||||
break;
|
||||
}
|
||||
|
||||
if ( respiderFreq == -1.0 ) return;
|
||||
// if not REcrawling, set this to 0 so we at least update our
|
||||
// round # and round start time...
|
||||
if ( respiderFreq == -1.0 )
|
||||
respiderFreq = 0.0;
|
||||
|
||||
if ( respiderFreq < 0.0 ) {
|
||||
log("spider: bad respiderFreq of %f. making 0.",
|
||||
@ -3989,6 +4002,9 @@ void doneSendingNotification ( void *state ) {
|
||||
}
|
||||
|
||||
long seconds = respiderFreq * 24*3600;
|
||||
// add 1 for lastspidertime round off errors so we can be assured
|
||||
// all spiders have a lastspidertime LESS than the new
|
||||
// m_spiderRoundStartTime we set below.
|
||||
if ( seconds <= 0 ) seconds = 1;
|
||||
|
||||
// now update this round start time. all the other hosts should
|
||||
@ -4011,6 +4027,15 @@ void doneSendingNotification ( void *state ) {
|
||||
|
||||
bool sendNotificationForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
// do not send email for maxrounds hit, it will send a round done
|
||||
// email for that. otherwise we end up calling doneSendingEmail()
|
||||
// twice and increment the round twice
|
||||
if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
|
||||
log("spider: not sending email for max rounds limit "
|
||||
"since already sent for round done.");
|
||||
return true;
|
||||
}
|
||||
|
||||
// . if already sent email for this, skip
|
||||
// . localCrawlInfo stores this value on disk so it is persistent
|
||||
// . we do it this way so SP_ROUNDDONE can be emailed and then
|
||||
@ -4139,11 +4164,14 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
if ( ! cr->m_spideringEnabled ) continue;
|
||||
|
||||
// hit crawl round max?
|
||||
if ( //cr->m_maxCrawlRounds > 0 &&
|
||||
if ( cr->m_maxCrawlRounds > 0 &&
|
||||
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
|
||||
cr->m_spiderStatus = SP_MAXROUNDS;
|
||||
cr->m_spiderStatusMsg = "Hit maxCrawlRounds limit.";
|
||||
sendNotificationForCollRec ( cr );
|
||||
// it'll send a SP_ROUNDDONE email first
|
||||
// so no need to repeat it, but we do want to
|
||||
// update the status msg
|
||||
//sendNotificationForCollRec ( cr );
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -4175,6 +4203,24 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// set current time, synced with host #0
|
||||
nowGlobal = getTimeGlobal();
|
||||
|
||||
// shortcut
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
|
||||
// the last time we attempted to spider a url for this coll
|
||||
//m_sc->m_lastSpiderAttempt = nowGlobal;
|
||||
// now we save this so when we restart these two times
|
||||
// are from where we left off so we do not end up setting
|
||||
// hasUrlsReadyToSpider to true which in turn sets
|
||||
// the sentEmailAlert flag to false, which makes us
|
||||
// send ANOTHER email alert!!
|
||||
ci->m_lastSpiderAttempt = nowGlobal;
|
||||
|
||||
// update this for the first time in case it is never updated.
|
||||
// then after 60 seconds we assume the crawl is done and
|
||||
// we send out notifications. see below.
|
||||
if ( ci->m_lastSpiderCouldLaunch == 0 )
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
|
||||
//
|
||||
// . if doing respider with roundstarttime....
|
||||
// . roundstarttime is > 0 if m_collectiveRespiderFrequency
|
||||
@ -4184,19 +4230,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
//
|
||||
if ( nowGlobal < cr->m_spiderRoundStartTime ) continue;
|
||||
|
||||
// the last time we attempted to spider a url for this coll
|
||||
m_sc->m_lastSpiderAttempt = nowGlobal;
|
||||
// update this for the first time in case it is never updated.
|
||||
// then after 60 seconds we assume the crawl is done and
|
||||
// we send out notifications. see below.
|
||||
if ( m_sc->m_lastSpiderCouldLaunch == 0 )
|
||||
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// if populating this collection's waitingtree assume
|
||||
// we would have found something to launch as well. it might
|
||||
// mean the waitingtree-saved.dat file was deleted from disk
|
||||
// so we need to rebuild it at startup.
|
||||
if ( m_sc->m_waitingTreeNeedsRebuild )
|
||||
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
|
||||
// get max spiders
|
||||
long maxSpiders = cr->m_maxNumSpiders;
|
||||
if ( m_sc->m_isTestColl ) {
|
||||
@ -4215,7 +4255,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// obey max spiders per collection too
|
||||
if ( m_sc->m_spidersOut >= maxSpiders ) {
|
||||
// assume we would have launched a spider
|
||||
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// try next collection
|
||||
continue;
|
||||
}
|
||||
@ -4279,10 +4319,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
|
||||
loop:
|
||||
|
||||
// shortcut
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
|
||||
// bail if waiting for lock reply, no point in reading more
|
||||
if ( m_msg12.m_gettingLocks ) {
|
||||
// assume we would have launched a spider for this coll
|
||||
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// wait for sleep callback to re-call us in 10ms
|
||||
return;
|
||||
}
|
||||
@ -4344,7 +4387,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
// skip?
|
||||
if ( out >= max ) {
|
||||
// assume we could have launched a spider
|
||||
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// count as non-empty then!
|
||||
//m_sc->m_encounteredDoledbRecs = true;
|
||||
// try the priority below us
|
||||
@ -4464,6 +4507,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
// unlock
|
||||
m_gettingDoledbList = false;
|
||||
|
||||
// shortcuts
|
||||
CollectionRec *cr = m_sc->m_cr;
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
|
||||
// update m_msg5StartKey for next read
|
||||
if ( m_list.getListSize() > 0 ) {
|
||||
m_list.getLastKey((char *)&m_sc->m_msg5StartKey);
|
||||
@ -4495,7 +4542,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
|
||||
if ( bail ) {
|
||||
// assume we could have launched a spider
|
||||
m_sc->m_lastSpiderCouldLaunch = getTimeGlobal();
|
||||
ci->m_lastSpiderCouldLaunch = getTimeGlobal();
|
||||
// return false to indicate to try another
|
||||
return false;
|
||||
}
|
||||
@ -4623,7 +4670,6 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
|
||||
// skip the priority if we already have enough spiders on it
|
||||
long out = m_sc->m_outstandingSpiders[pri];
|
||||
CollectionRec *cr = m_sc->m_cr;
|
||||
// get the first ufn that uses this priority
|
||||
//long max = getMaxAllowableSpidersOut ( pri );
|
||||
// how many spiders can we have out?
|
||||
@ -4661,7 +4707,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
// skip? and re-get another doledb list from next priority...
|
||||
if ( out >= max ) {
|
||||
// assume we could have launched a spider
|
||||
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
// this priority is maxed out, try next
|
||||
m_sc->devancePriority();
|
||||
// assume not an empty read
|
||||
@ -4850,12 +4896,22 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
// assume we launch the spider below. really this timestamp indicates
|
||||
// the last time we COULD HAVE LAUNCHED *OR* did actually launch
|
||||
// a spider
|
||||
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
ci->m_lastSpiderCouldLaunch = nowGlobal;
|
||||
|
||||
// set crawl done email sent flag so another email can be sent again
|
||||
// in case the user upped the maxToCrawl limit, for instance,
|
||||
// so that the crawl could continue.
|
||||
m_sc->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
//ci->m_sentCrawlDoneAlert = 0;
|
||||
|
||||
// there are urls ready to spider
|
||||
ci->m_hasUrlsReadyToSpider = true;
|
||||
|
||||
// reset reason why crawl is not running, because we basically are now
|
||||
cr->m_spiderStatus = 0;
|
||||
cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// be sure to save state so we do not re-send emails
|
||||
cr->m_needsSave = 1;
|
||||
|
||||
// assume not an empty read
|
||||
//m_sc->m_encounteredDoledbRecs = true;
|
||||
@ -8322,6 +8378,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) return -1;
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if we got a reply, we are not new!!
|
||||
if ( (bool)srep == (bool)val ) continue;
|
||||
// skip it for speed
|
||||
p += 8;
|
||||
// check for &&
|
||||
p = strstr(p, "&&");
|
||||
// if nothing, else then it is a match
|
||||
if ( ! p ) return i;
|
||||
// skip the '&&' and go to next rule
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// hastmperror, if while spidering, the last reply was
|
||||
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
|
||||
// usually temporary condition that warrants a retry
|
||||
@ -8802,7 +8876,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
// iswww, means url is like www.xyz.com/...
|
||||
if ( strncmp(p,"iswww", 5) == 0 ) {
|
||||
// now this is a bit
|
||||
@ -9863,9 +9936,13 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// but only if it was a crawl round done alert,
|
||||
// not a maxToCrawl or maxToProcess or maxRounds
|
||||
// alert.
|
||||
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
|
||||
SP_ROUNDDONE )
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
// we can't do this because on startup we end up
|
||||
// setting hasUrlsReadyToSpider to true and we
|
||||
// may have already sent an email, and it gets RESET
|
||||
// here when it shouldn't be
|
||||
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
|
||||
// SP_ROUNDDONE )
|
||||
// cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
}
|
||||
}
|
||||
// return if still waiting on more to come in
|
||||
@ -9874,6 +9951,15 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
// sanity check
|
||||
if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
|
||||
|
||||
// if we have urls ready to be spidered then prepare to send another
|
||||
// email/webhook notification
|
||||
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
|
||||
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
|
||||
|
||||
|
||||
// update cache time
|
||||
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
|
||||
|
||||
@ -9932,9 +10018,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
// if urls were considered and roundstarttime is still 0 then
|
||||
// set it to the current time...
|
||||
if ( cr->m_spiderRoundStartTime == 0 )
|
||||
// all hosts in the network should sync with host #0 on this
|
||||
cr->m_spiderRoundStartTime = getTimeGlobal();
|
||||
//if ( cr->m_spiderRoundStartTime == 0 )
|
||||
// // all hosts in the network should sync with host #0 on this
|
||||
// cr->m_spiderRoundStartTime = getTimeGlobal();
|
||||
|
||||
// but of course if it has urls ready to spider, do not send alert...
|
||||
// or if this is -1, indicating "unknown".
|
||||
@ -9987,20 +10073,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
//long now = getTimeGlobal();
|
||||
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
|
||||
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
|
||||
|
||||
// shortcut
|
||||
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
||||
|
||||
// assume it does
|
||||
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
|
||||
//ci->m_hasUrlsReadyToSpider = 1;
|
||||
|
||||
// if we haven't spidered anything in 1 min assume the
|
||||
// queue is basically empty...
|
||||
if ( sc->m_lastSpiderAttempt &&
|
||||
sc->m_lastSpiderCouldLaunch &&
|
||||
if ( ci->m_lastSpiderAttempt &&
|
||||
ci->m_lastSpiderCouldLaunch &&
|
||||
//cr->m_spideringEnabled &&
|
||||
//g_conf.m_spideringEnabled &&
|
||||
sc->m_lastSpiderAttempt - sc->m_lastSpiderCouldLaunch > 60 )
|
||||
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
|
||||
// assume our crawl on this host is completed i guess
|
||||
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
|
||||
ci->m_hasUrlsReadyToSpider = 0;
|
||||
|
||||
|
||||
|
||||
|
5
Spider.h
5
Spider.h
@ -980,11 +980,6 @@ class SpiderColl {
|
||||
|
||||
bool m_useTree;
|
||||
|
||||
// last time we launched a spider. 0 on startup.
|
||||
time_t m_lastSpiderAttempt;
|
||||
// time we had or might have had a url available for spidering
|
||||
time_t m_lastSpiderCouldLaunch;
|
||||
|
||||
//bool m_lastDoledbReadEmpty;
|
||||
//bool m_encounteredDoledbRecs;
|
||||
//long long m_numRoundsDone;
|
||||
|
43
XmlDoc.cpp
43
XmlDoc.cpp
@ -12898,8 +12898,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// add a '?' if none
|
||||
if ( ! strchr ( apiUrl.getUrl() , '?' ) )
|
||||
diffbotUrl.pushChar('?');
|
||||
else
|
||||
diffbotUrl.pushChar('&');
|
||||
|
||||
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
||||
diffbotUrl.safePrintf("&token=%s",cr->m_diffbotToken.getBufStart());
|
||||
diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
|
||||
diffbotUrl.safePrintf("&url=");
|
||||
// give diffbot the url to process
|
||||
diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
|
||||
@ -21492,20 +21498,20 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// so if your first X filters all map to a "FILTERED"
|
||||
// priority and this url matches one of them we can
|
||||
// confidently toss this guy out.
|
||||
long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
|
||||
false, m_niceness, cr);
|
||||
//long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
|
||||
// false, m_niceness, cr);
|
||||
|
||||
// bad?
|
||||
if ( ufn < 0 ) {
|
||||
log("build: link %s had bad url filter."
|
||||
, ksr.m_url );
|
||||
g_errno = EBADENGINEER;
|
||||
return NULL;
|
||||
}
|
||||
//if ( ufn < 0 ) {
|
||||
// log("build: link %s had bad url filter."
|
||||
// , ksr.m_url );
|
||||
// g_errno = EBADENGINEER;
|
||||
// return NULL;
|
||||
//}
|
||||
|
||||
long priority = -1;
|
||||
if ( ufn >= 0 )
|
||||
priority = cr->m_spiderPriorities[ufn];
|
||||
//long priority = -1;
|
||||
//if ( ufn >= 0 )
|
||||
// priority = cr->m_spiderPriorities[ufn];
|
||||
|
||||
// debug
|
||||
if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
|
||||
@ -21526,10 +21532,15 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
sb2.getBufStart());
|
||||
}
|
||||
// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
linksFiltered++; continue; }
|
||||
if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
linksBanned++; continue; }
|
||||
// . mdw: oct 24, 2013. now i add so the urls show up in
|
||||
// the pagecrawlbot.cpp spiderdb dump, so you can examine
|
||||
// exactly why a url was crawled or not. plus if you change
|
||||
// your mind about banning/filtering then it'd be nice to
|
||||
// have these urls readily available.
|
||||
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
// linksFiltered++; continue; }
|
||||
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
// linksBanned++; continue; }
|
||||
|
||||
// serialize into the buffer
|
||||
long need = ksr.getRecSize();
|
||||
|
12
hosts.conf
12
hosts.conf
@ -2,12 +2,14 @@
|
||||
# Tells us what hosts are participating in the distributed search engine.
|
||||
|
||||
|
||||
# This is how many pieces you want the index split into.
|
||||
# So if you have 64 machines, and you want a unique piece of index on
|
||||
# each machine, then make this 64. But if you have 64 machines and you
|
||||
# want one level of redundancy then make this 32.
|
||||
# How many mirrors do you want? If this is 0 then your data
|
||||
# will NOT be replicated. If it is 1 then each host listed
|
||||
# below will have one host that mirrors it, thereby decreasing
|
||||
# total index capacity, but increasing redundancy. If this is
|
||||
# 1 then the first half of hosts will be replicated by the
|
||||
# second half of the hosts listed below.
|
||||
|
||||
index-splits: 1
|
||||
num-mirrors: 0
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user