Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
mwells
2013-10-24 17:56:10 -07:00
12 changed files with 372 additions and 96 deletions

@ -101,6 +101,8 @@ void CollectionRec::reset() {
m_replies = 0;
}
CollectionRec *g_cr = NULL;
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use

@ -97,6 +97,11 @@ class CrawlInfo {
// this is non-zero if urls are available to be spidered right now.
long m_hasUrlsReadyToSpider;
// last time we launched a spider. 0 on startup.
time_t m_lastSpiderAttempt;
// time we had or might have had a url available for spidering
time_t m_lastSpiderCouldLaunch;
// have we sent out email/webhook notifications crawl has no urls
// currently in the ready queue (doledb) to spider?
char m_sentCrawlDoneAlert;

@ -774,9 +774,16 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
cr->m_spiderRoundNum = 0;
cr->m_spiderRoundStartTime = 0;
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
// reset seed buf
cr->m_diffbotSeeds.purge();
// reset seed dedup table
HashTableX *ht = &cr->m_seedHashTable;
ht->reset();
// so XmlDoc.cpp can detect if the collection was reset since it
// launched its spider:
cr->m_lastResetCount++;

@ -194,6 +194,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// skip known directives
if ( ! strncmp(p,"port-offset:",12) ||
! strncmp(p,"index-splits:",13) ||
! strncmp(p,"num-mirrors:",12) ||
! strncmp(p,"working-dir:",12) )
p = p;
// check if this is a spare host
@ -243,13 +244,14 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
if ( ! m_hosts ) return log(
"conf: Memory allocation failed.");
unsigned long maxShard = 0;
//unsigned long maxShard = 0;
long numGrunts = 0;
// now fill up m_hosts
p = m_buf;
i = 0;
long line = 1;
unsigned long lastShard = 0;
//unsigned long lastShard = 0;
long proxyNum = 0;
// assume defaults
@ -257,6 +259,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
long indexSplits = 0;
char *wdir2 = NULL;
long wdirlen2 = 0;
long numMirrors = -1;
for ( ; *p ; p++ , line++ ) {
if ( is_wspace_a (*p) ) continue;
@ -273,6 +276,15 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
continue;
}
if ( ! strncmp(p,"num-mirrors:",12) ) {
p += 12;
// skip spaces after the colon
while ( is_wspace_a(*p) ) p++;
numMirrors = atol(p);
while ( *p && *p != '\n' ) p++;
continue;
}
// does the line say "working-dir: xxxx" ?
if ( ! strncmp(p,"working-dir:",12) ) {
p += 12;
@ -351,13 +363,6 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// skip numeric hostid or "proxy" keyword
while ( ! is_wspace_a(*p) ) p++;
if ( indexSplits == 0 ) {
g_errno = EBADENGINEER;
log("admin: need index-splits: xxx directive "
"in hosts.conf");
return false;
}
// read in switch id
//h->m_switchId = atoi(p);
@ -590,7 +595,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// our group is based on our split!
//h->m_group = i % g_hostdb.m_indexSplits; // # grps
//h->m_group = i % indexSplits; // # grps
h->m_shardNum = i % indexSplits;
//h->m_shardNum = i % indexSplits;
// i guess proxy and spares don't count
if ( h->m_type != HT_GRUNT ) h->m_shardNum = 0;
@ -665,9 +670,12 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
h->m_externalHttpsPort = h->m_httpsPort;
// get max group number
if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
maxShard = h->m_shardNum;
//if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
// maxShard = h->m_shardNum;
if ( h->m_type == HT_GRUNT )
numGrunts++;
/*
if ( h->m_shardNum <= lastShard && h->m_shardNum != 0
&& !(h->m_type&(HT_ALL_PROXIES)) ) {
g_errno = EBADENGINEER;
@ -678,6 +686,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
filename,line);
}
lastShard = h->m_shardNum;
*/
// skip line now
while ( *p && *p != '\n' )
@ -742,10 +751,46 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
//m_numHosts = i;
m_numTotalHosts = i;
// how many shards are we configure for?
m_numShards = maxShard + 1; // g_conf.m_numGroups;
//m_numShards = maxShard + 1; // g_conf.m_numGroups;
// # of mirrors is zero if no mirrors,
// if it is 1 then each host has ONE MIRROR host
if ( numMirrors == 0 )
indexSplits = numGrunts;
if ( numMirrors > 0 )
indexSplits = numGrunts / (numMirrors+1);
if ( indexSplits == 0 ) {
g_errno = EBADENGINEER;
log("admin: need num-mirrors: xxx or "
"index-splits: xxx directive "
"in hosts.conf");
return false;
}
numMirrors = (numGrunts / indexSplits) - 1 ;
if ( numMirrors < 0 ) {
g_errno = EBADENGINEER;
log("admin: need num-mirrors: xxx or "
"index-splits: xxx directive "
"in hosts.conf (2)");
return false;
}
m_indexSplits = indexSplits;
m_numShards = numGrunts / (numMirrors+1);
//
// set Host::m_shardNum
//
for ( long i = 0 ; i < numGrunts ; i++ ) {
Host *h = &m_hosts[i];
h->m_shardNum = i % indexSplits;
}
// assign spare hosts
if ( m_numSpareHosts > MAX_SPARES ) {
log ( "conf: Number of spares (%li) exceeds max of %i, "

@ -12,7 +12,7 @@
//#include "Stats.h"
// put me back
#define _EFENCE_
//#define _EFENCE_
// uncomment this for _EFENCE_ to do underflow checks instead of the
// default overflow checks

@ -16,6 +16,21 @@
#include "Multicast.h"
#include "Syncdb.h"
//////////////
//
// Send out our records to add every X ms here:
//
// Batching up the add requests saves udp traffic
// on large networks (100+ hosts).
//
// . currently: send out adds once every 500ms
// . when this was 5000ms (5s) it would wait like
// 5s to spider a url after adding it.
//
//////////////
#define MSG4_WAIT 500
// we have up to this many outstanding Multicasts to send add requests to hosts
#define MAX_MCASTS 128
Multicast s_mcasts[MAX_MCASTS];
@ -98,8 +113,11 @@ bool registerHandler4 ( ) {
}
// . register sleep handler every 5 seconds = 5000 ms
// . right now MSG4_WAIT is 500ms... i lowered it from 5s
// to speed up spidering so it would harvest outlinks
// faster and be able to spider them right away.
// . returns false on failure
return g_loop.registerSleepCallback ( 5000 , NULL , sleepCallback4 );
return g_loop.registerSleepCallback(MSG4_WAIT,NULL,sleepCallback4 );
}
static void flushLocal ( ) ;
@ -475,7 +493,8 @@ bool Msg4::addMetaList ( char *metaList ,
if ( metaListSize == 0 ) return true;
// sanity
if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
//if ( collnum < 0 || collnum > 1000 ) { char *xx=NULL;*xx=0; }
if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
// if first time set this
m_currentPtr = metaList;
@ -547,7 +566,8 @@ bool Msg4::addMetaList2 ( ) {
char *pend = m_metaList + m_metaListSize;
if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
//if ( m_collnum < 0 || m_collnum > 1000 ) { char *xx=NULL;*xx=0; }
if ( m_collnum < 0 ) { char *xx=NULL;*xx=0; }
// store each record in the list into the send buffers
for ( ; p < pend ; ) {

@ -1256,6 +1256,10 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
long prevReplyError = 0;
time_t prevReplyDownloadTime = 0LL;
long badCount = 0;
long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
// parse through it
for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
// this record is either a SpiderRequest or SpiderReply
@ -1316,8 +1320,36 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
if ( status == 0 ) msg = "Unexamined";
if ( status == -1 ) msg = mstrerror(prevReplyError);
// matching url filter, print out the expression
long ufn ;
ufn = ::getUrlFilterNum(sreq,
srep,
nowGlobalMS,
false,
MAX_NICENESS,
cr);
char *expression = NULL;
long priority = -4;
// sanity check
if ( ufn >= 0 ) {
expression = cr->m_regExs[ufn].getBufStart();
priority = cr->m_spiderPriorities[ufn];
}
if ( ! expression ) {
expression = "error. matches no expression!";
priority = -4;
}
// when spidering rounds we use the
// lastspidertime>={roundstart} --> spiders disabled rule
// so that we do not spider a url twice in the same round
if ( ufn >= 0 && ! cr->m_spidersEnabled[ufn] ) {
priority = -5;
}
// "csv" is default if json not specified
if ( m_fmt == FMT_JSON )
if ( m_fmt == FMT_JSON )
sb->safePrintf("[{"
"{\"url\":"
"\"%s\"},"
@ -1338,18 +1370,35 @@ void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb ) {
, msg
);
// but default to csv
else
sb->safePrintf("%s,%lu,%li,\"%s\""
else {
sb->safePrintf("\"%s\",%lu,\"%s\",\"%s\",\""
//",%s"
"\n"
//"\n"
, sreq->m_url
// when was it first added to spiderdb?
, sreq->m_addedTime
, status
//, status
, msg
// the url filter expression it matches
, expression
// the priority
//, priorityMsg
//, iptoa(sreq->m_firstIp)
);
// print priority
if ( priority == SPIDER_PRIORITY_FILTERED )
sb->safePrintf("url ignored");
else if ( priority == SPIDER_PRIORITY_BANNED )
sb->safePrintf("url banned");
else if ( priority == -4 )
sb->safePrintf("error");
else if ( priority == -5 )
sb->safePrintf("will spider next round");
else
sb->safePrintf("%li",priority);
sb->safePrintf("\""
"\n");
}
}
if ( ! badCount ) return;
@ -2649,15 +2698,40 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//if ( cx->m_collectionNameAlias.length() > 0 )
// alias=cx->m_collectionNameAlias.getBufStart();
//long paused = 1;
char *ss = "Normal";
char *ss = "In progress.";
if ( cx->m_spiderStatusMsg )
ss = cx->m_spiderStatusMsg;
// 0 means not to RE-crawl
char tmp[256];
// indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
getTimeGlobal() < cx->m_spiderRoundStartTime ) {
long now = getTimeGlobal();
sprintf(tmp,"Spidering next round in %li "
"seconds.",
cx->m_spiderRoundStartTime - now
);
ss = tmp;
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency == 0.0 &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
ss = "Crawl has exhausted all urls and "
"repeatCrawl is set to 0.0.";
}
if ( ! cx->m_spideringEnabled )
ss = "Crawl paused.";
CrawlInfo *ci = &cx->m_localCrawlInfo;
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
if ( sentAlert ) sentAlert = 1;
//if ( cx->m_spideringEnabled ) paused = 0;
sb.safePrintf("\n\n{"
"\"name\":\"%s\",\n"
//"\"alias\":\"%s\",\n"
"\"crawlingEnabled\":%li,\n"
"\"crawlingStatus\":\"%s\",\n"
//"\"crawlingEnabled\":%li,\n"
"\"crawlStatus\":\"%s\",\n"
"\"sentCrawlDoneNotification\":%li,\n"
//"\"crawlingPaused\":%li,\n"
"\"objectsFound\":%lli,\n"
"\"urlsHarvested\":%lli,\n"
@ -2676,8 +2750,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//,cx->m_coll
, cx->m_diffbotCrawlName.getBufStart()
//, alias
, (long)cx->m_spideringEnabled
//, (long)cx->m_spideringEnabled
, ss
, sentAlert
//, (long)paused
, cx->m_globalCrawlInfo.m_objectsAdded -
cx->m_globalCrawlInfo.m_objectsDeleted
@ -4085,7 +4160,10 @@ bool resetUrlFilters ( CollectionRec *cr ) {
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
else {
cr->m_regExs[i].set("isindexed");
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//cr->m_regExs[i].set("isindexed");
cr->m_regExs[i].set("hasreply");
cr->m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
@ -4376,3 +4454,27 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
return true;
}
///////////
//
// SUPPORT for getting the last 100 spidered urls
//
// . sends request to each node
// . each node returns top 100 after scanning spiderdb (cache for speed)
// . master node gets top 100 of the top 100s
// . sends pretty html or json back to socket
// . then user can see why their crawl isn't working
// . also since we are scanning spiderdb indicate how many urls are
// ignored because they match "ismedia" or "!isonsamedomain" etc. so
// show each url filter expression then show how many urls matched that.
// when doing this make the spiderReply null, b/c the purpose is to see
// what urls
// . BUT url may never be attempted because it matches "ismedia" so that kind
// of thing might have to be indicated on the spiderdb dump above, not here.
//
//////////
//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {

@ -697,12 +697,10 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r , long page ,
"it from."
"</td></tr>"
"<tr><td>isnew | !isnew</td>"
"<tr><td>hasreply | !hasreply</td>"
"<td>"
"This is true if we have never tried to spider "
"this url. If we have tried to spider it and "
"received an error, like a timeout or something, "
"then it will no longer match <i>isnew</i>."
"This is true if we have tried to spider "
"this url, even if we got an error while trying."
"</td></tr>"

@ -1000,8 +1000,8 @@ SpiderColl::SpiderColl () {
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
m_lastSpiderAttempt = 0;
m_lastSpiderCouldLaunch = 0;
//m_lastSpiderAttempt = 0;
//m_lastSpiderCouldLaunch = 0;
//m_numRoundsDone = 0;
//m_lastDoledbReadEmpty = false; // over all priorities in this coll
// re-set this to min and set m_needsWaitingTreeRebuild to true
@ -3954,14 +3954,24 @@ void doneSendingNotification ( void *state ) {
// as false again! use LOCAL crawlInfo, since global is reset often.
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// sanity
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
// sanity check
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
// if not round done we are done
if ( cr->m_spiderStatus != SP_ROUNDDONE ) return;
// advance round if that round has completed, or there are no
// more urls to spider. if we hit maxToProcess/maxToCrawl then
// do not increment the round #. otherwise we should increment it.
if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) return;
if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) return;
// this should have been set below
if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
// how is this possible
//if ( getTimeGlobal()
@ -3980,7 +3990,10 @@ void doneSendingNotification ( void *state ) {
break;
}
if ( respiderFreq == -1.0 ) return;
// if not REcrawling, set this to 0 so we at least update our
// round # and round start time...
if ( respiderFreq == -1.0 )
respiderFreq = 0.0;
if ( respiderFreq < 0.0 ) {
log("spider: bad respiderFreq of %f. making 0.",
@ -3989,6 +4002,9 @@ void doneSendingNotification ( void *state ) {
}
long seconds = respiderFreq * 24*3600;
// add 1 for lastspidertime round off errors so we can be assured
// all spiders have a lastspidertime LESS than the new
// m_spiderRoundStartTime we set below.
if ( seconds <= 0 ) seconds = 1;
// now update this round start time. all the other hosts should
@ -4011,6 +4027,15 @@ void doneSendingNotification ( void *state ) {
bool sendNotificationForCollRec ( CollectionRec *cr ) {
// do not send email for maxrounds hit, it will send a round done
// email for that. otherwise we end up calling doneSendingEmail()
// twice and increment the round twice
if ( cr->m_spiderStatus == SP_MAXROUNDS ) {
log("spider: not sending email for max rounds limit "
"since already sent for round done.");
return true;
}
// . if already sent email for this, skip
// . localCrawlInfo stores this value on disk so it is persistent
// . we do it this way so SP_ROUNDDONE can be emailed and then
@ -4139,11 +4164,14 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( ! cr->m_spideringEnabled ) continue;
// hit crawl round max?
if ( //cr->m_maxCrawlRounds > 0 &&
if ( cr->m_maxCrawlRounds > 0 &&
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
cr->m_spiderStatus = SP_MAXROUNDS;
cr->m_spiderStatusMsg = "Hit maxCrawlRounds limit.";
sendNotificationForCollRec ( cr );
// it'll send a SP_ROUNDDONE email first
// so no need to repeat it, but we do want to
// update the status msg
//sendNotificationForCollRec ( cr );
continue;
}
@ -4175,6 +4203,24 @@ void SpiderLoop::spiderDoledUrls ( ) {
// set current time, synced with host #0
nowGlobal = getTimeGlobal();
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// the last time we attempted to spider a url for this coll
//m_sc->m_lastSpiderAttempt = nowGlobal;
// now we save this so when we restart these two times
// are from where we left off so we do not end up setting
// hasUrlsReadyToSpider to true which in turn sets
// the sentEmailAlert flag to false, which makes us
// send ANOTHER email alert!!
ci->m_lastSpiderAttempt = nowGlobal;
// update this for the first time in case it is never updated.
// then after 60 seconds we assume the crawl is done and
// we send out notifications. see below.
if ( ci->m_lastSpiderCouldLaunch == 0 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
//
// . if doing respider with roundstarttime....
// . roundstarttime is > 0 if m_collectiveRespiderFrequency
@ -4184,19 +4230,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
//
if ( nowGlobal < cr->m_spiderRoundStartTime ) continue;
// the last time we attempted to spider a url for this coll
m_sc->m_lastSpiderAttempt = nowGlobal;
// update this for the first time in case it is never updated.
// then after 60 seconds we assume the crawl is done and
// we send out notifications. see below.
if ( m_sc->m_lastSpiderCouldLaunch == 0 )
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
// if populating this collection's waitingtree assume
// we would have found something to launch as well. it might
// mean the waitingtree-saved.dat file was deleted from disk
// so we need to rebuild it at startup.
if ( m_sc->m_waitingTreeNeedsRebuild )
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// get max spiders
long maxSpiders = cr->m_maxNumSpiders;
if ( m_sc->m_isTestColl ) {
@ -4215,7 +4255,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
// obey max spiders per collection too
if ( m_sc->m_spidersOut >= maxSpiders ) {
// assume we would have launched a spider
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// try next collection
continue;
}
@ -4279,10 +4319,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
loop:
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// bail if waiting for lock reply, no point in reading more
if ( m_msg12.m_gettingLocks ) {
// assume we would have launched a spider for this coll
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// wait for sleep callback to re-call us in 10ms
return;
}
@ -4344,7 +4387,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
// skip?
if ( out >= max ) {
// assume we could have launched a spider
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// count as non-empty then!
//m_sc->m_encounteredDoledbRecs = true;
// try the priority below us
@ -4464,6 +4507,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// unlock
m_gettingDoledbList = false;
// shortcuts
CollectionRec *cr = m_sc->m_cr;
CrawlInfo *ci = &cr->m_localCrawlInfo;
// update m_msg5StartKey for next read
if ( m_list.getListSize() > 0 ) {
m_list.getLastKey((char *)&m_sc->m_msg5StartKey);
@ -4495,7 +4542,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( bail ) {
// assume we could have launched a spider
m_sc->m_lastSpiderCouldLaunch = getTimeGlobal();
ci->m_lastSpiderCouldLaunch = getTimeGlobal();
// return false to indicate to try another
return false;
}
@ -4623,7 +4670,6 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// skip the priority if we already have enough spiders on it
long out = m_sc->m_outstandingSpiders[pri];
CollectionRec *cr = m_sc->m_cr;
// get the first ufn that uses this priority
//long max = getMaxAllowableSpidersOut ( pri );
// how many spiders can we have out?
@ -4661,7 +4707,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// skip? and re-get another doledb list from next priority...
if ( out >= max ) {
// assume we could have launched a spider
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// this priority is maxed out, try next
m_sc->devancePriority();
// assume not an empty read
@ -4850,12 +4896,22 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// assume we launch the spider below. really this timestamp indicates
// the last time we COULD HAVE LAUNCHED *OR* did actually launch
// a spider
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// set crawl done email sent flag so another email can be sent again
// in case the user upped the maxToCrawl limit, for instance,
// so that the crawl could continue.
m_sc->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
//ci->m_sentCrawlDoneAlert = 0;
// there are urls ready to spider
ci->m_hasUrlsReadyToSpider = true;
// reset reason why crawl is not running, because we basically are now
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// assume not an empty read
//m_sc->m_encounteredDoledbRecs = true;
@ -8322,6 +8378,24 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
if ( (bool)srep == (bool)val ) continue;
// skip it for speed
p += 8;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// hastmperror, if while spidering, the last reply was
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
// usually temporary condition that warrants a retry
@ -8802,7 +8876,6 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
p += 2;
goto checkNextRule;
}
// iswww, means url is like www.xyz.com/...
if ( strncmp(p,"iswww", 5) == 0 ) {
// now this is a bit
@ -9863,9 +9936,13 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// but only if it was a crawl round done alert,
// not a maxToCrawl or maxToProcess or maxRounds
// alert.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
SP_ROUNDDONE )
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// we can't do this because on startup we end up
// setting hasUrlsReadyToSpider to true and we
// may have already sent an email, and it gets RESET
// here when it shouldn't be
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
// SP_ROUNDDONE )
// cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
}
// return if still waiting on more to come in
@ -9874,6 +9951,15 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// sanity check
if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
// if we have urls ready to be spidered then prepare to send another
// email/webhook notification
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// update cache time
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
@ -9932,9 +10018,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// if urls were considered and roundstarttime is still 0 then
// set it to the current time...
if ( cr->m_spiderRoundStartTime == 0 )
// all hosts in the network should sync with host #0 on this
cr->m_spiderRoundStartTime = getTimeGlobal();
//if ( cr->m_spiderRoundStartTime == 0 )
// // all hosts in the network should sync with host #0 on this
// cr->m_spiderRoundStartTime = getTimeGlobal();
// but of course if it has urls ready to spider, do not send alert...
// or if this is -1, indicating "unknown".
@ -9987,20 +10073,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
//long now = getTimeGlobal();
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// assume it does
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
//ci->m_hasUrlsReadyToSpider = 1;
// if we haven't spidered anything in 1 min assume the
// queue is basically empty...
if ( sc->m_lastSpiderAttempt &&
sc->m_lastSpiderCouldLaunch &&
if ( ci->m_lastSpiderAttempt &&
ci->m_lastSpiderCouldLaunch &&
//cr->m_spideringEnabled &&
//g_conf.m_spideringEnabled &&
sc->m_lastSpiderAttempt - sc->m_lastSpiderCouldLaunch > 60 )
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
// assume our crawl on this host is completed i guess
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
ci->m_hasUrlsReadyToSpider = 0;

@ -980,11 +980,6 @@ class SpiderColl {
bool m_useTree;
// last time we launched a spider. 0 on startup.
time_t m_lastSpiderAttempt;
// time we had or might have had a url available for spidering
time_t m_lastSpiderCouldLaunch;
//bool m_lastDoledbReadEmpty;
//bool m_encounteredDoledbRecs;
//long long m_numRoundsDone;

@ -12898,8 +12898,14 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// add a '?' if none
if ( ! strchr ( apiUrl.getUrl() , '?' ) )
diffbotUrl.pushChar('?');
else
diffbotUrl.pushChar('&');
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
diffbotUrl.safePrintf("&token=%s",cr->m_diffbotToken.getBufStart());
diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
diffbotUrl.safePrintf("&url=");
// give diffbot the url to process
diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
@ -21492,20 +21498,20 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// so if your first X filters all map to a "FILTERED"
// priority and this url matches one of them we can
// confidently toss this guy out.
long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
false, m_niceness, cr);
//long ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
// false, m_niceness, cr);
// bad?
if ( ufn < 0 ) {
log("build: link %s had bad url filter."
, ksr.m_url );
g_errno = EBADENGINEER;
return NULL;
}
//if ( ufn < 0 ) {
// log("build: link %s had bad url filter."
// , ksr.m_url );
// g_errno = EBADENGINEER;
// return NULL;
//}
long priority = -1;
if ( ufn >= 0 )
priority = cr->m_spiderPriorities[ufn];
//long priority = -1;
//if ( ufn >= 0 )
// priority = cr->m_spiderPriorities[ufn];
// debug
if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
@ -21526,10 +21532,15 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
sb2.getBufStart());
}
// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
if ( priority == SPIDER_PRIORITY_FILTERED ) {
linksFiltered++; continue; }
if ( priority == SPIDER_PRIORITY_BANNED ) {
linksBanned++; continue; }
// . mdw: oct 24, 2013. now i add so the urls show up in
// the pagecrawlbot.cpp spiderdb dump, so you can examine
// exactly why a url was crawled or not. plus if you change
// your mind about banning/filtering then it'd be nice to
// have these urls readily available.
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
// linksFiltered++; continue; }
//if ( priority == SPIDER_PRIORITY_BANNED ) {
// linksBanned++; continue; }
// serialize into the buffer
long need = ksr.getRecSize();

@ -2,12 +2,14 @@
# Tells us what hosts are participating in the distributed search engine.
# This is how many pieces you want the index split into.
# So if you have 64 machines, and you want a unique piece of index on
# each machine, then make this 64. But if you have 64 machines and you
# want one level of redundancy then make this 32.
# How many mirrors do you want? If this is 0 then your data
# will NOT be replicated. If it is 1 then each host listed
# below will have one host that mirrors it, thereby decreasing
# total index capacity, but increasing redundancy. If this is
# 1 then the first half of hosts will be replicated by the
# second half of the hosts listed below.
index-splits: 1
num-mirrors: 0