forked from Mirrors/privacore-open-source-search-engine
try to fix a few cores from deleting collections.
try to spider urls again if user changes certain crawling parms. like regex, patterns, etc.
This commit is contained in:
@ -1866,7 +1866,9 @@ bool CollectionRec::save ( ) {
|
||||
snprintf ( tmp , 1023, "%scoll.%s.%li/localcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
//log("coll: saving %s",tmp);
|
||||
SafeBuf sb;
|
||||
// in case emergency save from malloc core, do not alloc
|
||||
char stack[1024];
|
||||
SafeBuf sb(stack,1024);
|
||||
//m_localCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
|
||||
|
@ -363,6 +363,13 @@ bool readAndSendLoop ( StateCD *st , bool readFirst ) {
|
||||
// over the network. return if that blocked
|
||||
if ( readFirst && ! st->readDataFromRdb ( ) ) return false;
|
||||
|
||||
// did user delete their collection midstream on us?
|
||||
if ( g_errno ) {
|
||||
log("crawlbot: read shard data had error: %s",
|
||||
mstrerror(g_errno));
|
||||
goto subloop;
|
||||
}
|
||||
|
||||
// send it to the browser socket. returns false if blocks.
|
||||
if ( ! st->sendList() ) return false;
|
||||
|
||||
@ -416,6 +423,14 @@ bool StateCD::readDataFromRdb ( ) {
|
||||
key128_t ek; KEYMAX((char *)&ek,sizeof(key128_t));
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
|
||||
// collection got nuked?
|
||||
if ( ! cr ) {
|
||||
log("crawlbot: readdatafromrdb: coll %li got nuked",
|
||||
(long)m_collnum);
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
// top:
|
||||
// launch one request to each shard
|
||||
for ( long i = 0 ; i < g_hostdb.m_numShards ; i++ ) {
|
||||
|
11
Parms.cpp
11
Parms.cpp
@ -18371,8 +18371,15 @@ void handleRequest3fLoop ( void *weArg ) {
|
||||
// . no, because if they change the filters and there are
|
||||
// still no urls to spider i don't want to get another
|
||||
// email alert!!
|
||||
//cr->m_localCrawlInfo .m_hasUrlsReadyToSpider = true;
|
||||
//cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
|
||||
// . no, we need to because it might have added one more
|
||||
// url to be spidered then it'll be done! otherwise
|
||||
// we can change our url filters to try to spider more urls
|
||||
// but nothing will happen if the job has already completed
|
||||
// unless we set these things to true
|
||||
log("parms: reviving collection %s (%li) for parm change",
|
||||
cx->m_coll,(long)cx->m_collnum);
|
||||
cx->m_localCrawlInfo .m_hasUrlsReadyToSpider = true;
|
||||
cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
|
||||
// . reconstruct the url filters if we were a custom crawl
|
||||
// . this is used to abstract away the complexity of url
|
||||
// filters in favor of simple regular expressions and
|
||||
|
29
Spider.cpp
29
Spider.cpp
@ -1004,10 +1004,32 @@ SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
|
||||
}
|
||||
|
||||
bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
|
||||
// if not being deleted return false
|
||||
if ( ! sc->m_deleteMyself ) return false;
|
||||
if ( sc->m_msg5b.m_waitingForList ) return false;
|
||||
if ( sc->m_msg1.m_mcast.m_inUse ) return false;
|
||||
if ( sc->m_isLoading ) return false;
|
||||
// otherwise always return true
|
||||
if ( sc->m_msg5b.m_waitingForList ) {
|
||||
log("spider: deleting sc=0x%lx for collnum=%li waiting1",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
if ( sc->m_msg1.m_mcast.m_inUse ) {
|
||||
log("spider: deleting sc=0x%lx for collnum=%li waiting2",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
if ( sc->m_isLoading ) {
|
||||
log("spider: deleting sc=0x%lx for collnum=%li waiting3",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
return true;
|
||||
}
|
||||
// there's still a core of someone trying to write to someting
|
||||
// in "sc" so we have to try to fix that. somewhere in xmldoc.cpp
|
||||
// or spider.cpp. everyone should get sc from cr everytime i'd think
|
||||
log("spider: deleting sc=0x%lx for collnum=%li",
|
||||
(long)sc,(long)sc->m_collnum);
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
// make sure nobody has it
|
||||
cr->m_spiderColl = NULL;
|
||||
mdelete ( sc , sizeof(SpiderColl),"postdel1");
|
||||
delete ( sc );
|
||||
return true;
|
||||
@ -3278,7 +3300,6 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
// pretend to block since we got deleted!!!
|
||||
return false;
|
||||
|
||||
|
||||
// . did reading the list from spiderdb have an error?
|
||||
// . i guess we don't add to doledb then
|
||||
if ( g_errno ) {
|
||||
|
Reference in New Issue
Block a user