mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-15 02:36:08 -04:00
fix siteListIsEmpty bug causing spider to
spider the whole internet when it shouldn't
This commit is contained in:
@ -153,6 +153,8 @@ bool updateSiteListBuf ( collnum_t collnum ,
|
||||
//sc->m_siteListAsteriskLine = NULL;
|
||||
sc->m_siteListHasNegatives = false;
|
||||
sc->m_siteListIsEmpty = true;
|
||||
|
||||
sc->m_siteListIsEmptyValid = true;
|
||||
|
||||
// use this so it will be free automatically when msg4 completes!
|
||||
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
|
||||
@ -386,7 +388,7 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
// return sc->m_siteListAsteriskLine;
|
||||
|
||||
// if it is just a bunch of comments or blank lines, it is empty
|
||||
if ( sc->m_siteListIsEmpty )
|
||||
if ( sc->m_siteListIsEmpty && sc->m_siteListIsEmptyValid )
|
||||
return NULL;
|
||||
|
||||
// if we had a list of contains: or regex: directives in the sitelist
|
||||
|
@ -1132,6 +1132,7 @@ SpiderColl::SpiderColl () {
|
||||
m_numAdded = 0;
|
||||
m_numBytesScanned = 0;
|
||||
m_lastPrintCount = 0;
|
||||
m_siteListIsEmptyValid = false;
|
||||
//m_lastSpiderAttempt = 0;
|
||||
//m_lastSpiderCouldLaunch = 0;
|
||||
//m_numRoundsDone = 0;
|
||||
@ -10308,7 +10309,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
// if there is no domain or url explicitly listed
|
||||
// then assume user is spidering the whole internet
|
||||
// and we basically ignore "insitelist"
|
||||
if ( sc->m_siteListIsEmpty ) {
|
||||
if ( sc->m_siteListIsEmpty &&
|
||||
sc->m_siteListIsEmptyValid ) {
|
||||
// use a dummy row match
|
||||
row = (char *)1;
|
||||
}
|
||||
|
1
Spider.h
1
Spider.h
@ -1116,6 +1116,7 @@ class SpiderColl {
|
||||
//char *m_siteListAsteriskLine;
|
||||
bool m_siteListHasNegatives;
|
||||
bool m_siteListIsEmpty;
|
||||
bool m_siteListIsEmptyValid;
|
||||
// data buckets in this table are of type
|
||||
HashTableX m_siteListDomTable;
|
||||
// substring matches like "contains:goodstuff" or
|
||||
|
Reference in New Issue
Block a user