fix siteListIsEmpty bug causing spider to

spider the whole internet when it shouldn't
This commit is contained in:
mwells
2014-06-03 11:37:31 -07:00
parent 51bb653bb3
commit ba2329808b
3 changed files with 7 additions and 2 deletions

@ -153,6 +153,8 @@ bool updateSiteListBuf ( collnum_t collnum ,
//sc->m_siteListAsteriskLine = NULL;
sc->m_siteListHasNegatives = false;
sc->m_siteListIsEmpty = true;
sc->m_siteListIsEmptyValid = true;
// use this so it will be free automatically when msg4 completes!
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
@ -386,7 +388,7 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// return sc->m_siteListAsteriskLine;
// if it is just a bunch of comments or blank lines, it is empty
if ( sc->m_siteListIsEmpty )
if ( sc->m_siteListIsEmpty && sc->m_siteListIsEmptyValid )
return NULL;
// if we had a list of contains: or regex: directives in the sitelist

@ -1132,6 +1132,7 @@ SpiderColl::SpiderColl () {
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
m_siteListIsEmptyValid = false;
//m_lastSpiderAttempt = 0;
//m_lastSpiderCouldLaunch = 0;
//m_numRoundsDone = 0;
@ -10308,7 +10309,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
// if there is no domain or url explicitly listed
// then assume user is spidering the whole internet
// and we basically ignore "insitelist"
if ( sc->m_siteListIsEmpty ) {
if ( sc->m_siteListIsEmpty &&
sc->m_siteListIsEmptyValid ) {
// use a dummy row match
row = (char *)1;
}

@ -1116,6 +1116,7 @@ class SpiderColl {
//char *m_siteListAsteriskLine;
bool m_siteListHasNegatives;
bool m_siteListIsEmpty;
bool m_siteListIsEmptyValid;
// data buckets in this table are of type
HashTableX m_siteListDomTable;
// substring matches like "contains:goodstuff" or