fix url filters formulation.

fixed extra , in json.
fixed upp and ucp patterns if all substrings
are negative.
This commit is contained in:
Matt Wells 2013-11-26 09:17:38 -08:00
parent ca544ddb90
commit 040bdb8039
2 changed files with 43 additions and 20 deletions

@ -2938,7 +2938,9 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
sb.safePrintf("\"notifyWebhook\":\"");
sb.safeUtf8ToJSON ( cx->m_notifyUrl.getBufStart() );
sb.safePrintf("\",\n");
sb.safePrintf("\"\n");
//sb.safePrintf("\",\n");
/////
//
// show url filters table. kinda hacky!!
@ -2956,7 +2958,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
*/
//printUrlFilters ( sb , cx , FMT_JSON );
// end that collection rec
sb.safePrintf("\n}\n");
sb.safePrintf("}\n");
return true;
}
@ -4835,46 +4837,56 @@ bool resetUrlFilters ( CollectionRec *cr ) {
cr->m_spiderPriorities [i] = 55;
cr->m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// harvest links if we should crawl it
if ( ucp ) {
// if just matches ucp, just crawl it, do not process
cr->m_regExs[i].set("matchesucp");
cr->m_spiderPriorities [i] = 54;
i++;
}
// just process
if ( upp ) {
// just process, do not spider links if does not match ucp
cr->m_regExs[i].set("matchesupp");
cr->m_spiderPriorities [i] = 53;
cr->m_harvestLinks [i] = false;
cr->m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
cr->m_regExs[i].set("default");
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
i++;
}
if ( ucp && upp ) {
// harvest links if we should crawl it
if ( ucp && ! upp ) {
cr->m_regExs[i].set("matchesucp");
cr->m_spiderPriorities [i] = 54;
// process everything since upp is empty
cr->m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
cr->m_regExs[i].set("default");
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
i++;
}
else if ( ucp ) {
cr->m_regExs[i].set("default");
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
// just process
if ( upp && ! ucp ) {
cr->m_regExs[i].set("matchesupp");
cr->m_spiderPriorities [i] = 53;
cr->m_harvestLinks [i] = false;
cr->m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
else if ( upp ) {
// crawl everything by default, no processing
cr->m_regExs[i].set("default");
cr->m_spiderPriorities [i] = 50;
i++;
}
else {
// no restraints
if ( ! upp && ! ucp ) {
// crawl everything by default, no processing
cr->m_regExs[i].set("default");
cr->m_spiderPriorities [i] = 50;
cr->m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
cr->m_numRegExs = i;
cr->m_numRegExs2 = i;

@ -10565,6 +10565,7 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
char *p = pattern;
long matchedOne = 0;
bool hadPositive = false;
long count = 0;
// scan the " || " separated substrings
@ -10595,6 +10596,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
start++;
negative = true;
}
else
hadPositive = true;
// . is this substring anywhere in the document
// . check the rawest content before converting to utf8 i guess
char *foundPtr = strstr ( content , start ) ;
@ -10604,8 +10607,14 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
// m_firstUrl.m_url,start);
// revert \0
*end = c;
// negative is pad
if ( foundPtr && negative ) return false;
// negative mean we should NOT match it
if ( negative ) {
// so if its matched, that is bad
if ( foundPtr ) return false;
continue;
}
// skip if not found
if ( ! foundPtr ) continue;
// did we find it?
@ -10618,6 +10627,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
if ( count == 0 ) return true;
// must have matched one at least
if ( matchedOne ) return true;
// if all negative? i.e. !category||!author
if ( ! hadPositive ) return true;
// if we had an unfound substring...
return false;
}