fix url filters formulation.
fixed extra , in json. fixed upp and ucp patterns if all substrings are negative.
This commit is contained in:
parent
ca544ddb90
commit
040bdb8039
@ -2938,7 +2938,9 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
|
||||
sb.safePrintf("\"notifyWebhook\":\"");
|
||||
sb.safeUtf8ToJSON ( cx->m_notifyUrl.getBufStart() );
|
||||
sb.safePrintf("\",\n");
|
||||
sb.safePrintf("\"\n");
|
||||
//sb.safePrintf("\",\n");
|
||||
|
||||
/////
|
||||
//
|
||||
// show url filters table. kinda hacky!!
|
||||
@ -2956,7 +2958,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
*/
|
||||
//printUrlFilters ( sb , cx , FMT_JSON );
|
||||
// end that collection rec
|
||||
sb.safePrintf("\n}\n");
|
||||
sb.safePrintf("}\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -4835,46 +4837,56 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
cr->m_spiderPriorities [i] = 55;
|
||||
cr->m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
|
||||
// harvest links if we should crawl it
|
||||
if ( ucp ) {
|
||||
// if just matches ucp, just crawl it, do not process
|
||||
cr->m_regExs[i].set("matchesucp");
|
||||
cr->m_spiderPriorities [i] = 54;
|
||||
i++;
|
||||
}
|
||||
|
||||
// just process
|
||||
if ( upp ) {
|
||||
// just process, do not spider links if does not match ucp
|
||||
cr->m_regExs[i].set("matchesupp");
|
||||
cr->m_spiderPriorities [i] = 53;
|
||||
cr->m_harvestLinks [i] = false;
|
||||
cr->m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
if ( ucp && upp ) {
|
||||
// harvest links if we should crawl it
|
||||
if ( ucp && ! upp ) {
|
||||
cr->m_regExs[i].set("matchesucp");
|
||||
cr->m_spiderPriorities [i] = 54;
|
||||
// process everything since upp is empty
|
||||
cr->m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
else if ( ucp ) {
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
|
||||
// just process
|
||||
if ( upp && ! ucp ) {
|
||||
cr->m_regExs[i].set("matchesupp");
|
||||
cr->m_spiderPriorities [i] = 53;
|
||||
cr->m_harvestLinks [i] = false;
|
||||
cr->m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
else if ( upp ) {
|
||||
// crawl everything by default, no processing
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
i++;
|
||||
}
|
||||
else {
|
||||
|
||||
// no restraints
|
||||
if ( ! upp && ! ucp ) {
|
||||
// crawl everything by default, no processing
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
cr->m_numRegExs = i;
|
||||
cr->m_numRegExs2 = i;
|
||||
|
15
Spider.cpp
15
Spider.cpp
@ -10565,6 +10565,7 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
|
||||
char *p = pattern;
|
||||
|
||||
long matchedOne = 0;
|
||||
bool hadPositive = false;
|
||||
|
||||
long count = 0;
|
||||
// scan the " || " separated substrings
|
||||
@ -10595,6 +10596,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
|
||||
start++;
|
||||
negative = true;
|
||||
}
|
||||
else
|
||||
hadPositive = true;
|
||||
// . is this substring anywhere in the document
|
||||
// . check the rawest content before converting to utf8 i guess
|
||||
char *foundPtr = strstr ( content , start ) ;
|
||||
@ -10604,8 +10607,14 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
|
||||
// m_firstUrl.m_url,start);
|
||||
// revert \0
|
||||
*end = c;
|
||||
// negative is pad
|
||||
if ( foundPtr && negative ) return false;
|
||||
|
||||
// negative mean we should NOT match it
|
||||
if ( negative ) {
|
||||
// so if its matched, that is bad
|
||||
if ( foundPtr ) return false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// skip if not found
|
||||
if ( ! foundPtr ) continue;
|
||||
// did we find it?
|
||||
@ -10618,6 +10627,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
|
||||
if ( count == 0 ) return true;
|
||||
// must have matched one at least
|
||||
if ( matchedOne ) return true;
|
||||
// if all negative? i.e. !category||!author
|
||||
if ( ! hadPositive ) return true;
|
||||
// if we had an unfound substring...
|
||||
return false;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user