fix url filters formulation.

fixed extra , in json. fixed upp and ucp patterns if all substrings are negative.
2013-11-26 09:17:38 -08:00 · 2013-11-26 09:17:38 -08:00 · 040bdb8039
commit 040bdb8039
parent ca544ddb90
2 changed files with 43 additions and 20 deletions
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -2938,7 +2938,9 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {

 	sb.safePrintf("\"notifyWebhook\":\"");
 	sb.safeUtf8ToJSON ( cx->m_notifyUrl.getBufStart() );
-	sb.safePrintf("\",\n");
+	sb.safePrintf("\"\n");
+	//sb.safePrintf("\",\n");
+
 	/////
 	//
 	// show url filters table. kinda hacky!!
@ -2956,7 +2958,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
 	*/
 	//printUrlFilters ( sb , cx , FMT_JSON );
 	// end that collection rec
-	sb.safePrintf("\n}\n");
+	sb.safePrintf("}\n");

 	return true;
 }
@ -4835,46 +4837,56 @@ bool resetUrlFilters ( CollectionRec *cr ) {
 		cr->m_spiderPriorities   [i] = 55;
 		cr->m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
-	}
-
-	// harvest links if we should crawl it
-	if ( ucp ) {
+		// if just matches ucp, just crawl it, do not process
 		cr->m_regExs[i].set("matchesucp");
 		cr->m_spiderPriorities   [i] = 54;
 		i++;
-	}
-
-	// just process
-	if ( upp ) {
+		// just process, do not spider links if does not match ucp
 		cr->m_regExs[i].set("matchesupp");
 		cr->m_spiderPriorities   [i] = 53;
 		cr->m_harvestLinks       [i] = false;
 		cr->m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
+		// do not crawl anything else
+		cr->m_regExs[i].set("default");
+		cr->m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		i++;
 	}

-	if ( ucp && upp ) {
+	// harvest links if we should crawl it
+	if ( ucp && ! upp ) {
+		cr->m_regExs[i].set("matchesucp");
+		cr->m_spiderPriorities   [i] = 54;
+		// process everything since upp is empty
+		cr->m_spiderDiffbotApiUrl[i].set ( api );
+		i++;
+		// do not crawl anything else
 		cr->m_regExs[i].set("default");
 		cr->m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
 		i++;
 	}
-	else if ( ucp ) {
-		cr->m_regExs[i].set("default");
-		cr->m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+
+	// just process
+	if ( upp && ! ucp ) {
+		cr->m_regExs[i].set("matchesupp");
+		cr->m_spiderPriorities   [i] = 53;
+		cr->m_harvestLinks       [i] = false;
+		cr->m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
-	}
-	else if ( upp ) {
+		// crawl everything by default, no processing
 		cr->m_regExs[i].set("default");
 		cr->m_spiderPriorities   [i] = 50;
 		i++;
 	}
-	else {
+
+	// no restraints
+	if ( ! upp && ! ucp ) {
+		// crawl everything by default, no processing
 		cr->m_regExs[i].set("default");
 		cr->m_spiderPriorities   [i] = 50;
 		cr->m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 	}
-	

 	cr->m_numRegExs   = i;
 	cr->m_numRegExs2  = i;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -10565,6 +10565,7 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
 	char *p = pattern;

 	long matchedOne = 0;
+	bool hadPositive = false;

 	long count = 0;
 	// scan the " || " separated substrings
@ -10595,6 +10596,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
 			start++;
 			negative = true;
 		}
+		else
+			hadPositive = true;
 		// . is this substring anywhere in the document
 		// . check the rawest content before converting to utf8 i guess
 		char *foundPtr =  strstr ( content , start ) ;
@ -10604,8 +10607,14 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
 		//	    m_firstUrl.m_url,start);
 		// revert \0
 		*end = c;
-		// negative is pad
-		if ( foundPtr && negative ) return false;
+
+		// negative mean we should NOT match it
+		if ( negative ) {
+			// so if its matched, that is bad
+			if ( foundPtr ) return false;
+			continue;
+		}
+
 		// skip if not found
 		if ( ! foundPtr ) continue;
 		// did we find it?
@ -10618,6 +10627,8 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
 	if ( count == 0 ) return true;
 	// must have matched one at least
 	if ( matchedOne ) return true;
+	// if all negative? i.e. !category||!author
+	if ( ! hadPositive ) return true;
 	// if we had an unfound substring...
 	return false;
 }