added url process pattern and regex to

xmldoc.cpp.
2014-01-17 11:08:23 -08:00
parent 01a3282020
commit 0844dbf72a
2 changed files with 54 additions and 15 deletions
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -13048,6 +13048,7 @@ skip:
 	THIS->m_masterLoop ( THIS->m_masterState );
 }

+/*
 SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {

 	if ( m_diffbotApiUrlValid )
@ -13085,6 +13086,7 @@ SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
 	//m_diffbotApiNumValid = true;
 	return &m_diffbotApiUrl;
 }
+*/

 // if only processing NEW is enabled, then do not
 bool *XmlDoc::getRecycleDiffbotReply ( ) {
@ -13330,16 +13332,6 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	if ( m_indexCodeValid && m_indexCode )
 		return &m_diffbotReply;

-	// if already processed and onlyprocessifnew is enabled then
-	// we recycle and do not bother with this, we also do not nuke
-	// the diffbot json objects we have already indexed by calling
-	// nukeJSONObjects()
-	bool *recycle = getRecycleDiffbotReply();
-	if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
-	if ( *recycle ) {
-		m_diffbotReplyValid = true;
-		return &m_diffbotReply;
-	}

 	if ( m_isDiffbotJSONObject ) {
 		m_diffbotReplyValid = true;
@ -13354,6 +13346,52 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		return &m_diffbotReply;
 	}

+	CollectionRec *cr = getCollRec();
+	if ( ! cr ) return NULL;
+
+	// get list of substring patterns
+	char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
+	char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
+	if ( upp && ! upp[0] ) upp = NULL;
+	if ( ucp && ! ucp[0] ) ucp = NULL;
+	// do we match the url process pattern or regex?
+	// get the compiled regular expressions
+	regex_t *ucr = &cr->m_ucr;
+	regex_t *upr = &cr->m_upr;
+	if ( ! cr->m_hasucr ) ucr = NULL;
+	if ( ! cr->m_hasupr ) upr = NULL;
+	// get the url
+	Url *f = getFirstUrl();
+	char *url = f->getUrl();
+	// . "upp" is a ||-separated list of substrings
+	// . "upr" is a regex
+	// . regexec returns 0 for a match
+	if ( upr && regexec(upr,url,0,NULL,0) ) {
+		// return empty reply
+		m_diffbotReplyValid = true;
+		return &m_diffbotReply;
+	}
+	if ( upp && !upr &&!doesStringContainPattern(url,upp)) {
+		// return empty reply
+		m_diffbotReplyValid = true;
+		return &m_diffbotReply;
+	}
+
+
+	
+
+	// if already processed and onlyprocessifnew is enabled then
+	// we recycle and do not bother with this, we also do not nuke
+	// the diffbot json objects we have already indexed by calling
+	// nukeJSONObjects()
+	bool *recycle = getRecycleDiffbotReply();
+	if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
+	if ( *recycle ) {
+		m_diffbotReplyValid = true;
+		return &m_diffbotReply;
+	}
+
+
 	// if set from title rec, do not do it. we are possibly an "old doc"
 	// and we should only call diffbot.com with new docs
 	if ( m_setFromTitleRec ) {
@ -13363,8 +13401,9 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {

 	// . check the url filters table to see if diffbot api is specified
 	// . just return "\0" if none, but NULL means error i guess
-	SafeBuf *au = getDiffbotApiUrl();
-	if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
+	//SafeBuf *au = getDiffbotApiUrl();
+	//if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
+	SafeBuf *au = &cr->m_diffbotApiUrl;

 	// if no url, assume do not access diffbot
 	if ( au->length() <= 0 ) {
@ -13588,8 +13627,8 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 	}
 	*/

-	CollectionRec *cr = getCollRec();
-	if ( ! cr ) return NULL;
+	//CollectionRec *cr = getCollRec();
+	//if ( ! cr ) return NULL;

 	// add a '?' if none
 	if ( ! strchr ( apiUrl.getUrl() , '?' ) )
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -797,7 +797,7 @@ class XmlDoc {
 	char *getIsNoArchive ( ) ;
 	long *getUrlFilterNum();
 	//long *getDiffbotApiNum();
-	SafeBuf *getDiffbotApiUrl();
+	//SafeBuf *getDiffbotApiUrl();
 	long long **getAdVector ( ) ;
 	char *getIsLinkSpam ( ) ;
 	char *getIsHijacked();