Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts: Collectiondb.cpp
2015-05-06 09:58:51 -07:00
parent a8a094a270 97238c577f
commit 35e41d3615
8 changed files with 248 additions and 53 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3383,24 +3383,93 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
 bool expandRegExShortcuts ( SafeBuf *sb ) ;
 void nukeDoledb ( collnum_t collnum );

-bool CollectionRec::rebuildUrlFiltersDiffbot() {
-  
-	//logf(LOG_DEBUG,"db: rebuilding url filters");
+// rebuild the regexes related to diffbot, such as the one for the URL pattern
+bool CollectionRec::rebuildDiffbotRegexes() {
+		//logf(LOG_DEBUG,"db: rebuilding url filters");
+		char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
+		if ( ucp && ! ucp[0] ) ucp = NULL;

+		// get the regexes
+		if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
+		if ( ucp && ! ucp[0] ) ucp = NULL;
+		char *upp = m_diffbotUrlProcessPattern.getBufStart();
+		if ( upp && ! upp[0] ) upp = NULL;
+
+		if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
+		if ( upp && ! upp[0] ) upp = NULL;
+		char *ppp = m_diffbotPageProcessPattern.getBufStart();
+		if ( ppp && ! ppp[0] ) ppp = NULL;
+
+		// recompiling regexes starts now
+		if ( m_hasucr ) {
+			regfree ( &m_ucr );
+			m_hasucr = false;
+		}
+		if ( m_hasupr ) {
+			regfree ( &m_upr );
+			m_hasupr = false;
+		}
+
+		// copy into tmpbuf
+		SafeBuf tmp;
+		char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
+		if ( rx && ! rx[0] ) rx = NULL;
+		if ( rx ) {
+			tmp.reset();
+			tmp.safeStrcpy ( rx );
+			expandRegExShortcuts ( &tmp );
+			m_hasucr = true;
+		}
+		if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
+				     REG_EXTENDED| //REG_ICASE|
+				     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
+			// error!
+			log("coll: regcomp %s failed: %s. "
+				   "Ignoring.",
+				   rx,mstrerror(errno));
+			regfree ( &m_ucr );
+			m_hasucr = false;
+		}
+
+
+		rx = m_diffbotUrlProcessRegEx.getBufStart();
+		if ( rx && ! rx[0] ) rx = NULL;
+		if ( rx ) m_hasupr = true;
+		if ( rx ) {
+			tmp.reset();
+			tmp.safeStrcpy ( rx );
+			expandRegExShortcuts ( &tmp );
+			m_hasupr = true;
+		}
+		if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
+				     REG_EXTENDED| // REG_ICASE|
+				     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
+			// error!
+			log("coll: regcomp %s failed: %s. "
+			    "Ignoring.",
+			    rx,mstrerror(errno));
+			regfree ( &m_upr );
+			m_hasupr = false;
+		}
+		return true;
+
+}
+
+bool CollectionRec::rebuildUrlFiltersDiffbot() {
+
+	//logf(LOG_DEBUG,"db: rebuilding url filters");
 	char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
 	if ( ucp && ! ucp[0] ) ucp = NULL;

 	// if we had a regex, that works for this purpose as well
 	if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
 	if ( ucp && ! ucp[0] ) ucp = NULL;
-
 	char *upp = m_diffbotUrlProcessPattern.getBufStart();
 	if ( upp && ! upp[0] ) upp = NULL;

 	// if we had a regex, that works for this purpose as well
 	if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
 	if ( upp && ! upp[0] ) upp = NULL;
-
 	char *ppp = m_diffbotPageProcessPattern.getBufStart();
 	if ( ppp && ! ppp[0] ) ppp = NULL;

@ -3469,7 +3538,6 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_hasupr = false;
 	}

-
 	// what diffbot url to use for processing
 	char *api = m_diffbotApiUrl.getBufStart();
 	if ( api && ! api[0] ) api = NULL;
@ -3869,9 +3937,15 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	}


-	// the code beow is only for diffbot custom crawls
-	if ( ! m_isCustomCrawl ) return true; //!= 1 && // crawl api
+	// If the crawl is not generated by crawlbot, then we will just update
+	// the regexes concerning the urls to process
+	rebuildDiffbotRegexes();
+	if ( ! m_isCustomCrawl ){
+		return true;
+	}

+	// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
+	// to some default ones.
 	return rebuildUrlFiltersDiffbot();
 }

--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -394,6 +394,9 @@ class CollectionRec {
 	// for diffbot crawl or bulk jobs
 	bool rebuildUrlFiltersDiffbot();

+	// rebuild the regexes related to diffbot, such as the one for the URL pattern
+	bool rebuildDiffbotRegexes();
+
 	bool rebuildLangRules( char *lang , char *tld );

 	bool rebuildShallowRules();
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -6352,6 +6352,41 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {

 	int32_t saved = sb->length();

+        // If json, print beginning of json array
+        if ( format == FORMAT_JSON ) {
+                if ( m_si->m_streamResults ) {
+                        // if we are streaming results in json, we may have hacked off
+                        // the last ,\n so we need a comma to put it back
+                        bool needComma = true;
+
+                        // check if the last non-whitespace char in the
+                        // buffer is a comma
+                        for (int32_t i= sb->m_length-1; i >= 0; i--) {
+                                char c = sb->getBufStart()[i];
+                                if (c == '\n' || c == ' ') {
+                                        // ignore whitespace chars
+                                        continue;
+                                }
+
+                                // If the loop reaches this point, we have a
+                                // non-whitespace char, so we break the loop
+                                // either way
+                                if (c == ',') {
+                                        // last non-whitespace char is a comma,
+                                        // so we don't need to add an extra one
+                                        needComma = false;
+                                }
+                                break;
+                        }
+
+                        if ( needComma ) {
+                                sb->safeStrcpy(",\n\n");
+                        }
+                }
+                sb->safePrintf("\"facets\":[");
+	}
+
+        int numTablesPrinted = 0;
 	for ( int32_t i = 0 ; i < m_si->m_q.getNumTerms() ; i++ ) {
 		// only for html for now i guess
 		//if ( m_si->m_format != FORMAT_HTML ) break;
@ -6363,10 +6398,25 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
 			continue;

 		// if had facet ranges, print them out
-		printFacetsForTable ( sb , qt );;
-
+		if ( printFacetsForTable ( sb , qt ) > 0 )
+			numTablesPrinted++;
 	}

+        // If josn, print end of json array
+        if ( format == FORMAT_JSON ) {
+                if ( numTablesPrinted > 0 ) {
+                        sb->m_length -= 2; // hack off trailing comma
+			sb->safePrintf("],\n"); // close off json array
+	        }
+		// if no facets then do not print "facets":[]\n,
+		else {
+			// revert string buf to original length
+			sb->m_length = saved;
+			// and cap the string buf just in case
+			sb->nullTerm();
+		}
+        }
+
 	// if json, remove ending ,\n and make it just \n
 	if ( format == FORMAT_JSON && sb->length() != saved ) {
 		// remove ,\n
@ -6387,7 +6437,7 @@ bool Msg40::printFacetTables ( SafeBuf *sb ) {
 	return true;
 }

-bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
+int32_t Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {

 	//QueryWord *qw = qt->m_qword;
 	//if ( qw->m_numFacetRanges > 0 )
@ -6397,9 +6447,14 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 	int32_t *ptrs = (int32_t *)qt->m_facetIndexBuf.getBufStart();
 	int32_t numPtrs = qt->m_facetIndexBuf.length() / sizeof(int32_t);

+	if ( numPtrs == 0 )
+		return 0;
+
+	int32_t numPrinted = 0;
+
 	// now scan the slots and print out
 	HttpRequest *hr = &m_si->m_hr;
-	bool firstTime = true;
+
 	bool isString = false;
 	bool isFloat  = false;
 	bool isInt = false;
@ -6409,6 +6464,7 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 	char format = m_si->m_format;
 	// a new table for each facet query term
 	bool needTable = true;
+
 	// print out the dumps
 	for ( int32_t x= 0 ; x < numPtrs ; x++ ) {
 		// skip empty slots
@ -6516,7 +6572,9 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 			text = m_facetTextBuf.getBufStart() + *offset;
 		}

+
 		if ( format == FORMAT_XML ) {
+			numPrinted++;
 			sb->safePrintf("\t<facet>\n"
 				       "\t\t<field>%s</field>\n"
 				       , term );
@ -6573,17 +6631,6 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 			continue;
 		}

-		if ( format == FORMAT_JSON && firstTime ) {
-			firstTime = false;
-			// if streaming results we may have hacked off
-			// the last ,\n so put it back
-			if ( m_si->m_streamResults ) {
-				//sb->m_length -= 1;
-				sb->safeStrcpy(",\n\n");
-			}
-			//sb->safePrintf("\"facets\":[\n");
-		}
-
 		// print that out
 		if ( needTable && format == FORMAT_HTML ) {
 			needTable = false;
@ -6619,13 +6666,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 		}


-		if ( needTable && format == FORMAT_JSON ) {
-			needTable = false;
-			sb->safePrintf("\"facets\":[");
-		}
-
-
 		if ( format == FORMAT_JSON ) {
+			numPrinted++;
 			sb->safePrintf("{\n"
 				       "\t\"field\":\"%s\",\n"
 				       , term 
@ -6779,6 +6821,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 		SafeBuf newUrl;
 		replaceParm ( newStuff.getBufStart(), &newUrl , hr );

+		numPrinted++;
+
 		// print the facet in its numeric form
 		// we will have to lookup based on its docid
 		// and get it from the cached page later
@ -6799,13 +6843,8 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 			       ,count); // count for printing
 	}

-	if ( ! needTable && format == FORMAT_JSON ) {
-		sb->m_length -= 2; // hack off trailing comma
-		sb->safePrintf("],\n"); // close off json array
-	}
-
 	if ( ! needTable && format == FORMAT_HTML ) 
 		sb->safePrintf("</table></div><br>\n");

-	return true;
+	return numPrinted;
 }
--- a/Msg40.h
+++ b/Msg40.h
@ -227,7 +227,7 @@ class Msg40 {
 	int32_t m_omitCount;

 	bool printFacetTables ( class SafeBuf *sb ) ;
-	bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
+	int32_t printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );
 	bool lookupFacets ( ) ;
 	void lookupFacets2 ( ) ;
 	void gotFacetText ( class Msg20 *msg20 ) ;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -22562,6 +22562,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "You have to use the respider frequency as well "
 			  "to adjust how often you want things respidered."
 			  "</td></tr>"
+
+			  "<tr class=poo><td>urlage</td>"
+			  "<td>"
+			  "This is the time, in seconds, since a url was first "
+			  "added to spiderdb to be spidered. This is "
+			  "its discovery date. "
+			  "Can use <, >, <=, >=, ==, != comparison operators."
+			  "</td></tr>"
 			  

 			  //"<tr class=poo><td>!newoutlink</td>"
@ -22584,6 +22592,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "older permalinks into a slower spider queue."
 			  "</td></tr>"

+			  "<tr class=poo><td>spiderwaited &lt; 3600</td>"
+			  "<td>"
+			  "<i>spiderwaited</i> is how many seconds have elapsed "
+			  "since the last time "
+			  "we tried to spider/download the url. "
+			  "The constaint containing <i>spiderwaited</i> will "
+			  "fail to be matched if the url has never been "
+			  "attempted to be spidered/downloaded before. Therefore, "
+			  "it will only ever match urls that have a spider reply "
+			  "of some sort, so there is no need to add an additional "
+			  "<i>hasreply</i>-based constraint."
+			  "</td></tr>"
+
+
 			  "<tr class=poo><td>"
 			  "<a name=insitelist>"
 			  "insitelist | !insitelist"
--- a/Spider.cpp
+++ b/Spider.cpp
@ -4562,6 +4562,9 @@ bool SpiderColl::scanListForWinners ( ) {
 						 spiderTimeMS ,
 						 uh48 );

+		// assume our added time is the first time this url was added
+		sreq->m_discoveryTime = sreq->m_addedTime;
+
 		// if ( uh48 == 110582802025376LL )
 		// 	log("hey");

@ -4591,10 +4594,12 @@ bool SpiderColl::scanListForWinners ( ) {
 				// and the min added time as well!
 				// get the oldest timestamp so
 				// gbssDiscoveryTime will be accurate.
-				if ( sreq->m_addedTime < wsreq->m_addedTime )
-					wsreq->m_addedTime = sreq->m_addedTime;
-				if ( wsreq->m_addedTime < sreq->m_addedTime )
-					sreq->m_addedTime = wsreq->m_addedTime;
+				if ( sreq->m_discoveryTime < wsreq->m_discoveryTime )
+					wsreq->m_discoveryTime = 
+						sreq->m_discoveryTime;
+				if ( wsreq->m_discoveryTime < sreq->m_discoveryTime )
+					sreq->m_discoveryTime = 
+						wsreq->m_discoveryTime;
 			}

 			
@ -12399,6 +12404,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		// selector using the first time it was added to the Spiderdb
+		// added by Sam, May 5th 2015
+		if ( *p=='u' && strncmp(p,"urlage",6) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) {
+				//log("was for message 20");
+				continue;
+
+			}
+			// get the age of the spider_request. 
+			// (substraction of uint with int, hope
+			// every thing goes well there)
+			int32_t sreq_age = 0;
+			if ( sreq ) sreq_age = nowGlobal-sreq->m_discoveryTime;
+			//log("spiderage=%d",sreq_age);
+			// the argument entered by user
+			int32_t argument_age=atoi(s) ;
+			if ( sign == SIGN_EQ && sreq_age != argument_age ) continue;
+			if ( sign == SIGN_NE && sreq_age == argument_age ) continue;
+			if ( sign == SIGN_GT && sreq_age <= argument_age ) continue;
+			if ( sign == SIGN_LT && sreq_age >= argument_age ) continue;
+			if ( sign == SIGN_GE && sreq_age <  argument_age ) continue;
+			if ( sign == SIGN_LE && sreq_age >  argument_age ) continue;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) return i;
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+

 		if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
 			// if we do not have enough info for outlink, all done
@ -12521,16 +12557,16 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			// skip for msg20
 			if ( isForMsg20 ) continue;
 			// do not match rule if never attempted
-			if ( srep->m_spideredTime ==  0 ) {
-				char*xx=NULL;*xx=0;}
-			if ( srep->m_spideredTime == (uint32_t)-1){
-				char*xx=NULL;*xx=0;}
-			// int16_tcut
-			float af = (srep->m_spideredTime - nowGlobal);
+			// if ( srep->m_spideredTime ==  0 ) {
+			// 	char*xx=NULL;*xx=0;}
+			// if ( srep->m_spideredTime == (uint32_t)-1){
+			// 	char*xx=NULL;*xx=0;}
+			// shortcut
+			int32_t a = nowGlobal - srep->m_spideredTime;
 			// make into days
-			af /= (3600.0*24.0);
+			//af /= (3600.0*24.0);
 			// back to a int32_t, round it
-			int32_t a = (int32_t)(af + 0.5);
+			//int32_t a = (int32_t)(af + 0.5);
 			// make it point to the priority
 			int32_t b = atoi(s);
 			// compare
@ -13062,6 +13098,7 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// . if the same check who has the most recent added time
 		// . if we are not the most recent, just do not add us
 		// . no, now i want the oldest so we can do gbssDiscoveryTime
+		//   and set sreq->m_discoveryTime accurately, above
 		if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
 		// otherwise, erase over him
 		dst     = restorePoint;
--- a/Spider.h
+++ b/Spider.h
@ -522,10 +522,16 @@ class SpiderRequest {
 	int32_t    m_parentDomHash32;
 	int32_t    m_parentSiteHash32;

+	// if there are several spiderrequests for a url, this should be
+	// the earliest m_addedTime, basically, the url discovery time. this is
+	// NOT valid in spiderdb, but only set upon selecting the url to spider
+	// when we scan all of the SpiderRequests it has.
+	int32_t m_discoveryTime;
+
 	// the PROBABLE DOCID. if there is a collision with another docid
 	// then we increment the last 8 bits or so. see Msg22.cpp.
 	//int64_t m_probDocId;
-	int32_t m_reservedc1;
+	//int32_t m_reservedc1;
 	int32_t m_reservedc2;

 	//int32_t  m_parentPubDate;
@ -829,6 +835,7 @@ class SpiderReply {
 	// a SpiderRec outright
 	key128_t   m_key;

+	// this can be used for something else really. all SpiderReplies are fixed sz
 	int32_t    m_dataSize;

 	// for calling getHostIdToDole()
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -5362,7 +5362,8 @@ Dates *XmlDoc::getDates ( ) {
 	     m_sreq.m_parentPrevSpiderTime ) {
 		// pub date is somewhere between these two times
 		minPubDate = m_sreq.m_parentPrevSpiderTime;
-		maxPubDate = m_sreq.m_addedTime;
+		//maxPubDate = m_sreq.m_addedTime;
+		maxPubDate = m_sreq.m_discoveryTime;
 	}

 	// now set part2 , returns false and sets g_errno on error
@ -20150,6 +20151,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 			       tmp,(uint32_t)m_sreq.m_addedTime);
 	}

+	// discovery date, first time spiderrequest was added to spiderdb
+	if ( m_sreqValid && m_sreq.m_discoveryTime ) {
+		time_t ts = m_sreq.m_discoveryTime;
+		struct tm *timeStruct = gmtime ( &ts );
+		char tmp[64];
+		strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
+		sb->safePrintf("discoverydate=%s(%"UINT32") ",
+			       tmp,(uint32_t)m_sreq.m_discoveryTime);
+	}
+
 	// print first indexed time
 	if ( m_firstIndexedDateValid ) {
 		time_t ts = m_firstIndexedDate;
@ -27456,13 +27467,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 			      cr->m_spiderRoundNum);

 	// for -diffbotxyz fake docs addedtime is 0
-	if ( m_sreqValid && m_sreq.m_addedTime != 0 ) {
+	if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
 		// in Spider.cpp we try to set m_sreq's m_addedTime to the
 		// min of all the spider requests, and we try to ensure
 		// that in the case of deduping we preserve the one with
-		// the oldest time.
+		// the oldest time. no, now we actually use 
+		// m_discoveryTime since we were using m_addedTime in
+		// the url filters as it was originally intended.
 		jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
-			      m_sreq.m_addedTime);
+			      m_sreq.m_discoveryTime);
 	}

 	if ( m_isDupValid && m_isDup )