integrate diffbot from svn back into git.

2013-09-13 09:23:18 -07:00 · 2013-09-13 09:23:18 -07:00 · 5dc7bd2ab4
commit 5dc7bd2ab4
parent 9696c7936a
36 changed files with 1909 additions and 96 deletions
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -59,6 +59,10 @@ CollectionRec::CollectionRec() {
 	//	*(m_regExs[i]) = '\0';
 	//}
 	m_numRegExs = 0;
+
+	// for diffbot caching the global spider stats
+	reset();
+
 	// add default reg ex if we do not have one
 	fixRec();
 }
@ -74,12 +78,22 @@ void CollectionRec::setToDefaults ( ) {
 	fixRec ();
 }

+void CollectionRec::reset() {
+	m_localCrawlInfo.reset();
+	m_globalCrawlInfo.reset();
+	m_globalCrawlInfoUpdateTime = 0;
+	m_requests = 0;
+	m_replies = 0;
+}
+
 // . load this data from a conf file
 // . values we do not explicitly have will be taken from "default",
 //   collection config file. if it does not have them then we use
 //   the value we received from call to setToDefaults()
 // . returns false and sets g_errno on load error
 bool CollectionRec::load ( char *coll , long i ) {
+	// also reset some counts not included in parms list
+	reset();
 	// before we load, set to defaults in case some are not in xml file
 	g_parms.setToDefault ( (char *)this );
 	// get the filename with that id
@ -111,6 +125,47 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// add default reg ex
 	fixRec ();

+	//
+	// LOAD the crawlinfo class in the collectionrec for diffbot
+	//
+	if ( g_conf.m_useDiffbot ) {
+		// LOAD LOCAL
+		sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.txt",
+			  g_hostdb.m_dir , m_coll , (long)m_collnum );
+		log("coll: loading %s",tmp1);
+		SafeBuf sb;
+		// fillfromfile returns 0 if does not exist, -1 on read error
+		if ( sb.fillFromFile ( tmp1 ) > 0 )
+			sscanf ( sb.getBufStart() ,
+				 "indexAttempts:%lli\n"
+				 "processAttempts:%lli\n"
+				 "downloadAttempts:%lli\n"
+				 , &m_localCrawlInfo.m_pageIndexAttempts
+				 , &m_localCrawlInfo.m_pageProcessAttempts
+				 , &m_localCrawlInfo.m_pageDownloadAttempts
+				 );
+		// LOAD GLOBAL
+		sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.txt",
+			  g_hostdb.m_dir , m_coll , (long)m_collnum );
+		log("coll: loading %s",tmp1);
+		sb.reset();
+		if ( sb.fillFromFile ( tmp1 ) > 0 )
+			sscanf ( sb.getBufStart() ,
+				 "indexAttempts:%lli\n"
+				 "processAttempts:%lli\n"
+				 "downloadAttempts:%lli\n"
+				 "lastupdate:%lu\n"
+				 , &m_globalCrawlInfo.m_pageIndexAttempts
+				 , &m_globalCrawlInfo.m_pageProcessAttempts
+				 , &m_globalCrawlInfo.m_pageDownloadAttempts
+				 , &m_globalCrawlInfoUpdateTime 
+				 );
+		// ignore errors i guess
+		g_errno = 0;
+	}
+
+
+
 	// always turn on distributed spider locking because otherwise
 	// we end up calling Msg50 which calls Msg25 for the same root url
 	// at the same time, thereby wasting massive resources. it is also
@ -242,6 +297,7 @@ void CollectionRec::fixRec ( ) {

 	//strcpy(m_regExs   [n],"default");
 	m_regExs[n].set("default");
+	m_regExs[n].nullTerm();
 	m_numRegExs++;

 	m_spiderFreqs     [n] = 30; // 30 days default
@ -281,6 +337,50 @@ bool CollectionRec::save ( ) {
 	if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
 	// log msg
 	log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
+
+	//
+	// save the crawlinfo class in the collectionrec for diffbot
+	//
+	if ( g_conf.m_useDiffbot ) {
+		// SAVE LOCAL
+		sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.txt",
+			  g_hostdb.m_dir , m_coll , (long)m_collnum );
+		log("coll: saving %s",tmp);
+		SafeBuf sb;
+		sb.safePrintf("indexAttempts:%lli\n"
+			      "processAttempts:%lli\n"
+			      "downloadAttempts:%lli\n"
+			      , m_localCrawlInfo.m_pageIndexAttempts
+			      , m_localCrawlInfo.m_pageProcessAttempts
+			      , m_localCrawlInfo.m_pageDownloadAttempts
+			      );
+		if ( sb.dumpToFile ( tmp ) == -1 ) {
+			log("coll: failed to save file %s : %s",
+			    tmp,mstrerror(g_errno));
+			g_errno = 0;
+		}
+		// SAVE GLOBAL
+		sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.txt",
+			  g_hostdb.m_dir , m_coll , (long)m_collnum );
+		log("coll: saving %s",tmp);
+		sb.reset();
+		sb.safePrintf("indexAttempts:%lli\n"
+			      "processAttempts:%lli\n"
+			      "downloadAttempts:%lli\n"
+			      "lastupdate:%lu\n"
+			      , m_globalCrawlInfo.m_pageIndexAttempts
+			      , m_globalCrawlInfo.m_pageProcessAttempts
+			      , m_globalCrawlInfo.m_pageDownloadAttempts
+			      , m_globalCrawlInfoUpdateTime 
+			      );
+		if ( sb.dumpToFile ( tmp ) == -1 ) {
+			log("coll: failed to save file %s : %s",
+			    tmp,mstrerror(g_errno));
+			g_errno = 0;
+		}
+						 
+	}
+
 	// do not need a save now
 	m_needsSave = false;
 	return true;
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -69,6 +69,15 @@
 #include "RdbList.h"
 #include "Rdb.h" // for RdbBase

+// used by diffbot to control spidering per collection
+class CrawlInfo {
+ public:
+	long long m_pageIndexAttempts;
+	long long m_pageProcessAttempts;
+	long long m_pageDownloadAttempts;
+	void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
+};
+

 class CollectionRec {

@ -136,6 +145,7 @@ class CollectionRec {
 	bool      m_needsSave;

 	bool      load ( char *coll , long collNum ) ;
+	void reset();

 	void fixRec ( );

@ -355,6 +365,40 @@ class CollectionRec {
 	// priority of urls being retried, usually higher than normal
 	char  m_retryPriority; 

+	// new diffbot parms
+	SafeBuf m_diffbotToken;
+	SafeBuf m_diffbotSeed;
+	SafeBuf m_diffbotApi;
+	SafeBuf m_diffbotApiQueryString;
+	SafeBuf m_diffbotUrlCrawlPattern;
+	SafeBuf m_diffbotUrlProcessPattern;
+	SafeBuf m_diffbotPageProcessPattern;
+	SafeBuf m_diffbotClassify;
+	// format of output. "csv" or "xml" or "json" or null
+	SafeBuf m_diffbotFormat;
+	// what fields to return in the json output: (api dependent)
+	SafeBuf m_diffbotFields;
+	long long m_diffbotMaxToCrawl;
+	long long m_diffbotMaxToProcess;
+	long long m_diffbotCrawlStartTime;
+	long long m_diffbotCrawlEndTime;
+
+	// for testing their regexes etc...
+	char m_isDiffbotTestCrawl;
+
+	// our local crawling stats
+	CrawlInfo m_localCrawlInfo;
+	// total crawling stats summed up from all hosts in network
+	CrawlInfo m_globalCrawlInfo;
+	// last time we computed global crawl info
+	time_t m_globalCrawlInfoUpdateTime;
+	// for counting replies
+	long m_replies;
+	long m_requests;
+	// for storing callbacks waiting in line for freshest crawl info
+	SafeBuf m_callbackQueue;
+	
+
 	// . now the url regular expressions
 	// . we chain down the regular expressions
 	// . if a url matches we use that tagdb rec #
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -396,28 +396,30 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
 	// if we are doing a dump from the command line, skip this stuff
 	if ( isDump ) return true;
 	if(isNew) verify = false;
+
+
 	// tell rdbs to add one, too
 	//if ( ! g_indexdb.addColl    ( coll, verify ) ) goto hadError;
 	if ( ! g_posdb.addColl    ( coll, verify ) ) goto hadError;
 	//if ( ! g_datedb.addColl     ( coll, verify ) ) goto hadError;
-
+	
 	if ( ! g_titledb.addColl    ( coll, verify ) ) goto hadError;
 	//if ( ! g_revdb.addColl      ( coll, verify ) ) goto hadError;
 	//if ( ! g_sectiondb.addColl  ( coll, verify ) ) goto hadError;
 	if ( ! g_tagdb.addColl      ( coll, verify ) ) goto hadError;
 	//if ( ! g_catdb.addColl      ( coll, verify ) ) goto hadError;
 	//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
-	if ( ! g_spiderdb.addColl   ( coll, verify ) ) goto hadError;
-	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;
 	//if ( ! g_tfndb.addColl      ( coll, verify ) ) goto hadError;
 	if ( ! g_clusterdb.addColl  ( coll, verify ) ) goto hadError;
 	if ( ! g_linkdb.addColl     ( coll, verify ) ) goto hadError;
+	if ( ! g_spiderdb.addColl   ( coll, verify ) ) goto hadError;
+	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;


 	// if first time adding a collrec, initialize the collectionless
 	// rdbs so they call Rdb::addColl() which makes a new RdbBase for them
 	// and stores ptr to that base in CollectionRec::m_bases[]
-	if ( m_numRecsUsed <= 1 ) {
+	if ( m_numRecsUsed == 1 ) {
 		g_statsdb.addColl ( NULL );
 		g_cachedb.addColl ( NULL );
 		g_serpdb.addColl ( NULL );
@ -505,12 +507,12 @@ bool Collectiondb::deleteRec ( char *coll , bool deleteTurkdb ) {
 	deleteTurkdb = true;
 	// no spiders can be out. they may be referencing the CollectionRec
 	// in XmlDoc.cpp... quite likely.
-	if ( g_conf.m_spideringEnabled ||
-	     g_spiderLoop.m_numSpidersOut > 0 ) {
-		log("admin: Can not delete collection while "
-		    "spiders are enabled or active.");
-		return false;
-	}
+	//if ( g_conf.m_spideringEnabled ||
+	//     g_spiderLoop.m_numSpidersOut > 0 ) {
+	//	log("admin: Can not delete collection while "
+	//	    "spiders are enabled or active.");
+	//	return false;
+	//}
 	// do not allow this if in repair mode
 	if ( g_repairMode > 0 ) {
 		log("admin: Can not delete collection while in repair mode.");
@ -531,6 +533,16 @@ bool Collectiondb::deleteRec ( char *coll , bool deleteTurkdb ) {
 	}
 	CollectionRec *cr = m_recs [ collnum ];
 	if ( ! cr ) return log("admin: Collection id problem. Delete failed.");
+	// spiders off
+	if ( cr->m_spiderColl &&
+	     cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
+		log("admin: Can not delete collection while "
+		    "spiders are oustanding for collection. Turn off "
+		    "spiders and wait for them to exit.");
+		return false;
+	}
+	// note it
+	log("coll: deleting coll %s",cr->m_coll);
 	// we need a save
 	m_needsSave = true;
 	// nuke it on disk
--- a/Conf.cpp
+++ b/Conf.cpp
@ -156,8 +156,12 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 	if ( g_conf.m_isLive ) g_conf.m_doConsistencyTesting = false;
 	// and this on
 	g_conf.m_indexDeletes = true;
+
+	// leave it turned off for diffbot since it always needs to be crawling
+#ifndef DIFFBOT
 	// these off
 	g_conf.m_spideringEnabled = false;
+#endif
 	// this off
 	g_conf.m_repairingEnabled = false;
 	// make this 1 day for now (in seconds)
@ -203,7 +207,15 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 	// and always keep a decent site quality cache of at least 3M
 	if ( g_conf.m_siteQualityMaxCacheMem < 3000000 )
 		g_conf.m_siteQualityMaxCacheMem = 3000000;
-	
+
+
+	m_useDiffbot = false;
+
+#ifdef DIFFBOT	
+	// make sure all collections index into a single unified collection
+	m_useDiffbot = true;
+#endif
+
 	// HACK: set this now
 	setRootIps();

--- a/Conf.h
+++ b/Conf.h
@ -183,6 +183,10 @@ class Conf {
 	long  m_clusterdbMinFilesToMerge;
 	bool  m_clusterdbSaveCache;

+	// if this is true, all collections index into the "main" collection
+	// but keep their own spiderdb in their collection.
+	bool m_useDiffbot;
+
 	//bool  m_indexEventsOnly;

 	// linkdb for storing linking relations
--- a/Errno.cpp
+++ b/Errno.cpp
@ -156,6 +156,9 @@ case	EDOCIDCOLLISION  : return "DocId collision in titledb";
 case	ESSLERROR        : return "SSL error of some kind";
 case    EPERMDENIED      : return "Permission denied";
 case    ENOFUNDS         : return "Not enough funds in account";
+case	EDIFFBOTINTERNALERROR: return "Diffbot internal error";
+case    EDIFFBOTMIMEERROR: return "Diffbot mime error";
+case    EDIFFBOTBADHTTPSTATUS: return "Diffbot reply bad http status";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/Errno.h
+++ b/Errno.h
@ -159,6 +159,9 @@ enum {
 	EDOCIDCOLLISION  ,
 	ESSLERROR        ,
 	EPERMDENIED      ,
-	ENOFUNDS
+	ENOFUNDS         ,
+	EDIFFBOTINTERNALERROR,
+	EDIFFBOTMIMEERROR,
+	EDIFFBOTBADHTTPSTATUS
 };
 #endif
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -1434,6 +1434,22 @@ unsigned long Hostdb::makeGroupMask ( long numGroups ) {
 	return makeGroupId ( numGroups - 1 , numGroups );
 }

+// return first alive host in a group/shard
+Host *Hostdb::getLiveHostInGroup ( long groupId ) {
+	Host *group = getGroup ( groupId );
+	Host *live = NULL;
+	for ( long i = 0 ; i < m_numHostsPerGroup ; i++ ) {
+		// get it
+		Host *h = &group[i];
+		// skip if dead
+		if ( isDead(h->m_hostId) ) continue;
+		// return it if alive
+		return h;
+	}
+	// return first one if all dead
+	return &group[0];
+}
+
 // . get the Hosts in group with "groupId"
 Host *Hostdb::getGroup ( unsigned long groupId , long *numHosts ) {
 	// set hosts per group
--- a/Hostdb.h
+++ b/Hostdb.h
@ -354,6 +354,8 @@ class Hostdb {

 	long long getNumGlobalEvents ( );

+	Host *getLiveHostInGroup ( long groupId );
+
 	// . returns false if blocks and will call your callback later
 	// . returns true if doesn't block
 	// . sets errno on error
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -746,14 +746,21 @@ void HttpMime::makeMime  ( long    totalContentLen    ,
 		//sprintf ( m_buf , 
 		p += sprintf( p,
 			      "HTTP/1.0 %li%s\r\n"
-			      // make it at least 4 spaces so we can change
-			      // the length of the content should we insert
-			      // a login bar in Proxy::storeLoginBar()
-			      "Content-Length: %04li\r\n"
+			      , httpStatus , smsg );
+		// if content length is not known, as in diffbot.cpp, then
+		// do not print it into the mime
+		if ( totalContentLen >= 0 )
+			p += sprintf ( p , 
+				       // make it at least 4 spaces so we can
+				       // change the length of the content 
+				       // should we insert a login bar in 
+				       // Proxy::storeLoginBar()
+				       "Content-Length: %04li\r\n"
+				       , totalContentLen );
+		p += sprintf ( p ,
 			      "%s"
 			      "Content-Type: %s",
-			      httpStatus , smsg ,
-			      totalContentLen , enc , contentType );
+			       enc , contentType );
 		if ( charset ) p += sprintf ( p , "; charset=%s", charset );
 		p += sprintf ( p , "\r\n");
 		p += sprintf ( p ,
--- a/HttpRequest.cpp
+++ b/HttpRequest.cpp
@ -614,6 +614,11 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 	 // procog's ip
 	 // if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0) 
 	 //	 m_isLocal = true;
+#ifdef DIFFBOT
+	 // diffbot comcast
+	 if ( sock && strncmp(iptoa(sock->m_ip),"50.168.3.61",11) == 0) 
+	 	 m_isLocal = true;
+#endif

 	 // roadrunner ip
 	 // if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) 
@ -1022,9 +1027,9 @@ long HttpRequest::getLong ( char *field , long defaultLong ) {
 		 if ( i >= len || !is_digit(value[i]) ) return defaultLong;
 	 }
 	 return res;
- }
+}

- long long HttpRequest::getLongLong   ( char *field , 
+long long HttpRequest::getLongLong   ( char *field , 
 					long long defaultLongLong ) {
 	 long len;
 	 char *value = getValue ( field, &len, NULL );
@ -1043,7 +1048,7 @@ long HttpRequest::getLong ( char *field , long defaultLong ) {
 		 if ( i >= len || !is_digit(value[i]) ) return defaultLongLong;
 	 }
 	 return res;
- }
+}

 float HttpRequest::getFloat   ( char *field , double defaultFloat ) {
 	 long len;
@ -1091,6 +1096,22 @@ double HttpRequest::getDouble ( char *field , double defaultDouble ) {
 	 return res;
 }

+
+bool HttpRequest::hasField ( char *field ) {
+	// how long is it?
+	long fieldLen = gbstrlen ( field );
+	// scan the field table directly
+	long i = 0;
+	for (  ; i < m_numFields ; i++ ) {
+		if ( fieldLen != m_fieldLens[i]                    ) continue; 
+		if ( strncmp ( field, m_fields[i], fieldLen ) != 0 ) continue;
+		// got a match return the true
+		return true;
+	}
+	return false;
+}
+
+
 char *HttpRequest::getValue ( char *field , long *len, long *next ) {
 	// how long is it?
 	long fieldLen = gbstrlen ( field );
@ -1146,8 +1167,21 @@ void HttpRequest::parseFields ( char *s , long slen ) {
 		m_fields [ n ] = s;
 		// point to = sign
 		char *equal = strchr ( s , '=' );
-		// try next field if none here
-		if ( ! equal ) { s += gbstrlen ( s ) + 1; continue; }
+		// if no equal sign, maybe it is one of diffbot's valueless
+		// fields, so support that now
+		if ( ! equal ) { 
+			// just set value to NULL
+			char *end = strchr(s,'&');
+			long len = end - s;
+			if ( ! end ) len = gbstrlen(s);
+			m_fieldLens[n] = len;
+			s[len] = '\0';
+			m_fieldValues[n] = NULL;
+			n++;
+			// skip over the '&' too
+			s += len + 1; 
+			continue; 
+		}
 		// set field len
 		m_fieldLens [ n ] = equal - s;
 		// set = to \0 so getField() returns NULL terminated field name
--- a/HttpRequest.h
+++ b/HttpRequest.h
@ -118,6 +118,9 @@ class HttpRequest {
 					  char *defaultString = NULL , 
 					  long *next=NULL);
 	
+
+	bool hasField ( char *field );
+
 	// are we a redir? if so return non-NULL
 	char      *getRedir    ( ) { return m_redir;    };
 	long       getRedirLen ( ) { return m_redirLen; };
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -9,6 +9,7 @@
 #include "XmlDoc.h" // gbzip
 #include "UdpServer.h"
 #include "Proxy.h"
+#include "Diffbot.h"

 // a global class extern'd in .h file
 HttpServer g_httpServer;
@ -128,6 +129,11 @@ bool HttpServer::getDoc ( char   *url      ,
 			  char    *proto ,
 			  bool     doPost ,
 			  char    *cookie ) { 
+	// sanity
+	if ( ip == -1 ) 
+		log("http: you probably didn't mean to set ip=-1 did you? "
+		    "try setting to 0.");
+
 	//log(LOG_WARN, "http: get doc %s", url->getUrl());
 	// use the HttpRequest class
 	HttpRequest r;
@ -886,6 +892,22 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 			return sendErrorReply(s,404,"bad request");


+	// . if we get a request for this then allow Diffbot.cpp to
+	//   handle it and send back the right stuff
+	if ( strcmp ( path , "/dev/crawl" ) == 0 ||
+	     strcmp ( path , "/dev/crawl/" ) == 0 )
+		// this will call g_httpServer.sendDynamicPage() to send
+		// back the reply when it is done generating the reply.
+		// this function is in Diffbot.cpp.
+		return printCrawlBotPage ( s , r );
+
+	// . is it a diffbot api request, like "GET /api/*"
+	// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
+	if ( strncmp ( path , "/api/" , 5 ) == 0 )
+		// this will call g_httpServer.sendDynamicPage() to send
+		// back the reply when it is done generating the reply.
+		// this function is in Diffbot.cpp.
+		return handleDiffbotRequest ( s , r );


 	// for adding to browser list of search engines
--- a/14
+++ b/14
@ -60,7 +60,7 @@ OBJS =  Tfndb.o UdpSlot.o \
 	Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
 	Dates.o Sections.o SiteGetter.o Syncdb.o \
 	Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
-	Cachedb.o Monitordb.o dlstubs.o
+	Cachedb.o Monitordb.o dlstubs.o Diffbot.o

 CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_

@ -71,6 +71,7 @@ HOST=$(shell hostname)
 #print_vars:
 #	$(HOST)

+
 # force 32-bit mode using -m32 (apt-get install gcc-multilib to ensure works)
 # and -m32 should use /usr/lib32/ as the library path.
 # for old kernel 2.4 we don't use pthreads, just clone. so if compiling
@ -88,6 +89,13 @@ LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a
 endif


+# special diffbot compiling case to default g_conf.m_useDiffbot to true
+ifeq ("neo","$(HOST)")
+CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -D_PTHREADS_ -Wno-unused-but-set-variable -DDIFFBOT
+LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread 
+endif
+
+
 # let's keep the libraries in the repo for easier bug reporting and debugging
 # in general if we can. the includes are still in /usr/include/ however...
 # which is kinda strange but seems to work so far.
@ -285,8 +293,8 @@ RdbBuckets.o:
 Linkdb.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 

-XmlDoc.o:
-	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 
+#XmlDoc.o:
+#	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 

 seo.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 
--- a/Mem.cpp
+++ b/Mem.cpp
@ -1459,8 +1459,8 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
 	if ( slot < 0 ) {
 		log(LOG_LOGIC,"mem: could not find slot (note=%s)",note);
 		// return for now so procog does not core all the time!
-		return;
-		//char *xx = NULL; *xx = 0;
+		//return;
+		char *xx = NULL; *xx = 0;
 	}

 #ifdef _EFENCE_
--- a/Msg0.cpp
+++ b/Msg0.cpp
@ -198,8 +198,12 @@ bool Msg0::getList ( long long hostId      , // host to ask (-1 if none)
 	// . groupMask must turn on higher bits first (count downwards kinda)
 	// . titledb and spiderdb use special masks to get groupId

+	// if diffbot.cpp is reading spiderdb from each shard we have to
+	// get groupid from hostid here lest we core in getGroupId() below
+	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
+		m_groupId = 0;
 	// did they force it? core until i figure out what this is
-	if ( forceParitySplit >= 0 ) 
+	else if ( forceParitySplit >= 0 ) 
 		m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
 	else
 		m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -285,6 +285,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		// show gigabits?
 		long gb = hr->getLong("gigabits",0);
 		if ( gb >= 1 ) sb.safePrintf("&gigabits=%li",gb);
+		// propagate collection
+		long clen;
+		char *coll = hr->getString("c",&clen,"",NULL);
+		if ( coll ) sb.safePrintf("&c=%s",coll);
 		// provide hash of the query so clients can't just pass in
 		// a bogus id to get search results from us
 		unsigned long h32 = hash32n(qstr);
@ -390,8 +394,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		       );
 		// contents of search box
 		sb.htmlEncode ( qstr , qlen , false );
-		sb.safePrintf ("\">"
-			       "<input type=submit value=\"Search\" border=0>"
+		sb.safePrintf ("\">");
+		// propagate collection on subsequent searches
+		sb.safePrintf("<input name=c type=hidden value=\"%s\">",coll);
+		sb.safePrintf("<input type=submit value=\"Search\" border=0>"
 			       "<br>"
 			       "<br>"
 			       "Try your search (not secure) on: &nbsp;&nbsp; "
@ -1186,7 +1192,7 @@ bool gotResults ( void *state ) {
 		// print the word
 		char *t    = qw->m_word; 
 		long  tlen = qw->m_wordLen;
-		sb.utf8Encode ( t , tlen );
+		sb.utf8Encode2 ( t , tlen );
 		sb.safePrintf (" ");
 	}
 	// print tail if we had ignored terms
@ -1246,7 +1252,7 @@ bool gotResults ( void *state ) {
 			       qe2 );
 		// close it up
 		sb.safePrintf ("\"><i><b>");
-		sb.utf8Encode(st->m_spell, len);
+		sb.utf8Encode2(st->m_spell, len);
 		// then finish it off
 		sb.safePrintf ("</b></i></a></font>\n<br><br>\n");
 	}
@ -1830,13 +1836,13 @@ static int printResult ( SafeBuf &sb,
 				backTag,
 				0,
 				0 ); // niceness
-		//if (!sb.utf8Encode(tt, hlen)) return false;
+		//if (!sb.utf8Encode2(tt, hlen)) return false;
 		if ( ! sb.brify ( tt,hlen,0,cols) ) return false;
 	}
 	else if ( str && strLen ) {
 		// determine if TiTle wraps, if it does add a <br> count for
 		// each wrap
-		//if (!sb.utf8Encode(str , strLen )) return false;
+		//if (!sb.utf8Encode2(str , strLen )) return false;
 		if ( ! sb.brify ( str,strLen,0,cols) ) return false;
 	}
 	// . use "UNTITLED" if no title
--- a/Pages.cpp
+++ b/Pages.cpp
@ -1624,6 +1624,81 @@ bool  Pages::printAdminLinks ( SafeBuf *sb,
 		*/
 	}
 	sb->safePrintf("</center><br/>" );
+
+	if ( top ) return status;
+
+	//
+	// if diffbot give the crawlbot api here mostly for testing
+	//
+	char *hyphen = NULL;
+	if ( g_conf.m_useDiffbot ) 
+		hyphen = strchr ( coll , '-');
+
+	if ( g_conf.m_useDiffbot ) {
+		sb->safePrintf("<br>"
+			       "<center>"
+			       "Diffbot API: &nbsp; " );
+		// /api/startcrawl
+		sb->safePrintf(" <a href=/dev/crawl>startcrawl</a>");
+	}
+
+	if ( hyphen ) {
+
+		// /api/stopcrawl
+		sb->safePrintf("&nbsp; <a href=/api/stopcrawl?token=");
+		sb->safeMemcpy ( coll, hyphen - coll );
+		sb->safePrintf("&id=%s>stopcrawl</a>"
+			       ,hyphen+1);
+
+		// /api/resumecrawl
+		sb->safePrintf("&nbsp; <a href=/api/resumecrawl?token=");
+		sb->safeMemcpy ( coll, hyphen - coll );
+		sb->safePrintf("&id=%s>resumecrawl</a>"
+			       ,hyphen+1);
+
+		// crawls
+		sb->safePrintf(" &nbsp; <a href=/api/crawls?token=");
+		sb->safeMemcpy ( coll, hyphen - coll );
+		sb->safePrintf(" title=\"show all crawl collections\">"
+			       "crawls</a>");
+
+		// activecrawls
+		sb->safePrintf(" &nbsp; <a href=/api/activecrawls?id=%s ",
+			       hyphen+1);
+		sb->safePrintf(" title=\"show stats on one crawl\">"
+			       "activecrawls</a>");
+
+
+		// downloadurls
+		sb->safePrintf(" &nbsp; <a href=/api/downloadurls?id=%s ",
+			       hyphen+1);
+		sb->safePrintf(" title=\"download urls in a crawl's "
+			       "spiderdb\">downloadurls</a>");
+
+		// download crawl urls
+		sb->safePrintf(" &nbsp; <a href=/api/downloadcrawl?id=%s ",
+			       hyphen+1);
+		sb->safePrintf(" title=\"download urls from crawl\">"
+			       "downloadcrawl (urls)</a>");
+
+
+		// download json objects
+		sb->safePrintf(" &nbsp; <a href=/api/downloadcrawl?"
+			       "id=%s&format=json ",
+			       hyphen+1);
+		sb->safePrintf(" title=\"download urls from crawl\">"
+			       "downloadcrawl (json)</a>");
+
+	}
+
+	if ( g_conf.m_useDiffbot ) {
+		sb->safePrintf("</center>\n");
+		sb->safePrintf("<br>");
+	}
+
+
+
+
 	//sprintf(p,"</font>\n" );
 	//p += gbstrlen(p);
 	return status;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -1204,7 +1204,8 @@ bool Parms::printParms ( SafeBuf* sb , long page , char *username,//long user,
 				status &=printParm ( sb, username,&m_parms[i],i,
 						     j, jend, (char *)THIS,
 						     coll,NULL,
-						     bg,nc,pd);
+						     bg,nc,pd,
+						     false);
 			continue;
 		}
 		// if not first in a row, skip it, we printed it already
@ -1222,7 +1223,7 @@ bool Parms::printParms ( SafeBuf* sb , long page , char *username,//long user,
 			      k++ )
 				status &=printParm(sb,username,&m_parms[k],k,
 					    newj,jend,(char *)THIS,coll,NULL,bg,
-					    nc,pd);
+						   nc,pd, j==size-1);
 		}
 		// end array table
 		//if ( m->m_max > 1 ) {
@ -1656,7 +1657,8 @@ bool Parms::printParm ( SafeBuf* sb,
 			char *pwd  ,
 			char *bg   ,
 			long  nc   ,
-			long  pd   ) {
+			long  pd   ,
+			bool lastRow ) {
 	bool status = true;
 	// do not print if no permissions
 	if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) )
@ -1864,8 +1866,14 @@ bool Parms::printParm ( SafeBuf* sb,
 	else if ( t == TYPE_CHECKBOX ) {
 		char *ddd = "";
 		if ( *s ) ddd = " checked";
-		sb->safePrintf("<center>"
-			       "<input type=checkbox ");
+		// this is part of the "HACK" fix below. you have to
+		// specify the cgi parm in the POST request, and unchecked
+		// checkboxes are not included in the POST request.
+		if ( lastRow && m->m_page == PAGE_FILTERS ) 
+			sb->safePrintf("<center><input type=hidden ");
+		else
+			sb->safePrintf("<center>"
+				       "<input type=checkbox ");
 		if ( m->m_page == PAGE_FILTERS)
 			sb->safePrintf("id=id_%s ",cgi);

@ -1933,6 +1941,22 @@ bool Parms::printParm ( SafeBuf* sb,
 		sb->dequote ( s , gbstrlen(s) );
 		sb->safePrintf ("\">");
 	}
+	else if ( t == TYPE_SAFEBUF ) {
+		long size = m->m_size;
+		// give regular expression box on url filters page more room
+		if ( m->m_page == PAGE_FILTERS ) {
+			if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX;
+		}
+		else {
+			if ( size > 20 ) size = 20;
+		}
+		sb->safePrintf ("<input type=text name=%s size=%li value=\"",
+				cgi,size);
+		//sb->dequote ( s , gbstrlen(s) );
+		SafeBuf *sx = (SafeBuf *)s;
+		sb->dequote ( sx->getBufStart() , sx->length() );
+		sb->safePrintf ("\">");
+	}
 	else if ( t == TYPE_STRINGBOX ) {
 		sb->safePrintf("<textarea rows=10 cols=64 name=%s>",cgi);
 		//p += urlEncode ( p , pend - p , s , gbstrlen(s) );
@ -2505,12 +2529,21 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
 	}

 	// if we are setting a guy in an array AND he is NOT the first
-	// in his row, ensure the guy before has a count of j+1 or more
+	// in his row, ensure the guy before has a count of j+1 or more.
+	//
+	// crap, on the url filters page if you do not check "spidering 
+	// enabled" checkbox when adding a new rule at the bottom of the
+	// table, , then the spidering enabled parameter does not transmit so
+	// the "respider frequency" ends up checking the "spider enabled"
+	// array whose "count" was not incremented like it should have been.
+	// HACK: make new line at bottom always have spidering enabled
+	// checkbox set and make it impossible to unset.
 	if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 &&
 	     m_parms[mm-1].m_rowid == m->m_rowid ) {
 		char *pos =  (char *)THIS + m_parms[mm-1].m_off - 4 ;
 		long maxcount = *(long *)pos;
 		if ( j >= maxcount ) {
+			log("admin: parm before \"m\" is limiting us");
 			//log("admin: try nuking the url filters or whatever "
 			//    "and re-adding");
 			return;
@ -2609,15 +2642,19 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
 		     ! isHtmlEncoded && oldLen == len &&
 		     memcmp ( sb->getBufStart() , s , len ) == 0 ) 
 			return;
+		// nuke it
+		sb->purge();
 		// this means that we can not use string POINTERS as parms!!
 		if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len ); 
 		else                   len = sb->htmlDecode (s,len,false,0);
+		// ensure null terminated
+		sb->nullTerm();
 		// null term it all
 		//dst[len] = '\0';
-		sb->reserve ( 1 );
+		//sb->reserve ( 1 );
 		// null terminate but do not include as m_length so the
 		// memcmp() above still works right
-		sb->m_buf[sb->m_length] = '\0';
+		//sb->m_buf[sb->m_length] = '\0';
 		// . might have to set length
 		// . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen
 		//if ( m->m_plen >= 0 ) 
@ -2891,6 +2928,7 @@ bool Parms::setFromFile ( void *THIS        ,
 		// now, extricate from the <![CDATA[ ... ]]> tag if we need to
 		if ( m->m_type == TYPE_STRING         || 
 		     m->m_type == TYPE_STRINGBOX      ||
+		     m->m_type == TYPE_SAFEBUF        ||
 		     m->m_type == TYPE_STRINGNONEMPTY   ) {
 			char *oldv    = v;
 			long  oldvlen = vlen;
@ -3210,6 +3248,10 @@ skip2:
 		}
 		*/

+		// debug point
+		//if ( m->m_type == TYPE_SAFEBUF )
+		//	log("hey");
+
 		// loop over all in this potential array
 		for ( j = 0 ; j < count ; j++ ) {
 			// the xml
@ -3219,6 +3261,7 @@ skip2:
 			// print CDATA if string
 			if ( m->m_type == TYPE_STRING         || 
 			     m->m_type == TYPE_STRINGBOX      ||
+			     m->m_type == TYPE_SAFEBUF        ||
 			     m->m_type == TYPE_STRINGNONEMPTY   ) {
 				sprintf ( p , "<![CDATA[" );
 				p += gbstrlen ( p );
@ -3233,6 +3276,7 @@ skip2:
 			// print CDATA if string
 			if ( m->m_type == TYPE_STRING         || 
 			     m->m_type == TYPE_STRINGBOX      ||
+			     m->m_type == TYPE_SAFEBUF        ||
 			     m->m_type == TYPE_STRINGNONEMPTY   ) {
 				sprintf ( p , "]]>" );
 				p += gbstrlen ( p );
@ -3343,6 +3387,14 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
 		sprintf (p,"%li",*(long *)s);
 	else if ( t == TYPE_LONG_LONG )
 		sprintf (p,"%lli",*(long long *)s);
+	else if ( t == TYPE_SAFEBUF ) {
+		SafeBuf *sb = (SafeBuf *)s;
+		p = htmlEncode ( p , 
+				 pend , 
+				 sb->getBufStart(),
+				 sb->getBufStart() + sb->length(),
+				 true ); // #?*
+	}
 	else if ( t == TYPE_STRING         || 
 		  t == TYPE_STRINGBOX      ||
 		  t == TYPE_STRINGNONEMPTY ||
@ -3434,6 +3486,7 @@ bool Parms::serialize( char *buf, long *bufSize ) {
 		if ( m->m_type == TYPE_STRING         ) size = m->m_size;
 		if ( m->m_type == TYPE_STRINGBOX      ) size = m->m_size;
 		if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size;
+		if ( m->m_type == TYPE_SAFEBUF        ) size = m->m_size;
 		if ( m->m_type == TYPE_SITERULE       ) size = 4;

 		// . set size to the total size of array
@ -3573,6 +3626,7 @@ bool Parms::serializeConfParm( Parm *m, long i, char **p, char *end,
 	return false;
 }

+// TODO: add TYPE_SAFEBUF support
 bool Parms::serializeCollParm( CollectionRec *cr,
 			       Parm *m, long i, char **p, char *end,
 			       long size, long cnt,
@ -5102,7 +5156,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "seatonep";
 	m->m_off   = (char *)&g_conf.m_sendParmChangeAlertsToEmail1 - g;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "1";
+	m->m_def   = "0";
 	m->m_priv  = 2;
 	m->m_group = 0;
 	m++;
@ -5156,7 +5210,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "seattwop";
 	m->m_off   = (char *)&g_conf.m_sendParmChangeAlertsToEmail2 - g;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "1";
+	m->m_def   = "0";
 	m->m_priv  = 2;
 	m->m_group = 0;
 	m++;
@ -5210,7 +5264,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "seatthreep";
 	m->m_off   = (char *)&g_conf.m_sendParmChangeAlertsToEmail3 - g;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "1";
+	m->m_def   = "0";
 	m->m_priv  = 2;
 	m->m_group = 0;
 	m++;
@ -5265,7 +5319,7 @@ void Parms::init ( ) {
 	m->m_cgi   = "seatfourp";
 	m->m_off   = (char *)&g_conf.m_sendParmChangeAlertsToEmail4 - g;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "1";
+	m->m_def   = "0";
 	m->m_priv  = 2;
 	m->m_group = 0;
 	m++;
@ -7659,6 +7713,105 @@ void Parms::init ( ) {
 	m++;
 	*/

+	/////////////////////
+	//
+	// DIFFBOT CRAWLBOT PARMS
+	//
+	//////////////////////
+
+	m->m_cgi   = "dbseed";
+	m->m_xml   = "diffbotSeed";
+	m->m_off   = (char *)&cr.m_diffbotSeed - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_obj   = OBJ_COLL;
+	m++;
+
+	m->m_cgi   = "dbtoken";
+	m->m_xml   = "diffbotToken";
+	m->m_off   = (char *)&cr.m_diffbotToken - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbapi";
+	m->m_xml   = "diffbotApi";
+	m->m_off   = (char *)&cr.m_diffbotApi - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbapiqs";
+	m->m_xml   = "diffbotApiQueryString";
+	m->m_off   = (char *)&cr.m_diffbotApiQueryString - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbucp";
+	m->m_xml   = "diffbotUrlCrawlPattern";
+	m->m_off   = (char *)&cr.m_diffbotUrlCrawlPattern - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbupp";
+	m->m_xml   = "diffbotUrlProcessPattern";
+	m->m_off   = (char *)&cr.m_diffbotUrlProcessPattern - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbppp";
+	m->m_xml   = "diffbotPageProcessPattern";
+	m->m_off   = (char *)&cr.m_diffbotPageProcessPattern - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbclassify";
+	m->m_xml   = "diffbotClassify";
+	m->m_off   = (char *)&cr.m_diffbotClassify - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbmaxtocrawl";
+	m->m_xml   = "diffbotMaxToCrawl";
+	m->m_off   = (char *)&cr.m_diffbotMaxToCrawl - x;
+	m->m_type  = TYPE_LONG_LONG;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbmaxtoprocess";
+	m->m_xml   = "diffbotMaxToProcess";
+	m->m_off   = (char *)&cr.m_diffbotMaxToProcess - x;
+	m->m_type  = TYPE_LONG_LONG;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbcrawlstarttime";
+	m->m_xml   = "diffbotCrawlStartTime";
+	m->m_off   = (char *)&cr.m_diffbotCrawlStartTime - x;
+	m->m_type  = TYPE_LONG_LONG;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "dbcrawlendtime";
+	m->m_xml   = "diffbotCrawlEndTime";
+	m->m_off   = (char *)&cr.m_diffbotCrawlEndTime - x;
+	m->m_type  = TYPE_LONG_LONG;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+	m->m_cgi   = "isdbtestcrawl";
+	m->m_xml   = "isDiffbotTestCrawl";
+	m->m_off   = (char *)&cr.m_isDiffbotTestCrawl - x;
+	m->m_type  = TYPE_BOOL;
+	m->m_page  = PAGE_NONE;
+	m++;
+
+
 	///////////////////////////////////////////
 	// SPIDER CONTROLS
 	///////////////////////////////////////////
@ -7678,7 +7831,7 @@ void Parms::init ( ) {
 	m->m_cgi  = "cse";
 	m->m_off   = (char *)&cr.m_spideringEnabled - x;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "0";
+	m->m_def   = "1";
 	m++;

 	/*
@ -12040,7 +12193,8 @@ void Parms::init ( ) {
 	m->m_off   = (char *)cr.m_regExs - x;
 	// this is a safebuf, dynamically allocated string really
 	m->m_type  = TYPE_SAFEBUF;//STRINGNONEMPTY
-	m->m_size  = MAX_REGEX_LEN+1;
+	// the size of each element in the array:
+	m->m_size  = sizeof(SafeBuf);//MAX_REGEX_LEN+1;
 	m->m_page  = PAGE_FILTERS;
 	m->m_rowid = 1; // if we START a new row
 	m->m_def   = "";
@ -15096,6 +15250,8 @@ void Parms::overlapTest ( char step ) {
 			    m_parms[i].m_desc);
 	}

+	log("conf: try including \"m->m_obj = OBJ_COLL;\" or "
+	    "\"m->m_obj   = OBJ_CONF;\" in your parm definitions");
 	log("conf: failed overlap test. exiting.");
 	exit(-1);

--- a/Parms.h
+++ b/Parms.h
@ -31,7 +31,7 @@ enum {
 	TYPE_FLOAT          ,
 	TYPE_IP             ,
 	TYPE_LONG           ,
-	TYPE_LONG_LONG      ,
+	TYPE_LONG_LONG      , // 10
 	TYPE_NONE           ,
 	TYPE_PRIORITY       ,
 	TYPE_PRIORITY2      ,
@ -41,7 +41,7 @@ enum {
 	TYPE_STRINGBOX      ,
 	TYPE_STRINGNONEMPTY ,
 	TYPE_TIME           ,
-	TYPE_DATE2          ,
+	TYPE_DATE2          , // 20
 	TYPE_DATE           ,
 	TYPE_RULESET        ,
 	TYPE_FILTER         ,
@ -50,7 +50,7 @@ enum {
 	TYPE_MONOD2         ,
 	TYPE_MONOM2         ,
 	TYPE_LONG_CONST     ,
-	TYPE_SITERULE       ,
+	TYPE_SITERULE       , // 29
 	TYPE_SAFEBUF
 };

@ -147,7 +147,7 @@ class Parm {
 	char * getValueAsString ( class SearchInput *si ) ;	
 };

-#define MAX_PARMS 840
+#define MAX_PARMS 940

 #define MAX_XML_CONF (200*1024)

@ -171,15 +171,16 @@ class Parms {
 				long page , char *coll , char *pwd ) ;


-	char *printParms (char *p, char *pend, TcpSocket *s , HttpRequest *r );
+	//char *printParms (char *p, char *pend, TcpSocket *s, HttpRequest *r);
 	bool printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r );

-	char *printParms (char *p,char *pend,long page,char *username,
-	                  void *THIS, char *coll , char *pwd , 
-			  long nc , long pd ) ;
+	//char *printParms (char *p,char *pend,long page,char *username,
+	//                  void *THIS, char *coll , char *pwd , 
+	//		  long nc , long pd ) ;
 	bool printParms (SafeBuf* sb, long page,char *username,void *THIS,
-			  char *coll , char *pwd , long nc , long pd ) ;
+			 char *coll , char *pwd , long nc , long pd );

+	/*
 	char *printParm ( char *p    , 
 			  char *pend ,
 			  //long  user ,
@ -194,6 +195,8 @@ class Parms {
 			  char *bg   ,
 			  long  nc   ,
 			  long  pd   ) ;
+	*/
+
 	bool printParm ( SafeBuf* sb,
 			 //long  user ,
 			  char *username,
@ -206,7 +209,8 @@ class Parms {
 			  char *pwd  ,
 			  char *bg   ,
 			  long  nc   ,
-			  long  pd   ) ;
+			 long  pd   ,
+			 bool lastRow ) ;

 	char *getTHIS ( HttpRequest *r , long page ) ;

--- a/Process.cpp
+++ b/Process.cpp
@ -1396,7 +1396,7 @@ void Process::disableTreeWrites ( ) {
 	}
 	// disable all spider trees and tables
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
-		SpiderColl *sc = g_spiderCache.getSpiderColl(i);
+		SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
 		if ( ! sc ) continue;
 		sc->m_waitingTree .disableWrites();
 		sc->m_waitingTable.disableWrites();
@ -1413,7 +1413,7 @@ void Process::enableTreeWrites ( ) {
 	}
 	// enable all waiting trees
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
-		SpiderColl *sc = g_spiderCache.getSpiderColl(i);
+		SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(i);
 		if ( ! sc ) continue;
 		sc->m_waitingTree .enableWrites();
 		sc->m_waitingTable.enableWrites();
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -229,7 +229,9 @@ bool Rdb::init ( char          *dir                  ,
 			    m_dbname       ,
 			    m_ks           ,
 			    // make useProtection true for debugging
-			    false          ) ) // use protection?
+				    false          , // use protection?
+				    false , // alowdups?
+				    m_rdbId ) )
 			return false;
 	}
 	else {
@ -244,7 +246,9 @@ bool Rdb::init ( char          *dir                  ,
 			    m_dbname       ,
 			    m_ks           ,
 			    // make useProtection true for debugging
-			    false          ); // use protection?
+				     false          , // use protection?
+				     false , // alowdups?
+				     m_rdbId );
 		}
 		// set this then
 		sprintf(m_treeName,"buckets-%s",m_dbname);
@ -846,7 +850,8 @@ bool Rdb::loadTree ( ) {
 	//log (0,"Rdb::loadTree: loading %s",filename);
 	// set a BigFile to this filename
 	BigFile file;
-	file.set ( getDir() , filename , NULL ); // getStripeDir() );
+	char *dir = getDir();
+	file.set ( dir , filename , NULL ); // getStripeDir() );
 	bool treeExists = file.doesExist() > 0;
 	bool status = false ;
 	if ( treeExists ) {
@ -2163,7 +2168,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 	}
 	else if ( (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
 		// if adding to spiderdb, add to cache, too
-		if ( m_rdbId != RDB_SPIDERDB || m_rdbId != RDB_DOLEDB ) 
+		if ( m_rdbId != RDB_SPIDERDB && m_rdbId != RDB_DOLEDB ) 
 			return true;
 		// or if negative key
 		if ( KEYNEG(key) ) return true;
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -92,7 +92,8 @@ bool RdbTree::set ( long fixedDataSize ,
 		    char *dbname       ,
 		    char  keySize      ,
 		    bool  useProtection ,
-		    bool  allowDups     ) {
+		    bool  allowDups     ,
+		    char  rdbId ) {
 	reset();
 	m_fixedDataSize   = fixedDataSize; 
 	m_doBalancing     = doBalancing;
@ -120,9 +121,9 @@ bool RdbTree::set ( long fixedDataSize ,
 	if ( dbname ) strncpy ( p , dbname    , 8 ); p += 8;
 	*p++ = '\0';
 	// set rdbid
-	m_rdbId = -1;
+	m_rdbId = rdbId; // -1;
 	// if its doledb, set it
-	if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
+	//if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
 	// adjust m_maxMem to virtual infinity if it was -1
 	if ( m_maxMem < 0 ) m_maxMem = 0x7fffffff;
 	// . compute each node's memory overhead
@ -2994,11 +2995,11 @@ void RdbTree::cleanTree ( ) { // char **bases ) {

 long  RdbTree::getNumNegativeKeys ( collnum_t collnum ) { 
 	return g_collectiondb.m_recs[collnum]->
-			m_numNegKeysInTree[m_rdbId]; 
+		m_numNegKeysInTree[(unsigned char)m_rdbId]; 
 }

 long  RdbTree::getNumPositiveKeys ( collnum_t collnum ) { 
 	return g_collectiondb.m_recs[collnum]->
-		m_numPosKeysInTree[m_rdbId]; 
+		m_numPosKeysInTree[(unsigned char)m_rdbId]; 
 }

--- a/RdbTree.h
+++ b/RdbTree.h
@ -91,7 +91,8 @@ class RdbTree {
 		   bool dataInPtrs = false ,
 		   char *dbname = NULL , char keySize = 12 ,
 		   bool useProtection = false ,
-		   bool allowDups     = false );
+		   bool allowDups     = false ,
+		   char rdbId = -1 );

 	// . frees the used memory, etc.
 	// . override so derivatives can free up extra header arrays
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -680,7 +680,7 @@ bool SafeBuf::setEncoding(short cs) {
 	return true;
 }

-bool  SafeBuf::utf8Encode(char *s, long len, bool encodeHTML,long niceness) {
+bool  SafeBuf::utf8Encode2(char *s, long len, bool encodeHTML,long niceness) {
 	long tmp = m_length;
 	if ( m_encoding == csUTF8 ) {
 		if (! safeMemcpy(s,len)) return false;
@ -1786,6 +1786,7 @@ bool SafeBuf::htmlEncodeXmlTags ( char *s , long slen , long niceness ) {
 }

 bool  SafeBuf::safeStrcpy ( char *s ) {
+	if ( ! s ) return true;
 	long slen = gbstrlen(s);
 	return safeMemcpy(s,slen); 
 }
@ -2491,6 +2492,182 @@ bool SafeBuf::decodeJSON ( long niceness ) {

 	return true;
 }
+
+bool SafeBuf::decodeJSONToUtf8 ( long niceness ) {
+
+	//char *x = strstr(m_buf,"Chief European");
+	//if ( x )
+	//	log("hey");
+
+	// count how many \u's we got
+	long need = 0;
+	char *p = m_buf;
+	for ( ; *p ; p++ ) 
+		// for the 'x' and the ';'
+		if ( *p == '\\' && p[1] == 'u' ) need += 2;
+
+	// reserve a little extra if we need it
+	SafeBuf dbuf;
+	dbuf.reserve ( need + m_length + 1);
+
+	char *src = m_buf;
+	char *dst = dbuf.m_buf;
+	for ( ; *src ; ) {
+		QUICKPOLL(niceness);
+		if ( *src == '\\' ) {
+			// \n? (from json.org homepage)
+			if ( src[1] == 'n' ) {
+				*dst++ = '\n';
+				src += 2;
+				continue;
+			}
+			if ( src[1] == 'r' ) {
+				*dst++ = '\r';
+				src += 2;
+				continue;
+			}
+			if ( src[1] == 't' ) {
+				*dst++ = '\t';
+				src += 2;
+				continue;
+			}
+			if ( src[1] == 'b' ) {
+				*dst++ = '\b';
+				src += 2;
+				continue;
+			}
+			if ( src[1] == 'f' ) {
+				*dst++ = '\f';
+				src += 2;
+				continue;
+			}
+			// a "\\" is an encoded backslash
+			if ( src[1] == '\\' ) {
+				*dst++ = '\\';
+				src += 2;
+				continue;
+			}
+			// a "\/" is an encoded forward slash
+			if ( src[1] == '/' ) {
+				*dst++ = '/';
+				src += 2;
+				continue;
+			}
+			// utf8? if not, just skip the slash
+			if ( src[1] != 'u'  ) { src++; continue; }
+			// otherwise, decode. can do in place like this...
+			char *p = src + 2;
+			if ( ! is_hex(p[0]) ) continue;
+			if ( ! is_hex(p[1]) ) continue;
+			if ( ! is_hex(p[2]) ) continue;
+			if ( ! is_hex(p[3]) ) continue;
+			// TODO: support surrogate pairs in utf16?
+			UChar32 uc = 0;
+			// store the 16-bit number in lower 16 bits of uc...
+			hexToBin ( p   , 2 , ((char *)&uc)+1 );
+			hexToBin ( p+2 , 2 , ((char *)&uc)+0 );
+			//buf[2] = '\0';
+			long size = ::utf8Encode ( (UChar32)uc , (char *)dst );
+			// a quote??? not allowed in json!
+			if ( size == 1 && dst[0] == '\"' ) {
+				size = 2;
+				dst[0] = '\\';
+				dst[1] = '\"';
+			}
+			//short = ahextoshort ( p );
+			dst += size;
+			// skip over /u and 4 digits
+			src += 6;
+			continue;
+		}
+		*dst++ = *src++;
+	}
+	*dst = '\0';
+	dbuf.m_length = dst - dbuf.m_buf;
+
+	// purge ourselves
+	purge();
+
+	// and steal dbuf's m_buf
+	m_buf        = dbuf.m_buf;
+	m_length     = dbuf.m_length;
+	m_capacity   = dbuf.m_capacity;
+	m_usingStack = dbuf.m_usingStack;
+
+	// detach from dbuf so he does not free it
+	dbuf.detachBuf();
+
+	return true;
+}
+
+// . REALLY just a print vanity function. makes json output prettier
+//
+// . after converting JSON to utf8 above we sometimes want to go back.
+// . just print that out. encode \n's and \r's back to \\n \\r
+//   and backslash to a \\ ... etc.
+// . but if they originally had a \u<backslash> encoding and we decoded
+//   it to a backslash, here it will be re-encoded as (double backslash)
+// . like wise if that originally had a \u<quote> encoding we should
+//   have decoded it as a \"!
+// . this does not need to be super fast because it will be used for
+//   showing cached pages or dumping out the json objects from a crawl for
+//   diffbot
+// . really we could leave the newlines decoded etc, but it is prettier
+//   for printing
+bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
+	// how much space do we need?
+	// each single byte \t char for instance will need 2 bytes
+	long need = gbstrlen(decodedJson) * 2 + 1;
+	if ( ! reserve ( need ) ) return false;
+	// scan and copy
+	char *src = decodedJson;
+	// concatenate to what's already there
+	char *dst = m_buf + m_length;
+	for ( ; *src ; src++ ) {
+
+		if ( *src == '\t' ) {
+			*dst++ = '\\';
+			*dst++ = 't';
+			continue;
+		}
+		if ( *src == '\n' ) {
+			*dst++ = '\\';
+			*dst++ = 'n';
+			continue;
+		}
+		if ( *src == '\r' ) {
+			*dst++ = '\\';
+			*dst++ = 'r';
+			continue;
+		}
+		if ( *src == '\f' ) {
+			*dst++ = '\\';
+			*dst++ = 'f';
+			continue;
+		}
+		if ( *src == '\\' ) {
+			*dst++ = '\\';
+			*dst++ = '\\';
+			continue;
+		}
+		//if ( *src == '\/' ) {
+		//	*dst++ = '\\';
+		//	*dst++ = '/';
+		//	continue;
+		//}
+
+		*dst++ = *src;
+
+	}
+	// null term
+	*dst = '\0';
+
+	m_length = dst - m_buf;
+
+	return true;
+}
+
+
 		
 bool SafeBuf::linkify ( long niceness , long startPos ) {

--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -57,6 +57,7 @@ struct SafeBuf {
 	bool truncateLongWords ( char *src, long srcLen , long minmax );
 	bool safeTruncateEllipsis ( char *src , long maxLen );
 	bool convertJSONtoXML ( long niceness , long startConvertPos );
+	bool decodeJSONToUtf8 ( long niceness );
 	bool decodeJSON ( long niceness );
 	bool linkify ( long niceness , long startPos );

@ -70,6 +71,13 @@ struct SafeBuf {
 		return safeStrcpy ( str );
 	};

+	void removeLastChar ( char lastChar ) {
+		if ( m_length <= 0 ) return;
+		if ( m_buf[m_length-1] != lastChar ) return;
+		m_length--;
+		m_buf[m_length] = '\0';
+	};
+
 	//MUTATORS
 #ifdef _CHECK_FORMAT_STRING_
 	bool  safePrintf(char *formatString, ...)
@ -83,6 +91,7 @@ struct SafeBuf {
 	bool  safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
 	bool  safeMemcpy ( class Words *w , long a , long b ) ;
 	bool  safeStrcpy ( char *s ) ;
+	bool  safeStrcpyPrettyJSON ( char *decodedJson ) ;
 	//bool  pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
@ -96,7 +105,11 @@ struct SafeBuf {
 	bool  reserve(long i, char *label=NULL);
 	bool  reserve2x(long i);
 	bool  inlineStyleTags();
-	void  incrementLength(long i) { m_length += i; }
+	void  incrementLength(long i) { 
+		m_length += i; 
+		// watch out for negative i's
+		if ( m_length < 0 ) m_length = 0; 
+	};
 	void  setLength(long i) { m_length = i; };
 	char *getNextLine ( char *p ) ;
 	long  catFile(char *filename) ;
@ -172,9 +185,9 @@ struct SafeBuf {

 	//insert strings in their native encoding
 	bool  encode ( char *s , long len , long niceness=0) {
-		return utf8Encode(s,len,false,niceness); };
+		return utf8Encode2(s,len,false,niceness); };
 	// htmlEncode default = false
-	bool  utf8Encode(char *s, long len, bool htmlEncode=false, 
+	bool  utf8Encode2(char *s, long len, bool htmlEncode=false, 
 			 long niceness=0);
 	bool  latin1Encode(char *s, long len, bool htmlEncode=false,
 			   long niceness=0);
@ -230,6 +243,16 @@ struct SafeBuf {

 	bool  cdataEncode ( char *s ) ;

+	// . append a \0 but do not inc m_length
+	// . for null terminating strings
+	bool nullTerm ( ) {
+		if(m_length >= m_capacity && !reserve(m_capacity + 1) )
+			return false;
+		m_buf[m_length] = '\0';
+		return true;
+	};
+
+
 	bool  safeCdataMemcpy(char *s, long len);
 	bool  pushChar (char i) {
 		if(m_length >= m_capacity) 
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -196,6 +196,9 @@ class SearchInput *g_si = NULL;

 bool SearchInput::set ( TcpSocket *sock , HttpRequest *r , Query *q ) {

+	// save it now
+	m_socket = sock;
+
 	// get coll rec
 	long  collLen;
 	char *coll = r->getString ( "c" , &collLen );
--- a/SearchInput.h
+++ b/SearchInput.h
@ -401,6 +401,8 @@ class SearchInput {
 	// make a cookie from parms with m_flags of PF_COOKIE set
 	SafeBuf m_cookieBuf;

+	TcpSocket *m_socket;
+
 	//char           m_urlParms  [ MAX_URLPARMS_LEN ];
 	//char           m_postParms [ MAX_URLPARMS_LEN ];

--- a/Spider.cpp
+++ b/Spider.cpp
@ -747,7 +747,7 @@ void SpiderCache::save ( bool useThread ) {
 	//m_isSaving = true;
 	// loop over all SpiderColls and get the best
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
-		SpiderColl *sc = getSpiderColl(i);//m_spiderColls[i];
+		SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
 		if ( ! sc ) continue;
 		RdbTree *tree = &sc->m_waitingTree;
 		char *filename = "waitingtree";
@ -797,7 +797,7 @@ void SpiderCache::save ( bool useThread ) {

 bool SpiderCache::needsSave ( ) {
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
-		SpiderColl *sc = getSpiderColl(i);//m_spiderColls[i];
+		SpiderColl *sc = getSpiderCollIffNonNull(i);//m_spiderColls[i];
 		if ( ! sc ) continue;
 		if ( sc->m_waitingTree.m_needsSave ) return true;
 		// also the doleIpTable
@ -809,7 +809,7 @@ bool SpiderCache::needsSave ( ) {
 void SpiderCache::reset ( ) {
 	// loop over all SpiderColls and get the best
 	for ( long i = 0 ; i < g_collectiondb.getNumRecs() ; i++ ) {
-		SpiderColl *sc = getSpiderColl(i);
+		SpiderColl *sc = getSpiderCollIffNonNull(i);
 		if ( ! sc ) continue;
 		sc->reset();
 		mdelete ( sc , sizeof(SpiderColl) , "SpiderCache" );
@ -821,6 +821,13 @@ void SpiderCache::reset ( ) {
 	//m_numSpiderColls = 0;
 }

+SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
+	// shortcut
+	CollectionRec *cr = g_collectiondb.m_recs[collnum];
+	// return it if non-NULL
+	return cr->m_spiderColl;
+}
+
 // get SpiderColl for a collection
 SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 	// return it if non-NULL
@ -867,6 +874,8 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 	sc->m_cr = cr;
 	// sanity check
 	if ( ! cr ) { char *xx=NULL;*xx=0; }
+	// note it!
+	log("spider: adding new spider collection for %s",cr->m_coll);
 	// that was it
 	return sc;
 }
@ -894,6 +903,13 @@ SpiderColl::SpiderColl () {
 	memset ( m_outstandingSpiders , 0 , 4 * MAX_SPIDER_PRIORITIES );
 }

+long SpiderColl::getTotalOutstandingSpiders ( ) {
+	long sum = 0;
+	for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
+		sum += m_outstandingSpiders[i];
+	return sum;
+}
+
 // load the tables that we set when m_doInitialScan is true
 bool SpiderColl::load ( ) {
 	// error?
@ -937,6 +953,8 @@ bool SpiderColl::load ( ) {
 	// . try going to 20M now since we hit it again...
 	if (!m_waitingTree.set(0,-1,true,20000000,true,"waittree2",
 			       false,"waitingtree",sizeof(key_t)))return false;
+	// prevent core with this
+	m_waitingTree.m_rdbId = RDB_NONE;

 	// make dir
 	char dir[500];
@ -2326,7 +2344,8 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 		if ( sreq->m_url[0] != 'h' &&
 		     // might be a docid from a pagereindex.cpp
 		     ! is_digit(sreq->m_url[0]) ) { 
-			log("spider: got corrupt 1 spiderRequest in scan");
+			log("spider: got corrupt 1 spiderRequest in scan "
+			    "because url is %s",sreq->m_url);
 			continue;
 		}

@ -7814,3 +7833,144 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {

 	//mfree ( oldbuf , oldSize, "oldspbuf");
 }
+
+///////
+//
+// diffbot uses these for limiting crawls in a collection
+//
+///////
+
+void gotCrawlInfoReply ( void *state , UdpSlot *slot);
+
+class CallbackEntry2 {
+public:
+	void *m_state;
+	void (* m_callback ) ( void *state );
+};
+
+// . get total # of pages crawled in this collection over whole network
+// . returns false if blocked
+// . returns true and sets g_errno on error
+bool updateCrawlInfo ( CollectionRec *cr , 
+		       void *state ,
+		       void (* callback)(void *state) ,
+		       bool useCache ) {
+
+	long now = getTimeLocal();
+	if ( useCache && now - cr->m_globalCrawlInfoUpdateTime  < 60 )
+		return true;
+
+	// wait in line if reply is pending
+	//if ( cr->m_replies < cr->m_requests || ) {
+	// . returns false and sets g_errno on error
+	// . this will store state/callback into a safebuf queue
+	CallbackEntry2 ce2;
+	ce2.m_state = state;
+	ce2.m_callback = callback;
+	if ( ! cr->m_callbackQueue.safeMemcpy ( &ce2, sizeof(CallbackEntry2)) )
+		return true;
+
+	// if we were not the first, we do not initiate it, we just wait
+	// for all the replies to come back
+	if ( cr->m_replies < cr->m_requests ) return false;
+
+	cr->m_globalCrawlInfo.reset();
+
+	cr->m_replies  = 0;
+	cr->m_requests = 0;
+
+	// request is just the collnum
+	char *request = (char *)&cr->m_collnum;
+	long requestSize = sizeof(collnum_t);
+
+	// send out the msg request
+	for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
+		Host *h = g_hostdb.getHost(i);
+		// skip if dead
+		if ( g_hostdb.isDead(i) ) continue;
+		// count it as launched
+		cr->m_requests++;
+		if ( ! g_udpServer.sendRequest ( request,
+						 requestSize,
+						 0xc1 , // msgtype
+						 h->m_ip      ,
+						 h->m_port    ,
+						 h->m_hostId  ,
+						 NULL, // retslot
+						 cr , // state
+						 gotCrawlInfoReply ) ) {
+			log("spider: error sending c1 request: %s",
+			    mstrerror(g_errno));
+			cr->m_replies++;
+		}
+	}
+
+	// return false if we blocked awaiting replies
+	if ( cr->m_replies < cr->m_requests ) return false;
+
+	// somehow we did not block... hmmmm...
+	gotCrawlInfoReply( cr , NULL );
+
+	// we did not block...
+	return true;
+}
+
+void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
+	// cast it
+	CollectionRec *cr = (CollectionRec *)state;
+	// inc it
+	cr->m_replies++;
+
+	// the sendbuf should never be freed! it points into collrec
+	slot->m_sendBufAlloc = NULL;
+
+	// add it in to the stats
+	if ( slot ) {
+		CrawlInfo *stats = (CrawlInfo *)(slot->m_readBuf);
+		cr->m_globalCrawlInfo.m_pageIndexAttempts +=
+			stats->m_pageIndexAttempts;
+		cr->m_globalCrawlInfo.m_pageProcessAttempts +=
+			stats->m_pageProcessAttempts;
+		cr->m_globalCrawlInfo.m_pageDownloadAttempts +=
+			stats->m_pageDownloadAttempts;
+	}
+	// return if still waiting on more to come in
+	if ( cr->m_replies < cr->m_requests ) return;
+
+	// update cache time
+	cr->m_globalCrawlInfoUpdateTime = getTime();
+
+	// make it save to disk i guess
+	cr->m_needsSave = true;
+
+	// call all callbacks
+	long nc = cr->m_callbackQueue.length() / sizeof(CallbackEntry2);
+	char *p = cr->m_callbackQueue.getBufStart();
+	for ( long i = 0 ; i < nc ; i++ ) {
+		CallbackEntry2 *ce2 = (CallbackEntry2 *)p;
+		p += sizeof(CallbackEntry2);
+		// clear g_errno just in case
+		g_errno = 0;
+		// call that callback waiting in the queue
+		ce2->m_callback ( ce2->m_state );
+	}
+
+	// save the mem!
+	cr->m_callbackQueue.purge();
+}
+
+void handleRequestc1 ( UdpSlot *slot , long niceness ) {
+	char *request = slot->m_readBuf;
+	// just a single collnum
+	if ( slot->m_readBufSize != sizeof(collnum_t) ) { char *xx=NULL;*xx=0;}
+	collnum_t collnum = *(collnum_t *)request;
+	CollectionRec *cr = g_collectiondb.getRec(collnum);
+	char *reply = slot->m_tmpBuf;
+	if ( TMPBUFSIZE < sizeof(CrawlInfo) ) { char *xx=NULL;*xx=0; }
+	memcpy ( reply , &cr->m_localCrawlInfo , sizeof(CrawlInfo) );
+	g_udpServer.sendReply_ass ( reply , 
+				    sizeof(CrawlInfo) ,
+				    reply , // alloc
+				    sizeof(CrawlInfo) , //alloc size
+				    slot );
+}
--- a/Spider.h
+++ b/Spider.h
@ -24,6 +24,13 @@
 #include "Msg4.h"
 #include "hash.h"

+// for diffbot, this is for xmldoc.cpp to update CollectionRec::m_crawlInfo
+// which has m_pagesCrawled and m_pagesProcessed.
+bool updateCrawlInfo ( CollectionRec *cr , 
+		       void *state ,
+		       void (* callback)(void *state) ,
+		       bool useCache = true ) ;
+
 ///////////////////////////////////////
 //
 // QUICK OVERVIEW
@ -828,6 +835,8 @@ class SpiderColl {

 	bool      load();

+	long getTotalOutstandingSpiders ( ) ;
+
 	key128_t m_firstKey;
 	// spiderdb is now 128bit keys
 	key128_t m_nextKey;
@ -966,6 +975,8 @@ class SpiderCache {
 	// what SpiderColl does a SpiderRec with this key belong?
 	SpiderColl *getSpiderColl ( collnum_t collNum ) ;

+	SpiderColl *getSpiderCollIffNonNull ( collnum_t collNum ) ;
+
 	// called by main.cpp on exit to free memory
 	void reset();

@ -1024,7 +1035,7 @@ class Msg12 {
 };

 void handleRequest12 ( UdpSlot *udpSlot , long niceness ) ;
-
+void handleRequestc1 ( UdpSlot *slot , long niceness ) ;

 // . the spider loop
 // . it gets urls to spider from the SpiderCache global class, g_spiderCache
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -1425,6 +1425,9 @@ void writeSocketWrapper ( int sd , void *state ) {
 	}
 	// if socket has nothing to send yet cuz we're waiting, wait...
 	if ( s->m_sendBufUsed == 0 ) return;
+
+ sendAgain:
+
 	// . writeSocket returns false if blocked, true otherwise
 	// . it also sets g_errno on errro
 	// . don't call it if we have g_errno set, however
@ -1435,8 +1438,16 @@ void writeSocketWrapper ( int sd , void *state ) {
 	if ( status == 1  &&  ! s->m_readBuf ) return;
 	// good?
 	g_errno = 0;
-	// otherwise, call callback on done reading or error
+	// otherwise, call callback on done writing or error
 	THIS->makeCallback ( s );
+
+	// if callback changed socket status to ST_SEND_AGAIN 
+	// then let's send the new buffer that it has. Diffbot.cpp uses this.
+	if ( s->m_sockState == ST_SEND_AGAIN ) {
+		s->m_sockState = ST_WRITING;
+		goto sendAgain;
+	}
+
 	// . destroy the socket on error, recycle on transaction completion
 	// . this will also unregister all our callbacks for the socket
 	if      ( status == -1 ) THIS->destroySocket ( s );
--- a/TcpSocket.h
+++ b/TcpSocket.h
@ -26,6 +26,10 @@
 #define ST_CLOSE_CALLED     7
 #define ST_SSL_ACCEPT       8
 #define ST_SSL_SHUTDOWN     9
+// hack to repopulate the socket's send buf when its done sending
+// it's current sendbuf in order to transmit large amounts of data that
+// can't all fit in memory at the same time:
+#define ST_SEND_AGAIN       10

 #define TCP_READ_BUF_SIZE 1024

--- a/Title.cpp
+++ b/Title.cpp
@ -103,6 +103,35 @@ bool Title::setTitle ( XmlDoc   *xd            ,

 	long long startTime = gettimeofdayInMilliseconds();

+	// . reset so matches.cpp using this does not core
+	// . assume no title tag
+	m_titleTagStart = -1;
+	m_titleTagEnd   = -1;
+
+	// if we are a json object
+	if ( ! xd->m_contentTypeValid ) { char *xx=NULL;*xx=0; }
+	char *val = NULL;
+	long vlen;
+	// look for the "title:" field in json then use that
+	if ( xd->m_contentType == CT_JSON )
+		val = getJSONFieldValue ( xd->ptr_utf8Content,"title",&vlen);
+	// if we had a title: field in the json...
+	if ( val ) {
+		char *dst = NULL;
+		m_titleBytes = vlen;
+		if ( m_titleBytes+1 <  TITLE_LOCAL_SIZE )
+			dst = m_localBuf;
+		else {
+			dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
+			if ( ! dst ) return false;
+		}
+		m_title = dst;
+		memcpy ( dst , val , m_titleBytes );
+		dst[m_titleBytes] = '\0';
+		return true;
+	}
+
+
 	bool status = setTitle4 ( xd ,
 				  xml ,
 				  words ,
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -91,8 +91,16 @@ bool setLangVec ( class Words *words ,
 		  class Sections *sections ,
 		  long niceness ) ;

+char *getJSONFieldValue ( char *json, char *field , long *valueLen ) ;
+
 bool logQueryLogs ( );

+bool checkRegex ( SafeBuf *regex , 
+		  char    *target ,
+		  bool    *boolVal ,
+		  bool    *boolValValid ,
+		  long    *compileError = NULL ) ;
+
 // Address.cpp calls this to make a vector from the "place name" for comparing
 // to other places in placedb using the computeSimilarity() function. if
 // we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the
@ -283,7 +291,13 @@ class XmlDoc {
 	char      m_reserved3b;
 	uint16_t  m_reserved4;//externalLinkTextWeight;
 	uint16_t  m_reserved5;//internalLinkTextWeight;
-	uint16_t  m_reserved6;//conceptWeight;
+
+	// a new parm from reserved6. need to know the count so we can
+	// delete the json objects derived from this page if we want to
+	// delete this page. or if this page is respidered then we get the
+	// json objects for it, REject the old json object urls, and inject
+	// the new ones i guess.
+	uint16_t  m_diffbotJSONCount;

 	// these do not include header/footer (dup) addresses
 	//int16_t   m_numAddresses;
@ -311,7 +325,24 @@ class XmlDoc {
 	uint16_t  m_hasSiteVenue:1;
 	uint16_t  m_hasContactInfo:1;
 	uint16_t  m_isSiteRoot:1;
-	uint16_t  m_reserved8;
+
+	uint16_t  m_isDiffbotJSONObject:1;
+	uint16_t  m_reserved802:1;
+	uint16_t  m_reserved803:1;
+	uint16_t  m_reserved804:1;
+	uint16_t  m_reserved805:1;
+	uint16_t  m_reserved806:1;
+	uint16_t  m_reserved807:1;
+	uint16_t  m_reserved808:1;
+	uint16_t  m_reserved809:1;
+	uint16_t  m_reserved810:1;
+	uint16_t  m_reserved811:1;
+	uint16_t  m_reserved812:1;
+	uint16_t  m_reserved813:1;
+	uint16_t  m_reserved814:1;
+	uint16_t  m_reserved815:1;
+	uint16_t  m_reserved816:1;
+

 	char      *ptr_firstUrl;
 	char      *ptr_redirUrl;
@ -1205,6 +1236,11 @@ class XmlDoc {
 	bool m_numOutlinksAddedValid;
 	bool m_baseUrlValid;
 	bool m_replyValid;
+	bool m_diffbotReplyValid;
+	bool m_diffbotUrlCrawlPatternMatchValid;
+	bool m_diffbotUrlProcessPatternMatchValid;
+	bool m_diffbotPageProcessPatternMatchValid;
+	bool m_crawlInfoValid;
 	bool m_isPageParserValid;
 	bool m_imageUrlValid;
 	bool m_matchOffsetsValid;
@ -1416,6 +1452,7 @@ class XmlDoc {
 	long m_siteHash32;
 	char *m_httpReply;
 	char m_downloadAttempted;
+	char m_incrementedAttemptsCount;
 	char m_redirectFlag;
 	//char m_isScraping;
 	//char m_throttleDownload;
@ -1447,6 +1484,25 @@ class XmlDoc {
 	//long *m_outlinkIpVector;
 	Msge1 m_msge1;

+	//
+	// diffbot parms for indexing diffbot's json output
+	//
+	XmlDoc *m_dx;
+	char *m_diffbotObj;
+	char *m_diffbotObjEnd;
+	char  m_diffbotSavedChar;
+	SafeBuf m_diffbotReply;
+	long m_diffbotReplyError;
+	bool m_diffbotUrlCrawlPatternMatch;
+	bool m_diffbotUrlProcessPatternMatch;
+	bool m_diffbotPageProcessPatternMatch;
+
+	SafeBuf *getDiffbotReply ( ) ;
+	bool doesUrlMatchDiffbotCrawlPattern() ;
+	bool doesUrlMatchDiffbotProcessPattern() ;
+	bool doesPageContentMatchDiffbotProcessPattern() ;
+	char *hashJSON ( HashTableX *table );
+

 	//
 	// functions and vars for the seo query matching tool
--- a/main.cpp
+++ b/main.cpp
@ -4828,6 +4828,7 @@ bool registerMsgHandlers2(){
 	//if ( ! MsgF ::registerHandler() ) return false;

 	//if(! g_udpServer.registerHandler(0x10,handleRequest10)) return false;
+	if ( ! g_udpServer.registerHandler(0xc1,handleRequestc1)) return false;
 	if ( ! g_udpServer.registerHandler(0x39,handleRequest39)) return false;
 	if ( ! g_udpServer.registerHandler(0x2c,handleRequest2c)) return false;
 	if ( ! g_udpServer.registerHandler(0x12,handleRequest12)) return false;