Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

Conflicts: Errno.cpp Errno.h
2016-01-11 15:30:53 -08:00
parent 33a40480b4 422ffae8e3
commit 032f597a16
91 changed files with 7643 additions and 827 deletions
--- a/BigFile.cpp
+++ b/BigFile.cpp
@ -33,7 +33,7 @@ BigFile::~BigFile () {
 //#define O_DIRECT 040000

 BigFile::BigFile () {
-	m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
+	//m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
 	m_flags       = O_RDWR ; // | O_DIRECT;
 	m_usePartFiles = true;
 	// NULLify all ptrs to files
@ -289,7 +289,7 @@ bool BigFile::open ( int flags ,

        m_flags       = flags;
 	//m_pc          = pc;
-	m_permissions = permissions;
+	//m_permissions = permissions;
 	m_isClosing   = false;
 	// this is true except when parsing big warc files
 	m_usePartFiles = true;//usePartFiles;
@ -363,7 +363,7 @@ int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
 	}
 	// open it if not opened
 	if ( ! f->calledOpen() ) {
-		if ( ! f->open ( m_flags , m_permissions ) ) {
+		if ( ! f->open ( m_flags , getFileCreationFlags() ) ) {
 			log("disk: Failed to open file part #%"INT32".",n);
 			return -1;
 		}
@ -1481,6 +1481,15 @@ bool BigFile::chopHead ( int32_t part ,
 	return unlinkRename ( NULL, part, true, callback, state );
 }

+class UnlinkRenameState {
+public:
+	char m_oldFilename [ 1024 ];
+	char m_newFilename [ 1024 ];
+	int  m_fd;
+	File *m_file;
+	collnum_t m_collnum;
+};
+
 static void *renameWrapper_r   ( void *state , ThreadEntry *t ) ;
 static void *unlinkWrapper_r   ( void *state , ThreadEntry *t ) ;
 static void  doneRenameWrapper ( void *state , ThreadEntry *t ) ;
@ -1604,6 +1613,38 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 		// save callback for when all parts are unlinked or renamed
 		m_callback = callback;
 		m_state    = state;
+
+#ifdef FIXBUG
+		// now use a special state in case RdbBase gets nuked
+		// because the collection gets deleted in the middle of this
+		UnlinkRenameState stackUr;
+		char *st =(char *)mmalloc( sizeof(UnlinkRenameState),"ulrnst");
+		UnlinkRenameState *urs = (UnlinkRenameState *)st;
+		if ( ! ur ) {
+			log("disk: failed to alloc unlinkrename state. "
+			    "skipping thread.");
+			ur = stackUr;
+		}
+		urs->m_fd = m_fd;
+		urs->m_collnum = collnum; // can we supply this now?
+		urs->m_file = this;
+		urs->m_closedIt = false;
+		makeFilename_r ( m_baseFilename.getBufStart()   ,
+				 NULL                       ,
+				 i                          , 
+				 urs->m_oldFilename            ,
+				 1024 );
+		// rename also takes the new name
+		if ( ! m_isUnlink )
+			makeFilename_r ( m_newBaseFilename.getBufStart()  ,
+					 m_newBaseFilenameDir.getBufStart(), 
+					 i        , 
+					 urs->m_newFilename ,
+					 1024 );
+		if ( ur == stackUr )
+			goto skipThread;
+#endif
+
 		// . we spawn the thread here now
 		// . returns true on successful spawning
 		// . we can't make a disk thread cuz Threads.cpp checks its
@ -1668,6 +1709,30 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 }

 void *renameWrapper_r ( void *state , ThreadEntry *t ) {
+
+#ifdef FIXBUG
+	UnlinkRenameState *urs = (UnlinkRenameState *)state;
+	if ( ::rename ( urs->m_oldFilename , urs->m_newFilename ) ) {
+		// reset errno and return true if file does not exist
+		if ( errno == ENOENT ) {
+			log("disk: file %s does not exist.",oldFilename);
+			errno = 0; 
+		}
+		// otherwise, it's a more serious error i guess
+		else log("disk: rename %s to %s: %s", 
+			   oldFilename,newFilename,mstrerror(errno));
+		return NULL;
+	}
+	// we must close the file descriptor in the thread otherwise the
+	// file will not actually be renamed in this thread
+	//f->close1_r();
+	// we can't call f->close1_r() because f might have been deleted
+	// because the collection was deleted.
+	if ( close1ByFd_r( urs->m_fd) )
+		urs->m_closedIt = true;
+	return;
+#endif
+
 	// extract our class
 	File *f = (File *)state;
 	// . by getting the inode in the cache space the call to f->close()
@ -1721,6 +1786,16 @@ void *renameWrapper_r ( void *state , ThreadEntry *t ) {
 }

 void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
+#ifdef FIXBUG
+	UnlinkRenameState *urs = (UnlinkRenameState *)state;
+	::unlink ( urs->m_oldFilename );
+	// we can't call f->close1_r() because f might have been deleted
+	// because the collection was deleted.
+	if ( close1ByFd_r( urs->m_fd) )
+		urs->m_closedIt = true;
+	return;
+#endif
+
 	// get ourselves
 	File *f = (File *)state;
 	// . by getting the inode in the cache space the call to delete(f) 
@ -1742,6 +1817,25 @@ void *unlinkWrapper_r ( void *state , ThreadEntry *t ) {
 }

 void doneRenameWrapper ( void *state , ThreadEntry *t ) {
+
+#ifdef FIXBUG
+	// if collection got nuked, then file will be invalid
+	// so when we nuke a collection we scan all threads for unlink/rename
+	// operations that reference files from the collection being nuked and
+	// set their m_collectionGotNuked flag to true
+	UnlinkRenameState *urs = (UnlinkRenameState *)state;
+	File *f = urs->m_file;
+	collnum_t cn = urs->m_collnum;
+	RdbBase *base = getRdbBase ( cn );
+	mfree ( urs , sizeof(UrlRenameState), "urnst" );
+	if ( ! base ) { // urs->m_collectionGotNuked ) {
+		log("bigfile: captured rename on nuked collection %i",(int)cn);
+		g_unlinkRenameThreads--;
+		return;
+	}
+
+#endif
+
 	// extract our class
 	File *f = (File *)state;
 	// . finish the close
@ -1795,6 +1889,24 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
 }

 void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
+
+#ifdef FIXBUG
+	// if collection got nuked, then file will be invalid
+	// so when we nuke a collection we scan all threads for unlink/rename
+	// operations that reference files from the collection being nuked and
+	// set their m_collectionGotNuked flag to true
+	UnlinkRenameState *urs = (UnlinkRenameState *)state;
+	File *f = urs->m_file;
+	collnum_t cn = urs->m_collnum;
+	RdbBase *base = getRdbBase ( cn );
+	mfree ( urs , sizeof(UrlRenameState), "urnst" );
+	if ( ! base ) { // urs->m_collectionGotNuked ) {
+		log("bigfile: captured unlink on nuked collection %i",(int)cn);
+		g_unlinkRenameThreads--;
+		return;
+	}
+#endif
+
 	// extract our class
 	File *f = (File *)state;
 	// finish the close
--- a/BigFile.h
+++ b/BigFile.h
@ -353,7 +353,7 @@ class BigFile {
 	SafeBuf m_newBaseFilenameDir ;//[256];


-	int32_t m_permissions;
+	//int32_t m_permissions;
 	int32_t m_flags;

 	// determined in open() override
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -333,6 +333,9 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
 	if ( cr->m_isCustomCrawl ) {
 		cr->m_getLinkInfo = false;
 		cr->m_computeSiteNumInlinks = false;
+		// limit each shard to 5 spiders per collection to prevent
+		// ppl from spidering the web and hogging up resources
+		cr->m_maxNumSpiders = 5;
 	}

 	// we need to compile the regular expressions or update the url
@ -633,10 +636,11 @@ bool Collectiondb::addNewColl ( char *coll ,

 	// MDW: create the new directory
 retry22:
-	if ( ::mkdir ( dname , 
-		       S_IRUSR | S_IWUSR | S_IXUSR | 
-		       S_IRGRP | S_IWGRP | S_IXGRP | 
-		       S_IROTH | S_IXOTH ) ) {
+	if ( ::mkdir ( dname ,
+		       getDirCreationFlags() ) ) {
+		       // S_IRUSR | S_IWUSR | S_IXUSR | 
+		       // S_IRGRP | S_IWGRP | S_IXGRP | 
+		       // S_IROTH | S_IXOTH ) ) {
 		// valgrind?
 		if ( errno == EINTR ) goto retry22;
 		g_errno = errno;
@ -1401,10 +1405,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 		log("admin: Trying to create collection %s but "
 		    "directory %s already exists on disk.",cr->m_coll,dname);
 	}
-	if ( ::mkdir ( dname , 
-		       S_IRUSR | S_IWUSR | S_IXUSR | 
-		       S_IRGRP | S_IWGRP | S_IXGRP | 
-		       S_IROTH | S_IXOTH ) ) {
+	if ( ::mkdir ( dname ,
+		       getDirCreationFlags() ) ) {
+		       // S_IRUSR | S_IWUSR | S_IXUSR | 
+		       // S_IRGRP | S_IWGRP | S_IXGRP | 
+		       // S_IROTH | S_IXOTH ) ) {
 		// valgrind?
 		//if ( errno == EINTR ) goto retry22;
 		//g_errno = errno;
@ -1971,6 +1976,29 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
 		// it is binary now
 		gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );

+	// if it had corrupted data from saving corrupted mem zero it out
+	CrawlInfo *stats = &m_localCrawlInfo;
+	// point to the stats for that host
+	int64_t *ss = (int64_t *)stats;
+	// are stats crazy?
+	bool crazy = false;
+	for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
+		// crazy stat?
+		if ( *ss > 1000000000LL ||
+		     *ss < -1000000000LL ) {
+			crazy = true;
+			break;
+		}
+		ss++;
+	}
+	if ( m_localCrawlInfo.m_collnum != m_collnum )
+		crazy = true;
+	if ( crazy ) {
+		log("coll: had crazy spider stats for coll %s. zeroing out.",
+		    m_coll);
+		m_localCrawlInfo.reset();
+	}
+

 	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
 		log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
@ -3787,12 +3815,30 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 	}

+	// don't bother re-spidering old pages if hopcount == maxhopcount
+	// and only process new urls is true. because we don't need to 
+	// harvest outlinks from them.
+	if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
+	     // only crawls, not bulk jobs
+	     m_isCustomCrawl == 1 ) {
+		m_regExs[i].purge();
+		m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
+				       m_diffbotMaxHops );
+		m_spiderPriorities   [i] = 14;
+		m_spiderFreqs        [i] = 0.0;
+		m_maxSpidersPerRule  [i] = 0; // turn off spiders
+		m_harvestLinks       [i] = false;
+		i++;
+	}

+	// diffbot needs to retry even on 500 or 404 errors since sometimes
+	// a seed url gets a 500 error mistakenly and it haults the crawl.
+	// so take out "!hastmperror".

 	m_regExs[i].set("errorcount>=1 && !hastmperror");
-	m_spiderPriorities   [i] = 15;
-	m_spiderFreqs        [i] = 0.0;
-	m_maxSpidersPerRule  [i] = 0; // turn off spiders if not tmp error
+	m_spiderPriorities   [i] = 14;
+	m_spiderFreqs        [i] = 0.0416; // every hour
+	//m_maxSpidersPerRule  [i] = 0; // turn off spiders if not tmp error
 	i++;

 	// and for docs that have errors respider once every 5 hours
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -494,6 +494,7 @@ class CollectionRec {
 	char  m_useSimplifiedRedirects  ;
 	char  m_useIfModifiedSince      ;
 	char  m_useTimeAxis             ;
+	char  m_indexWarcs;
 	char  m_buildVecFromCont        ;
 	int32_t  m_maxPercentSimilarPublishDate;
 	char  m_useSimilarityPublishDate;
--- a/Conf.cpp
+++ b/Conf.cpp
@ -9,6 +9,25 @@

 Conf g_conf;

+static bool s_setUmask = false;;
+
+mode_t getFileCreationFlags() {
+	if ( ! s_setUmask ) {
+		s_setUmask = true;
+		umask  ( 0 );
+	}
+	return  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
+}
+
+mode_t getDirCreationFlags() {
+	if ( ! s_setUmask ) {
+		s_setUmask = true;
+		umask  ( 0 );
+	}
+	return  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH |
+		S_IXUSR | S_IXGRP;
+}
+
 Conf::Conf ( ) {
 	m_save = true;
 	m_doingCommandLine = false;
--- a/Conf.h
+++ b/Conf.h
@ -43,6 +43,9 @@

 #define MAX_GEOCODERS         4

+mode_t getFileCreationFlags();
+mode_t getDirCreationFlags ();
+
 class Conf {

  public:
@ -180,7 +183,9 @@ class Conf {
 	//bool  m_tagdbUseSeals;
 	//int32_t  m_tagdbMinFilesToMerge;
 	//bool  m_tagdbSaveCache;
-	
+
+	//bool m_makeAllFilesGroupWritable;
+
 	// catdb parameters
 	int32_t  m_catdbMaxTreeMem;
 	//int32_t  m_catdbMaxDiskPageCacheMem;
--- a/Dns.cpp
+++ b/Dns.cpp
@ -2470,7 +2470,8 @@ Host *Dns::getResponsibleHost ( key_t key ) {
 	// get the hostNum that should handle this
 	int32_t hostId = key.n1 % hostdb->getNumHosts();
 	// return it if it is alive
-	if ( ! hostdb->isDead ( hostId ) ) return hostdb->getHost ( hostId );
+	Host* h = hostdb->getHost ( hostId );
+	if ( h->m_spiderEnabled && ! hostdb->isDead ( hostId ) ) return h;
 	// how many are up?
 	int32_t numAlive = hostdb->getNumHostsAlive();
 	// NULL if none
@ -2482,6 +2483,7 @@ Host *Dns::getResponsibleHost ( key_t key ) {
 	for ( int32_t i = 0 ; i < hostdb->m_numHosts ; i++ ) {
 		// get the ith host
 		Host *host = &hostdb->m_hosts[i];
+		if ( !host->m_spiderEnabled )  continue;
 		// skip him if he is dead
 		if ( hostdb->isDead ( host ) ) continue;
 		// count it if alive, continue if not our number
--- a/Errno.cpp
+++ b/Errno.cpp
@ -196,6 +196,7 @@ case	EDNSERROR        : return "DNS lookup error";
 case ETHREADSDISABLED:return "Threads Disabled";
 case EMALFORMEDQUERY: return "Malformed query";
 case ESHARDDOWN: return "One or more shards are down";
+case EDOCWARC: return "Doc is WARC or ARC and support is disabled";
 case EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY: return "Diffbot request of third-party content timed out";
 	}
 	// if the remote error bit is clear it must be a regulare errno
--- a/Errno.h
+++ b/Errno.h
@ -201,6 +201,8 @@ enum {
 	ETHREADSDISABLED,
 	EMALFORMEDQUERY,
 	ESHARDDOWN,
+	EDOCWARC,
+	EWRONGSHARD,
 	EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY
 };
 #endif
--- a/File.cpp
+++ b/File.cpp
@ -238,7 +238,10 @@ bool File::open ( int flags , int permissions ) {
 	}
 	// save these in case we need to reopen in getfd()
 	m_flags       = flags;
-	m_permissions = permissions;
+	//m_permissions = permissions;
+	// just override and use system settings so we can get the group 
+	// writable/readable/executable bits if set that way in g_conf
+	//m_permissions = getFileCreationFlags();
 	m_calledOpen  = true;
 	// sanity check
 	//int32_t ss = 0;
@ -668,7 +671,7 @@ int File::getfd () {
 	if ( fd == -1 ) {
 		t1 = gettimeofdayInMilliseconds();
 retry7:
-		fd = ::open ( getFilename() , m_flags , m_permissions );
+		fd = ::open ( getFilename() , m_flags,getFileCreationFlags());
 		// valgrind
 		if ( fd == -1 && errno == EINTR ) goto retry7;
 		// 0 means stdout, right? why am i seeing it get assigned???
@ -676,7 +679,7 @@ int File::getfd () {
 			log("disk: Got fd of 0 when opening %s.",
 			    getFilename());
 		if ( fd == 0 )
-			fd = ::open ( getFilename(), m_flags , m_permissions );
+		       fd=::open(getFilename(),m_flags,getFileCreationFlags());
 		if ( fd == 0 ) 
 			log("disk: Got fd of 0 when opening2 %s.",
 			    getFilename());
--- a/File.h
+++ b/File.h
@ -193,7 +193,7 @@ class File {

 	// save the permission and flag sets in case of re-opening
 	int m_flags;
-	int m_permissions;
+	//int m_permissions;
 	
 	char m_calledOpen;
 	char m_calledSet;
--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -623,8 +623,10 @@ bool HashTableX::save ( char *dir ,
 	char s[1024];
 	sprintf ( s , "%s/%s", dir , filename );
 	int fd = ::open ( s , 
-			  O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR | 
-			  S_IRGRP | S_IWGRP | S_IROTH);
+			  O_RDWR | O_CREAT | O_TRUNC ,
+			  getFileCreationFlags() );
+			  // S_IRUSR | S_IWUSR | 
+			  // S_IRGRP | S_IWGRP | S_IROTH);
 	if ( fd < 0 ) {
 		//m_saveErrno = errno;
 		return log("db: Could not open %s for writing: %s.",
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -691,16 +691,26 @@ bool Hostdb::init ( int32_t hostIdArg , char *netName ,
 		
 		//skip:

+		h->m_queryEnabled = true;
+		h->m_spiderEnabled = true;
 		// check for something after the working dir
 		h->m_note[0] = '\0';
 		if ( *p != '\n' ) {
 			// save the note
 			char *n = p;
 			while ( *n && *n != '\n' && n < pend ) n++;
+
 			int32_t noteSize = n - p;
 			if ( noteSize > 127 ) noteSize = 127;
 			gbmemcpy(h->m_note, p, noteSize);
 			*p++ = '\0'; // NULL terminate for atoip
+
+			if(strstr(h->m_note, "noquery")) {
+				h->m_queryEnabled = false;
+			}
+			if(strstr(h->m_note, "nospider")) {
+				h->m_spiderEnabled = false;
+			}
 		}
 		else
 			*p   = '\0';
@ -1642,6 +1652,56 @@ Host *Hostdb::getLiveHostInShard ( int32_t shardNum ) {
 	return &shard[0];
 }

+int32_t Hostdb::getHostIdWithSpideringEnabled ( uint32_t shardNum ) {
+	Host *hosts = g_hostdb.getShard ( shardNum);
+	int32_t numHosts = g_hostdb.getNumHostsPerShard();
+
+	int32_t hostNum = 0;
+	int32_t numTried = 0;
+	while( !hosts [ hostNum ].m_spiderEnabled && numTried < numHosts ) {
+		hostNum = (hostNum+1) % numHosts;
+		numTried++;
+	}
+	if( !hosts [ hostNum ].m_spiderEnabled) {
+		log("build: cannot spider when entire shard has nospider enabled");
+		char *xx = NULL; *xx = 0;
+	}
+	return hosts [ hostNum ].m_hostId ;
+}
+
+// if niceness 0 can't pick noquery host.
+// if niceness 1 can't pick nospider host.
+Host *Hostdb::getLeastLoadedInShard ( uint32_t shardNum , char niceness ) {
+	int32_t minOutstandingRequests = 0x7fffffff;
+	int32_t minOutstandingRequestsIndex = -1;
+	Host *shard = getShard ( shardNum );
+	Host *bestDead = NULL;
+	for(int32_t i = 0; i < m_numHostsPerShard; i++) {
+		Host *hh = &shard[i];
+		// don't pick a 'no spider' host if niceness is 1
+		if ( niceness >  0 && ! hh->m_spiderEnabled ) continue;
+		// don't pick a 'no query' host if niceness is 0
+		if ( niceness == 0 && ! hh->m_queryEnabled  ) continue;
+		if ( ! bestDead ) bestDead = hh;
+		if(isDead(hh)) continue;
+		// log("host %"INT32 " numOutstanding is %"INT32, hh->m_hostId, 
+		// 	hh->m_pingInfo.m_udpSlotsInUseIncoming);
+		if ( hh->m_pingInfo.m_udpSlotsInUseIncoming > 
+		     minOutstandingRequests )
+			continue;
+
+		minOutstandingRequests =hh->m_pingInfo.m_udpSlotsInUseIncoming;
+		minOutstandingRequestsIndex = i;
+	}
+	// we should never return a nospider/noquery host depending on
+	// the niceness, so return bestDead
+	if(minOutstandingRequestsIndex == -1) return bestDead;//shard;
+	return &shard[minOutstandingRequestsIndex];
+}
+
+
+
+
 // if all are dead just return host #0
 Host *Hostdb::getFirstAliveHost ( ) {
 	for ( int32_t i = 0 ; i < m_numHosts ; i++ )
@ -1990,8 +2050,9 @@ bool Hostdb::saveHostsConf ( ) {
 	sprintf ( filename, "%shosts.conf", m_dir );
 	log ( LOG_INFO, "conf: Writing hosts.conf file to: %s",
 			filename );
-	int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
-			 S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
+	int32_t fd = open ( filename, O_CREAT|O_WRONLY|O_TRUNC ,
+			    getFileCreationFlags() );
+			 // S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
 	if ( !fd ) {
 		log ( "conf: Failed to open %s for writing.", filename );
 		return false;
--- a/Hostdb.h
+++ b/Hostdb.h
@ -211,6 +211,7 @@ class Host {
 	int64_t      m_lastPing;

 	char m_tmpBuf[4];
+	int16_t m_tmpCount;

 	// . first time we sent an unanswered ping request to this host
 	// . used so we can determine when to send an email alert
@ -337,6 +338,10 @@ class Host {
 	int32_t m_lastTryError;
 	int32_t m_lastTryTime;

+	bool m_spiderEnabled;
+	bool m_queryEnabled;
+	
+
 	//char  m_requestBuf[MAX_PING_SIZE];
 	PingInfo m_pingInfo;//RequestBuf;
 };
@ -445,6 +450,8 @@ class Hostdb {

 	//Host *getLiveHostInGroup ( int32_t groupId );
 	Host *getLiveHostInShard ( int32_t shardNum );
+	Host *getLeastLoadedInShard ( uint32_t shardNum , char niceness );
+	int32_t getHostIdWithSpideringEnabled ( uint32_t shardNum );

 	// in the entire cluster. return host #0 if its alive, otherwise
 	// host #1, etc.
@ -464,6 +471,7 @@ class Hostdb {
 		return &m_hosts[shardNum * m_numHostsPerShard]; 
 	};

+
 	//Host *getGroupFromGroupId ( uint32_t gid ) {
 	//	return getGroup ( gid ); 
 	//};
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1778,8 +1778,8 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
 	else                   now = getTimeLocal();
 	// . buffer for the MIME request and brief html err msg
 	// . NOTE: ctime appends a \n to the time, so we don't need to
-	char msg[1024];
-	SafeBuf sb(msg,1024,0,false);
+	char msg[1524];
+	SafeBuf sb(msg,1524,0,false);

 	char *tt = asctime(gmtime ( &now ));
 	tt [ gbstrlen(tt) - 1 ] = '\0';
@ -1838,7 +1838,7 @@ bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {

 	// use this new function that will compress the reply now if the
 	// request was a ZET instead of a GET
-	return sendReply2 ( msg , sb.length() , NULL , 0 , s );
+	return sendReply2 ( sb.getBufStart(), sb.length() , NULL , 0 , s );
 }

 bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
@ -1851,8 +1851,8 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {
 	else                   now = getTimeLocal();

 	int32_t format = gr->m_hr.getReplyFormat();
-	char msg[1024];
-	SafeBuf sb(msg,1024,0,false);
+	char msg[1524];
+	SafeBuf sb(msg,1524,0,false);
 	char *tt = asctime(gmtime ( &now ));
 	tt [ gbstrlen(tt) - 1 ] = '\0';

@ -1904,7 +1904,7 @@ bool HttpServer::sendErrorReply ( GigablastRequest *gr ) {

 	// use this new function that will compress the reply now if the
 	// request was a ZET instead of a GET
-	return sendReply2 ( msg , sb.length() , NULL , 0 , gr->m_socket );
+	return sendReply2 ( sb.getBufStart(),sb.length(),NULL,0,gr->m_socket );
 }

 // . send an error reply, like "HTTP/1.1 404 Not Found"
@ -1931,8 +1931,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,

 	// . buffer for the MIME request and brief html err msg
 	// . NOTE: ctime appends a \n to the time, so we don't need to
-	char msg[1024];
-	SafeBuf sb(msg,1024,0,false);
+	char msg[1524];
+	SafeBuf sb(msg,1524,0,false);
 	// if it's a 404, redirect to home page
 	/*
 	if ( error == 404 ) 
@ -2000,8 +2000,8 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , int32_t error , char *errmsg ,
 	// record it
 	if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
 	// use this new function that will compress the reply now if the
-	// request was a ZET instead of a GET
-	return sendReply2 ( msg , sb.length() , NULL , 0 , s );
+	// request was a ZET instead of a GET mdw
+	return sendReply2 ( sb.getBufStart() , sb.length() , NULL , 0 , s );

 	/*
 	// . this returns false if blocked, true otherwise
--- a/Images.cpp
+++ b/Images.cpp
@ -1007,7 +1007,10 @@ void Images::thumbStart_r ( bool amThread ) {

        // Open/Create temporary file to store image to
        int   fhndl;
-        if( (fhndl = open( in, O_RDWR+O_CREAT, S_IWUSR+S_IRUSR )) < 0 ) {
+        if( (fhndl = open( in, O_RDWR+O_CREAT ,
+			   getFileCreationFlags()
+			   // //			   S_IWUSR+S_IRUSR 
+			   )) < 0 ) {
               log( "image: Could not open file, %s, for writing: %s - %d.",
       		    in, mstrerror( m_errno ), fhndl );
 	       m_imgDataSize = 0;
--- a/Json.cpp
+++ b/Json.cpp
@ -512,3 +512,66 @@ bool endsInCurly ( char *s , int32_t slen ) {
 	if ( e >= m && *e == '}' ) return true;
 	return false;
 }
+
+
+// Accepts a json string which has a top level object and a "key":val pair
+// return false unless jsonStr has the new key:val
+bool Json::prependKey(SafeBuf& jsonStr, char* keyVal) {
+	int32_t ndx = jsonStr.indexOf('{');
+	// no object? try array? fail for now
+	if( ndx == -1  || ndx == jsonStr.length() - 1 ) return false;
+	ndx++; //the insert pos
+	if(ndx == jsonStr.length()) return false;
+
+	// find if the object had any other keys
+	int32_t jsonStrLen = jsonStr.length();
+	int32_t i = ndx;
+	while(i < jsonStrLen && isspace(jsonStr[i])) i++;
+	if( i == jsonStrLen ) return false;
+
+
+	
+	if (jsonStr[i] != '}') {
+		jsonStr.insert(",\n", i);
+	} //else we are the only item, no comma
+
+	return jsonStr.insert(keyVal, ndx);
+
+
+}
+
+
+// bool Json::printToString(SafeBuf& out, JsonItem* ji = NULL) {
+// 	if(!ji) ji = getFirstItem();
+
+// 	for ( ; ji ; ji = ji->m_next ) {
+// 		switch (ji->m_type) {
+// 		case JT_NULL:
+// 			out.safeMemcpy("null", 4);
+// 		break;
+// 		case JT_NUMBER:
+// 			int32_t vl;
+// 			char* v = ji->getValueAsString(&vl);
+// 			out.safeMemcpy(v, vl);
+// 			break;
+// 		case JT_STRING:
+// 			int32_t vl;
+// 			char* v = ji->getValueAsString(&vl);
+// 			out.pushChar('"');
+// 			out.safeMemcpy(v, vl);
+// 			out.pushChar('"');
+// 		break;
+// 		case JT_ARRAY:
+// 			// wha? really? I would've thought this would contain 
+// 			// jsonitems and not a string
+// 			safeMemcpy(ji->m_valueArray, ji->m_valueArray);
+// 		break;
+// 		case JT_OBJECT:
+// 			out.pushChar('{');
+// 			out.safeMemcpy(v, vl);
+// 			out.pushChar("\"");
+// 		break;
+// 		}
+// 	}
+// 	out->
+// }
--- a/Json.h
+++ b/Json.h
@ -24,6 +24,7 @@ class JsonItem {
 	class JsonItem *m_next,*m_prev;
 	class JsonItem *m_parent;//child;

+
 	// the JT_* values above
 	int m_type;

@ -43,7 +44,6 @@ class JsonItem {

 	char *m_valueArray;

-
 	// for JT_String
 	int32_t  getValueLen() { return m_valueLen; };

@ -78,6 +78,8 @@ class Json {

 	JsonItem *parseJsonStringIntoJsonItems ( char *json , int32_t niceness );

+	bool printToString(SafeBuf& out);
+
 	JsonItem *getFirstItem ( ) ;

 	JsonItem *getItem ( char *name );
@ -86,6 +88,9 @@ class Json {

 	Json() { m_stackPtr = 0; m_prev = NULL; };
 	
+	static bool prependKey(SafeBuf& jsonString, char* newKey);
+
+
 	SafeBuf m_sb;
 	JsonItem *m_stack[MAXJSONPARENTS];
 	int32_t m_stackPtr;
--- a/Language.cpp
+++ b/Language.cpp
@ -145,7 +145,7 @@ bool Language::convertLatin1DictToUTF8( char *infile ){
 	// then open a new one for appending
 	int fdw = open ( ff , 
 			 O_CREAT | O_RDWR | O_APPEND ,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 ){
 		return log("lang: Could not open for %s "
 			   "writing: %s.",ff, strerror(errno));
@ -2763,7 +2763,7 @@ bool Language::makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhras
 		// then open a new one for appending
 		fds[i] = open ( ff , 
 				O_CREAT | O_RDWR | O_APPEND ,
-				S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//				S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 		if ( fds[i] < 0 )
 			return log("lang: Could not open %s for writing: "
 				   "%s.",ff, strerror(errno));
@ -3146,7 +3146,7 @@ bool Language::makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase
 		// then open a new one for appending
 		fds[i] = open ( ff , 
 				O_CREAT | O_RDWR | O_APPEND ,
-				S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//				S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 		if ( fds[i] < 0 )
 			return log("lang: Could not open %s for writing: "
 				   "%s.",ff, strerror(errno));
@ -3683,7 +3683,7 @@ bool Language::makeQueryFiles ( ) {
 		// then open a new one for appending
 		int fdw = open ( ff , 
 				 O_CREAT | O_RDWR | O_APPEND ,
-				 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//				 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 		if ( fdw < 0 ){
 			return log("lang: Could not open for %s "
 				   "writing: %s.",ff, strerror(errno));
@ -3874,7 +3874,7 @@ bool Language::makeWikiFiles( ) {
 	// then open a new one for appending
 	int fdw = open ( ff , 
 		     O_CREAT | O_RDWR | O_APPEND ,
-		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 ){
 		log("lang: Could not open for %s "
 		    "writing: %s.",ff, strerror(errno));
@ -4250,7 +4250,7 @@ bool Language::gotTermFreqs( StateDict *st ){
 	// then open a new one for appending
 	fd = open ( ff , 
 		    O_CREAT | O_RDWR | O_APPEND ,
-		    S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//		    S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fd < 0 ){
 		log("lang: Could not open %s for writing: "
 			   "%s.",ff, strerror(errno));
@ -4338,7 +4338,7 @@ bool StateAff::openAffinityFile( ){
 	unlink ( ff );
 	// then open a new one for appending
 	m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND ,
-			   S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//			   S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( m_fdw < 0 ){
 		log("lang: Could not open for %s "
 		    "writing: %s.",ff, strerror(errno));
@ -4537,7 +4537,7 @@ bool Language::cleanDictFile ( ) {
 	// then open a new one for appending
 	int fdw = open ( ff , 
 			 O_CREAT | O_RDWR | O_APPEND ,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 ){
 		return log("lang: Could not open for %s "
 			   "writing: %s.",ff, strerror(errno));
@ -4590,7 +4590,7 @@ bool Language::makePhonet( char *infile){
 	// then open a new one for appending
 	fdw = open ( outfile , 
 		     O_CREAT | O_RDWR | O_APPEND ,
-		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 )
 		return log("lang: Could not open %s for writing: "
 			   "%s.", outfile, strerror(errno));
@ -4711,7 +4711,7 @@ bool Language::genTopPopFile ( char *infile ){
 	// then open a new one for appending
 	fdw = open ( outfile , 
 		     O_CREAT | O_RDWR | O_APPEND ,
-		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+//		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 )
 		return log("lang: Could not open %s for writing: "
 			   "%s.", outfile, strerror(errno));
@ -4761,7 +4761,8 @@ bool Language::genDistributedPopFile ( char *infile, uint32_t myHash ){
 	// then open a new one for appending
 	fdw = open ( outfile , 
 		     O_CREAT | O_RDWR | O_APPEND ,
-		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+		     getFileCreationFlags() );
+		     // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 )
 		return log("lang: Could not open %s for writing: "
 			   "%s.", outfile, strerror(errno));
@ -4848,7 +4849,8 @@ int32_t Language::spellcheckDict(){
 	// then open a new one for appending
 	fdw = open ( outfile , 
 		     O_CREAT | O_RDWR | O_APPEND ,
-		     S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+		     getFileCreationFlags() );
+		     // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 )
 		return log("lang: Could not open %s for writing: "
 			   "%s.", outfile, strerror(errno));
--- a/LanguageIdentifier.cpp
+++ b/LanguageIdentifier.cpp
@ -961,7 +961,7 @@ static bool s_isLangTag(char *str) {

 static uint8_t s_getCountryFromSpec(char *str) {
 	char code[6];
-	memset(code, 6, 0);
+	memset(code, 0,6);
 	gbmemcpy(code, str, s_wordLen(str));
 	for(int x = 0; x < 6; x++)
 		if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -603,6 +603,10 @@ bool getLinkInfo ( SafeBuf   *reqBuf              ,
 	Host *hosts = g_hostdb.getShard ( shardNum); // Group ( groupId );
 	if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
 	int32_t hostId = hosts [ hostNum ].m_hostId ;
+	if( !hosts [ hostNum ].m_spiderEnabled) {
+		hostId = g_hostdb.getHostIdWithSpideringEnabled ( shardNum );
+	}
+

 	// . serialize the string buffers
 	// . use Msg25Request::m_buf[MAX_NEEDED]
@ -665,7 +669,16 @@ static void sendReplyWrapper ( void *state ) {
 	// sanity
 	if ( req->m_udpSlot != slot2 ) { char *xx=NULL;*xx=0;}
 	// if in table, nuke it
-	g_lineTable.removeKey ( &req->m_siteHash64 );
+	// but only if it was in SITE mode, not PAGE. we've lost our
+	// table entry like this before.
+	// TODO: if this still doesn't work then ensure the stored 'req'
+	// is the same!
+	if ( req->m_mode == MODE_SITELINKINFO ) {
+		g_lineTable.removeKey ( &req->m_siteHash64 );
+		if ( g_conf.m_logDebugLinkInfo )
+			log("linkdb: removing sitehash64=%"INT64"",
+			    req->m_siteHash64);
+	}

 nextLink:

@ -746,6 +759,7 @@ void  handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
 		if ( head->m_next ) 
 			req->m_next = head->m_next;
 		head->m_next = req;
+		req->m_waitingInLine = 1;
 		// note it for debugging
 		log("build: msg25 request waiting in line for %s "
 		    "udpslot=0x%"PTRFMT"",
@ -755,6 +769,8 @@ void  handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
 		return;
 	}

+	req->m_waitingInLine = 0;
+
 	// make a new Msg25
 	Msg25 *m25;
 	try { m25 = new ( Msg25 ); }
--- a/Linkdb.h
+++ b/Linkdb.h
@ -76,6 +76,15 @@ public:
 	int32_t       m_ourHostHash32 ;
 	int32_t       m_ourDomHash32 ;

+	uint8_t m_waitingInLine:1;
+	uint8_t m_reserved1:1;
+	uint8_t m_reserved2:1;
+	uint8_t m_reserved3:1;
+	uint8_t m_reserved4:1;
+	uint8_t m_reserved5:1;
+	uint8_t m_reserved6:1;
+	uint8_t m_reserved7:1;
+
 	// new stuff
 	int32_t       m_siteHash32;
 	int64_t  m_siteHash64;
--- a/Log.cpp
+++ b/Log.cpp
@ -132,8 +132,9 @@ bool Log::init ( char *filename ) {
 	// open it for appending.
 	// create with -rw-rw-r-- permissions if it's not there.
 	m_fd = open ( m_filename , 
-		      O_APPEND | O_CREAT | O_RDWR , 
-		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
+		      O_APPEND | O_CREAT | O_RDWR ,
+		      getFileCreationFlags() );
+		      // S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
 	if ( m_fd >= 0 ) return true;
 	// bitch to stderr and return false on error
 	fprintf(stderr,"could not open log file %s for appending\n",
@ -422,8 +423,9 @@ bool Log::makeNewLogFile ( ) {
 	// open it for appending.
 	// create with -rw-rw-r-- permissions if it's not there.
 	m_fd = open ( m_filename , 
-		      O_APPEND | O_CREAT | O_RDWR , 
-		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
+		      O_APPEND | O_CREAT | O_RDWR ,
+		      getFileCreationFlags() );
+		      // S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
 	if ( m_fd >= 0 ) return true;
 	// bitch to stderr and return false on error
 	fprintf(stderr,"could not open new log file %s for appending\n",
--- a/Loop.cpp
+++ b/Loop.cpp
@ -1014,7 +1014,7 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
 	// right now only works for 32 bit
 	//if ( arch != 32 ) return;

-	logf(LOG_DEBUG,"gb: seg fault. printing stack trace. use "
+	logf(LOG_DEBUG,"gb: Printing stack trace. use "
 	     "'addr2line -e gb' to decode the hex below.");

 	if ( g_inMemFunction ) {
@ -1035,6 +1035,16 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
 		     //,ba
 		     //,g_profiler.getFnName(ba,0));
 		     );
+#ifdef INLINEDECODE
+		char cmd[256];
+		sprintf(cmd,"addr2line -e gb 0x%"XINT64" > ./tmpout"
+			,(uint64_t)s_bt[i]);
+		gbsystem ( cmd );
+		char obuf[1024];
+		SafeBuf fb (obuf,1024);
+		fb.load("./tmpout");
+		log("stack: %s",fb.getBufStart());
+#endif
 	}
 }

@ -1171,7 +1181,8 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
 		//g_inSigHandler = true;
 		// NOT SAFE for pthreads cuz we're in sig handler
 #ifndef PTHREADS
-		log("loop: missed quickpoll");
+		log("loop: missed quickpoll. Dumping stack.");
+		printStackTrace( x , info , y );
 #endif
 		//g_inSigHandler = false;
 		// seems to core a lot in gbcompress() we need to
@ -1183,15 +1194,19 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {
 	}

 	// if it has been a while since heartbeat (> 10000ms) dump core so
-	// we can see where the process was... that is a missed quick poll?
+	// we can see where the process was... we are in a long niceness 0
+	// function or a niceness 1 function without a quickpoll, so that
+	// heartbeatWrapper() function never gets called.
 	if ( g_process.m_lastHeartbeatApprox == 0 ) return;
 	if ( g_conf.m_maxHeartbeatDelay <= 0 ) return;
 	if ( g_nowApprox - g_process.m_lastHeartbeatApprox > 
 	     g_conf.m_maxHeartbeatDelay ) {
 #ifndef PTHREADS
-		logf(LOG_DEBUG,"gb: CPU seems blocked. Forcing core.");
+		logf(LOG_DEBUG,"gb: CPU seems blocked. Dumping stack.");
+		printStackTrace( x , info , y );
 #endif
 		//char *xx=NULL; *xx=0; 
+
 	}

 	//logf(LOG_DEBUG, "xxx now: %"INT64"! approx: %"INT64"", g_now, g_nowApprox);
@ -2708,6 +2723,32 @@ void Loop::enableTimer() {
 }


+FILE* gbpopen(char* cmd) {
+    // Block everything from interrupting this system call because
+    // if there is an alarm or a child thread crashes (pdftohtml)
+    // then this will hang forever.
+    // We should actually write our own popen so that we do
+    // fork, close all fds in the child, then exec.  
+    // These child processes can hold open the http server and
+    // prevent a new gb from running even after it has died.
+	g_loop.disableTimer();
+
+	sigset_t oldSigs;
+	sigset_t sigs;
+	sigfillset ( &sigs );	
+
+	if ( sigprocmask ( SIG_BLOCK  , &sigs, &oldSigs ) < 0 ) {
+        log("build: had error blocking signals for popen");
+    }
+	FILE* fh = popen(cmd, "r");            
+    
+	if ( sigprocmask ( SIG_SETMASK  , &oldSigs, NULL ) < 0 ) {
+        log("build: had error unblocking signals for popen");
+    }
+
+	g_loop.enableTimer();
+    return fh;
+}


 //calling with a 0 niceness will turn off the timer interrupt
--- a/Loop.h
+++ b/Loop.h
@ -18,7 +18,9 @@
 #define QUERYPRIORITYWEIGHT 16
 #define QUICKPOLL_INTERVAL 10

-int gbsystem(char *cmd ) ;
+int gbsystem(char *cmd);
+FILE* gbpopen(char* cmd);
+

 #define sleep(a) { char *xx=NULL;*xx=0; }
 //#define sleep(a) logf(LOG_INFO,"sleep: sleep"); 
--- a/15
+++ b/15
@ -67,7 +67,7 @@ OBJS =  UdpSlot.o Rebalance.o \
 	Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
 	Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
 	Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o \
-	Version.o
+	Punycode.o Version.o

 CHECKFORMATSTRING = -D_CHECK_FORMAT_STRING_

@ -407,7 +407,7 @@ Linkdb.o:

 # final gigabit generation in here:
 Msg40.o:
-	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 
+	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 

 seo.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp 
@ -788,14 +788,5 @@ install-pkgs-local:
 warcinjector: 
 	-rm -r /home/zak/.pex/build/inject-*
 	-rm -r /home/zak/.pex/install/inject-*
-	cd script && pex -v . requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject -o warc-inject --inherit-path --no-wheel
+	cd script && pex -v . gevent gevent-socketio requests pyopenssl ndg-httpsclient pyasn1 multiprocessing -e inject -o warc-inject --inherit-path --no-wheel

-
-
-#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
-
-
-#pex -v inject requests pyopenssl ndg-httpsclient pyasn1 multiprocessing flask -e inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script' --inherit-path --no-wheel
-
-
-#	pex -r requests -r pyopenssl -r ndg-httpsclient -r pyasn1 -r multiprocessing -e inject.inject:main -o script/warc-inject -f '/home/zak/repos/open-source-search-engine/script/' --inherit-path --no-wheel
--- a/Matches.cpp
+++ b/Matches.cpp
@ -1736,6 +1736,75 @@ bool Matches::negTermsFound ( ) {
 }
 */

+bool Matches::docHasQueryTerms(int32_t totalInlinks) {
+    // Loop through all matches keeping a count of query term matches 
+    // from link text.
+    // If a match is not from a link text max it out.
+    // Tally up the matched terms vs number of matches
+    // if only one or two link text matches out of > 10 then
+    // return false indicating that the doc does not
+    // have the term
+
+    if(m_numMatches == 0) {
+        // if there is no query and no matches then short circuit
+        return true;
+    }
+
+    int32_t qterms = 1024;
+    int32_t tmpBuf[qterms];
+    int32_t *numMatches = tmpBuf;
+
+    if(qterms < m_q->m_numTerms) {
+        qterms = m_q->m_numTerms;
+        numMatches = (int32_t *)mmalloc(qterms * sizeof(int32_t), 
+                                        "matchesAnomaly");
+    }
+    memset(numMatches, 0, qterms * sizeof(int32_t));
+
+    for ( int32_t i = 0 ; i < m_numMatches ; i++ ) {
+        // get the match
+        Match *m = &m_matches[i];
+        if(m->m_flags & MF_LINK) {
+            numMatches[m->m_qwordNum]++;
+            continue;
+        }
+        numMatches[m->m_qwordNum] = m_numMatches;
+        // log("match flag %x wordnum %"INT32 " totalinlinks:%"INT32, 
+		// 	m->m_flags, m->m_wordNum, totalInlinks);
+    }
+
+
+    // Assume the best, since we're really only after anomalous link text
+    // at this point.
+    bool hasTerms = true;
+    int32_t nqt = m_q->m_numTerms;
+    for ( int32_t i = 0 ; i < nqt ; i++ ) {
+        QueryTerm *qt = &m_q->m_qterms[i];
+        // skip if ignored *in certain ways only*
+        if ( ! isMatchableTerm ( qt ) ) {
+            continue;
+        }
+        // get the word it is from
+        QueryWord *qw = qt->m_qword;
+
+        // It is a match if it matched something other than link text
+        // or it matched at least 1 link text and there arent many link texts
+        // or it matched more than 2 link texts and there are many link texts
+        hasTerms &= ((numMatches[qw->m_wordNum] >= m_numMatches) ||  
+                     (numMatches[qw->m_wordNum] > 0 && totalInlinks < 10) ||
+                     (numMatches[qw->m_wordNum] > 2 && totalInlinks > 10));
+    }
+
+    if (numMatches != tmpBuf) {
+        mfree(numMatches, qterms * sizeof(int32_t), "matchesAnomaly");
+    }
+    return hasTerms;
+}
+
+
+
+
+
 MatchOffsets::MatchOffsets() {
 	reset();
 }
@ -1804,6 +1873,7 @@ bool MatchOffsets::set(Xml * xml, Words *words, Matches *matches,
 	return true;
 }

+
 int32_t MatchOffsets::getStoredSize() {
 	return m_numMatches * 5 
 		+ 4 //numMatches
--- a/Matches.h
+++ b/Matches.h
@ -148,6 +148,7 @@ class Matches {
 	//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
 	uint32_t getTermsFound2(bool *hadPhrases, bool *hadWords);
 	//bool negTermsFound ( );
+	bool docHasQueryTerms(int32_t totalInlinks);

 	// used internally and by PageGet.cpp
 	bool isMatchableTerm ( class QueryTerm *qt );//, int32_t i );
--- a/Mem.cpp
+++ b/Mem.cpp
@ -1718,7 +1718,7 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
 	int32_t slot = g_mem.getMemSlot ( ptr );
 	if ( slot < 0 ) {
 		log(LOG_LOGIC,"mem: could not find slot (note=%s)",note);
-		log(LOG_LOGIC,"mem: FIXME!!!");
+		//log(LOG_LOGIC,"mem: FIXME!!!");
 		// return for now so procog does not core all the time!
 		return;
 		//char *xx = NULL; *xx = 0;
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -282,6 +282,12 @@ bool Msg13::forwardRequest ( ) {
 	int32_t nh     = g_hostdb.m_numHosts;
 	int32_t hostId = hash32h(((uint32_t)r->m_firstIp >> 8), 0) % nh;

+	if((uint32_t)r->m_firstIp >> 8 == 0) {
+		// If the first IP is not set for the request then we don't
+		// want to hammer the first host with spidering enabled.
+		hostId = hash32n ( r->ptr_url ) % nh;
+	}
+
 	// avoid host #0 for diffbot hack which is dropping some requests
 	// because of the streaming bug methinks
 	if ( hostId == 0 && nh >= 2 && g_conf.m_diffbotMsg13Hack ) 
@ -295,12 +301,22 @@ bool Msg13::forwardRequest ( ) {
 		// get that host
 		//h = g_hostdb.getProxy ( hostId );;
 		h = g_hostdb.getHost ( hostId );
-		// stop if he is alive
-		if ( ! g_hostdb.isDead ( h ) ) break;
+
+		// Get the other one in shard instead of getting the first
+		// one we find sequentially because that makes the load
+		// imbalanced to the lowest host with spidering enabled.
+		if(!h->m_spiderEnabled) {
+			h = g_hostdb.getHost(g_hostdb.getHostIdWithSpideringEnabled(
+			  h->m_hostId));
+		}
+
+		// stop if he is alive and able to spider
+		if ( h->m_spiderEnabled && ! g_hostdb.isDead ( h ) ) break;
 		// get the next otherwise
 		if ( ++hostId >= nh ) hostId = 0;
 	}

+
 	hostId = 0; // HACK!!

 	// forward it to self if we are the spider proxy!!!
@ -2364,7 +2380,7 @@ bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir )
 bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) {

 	// ensure dir exists
-	::mkdir(testDir,S_IRWXU);
+	::mkdir(testDir,getDirCreationFlags());

 	// set this
 	int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());
--- a/Msg1f.cpp
+++ b/Msg1f.cpp
@ -57,8 +57,9 @@ void handleRequest ( UdpSlot *slot , int32_t netnice ) {
 		return;
 	}

-	int32_t fd = open ( filename , O_RDONLY,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
+	int32_t fd = open ( filename , O_RDONLY ,
+			    getFileCreationFlags() );
+			 // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
 	if ( ! fd ) {
 		log(LOG_DEBUG, "logviewer: Failed to open %s for reading: ",
 		    filename);
--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -177,6 +177,12 @@ bool Msg20::getSummary ( Msg20Request *req ) {
 	int32_t timeout = 9999999; // 10 million seconds, basically inf.
 	if ( req->m_niceness == 0 ) timeout = 20;

+	// for diffbot make timeout super long so we aren't tripped up
+	// by dead hosts that aren't really dead.
+	// CollectionRec *cr = g_collectiondb.getRec ( req->m_collnum );
+	// if ( cr && cr->m_isCustomCrawl && req->m_niceness == 0 ) 
+	// 	timeout = 300;
+
 	// get our group
 	int32_t  allNumHosts = hostdb->getNumHostsPerShard();
 	Host *allHosts    = hostdb->getShard ( shardNum );//getGroup(groupId );
@ -189,13 +195,29 @@ bool Msg20::getSummary ( Msg20Request *req ) {
 		Host *hh = &allHosts[i];
 		// skip if dead
 		if ( g_hostdb.isDead(hh) ) continue;
+
+		// Respect no-spider, no-query directives from hosts.conf 
+		if ( !req->m_getLinkInfo && ! hh->m_queryEnabled ) continue;
+		if ( req->m_getLinkInfo && ! hh->m_spiderEnabled ) continue;
 		// add it if alive
 		cand[nc++] = hh;
 	}
 	// if none alive, make them all candidates then
 	bool allDead = (nc == 0);
-	for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) 
+	for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) {
+		// NEVER add a noquery host to the candidate list, even
+		// if the query host is dead
+		if ( ! allHosts[i].m_queryEnabled ) continue;
 		cand[nc++] = &allHosts[i];
+	}
+
+	if ( nc == 0 ) {
+		log("msg20: error sending mcast: no queryable hosts "
+		    "availble to handle summary generation");
+		g_errno = EBADENGINEER;
+		m_gotReply = true;
+		return true;
+	}

 	// route based on docid region, not parity, because we want to hit
 	// the urldb page cache as much as possible
--- a/Msg22.cpp
+++ b/Msg22.cpp
@ -157,46 +157,13 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 	if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
 	firstHostId = hosts [ hostNum ].m_hostId ;
 	*/
+	
+	Host *firstHost ;
+	// if niceness 0 can't pick noquery host.
+	// if niceness 1 can't pick nospider host.
+	firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness );
+	int32_t firstHostId = firstHost->m_hostId;

-	// get our group
-	int32_t  allNumHosts = g_hostdb.getNumHostsPerShard();
-	Host *allHosts    = g_hostdb.getShard ( shardNum );//Group ( groupId );
-
-	// put all alive hosts in this array
-	Host *cand[32];
-	int64_t  nc = 0;
-	for ( int32_t i = 0 ; i < allNumHosts ; i++ ) {
-		// get that host
-		Host *hh = &allHosts[i];
-		// skip if dead
-		if ( g_hostdb.isDead(hh) ) continue;
-		// add it if alive
-		cand[nc++] = hh;
-	}
-	// if none alive, make them all candidates then
-	bool allDead = (nc == 0);
-	for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) 
-		cand[nc++] = &allHosts[i];
-
-	// route based on docid region, not parity, because we want to hit
-	// the urldb page cache as much as possible
-	int64_t sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
-	// we mod by 1MB since tied scores resort to sorting by docid
-	// so we don't want to overload the host responsible for the lowest
-	// range of docids. CAUTION: do this for msg22 too!
-	// in this way we should still ensure a pretty good biased urldb
-	// cache... 
-	// . TODO: fix the urldb cache preload logic
-	int32_t hostNum = (docId % (128LL*1024*1024)) / sectionWidth;
-	if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
-	if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
-	int32_t firstHostId = cand [ hostNum ]->m_hostId ;
-
-	// while this prevents tfndb seeks, it also causes bottlenecks
-	// if one host is particularly slow, because load balancing is
-	// bypassed.
-	//if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1;
-	// flag it
 	m_outstanding = true;
 	r->m_inUse    = 1;

--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -1115,6 +1115,8 @@ bool Msg3::doneScanning ( ) {
 					 m_lists[i].getListSize() ,
 					 0 ); // timestamp. 0 = now

+		QUICKPOLL(m_niceness);
+
 		// if from our 'page' cache, no need to constrain
 		if ( ! m_lists[i].constrain ( m_startKey       ,
 					      m_constrainKey   , // m_endKey
--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -470,6 +470,12 @@ bool Msg3a::gotCacheReply ( ) {
 	for ( int32_t i = 0; i < m_numHosts ; i++ ) { // m_indexdbSplit; i++ ) {
 		// get that host
 		Host *h = g_hostdb.getHost(i);
+
+		if(!h->m_queryEnabled) {
+			m_numReplies++;
+			continue;
+		}
+
 		// if not a full split, just round robin the group, i am not
 		// going to sweat over performance on non-fully split indexes
 		// because they suck really bad anyway compared to full
@ -701,10 +707,12 @@ bool Msg3a::gotAllShardReplies ( ) {
 		// bad reply?
 		if ( ! mr || replySize < 29 ) {
 			m_skippedShards++;
-			log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
-			    "host #%"INT32". Dead? Timeout? OOM?"
-			    ,(int)replySize
-			    ,i);
+			if(g_hostdb.getHost(i)->m_queryEnabled) {
+				log(LOG_LOGIC,"query: msg3a: Bad reply (size=%i) from "
+					"host #%"INT32". Dead? Timeout? OOM?"
+					,(int)replySize
+					,i);
+            }
 			m_reply       [i] = NULL;
 			m_replyMaxSize[i] = 0;
 			// it might have been timd out, just ignore it!!
--- a/Msg4.cpp
+++ b/Msg4.cpp
@ -1434,8 +1434,9 @@ bool saveAddsInProgress ( char *prefix ) {
 	sprintf ( filename , "%s%saddsinprogress.saving", 
 		  g_hostdb.m_dir , prefix );

-	int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC , 
-			 S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
+	int32_t fd = open ( filename, O_RDWR | O_CREAT | O_TRUNC ,
+			    getFileCreationFlags() );
+			 // S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH );
 	if ( fd < 0 ) {
 		log ("build: Failed to open %s for writing: %s",
 		     filename,strerror(errno));
@ -1460,6 +1461,12 @@ bool saveAddsInProgress ( char *prefix ) {
 		// 4 bytes is how much of the total buffer is used, including
 		// those 4 bytes.
 		if ( used == 4 ) continue;
+		// test it
+		if ( used <= 4 || used > 300000000 ) {  // > 300MB????
+			log("msg4: saving addsinprogress. bad bucket "
+			    "used size of %"INT32,used);
+			continue;
+		}
 		// the buf itself
 		write ( fd , s_hostBufs[i] , used );
 	}
@ -1473,6 +1480,20 @@ bool saveAddsInProgress ( char *prefix ) {
 		if ( ! slot->m_callback ) continue;
 		// skip if got reply
 		if ( slot->m_readBuf ) continue;
+		// if not sending something, skip
+		if ( ! slot->m_sendBuf ) continue;
+		// test it
+		int32_t used = *(int32_t *)slot->m_sendBuf;
+		if ( used <= 4 || used > 300000000 ) {  // > 300MB????
+			log("msg4: saving addsinprogress. bad slot "
+			    "used size of %"INT32,used);
+			continue;
+		}
+		if ( used != slot->m_sendBufSize ) {
+			log("msg4: saving addsinprogress. bad used size of "
+			    "%"INT32" != %"INT32,used,slot->m_sendBufSize);
+			continue;
+		}
 		// write hostid sent to
 		write ( fd , &slot->m_hostId , 4 );
 		// write that
@ -1510,6 +1531,9 @@ bool saveAddsInProgress ( char *prefix ) {
 		  g_hostdb.m_dir , prefix );

 	::rename ( filename , newFilename );
+
+	log(LOG_INFO,"build: Renamed %s to %s",filename,newFilename);
+
 	return true;
 }

@ -1577,12 +1601,12 @@ bool loadAddsInProgress ( char *prefix ) {
 	p += 4;
 	if ( numHostBufs != s_numHostBufs ) {
 		g_errno = EBADENGINEER;
-		return log("build: addsinprogress.dat has wrong number of "
-			   "host bufs.");
+		log("build: addsinprogress.dat has wrong number of "
+		    "host bufs.");
 	}

 	// deserialize each hostbuf
-	for ( int32_t i = 0 ; i < s_numHostBufs ; i++ ) {
+	for ( int32_t i = 0 ; i < numHostBufs ; i++ ) {
 		// break if nothing left to read
 		if ( p >= pend ) break;
 		// USED size of the buf
@ -1595,6 +1619,8 @@ bool loadAddsInProgress ( char *prefix ) {
 			s_hostBufSizes[i] = 0;
 			continue;
 		}
+		if ( used < 4 || used > 300000000 )
+			return log("msg4: bad used bytes in bucket 1");
 		// malloc the min buf size
 		int32_t allocSize = MAXHOSTBUFSIZE;
 		if ( allocSize < used ) allocSize = used;
@ -1620,6 +1646,12 @@ bool loadAddsInProgress ( char *prefix ) {
 			log("build: file %s is bad.",filename);
 			char *xx = NULL; *xx = 0; 
 		}
+		if ( i >= s_numHostBufs ) {
+			mfree ( buf , allocSize ,"hostbuf");
+			log("build: skipping host buf #%"INT32,i);
+			continue;
+		}
+
 		// set the array
 		s_hostBufs     [i] = buf;
 		s_hostBufSizes [i] = allocSize;
@ -1635,15 +1667,12 @@ bool loadAddsInProgress ( char *prefix ) {
 		p += 4;
 		// get host
 		Host *h = g_hostdb.getHost(hostId);
-		// must be there
-		if ( ! h ) {
-			close (fd);
-			return log("build: bad msg4 hostid %"INT32"",hostId);
-		}
 		// host many bytes
 		int32_t numBytes;
 		read ( fd , (char *)&numBytes , 4 );
 		p += 4;
+		if ( numBytes < 4 || numBytes > 300000000 )
+			return log("msg4: bad used bytes in slot 1");
 		// allocate buffer
 		char *buf = (char *)mmalloc ( numBytes , "msg4loadbuf");
 		if ( ! buf ) {
@ -1657,6 +1686,14 @@ bool loadAddsInProgress ( char *prefix ) {
 			return log("build: bad msg4 buf read");
 		}
 		p += numBytes;
+		// must be there
+		if ( ! h ) {
+			//close (fd);
+			log("build: bad msg4 hostid %"INT32" nb=%"INT32,
+			    hostId,nb);
+			mfree ( buf , numBytes,"hostbuf");
+			continue;
+		}
 		// send it!
 		if ( ! g_udpServer.sendRequest ( buf ,
 						 numBytes ,
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -83,7 +83,7 @@ static bool gotSummaryWrapper            ( void *state );
 bool isSubDom(char *s , int32_t len);

 Msg40::Msg40() {
-	m_firstTime = true;
+	m_calledFacets = false;
 	m_doneWithLookup = false;
 	m_socketHadError = 0;
 	m_buf           = NULL;
@ -109,6 +109,8 @@ Msg40::Msg40() {
 	m_printCount = 0;
 	//m_numGigabitInfos = 0;
 	m_numCollsToSearch = 0;
+	m_numMsg20sIn = 0;
+	m_numMsg20sOut = 0;
 }

 #define MAX2 50
@ -1427,8 +1429,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		// hard limit
 		if ( m_numRequests-m_numReplies >= maxOut ) break;
 		// do not launch another until m_printi comes back because
-		// all summaries are bottlenecked on printing him out now
+		// all summaries are bottlenecked on printing him out now.
 		if ( m_si->m_streamResults &&
+		     // must have at least one outstanding summary guy
+		     // otherwise we can return true below and cause
+		     // the stream to truncate results in gotSummary()
+		     //m_numReplies < m_numRequests &&
 		     i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
 			break;

@ -1499,8 +1505,21 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		// if to a dead host, skip it
 		int64_t docId = m_msg3a.m_docIds[i];
 		uint32_t shardNum = g_hostdb.getShardNumFromDocId ( docId );
-		if ( g_hostdb.isShardDead ( shardNum ) ) {
-			log("msg40: skipping summary lookup #%"INT32" of "
+		// get the collection rec
+		CollectionRec *cr = g_collectiondb.getRec(m_firstCollnum);
+		// if shard is dead then do not send to it if not crawlbot
+		if ( g_hostdb.isShardDead ( shardNum ) &&
+		     cr &&
+		     // diffbot urls.csv downloads often encounter dead
+		     // hosts that are not really dead, so wait for it
+		     ! cr->m_isCustomCrawl &&
+		     // this is causing us to truncate streamed results
+		     // too early when we have false positives that a 
+		     // host is dead because the server is locking up 
+		     // periodically
+		     ! m_si->m_streamResults ) {
+			log("msg40: skipping summary "
+			    "lookup #%"INT32" of "
 			    "docid %"INT64" for dead shard #%"INT32""
 			    , i
 			    , docId
@ -1547,8 +1566,6 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		// keep for-loops int16_ter with this
 		//if ( i > m_maxiLaunched ) m_maxiLaunched = i;
 		
-		// get the collection rec
-		CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
 		//getRec(m_si->m_coll2,m_si->m_collLen2);
 		if ( ! cr ) {
 			log("msg40: missing coll");
@ -1737,7 +1754,7 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
 		if ( m_msg20[i]->m_launched ) continue;
 		return m_msg20[i];
 	}
-	// how can this happen???
+	// how can this happen???  THIS HAPPEND
 	char *xx=NULL;*xx=0; 
 	return NULL;
 }
@ -1762,7 +1779,7 @@ bool gotSummaryWrapper ( void *state ) {
 		    THIS->m_numReplies,
 		    THIS->m_msg3a.m_numDocIds);
 	// it returns false if we're still awaiting replies
-	if ( ! THIS->gotSummary ( ) ) return false;
+	if ( ! THIS->m_calledFacets && ! THIS->gotSummary ( ) ) return false;
 	// lookup facets
 	if ( THIS->m_si &&
 	     ! THIS->m_si->m_streamResults &&
@ -2215,12 +2232,11 @@ bool Msg40::gotSummary ( ) {

 complete:

-	// . ok, now i wait for everybody.
+	// . ok, now i wait for all msg20s (getsummary) to come back in.
 	// . TODO: evaluate if this hurts us
 	if ( m_numReplies < m_numRequests )
 		return false;

-
 	// if streaming results, we are done
 	if ( m_si && m_si->m_streamResults ) {
 		// unless waiting for last transmit to complete
@ -2444,6 +2460,9 @@ bool Msg40::gotSummary ( ) {
 	for ( int32_t i = 0 ; dedupPercent && i < m_numReplies ; i++ ) {
 		// skip if already invisible
 		if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
+		// Skip if invalid
+		if ( m_msg20[i]->m_errno ) continue;
+
 		// start with the first docid we have not yet checked!
 		//int32_t m = oldNumContiguous;
 		// get it
@ -2462,6 +2481,8 @@ bool Msg40::gotSummary ( ) {
 			// skip if already invisible
 			if ( *level != CR_OK ) continue;
 			// get it
+			if ( m_msg20[m]->m_errno ) continue;
+
 			Msg20Reply *mrm = m_msg20[m]->m_r;
 			// do not dedup CT_STATUS results, those are
 			// spider reply "documents" that indicate the last
@ -6280,8 +6301,8 @@ bool Msg40::lookupFacets ( ) {

 	if ( m_doneWithLookup ) return true;

-	if ( m_firstTime ) {
-		m_firstTime = false;
+	if ( !m_calledFacets ) {
+		m_calledFacets = true;
 		m_numMsg20sOut = 0;
 		m_numMsg20sIn  = 0;
 		m_j = 0;
--- a/Msg40.h
+++ b/Msg40.h
@ -223,7 +223,7 @@ class Msg40 {
 	bool m_doneWithLookup;
 	HashTableX m_facetTextTable;
 	SafeBuf m_facetTextBuf;
-	bool m_firstTime;
+	bool m_calledFacets;
 	int32_t m_omitCount;

 	bool printFacetTables ( class SafeBuf *sb ) ;
--- a/MsgC.cpp
+++ b/MsgC.cpp
@ -88,7 +88,7 @@ bool MsgC::getIp(char  *hostname    , int32_t   hostnameLen ,
 	if ( g_dns.isInCache ( key , ip ) ) {
 		if ( *ip == 3 ) { char *xx=NULL;*xx=0; }
 		// debug msg
-		//log("dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
+		//log(LOG_DEBUG, "dns::getIp: %s (key=%"UINT64") has ip=%s in cache!!!",
 		//     tmp,key.n0,iptoa(*ip));
 		return true;
 	}
--- a/Multicast.cpp
+++ b/Multicast.cpp
@ -607,6 +607,11 @@ loop:
 		// debug msg
 		//log("Multicast:: no hosts left to send to");
 		g_errno = ENOHOSTS; return false; }
+
+	// log("build: msg %x sent to host %"INT32 " first hostId is %"INT32 
+	// 	" oustanding msgs %"INT32, 
+	// 	m_msgType, i, firstHostId, m_hostPtrs[i]->m_numOutstandingRequests);
+
 	// . send to this guy, if we haven't yet
 	// . returns false and sets g_errno on error
 	// . if it returns true, we sent ok, so we should return true
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -3570,6 +3570,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,

 			      " <a href=/v3/crawl/download/%s_urls.csv>"
 			      "new csv format</a>"
+
+			      " <a href=/search?q=gbsortby"
+			      "int%%3AgbssSpiderTime&n=50&c=%s>"
+			      "last 50 download attempts</a>"
 			      
 			      "</td>"
 			      "</tr>"
@ -3645,6 +3649,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      // urls.csv new format v3
 			      , cr->m_coll

+
+			      // last 50 downloaded urls
+			      , cr->m_coll
+
 			      // latest objects in html
 			      , cr->m_coll
 			      , rand64
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -483,7 +483,7 @@ bool processLoop ( void *state ) {
 			  "><td>"
 			  //"<font face=times,sans-serif color=black size=-1>"
 			  "<span style=\"%s\">"
-			  "This is Gigablast's cached page of </span>"
+			  "This is <a href=/>Gigablast<a>'s cached page of </span>"
 			  "<a href=\"%s\" style=\"%s\">%s</a>"
 			  "" , styleTitle, f->getUrl(), styleLink,
 			  f->getUrl() );
--- a/PageHosts.cpp
+++ b/PageHosts.cpp
@ -200,7 +200,7 @@ skipReplaceHost:

 			       //"<td><b>resends sent</td>"
 			       //"<td><b>errors recvd</td>"
-			       "<td><b>try agains recvd</b></td>"
+			       "<td><b>try agains sent</b></td>"

 			       "<td><a href=\"/admin/hosts?c=%s&sort=3\">"
 			       "<b>dgrams resent</b></a></td>"
@ -630,6 +630,15 @@ skipReplaceHost:
 		if (  !(flags & PFLAG_MERGEMODE0) )
 			fb.safePrintf ( "y");

+
+		if ( format == FORMAT_HTML && !h->m_spiderEnabled) {
+			fb.safePrintf("<span title=\"Spider Disabled\" style=\"text-decoration:line-through;\">S</span>");
+		}
+		if ( format == FORMAT_HTML && !h->m_queryEnabled) {
+			fb.safePrintf("<span title=\"Query Disabled\" style=\"text-decoration:line-through;\">Q</span>");
+		}
+
+
 		// clear it if it is us, this is invalid
 		if ( ! h->m_gotPingReply ) {
 			fb.reset();
@ -758,6 +767,13 @@ skipReplaceHost:
 			sb.safePrintf("\t\t<note>%s</note>\n",
 				      h->m_note );

+			sb.safePrintf("\t\t<spider>%"INT32"</spider>\n",
+						  (int32_t)h->m_spiderEnabled );
+
+
+			sb.safePrintf("\t\t<query>%"INT32"</query>\n",
+						  (int32_t)h->m_queryEnabled );
+
 			sb.safePrintf("\t</host>\n");

 			continue;
@ -859,6 +875,14 @@ skipReplaceHost:
 			sb.safePrintf("\t\t\"note\":\"%s\"\n",
 				      h->m_note );

+			sb.safePrintf("\t\t\"spider\":\"%"INT32"\"\n",
+						  (int32_t)h->m_spiderEnabled );
+
+			sb.safePrintf("\t\t\"query\":\"%"INT32"\"\n",
+						  (int32_t)h->m_queryEnabled );
+
+
+            
 			sb.safePrintf("\t},\n");

 			continue;
@ -1313,12 +1337,14 @@ skipReplaceHost:
 		  */

 		  "<tr class=poo>"
-		  "<td>try agains recvd</td>"
+		  "<td>try agains sent</td>"
 		  "<td>How many ETRYAGAIN errors "
-		  "were received in response to a "
+		  "has this host sent out? they are sent out some times "
+		  "in response to a "
 		  "request to add data. Usually because the host's memory "
 		  "is full and it is dumping its data to disk. This number "
-		  "can be high if the host if failing to dump the data "
+		  "can be relatively high if the host if failing to dump "
+		  "the data "
 		  "to disk because of some malfunction, and it can therefore "
 		  "bottleneck the entire cluster."
 		  "</td>"
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -131,6 +131,53 @@ Host *getHostToHandleInjection ( char *url ) {
 	Host *group = g_hostdb.getShard ( shardNum );
 	int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
 	Host *host = &group[hostNum];
+
+	bool isWarcInjection = false;
+	int32_t ulen = gbstrlen(url);
+	if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
+		isWarcInjection = true;
+	if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
+		isWarcInjection = true;
+
+	if ( ! isWarcInjection ) return host;
+
+	// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
+	// a msg7 injection request for each doc in the warc/arc file
+	// so let's do load balancing differently for them so one host
+	// doesn't end up doing a bunch of wget/gunzips on warc files 
+	// thereby bottlenecking the cluster. get the first hostid that
+	// we have not sent a msg7 injection request to that is still out
+	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
+		Host *h = g_hostdb.getHost(i);
+		h->m_tmpCount = 0;
+	}
+	for ( UdpSlot *slot = g_udpServer.m_head2 ; 
+	      slot ; 
+	      slot = slot->m_next2 ) {
+		// skip if not injection request
+		if ( slot->m_msgType != 0x07 ) continue;
+		//if ( ! slot->m_weInitiated ) continue;
+		// if we did not initiate the injection request, i.e. if
+		// it is to us, skip it
+		if ( ! slot->m_callback ) continue;
+		// who is it from?
+		int32_t hostId = slot->m_hostId;
+		if ( hostId < 0 ) continue;
+		Host *h = g_hostdb.getHost ( hostId );
+		if ( ! h ) continue;
+		h->m_tmpCount++;
+	}
+	int32_t min = 999999;
+	Host *minh = NULL;
+	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
+		Host *h = g_hostdb.getHost(i);
+		if ( h->m_tmpCount == 0 ) return h;
+		if ( h->m_tmpCount >= min ) continue;
+		min  = h->m_tmpCount;
+		minh = h;
+	}
+	if ( minh ) return minh;
+	// how can this happen?
 	return host;
 }

@ -182,6 +229,9 @@ bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir ,
 		return log("inject: url too big.");
 	}

+	// hack fix core
+	if ( ir->size_metadata == 0 ) ir->ptr_metadata = NULL;
+
 	int32_t sirSize = 0;
 	char *sir = serializeMsg2 ( ir ,
 				    sizeof(InjectionRequest),
@ -615,7 +665,7 @@ void sendUdpReply7 ( void *state ) {

    uint32_t statColor = 0xccffcc;
    if(xd->m_indexCode) {
-        statColor = 0x4e99e9;
+        statColor = 0xaaddaa;//0x4e99e9;
    }
 	g_stats.addStat_r ( xd->m_rawUtf8ContentSize,
 						xd->m_injectStartTime, 
@ -652,11 +702,29 @@ void sendUdpReply7 ( void *state ) {

 void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {

-
 	InjectionRequest *ir = (InjectionRequest *)slot->m_readBuf;

 	// now just supply the first guy's char ** and size ptr
-	deserializeMsg2 ( &ir->ptr_url, &ir->size_url );
+	if ( ! deserializeMsg2 ( &ir->ptr_url, &ir->size_url ) ) {
+		log("inject: error deserializing inject request from "
+		    "host ip %s port %i",iptoa(slot->m_ip),(int)slot->m_port);
+		g_errno = EBADREQUEST;
+		g_udpServer.sendErrorReply(slot,g_errno);
+		//g_corruptCount++;
+		return;
+	}
+		
+
+	// the url can be like xyz.com. so need to do another corruption
+	// test for ia
+	if ( ! ir->ptr_url ) { // || strncmp(ir->ptr_url,"http",4) != 0 ) {
+		//log("inject: trying to inject NULL or non http url.");
+		log("inject: trying to inject NULL url.");
+		g_errno = EBADURL;
+		//g_corruptCount++;
+		g_udpServer.sendErrorReply(slot,g_errno);
+		return;
+	}

 	CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );
 	if ( ! cr ) {
@ -692,6 +760,10 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
 		s_injectHead = xd;
 		s_injectTail = xd;
 	}
+	if(ir->ptr_content && ir->ptr_content[ir->size_content - 1]) {
+		// XmlDoc expects this buffer to be null terminated.
+		char *xx=NULL;*xx=0;
+	}

 	if ( ! xd->injectDoc ( ir->ptr_url , // m_injectUrlBuf.getBufStart() ,
 			       cr ,
@ -722,7 +794,8 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
 			       ir->m_injectDocIp ,
 				   ir->ptr_contentDelim,
 				   ir->ptr_metadata,
-			       ir->size_metadata
+				   ir->size_metadata,
+				   ir->size_content - 1 // there should be a null in that last byte
 			       ) )
 		// we blocked...
 		return;
--- a/PageLogView.cpp
+++ b/PageLogView.cpp
@ -240,7 +240,7 @@ bool showLine ( SafeBuf *sb , char *s , int32_t len ) {

 	return sb->brify ( s , len , 
 			   0 , // niceness 
-			   80 , // cols
+			   8000 , // cols
 			   "<br>",
 			   false ); // isHtml?
 }
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -600,6 +600,14 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	//      ! cr->m_isCustomCrawl )
 	// 	si->m_docsWanted = maxpp;

+	// BUT if it is a custom diffbot crawl with no &stream=1 option,
+	// then to prevent a results page of 1.6GB, limit it here
+	if ( si->m_docsWanted > 1000 && ! si->m_streamResults ) {
+	 	si->m_docsWanted = 1000;
+		log("query: limiting query %s without &stream=1 option to "
+		    "%"INT32" results.",st->m_si.m_displayQuery,1000);
+	}
+
        st->m_numDocIds = si->m_docsWanted;

 	// watch out for cowboys
@ -5008,26 +5016,31 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	// print the URL
 	//
 	////////////
+
+	StackBuf(tmpBuf);
+	char* displayUrl = Url::getDisplayUrl(url, &tmpBuf);
+	uint32_t displayUrlLen = tmpBuf.length();
+
 	// hack off the http:// if any for displaying it on screen
-	if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
-		url += 7; urlLen -= 7; }
+	if ( displayUrlLen > 8 && strncmp ( displayUrl , "http://" , 7 )==0 ) {
+		displayUrl += 7; displayUrlLen -= 7; }
 	// . remove trailing /
 	// . only remove from root urls in case user cuts and 
 	//   pastes it for link: search
-	if ( url [ urlLen - 1 ] == '/' ) {
+	if ( displayUrl [ displayUrlLen - 1 ] == '/' ) {
 		// see if any other slash before us
 		int32_t j;
-		for ( j = urlLen - 2 ; j >= 0 ; j-- )
-			if ( url[j] == '/' ) break;
+		for ( j = displayUrlLen - 2 ; j >= 0 ; j-- )
+			if ( displayUrl[j] == '/' ) break;
 		// if there wasn't, we must have been a root url
 		// so hack off the last slash
-		if ( j < 0 ) urlLen--;
+		if ( j < 0 ) displayUrlLen--;
 	}
 	if ( si->m_format == FORMAT_HTML ) {
 		sb->safePrintf ("<font color=gray>" );
 		//sb->htmlEncode ( url , gbstrlen(url) , false );
 		// 20 for the date after it
-		sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 );
+		sb->safeTruncateEllipsis ( displayUrl , 50 ); // cols - 30 );
 		// turn off the color
 		sb->safePrintf ( "</font>\n" );
 	}
@ -5058,12 +5071,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {

 	if ( si->m_format == FORMAT_XML ) {
 		sb->safePrintf("\t\t<url><![CDATA[");
-		sb->safeMemcpy ( url , urlLen );
+		sb->safeMemcpy ( displayUrl , displayUrlLen );
 		sb->safePrintf("]]></url>\n");
 	}
 	if ( si->m_format == FORMAT_JSON ) {
 		sb->safePrintf("\t\t\"url\":\"");
-		sb->jsonEncode ( url , urlLen );
+		sb->jsonEncode ( displayUrl , displayUrlLen );
 		sb->safePrintf("\",\n");
 	}

@ -5717,10 +5730,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	*/
 		
 	if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) {
-		sb->safePrintf("\t\t\"metadata\":");
-		sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
-		sb->pushChar(',');
-
+		sb->safePrintf("\t\t\"metadata\":[");
+		//sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
+		sb->safeStrcpy(mr->ptr_metadataBuf);
+		// without this \n we seem to lose our ] i guess it gets
+		// backed up over
+		sb->safePrintf("],\n");
 	}


--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -2523,7 +2523,17 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {

 	ir->ptr_url = hr->getString("u",NULL);
 	if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("url",NULL);
-	
+	if ( ! ir->ptr_url ) ir->ptr_url = hr->getString("urls",NULL);
+
+	if ( ! ir->ptr_url ) {
+		g_errno = EBADURL;
+		doneInjectingWrapper3  ( st1 );
+		return true;
+	}
+
+	// include \0 in size
+	ir->size_url = gbstrlen(ir->ptr_url)+1;
+
 	// get back a short reply so we can show the status code easily
 	ir->m_shortReply = 1;

--- a/PageSockets.cpp
+++ b/PageSockets.cpp
@ -7,6 +7,7 @@
 #include "Dns.h"
 #include "SafeBuf.h"
 #include "Msg13.h"
+#include "Linkdb.h" // Msg25Request

 static void printTcpTable  (SafeBuf *p,char *title,TcpServer *server);
 static void printUdpTable  (SafeBuf *p,char *title,UdpServer *server,
@ -554,6 +555,62 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
 		if ( msgType == 0x50 ) desc = "get root quality";
 		if ( msgType == 0x25 ) desc = "get link info";
 		if ( msgType == 0xfd ) desc = "proxy forward";
+
+		char *req = NULL;
+		int32_t reqSize = 0;
+		if ( s->m_callback ) {
+			req = s->m_sendBuf;
+			reqSize = s->m_sendBufSize;
+		}
+		// are we receiving the request?
+		else {
+			req = s->m_readBuf;
+			reqSize = s->m_readBufSize;
+			// if not completely read in yet...
+			if ( s->hasDgramsToRead ())
+				req = NULL;
+		}
+
+		SafeBuf tmp;
+		char *altText = "";
+
+		// MSG25
+		if ( req && msgType == 0x25 ) {
+			Msg25Request *mr = (Msg25Request *)req;
+			// it doesn't hurt if we call Msg25Request::deserialize
+			// again if it has already been called
+			mr->deserialize();
+			if ( mr->m_mode == 2 ) { // MODE_SITELINKINFO ) {
+				tmp.safePrintf(" title=\""
+					       "getting site link info for "
+					       "%s "
+					       "in collnum %i.\n"
+					       "sitehash64=%"UINT64" "
+					       "waitinginline=%i"
+					       "\""
+					       ,mr->ptr_site
+					       ,(int)mr->m_collnum
+					       ,mr->m_siteHash64
+					       ,(int)mr->m_waitingInLine
+					       );
+				desc = "getting site link info";
+			}
+			else {
+				tmp.safePrintf(" title=\""
+					       "getting page link info for "
+					       "%s "
+					       "in collnum %i."
+					       "\""
+					       ,mr->ptr_url
+					       ,(int)mr->m_collnum
+					       );
+				desc = "getting page link info";
+			}
+		}
+
+		if ( tmp.getLength() )
+			altText = tmp.getBufStart();
+
 		
 		p->safePrintf ( "<tr bgcolor=#%s>"
 				"<td>%s</td>"  // age
@ -609,12 +666,14 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
 			if ( ! s->m_callback ) toFrom = "from";
 			//"<td><a href=http://%s:%hu/cgi/15.cgi>%"INT32"</a></td>"
 			p->safePrintf (	"<td>0x%hhx</td>"  // msgtype
-					"<td><nobr>%s</nobr></td>"  // desc
+					"<td%s><nobr>"
+					"%s</nobr></td>"  // desc
 					"<td><nobr>%s <a href=http://%s:%hu/"
 					"admin/sockets?"
 					"c=%s>%s</a></nobr></td>"
 					"<td>%s%"INT32"%s</td>" , // niceness
 					s->m_msgType ,
+					altText,
 					desc,
 					//iptoa(s->m_ip) ,
 					//s->m_port ,
--- a/PageStatsdb.cpp
+++ b/PageStatsdb.cpp
@ -49,9 +49,18 @@ class StateStatsdb {
 static time_t genDate( char *date, int32_t dateLen ) ;
 static void   sendReply ( void *st ) ;

+static bool s_graphInUse = false;
+
 // . returns false if blocked, otherwise true
 // . sets g_errno on error
 bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
+
+	if ( s_graphInUse ) {
+		char *msg = "stats graph calculating for another user. "
+			"Try again later.";
+		g_httpServer.sendErrorReply(s,500,msg);
+		return true;
+	}
 	
 	char *cgi;
 	int32_t cgiLen;
@ -121,7 +130,6 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
 		st->m_endDate = st->m_endDateR;
 	}

-	g_statsdb.addDocsIndexed();
 	//
 	// this is no longer a gif, but an html graph in g_statsdb.m_sb
 	//
@ -130,8 +138,10 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
 				   st->m_samples ,
 				   &st->m_sb2 ,
 				   st               ,
-				   sendReply        ) )
+				   sendReply        ) ) {
+		s_graphInUse = true;
 		return false;
+	}

 	// if we didn't block call it ourselves directly
 	sendReply ( st );
@ -139,6 +149,15 @@ bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
 	return true;
 }

+void genStatsDataset(SafeBuf *buf, StateStatsdb *st) {
+	if ( ! g_conf.m_useStatsdb ) {
+		buf->safePrintf("{\"error\":\"statsdb disabled\"}\n" );
+        return;
+    }
+    
+
+}
+
 static void writeControls ( SafeBuf *buf, StateStatsdb *st ) ;
 void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {
 	if ( ! g_conf.m_useStatsdb ) 
@ -186,6 +205,8 @@ void genStatsGraphTable(SafeBuf *buf, StateStatsdb *st) {

 void sendReply ( void *state ) {

+	s_graphInUse = false;
+
 	StateStatsdb *st = (StateStatsdb *)state;

 	if ( g_errno ) {
@ -196,6 +217,10 @@ void sendReply ( void *state ) {

 	TcpSocket *s = st->m_socket;

+	if(st->m_request.getLong("json", 0)) {
+        //xxxxxxxxxxxxxxxxxxxxxxxxx
+    }
+
 	if(st->m_request.getLong("justgraph", 0)) {
 		SafeBuf buf( 1024*32 , "tmpbuf0" );
 		genStatsGraphTable(&buf, st);
--- a/Parms.cpp
+++ b/Parms.cpp
@ -6800,7 +6800,7 @@ void Parms::init ( ) {
 	m->m_off   = (char *)&cr.m_maxSearchResultsPerQuery - x;
 	m->m_type  = TYPE_LONG;
 	m->m_def   = "100";
-	m->m_flags = PF_HIDDEN | PF_NOSAVE;
+	m->m_flags = 0;
 	m->m_page  = PAGE_SEARCH;
 	m->m_obj   = OBJ_COLL;
 	m++;
@ -10548,7 +10548,7 @@ void Parms::init ( ) {
 	m->m_off   = (char *)&g_conf.m_maxHeartbeatDelay - g;
 	m->m_type  = TYPE_LONG;
 	m->m_def   = "0";
-	m->m_flags = PF_HIDDEN | PF_NOSAVE;
+	m->m_flags = PF_CLONE; // PF_HIDDEN | PF_NOSAVE;
 	m->m_page  = PAGE_MASTER;
 	m->m_obj   = OBJ_CONF;
 	m++;
@ -12401,12 +12401,31 @@ void Parms::init ( ) {
 	m->m_type  = TYPE_BOOL;
 	m->m_def   = "0";
 	m->m_group = 0;
-	m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
+	m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
 	m->m_page  = PAGE_MASTER;
 	m->m_obj   = OBJ_CONF;
 	m->m_group = 0;
 	m++;

+	/*
+	m->m_title = "files group writable";
+	m->m_desc  = "Make all created files group writable? If you have "
+		"multiple user accounts starting Gigablast processes you "
+		"will want the files to be group writable. You will "
+		"need to make sure you run gigablast under the "
+		"primary group you want to use for gigablast administration.";
+	m->m_cgi   = "afgw";
+	m->m_off   = (char *)&g_conf.m_makeAllFilesGroupWritable - g;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_group = 0;
+	m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE;
+	m->m_page  = PAGE_MASTER;
+	m->m_obj   = OBJ_CONF;
+	m->m_group = 0;
+	m++;
+	*/
+
 	m->m_title = "verify disk writes";
 	m->m_desc  = "Read what was written in a verification step. Decreases "
 		"performance, but may help fight disk corruption mostly on "
@ -16655,6 +16674,21 @@ void Parms::init ( ) {
 	m->m_flags = PF_CLONE;
 	m++;

+	m->m_title = "index warc or arc files";
+	m->m_desc  = "If this is true Gigablast will index .warc and .arc "
+		"files by injecting the pages contained in them as if they "
+		"were spidered with the content in the .warc or .arc file. "
+		"The spidered time will be taken from the archive file "
+		"as well.";
+	m->m_cgi   = "indexwarcs";
+	m->m_off   = (char *)&cr.m_indexWarcs - x;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m->m_flags = PF_CLONE;
+	m++;
+
 	/*
 	m->m_title = "add url enabled";
 	m->m_desc  = "If this is enabled others can add "
@ -21338,9 +21372,23 @@ void tryToCallCallbacks ( ) {
 		if ( pn->m_calledCallback ) continue;
 		// should we call the callback?
 		bool callIt = false;
-		// 8 seconds is enough to wait for all replies to come in
-		if ( now - pn->m_startTime > 8 ) callIt = true;
 		if ( pn->m_numReplies >= pn->m_numRequests ) callIt = true;
+		// sometimes we don't launch any requests to update parms
+		// because we are jammed up. same logic as we use for
+		// freeing the pn below.
+		if ( pn->m_numGoodReplies < pn->m_numHostsTotal )
+			callIt = false;
+
+		// 8 seconds is enough to wait for all replies to come in.
+		// a host might be dead, so we need this here lest the
+		// underlying page handler (i.e. sendPageCrawlbot()) never
+		// get called if a host is dead. if you are updating some
+		// parms you want the page to return.
+		if ( now - pn->m_startTime > 8 && 
+		     ! callIt &&
+		     g_hostdb.hasDeadHost() ) 
+			callIt = true;
+
 		if ( ! callIt ) continue;
 		// callback is NULL for updating parms like spiderRoundNum
 		// in Spider.cpp
@ -21475,6 +21523,8 @@ bool Parms::doParmSendingLoop ( ) {

 	if ( ! s_headNode ) return true;

+	if ( g_isDumpingRdbFromMain ) return true;
+
 	if ( s_inLoop ) return true;

 	s_inLoop = true;
@ -21551,8 +21601,8 @@ bool Parms::doParmSendingLoop ( ) {
 		}

 		// debug log
-		log(LOG_INFO,"parms: sending parm request "
-		    "to hostid %"INT32"",h->m_hostId);
+		log(LOG_INFO,"parms: sending parm request id %i "
+		    "to hostid %"INT32"",(int)pn->m_parmId,h->m_hostId);

 		// count it
 		pn->m_numRequests++;
@ -22946,6 +22996,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "\"temporary\" errors like DNS timeouts."
 			  "</td></tr>"

+			  "<tr class=poo><td>errorcode==32880</td>"
+			  "<td>"
+			  "If the last time it was spidered it had this "
+			  "numeric error code. See the error codes in "
+			  "Errno.cpp. In this particular example 32880 is "
+			  "for EBADURL."
+			  "</td></tr>"
+
 			  "<tr class=poo><td>hastmperror</td>"
 			  "<td>"
 			  "This is true if the last spider attempt resulted "
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -6019,7 +6019,6 @@ void PosdbTable::intersectLists10_r ( ) {
 #define RINGBUFSIZE 4096
 //#define RINGBUFSIZE 1024
 	unsigned char ringBuf[RINGBUFSIZE+10];
-	unsigned char *ringBufEnd = ringBuf + RINGBUFSIZE;
 	// for overflow conditions in loops below
 	ringBuf[RINGBUFSIZE+0] = 0xff;
 	ringBuf[RINGBUFSIZE+1] = 0xff;
@ -6363,18 +6362,7 @@ void PosdbTable::intersectLists10_r ( ) {
 	// for 'search engine'. it might save time!

 	// reset ring buf. make all slots 0xff. should be 1000 cycles or so.
-	for ( int32_t *rb = (int32_t *)ringBuf ; ; ) {
-		rb[0] = 0xffffffff;
-		rb[1] = 0xffffffff;
-		rb[2] = 0xffffffff;
-		rb[3] = 0xffffffff;
-		rb[4] = 0xffffffff;
-		rb[5] = 0xffffffff;
-		rb[6] = 0xffffffff;
-		rb[7] = 0xffffffff;
-		rb += 8;
-		if ( rb >= (int32_t *)ringBufEnd ) break;
-	}
+	memset ( ringBuf, 0xff, RINGBUFSIZE );

 	// now to speed up 'time enough for love' query which does not
 	// have many super high scoring guys on top we need a more restrictive
--- a/Process.cpp
+++ b/Process.cpp
@ -885,6 +885,9 @@ void hdtempWrapper ( int fd , void *state ) {
 	// or if haven't waited int32_t enough
 	if ( now < s_nextTime ) return;

+	// see if this fixes the missed heartbeats
+	//return;
+
 	// set it
 	g_process.m_threadOut = true;
 	// . call thread to call popen
@ -968,7 +971,11 @@ float getDiskUsage ( int64_t *diskAvail ) {
 	char cmd[10048];
 	char out[1024];
 	sprintf(out,"%sdiskusage",g_hostdb.m_dir);
-	snprintf(cmd,10000,"df -ka %s | tail -1 | "
+	snprintf(cmd,10000,
+		 // "ulimit -v 25000  ; "
+		 // "ulimit -t 30 ; "
+		 // "ulimit -a; "
+		 "df -ka %s | tail -1 | "
 		 "awk '{print $4\" \"$5}' > %s",
 		 g_hostdb.m_dir,
 		 out);
@ -982,7 +989,9 @@ float getDiskUsage ( int64_t *diskAvail ) {
 		return -1.0; // unknown
 	}
 	// this will happen if you don't upgrade glibc to 2.2.4-32 or above
-	if ( err != 0 ) {
+	// for some reason it returns no mem but the file is ok.
+	// something to do with being in a thread?
+	if ( err != 0 && errno != ENOMEM ) {
 		log("build: Call to system(\"%s\") had error: %s",
 		    cmd,mstrerror(errno));
 		return -1.0; // unknown
@ -1175,8 +1184,12 @@ void heartbeatWrapper ( int fd , void *state ) {
 		// check the "cat /proc/<pid>/status | grep SigQ" output
 		// to see if its overflowed. hopefully i will fix this by
 		// queue the signals myself in Loop.cpp.
-		log("db: missed heartbeat by %"INT64" ms. Num elapsed alarms = "
-		    "%"INT32"", elapsed-100,(int32_t)(g_numAlarms - s_lastNumAlarms));
+		log("db: missed calling niceness 0 heartbeatWrapper "
+		    "function by %"INT64" ms. Either you need a quickpoll "
+		    "somewhere or a niceness 0 function is taking too long. "
+		    "Num elapsed alarms = "
+		    "%"INT32"", elapsed-100,(int32_t)(g_numAlarms - 
+						      s_lastNumAlarms));
 	s_last = now;
 	s_lastNumAlarms = g_numAlarms;

@ -1524,21 +1537,32 @@ bool Process::shutdown2 ( ) {

 	static bool s_printed = false;

-	// wait for all threads to return
-	//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
-	int32_t n = g_threads.getNumWriteThreadsOut();
+ waitLoop:
+
+	// wait for all 'write' threads to be done. they can be done
+	// and just waiting for a join, in which case we won't coun them.
+	int32_t n = g_threads.getNumActiveWriteUnlinkRenameThreadsOut();
+	// we can't wait for the write thread if we had a seg fault, but
+	// do print a msg in the log
+	if ( n != 0 && m_urgent ) {
+		log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
+		    "threads active. Waiting.",n);
+		sleep(1);
+		goto waitLoop;
+	}
+
 	if ( n != 0 && ! m_urgent ) {
-		log(LOG_INFO,"gb: Has %"INT32" write threads out. Waiting for "
+		log(LOG_INFO,"gb: Has %"INT32" write/unlink/rename "
+		    "threads out. Waiting for "
 		    "them to finish.",n);
 		return false;
 	}
 	else if ( ! s_printed && ! m_urgent ) {
 		s_printed = true;
-		log(LOG_INFO,"gb: No write threads out.");
+		log(LOG_INFO,"gb: No write/unlink/rename threads active.");
 	}


-
 	// disable all spidering
 	// we can exit while spiders are in the queue because
 	// if they are in the middle of being added they will be
@ -1650,11 +1674,18 @@ bool Process::shutdown2 ( ) {

 	// urgent means we need to dump core, SEGV or something
 	if ( m_urgent ) {
-		// log it
-		log("gb: Dumping core after saving.");
-		// at least destroy the page caches that have shared memory
-		// because they seem to not clean it up
-		resetPageCaches();
+
+		if ( g_threads.amThread() ) {
+			uint64_t tid = (uint64_t)getpidtid();
+			log("gb: calling abort from thread with tid of "
+			    "%"UINT64" (thread)",tid);
+		}
+		else {
+			pid_t pid = getpid();
+			log("gb: calling abort from main process "
+			    "with pid of %"UINT64" (main process)",
+			    (uint64_t)pid);
+		}

 		// let's ensure our core file can dump
 		struct rlimit lim;
@ -1662,9 +1693,48 @@ bool Process::shutdown2 ( ) {
 		if ( setrlimit(RLIMIT_CORE,&lim) )
 			log("gb: setrlimit: %s.", mstrerror(errno) );

+		// if we are in this code then we are the main process
+		// and not a thread.
+		// see if this makes it so we always dump core again.
+		// joins with all threads, too.
+		log("gb: Joining with all threads");
+		g_threads.killAllThreads();
+
+		// log it
+		log("gb: Dumping core after saving.");
+		// at least destroy the page caches that have shared memory
+		// because they seem to not clean it up
+		//resetPageCaches();
+
+		// use the default segmentation fault handler which should
+		// dump core rather than call abort() which doesn't always
+		// work because perhaps of threads doing something
+		int signum = SIGSEGV;
+		signal(signum, SIG_DFL);
+		kill(getpid(), signum);
+
+		// this is the trick: it will trigger the core dump by
+		// calling the original SIGSEGV handler.
+		//int signum = SIGSEGV;
+		//signal(signum, SIG_DFL);
+		//kill(getpid(), signum);
+
+		// try resetting the SEGV sig handle to default. when
+		// we return it should call the default handler.
+		// struct sigaction sa;
+		// sigemptyset (&sa.sa_mask);
+		// sa.sa_flags = SA_RESETHAND;
+		// sa.sa_sigaction = NULL;
+		// sigaction ( SIGSEGV, &sa, 0 ) ;
+		// return true;
+
 		// . force an abnormal termination which will cause a core dump
 		// . do not dump core on SIGHUP signals any more though
-		abort();
+		//abort();
+
+		// return from this signal handler so we can execute
+		// original SIGSEGV handler right afterwards
+		// default handler should be called after we return now
 		// keep compiler happy
 		return true;
 	}
@ -1674,6 +1744,12 @@ bool Process::shutdown2 ( ) {
 	// cleanup threads, this also launches them too
 	g_threads.timedCleanUp(0x7fffffff,MAX_NICENESS);

+	// there's no write/unlink/rename threads active, 
+	// so just kill the remaining threads and join
+	// with them so we can try to get a proper exit status code
+	log("gb: Joining with all threads");
+	g_threads.killAllThreads();
+
 	// wait for all threads to complete...
 	//int32_t n = g_threads.getNumThreadsOutOrQueued() ;
 	//if ( n > 0 )
--- a/Profiler.cpp
+++ b/Profiler.cpp
@ -1866,7 +1866,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
 	ff.safePrintf("%strash/profile.txt",g_hostdb.m_dir);
 	char *filename = ff.getBufStart();
 	unlink ( filename );
-	int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
+	int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
 	if ( fd < 0 ) {
 		sb->safePrintf("FAILED TO OPEN %s for writing: %s"
 			       ,ff.getBufStart(),mstrerror(errno));
@ -2094,7 +2094,7 @@ Profiler::printRealTimeInfo(SafeBuf *sb,
 	ff.reset();
 	ff.safePrintf("%strash/qp.txt",g_hostdb.m_dir);
 	filename = ff.getBufStart();
-	fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
+	//fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
 	if ( fd < 0 ) {
 		sb->safePrintf("FAILED TO OPEN %s for writing: %s"
 			       ,ff.getBufStart(),strerror(errno));
--- a/Punycode.cpp
+++ b/Punycode.cpp
@ -0,0 +1,268 @@
+#include "Punycode.h"
+#include <string.h>
+
+/* #include "punycode.h" */
+
+/*** Bootstring parameters for Punycode ***/
+
+enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
+       initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
+
+/* basic(cp) tests whether cp is a basic code point: */
+#define basic(cp) ((punycode_uint)(cp) < 0x80)
+
+/* delim(cp) tests whether cp is a delimiter: */
+#define delim(cp) ((cp) == delimiter)
+
+/* decode_digit(cp) returns the numeric value of a basic code */
+/* point (for use in representing integers) in the range 0 to */
+/* base-1, or base if cp does not represent a value.          */
+
+static punycode_uint decode_digit(punycode_uint cp)
+{
+  return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
+          cp - 97 < 26 ? cp - 97 :  base;
+}
+
+/* encode_digit(d,flag) returns the basic code point whose value      */
+/* (when used for representing integers) is d, which needs to be in   */
+/* the range 0 to base-1.  The lowercase form is used unless flag is  */
+/* nonzero, in which case the uppercase form is used.  The behavior   */
+/* is undefined if flag is nonzero and digit d has no uppercase form. */
+
+static char encode_digit(punycode_uint d, int flag)
+{
+  return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
+  /*  0..25 map to ASCII a..z or A..Z */
+  /* 26..35 map to ASCII 0..9         */
+}
+
+/* flagged(bcp) tests whether a basic code point is flagged */
+/* (uppercase).  The behavior is undefined if bcp is not a  */
+/* basic code point.                                        */
+
+#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
+
+/* encode_basic(bcp,flag) forces a basic code point to lowercase */
+/* if flag is zero, uppercase if flag is nonzero, and returns    */
+/* the resulting code point.  The code point is unchanged if it  */
+/* is caseless.  The behavior is undefined if bcp is not a basic */
+/* code point.                                                   */
+
+static char encode_basic(punycode_uint bcp, int flag)
+{
+  bcp -= (bcp - 97 < 26) << 5;
+  return bcp + ((!flag && (bcp - 65 < 26)) << 5);
+}
+
+/*** Platform-specific constants ***/
+
+/* maxint is the maximum value of a punycode_uint variable: */
+static const punycode_uint maxint = -1;
+/* Because maxint is unsigned, -1 becomes the maximum value. */
+
+/*** Bias adaptation function ***/
+
+static punycode_uint adapt(
+  punycode_uint delta, punycode_uint numpoints, int firsttime )
+{
+  punycode_uint k;
+
+  delta = firsttime ? delta / damp : delta >> 1;
+  /* delta >> 1 is a faster way of doing delta / 2 */
+  delta += delta / numpoints;
+
+  for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {
+    delta /= base - tmin;
+  }
+
+  return k + (base - tmin + 1) * delta / (delta + skew);
+}
+
+/*** Main encode function ***/
+
+enum punycode_status punycode_encode(
+  size_t input_length_orig,
+  const punycode_uint input[],
+  const unsigned char case_flags[],
+  size_t *output_length,
+  char output[] )
+{
+  punycode_uint input_length, n, delta, h, b, bias, j, m, q, k, t;
+  size_t out, max_out;
+
+  /* The Punycode spec assumes that the input length is the same type */
+  /* of integer as a code point, so we need to convert the size_t to  */
+  /* a punycode_uint, which could overflow.                           */
+
+  if (input_length_orig > maxint) return punycode_overflow;
+  input_length = (punycode_uint) input_length_orig;
+
+  /* Initialize the state: */
+
+  n = initial_n;
+  delta = 0;
+  out = 0;
+  max_out = *output_length;
+  bias = initial_bias;
+
+  /* Handle the basic code points: */
+
+  for (j = 0;  j < input_length;  ++j) {
+    if (basic(input[j])) {
+      if (max_out - out < 2) return punycode_big_output;
+      output[out++] = case_flags ?
+        encode_basic(input[j], case_flags[j]) : (char) input[j];
+    }
+    /* else if (input[j] < n) return punycode_bad_input; */
+    /* (not needed for Punycode with unsigned code points) */
+  }
+
+  h = b = (punycode_uint) out;
+  /* cannot overflow because out <= input_length <= maxint */
+
+  /* h is the number of code points that have been handled, b is the  */
+  /* number of basic code points, and out is the number of ASCII code */
+  /* points that have been output.                                    */
+
+  if (b > 0) output[out++] = delimiter;
+
+  /* Main encoding loop: */
+
+  while (h < input_length) {
+    /* All non-basic code points < n have been     */
+    /* handled already.  Find the next larger one: */
+
+    for (m = maxint, j = 0;  j < input_length;  ++j) {
+      /* if (basic(input[j])) continue; */
+      /* (not needed for Punycode) */
+      if (input[j] >= n && input[j] < m) m = input[j];
+    }
+
+    /* Increase delta enough to advance the decoder's    */
+    /* <n,i> state to <m,0>, but guard against overflow: */
+
+    if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
+    delta += (m - n) * (h + 1);
+    n = m;
+
+    for (j = 0;  j < input_length;  ++j) {
+      /* Punycode does not need to check whether input[j] is basic: */
+      if (input[j] < n /* || basic(input[j]) */ ) {
+        if (++delta == 0) return punycode_overflow;
+      }
+
+      if (input[j] == n) {
+        /* Represent delta as a generalized variable-length integer: */
+
+        for (q = delta, k = base;  ;  k += base) {
+          if (out >= max_out) return punycode_big_output;
+          t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
+              k >= bias + tmax ? tmax : k - bias;
+          if (q < t) break;
+          output[out++] = encode_digit(t + (q - t) % (base - t), 0);
+          q = (q - t) / (base - t);
+        }
+
+        output[out++] = encode_digit(q, case_flags && case_flags[j]);
+        bias = adapt(delta, h + 1, h == b);
+        delta = 0;
+        ++h;
+      }
+    }
+
+    ++delta, ++n;
+  }
+
+  *output_length = out;
+  return punycode_success;
+}
+
+/*** Main decode function ***/
+
+enum punycode_status punycode_decode(
+  size_t input_length,
+  const char input[],
+  size_t *output_length,
+  punycode_uint output[],
+  unsigned char case_flags[] )
+{
+  punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
+  size_t b, j, in;
+
+  /* Initialize the state: */
+
+  n = initial_n;
+  out = i = 0;
+  max_out = *output_length > maxint ? maxint
+            : (punycode_uint) *output_length;
+  bias = initial_bias;
+
+  /* Handle the basic code points:  Let b be the number of input code */
+  /* points before the last delimiter, or 0 if there is none, then    */
+  /* copy the first b code points to the output.                      */
+
+  for (b = j = 0;  j < input_length;  ++j)  if (delim(input[j])) b = j;
+  if (b > max_out) return punycode_big_output;
+
+  for (j = 0;  j < b;  ++j) {
+    if (case_flags) case_flags[out] = flagged(input[j]);
+    if (!basic(input[j])) return punycode_bad_input;
+    output[out++] = input[j];
+  }
+
+  /* Main decoding loop:  Start just after the last delimiter if any  */
+  /* basic code points were copied; start at the beginning otherwise. */
+
+  for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {
+
+    /* in is the index of the next ASCII code point to be consumed, */
+    /* and out is the number of code points in the output array.    */
+
+    /* Decode a generalized variable-length integer into delta,  */
+    /* which gets added to i.  The overflow checking is easier   */
+    /* if we increase i as we go, then subtract off its starting */
+    /* value at the end to obtain delta.                         */
+
+    for (oldi = i, w = 1, k = base;  ;  k += base) {
+      if (in >= input_length) return punycode_bad_input;
+      digit = decode_digit(input[in++]);
+      if (digit >= base) return punycode_bad_input;
+      if (digit > (maxint - i) / w) return punycode_overflow;
+      i += digit * w;
+      t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
+          k >= bias + tmax ? tmax : k - bias;
+      if (digit < t) break;
+      if (w > maxint / (base - t)) return punycode_overflow;
+      w *= (base - t);
+    }
+
+    bias = adapt(i - oldi, out + 1, oldi == 0);
+
+    /* i was supposed to wrap around from out+1 to 0,   */
+    /* incrementing n each time, so we'll fix that now: */
+
+    if (i / (out + 1) > maxint - n) return punycode_overflow;
+    n += i / (out + 1);
+    i %= (out + 1);
+
+    /* Insert n at position i of the output: */
+
+    /* not needed for Punycode: */
+    /* if (basic(n)) return punycode_bad_input; */
+    if (out >= max_out) return punycode_big_output;
+
+    if (case_flags) {
+      memmove(case_flags + i + 1, case_flags + i, out - i);
+      /* Case of last ASCII code point determines case flag: */
+      case_flags[i] = flagged(input[in - 1]);
+    }
+
+    memmove(output + i + 1, output + i, (out - i) * sizeof *output);
+    output[i++] = n;
+  }
+
+  *output_length = (size_t) out;
+  /* cannot overflow because out <= old value of *output_length */
+  return punycode_success;
+}
--- a/Punycode.h
+++ b/Punycode.h
@ -0,0 +1,154 @@
+/*
+punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
+http://www.nicemice.net/idn/
+Adam M. Costello
+http://www.nicemice.net/amc/
+
+This is ANSI C code (C89) implementing Punycode 1.0.x.
+*/
+#include <limits.h>
+#include <stddef.h>
+
+enum punycode_status {
+  punycode_success    = 0,
+  punycode_bad_input  = 1, /* Input is invalid.                       */
+  punycode_big_output = 2, /* Output would exceed the space provided. */
+  punycode_overflow   = 3  /* Wider integers needed to process input. */
+};
+
+/* punycode_uint needs to be unsigned and needs to be */
+/* at least 26 bits wide.  The particular type can be */
+/* specified by defining PUNYCODE_UINT, otherwise a   */
+/* suitable type will be chosen automatically.        */
+
+#ifdef PUNYCODE_UINT
+  typedef PUNYCODE_UINT punycode_uint;
+#elif UINT_MAX >= (1 << 26) - 1
+  typedef unsigned int punycode_uint;
+#else
+  typedef unsigned long punycode_uint;
+#endif
+
+enum punycode_status punycode_encode(
+  size_t,                 /* input_length  */
+  const punycode_uint [], /* input         */
+  const unsigned char [], /* case_flags    */
+  size_t *,               /* output_length */
+  char []                 /* output        */
+);
+
+/*
+    punycode_encode() converts a sequence of code points (presumed to be
+    Unicode code points) to Punycode.
+
+    Input arguments (to be supplied by the caller):
+
+        input_length
+            The number of code points in the input array and the number
+            of flags in the case_flags array.
+
+        input
+            An array of code points.  They are presumed to be Unicode
+            code points, but that is not strictly necessary.  The
+            array contains code points, not code units.  UTF-16 uses
+            code units D800 through DFFF to refer to code points
+            10000..10FFFF.  The code points D800..DFFF do not occur in
+            any valid Unicode string.  The code points that can occur in
+            Unicode strings (0..D7FF and E000..10FFFF) are also called
+            Unicode scalar values.
+
+        case_flags
+            A null pointer or an array of boolean values parallel to
+            the input array.  Nonzero (true, flagged) suggests that the
+            corresponding Unicode character be forced to uppercase after
+            being decoded (if possible), and zero (false, unflagged)
+            suggests that it be forced to lowercase (if possible).
+            ASCII code points (0..7F) are encoded literally, except that
+            ASCII letters are forced to uppercase or lowercase according
+            to the corresponding case flags.  If case_flags is a null
+            pointer then ASCII letters are left as they are, and other
+            code points are treated as unflagged.
+
+    Output arguments (to be filled in by the function):
+
+        output
+            An array of ASCII code points.  It is *not* null-terminated;
+            it will contain zeros if and only if the input contains
+            zeros.  (Of course the caller can leave room for a
+            terminator and add one if needed.)
+
+    Input/output arguments (to be supplied by the caller and overwritten
+    by the function):
+
+        output_length
+            The caller passes in the maximum number of ASCII code points
+            that it can receive.  On successful return it will contain
+            the number of ASCII code points actually output.
+
+    Return value:
+
+        Can be any of the punycode_status values defined above except
+        punycode_bad_input.  If not punycode_success, then output_size
+        and output might contain garbage.
+*/
+
+enum punycode_status punycode_decode(
+  size_t,           /* input_length  */
+  const char [],    /* input         */
+  size_t *,         /* output_length */
+  punycode_uint [], /* output        */
+  unsigned char []  /* case_flags    */
+);
+
+/*
+    punycode_decode() converts Punycode to a sequence of code points
+    (presumed to be Unicode code points).
+
+    Input arguments (to be supplied by the caller):
+
+        input_length
+            The number of ASCII code points in the input array.
+
+        input
+            An array of ASCII code points (0..7F).
+
+    Output arguments (to be filled in by the function):
+
+        output
+            An array of code points like the input argument of
+            punycode_encode() (see above).
+
+        case_flags
+            A null pointer (if the flags are not needed by the caller)
+            or an array of boolean values parallel to the output array.
+            Nonzero (true, flagged) suggests that the corresponding
+            Unicode character be forced to uppercase by the caller (if
+            possible), and zero (false, unflagged) suggests that it
+            be forced to lowercase (if possible).  ASCII code points
+            (0..7F) are output already in the proper case, but their
+            flags will be set appropriately so that applying the flags
+            would be harmless.
+
+    Input/output arguments (to be supplied by the caller and overwritten
+    by the function):
+
+        output_length
+            The caller passes in the maximum number of code points
+            that it can receive into the output array (which is also
+            the maximum number of flags that it can receive into the
+            case_flags array, if case_flags is not a null pointer).  On
+            successful return it will contain the number of code points
+            actually output (which is also the number of flags actually
+            output, if case_flags is not a null pointer).  The decoder
+            will never need to output more code points than the number
+            of ASCII code points in the input, because of the way the
+            encoding is defined.  The number of code points output
+            cannot exceed the maximum possible value of a punycode_uint,
+            even if the supplied output_length is greater than that.
+
+    Return value:
+
+        Can be any of the punycode_status values defined above.  If not
+        punycode_success, then output_length, output, and case_flags
+        might contain garbage.
+*/
--- a/Query.cpp
+++ b/Query.cpp
@ -3212,7 +3212,7 @@ bool Query::setQWords ( char boolFlag ,
 		// no punct, alnum only
 		if ( words.isPunct(i) ) continue;
 		// skip if not a stop word
-		if ( ! bits.m_bits[i] & D_IS_STOPWORD ) continue;
+		if ( ! (bits.m_bits[i] & D_IS_STOPWORD) ) continue;
 		// continue if you can still pair across prev punct word
 		if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue;
 		// otherwise, we can now start a phrase
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -374,16 +374,16 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
 	char dstDir[256];
 	// make the trash dir if not there
 	sprintf ( dstDir , "%s/trash/" , g_hostdb.m_dir );
-	int32_t status = ::mkdir ( dstDir ,
-				S_IRUSR | S_IWUSR | S_IXUSR | 
-				S_IRGRP | S_IWGRP | S_IXGRP | 
-				S_IROTH | S_IXOTH ) ;
+	int32_t status = ::mkdir ( dstDir , getDirCreationFlags() );
+				// S_IRUSR | S_IWUSR | S_IXUSR | 
+				// S_IRGRP | S_IWGRP | S_IXGRP | 
+				// S_IROTH | S_IXOTH ) ;
 	// we have to create it
 	sprintf ( dstDir , "%s/trash/rebuilt%"UINT32"/" , g_hostdb.m_dir , t );
-	status = ::mkdir ( dstDir ,
-				S_IRUSR | S_IWUSR | S_IXUSR | 
-				S_IRGRP | S_IWGRP | S_IXGRP | 
-				S_IROTH | S_IXOTH ) ;
+	status = ::mkdir ( dstDir , getDirCreationFlags() );
+				// S_IRUSR | S_IWUSR | S_IXUSR | 
+				// S_IRGRP | S_IWGRP | S_IXGRP | 
+				// S_IROTH | S_IXOTH ) ;
 	if ( status && errno != EEXIST ) {
 		g_errno = errno;
 		return log("repair: Could not mkdir(%s): %s",dstDir,
@ -643,10 +643,10 @@ bool Rdb::deleteAllRecs ( collnum_t collnum ) {
 bool makeTrashDir() {
 	char trash[1024];
 	sprintf(trash, "%strash/",g_hostdb.m_dir);
-	if ( ::mkdir ( trash, 
-		       S_IRUSR | S_IWUSR | S_IXUSR | 
-		       S_IRGRP | S_IWGRP | S_IXGRP | 
-		       S_IROTH | S_IXOTH ) == -1 ) {
+	if ( ::mkdir ( trash , getDirCreationFlags() ) ) {
+		       // S_IRUSR | S_IWUSR | S_IXUSR | 
+		       // S_IRGRP | S_IWGRP | S_IXGRP | 
+		       // S_IROTH | S_IXOTH ) == -1 ) {
 		if ( errno != EEXIST ) {
 			log("dir: mkdir %s had error: %s",
 			    trash,mstrerror(errno));
@ -1424,10 +1424,12 @@ bool Rdb::gotTokenForDump ( ) {
 			RdbBucket *b = m_buckets.m_buckets[i];
 			collnum_t cn = b->getCollnum();
 			int32_t nk = b->getNumKeys();
-			for ( int32_t j = 0 ; j < nk; j++ ) {
-				cr = g_collectiondb.m_recs[cn];
-				if ( cr ) cr->m_treeCount++;
-			}
+			// for ( int32_t j = 0 ; j < nk; j++ ) {
+			// 	cr = g_collectiondb.m_recs[cn];
+			// 	if ( cr ) cr->m_treeCount++;
+			// }
+			cr = g_collectiondb.m_recs[cn];
+			if ( cr ) cr->m_treeCount += nk;
 		}
 	}

@ -1542,6 +1544,20 @@ bool Rdb::dumpCollLoop ( ) {
 				   "available secondary id for titledb: %s." , 
 				   mstrerror(g_errno) );
 	}
+
+	// if we add to many files then we can not merge, because merge op
+	// needs to add a file and it calls addNewFile() too
+	static int32_t s_flag = 0;
+	if ( base->m_numFiles + 1 >= MAX_RDB_FILES ) {
+		if ( s_flag < 10 )
+			log("db: could not dump tree to disk for cn="
+			    "%i %s because it has %"INT32" files on disk. "
+			    "Need to wait for merge operation.",
+			    (int)m_dumpCollnum,m_dbname,base->m_numFiles);
+		s_flag++;
+		goto loop;
+	}
+
 	// this file must not exist already, we are dumping the tree into it
 	m_fn = base->addNewFile ( id2 ) ;
 	if ( m_fn < 0 ) return log(LOG_LOGIC,"db: rdb: Failed to add new file "
@ -1797,6 +1813,8 @@ void attemptMergeAll2 ( ) {

 tryLoop:

+	QUICKPOLL(niceness);
+
 	// if a collection got deleted, reset this to 0
 	if ( s_lastCollnum >= g_collectiondb.m_numRecs ) {
 		s_lastCollnum = 0;
@ -1836,6 +1854,26 @@ void attemptMergeAll2 ( ) {
 	if ( base && base->attemptMerge(niceness,force,true) ) 
 		return;

+	// also try to merge on rdbs being rebuilt
+	base = cr->getBasePtr(RDB2_POSDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB2_TITLEDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB2_TAGDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB2_LINKDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB2_SPIDERDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB2_CLUSTERDB2);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+
 	// try next collection
 	s_lastCollnum++;

--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -165,10 +165,10 @@ bool RdbBase::init ( char  *dir            ,
 		}
 		// make a special "cat" dir for it if we need to
 		sprintf ( tmp , "%s%s" , dir , dbname );
-		int32_t status = ::mkdir ( tmp ,
-			       S_IRUSR | S_IWUSR | S_IXUSR | 
-			       S_IRGRP | S_IWGRP | S_IXGRP | 
-					S_IROTH | S_IXOTH );
+		int32_t status = ::mkdir ( tmp , getDirCreationFlags() );
+			       // S_IRUSR | S_IWUSR | S_IXUSR | 
+			       // S_IRGRP | S_IWGRP | S_IXGRP | 
+			       // 		S_IROTH | S_IXOTH );
 	        if ( status == -1 && errno != EEXIST && errno )
 			return log("db: Failed to make directory %s: %s.",
 				   tmp,mstrerror(errno));
@ -186,9 +186,9 @@ bool RdbBase::init ( char  *dir            ,
 		// make a special "cat" dir for it if we need to
 		sprintf ( tmp , "%scat" , dir );
 		if ( ::mkdir ( tmp ,
-			       S_IRUSR | S_IWUSR | S_IXUSR | 
-			       S_IRGRP | S_IWGRP | S_IXGRP | 
-			       S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
+			       // S_IRUSR | S_IWUSR | S_IXUSR | 
+			       // S_IRGRP | S_IWGRP | S_IXGRP | 
+			       // S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
 			return log("db: Failed to make directory %s: %s.",
 				   tmp,mstrerror(errno));
 	}
@ -202,9 +202,9 @@ bool RdbBase::init ( char  *dir            ,
 		// make a special "stats" dir for it if necessary
 		sprintf ( tmp , "%sstats" , dir );
 		if ( ::mkdir ( tmp ,
-			       S_IRUSR | S_IWUSR | S_IXUSR | 
-			       S_IRGRP | S_IWGRP | S_IXGRP | 
-			       S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
+			       // S_IRUSR | S_IWUSR | S_IXUSR | 
+			       // S_IRGRP | S_IWGRP | S_IXGRP | 
+			       // S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
 			return log( "db: Failed to make directory %s: %s.",
 				    tmp, mstrerror( errno ) );
 	}
@ -218,9 +218,9 @@ bool RdbBase::init ( char  *dir            ,
 		// make a special "stats" dir for it if necessary
 		sprintf ( tmp , "%saccess" , dir );
 		if ( ::mkdir ( tmp ,
-			       S_IRUSR | S_IWUSR | S_IXUSR | 
-			       S_IRGRP | S_IWGRP | S_IXGRP | 
-			       S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
+			       // S_IRUSR | S_IWUSR | S_IXUSR | 
+			       // S_IRGRP | S_IWGRP | S_IXGRP | 
+			       // S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
 			return log( "db: Failed to make directory %s: %s.",
 				    tmp, mstrerror( errno ) );
 	}
@ -234,9 +234,9 @@ bool RdbBase::init ( char  *dir            ,
 		// make a special "stats" dir for it if necessary
 		sprintf ( tmp , "%ssyncdb" , dir );
 		if ( ::mkdir ( tmp ,
-			       S_IRUSR | S_IWUSR | S_IXUSR | 
-			       S_IRGRP | S_IWGRP | S_IXGRP | 
-			       S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
+			       // S_IRUSR | S_IWUSR | S_IXUSR | 
+			       // S_IRGRP | S_IWGRP | S_IXGRP | 
+			       // S_IROTH | S_IXOTH ) == -1 && errno != EEXIST )
 			return log( "db: Failed to make directory %s: %s.",
 				    tmp, mstrerror( errno ) );
 	}
@ -1607,8 +1607,8 @@ bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 	if ( ! m_mergeUrgent && numFiles - 14 >= m_minToMerge ) {
 		m_mergeUrgent = true;
 		if ( doLog ) 
-		log(LOG_INFO,"merge: Entering urgent merge mode for %s.",
-		    m_dbname);
+		log(LOG_INFO,"merge: Entering urgent merge mode for %s "
+		    "coll=%s.", m_dbname,m_coll);
 		g_numUrgentMerges++;
 	}

@ -1811,7 +1811,8 @@ void RdbBase::gotTokenForMerge ( ) {
 		m_mergeUrgent = true;
 		if ( m_doLog )
 		log(LOG_INFO,
-		    "merge: Entering urgent merge mode for %s.", m_dbname);
+		    "merge: Entering urgent merge mode (2) for %s coll=%s.", 
+		    m_dbname,m_coll);
 		g_numUrgentMerges++;
 	}
 	// tfndb has his own merge class since titledb merges write tfndb recs
@ -1892,8 +1893,13 @@ void RdbBase::gotTokenForMerge ( ) {
 		// sanity check
 		if ( n <= 1 ) {
 			log(LOG_LOGIC,"merge: attemptMerge: Resuming. bad "
-			    "engineer");
+			    "engineer for %s coll=%s",m_dbname,m_coll);
 			//g_msg35.releaseToken();
+			if ( m_mergeUrgent ) {
+				log("merge: leaving urgent merge mode");
+				g_numUrgentMerges--;
+				m_mergeUrgent = false;
+			}
 			return false;
 		}
 		// make a log note
--- a/RdbBase.h
+++ b/RdbBase.h
@ -338,10 +338,10 @@ class RdbBase {
 	// . older files are listed first (lower fileIds)
 	// . filenames should include the directory (full filenames)
 	// . TODO: RdbMgr should control what rdb gets merged?
-	BigFile  *m_files     [ MAX_RDB_FILES ];
-	int32_t      m_fileIds   [ MAX_RDB_FILES ];
-	int32_t      m_fileIds2  [ MAX_RDB_FILES ]; // for titledb/tfndb linking
-	RdbMap   *m_maps      [ MAX_RDB_FILES ];
+	BigFile  *m_files     [ MAX_RDB_FILES+1 ];
+	int32_t      m_fileIds   [ MAX_RDB_FILES+1 ];
+	int32_t      m_fileIds2  [ MAX_RDB_FILES+1 ]; // for titledb/tfndb linking
+	RdbMap   *m_maps      [ MAX_RDB_FILES+1 ];
 	int32_t      m_numFiles;

 	// this class contains a ptr to us
--- a/RdbBuckets.cpp
+++ b/RdbBuckets.cpp
@ -2060,8 +2060,10 @@ bool RdbBuckets::fastSave_r() {
 	char s[1024];
 	sprintf ( s , "%s/%s-buckets-saving.dat", m_dir , m_dbname );
 	int fd = ::open ( s , 
-			  O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR | 
-			  S_IRGRP | S_IWGRP | S_IROTH);
+			  O_RDWR | O_CREAT | O_TRUNC ,
+			  getFileCreationFlags() );
+			  // S_IRUSR | S_IWUSR | 
+			  // S_IRGRP | S_IWGRP | S_IROTH);
 	if ( fd < 0 ) {
 		m_saveErrno = errno;
 		return log("db: Could not open %s for writing: %s.",
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -1484,7 +1484,7 @@ bool RdbCache::save_r ( ) {
 	//f.set ( g_hostdb.m_dir , filename );
 	// open the file
 	//if ( ! f.open ( O_RDWR | O_CREAT ) ) 
-	int fd = open ( filename , O_RDWR | O_CREAT , S_IRWXU );
+	int fd = open ( filename , O_RDWR | O_CREAT , getFileCreationFlags() );
 	if ( fd < 0 )
 		return log("db: Had opening file to save cache to: %s.", 
 		    mstrerror(errno));
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -1340,6 +1340,7 @@ bool RdbList::constrain ( char   *startKey    ,
 	// ensure we our first key is 12 bytes if m_useHalfKeys is true
 	if ( m_useHalfKeys && isHalfBitOn ( m_list ) ) {
 		g_errno = ECORRUPTDATA;
+		g_numCorrupt++;
 		return log("db: First key is 6 bytes. Corrupt data "
 			   "file.");
 	}
@ -1347,12 +1348,14 @@ bool RdbList::constrain ( char   *startKey    ,
 	// sanity. hint key should be full key
 	if ( m_ks == 18 && hintKey && (hintKey[0]&0x06)){
 		g_errno = ECORRUPTDATA;
+		g_numCorrupt++;
 		return log("db: Hint key is corrupt.");
 		//char *xx=NULL;*xx=0;}
 	}

 	if ( hintOffset > m_listSize ) { //char *xx=NULL;*xx=0; }
 		g_errno = ECORRUPTDATA;
+		g_numCorrupt++;
 		return log("db: Hint offset %"INT32" > %"INT32" is corrupt."
 			   ,hintOffset,
 			   m_listSize);
@ -1418,6 +1421,7 @@ bool RdbList::constrain ( char   *startKey    ,
 			m_listPtrHi = savelistPtrHi ;
 			m_listPtrLo = savelistPtrLo ;
 			g_errno = ECORRUPTDATA;
+			g_numCorrupt++;
 			return log("db: Got record size of %"INT32" < 0. "
 				   "Corrupt data file.",recSize);
 		}
@ -1525,13 +1529,16 @@ bool RdbList::constrain ( char   *startKey    ,
 	if ( minRecSizes < 0 ) maxPtr = m_listEnd;
 	// size of last rec we read in the list
 	int32_t size = -1 ;
+	// char *savedp = p;
+	// if ( savedp == (char *)0x001 ) { char *xx=NULL;*xx=0;}
 	// advance until endKey or minRecSizes kicks us out
 	//while ( p < m_listEnd && getKey(p) <= endKey && p < maxPtr ) {
 	while ( p < m_listEnd ) {
 		QUICKPOLL(niceness);
 		getKey(p,k);
 		if ( KEYCMP(k,endKey,m_ks)>0 ) break;
-		if ( p >= maxPtr ) break;
+		// only break out if we've set the size AND are >= maxPtr
+		if ( p >= maxPtr && size > 0 ) break;
 		size = getRecSize ( p );
 		// watch out for corruption, let Msg5 fix it
 		if ( size < 0 ) {
@ -1540,6 +1547,7 @@ bool RdbList::constrain ( char   *startKey    ,
 			m_listPtrLo = savelistPtrLo;
 			m_listPtr   = savelist;
 			g_errno = ECORRUPTDATA;
+			g_numCorrupt++;
 			return log("db: Corrupt record size of %"INT32" "
 				   "bytes in %s.",size,filename);
 		}
@ -1559,6 +1567,7 @@ bool RdbList::constrain ( char   *startKey    ,
 			m_listPtrLo = savelistPtrLo;
 			m_listPtr   = savelist;
 			g_errno = ECORRUPTDATA;
+			g_numCorrupt++;
 			return log("db: Corrupt record size of %"INT32" "
 				   "bytes in %s.",size,filename);
 		}
@ -1580,6 +1589,7 @@ bool RdbList::constrain ( char   *startKey    ,
 			m_listPtrLo = savelistPtrLo;
 			m_listPtr   = savelist;
 			g_errno = ECORRUPTDATA;
+			g_numCorrupt++;
 			return log("db: Corrupt record size of %"INT32" "
 				   "bytes in %s.",size,filename);
 		}
@ -1587,17 +1597,23 @@ bool RdbList::constrain ( char   *startKey    ,
 		//endKey = getKey ( p - size );
 		getKey(p-size,endKey);
 	}
+	// bitch if size is -1 still
+	if ( size == -1 ) {
+		log("db: Corruption. Encountered bad endkey in %s.",filename);
+		char *xx=NULL;*xx=0;
+		m_list      = savelist;
+		m_listPtrHi = savelistPtrHi;
+		m_listPtrLo = savelistPtrLo;
+		m_listPtr   = savelist;
+		g_errno = ECORRUPTDATA;
+		g_numCorrupt++;
+		return false;
+	}
 	// cut the tail
 	m_listEnd   = p;
 	m_listSize  = m_listEnd - m_list;
-	// bitch if size is -1 still
-	if ( size == -1 ) {
-		log("db: Encountered bad endkey in %s. listSize=%"INT32"",
-		    filename,m_listSize);
-		char *xx=NULL;*xx=0;
-	}
 	// otherwise store the last key if size is not -1
-	else if ( m_listSize > 0 ) {
+	if ( m_listSize > 0 ) {
 		//m_lastKey        = getKey ( p - size );
 		getKey(p-size,m_lastKey);
 		m_lastKeyIsValid = true;
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -2488,8 +2488,10 @@ bool RdbTree::fastSave_r() {
 	char s[1024];
 	sprintf ( s , "%s/%s-saving.dat", m_dir , m_dbname );
 	int fd = ::open ( s , 
-			  O_RDWR | O_CREAT | O_TRUNC , S_IRUSR | S_IWUSR | 
-			  S_IRGRP | S_IWGRP | S_IROTH);
+			  O_RDWR | O_CREAT | O_TRUNC ,
+			  getFileCreationFlags() );
+			  // S_IRUSR | S_IWUSR | 
+			  // S_IRGRP | S_IWGRP | S_IROTH);
 	if ( fd < 0 ) {
 		m_saveErrno = errno;
 		return log("db: Could not open %s for writing: %s.",
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -198,6 +198,15 @@ bool SafeBuf::safeMemcpy ( Words *w , int32_t a , int32_t b ) {
 	return safeMemcpy ( p , pend - p );
 }

+char* SafeBuf::pushStr  (char* str, uint32_t len) {
+	int32_t initLen = m_length;
+	bool status = safeMemcpy ( str , len );
+	status &= nullTerm();
+	m_length++; //count the null so it isn't overwritten
+	if(!status) return NULL;
+	return m_buf + initLen;
+}
+
 bool SafeBuf::pushPtr ( void *ptr ) {
 	if ( m_length + (int32_t)sizeof(char *) > m_capacity ) 
 		if(!reserve(sizeof(char *)))//2*m_capacity + 1))
@ -431,7 +440,7 @@ bool SafeBuf::reserve(int32_t i , char *label, bool clearIt ) {
 //buffer size.
 bool SafeBuf::reserve2x(int32_t i, char *label) {
 	//watch out for overflow!
-	if((m_capacity << 1) + i < 0) return false;
+	if((m_capacity << 1) + i < m_capacity) return false;
 	if(i + m_length >= m_capacity)
 		return reserve(m_capacity + i,label);
 	else return true;
@ -449,8 +458,9 @@ int32_t SafeBuf::save ( char *fullFilename ) {

 int32_t SafeBuf::dumpToFile(char *filename ) {
 retry22:
-	int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
+	int32_t fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC ,
+			    getFileCreationFlags() );
+			    //S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
 	if ( fd < 0 ) {
 		// valgrind
 		if ( errno == EINTR ) goto retry22;
@ -484,8 +494,9 @@ int32_t SafeBuf::safeSave (char *filename ) {
 	fn.safePrintf( "%s.saving",filename );

 	int32_t fd = open ( fn.getBufStart() ,
-			 O_CREAT | O_WRONLY | O_TRUNC,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
+			    O_CREAT | O_WRONLY | O_TRUNC ,
+			    getFileCreationFlags() );
+			 // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
 	if ( fd < 0 ) {
 		// valgrind
 		if ( errno == EINTR ) goto retry22;
@ -571,8 +582,8 @@ int32_t SafeBuf::fillFromFile(char *filename) {
 	reserve(results.st_size+1);
 	
 retry:
-	int32_t fd = open ( filename , O_RDONLY,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
+	int32_t fd = open ( filename , O_RDONLY , getFileCreationFlags() );
+			 // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
 	if ( ! fd ) {
 		// valgrind
 		if ( errno == EINTR ) goto retry;
@ -862,6 +873,22 @@ bool  SafeBuf::utf8Encode2(char *s, int32_t len, bool encodeHTML,int32_t nicenes
 	return htmlEncode(m_length-tmp,niceness);
 }

+
+
+bool SafeBuf::utf32Encode(UChar32* codePoints, int32_t cpLen) {
+	if(m_encoding != csUTF8) return safePrintf("FIXME %s:%i", __FILE__, __LINE__);
+
+    int32_t need = 0;
+    for(int32_t i = 0; i < cpLen;i++) need += utf8Size(codePoints[i]);
+	if(!reserve(need)) return false;
+    
+    for(int32_t i = 0; i < cpLen;i++) {
+		m_length += ::utf8Encode(codePoints[i], m_buf + m_length);
+	}
+	
+    return true;
+}
+
 /*
 bool SafeBuf::utf32Encode(UChar32 c) {
 	if(!reserve2x(8)) return false;
@ -3666,3 +3693,12 @@ bool SafeBuf::hasDigits() {
 		if ( is_digit(m_buf[i]) ) return true;
 	return false;
 }
+
+
+int32_t SafeBuf::indexOf(char c) {
+	char* p = m_buf;
+	char* pend = m_buf + m_length;
+	while (p < pend && *p != c) p++;
+	if (p == pend) return -1;
+	return p - m_buf;
+}
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -259,6 +259,7 @@ public:
 			 int32_t niceness=0);
 	bool  latin1Encode(char *s, int32_t len, bool htmlEncode=false,
 			   int32_t niceness=0);
+    bool utf32Encode(UChar32* codePoints, int32_t cpLen);
 	//bool  utf16Encode(UChar *s, int32_t len, bool htmlEncode=false);
 	//bool  utf16Encode(char *s, int32_t len, bool htmlEncode=false) {
 	//	return utf16Encode((UChar*)s, len>>1, htmlEncode); };
@ -327,6 +328,7 @@ public:
 		return true;
 	};

+	int32_t indexOf(char c);

 	bool  safeCdataMemcpy(char *s, int32_t len);
 	bool  pushChar (char i) {
@ -346,6 +348,7 @@ public:
 	// hack off trailing 0's
 	bool printFloatPretty ( float f ) ;

+	char* pushStr  (char* str, uint32_t len);
 	bool  pushPtr  ( void *ptr );
 	bool  pushLong (int32_t i);
 	bool  pushLongLong (int64_t i);
--- a/Speller.cpp
+++ b/Speller.cpp
@ -1805,7 +1805,8 @@ bool Speller::createUnifiedDict (){
 	// then open a new one for appending
 	int fdw = open ( ff , 
 			 O_CREAT | O_RDWR | O_APPEND ,
-			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
+			 getFileCreationFlags());
+			 // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 	if ( fdw < 0 ){
 		return log("lang: Could not open for %s "
 			   "writing: %s.",ff, strerror(errno));
--- a/Spider.cpp
+++ b/Spider.cpp
@ -2759,6 +2759,7 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
 		// remove all his keys just because we restarted and think he
 		// is alive even though we have gotten no ping from him.
 		//if ( hp->m_numPingRequests > 0 )
+	removeFromTree:
 		// these operations should fail if writes have been disabled
 		// and becase the trees/tables for spidercache are saving
 		// in Process.cpp's g_spiderCache::save() call
@ -2793,7 +2794,15 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
 	m_waitingTreeKeyValid = true;
 	m_scanningIp = firstIp;
 	// sanity
-	if ( firstIp == 0 || firstIp == -1 ) { char *xx=NULL;*xx=0; }
+	if ( firstIp == 0 || firstIp == -1 ) { 
+		//char *xx=NULL;*xx=0; }
+		log("spider: removing corrupt spiderreq firstip of %"INT32
+		    " from waiting tree collnum=%i",
+		    firstIp,(int)m_collnum);
+		goto removeFromTree;
+	}
+	// avoid corruption
+	
 	// we set this to true when done
 	//m_isReadDone = false;
 	// compute the best request from spiderdb list, not valid yet
@ -2877,6 +2886,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 	if ( m_deleteMyself ) { char *xx=NULL;*xx=0; }
 	// skip if spiders off
 	if ( ! m_cr->m_spideringEnabled ) return;
+	if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
 	// skip if udp table is full
 	if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
 	// if entering for the first time, we need to read list from spiderdb
@ -3160,6 +3170,8 @@ void SpiderColl::populateDoledbFromWaitingTree ( ) { // bool reentry ) {
 	// since addSpiderRequest() calls addToWaitingTree() which then calls
 	// this. 
 	if ( ! g_conf.m_spideringEnabled ) return;
+	if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
+

 	// skip if udp table is full
 	if ( g_udpServer.getNumUsedSlotsIncoming() >= MAXUDPSLOTS ) return;
@ -4106,6 +4118,20 @@ bool SpiderColl::scanListForWinners ( ) {
 				//srep = NULL;
 				continue;
 			}
+			// ignore these to fix diffbot's malformed url bug
+			if ( tmp->m_errCode == 32880 &&
+			     // and is before about dec 18th 2015
+			     tmp->m_spideredTime < 1450488447 )
+				continue;
+			// ignore these to fix diffbot's ebadtitlerec error
+			// 'bad cached document'.
+			// ignore them so we can respider the urls and
+			// the new logic in xmldoc.cpp can ignore them.
+			// i fixed xmldoc.cpp to index these status docs.
+			if ( tmp->m_errCode == 32792 &&
+			     // and is before about dec 22nd 2015
+			     tmp->m_spideredTime < 1450897197 )
+				continue;
 			// bad langid?
 			if ( ! getLanguageAbbr (tmp->m_langId) ) {
 				log("spider: got corrupt 4 spiderReply in "
@ -4268,7 +4294,18 @@ bool SpiderColl::scanListForWinners ( ) {
 		m_lastCBlockIp = cblock;

 		// only add firstip if manually added and not fake
-		
+ 		// if ( uh48 == 272628060426254 )
+ 		// 	log("spider: got special seed");
+
+// #undef sleep
+// 		if ( uh48 == 272628060426254 ) {
+// 			log("spider: got special seed");
+// 			bool flag = true;
+// 		sleepLoop:
+// 			sleep(1);
+// 			if ( flag ) goto sleepLoop;
+// 		}
+// #define sleep(a) { char *xx=NULL;*xx=0; }

 		//
 		// just calculating page counts? if the url filters are based
@ -5889,6 +5926,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 	if ( ! srep && sreq->m_isInjecting ) return spiderTimeMS;
 	if ( ! srep && sreq->m_isPageReindex ) return spiderTimeMS;

+
+	//log("spider: getting spider time %"INT64, spiderTimeMS);
 	// to avoid hammering an ip, get last time we spidered it...
 	int64_t lastMS ;
 	lastMS = m_lastDownloadCache.getLongLong ( m_collnum       ,
@ -6073,6 +6112,8 @@ bool isAssignedToUs ( int32_t firstIp ) {
 	// . ignore lower 8 bits of ip since one guy often owns a whole block!
 	//int32_t hostId=(((uint32_t)firstIp) >> 8) % g_hostdb.getNumHosts();

+	if( !g_hostdb.getMyHost()->m_spiderEnabled ) return false;
+	
 	// get our group
 	//Host *group = g_hostdb.getMyGroup();
 	Host *shard = g_hostdb.getMyShard();
@ -6097,22 +6138,30 @@ bool isAssignedToUs ( int32_t firstIp ) {
 	int32_t i = ((uint32_t)h64) % hpg;
 	Host *h = &shard[i];
 	// return that if alive
-	if ( ! g_hostdb.isDead(h) ) return (h->m_hostId == g_hostdb.m_hostId);
+	if ( ! g_hostdb.isDead(h) && h->m_spiderEnabled) {
+		return (h->m_hostId == g_hostdb.m_hostId);
+	}
 	// . select another otherwise
 	// . put all alive in an array now
 	Host *alive[64];
 	int32_t upc = 0;
+
 	for ( int32_t j = 0 ; j < hpg ; j++ ) {
-		Host *h = &shard[i];
+		Host *h = &shard[j];
 		if ( g_hostdb.isDead(h) ) continue;
+		if( ! h->m_spiderEnabled ) continue;
 		alive[upc++] = h;
 	}
 	// if none, that is bad! return the first one that we wanted to
-	if ( upc == 0 ) return (h->m_hostId == g_hostdb.m_hostId);
+	if ( upc == 0 ) {
+		log("spider: no hosts can handle spider request for ip=%s", iptoa(firstIp));
+		return false;
+		//return (h->m_hostId == g_hostdb.m_hostId);
+	}
 	// select from the good ones now
-	i  = ((uint32_t)firstIp) % hpg;
+	i  = ((uint32_t)firstIp) % upc;
 	// get that
-	h = &shard[i];
+	h = alive[i]; //&shard[i];
 	// guaranteed to be alive... kinda
 	return (h->m_hostId == g_hostdb.m_hostId);
 }
@ -6217,7 +6266,11 @@ void SpiderLoop::startLoop ( ) {
 	// in case host when dead.
 	// now that we only send the info on startup and if changed,
 	// let's move back down to 1 second
-	if ( !g_loop.registerSleepCallback(3000,
+	// . make it 20 seconds because handlerequestc1 is always on
+	//   profiler when we have thousands of collections
+	// . let's try 10 seconds so as not to think a job is done when
+	//   it is not
+	if ( !g_loop.registerSleepCallback(10000,
 					   this,
 					   updateAllCrawlInfosSleepWrapper))
 		log("build: failed to register updatecrawlinfowrapper");
@ -6232,6 +6285,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {

 	// if spidering disabled then do not do this crap
 	if ( ! g_conf.m_spideringEnabled )  return;
+	if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
+	
 	//if ( ! g_conf.m_webSpideringEnabled )  return;
 	// or if trying to exit
 	if ( g_process.m_mode == EXIT_MODE ) return;	
@ -6250,6 +6305,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 		return;
 	}

+	//if ( g_hostdb.hasDeadHost() ) return;
+
 	static int32_t s_count = -1;
 	// count these calls
 	s_count++;
@ -6299,6 +6356,7 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 		// if ( ! cr ) continue;
 		// skip if not enabled
 		if ( ! crp->m_spideringEnabled ) continue;
+
 		// get it
 		//SpiderColl *sc = cr->m_spiderColl;
 		SpiderColl *sc = g_spiderCache.getSpiderColl(crp->m_collnum);
@ -6694,6 +6752,8 @@ void SpiderLoop::spiderDoledUrls ( ) {

 	// must be spidering to dole out
 	if ( ! g_conf.m_spideringEnabled ) return;
+	if ( ! g_hostdb.getMyHost( )->m_spiderEnabled ) return;
+
 	// or if trying to exit
 	if ( g_process.m_mode == EXIT_MODE ) return;	
 	// if we don't have all the url counts from all hosts, then wait.
@ -7543,7 +7603,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
 		// note it
 		if ( (g_corruptCount % 1000) == 0 )
 			log("spider: got corrupt doledb record. ignoring. "
-			    "pls fix!!!");
+			    "pls fix!!! cn=%i",(int)m_collnum);
 		g_corruptCount++;
 		// skip for now....!! what is causing this???
 		m_list.skipCurrentRecord();
@ -8278,7 +8338,13 @@ bool SpiderLoop::spiderUrl2 ( ) {
 	// count it as a hit
 	//g_stats.m_spiderUrlsHit++;
 	// sanity check
-	if (m_sreq->m_priority <= -1 ) { char *xx=NULL;*xx=0; }
+	if (m_sreq->m_priority <= -1 ) { 
+		log("spider: fixing bogus spider req priority of %i for "
+		    "url %s",
+		    (int)m_sreq->m_priority,m_sreq->m_url);
+		m_sreq->m_priority = 0;
+		//char *xx=NULL;*xx=0; 
+	}
 	//if(m_sreq->m_priority >= MAX_SPIDER_PRIORITIES){char *xx=NULL;*xx=0;}
 	// update this
 	m_sc->m_outstandingSpiders[(unsigned char)m_sreq->m_priority]++;
@ -9588,7 +9654,10 @@ bool printList ( State11 *st ) {
 		if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;}
 		// sanity check. requests ONLY in doledb
 		if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) {
-			char*xx=NULL;*xx=0;}
+			log("spider: not printing spiderreply");
+			continue;
+			//char*xx=NULL;*xx=0;
+		}
 		// get the spider rec, encapsed in the data of the doledb rec
 		SpiderRequest *sreq = (SpiderRequest *)rec;
 		// print it into sbTable
@ -11428,7 +11497,7 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 	if ( langId >= 0 ) { // if ( srep ) {
 		// this is NULL on corruption
 		lang = getLanguageAbbr ( langId );//srep->m_langId );	
-		langLen = gbstrlen(lang);
+		if (lang) langLen = gbstrlen(lang);
 	}

 	// . get parent language in the request
@ -12919,6 +12988,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			if ( sign == SIGN_LT && a >= b ) continue;
 			if ( sign == SIGN_GE && a <  b ) continue;
 			if ( sign == SIGN_LE && a >  b ) continue;
+			// skip fast
+			p += 10;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) return i;
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
+		// EBADURL malformed url is ... 32880
+		if ( *p=='e' && strncmp(p,"errorcode",9) == 0 ) {
+			// if we do not have enough info for outlink, all done
+			if ( isOutlink ) return -1;
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// reply based
+			if ( ! srep ) continue;
+			// int16_tcut
+			int32_t a = srep->m_errCode;
+			// make it point to the retry count
+			int32_t b = atoi(s);
+			// compare
+			if ( sign == SIGN_EQ && a != b ) continue;
+			if ( sign == SIGN_NE && a == b ) continue;
+			if ( sign == SIGN_GT && a <= b ) continue;
+			if ( sign == SIGN_LT && a >= b ) continue;
+			if ( sign == SIGN_GE && a <  b ) continue;
+			if ( sign == SIGN_LE && a >  b ) continue;
+			// skip fast
+			p += 9;
 			p = strstr(s, "&&");
 			//if nothing, else then it is a match
 			if ( ! p ) return i;
@ -13810,6 +13910,8 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// . TODO: do not update on error???
 	for ( ; ptr < end ; ptr++ ) {

+		QUICKPOLL ( slot->m_niceness );
+
 		// get collnum
 		collnum_t collnum = (collnum_t)(ptr->m_collnum);

@ -13875,6 +13977,12 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 	// loop over 
 	for ( int32_t x = 0 ; x < g_collectiondb.m_numRecs ; x++ ) {

+		QUICKPOLL ( slot->m_niceness );
+
+		// a niceness 0 routine could have nuked it?
+		if ( x >= g_collectiondb.m_numRecs )
+			break;
+
 		CollectionRec *cr = g_collectiondb.m_recs[x];
 		if ( ! cr ) continue;

@ -13897,20 +14005,35 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
 		if ( ! cia ) continue;

 		for ( int32_t k = 0 ; k < g_hostdb.m_numHosts; k++ ) {
+			QUICKPOLL ( slot->m_niceness );
 			// get the CrawlInfo for the ith host
 			CrawlInfo *stats = &cia[k];
 			// point to the stats for that host
 			int64_t *ss = (int64_t *)stats;
 			int64_t *gs = (int64_t *)gi;
-			// add each hosts counts into the global accumulators
+			// are stats crazy?
+			bool crazy = false;
 			for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
-				*gs = *gs + *ss;
 				// crazy stat?
 				if ( *ss > 1000000000LL ||
-				     *ss < -1000000000LL ) 
+				     *ss < -1000000000LL ) {
 					log("spider: crazy stats %"INT64" "
-					    "from host #%"INT32" coll=%s",
+					    "from host #%"INT32" coll=%s. "
+					    "ignoring.",
 					    *ss,k,cr->m_coll);
+					crazy = true;
+					break;
+				}
+				ss++;
+			}
+			// reset ptr to accumulate
+			ss = (int64_t *)stats;
+			for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
+				// do not accumulate if corrupted.
+				// probably mem got corrupted and it saved
+				// to disk.
+				if ( crazy ) break;
+				*gs = *gs + *ss;
 				gs++;
 				ss++;
 			}
@ -14177,7 +14300,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {

 	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {

-	  QUICKPOLL(MAX_NICENESS);
+		QUICKPOLL(slot->m_niceness);

 		CollectionRec *cr = g_collectiondb.m_recs[i];
 		if ( ! cr ) continue;
@ -14370,10 +14493,27 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {

 	uint32_t now = (uint32_t)getTimeGlobal();

+
+	// hit crawl round max? this could be SP_ROUNDDONE and it doesn't
+	// get converted to SP_MAXROUNDS until we call spiderDoledUrls()
+	// so fix the crawlbot nightly smoke test by setting this here
+	// to SP_MAXROUNDS.
+	// smoketest msg = FAIL: testCrawlRounds (__main__.TestRepeatCrawl)
+	// self.assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name
+	// AssertionError: 4 != 1 : 1227151934RepeatCrawlself.
+	// assertEqual(j['jobs'][0]['jobStatus']['status'],1,msg=self.name)
+	int32_t spiderStatus = cx->m_spiderStatus;
+	if ( spiderStatus == SP_ROUNDDONE &&
+	     cx->m_maxCrawlRounds > 0 &&
+	     cx->m_isCustomCrawl &&
+	     cx->m_spiderRoundNum >= cx->m_maxCrawlRounds )
+		spiderStatus = SP_MAXROUNDS;
+
+
 	// try to fix crawlbot nightly test complaining about job status
 	// for TestRepeatCrawlWithMaxToCrawl
-	if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
-	      cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
+	if ( (spiderStatus == SP_MAXTOCRAWL ||
+	      spiderStatus == SP_MAXTOPROCESS ) &&
 	     cx->m_collectiveRespiderFrequency > 0.0 &&
 	     now < cx->m_spiderRoundStartTime &&
 	     cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
@ -14384,7 +14524,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {

 	// . 0 means not to RE-crawl
 	// . indicate if we are WAITING for next round...
-	if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
+	if ( spiderStatus == SP_MAXTOCRAWL &&
 	     cx->m_collectiveRespiderFrequency > 0.0 &&
 	     now < cx->m_spiderRoundStartTime ) {
 		*status = SP_ROUNDDONE;
@ -14395,7 +14535,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 						 now));
 	}

-	if ( cx->m_spiderStatus == SP_MAXTOPROCESS &&
+	if ( spiderStatus == SP_MAXTOPROCESS &&
 	     cx->m_collectiveRespiderFrequency > 0.0 &&
 	     now < cx->m_spiderRoundStartTime ) {
 		*status = SP_ROUNDDONE;
@ -14407,19 +14547,19 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 	}


-	if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
+	if ( spiderStatus == SP_MAXTOCRAWL ) {
 		*status = SP_MAXTOCRAWL;
 		return msg->safePrintf ( "Job has reached maxToCrawl "
 					 "limit." );
 	}

-	if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
+	if ( spiderStatus == SP_MAXTOPROCESS ) {
 		*status = SP_MAXTOPROCESS;
 		return msg->safePrintf ( "Job has reached maxToProcess "
 					 "limit." );
 	}

-	if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
+	if ( spiderStatus == SP_MAXROUNDS ) {
 		*status = SP_MAXROUNDS;
 		return msg->safePrintf ( "Job has reached maxRounds "
 					 "limit." );
@ -14453,7 +14593,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 	//	return msg->safePrintf("Crawl is waiting for urls.");
 	//}

-	if ( cx->m_spiderStatus == SP_INITIALIZING ) {
+	if ( spiderStatus == SP_INITIALIZING ) {
 		*status = SP_INITIALIZING;
 		return msg->safePrintf("Job is initializing.");
 	}
@ -14479,7 +14619,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 			"repeat is scheduled.");
 	}

-	if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
+	if ( spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
 		*status = SP_ROUNDDONE;
 		return msg->safePrintf ( "Nothing currently "
 					 "available to spider. "
@ -14502,7 +14642,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 	}
 		

-	if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
+	if ( spiderStatus == SP_ROUNDDONE ) {
 		*status = SP_ROUNDDONE;
 		return msg->safePrintf ( "Job round completed.");
 	}
@ -14755,12 +14895,21 @@ bool SpiderRequest::isCorrupt ( ) {
 	}

 	// sanity check. check for http(s)://
-	if ( m_url[0] != 'h' &&
-	     // might be a docid from a pagereindex.cpp
-	     ! is_digit(m_url[0]) ) { 
+	if ( m_url[0] == 'h' ) 
+		return false;
+	// might be a docid from a pagereindex.cpp
+	if ( ! is_digit(m_url[0]) ) { 
 		log("spider: got corrupt 1 spiderRequest");
 		return true;
 	}
+	// if it is a digit\0 it is ok, not corrupt
+	if ( ! m_url[1] )
+		return false;
+	// if it is not a digit after the first digit, that is bad
+	if ( ! is_digit(m_url[1]) ) { 
+		log("spider: got corrupt 2 spiderRequest");
+		return true;
+	}

 	return false;
 }
--- a/SpiderProxy.cpp
+++ b/SpiderProxy.cpp
@ -143,7 +143,10 @@ bool buildProxyTable ( ) {
 			*s = '\0';
 			log("buf: %s for %s",msg,p);
 			*s = c;
-			return false;
+			//return false;
+			// advance p
+			p = s;
+			continue;
 		}

 		// convert it
@ -706,6 +709,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 	int32_t hslot = s_loadTable.getSlot ( &urlIp );
 	// scan all proxies that have this urlip outstanding
 	for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
+		QUICKPOLL(niceness);
 		// get the bucket
 		LoadBucket *lb;
 		lb = (LoadBucket *)s_loadTable.getValueFromSlot(i);
@ -736,6 +740,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 	// get the min of the counts
 	int32_t minCount = 999999;
 	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
+		QUICKPOLL(niceness);
 		// skip empty slots
 		if ( ! s_iptab.m_flags[i] ) continue;
 		// get the spider proxy
@ -824,6 +829,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 	int32_t slotCount = s_iptab.getNumSlots();
 	// . now find the best proxy wih the minCount
 	for ( int32_t i = start ; ; i++ ) {
+		QUICKPOLL(niceness);
 		// scan all slots in hash table, then stop
 		if ( slotCount-- <= 0 ) break;
 		// wrap around to zero if we hit the end
@ -896,8 +902,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 	static int32_t s_lbid = 0;
 	// add it now, iff not for passing to diffbot backend
 	if ( preq->m_opCode != OP_GETPROXYFORDIFFBOT ) {
-		s_loadTable.addKey ( &urlIp , &bb );
 		bb.m_id = s_lbid++;
+		s_loadTable.addKey ( &urlIp , &bb );
 		// winner count update
 		winnersp->m_timesUsed++;
 	}
@ -931,12 +937,29 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 	// and the loadbucket id
 	//*(int32_t *)p = bb.m_id; p += 4;

-	//int32_t sanityCount = 0;//s_loadTable.getNumSlots();
-	// top:
+	// with dup keys we end up with long chains of crap and this
+	// takes forever. so just flush the whole thing every 2 minutes AND
+	// when 20000+ entries are in there
+	static time_t s_lastTime = 0;
+	time_t now = nowms / 1000;
+	if ( s_lastTime == 0 ) s_lastTime = now;
+	time_t elapsed = now - s_lastTime;
+	if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) {
+		log("sproxy: flushing %i entries from proxy loadtable that "
+		    "have accumulated since %i seconds ago",
+		    (int)s_loadTable.m_numSlotsUsed,(int)elapsed);
+		s_loadTable.clear();
+		// only do this one per minute
+		s_lastTime = now;
+	}

+
+	int32_t sanityCount = 0;//s_loadTable.getNumSlots();
+	// top:
 	// now remove old entries from the load table. entries that
-	// have completed and have a download end time more than 10 mins ago
-	for ( int32_t i = 0 ; i < s_loadTable.getNumSlots() ; i++ ) {
+	// have completed and have a download end time more than 10 mins ago.
+	for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) {
+		QUICKPOLL(niceness);
 		// skip if empty
 		if ( ! s_loadTable.m_flags[i] ) continue;
 		// get the bucket
@ -948,8 +971,8 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 		// < 10 mins? now it's < 15 seconds to prevent clogging.
 		if ( took < LOADPOINT_EXPIRE_MS ) continue;

-		// 100 at a time
-		//if ( sanityCount++ > 100 ) break;
+		// 100 at a time so we don't slam cpu
+		if ( sanityCount++ > 100 ) break;

 		// ok, its too old, nuke it to save memory
 		s_loadTable.removeSlot(i);
@ -957,7 +980,7 @@ void handleRequest54 ( UdpSlot *udpSlot , int32_t niceness ) {
 		// mis out on analyzing any keys if we just keep looping here
 		// should we? TODO: figure it out. if we miss a few it's not
 		// a big deal.
-		i--;
+		//i--;
 		//goto top;
 	}

--- a/Statsdb.cpp
+++ b/Statsdb.cpp
@ -65,21 +65,19 @@ static Label s_labels[] = {
 	// . 300MB/s is max read rate regardless to stop graph shrinkage
 	// . use 1KB as the min resolution per pixel
 	// . stored in Bps so use 1/1000 as scalar to get into KBps
-	{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,
-	"disk read"},
+	{ GRAPH_QUANTITY,200,"disk_read",1,"%.0f MBps",1.0/(1000.0*1000.0),0x000000,"disk read"},

 	// . 300MB/s is max write rate regardless to stop graph shrinkage
 	// . use 1KB as the min resolution per pixel
 	// . stored in Bps so use 1/1000 as scalar to get into KBps
-	{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000,
-	"disk write"},
+	{GRAPH_QUANTITY,200,"disk_write",1,"%.0f Mbps",1.0/(1000.0*1000.0), 0xff0000,	"disk write"},

 	// . 20 is the max dps regardless to stop graph shrinkage
 	// . use .03 qps as the min resolution per pixel
 	{GRAPH_OPS,20,"parse_doc", .005,"%.1f dps" , 1.0 , 0x00fea915,"parsed doc" },


-	{GRAPH_QUANTITY_PER_OP,1000,"docs_per_second", .005,"%.1f docs" , .001 , 0x1F2F5C,"docs per second" },
+	{GRAPH_QUANTITY_PER_OP,-1,"docs_per_second", .1,"%.1f docs per second" , -1 , 0x1F2F5C,"*successfully* indexed docs per second" },

 	// . use .1 * 1000 docs as the min resolution per pixel
 	// . max = -1, means dynamic size the ymax!
@ -88,7 +86,7 @@ static Label s_labels[] = {
 	// . make it 2M now not 50M. seems like it is per pixel and theres
 	//   like 1000 pixels vertically. but we need to autoscale it 
 	//   eventually
-	{GRAPH_QUANTITY,2000000.0,"docs_indexed", .1,"%.0fK docs" , .001 , 0x00cc0099,"docs indexed" }
+	{GRAPH_QUANTITY,-1,"docs_indexed", .1,"%.0f docs" , -1,  0x00cc0099,"docs indexed" }


 	//{ "termlist_intersect",0x0000ff00},
@ -122,6 +120,7 @@ Label *Statsdb::getLabel ( int32_t labelHash ) {
 	return *label;
 }

+
 Statsdb::Statsdb ( ) {
 	m_init = false;
 	m_disabled = true;
@ -246,6 +245,8 @@ void flushStatsWrapper ( int fd , void *state ) {
 void Statsdb::addDocsIndexed ( ) {

 	if ( ! isClockInSync() ) return;
+	if ( g_hostdb.hasDeadHost() ) return;
+

 	// only host #0 needs this
 	if ( g_hostdb.m_hostId != 0 ) return;
@ -270,18 +271,23 @@ void Statsdb::addDocsIndexed ( ) {
 	// divide by # of groups
 	total /= g_hostdb.getNumHostsPerShard();
 	// skip if no change
+
 	if ( total == s_lastTotal ) return;

    int32_t docsIndexedInInterval = total - s_lastTotal;
    float docsPerSecond = docsIndexedInInterval / (float)interval;

-	s_lastTotal = total;
 	log("build: total docs indexed: %f. docs per second %f %i %i", (float)total, docsPerSecond, docsIndexedInInterval, interval);

 	// add it if changed though
 	int64_t nowms = gettimeofdayInMillisecondsGlobal();
 	addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total );
-	addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
+    // Prevent a datapoint which adds all of the docs indexed to date.
+    if( s_lastTotal != 0 ) {
+        addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond );
+    }
+
+	s_lastTotal = total;
 }

 // . m_key bitmap in statsdb:
@ -896,12 +902,13 @@ char *Statsdb::plotGraph ( char *pstart ,
 	bool needMax = true;
 	float ymin = 0.0;
 	float ymax = 0.0;
-
+	float yscalar = label->m_yscalar;
 	char *p = pstart;

 	for ( ; p < pend ; p += 12 ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
+		if ( m_gw.getLength() > 10000000 ) break;
 		// get the y
 		float y2 = *(float *)(p+4);
 		// get color of this point
@ -909,7 +916,8 @@ char *Statsdb::plotGraph ( char *pstart ,
 		// stop if not us
 		if ( gh != graphHash ) continue;
 		// put into scaled space right away
-		y2 = y2 * label->m_yscalar;
+		if (label->m_yscalar >= 0)
+			y2 = y2 * label->m_yscalar;
 		// . limit y to absolute max
 		// . these units should be scaled as well!
 		if ( y2 > label->m_absYMax && label->m_absYMax > 0.0 )
@ -922,13 +930,21 @@ char *Statsdb::plotGraph ( char *pstart ,
 	}

 	// force to zero for now
-	ymin = 0.0;
+	//ymin = 0.0;
 	// . and force to ymax for now as well
 	// . -1 indicates dynamic though!
 	if ( label->m_absYMax > 0.0 ) ymax = label->m_absYMax;
 	// add a 20% ceiling
-	else                          ymax *= 1.20;
+	//	else                          ymax *= 1.20;

+	
+	if( label->m_yscalar <= 0 ) {
+        if(ymax == ymin) {
+            yscalar = 0;
+        } else {
+            yscalar = (float)DY2 / (ymax - ymin);
+        }
+	}
 	// return that!
 	char *retp = p;

@ -951,7 +967,7 @@ char *Statsdb::plotGraph ( char *pstart ,

 	// . pad y range if total range is small
 	// . only do this for certain types of stats, like qps and disk i/o
-	if ( ourDiff < minDiff ) {
+	if ( label->m_yscalar >=0 && ourDiff < minDiff ) {
 		float pad = (minDiff - ourDiff) / 2;
 		// pad it out
 		ymin -= pad ;
@ -981,16 +997,23 @@ char *Statsdb::plotGraph ( char *pstart ,
 	for ( ; p < pend ; ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
+		if ( m_gw.getLength() > 10000000 ) break;
 		// first is x pixel pos
 		int32_t  x2 = *(int32_t *)p; p += 4;
 		// then y pos
 		float y2 = *(float *)p; p += 4;

 		// scale it right away
-		y2 *= label->m_yscalar;
+		if(label->m_yscalar < 0) {
+			y2 = (y2 - ymin) * yscalar;
+		}
+		else {
+			y2 *= yscalar;

+		}
 		// adjust
 		if ( y2 > ymax ) y2 = ymax;
+		if ( y2 < 0 ) y2 = 0;

 		// then graphHash
 		int32_t  gh = *(int32_t *)p; p += 4;
@ -1003,8 +1026,10 @@ char *Statsdb::plotGraph ( char *pstart ,
 		float y1 = lasty;

 		// normalize y into pixel space
-		y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
-
+		if(label->m_yscalar >= 0 && ymax != ymin) {
+			y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
+		}
+		
 		// set lasts for next iteration of this loop
 		lastx = x2;
 		lasty = y2;
@ -1073,13 +1098,20 @@ char *Statsdb::plotGraph ( char *pstart ,
 	}


+	float lastZ = -1;
 	for ( float z = ymin ; z < ymax ; z += deltaz ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
 		// draw it
 		drawHR ( z , ymin , ymax , m_gw , label , zoff , color );
+		if(z == lastZ) break;
+		lastZ = z;
+		//if ( m_gw.getLength() > 10000000 ) break;
 	}

+	if ( m_gw.getLength() > 10000000 )
+		log("statsdb: graph too big");
+
 	return retp;
 	//#endif
       
@ -1158,7 +1190,7 @@ void Statsdb::drawHR ( float z ,
 				  "font-size:14px;"
 				  "min-height:20px;"
 				  "min-width:3px;\""
-                  " class=\"color-%"XINT32"\";"
+                  " class=\"color-%"XINT32"\""
 				  ">%s</div>\n"
 		      , (int32_t)(m_bx)
 		      , (int32_t)z2 +m_by
@ -1194,6 +1226,13 @@ bool Statsdb::processList ( ) {
 			m_done = true;
 	}

+	// HACK: the user can request all of the events, it can
+	// become quite large. so limit to 100 mb right now.
+	if( m_sb3.length() > 100000000) {
+		log("statsdb: truncating statsdb results.");
+		m_done = true;
+	}
+

 	//
 	// all these points are accumulated into 1-second buckets
@ -1590,7 +1629,7 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
 				  "z-index:-5;"
 				  "min-height:%"INT32"px;"
 				  "min-width:%"INT32"px;\""
-				  "class=\"color-%"XINT32"\"></div>\n"
+				  " class=\"color-%"XINT32"\"></div>\n"
 		      , x1 + m_bx
 		      , (fy1 - width/2) + m_by
 		      , color
@ -1599,3 +1638,5 @@ void Statsdb::drawLine3 ( SafeBuf &sb ,
 		      , color
 		      );
 }
+
+
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -2803,24 +2803,15 @@ bool Msg8a::launchGetRequests ( ) {
 	//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
 	//Host *group = g_hostdb.getGroup ( gid );
 	int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
-	Host *group = g_hostdb.getShard ( shardNum );
-
-	//int32_t numTwins = g_hostdb.getNumHostsPerShard();
-	// use top byte!
-	uint8_t *sks = (uint8_t *)&startKey;
-	uint8_t top = sks[sizeof(TAGDB_KEY)-1];
-	//int32_t hostNum = 0;
-	//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
-	// TODO: fix this!
-	//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
-	// support more than 2 stripes now...
-	int32_t hostNum = top % g_hostdb.getNumHostsPerShard();
-	int32_t hostId = group[hostNum].m_hostId;
-
+	Host *firstHost ;
+	// if niceness 0 can't pick noquery host.
+	// if niceness 1 can't pick nospider host.
+	firstHost = g_hostdb.getLeastLoadedInShard ( shardNum , m_niceness );
+	int32_t firstHostId = firstHost->m_hostId;

 	// . launch this request, even if to ourselves
 	// . TODO: just use msg0!!
-	bool status = m->getList ( hostId     , // hostId
+	bool status = m->getList ( firstHostId     , // hostId
 				   0          , // ip
 				   0          , // port
 				   0          , // maxCacheAge
@ -2837,7 +2828,7 @@ bool Msg8a::launchGetRequests ( ) {
 				   true                , // error correction?
 				   true                , // include tree?
 				   true                , // doMerge?
-				   -1                  , // firstHostId
+				   firstHostId         , // firstHostId
 				   0                   , // startFileNum
 				   -1                  , // numFiles
 				   3600*24*365         );// timeout
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -2918,6 +2918,67 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
 		SSL_set_connect_state(s->m_ssl);
 	}

+	// . set hostname for SNI (Server Name Identification)
+	// . can test with page parser on the test page: https://sni.velox.ch/
+	// . we can parse the mime reliably here because we are the ones
+	//   that created the request, so we know it should be standardish.
+	if ( s->m_sendBuf && ! s->m_readBuf ) {
+		// grab hostname from the mime
+		// skip first line
+		char *p = s->m_sendBuf;
+		char *pend = p + s->m_sendBufSize;
+		if ( p+10 >= pend )
+			goto skipSNI;
+		bool gotIt = false;
+		if ( p[0] == 'G' && p[1] == 'E' && p[2] == 'T' && p[3]==' ' )
+			gotIt = true;
+		if ( p[0] == 'P' && p[1] == 'O' && p[2] == 'S' && p[3]=='T' &&
+		     p[4] == ' ' )
+			gotIt = true;
+		// need to start with "GET " or "POST "
+		if ( ! gotIt ) 
+			goto skipSNI;
+	scanMimeSomeMore:
+		// skip to the first \r, indicating end of line
+		for ( ; p < pend && *p != '\r' ; p++ );
+		// if we couldn't find it, then there's no Host: directive
+		if ( p == pend ) 
+			goto skipSNI;
+		// skip \r\n
+		if ( *p == '\r' ) 
+			p++;
+		if ( p == pend )
+			goto skipSNI;
+		if ( *p == '\n' ) 
+			p++;
+		if ( p == pend ) 
+			goto skipSNI;
+		// end of mime (\r\n\r\n)
+		if ( p+2<pend && p[0] == '\r' && p[1] == '\n' ) 
+			goto skipSNI;
+		// is it host:?
+		if ( p+6 >= pend )
+			goto skipSNI;
+		if ( strncasecmp(p,"Host:",5) ) 
+			goto scanMimeSomeMore;
+		p += 5;
+		if ( p<pend && *p == ' ' ) p++;
+		if ( p<pend && *p == ' ' ) p++;
+		char *hostname = p;
+		// find end of line
+		for ( ; p<pend && *p != '\r' ; p++ );
+		if ( p == pend )
+			goto skipSNI;
+		// temp null
+		char c = *p;
+		*p = '\0';
+		/// @todo what if we can't set TLS servername extension?
+		SSL_set_tlsext_host_name(s->m_ssl, hostname );
+		// replace the \0 with original char
+		*p = c;
+	}
+ skipSNI:
+
 	// SSL_connect() calls malloc()
 	g_inMemFunction = true;
 	int r = SSL_connect(s->m_ssl);
--- a/Threads.cpp
+++ b/Threads.cpp
@ -320,7 +320,7 @@ bool Threads::init ( ) {
 	// i raised since global specs new servers have 2 (hyperthreaded?) cpus
 	int32_t max = g_conf.m_maxCpuThreads;
 	if ( max < 1 ) max = 1;
-	if ( ! g_threads.registerType ( INTERSECT_THREAD,max,200) ) 
+	if ( ! g_threads.registerType ( INTERSECT_THREAD,max,10) ) 
 		return log("thread: Failed to register thread type." );
 	// filter thread spawned to call popen() to filter an http reply
 	if ( ! g_threads.registerType ( FILTER_THREAD, 2/*maxThreads*/,300) ) 
@ -334,10 +334,10 @@ bool Threads::init ( ) {
 	//   it was taking forever to go one at a time through the unlink
 	//   thread queue. seemed like a 1 second space between unlinks.
 	//   1/23/1014
-	if ( ! g_threads.registerType ( UNLINK_THREAD,30/*maxThreads*/,3000) ) 
+	if ( ! g_threads.registerType ( UNLINK_THREAD,5/*maxThreads*/,3000) ) 
 		return log("thread: Failed to register thread type." );
 	// generic multipurpose
-	if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) ) 
+	if ( ! g_threads.registerType (GENERIC_THREAD,20/*maxThreads*/,100) ) 
 		return log("thread: Failed to register thread type." );
 	// for call SSL_accept() which blocks for 10ms even when socket
 	// is non-blocking...
@ -435,6 +435,13 @@ int32_t Threads::getNumWriteThreadsOut() {
 	return m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
 }

+int32_t Threads::getNumActiveWriteUnlinkRenameThreadsOut() {
+	// these do not countthreads that are done, and just awaiting join
+	int32_t n = m_threadQueues[DISK_THREAD].getNumWriteThreadsOut();
+	n += m_threadQueues[UNLINK_THREAD].getNumActiveThreadsOut();
+	return n;
+}
+
 // . returns false (and may set errno) if failed to launch a thread
 // . returns true if thread added to queue successfully
 // . may be launched instantly or later depending on # of threads in the queue
@ -853,6 +860,19 @@ bool ThreadQueue::init ( char threadType, int32_t maxThreads, int32_t maxEntries
 	return true;
 }

+int32_t ThreadQueue::getNumActiveThreadsOut() {
+	int32_t n = 0;
+	for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
+		ThreadEntry *e = &m_entries[i];
+		if ( ! e->m_isOccupied ) continue;
+		if ( ! e->m_isLaunched ) continue;
+		// if it is done and just waiting for a join, do not count
+		if ( e->m_isDone ) continue;
+		n++;
+	}
+	return n;
+}
+
 int32_t ThreadQueue::getNumThreadsOutOrQueued() {
 	// MDW: we also need to count threads that are returned but need their
 	// callback called so, in the case of RdbDump, the rdblist that was written
@ -1108,6 +1128,7 @@ int32_t Threads::timedCleanUp (int32_t maxTime, int32_t niceness) {
 		return 0;

 	if ( ! m_needsCleanup ) return 0;
+
 	//if ( g_inSigHandler ) return 0;
 	int64_t startTime = gettimeofdayInMillisecondsLocal();
 	int64_t took = 0;
@ -1299,7 +1320,15 @@ bool ThreadQueue::timedCleanUp ( int32_t maxNiceness ) {
 			// . join up with that thread
 			// . damn, sometimes he can block forever on his
 			//   call to sigqueue(), 
+			int64_t startTime = gettimeofdayInMillisecondsLocal();
+			int64_t took;
 			int32_t status =  pthread_join ( t->m_joinTid , NULL );
+			took = startTime - gettimeofdayInMillisecondsLocal();
+			if ( took > 50 ) {
+				log("threads: pthread_join took %i ms",
+				    (int)took);
+			}
+
 			if ( status != 0 ) {
 				log("threads: pthread_join %"INT64" = %s (%"INT32")",
 				    (int64_t)t->m_joinTid,mstrerror(status),
@ -2088,7 +2117,8 @@ bool ThreadQueue::launchThread2 ( ) {

 	if ( m_threadType != DISK_THREAD ) {
 		// if one thread of this type is already out, forget it
-		if ( m_launchedHead ) return false;
+		// then we can't have 100 GENERIC THREADS!!! with this...
+		//if ( m_launchedHead ) return false;
 		// first try niceness 0 queue
 		ThreadEntry **bestHeadPtr = &m_waitHead0;
 		ThreadEntry **bestTailPtr = &m_waitTail0;
@ -3315,3 +3345,23 @@ void Threads::printState() {
 		}
 	}
 }
+
+void ThreadQueue::killAllThreads ( ) {
+	for ( int32_t i = 0 ; i < m_maxEntries ; i++ ) {
+		ThreadEntry *e = &m_entries[i];
+		if ( ! e->m_isOccupied ) continue;
+		if ( ! e->m_isLaunched ) continue;
+		log("threads: killling thread id %i",(int)e->m_joinTid);
+		pthread_kill ( e->m_joinTid , SIGKILL );
+		log("threads: joining with thread id %i",(int)e->m_joinTid);
+		pthread_join ( e->m_joinTid , NULL );
+	}
+}
+
+void Threads::killAllThreads ( ) {
+	log("threads: killing all threads");
+	for ( int32_t j = 0 ; j < m_numQueues ; j++ ) {
+		ThreadQueue *tq = &m_threadQueues[j];
+		tq->killAllThreads();
+	}		
+}
--- a/Threads.h
+++ b/Threads.h
@ -161,6 +161,7 @@ class ThreadQueue {

 	int32_t getNumThreadsOutOrQueued();
 	int32_t getNumWriteThreadsOut() ;
+	int32_t getNumActiveThreadsOut() ;


 	// . for adding an entry
@ -196,6 +197,8 @@ class ThreadQueue {
 	void suspendLowPriorityThreads();
 	void resumeLowPriorityThreads();

+	void killAllThreads();
+
 	// this is true if low priority threads are temporarily suspended
 	bool m_isLowPrioritySuspended ;

@ -246,6 +249,8 @@ class Threads {
 	bool areThreadsDisabled() { return m_disabled; };
 	bool areThreadsEnabled () { return ! m_disabled; };

+	void killAllThreads();
+
 	// . returns false and sets errno if thread launch failed
 	// . returns true on success
 	// . when thread is done a signal will be put on the g_loop's
@ -301,6 +306,8 @@ class Threads {
 	int32_t getNumThreadsOutOrQueued();
 	int32_t getNumWriteThreadsOut() ;

+	int32_t getNumActiveWriteUnlinkRenameThreadsOut() ;
+
 	// counts the high/low priority (niceness <= 0) threads
 	//int64_t   m_hiLaunched;
 	//int64_t   m_hiReturned;
--- a/UdpServer.cpp
+++ b/UdpServer.cpp
@ -286,6 +286,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
 	// no requests waiting yet
 	m_requestsInWaiting = 0;
 	// special count
+	m_msg07sInWaiting = 0;
 	m_msg10sInWaiting = 0;
 	m_msgc1sInWaiting = 0;
 	//m_msgDsInWaiting = 0;
@ -1005,7 +1006,7 @@ UdpSlot *UdpServer::getBestSlotToSend ( int64_t now ) {
 	UdpSlot *maxi     = NULL;
 	int32_t     score;  
 	//UdpSlot *slot;
-	// . we send dgrams with the lowest "score" first
+  	// . we send dgrams with the lowest "score" first
 	// . the "score" is just number of ACKs you're waiting for
 	// . that way transmissions that are the most caught up to their ACKs
 	//   are considered faster so we send to them first
@ -1482,6 +1483,9 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 		// rate, these are pretty lightweight. msg 0x10 reply gen times
 		// are VERY low. MDW
 		bool getSlot = true;
+		if ( msgType == 0x07 && m_msg07sInWaiting >= 100 )
+			getSlot = false;
+
 		if ( msgType == 0x10 && m_msg10sInWaiting >= 50 ) 
 			getSlot = false;
 		// crawl update info from Spider.cpp
@ -1671,6 +1675,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 			// if we connected to a request slot, count it
 			m_requestsInWaiting++;
 			// special count
+			if ( msgType == 0x07 ) m_msg07sInWaiting++;
 			if ( msgType == 0x10 ) m_msg10sInWaiting++;
 			if ( msgType == 0xc1 ) m_msgc1sInWaiting++;
 			//if ( msgType == 0xd  ) m_msgDsInWaiting++;
@ -3122,6 +3127,7 @@ void UdpServer::destroySlot ( UdpSlot *slot ) {
 		// one less request in waiting
 		m_requestsInWaiting--;
 		// special count
+		if ( slot->m_msgType == 0x07 ) m_msg07sInWaiting--;
 		if ( slot->m_msgType == 0x10 ) m_msg10sInWaiting--;
 		if ( slot->m_msgType == 0xc1 ) m_msgc1sInWaiting--;
 		//if ( slot->m_msgType == 0xd  ) m_msgDsInWaiting--;
--- a/UdpServer.h
+++ b/UdpServer.h
@ -390,6 +390,7 @@ class UdpServer {
 	int32_t   m_requestsInWaiting;

 	// like m_requestsInWaiting but requests which spawn other requests
+	int32_t   m_msg07sInWaiting;
 	int32_t   m_msg10sInWaiting;
 	int32_t   m_msgc1sInWaiting;
 	//int32_t   m_msgDsInWaiting;
--- a/UdpSlot.cpp
+++ b/UdpSlot.cpp
@ -1280,6 +1280,8 @@ bool UdpSlot::readDatagramOrAck ( int        sock    ,
 	}
 	// handle acks
 	if ( m_proto->isAck ( peek , peekSize ) ) {
+		// if ack for msg4 core to test its save stuff
+		//if ( m_msgType == 0x04 ) { char *xx=NULL;*xx=0; }
 		readAck ( sock, dgramNum , now ); 
 		// keep stats
 		if ( m_host ) m_host->m_dgramsFrom++;
--- a/UdpSlot.h
+++ b/UdpSlot.h
@ -10,6 +10,12 @@
 #include "UdpProtocol.h"
 #include "Hostdb.h"

+// i'm seeing some networks not liking big dgrams, so
+// lets go super small. we won't be able to send back
+// huge msgs unfortunately, so we'll have to fix that
+// a different way later.
+#define SMALLDGRAMS
+
 // . we want to avoid the overhead of IP level fragmentation
 // . so for an MTU of 1500 we got 28 bytes overhead (IP and UDP headers)
 // . later we can try large DGRAM_SIZE values to see if faster
@ -19,9 +25,9 @@
 //#define DGRAM_SIZE 7500
 //#define DGRAM_SIZE ((1500-28)*5)
 // this was the most stable size, but now, 4/8/04, i'm trying bigger...
-#ifdef _SMALLDGRAMS_
+#ifdef SMALLDGRAMS
 // newspaperarchive machines need this smaller size
-#define DGRAM_SIZE (1500-28)
+#define DGRAM_SIZE (1500-28-10)
 #else
 // . here's the new size, 4/8/04, about 20x bigger
 // . only use this for our machines
@ -30,10 +36,11 @@
 // . let's see if smaller dgrams fix the ping spike problem on gk0c
 // . this is in addition to lower the ack windows from 12 to 4
 #define DGRAM_SIZE 16400
+#endif
+
 // . the 45k dgram doesn't travel well over the internet, and javier needs
 //   to do that for the "interface client" code
-#define DGRAM_SIZE_INTERNET (1500-28)
-#endif
+#define DGRAM_SIZE_INTERNET (1500-28-10)

 // i'd like to have less dgram to decrease interrupts and
 // to decrease the MAX_DGRAMS define which decrease UdpSlot size
@ -76,10 +83,11 @@
 // raised from 50MB to 80MB so Msg13 compression proxy can send back big replies > 5MB
 // raised from 80MB to 180MB since we could be sending back a Msg95Reply
 // which is a list of QueryChanges. 3/29/13.
-#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
+//#define MAX_DGRAMS (((180*1024*1024) / DGRAM_SIZE_LB) + 1)
+#define MAX_DGRAMS (((80*1024*1024) / DGRAM_SIZE) + 1)
 //#endif

-#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE_LB)-50000)
+#define MAX_ABSDOCLEN ((MAX_DGRAMS * DGRAM_SIZE)-50000)

 // . the max size of an incoming request for a hot udp server
 // . we cannot call malloc so it must fit in here
--- a/Unicode.h
+++ b/Unicode.h
@ -66,15 +66,26 @@ static int utf8_sane[] = {

 // how many bytes is char pointed to by p?
 inline char getUtf8CharSize ( uint8_t *p ) {
-	return bytes_in_utf8_code[*p];
+	uint8_t c = *p;
+	if(c<128)
+		return 1;
+	else
+		return bytes_in_utf8_code[c];
 }

 inline char getUtf8CharSize ( char *p ) {
-	return bytes_in_utf8_code[*(uint8_t *)p];
+	uint8_t c = (uint8_t)*p;
+	if(c<128)
+		return 1;
+	else
+		return bytes_in_utf8_code[c];
 }

 inline char getUtf8CharSize ( uint8_t c ) {
-	return bytes_in_utf8_code[c];
+	if(c<128)
+		return 1;
+	else
+		return bytes_in_utf8_code[c];
 }

 inline char getUtf8CharSize2 ( uint8_t *p ) {
--- a/Url.cpp
+++ b/Url.cpp
@ -5,6 +5,8 @@
 #include "Errno.h"
 #include "HashTable.h"
 #include "Speller.h"
+#include "Punycode.h"
+#include "Unicode.h"

 static void print_string ( char *s , int32_t len );

@ -137,7 +139,7 @@ void Url::set (Url *baseUrl,char *s,int32_t len,bool addWWW,bool stripSessionId,
 // . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
 // . url should be ENCODED PROPERLY for this to work properly
 void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
-		bool stripPound , bool stripCommonFile , 
+                bool stripPound , bool stripCommonFile , 
 		int32_t titleRecVersion ) {
 	reset();
 	// debug
@ -157,11 +159,163 @@ void Url::set ( char *t , int32_t tlen , bool addWWW , bool stripSessionId ,
 	while ( tlen > 0 && !is_alnum_a(*t) && *t!='-' && *t!='/'){t++;tlen--;}
 	// . stop t at first space or binary char
 	// . url should be in encoded form!
-	int32_t i ;
+	int32_t i = 0;
+	int32_t nonAsciiPos = -1;
 	for ( i = 0 ; i < tlen ; i++ )	{
-		if ( ! is_ascii(t[i]) ) break; // no non-ascii chars allowed
 		if ( is_wspace_a(t[i])   ) break; // no spaces allowed
+
+		if ( ! is_ascii(t[i]) ) {
+			// Sometimes the length with the null is passed in, 
+			// so ignore nulls FIXME?
+			if( t[i] ) nonAsciiPos = i;
+			break; // no non-ascii chars allowed
+		}
 	}
+
+	
+	if(nonAsciiPos != -1) { 
+		// Try turning utf8 and latin1 encodings into punycode.
+		// All labels(between dots) in the domain are encoded 
+		// separately.  We don't support encoded tlds, but they are 
+		// not widespread yet.
+		// If it is a non ascii domain it needs to take the form 
+		// xn--<punycoded label>.xn--<punycoded label>.../
+		char tmp = t[tlen];
+		if(t[tlen]) t[tlen] = 0;
+		log(LOG_DEBUG, "build: attempting to decode unicode url %s pos at %"INT32, t, nonAsciiPos);
+		if(tmp) t[tlen] = tmp;
+		char encoded [ MAX_URL_LEN ];
+		size_t encodedLen = MAX_URL_LEN;
+		char *encodedDomStart = encoded;
+		char *p = t;
+		char *pend = t+tlen;
+		
+		// Find the start of the domain
+		if(tlen > 7 && strncmp(p, "http://", 7) == 0) p += 7;
+		else if(tlen > 8 && strncmp(p, "https://", 8) == 0) p += 8;
+
+		gbmemcpy(encodedDomStart, t, p-t);
+		encodedDomStart += p-t;
+
+		while(p < pend && *p != '/') {
+			char *labelStart = p;
+			uint32_t tmpBuf[MAX_URL_LEN];
+			int32_t tmpLen = 0;
+		
+			while(p < pend && *p != '.' && *p != '/') p++;
+			int32_t	labelLen = p - labelStart;
+
+			bool tryLatin1 = false;
+			// For utf8 urls
+			p = labelStart;
+			bool labelIsAscii = true;
+
+			// Convert the domain to code points and copy it to 
+			// tmpbuf to be punycoded
+			for(;p-labelStart<labelLen;
+				p += utf8Size(tmpBuf[tmpLen]), tmpLen++) {
+
+				labelIsAscii &= is_ascii(*p);
+				tmpBuf[tmpLen] = utf8Decode(p);
+				if(!tmpBuf[tmpLen]) { // invalid char?
+					tryLatin1 = true;
+					break;
+				}
+			}
+			if(labelIsAscii) {
+				if(labelStart[labelLen] == '.') {
+					labelLen++;
+					p++;
+				}
+				gbmemcpy(encodedDomStart, labelStart, labelLen);
+				encodedDomStart += labelLen;
+				continue;
+			}
+
+			if( tryLatin1 ) {
+				// For latin1 urls
+				tmpLen = 0;
+				for(;tmpLen<labelLen;tmpLen++) {
+					tmpBuf[tmpLen] = labelStart[tmpLen];
+				}
+			}
+
+			gbmemcpy(encodedDomStart, "xn--", 4);
+			encodedDomStart += 4;
+
+			punycode_status status ;
+			status = punycode_encode(tmpLen, 
+						 tmpBuf,
+						 NULL, 
+						 &encodedLen,
+						 encodedDomStart);
+			if ( status != 0 ) {
+				// Give up? try again?
+				log("build: Bad Engineer, failed to "
+				    "punycode international url %s", t);
+				return;
+			}
+			// We should check if what we encoded were valid url 
+			// characters, no spaces, etc
+			// FIXME: should we exclude just the bad chars? I've 
+			// seen plenty of urls with
+			// a newline in the middle.  Just discard the whole 
+			// chunk for now
+			bool badUrlChars = false;
+			for(uint32_t i=0;i<encodedLen;i++) {
+				if(is_wspace_a(encodedDomStart[i])){
+					badUrlChars = true;
+					break;
+				}
+			}
+
+			if(encodedLen == 0 || badUrlChars) {
+				encodedDomStart -= 4; //don't need the xn--
+				p++;
+			} else {
+				encodedDomStart += encodedLen;
+				*encodedDomStart++ = *p++; // Copy in the . or the /
+
+			}
+		}
+		
+		// p now points to the end of the domain
+		// encodedDomStart now points to the first free space in encoded string
+
+		// Now copy the rest of the url in.  Watch out for non-ascii chars 
+		// truncate the url, and keep it under max url length
+		uint32_t newUrlLen = encodedDomStart - encoded;
+
+		while(p < pend) {
+			if ( ! *p ) break; // null?
+			if(!is_ascii(*p)) {
+				//break;
+				// url encode utf8 characters now
+				char cs = getUtf8CharSize(p);
+				// bad utf8 char?
+				if ( cs <= 1 ) break;
+				// too long?
+				if ( newUrlLen + 12 >= MAX_URL_LEN )
+					break;
+				char stored = urlEncode ( &encoded[newUrlLen], 
+							  12 ,
+							  p ,
+							  cs );
+				p += cs;
+				newUrlLen += stored;
+				continue;
+			}
+			if(is_wspace_a(*p)) break;
+			if(newUrlLen >= MAX_URL_LEN) break;
+			encoded[newUrlLen++] = *p++;
+		}
+
+
+		//gbmemcpy(encodedDomStart, p, restOfUrlLen);
+		encoded[newUrlLen] = '\0';
+		return this->set(encoded, newUrlLen, addWWW, stripSessionId, 
+						 stripPound, stripCommonFile, titleRecVersion);
+    }
 	// truncate length to the first occurence of an unacceptable char
 	tlen = i;
 	// . decode characters that should not have been encoded
@ -955,6 +1109,10 @@ char *Url::getPathComponent ( int32_t num , int32_t *clen ) {
 //	return pc + pclen;
 //}

+
+
+
+
 bool Url::isHostWWW ( ) {
 	if ( m_hlen < 4 ) return false;
 	if ( m_host[0] != 'w' ) return false;
@ -2380,3 +2538,91 @@ bool Url::hasMediaExtension ( ) {

 	return false;
 }
+
+uint32_t Url::unitTests() {
+	char* urls[] = {
+		"http://www.fas.org/blog/ssp/2009/08/securing-venezuela\032s-arsenals.php",
+		"http://topbeskæring.dk/velkommen",
+		"www.Alliancefrançaise.nu",
+		"française.Alliance.nu",
+		"française.Alliance.nu/asdf",
+		"http://française.Alliance.nu/asdf",
+		"http://française.Alliance.nu/",
+		"幸运.龍.com",
+		"幸运.龍.com/asdf/运/abc",
+		"幸运.龍.com/asdf",
+		"http://幸运.龍.com/asdf",
+		"http://Беларуская.org/Акадэмічная",
+		"https://hi.Български.com",
+		"https://fakedomain.中文.org/asdf",
+		"https://gigablast.com/abc/文/efg",
+		"https://gigablast.com/?q=文",
+		"http://www.example.сайт",
+		"http://genocidearchiverwanda.org.rw/index.php/Category:Official_Communiqués",
+		"http://www.example.com/xn--fooled-you-into-trying-to-decode-this",
+		"http://www.example.сайт/xn--fooled-you-into-trying-to-decode-this",
+		"http://腕時計通販.jp/",
+		// Lets check some bad urls too:
+		"https://pypi.python\n\n\t\t\t\t.org/packages/source/p/pyramid/pyramid-1.5.tar.gz#md5=8747658dcbab709a9c491e43d3b0d58b"
+	};
+
+    StackBuf(sb);
+	uint32_t len = sizeof(urls) / sizeof(char*);
+	for(uint32_t i = 0; i < len; i++) {
+		Url u;
+		u.set(urls[i], strlen(urls[i]));
+		log("build:%s normalized to %s, printed to %s ", 
+            urls[i], u.getUrl(), Url::getDisplayUrl(u.getUrl(), &sb));
+        sb.reset();
+	}
+	//FIXME: need to return an error if there is a problem
+	return 0;
+}
+
+
+char* Url::getDisplayUrl(char* url, SafeBuf* sb) {
+	char* found;
+    char* labelCursor = url;
+	if((found = strstr(labelCursor, "xn--"))) {
+		sb->safeMemcpy(url, found - url);
+
+		char* p = url;
+		char* pend = url + gbstrlen(url);
+		if(strncmp(p, "http://", 7) == 0) p += 7;
+		else if(strncmp(p, "https://", 8) == 0) p += 8;
+
+		while(p < pend && *p != '/') p++;
+		char* domEnd = p;
+
+		do {
+			if(found > domEnd) {
+				// Dont even look if it is past the domain
+				break;
+			}
+
+			char* encodedStart = found + 4;
+			uint32_t decoded [ MAX_URL_LEN];
+			size_t decodedLen = MAX_URL_LEN - 1 ;
+			char* labelEnd = encodedStart;
+			while( labelEnd < domEnd && *labelEnd != '/' &&  *labelEnd != '.' ) 
+				labelEnd++;
+
+			punycode_status status = punycode_decode(labelEnd - encodedStart,
+													 encodedStart, 
+													 &decodedLen, 
+													 decoded, NULL);
+			if(status != 0) {
+				log("build: Bad Engineer, failed to depunycode international url %s", url);
+				sb->safePrintf("%s", url);
+				return url;
+			}
+			sb->utf32Encode(decoded, decodedLen);
+			//sb->pushChar(*labelEnd);
+			labelCursor = labelEnd;
+		} while((found = strstr(labelCursor, "xn--")));
+	}
+    // Copy in the rest
+    sb->safePrintf("%s", labelCursor);
+    sb->nullTerm();
+    return sb->getBufStart();
+}
--- a/Url.h
+++ b/Url.h
@ -232,6 +232,7 @@ public:
 	// this is private
 	bool isSpam ( char *s , int32_t slen ) ;

+
 	// . detects crazy repetetive urls like this:
 	//   http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
 	//   steigerwald/letters/send/archive/letters/send/archive/bish/
@ -244,6 +245,9 @@ public:
 	//   is probably more accurate than this function.
 	bool isLinkLoop();

+	static uint32_t unitTests();
+	static char* getDisplayUrl(char* url, SafeBuf* sb);
+
 	// private:

 	char    m_url[MAX_URL_LEN]; // the normalized url
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -475,7 +475,7 @@ class XmlDoc {
 		    key_t           *doledbKey ,
 		    char            *coll      , 
 		    class SafeBuf   *pbuf      , 
-		    int32_t             niceness  ,
+		    int32_t          niceness  ,
 		    char            *utf8Content = NULL ,
 		    bool             deleteFromIndex = false ,
 		    int32_t             forcedIp = 0 ,
@ -483,9 +483,11 @@ class XmlDoc {
 		    uint32_t           spideredTime = 0 , // time_t
 		    bool             contentHasMime = false ,
 		    // for container docs, what is the separator of subdocs?
-				char            *contentDelim = NULL,
-				char *metadata = NULL,
-				uint32_t metadataLen = 0) ;
+		    char            *contentDelim = NULL,
+			char *metadata = NULL,
+			uint32_t metadataLen = 0,
+			// for injected docs we have the recv, buffer size don't exceed that
+			int32_t payloadLen = -1) ;

 	// we now call this right away rather than at download time!
 	int32_t getSpideredTime();
@ -513,7 +515,9 @@ class XmlDoc {
 	bool indexDoc2 ( );
 	bool isContainerDoc ( );
 	bool indexContainerDoc ( );
-	bool indexWarcOrArc ( char ct ) ;
+
+	bool readMoreWarc();
+	bool indexWarcOrArc ( ) ;
 	key_t *getTitleRecKey() ;
 	//char *getSkipIndexing ( );
 	char *prepareToMakeTitleRec ( ) ;
@ -521,6 +525,7 @@ class XmlDoc {
 	bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
 	// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
 	SafeBuf *getTitleRecBuf ( );
+	bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
 					      bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
@ -705,7 +710,7 @@ class XmlDoc {
 	char **getExpandedUtf8Content ( ) ;
 	char **getUtf8Content ( ) ;
 	// we download large files to a file on disk, like warcs and arcs
-	BigFile *getUtf8ContentInFile ( int64_t *fileSizeArg );
+	FILE *getUtf8ContentInFile ( );
 	int32_t *getContentHash32 ( ) ;
 	int32_t *getContentHashJson32 ( ) ;
 	//int32_t *getTagHash32 ( ) ;
@ -768,6 +773,8 @@ class XmlDoc {
 	uint64_t m_ipStartTime;
 	uint64_t m_ipEndTime;

+	bool m_updatedMetaData;
+
 	void copyFromOldDoc ( class XmlDoc *od ) ;

 	class SpiderReply *getFakeSpiderReply ( );
@ -813,6 +820,7 @@ class XmlDoc {
 	int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
 	bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
 	bool hashMetaTags ( class HashTableX *table ) ;
+	bool hashMetaData ( class HashTableX *table ) ;
 	bool hashIsClean ( class HashTableX *table ) ;
 	bool hashZipCodes ( class HashTableX *table ) ;
 	bool hashMetaZip ( class HashTableX *table ) ;
@ -1067,6 +1075,7 @@ class XmlDoc {
 	int32_t m_addedSpiderRequestSize;
 	int32_t m_addedSpiderReplySize;
 	int32_t m_addedStatusDocSize;
+	int64_t m_addedStatusDocId;

 	SafeBuf  m_metaList2;
 	SafeBuf  m_zbuf;
@ -1084,12 +1093,16 @@ class XmlDoc {
 	int32_t m_warcError ;
 	int32_t m_arcError ;
 	bool m_doneInjectingWarc ;
-	bool m_doneInjectingArc ;
-	int64_t m_fileOff ;
+
+	int64_t m_bytesStreamed;
 	char *m_fileBuf ;
 	int32_t m_fileBufAllocSize;
+	bool    m_registeredWgetReadCallback;
 	char *m_fptr ;
 	char *m_fptrEnd ;
+
+	FILE* m_pipe;
+	
 	BigFile m_file;
 	int64_t m_fileSize;
 	FileState m_fileState;
@ -2401,7 +2414,6 @@ class XmlDoc {
 	bool          m_setFromDocId;
 	bool          m_freeLinkInfo1;
 	bool          m_freeLinkInfo2;
-
 	bool          m_contentInjected;

 	bool          m_recycleContent;
@ -2470,7 +2482,8 @@ class XmlDoc {
 			 // for container docs consisting of subdocs to inject
 			 char *contentDelim = NULL,
 			 char* metadata = NULL,
-			 uint32_t metadataLen = 0);
+             uint32_t metadataLen = 0,
+             int32_t  payloadLen = -1);


 	bool injectLinks  ( HashTableX *linkDedupTable ,
--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -2515,7 +2515,7 @@ int32_t deserializeMsg ( int32_t  baseSize ,
 	return baseSize + (p - stringBuf);//getStringBuf());
 }

-void deserializeMsg2 ( char    **firstStrPtr , // ptr_url
+bool deserializeMsg2 ( char    **firstStrPtr , // ptr_url
 		       int32_t  *firstSizeParm ) { // size_url
 	int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
 	// point to our string buffer
@ -2531,7 +2531,7 @@ void deserializeMsg2 ( char    **firstStrPtr , // ptr_url
 		// make it NULL if size is 0 though
 		if ( *sizePtr == 0 ) *strPtr = NULL;
 		// sanity check
-		if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
+		if ( *sizePtr < 0 ) return false;//{ char *xx = NULL; *xx =0; }
 		// advance our destination ptr
 		p += *sizePtr;
 		// advance both ptrs to next string
@ -2540,6 +2540,7 @@ void deserializeMsg2 ( char    **firstStrPtr , // ptr_url
 	}
 	// return how many bytes we processed
 	//return baseSize + (p - stringBuf);//getStringBuf());
+	return true;
 }

 // print it to stdout for debugging Dates.cpp
@ -2618,4 +2619,3 @@ bool verifyUtf8 ( char *txt ) {
 	int32_t tlen = gbstrlen(txt);
 	return verifyUtf8(txt,tlen);
 }
-
--- a/fctypes.h
+++ b/fctypes.h
@ -237,7 +237,7 @@ bool saveTimeAdjustment ( ) ;
 #define is_hspace_a(c)         g_map_is_hspace[(unsigned char)c]
 #define is_ascii(c)           g_map_is_ascii[(unsigned char)c]
 #define is_ascii9(c)           g_map_is_ascii[(unsigned char)c]
-#define is_ascii3(c)           g_map_is_ascii3[(unsigned char)c]
+#define is_ascii3(c)           ((unsigned char)c<128 || g_map_is_ascii3[(unsigned char)c])
 #define is_punct_a(c)          g_map_is_punct[(unsigned char)c]
 #define is_alnum_a(c)          g_map_is_alnum[(unsigned char)c]
 #define is_alpha_a(c)          g_map_is_alpha[(unsigned char)c]
@ -627,6 +627,6 @@ int32_t deserializeMsg ( int32_t  baseSize ,
 		      char **firstStrPtr ,
 		      char *stringBuf ) ;

-void deserializeMsg2 ( char **firstStrPtr , int32_t  *firstSizeParm );
+bool deserializeMsg2 ( char **firstStrPtr , int32_t  *firstSizeParm );

 #endif 
--- a/gbfilter.cpp
+++ b/gbfilter.cpp
@ -236,7 +236,7 @@ int filterContent ( char *buf , int32_t n , int32_t mimeLen , char ctype , int32

 	//fprintf(stderr,"in=%s\n",in);

-	int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU );
+	int fd = open ( in , O_CREAT | O_RDWR , S_IRWXU | S_IRWXG );
 	if ( fd < 0 ) {
 		fprintf(stderr,"gbfilter: open: %s\n",strerror(errno)); 
 		return -1;
--- a/main.cpp
+++ b/main.cpp
@ -289,7 +289,7 @@ bool summaryTest1   ( char *rec, int32_t listSize, char *coll , int64_t docId ,
 // time a big write, read and then seeks
 bool thrutest ( char *testdir , int64_t fileSize ) ;
 void seektest ( char *testdir , int32_t numThreads , int32_t maxReadSize ,
-		char *filename );
+		char *filename , bool doSeqWriteThread );

 bool pingTest ( int32_t hid , uint16_t clientPort );
 bool memTest();
@ -810,17 +810,21 @@ int main2 ( int argc , char *argv[] ) {
 			"parser speed tests\n\n"
 			*/

-			/*
-			"thrutest [dir] [fileSize]\n\tdisk write/read speed "
-			"test\n\n"
+			"thrutest [dir] [fileSize]\n\tdisk sequential "
+			"write then read speed tests.\n\n"

 			"seektest [dir] [numThreads] [maxReadSize] "
 			"[filename]\n"
-			"\tdisk seek speed test\n\n"
+			"\tdisk access speed test. (IOps)\n\n"
+
+			"rwtest [dir] [numThreads] [maxReadSize] "
+			"[filename]\n"
+			"\tdisk read access speed test while sequentially "
+			"writing. Simulates Gigablast while spidering and "
+			"querying nicely.\n\n"
 			
 			"memtest\n"
 			"\t Test how much memory we can use\n\n"
-			*/

 			/*
 			// Quality Tests
@ -1390,7 +1394,20 @@ int main2 ( int argc , char *argv[] ) {
 		if ( cmdarg+2 < argc ) numThreads  = atol(argv[cmdarg+2]);
 		if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
 		if ( cmdarg+4 < argc ) filename    = argv[cmdarg+4];
-		seektest ( testdir , numThreads , maxReadSize , filename );
+		seektest ( testdir , numThreads , maxReadSize ,filename,false);
+		return 0;
+	}
+	// gb rwtest <testdir> <numThreads> <maxReadSize>
+	if ( strcmp ( cmd , "rwtest" ) == 0 ) {
+		char     *testdir         = "/tmp/";
+		int32_t      numThreads      = 20; //30;
+		int64_t maxReadSize     = 20000;
+		char     *filename        = NULL;
+		if ( cmdarg+1 < argc ) testdir     = argv[cmdarg+1];
+		if ( cmdarg+2 < argc ) numThreads  = atol(argv[cmdarg+2]);
+		if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
+		if ( cmdarg+4 < argc ) filename    = argv[cmdarg+4];
+		seektest ( testdir , numThreads , maxReadSize,filename,true);
 		return 0;
 	}

@ -2572,6 +2589,13 @@ int main2 ( int argc , char *argv[] ) {
 		false );// sendtoproxies
 	}

+	if ( strcmp ( cmd , "unittest" ) == 0 ) {
+		if ( cmdarg + 1 >= argc ) exit(1);
+		if(strcmp("url", argv[cmdarg+1]) == 0) {
+			exit(Url::unitTests());
+		}
+	}
+
 	// gb startclassifier coll ruleset [hostId]
 	/*
 	if ( strcmp ( cmd , "startclassifier" ) == 0 ) {
@ -4936,7 +4960,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 					  // ensure directory is there, if
 					  // not then make it
 					  "ssh %s 'mkdir %s' ; "
-					  "scp -r %s %s:%s"
+					  "scp -p -r %s %s:%s"
 					  , ipStr
 					  , h2->m_dir

@ -5022,7 +5046,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			if ( ! f.doesExist() ) target = "gb";

 			sprintf(tmp,
-				"scp -c arcfour " // blowfish is faster
+				"scp -p " // blowfish is faster
 				"%s%s "
 				"%s:%s/gb.installed%s",
 				dir,
@ -5058,7 +5082,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			// don't copy to ourselves
 			//if ( h2->m_hostId == h->m_hostId ) continue;
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%sgb.new "
 				"%s:%s/tmpgb.installed &",
 				dir,
@ -5071,7 +5095,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			// don't copy to ourselves
 			//if ( h2->m_hostId == h->m_hostId ) continue;
 			sprintf(tmp,
-				"scp %sgb.conf %shosts.conf %s:%s %s",
+				"scp -p %sgb.conf %shosts.conf %s:%s %s",
 				dir ,
 				dir ,
 				//h->m_hostId ,
@ -5453,7 +5477,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			}
 			*/
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/content.rdf.u8 "
 				"%s:%scatdb/content.rdf.u8",
 				dir,
@ -5462,7 +5486,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/structure.rdf.u8 "
 				"%s:%scatdb/structure.rdf.u8",
 				dir,
@ -5471,7 +5495,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/gbdmoz.structure.dat "
 				"%s:%scatdb/gbdmoz.structure.dat",
 				dir,
@ -5480,7 +5504,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/gbdmoz.content.dat "
 				"%s:%scatdb/gbdmoz.content.dat",
 				dir,
@ -5503,7 +5527,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			// don't copy to ourselves
 			if ( h2->m_hostId == 0 ) continue;
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/content.rdf.u8.new "
 				"%s:%scatdb/content.rdf.u8.new",
 				dir,
@ -5512,7 +5536,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/structure.rdf.u8.new "
 				"%s:%scatdb/structure.rdf.u8.new",
 				dir,
@ -5521,7 +5545,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/gbdmoz.structure.dat.new "
 				"%s:%scatdb/gbdmoz.structure.dat.new",
 				dir,
@ -5530,7 +5554,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/gbdmoz.content.dat.new "
 				"%s:%scatdb/gbdmoz.content.dat.new",
 				dir,
@ -5539,7 +5563,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
 			sprintf(tmp,
-				"scp "
+				"scp -p "
 				"%scatdb/gbdmoz.content.dat.new.diff "
 				"%s:%scatdb/gbdmoz.content.dat.new.diff",
 				dir,
@ -6384,6 +6408,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 		  bool justPrintSentences, 
 		  bool justPrintWords ) {

+	g_isDumpingRdbFromMain = 1;
 	if (!ucInit(g_hostdb.m_dir, true)) {
 		log("Unicode initialization failed!");
 		return;
@ -6903,6 +6928,8 @@ void dumpDoledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTr
 		printf("\n");
 		// must be a request -- for now, for stats
 		if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
+			// error!
+			continue;
 			char *xx=NULL;*xx=0; }
 		// cast it
 		SpiderRequest *sreq = (SpiderRequest *)srec;
@ -11642,17 +11669,19 @@ static BigFile s_f;
 static int32_t s_numThreads = 0;
 static int64_t s_maxReadSize = 1;
 static int64_t s_startTime = 0;
+static bool s_doSeqWriteThread;
 //#define MAX_READ_SIZE (2000000)
 #include <sys/types.h>
 #include <sys/wait.h>

 void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize , 
-		char *filename ) {
+		char *filename , bool doSeqWriteThread ) {

 	g_loop.init();
 	g_threads.init();
 	s_numThreads = numThreads;
 	s_maxReadSize = maxReadSize;
+	s_doSeqWriteThread = doSeqWriteThread;
 	if ( s_maxReadSize <= 0 ) s_maxReadSize = 1;
 	//if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE;

@ -11689,7 +11718,7 @@ void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
 	    "exist. Use ./gb thrutest ... to create speedtest* files.");
 	return;
 skip:
-	s_f.open ( O_RDONLY );
+	s_f.open ( O_RDWR );
 	s_filesize = s_f.getFileSize();
 	log ( LOG_INIT, "admin: file size = %"INT64".",s_filesize);
 	// always block
@ -11719,6 +11748,30 @@ skip:
 	//s_lock = 1;
 	//pthread_t tid1 ; //, tid2;

+	//g_conf.m_logDebugThread = 1;
+
+	// garbage collection on ssds seems to be triggered by writes so
+	// that they do not hurt read times, do this:
+	g_conf.m_flushWrites = 1;
+
+	// disable linux file cache
+  	// system("echo 1 > /proc/sys/vm/drop_caches");
+
+	// -o sync TOTAL WORKS!!!!!!!
+	// mount with -o sync to disable write page caching on linux
+
+	// disable on-disk write cache
+	// system("sudo hdparm -W 0 /dev/sda2");
+	// system("sudo hdparm -W 0 /dev/sdb1");
+	// system("sudo hdparm -W 0 /dev/sdc1");
+	// system("sudo hdparm -W 0 /dev/sdd1");
+
+	// disable read-ahead
+	// system("sudo hdparm -A 0 /dev/sda2");
+	// system("sudo hdparm -A 0 /dev/sdb1");
+	// system("sudo hdparm -A 0 /dev/sdc1");
+	// system("sudo hdparm -A 0 /dev/sdd1");
+
 	// set time
 	s_startTime = gettimeofdayInMilliseconds_force();

@ -11771,6 +11824,7 @@ void *startUp ( void *state , ThreadEntry *t ) {
 	//	fprintf(stderr,"Threads::startUp: setpriority: failed\n");
 	//	exit(-1);
 	//}
+
 	// read buf
 	//char buf [ MAX_READ_SIZE ];
 #undef malloc
@ -11782,13 +11836,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
 	}
 	// we got ourselves
 	s_launched++;
+
+	char *s = "reads";
+	if ( id == 0 && s_doSeqWriteThread )
+		s = "writes";
 	// msg
-	fprintf(stderr,"id=%"INT32" launched. Performing 100000 reads.\n",id);
+	fprintf(stderr,"threadid=%"INT32" launched. "
+		"Performing 100000 %s.\n",id,s);
+
+// #undef sleep
+// 	if (  id == 0 ) sleep(1000);
+// #define sleep(a) { char *xx=NULL;*xx=0; }
+
+
 	// wait for lock to be unleashed
 	//while ( s_launched != s_numThreads ) usleep(10);
 	// now do a stupid loop
 	//int32_t j, off , size;
 	int64_t off , size;
+	int64_t seqOff = 0;
 	for ( int32_t i = 0 ; i < 100000 ; i++ ) {
 		uint64_t r = rand();
 		r <<= 32 ;
@ -11802,7 +11868,13 @@ void *startUp ( void *state , ThreadEntry *t ) {
 		int64_t start = gettimeofdayInMilliseconds_force();
 		//fprintf(stderr,"%"INT32") i=%"INT32" start\n",id,i );
 		//pread ( s_fd1 , buf , size , off );
-		s_f.read ( buf , size , off );
+		if ( id == 0 && s_doSeqWriteThread )
+			s_f.write ( buf , size , seqOff );
+		else
+			s_f.read ( buf , size , off );
+		seqOff += size;
+		if ( seqOff + size > s_filesize )
+			seqOff = 0;
 		//fprintf(stderr,"%"INT32") i=%"INT32" done\n",id,i );
 		int64_t now = gettimeofdayInMilliseconds_force();
 #undef usleep
@ -11811,13 +11883,25 @@ void *startUp ( void *state , ThreadEntry *t ) {
 		s_count++;
 		float sps = (float)((float)s_count * 1000.0) / 
 			(float)(now - s_startTime);
-		fprintf(stderr,"count=%"INT32" off=%012"INT64" size=%"INT32" time=%"INT32"ms "
-			"(%.2f seeks/sec)\n",
+		int64_t poff = off;
+		char *str = "seeks";
+		if ( id == 0 && s_doSeqWriteThread ) {
+			poff = seqOff;
+			str = "writes";
+		}
+		fprintf(stderr,"threadid=%i "
+			"count=%"INT32" "
+			"off=%012"INT64" "
+			"size=%"INT32" "
+			"time=%"INT32"ms "
+			"(%.2f %s/sec)\n",
+			(int)id,
 			(int32_t)s_count,
-			(int64_t)off,
+			(int64_t)poff,
 			(int32_t)size,
 			(int32_t)(now - start) , 
-			sps );
+			sps ,
+			str );
 	}
 		

@ -16849,7 +16933,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
 	int32_t filenum = 0;
 	char filename[64];
 	sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
-	int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
+	//int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
 	int32_t numDumped = 0;
 	uint32_t bytesDumped = 0;
 loop:
@ -17016,7 +17100,7 @@ void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool inclu
 		filenum++;
 		sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
 		close(FD);
-		FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
+		//FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
 		bytesDumped = 0;
 		fprintf(stderr, "Started new file: %s. starts at docId: %"INT64".\n",filename, lastDocId);
 	}
--- a/qa.cpp
+++ b/qa.cpp
@ -248,10 +248,10 @@ void makeQADir ( ) {
 	char dir[1024];
 	snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
 	log("mkdir mkdir %s",dir);
-	int32_t status = ::mkdir ( dir ,
-				S_IRUSR | S_IWUSR | S_IXUSR | 
-				S_IRGRP | S_IWGRP | S_IXGRP | 
-				S_IROTH | S_IXOTH );
+	int32_t status = ::mkdir ( dir ,getDirCreationFlags() );
+				// S_IRUSR | S_IWUSR | S_IXUSR | 
+				// S_IRGRP | S_IWGRP | S_IXGRP | 
+				// S_IROTH | S_IXOTH );
 	if ( status == -1 && errno != EEXIST && errno )
 		log("qa: Failed to make directory %s: %s.",
 		    dir,mstrerror(errno));
@ -1459,6 +1459,13 @@ bool qaTimeAxis ( ) {
                          "format=xml&u=");
            sb.urlEncode ( s_urlPtrs[s_flags[URL_COUNTER]]);
            sb.safePrintf("&hasmime=1");
+	    // add some meta data now, the current time stamp so we can
+	    // make sure the meta data is updated even if its EDOCUNCHANGED
+	    sb.safePrintf("&metadata=");
+	    static int32_t s_count9 = 0;
+	    SafeBuf tmp;
+	    tmp.safePrintf("{\"qatesttime\":%"INT32"}\n",s_count9++);
+	    sb.urlEncode ( tmp.getBufStart(), tmp.getLength() );
            sb.safePrintf("&content=");
            sb.urlEncode(s_contentPtrs[contentIndex]);

@ -1494,13 +1501,17 @@ bool qaTimeAxis ( ) {
 		return false;
 	}

-	// if ( ! s_flags[EXAMINE_RESULTS] ) {
-	// 	s_flags[16] = true;
-	// 	if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
-	// 			"&dsrt=500",
-	// 			702467314 ) )
-	// 		return false;
-	// }
+	// this doc should have qatesttime:197 and qatesttime:198
+	// since it had a EDOCUNCHANGED error the 2nd time around but
+	// different metadata.
+	if ( ! s_flags[EXAMINE_RESULTS1] ) {
+	 	s_flags[EXAMINE_RESULTS1] = true;
+	 	if ( ! getUrl ( "/search?c=qatest123&qa=1&"
+				"format=json&"
+				"q=qatesttime:197",
+	 			702467314 ) )
+	 		return false;
+	}

    return true;
 }
@ -1534,6 +1545,8 @@ bool qaWarcFiles ( ) {
 				"&obeyRobots=0"
 				// This is what we are testing
 				"&usetimeaxis=1"
+				// we are indexing warc files
+				"&indexwarcs=1"
 				,
 				// checksum of reply expected
 				0 ) )
@ -1638,7 +1651,7 @@ bool qaInjectMetadata ( ) {

 		char* metadata = "{\"testtest\":42,\"a-hyphenated-name\":5, "
 			"\"a-string-value\":\"can we search for this\", "
-			"an array:['a','b', 'c', 1,2,3], "
+			"\"an array\":[\"a\",\"b\", \"c\", 1,2,3], "
 			"\"a field with spaces\":6, \"compound\":{\"field\":7}}";
 		
 		s_flags[ADD_INITIAL_URLS]++;
@ -3401,9 +3414,9 @@ static QATest s_qatests[] = {
 	 "when content has changed, even if the url is the same. "},


-	{qaWarcFiles,
-	 "indexWarcFiles",
-	 "Ensure the spider handles arc.gz and warc.gz file formats."},
+	// {qaWarcFiles,
+	//  "indexWarcFiles",
+	//  "Ensure the spider handles arc.gz and warc.gz file formats."},

 	{qaInjectMetadata,
 	 "injectMetadata",
--- a/script/inject/main.py
+++ b/script/inject/main.py
@ -11,13 +11,14 @@ import sqlite3
 import datetime
 import sys
 import time
-import flask
+# import flask
 import signal, os
 import random
+from itertools import repeat
+staleTime = datetime.timedelta(90,0,0) # three month for now

-app = flask.Flask(__name__)
-app.secret_key = 'oaisj84alwsdkjhf9238u'
-staleTime = datetime.timedelta(7,0,0) # one week for now
+# app = flask.Flask(__name__)
+# app.secret_key = 'oaisj84alwsdkjhf9238u'

 def getDb(makeDates=True):
    if makeDates:
@ -33,6 +34,9 @@ def handler(signum, frame):
 #Generate environment with:
 #pex -r requests -r multiprocessing -e inject:main -o warc-inject -s '.' --no-wheel
 #pex -r requests -r multiprocessing -o warc-inject
+# see the Makefile
+
+# TODO: add argument parser
 # import argparse
 # parser = argparse.ArgumentParser()
 # parser.add_argument('--foo', help='foo help')
@ -63,13 +67,16 @@ def reallyExecuteMany(c, query, qargs):
    

 def injectItem(item, db, mode):
+    itemStart = time.time()
+
    c = db.cursor()
    res = reallyExecute(c, 'select * from items where item = ?', (item,)).fetchone()
+    db.commit()
    itemId = None
    if res:
        if res[1] > (datetime.datetime.now() - staleTime):
            print 'skipping %s because we checked recently' % item
-            return 0     # We checked recently
+            return time.time() - itemStart     # We checked recently
        itemId = res[0]


@ -83,7 +90,7 @@ def injectItem(item, db, mode):
        except Exception, e:
            print 'error: metadata feed went down (%s) for: %s' % (e, item)
            time.sleep(10)
-            
+

    if itemId is None:
        reallyExecute(c, "insert INTO items VALUES (?,?)", (item, datetime.datetime.now()))
@ -91,11 +98,12 @@ def injectItem(item, db, mode):
        db.commit()

    if 'files' not in md:
-        return
+        time.time() - itemStart

    res = None
    res = reallyExecute(c, "select fileName, updated, status, took from files where itemId = ?", 
                        (itemId,)).fetchall()
+    db.commit()

    lastUpdate = {}
    for fileName, updated, status, took in res:
@ -105,22 +113,31 @@ def injectItem(item, db, mode):

    dbUpdates = []
    skipped = 0
-    for ff in md['files']:
-        if not ff['name'].endswith('arc.gz'): continue
+    warcs = filter(lambda x: 'name' in x and x['name'].endswith and x['name'].endswith('arc.gz'), md['files'])
+    collectionName = md['metadata'].get('archiveit-collection-name', '')
+    for ii, ff in enumerate(warcs):
+        #if not ff['name'].endswith('arc.gz'): continue
        itemMetadata = {'mtime':ff['mtime']}
        updateTime = datetime.datetime.fromtimestamp(float(ff['mtime']))
-        if ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
+        if mode != 'force' and ff['name'] in lastUpdate and updateTime <= lastUpdate[ff['name']]:
            print "skip {0} because it is up to date".format(ff['name'])
            skipped += 1
+            requests.post('http://localhost:10008/progress', 
+                          json={'item':item, 'total':len(warcs), 'done':ii+1, 
+                                'collection-name':collectionName})
            continue
        
        itemMetadata.update(md['metadata'])
        postVars = {'url':'http://archive.org/download/%s/%s' %
                    (item,ff['name']),
                    'metadata':json.dumps(itemMetadata),
-                    'c':'ait'}
+                    'c':'ait',
+                    'spiderlinks':0}
        start = time.time()
-        if mode == 'production':
+        if mode == 'testing':
+            time.sleep(random.randint(1,4))
+            statusCode = 999
+        else:
            try:
                rp = requests.post("http://localhost:8000/admin/inject", postVars)
                statusCode = rp.status_code
@ -129,49 +146,60 @@ def injectItem(item, db, mode):
                print 'error: gb inject', postVars['url'], e
                statusCode = -1
            #print postVars['url'], rp.status_code
-        else:
-            time.sleep(random.randint(1,4))
-            statusCode = 999
        took = time.time() - start

        print "sent", ff['name'],'to gb, took', took
        sys.stdout.flush()
            
        dbUpdates.append((itemId, ff['name'], updateTime, statusCode, took))
+        requests.post('http://localhost:10008/progress', 
+                      json={'item':item, 'total':len(warcs), 'done':ii+1, 
+                            'collection-name':collectionName})

-    reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
-    reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
-                      dbUpdates)
-    db.commit()

+    if len(dbUpdates):
+        reallyExecuteMany(c, "DELETE FROM files where fileName = ? ", zip(lastUpdate.iterkeys()))
+        reallyExecuteMany(c, "INSERT INTO files VALUES (?,?,?,?,?)",
+                          dbUpdates)
+        db.commit()
    print 'completed %s with %s items injected and %s skipped' % (item, len(dbUpdates), skipped)
+    return time.time() - itemStart
+

 def getPage(zippedArgs):
-    page, mode = zippedArgs
+    page, mode, resultsPerPage, extraQuery = zippedArgs
+    query = 'collection%3Aarchiveitdigitalcollection+' + extraQuery
    #r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&rows=1&page={0}&output=json&save=yes'.format(page))
-    r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=100&page={0}&output=json&save=yes'.format(page))
-    if r.status_code != 200:
-        return 0
+    url = 'https://archive.org/advancedsearch.php?q={1}&fl%5B%5D=identifier&sort[]=date+asc&rows={2}&page={0}&output=json'.format(page, query, resultsPerPage)
+    try:
+        r = requests.get(url)
+        if r.status_code != 200:
+            return 0

-    contents = r.content
-    jsonContents = json.loads(contents)
-    items = [x['identifier'] for x in jsonContents['response']['docs']]
-    numFound = jsonContents['response']['numFound']
-                  
-    if len(items) == 0:
-        print 'got 0 items for search page', page
-        return 0
-    print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)
-
-    db = getDb()
-
-    for item in items:
-        injectItem(item, db, mode)
-
-    db.close()
-    return len(items)
+        contents = r.content
+        jsonContents = json.loads(contents)
+        items = [x['identifier'] for x in jsonContents['response']['docs']]
+        numFound = jsonContents['response']['numFound']

+        if len(items) == 0:
+            requests.post('http://localhost:10008/progress', json={'total':numFound, 'completed':'', 'query':extraQuery})
+            print 'got 0 items for search page', page
+            return 0
+        print 'loading %s items, %s - %s of %s' % (len(items), items[0], items[-1], numFound)

+        for item in items:
+            db = getDb()
+            took = injectItem(item, db, mode)
+            db.close()
+            requests.post('http://localhost:10008/progress', json={'total':numFound, 
+                                                                   'completed':item, 
+                                                                   'query':extraQuery,
+                                                                   'took':took})
+        return len(items)
+    except Exception, e:
+        print 'Caught', e, 'sleep and retry', url
+        time.sleep(60)
+        return getPage(zippedArgs)


 def dumpDb():
@ -197,6 +225,10 @@ def showItems():


 def nuke(lastPid, fromOrbit=False):
+    try:
+        requests.post('http://localhost:10008/shutdown', {})
+    except:
+        pass
    sig = signal.SIGTERM
    if fromOrbit:
        sig = signal.SIGKILL
@ -209,7 +241,7 @@ def nuke(lastPid, fromOrbit=False):
        except:
            pass

-    killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|awk -e '{print $2}'`""" % sys.argv[0],
+    killed = subprocess.Popen("""kill `ps auxx |grep warc-inject|grep -v grep|awk -e '{print $2}'`""",
                              shell=True,stdout=subprocess.PIPE).communicate()[0]

    if killed == 'Terminated':
@ -219,13 +251,47 @@ def nuke(lastPid, fromOrbit=False):


 def main():
-    try:
-        lastPid = open('running.pid', 'r').read()
-    except:
-        lastPid = None
+    global staleTime
    print 'arguments were', sys.argv, 'pid is', os.getpid()
-    open('running.pid', 'w').write(str(os.getpid()))
+
+    if sys.argv[1] != 'monitor':
+        try:
+            lastPid = open('running.pid', 'r').read()
+        except:
+            lastPid = None
+        open('running.pid', 'w').write(str(os.getpid()))
+
+    # p = multiprocessing.Process(target=serveForever)
+    # p.start()
+    
+    if sys.argv[1] == 'test':
+        query = ''
+        if len(sys.argv) == 3:
+            query = sys.argv[2]
+
+        #subprocess.Popen(['python','inject', 'monitor'])
+        
+        mode = 'testing'
+        runInjects(10, 'testing', query)
+
+    if sys.argv[1] == 'run':
+        query = ''
+        if len(sys.argv) == 4:
+            query = sys.argv[3]
+
+            #subprocess.Popen(['./warc-inject','monitor'])
+        threads = int(sys.argv[2])
+        runInjects(threads, 'production', query)
+        print "done running"
+
+
+
+
    if len(sys.argv) == 2:
+        if sys.argv[1] == 'monitor':
+            import monitor
+            monitor.main()
+
        if sys.argv[1] == 'init':
            init()
            print 'initialized'
@ -247,6 +313,8 @@ def main():
            nuke(lastPid, fromOrbit=True)

        if sys.argv[1] == 'test':
+            subprocess.Popen(['./warc-inject','monitor'])
+
            mode = 'testing'
            runInjects(10, 'testing')

@ -308,33 +376,106 @@ def main():

            signal.alarm(0)          # Disable the alarm

-
-
-        if sys.argv[1] == 'serve':
-            serveForever()
+        # if sys.argv[1] == 'serve':
+        #     serveForever()

    if len(sys.argv) == 3:
+        if sys.argv[1] == 'force':
+            itemName = sys.argv[2]
+            db = getDb()
+            injectItem(itemName, db, 'production')
+            sys.exit(0)
+
+
+
+    if len(sys.argv) == 4:
+        if sys.argv[1] == 'injectfile':
+            staleTime = datetime.timedelta(0,0,0)
+            from multiprocessing.pool import ThreadPool
+            fileName = sys.argv[2]
+            items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
+            threads = int(sys.argv[3])
+            pool = ThreadPool(processes=threads)
+            #print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
+            def injectItemTupleWrapper(itemName):
+                db = getDb()
+                ret = injectItem(itemName, db, 'production')
+                db.close()
+                return ret
+
+            answer = pool.map(injectItemTupleWrapper, items)
+            print 'finished: ', answer
+            sys.exit(0)
+        if sys.argv[1] == 'forcefile':
+            staleTime = datetime.timedelta(0,0,0)
+            from multiprocessing.pool import ThreadPool
+            fileName = sys.argv[2]
+            items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
+            threads = int(sys.argv[3])
+            pool = ThreadPool(processes=threads)
+            #print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
+            def injectItemTupleWrapper(itemName):
+                db = getDb()
+                ret = injectItem(itemName, db, 'force')
+                db.close()
+                return ret
+
+            answer = pool.map(injectItemTupleWrapper, items)
+            print 'finished: ', answer
+            sys.exit(0)
+
+        if sys.argv[1] == 'injectitems':
+            from multiprocessing.pool import ThreadPool
+            fileName = sys.argv[2]
+            items = filter(lambda x: x, open(fileName, 'r').read().split('\n'))
+            threads = int(sys.argv[3])
+            pool = ThreadPool(processes=threads)
+            #print zip(files, repeat(getDb(), len(files)), repeat('production', len(files)))
+            def injectItemTupleWrapper(itemName):
+                db = getDb()
+                ret = injectItem(itemName, db, 'production')
+                db.close()
+                return ret
+
+            answer = pool.map(injectItemTupleWrapper, items)
+            sys.exit(0)
+
+
+def getNumResults(query):
+    query = 'collection%3Aarchiveitdigitalcollection+' + query
+    r = requests.get('https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=identifier&sort[]=date+asc&rows=1&page=0&output=json'.format(query))
+    if r.status_code != 200:
+        return 0
+    contents = r.content
+    jsonContents = json.loads(contents)
+    numFound = jsonContents['response']['numFound']
+    return numFound
+    

-        if sys.argv[1] == 'run':
-            threads = int(sys.argv[2])
-            runInjects(threads)

-    # else:
-    #     #getPage(3)
-    #     from multiprocessing.pool import ThreadPool
-    #     pool = ThreadPool(processes=150)
-    #     pool.map(getPage, xrange(1,1300))
        
-def runInjects(threads, mode='production'):
+def runInjects(threads, mode='production', query=''):
    from multiprocessing.pool import ThreadPool
+    import math
    pool = ThreadPool(processes=threads)
    try:
-        from itertools import repeat
-        maxPages = 1300
-        answer = pool.map(getPage, zip(xrange(1,maxPages), repeat(mode, maxPages)))
+        totalResults = getNumResults(query)
+        resultsPerPage = 100
+        maxPages = int(math.ceil(totalResults / float(resultsPerPage)))
+        if maxPages < threads:
+            maxPages = threads
+            resultsPerPage = int(math.ceil(totalResults / float(maxPages)))
+        print threads, ' threads,', totalResults, 'total,', maxPages, 'pages', resultsPerPage, 'results per page'
+        answer = pool.map(getPage, zip(xrange(1,maxPages), 
+                                       repeat(mode, maxPages),
+                                       repeat(resultsPerPage, maxPages),
+                                       repeat(query, maxPages)))
+        print "finished item pass", answer
    except (KeyboardInterrupt, SystemExit):
        print 'ok, caught'
-        raise
+        requests.post('http://localhost:10008/shutdown', {})
+        sys.exit(0)
+        #raise


 def init():
@ -351,73 +492,67 @@ def init():
    db.close()


-def serveForever():
-    @app.route('/',
-               methods=['GET', 'POST'], endpoint='home')
-    def home():
-        db = getDb(makeDates=False)
-        res = db.execute('select * from items limit 10')
-        for item, checked in res.fetchall():
-            print item
-            try:
-                metadata = subprocess.Popen(['./ia','metadata', item],
-                                            stdout=subprocess.PIPE).communicate()[0]
+# def serveForever():
+#     @app.route('/',
+#                methods=['GET', 'POST'], endpoint='home')
+#     def home():
+#         db = getDb(makeDates=False)
+#         res = db.execute('select * from items limit 10')
+#         for item, checked in res.fetchall():
+#             print item
+#             try:
+#                 metadata = subprocess.Popen(['./ia','metadata', item],
+#                                             stdout=subprocess.PIPE).communicate()[0]

-                break
-            except Exception, e:
-                pass
-        db.close()
+#                 break
+#             except Exception, e:
+#                 pass
+#         db.close()
+
+#         return flask.make_response(metadata)
+
+#     @app.route('/progress',
+#                methods=['GET', 'POST'], endpoint='progress')
+#     def progress():
+#         r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
+#         if r.status_code != 200:
+#             return flask.make_response(json.dumps({error:'ia search feed is down'}), 
+#                                        'application/json')
+
+#         contents = r.content
+#         jsonContents = json.loads(contents)
+#         numFound = jsonContents['response']['numFound']


+#         db = getDb()
+#         examinedItems = db.execute('select count(*) from items').fetchone()
+#         itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
+#         return flask.make_response(json.dumps({'totalItems':numFound, 
+#                                                'examinedItems':examinedItems,
+#                                                'itemsWithWarc':itemsWithWarc
+#                                            }, indent=4), 'application/json')


+#     @app.route('/items',
+#                methods=['GET', 'POST'], endpoint='items')
+#     def items():
+#         db = getDb(makeDates=False)

+#         c = db.cursor()
+#         res = c.execute("select item, checked from items")    

+#         out = []
+#         for item, checked in res.fetchall():
+#             out.append({'item':item, 'checked':checked})
+#         db.close()

-        return flask.make_response('hihih' + metadata)
+#         return flask.make_response(json.dumps(out), 'application/json')

-    @app.route('/progress',
-               methods=['GET', 'POST'], endpoint='progress')
-    def progress():
-        r = requests.get('https://archive.org/advancedsearch.php?q=collection%3Aarchiveitdigitalcollection&fl%5B%5D=identifier&sort[]=date+desc&rows=1&page=1&output=json')
-        if r.status_code != 200:
-            return flask.make_response(json.dumps({error:'ia search feed is down'}), 
-                                       'application/json')
-
-        contents = r.content
-        jsonContents = json.loads(contents)
-        numFound = jsonContents['response']['numFound']
-
-
-        db = getDb()
-        examinedItems = db.execute('select count(*) from items').fetchone()
-        itemsWithWarc = db.execute('select count(*) from items where ROWID in (select itemId from files where files.status = 200)').fetchone()
-        return flask.make_response(json.dumps({'totalItems':numFound, 
-                                               'examinedItems':examinedItems,
-                                               'itemsWithWarc':itemsWithWarc
-                                           }, indent=4), 'application/json')
-
-
-    @app.route('/items',
-               methods=['GET', 'POST'], endpoint='items')
-    def items():
-        db = getDb(makeDates=False)
-
-        c = db.cursor()
-        res = c.execute("select item, checked from items")    
-
-        out = []
-        for item, checked in res.fetchall():
-            out.append({'item':item, 'checked':checked})
-        db.close()
-
-        return flask.make_response(json.dumps(out), 'application/json')
-
-    app.run('0.0.0.0', 
-            port=7999,
-            debug=True,
-            use_reloader=True,
-            use_debugger=True)
+#     app.run('0.0.0.0', 
+#             port=7999,
+#             debug=False,
+#             use_reloader=False,
+#             use_debugger=False)


 if __name__ == '__main__':
--- a/script/inject/monitor.py
+++ b/script/inject/monitor.py
--- a/script/testMachines.py
+++ b/script/testMachines.py
@ -100,7 +100,7 @@ def getSplitTime():



-def copyToTwins(fname):
+def copyToTwins(fname, backToFront=False):
    fh = open(fname, 'r')
    ret = {}
    hosts = []
@ -117,23 +117,25 @@ def copyToTwins(fname):
            continue
        #print directory, ip1, note
    step = len(hosts)/2
-    hostPlex = {}
-    someIp = None
+    cmds = []
    for hostId, dnsPort, httpsPort, httpPort, udbPort,ip1, ip2, directory, note in hosts[:step]:
-        if ip1 not in hostPlex:
-            hostPlex[ip1] = []
-            someIp = ip1
-        hostPlex[ip1].append('scp -r %s:%s* %s:%s. ' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7])))
+        backHostId, backDnsPort, backHttpsPort, backHttpPort, backUdbPort,backIp1, backIp2, backDirectory, backNote = hosts[hostId + step]
+
+        if note != directory:
+            print 'oh looks like you overlooked host %s' % hostId
+        if backNote != backDirectory:
+            print 'oh looks like you overlooked host %s' % backHostId
+
+        if backToFront:
+            cmd = 'scp -r %s:%s* %s:%s. &' % (backIp1, backDirectory, ip1, directory )
+        else:
+            cmd = 'scp -r %s:%s* %s:%s. &' % (ip1, directory, backIp1, backDirectory)
+        cmds.append(cmd)
        #print 'scp -r %s:%s* %s:%s. &' % (ip1, directory, (hosts[hostId + step][5]), (hosts[hostId + step][7]))

-    while len(hostPlex[someIp]) > 0:
-        cmd = []
+    for cmd in cmds:
+        print cmd

-        for ip in hostPlex.iterkeys():
-            cmd.append(hostPlex[ip].pop())
-            #print hostPlex[ip].pop()
-
-        print '&\n'.join(cmd), ';'


 def testDiskSpeed(host, directory):
--- a/script/warc-inject
+++ b/script/warc-inject