merge from master branch to diffbot-kevin

2015-05-05 01:53:17 -07:00 · 2015-05-05 01:53:17 -07:00 · 2eb106aaf5
commit 2eb106aaf5
parent b65fd6a0cb c3c7e757fa
74 changed files with 3569 additions and 910 deletions
--- a/BigFile.cpp
+++ b/BigFile.cpp
@ -386,7 +386,9 @@ bool BigFile::readwrite ( void         *buf      ,
 	//   had negative offsets, bad engineer
 	if ( offset < 0 ) {
 		log(LOG_LOGIC,"disk: readwrite() offset is %"INT64" "
-		    "< 0. dumping core.",offset);
+		    "< 0. filename=%s/%s. dumping core. try deleting "
+		    "the .map file for it and restarting.",offset,
+		    m_dir,m_baseFilename);
 		char *xx = NULL; *xx = 0;
 	}
 	// if we're not blocking use a fake fstate
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -191,7 +191,8 @@ bool Collectiondb::cleanTrees ( ) {
 	//r = g_indexdb.getRdb();
 	//r->m_tree.cleanTree    ((char **)r->m_bases);
 	r = g_posdb.getRdb();
-	r->m_tree.cleanTree    ();//(char **)r->m_bases);
+	//r->m_tree.cleanTree    ();//(char **)r->m_bases);
+	r->m_buckets.cleanBuckets();
 	//r = g_datedb.getRdb();
 	//r->m_tree.cleanTree    ((char **)r->m_bases);

@ -284,6 +285,10 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {

 	if ( ! registerCollRec ( cr , false ) ) return false;

+	// always index spider status docs now for custom crawls
+	if ( cr->m_isCustomCrawl )
+		cr->m_indexSpiderReplies = true;
+
 	// we need to compile the regular expressions or update the url
 	// filters with new logic that maps crawlbot parms to url filters
 	return cr->rebuildUrlFilters ( );
@ -476,6 +481,8 @@ bool Collectiondb::addNewColl ( char *coll ,


 	if ( customCrawl ) {
+		// always index spider status docs now
+		cr->m_indexSpiderReplies = true;
 		// remember the token
 		cr->m_diffbotToken.set ( token );
 		cr->m_diffbotCrawlName.set ( crawl );
@ -1702,6 +1709,8 @@ CollectionRec::CollectionRec() {
 	//	m_spiderQuotas[i] = -1;
 	memset( m_spiderPriorities, 0, 
 		MAX_FILTERS*sizeof(*m_spiderPriorities) );
+	memset ( m_harvestLinks,0,MAX_FILTERS);
+	memset ( m_forceDelete,0,MAX_FILTERS);
 	//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
 	//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
 	//	*(m_searchPwds[i]) = '\0';
@ -2071,6 +2080,11 @@ bool CollectionRec::countEvents ( ) {
 */

 bool CollectionRec::rebuildUrlFilters2 ( ) {
+
+	// tell spider loop to update active list
+	g_spiderLoop.m_activeListValid = false;
+
+
 	bool rebuild = true;
 	if ( m_numRegExs == 0 ) 
 		rebuild = true;
@ -2106,9 +2120,6 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	//	addDefault = true;
 	if ( ! rebuild ) return true;

-	// tell spider loop to update active list
-	g_spiderLoop.m_activeListValid = false;
-

 	if ( !strcmp(s,"shallow" ) )
 		return rebuildShallowRules();
@ -2177,7 +2188,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; // delete!
+	m_forceDelete        [n] = 1;
 	n++;

 	// if not in the site list then nuke it
@ -2187,7 +2199,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; 
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=3 && hastmperror");
@ -2196,7 +2209,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=1 && hastmperror");
@ -2221,6 +2235,32 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 		m_spiderFreqs [n] = .00347; // 5 mins
 	n++;

+	// 20+ unique c block parent request urls means it is important!
+	m_regExs[n].set("numinlinks>7 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 52;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	// 20+ unique c block parent request urls means it is important!
+	m_regExs[n].set("numinlinks>7");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 51;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+
+
 	m_regExs[n].set("hopcount==0 && iswww && isnew");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 7; // 30 days default
@ -2265,6 +2305,55 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 		m_spiderFreqs [n] = .00347; // 5 mins
 	n++;

+
+	m_regExs[n].set("isparentrss && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 45;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("isparentsitemap && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 44;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+
+	m_regExs[n].set("isparentrss");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 43;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("isparentsitemap");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 7; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 42;
+	if ( ! strcmp(s,"news") )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+
+
+
 	m_regExs[n].set("hopcount==1 && isnew");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 20.0;
@ -2379,6 +2468,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;

 	// more rules

@ -2414,7 +2504,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; // delete!
+	m_forceDelete        [n] = 1;
 	n++;

 	// if not in the site list then nuke it
@ -2424,7 +2515,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; // delete!
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=3 && hastmperror");
@ -2433,7 +2525,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=1 && hastmperror");
@ -2794,6 +2887,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;

 	// done rebuilding CHINESE rules
 	return true;
@ -2818,7 +2912,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; // delete!
+	m_forceDelete        [n] = 1;
 	n++;

 	// if not in the site list then nuke it
@ -2828,7 +2923,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_maxSpidersPerRule  [n] = 99; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = -3; // delete!
+	m_spiderPriorities   [n] = 100; // delete!
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=3 && hastmperror");
@ -2837,7 +2933,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;

 	m_regExs[n].set("errorcount>=1 && hastmperror");
@ -3012,6 +3109,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;

 	// done rebuilding SHALLOW rules
 	return true;
@ -3388,6 +3486,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_spiderFreqs       [i] = respiderFreq;
 		//m_spiderDiffbotApiUrl[i].purge();
 		m_harvestLinks[i] = true;
+		m_forceDelete [i] = false;
 	}

 	int32_t i = 0;
@ -3400,7 +3499,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {

 	// 2nd default url 
 	m_regExs[i].set("ismedia && !ismanualadd");
-	m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+	m_maxSpidersPerRule  [i] = 0;
+	m_spiderPriorities   [i] = 100; // delete!
+	m_forceDelete        [i] = 1;
 	i++;

 	// hopcount filter if asked for
@ -3418,7 +3519,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set(hopcountStr);

 		// means DELETE :
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED; 
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED; 
+
+		//  just don't spider
+		m_maxSpidersPerRule[i] = 0;

 		// compatibility with m_spiderRoundStartTime:
 		m_spiderFreqs[i] = 0.0; 
@ -3439,7 +3543,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	// MDW: even if they supplied a crawl pattern let's restrict to seed
 	// domains 12/15/14
 	m_regExs[i].set("!isonsamedomain && !ismanualadd");
-	m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+	m_maxSpidersPerRule  [i] = 0;
+	m_spiderPriorities   [i] = 100; // delete!
+	m_forceDelete        [i] = 1;
 	i++;
 	//}

@ -3452,7 +3558,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	// only negative patterns then restrict to domains of seeds
 	if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
 		m_regExs[i].set("!isonsamedomain && !ismanualadd");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_maxSpidersPerRule  [i] = 0;
+		m_spiderPriorities   [i] = 100; // delete!
+		m_forceDelete        [i] = 1;
 		i++;
 	}

@ -3478,7 +3586,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {

 	// excessive errors? (tcp/dns timed out, etc.) retry once per month?
 	m_regExs[i].set("errorcount>=3 && hastmperror");
-	m_spiderPriorities   [i] = 30;
+	m_spiderPriorities   [i] = 3;
 	m_spiderFreqs        [i] = 30; // 30 days
 	// if bulk job, do not download a url more than 3 times
 	if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
@ -3556,7 +3664,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 		// do not crawl anything else
 		m_regExs[i].set("default");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+		// don't spider
+		m_maxSpidersPerRule[i] = 0;
 		// this needs to be zero so &spiderRoundStart=0
 		// functionality which sets m_spiderRoundStartTime
 		// to the current time works
@ -3576,7 +3686,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 		// do not crawl anything else
 		m_regExs[i].set("default");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+		// don't delete, just don't spider
+		m_maxSpidersPerRule[i] = 0;
 		// this needs to be zero so &spiderRoundStart=0
 		// functionality which sets m_spiderRoundStartTime
 		// to the current time works
@ -3630,6 +3742,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	m_numRegExs6  = i;
 	//m_numRegExs7  = i;
 	m_numRegExs8  = i;
+	m_numRegExs7  = i;
 	//m_numRegExs11 = i;


--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -814,6 +814,9 @@ class CollectionRec {
 	int32_t      m_numRegExs8;
 	char      m_harvestLinks     [ MAX_FILTERS ];

+	int32_t      m_numRegExs7;
+	char      m_forceDelete  [ MAX_FILTERS ];
+
 	// dummy?
 	int32_t      m_numRegExs9;

--- a/File.cpp
+++ b/File.cpp
@ -74,6 +74,8 @@ File::File ( ) {
 	// threaded unlink sets this to true before spawning thread so we
 	// do not try to open it!
 	//m_gone = 0;
+	m_nextActive = NULL;
+	m_prevActive = NULL;
 }


@ -129,6 +131,52 @@ bool File::rename ( char *newFilename ) {
 	return true;
 }

+
+/*
+static File *s_activeHead = NULL;
+static File *s_activeTail = NULL;
+
+void rmFileFromLinkedList ( File *f ) {
+	// excise from linked list of active files
+	if ( s_activeHead == f )
+		s_activeHead = f->m_nextActive;
+	if ( s_activeTail == f )
+		s_activeTail = f->m_prevActive;
+	if ( f->m_prevActive ) 
+		f->m_prevActive->m_nextActive = f->m_nextActive;
+	if ( f->m_nextActive ) 
+		f->m_nextActive->m_prevActive = f->m_prevActive;
+	// and so we do not try to re-excise it
+	f->m_prevActive = NULL;
+	f->m_nextActive = NULL;
+}
+
+void addFileToLinkedList ( File *f ) {
+	// must not be in there already, lest we double add it
+	if ( f->m_nextActive ) return;
+	if ( f->m_prevActive ) return;
+	if ( s_activeHead == f ) return;
+
+	f->m_nextActive = NULL;
+	f->m_prevActive = NULL;
+	if ( ! s_activeTail ) {
+		s_activeHead = f;
+		s_activeTail = f;
+		return;
+	}
+	// insert at end of linked list otherwise
+	s_activeTail->m_nextActive = f;
+	f->m_prevActive = s_activeTail;
+	s_activeTail = f;
+}
+
+// update linked list
+void promoteInLinkedList ( File *f ) {
+	rmFileFromLinkedList ( f );
+	addFileToLinkedList  ( f );
+}
+*/
+
 // . open the file
 // . only call once per File after calling set()
 bool File::open ( int flags , int permissions ) {
@ -200,6 +248,8 @@ int File::write ( void *buf             ,
 	else              n =  pwrite ( fd , buf , numBytesToWrite , offset );
 	// valgrind
 	if ( n < 0 && errno == EINTR ) goto retry21;	
+	// update linked list
+	//promoteInLinkedList ( this );
 	// copy errno to g_errno
 	if ( n < 0 ) g_errno = errno;
 	// cancel blocking errors - not really errors
@ -228,6 +278,8 @@ int File::read ( void *buf            ,
 	else              n =  pread  ( fd , buf , numBytesToRead , offset );
 	// valgrind
 	if ( n < 0 && errno == EINTR ) goto retry9;	
+	// update linked list
+	//promoteInLinkedList ( this );
 	// copy errno to g_errno
 	if ( n < 0 ) g_errno = errno;
 	// cancel blocking errors - not really errors
@ -340,6 +392,8 @@ void File::close2 ( ) {
 		    "This should never happen. vfd=%i fd=%i.", m_vfd,fd);
 		return;
 	}
+	// excise from linked list of active files
+	//rmFileFromLinkedList ( this );
 	// mark this virtual file descriptor as available.
 	s_fds [ m_vfd ] = -2;           
 	// no more virtual file descriptor
@ -407,6 +461,8 @@ bool File::close ( ) {
 	}
 	// otherwise decrease the # of open files
 	s_numOpenFiles--; 
+	// excise from linked list of active files
+	//rmFileFromLinkedList ( this );
 	// return true blue
 	return true; 
 }
@ -524,6 +580,8 @@ int File::getfd () {
 	s_unlinking [ m_vfd ] = 0;
 	// update the time stamp
 	s_timestamps [ m_vfd ] = gettimeofdayInMillisecondsLocal();
+	// add file to linked list of active files
+	//addFileToLinkedList ( this );
 	return fd;
 }

@ -535,6 +593,31 @@ bool File::closeLeastUsed () {
 	int    mini = -1;
 	int64_t now = gettimeofdayInMillisecondsLocal();

+	/*
+	// use the new linked list of active file descriptors
+	// . file at tail is the most active
+	File *f = s_activeHead;
+
+	// if nothing to do return true
+	if ( ! f ) return true;
+
+	// close the head if not writing
+	for ( ; f ; f = f->m_nextActive ) {
+		mini = f->m_vfd;
+		// how can this be?
+		if ( s_fds [ mini ] < 0 ) { char *xx=NULL;*xx=0; }
+		if ( s_writing [ mini ] ) continue;
+		if ( s_unlinking [ mini ] ) continue;
+		// when we got like 1000 reads queued up, it uses a *lot* of
+		// memory and we can end up never being able to complete a
+		// read because the descriptors are always getting closed on us
+		// so do a hack fix and do not close descriptors that are
+		// about .5 seconds old on avg.
+		if ( s_timestamps [ mini ] >= now - 1 ) continue;
+		break;
+	}
+	*/
+
 	// get the least used of all the actively opened file descriptors.
 	// we can't get files that were opened for writing!!!
 	int i;
@ -603,8 +686,19 @@ bool File::closeLeastUsed () {
 	// we're just conserving file descriptors
 	s_fds [ mini ] = -1;

+	
+
 	// if the real close was successful then decrement the # of open files
-	if ( status == 0 ) s_numOpenFiles--;
+	if ( status == 0 ) {
+		s_numOpenFiles--;
+		// excise from linked list of active files
+		//rmFileFromLinkedList ( f );
+		// getfd() may not execute in time to ince the closeCount
+		// so do it here. test by setting the max open files to like
+		// 10 or so and spidering heavily.
+		//s_closeCounts [ fd ]++;
+	}
+

 	if ( status == -1 ) 
 		return log("disk: close(%i) : %s", fd , strerror(errno));
--- a/File.h
+++ b/File.h
@ -22,7 +22,8 @@
 // . max # of VIRTUAL file descriptors
 // . man, chris has 958 files, lets crank it up from 2k to 5k
 // . boost up to 50,000 since we are hitting this limit with crawlbot
-#define MAX_NUM_VFDS (50*1024)
+// . we are hitting again with crawlbot, boost to 200k from 50k
+#define MAX_NUM_VFDS (200*1024)

 #include <sys/types.h>       // for open/lseek
 #include <sys/stat.h>        // for open
@ -182,6 +183,9 @@ class File {
 	time_t m_st_mtime;  // file last mod date
 	int32_t   m_st_size;   // file size
 	time_t getLastModifiedDate ( ) ;
+
+	class File *m_nextActive;
+	class File *m_prevActive;
 };


--- a/HashTableX.h
+++ b/HashTableX.h
@ -144,13 +144,19 @@ class HashTableX {

 	bool addTerm144 ( key144_t *kp , int32_t score = 1 ) {

+		/*
 		// debug XmlDoc.cpp's hash table
-		//int64_t termId = ((key144_t *)kp)->n2 >> 16;
-		//if ( termId == 59194288760543LL ) {
-		//	log("got it");
-		//	char *xx=NULL;*xx=0;
-		//}
-
+		int64_t termId = ((key144_t *)kp)->n2 >> 16;
+		uint64_t d = 0LL;
+		d = ((unsigned char *)kp)[11];
+		d <<= 32;
+		d |= *(uint32_t *)(((unsigned char *)kp)+7);
+		d >>= 2;
+		if ( d==110324895284 && termId == 39206941907955LL ) {
+			log("got it");
+			char *xx=NULL;*xx=0;
+		}
+		*/
 		// grow it!
 		if ( (m_numSlots < 20 || 4 * m_numSlotsUsed >= m_numSlots) &&
 		     m_numSlots < m_maxSlots ) {
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -66,6 +66,7 @@ Hostdb::Hostdb ( ) {
 	m_crcValid = false;
 	m_crc = 0;
 	m_created = false;
+	m_myHost = NULL;
 }

 Hostdb::~Hostdb () {
@ -108,6 +109,7 @@ bool Hostdb::init ( int32_t hostIdArg , char *netName ,
 	m_myIp             = 0;
 	m_myIpShotgun      = 0;
 	m_myPort           = 0;
+	m_myHost           = NULL;
 	//m_myPort2          = 0;
 	m_numHosts         = 0;
 	m_numHostsPerShard = 0;
@ -1833,7 +1835,7 @@ bool Hostdb::replaceHost ( int32_t origHostId, int32_t spareHostId ) {
 	oldHost->m_emailCode           = 0;
 	oldHost->m_wasAlive            = false;
 	oldHost->m_pingInfo.m_etryagains          = 0;
-	oldHost->m_pingInfo.m_udpSlotsInUse = 0;
+	oldHost->m_pingInfo.m_udpSlotsInUseIncoming = 0;
 	oldHost->m_pingInfo.m_totalResends        = 0;
 	oldHost->m_errorReplies        = 0;
 	oldHost->m_dgramsTo            = 0;
--- a/Hostdb.h
+++ b/Hostdb.h
@ -106,7 +106,7 @@ class PingInfo {
 	int32_t m_totalResends;
 	int32_t m_etryagains;

-	int32_t m_udpSlotsInUse;
+	int32_t m_udpSlotsInUseIncoming;
 	int32_t m_tcpSocketsInUse;

 	int16_t m_currentSpiders;
@ -115,7 +115,7 @@ class PingInfo {
 	char m_gbVersionStr[21];
 	char m_repairMode;
 	char m_kernelErrors;
-	
+	uint8_t m_recoveryLevel;
 };

 class Host {
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1026,6 +1026,7 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	// "GET /crawlbot/downloadpages"
 	if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
 	     // add 4 to length of needle to account for /vXX.
+	     // GET /v3/crawl/download/
 	     (pathLen >= 20 && strnstr(path, "/crawl/download/", 20)) ||
 	     (pathLen >= 19 && strnstr(path, "/bulk/download/", 19)) )
 		return sendBackDump ( s , r );
@ -1243,8 +1244,8 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 		return sendPagePretty ( s , r , "about.html","about" );

 	// decorate the plain html page, news.html, with our nav chrome
-	if ( ! strncmp ( path ,"/news.html", pathLen ) )
-		return sendPagePretty ( s , r , "news.html", "news");
+	if ( ! strncmp ( path ,"/blog.html", pathLen ) )
+		return sendPagePretty ( s , r , "blog.html", "blog");

 	// decorate the plain html page with our nav chrome
 	if ( ! strncmp ( path ,"/searchfeed.html", pathLen ) )
@ -2340,7 +2341,7 @@ int32_t getMsgSize ( char *buf, int32_t bufSize, TcpSocket *s ) {
 		// /admin/basic etc
 		if ( pp + 7 < ppend && strncmp ( pp ,"/admin/",7)==0)
 			max = 0x7fffffff;
-		// bulk job. /v2/bulk
+		// bulk job. /v2/bulk or /v3/crawl/download/token-name...
 		if ( pp + 4 < ppend && strncmp ( pp ,"/v",2)==0 &&
 		     // /v2/bulk
 		     ( ( pp[4] == 'b' && pp[5] == 'u' ) ||
--- a/Json.cpp
+++ b/Json.cpp
@ -233,6 +233,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 				// json must start with { or [ i guess
 				// otherwise getFirstItem() won't work!
 				if ( m_sb.m_length==0 ) {
+					log("json: length is 0");
 					g_errno = EBADJSONPARSER;
 					return NULL;
 				}
@ -294,10 +295,12 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 			// what is the length of it?
 			int32_t slen = 4;
 			ji->m_valueLong = 1;
+			ji->m_value64 = 1;
 			ji->m_valueDouble = 1.0;
 			if ( *p == 'f' ) {
 				slen = 5;
 				ji->m_valueLong = 0;
+				ji->m_value64 = 0;
 				ji->m_valueDouble = 0;
 			}
 			// store decoded string right after jsonitem
@ -342,6 +345,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 			//char c = str[slen];
 			//str[slen] = '\0';
 			ji->m_valueLong = atol(str);
+			ji->m_value64 = atoll(str);
 			ji->m_valueDouble = atof(str);
 			// copy the number as a string as well
 			int32_t curr = m_sb.length();
@ -367,7 +371,11 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {

 	// for testing if we realloc
 	char *memEnd = m_sb.getBufStart();
-	if ( mem != memEnd ) { char *xx=NULL;*xx=0; }
+
+	// bitch if we had to do a realloc. should never happen but i
+	// saw it happen once, so do not core on that.
+	if ( mem != memEnd )
+		log("json: json parser reallocated buffer. inefficient.");

 	return (JsonItem *)m_sb.getBufStart();
 }
@ -465,14 +473,26 @@ char *JsonItem::getValueAsString ( int32_t *valueLen ) {
 	}

 	// numbers...
-	static char s_numBuf[64];
+	// seems like when this overflowed when it was 64 bytes
+	// it went into s_vbuf in Version.cpp
+	static char s_numBuf[256];
 	if ( (float)m_valueLong == m_valueDouble ) {
-		*valueLen = sprintf ( s_numBuf,"%"INT32"", m_valueLong );
+		*valueLen = snprintf ( s_numBuf,255,"%"INT32"", m_valueLong );
 		return s_numBuf;
 	}

-	*valueLen = sprintf ( s_numBuf,"%f", m_valueDouble );
-	return s_numBuf;
+	if ( (double)m_value64 == m_valueDouble ) {
+		*valueLen = snprintf ( s_numBuf,255,"%"INT64"", m_value64 );
+		return s_numBuf;
+	}
+
+	// otherwise return the number as it was written in the json
+	// because it might have too many digits for printing as a double
+	*valueLen = m_valueLen;
+	return (char *)this + sizeof(JsonItem);
+
+	// *valueLen = snprintf ( s_numBuf,255,"%f", m_valueDouble );
+	// return s_numBuf;
 }

 bool endsInCurly ( char *s , int32_t slen ) {
--- a/Json.h
+++ b/Json.h
@ -34,6 +34,7 @@ class JsonItem {

 	// for JT_NUMBER
 	int32_t m_valueLong;
+	int64_t m_value64;
 	// for JT_NUMBER
 	double m_valueDouble;

--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -5068,27 +5068,51 @@ bool Links::set ( bool useRelNoFollow ,
 		// . continue if this tag ain't an <a href> tag
 		// . atom feeds have a <link href=""> field in them
 		int32_t id = xml->getNodeId ( i );
+
+		int32_t  slen;
+		char *s ;
+		// reset
+		linkflags_t flags = 0;
+
+		/*
+		  MDW: now we set m_nodeId properly to TAG_LINK even in
+		  pure xml docs
+		if ( xml->m_pureXml ) {
+			// if it's a back tag continue
+			if ( xml->isBackTag ( i ) ) continue;
+			// must be a <> tag not innerhtml of tag
+			if ( xml->m_nodes[i].m_nodeId != TAG_XMLTAG ) continue;
+			// must be <link> i guess
+			if ( xml->m_nodes[i].m_tagNameLen != 4 ) continue;
+			if ( strncmp ( xml->m_nodes[i].m_tagName , "link" , 4))
+				continue;
+			// pure xml does not have ids like this so force it
+			id = TAG_LINK;
+			goto gotOne;
+		}
+		*/
+
 		if ( id != TAG_A         &&
-		     id != TAG_LINK      &&
+		     id != TAG_LINK      && // rss feed url
+		     id != TAG_LOC       && // sitemap.xml url
 		     id != TAG_AREA      &&
 		     id != TAG_ENCLOSURE &&
 		     id != TAG_WEBLOG    &&
 		     id != TAG_URLFROM   && //  <UrlFrom> for ahrefs.com
 		     id != TAG_FBORIGLINK )
 			continue;
+
+		//gotOne:
+
 		urlattr = "href";
 		if ( id == TAG_WEBLOG     ) urlattr ="url";
 		if ( id == TAG_FBORIGLINK ) m_isFeedBurner = true;

 		// if it's a back tag continue
 		if ( xml->isBackTag ( i ) ) continue;
-		// reset
-		linkflags_t flags = 0;
 		// . if it has rel=nofollow then ignore it
 		// . for old titleRecs we should skip this part so that the
 		//   link: terms are indexed/hashed the same way in XmlDoc.cpp
-		int32_t  slen;
-		char *s ;
 		if ( useRelNoFollow ) s = xml->getString ( i , "rel", &slen ) ;
 		if ( useRelNoFollow &&
 		     slen==8 &&   // ASCII
@ -5112,6 +5136,7 @@ bool Links::set ( bool useRelNoFollow ,
 		// follow, like in an rss feed.
 		if ( linkLen==0 && 
 		     (id == TAG_LINK || 
+		      id == TAG_LOC || // sitemap.xml urls
 		      id == TAG_URLFROM ||
 		      id == TAG_FBORIGLINK) ) {
 			// the the <link> node
@ -5343,6 +5368,30 @@ bool Links::set ( char *buf ,  int32_t niceness ) { //char *coll,int32_t nicenes
 	return true;
 }

+bool Links::print ( SafeBuf *sb ) {
+	sb->safePrintf(
+		       "<table cellpadding=3 border=1>\n"
+		       "<tr>"
+		       "<td>#</td>"
+		       "<td colspan=40>"
+		       // table header row
+		       "Outlink"
+		       "</td>"
+		       "</tr>"
+		       );
+	// find the link point to our url
+	int32_t i;
+	for ( i = 0 ; i < m_numLinks ; i++ ) {
+		char *link    = getLinkPtr(i);
+		int32_t  linkLen = getLinkLen(i);
+		sb->safePrintf("<tr><td>%"INT32"</td><td>",i);
+		sb->safeMemcpy(link,linkLen);
+		sb->safePrintf("</td></tr>\n");
+	}
+	sb->safePrintf("</table>\n<br>\n");
+	return true;
+}
+
 // . the blogroll must consist of 2 outlinks to two different external blogs
 //   in order to be a valid blogroll
 // . add the all the site root outlinks in the valid blogroll into the 
--- a/Linkdb.h
+++ b/Linkdb.h
@ -1183,6 +1183,8 @@ public:
 	// set from a simple text buffer
 	bool set ( char *buf , int32_t niceness ) ;

+	bool print ( SafeBuf *sb ) ;
+
 	// Link in ascii text
 	bool addLink(char *link,int32_t linkLen,int32_t nodeNum,bool setLinkHashes,
 		     int32_t titleRecVersion, int32_t niceness , bool isRSS ,
--- a/Loop.cpp
+++ b/Loop.cpp
@ -1193,6 +1193,8 @@ void sigvtalrmHandler ( int x , siginfo_t *info , void *y ) {

 }

+float g_cpuUsage = 0.0;
+
 void sigalrmHandler ( int x , siginfo_t *info , void *y ) {

 #ifdef PTHREADS
@ -1206,6 +1208,17 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {

 	// stats
 	g_numAlarms++;
+
+	if ( ! g_inWaitState )
+		g_cpuUsage = .99 * g_cpuUsage + .01 * 100;
+	else
+		g_cpuUsage = .99 * g_cpuUsage + .01 * 000;
+
+	if ( g_profiler.m_realTimeProfilerRunning )
+		g_profiler.getStackFrame(0);
+
+	return;
+	/*
 	// . see where we are in the code
 	// . for computing cpu usage
 	// . if idling we will be in sigtimedwait() at the lowest level
@ -1224,6 +1237,7 @@ void sigalrmHandler ( int x , siginfo_t *info , void *y ) {

 	if ( g_profiler.m_realTimeProfilerRunning )
 		g_profiler.getStackFrame(0);
+	*/
 }

 /*
--- a/5
+++ b/5
@ -177,6 +177,11 @@ vclean:
 	@echo ""
 	@echo "sudo yum install gcc-c++"
 	@echo ""
+	@echo ""
+	@echo "If make fails on CentOS then first run:"
+	@echo ""
+	@echo "sudo yum install gcc-c++ openssl-devel"
+	@echo ""
 	@echo "*****"
 	@echo ""

--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -1950,6 +1950,13 @@ void gotHttpReply2 ( void *state ,
 		if ( --count > 0 && ! err ) {
 			copy          = (char *)mdup(reply,replySize,"msg13d");
 			copyAllocSize = replySize;
+			// oom doing the mdup? i've seen this core us so fix it
+			// because calling sendreply_ass with a NULL 
+			// 'copy' cores it.
+			if ( reply && ! copy ) {
+				copyAllocSize = 0;
+				err = ENOMEM;
+			}
 		}
 		// this is not freeable
 		if ( copy == g_fakeReply ) copyAllocSize = 0;
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -782,6 +782,13 @@ bool Msg3::doneScanning ( ) {
 		}
 	}

+	// if shutting down gb then limit to 20 so we can shutdown because
+	// it can't shutdown until all threads are out of the queue i think
+	if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
+		log("msg3: forcing retries to 0 because shutting down");
+		max = 0;
+	}
+
 	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
 	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;

--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -1452,6 +1452,9 @@ void Msg39::estimateHitsAndSendReply ( ) {
 			need += 4;
 			// then buckets. keys and counts
 			need += (4+sizeof(FacetEntry)) * used;
+			// for # of ALL docs that have this facet, even if
+			// not in search results
+			need += sizeof(int64_t);
 		}
 		// allocate
 		SafeBuf tmp;
@ -1523,6 +1526,12 @@ void Msg39::estimateHitsAndSendReply ( ) {
 		//
 		/////////////

+		// how many docs IN TOTAL had the facet, including all docs
+		// that did not match the query.
+		// it's 1-1 with the query terms.
+		mr.ptr_numDocsThatHaveFacetList  = NULL;
+		mr.size_numDocsThatHaveFacetList = nqt * sizeof(int64_t);
+

 		// . that is pretty much it,so serialize it into buffer,"reply"
 		// . mr.ptr_docIds, etc., will point into the buffer so we can
@ -1548,6 +1557,44 @@ void Msg39::estimateHitsAndSendReply ( ) {
 		topDocIds    = (int64_t *) mr.ptr_docIds;
 		topScores    = (double    *) mr.ptr_scores;
 		topRecs      = (key_t     *) mr.ptr_clusterRecs;
+
+		// sanity
+		if ( nqt != m_msg2.m_numLists )
+			log("query: nqt mismatch for q=%s",m_tmpq.m_orig);
+		int64_t *facetCounts=(int64_t*)mr.ptr_numDocsThatHaveFacetList;
+		for ( int32_t i = 0 ; i < nqt ; i++ ) {
+			QueryTerm *qt = &m_tmpq.m_qterms[i];
+			// default is 0 for non-facet termlists
+			facetCounts[i] = qt->m_numDocsThatHaveFacet;
+		}
+		/*
+		  MDW - no, because some docs have the same facet field
+		  multiple times and we want a doc count. so do it in Posdb.cpp
+		// fill these in now too
+		int64_t *facetCounts=(int64_t*)mr.ptr_numDocsThatHaveFacetList;
+		for ( int32_t i = 0 ; i < nqt ; i++ ) {
+			// default is 0 for non-facet termlists
+			facetCounts[i] = 0;
+			QueryTerm *qt = &m_tmpq.m_qterms[i];
+			// skip if not facet term
+			bool isFacetTerm = false;
+			if ( qt->m_fieldCode == FIELD_GBFACETSTR )
+				isFacetTerm = true;
+			if ( qt->m_fieldCode == FIELD_GBFACETINT )
+				isFacetTerm = true;
+			if ( qt->m_fieldCode == FIELD_GBFACETFLOAT )
+				isFacetTerm = true;
+			if ( ! isFacetTerm )
+				continue;
+			RdbList *list = &m_lists[i];
+			// they should be all 12 bytes except first rec which
+			// is 18 bytes.
+			int64_t count = list->m_listSize;
+			count -= 6;
+			count /= 12;
+			facetCounts[i] = count;
+		}
+		*/	
 	}

 	int32_t docCount = 0;
--- a/Msg39.h
+++ b/Msg39.h
@ -188,6 +188,7 @@ public:
 	char  *ptr_scoreInfo      ; // transparency info
 	char  *ptr_pairScoreBuf   ; // transparency info
 	char  *ptr_singleScoreBuf ; // transparency info
+	char  *ptr_numDocsThatHaveFacetList ;
 	// this is now 1-1 with # of query terms!
 	char  *ptr_facetHashList   ; // list of all the facet values in serps
 	char  *ptr_clusterRecs    ; // key_t (might be empty)
@ -199,6 +200,7 @@ public:
 	int32_t   size_scoreInfo;
 	int32_t   size_pairScoreBuf  ;
 	int32_t   size_singleScoreBuf;
+	int32_t   size_numDocsThatHaveFacetList ;
 	int32_t   size_facetHashList;
 	int32_t   size_clusterRecs;

--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -25,6 +25,7 @@ void Msg3a::constructor ( ) {
 	m_numDocIds    = 0;
 	m_collnums     = NULL;
 	m_inUse        = false;
+	m_q            = NULL;

 	// need to call all safebuf constructors now to set m_label
 	m_rbuf2.constructor();
@ -143,6 +144,7 @@ bool Msg3a::getDocIds ( Msg39Request *r          ,
 	reset();
 	// remember ALL the stuff
 	m_r        = r;
+	// this should be &SearchInput::m_q
 	m_q        = q;
 	m_callback = callback;
 	m_state    = state;
@ -761,6 +763,16 @@ bool Msg3a::gotAllShardReplies ( ) {
 		//   of posdb...
 		m_numTotalEstimatedHits += mr->m_estimatedHits;

+		// accumulate total facet count from all shards for each term
+		int64_t *facetCounts;
+		facetCounts = (int64_t*)mr->ptr_numDocsThatHaveFacetList;
+		for ( int32_t k = 0 ; k < mr->m_nqt ;  k++ ) {
+			QueryTerm *qt = &m_q->m_qterms[k];
+			// sanity. this should never happen.
+			if ( k >= m_q->m_numTerms ) break;
+			qt->m_numDocsThatHaveFacet += facetCounts[k];
+		}
+
 		// debug log stuff
 		if ( ! m_debug ) continue;
 		// cast these for printing out
@ -771,7 +783,8 @@ bool Msg3a::gotAllShardReplies ( ) {
 			// print out score_t
 			logf( LOG_DEBUG,
 			     "query: msg3a: [%"PTRFMT"] %03"INT32") "
-			     "shard=%"INT32" docId=%012"UINT64" domHash=0x%02"XINT32" "
+			     "shard=%"INT32" docId=%012"UINT64" "
+			      "domHash=0x%02"XINT32" "
 			     "score=%f"                     ,
 			     (PTRTYPE)this                      ,
 			     j                                        , 
@ -1063,13 +1076,21 @@ bool Msg3a::mergeLists ( ) {
 		// and Msg40.cpp ultimately.
 		HashTableX *ht = &qt->m_facetHashTable;
 		// we have to manually call this because Query::constructor()
-		// might have been called explicitly
-		ht->constructor();
+		// might have been called explicitly. not now because
+		// i added a call the Query::constructor() to call
+		// QueryTerm::constructor() for each QueryTerm in
+		// Query::m_qterms[]. this was causing a mem leak of 
+		// 'fhtqt' too beacause we were re-using the query for each 
+		// coll in the federated loop search.
+		//ht->constructor();
 		// 4 byte key, 4 byte score for counting facet values
 		if ( ! ht->set(4,sizeof(FacetEntry),
 			       128,NULL,0,false,
 			       m_r->m_niceness,"fhtqt")) 
 			return true;
+		// debug note
+		// log("results: alloc fhtqt of %"PTRFMT" for st0=%"PTRFMT,
+		//     (PTRTYPE)ht->m_buf,(PTRTYPE)m_q->m_st0Ptr);
 		// sanity
 		if ( ! ht->m_isWritable ) {char *xx=NULL;*xx=0;}
 	}
@ -1186,7 +1207,6 @@ bool Msg3a::mergeLists ( ) {
 	if ( ! sortFacetEntries() )
 		return true;

-
 	//if ( m_r->m_getSectionStats ) return true;
 	//
 	// HACK: END section stats merge
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -108,6 +108,7 @@ Msg40::Msg40() {
 	m_omitCount      = 0;
 	m_printCount = 0;
 	//m_numGigabitInfos = 0;
+	m_numCollsToSearch = 0;
 }

 #define MAX2 50
@ -140,6 +141,14 @@ void Msg40::resetBuf2 ( ) {
 }

 Msg40::~Msg40() {
+	// free tmp msg3as now
+	for ( int32_t i = 0 ; i < m_numCollsToSearch ; i++ ) {
+		if ( ! m_msg3aPtrs[i] ) continue;
+		if ( m_msg3aPtrs[i] == &m_msg3a ) continue;
+		mdelete ( m_msg3aPtrs[i] , sizeof(Msg3a), "tmsg3a");
+		delete  ( m_msg3aPtrs[i] );
+		m_msg3aPtrs[i] = NULL;
+	}
 	if ( m_buf  ) mfree ( m_buf  , m_bufMaxSize  , "Msg40" );
 	m_buf  = NULL;
 	resetBuf2();
@ -2108,7 +2117,8 @@ bool Msg40::gotSummary ( ) {
 	//   socket but rather calls doneSendingWrapper() which can call
 	//   this function again to send another chunk
 	// . when we are truly done sending all the data, then we set lastChunk
-	//   to true and TcpServer.cpp will destroy m_socket when done
+	//   to true and TcpServer.cpp will destroy m_socket when done.
+	//   no, actually we just set m_streamingMode to false i guess above
 	if ( sb->length() &&
 	     // did client browser close the socket on us midstream?
 	     ! m_socketHadError &&
@ -5774,6 +5784,7 @@ bool printHttpMime ( State0 *st ) {
 //
 /////////////////

+/*
 // return 1 if a should be before b
 static int csvPtrCmp ( const void *a, const void *b ) {
 	//JsonItem *ja = (JsonItem **)a;
@ -5791,6 +5802,7 @@ static int csvPtrCmp ( const void *a, const void *b ) {
 	int val = strcmp(pa,pb);
 	return val;
 }
+*/
 	
 #include "Json.h"

@ -5802,12 +5814,10 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
 	//Msg40 *msg40 = &st->m_msg40;
 	//int32_t numResults = msg40->getNumResults();

+	/*
 	char tmp1[1024];
 	SafeBuf tmpBuf (tmp1 , 1024);

-	char tmp2[1024];
-	SafeBuf nameBuf (tmp2, 1024);
-
 	char nbuf[27000];
 	HashTableX nameTable;
 	if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
@ -5905,9 +5915,8 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
 	}

 	// sort them
-	qsort ( ptrs , numPtrs , 4 , csvPtrCmp );
+	qsort ( ptrs , numPtrs , sizeof(char *) , csvPtrCmp );

-	// set up table to map field name to column for printing the json items
 	HashTableX *columnTable = &m_columnTable;
 	if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
 		return false;
@ -5922,6 +5931,37 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
 		if ( ! columnTable->addKey ( &h64 , &i ) ) 
 			return false;
 	}
+	*/
+
+	Msg20 *msg20s[100];
+	int32_t i;
+	for ( i = 0 ; i < m_needFirstReplies && i < 100 ; i++ ) {
+		Msg20 *m20 = getCompletedSummary(i);
+		if ( ! m20 ) break;
+		msg20s[i] = m20;
+	}
+
+	int32_t numPtrs = 0;
+
+	char tmp2[1024];
+	SafeBuf nameBuf (tmp2, 1024);
+
+	int32_t ct = 0;
+	if ( msg20s[0] ) ct = msg20s[0]->m_r->m_contentType;
+
+	CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
+
+	// . set up table to map field name to col for printing the json items
+	// . call this from PageResults.cpp 
+	printCSVHeaderRow2 ( sb , 
+			     ct ,
+			     cr ,
+			     &nameBuf ,
+			     &m_columnTable ,
+			     msg20s ,
+			     i , // numResults ,
+			     &numPtrs 
+			     );

 	m_numCSVColumns = numPtrs;

@ -6016,6 +6056,8 @@ bool Msg40::printJsonItemInCSV ( State0 *st , int32_t ix ) {

 		// sanity
 		if ( column == -1 ) {//>= numCSVColumns ) { 
+			// don't show it any more...
+			continue;
 			// add a new column...
 			int32_t newColnum = numCSVColumns + 1;
 			// silently drop it if we already have too many cols
@ -6467,9 +6509,12 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 		if ( format == FORMAT_XML ) {
 			sb->safePrintf("\t<facet>\n"
 				       "\t\t<field>%s</field>\n"
-				       "\t\t<value>"
-				       , term
-				       );
+				       , term );
+			sb->safePrintf("\t\t<totalDocsWithField>%"INT64""
+				       "</totalDocsWithField>\n"
+				       , qt->m_numDocsThatHaveFacet );
+			sb->safePrintf("\t\t<value>");
+
 			if ( isString )
 				sb->safePrintf("<![CDATA[%"UINT32",",
 					       (uint32_t)*fvh);
@ -6569,9 +6614,12 @@ bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
 		if ( format == FORMAT_JSON ) {
 			sb->safePrintf("{\n"
 				       "\t\"field\":\"%s\",\n"
-				       "\t\"value\":\""
-				       , term
+				       , term 
 				       );
+			sb->safePrintf("\t\"totalDocsWithField\":%"INT64""
+				       ",\n", qt->m_numDocsThatHaveFacet );
+			sb->safePrintf("\t\"value\":\"");
+
 			if (  isString )
 				sb->safePrintf("%"UINT32","
 					       , (uint32_t)*fvh);
--- a/Msg40.h
+++ b/Msg40.h
@ -109,6 +109,7 @@ class Msg40 {
 			  //void (* callback)(class Msg40 *THIS, void *state));
 			  void             (* callback)(void *state));

+	void makeCallback();
 	bool gotCacheReply();
 	// a continuation function of getResults() above
 	bool prepareToGetDocIds ( );
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -244,6 +244,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 			       , dr
 			       , cr->m_coll
 			       );
+		log("crawlbot: %s",sb2.getBufStart());
 		HttpRequest hr2;
 		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
 		return sendPageResults ( sock , &hr2 );
@ -283,6 +284,59 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 			       , dr 
 			       , cr->m_coll
 			       );
+		log("crawlbot: %s",sb2.getBufStart());
+		HttpRequest hr2;
+		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
+		return sendPageResults ( sock , &hr2 );
+	}
+
+	// . now the urls.csv is also a query on gbss files
+	// . make an httprequest on stack and call it
+	// . only do this for version 3 
+	//   i.e. GET /v3/crawl/download/token-collectionname_urls.csv
+	if ( fmt == FORMAT_CSV && 
+	     rdbId == RDB_SPIDERDB &&
+	     path[0] == '/' &&
+	     path[1] == 'v' &&
+	     path[2] == '3' ) {
+		char tmp2[5000];
+		SafeBuf sb2(tmp2,5000);
+		// never dedup
+		int32_t dr = 0;
+		// do not dedup for crawls either it is too confusing!!!!
+		// ppl wonder where the results are!
+		dr = 0;
+		sb2.safePrintf("GET /search?"
+			       // this is not necessary
+			       //"icc=1&"
+			       "format=csv&"
+			       // no site clustering
+			       "sc=0&"
+			       // never dedup.
+			       "dr=0&"
+			       "c=%s&"
+			       "n=10000000&"
+			       // stream it now
+			       // can't stream until we fix headers be printed
+			       // in Msg40.cpp. so gbssUrl->Url etc.
+			       // mdw: ok should work now
+			       "stream=1&"
+			       //"stream=0&"
+			       // no summary similarity dedup, only exact
+			       // doc content hash. otherwise too slow!!
+			       "pss=0&"
+			       // no gigabits
+			       "dsrt=0&"
+			       // do not compute summary. 0 lines.
+			       //"ns=0&"
+			       "q=gbrevsortbyint%%3AgbssSpiderTime+"
+			       "gbssIsDiffbotObject%%3A0"
+			       "&"
+			       //"prepend=type%%3Ajson"
+			       "\r\n\r\n"
+			       , cr->m_coll
+			       );
+		log("crawlbot: %s",sb2.getBufStart());
 		HttpRequest hr2;
 		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
 		return sendPageResults ( sock , &hr2 );
@ -768,7 +822,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 			lastSpidered = 0;

 		bool isProcessed = false;
-		if ( srep ) isProcessed = srep->m_sentToDiffbot;
+		if ( srep ) isProcessed = srep->m_sentToDiffbotThisTime;

 		if ( srep && srep->m_hadDiffbotError )
 			isProcessed = false;
@ -848,8 +902,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		// lastspidertime>={roundstart} --> spiders disabled rule
 		// so that we do not spider a url twice in the same round
 		if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
+		     cr->m_regExs[ufn].length() &&
 		     // we set this to 0 instead of using the checkbox
-		     cr->m_maxSpidersPerRule[ufn] <= 0 ) {
+		     strstr(cr->m_regExs[ufn].getBufStart(),"round") ) {
+			//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
 			priority = -5;
 		}

@ -935,10 +991,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 				       //, iptoa(sreq->m_firstIp)
 				       );
 			// print priority
-			if ( priority == SPIDER_PRIORITY_FILTERED )
+			//if ( priority == SPIDER_PRIORITY_FILTERED )
+			// we just turn off the spiders now
+			if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 )
 				sb->safePrintf("url ignored");
-			else if ( priority == SPIDER_PRIORITY_BANNED )
-				sb->safePrintf("url banned");
+			//else if ( priority == SPIDER_PRIORITY_BANNED )
+			//	sb->safePrintf("url banned");
 			else if ( priority == -4 )
 				sb->safePrintf("error");
 			else if ( priority == -5 )
@ -4254,7 +4312,7 @@ bool getSpiderRequestMetaList ( char *doc ,
 		sreq.m_hostHash32 = url.getHostHash32();
 		sreq.m_domHash32  = url.getDomainHash32();
 		sreq.m_siteHash32 = url.getHostHash32();
-		sreq.m_probDocId  = probDocId;
+		//sreq.m_probDocId  = probDocId;
 		sreq.m_hopCount   = 0; // we're a seed
 		sreq.m_hopCountValid = true;
 		sreq.m_addedTime = now;
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -407,6 +407,10 @@ bool processLoop ( void *state ) {
 	if ( format == FORMAT_XML ) sb->reset();
 	if ( format == FORMAT_JSON ) sb->reset();

+	if ( xd->m_contentType == CT_JSON ) sb->reset();
+	if ( xd->m_contentType == CT_XML  ) sb->reset();
+	if ( xd->m_contentType == CT_STATUS ) sb->reset();
+
 	// for undoing the stuff below
 	int32_t startLen2 = sb->length();//p;

@ -431,6 +435,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		printDisclaimer = false;

+	if ( xd->m_contentType == CT_STATUS )
+		printDisclaimer = false;
+
 	if ( format == FORMAT_XML ) printDisclaimer = false;
 	if ( format == FORMAT_JSON ) printDisclaimer = false;

@ -624,6 +631,8 @@ bool processLoop ( void *state ) {
 		includeHeader = false;
 	if ( xd->m_contentType == CT_XML )
 		includeHeader = false;
+	if ( xd->m_contentType == CT_STATUS )
+		includeHeader = false;

 	if ( format == FORMAT_XML ) includeHeader = false;
 	if ( format == FORMAT_JSON ) includeHeader = false;
@ -679,6 +688,7 @@ bool processLoop ( void *state ) {
 	// do not calc title or print it if doc is xml or json
 	if ( ctype == CT_XML ) sbend = sbstart;
 	if ( ctype == CT_JSON ) sbend = sbstart;
+	if ( ctype == CT_STATUS ) sbend = sbstart;

 	for ( char *t = sbstart ; t < sbend ; t++ ) {
 		// title tag?
@ -813,6 +823,8 @@ bool processLoop ( void *state ) {
 	// do not do term highlighting if json
 	if ( xd->m_contentType == CT_JSON )
 		queryHighlighting = false;
+	if ( xd->m_contentType == CT_STATUS )
+		queryHighlighting = false;

 	SafeBuf tmp;
 	SafeBuf *xb = sb;
@ -917,6 +929,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		contentType = "application/json";

+	if ( xd->m_contentType == CT_STATUS )
+		contentType = "application/json";
+
 	if ( xd->m_contentType == CT_XML )
 		contentType = "test/xml";

--- a/PageHosts.cpp
+++ b/PageHosts.cpp
@ -521,9 +521,18 @@ skipReplaceHost:
 		}

 		// recovery mode? reocvered from coring?
-		if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML )
+		if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML ) {
 			fb.safePrintf("<b title=\"Recovered from core"
 				      "\">x</b>");
+			// this is only 8-bits at the moment so it's capped
+			// at 255. this level is 1 the first time we core
+			// and are restarted.
+			if ( h->m_pingInfo.m_recoveryLevel > 1 )
+			fb.safePrintf("<sup>%"INT32"</sup>",
+				      (int32_t)
+				      h->m_pingInfo.m_recoveryLevel);
+		}
+
 		if ((flags & PFLAG_RECOVERYMODE)&& format != FORMAT_HTML )
 			fb.safePrintf("Recovered from core");

@ -553,14 +562,15 @@ skipReplaceHost:
 					,h->m_pingInfo.m_currentSpiders
 					);

-		if ( format == FORMAT_HTML && h->m_pingInfo.m_udpSlotsInUse ) {
+		if ( format == FORMAT_HTML && 
+		     h->m_pingInfo.m_udpSlotsInUseIncoming ) {
 			char *f1 = "";
 			char *f2 = "";
-			if ( h->m_pingInfo.m_udpSlotsInUse >= 200 ) {
+			if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 200 ) {
 				f1 = "<b>";
 				f2 = "</b>";
 			}
-			if ( h->m_pingInfo.m_udpSlotsInUse >= 400 ) {
+			if ( h->m_pingInfo.m_udpSlotsInUseIncoming >= 400 ) {
 				f1 = "<b><font color=red>";
 				f2 = "</font></b>";
 			}
@ -571,7 +581,7 @@ skipReplaceHost:
 				      "%s"
 				      "</span>"
 				      ,f1
-				      ,h->m_pingInfo.m_udpSlotsInUse
+				      ,h->m_pingInfo.m_udpSlotsInUseIncoming
 				      ,f2
 				      );
 		}
@ -679,7 +689,7 @@ skipReplaceHost:

 			sb.safePrintf("\t\t<udpSlotsInUse>%"INT32""
 				      "</udpSlotsInUse>\n",
-				      h->m_pingInfo.m_udpSlotsInUse);
+				      h->m_pingInfo.m_udpSlotsInUseIncoming);

 			sb.safePrintf("\t\t<tcpSocketsInUse>%"INT32""
 				      "</tcpSocketsInUse>\n",
@ -791,7 +801,7 @@ skipReplaceHost:
 			sb.safePrintf("\t\t\"errorTryAgains\":%"INT32",\n",
 				      h->m_pingInfo.m_etryagains);
 			sb.safePrintf("\t\t\"udpSlotsInUse\":%"INT32",\n",
-				      h->m_pingInfo.m_udpSlotsInUse);
+				      h->m_pingInfo.m_udpSlotsInUseIncoming);
 			sb.safePrintf("\t\t\"tcpSocketsInUse\":%"INT32",\n",
 				      h->m_pingInfo.m_tcpSocketsInUse);

@ -1463,7 +1473,8 @@ skipReplaceHost:
 		  "<td>x (status flag)</td>"
 		  "<td>Indicates host has abruptly exited due to a fatal "
 		  "error (cored) and "
-		  "restarted itself."
+		  "restarted itself. The exponent is how many times it has "
+		  "done this. If no exponent, it only did it once."
 		  "</td>"
 		  "</tr>\n"

@ -1498,7 +1509,8 @@ skipReplaceHost:
 		  "<tr class=poo>"
 		  "<td><nobr>U (status flag)</nobr></td>"
 		  "<td>Indicates the number of active UDP transactions "
-		  "which are either outgoing or incoming requests."
+		  "which are incoming requests. These will pile up if a "
+		  "host can't handle them fast enough."
 		  "</td>"
 		  "</tr>\n"

--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -449,9 +449,10 @@ bool Msg1c::gotList ( ) {
 		sr.m_urlIsDocId     =  1;
 		sr.m_fakeFirstIp    =  1;
 		// for msg12 locking
-		sr.m_probDocId      = docId;
+		//sr.m_probDocId      = docId;
 		// use test-parser not test-spider
-		sr.m_useTestSpiderDir = 0;
+		//sr.m_useTestSpiderDir = 0;
+		sr.m_parentIsSiteMap = 0;
 		// now you can recycle content instead of re-downloading it
 		// for every docid
 		sr.m_recycleContent = gr->m_recycleContent;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -42,7 +42,7 @@ bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
 		    char *oldUrl , int32_t oldUrlLen ) ;


-bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ;
+bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) ;

 bool printJsonItemInCSV ( char *json , SafeBuf *sb , class State0 *st ) ;

@ -128,6 +128,8 @@ bool sendReply ( State0 *st , char *reply ) {

 	g_stats.logAvgQueryTime(st->m_startTime);

+	//log("results: debug: in sendReply deleting st=%"PTRFMT,(PTRTYPE)st);
+
 	if ( ! savedErr ) { // g_errno ) {
 		g_stats.m_numSuccess++;
 		// . one hour cache time... no 1000 hours, basically infinite
@ -543,10 +545,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	// save this count so we know if TcpServer.cpp calls destroySocket(s)
 	st->m_numDestroys = s->m_numDestroys;

-	// you have to say "&header=1" to get back the header for json now.
-	// later on maybe it will default to on.
-	st->m_header = hr->getLong("header",0);
-
 	// . parse it up
 	// . this returns false and sets g_errno and, maybe, g_msg on error
 	SearchInput *si = &st->m_si;
@ -563,6 +561,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		return sendReply ( st, NULL );
 	}

+	// for debug
+	si->m_q.m_st0Ptr = (char *)st;
+
 	int32_t  codeLen = 0;
 	char *code = hr->getString("code", &codeLen, NULL);
 	// allow up to 1000 results per query for paying clients
@ -572,9 +573,15 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	if ( cr ) st->m_collnum = cr->m_collnum;
 	else      st->m_collnum = -1;

-	// turn this on for json output, unless diffbot collection
-	if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl )
-		st->m_header = 1;
+	int32_t defHdr = 1;
+
+	// default is no header for diffbot only
+	if ( cr->m_isCustomCrawl ||  strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 )
+		defHdr = 0;
+
+	// you have to say "&header=1" to get back the header for json now.
+	// later on maybe it will default to on.
+	st->m_header = hr->getLong("header",defHdr);

 	// take this out here as well!
 	// limit here
@ -635,7 +642,13 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 		return sendReply(st,NULL);
 	}

-
+	// filter that one query causing the memleak for now
+	// if ( strstr(si->m_q.m_orig,
+	// 	    "type:json AND ((((query=humanLanguage:en") ) {
+	// 	g_errno = EQUERYINGDISABLED;
+	// 	return sendReply(st,NULL);
+	// }
+		
 	// LAUNCH ADS
 	// . now get the ad space for this query
 	// . don't get ads if we're not on the first page of results
@ -692,6 +705,8 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	// save error
 	st->m_errno = g_errno;

+	//log("results: debug: new state=%"PTRFMT,(PTRTYPE)st);
+
 	// wait for ads and spellcheck and results?
 	if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults )
 		return false;
@ -1128,6 +1143,7 @@ bool gotResults ( void *state ) {
 	// record that
 	st->m_took = took;

+	//log("results: debug: in gotResults state=%"PTRFMT,(PTRTYPE)st);

 	// grab the query
 	Msg40 *msg40 = &(st->m_msg40);
@ -1153,10 +1169,12 @@ bool gotResults ( void *state ) {
 			log("res: socket still in streaming mode. wtf?");
 			st->m_socket->m_streamingMode = false;
 		}
-		log("msg40: done streaming. nuking state=%"PTRFMT" q=%s. "
+		log("msg40: done streaming. nuking state=0x%"PTRFMT" "
+		    "msg40=0x%"PTRFMT" q=%s. "
 		    "msg20sin=%i msg20sout=%i sendsin=%i sendsout=%i "
 		    "numrequests=%i numreplies=%i "
 		    ,(PTRTYPE)st
+		    ,(PTRTYPE)msg40
 		    ,si->m_q.m_orig

 		    , msg40->m_numMsg20sIn
@ -1167,6 +1185,15 @@ bool gotResults ( void *state ) {
 		    , msg40->m_numReplies

 		    );
+
+		// for some reason the socket still exists and will time out
+		//g_tcpServer.destroySocket ( st->m_socket );
+
+		// just let tcpserver nuke it, but don't double call
+		// the callback, doneSendingWrapper9()... because msg40
+		// will have been deleted!
+		st->m_socket->m_callback = NULL;
+
 		mdelete(st, sizeof(State0), "PageResults2");
 		delete st;
 		return true;
@ -1729,6 +1756,8 @@ bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) {
 	// MDW: support gigabits in xml/json format again
 	//if ( format != FORMAT_HTML ) numGigabits = 0;

+	if ( ! st->m_header )
+		numGigabits = 0;

 	// print gigabits
 	Gigabit *gigabits = (Gigabit *)gbuf->getBufStart();
@ -2131,6 +2160,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 	// print first [ for json
 	if ( si->m_format == FORMAT_JSON ) {
 		if ( st->m_header ) sb->safePrintf("{\n");
+		// this is just for diffbot really...
 		else                sb->safePrintf("[\n");
 	}

@ -2626,7 +2656,8 @@ bool printSearchResultsHeader ( State0 *st ) {


 	// when streaming results we lookup the facets last
-	if ( si->m_format != FORMAT_HTML && ! si->m_streamResults ) 
+	if ( si->m_format != FORMAT_HTML && ! si->m_streamResults &&
+	     st->m_header ) 
 		msg40->printFacetTables ( sb );

 	// now print gigabits if we are xml/json
@ -2647,8 +2678,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 		return true;
 	}

-	if ( si->m_format == FORMAT_JSON &&
-	     ! cr->m_isCustomCrawl ) {
+	if ( si->m_format == FORMAT_JSON && st->m_header ) {
 		sb->safePrintf("\"results\":[\n");
 		return true;
 	}
@ -3170,7 +3200,7 @@ bool printSearchResultsTail ( State0 *st ) {
 			sb->m_length -= 2;
 			sb->safePrintf("\n");
 		}
-		// print ending ] for json
+		// print ending ] for json search results
 		sb->safePrintf("]\n");

 		// when streaming results we lookup the facets last
@ -3903,13 +3933,14 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	// ptr_content is set in the msg20reply.
 	if ( si->m_format == FORMAT_CSV &&
 	     mr->ptr_content &&
-	     mr->m_contentType == CT_JSON ) {
+	     // spider STATUS docs are json
+	     (mr->m_contentType == CT_JSON || mr->m_contentType == CT_STATUS)){
 		// parse it up
 		char *json = mr->ptr_content;
 		// only print header row once, so pass in that flag
 		if ( ! st->m_printedHeaderRow ) {
 			sb->reset();
-			printCSVHeaderRow ( sb , st );
+			printCSVHeaderRow ( sb , st , mr->m_contentType );
 			st->m_printedHeaderRow = true;
 		}
 		printJsonItemInCSV ( json , sb , st );
@ -4026,6 +4057,83 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		sb->safePrintf("\",\n");
 	}

+	// print spider status pages special
+	if ( mr->ptr_content && 
+	     si->m_format == FORMAT_HTML &&
+	     mr->m_contentType == CT_STATUS ) {
+		if ( *numPrintedSoFar )
+			sb->safePrintf("<br><hr><br>\n");
+		// skip to gbssurl
+		char *s = strstr ( mr->ptr_content,"\"gbssUrl\":");
+		if ( ! s ) {
+			log("results: missing gbssUrl");
+			goto badformat;
+		}
+		// then do two columns after the two urls
+		char *e = strstr ( s , "\"gbssStatusCode\":" );
+		if ( ! e ) {
+			log("results: missing gbssStatusCode");
+			goto badformat;
+		}
+		char *m = strstr ( e , "\"gbssConsecutiveErrors\":");
+		if ( ! m ) {
+			log("results: missing gbssConsecutiveErrors");
+			goto badformat;
+		}
+		// exclude \0
+		char *end = mr->ptr_content + mr->size_content - 1;
+		// use a table with 2 columns
+		// so we can use \n to separate lines and don't have to add brs
+		// and boldify just the main url, not the redir url!
+		sb->safePrintf("<pre style=display:inline;>"
+			       "\"gbssUrl\":\""
+			       "<b style=color:blue;><a href=/get?"
+			       "c=%s&"
+			       "d=%"INT64">"
+			       , cr->m_coll
+			       , mr->m_docId
+			       );
+		char *s2 = strstr ( s , "\"gbssFinalRedirectUrl\":");
+		char *bend = e - 3;
+		if ( s2 ) bend = s2 - 3;
+		sb->safeMemcpy ( s+11 , bend - (s+11));
+		sb->safePrintf("</a></b></pre>\",<br>");
+		// now print redir url if there
+		if ( s2 ) {
+			sb->safePrintf("<pre style=display:inline;>");
+			sb->safeMemcpy ( s2 , e-s2 );
+			sb->removeLastChar('\n');
+			sb->safePrintf("</pre>");
+		}
+		sb->safePrintf("<table border=0 cellpadding=0 cellspacing=0>"
+			       "<tr><td>");
+		sb->safePrintf("<pre>");
+		//int32_t off = sb->length();
+		sb->safeMemcpy ( e , m - e );
+		sb->safePrintf("</pre>");
+		sb->safePrintf("</td><td>");
+		sb->safePrintf("<pre>");
+		sb->safeMemcpy ( m , end - m );
+		// remove last \n
+		sb->removeLastChar('\n');
+		sb->removeLastChar('}');
+		sb->removeLastChar('\n');
+		sb->safePrintf("</pre>\n");
+		sb->safePrintf("</td></tr></table>");
+		// replace \n with <br>
+		// sb->safeReplace2 ( "\n" , 1 ,
+		// 		   "<br>" , 4 ,
+		// 		   0,//niceness ,
+		// 		   off );
+		// inc it
+		*numPrintedSoFar = *numPrintedSoFar + 1;
+		// just in case
+		sb->nullTerm();
+		return true;
+	}
+
+	badformat:
+
 	Highlight hi;

 	// get the url
@ -4359,7 +4467,6 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	//
 	///////

-
 	// the a href tag
 	if ( si->m_format == FORMAT_HTML ) {
 		sb->safePrintf ( "<a href=" );
@ -4887,6 +4994,9 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		// . docId for possible cached link
 		// . might have merged a bunch together
 		sb->safePrintf("\t\t<docId>%"INT64"</docId>\n",mr->m_docId );
+	}
+
+	if ( si->m_format == FORMAT_XML && mr->m_contentType != CT_STATUS ) {
 		// . show the site root
 		// . for hompages.com/users/fred/mypage.html this will be
 		//   homepages.com/users/fred/
@ -4934,6 +5044,9 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		// . docId for possible cached link
 		// . might have merged a bunch together
 		sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId );
+	}
+
+	if ( si->m_format == FORMAT_JSON && mr->m_contentType != CT_STATUS ) {
 		// . show the site root
 		// . for hompages.com/users/fred/mypage.html this will be
 		//   homepages.com/users/fred/
@ -5182,10 +5295,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 		sb->safePrintf (" - "
 				"<a style=color:blue; "
 				"href=\"/search?sb=1&c=%s&"
-				"q=url2%%3A" 
+				//"q=url2%%3A" 
+				"q=gbfieldmatch%%3AgbssUrl%%3A"
 				, coll 
 				);
-		sb->urlEncode ( url , gbstrlen(url) , false );
+		// do not include ending \0
+		sb->urlEncode ( mr->ptr_ubuf , mr->size_ubuf-1 , false );
 		sb->safePrintf ( "\">"
 				 "spider info</a>\n"
 			       );
@ -7810,28 +7925,44 @@ int csvPtrCmp ( const void *a, const void *b ) {
 	if ( strcmp(pb,"product.title") == 0 ) return  1;
 	if ( strcmp(pa,"title") == 0 ) return -1;
 	if ( strcmp(pb,"title") == 0 ) return  1;
+
+	// this is now taken care of from the 'supps[]' array below
+	// by prepending two digits before each field name
+
+	// put url first for spider status docs
+	// if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssUrl") == 0 ) return  1;
+
+	// if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssStatusMsg") == 0 ) return  1;
+
+	// if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssStatusCode") == 0 ) return  1;
+
+
 	// otherwise string compare
 	int val = strcmp(pa,pb);
+
 	return val;
 }
 	

 #include "Json.h"

-// 
-// print header row in csv
-//
-bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
+bool printCSVHeaderRow2 ( SafeBuf *sb ,
+			  int32_t ct ,
+			  CollectionRec *cr ,
+			  SafeBuf *nameBuf ,
+			  HashTableX *columnTable ,
+			  Msg20 **msg20s ,
+			  int32_t numMsg20s ,
+			  int32_t *numPtrsArg ) {

-	Msg40 *msg40 = &st->m_msg40;
- 	int32_t numResults = msg40->getNumResults();
+	*numPtrsArg = 0;

 	char tmp1[1024];
 	SafeBuf tmpBuf (tmp1 , 1024);

-	char tmp2[1024];
-	SafeBuf nameBuf (tmp2, 1024);
-
 	char nbuf[27000];
 	HashTableX nameTable;
 	if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
@ -7839,16 +7970,86 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {

 	int32_t niceness = 0;

+	// if doing spider status docs not all will have dupofdocid field
+	char *supps [] = { 
+		"00gbssUrl",
+		"01gbssDocId",
+		"02gbssDiscoveredTime",
+		"03gbssSpiderTime",
+		"06gbssContentLen",
+		"07gbssDupOfDocId" ,
+		"08gbssNumRedirects",
+		"09gbssFinalRedirectUrl",
+		"10gbssCrawlDelayMS",
+		"11gbssCrawlRound",
+		"12gbssPrevTotalNumIndexAttempts",
+		"13gbssHopCount",
+		"14gbssStatusMsg",
+		"15gbssSentToDiffbotThisTime",
+		"16gbssDiffbotReplyMsg",
+
+		"gbssIp",
+		"gbssPercentContentChanged",
+		"gbssDownloadStartTime",
+		"gbssDownloadEndTime",
+		"gbssContentType",
+		"gbssHttpStatus",
+		"gbssWasIndexed",
+		"gbssAgeInIndex",
+		"gbssPrevTotalNumIndexSuccesses",
+		"gbssPrevTotalNumIndexFailures",
+		"gbssDownloadStartTimeMS",
+		"gbssDownloadEndTimeMS",
+		"gbssDownloadDurationMS",
+		"gbssIpLookupTimeMS",
+		"gbssSiteNumInlinks",
+		"gbssSiteRank",
+		"gbssLanguage",
+		"gbssDiffbotReplyCode",
+		"gbssDiffbotLen",
+		"gbssDiffbotReplyResponseTimeMS",
+		"gbssDiffbotReplyRetries",
+		NULL };
+
+	for ( int32_t i = 0 ; supps[i] ; i++ ) {
+		// don't add these column headers to non spider status docs
+		if ( ct != CT_STATUS ) break;
+		char *skip = supps[i];
+		// if custom crawl only show fields in supps with digits
+		if ( cr->m_isCustomCrawl && ! is_digit(skip[0]) ) continue;
+		// skip over the two order digits
+		if ( is_digit(skip[0]) ) skip += 2;
+		// don't include the order digits in the hash
+		int64_t h64 = hash64n ( skip );
+		if ( nameTable.isInTable ( &h64 ) ) continue;
+		// only show diffbot column headers for custom (diffbot) crawls
+		if ( strncmp(skip,"gbssDiffbot",11) == 0 &&
+		     ( ! cr || ! cr->m_isCustomCrawl ) )
+			break;
+		// record offset of the name for our hash table
+		int32_t nameBufOffset = nameBuf->length();
+		// store the name in our name buffer
+		if ( ! nameBuf->safeStrcpy (supps[i])) return false;
+		if ( ! nameBuf->pushChar ( '\0' ) ) return false;
+		// it's new. add it
+		if ( ! nameTable.addKey ( &h64 ,&nameBufOffset)) return false;
+	}
+	
 	// . scan every fucking json item in the search results.
 	// . we still need to deal with the case when there are so many
 	//   search results we have to dump each msg20 reply to disk in
 	//   order. then we'll have to update this code to scan that file.

-	for ( int32_t i = 0 ; i < numResults ; i++ ) {
+	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // numResults
+
+		// if custom crawl urls.csv only show the supps[] from above
+		if ( ct == CT_STATUS && cr->m_isCustomCrawl )
+			break;

 		// get the msg20 reply for search result #i
-		Msg20      *m20 = msg40->m_msg20[i];
-		Msg20Reply *mr  = m20->m_r;
+		//Msg20      *m20 = msg40->m_msg20[i];
+		//Msg20Reply *mr  = m20->m_r;
+		Msg20Reply *mr  = msg20s[i]->m_r;

 		if ( ! mr ) {
 			log("results: missing msg20 reply for result #%"INT32"",i);
@ -7889,6 +8090,13 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
 			     strcmp(ji->m_name,"html")==0)
 				continue;

+			// for spider status docs skip these
+			if ( ct == CT_STATUS && ji->m_name ) {
+				if (!strcmp(ji->m_name,"") )
+					continue;
+			}
+
+
 			// reset length of buf to 0
 			tmpBuf.reset();

@ -7902,12 +8110,12 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
 			if ( nameTable.isInTable ( &h64 ) ) continue;

 			// record offset of the name for our hash table
-			int32_t nameBufOffset = nameBuf.length();
+			int32_t nameBufOffset = nameBuf->length();
 			
 			// store the name in our name buffer
-			if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) )
+			if ( ! nameBuf->safeStrcpy ( tmpBuf.getBufStart() ) )
 				return false;
-			if ( ! nameBuf.pushChar ( '\0' ) )
+			if ( ! nameBuf->pushChar ( '\0' ) )
 				return false;

 			// it's new. add it
@ -7923,30 +8131,129 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
 	for ( int32_t i = 0 ; i < nameTable.m_numSlots ; i++ ) {
 		if ( ! nameTable.m_flags[i] ) continue;
 		int32_t off = *(int32_t *)nameTable.getValueFromSlot(i);
-		char *p = nameBuf.getBufStart() + off;
+		char *p = nameBuf->getBufStart() + off;
 		ptrs[numPtrs++] = p;
 		if ( numPtrs >= 1024 ) break;
 	}

+	// pass back to caller
+	*numPtrsArg = numPtrs;
+
 	// sort them
 	qsort ( ptrs , numPtrs , sizeof(char *) , csvPtrCmp );

 	// set up table to map field name to column for printing the json items
-	HashTableX *columnTable = &st->m_columnTable;
+	//HashTableX *columnTable = &st->m_columnTable;
 	if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
 		return false;

 	// now print them out as the header row
 	for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
+
+		char *hdr = ptrs[i];
+
 		if ( i > 0 && ! sb->pushChar(',') ) return false;
-		if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
+
+		// skip the two order digits
+		if ( ct == CT_STATUS && is_digit(hdr[0]) ) hdr += 2;
+
+		// save it
+		char *skip = hdr;
+
+		// now transform the hdr from gbss* into the old way
+		if ( ! cr->m_isCustomCrawl )
+			goto skipTransform;
+
+		if ( ! strcmp(hdr,"gbssUrl") ) 
+			hdr = "Url";
+		if ( ! strcmp(hdr,"gbssDocId") ) 
+			hdr = "Doc ID";
+		// when url was first discovered
+		if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
+			hdr = "Url Discovered Time";
+		// when it was crawled this time
+		if ( ! strcmp(hdr,"gbssSpiderTime" ) )
+			hdr = "Crawled Time";
+		if ( ! strcmp(hdr,"gbssContentLen") ) 
+			hdr = "Content Length";
+		if ( ! strcmp(hdr,"gbssDupOfDocId") ) 
+			hdr = "Duplicate Of";
+		if ( ! strcmp(hdr,"gbssNumRedirects") ) 
+			hdr = "Redirects";
+		if ( ! strcmp(hdr,"gbssFinalRedirectUrl") )
+			hdr = "Redirected To";
+		if ( ! strcmp(hdr,"gbssCrawlDelayMS") ) 
+			hdr = "Robots.txt Crawl Delay (ms)";
+		if ( ! strcmp(hdr,"gbssPercentContentChanged") )
+			hdr = "Percent Changed";
+		if ( ! strcmp(hdr,"gbssCrawlRound") ) 
+			hdr = "Crawl Round";
+		if ( ! strcmp(hdr,"gbssPrevTotalNumIndexAttempts") )
+			hdr = "Crawl Try #";
+		if ( ! strcmp(hdr,"gbssHopCount") ) 
+			hdr = "Hop Count";
+		if ( ! strcmp(hdr,"gbssIp") ) 
+			hdr = "IP";
+		if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") ) 
+			hdr = "Process Attempted";
+		if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
+			hdr = "Process Response";
+		if ( ! strcmp(hdr,"gbssStatusMsg") ) 
+			hdr = "Crawl Status";
+
+		//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") ) 
+		//	hdr = "Matching Expression";
+		// value is 'url ignored', 'will spider next round', 'error' or 
+		// a numeric priority
+		// if ( ! strcmp(hdr,"gbssSpiderPriority") ) 
+		// 	hdr = "Matching Action";
+
+		// new columns
+		// if ( ! strcmp(hdr,"gbssAgeInIndex") ) 
+		// 	hdr = "Age in Index";
+
+		// if not transformed, then do not print it out
+		if ( ! strncmp(hdr,"gbss",4) )
+			continue;
+
+	skipTransform:
+		if ( ! sb->safeStrcpy ( hdr ) ) return false;
+
 		// record the hash of each one for printing out further json
 		// objects in the same order so columns are aligned!
-		int64_t h64 = hash64n ( ptrs[i] );
+		int64_t h64 = hash64n ( skip ); // ptrs[i] );
 		if ( ! columnTable->addKey ( &h64 , &i ) ) 
 			return false;
 	}

+	return true;
+}
+
+// 
+// print header row in csv
+//
+bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
+
+	Msg40 *msg40 = &st->m_msg40;
+ 	int32_t numResults = msg40->getNumResults();
+
+	char tmp2[1024];
+	SafeBuf nameBuf (tmp2, 1024);
+
+	CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
+
+	int32_t numPtrs = 0;
+
+	printCSVHeaderRow2 ( sb , 
+			     ct ,
+			     cr ,
+			     &nameBuf ,
+			     &st->m_columnTable ,
+			     msg40->m_msg20 ,
+			     numResults ,
+			     &numPtrs 
+			     );
+
 	st->m_numCSVColumns = numPtrs;

 	if ( ! sb->pushChar('\n') )
@ -7960,6 +8267,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
 // returns false and sets g_errno on error
 bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {

+	CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
+
 	int32_t niceness = 0;

 	// parse the json
@ -8018,6 +8327,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 		int32_t slot = columnTable->getSlot ( &h64 ) ;
 		// MUST be in there
 		if ( slot < 0 ) { 
+			// we do not transform all gbss fields any more for
+			// diffbot to avoid overpopulating the csv
+			if ( cr && cr->m_isCustomCrawl ) continue;
 			// do not core on this anymore...
 			log("serps: json column not in table : %s",ji->m_name);
 			continue;
@ -9039,6 +9351,12 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
 		s_mi[n].m_icon     = NULL;
 		n++;

+		s_mi[n].m_menuNum  = 5;
+		s_mi[n].m_title    = "Output CSV";
+		s_mi[n].m_cgi      = "format=csv";
+		s_mi[n].m_icon     = NULL;
+		n++;
+
 		// show/hide banned
 		s_mi[n].m_menuNum  = 6;
 		s_mi[n].m_title    = "Hide banned results";
@ -9116,19 +9434,19 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {

 		s_mi[n].m_menuNum  = 11;
 		s_mi[n].m_title    = "Respider all results";
-		s_mi[n].m_cgi      = "/admin/reindex";
+		s_mi[n].m_cgi      = "";//"/admin/reindex";
 		s_mi[n].m_icon     = NULL;
 		n++;

 		s_mi[n].m_menuNum  = 11;
 		s_mi[n].m_title    = "Delete all results";
-		s_mi[n].m_cgi      = "/admin/reindex";
+		s_mi[n].m_cgi      = "";//"/admin/reindex";
 		s_mi[n].m_icon     = NULL;
 		n++;

 		s_mi[n].m_menuNum  = 11;
 		s_mi[n].m_title    = "Scrape from google/bing";
-		s_mi[n].m_cgi      = "/admin/inject";
+		s_mi[n].m_cgi      = "";//"/admin/inject";
 		s_mi[n].m_icon     = NULL;
 		n++;

@ -9355,7 +9673,7 @@ bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) {
 }

 bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) { 
-
+	if ( ! cgi[0] ) return true;
 	// get original request url. this is not \0 terminated
 	char *src    = hr->m_origUrlRequest;
 	int32_t  srcLen = hr->m_origUrlRequestLen;
@ -9371,7 +9689,8 @@ bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
 	char *srcEnd = src + srcLen;

 	char *equal = strstr(cgi,"=");
-	if ( ! equal ) return log("results: %s has no equal sign",cgi);
+	if ( ! equal ) 
+		return log("results: %s has no equal sign",cgi);
 	int32_t cgiLen = equal - cgi;

 	char *found = NULL;
--- a/PageResults.h
+++ b/PageResults.h
@ -13,6 +13,14 @@
 #define PADDING 8
 #define SCROLLBAR_WIDTH 20

+bool printCSVHeaderRow2 ( class SafeBuf *sb ,
+			  int32_t ct ,
+			  class CollectionRec *cr ,
+			  class SafeBuf *nameBuf ,
+			  class HashTableX *columnTable ,
+			  class Msg20 **msg20s ,
+			  int32_t numMsg20s ,
+			  int32_t *numPtrsArg ) ;

 class State0 {
 public:
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -666,7 +666,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
 		{"SYNTAX","/syntax.html"},
 		{"USERS","/users.html"},
 		{"ABOUT","/about.html"},
-		{"NEWS","/news.html"},
+		{"BLOG","/blog.html"},
 		// take this out for now
 		//{"FEED","/searchfeed.html"},
 		{"FAQ","/faq.html"},
@ -1202,11 +1202,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
 	if ( printRedBox2 ( &sb , sock , r ) ) // true ) )
 		sb.safePrintf("<br>\n");

-	/*
-
-	  do not show table for open source installs
-
-	sb.safePrintf("<table cellpadding=3>\n");
+	sb.safePrintf("<br><center><table cellpadding=3>\n");
 	sb.safePrintf("\n");

 	char *root = "";
@ -1216,16 +1212,42 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
 	sb.safePrintf("<tr valign=top>\n");

 	//sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:red;></div></td>\n");
-	sb.safePrintf("<td align=center><img height=71px width=50px "
+	sb.safePrintf("<td width=10%% "
+		      "align=center><img style=padding-right:10px; "
+		      "height=71px width=50px "
 		      "src=%s/opensource.png></td>\n"
 		      , root );

-	sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
-	"</font><br>\n");
-	sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>. Limited support available for free."
-		  ,80);
+	sb.safePrintf("<td width=45%%><font size=+1><b>Open Source!</b>"
+	"</font><br><br>\n");
+	sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>."
+		  ,40);
+	//sb.safePrintf("<br><br>");
+	sb.safePrintf("</td>");
+
+	sb.safePrintf("<td><font size=+1><b>ScreenShots</b>"
+	"</font><br><br>\n");
+
+	sb.safePrintf("<a href=/ss_settings.png><img width=150 height=81 src=ss_settings_thumb.png></a>");
+
 	sb.safePrintf("<br><br>");
-	sb.safePrintf("</td></tr>\n");
+
+	sb.safePrintf("<a href=/ss_hosts.png><img width=150 height=81 src=ss_hosts_thumb.png></a>");
+
+	sb.safePrintf("<br><br>");
+
+	sb.safePrintf("<a href=/ss_filters.png><img width=150 height=81 src=ss_filters_thumb.png></a>");
+
+	sb.safePrintf("</td>");
+
+
+	sb.safePrintf("</tr>\n");
+
+	sb.safePrintf("</table></center>\n");
+
+	/*
+
+	  do not show table for open source installs


 	// donate with paypal
--- a/PageSockets.cpp
+++ b/PageSockets.cpp
@ -349,11 +349,16 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
 		     "<td><b>hostname</b></td>";
 	}

+	UdpSlot *slot = server->m_head3;
+	int32_t callbackReadyCount = 0;
+	for ( ; slot ; slot = slot->m_next3 , callbackReadyCount++ ); 
+
 	p->safePrintf ( "<table %s>"
 			"<tr class=hdrow><td colspan=19>"
 			"<center>"
 			//"<font size=+1>"
 			"<b>%s</b> (%"INT32" transactions)"
+			"(%"INT32" reads ready)"
 			//"</font>"
 			"</td></tr>"
 			"<tr bgcolor=#%s>"
@ -380,6 +385,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
 			"</tr>\n" , 
 			TABLE_STYLE,
 			title , server->getNumUsedSlots() , 
+			callbackReadyCount ,
 			DARK_BLUE ,
 			dd );

--- a/Pages.cpp
+++ b/Pages.cpp
@ -4092,7 +4092,7 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
 	for ( int32_t i = 1 ; i < g_hostdb.getNumHosts() ; i++ ) {
 		Host *h = &g_hostdb.m_hosts[i];
 		if ( g_hostdb.isDead( h ) ) continue;
-		if ( h->m_pingInfo.m_udpSlotsInUse >= 400 ) jammedHosts++;
+		if ( h->m_pingInfo.m_udpSlotsInUseIncoming>= 400)jammedHosts++;
 	}
 	if ( jammedHosts > 0 ) {
 		if ( adds ) mb->safePrintf("<br>");
@ -4101,8 +4101,8 @@ bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
 		if ( out == 1 ) s = " is";
 		mb->safePrintf("%s",box);
 		mb->safePrintf("%"INT32" host%s jammed with "
-			       "over %"INT32" outstanding "
-			       "udp transactions. "
+			       "over %"INT32" unhandled "
+			       "incoming udp requests. "
 			       "See <a href=/admin/sockets?c=%s>sockets</a>"
 			       " table.",jammedHosts,s,400,coll);
 		mb->safePrintf("%s",boxEnd);
--- a/Parms.cpp
+++ b/Parms.cpp
@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
 	// . by default, minus 2 includes minus 3, the new "FILTERED" priority
 	// . it is link "BANNED" but does not mean the url is low quality necessarily
 	if ( includeMinusTwo ) i = -3;
+
+	// no more DELETE, etc.
+	i = 0;
+	if ( select < 0 ) select = 0;
+
 	for ( ; i < n ; i++ ) {
 		if ( i == select ) s = " selected";
 		else               s = "";
@ -3446,8 +3451,11 @@ bool Parms::setFromFile ( void *THIS        ,
 	Xml xml;
 	//char buf [ MAX_XML_CONF ];
 	SafeBuf sb;
-	if ( filename&&!setXmlFromFile(&xml,filename,&sb))//buf,MAX_XML_CONF) )
+	if ( filename&&!setXmlFromFile(&xml,filename,&sb)){//buf,MAX_XML_CONF))
+		log("parms: error setting from file %s: %s",filename,
+		    mstrerror(g_errno));
 		return false;
+	}

 	// . all the collectionRecs have the same default file in
 	//   the workingDir/collections/default.conf
@ -3499,7 +3507,7 @@ bool Parms::setFromFile ( void *THIS        ,
 		if ( m->m_type == TYPE_CONSTANT ) continue;
 		// these are special commands really
 		if ( m->m_type == TYPE_BOOL2    ) continue;
-		//if ( strcmp ( m->m_xml , "users" ) == 0 )
+		//if ( strcmp ( m->m_xml , "forceDeleteUrls" ) == 0 )
 		//	log("got it");
 		// we did not get one from first xml file yet
 		bool first = true;
@ -12985,11 +12993,15 @@ void Parms::init ( ) {
 		"expressions. "
 		"Use the <i>&&</i> operator to string multiple expressions "
 		"together in the same expression text box. "
-		"A <i>spider priority</i> of "
+		"If you check the <i>delete</i> checkbox then urls matching "
+		"that row will be deleted if already indexed, otherwise, "
+		"they just won't be indexed."
+		//"A <i>spider priority</i> of "
 		//"<i>FILTERED</i> or <i>BANNED</i> "
-		"<i>DELETE</i> "
-		"will cause the URL to not be spidered, or if it has already "
-		"been indexed, it will be deleted when it is respidered."
+		// "<i>DELETE</i> "
+		// "will cause the URL to not be spidered, "
+		// "or if it has already "
+		// "been indexed, it will be deleted when it is respidered."
 		"<br><br>";
 		
 		/*
@ -13159,6 +13171,19 @@ void Parms::init ( ) {
 	m++;
 	*/

+	m->m_title = "delete";
+	m->m_cgi   = "fdu";
+	m->m_xml   = "forceDeleteUrls";
+	m->m_max   = MAX_FILTERS;
+	m->m_off   = (char *)cr.m_forceDelete - x;
+	m->m_type  = TYPE_CHECKBOX;
+	m->m_def   = "0";
+	m->m_page  = PAGE_FILTERS;
+	m->m_rowid = 1;
+	m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
+	m->m_obj   = OBJ_COLL;
+	m++;
+
 	m->m_title = "spider priority";
 	m->m_cgi   = "fsp";
 	m->m_xml   = "filterPriority";
@ -17754,7 +17779,8 @@ void Parms::init ( ) {
 	// and we add gbdocspidertime and gbdocindextime terms so you
 	// can use those to sort regular docs and not have spider reply
 	// status docs in the serps.
-	m->m_def   = "0";
+	// back on 4/21/2015 seems pretty stable.
+	m->m_def   = "1";
 	m->m_page  = PAGE_SPIDER;
 	m->m_obj   = OBJ_COLL;
 	m->m_flags = PF_CLONE;
@ -22006,6 +22032,41 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
 			cr->m_localCrawlInfo.m_lastSpiderAttempt = 0;
 		}
 	}
+
+	//
+	// if user changed the crawl/process max then reset here so
+	// spiders will resume
+	// 
+	if ( base == cr && 
+	     dst == (char *)&cr->m_maxToCrawl &&
+	     cr->m_spiderStatus == SP_MAXTOCRAWL ) {
+		// reset this for rebuilding of active spider collections
+		// so this collection can be in the linked list again
+		cr->m_spiderStatus = SP_INPROGRESS;
+		// rebuild list of active spider collections then
+		g_spiderLoop.m_activeListValid = false;
+	}
+
+	if ( base == cr && 
+	     dst == (char *)&cr->m_maxToProcess &&
+	     cr->m_spiderStatus == SP_MAXTOPROCESS ) {
+		// reset this for rebuilding of active spider collections
+		// so this collection can be in the linked list again
+		cr->m_spiderStatus = SP_INPROGRESS;
+		// rebuild list of active spider collections then
+		g_spiderLoop.m_activeListValid = false;
+	}
+
+	if ( base == cr && 
+	     dst == (char *)&cr->m_maxCrawlRounds &&
+	     cr->m_spiderStatus == SP_MAXROUNDS ) {
+		// reset this for rebuilding of active spider collections
+		// so this collection can be in the linked list again
+		cr->m_spiderStatus = SP_INPROGRESS;
+		// rebuild list of active spider collections then
+		g_spiderLoop.m_activeListValid = false;
+	}
+
 	//
 	// END HACK
 	//
@ -22287,11 +22348,18 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {


 			  "<tr class=poo><td>isrss | !isrss</td>"
-			  "<td>Matches if document is an rss feed. "
-			  "When harvesting outlinks we <i>guess</i> if they "
-			  "are an rss feed by seeing if their file extension "
-			  "is xml, rss or rdf. Or if they are in an "
-			  "alternative link tag.</td></tr>"
+			  "<td>Matches if document is an RSS feed. Will "
+			  "only match this rule if the document has been "
+			  "successfully spidered before, because it requires "
+			  "downloading the document content to see if it "
+			  "truly is an RSS feed.."
+			  "</td></tr>"
+
+			  "<tr class=poo><td>isrssext | !isrssext</td>"
+			  "<td>Matches if url ends in .xml .rss or .atom. "
+			  "TODO: Or if the link was in an "
+			  "alternative link tag."
+			  "</td></tr>"

 			  //"<tr class=poo><td>!isrss</td>"
 			  //"<td>Matches if document is NOT an rss feed."
@ -22452,6 +22520,13 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "then this will be matched."
 			  "</td></tr>"

+			  "<tr class=poo><td>isparentsitemap | "
+			  "!isparentsitemap</td>"
+			  "<td>"
+			  "If a parent of the URL was a sitemap.xml page "
+			  "then this will be matched."
+			  "</td></tr>"
+
 			  /*
 			  "<tr class=poo><td>parentisnew | !parentisnew</td>"
 			  "<td>"
@ -22518,6 +22593,20 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "Can use <, >, <=, >=, ==, != comparison operators. "
 			  "</td></tr>"

+
+			  "<tr class=poo><td>numinlinks&gt;20</td>"
+			  "<td>"
+			  "How many inlinks does the URL itself have? "
+			  "We only count one link per unique C-Class IP "
+			  "address "
+			  "so that a webmaster who owns an entire C-Class "
+			  "of IP addresses will only have her inlinks counted "
+			  "once."
+			  "Can use <, >, <=, >=, ==, != comparison operators. "
+			  "This is useful for spidering popular URLs quickly."
+			  "</td></tr>"
+
+
 			  "<tr class=poo><td>httpstatus==404</td>"
 			  "<td>"
 			  "For matching the URL based on the http status "
@ -22649,6 +22738,14 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "<i>foo.somesite.com</i> would NOT match."
 			  "</td></tr>"

+
+			  "<tr class=poo><td>isroot | !isroot</td>"
+			  "<td>Matches if the URL is a root URL. Like if "
+			  "its path is just '/'. Example: http://www.abc.com "
+			  "is a root ur but http://www.abc.com/foo is not. "
+			  "</td></tr>"
+
+
 			  "<tr class=poo><td>isonsamedomain | !isonsamedomain</td>"
 			  "<td>"
 			  "This is true if the url is from the same "
--- a/Parms.h
+++ b/Parms.h
@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;

 // special priorities for the priority drop down 
 // in the url filters table
-enum {
-	SPIDER_PRIORITY_FILTERED  = -3 ,
-	SPIDER_PRIORITY_BANNED    = -2 ,
-	SPIDER_PRIORITY_UNDEFINED = -1 };
+//enum {
+//	SPIDER_PRIORITY_FILTERED  = -3 ,
+//	SPIDER_PRIORITY_BANNED    = -2 ,
+//	SPIDER_PRIORITY_UNDEFINED = -1 };

 enum {
 	OBJ_CONF    = 1 ,
--- a/PingServer.cpp
+++ b/PingServer.cpp
@ -28,6 +28,7 @@ int32_t klogctl( int, char *,int ) { return 0; }

 // from main.cpp. when keepalive script restarts us this is true
 extern bool g_recoveryMode;
+extern int32_t g_recoveryLevel;

 // a global class extern'd in .h file
 PingServer g_pingServer;
@ -281,6 +282,9 @@ void PingServer::sendPingsToAll ( ) {

 // };

+// from Loop.cpp
+extern float g_cpuUsage;
+
 // ping host #i
 void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
 	// don't ping on interface machines
@ -491,6 +495,10 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
 		flags |= PFLAG_MERGEMODE0OR6;
 	if ( ! isClockInSync() ) flags |= PFLAG_OUTOFSYNC;

+	uint8_t rv8 = (uint8_t)g_recoveryLevel;
+	if ( g_recoveryLevel > 255 ) rv8 = 255;
+	pi->m_recoveryLevel = rv8;
+
 	//*(int32_t *)p = flags; p += 4; // 4 bytes
 	pi->m_flags = flags;

@ -504,10 +512,13 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {

 	pi->m_localHostTimeMS = gettimeofdayInMillisecondsLocal();

-	pi->m_udpSlotsInUse = g_udpServer.getNumUsedSlots();
+	pi->m_udpSlotsInUseIncoming = g_udpServer.getNumUsedSlotsIncoming();

 	pi->m_tcpSocketsInUse = g_httpServer.m_tcp.m_numUsed;

+	// from Loop.cpp
+	pi->m_cpuUsage = g_cpuUsage;
+
 	// store hd temps
 	// gbmemcpy ( p , me->m_hdtemps , 4 * 2 );
 	// p += 4 * 2;
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -686,8 +686,10 @@ PosdbTable::~PosdbTable() {
 }

 void PosdbTable::reset() {
+	// we can't reset this because we don't recall allocTopTree()
+	// again when computing search results in docid ranges.
+	//m_hasFacetTerm = false;
 	// has init() been called?
-	m_hasFacetTerm = false;
 	m_initialized          = false;
 	m_estimatedTotalHits   = -1;
 	m_errno                   = 0;
@ -4365,6 +4367,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		qti->m_qtermNum      = i;
 		// and vice versa
 		qt->m_queryTermInfoNum = nrg;
+		// now we count the total # of docs that have a facet
+		// for doing tf/idf type things
+		//qti->m_numDocsThatHaveFacet = 0;
 		// this is not good enough, we need to count 
 		// non-whitespace punct as 2 units not 1 unit
 		// otherwise qdist gets thrown off and our phrasing fails.
@ -4960,10 +4965,36 @@ inline bool isInRange2 ( char *recPtr , char *subListEnd, QueryTerm *qt ) {
 	return false;
 }

+// for a facet
+int64_t PosdbTable::countUniqueDocids( QueryTermInfo *qti ) {
+	// get that sublist. facets should only have one sublist since
+	// they have no synonyms.
+	char *start = qti->m_subLists[0]->getList();
+	register char *recPtr     = start;
+	register char *subListEnd = qti->m_subLists[0]->getListEnd();
+	int64_t count = 0;
+ loop:
+	if ( recPtr >= subListEnd ) {
+		if ( m_debug )
+			log(LOG_DEBUG,"posdb: term list size of %"
+			    INT32" has %"INT64" unique docids"
+			    , (int32_t)(subListEnd-start),count);
+		return count;
+	}
+	// skip that docid record in our termlist. it MUST have been
+	// 12 bytes, a docid heading record.
+	recPtr += 12;
+	count++;
+	// skip any following keys that are 6 bytes, that means they
+	// share the same docid
+	for ( ; recPtr < subListEnd && ((*recPtr)&0x04); recPtr += 6 );
+	goto loop;
+}
+
 // . add a QueryTermInfo for a term (synonym lists,etc) to the docid vote buf
 //   "m_docIdVoteBuf"
 // . this is how we intersect all the docids to end up with the winners
-void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t   listGroupNum ) {
+void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t   listGroupNum) {

 	// sanity check, we store this in a single byte below for voting
 	if ( listGroupNum >= 256 ) { char *xx=NULL;*xx=0; }
@ -5006,7 +5037,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t   listGroupNum ) {
 	//   the docid vote buf. that is, if the query is "jump car" we
 	//   just add all the docids for "jump" and then intersect with the
 	//   docids for "car".
-	for ( int32_t i = 0 ; i < qti->m_numSubLists && listGroupNum > 0 ; i++ ) {
+	for ( int32_t i = 0 ; i < qti->m_numSubLists && listGroupNum > 0; i++){
 		// get that sublist
 		recPtr     = qti->m_subLists[i]->getList();
 		subListEnd = qti->m_subLists[i]->getListEnd();
@ -5049,6 +5080,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t   listGroupNum ) {
 			dp[5] = listGroupNum;
 			// skip it
 			dp += 6;
+
 			// advance recPtr now
 			break;
 		}
@ -5121,7 +5153,7 @@ void PosdbTable::addDocIdVotes ( QueryTermInfo *qti , int32_t   listGroupNum ) {
 	for ( int32_t i = 0 ; i < qti->m_numSubLists ; i++ ) {
 		// skip if exhausted
 		if ( ! cursor[i] ) continue;
-		// int16_tcut
+		// shortcut
 		recPtr = cursor[i];
 		// get the min docid
 		if ( ! minRecPtr ) {
@ -5628,6 +5660,23 @@ void PosdbTable::intersectLists10_r ( ) {
 	//if ( s_special == 2836 )
 	//	log("hey");

+	// point to our array of query term infos set in setQueryTermInfos()
+	QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
+
+	// if a query term is for a facet (ie gbfacetstr:gbtagsite)
+	// then count how many unique docids are in it. we were trying to 
+	// do this in addDocIdVotes() but it wasn't in the right place i guess.
+	for ( int32_t i = 0 ; i < m_numQueryTermInfos ; i++ ) {
+		QueryTermInfo *qti = &qip[i];
+		QueryTerm *qt = qti->m_qt;
+		bool isFacetTerm = false;
+		if ( qt->m_fieldCode == FIELD_GBFACETSTR ) isFacetTerm = true;
+		if ( qt->m_fieldCode == FIELD_GBFACETINT ) isFacetTerm = true;
+		if ( qt->m_fieldCode == FIELD_GBFACETFLOAT ) isFacetTerm =true;
+		if ( ! isFacetTerm ) continue;
+		qt->m_numDocsThatHaveFacet += countUniqueDocids ( qti );
+	}
+

 	// setQueryTermInfos() should have set how many we have
 	if ( m_numQueryTermInfos == 0 ) {
@ -5662,8 +5711,6 @@ void PosdbTable::intersectLists10_r ( ) {

 	int32_t listGroupNum = 0;

-	// point to our array of query term infos set in setQueryTermInfos()
-	QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();

 	// if all non-negative query terms are in the same wikiphrase then
 	// we can apply the WIKI_WEIGHT in getMaxPossibleScore() which
@ -5705,8 +5752,6 @@ void PosdbTable::intersectLists10_r ( ) {
 		goto skip3;
 	}

-
-
 	// . create "m_docIdVoteBuf" filled with just the docids from the
 	//   smallest group of sublists 
 	// . m_minListi is the queryterminfo that had the smallest total
--- a/Posdb.h
+++ b/Posdb.h
@ -711,6 +711,8 @@ class PosdbTable {

 	void shrinkSubLists ( class QueryTermInfo *qti );

+	int64_t countUniqueDocids( QueryTermInfo *qti ) ;
+
 	// for intersecting docids
 	void addDocIdVotes ( class QueryTermInfo *qti , int32_t listGroupNum );

--- a/Process.cpp
+++ b/Process.cpp
@ -1471,6 +1471,11 @@ bool Process::shutdown2 ( ) {
 	else
 		log(LOG_INFO,"gb: Shutting down. Try #%"INT32".",m_try++);

+
+	// switch to urgent if having problems
+	if ( m_try >= 10 )
+		m_urgent = true;
+
 	// turn off statsdb so it does not try to add records for these writes
 	g_statsdb.m_disabled = true;

@ -1861,7 +1866,7 @@ bool Process::saveBlockingFiles1 ( ) {
 	if ( g_conf.m_readOnlyMode ) return true;

 	// save user accounting files. 3 of them.
-	if ( g_hostdb.m_myHost->m_isProxy )
+	if ( g_hostdb.m_myHost && g_hostdb.m_myHost->m_isProxy )
 		g_proxy.saveUserBufs();

 	// save the Conf file now
--- a/Query.cpp
+++ b/Query.cpp
@ -32,6 +32,11 @@ void Query::constructor ( ) {
 	m_qwords               = NULL;
 	m_numTerms = 0;
 	m_containingParent = NULL;
+	m_st0Ptr = NULL;
+	// we have to manually call this because Query::constructor()
+	// might have been called explicitly
+	for ( int32_t i = 0 ; i < MAX_QUERY_TERMS ; i++ )
+		m_qterms[i].constructor();
 	//m_expressions          = NULL;
 	reset ( );
 }
@ -48,10 +53,15 @@ void Query::reset ( ) {

 	// if Query::constructor() was called explicitly then we have to
 	// call destructors explicitly as well...
+	// essentially call QueryTerm::reset() on each query term
 	for ( long i = 0 ; i < m_numTerms ; i++ ) {
 	 	// get it
 		QueryTerm *qt = &m_qterms[i];
 		HashTableX *ht = &qt->m_facetHashTable;
+		// debug note
+		// log("results: free fhtqt of %"PTRFMT" for q=%"PTRFMT 
+		//     " st0=%"PTRFMT,
+		//     (PTRTYPE)ht->m_buf,(PTRTYPE)this,(PTRTYPE)m_st0Ptr);
 		ht->reset();
 		qt->m_facetIndexBuf.purge();
 	}
@ -1285,6 +1295,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			qt->m_isPhrase  = false ;
 			qt->m_isUORed   = false;
 			qt->m_UORedTerm = NULL;
+			qt->m_langIdBits = 0;
 			// synonym of this term...
 			qt->m_synonymOf = origTerm;
 			// nuke this crap since it was done above and we
@ -2570,6 +2581,11 @@ bool Query::setQWords ( char boolFlag ,
 				qw->m_ignoreWordInBoolQuery = true;
 			}

+			// this seems case sensitive now, gbfacetstr:humanLang
+			if ( fieldCode == FIELD_GBFACETSTR ) {
+				wid = hash64 ( w , wlen , 0LL );
+			}
+
 			if ( fieldCode == FIELD_GBFIELDMATCH ) {
 				// hash the json field name. (i.e. tag.uri)
 				// make it case sensitive as 
@ -4105,8 +4121,10 @@ struct QueryField g_fields[] = {
 	 false,
 	 "gbdocspiderdate:1400081479",
 	 "Matches documents that have "
-	 "that spider date timestamp (UTC). Does not include the "
-	 "special spider status documents. This is the time the document "
+	 "that spider date timestamp (UTC). "
+	 //"Does not include the "
+	 //"special spider status documents. "
+	 "This is the time the document "
 	 "completed downloading.",
 	 "Date Related Query Operators",
 	 QTF_BEGINNEWTABLE},
@ -4116,7 +4134,8 @@ struct QueryField g_fields[] = {
 	 FIELD_GENERIC,
 	 false,
 	 "gbspiderdate:1400081479",
-	 "Like above, but DOES include the special spider status documents.",
+	 "Like above.",
+	 //, but DOES include the special spider status documents.",
 	 NULL,
 	 0},

@ -4126,8 +4145,8 @@ struct QueryField g_fields[] = {
 	 "gbdocindexdate:1400081479",
 	 "Like above, but is the time the document was last indexed. "
 	 "This time is "
-	 "slightly greater than or equal to the spider date. Does not "
-	 "include the special spider status documents.",
+	 "slightly greater than or equal to the spider date.",//Does not "
+	 //"include the special spider status documents.",
 	 NULL,
 	 0},

@ -4136,8 +4155,8 @@ struct QueryField g_fields[] = {
 	 FIELD_GENERIC,
 	 false,
 	 "gbindexdate:1400081479",
-	 "Like above, but it does include the special spider status "
-	 "documents.",
+	 "Like above.",//, but it does include the special spider status "
+	 //"documents.",
 	 NULL,
 	 0},

@ -4251,6 +4270,384 @@ struct QueryField g_fields[] = {
 	//
 	// spider status docs queries
 	//
+
+	{"gbssUrl",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssUrl:com",
+	 "Query the url of a spider status document.",
+	 "Spider Status Documents", // title
+	 QTF_BEGINNEWTABLE},
+
+
+	{"gbssFinalRedirectUrl",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssFinalRedirectUrl:abc.com/page2.html",
+	 "Query on the last url redirect to, if any.",
+	 NULL, // title
+	 0},
+
+	{"gbssStatusCode",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssStatusCode:0",
+	 "Query on the status code of the index attempt. 0 means no error.",
+	 NULL,
+	 0},
+
+	{"gbssStatusMsg",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssStatusMsg:\"Tcp timed\"",
+	 "Like gbssStatusCode but a textual representation.",
+	 NULL,
+	 0},
+
+	{"gbssHttpStatus",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssHttpStatus:200",
+	 "Query on the HTTP status returned from the web server.",
+	 NULL,
+	 0},
+
+	{"gbssWasIndexed",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssWasIndexed:0",
+	 "Was the document in the index before attempting to index? Use 0 "
+	 " or 1 to find all documents that were not or were, respectively.",
+	 NULL,
+	 0},
+
+	{"gbssIsDiffbotObject",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssIsDiffbotObject:1",
+	 "This field is only present if the document was an object from "
+	 "a diffbot reply. Use gbssIsDiffbotObject:0 to find the non-diffbot "
+	 "objects.",
+	 NULL,
+	 0},
+
+	{"gbssAgeInIndex",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortby:gbssAgeInIndex",
+	 "If the document was in the index at the time we attempted to "
+	 "reindex it, how long has it been since it was last indexed?",
+	 NULL,
+	 0},
+
+	{"gbssDomain",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssDomain:yahoo.com",
+	 "Query on the domain of the url.",
+	 NULL,
+	 0},
+
+	{"gbssSubdomain",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssSubdomain:www.yahoo.com",
+	 "Query on the subdomain of the url.",
+	 NULL,
+	 0},
+
+	{"gbssNumRedirects",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssNumRedirects",
+	 "Query on the number of times the url redirect when attempting to "
+	 "index it.",
+	 NULL,
+	 0},
+
+	{"gbssDocId",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssDocId:1234567",
+	 "Show all the spider status docs for the document with this docId.",
+	 NULL,
+	 0},
+
+	{"gbssHopCount",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssHopCount",
+	 "Query on the hop count of the document.",
+	 NULL,
+	 0},
+
+	{"gbssCrawlRound",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssCrawlRound",
+	 "Query on the crawl round number.",
+	 NULL,
+	 0},
+
+	{"gbssDupOfDocId",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssDupOfDocId:123456",
+	 "Show all the documents that were considered dups of this docId.",
+	 NULL,
+	 0},
+
+	{"gbssPrevTotalNumIndexAttempts",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssPrevTotalNumIndexAttempts:1",
+	 "Before this index attempt, how many attempts were there?",
+	 NULL,
+	 0},
+
+	{"gbssPrevTotalNumIndexSuccesses",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssPrevTotalNumIndexSuccesses:1",
+	 "Before this index attempt, how many successful attempts were there?",
+	 NULL,
+	 0},
+
+	{"gbssPrevTotalNumIndexFailures",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssPrevTotalNumIndexFailures:1",
+	 "Before this index attempt, how many failed attempts were there?",
+	 NULL,
+	 0},
+
+	{"gbssFirstIndexed",
+	 FIELD_GENERIC,
+	 false,
+	 "gbrevsortbyint:gbssFirsIndexed",
+	 "The date in utc that the document was first indexed.",
+	 NULL,
+	 0},
+
+	{"gbssContentHash32",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssContentHash32",
+	 "The hash of the document content, excluding dates and times. Used "
+	 "internally for deduping.",
+	 NULL,
+	 0},
+
+	{"gbssDownloadDurationMS",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssDownloadDurationMS",
+	 "How long it took in millisecons to download the document.",
+	 NULL,
+	 0},
+
+	{"gbssDownloadStartTime",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssDownloadStartTime",
+	 "When the download started, in seconds since the epoch, UTC.",
+	 NULL,
+	 0},
+
+	{"gbssDownloadEndTime",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssDownloadEndTime",
+	 "When the download ended, in seconds since the epoch, UTC.",
+	 NULL,
+	 0},
+
+	{"gbssUsedRobotsTxt",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssUsedRobotsTxt",
+	 "This is 0 or 1 depending on if robots.txt was not obeyed or obeyed, "
+	 "respectively.",
+	 NULL,
+	 0},
+
+	{"gbssConsecutiveErrors",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssConsecutiveErrors",
+	 "For the last set of indexing attempts how many were errors?",
+	 NULL,
+	 0},
+
+	{"gbssIp",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssIp:1.2.3.4",
+	 "The IP address of the document being indexed. Is 0.0.0.0 "
+	 "if unknown.",
+	 NULL,
+	 0},
+
+	{"gbssIpLookupTimeMS",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortby:gbssIpLookupTimeMS",
+	 "How long it took to lookup the IP of the document. Might have been "
+	 "in the cache.",
+	 NULL,
+	 0},
+
+	{"gbssSiteNumInlinks",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortby:gbssSiteNumInlinks",
+	 "How many good inlinks the document's site had.",
+	 NULL,
+	 0},
+
+	{"gbssSiteRank",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortby:gbssSiteRank",
+	 "The site rank of the document. Based directly "
+	 "on the number of inlinks the site had.",
+	 NULL,
+	 0},
+
+	{"gbssContentInjected",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssContentInjected",
+	 "This is 0 or 1 if the content was not injected or injected, "
+	 "respectively.",
+	 NULL,
+	 0},
+
+	{"gbssPercentContentChanged",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetfloat:gbssPercentContentChanged",
+	 "A float between 0 and 100, inclusive. Represents how much "
+	 "the document has changed since the last time we indexed it. This is "
+	 "only valid if the document was successfully indexed this time."
+	 "respectively.",
+	 NULL,
+	 0},
+
+	{"gbssSpiderPriority",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssSpiderPriority",
+	 "The spider priority, from 0 to 127, inclusive, of the document "
+	 "according to the url filters table.",
+	 NULL,
+	 0},
+
+	{"gbssMatchingUrlFilter",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetstr:gbssMatchingUrlFilter",
+	 "The url filter expression the document matched.",
+	 NULL,
+	 0},
+
+	{"gbssLanguage",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetstr:gbssLanguage",
+	 "The language of the document. If document was empty or not "
+	 "downloaded then this will not be present. Uses xx to mean "
+	 "unknown language. Uses the language abbreviations found at the "
+	 "bottom of the url filters page.",
+	 NULL,
+	 0},
+
+	{"gbssContentType",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetstr:gbssContentType",
+	 "The content type of the document. Like html, xml, json, pdf, etc. "
+	 "This field is not present if unknown.",
+	 NULL,
+	 0},
+
+	{"gbssContentLen",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssContentLen",
+	 "The content length of the document. 0 if empty or not downloaded.",
+	 NULL,
+	 0},
+
+	{"gbssCrawlDelayMS",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssCrawlDelay",
+	 "The crawl delay according to the robots.txt of the document. "
+	 "This is -1 if not specified in the robots.txt or not found.",
+	 NULL,
+	 0},
+
+	{"gbssSentToDiffbot",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssSentToDiffbot:1",
+	 "Was the document's url sent to diffbot for processing?",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyCode",
+	 FIELD_GENERIC,
+	 false,
+	 "gbssDiffbotReplyCode:0",
+	 "The reply received from diffbot. 0 means success, otherwise, it "
+	 "indicates an error code.",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyMsg",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetstr:gbssDiffbotReplyMsg:0",
+	 "The reply received from diffbot represented in text.",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyLen",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssDiffbotReplyLen",
+	 "The length of the reply received from diffbot.",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyResponseTimeMS",
+	 FIELD_GENERIC,
+	 false,
+	 "gbsortbyint:gbssDiffbotReplyResponseTimeMS",
+	 "The time in milliseconds it took to get a reply from diffbot.",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyRetries",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssDiffbotReplyRetries",
+	 "The number of times we had to resend the request to diffbot "
+	 "because diffbot returned a 504 gateway timed out error.",
+	 NULL,
+	 0},
+
+	{"gbssDiffbotReplyNumObjects",
+	 FIELD_GENERIC,
+	 false,
+	 "gbfacetint:gbssDiffbotReplyNumObjects",
+	 "The number of JSON objects diffbot excavated from the provided url.",
+	 NULL,
+	 0},
+
+
+	/*
 	{"gbstatus",
 	 FIELD_GENERIC,
 	 false,
@ -4362,7 +4759,7 @@ struct QueryField g_fields[] = {
 	 "spider status documents.",
 	 NULL,
 	 0},
-
+	*/


 	// they don't need to know about this
@ -5038,6 +5435,14 @@ bool Query::isSplit() {
 	return false;
 }

+void QueryTerm::constructor ( ) {
+	m_facetHashTable.constructor(); // hashtablex
+	m_facetIndexBuf.constructor(); // safebuf
+	m_langIdBits = 0;
+	m_langIdBitsValid = false;
+	m_numDocsThatHaveFacet = 0;
+}
+
 bool QueryTerm::isSplit() {
 	if(!m_fieldCode) return true;
 	if(m_fieldCode == FIELD_QUOTA)           return false;
--- a/Query.h
+++ b/Query.h
@ -397,6 +397,11 @@ class QueryWord {
 class QueryTerm {

 public:
+
+	//QueryTerm ( ) { constructor(); };
+
+	void constructor ( ) ;
+
 	// the query word we were derived from
 	QueryWord *m_qword;
 	// . are we a phrase termid or single word termid from that QueryWord?
@ -557,6 +562,7 @@ class QueryTerm {
 	int64_t m_hash64d;
 	int32_t      m_popWeight;

+	uint64_t m_numDocsThatHaveFacet;
 };

 //#define MAX_OPSLOTS 256
@ -871,6 +877,9 @@ class Query {
 		return NULL;
 	};

+	// for debugging fhtqt mem leak
+	char *m_st0Ptr;
+
 	// silly little functions that support the BIG HACK
 	//int32_t getNumNonFieldedSingletonTerms() { return m_numTermsSpecial; };
 	//int32_t getTermsFound ( Query *q , char *foundTermVector ) ;
--- a/README.md
+++ b/README.md
@ -6,7 +6,8 @@ An open source web and enterprise search engine and spider/crawler. As can be se
 RUNNING GIGABLAST
 -----------------

-See html/faq.html for all administrative documentation including 
+See <a href=html/faq.html>html/faq.html</a> 
+for all administrative documentation including 
 the quick start instructions.

 Alternatively, visit http://www.gigablast.com/faq.html
@ -16,7 +17,8 @@ Alternatively, visit http://www.gigablast.com/faq.html
 CODE ARCHITECTURE
 -----------------

-See html/developer.html for all code documentation.
+See <a href=html/developer.html>html/developer.html</a> 
+for all code documentation.

 Alternatively, visit http://www.gigablast.com/developer.html

--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2324,11 +2324,13 @@ bool Rdb::addRecord ( collnum_t collnum,
 			SpiderRequest *sreq = (SpiderRequest *)data;
 			logf(LOG_DEBUG,"spider: added doledb key "
 			     "for pri=%"INT32" time=%"UINT32" "
-			     "uh48=%"UINT64" docid=%"INT64" u=%s",
+			     "uh48=%"UINT64" "
+			     //"docid=%"INT64" "
+			     "u=%s",
 			     (int32_t)g_doledb.getPriority(&doleKey),
 			     (uint32_t)g_doledb.getSpiderTime(&doleKey),
 			     g_doledb.getUrlHash48(&doleKey),
-			     sreq->m_probDocId,
+			     //sreq->m_probDocId,
 			     sreq->m_url);
 		}
 	}
@ -3042,7 +3044,10 @@ char getKeySizeFromRdbId ( uint8_t rdbId ) {
 		}
 	}
 	// sanity check
-	if ( s_table1[rdbId] == 0 ) { char *xx=NULL;*xx=0; }
+	if ( s_table1[rdbId] == 0 ) { 
+		log("rdb: bad lookup rdbid of %i",(int)rdbId);
+		char *xx=NULL;*xx=0; 
+	}
 	return s_table1[rdbId];
 }

--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -815,6 +815,7 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
 		      ff->getFilename() ,
 		      (int64_t)ff->getFileSize(),
 		      (int64_t)MAX_PART_SIZE);
+		exit(0);
 		return -1;
 	}

@ -2480,6 +2481,14 @@ bool RdbBase::verifyFileSharding ( ) {
 	// not re-verify file sharding! only do at startup
 	if ( g_loop.m_isDoingLoop ) return true;

+	// skip for now to speed up startup
+	static int32_t s_count = 0;
+	s_count++;
+	if ( s_count == 50 )
+		log("db: skipping shard verification for remaining files");
+	if ( s_count >= 50 ) 
+		return true;
+
 	g_threads.disableThreads();

 	Msg5 msg5;
--- a/RdbBuckets.cpp
+++ b/RdbBuckets.cpp
@ -885,6 +885,13 @@ bool RdbBuckets::addBucket (RdbBucket* newBucket, int32_t i) {
 	return true;
 }

+// void RdbBuckets::deleteBucket ( int32_t i ) {
+// 	int32_t moveSize = (m_numBuckets - i)*sizeof(RdbBuckets*);
+// 	if(moveSize > 0)
+// 		memmove(&m_buckets[i+1], &m_buckets[i], moveSize);
+// 	m_numBuckets--;
+// }
+
 bool RdbBuckets::getList ( collnum_t collnum ,
 			   char *startKey, char *endKey, int32_t minRecSizes ,
 			   RdbList *list , int32_t *numPosRecs , 
@ -1768,6 +1775,66 @@ bool RdbBucket::deleteList(RdbList *list) {
 	return true;
 }

+// remove keys from any non-existent collection
+void RdbBuckets::cleanBuckets ( ) {
+
+	// what buckets have -1 rdbid???
+	if ( m_rdbId < 0 ) return;
+
+	// the liberation count
+	int32_t count = 0;
+
+	/*
+	char buf[50000];
+	RdbList list;
+	list.set ( NULL,
+		   0,
+		   buf,
+		   50000,
+		   0, // fixeddatasize
+		   false, // own data? should rdblist free it
+		   false, // usehalfkeys
+		   m_ks);
+	*/
+
+ top:
+
+	for ( int32_t i = 0; i < m_numBuckets; i++ ) {
+		RdbBucket *b = m_buckets[i];
+		collnum_t collnum = b->getCollnum();
+		CollectionRec *cr = g_collectiondb.m_recs[collnum];
+		if ( cr ) continue;
+		// count # deleted
+		count += b->getNumKeys();
+		// delete that coll
+		delColl ( collnum );
+		// restart
+		goto top;
+		/*
+		int32_t nk = b->getNumKeys();
+		for (int32_t j = 0 ; j < nk ; j++ ) {
+			char *kp = b->m_keys + j*m_ks;
+			// add into list. should just be a gbmemcpy()
+			list.addKey ( kp , 0 , NULL );
+		*/
+		//deleteBucket ( i );
+	}
+
+	// print it
+	if ( count == 0 ) return;
+	log(LOG_LOGIC,"db: Removed %"INT32" records from %s buckets "
+	    "for invalid collection numbers.",count,m_dbname);
+	//log(LOG_LOGIC,"db: Records not actually removed for safety. Except "
+	//    "for those with negative colnums.");
+	// static bool s_print = true;
+	// if ( ! s_print ) return;
+	// s_print = false;
+	// log (LOG_LOGIC,"db: This is bad. Did you remove a collection "
+	//      "subdirectory? Don't do that, you should use the \"delete "
+	//      "collections\" interface because it also removes records from "
+	//      "memory, too.");
+}
+

 bool RdbBuckets::delColl(collnum_t collnum) {

@ -1783,7 +1850,8 @@ bool RdbBuckets::delColl(collnum_t collnum) {
 				minRecSizes /= 2;
 				continue;
 			} else {
-				log("db: buckets could not delete collection: %s.",
+				log("db: buckets could not delete "
+				    "collection: %s.",
 				    mstrerror(errno));
 				return false;
 			}
@ -1791,6 +1859,8 @@ bool RdbBuckets::delColl(collnum_t collnum) {
 		if(list.isEmpty()) break;
 		deleteList(collnum, &list);
 	}
+
+	log("buckets: deleted all keys for collnum %"INT32,(int32_t)collnum);
 	return true;
 }

--- a/RdbBuckets.h
+++ b/RdbBuckets.h
@ -168,6 +168,7 @@ class RdbBuckets {

 	int32_t      getNumNegativeKeys ( );
 	int32_t      getNumPositiveKeys ( );
+	void cleanBuckets ( );
 	bool      delColl            ( collnum_t collnum );

 	//just for this collection
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -215,7 +215,7 @@ void RdbDump::doneDumping ( ) {
 	// . map verify
 	// . if continueDumping called us with no collectionrec, it got
 	//   deleted so RdbBase::m_map is nuked too i guess
-	if ( saved != ENOCOLLREC )
+	if ( saved != ENOCOLLREC && m_map )
 		log("db: map # pos=%"INT64" neg=%"INT64"",
 		    m_map->getNumPositiveRecs(),
 		    m_map->getNumNegativeRecs()
@ -230,11 +230,11 @@ void RdbDump::doneDumping ( ) {
 	if ( saved == ENOCOLLREC ) return;

 	// save the map to disk
-	m_map->writeMap();
+	if ( m_map ) m_map->writeMap();
 #ifdef GBSANITYCHECK
 	// sanity check
 	log("DOING SANITY CHECK FOR MAP -- REMOVE ME");
-	if ( ! m_map->verifyMap ( m_file ) ) {
+	if ( m_map && ! m_map->verifyMap ( m_file ) ) {
 		char *xx = NULL; *xx = 0; }
 	// now check the whole file for consistency
 	if ( m_ks == 18 ) { // map->m_rdbId == RDB_POSDB ) {
@ -495,7 +495,7 @@ bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {

 	// . SANITY CHECK
 	// . ensure first key is >= last key added to the map map
-	if ( m_offset > 0 ) {
+	if ( m_offset > 0 && m_map ) {
 		//key_t k       = m_list->getCurrentKey();
 		char k[MAX_KEY_BYTES];
 		m_list->getCurrentKey(k);
@ -748,6 +748,22 @@ void doneReadingForVerifyWrapper ( void *state ) {
 }

 bool RdbDump::doneReadingForVerify ( ) {
+
+	// if someone reset/deleted the collection we were dumping...
+	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
+	// . do not do this for statsdb/catdb which always use collnum of 0
+	// . RdbMerge also calls us but gives a NULL m_rdb so we can't
+	//   set m_isCollectionless to false
+	if ( ! cr && m_doCollCheck ) {
+		g_errno = ENOCOLLREC;
+		// m_file is invalid if collrec got nuked because so did
+		// the Rdbbase which has the files
+		log("db: lost collection while dumping to disk. making "
+		    "map null so we can stop.");
+		m_map = NULL;
+	}
+
+
 	// see if what we wrote is the same as what we read back
 	if ( m_verifyBuf && memcmp(m_verifyBuf,m_buf,m_bytesToWrite) != 0 &&
 	     ! g_errno ) {
--- a/Repair.cpp
+++ b/Repair.cpp
@ -686,14 +686,26 @@ void Repair::initScan ( ) {

 	
 	// init secondary rdbs
-	if ( m_rebuildTitledb )
+	if ( m_rebuildTitledb ) {
 		if ( ! g_titledb2.init2    ( titledbMem    ) ) goto hadError;
+		// clean tree in case loaded from saved file
+		Rdb *r = g_titledb2.getRdb();
+		if ( r ) r->m_tree.cleanTree();
+	}
+
 	//if ( m_rebuildTfndb )
 	//	if ( ! g_tfndb2.init2      ( tfndbMem      ) ) goto hadError;
 	//if ( m_rebuildIndexdb )
 	//	if ( ! g_indexdb2.init2    ( indexdbMem    ) ) goto hadError;
-	if ( m_rebuildPosdb )
+	if ( m_rebuildPosdb ) {
 		if ( ! g_posdb2.init2    ( posdbMem    ) ) goto hadError;
+		// clean tree in case loaded from saved file
+		Rdb *r = g_posdb2.getRdb();
+		if ( r ) r->m_buckets.cleanBuckets();
+	}
+
+
+
 	//if ( m_rebuildDatedb )
 	//	if ( ! g_datedb2.init2     ( datedbMem     ) ) goto hadError;
 	if ( m_rebuildClusterdb )
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -29,7 +29,7 @@ void SearchInput::clear ( int32_t niceness ) {
 	reset();
 	// set all to 0 just to avoid any inconsistencies
 	int32_t size = (char *)&m_END_TEST - (char *)&m_START;
-	memset ( this , 0x00 , size );
+	memset ( &m_START , 0x00 , size );
 	m_sbuf1.reset();
 	m_sbuf2.reset();
 	m_sbuf3.reset();
@ -185,6 +185,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 	// store list of collection #'s to search here. usually just one.
 	m_collnumBuf.reset();

+	m_q.reset();
+
 	// zero out everything, set niceness to 0
 	clear ( 0 ) ;

@ -339,10 +341,11 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {


 	if ( m_streamResults &&
-	     tmpFormat != FORMAT_XML && 
+	     tmpFormat != FORMAT_XML &&
+	     tmpFormat != FORMAT_CSV &&
 	     tmpFormat != FORMAT_JSON ) {
 		log("si: streamResults only supported for "
-		    "json/html. disabling");
+		    "xml/csv/json. disabling");
 		m_streamResults = false;
 	}

--- a/Spider.cpp
+++ b/Spider.cpp
@ -127,7 +127,8 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
 	strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
 	sb->safePrintf("addedTime=%s(%"UINT32") ",time,(uint32_t)m_addedTime );

-	sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) );
+	//sb->safePrintf("parentFirstIp=%s ",iptoa(m_parentFirstIp) );
+	sb->safePrintf("pageNumInlinks=%i ",(int)m_pageNumInlinks);
 	sb->safePrintf("parentHostHash32=0x%"XINT32" ",m_parentHostHash32 );
 	sb->safePrintf("parentDomHash32=0x%"XINT32" ",m_parentDomHash32 );
 	sb->safePrintf("parentSiteHash32=0x%"XINT32" ",m_parentSiteHash32 );
@ -174,6 +175,7 @@ int32_t SpiderRequest::print ( SafeBuf *sbarg ) {
 	if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
 	if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
 	if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
+	if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
 	if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");

 	if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
@ -355,7 +357,7 @@ int32_t SpiderRequest::printToTable ( SafeBuf *sb , char *status ,

 	//sb->safePrintf(" <td>%s(%"UINT32")</td>\n",mstrerror(m_errCode),m_errCode);
 	//sb->safePrintf(" <td>%"INT32"ms</td>\n",m_crawlDelay );
-	sb->safePrintf(" <td>%s</td>\n",iptoa(m_parentFirstIp) );
+	sb->safePrintf(" <td>%i</td>\n",(int)m_pageNumInlinks);
 	sb->safePrintf(" <td>%"UINT64"</td>\n",getParentDocId() );

 	//sb->safePrintf(" <td>0x%"XINT32"</td>\n",m_parentHostHash32);
@ -387,6 +389,7 @@ int32_t SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
 	if ( m_parentIsRSS ) sb->safePrintf("PARENTISRSS ");
 	if ( m_parentIsPermalink ) sb->safePrintf("PARENTISPERMALINK ");
 	if ( m_parentIsPingServer ) sb->safePrintf("PARENTISPINGSERVER ");
+	if ( m_parentIsSiteMap ) sb->safePrintf("PARENTISSITEMAP ");
 	if ( m_isMenuOutlink ) sb->safePrintf("MENUOUTLINK ");

 	if ( m_parentHasAddress ) sb->safePrintf("PARENTHASADDRESS ");
@ -1209,6 +1212,7 @@ CollectionRec *SpiderColl::getCollectionRec ( ) {
 SpiderColl::SpiderColl () {
 	m_overflowList = NULL;
 	m_lastOverflowFirstIp = 0;
+	m_lastPrinted = 0;
 	m_deleteMyself = false;
 	m_isLoading = false;
 	m_gettingList1 = false;
@ -1798,6 +1802,9 @@ void SpiderColl::clearLocks ( ) {

 void SpiderColl::reset ( ) {

+	m_numSuccessReplies = 0;
+	m_numFailedReplies  = 0;
+
 	// reset these for SpiderLoop;
 	m_nextDoledbKey.setMin();
 	//m_didRound = false;
@ -2309,14 +2316,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}

 	// do not add to doledb if bad
-	if ( priority == SPIDER_PRIORITY_FILTERED ) {
+	//if ( priority == SPIDER_PRIORITY_FILTERED ) {
+	if ( m_cr->m_forceDelete[ufn] ) {
 		if ( g_conf.m_logDebugSpider )
 			log("spider: request %s is filtered ufn=%"INT32"",
 			    sreq->m_url,ufn);
 		return true;
 	}

-	if ( priority == SPIDER_PRIORITY_BANNED   ) {
+	//if ( priority == SPIDER_PRIORITY_BANNED   ) {
+	if ( m_cr->m_forceDelete[ufn] ) {
 		if ( g_conf.m_logDebugSpider )
 			log("spider: request %s is banned ufn=%"INT32"",
 			    sreq->m_url,ufn);
@ -2370,7 +2379,7 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	     "spider: %s request to waiting tree %s "
 	     "uh48=%"UINT64" "
 	     "firstIp=%s "
-	     "parentFirstIp=%"UINT32" "
+	     "pageNumInlinks=%"UINT32" "
 	     "parentdocid=%"UINT64" "
 	     "isinjecting=%"INT32" "
 	     "ispagereindex=%"INT32" "
@ -2383,7 +2392,7 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	     sreq->m_url,
 	     sreq->getUrlHash48(),
 	     iptoa(sreq->m_firstIp),
-	     (uint32_t)sreq->m_parentFirstIp,
+	     (uint32_t)sreq->m_pageNumInlinks,//(uint32_t)sreq->m_parentFirstIp
 	     sreq->getParentDocId(),
 	     (int32_t)(bool)sreq->m_isInjecting,
 	     (int32_t)(bool)sreq->m_isPageReindex,
@ -2787,6 +2796,27 @@ int32_t SpiderColl::getNextIpFromWaitingTree ( ) {
 	return firstIp;
 }

+uint64_t SpiderColl::getNextSpiderTimeFromWaitingTree ( ) {
+	// if nothing to scan, bail
+	if ( m_waitingTree.isEmpty() ) return 0LL;
+	// the key
+	key_t mink; mink.setMin();
+	// set node from wait tree key. this way we can resume from a prev key
+	int32_t node = m_waitingTree.getNextNode (0,(char *)&mink );
+	// if empty, stop
+	if ( node < 0 ) return 0LL;
+	// get the key
+	key_t *wk = (key_t *)m_waitingTree.getKey ( node );
+	// time from that
+	uint64_t spiderTimeMS = (wk->n1);
+	spiderTimeMS <<= 32;
+	spiderTimeMS |= ((wk->n0) >> 32);
+	// stop if need to wait for this one
+	return spiderTimeMS;
+}
+	
+
+
 static void gotSpiderdbListWrapper2( void *state , RdbList *list,Msg5 *msg5) {

 	SpiderColl *THIS = (SpiderColl *)state;
@ -3535,6 +3565,11 @@ bool SpiderColl::evalIpLoop ( ) {
 		m_didRead = true;
 		// reset some stuff
 		m_lastScanningIp = 0;
+
+		// reset these that need to keep track of requests for
+		// the same url that might span two spiderdb lists or more
+		m_lastSreqUh48 = 0LL;
+
 		// do a read. if it blocks it will recall this loop
 		if ( ! readListFromSpiderdb () ) return false;
 	}
@ -3941,8 +3976,20 @@ bool SpiderColl::scanListForWinners ( ) {
 		}
 		// if its a SpiderReply set it for an upcoming requests
 		if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec ) ) {
+
 			// see if this is the most recent one
 			SpiderReply *tmp = (SpiderReply *)rec;
+
+			// reset reply stats if beginning a new url
+			if ( srepUh48 != tmp->getUrlHash48() ) {
+				m_numSuccessReplies = 0;
+				m_numFailedReplies  = 0;
+			}
+
+			// inc stats
+			if ( tmp->m_errCode == 0 ) m_numSuccessReplies++;
+			else                       m_numFailedReplies ++;
+
 			// if we have a more recent reply already, skip this 
 			if ( srep && 
 			     srep->getUrlHash48() == tmp->getUrlHash48() &&
@ -3962,6 +4009,12 @@ bool SpiderColl::scanListForWinners ( ) {

 		int64_t uh48 = sreq->getUrlHash48();

+		// reset reply stats if beginning a new url
+		if ( ! srep ) {
+			m_numSuccessReplies = 0;
+			m_numFailedReplies  = 0;
+		}
+
 		// . skip if our twin should add it to doledb
 		// . waiting tree only has firstIps assigned to us so
 		//   this should not be necessary
@ -4000,7 +4053,58 @@ bool SpiderColl::scanListForWinners ( ) {
 		     ! sreq->m_fakeFirstIp )
 			m_totalNewSpiderRequests++;

+		//int32_t  ipdom ( int32_t ip ) { return ip & 0x00ffffff; };
+		int32_t cblock = ipdom ( sreq->m_firstIp );
+
+		bool countIt = true;
+
+		// reset page inlink count on url request change
+		if ( m_lastSreqUh48 != uh48 ) {
+			m_pageNumInlinks = 0;
+			m_lastCBlockIp = 0;
+		}
+
+		//if ( uh48 != m_lastSreqUh48 )
+		//	countIt = false;
+
+		if ( cblock == m_lastCBlockIp )
+			countIt = false;
+
+		// do not count manually added spider requests
+		if ( (sreq->m_isAddUrl || sreq->m_isInjecting) )
+			countIt = false;
+
+		// 20 is good enough
+		if ( m_pageNumInlinks >= 20 )
+			countIt = false;
+
+		if ( countIt ) {
+			int32_t ca;
+			for ( ca = 0 ; ca < m_pageNumInlinks ; ca++ ) 
+				if ( m_cblocks[ca] == cblock ) break;
+			// if found in our list, do not count it, already did
+			if ( ca < m_pageNumInlinks )
+				countIt = false;
+		}
+
+		if ( countIt ) {
+			m_cblocks[m_pageNumInlinks] = cblock;
+			m_pageNumInlinks++;
+			if ( m_pageNumInlinks > 20 ) { char *xx=NULL;*xx=0;}
+		}
+
+		// set this now. it does increase with each request. so 
+		// initial requests will not see the full # of inlinks.
+		sreq->m_pageNumInlinks = (uint8_t)m_pageNumInlinks;
+
+		// put these in the spiderequest in doledb so we can
+		// show in the json spider status docs in 
+		// XmlDoc::getSpiderStatusDocMetaList2()
+		sreq->m_reservedc1 = m_numSuccessReplies;
+		sreq->m_reservedc2 = m_numFailedReplies;
+		
 		m_lastSreqUh48 = uh48;
+		m_lastCBlockIp = cblock;

 		// only add firstip if manually added and not fake
 		
@ -4198,8 +4302,11 @@ bool SpiderColl::scanListForWinners ( ) {
 		}
 		// set the priority (might be the same as old)
 		int32_t priority = m_cr->m_spiderPriorities[ufn];
+		// now get rid of negative priorities since we added a
+		// separate force delete checkbox in the url filters
+		if ( priority < 0 ) priority = 0;
 		// sanity checks
-		if ( priority == -1 ) { char *xx=NULL;*xx=0; }
+		//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
 		if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}

 		if ( g_conf.m_logDebugSpider )
@ -4214,26 +4321,37 @@ bool SpiderColl::scanListForWinners ( ) {
 		//if ( ! m_cr->m_spidersEnabled[ufn] ) continue;
 		if ( m_cr->m_maxSpidersPerRule[ufn] <= 0 ) continue;

-		// skip if banned
-		if ( priority == SPIDER_PRIORITY_FILTERED ) continue;
-		if ( priority == SPIDER_PRIORITY_BANNED   ) continue;
+		// skip if banned (unless need to delete from index)
+		bool skip = false;
+		// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
+		// if ( priority == SPIDER_PRIORITY_BANNED   ) skip = true;
+		if ( m_cr->m_forceDelete[ufn] ) skip = true;
+		// but if it is currently indexed we have to delete it
+		if ( skip && srep && srep->m_isIndexed ) skip = false;
+		if ( skip ) continue;

 		// temp debug
 		//char *xx=NULL;*xx=0;

+		if ( m_cr->m_forceDelete[ufn] )
+			// force it to a delete
+			sreq->m_forceDelete = true;
+
 		int64_t spiderTimeMS;
 		spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );
 		// how many outstanding spiders on a single IP?
 		//int32_t maxSpidersPerIp = m_cr->m_spiderIpMaxSpiders[ufn];
 		// sanity
 		if ( (int64_t)spiderTimeMS < 0 ) { 
-			log("spider: got corrupt 2 spiderRequest in scan (cn=%"INT32")",
+			log("spider: got corrupt 2 spiderRequest in "
+			    "scan (cn=%"INT32")",
 			    (int32_t)m_collnum);
 			continue;
 		}
 		// more corruption detection
 		if ( sreq->m_hopCount < -1 ) {
-			log("spider: got corrupt 5 spiderRequest in scan (cn=%"INT32")",
+			log("spider: got corrupt 5 spiderRequest in "
+			    "scan (cn=%"INT32")",
 			    (int32_t)m_collnum);
 			continue;
 		}
@ -4245,8 +4363,8 @@ bool SpiderColl::scanListForWinners ( ) {

 		// if it is in future, skip it and just set m_futureTime and
 		// and we will update the waiting tree
-		// with an entry based on that future time if the winnerTree turns
-		// out to be empty after we've completed our scan
+		// with an entry based on that future time if the winnerTree 
+		// turns out to be empty after we've completed our scan
 		if ( spiderTimeMS > nowGlobalMS ) {
 			// if futuretime is zero set it to this time
 			if ( ! m_minFutureTimeMS ) 
@ -4422,8 +4540,17 @@ bool SpiderColl::scanListForWinners ( ) {
 					wsreq->m_hopCount = sreq->m_hopCount;
 				if ( wsreq->m_hopCount < sreq->m_hopCount )
 					sreq->m_hopCount = wsreq->m_hopCount;
+				// and the min added time as well!
+				// get the oldest timestamp so
+				// gbssDiscoveryTime will be accurate.
+				if ( sreq->m_addedTime < wsreq->m_addedTime )
+					wsreq->m_addedTime = sreq->m_addedTime;
+				if ( wsreq->m_addedTime < sreq->m_addedTime )
+					sreq->m_addedTime = wsreq->m_addedTime;
 			}

+			
+
 			// are we lower priority? (or equal)
 			// smaller keys are HIGHER priority.
 			if(KEYCMP((char *)&wk,(char *)oldwk,
@ -4770,9 +4897,11 @@ bool SpiderColl::scanListForWinners ( ) {
 		log("spider: Checked list of %"INT32" spiderdb "
 		    "bytes (%"INT32" recs) "
 		    "for winners "
-		    "for firstip=%s. winnerTreeUsedNodes=%"INT32"",
-		    list->getListSize(),recCount,iptoa(m_scanningIp),
-		    m_winnerTree.getNumUsedNodes());
+		    "for firstip=%s. winnerTreeUsedNodes=%"INT32" #newreqs=%"
+		    INT64
+		    ,list->getListSize(),recCount,iptoa(m_scanningIp),
+		    m_winnerTree.getNumUsedNodes(),
+		    m_totalNewSpiderRequests);
 	// reset any errno cuz we're just a cache
 	g_errno = 0;

@ -6126,6 +6255,9 @@ void SpiderLoop::spiderDoledUrls ( ) {
 	if ( ! m_activeListValid ) {
 		buildActiveList();
 		m_crx = m_activeList;
+		// recompute every 3 seconds, it seems kinda buggy!!
+		m_recalcTime = nowGlobal + 3;
+		m_recalcTimeValid = true;
 	}

 	// start again at head
@ -7896,23 +8028,25 @@ bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
 	// care of g_errno now by clearing it and adding an error spider
 	// reply to release the lock!!
 	if ( g_errno ) {
-		log("spider: ----CRITICAL CRITICAL CRITICAL----");
-		log("spider: ----CRITICAL CRITICAL CRITICAL----");
-		log("spider: ------ *** LOCAL ERROR ***  ------");
-		log("spider: ------ *** LOCAL ERROR ***  ------");
-		log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ----CRITICAL CRITICAL CRITICAL----");
+		// log("spider: ----CRITICAL CRITICAL CRITICAL----");
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
 		log("spider: spidering %s has error: %s. uh48=%"INT64". "
-		    "Respidering "
-		    "in %"INT32" seconds. MAX_LOCK_AGE when lock expires.",
+		    //"Respidering "
+		    //"in %"INT32" seconds. MAX_LOCK_AGE when lock expires. "
+		    "cn=%"INT32"",
 		    xd->m_firstUrl.m_url,
 		    mstrerror(g_errno),
 		    xd->getFirstUrlHash48(),
-		    (int32_t)MAX_LOCK_AGE);
-		log("spider: ------ *** LOCAL ERROR ***  ------");
-		log("spider: ------ *** LOCAL ERROR ***  ------");
-		log("spider: ------ *** LOCAL ERROR ***  ------");
-		log("spider: ----CRITICAL CRITICAL CRITICAL----");
-		log("spider: ----CRITICAL CRITICAL CRITICAL----");
+		    //(int32_t)MAX_LOCK_AGE,
+		    (int32_t)collnum);
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ------ *** LOCAL ERROR ***  ------");
+		// log("spider: ----CRITICAL CRITICAL CRITICAL----");
+		// log("spider: ----CRITICAL CRITICAL CRITICAL----");
 		// don't release the lock on it right now. just let the
 		// lock expire on it after MAX_LOCK_AGE seconds. then it will
 		// be retried. we need to debug gb so these things never
@ -10782,6 +10916,10 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			   HashTableX   *quotaTable ,
 			   int32_t langIdArg ) {

+	if ( ! sreq ) {
+		log("spider: sreq is NULL!");
+	}
+
 	int32_t langId = langIdArg;
 	if ( srep ) langId = srep->m_langId;

@ -11264,6 +11402,57 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( strncmp(p,"isparentsitemap",15) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if no match continue
+			if ( (bool)sreq->m_parentIsSiteMap == val) continue;
+			// skip
+			p += 15;
+			// skip to next constraint
+			p = strstr(p, "&&");
+			// all done?
+			if ( ! p ) return i;
+			p += 2;
+			goto checkNextRule;
+		}
+
+		// does it have an rss inlink? we want to expedite indexing
+		// of such pages. i.e. that we gather from an rss feed that
+		// we got from a pingserver...
+		if ( strncmp(p,"isroot",6) == 0 ) {
+			// skip for msg20
+			//if ( isForMsg20 ) continue;
+			// this is a docid only url, no actual url, so skip
+			if ( sreq->m_isPageReindex ) continue;
+			// a fast check
+			char *u = sreq->m_url;
+			// skip http
+			u += 4;
+			// then optional s for https
+			if ( *u == 's' ) u++;
+			// then ://
+			u += 3;
+			// scan until \0 or /
+			for ( ; *u && *u !='/' ; u++ );
+			// if \0 we are root
+			bool isRoot = true;
+			if ( *u == '/' ) {
+				u++;
+				if ( *u ) isRoot = false;
+			}
+			// if we are not root
+			if ( isRoot == val ) continue;
+			// skip
+			p += 6;
+			// skip to next constraint
+			p = strstr(p, "&&");
+			// all done?
+			if ( ! p ) return i;
+			p += 2;
+			goto checkNextRule;
+		}
+
 		/*
 		if ( strncmp(p,"isparentindexed",16) == 0 ) {
 			// skip for msg20
@ -11506,6 +11695,21 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		// check for "isrss" aka "rss"
+		if ( strncmp(p,"isrssext",8) == 0 ) {
+			// if we are not rss, we do not match this rule
+			if ( (bool)sreq->m_isRSSExt == val ) continue; 
+			// skip it
+			p += 8;
+			// check for &&
+			p = strstr(p, "&&");
+			// if nothing, else then it is a match
+			if ( ! p ) return i;
+			// skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// check for permalinks. for new outlinks we *guess* if its
 		// a permalink by calling isPermalink() function.
 		if (!strncmp(p,"ispermalink",11) ) {
@ -11602,10 +11806,9 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 		}
 		// iswww, means url is like www.xyz.com/...
 		if ( strncmp(p,"iswww", 5) == 0 ) {
-			// now this is a bit
-			if ( (bool)sreq->m_isWWWSubdomain == (bool)val ) 
-				continue;
-			/*
+			// now this is a bit - doesn't seem to be working yet
+			//if ( (bool)sreq->m_isWWWSubdomain == (bool)val ) 
+			//	continue;
 			// skip "iswww"
 			p += 5;
 			// skip over http:// or https://
@ -11619,7 +11822,6 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			    u[2] == 'w' ) isWWW = 1;
 			// skip if no match
 			if ( isWWW == val ) continue;
-			*/
 			// TODO: fix www.knightstown.skepter.com
 			// maybe just have a bit in the spider request
 			// another rule?
@ -12141,12 +12343,37 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( *p == 'n' && strncmp(p,"numinlinks",10) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// these are -1 if they are NOT valid
+			int32_t a = sreq->m_pageNumInlinks;
+			// make it point to the priority
+			int32_t b = atoi(s);
+			// compare
+			if ( sign == SIGN_EQ && a != b ) continue;
+			if ( sign == SIGN_NE && a == b ) continue;
+			if ( sign == SIGN_GT && a <= b ) continue;
+			if ( sign == SIGN_LT && a >= b ) continue;
+			if ( sign == SIGN_GE && a <  b ) continue;
+			if ( sign == SIGN_LE && a >  b ) continue;
+			// skip fast
+			p += 10;
+			p = strstr(s, "&&");
+			//if nothing, else then it is a match
+			if ( ! p ) return i;
+			//skip the '&&' and go to next rule
+			p += 2;
+			goto checkNextRule;
+		}
+
 		// siteNumInlinks >= 300 [&&]
 		if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
 			// these are -1 if they are NOT valid
 			int32_t a1 = sreq->m_siteNumInlinks;
 			// only assign if valid
-			int32_t a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
+			int32_t a2 = -1; 
+			if ( srep ) a2 = srep->m_siteNumInlinks;
 			// assume a1 is the best
 			int32_t a ;
 			// assign to the first valid one
@ -12720,18 +12947,21 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		}

 		// try to kinda grab the min hop count as well
-		if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
-			if ( oldReq->m_hopCount < sreq->m_hopCount )
-				sreq->m_hopCount = oldReq->m_hopCount;
-			else
-				oldReq->m_hopCount = sreq->m_hopCount;
-		}
+		// do not alter spiderdb!
+		// if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
+		// 	if ( oldReq->m_hopCount < sreq->m_hopCount )
+		// 		sreq->m_hopCount = oldReq->m_hopCount;
+		// 	else
+		// 		oldReq->m_hopCount = sreq->m_hopCount;
+		// }

 		// if he's essentially different input parms but for the
 		// same url, we want to keep him because he might map the
 		// url to a different url priority!
 		if ( oldReq->m_siteHash32    != sreq->m_siteHash32    ||
 		     oldReq->m_isNewOutlink  != sreq->m_isNewOutlink  ||
+		     //  use hopcount now too!
+		     oldReq->m_hopCount      != sreq->m_hopCount      ||
 		     // makes a difference as far a m_minPubDate goes, because
 		     // we want to make sure not to delete that request that
 		     // has m_parentPrevSpiderTime
@ -12748,7 +12978,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 			goto addIt;
 		// . if the same check who has the most recent added time
 		// . if we are not the most recent, just do not add us
-		if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
+		// . no, now i want the oldest so we can do gbssDiscoveryTime
+		if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
 		// otherwise, erase over him
 		dst     = restorePoint;
 		lastKey = prevLastKey;
@ -13342,6 +13573,8 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {

 	uint32_t now = (uint32_t)getTimeGlobalNoCore();

+	uint64_t nowMS = gettimeofdayInMillisecondsGlobalNoCore();
+
 	//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);

 	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
@ -13395,6 +13628,44 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
 		     //g_conf.m_spideringEnabled &&
 		     ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 
 		     spiderDoneTimer ) {
+
+			// break it here for our collnum to see if
+			// doledb was just lagging or not.
+			bool printIt = true;
+			if ( now < sc->m_lastPrinted ) printIt = false;
+			if ( printIt ) sc->m_lastPrinted = now + 5;
+
+			// doledb must be empty
+			if ( ! sc->m_doleIpTable.isEmpty() ) {
+				if ( printIt )
+				log("spider: not ending crawl because "
+				    "doledb not empty for coll=%s",cr->m_coll);
+				goto doNotEnd;
+			}
+
+			uint64_t nextTimeMS ;
+			nextTimeMS = sc->getNextSpiderTimeFromWaitingTree ( );
+
+			// and no ips awaiting scans to get into doledb
+			// except for ips needing scans 60+ seconds from now
+			if ( nextTimeMS &&  nextTimeMS < nowMS + 60000 ) {
+				if ( printIt )
+				log("spider: not ending crawl because "
+				    "waiting tree key is ready for scan "
+				    "%"INT64" ms from now for coll=%s",
+				    nextTimeMS - nowMS,cr->m_coll );
+				goto doNotEnd;
+			}
+
+			// maybe wait for waiting tree population to finish
+			if ( sc->m_waitingTreeNeedsRebuild ) {
+				if ( printIt )
+				log("spider: not ending crawl because "
+				    "waiting tree is building for coll=%s",
+				    cr->m_coll );
+				goto doNotEnd;
+			}
+
 			// this is the MOST IMPORTANT variable so note it
 			log(LOG_INFO,
 			    "spider: coll %s has no more urls to spider",
@ -13407,6 +13678,7 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {
 			cr->m_needsSave = true;
 		}

+	doNotEnd:

 		int32_t hostId = slot->m_host->m_hostId;

@ -13454,39 +13726,64 @@ void handleRequestc1 ( UdpSlot *slot , int32_t niceness ) {

 bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {

-	if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl )
+	if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("Spidering disabled in "
 				       "master controls. You can turn it "
 				       "back on there.");
+	}

-	if ( g_conf.m_readOnlyMode ) 
+	if ( g_conf.m_readOnlyMode ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("In read-only mode. Spidering off.");
+	}

-	if ( g_dailyMerge.m_mergeMode )
+	if ( g_dailyMerge.m_mergeMode ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("Daily merge engaged, spidering "
 				       "paused.");
+	}

-	if ( g_udpServer.getNumUsedSlots() >= 1300 ) 
+	if ( g_udpServer.getNumUsedSlots() >= 1300 ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("Too many UDP slots in use, "
 				       "spidering paused.");
+	}

-	if ( g_repairMode ) 
+	if ( g_repairMode ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("In repair mode, spidering paused.");
+	}

 	// do not spider until collections/parms in sync with host #0
-	if ( ! g_parms.m_inSyncWithHost0 )
+	if ( ! g_parms.m_inSyncWithHost0 ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("Parms not in sync with host #0, "
 				       "spidering paused");
+	}

 	// don't spider if not all hosts are up, or they do not all
 	// have the same hosts.conf.
-	if ( g_pingServer.m_hostsConfInDisagreement )
+	if ( g_pingServer.m_hostsConfInDisagreement ) {
+		*status = SP_ADMIN_PAUSED;
 		return msg->safePrintf("Hosts.conf discrepancy, "
 				       "spidering paused.");
-
+	}

 	uint32_t now = (uint32_t)getTimeGlobal();

+	// try to fix crawlbot nightly test complaining about job status
+	// for TestRepeatCrawlWithMaxToCrawl
+	if ( (cx->m_spiderStatus == SP_MAXTOCRAWL ||
+	      cx->m_spiderStatus == SP_MAXTOPROCESS ) &&
+	     cx->m_collectiveRespiderFrequency > 0.0 &&
+	     now < cx->m_spiderRoundStartTime &&
+	     cx->m_spiderRoundNum >= cx->m_maxCrawlRounds ) {
+		*status = SP_MAXROUNDS;
+		return msg->safePrintf ( "Job has reached maxRounds "
+					 "limit." );
+	}		
+
 	// . 0 means not to RE-crawl
 	// . indicate if we are WAITING for next round...
 	if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
@ -13587,6 +13884,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 	if ( ! cx->m_isCustomCrawl && 
 	     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
 		//*status = SP_COMPLETED;
+		*status = SP_INPROGRESS;
 		return msg->safePrintf ( "Nothing currently "
 					 "available to spider. "
 					 "Change your url filters, try "
@ -13783,7 +14081,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
 	m_isAddUrl     = 1;
 	m_addedTime    = (uint32_t)getTimeGlobal();//now;
 	m_fakeFirstIp   = 1;
-	m_probDocId     = probDocId;
+	//m_probDocId     = probDocId;
 	m_firstIp       = firstIp;
 	m_hopCount      = 0;

@ -13893,10 +14191,12 @@ void SpiderLoop::buildActiveList ( ) {
 		//
 		if ( nowGlobal < cr->m_spiderRoundStartTime ) {
 			active = false;
-			if ( cr->m_spiderRoundStartTime < m_recalcTime ) {
-				m_recalcTime = cr->m_spiderRoundStartTime;
-				m_recalcTimeValid = true;
-			}
+			// no need to do this now since we recalc every
+			// 3 seconds anyway...
+			// if ( cr->m_spiderRoundStartTime < m_recalcTime ) {
+			// 	m_recalcTime = cr->m_spiderRoundStartTime;
+			// 	m_recalcTimeValid = true;
+			// }
 		}

 		if ( ! active ) continue;
--- a/Spider.h
+++ b/Spider.h
@ -509,15 +509,24 @@ class SpiderRequest {
 	// spidered (when m_url was not an outlink on its parent page)
 	uint32_t  m_parentPrevSpiderTime; // time_t

+	//int32_t    m_parentFirstIp;
+	// # of spider requests from different c-blocks. capped at 255.
+	// taken from the # of SpiderRequests.
+	uint8_t    m_pageNumInlinks;
+	uint8_t    m_reservedb2;
+	uint8_t    m_reservedb3;
+	uint8_t    m_reservedb4;
+
 	// info on the page we were harvest from
-	int32_t    m_parentFirstIp;
 	int32_t    m_parentHostHash32;
 	int32_t    m_parentDomHash32;
 	int32_t    m_parentSiteHash32;

 	// the PROBABLE DOCID. if there is a collision with another docid
 	// then we increment the last 8 bits or so. see Msg22.cpp.
-	int64_t m_probDocId;
+	//int64_t m_probDocId;
+	int32_t m_reservedc1;
+	int32_t m_reservedc2;

 	//int32_t  m_parentPubDate;

@ -583,11 +592,12 @@ class SpiderRequest {
 	// or from PageParser.cpp directly
 	int32_t    m_isPageParser:1; 
 	// should we use the test-spider-dir for caching test coll requests?
-	int32_t    m_useTestSpiderDir:1;
+	//int32_t    m_useTestSpiderDir:1;
+	int32_t    m_parentIsSiteMap:1;
 	// . is the url a docid (not an actual url)
 	// . could be a "query reindex"
 	int32_t    m_urlIsDocId:1;
-	// does m_url end in .rss? or a related rss file extension?
+	// does m_url end in .rss .xml .atom? or a related rss file extension?
 	int32_t    m_isRSSExt:1;
 	// is url in a format known to be a permalink format?
 	int32_t    m_isUrlPermalinkFormat:1;
@ -921,7 +931,7 @@ class SpiderReply {
 	// was the request an injection request
 	int32_t    m_fromInjectionRequest    :1; 
 	// did we TRY to send it to the diffbot backend filter? might be err?
-	int32_t    m_sentToDiffbot           :1;
+	int32_t    m_sentToDiffbotThisTime   :1;
 	int32_t    m_hadDiffbotError         :1;
 	// . was it in the index when we started?
 	// . we use this with m_isIndexed above to adjust quota counts for
@ -1145,6 +1155,9 @@ class SpiderColl {
 	int32_t      m_tailHopCount;
 	int64_t m_minFutureTimeMS;

+	int32_t m_numSuccessReplies;
+	int32_t m_numFailedReplies;
+
 	// . do not re-send CrawlInfoLocal for a coll if not update
 	// . we store the flags in here as true if we should send our
 	//   CrawlInfoLocal for this coll to this hostId
@ -1212,6 +1225,7 @@ class SpiderColl {
 	int32_t     m_numAdded;
 	int64_t m_numBytesScanned;
 	int64_t m_lastPrintCount;
+	int64_t m_lastPrinted;

 	// used by SpiderLoop.cpp
 	int32_t m_spidersOut;
@ -1253,6 +1267,7 @@ class SpiderColl {
 	bool addToWaitingTree    ( uint64_t spiderTime , int32_t firstIp ,
 				   bool callForScan );
 	int32_t getNextIpFromWaitingTree ( );
+	uint64_t getNextSpiderTimeFromWaitingTree ( ) ;
 	void populateDoledbFromWaitingTree ( );

 	//bool scanSpiderdb        ( bool needList );
@ -1305,6 +1320,11 @@ class SpiderColl {
 	int32_t *m_overflowList;
 	int64_t  m_totalNewSpiderRequests;
 	int64_t  m_lastSreqUh48;
+
+	int32_t m_cblocks[20];
+	int32_t m_pageNumInlinks;
+	int32_t m_lastCBlockIp;
+		
 	int32_t  m_lastOverflowFirstIp;

 private:
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -4873,7 +4873,19 @@ bool isTagTypeUnique ( int32_t tt ) {
 	// make sure table is valid
 	if ( ! s_initialized ) g_tagdb.setHashTable();
 	// look up in hash table
-	TagDesc *td = *(TagDesc **)s_ht.getValue ( &tt );
+	TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
+	if ( ! tdp ) {
+		log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
+		    "not indexable",tt);
+		return false;
+	}
+	// do not core for now
+	TagDesc *td = *tdp;
+	if ( ! td ) {
+		log("tagdb: got unknown tag type %"INT32" assuming "
+		    "unique",tt);
+		return true;
+	}
 	// if none, that is crazy
 	if ( ! td ) { char *xx=NULL;*xx=0; }
 	// return 
@ -4887,8 +4899,20 @@ bool isTagTypeIndexable ( int32_t tt ) {
 	// make sure table is valid
 	if ( ! s_initialized ) g_tagdb.setHashTable();
 	// look up in hash table
-	TagDesc *td = *(TagDesc **)s_ht.getValue ( &tt );
-	// if none, that is crazy
+	TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
+	// do not core for now
+	if ( ! tdp ) {
+		log("tagdb: got unknown tag type %"INT32" assuming "
+		    "not indexable",tt);
+		return false;
+	}
+	TagDesc *td = *tdp;
+	if ( ! td ) {
+		log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
+		    "not indexable",tt);
+		return false;
+	}
+	// if none, that is crazy MDW coring here:
 	if ( ! td ) { char *xx=NULL;*xx=0; }
 	// return false if we should not index it
 	if ( td->m_flags & TDF_NOINDEX ) return false;
--- a/Test.cpp
+++ b/Test.cpp
@ -932,11 +932,12 @@ bool Test::injectLoop ( ) {
 	m_sreq.m_domHash32  = fakeIp;
 	m_sreq.m_hostHash32 = fakeIp;
 	m_sreq.m_siteHash32 = fakeIp;
-	m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
+	//m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url );
 	// this crap is fake
 	m_sreq.m_isInjecting = 1;
 	// use test-spider subdir for storing pages and spider times?
-	if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
+	// MDW: this was replaced by m_isParentSiteMap bit.
+	//if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1;
 	// use this later
 	m_sreq.m_hasContent = 0;
 	// injected requests use this as the spider time i guess
--- a/UdpServer.cpp
+++ b/UdpServer.cpp
@ -251,7 +251,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
 	m_head2 = NULL;
 	m_tail2 = NULL;
 	// linked list of callback candidates
-	//m_head3 = NULL;
+	m_head3 = NULL;
 	// . set up hash table that converts key (ip/port/transId) to a slot
 	// . m_numBuckets must be power of 2
 	m_numBuckets = getHighestLitBitValue ( m_maxSlots * 6 );
@ -267,6 +267,7 @@ bool UdpServer::init ( uint16_t port, UdpProtocol *proto, int32_t niceness,
 	log(LOG_DEBUG,"udp: Allocated %"INT32" bytes for table.",m_bufSize);

 	m_numUsedSlots   = 0;
+	m_numUsedSlotsIncoming   = 0;
 	// clear this
 	m_isShuttingDown = false;
 	// and this
@ -555,7 +556,7 @@ bool UdpServer::sendRequest ( char     *msg          ,

 	// . create a new slot to control the transmission of this request
 	// . should set g_errno on failure
-	UdpSlot *slot = getEmptyUdpSlot_ass ( key );
+	UdpSlot *slot = getEmptyUdpSlot_ass ( key , false );
 	if ( ! slot ) {
 		if ( flipped ) interruptsOn();
 		return log("udp: All %"INT32" slots are in use.",m_maxSlots);
@ -601,6 +602,8 @@ bool UdpServer::sendRequest ( char     *msg          ,
 		return log("udp: Failed to initialize udp socket for "
 			   "sending req: %s",mstrerror(g_errno));
 	}
+
+	if ( slot->m_next3 || slot->m_prev3 ) { char *xx=NULL;*xx=0; }
 	// set this
 	slot->m_maxResends = maxResends;
 	// keep sending dgrams until we have no more or hit ACK_WINDOW limit
@ -675,6 +678,9 @@ void UdpServer::sendReply_ass ( char    *msg        ,
 		log(LOG_LOGIC,"udp: sendReply_ass: Callback is non-NULL.");
 		return;
 	}
+	if ( ! msg && msgSize > 0 )
+		log("udp: calling sendreply with null send buffer and "
+		    "positive size! will probably core.");
 	// record some statistics on how long these msg handlers are taking
 	int64_t now = gettimeofdayInMillisecondsLocal();
 	// m_queuedTime should have been set before m_handlers[] was called
@ -1069,6 +1075,8 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
 	// bail if no main sock
 	if ( m_sock < 0 ) return ;

+	//log("process_ass");
+
 	// if we call this while in the sighandler it crashes since
 	// gettimeofdayInMillisecondsLocal() is not async safe
 	int64_t startTimer;
@ -1099,7 +1107,16 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
 		// if no slot was set, it was a slotless read so keep looping
 		if ( ! slot ) { g_errno = 0; goto readAgain; }
 		// if there was a read error let makeCallback() know about it
-		if ( status == -1 ) slot->m_errno = g_errno;
+		if ( status == -1 ) {
+			slot->m_errno = g_errno;
+			// prepare to call the callback by adding it to this
+			// special linked list
+			if ( g_errno )
+				addToCallbackLinkedList ( slot );
+			// sanity
+			if ( ! g_errno )
+				log("udp: missing g_errno from read error");
+		}
 		// we read something
 		something = true;
 		// try sending an ACK on the slot we read something from
@ -1108,6 +1125,7 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
 	// if we read something, try for more
 	if ( something ) { 
 		//if ( slot->m_errno || slot->isTransactionComplete())
+		//log("got something");
 		needCallback = true; 
 		goto loop; 
 	}
@ -1131,6 +1149,8 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
 	if ( makeCallbacks_ass ( /*niceness level*/ 0 ) ) {
 		// set flag to call low priority callbacks 
 		m_needBottom = true;
+		// note it
+		//log("made callback");
 		// but not now, only when we don't call any high priorities
 		goto bigloop;
 	}
@ -1140,17 +1160,19 @@ void UdpServer::process_ass ( int64_t now , int32_t maxNiceness) {
 	// gettimeofdayInMillisecondsLocal() is not async safe
 	int64_t elapsed = 0;
 	if ( ! g_inSigHandler )
-		elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
+	 	elapsed = gettimeofdayInMillisecondsLocal() - startTimer;
 	if(elapsed < 10) {
 		// we did not call any, so resort to nice callbacks
-		makeCallbacks_ass ( /*niceness level*/ 1 ) ;
+		// . only go to bigloop if we called a callback
+		if ( makeCallbacks_ass ( /*niceness level*/ 1 ) )
+			goto bigloop;
 		// no longer need to be called
 		// if we did anything loop back up
 		// . but only if we haven't been looping forever,
 		// . if so we need to relinquish control to loop.
 		// 		log(LOG_WARN, "udp: give back control. after %"INT64"", 
 		// 		    elapsed);
-		goto bigloop;	
+		//goto bigloop;	
 	}
 	else {
 		m_needBottom = true;
@ -1239,12 +1261,19 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 		log("loop: readsock_ass: peekSize=%i m_sock/fd=%i",
 		    peekSize,m_sock);

+	//static int s_ss = 0;
+
 	// cancel silly g_errnos and return 0 since we blocked
 	if ( peekSize < 0 ) {
 		g_errno = errno;
 		if ( flipped ) interruptsOn();
-		if ( g_errno == EAGAIN || g_errno == 0 ) { g_errno = 0; return 0; }
-		if ( g_errno == EILSEQ ) { g_errno = 0; return 0; }
+		if ( g_errno == EAGAIN || g_errno == 0 ) { 
+			// if ( s_ss++ == 100 ) {
+			// 	log("foo");char *xx=NULL;*xx=0; }
+			// log("udp: EAGAIN");
+			g_errno = 0; return 0; }
+		if ( g_errno == EILSEQ ) { 
+			g_errno = 0; return 0; }
 		// Interrupted system call (4) (from valgrind)
 #ifdef _VALGRIND_
 		if ( g_errno == 4 ) { g_errno = 0; return 0;}
@ -1592,7 +1621,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 		
 		if ( getSlot ) 
 			// get a new UdpSlot
-			slot = getEmptyUdpSlot_ass ( key );
+			slot = getEmptyUdpSlot_ass ( key , true );
 		// return -1 on failure
 		if ( ! slot ) {
 			// return -1
@ -1693,8 +1722,25 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 	// we we could not allocate a read buffer to hold the request/reply
 	// just send a cancel ack so the send will call its callback with
 	// g_errno set
+	// MDW: it won't make it into the m_head3 callback linked list with
+	// this logic.... maybe it just times out or resends later...
 	if ( ! status && g_errno == ENOMEM ) goto cancelTrans;

+	// if it is now a complete REPLY, callback will need to be called
+	// so insert into the callback linked list, m_head3.
+	// we have to put slots with NULL callbacks in here since they
+	// are incoming requests to handle.
+	if ( //slot->m_callback && 
+	     // if we got an error reading the reply (or sending req?) then
+	     // consider it completed too?
+	     // ( slot->isTransactionComplete() || slot->m_errno ) &&
+	    ( slot->isDoneReading() || slot->m_errno ) ) {
+		// prepare to call the callback by adding it to this
+		// special linked list
+		addToCallbackLinkedList ( slot );
+	}
+
+
 	//	if(g_conf.m_sequentialProfiling) {
 	// 		if(slot->isDoneReading()) 
 	// 			log(LOG_TIMING, "admin: read last dgram: "
@ -1705,6 +1751,7 @@ int32_t UdpServer::readSock_ass ( UdpSlot **slotPtr , int64_t now ) {
 	// discard if we should
 	if ( discard ) {
 	       readSize=recvfrom(m_sock,tmpbuf,DGRAM_SIZE_CEILING,0,NULL,NULL);
+	       //log("udp: recvfrom3 = %i",(int)readSize);
 	}
 	// . update stats, just put them all in g_udpServer
 	// . do not count acks
@ -1886,13 +1933,18 @@ void UdpServer::resume ( ) {
 // . the problem is when we call this with niceness 1 and we convert
 //   a niceness 1 callback to 0...
 bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {
- 	if ( g_conf.m_logDebugUdp )
+
+	// if nothing to call, forget it
+	if ( ! m_head3 ) return false;
+
+ 	//if ( g_conf.m_logDebugUdp )
 		log(LOG_DEBUG,"udp: makeCallbacks_ass: start. nice=%"INT32" "
 		    "inquickpoll=%"INT32"",
 		    niceness,(int32_t)g_loop.m_inQuickPoll);
 	// bail if suspended
 	if ( m_isSuspended ) return false;

+
 	// . if there are active high priority threads, do not 
 	//   call low priority callbacks. in that case
 	// . This seems to block things up to much?
@ -1938,9 +1990,13 @@ bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {

 nextPass:

+	UdpSlot *nextSlot = NULL;
+
 	// only scan those slots that are ready
-	//for ( UdpSlot *slot = m_head3 ; slot ; slot = slot->m_next3 ) 
-	for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
+	//for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
+	for ( UdpSlot *slot = m_head3 ; slot ; slot = nextSlot ) {
+		// because makeCallback_ass() can delete the slot, use this
+		nextSlot = slot->m_next3;
 		// call quick handlers in pass 0, they do not take any time
 		// and if they do not get called right away can cause this host
 		// to bottleneck many hosts
@ -2097,12 +2153,15 @@ bool UdpServer::makeCallbacks_ass ( int32_t niceness ) {
 		//UdpSlot *next3 = slot->m_next2;

 		// . crap, this can alter the linked list we are scanning
-		//   if it deletes the slot!
+		//   if it deletes the slot! yes, but now we use "nextSlot"
 		// . return false on error and sets g_errno, true otherwise
 		// . return true if we called one
 		// . skip to next slot if did not call callback/handler
 		if ( ! makeCallback_ass ( slot ) ) continue;

+		// remove it from the callback list to avoid re-call
+		removeFromCallbackLinkedList ( slot );
+
 		int64_t took = 0;
 		if ( logIt )
 			took = gettimeofdayInMillisecondsLocal()-start2;
@ -2245,9 +2304,18 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
 		start = gettimeofdayInMillisecondsLocal();
 	// callback is non-NULL if we initiated the transaction 
 	if ( slot->m_callback ) { 
+
+		// assume the slot's error when making callback
+		// like EUDPTIMEDOUT
+		if ( ! g_errno ) g_errno = slot->m_errno;
+
 		// . if transaction has not fully completed, bail
 		// . unless there was an error
-		if ( ! g_errno && ! slot->isTransactionComplete())return false;
+		// . g_errno could be ECANCELLED
+		if ( ! g_errno && ! slot->isTransactionComplete()) {
+			log("udp: why calling callback when not ready???");
+			return false;
+		}
 		/*
 #ifdef _UDPDEBUG_		
 		// if we had the token, give it up so others can send with it
@ -2276,7 +2344,8 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
 			    "niceness=%"INT32" "
 			    "callback=%08"PTRFMT" "
 			    "took %"INT64" ms (%"INT32" Mbps).",
-			    slot->m_transId,msgType,mstrerror(g_errno),
+			    slot->m_transId,msgType,
+			    mstrerror(g_errno),
 			    slot->m_niceness,
 			    (PTRTYPE)slot->m_callback ,
 			    took , Mbps );
@ -2389,7 +2458,14 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
 	if ( slot->m_calledHandler ) {
 		// . if transaction has not fully completed, keep sending
 		// . unless there was an error
-		if ( ! g_errno && ! slot->isTransactionComplete())return false;
+		if ( ! g_errno && 
+		     ! slot->isTransactionComplete() &&
+		     ! slot->m_errno ) {
+			if ( g_conf.m_logDebugUdp )
+				log("udp: why calling handler "
+				    "when not ready?");
+			return false;
+		}
 		// we should not destroy the slot here on ENOMEM error,
 		// because handler might be referencing the slot's read buffer
 		// still. that is what Msg20 does... the first dgram was
@ -2468,6 +2544,7 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
 		if ( g_inSigHandler ) goto queueSig;
 		// nuke the slot, we gave them a reply...
 		destroySlot ( slot );
+		//log("udp: why double calling handler?");
 		// this kind of callback doesn't count
 		return false;
 	}
@ -2882,6 +2959,9 @@ bool UdpServer::readTimeoutPoll ( int64_t now ) {
 			// . set slot's m_errno field
 			// . makeCallbacks_ass() should call its callback
 			slot->m_errno = EUDPTIMEDOUT;
+			// prepare to call the callback by adding it to this
+			// special linked list
+			addToCallbackLinkedList ( slot );
 			// let caller know we did something
 			something = true;
 			// keep going
@ -2987,6 +3067,9 @@ bool UdpServer::readTimeoutPoll ( int64_t now ) {
 		     slot->m_callback ) {
 			// should this be ENOACK or something?
 			slot->m_errno = EUDPTIMEDOUT;
+			// prepare to call the callback by adding it to this
+			// special linked list
+			addToCallbackLinkedList ( slot );
 			// let caller know we did something
 			something = true;
 			// note it
@ -3126,7 +3209,7 @@ bool UdpServer::shutdown ( bool urgent ) {
 	time_t now = getTime();
 	int32_t count = 0;
 	if(!urgent) {
-		//if ( m_head && m_head2->m_next2 ) return false;		
+		//if ( m_head && m_head2->m_next2 ) return false;	      
 		for ( UdpSlot *slot = m_head2 ; slot ; slot = slot->m_next2 ) {
 			// if we initiated, then don't count it
 			if ( slot->m_callback ) continue;
@ -3206,7 +3289,7 @@ bool UdpServer::timeoutDeadHosts ( Host *h ) {
 }

 // verified that this is not interruptible
-UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k ) {
+UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k , bool incoming ) {
 	// turn em off
 	bool flipped = interruptsOff();
 	// tmp debug
@ -3244,14 +3327,19 @@ UdpSlot *UdpServer::getEmptyUdpSlot_ass ( key_t k ) {
 		m_tail2          = slot;
 	}
 	// also to callback candidates if we should
-	//if ( hasCallback ) {
-	//	slot->m_next3    = m_head3;
-	//	slot->m_prev3    = NULL;
-	//	if ( m_head3 ) m_head3->m_prev3 = slot;
-	//	m_head3          = slot;
-	//}
+	// if ( hasCallback ) {
+	// 	slot->m_next3    = m_head3;
+	// 	slot->m_prev3    = NULL;
+	// 	if ( m_head3 ) m_head3->m_prev3 = slot;
+	// 	m_head3          = slot;
+	// }
 	// count it
 	m_numUsedSlots++;
+
+	if ( incoming ) m_numUsedSlotsIncoming++;
+
+	slot->m_incoming = incoming;
+
 	// now store ptr in hash table
 	slot->m_key = k;
 	addKey ( k , slot );
@ -3281,6 +3369,71 @@ UdpSlot *UdpServer::getUdpSlot ( key_t k ) {
 	return m_ptrs[i];
 }

+void UdpServer::addToCallbackLinkedList ( UdpSlot *slot ) {
+	// debug log
+	if ( g_conf.m_logDebugUdp && slot->m_errno )
+		log("udp: adding slot with err = %s to callback list"
+		    , mstrerror(slot->m_errno) );
+	if ( g_conf.m_logDebugUdp )
+		log("udp: adding slot=%"PTRFMT" to callback list"
+		    ,(PTRTYPE)slot);
+	// must not be in there already, lest we double add it
+	if ( isInCallbackLinkedList ( slot ) ) {
+		if ( g_conf.m_logDebugUdp )
+			log("udp: avoided double add slot=%"PTRFMT
+			    ,(PTRTYPE)slot);
+		return;
+	}
+	slot->m_next3 = NULL;
+	slot->m_prev3 = NULL;
+	if ( ! m_tail3 ) {
+		m_head3 = slot;
+		m_tail3 = slot;
+	}
+	else {
+		// insert at end of linked list otherwise
+		m_tail3->m_next3 = slot;
+		slot->m_prev3 = m_tail3;
+		m_tail3 = slot;
+	}
+}
+
+bool UdpServer::isInCallbackLinkedList ( UdpSlot *slot ) {
+	// return if not in the linked list
+	if ( slot->m_prev3 ) return true;
+	if ( slot->m_next3 ) return true;
+	if ( m_head3 == slot ) return true;
+	return false;
+}
+
+void UdpServer::removeFromCallbackLinkedList ( UdpSlot *slot ) {
+
+	if ( g_conf.m_logDebugUdp )
+		log("udp: removing slot=%"PTRFMT" from callback list"
+		    ,(PTRTYPE)slot);
+
+	// return if not in the linked list
+	if ( slot->m_prev3 == NULL && 
+	     slot->m_next3 == NULL && 
+	     m_head3 != slot )
+		return;
+
+	// excise from linked list otherwise
+	if ( m_head3 == slot )
+		m_head3 = slot->m_next3;
+	if ( m_tail3 == slot )
+		m_tail3 = slot->m_prev3;
+
+	if ( slot->m_prev3 ) 
+		slot->m_prev3->m_next3 = slot->m_next3;
+	if ( slot->m_next3 ) 
+		slot->m_next3->m_prev3 = slot->m_prev3;
+
+	// and so we do not try to re-excise it
+	slot->m_prev3 = NULL;
+	slot->m_next3 = NULL;
+}
+
 // verified that this is not interruptible
 void UdpServer::freeUdpSlot_ass ( UdpSlot *slot ) {
 	bool flipped = interruptsOff();
@ -3291,13 +3444,12 @@ void UdpServer::freeUdpSlot_ass ( UdpSlot *slot ) {
 	if ( slot->m_prev2 ) slot->m_prev2->m_next2 = slot->m_next2;
 	if ( slot->m_next2 ) slot->m_next2->m_prev2 = slot->m_prev2;
 	// also from callback candidates if we should
-	//if ( slot->m_callback ) {
-	//	if ( slot->m_prev3 ) slot->m_prev3->m_next3 = slot->m_next3;
-	//	else                 m_head3                = slot->m_next3;
-	//	if ( slot->m_next3 ) slot->m_next3->m_prev3 = slot->m_prev3;
-	//}
+	removeFromCallbackLinkedList ( slot );
 	// discount it
 	m_numUsedSlots--;
+
+	if ( slot->m_incoming ) m_numUsedSlotsIncoming--;
+
 	// add to linked list of available slots
 	slot->m_next = m_head;
 	m_head = slot;
--- a/UdpServer.h
+++ b/UdpServer.h
@ -170,6 +170,8 @@ class UdpServer {
 	// an estimation as well
 	//int32_t getNumUsedSlots  () { return m_topUsedSlot + 1; };
 	int32_t getNumUsedSlots  () { return m_numUsedSlots; };
+
+	int32_t getNumUsedSlotsIncoming  () { return m_numUsedSlotsIncoming; };
 	

 	// . when a request/msg of type "msgType" is received we call the
@ -282,6 +284,11 @@ class UdpServer {

 	UdpSlot *getActiveHead ( ) { return m_head2; };

+	// callback linked list functions (m_head3)
+	void addToCallbackLinkedList ( UdpSlot *slot ) ;
+	bool isInCallbackLinkedList ( UdpSlot *slot );
+	void removeFromCallbackLinkedList ( UdpSlot *slot ) ;
+
 	// cancel a transaction
 	void cancel ( void *state , unsigned char msgType ) ;

@ -409,7 +416,7 @@ class UdpServer {
 	int32_t     m_maxSlots;

 	// routines
-	UdpSlot *getEmptyUdpSlot_ass ( key_t k );
+	UdpSlot *getEmptyUdpSlot_ass ( key_t k , bool incoming );
 	void     freeUdpSlot_ass     ( UdpSlot *slot );

 	void addKey ( key_t key , UdpSlot *ptr ) ;
@ -434,9 +441,11 @@ class UdpServer {
 	UdpSlot        *m_head2;
 	UdpSlot        *m_tail2;
 	// linked list of callback candidates
-	//UdpSlot        *m_head3;
+	UdpSlot        *m_head3;
+	UdpSlot        *m_tail3;

 	int32_t            m_numUsedSlots;
+	int32_t            m_numUsedSlotsIncoming;

 	// stats
 public:
--- a/UdpSlot.cpp
+++ b/UdpSlot.cpp
@ -1502,6 +1502,9 @@ bool UdpSlot::readDatagramOrAck ( int        sock    ,
 	// if its a msg 0x0c reply from a proxy ove roadrunner wireless
 	// they tend to damage our packets for some reason so i repeat
 	// the ip for a total of an 8 byte reply
+	/*
+	  MDW: this seems to be causing problems on local networks
+	  so taking it out. 4/7/2015.
 	if ( m_msgType == 0x0c && msgSize == 12 && peekSize == 24 &&
 	     // must be reply! not request.
 	     m_callback ) {
@ -1528,6 +1531,7 @@ bool UdpSlot::readDatagramOrAck ( int        sock    ,
 			return true;
 		}
 	}	
+	*/

 	// we're doing the call to recvfrom() for sure now
 	*discard = false;
@ -1572,6 +1576,7 @@ bool UdpSlot::readDatagramOrAck ( int        sock    ,
 					 0      ,
 					 NULL   ,
 					 NULL   );
+		//log("udp: recvfrom1 = %i",(int)numRead);
 		// let caller know how much we read for stats purposes
 		*readSize = numRead;
 		// restore what was at the header before we stored it there
@ -1614,6 +1619,7 @@ bool UdpSlot::readDatagramOrAck ( int        sock    ,
 				   0             , 
 				   NULL          , 
 				   NULL          );
+	//log("udp: recvfrom2 = %i",(int)dgramSize);
 	// bail on error, how could this happen?
 	if ( dgramSize < 0 ) {
 		// valgrind
--- a/UdpSlot.h
+++ b/UdpSlot.h
@ -412,6 +412,13 @@ class UdpSlot {
 	// save cpu by not having to call memset() on m_sentBits et al
 	int32_t m_numBitsInitialized;

+	// and for doubly linked list of callback candidates
+ 	class UdpSlot *m_next3;
+	class UdpSlot *m_prev3;
+
+	// memset clears from here and above. so put anything that needs to
+	// be set to zero above this line.
+
 	// . i've discarded the window since msg size is limited
 	// . this way is faster 
 	// . these bits determine what dgrams we've sent/read/sentAck/readAck
@ -425,9 +432,7 @@ class UdpSlot {
 	// and for doubly linked list of used slots
 	class UdpSlot *m_next2;
 	class UdpSlot *m_prev2;
-	// and for doubly linked list of callback candidates
- 	//class UdpSlot *m_next3;
-	//class UdpSlot *m_prev3;
+
 	// store the key so when returning slot we can remove from hash table
 	key_t m_key;

@ -435,6 +440,8 @@ class UdpSlot {

 	char m_maxResends;

+	char m_incoming;
+
 	// . for the hot udp server, we cannot call malloc in the sig handler
 	//   so we set m_readBuf to this to read in int16_t requests
 	// . caller should pre-allocated m_readBuf when calling sendRequest()
--- a/Url.h
+++ b/Url.h
@ -202,7 +202,7 @@ public:
 	char *getShorthandUrl    ( bool rmWWW , int32_t *len );

 	// count the path components (root url as 0 path components)
-	int32_t  getPathDepth ( bool countFilename = false );
+	int32_t  getPathDepth ( bool countFilename ); // = false );

 	// get path component #num. starts at 0.
 	char *getPathComponent ( int32_t num , int32_t *clen );
--- a/Xml.cpp
+++ b/Xml.cpp
@ -287,10 +287,19 @@ bool Xml::set ( char  *s             ,
 		return true;
 	}

-	// override
+	// override. no don't it hurts when parsing CT_XML docs!!
+	// we need XmlNode.cpp's setNodeInfo() to identify xml tags in 
+	// an rss feed. No, this was here for XmlDoc::hashXml() i think
+	// so let's just fix Links.cpp to get links from pure xml.
+	// we can't do this any more. it's easier to fix xmldoc::hashxml()
+	// some other way... because Links.cpp and Xml::isRSSFeed() 
+	// depend on having regular tagids. but without this here
+	// then XmlDoc::hashXml() breaks.
 	if ( contentType == CT_XML )
-		pureXml = true;
+	  	pureXml = true;

+	// is it an xml conf file?
+	m_pureXml = pureXml;

 	QUICKPOLL((niceness));
 	int32_t i;
@ -372,8 +381,12 @@ bool Xml::set ( char  *s             ,
 		bool endsInSlash = false;
 		if ( xi->m_node[xi->m_nodeLen-2] == '/' ) endsInSlash = true;
 		if ( xi->m_node[xi->m_nodeLen-2] == '?' ) endsInSlash = true;
+		// disregard </> in the conf files
+		if ( xi->m_nodeLen==3 && endsInSlash    ) endsInSlash = false;

 		// if not text node then he's the new parent
+		// if we don't do this for xhtml then we don't pop the parent
+		// and run out of parent stack space very quickly.
 		if ( pureXml &&
 		     xi->m_nodeId && 
 		     xi->m_nodeId != TAG_COMMENT &&
--- a/Xml.h
+++ b/Xml.h
@ -230,6 +230,8 @@ class Xml {
 	int32_t       m_numNodes;
 	int32_t       m_maxNumNodes;

+	bool m_pureXml;
+
 	char      *m_xml;
 	int32_t       m_xmlLen;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -506,7 +506,8 @@ class XmlDoc {
 	bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
 	// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
 	SafeBuf *getTitleRecBuf ( );
-	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ) ;
+	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
+					      bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
 	SafeBuf m_spiderStatusDocMetaList;
 	char *getIsAdult ( ) ;
@ -532,6 +533,7 @@ class XmlDoc {
 	char *getIsPermalink ( ) ;
 	char *getIsUrlPermalinkFormat ( ) ;
 	char *getIsRSS ( ) ;
+	char *getIsSiteMap ( ) ;
 	class Xml *getXml ( ) ;
 	uint8_t *getLangVector ( ) ;	
 	uint8_t *getLangId ( ) ;
@ -734,6 +736,18 @@ class XmlDoc {

 	char *getDiffbotParentUrl( char *myUrl );

+	int64_t m_diffbotReplyEndTime;
+	int64_t m_diffbotReplyStartTime;
+	int32_t m_diffbotReplyRetries;
+
+	bool m_sentToDiffbotThisTime;
+
+	uint64_t m_downloadStartTime;
+	//uint64_t m_downloadEndTime;
+
+	uint64_t m_ipStartTime;
+	uint64_t m_ipEndTime;
+
 	void copyFromOldDoc ( class XmlDoc *od ) ;

 	class SpiderReply *getFakeSpiderReply ( );
@ -785,8 +799,8 @@ class XmlDoc {
 	bool hashContentType ( class HashTableX *table ) ;
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
-	bool hashUrl ( class HashTableX *table , bool isStatusDoc = false ) ;
-	bool hashDateNumbers ( class HashTableX *tt , bool isStatusDoc=false) ;
+	bool hashUrl ( class HashTableX *table );
+	bool hashDateNumbers ( class HashTableX *tt );
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
 				    bool       hashAnomalies    ,
@ -1148,6 +1162,7 @@ class XmlDoc {
 	char     m_addedSpiderRequestSizeValid;
 	char     m_addedSpiderReplySizeValid;
 	char     m_addedStatusDocSizeValid;
+	char     m_downloadStartTimeValid;
 	//char   m_docQualityValid;
 	char     m_siteValid;
 	char     m_startTimeValid;
@ -1215,6 +1230,7 @@ class XmlDoc {
 	char     m_rootLangIdValid;
 	char     m_datedbDateValid;
 	char     m_isRSSValid;
+	char     m_isSiteMapValid;
 	char     m_spiderLinksArgValid;
 	char     m_isContentTruncatedValid;
 	char     m_xmlValid;
@ -1436,6 +1452,8 @@ class XmlDoc {
 	bool m_looseContentHash64Valid;
 	bool m_jpValid;

+	char m_isSiteMap;
+
 	// shadows
 	char m_isRSS2;
 	char m_isPermalink2;
@ -1634,7 +1652,7 @@ class XmlDoc {
 	//class LinkInfo *m_linkInfo1Ptr;
 	char     *m_linkInfoColl;
 	//char m_injectedReply;
-	int32_t m_minInlinkerHopCount;
+	//int32_t m_minInlinkerHopCount;
 	//class LinkInfo *m_linkInfo2Ptr;
 	SiteGetter m_siteGetter;
 	int64_t  m_siteHash64;
@ -1712,6 +1730,9 @@ class XmlDoc {
 	bool doesPageContentMatchDiffbotProcessPattern() ;
 	int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
 	char *hashJSONFields ( HashTableX *table );
+	char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
+				bool hashWithoutFieldNames ) ;
+
 	char *hashXMLFields ( HashTableX *table );
 	int32_t *reindexJSONObjects ( int32_t *newTitleHashes , 
 				      int32_t numNewHashes ) ;
--- a/XmlNode.cpp
+++ b/XmlNode.cpp
@ -194,8 +194,10 @@ NodeType g_nodes[] = {

 	{"scriptText",0, 1, 0, 0,0, TAG_SCRIPTTEXT,0 },
 	{"BUTTON"   , 1, 1, 1, 0,0, TAG_BUTTON	,0}, 	
-	{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1}
+	{"UrlFrom", 0, 1, 1, 0,0, TAG_URLFROM ,1},

+	// for sitemap.xml
+	{"LOC"     , 0, 1, 1, 0,0, TAG_LOC,0}
 	//{"BUTTON"   , 1, 1, 1, 2, 122,0},
 	//{"BDO"      , 1, 1, 1, 2, 123,0},
 	//{"LABEL"    , 1, 1, 1, 2, 124,0},
@ -312,7 +314,9 @@ int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
 		m_hasBackTag = true;
 		m_isBreaking = true;
 		m_isVisible  = true;
-		m_nodeId     = TAG_XMLTAG;//1;
+		//m_nodeId     = TAG_XMLTAG;//1;
+		// this returns 1 if tag is not in the list
+		m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , 
 	}
 	// . determine if the nodeId for this node
 	// . determine if it breaks lines (for phrasing purposes)
--- a/XmlNode.h
+++ b/XmlNode.h
@ -322,13 +322,15 @@ enum {
 	TAG_BUTTON,
 	TAG_URLFROM, // for ahrefs.com

+	// support sitemap.xml
+	TAG_LOC,
+
 	//
 	// fake tags below here
 	//
 	// a fake tag used by Sections.cpp
 	TAG_SENTENCE,

-
 	LAST_TAG
 };
 #endif
--- a/html/blog.html
+++ b/html/blog.html
@ -1,12 +1,406 @@
-<html>
-<title>Gigablast - Blog</title>
+
+<div style=max-width:700px;>

 <br>
+<br><br>

-<a name=comparetool></a>
-<font size=+1><b>Compare Tool</b></font><br>
-<i>Aug 17, 2013</i><br><br>
-This week I have begun constructing a tool that allows you to compare Gigablasts with Solr and Elasticsearch, which are two of the more popular open-source search engines on the market today. Both of those are based on Lucene. So essentially I will be comparing everything I think is noteworthy, if I leave something out drop me an email.
+<a name=revival></a>
+<font size=+1><b>15 Year Anniversary</b></font><br>
+<i>September 1, 2014</i><br><br>
+It's been 15 years since I first started Gigablast. It's taken some interesting directions as of late. Most notably being open source. I've decided to revive the old blog entries that you can find below and continue working on top of those.
+
+
+
+
+<br><br><br><br>
+
+
+
+
+<a name=gigabits></a>
+<font size=+1><b>Giga Bits Introduced</b></font><br>
+<i>Jan 31, 2004</i><br><br>
+Gigablast now generates related concepts for your query. I call them Giga Bits.  I believe it is the best concept generator in the industry, but if you don't think so please <a href="/contact.html">drop me a note</a> explaining why not, so I can improve it. 
+<br><br>
+You can also ask Gigablast a simple question like <a href="/search?q=Who+is+President+of+Russia%3F">"Who is President of Russia?"</a> and it often comes up with the correct answer in the Giga Bits section. How do you think it does that?
+<br><br>
+In other news, the spider speed ups I rolled a few weeks ago are tremendously successful. I can easily burn all my bandwidth quota with insignificant load on my servers. I could not be happier with this.
+<br><br>
+Now I'm planning on turning Gigablast into a default AND engine. Why? Because it will decrease query latency by several times, believe or not. That should put Gigablast on par with the fastest engines in the world, even though it only runs on 8 desktop machines. But Don't worry, I will still leave the default OR functionality intact. 
+<br>
+<br>
+<br>
+<br>
+
+<a name=update></a>
+<font size=+1><b>January Update Rolled</b></font><br>
+<i>Jan 8, 2004</i><br><br>
+Gigablast now has a more professional, but still recognizable, logo, and a new catch phrase, "Information Acceleration". Lots of changes on the back end. You should notice significantly higher quality searches. The spider algorithm was sped up several times. Gigablast should be able to index several million documents per day, but that still remains to be tested. &lt;knock on wood&gt;.  Site clustering was sped up. I added the ability to force all query terms to be required by using the &rat=1 cgi parm. Now Gigablast will automatically regenerate some of its databases when they are missing. And I think I wasted two weeks working like a dog on code that I'm not going to end up using! I hate when that happens... 
+
+<br>
+<br>
+<br>
+<br>
+
+<a name=traffic></a>
+<font size=+1><b>An Easy way to Slash Motor Vehicle Emissions</b></font><br>
+<i>Dec 11, 2003</i><br><br>
+Blanket the whole city with wi-fi access. (like <a href="/?redir=http://story.news.yahoo.com/news?tmpl=story&ncid=1293&e=2&u=/ap/20031211/ap_on_hi_te/wi_fi_city&sid=95573418">Cerritos, California</a>)  When you want to travel from point A
+to point B, tell the central traffic computer. It will then give you a time 
+window in which to begin your voyage and, most importantly, it will ensure that
+as long as you stay within the window you will always hit green lights. 
+<br><br>
+If you stray from your path, you'll be able to get a new window via the wi-fi network.
+If everyone's car has gps and is connected to the wi-fi network, 
+the central computer will also be able to monitor the flow of traffic and 
+make adjustments to your itinerary in real-time.
+Essentially, the traffic computer will be solving a large system of linear,
+and possibly non-linear, constraints in real-time. Lots of fun... and think of
+how much more efficient travel will be!! If someone wants to secure funding,
+count me in.
+
+<br>
+<br>
+<br>
+<br>
+
+<a name=spellchecker></a>
+<font size=+1><b>Spellchecker Finally Finished</b></font><br>
+<i>Nov 18, 2003</i><br><br>
+After a large, countable number of interruptions, I've finally completed the spellchecker. I tested the word '<b>dooty</b>' on several search engines to see how they handled that misspelling. Here's what I got:
+<br><br>
+<table>
+<tr><td><b>Source</b></td><td><b>Result</b></td></tr>
+<tr><td>Alltheweb</td><td><a href="http://www.alltheweb.com/search?query=dooty">booty</a><td></tr>
+<tr><td>Altavista</td><td><a href="http://search01.altavista.com/web/results?q=dooty">dhooti</a></td></tr>
+<tr><td>Gigablast</td><td><a href="http://www.gigablast.com/search?q=dooty">door</a></td></tr>
+<tr><td>Google</td><td><a href="http://www.google.com/search?q=dooty">doody</a></td></tr>
+<tr><td>Microsoft Word</td><td>Doty</td></tr>
+<tr><td>Teoma</td><td><a href="http://s.teoma.com/search?q=dooty">doty</a></td></tr>
+<tr><td>Wisenut</td><td>N/A (no spellchecker)</td></tr>
+</table>
+<br>
+So there is no one way to code a spellchecker. It's a guessing game. And, hey Wisenut, want to license a good spellchecker for cheap? <a href="/contact.html">Let me know</a>.
+
+<br><br>
+Gigablast uses its cached web pages to generate its dictionary instead of the query logs. When a word or phrase is not found in the the dictionary, Gigablast replaces it with the closest match in the dictionary.  If multiple words or phrases are equally close, then Gigablast resorts to a popularity ranking.
+<br><br>
+One interesting thing I noticed is that in Google's spellchecker you must at least get the first letter of the word correct, otherwise, Google will not be able to recommend the correct spelling. I made Gigablast this way too, because it really cuts down on the number of words it has to search to come up with a recommendation. This also allows you to have an extremely large dictionary distributed amongst several machines, where each machine is responsible for a letter.
+<br><br>
+Also of note: I am planning on purchasing the hardware required for achieving a 5 billion document index capable of serving hundreds of queries per second within the next 12 months. Wish me luck... and thanks for using Gigablast.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=onagain></a>
+<font size=+1><b>Spiders On Again</b></font><br>
+<i>Nov 10, 2003</i><br><br>
+After updating the spider code I've reactivated the spiders. Gigablast should be able to spider at a faster rate with even less impact on query response time than before. So add your urls now while the addings good.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=speed></a>
+<font size=+1><b>Going For Speed</b></font><br>
+<i>Nov 3, 2003</i><br><br>
+I've finally got around to working on Gigablast's distributed caches. It was not doing a lot of caching before. The new cache class I rigged up has no memory fragmentation and minimal record overhead. It is vurhy nice.<br><br>
+I've stopped spidering just for a bit so I can dedicate all Gigablast's RAM to the multi-level cache system I have in place now and see how much I can reduce query latency. Disks are still my main point of contention by far so the caching helps out a lot. But I could still use more memory.<br><br>
+Take Gigablast for a <a href="/">spin</a>. See how fast it is.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=metas></a>
+<font size=+1><b>Bring Me Your Meta Tags</b></font><br>
+<i>Oct 11, 2003</i><br><br>
+As of now Gigablast supports the indexing, searching and displaying of generic meta tags. You name them I fame them. For instance, if you have a tag like <i>&lt;meta name="foo" content="bar baz"&gt;</i> in your document, then you will be able to do a search like <i><a href="/search?q=foo%3Abar&dt=foo">foo:bar</a></i> or <i><a href="/search?q=foo%3A%22bar+baz%22&dt=foo">foo:"bar baz"</a></i> and Gigablast will find your document. 
+<br><br>
+You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like <a href="/search?q=gigablast&s=10&dt=author+keywords%3A32">this</a>. Note that you must assign the <i>dt</i> cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a <i>:X</i> to the name of the meta tag supplied to the <i>dt</i> parameter. In the link above, I limited the displayed keywords to 32 characters.
+<br><br>
+Why use generic metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it. Originally I wanted to do something like this in XML, but now my gut instincts are that XML is not catching on because it is ugly and bloated. Meta tags are pretty and slick.
+<br>
+<br>
+<br>
+<br>
+
+<a name=verisignstopped></a>
+<font size=+1><b>Verisign Stops Destroying the Internet</b></font><br>
+<i>Oct 11, 2003</i><br><br>
+Ok, they actually stopped about a week ago, but I didn't get around to posting it until now. They really ought to lose their privileged position so this does not happen again. Please do not stop your boycott. They have not learned from their mistakes.
+<br>
+<br>
+<br>
+<br>
+
+<a name=moreverisign></a>
+<font size=+1><b>Verisign Continues to Damage Gigablast's Index</b></font><br>
+<i>September 30, 2003</i><br><br>
+When the Gigablast spider tries to download a page from a domain it first gets the associated robots.txt file for that domain.  When the domain does not exist it ends up downloading a robots.txt file from verisign. There are two major problems with this. The first is that verisign's servers may be slow which will slow down Gigablast's indexing. Secondly, and this has been happening for a while now, Gigablast will still index any incoming link text for that domain, thinking that the domain still exists, but just that spider permission was denied by the robots.txt file.
+<br>
+<br>
+So, hats off to you verisign, thanks for enhancing my index with your fantastic "service". I hope your company is around for many years so you can continue providing me with your great "services".
+<br>
+<br>
+If you have been hurt because of verisign's greed you might want to consider joining the <a href="/?redir=http://www.geek.com/news/geeknews/2003Sep/gee20030929021965.htm">class-action lawsuit</a> announced Friday, September 26th, by the <a href="/?redir=http://www.techfirm.com/">Ira Rothken law firm</a>.
+<br>
+<br>
+Want to learn more about how the internet is run? Check out <a href="/?redir=http://www.paradigm.nu/icann/">the ICANN movie page</a>. Movie #1 portrays verisign's CEO, Stratton Sclavos, quite well in my opinion.
+<br>
+<br>
+<b>(10/01/03) Update #5:</b> verisign <a href="/?redir=http://www.pcworld.com/news/article/0,aid,112712,00.asp">comes under further scrutiny</a>.
+<br>
+<br>
+<br>
+<br>
+
+<a name=verisign></a>
+<font size=+1><b>Verisign Redesigns the Internet for their Own Profit</b></font><br>
+<i>September 24, 2003</i><br><br>
+My spiders expect to get "not found" messages when they look up a domain that does not have an IP. When verisign uses their privledged position to change the underlying fundamentals of the internet just to line their own greedy pockets it really, really perturbs me.  Now, rather than get the "not found" message, my spiders get back a valid IP, the IP of verisign's commercial servers. That causes my spiders to then proceed to download the robots.txt from that domain. This can take forever if their servers are slow. What a pain. Now I have to fix my freakin' code. And that's just one of many problems this company has caused.
+<br>
+<br>
+Please join me in boycott. I'm going to discourage everyone I know from supporting this abusive, monopolistic entity.
+<br>
+<br>
+<b>(9/22/03) Update #1:</b> verisign <a href="/?redir=http://www.icann.org/correspondence/lewis-to-twomey-21sep03.htm">responded</a> to ICANN's request that they stop. <a href="/?redir=http://slashdot.org/articles/03/09/22/2255202.shtml?tid=126&tid=95&tid=99">See what the slashdot community has to say about this response.</a>
+<br>
+<br>
+<b>(9/22/03) Update #2:</b> ICANN has now posted some complaints in this <a href="/?redir=http://forum.icann.org/alac-forum/redirect/">forum</a>.
+<br>
+<br>
+<b>(9/24/03) Update #3:</b> Slashdot has more <a href="/?redir=http://yro.slashdot.org/yro/03/09/24/0134256.shtml?tid=126&tid=95&tid=98&tid=99">coverage</a>.
+<br>
+<br>
+<b>(9/24/03) Update #4:</b> Please sign the <a href="/?redir=http://www.whois.sc/verisign-dns/">petition</a> to stop verisign.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=geotags></a>
+<font size=+1><b>Geo-Sensitive Search</b></font><br>
+<i>September 18, 2003</i><br><br>
+Gigablast now supports some special new meta tags that allow for constraining a search to a particular zipcode, city, state or country. Support was also added for the standard author, language and classification meta tags. This <a href="/tagsdemo.html">page</a> explains more. These meta tags should be standard, everyone should use them (but not abuse them!) and things will be easier for everybody.
+<br><br>
+Secondly, I have declared jihad against stale indexes. I am planning a significantly faster update cycle, not to mention growing the index to about 400 million pages, all hopefully in the next few months.
+<br>
+<br>
+<br>
+<br>
+
+<a name=turing></a>
+<font size=+1><b>Foiling the Addurl Scripts</b></font><br>
+<i>September 6, 2003</i><br><br>
+The new pseudo-Turing test on the <a href="/addurl">addurl page</a> should prevent most automated scripts from submitting boatloads of URLs. If someone actually takes the time to code a way around it then I'll just have to take it a step further. I would rather work on other things, though, so please quit abusing my free service and discontinue your scripts. Thanks.
+<br>
+<br>
+<br>
+<br>
+
+<a name=boolean></a>
+<font size=+1><b>Boolean is Here</b></font><br>
+<i>September 1, 2003</i><br><br>
+I just rolled out the new boolean logic code. You should be able to do nested boolean queries using the traditional AND, OR and NOT boolean operators. See the updated <a href="/help.html#boolean">help page</a> for more detail.
+<br><br>
+I have declared jihad against swapping and am now running the 2.4.21-rc6-rmap15j Linux kernel with swap tuned to zero using the /proc/sys/vm/pagecache knobs. So far no machines have swapped, which is great, but I'm unsure of this kernel's stability.
+<br>
+<br>
+<br>
+<br>
+
+<a name=swap></a>
+<font size=+1><b>All Swapped Out</b></font><br>
+<i>August 29, 2003</i><br><br>
+I no longer recommend turning the swap off, at least not on linux 2.4.22. A kernel panicked on me and froze a server. Not good. If anyone has any ideas for how I can prevent my app from being swapped out, please let me know. I've tried mlockall() within my app but that makes its memory usage explode for some reason. I've also tried Rik van Riel's 2.4.21-rc6-rmap15j.txt patch on the 2.4.21 kernel, but it still does unnecessary swapping (although, strangely, only when spidering). If you know how to fix this problem, please help!!! <a href="vmstat.html">Here</a> is the output from the vmstat command on one of my production machines running 2.4.22. And <a href="vmstatrik.html">here</a> is the output from my test machine running 2.4.21-rc6-rmap15j.txt. 
+<br>
+<br>
+<br>
+<br>
+
+<a name=kernel></a>
+<font size=+1><b>Kernel Update</b></font><br>
+<i>August 28, 2003</i><br><br>
+I updated the Linux kernel to 2.4.22, which was just released a few days ago on <a href="/?redir=http://www.kernel.org/">kernel.org</a>. Now my gigabit cards are working, yay! I finally had to turn off swap using the swapoff command. When an application runs out of memory the swapper is supposed to write unfrequently used memory to disk so it can give that memory to the application that needs it. Unfortunately, the Linux virtual memory manager enjoys swapping out an application's memory for no good reason. This can often make an application disastrously slow, especially when the application ends up blocking on code that it doesn't expect too! And, furthermore, when the application uses the disk intensely it has to wait even longer for memory to get swapped back in from disk. I recommend that anyone who needs high performance turn off the swap and just make sure their program does not use more physical memory than is available.
+<br>
+<br>
+<br>
+<br>
+
+<a name=gang></a>
+<font size=+1><b>The Gang's All Here</b></font><br>
+<i>August 17, 2003</i><br><br>
+I decided to add PostScript (<a href="/search?q=type:ps">.ps</a>) , PowerPoint (<a href="/search?q=type:ppt">.ppt</a>), Excel SpreadSheet (<a href="/search?q=type:xls">.xls</a>) and Microsoft Word (<a href="/search?q=type:doc">.doc</a>) support in addition to the PDF support. Woo-hoo.
+<br>
+<br>
+<br>
+<br>
+
+<a name=pdf></a>
+<font size=+1><b>PDF Support</b></font><br>
+<i>August 14, 2003</i><br><br>
+Gigablast now indexes PDF documents. Try the search <a href="/search?q=type:pdf"><i>type:pdf</i></a> to see some PDF results. <i>type</i> is a new search field. It also support the text type, <a href="/search?q=type:text"><i>type:text</i></a>, and will support other file types in the future.
+<br>
+<br>
+<br>
+<br>
+
+<a name=codeupdate3></a>
+<font size=+1><b>Minor Code Updates</b></font><br>
+<i>July 17, 2003</i><br><br>
+I've cleaned up the keyword highlight routines so they don't highlight isolated stop words. Gigablast now displays a <a href="/superRecall.html">blue bar</a> above returned search results that do not have <b>all</b> of your query terms. When returning a page of search results Gigablast lets you know how long ago that page was cached by displaying a small message at the bottom of that page. NOTE: This small message is at the bottom of the page containing the search results, not at the bottom of any pages from the web page cache, that is a different cache entirely. Numerous updates to less user-visible things on the back end. Many bugs fixed, but still more to go. Thanks a bunch to Bruce Perens for writing the <a href="/?redir=http://www.perens.com/FreeSoftware/">Electric Fence</a> debug utility.
+<br>
+<br>
+<br>
+<br>
+
+<a name=codeupdate2></a>
+<font size=+1><b>Gigablast 2.0</b></font><br>
+<i>June 20, 2003</i><br><br>
+I've recently released Gigablast 2.0. Right now Gigablast can do about twice as many queries per second as before. When I take care of a few more things that rate should double again. 
+<br><br>
+The ranking algorithm now treats phrase weights much better. If you search for something like <i><a href="/search?q=boots+in+the+uk">boots in the uk</a></i> you won't get a bunch of results that have that exact phrase in them, but rather you will get UK sites about boots (theoretically). And when you do a search like <i><a href="/search?q=all+the+king%27s+men">all the king's men</a></i> you will get results that have that exact phrase. If you find any queries for which Gigablast is especially bad, but a competing search engine is good, please <a href="/contact.html">let me know</a>, I'm am very interested.
+<br><br>
+2.0 also introduced a new index format. The new index is half the size of the old one. This allows my current setup to index over 400 million pages with dual redundancy. Before it was only able to index about 300 million pages. The decreased index size also speeds up the query process since only half as much data needs to be read from disk to satisfy a query.
+<br><br>
+I've also started a full index refresh, starting with top level pages that haven't been spidered in a while. This is especially nice because a lot of pages that were indexed before all my anti-spam algorithms were 100% in place are just now getting filtered appropriately. I've manually removed over 100,000 spam pages so far, too.
+<br>
+<br>
+<br>
+<br>
+
+<a name=grub></a>
+<font size=+1><b>My Take on Looksmart's Grub</b></font><br>
+<i>Apr 19, 2003</i><br><br>
+There's been some press about Grub, a program from Looksmart which you install on your machine to help Looksmart spider the web. Looksmart is only using Grub to save on their bandwidth. Essentially Grub just compresses web pages before sending them to Looksmart's indexer thus reducing the bandwidth they have to pay for by a factor of 5 or so. The same thing could be accomplished through a proxy which compresses web pages. Eventually, once the HTTP mime standard for requesting compressed web pages is better supported by web servers, Grub will not be necessary. 
+<br>
+<br>
+<br>
+<br>
+
+<a name=codeupdate></a>
+<font size=+1><b>Code Update</b></font><br>
+<i>Mar 25, 2003</i><br><br>
+I just rolled some significant updates to Gigablast's back-end. Gigablast now has a uniformly-distributed, unreplicated search results cache. This means that if someone has done your search within the last several hours then you will get results back very fast. This also means that Gigablast can handle a lot more queries per second.
+<br>
+<br>
+I also added lots of debug and timing messages that can be turned on and off via the Gigablast admin page. This allows me to quickly isolate problems and identify bottlenecks.
+<br>
+<br>
+Gigablast now synchronizes the clocks on all machines on the network so the instant add-url should be more "instant". Before I made this change, one machine would tell another to spider a new url "now", where "now" was actually a few minutes into the future on the spider machine. But since everyone's currently synchronized, this will not be a problem anymore.
+<br>
+<br>
+There were about 100 other changes and bug fixes, minor and major, that I made, too, that should result in significant performance gains. My next big set of changes should make searches at least 5 times faster, but it will probably take several months until completed. I will keep you posted.
+<br>
+<br>
+<br>
+<br>
+
+<a name=downtime></a>
+<font size=+1><b>Downtime</b></font><br>
+<i>Feb 20, 2003</i><br><br>
+To combat downtime I wrote a monitoring program. It will send me a text message on my cellphone if gigablast ever stops responding to queries. This should prevent extended periods of downtime by alerting me to the problem so I can promptly fix it.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=uunet></a>
+<font size=+1><b>Connectivity Problems. Bah!</b></font><br>
+<i>Feb 14, 2003</i><br><br>
+I had to turn off the main refresh spiders a few weeks ago because of internet connectivity problems. Lots of pages were inaccessible or were timing out to the point that spider performance was suffering too much. 
+<br><br>
+After running tcpdump in combination with wget I noticed that the FIN packets of some web page transfers were being lost or delayed for over a minute. The TCP FIN packet is typically the last TCP packet sent to your browser when it retrieves a web page. It tells your browser to close the connection. Once it is received the little spinning logo in the upper right corner of your browser window should stop spinning. 
+<br><br>
+The most significant problem was, however, that the initial incoming data packet for some URLs was being lost or excessively delayed. You can get by without receiving FIN packets but you absoultely need these TCP "P" packets. I've tested my equipment and my ISP has tested their equipment and we have both concluded that the problem is upstream. Yesterday my ISP submitted a ticket to Worldcom/UUNet.  Worldcom's techs have verified the problem and thought it was... "interesting". 
+<br><br>
+I personally think it is a bug in some filtering or monitoring software installed at one of Worldcom's NAPs (Network Access Points). NAPs are where the big internet providers interface with each other. The most popular NAPs are in big cities, the Tier-1 cities, as they're called.  There are also companies that host NAP sites where the big carriers like Worldcom can install their equipment. The big carriers then set up Peering Agreements with each other. Peering Agreements state the conditions under which two or more carriers will exchange internet traffic. 
+<br><br>
+Once you have a peering agreement in place with another carrier then you must pay them based on how much data you transfer from your network to their network across a NAP. This means that downloading a file is much cheaper than uploading a file.  When you send a request to retrieve some information, that request is small compared to the amount of data it retrieves. Therefore, the carrier that hosted the server from which you got the data will end up paying more. Doh! I got off the topic. I hope they fix the problem soon!
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=ads></a>
+<font size=+1><b>Considering Advertisements</b></font><br>
+<i>Jan 10, 2003</i><br><br>
+I'm now looking into serving text advertisements on top of the search results page so I can continue to fund my information retrieval research. I am also exploring the possibility of injecting ads into some of my xml-based search feeds. If you're interested in a search feed I should be able to give you an even better deal provided you can display the ads I feed you, in addition to any other ads you might want to add. If anyone has any good advice concerning what ad company I should use, I'd love to here it.
+<br>
+<br>
+<br>
+<br>
+
+<a name=codeupdate></a>
+<font size=+1><b>Code Update</b></font><br>
+<i>Dec 27, 2002</i><br><br>
+After a brief hiatus I've restarted the Gigablast spiders. The problem was they were having a negative impact on the query engine's performance, but now, all spider processing yields computer resources much better to the query traffic. The result is that the spidering process only runs in the space between queries. This actually involved a lot of work. I had to insert code to suspend spider-related, network transactions and cancel disk-read and disk-write threads.<br><br>
+I've also launched my <a href="/gigaboost.html">Gigaboost</a> campaign. This rewards pages that link to gigablast.com with a boost in the search results rankings. The boost is only utilized to resolve ties in ranking scores so it does not taint the quality of the index.<br><br>
+Gigablast.nu, in Scandinavia, now has a news index built from news sources in the Scandinavian region. It is not publically available just yet because there's still a few details we are working out. 
+I've also added better duplicate detection and removal. It won't be very noticable until the index refresh cycle completes. 
+In addition Gigablast now removes session ids from urls, but, this only applies to new links and will be back pedaled to fix urls already in the index at a later date. 
+There is also a new summary generator installed. It's over ten times faster than the old one. If you notice any problems with it please contact me. As always, I appreciate any constructive input you have to give.
+<br>
+<br>
+<br>
+<br>
+
+
+<a name=corruption></a>
+<font size=+1><b>Data Corruption Mysteries</b></font><br>
+<i>Dec 20, 2002</i><br><br>
+I've been having problems with my hard drives. I have a bunch of Maxtor 160GB drives (Model # = 4G160J8) running on Linux 2.4.17 with the <a href="/ide.2.4.17.02152002.patch.bz2">48-bit LBA patch</a>. Each machine has 4 of these drives on them, 2 on each IDE slot. I've had about 160 gigabytes of data on one before so I know the patch seems to do the job. But every now and then a drive will mess up a write. I do a lot of writing and it usually takes tens of gigabytes of writing before a drive does this.  It writes out about 8 bytes that don't match what should have been written. This causes index corruption and I've had to install work-arounds in my code to detect and patch it. 
+<br>
+<br>
+I'm not sure if the problem is with the hard drive itself or with Linux. I've made sure that the problem wasn't in my code by doing a read after each write to verify. I thought it might be my motherboard or CPU. I use AMDs and Giga-byte motherboards.  But gigablast.nu in Sweden has the same problem and it uses a Pentium 3. Furthermore, gigablast.nu uses a RAID of 160GB Maxtors, whereas gigablast.com does not. Gigablast.nu uses version 2.4.19 of Linux with the 48-bit LBA patch. So the problem seems to be with Linux, the LBA patch or the hard drive itself.
+<br>
+<br>
+On top of all this mess, about 1 Maxtor, out of the 32 I have, completely fails on me every 4 months.  The drive just gives I/O errors to the kernel and brings the whole system down. Luckily, gigablast.com implements a redundant architecture so the failing server will be replaced by his backup. So far Maxtor has replaced the drives I had fail. If you give them your credit card number they'll even send the replacements out in advance.  But I believe the failure problem is an indicator that the data corruption problem is hard drive related, not Linux related. If anyone has any insight into this problem please let me know, you could quite easily be my hero.
+<br>
+<br>
+If you're still reading this you're pretty hard core so <a href="/output.html">here's</a> what /var/log/messages says when the 4G160J8 completely fails.
+<br>
+<br>
+<br>
+<br>
+
+<a name=pvr></a>
+<font size=+1><b>Personal Video Recorders (PVRs)</b></font><br>
+<i>Dec 20, 2002</i><br><br>
+Boy, these things are great. I bought a Tivo last year for my wife and she loved it. At first though she wasn't that enthusiastic because she wasn't very familiar with it. But now we rarely rent any more video tapes from Blockbuster or Hollywood video because there's always something interesting to watch on the Tivo. You just let it know what shows you like and it will record them anytime they come on. We always have an overflow of Simpsons and Seinfeld epsidoes on there.
+<br>
+<br>
+In the future though I don't think Tivo is going to make it. The reason? Home networking. Because I'm a professional computer person, we already have a home network installed. If the TV had an ethernet jack it would be in our network. 100Mbps is fast enough to send it a high-quality video stream from the computers already on the network. I have a cable modem which, in the future, should allow the computer using it to rip signals from the cable station, as well. For now though, you could split your cable and plug the new end into a tuner card on your PC. So once someone comes out with a small device for the television that converts an ethernet-based mpeg stream to a video signal we can use our home PC to act as the TIVO. This device should be pretty cheap, I'd imagine around $30 or so. The only thing you'd need then is a way to allow the remote control to talk to your PC.
+<br>
+<br>
+Now I read about the EFF suing "Hollywood" in order to clarify consumer rights of fair use. Specifically, the EFF was said to be representing Replay TV. Hey! Isn't Replay TV owned in part by Disney (aka Hollywood)... hmmmm... Seems like Disney might have pretty good control over the outcome of this case. I think it's a conflict of interest when such an important trial, which would set precedence for many cases to come, has the same plantiff as defendant.
+<br>
+<br>
+This makes me wonder about when Disney's Go.com division got sued by Overture (then known as Goto.com) for logo infringement. Disney had to pay around 20 million to Overture. I wonder what kind of ties Disney had to Overture. Ok, maybe I'm being a conspiracy theorist, so I'll stop now.
+<br>
+<br>
+<br>
+<br>
+
+<a name=ecs></a>
+<font size=+1><b>ECS K7S5A Motherboard Mayhem</b></font><br>
+<i>Dec 20, 2002</i><br><br>
+I pinch pennies. When I bought my 8 servers I got the cheapest motherboards I could get for my AMD 1.4GHz Athlon T-Birds. At the time, in late January 2002, they turned out to be the K7S5A's. While running my search engine on them I experienced lots of segmentation faults. I spent a couple of days pouring over the code wondering if I was tripping out. It wasn't until I ran memtest86 at boot time (ran by lilo) that I found memory was being corrupted. I even tried new memory sticks to no avail. Fortunately I found some pages on the web that addressed the problem. It was the motherboard. It took me many hours to replace them on all 8 servers. I don't recommend ECS. I've been very happy with the Giga-byte motherboards I have now.
+
+
+<br><br><br>
 <br><br><br>

+</div>

+
+
+	
--- a/html/faq.html
+++ b/html/faq.html
@ -270,7 +270,7 @@ For RedHat do a <b>yum install gcc-c++ glibc-static libstdc++-static openssl-sta
 </ul>
 -->

-<b>1.0</b> For <u>Ubuntu 12.02 or 14.04</u>: do <b>sudo apt-get update ; apt-get install make g++ libssl-dev binutils</b>
+<b>1.0</b> For <u>Ubuntu 12.02 or 14.04</u>: do <b>sudo apt-get update ; sudo apt-get install make g++ libssl-dev binutils</b>

 <br><br>

--- a/html/news.html
+++ b/html/news.html
@ -1,406 +0,0 @@
-
-<div style=max-width:700px;>
-
-<br>
-<br><br>
-
-<a name=revival></a>
-<font size=+1><b>15 Year Anniversary</b></font><br>
-<i>September 1, 2014</i><br><br>
-It's been 15 years since I first started Gigablast. It's taken some interesting directions as of late. Most notably being open source. I've decided to revive the old blog entries that you can find below and continue working on top of those.
-
-
-
-
-<br><br><br><br>
-
-
-
-
-<a name=gigabits></a>
-<font size=+1><b>Giga Bits Introduced</b></font><br>
-<i>Jan 31, 2004</i><br><br>
-Gigablast now generates related concepts for your query. I call them Giga Bits.  I believe it is the best concept generator in the industry, but if you don't think so please <a href="/contact.html">drop me a note</a> explaining why not, so I can improve it. 
-<br><br>
-You can also ask Gigablast a simple question like <a href="/search?q=Who+is+President+of+Russia%3F">"Who is President of Russia?"</a> and it often comes up with the correct answer in the Giga Bits section. How do you think it does that?
-<br><br>
-In other news, the spider speed ups I rolled a few weeks ago are tremendously successful. I can easily burn all my bandwidth quota with insignificant load on my servers. I could not be happier with this.
-<br><br>
-Now I'm planning on turning Gigablast into a default AND engine. Why? Because it will decrease query latency by several times, believe or not. That should put Gigablast on par with the fastest engines in the world, even though it only runs on 8 desktop machines. But Don't worry, I will still leave the default OR functionality intact. 
-<br>
-<br>
-<br>
-<br>
-
-<a name=update></a>
-<font size=+1><b>January Update Rolled</b></font><br>
-<i>Jan 8, 2004</i><br><br>
-Gigablast now has a more professional, but still recognizable, logo, and a new catch phrase, "Information Acceleration". Lots of changes on the back end. You should notice significantly higher quality searches. The spider algorithm was sped up several times. Gigablast should be able to index several million documents per day, but that still remains to be tested. &lt;knock on wood&gt;.  Site clustering was sped up. I added the ability to force all query terms to be required by using the &rat=1 cgi parm. Now Gigablast will automatically regenerate some of its databases when they are missing. And I think I wasted two weeks working like a dog on code that I'm not going to end up using! I hate when that happens... 
-
-<br>
-<br>
-<br>
-<br>
-
-<a name=traffic></a>
-<font size=+1><b>An Easy way to Slash Motor Vehicle Emissions</b></font><br>
-<i>Dec 11, 2003</i><br><br>
-Blanket the whole city with wi-fi access. (like <a href="/?redir=http://story.news.yahoo.com/news?tmpl=story&ncid=1293&e=2&u=/ap/20031211/ap_on_hi_te/wi_fi_city&sid=95573418">Cerritos, California</a>)  When you want to travel from point A
-to point B, tell the central traffic computer. It will then give you a time 
-window in which to begin your voyage and, most importantly, it will ensure that
-as long as you stay within the window you will always hit green lights. 
-<br><br>
-If you stray from your path, you'll be able to get a new window via the wi-fi network.
-If everyone's car has gps and is connected to the wi-fi network, 
-the central computer will also be able to monitor the flow of traffic and 
-make adjustments to your itinerary in real-time.
-Essentially, the traffic computer will be solving a large system of linear,
-and possibly non-linear, constraints in real-time. Lots of fun... and think of
-how much more efficient travel will be!! If someone wants to secure funding,
-count me in.
-
-<br>
-<br>
-<br>
-<br>
-
-<a name=spellchecker></a>
-<font size=+1><b>Spellchecker Finally Finished</b></font><br>
-<i>Nov 18, 2003</i><br><br>
-After a large, countable number of interruptions, I've finally completed the spellchecker. I tested the word '<b>dooty</b>' on several search engines to see how they handled that misspelling. Here's what I got:
-<br><br>
-<table>
-<tr><td><b>Source</b></td><td><b>Result</b></td></tr>
-<tr><td>Alltheweb</td><td><a href="http://www.alltheweb.com/search?query=dooty">booty</a><td></tr>
-<tr><td>Altavista</td><td><a href="http://search01.altavista.com/web/results?q=dooty">dhooti</a></td></tr>
-<tr><td>Gigablast</td><td><a href="http://www.gigablast.com/search?q=dooty">door</a></td></tr>
-<tr><td>Google</td><td><a href="http://www.google.com/search?q=dooty">doody</a></td></tr>
-<tr><td>Microsoft Word</td><td>Doty</td></tr>
-<tr><td>Teoma</td><td><a href="http://s.teoma.com/search?q=dooty">doty</a></td></tr>
-<tr><td>Wisenut</td><td>N/A (no spellchecker)</td></tr>
-</table>
-<br>
-So there is no one way to code a spellchecker. It's a guessing game. And, hey Wisenut, want to license a good spellchecker for cheap? <a href="/contact.html">Let me know</a>.
-
-<br><br>
-Gigablast uses its cached web pages to generate its dictionary instead of the query logs. When a word or phrase is not found in the the dictionary, Gigablast replaces it with the closest match in the dictionary.  If multiple words or phrases are equally close, then Gigablast resorts to a popularity ranking.
-<br><br>
-One interesting thing I noticed is that in Google's spellchecker you must at least get the first letter of the word correct, otherwise, Google will not be able to recommend the correct spelling. I made Gigablast this way too, because it really cuts down on the number of words it has to search to come up with a recommendation. This also allows you to have an extremely large dictionary distributed amongst several machines, where each machine is responsible for a letter.
-<br><br>
-Also of note: I am planning on purchasing the hardware required for achieving a 5 billion document index capable of serving hundreds of queries per second within the next 12 months. Wish me luck... and thanks for using Gigablast.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=onagain></a>
-<font size=+1><b>Spiders On Again</b></font><br>
-<i>Nov 10, 2003</i><br><br>
-After updating the spider code I've reactivated the spiders. Gigablast should be able to spider at a faster rate with even less impact on query response time than before. So add your urls now while the addings good.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=speed></a>
-<font size=+1><b>Going For Speed</b></font><br>
-<i>Nov 3, 2003</i><br><br>
-I've finally got around to working on Gigablast's distributed caches. It was not doing a lot of caching before. The new cache class I rigged up has no memory fragmentation and minimal record overhead. It is vurhy nice.<br><br>
-I've stopped spidering just for a bit so I can dedicate all Gigablast's RAM to the multi-level cache system I have in place now and see how much I can reduce query latency. Disks are still my main point of contention by far so the caching helps out a lot. But I could still use more memory.<br><br>
-Take Gigablast for a <a href="/">spin</a>. See how fast it is.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=metas></a>
-<font size=+1><b>Bring Me Your Meta Tags</b></font><br>
-<i>Oct 11, 2003</i><br><br>
-As of now Gigablast supports the indexing, searching and displaying of generic meta tags. You name them I fame them. For instance, if you have a tag like <i>&lt;meta name="foo" content="bar baz"&gt;</i> in your document, then you will be able to do a search like <i><a href="/search?q=foo%3Abar&dt=foo">foo:bar</a></i> or <i><a href="/search?q=foo%3A%22bar+baz%22&dt=foo">foo:"bar baz"</a></i> and Gigablast will find your document. 
-<br><br>
-You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like <a href="/search?q=gigablast&s=10&dt=author+keywords%3A32">this</a>. Note that you must assign the <i>dt</i> cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a <i>:X</i> to the name of the meta tag supplied to the <i>dt</i> parameter. In the link above, I limited the displayed keywords to 32 characters.
-<br><br>
-Why use generic metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it. Originally I wanted to do something like this in XML, but now my gut instincts are that XML is not catching on because it is ugly and bloated. Meta tags are pretty and slick.
-<br>
-<br>
-<br>
-<br>
-
-<a name=verisignstopped></a>
-<font size=+1><b>Verisign Stops Destroying the Internet</b></font><br>
-<i>Oct 11, 2003</i><br><br>
-Ok, they actually stopped about a week ago, but I didn't get around to posting it until now. They really ought to lose their privileged position so this does not happen again. Please do not stop your boycott. They have not learned from their mistakes.
-<br>
-<br>
-<br>
-<br>
-
-<a name=moreverisign></a>
-<font size=+1><b>Verisign Continues to Damage Gigablast's Index</b></font><br>
-<i>September 30, 2003</i><br><br>
-When the Gigablast spider tries to download a page from a domain it first gets the associated robots.txt file for that domain.  When the domain does not exist it ends up downloading a robots.txt file from verisign. There are two major problems with this. The first is that verisign's servers may be slow which will slow down Gigablast's indexing. Secondly, and this has been happening for a while now, Gigablast will still index any incoming link text for that domain, thinking that the domain still exists, but just that spider permission was denied by the robots.txt file.
-<br>
-<br>
-So, hats off to you verisign, thanks for enhancing my index with your fantastic "service". I hope your company is around for many years so you can continue providing me with your great "services".
-<br>
-<br>
-If you have been hurt because of verisign's greed you might want to consider joining the <a href="/?redir=http://www.geek.com/news/geeknews/2003Sep/gee20030929021965.htm">class-action lawsuit</a> announced Friday, September 26th, by the <a href="/?redir=http://www.techfirm.com/">Ira Rothken law firm</a>.
-<br>
-<br>
-Want to learn more about how the internet is run? Check out <a href="/?redir=http://www.paradigm.nu/icann/">the ICANN movie page</a>. Movie #1 portrays verisign's CEO, Stratton Sclavos, quite well in my opinion.
-<br>
-<br>
-<b>(10/01/03) Update #5:</b> verisign <a href="/?redir=http://www.pcworld.com/news/article/0,aid,112712,00.asp">comes under further scrutiny</a>.
-<br>
-<br>
-<br>
-<br>
-
-<a name=verisign></a>
-<font size=+1><b>Verisign Redesigns the Internet for their Own Profit</b></font><br>
-<i>September 24, 2003</i><br><br>
-My spiders expect to get "not found" messages when they look up a domain that does not have an IP. When verisign uses their privledged position to change the underlying fundamentals of the internet just to line their own greedy pockets it really, really perturbs me.  Now, rather than get the "not found" message, my spiders get back a valid IP, the IP of verisign's commercial servers. That causes my spiders to then proceed to download the robots.txt from that domain. This can take forever if their servers are slow. What a pain. Now I have to fix my freakin' code. And that's just one of many problems this company has caused.
-<br>
-<br>
-Please join me in boycott. I'm going to discourage everyone I know from supporting this abusive, monopolistic entity.
-<br>
-<br>
-<b>(9/22/03) Update #1:</b> verisign <a href="/?redir=http://www.icann.org/correspondence/lewis-to-twomey-21sep03.htm">responded</a> to ICANN's request that they stop. <a href="/?redir=http://slashdot.org/articles/03/09/22/2255202.shtml?tid=126&tid=95&tid=99">See what the slashdot community has to say about this response.</a>
-<br>
-<br>
-<b>(9/22/03) Update #2:</b> ICANN has now posted some complaints in this <a href="/?redir=http://forum.icann.org/alac-forum/redirect/">forum</a>.
-<br>
-<br>
-<b>(9/24/03) Update #3:</b> Slashdot has more <a href="/?redir=http://yro.slashdot.org/yro/03/09/24/0134256.shtml?tid=126&tid=95&tid=98&tid=99">coverage</a>.
-<br>
-<br>
-<b>(9/24/03) Update #4:</b> Please sign the <a href="/?redir=http://www.whois.sc/verisign-dns/">petition</a> to stop verisign.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=geotags></a>
-<font size=+1><b>Geo-Sensitive Search</b></font><br>
-<i>September 18, 2003</i><br><br>
-Gigablast now supports some special new meta tags that allow for constraining a search to a particular zipcode, city, state or country. Support was also added for the standard author, language and classification meta tags. This <a href="/tagsdemo.html">page</a> explains more. These meta tags should be standard, everyone should use them (but not abuse them!) and things will be easier for everybody.
-<br><br>
-Secondly, I have declared jihad against stale indexes. I am planning a significantly faster update cycle, not to mention growing the index to about 400 million pages, all hopefully in the next few months.
-<br>
-<br>
-<br>
-<br>
-
-<a name=turing></a>
-<font size=+1><b>Foiling the Addurl Scripts</b></font><br>
-<i>September 6, 2003</i><br><br>
-The new pseudo-Turing test on the <a href="/addurl">addurl page</a> should prevent most automated scripts from submitting boatloads of URLs. If someone actually takes the time to code a way around it then I'll just have to take it a step further. I would rather work on other things, though, so please quit abusing my free service and discontinue your scripts. Thanks.
-<br>
-<br>
-<br>
-<br>
-
-<a name=boolean></a>
-<font size=+1><b>Boolean is Here</b></font><br>
-<i>September 1, 2003</i><br><br>
-I just rolled out the new boolean logic code. You should be able to do nested boolean queries using the traditional AND, OR and NOT boolean operators. See the updated <a href="/help.html#boolean">help page</a> for more detail.
-<br><br>
-I have declared jihad against swapping and am now running the 2.4.21-rc6-rmap15j Linux kernel with swap tuned to zero using the /proc/sys/vm/pagecache knobs. So far no machines have swapped, which is great, but I'm unsure of this kernel's stability.
-<br>
-<br>
-<br>
-<br>
-
-<a name=swap></a>
-<font size=+1><b>All Swapped Out</b></font><br>
-<i>August 29, 2003</i><br><br>
-I no longer recommend turning the swap off, at least not on linux 2.4.22. A kernel panicked on me and froze a server. Not good. If anyone has any ideas for how I can prevent my app from being swapped out, please let me know. I've tried mlockall() within my app but that makes its memory usage explode for some reason. I've also tried Rik van Riel's 2.4.21-rc6-rmap15j.txt patch on the 2.4.21 kernel, but it still does unnecessary swapping (although, strangely, only when spidering). If you know how to fix this problem, please help!!! <a href="vmstat.html">Here</a> is the output from the vmstat command on one of my production machines running 2.4.22. And <a href="vmstatrik.html">here</a> is the output from my test machine running 2.4.21-rc6-rmap15j.txt. 
-<br>
-<br>
-<br>
-<br>
-
-<a name=kernel></a>
-<font size=+1><b>Kernel Update</b></font><br>
-<i>August 28, 2003</i><br><br>
-I updated the Linux kernel to 2.4.22, which was just released a few days ago on <a href="/?redir=http://www.kernel.org/">kernel.org</a>. Now my gigabit cards are working, yay! I finally had to turn off swap using the swapoff command. When an application runs out of memory the swapper is supposed to write unfrequently used memory to disk so it can give that memory to the application that needs it. Unfortunately, the Linux virtual memory manager enjoys swapping out an application's memory for no good reason. This can often make an application disastrously slow, especially when the application ends up blocking on code that it doesn't expect too! And, furthermore, when the application uses the disk intensely it has to wait even longer for memory to get swapped back in from disk. I recommend that anyone who needs high performance turn off the swap and just make sure their program does not use more physical memory than is available.
-<br>
-<br>
-<br>
-<br>
-
-<a name=gang></a>
-<font size=+1><b>The Gang's All Here</b></font><br>
-<i>August 17, 2003</i><br><br>
-I decided to add PostScript (<a href="/search?q=type:ps">.ps</a>) , PowerPoint (<a href="/search?q=type:ppt">.ppt</a>), Excel SpreadSheet (<a href="/search?q=type:xls">.xls</a>) and Microsoft Word (<a href="/search?q=type:doc">.doc</a>) support in addition to the PDF support. Woo-hoo.
-<br>
-<br>
-<br>
-<br>
-
-<a name=pdf></a>
-<font size=+1><b>PDF Support</b></font><br>
-<i>August 14, 2003</i><br><br>
-Gigablast now indexes PDF documents. Try the search <a href="/search?q=type:pdf"><i>type:pdf</i></a> to see some PDF results. <i>type</i> is a new search field. It also support the text type, <a href="/search?q=type:text"><i>type:text</i></a>, and will support other file types in the future.
-<br>
-<br>
-<br>
-<br>
-
-<a name=codeupdate3></a>
-<font size=+1><b>Minor Code Updates</b></font><br>
-<i>July 17, 2003</i><br><br>
-I've cleaned up the keyword highlight routines so they don't highlight isolated stop words. Gigablast now displays a <a href="/superRecall.html">blue bar</a> above returned search results that do not have <b>all</b> of your query terms. When returning a page of search results Gigablast lets you know how long ago that page was cached by displaying a small message at the bottom of that page. NOTE: This small message is at the bottom of the page containing the search results, not at the bottom of any pages from the web page cache, that is a different cache entirely. Numerous updates to less user-visible things on the back end. Many bugs fixed, but still more to go. Thanks a bunch to Bruce Perens for writing the <a href="/?redir=http://www.perens.com/FreeSoftware/">Electric Fence</a> debug utility.
-<br>
-<br>
-<br>
-<br>
-
-<a name=codeupdate2></a>
-<font size=+1><b>Gigablast 2.0</b></font><br>
-<i>June 20, 2003</i><br><br>
-I've recently released Gigablast 2.0. Right now Gigablast can do about twice as many queries per second as before. When I take care of a few more things that rate should double again. 
-<br><br>
-The ranking algorithm now treats phrase weights much better. If you search for something like <i><a href="/search?q=boots+in+the+uk">boots in the uk</a></i> you won't get a bunch of results that have that exact phrase in them, but rather you will get UK sites about boots (theoretically). And when you do a search like <i><a href="/search?q=all+the+king%27s+men">all the king's men</a></i> you will get results that have that exact phrase. If you find any queries for which Gigablast is especially bad, but a competing search engine is good, please <a href="/contact.html">let me know</a>, I'm am very interested.
-<br><br>
-2.0 also introduced a new index format. The new index is half the size of the old one. This allows my current setup to index over 400 million pages with dual redundancy. Before it was only able to index about 300 million pages. The decreased index size also speeds up the query process since only half as much data needs to be read from disk to satisfy a query.
-<br><br>
-I've also started a full index refresh, starting with top level pages that haven't been spidered in a while. This is especially nice because a lot of pages that were indexed before all my anti-spam algorithms were 100% in place are just now getting filtered appropriately. I've manually removed over 100,000 spam pages so far, too.
-<br>
-<br>
-<br>
-<br>
-
-<a name=grub></a>
-<font size=+1><b>My Take on Looksmart's Grub</b></font><br>
-<i>Apr 19, 2003</i><br><br>
-There's been some press about Grub, a program from Looksmart which you install on your machine to help Looksmart spider the web. Looksmart is only using Grub to save on their bandwidth. Essentially Grub just compresses web pages before sending them to Looksmart's indexer thus reducing the bandwidth they have to pay for by a factor of 5 or so. The same thing could be accomplished through a proxy which compresses web pages. Eventually, once the HTTP mime standard for requesting compressed web pages is better supported by web servers, Grub will not be necessary. 
-<br>
-<br>
-<br>
-<br>
-
-<a name=codeupdate></a>
-<font size=+1><b>Code Update</b></font><br>
-<i>Mar 25, 2003</i><br><br>
-I just rolled some significant updates to Gigablast's back-end. Gigablast now has a uniformly-distributed, unreplicated search results cache. This means that if someone has done your search within the last several hours then you will get results back very fast. This also means that Gigablast can handle a lot more queries per second.
-<br>
-<br>
-I also added lots of debug and timing messages that can be turned on and off via the Gigablast admin page. This allows me to quickly isolate problems and identify bottlenecks.
-<br>
-<br>
-Gigablast now synchronizes the clocks on all machines on the network so the instant add-url should be more "instant". Before I made this change, one machine would tell another to spider a new url "now", where "now" was actually a few minutes into the future on the spider machine. But since everyone's currently synchronized, this will not be a problem anymore.
-<br>
-<br>
-There were about 100 other changes and bug fixes, minor and major, that I made, too, that should result in significant performance gains. My next big set of changes should make searches at least 5 times faster, but it will probably take several months until completed. I will keep you posted.
-<br>
-<br>
-<br>
-<br>
-
-<a name=downtime></a>
-<font size=+1><b>Downtime</b></font><br>
-<i>Feb 20, 2003</i><br><br>
-To combat downtime I wrote a monitoring program. It will send me a text message on my cellphone if gigablast ever stops responding to queries. This should prevent extended periods of downtime by alerting me to the problem so I can promptly fix it.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=uunet></a>
-<font size=+1><b>Connectivity Problems. Bah!</b></font><br>
-<i>Feb 14, 2003</i><br><br>
-I had to turn off the main refresh spiders a few weeks ago because of internet connectivity problems. Lots of pages were inaccessible or were timing out to the point that spider performance was suffering too much. 
-<br><br>
-After running tcpdump in combination with wget I noticed that the FIN packets of some web page transfers were being lost or delayed for over a minute. The TCP FIN packet is typically the last TCP packet sent to your browser when it retrieves a web page. It tells your browser to close the connection. Once it is received the little spinning logo in the upper right corner of your browser window should stop spinning. 
-<br><br>
-The most significant problem was, however, that the initial incoming data packet for some URLs was being lost or excessively delayed. You can get by without receiving FIN packets but you absoultely need these TCP "P" packets. I've tested my equipment and my ISP has tested their equipment and we have both concluded that the problem is upstream. Yesterday my ISP submitted a ticket to Worldcom/UUNet.  Worldcom's techs have verified the problem and thought it was... "interesting". 
-<br><br>
-I personally think it is a bug in some filtering or monitoring software installed at one of Worldcom's NAPs (Network Access Points). NAPs are where the big internet providers interface with each other. The most popular NAPs are in big cities, the Tier-1 cities, as they're called.  There are also companies that host NAP sites where the big carriers like Worldcom can install their equipment. The big carriers then set up Peering Agreements with each other. Peering Agreements state the conditions under which two or more carriers will exchange internet traffic. 
-<br><br>
-Once you have a peering agreement in place with another carrier then you must pay them based on how much data you transfer from your network to their network across a NAP. This means that downloading a file is much cheaper than uploading a file.  When you send a request to retrieve some information, that request is small compared to the amount of data it retrieves. Therefore, the carrier that hosted the server from which you got the data will end up paying more. Doh! I got off the topic. I hope they fix the problem soon!
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=ads></a>
-<font size=+1><b>Considering Advertisements</b></font><br>
-<i>Jan 10, 2003</i><br><br>
-I'm now looking into serving text advertisements on top of the search results page so I can continue to fund my information retrieval research. I am also exploring the possibility of injecting ads into some of my xml-based search feeds. If you're interested in a search feed I should be able to give you an even better deal provided you can display the ads I feed you, in addition to any other ads you might want to add. If anyone has any good advice concerning what ad company I should use, I'd love to here it.
-<br>
-<br>
-<br>
-<br>
-
-<a name=codeupdate></a>
-<font size=+1><b>Code Update</b></font><br>
-<i>Dec 27, 2002</i><br><br>
-After a brief hiatus I've restarted the Gigablast spiders. The problem was they were having a negative impact on the query engine's performance, but now, all spider processing yields computer resources much better to the query traffic. The result is that the spidering process only runs in the space between queries. This actually involved a lot of work. I had to insert code to suspend spider-related, network transactions and cancel disk-read and disk-write threads.<br><br>
-I've also launched my <a href="/gigaboost.html">Gigaboost</a> campaign. This rewards pages that link to gigablast.com with a boost in the search results rankings. The boost is only utilized to resolve ties in ranking scores so it does not taint the quality of the index.<br><br>
-Gigablast.nu, in Scandinavia, now has a news index built from news sources in the Scandinavian region. It is not publically available just yet because there's still a few details we are working out. 
-I've also added better duplicate detection and removal. It won't be very noticable until the index refresh cycle completes. 
-In addition Gigablast now removes session ids from urls, but, this only applies to new links and will be back pedaled to fix urls already in the index at a later date. 
-There is also a new summary generator installed. It's over ten times faster than the old one. If you notice any problems with it please contact me. As always, I appreciate any constructive input you have to give.
-<br>
-<br>
-<br>
-<br>
-
-
-<a name=corruption></a>
-<font size=+1><b>Data Corruption Mysteries</b></font><br>
-<i>Dec 20, 2002</i><br><br>
-I've been having problems with my hard drives. I have a bunch of Maxtor 160GB drives (Model # = 4G160J8) running on Linux 2.4.17 with the <a href="/ide.2.4.17.02152002.patch.bz2">48-bit LBA patch</a>. Each machine has 4 of these drives on them, 2 on each IDE slot. I've had about 160 gigabytes of data on one before so I know the patch seems to do the job. But every now and then a drive will mess up a write. I do a lot of writing and it usually takes tens of gigabytes of writing before a drive does this.  It writes out about 8 bytes that don't match what should have been written. This causes index corruption and I've had to install work-arounds in my code to detect and patch it. 
-<br>
-<br>
-I'm not sure if the problem is with the hard drive itself or with Linux. I've made sure that the problem wasn't in my code by doing a read after each write to verify. I thought it might be my motherboard or CPU. I use AMDs and Giga-byte motherboards.  But gigablast.nu in Sweden has the same problem and it uses a Pentium 3. Furthermore, gigablast.nu uses a RAID of 160GB Maxtors, whereas gigablast.com does not. Gigablast.nu uses version 2.4.19 of Linux with the 48-bit LBA patch. So the problem seems to be with Linux, the LBA patch or the hard drive itself.
-<br>
-<br>
-On top of all this mess, about 1 Maxtor, out of the 32 I have, completely fails on me every 4 months.  The drive just gives I/O errors to the kernel and brings the whole system down. Luckily, gigablast.com implements a redundant architecture so the failing server will be replaced by his backup. So far Maxtor has replaced the drives I had fail. If you give them your credit card number they'll even send the replacements out in advance.  But I believe the failure problem is an indicator that the data corruption problem is hard drive related, not Linux related. If anyone has any insight into this problem please let me know, you could quite easily be my hero.
-<br>
-<br>
-If you're still reading this you're pretty hard core so <a href="/output.html">here's</a> what /var/log/messages says when the 4G160J8 completely fails.
-<br>
-<br>
-<br>
-<br>
-
-<a name=pvr></a>
-<font size=+1><b>Personal Video Recorders (PVRs)</b></font><br>
-<i>Dec 20, 2002</i><br><br>
-Boy, these things are great. I bought a Tivo last year for my wife and she loved it. At first though she wasn't that enthusiastic because she wasn't very familiar with it. But now we rarely rent any more video tapes from Blockbuster or Hollywood video because there's always something interesting to watch on the Tivo. You just let it know what shows you like and it will record them anytime they come on. We always have an overflow of Simpsons and Seinfeld epsidoes on there.
-<br>
-<br>
-In the future though I don't think Tivo is going to make it. The reason? Home networking. Because I'm a professional computer person, we already have a home network installed. If the TV had an ethernet jack it would be in our network. 100Mbps is fast enough to send it a high-quality video stream from the computers already on the network. I have a cable modem which, in the future, should allow the computer using it to rip signals from the cable station, as well. For now though, you could split your cable and plug the new end into a tuner card on your PC. So once someone comes out with a small device for the television that converts an ethernet-based mpeg stream to a video signal we can use our home PC to act as the TIVO. This device should be pretty cheap, I'd imagine around $30 or so. The only thing you'd need then is a way to allow the remote control to talk to your PC.
-<br>
-<br>
-Now I read about the EFF suing "Hollywood" in order to clarify consumer rights of fair use. Specifically, the EFF was said to be representing Replay TV. Hey! Isn't Replay TV owned in part by Disney (aka Hollywood)... hmmmm... Seems like Disney might have pretty good control over the outcome of this case. I think it's a conflict of interest when such an important trial, which would set precedence for many cases to come, has the same plantiff as defendant.
-<br>
-<br>
-This makes me wonder about when Disney's Go.com division got sued by Overture (then known as Goto.com) for logo infringement. Disney had to pay around 20 million to Overture. I wonder what kind of ties Disney had to Overture. Ok, maybe I'm being a conspiracy theorist, so I'll stop now.
-<br>
-<br>
-<br>
-<br>
-
-<a name=ecs></a>
-<font size=+1><b>ECS K7S5A Motherboard Mayhem</b></font><br>
-<i>Dec 20, 2002</i><br><br>
-I pinch pennies. When I bought my 8 servers I got the cheapest motherboards I could get for my AMD 1.4GHz Athlon T-Birds. At the time, in late January 2002, they turned out to be the K7S5A's. While running my search engine on them I experienced lots of segmentation faults. I spent a couple of days pouring over the code wondering if I was tripping out. It wasn't until I ran memtest86 at boot time (ran by lilo) that I found memory was being corrupted. I even tried new memory sticks to no avail. Fortunately I found some pages on the web that addressed the problem. It was the motherboard. It took me many hours to replace them on all 8 servers. I don't recommend ECS. I've been very happy with the Giga-byte motherboards I have now.
-
-
-<br><br><br>
-<br><br><br>
-
-</div>
-
-
-
-	
--- a/html/ss_filters.png
+++ b/html/ss_filters.png
--- a/html/ss_filters_thumb.png
+++ b/html/ss_filters_thumb.png
--- a/html/ss_hosts.png
+++ b/html/ss_hosts.png
--- a/html/ss_hosts_thumb.png
+++ b/html/ss_hosts_thumb.png
--- a/html/ss_settings.png
+++ b/html/ss_settings.png
--- a/html/ss_settings_thumb.png
+++ b/html/ss_settings_thumb.png
--- a/main.cpp
+++ b/main.cpp
@ -195,6 +195,8 @@ void dumpLinkdb          ( char *coll,int32_t sfn,int32_t numFiles,bool includeT
 void exitWrapper ( void *state ) { exit(0); };

 bool g_recoveryMode = false;
+
+int32_t g_recoveryLevel = 0;
 	
 bool isRecoveryFutile ( ) ;

@ -1116,8 +1118,15 @@ int main2 ( int argc , char *argv[] ) {
 	//send an email on startup for -r, like if we are recovering from an
 	//unclean shutdown.
 	g_recoveryMode = false;
-	if ( strcmp ( cmd , "-r" ) == 0 ) g_recoveryMode = true;
-	if ( strcmp ( cmd2 , "-r" ) == 0 ) g_recoveryMode = true;
+	char *cc = NULL;
+	if ( strncmp ( cmd , "-r" ,2 ) == 0 ) cc = cmd;
+	if ( strncmp ( cmd2 , "-r",2 ) == 0 ) cc = cmd2;
+	if ( cc ) {
+		g_recoveryMode = true;
+		g_recoveryLevel = 1;
+		if ( cc[2] ) g_recoveryLevel = atoi(cc+2);
+		if ( g_recoveryLevel < 0 ) g_recoveryLevel = 0;
+	}

 	// run as daemon? then we have to fork
 	if ( strcmp ( cmd , "-d" ) == 0 ) g_conf.m_runAsDaemon = true;
@ -3092,6 +3101,10 @@ int main2 ( int argc , char *argv[] ) {
 		pid_t pid, sid;
 		pid = fork();
 		if ( pid < 0 ) exit(EXIT_FAILURE);
+		// seems like we core unless parent sets this to NULL.
+		// it does not affect the child.
+		//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
+		// child gets a 0, parent gets the child's pid, so exit
 		if ( pid > 0 ) exit(EXIT_SUCCESS);
 		// change file mode mask
 		umask(0);
@ -4103,6 +4116,9 @@ bool doCmd ( const char *cmd , int32_t hostId , char *filename ,
 	// doCmdAll()'s call to convertHttpRequestToParmList
 	sock.m_ip = atoip("127.0.0.1");
 	s_r.set ( s_buffer , gbstrlen ( s_buffer ) , &sock );
+	// do not do sig alarms! for now just set this to null so 
+	// the sigalarmhandler doesn't core
+	//g_hostdb.m_myHost = NULL;
 	// run the loop
 	if ( ! g_loop.runLoop() ) 
 		return log("INJECT: loop run failed.");
@ -5170,6 +5186,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 				"export MALLOC_CHECK_=0;"
 				"cp -f gb gb.oldsave ; "
 				"ADDARGS='' "
+				"INC=1 "
 				"EXITSTATUS=1 ; "
 				 "while [ \\$EXITSTATUS != 0 ]; do "
 				 "{ "
@ -5191,7 +5208,8 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 				" ;"

 				"EXITSTATUS=\\$? ; "
-				"ADDARGS='-r' ; "
+				"ADDARGS='-r'\\$INC ; "
+				"INC=\\$((INC+1));"
 				"} " 
 				"done >& /dev/null & \" %s",
 				//"\" %s",
--- a/mysynonyms.txt
+++ b/mysynonyms.txt
@ -15,6 +15,10 @@ en|nos
 en|at
 en|ats

+# it's not good to do love <-> like, as wiktionary does. override it here.
+en|love,loved,loving,loves
+en|like,likes,liked,liking
+

 en|sunday,sundays,sun
 en|mon,monday,mondays
@ -179,13 +183,7 @@ en|porno,pornographic,pornography
 en|tech,technology,technologies,technological
 en|car,auto,automobile,cars,autos,automobiles
 en|electric,electrical
-en|dinner,dining,dined,dines,food
-en|lunch,food
-en|breakfast,food
 en|restaurant,restaurants,restarant,restarants,diner,cafe,cafes,caf\xc3\xa9,caf\xc3\xa9s,caf\xc3\xa8,caf\xc3\xa8s
-en|breakfast,food
-en|lunch,food
-en|dine,dines,dined,dining,eat,eats,eating,ate

 en|website,web site,websites,web sites,webpage,web page,webpages,web pages,homepage,home page,homepages,home pages,site,sites

@ -208,17 +206,12 @@ en|calendar,schedule,calendars,schedules
 # added tv
 en|tv,television,televisions,tvs

-en|food,foods,nourishment,nourishments
-
 # flower shop san francisco should match 'flower store'
 en|shop,shops,shopping,store,stores

 # trigram
 en|nfl,national football league,football leauge

-en|celebration,celebrations,bash,bashes,party,parties,partying
-en|rave,raves,party,parties,partying
-
 en|rock groups,rock group,band,bands
 en|music groups,music group,band,bands

--- a/qa.cpp
+++ b/qa.cpp
@ -1,6 +1,7 @@
 #include <string.h>
 #include "SafeBuf.h"
 #include "HttpServer.h"
+#include "Posdb.h"

 TcpSocket *g_qaSock = NULL;
 SafeBuf g_qaOutput;
@ -183,6 +184,7 @@ void processReply ( char *reply , int32_t replyLen ) {
 	markOut ( content , "spider is done (");
 	markOut ( content , "spider is paused (");
 	markOut ( content , "spider queue empty (");
+	markOut ( content , "spider is active (");

 	markOut ( content , "<totalShards>");

@ -200,6 +202,23 @@ void processReply ( char *reply , int32_t replyLen ) {
 	markOut ( content , "\"responseTimeMS\":");
 	markOut ( content , "\"docsInCollection\":");

+	// if the results are in json, then status doc is encoded json
+	markOut ( content , "\\\"gbssDownloadStartTime\\\":");
+	markOut ( content , "\\\"gbssDownloadEndTime\\\":");
+	markOut ( content , "\\\"gbssDownloadStartTimeMS\\\":");
+	markOut ( content , "\\\"gbssDownloadEndTimeMS\\\":");
+	markOut ( content , "\\\"gbssDownloadDurationMS\\\":");
+	markOut ( content , "\\\"gbssAgeInIndex\\\":");
+
+	// if the results are in xml, then the status doc is xml encoded
+	markOut ( content , "\"gbssDownloadStartTime\":");
+	markOut ( content , "\"gbssDownloadEndTime\":");
+	markOut ( content , "\"gbssDownloadStartTimeMS\":");
+	markOut ( content , "\"gbssDownloadEndTimeMS\":");
+	markOut ( content , "\"gbssDownloadDurationMS\":");
+	markOut ( content , "\"gbssAgeInIndex\":");
+
+
 	// for xml
 	markOut ( content , "<currentTimeUTC>" );
 	markOut ( content , "<responseTimeMS>");
@ -776,6 +795,16 @@ bool qainject1 ( ) {
 			log("qa: failed qa test of posdb0001.dat. "
 			    "has %i bytes of positive keys! coring.",
 			    (int)list.m_listSize);
+			char rec [ 64];
+			for ( list.getCurrentKey ( rec ) ;
+			      ! list.isExhausted() ;
+			      list.skipCurrentRecord() ) {
+				// parse it up
+				int64_t tid = g_posdb.getTermId ( rec );
+				int64_t d = g_posdb.getDocId ( rec ) ;
+				log("qa: termid=%"INT64" docid=%"INT64,
+				    tid,d);
+			}
 			//char *xx=NULL;*xx=0;
 			exit(0);
 		}
@ -980,7 +1009,8 @@ bool qainject2 ( ) {
 	if ( ! s_flags[33] ) {
 		s_flags[33] = true;
 		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q="
-				"url2%3Axyz.com%2F-13737921970569011262&xml=1"
+				"gbssUrl%3Axyz.com%2F-13737921970569011262&"
+				"xml=1"
 				,-1405546537 ) )
 			return false;
 	}
@ -1164,17 +1194,17 @@ bool qaSyntax ( ) {
 			     "gbpermalink:1",
 			     "gbdocid:123456",

-			     "gbstatus:0",
-			     "gbstatusmsg:tcp",
-			     "url2:www.abc.com/page.html",
-			     "site2:mysite.com",
-			     "ip2:1.2.3.4",
-			     "inurl2:dog",
-			     "gbpathdepth2:2",
-			     "gbhopcount2:3",
-			     "gbhasfilename2:1",
-			     "gbiscgi2:1",
-			     "gbhasext2:1",
+			     "gbssStatusCode:0",
+			     "gbssStatusmsg:tcp",
+			     "gbssUrl:www.abc.com/page.html",
+			     "gbssDomain:mysite.com",
+			     "gbssIp:1.2.3.4",
+			     "gbssUrl:dog",
+			     //"gbpathdepth:2",
+			     "gbssHopcount:3",
+			     //"gbhasfilename2:1",
+			     //"gbiscgi2:1",
+			     //"gbhasext2:1",

 			     "cat AND dog",
 			     "cat OR dog",
@ -1553,6 +1583,7 @@ bool qareindex() {
 			      "ufp=custom&"
 			      // zero spiders if not isreindex
 			      "fe1=default&hspl1=0&hspl1=1&fsf1=1.000000&"
+			      "fdu1=0&"
 			      "mspr1=0&mspi1=0&xg1=1000&fsp1=45&"
 		);
 		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
@ -1776,15 +1807,15 @@ bool qaspider1 ( ) {
 			      // make it the custom filter
 			      "ufp=custom&"

-	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
+	       "fdu=0&fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

 			      // take out hopcount for now, just test quotas
 			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

 			      // just one spider out allowed for consistency
-	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
+	       "fdu1=0&fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

-	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
+	       "fdu2=0&fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

 		);
 		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
@ -1935,8 +1966,8 @@ bool qaspider1 ( ) {
 	if ( ! s_flags[17] ) {
 		s_flags[17] = true;
 		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
-				"q=site2%3Awww.walmart.com+"
-				"gbsortby%3Agbspiderdate",
+				"q=gbssSubdomain%3Awww.walmart.com+"
+				"gbsortbyint%3AgbssDownloadStartTime",
 				999 ) )
 			return false;
 	}
@ -2039,7 +2070,7 @@ bool qaspider2 ( ) {
 			      // make it the custom filter
 			      "ufp=custom&"

-	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"
+	       "fdu=0&fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

 			      // take out hopcount for now, just test quotas
 			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"
@ -2047,9 +2078,9 @@ bool qaspider2 ( ) {
 			      // sitepages is a little fuzzy so take it
 			      // out for this test and use hopcount!!!
 			      //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
-			      "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
+			      "fdu1=0&fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

-	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"
+	       "fdu2=0&fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

 		);
 		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
@ -2450,7 +2481,7 @@ bool qajson ( ) {
 	if ( ! s_flags[12] ) {
 		s_flags[12] = true;
 		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
-				"q=inurl2%3Aquirksmode.org%2Fm%2F",
+				"q=gbssUrl%3Aquirksmode.org%2Fm%2F",
 				-1310551262 ) )
 			return false;
 	}