Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2013-10-24 17:59:22 -07:00 · 2013-10-24 17:59:22 -07:00 · 129937168d
commit 129937168d
parent 896fbc2570 242873b272
8 changed files with 227 additions and 133 deletions
--- a/Highlight.cpp
+++ b/Highlight.cpp
@ -73,8 +73,9 @@ char s_termList[1024];
 // . content must be NULL terminated
 // . if "useAnchors" is true we do click and scroll
 // . if "isQueryTerms" is true, we do typical anchors in a special way
-long Highlight::set ( char        *buf          ,
-		      long         bufLen       ,
+long Highlight::set ( SafeBuf *sb,
+		      //char        *buf          ,
+		      //long         bufLen       ,
 		      char        *content      ,
 		      long         contentLen   ,
 		      // primary language of the document (for synonyms)
@ -119,8 +120,9 @@ long Highlight::set ( char        *buf          ,
 	// store
 	m_numMatches = matches.getNumMatches();

-	return set ( buf         ,
-		     bufLen      , 
+	return set ( sb , 
+		     //buf         ,
+		     //bufLen      , 
 		     &words      ,
 		     &matches    ,
 		     doStemming  ,
@ -133,8 +135,9 @@ long Highlight::set ( char        *buf          ,
 }

 // New version
-long Highlight::set ( char        *buf        ,
-		      long         bufLen     ,
+long Highlight::set ( SafeBuf *sb ,
+		      //char        *buf        ,
+		      //long         bufLen     ,
 		      Words       *words      ,
 		      Matches     *matches    ,
 		      bool         doStemming ,
@ -162,18 +165,20 @@ long Highlight::set ( char        *buf        ,
 	if ( m_frontTag ) m_frontTagLen = gbstrlen ( frontTag );
 	if ( m_backTag  ) m_backTagLen  = gbstrlen ( backTag  );
 	// point to buffer to store highlighted text into
-	m_buf    = buf;
-	m_bufLen = bufLen;
-	m_bufPtr = buf;
+	//m_buf    = buf;
+	//m_bufLen = bufLen;
+	//m_bufPtr = buf;
+	m_sb = sb;
 	// save room for terminating \0
-	m_bufEnd = m_buf + m_bufLen - 1;
+	//m_bufEnd = m_buf + m_bufLen - 1;

 	if ( ! highlightWords ( words, matches, q ) ) return 0;

 	// null terminate
-	*m_bufPtr = '\0';
+	//*m_bufPtr = '\0';
+	m_sb->nullTerm();
 	// return the length
-	return m_bufPtr - m_buf;
+	return m_sb->length();//m_bufPtr - m_buf;
 }

 bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
@ -228,6 +233,7 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
 		endHead = false;
 		endHtml = false;
 		// bail now if out of room
+		/*
 		if ( m_bufPtr + MAX_URL_LEN + 1024 + wlen >= m_bufEnd ) {
 			// don't spam the logs
 			static long long s_lastTime = 0;
@ -238,6 +244,7 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
 			s_lastTime = now;
 			return true;
 		}
+		*/
 		if ( (words->getTagId(i) ) == TAG_TITLE ) { //<TITLE>
 			if ( words->isBackTag(i) ) inTitle = false;
 			else inTitle = true;
@ -282,8 +289,9 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
 				//else frontTag = s_frontTags [ p[i] % 10];
 				else frontTag =s_frontTags[mat->m_colorNum%10];
 				// OK...this is UTF-8 output, and ASCII Text
-				strcpy ( m_bufPtr , frontTag );
-				m_bufPtr += frontTagLen;
+				//strcpy ( m_bufPtr , frontTag );
+				//m_bufPtr += frontTagLen;
+				m_sb->safeStrcpy ( (char *)frontTag );
 				//log(LOG_DEBUG, 
 				//    "Highlight: starting phrase %d at word %d\n",
 				//    p[i], i);
@ -296,8 +304,9 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
 		else if ( endHead ) {
 			// include the tags style sheet immediately before
 			// the closing </TITLE> tag
-			memcpy( m_bufPtr, s_styleSheet, s_styleSheetLen );
-			m_bufPtr += s_styleSheetLen;
+			//memcpy( m_bufPtr, s_styleSheet, s_styleSheetLen );
+			m_sb->safeMemcpy( s_styleSheet , s_styleSheetLen );
+			//m_bufPtr += s_styleSheetLen;
 		}
 		//else if ( endHtml ) {
 		//	;
@ -326,14 +335,16 @@ bool Highlight::highlightWords ( Words *words , Matches *m, Query *q ) {
 		// write the alnum word
 		//m_bufPtr +=latin1ToUtf8(m_bufPtr, m_bufEnd-m_bufPtr,w, wlen);
 		// everything is utf8 now
-		memcpy ( m_bufPtr, w , wlen );
-		m_bufPtr += wlen;
+		//memcpy ( m_bufPtr, w , wlen );
+		//m_bufPtr += wlen;
+		m_sb->safeMemcpy ( w , wlen );

 		// back tag
 		if ( i == backTagi-1 ) {
 			// store the back tag
-			memcpy ( m_bufPtr , backTag , backTagLen );
-			m_bufPtr += backTagLen ;
+			//memcpy ( m_bufPtr , backTag , backTagLen );
+			//m_bufPtr += backTagLen ;
+			m_sb->safeMemcpy ( (char *)backTag , backTagLen );
 			//log(LOG_DEBUG, 
 			//    "Highlight: ending phrase %d at word %d\n",
 			//    p[i], i);
--- a/Highlight.h
+++ b/Highlight.h
@ -19,8 +19,9 @@ class Highlight {
 	// . we highlight Query "q" in "xml" as best as we can
 	// . store highlighted text into "buf"
 	// . return length stored into "buf"
-	long set ( char        *buf          ,
-		   long         bufLen       ,
+	long set ( //char        *buf          ,
+		   //long         bufLen       ,
+		  SafeBuf *sb,
 		   char        *content      ,
 		   long         contentLen   , 
 		   char         docLangId    ,
@ -33,8 +34,9 @@ class Highlight {
 		   long         fieldCode    , // = 0     ,
 		   long         niceness    ) ;
 	
-	long set ( char        *buf        ,
-		   long         bufLen     ,
+	long set ( //char        *buf        ,
+		  //long         bufLen     ,
+		  SafeBuf *sb ,
 		   Words       *words      ,
 		   Matches     *matches    ,
 		   bool         doStemming ,
@ -52,10 +54,11 @@ class Highlight {
 	bool highlightWords ( Words *words , Matches *m , Query *q=NULL );

 	// null terminate and store the highlighted content in m_buf
-	char    *m_buf ;
-	long     m_bufLen;
-	char    *m_bufPtr;
-	char    *m_bufEnd;
+	//char    *m_buf ;
+	//long     m_bufLen;
+	//char    *m_bufPtr;
+	//char    *m_bufEnd;
+	class SafeBuf *m_sb;

 	//Words    m_words;
 	Matches  m_matches;
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -194,6 +194,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 			// skip known directives
 			if ( ! strncmp(p,"port-offset:",12) ||
 			     ! strncmp(p,"index-splits:",13) ||
+			     ! strncmp(p,"num-mirrors:",12) ||
 			     ! strncmp(p,"working-dir:",12) )
 				p = p;
 			// check if this is a spare host
@ -243,13 +244,14 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	if ( ! m_hosts ) return log(
 				    "conf: Memory allocation failed.");

-	unsigned long maxShard = 0;
+	//unsigned long maxShard = 0;
+	long numGrunts = 0;

 	// now fill up m_hosts
 	p = m_buf;
 	i = 0;
 	long line = 1;
-	unsigned long lastShard = 0;
+	//unsigned long lastShard = 0;
 	long proxyNum = 0;

 	// assume defaults
@ -257,6 +259,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	long indexSplits = 0;
 	char *wdir2 = NULL;
 	long  wdirlen2 = 0;
+	long numMirrors = -1;

 	for ( ; *p ; p++ , line++ ) {
 		if ( is_wspace_a (*p) ) continue;
@ -273,6 +276,15 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 			continue; 
 		}

+		if ( ! strncmp(p,"num-mirrors:",12) ) {
+			p += 12;
+			// skip spaces after the colon
+			while (  is_wspace_a(*p) ) p++;			
+			numMirrors = atol(p);
+			while ( *p && *p != '\n' ) p++; 
+			continue; 
+		}
+
 		// does the line say "working-dir: xxxx" ?
 		if ( ! strncmp(p,"working-dir:",12) ) {
 			p += 12;
@ -351,13 +363,6 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		// skip numeric hostid or "proxy" keyword
 		while ( ! is_wspace_a(*p) ) p++;

-		if ( indexSplits == 0 ) {
-			g_errno = EBADENGINEER;
-			log("admin: need index-splits: xxx directive "
-			    "in hosts.conf");
-			return false;
-		}
-
 		// read in switch id
 		//h->m_switchId = atoi(p);

@ -590,7 +595,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		// our group is based on our split!
 		//h->m_group = i % g_hostdb.m_indexSplits; // # grps
 		//h->m_group = i % indexSplits; // # grps
-		h->m_shardNum = i % indexSplits;
+		//h->m_shardNum = i % indexSplits;
 		// i guess proxy and spares don't count
 		if ( h->m_type != HT_GRUNT ) h->m_shardNum = 0;
 		
@ -665,9 +670,12 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 		h->m_externalHttpsPort = h->m_httpsPort;

 		// get max group number
-		if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
-			maxShard = h->m_shardNum;
+		//if ( h->m_shardNum > maxShard && h->m_type==HT_GRUNT )
+		//	maxShard = h->m_shardNum;
+		if ( h->m_type == HT_GRUNT )
+			numGrunts++;

+		/*
 		if ( h->m_shardNum <= lastShard && h->m_shardNum != 0 
 		     && !(h->m_type&(HT_ALL_PROXIES)) ) {
 		      g_errno = EBADENGINEER;
@ -678,6 +686,7 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 				 filename,line);
 		}
 		lastShard = h->m_shardNum;
+		*/

 		// skip line now
 		while ( *p && *p != '\n' )
@ -742,10 +751,46 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
 	//m_numHosts = i;
 	m_numTotalHosts = i;
 	// how many shards are we configure for?
-	m_numShards = maxShard + 1; // g_conf.m_numGroups;
+	//m_numShards = maxShard + 1; // g_conf.m_numGroups;
+
+
+	// # of mirrors is zero if no mirrors,
+	// if it is 1 then each host has ONE MIRROR host
+	if ( numMirrors == 0 )
+		indexSplits = numGrunts;
+	if ( numMirrors > 0 )
+		indexSplits = numGrunts / (numMirrors+1);
+
+	if ( indexSplits == 0 ) {
+		g_errno = EBADENGINEER;
+		log("admin: need num-mirrors: xxx or "
+		    "index-splits: xxx directive "
+		    "in hosts.conf");
+		return false;
+	}
+
+	numMirrors = (numGrunts / indexSplits) - 1 ;
+
+	if ( numMirrors < 0 ) {
+		g_errno = EBADENGINEER;
+		log("admin: need num-mirrors: xxx or "
+		    "index-splits: xxx directive "
+		    "in hosts.conf (2)");
+		return false;
+	}

 	m_indexSplits = indexSplits;

+	m_numShards = numGrunts / (numMirrors+1);
+
+	//
+	// set Host::m_shardNum
+	//
+	for ( long i = 0 ; i < numGrunts ; i++ ) {
+		Host *h = &m_hosts[i];
+		h->m_shardNum = i % indexSplits;
+	}
+
 	// assign spare hosts
 	if ( m_numSpareHosts > MAX_SPARES ) {
 		log ( "conf: Number of spares (%li) exceeds max of %i, "
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -305,23 +305,24 @@ bool processLoop ( void *state ) {
 	SafeBuf sb;

 	// alloc buffer now
-	char *buf = NULL;
-	long  bufMaxSize = 0;
+	//char *buf = NULL;
+	//long  bufMaxSize = 0;
 	//bufMaxSize = len + ( 32 * 1024 ) ;
-	bufMaxSize = contentLen + ( 32 * 1024 ) ;
-	buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
-	char *p          = buf;
-	char *bufEnd     = buf + bufMaxSize;
-	if ( ! buf ) {
-		return sendErrorReply ( st , g_errno );
-	}
+	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
+	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
+	//char *p          = buf;
+	//char *bufEnd     = buf + bufMaxSize;
+	//if ( ! buf ) {
+	//	return sendErrorReply ( st , g_errno );
+	//}

 	// for undoing the header
-	char *start1 = p;
+	//char *start1 = p;
+	long startLen1 = sb.length();

 	// we are always utfu
 	if ( strip != 2 )
-		p += sprintf(p, "<meta http-equiv=\"Content-Type\" "
+		sb.safePrintf( "<meta http-equiv=\"Content-Type\" "
 			     "content=\"text/html;charset=utf8\">\n");

 	// base href
@ -332,20 +333,21 @@ bool processLoop ( void *state ) {
 	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
 	//Url *redir = *xd->getRedirUrl();
 	if ( strip != 2 ) {
-		sprintf ( p , "<BASE HREF=\"%s\">" , base );
-		p += gbstrlen ( p );
+		sb.safePrintf ( "<BASE HREF=\"%s\">" , base );
+		//p += gbstrlen ( p );
 	}

 	// default colors in case css files missing
 	if ( strip != 2 ) {
-		sprintf ( p , "\n<style type=\"text/css\">\n"
+		sb.safePrintf( "\n<style type=\"text/css\">\n"
 			  "body{background-color:white;color:black;}\n"
 			  "</style>\n");
-		p += gbstrlen ( p );
+		//p += gbstrlen ( p );
 	}


-	char *start2 = p;
+	// for undoing the stuff below
+	long startLen2 = sb.length();//p;

 	// query should be NULL terminated
 	char *q    = st->m_q;
@ -376,7 +378,7 @@ bool processLoop ( void *state ) {
 	// CNS: if ( ! st->m_clickNScroll ) {
 	if ( printDisclaimer ) {

-		sprintf ( p , 
+		sb.safePrintf(//sprintf ( p , 
 			  //"<BASE HREF=\"%s\">"
 			  //"<table border=1 width=100%%>"
 			  //"<tr><td>"
@ -394,28 +396,31 @@ bool processLoop ( void *state ) {
 			  "<a href=\"%s\" style=\"%s\">%s</a>"
 			  "" , styleTitle, f->getUrl(), styleLink,
 			  f->getUrl() );
-		p += gbstrlen ( p );
+		//p += gbstrlen ( p );
 		// then the rest
-		sprintf(p , 
+		//sprintf(p , 
+		sb.safePrintf(
 			"<span style=\"%s\">. "
 			"Gigablast is not responsible for the content of "
 			"this page.</span>", styleTitle );
-		p += gbstrlen ( p );
+		//p += gbstrlen ( p );

-		sprintf ( p , "<br/><span style=\"%s\">"
+		sb.safePrintf ( "<br/><span style=\"%s\">"
 			  "Cached: </span>"
 			  "<span style=\"%s\">",
 			  styleTitle, styleText );
-		p += gbstrlen ( p );
+		//p += gbstrlen ( p );

 		// then the spider date in GMT
 		time_t lastSpiderDate = xd->m_spideredTime;
 		struct tm *timeStruct = gmtime ( &lastSpiderDate );
-		strftime ( p, 100,"%b %d, %Y UTC", timeStruct);
-		p += gbstrlen ( p );
+		char tbuf[100];
+		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
+		//p += gbstrlen ( p );
+		sb.safeStrcpy(tbuf);

 		// Moved over from PageResults.cpp
-		p += sprintf (p, "</span> - <a href=\""
+		sb.safePrintf( "</span> - <a href=\""
 			      "/get?"
 			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
 			      "d=%lli&amp;strip=1\""
@ -427,7 +432,7 @@ bool processLoop ( void *state ) {

 		// a link to alexa
 		if ( f->getUrlLen() > 5 ) {
-			p += sprintf (p, " - <a href=\"http:"
+			sb.safePrintf( " - <a href=\"http:"
 					 "//web.archive.org/web/*/%s\""
 					 " style=\"%s\">"
 					 "[older copies]</a>" ,
@ -435,29 +440,29 @@ bool processLoop ( void *state ) {
 		}

 		if (st->m_noArchive){
-			p += sprintf(p, " - <span style=\"%s\"><b>"
+			sb.safePrintf( " - <span style=\"%s\"><b>"
 				     "[NOARCHIVE]</b></span>",
 				     styleTell );
 		}
 		if (st->m_isBanned){
-			p += sprintf(p, " - <span style=\"%s\"><b>"
+			sb.safePrintf(" - <span style=\"%s\"><b>"
 				     "[BANNED]</b></span>",
 				     styleTell );
 		}

 		// only print this if we got a query
 		if ( qlen > 0 ) {
-			sprintf (p,"<br/><br/><span style=\"%s\"> "
+			sb.safePrintf("<br/><br/><span style=\"%s\"> "
 				   "These search terms have been "
 				   "highlighted:  ",
 				   styleText );
-			p += gbstrlen ( p );
+			//p += gbstrlen ( p );
 		}
 		
 	}

 	// how much space left in p?
-	long avail = bufEnd - p;
+	//long avail = bufEnd - p;
 	// . make the url that we're outputting for (like in PageResults.cpp)
 	// . "thisUrl" is the baseUrl for click & scroll
 	char thisUrl[MAX_URL_LEN];
@ -487,7 +492,7 @@ bool processLoop ( void *state ) {
 	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
 	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
 	sprintf ( x, "&d=%lli",st->m_docId );
-	x += gbstrlen(p);		
+	x += gbstrlen(x);		
 	// set our query for highlighting
 	Query qq;
 	qq.set2 ( q, st->m_langId , true );
@ -523,16 +528,18 @@ bool processLoop ( void *state ) {
 	// CNS: if ( ! st->m_clickNScroll ) {
 	// and highlight the matches
 	if ( printDisclaimer ) {
-		hilen = hi.set ( p       ,
-				 avail   ,
+		hilen = hi.set ( //p       ,
+				 //avail   ,
+				&sb ,
 				 &qw     , // words to highlight
 				 &m      , // matches relative to qw
 				 false   , // doSteming
 				 false   , // st->m_clickAndScroll , 
 				 (char *)thisUrl );// base url for ClcknScrll
-		p += hilen;
+		//p += hilen;
 		// now an hr
-		memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
+		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
+		sb.safeStrcpy("</span></table></table>\n");
 	}


@ -547,8 +554,8 @@ bool processLoop ( void *state ) {
 	if ( ! includeHeader ) {
 		// including base href is off by default when not including
 		// the header, so the caller must explicitly turn it back on
-		if ( st->m_includeBaseHref ) p = start2;
-		else                         p = start1;
+		if ( st->m_includeBaseHref ) sb.m_length=startLen2;//p=start2;
+		else                         sb.m_length=startLen1;//p=start1;
 	}

 	// identify start of <title> tag we wrote out
@ -596,7 +603,8 @@ bool processLoop ( void *state ) {
 		char *ebuf = st->m_r.getString("eb");
 		if ( ! ebuf ) ebuf = "";

-		p += sprintf ( p , 
+		//p += sprintf ( p , 
+		sb.safePrintf(
 			       "<table border=1 "
 			       "cellpadding=10 "
 			       "cellspacing=0 "
@ -606,7 +614,7 @@ bool processLoop ( void *state ) {
 		long printLinks = st->m_r.getLong("links",0);

 		if ( ! printDisclaimer && printLinks )
-			p += sprintf ( p , 
+			sb.safePrintf(//p += sprintf ( p , 
 				       // first put cached and live link
 				       "<tr>"
 				       "<td bgcolor=lightyellow>"
@ -637,7 +645,7 @@ bool processLoop ( void *state ) {
 				       );

 		if ( printLinks ) {
-			p += sprintf ( p ,
+			sb.safePrintf(//p += sprintf ( p ,
 				       "<tr><td bgcolor=pink>"
 				       "<span style=\"font-size:18px;"
 				       "font-weight:600;"
@ -646,11 +654,11 @@ bool processLoop ( void *state ) {
 				       "<b>PAGE TITLE:</b> "
 				       );
 			long tlen = titleEnd - titleStart;
-			memcpy ( p , titleStart , tlen ); p += tlen;
-			p += sprintf ( p , "</span></td></tr>" );
+			sb.safeMemcpy ( titleStart , tlen );
+			sb.safePrintf ( "</span></td></tr>" );
 		}

-		p += sprintf ( p , "</table><br>\n" );
+		sb.safePrintf( "</table><br>\n" );

 	}

@ -661,9 +669,9 @@ bool processLoop ( void *state ) {
 	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
 	if ( ctype == CT_PS   ) pre = true ; // filtered postscript
 	// if it is content-type text, add a <pre>
-	if ( p + 5 < bufEnd && pre ) {
-		memcpy ( p , "<pre>" , 5 );
-		p += 5;
+	if ( pre ) {//p + 5 < bufEnd && pre ) {
+		sb.safePrintf("<pre>");
+		//p += 5;
 	}

 	if ( st->m_strip == 1 )
@ -671,7 +679,7 @@ bool processLoop ( void *state ) {
 					(long)xd->m_version, st->m_strip );
 	// it returns -1 and sets g_errno on error, line OOM
 	if ( contentLen == -1 ) {
-		if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
+		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
 		return sendErrorReply ( st , g_errno );
 	}

@ -688,8 +696,8 @@ bool processLoop ( void *state ) {
 	

 	if ( ! queryHighlighting ) {
-		memcpy ( p , content , contentLen );
-		p += contentLen ;
+		sb.safeMemcpy ( content , contentLen );
+		//p += contentLen ;
 	}
 	else {
 		// get the content as xhtml (should be NULL terminated)
@ -697,37 +705,38 @@ bool processLoop ( void *state ) {
 		if ( ! xml.set ( content , contentLen , false ,
 				 0 , false , TITLEREC_CURRENT_VERSION ,
 				 false , 0 ) ) { // niceness is 0
-			if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
+			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
 			return sendErrorReply ( st , g_errno );
 		}			
 		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
-			if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
+			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
 			return sendErrorReply ( st , g_errno );
 		}
 		// sanity check
 		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
 		// how much space left in p?
-		avail = bufEnd - p;
+		//avail = bufEnd - p;

 		Matches m;
 		m.setQuery ( &qq );
 		m.addMatches ( &ww );
-		hilen = hi.set ( p , avail , &ww , &m ,
+		hilen = hi.set ( &sb , // p , avail , 
+				 &ww , &m ,
 				 false /*doStemming?*/ ,  
 				 st->m_clickAndScroll , 
 				 thisUrl /*base url for click & scroll*/);
-		p += hilen;
+		//p += hilen;
 		log(LOG_DEBUG, "query: Done highlighting cached page content");
 	}

 	// if it is content-type text, add a </pre>
-	if ( p + 6 < bufEnd && pre ) {
-		memcpy ( p , "</pre>" , 6 );
-		p += 6;
+	if ( pre ) { // p + 6 < bufEnd && pre ) {
+		sb.safeMemcpy ( "</pre>" , 6 );
+		//p += 6;
 	}

 	// calculate bufLen
-	long bufLen = p - buf;
+	//long bufLen = p - buf;

 	long ct = xd->m_contentType;

@ -737,16 +746,19 @@ bool processLoop ( void *state ) {

 	if ( ct == CT_XML ) {
 		// encode the xml tags into &lt;tagname&gt; sequences
-		if ( !newbuf.htmlEncodeXmlTags ( buf , p-buf,0)){// niceness=0
-			if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
+		if ( !newbuf.htmlEncodeXmlTags ( sb.getBufStart() ,
+						 sb.getLength(),
+						 0)){// niceness=0
+			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
 			return sendErrorReply ( st , g_errno );
 		}
 		// free out buffer that we alloc'd before returning since this
 		// should have copied it into another buffer
-		if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
+		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
 		// reassign
-		buf    = newbuf.getBufStart();
-		bufLen = newbuf.length();
+		//buf    = newbuf.getBufStart();
+		//bufLen = newbuf.length();
+		sb.stealBuf ( &newbuf );
 	}

 	// now encapsulate it in html head/tail and send it off
@ -763,14 +775,18 @@ bool processLoop ( void *state ) {
 	mdelete ( st , sizeof(State2) , "PageGet1" );
 	delete (st);

-	bool status = g_httpServer.sendDynamicPage (s,buf,bufLen,-1,false,
+	bool status = g_httpServer.sendDynamicPage (s,
+						    //buf,bufLen,
+						    sb.getBufStart(),
+						    sb.getLength(),
+						    -1,false,
 						    contentType,
 						     -1, NULL, "utf8" );
 	// free out buffer that we alloc'd before returning since this
 	// should have copied it into another buffer

-	if      ( ct == CT_XML ) newbuf.purge();
-	else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
+	//if      ( ct == CT_XML ) newbuf.purge();
+	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
 	
 	// and convey the status
 	return status;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1515,8 +1515,8 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
 		if ( ! si->m_doQueryHighlighting && ! si->m_xml ) continue;
 		char *str   = k-> ptr_linkText;
 		long strLen = k->size_linkText;
-		char tt[1024*3];
-		char *ttend = tt + 1024*3;
+		//char tt[1024*3];
+		//char *ttend = tt + 1024*3;
 		char *frontTag = 
 		     "<font style=\"color:black;background-color:yellow\">" ;
 		char *backTag = "</font>";
@ -1525,8 +1525,9 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
 			backTag  = "</b>";
 		}
 		Highlight hi;
-		long hlen = hi.set ( tt , 
-				ttend - tt , 
+		SafeBuf hb;
+		long hlen = hi.set ( &hb,//tt , 
+				     //ttend - tt , 
 				str, 
 				strLen , 
 				mr->m_language, // docLangId
@ -1573,7 +1574,10 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
 			// inc it
 			inlinkId++;
 			// encode it for xml
-			if ( !sb.htmlEncode ( tt,hlen,false)) return false;
+			if ( !sb.htmlEncode ( hb.getBufStart(),
+					      hb.length(),
+					      false)) 
+				return false;
 			sb.safePrintf("\"/>\n");
 			continue;
 		}
@ -1604,7 +1608,7 @@ bool printInlinkText ( SafeBuf &sb , Msg20Reply *mr , SearchInput *si ,
 			      //k->ptr_urlBuf);
 			      ,si->m_cr->m_coll
 			      ,k->m_docId);
-		if ( ! sb.safeMemcpy(tt,hlen) ) return false;
+		if ( ! sb.safeMemcpy(&hb) ) return false;
 		long hostLen = 0;
 		char *host = getHostFast(k->ptr_urlBuf,&hostLen,NULL);
 		sb.safePrintf("</td><td>");
@ -1871,8 +1875,8 @@ static int printResult ( SafeBuf &sb,

 	long hlen;
 	//copy all summary and title excerpts for this result into here
-	char tt[1024*32];
-	char *ttend = tt + 1024*32;
+	//char tt[1024*32];
+	//char *ttend = tt + 1024*32;
 	char *frontTag = 
 		"<font style=\"color:black;background-color:yellow\">" ;
 	char *backTag = "</font>";
@ -1882,9 +1886,11 @@ static int printResult ( SafeBuf &sb,
 	}
 	long cols = 80;
 	if ( si->m_xml ) sb.safePrintf("\t\t<title><![CDATA[");
+	SafeBuf hb;
 	if ( str && strLen && si->m_doQueryHighlighting ) {
-		hlen = hi.set ( tt , 
-				ttend - tt , 
+		hlen = hi.set ( &hb,
+				//tt , 
+				//ttend - tt , 
 				str, 
 				strLen , 
 				mr->m_language, // docLangId
@ -1897,7 +1903,10 @@ static int printResult ( SafeBuf &sb,
 				0,
 				0 ); // niceness
 		//if (!sb.utf8Encode2(tt, hlen)) return false;
-		if ( ! sb.brify ( tt,hlen,0,cols) ) return false;
+		if ( ! sb.brify ( hb.getBufStart(),
+				  hb.getLength(),
+				  0,
+				  cols) ) return false;
 	}
 	else if ( str && strLen ) {
 		// determine if TiTle wraps, if it does add a <br> count for
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -2514,6 +2514,9 @@ bool SafeBuf::decodeJSON ( long niceness ) {

 // . this should support the case when the src and dst buffers are the same!
 //   so decodeJSONToUtf8() function below will work
+// . this is used by xmldoc.cpp to PARTIALLY decode a json buf so we do not
+//   index letters in escapes like \n \r \f \t \uxxxx \\ \/
+// . SO we do keep \" 
 bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {

 	// how much space to reserve for the copy?
@ -2687,11 +2690,13 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
 			*dst++ = '\\';
 			continue;
 		}
-		//if ( *src == '\/' ) {
-		//	*dst++ = '\\';
-		//	*dst++ = '/';
-		//	continue;
-		//}
+		// mdw: why was this commented out?
+		// we converted '\/' above to a single / so we must undo here
+		if ( *src == '/' ) {
+			*dst++ = '\\';
+			*dst++ = '/';
+			continue;
+		}

 		*dst++ = *src;

--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -26441,11 +26441,13 @@ char *XmlDoc::getHighlightedSummary ( ) {

 	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

-	char tt[5000];
+	//char tt[5000];
 	Highlight hi;
+	SafeBuf hb;
 	// highlight the query in it
-	long hlen = hi.set ( tt , 
-			     4999 ,
+	hi.set ( &hb,
+			     //tt , 
+			     //4999 ,
 			     sum, 
 			     sumLen,
 			     m_langId,
@ -26459,8 +26461,9 @@ char *XmlDoc::getHighlightedSummary ( ) {
 			     m_niceness );

 	// store into our safebuf then
-	m_finalSummaryBuf.safeMemcpy ( tt , hlen + 1 );
+	m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
 	m_finalSummaryBufValid = true;
+	m_finalSummaryBuf.nullTerm();

 	char *fsum = m_finalSummaryBuf.getBufStart();
 	if ( ! fsum ) fsum = (char *)0x01;
--- a/hosts.conf
+++ b/hosts.conf
@ -2,12 +2,14 @@
 # Tells us what hosts are participating in the distributed search engine.


-# This is how many pieces you want the index split into.
-# So if you have 64 machines, and you want a unique piece of index on
-# each machine, then make this 64. But if you have 64 machines and you
-# want one level of redundancy then make this 32.
+# How many mirrors do you want? If this is 0 then your data
+# will NOT be replicated. If it is 1 then each host listed
+# below will have one host that mirrors it, thereby decreasing
+# total index capacity, but increasing redundancy. If this is
+# 1 then the first half of hosts will be replicated by the
+# second half of the hosts listed below.

-index-splits: 1
+num-mirrors: 0