Merge branch 'ia' into ia-zak

2015-07-22 12:02:19 -06:00
parent dead58329e 090e1b35d5
commit e9f86f362e
29 changed files with 414 additions and 107 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3579,7 +3579,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {

 	// lower from 7 to 1 since we have so many collections now
 	// ok, now we have much less colls so raise back to 7
-	int32_t diffbotipms = 7;// 1; // 7
+	int32_t diffbotipms = 7;//1; // 7

 	// make the gigablast regex table just "default" so it does not
 	// filtering, but accepts all urls. we will add code to pass the urls
@ -3599,8 +3599,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_spiderIpWaits     [i] = wait;
 		m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
 		// ethan wants some speed
-		if ( isEthan )
-			m_spiderIpMaxSpiders[i] = 30;
+		// if ( isEthan )
+		// 	m_spiderIpMaxSpiders[i] = 30;
 		//m_spidersEnabled    [i] = 1;
 		m_spiderFreqs       [i] = respiderFreq;
 		//m_spiderDiffbotApiUrl[i].purge();
@ -3623,6 +3623,18 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	m_forceDelete        [i] = 1;
 	i++;

+	// de-prioritize fakefirstip urls so we don't give the impression our
+	// spiders are slow. like if someone adds a bulk job with 100,000 urls
+	// then we sit there and process to lookup their ips and add a real
+	// spider request (if it falls onto the same shard) before we actually
+	// do any real spidering. so keep the priority here low.
+	m_regExs[i].set("isfakeip");
+	m_maxSpidersPerRule  [i] = 7;
+	m_spiderIpMaxSpiders [i] = 7;
+	m_spiderPriorities   [i] = 20;
+	m_spiderIpWaits      [i] = 0;
+	i++;
+
 	// hopcount filter if asked for
 	if( m_diffbotMaxHops >= 0 ) {

--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -18,6 +18,8 @@ void HashTableX::constructor() {
 	m_useKeyMagic = false;
 	m_ks = 0;
 	m_allowGrowth = true;
+	m_numSlots = 0;
+	m_numSlotsUsed = 0;
 }

 void HashTableX::destructor() {
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1514,6 +1514,10 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	//   is recycled/destroyed
 	// . this will call getMsgPiece() to fill up sendBuf from file
 	int32_t totalToSend = mimeLen + bytesToSend;
+
+	//s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
+	if ( s && s->m_state == f ) s->m_state = NULL;
+
 	//if ( ! m_tcp.sendMsg ( s           , 
 	if (  ! tcp->sendMsg ( s           , 
 			       sendBuf     ,
@ -1542,7 +1546,6 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	if ( ! f->isOpen() ) f->open( O_RDONLY );
 	int fd = f->getfd();
 	cleanUp ( f , NULL/*TcpSocket */ );
-	s->m_state = NULL; // do we need this? yes, cuz s is NULL for cleanUp
 	// . AND we need to do this ourselves here
 	// . do it SILENTLY so not message is logged if fd not registered
 	if (tcp->m_useSSL)
--- a/Matches.cpp
+++ b/Matches.cpp
@ -30,6 +30,18 @@ Matches::Matches ( ) {
 }
 Matches::~Matches( ) { reset(); }
 void Matches::reset   ( ) { 
+	reset2();
+	if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
+		mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
+		m_qwordFlags = NULL;
+	}
+	//m_explicitsMatched = 0;
+	//m_matchableRequiredBits = 0;
+	//m_hasAllQueryTerms = false;
+	//m_matchesQuery = false;
+}
+
+void Matches::reset2() {
 	m_numMatches = 0;
 	//m_maxNQT     = -1;
 	m_numAlnums  = 0;
@ -41,14 +53,6 @@ void Matches::reset   ( ) {
 		m_bitsArray    [i].reset();
 	}
 	m_numMatchGroups = 0;
-	if ( m_qwordFlags && m_qwordFlags != (mf_t *)m_tmpBuf ) {
-		mfree ( m_qwordFlags , m_qwordAllocSize , "mmqw" );
-		m_qwordFlags = NULL;
-	}
-	//m_explicitsMatched = 0;
-	//m_matchableRequiredBits = 0;
-	//m_hasAllQueryTerms = false;
-	//m_matchesQuery = false;
 }

 bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
@ -298,7 +302,7 @@ bool Matches::set ( XmlDoc   *xd         ,
 		    int32_t      niceness   ) {

 	// don't reset query info!
-	reset();
+	reset2();

 	// sanity check
 	if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }
--- a/Matches.h
+++ b/Matches.h
@ -142,6 +142,7 @@ class Matches {
 	Matches ( ) ;
 	~Matches( ) ;
 	void reset ( ) ;
+	void reset2 ( ) ;

 	// BIG HACK support
 	//int32_t getTermsFound ( bool *hadPhrases , bool *hadWords );
--- a/Mem.h
+++ b/Mem.h
@ -280,6 +280,20 @@ inline int32_t getNumBitsOn64 ( uint64_t bits ) {
 		g_a [ *((unsigned char *)(&bits) + 7)  ] ;
 }

+inline int32_t getNumBitsOnX ( unsigned char *s , int32_t slen ) {
+	if ( slen == 1 ) return getNumBitsOn8 ( *s );
+	if ( slen == 2 ) return getNumBitsOn16 ( *(uint16_t *)s );
+	if ( slen == 4 ) return getNumBitsOn32 ( *(uint32_t *)s );
+	if ( slen == 3 ) 
+		return  getNumBitsOn8 ( s[0] ) +
+			getNumBitsOn8 ( s[1] ) +
+			getNumBitsOn8 ( s[2] ) ;
+	int32_t total = 0;
+	for ( int32_t i = 0 ; i < slen ; i++ )
+		total += getNumBitsOn8 ( s[i] );
+	return total;
+}
+
 // assume only one bit is set for this (used by Address.cpp)
 inline int32_t getBitPosLL ( uint8_t *bit ) {
 	// which int32_t is it in?
--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -34,6 +34,10 @@ Msg39::Msg39 () {
 	reset();
 }

+Msg39::~Msg39 () {
+	reset();
+}
+
 void Msg39::reset() {
 	if ( m_inUse ) { char *xx=NULL;*xx=0; }
 	m_allocedTree = false;
@ -46,8 +50,14 @@ void Msg39::reset() {

 void Msg39::reset2() {
 	// reset lists
-	for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) 
-		m_lists[j].freeList();
+	int32_t nqt = m_stackBuf.getLength() / sizeof(RdbList);
+	//for ( int32_t j = 0 ; j < m_msg2.m_numLists && m_lists ; j++ ) {
+	for ( int32_t j = 0 ; j < nqt && m_lists ; j++ ) {
+		//m_lists[j].freeList();
+		//log("msg39: destroy list @ 0x%"PTRFMT,(PTRTYPE)&m_lists[j]);
+		// same thing but more generic
+		m_lists[j].destructor();
+	}
 	m_stackBuf.purge();
 	m_lists = NULL;
 	m_msg2.reset();
@ -207,7 +217,8 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 	if ( ! m_tmpq.set2 ( m_r->ptr_query  , 
 			     m_r->m_language ,
 			     m_r->m_queryExpansion ,
-			     m_r->m_useQueryStopWords ) ) {
+			     m_r->m_useQueryStopWords ,
+			     m_r->m_maxQueryTerms ) ) {
 		log("query: msg39: setQuery: %s." , 
 		    mstrerror(g_errno) );
 		sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -225,11 +236,14 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 	if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
 		g_errno = EBADENGINEER;
 		log("query: Query parsing inconsistency for q=%s. "
+		    "%i != %i. "
 		    "langid=%"INT32". Check langids and m_queryExpansion parms "
 		    "which are the only parms that could be different in "
 		    "Query::set2(). You probably have different mysynoyms.txt "
 		    "files on two different hosts! check that!!"
 		    ,m_tmpq.m_orig
+		    ,(int)m_tmpq.getNumTerms()
+		    ,(int)m_r->m_nqt
 		    ,(int32_t)m_r->m_language
 		    );
 		sendReply ( m_slot , this , NULL , 0 , 0 , true );
@ -767,11 +781,15 @@ bool Msg39::getLists () {


 	int32_t nqt = m_tmpq.getNumTerms();
-	if ( ! m_stackBuf.reserve ( sizeof(RdbList) * nqt ) ) return true;
+	int32_t need = sizeof(RdbList) * nqt ;
 	m_stackBuf.setLabel("stkbuf2");
+	if ( ! m_stackBuf.reserve ( need ) ) return true;
 	m_lists = (IndexList *)m_stackBuf.getBufStart();
-	for ( int32_t i = 0 ; i < nqt ; i++ )
+	m_stackBuf.setLength ( need );
+	for ( int32_t i = 0 ; i < nqt ; i++ ) {
 		m_lists[i].constructor();
+		//log("msg39: constructlist @ 0x%"PTRFMT,(PTRTYPE)&m_lists[i]);
+	}

 	// call msg2
 	if ( ! m_msg2.getLists ( rdbId                      ,
--- a/Msg39.h
+++ b/Msg39.h
@ -216,6 +216,7 @@ class Msg39 {
 public:

 	Msg39();
+	~Msg39();
 	void reset();
 	void reset2();
 	// register our request handler for Msg39's
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -666,7 +666,7 @@ bool Msg40::federatedLoop ( ) {
 	mr.size_whiteList              = slen;
 	mr.m_timeout                   = -1; // auto-determine based on #terms
 	// make sure query term counts match in msg39
-	mr.m_maxQueryTerms             = m_si->m_maxQueryTerms; 
+	//mr.m_maxQueryTerms             = m_si->m_maxQueryTerms; 
 	mr.m_realMaxTop                = m_si->m_realMaxTop;

 	mr.m_minSerpDocId              = m_si->m_minSerpDocId;
@ -699,6 +699,9 @@ bool Msg40::federatedLoop ( ) {
 	//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
 	//}

+	if ( cr ) mr.m_maxQueryTerms = cr->m_maxQueryTerms; 
+	else      mr.m_maxQueryTerms = 100;
+
 	// special oom hack fix
 	if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 ) 
 		numDocIdSplits = 4;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -2529,6 +2529,22 @@ bool printSearchResultsHeader ( State0 *st ) {
 			       , getLanguageString(si->m_queryLangId) );
 		// print query words we ignored, like stop words
 		printIgnoredWords ( sb , si );
+
+		sb->safePrintf("\t\t<queryNumTermsTotal>"
+			       "%"INT32
+			       "</queryNumTermsTotal>\n"
+			       , q->m_numTermsUntruncated );
+		sb->safePrintf("\t\t<queryNumTermsUsed>"
+			       "%"INT32
+			       "</queryNumTermsUsed>\n"
+			       , q->m_numTerms );
+		int32_t tval = 0;
+		if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
+		sb->safePrintf("\t\t<queryWasTruncated>"
+			       "%"INT32
+			       "</queryWasTruncated>\n"
+			       , tval );
+
 		for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
 			sb->safePrintf("\t\t<term>\n");
 			QueryTerm *qt = &q->m_qterms[i];
@ -2605,6 +2621,19 @@ bool printSearchResultsHeader ( State0 *st ) {
 		sb->safePrintf("\",\n");
 		// print query words we ignored, like stop words
 		printIgnoredWords ( sb , si );
+
+		sb->safePrintf("\t\"queryNumTermsTotal\":"
+			       "%"INT32",\n"
+			       , q->m_numTermsUntruncated );
+		sb->safePrintf("\t\"queryNumTermsUsed\":"
+			       "%"INT32",\n"
+			       , q->m_numTerms );
+		int32_t tval = 0;
+		if ( q->m_numTerms < q->m_numTermsUntruncated ) tval = 1;
+		sb->safePrintf("\t\"queryWasTruncated\":"
+			       "%"INT32",\n"
+			       , tval );
+			
 		sb->safePrintf("\t\"terms\":[\n");
 		for ( int i = 0 ; i < q->m_numTerms ; i++ ) {
 			sb->safePrintf("\t\t{\n");
@ -8263,8 +8292,11 @@ bool printCSVHeaderRow2 ( SafeBuf *sb ,
 			hdr = "Hop Count";
 		if ( ! strcmp(hdr,"gbssIp") ) 
 			hdr = "IP";
-		if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
-			hdr = "Diffbot URI";
+		// csv report is regular urls not diffbot object urls so
+		// regular urls do not have a just a single diffboturi,
+		// they could have 0 or multiple diffboturis
+		//if ( ! strcmp(hdr,"gbssDiffbotUri" ) )
+		//	hdr = "Diffbot URI";
 		if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") ) 
 			hdr = "Process Attempted";
 		if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
--- a/Pages.cpp
+++ b/Pages.cpp
@ -3857,6 +3857,25 @@ bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) {
 			       "</b>");


+		sb->brify2 ( 
+			"\t\t# List of space separated words in the "
+			"query that were ignored for the most part. "
+			"Because they were common words for the "
+			"query language they are in.\n"
+			, cols , "\n\t\t# " , false );
+		sb->safePrintf("<b>\t\t\"ignoredWords\":\"to the\",\n\n"
+			       "</b>");
+
+		sb->brify2 ( 
+			"\t\t# There is a maximum limit placed on the "
+			"number of query terms we search on to keep things "
+			"fast. This can "
+			"be changed in the search controls.\n"
+			, cols , "\n\t\t# " , false );
+		sb->safePrintf("<b>\t\t\"queryNumTermsTotal\":52,\n</b>");
+		sb->safePrintf("<b>\t\t\"queryNumTermsUsed\":20,\n</b>");
+		sb->safePrintf("<b>\t\t\"queryWasTruncated\":1,\n\n</b>");
+
 		sb->brify2 ( 
 			"\t\t# The start of the terms array. Each query "
 			"is broken down into a list of terms. Each "
--- a/Parms.cpp
+++ b/Parms.cpp
@ -7879,17 +7879,19 @@ void Parms::init ( ) {
 	m->m_obj   = OBJ_COLL;
 	m++;

-	//m->m_title = "max query terms";
-	//m->m_desc  = "Do not allow more than this many query terms. Will "
-	//	"return error in XML feed error tag if breeched.";
-	//m->m_cgi   = "mqt";
-	//m->m_off   = (char *)&cr.m_maxQueryTerms - x;
+	m->m_title = "max query terms";
+	m->m_desc  = "Do not allow more than this many query terms. Helps "
+		"prevent big queries from resource hogging.";
+	m->m_cgi   = "mqt";
+	m->m_off   = (char *)&cr.m_maxQueryTerms - x;
 	//m->m_soff  = (char *)&si.m_maxQueryTerms - y;
-	//m->m_type  = TYPE_LONG;
-	//m->m_def   = "20"; // 20 for testing, normally 16
-	//m->m_sparm = 1;
-	//m->m_spriv = 1;
-	//m++;
+	m->m_type  = TYPE_LONG;
+	m->m_def   = "999999"; // now we got synonyms... etc
+	m->m_group = 0;
+	m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; 
+	m->m_page  = PAGE_SEARCH;
+	m->m_obj   = OBJ_COLL;
+	m++;

 	/*
 	m->m_title = "dictionary site";
@ -15283,7 +15285,7 @@ void Parms::init ( ) {
 	m->m_type  = TYPE_CHARPTR;
 	m->m_page  = PAGE_REINDEX;
 	m->m_obj   = OBJ_GBREQUEST;
-	m->m_def   = "xx";
+	m->m_def   = "en";
 	m->m_flags = PF_API ;
 	m++;

--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -759,7 +759,6 @@ void PosdbTable::init ( Query     *q               ,
 	// set this now
 	//m_collnum = cr->m_collnum;

-
 	// save it
 	m_topTree = topTree;
 	// a ptr for debugging i guess
@ -773,6 +772,9 @@ void PosdbTable::init ( Query     *q               ,
 	m_realMaxTop = r->m_realMaxTop;
 	if ( m_realMaxTop > MAX_TOP ) m_realMaxTop = MAX_TOP;

+	m_siteRankMultiplier = SITERANKMULTIPLIER;
+	if ( m_q->m_isBoolean ) m_siteRankMultiplier = 0.0;
+
 	// seo.cpp supplies a NULL msg2 because it already sets
 	// QueryTerm::m_posdbListPtrs
 	if ( ! msg2 ) return;
@ -6304,12 +6306,7 @@ void PosdbTable::intersectLists10_r ( ) {
 	}

 	if ( m_q->m_isBoolean ) {
-		minScore = 1.0;
-		// since we are jumping, we need to set m_docId here
-		//m_docId = *(uint32_t *)(docIdPtr+1);
-		//m_docId <<= 8;
-		//m_docId |= (unsigned char)docIdPtr[0];
-		//m_docId >>= 2;
+		//minScore = 1.0;
 		// we can't jump over setting of miniMergeList. do that.
 		goto boolJump1;
 	}
@ -6521,6 +6518,30 @@ void PosdbTable::intersectLists10_r ( ) {

 boolJump1:

+	if ( m_q->m_isBoolean ) {
+		//minScore = 1.0;
+		// this is somewhat wasteful since it is set below again
+		m_docId = *(uint32_t *)(docIdPtr+1);
+		m_docId <<= 8;
+		m_docId |= (unsigned char)docIdPtr[0];
+		m_docId >>= 2;
+		// add one point for each term matched in the bool query
+		// this is really just for when the terms are from different
+		// fields. if we have unfielded boolean terms we should
+		// do proximity matching.
+		int32_t slot = m_bt.getSlot ( &m_docId );
+		if ( slot >= 0 ) {
+			uint8_t *bv = (uint8_t *)m_bt.getValueFromSlot(slot);
+			// then a score based on the # of terms that matched
+			int16_t bitsOn = getNumBitsOnX ( bv , m_vecSize );
+			// but store in hashtable now
+			minScore = (float)bitsOn;
+		}
+		else {
+			minScore = 1.0;
+		}
+	}
+
 	// we need to do this for seo hacks to merge the synonyms together
 	// into one list
 seoHackSkip2:
@ -7226,7 +7247,7 @@ void PosdbTable::intersectLists10_r ( ) {
 boolJump2:

 	// try dividing it by 3! (or multiply by .33333 faster)
-	score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
+	score = minScore * (((float)siteRank)*m_siteRankMultiplier+1.0);

 	// . not foreign language? give a huge boost
 	// . use "qlang" parm to set the language. i.e. "&qlang=fr"
@ -7896,7 +7917,7 @@ float PosdbTable::getMaxPossibleScore ( QueryTermInfo *qti ,
 		score *= WIKI_BIGRAM_WEIGHT;
 	}
 	//score *= perfectWordSpamWeight * perfectWordSpamWeight;
-	score *= (((float)siteRank)*SITERANKMULTIPLIER+1.0);
+	score *= (((float)siteRank)*m_siteRankMultiplier+1.0);

 	// language boost if same language (or no lang specified)
 	if ( m_r->m_language == docLang ||
@ -8187,13 +8208,15 @@ bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
 			// a 6 byte key means you pass
 			gbmemcpy ( dst , &docId , 6 );
 			// test it
-			int64_t d2;
-			d2 = *(uint32_t *)(dst+1);
-			d2 <<= 8;
-			d2 |= (unsigned char)dst[0];
-			d2 >>= 2;
-			docId >>= 2;
-			if ( d2 != docId ) { char *xx=NULL;*xx=0; }
+			if ( m_debug ) {
+				int64_t d2;
+				d2 = *(uint32_t *)(dst+1);
+				d2 <<= 8;
+				d2 |= (unsigned char)dst[0];
+				d2 >>= 2;
+				docId >>= 2;
+				if ( d2 != docId ) { char *xx=NULL;*xx=0; }
+			}
 			// end test
 			dst += 6;
 		}
--- a/Posdb.h
+++ b/Posdb.h
@ -604,6 +604,8 @@ class PosdbTable {
 	float m_finalScore;
 	float m_preFinalScore;

+	float m_siteRankMultiplier;
+
 	// how long to add the last batch of lists
 	int64_t       m_addListsTime;
 	int64_t       m_t1 ;
--- a/Query.cpp
+++ b/Query.cpp
@ -74,6 +74,9 @@ void Query::reset ( ) {
 		qw->destructor();
 	}

+	m_stackBuf.purge();
+	m_qterms = NULL;
+
 	m_sb.purge();
 	m_osb.purge();
 	m_docIdRestriction = 0LL;
@ -140,14 +143,16 @@ bool Query::set2 ( char *query        ,
 		   // need language for doing synonyms
 		   uint8_t  langId ,
 		   char     queryExpansion ,
-		   bool     useQueryStopWords ) {
-		  //int32_t  maxQueryTerms  ) {
+		   bool     useQueryStopWords ,
+		   int32_t  maxQueryTerms  ) {

 	m_langId = langId;
 	m_useQueryStopWords = useQueryStopWords;
 	// fix summary rerank and highlighting.
 	bool keepAllSingles = true;

+	m_maxQueryTerms = maxQueryTerms;
+
 	// assume  boolean auto-detect.
 	char boolFlag = 2;

@ -159,7 +164,7 @@ bool Query::set2 ( char *query        ,
 	if ( ! query ) return true;

 	// set to 256 for synonyms?
-	m_maxQueryTerms = 256;
+	//m_maxQueryTerms = 256;
 	m_queryExpansion = queryExpansion;

 	int32_t queryLen = gbstrlen(query);
@ -601,7 +606,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 	int32_t max = (int32_t)MAX_EXPLICIT_BITS;
 	if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;

-	// count them first for allocating
+	// count phrases first for allocating
 	int32_t nqt = 0;
 	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
 		QueryWord *qw  = &m_qwords[i];
@ -653,6 +658,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			continue;
 		// skip if ignored like a stopword (stop to->too)
 		//if ( qw->m_ignoreWord ) continue;
+		// ignore title: etc. words, they are field names
+		if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
+		// ignore boolean operators
+		if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
 		// no, hurts 'Greencastle IN economic development'
 		if ( qw->m_wordId == to ) continue;
 		// single letters...
@ -673,7 +682,9 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 		nqt += naids;
 	}

+	m_numTermsUntruncated = nqt;

+	if ( nqt > m_maxQueryTerms ) nqt = m_maxQueryTerms;

 	// allocate the stack buf
 	if ( nqt ) {
@ -719,6 +730,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			    "limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
 			break;
 		}
+		if ( n >= m_maxQueryTerms ) {
+			log("query: lost query phrase terms to max term cr "
+			    "limit of %"INT32"",(int32_t)m_maxQueryTerms);
+			break;
+		}

 		QueryTerm *qt = &m_qterms[n];
 		qt->m_qword     = qw ;
@ -877,6 +893,11 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			    "limit of %"INT32"",(int32_t)ABS_MAX_QUERY_TERMS );
 			break;
 		}
+		if ( n >= m_maxQueryTerms ) {
+			log("query: lost query terms to max term cr "
+			    "limit of %"INT32"",(int32_t)m_maxQueryTerms);
+			break;
+		}

 		QueryTerm *qt = &m_qterms[n];
 		qt->m_qword     = qw ;
@ -1389,6 +1410,10 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			continue;
 		// skip if ignored like a stopword (stop to->too)
 		//if ( qw->m_ignoreWord ) continue;
+		// ignore title: etc. words, they are field names
+		if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) continue;
+		// ignore boolean operators
+		if ( qw->m_ignoreWord ) continue;// IGNORE_BOOLOP
 		// no, hurts 'Greencastle IN economic development'
 		if ( qw->m_wordId == to ) continue;
 		// single letters...
@ -1424,6 +1449,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			}
 			// this happens for 'da da da'
 			if ( ! origTerm ) continue;
+
+			if ( n >= m_maxQueryTerms ) {
+				log("query: lost synonyms due to max cr term "
+				    "limit of %"INT32"",
+				    (int32_t)m_maxQueryTerms);
+				break;
+			}
+
 			// add that query term
 			QueryTerm *qt   = &m_qterms[n];
 			qt->m_qword     = qw; // NULL;
@ -2483,12 +2516,14 @@ bool Query::setQWords ( char boolFlag ,
 		// in quotes which is silly, so undo it. But we should
 		// still inherit any quoteSign, however. Be sure to also
 		// set m_inQuotes to false so Matches.cpp::matchWord() works.
-		if ( i == quoteStart ) { // + 1 ) {
-			if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
-				qw->m_quoteStart = -1;
-				qw->m_inQuotes   = false;
-			}
-		}
+		// MDW: don't undo it because we do not want to get synonyms
+		// of terms in quotes. 7/15/2015
+		// if ( i == quoteStart ) { // + 1 ) {
+		// 	if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
+		// 		qw->m_quoteStart = -1;
+		// 		qw->m_inQuotes   = false;
+		// 	}
+		// }
 		// . get prefix hash of collection name and field
 		// . but first convert field to lower case
 		uint64_t ph;
--- a/Query.h
+++ b/Query.h
@ -635,10 +635,10 @@ class Query {
 		    //int32_t  collLen  ,
 		    uint8_t  langId ,
 		    char     queryExpansion ,
-		    bool     useQueryStopWords = true );
-		   //char  boolFlag = 2 , // auto-detect if boolean query
-		   //bool  keepAllSingles = false ,
-		   //int32_t  maxQueryTerms = 0x7fffffff );
+		    bool     useQueryStopWords = true ,
+		    //char  boolFlag = 2 , // auto-detect if boolean query
+		    //bool  keepAllSingles = false ,
+		    int32_t  maxQueryTerms = 0x7fffffff );

 	// serialize/deserialize ourselves so we don't have to pass the
 	// unmodified string around and reparse it every time
@ -941,6 +941,8 @@ class Query {
 	int32_t      m_numTerms;
 	int32_t      m_numTermsSpecial;

+	int32_t m_numTermsUntruncated;
+
 	// separate vectors for easier interfacing, 1-1 with m_qterms
 	//int64_t m_termFreqs      [ MAX_QUERY_TERMS ];
 	//int64_t m_termIds        [ MAX_QUERY_TERMS ];
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -373,12 +373,12 @@ bool RdbDump::dumpTree ( bool recall ) {
 		//if ( removeNegRecs )
 		//	m_list.removeNegRecs();

-// 		if(!m_list->checkList_r ( false , // removeNegRecs?
-// 					 false , // sleep on problem?
-// 					 m_rdb->m_rdbId )) {
-// 			log("db: list to dump is not sane!");
-//			char *xx=NULL;*xx=0;
-// 		}
+ 		// if(!m_list->checkList_r ( false , // removeNegRecs?
+ 		// 			 false , // sleep on problem?
+ 		// 			 m_rdb->m_rdbId )) {
+ 		// 	log("db: list to dump is not sane!");
+		// 	char *xx=NULL;*xx=0;
+ 		// }


 	skip:
@ -781,6 +781,10 @@ bool RdbDump::doneReadingForVerify ( ) {
 	if ( m_addToMap ) t = gettimeofdayInMilliseconds();
 	// sanity check
 	if ( m_list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
+
+	bool triedToFix = false;
+
+ tryAgain:
 	// . register this with the map now
 	// . only register AFTER it's ALL on disk so we don't get partial
 	//   record reads and we don't read stuff on disk that's also in tree
@ -788,6 +792,16 @@ bool RdbDump::doneReadingForVerify ( ) {
 	// . we don't have maps when we do unordered dumps
 	// . careful, map is NULL if we're doing unordered dump
 	if ( m_addToMap && m_map && ! m_map->addList ( m_list ) ) {
+		// keys  out of order in list from tree?
+		if ( g_errno == ECORRUPTDATA ) {
+			log("db: trying to fix tree or buckets");
+			if ( m_tree ) m_tree->fixTree();
+			//if ( m_buckets ) m_buckets->fixBuckets();
+			if ( m_buckets ) { char *xx=NULL;*xx=0; }
+			if ( triedToFix ) { char *xx=NULL;*xx=0; }
+			triedToFix = true;
+			goto tryAgain;
+		}
 		g_errno = ENOMEM; 
 		log("db: Failed to add data to map.");
 		// undo the offset update, the write failed, the parent
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -624,7 +624,8 @@ bool RdbList::growList ( int32_t newSize ) {
 	// don't shrink list
 	if ( newSize <= m_allocSize ) return true;
 	// debug msg
-	//log("RdbList::growList from %"INT32" to %"INT32"",m_allocSize , newSize );
+	// log("RdbList::growList 0x%"PTRFMT "from %"INT32" to %"INT32"",
+	//     (PTRTYPE)this,m_allocSize , newSize );
 	// make a new buffer
 	char *tmp =(char *) mrealloc ( m_alloc,m_allocSize,newSize,"RdbList");
 	//if ( (int32_t)tmp == 0x904dbd0 )
--- a/RdbMap.cpp
+++ b/RdbMap.cpp
@ -323,8 +323,9 @@ bool RdbMap::verifyMap2 ( ) {
 			KEYSET(lastKey,k,m_ks); continue; }
 		// just bitch for now
 		log(
-		    "db: Key out of order in map file %s%s. "
-		    "page = %"INT32". key offset = %"INT64". Map or data file is "
+		    "db: Key out of order in map file %s/%s. "
+		    "page = %"INT32". key offset = %"INT64". "
+		    "Map or data file is "
 		    "corrupt, but it is probably the data file. Please "
 		    "delete the map file and restart.", 
 		    m_file.m_dir,m_file.getFilename() ,
@ -337,6 +338,15 @@ bool RdbMap::verifyMap2 ( ) {
 		    KEY1(lastKey,m_ks),KEY0(lastKey));
 		log("db:    k.n1=%016"XINT64" n0=%016"XINT64"",KEY1(k,m_ks),KEY0(k));
 		log("db: m_numPages = %"INT32"",m_numPages);
+
+		SafeBuf cmd;
+		cmd.safePrintf("mv %s/%s %s/trash/",
+			       m_file.m_dir,
+			       m_file.getFilename(),
+			       g_hostdb.m_dir);
+		log("db: %s",cmd.getBufStart() );
+		gbsystem ( cmd.getBufStart() );
+
 		exit(0);
 		//char *xx=NULL;*xx=0;
 		// was k too small?
@ -543,7 +553,8 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
 		m_lastLogTime = getTime();
 		//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
 		log(LOG_LOGIC,"build: RdbMap: added key out of order. "
-		    "count=%"INT64".",m_badKeys);
+		    "count=%"INT64" file=%s/%s.",m_badKeys,
+		    m_file.m_dir,m_file.getFilename());
 		//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64"  lastKey.n1=%"XINT32" %"XINT64"",
 		//    key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
 		log(LOG_LOGIC,"build: offset=%"INT64"",
@ -556,7 +567,10 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
 			g_errno = ECORRUPTDATA;
 			return false;
 		}
-		char *xx=NULL;*xx=0;
+		// if being called from RdbDump.cpp...
+		g_errno = ECORRUPTDATA;
+		return false;
+		//char *xx=NULL;*xx=0;
 		// . during a merge, corruption can happen, so let's core
 		//   here until we figure out how to fix it.
 		// . any why wasn't the corruption discovered and patched
@ -719,7 +733,10 @@ bool RdbMap::addList ( RdbList *list ) {
 	if ( ! addRecord ( key , rec , recSize ) ) {
 		log("db: Failed to add record to map: %s.",
 		    mstrerror(g_errno));
-		char *xx = NULL; *xx = 0;
+		// allow caller to try to fix the tree in the case of dumping
+		// a tree to a file on disk
+		return false;
+		//char *xx = NULL; *xx = 0;
 	}
 	if ( list->skipCurrentRecord() ) goto top2;

--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -1283,19 +1283,26 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		if ( m_right[i] >= 0 && m_parents[m_right[i]] != i ) 
 			return log(
 				   "db: Tree right kid and parent disagree.");
-		/*
+		// MDW: why did i comment out the order checking?
 		// check order
-		if ( m_left[i] >= 0 ) {
+		if ( m_left[i] >= 0 &&
+		     m_collnums[i] == m_collnums[m_left[i]] ) {
 			char *key = &m_keys[i*m_ks];
 			char *left = &m_keys[m_left[i]*m_ks];
-			if ( KEYCMP(key,left,m_ks)<0) {char *xx=NULL;*xx=0;}
+			if ( KEYCMP(key,left,m_ks)<0) 
+				return log("db: Tree left kid > parent %i",i);
+			
 		}
-		if ( m_right[i] >= 0 ) {
+		if ( m_right[i] >= 0 &&
+		     m_collnums[i] == m_collnums[m_right[i]] ) {
 			char *key = &m_keys[i*m_ks];
 			char *right = &m_keys[m_right[i]*m_ks];
-			if ( KEYCMP(key,right,m_ks)>0) {char *xx=NULL;*xx=0;}
+			if ( KEYCMP(key,right,m_ks)>0) 
+				return log("db: Tree right kid < parent %i "
+					   "%s < %s",i,
+					   KEYSTR(right,m_ks),
+					   KEYSTR(key,m_ks) );
 		}
-		*/
 		//g_loop.quickPoll(1, __PRETTY_FUNCTION__, __LINE__);
 	}
 	if ( hkp > 0 ) 
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -470,14 +470,16 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 		log("query: qlang of \"%s\" is NOT SUPPORTED. using "
 		    "langUnknown, \"xx\".",langAbbr);

+	int32_t maxQueryTerms = cr->m_maxQueryTerms;
+
 	// . the query to use for highlighting... can be overriden with "hq"
 	// . we need the language id for doing synonyms
 	if ( m_prepend && m_prepend[0] )
-		m_hqq.set2 ( m_prepend , m_queryLangId , true );
+		m_hqq.set2 ( m_prepend , m_queryLangId , true ,maxQueryTerms);
 	else if ( m_highlightQuery && m_highlightQuery[0] )
-		m_hqq.set2 ( m_highlightQuery , m_queryLangId , true );
+		m_hqq.set2 (m_highlightQuery,m_queryLangId,true,maxQueryTerms);
 	else if ( m_query && m_query[0] )
-		m_hqq.set2 ( m_query , m_queryLangId , true );
+		m_hqq.set2 ( m_query , m_queryLangId , true,maxQueryTerms);

 	// log it here
 	log(LOG_INFO,
@ -489,7 +491,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 	// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
 	if ( ! m_q.set2 ( m_sbuf1.getBufStart(), 
 			  m_queryLangId , 
-			  m_queryExpansion ) ) {
+			  m_queryExpansion ,
+			  true , // use QUERY stopwords?
+			  maxQueryTerms ) ) {
 		g_msg = " (error: query has too many operands)";
 		return false;
 	}
--- a/Spider.cpp
+++ b/Spider.cpp
@ -11641,6 +11641,18 @@ int32_t getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( strncmp ( p , "isfakeip",8 ) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if no match continue
+			if ( (bool)sreq->m_fakeFirstIp == val ) continue;
+			p += 8;
+			p = strstr(p, "&&");
+			if ( ! p ) return i;
+			p += 2;
+			goto checkNextRule;
+		}
+
 		if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
 			// skip for msg20
 			if ( isForMsg20 ) continue;
@ -13993,6 +14005,17 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , int32_t *status ) {
 		return msg->safePrintf("Job is initializing.");
 	}

+	// if we had seeds and none were successfully crawled, do not just
+	// print that the crawl completed.
+	if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
+	     cx->m_isCustomCrawl &&
+	     ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
+	     cx->m_globalCrawlInfo.m_pageDownloadAttempts > 0 &&
+	     cx->m_globalCrawlInfo.m_pageDownloadSuccesses == 0 ) {
+		*status = SP_SEEDSERROR;
+		return msg->safePrintf("Failed to crawl any seed.");
+	}
+
 	// if we sent an email simply because no urls
 	// were left and we are not recrawling!
 	if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
--- a/Spider.h
+++ b/Spider.h
@ -39,6 +39,7 @@
 #define SP_INPROGRESS   7 // it is going on!
 #define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
 #define SP_COMPLETED    9 // crawl is done, and no repeatCrawl is scheduled
+#define SP_SEEDSERROR  10 // all seeds had an error preventing crawling

 bool tryToDeleteSpiderColl ( SpiderColl *sc , char *msg ) ;
 void spiderRoundIncremented ( class CollectionRec *cr ) ;
--- a/Summary.cpp
+++ b/Summary.cpp
@ -13,6 +13,7 @@ Summary::Summary()
 	m_bitScoresBuf = NULL;
 	m_bitScoresBufSize = 0;
 	m_wordWeights = NULL;
+	m_buf4 = NULL;
 	reset();
 }

@ -42,9 +43,10 @@ void Summary::reset() {
 		m_wordWeights = NULL;
 	}
 	m_wordWeights = NULL;
-	if ( m_buf && m_buf != m_tmpBuf2 ) 
-		mfree ( m_buf , m_bufSize , "ssstkb" );
-	m_buf = NULL;
+	if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
+		mfree ( m_buf4 , m_buf4Size , "ssstkb" );
+		m_buf4 = NULL;
+	}
 }


@ -248,13 +250,13 @@ bool Summary::set2 ( Xml      *xml                ,
 	m_numExcerpts = 0;

 	int32_t need2 = (1+1+1) * m_q->m_numWords;
-	m_bufSize = need2;
+	m_buf4Size = need2;
 	if ( need2 < 128 )
-		m_buf = m_tmpBuf2;
+		m_buf4 = m_tmpBuf4;
 	else
-		m_buf = (char *)mmalloc ( need2 , "stkbuf" );
-	if ( ! m_buf ) return false;
-	char *x = m_buf;
+		m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
+	if ( ! m_buf4 ) return false;
+	char *x = m_buf4;
 	char *retired = x;
 	x += m_q->m_numWords;
 	char *maxGotIt = x;
@ -591,9 +593,10 @@ bool Summary::set2 ( Xml      *xml                ,
 	}

 	// free the mem we used if we allocated it
-	if ( m_buf && m_buf != m_tmpBuf2 ) 
-		mfree ( m_buf , m_bufSize , "ssstkb" );
-	m_buf = NULL;
+	if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
+		mfree ( m_buf4 , m_buf4Size , "ssstkb" );
+		m_buf4 = NULL;
+	}


 	// If we still didn't find a summary, get the default summary
--- a/Summary.h
+++ b/Summary.h
@ -271,9 +271,9 @@ class Summary {
 	int32_t m_wordWeightSize;
 	char m_tmpBuf[128];

-	char *m_buf;
-	int32_t m_bufSize;
-	char m_tmpBuf2[128];
+	char *m_buf4;
+	int32_t m_buf4Size;
+	char m_tmpBuf4[128];

 	char    m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
 	SafeBuf m_summaryLocs;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2569,11 +2569,10 @@ bool XmlDoc::indexDoc ( ) {
 			SafeBuf *ssDocMetaList = NULL;
 			// save this
 			int32_t saved = m_indexCode;
-			// and make it the real reason for the spider status doc
+			// make it the real reason for the spider status doc
 			m_indexCode = EDNSERROR;
-			// get the spiderreply ready to be added
-			
-			ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);//del
+			// get the spiderreply ready to be added. false=del
+			ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
 			// revert
 			m_indexCode = saved;
 			// error?
@ -2590,8 +2589,11 @@ bool XmlDoc::indexDoc ( ) {

 			char *url = "unknown";
 			if ( m_sreqValid ) url = m_sreq.m_url;
-			log("build: error2 getting real firstip of %"INT32" for "
-			    "%s. Not adding new spider req", (int32_t)*fip,url);
+			log("build: error2 getting real firstip of "
+			    "%"INT32" for "
+			    "%s. Not adding new spider req. "
+			    "spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
+			    m_addedStatusDocSize);
 			// also count it as a crawl attempt
 			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
 			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
@ -3134,8 +3136,9 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
 bool XmlDoc::isContainerDoc ( ) {
 	if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
 	if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
-	if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
-	if ( m_contentDelim ) return true;
+	//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
+	//if ( m_contentDelim ) return true;
+	if ( m_contentDelimValid && m_contentDelim ) return true;
 	return false;
 }

@ -28695,6 +28698,11 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
 		jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
 			      (int32_t)m_httpStatus);

+	// do not index gbssIsSeedUrl:0 because there will be too many usually
+	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
+	if ( isSeed )
+		jd.safePrintf("\"gbssIsSeedUrl\":1,\n");
+
 	if ( od )
 		jd.safePrintf("\"gbssWasIndexed\":1,\n");
 	else
@ -28719,6 +28727,18 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {
 		else
 			jd.safePrintf("\"gbssDiffbotUri\":"
 				      "\"none\",\n");
+		// show the type as gbssDiffbotType:"article" etc.
+		JsonItem *dti = NULL;
+		if ( jp1 ) 
+			dti = jp1->getItem("type");
+		if ( dti ) {
+			jd.safePrintf("\"gbssDiffbotType\":\"");
+			int32_t vlen;
+			char *val = dti->getValueAsString( &vlen );
+			if ( val ) jd.jsonEncode ( val , vlen );
+			jd.safePrintf("\",\n");
+		}
+
 	}
 	else { // if ( cr->m_isCustomCrawl ) {
 		jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
--- a/hash.cpp
+++ b/hash.cpp
@ -232,6 +232,17 @@ uint64_t hash64d ( char *p, int32_t plen ) {
 		char    ncs = utf8Encode ( x , (char *)tmp );
 		// sanity check
 		if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
+		// i've seen this happen for 4 byte char =
+		// -16,-112,-51,-125  which has x=66371 and y=66371
+		// but utf8Encode() returned 0!
+		if ( ncs == 0 ) {
+			// let's just hash it as-is then
+			tmp[0] = p[0];
+			if ( cs >= 1 ) tmp[1] = p[1];
+			if ( cs >= 2 ) tmp[2] = p[2];
+			if ( cs >= 3 ) tmp[3] = p[3];
+			ncs = cs;
+		}
 		// hash it up
 		h ^= g_hashtab [i++][tmp[0]];
 		if ( ncs == 1 ) continue;
--- a/hash.h
+++ b/hash.h
@ -250,6 +250,17 @@ inline uint64_t hash64Lower_utf8_nospaces ( char *p, int32_t len  ) {
 		char ncs = utf8Encode ( y , tmp );
 		// sanity check
 		if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
+		// i've seen this happen for 4 byte char =
+		// -16,-112,-51,-125  which has x=66371 and y=66371
+		// but utf8Encode() returned 0!
+		if ( ncs == 0 ) {
+			// let's just hash it as-is then
+			tmp[0] = p[0];
+			if ( cs >= 1 ) tmp[1] = p[1];
+			if ( cs >= 2 ) tmp[2] = p[2];
+			if ( cs >= 3 ) tmp[3] = p[3];
+			ncs = cs;
+		}
 		// hash it up
 		h ^= g_hashtab [i++][(uint8_t)tmp[0]];
 		if ( ncs == 1 ) continue;
@ -301,6 +312,17 @@ inline uint64_t hash64Lower_utf8_cont ( char *p,
 		char ncs = utf8Encode ( y , tmp );
 		// sanity check
 		if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
+		// i've seen this happen for 4 byte char =
+		// -16,-112,-51,-125  which has x=66371 and y=66371
+		// but utf8Encode() returned 0!
+		if ( ncs == 0 ) {
+			// let's just hash it as-is then
+			tmp[0] = p[0];
+			if ( cs >= 1 ) tmp[1] = p[1];
+			if ( cs >= 2 ) tmp[2] = p[2];
+			if ( cs >= 3 ) tmp[3] = p[3];
+			ncs = cs;
+		}
 		// hash it up
 		h ^= g_hashtab [i++][(uint8_t)tmp[0]];
 		if ( ncs == 1 ) continue;
@ -376,6 +398,17 @@ inline uint64_t hash64Lower_utf8 ( char *p ) {
 		char ncs = utf8Encode ( y , (char *)tmp );
 		// sanity check
 		if ( ncs > 4 ) { char *xx=NULL;*xx=0; }
+		// i've seen this happen for 4 byte char =
+		// -16,-112,-51,-125  which has x=66371 and y=66371
+		// but utf8Encode() returned 0!
+		if ( ncs == 0 ) {
+			// let's just hash it as-is then
+			tmp[0] = p[0];
+			if ( cs >= 1 ) tmp[1] = p[1];
+			if ( cs >= 2 ) tmp[2] = p[2];
+			if ( cs >= 3 ) tmp[3] = p[3];
+			ncs = cs;
+		}
 		// hash it up
 		h ^= g_hashtab [i++][(uint8_t)tmp[0]];
 		if ( ncs == 1 ) continue;
--- a/main.cpp
+++ b/main.cpp
@ -4998,7 +4998,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 			if ( ! f.doesExist() ) target = "gb";

 			sprintf(tmp,
-				"scp -c blowfish " // blowfish is faster
+				"scp -c arcfour " // blowfish is faster
 				"%s%s "
 				"%s:%s/gb.installed%s",
 				dir,