Merge branch 'diffbot-testing' into diffbot-matt

2014-05-02 14:34:04 -07:00 · 2014-05-02 14:34:04 -07:00 · 0daced51df
commit 0daced51df
parent 4d059109f1 9f67eb7699
13 changed files with 257 additions and 168 deletions
--- a/Conf.cpp
+++ b/Conf.cpp
@ -297,8 +297,8 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 		log(LOG_INFO,"db: Split is FULL");
 	*/
 	// sanity check
-	if ( g_hostdb.m_indexSplits > MAX_INDEXDB_SPLIT ) {
-		log("db: Increase MAX_INDEXDB_SPLIT");
+	if ( g_hostdb.m_indexSplits > MAX_SHARDS ) {
+		log("db: Increase MAX_SHARDS");
 		char *xx = NULL; *xx = 0; 
 	}
 	// and always keep a decent site quality cache of at least 3M
--- a/Indexdb.h
+++ b/Indexdb.h
@ -33,7 +33,7 @@
 //#define INDEXDB_SPLIT 8
 //#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)
 #define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)
-#define MAX_INDEXDB_SPLIT 128
+#define MAX_SHARDS 128

 class Indexdb {

--- a/Msg0.cpp
+++ b/Msg0.cpp
@ -31,7 +31,7 @@ void Msg0::constructor ( ) {
 	m_msg5b = NULL;
 //#ifdef SPLIT_INDEXDB
 	//for ( long i = 0; i < INDEXDB_SPLIT; i++ )
-	//for ( long i = 0; i < MAX_INDEXDB_SPLIT; i++ )
+	//for ( long i = 0; i < MAX_SHARDS; i++ )
 	//	m_mcast[i].constructor();
 	m_mcast.constructor();
 	m_mcasts      = NULL;
@ -726,8 +726,8 @@ void Msg0::gotSplitReply ( ) {
 	char *xx=NULL;*xx=0;
 	// get all the split lists
 	long totalSize = 0;
-	RdbList lists[MAX_INDEXDB_SPLIT];
-	RdbList *listPtrs[MAX_INDEXDB_SPLIT];
+	RdbList lists[MAX_SHARDS];
+	RdbList *listPtrs[MAX_SHARDS];
 	for ( long i = 0; i < m_numSplit; i++ ) {
 		listPtrs[i] = &lists[i];
 		long replySize;
--- a/Msg0.h
+++ b/Msg0.h
@ -216,7 +216,7 @@ class Msg0 {
 	// used for multicasting the request
 //#ifdef SPLIT_INDEXDB
 	//Multicast m_mcast[INDEXDB_SPLIT];
-	//Multicast m_mcast[MAX_INDEXDB_SPLIT];
+	//Multicast m_mcast[MAX_SHARDS];
 	// casting to multiple splits is obsolete, but for PageIndexdb.cpp
 	// we still need to do it, but we alloc for it
 	Multicast  m_mcast;
--- a/Msg36.h
+++ b/Msg36.h
@ -65,12 +65,12 @@ class Msg36 {
 //#else
 //	char m_reply[8];
 //#endif
-	char m_reply[8*MAX_INDEXDB_SPLIT];
+	char m_reply[8*MAX_SHARDS];

 	// for sending the request
 //#ifdef SPLIT_INDEXDB
 	//Multicast m_mcast[INDEXDB_SPLIT];
-	Multicast m_mcast[1];//MAX_INDEXDB_SPLIT];
+	Multicast m_mcast[1];//MAX_SHARDS];
 	long      m_numRequests;
 	long      m_numReplies;
 	long      m_errno;
--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -26,17 +26,17 @@ void Msg3a::constructor ( ) {
 	m_rbuf2.constructor();

 	// NULLify all the reply buffer ptrs
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_reply[j] = NULL;
 	m_rbufPtr = NULL;
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].constructor();
 	m_seoCacheList.constructor();
 }

 Msg3a::~Msg3a ( ) {
 	reset();
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].destructor();
 	m_seoCacheList.freeList();
 }
@ -48,12 +48,12 @@ void Msg3a::reset ( ) {
 	m_siteHashes26 = NULL;
 	// . NULLify all the reply buffer ptrs
 	// . have to count DOWN with "i" because of the m_reply[i-1][j] check
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) {
+	for ( long j = 0; j < MAX_SHARDS; j++ ) {
 		if ( ! m_reply[j] ) continue;
 		mfree(m_reply[j],m_replyMaxSize[j],  "Msg3aR");
 		m_reply[j] = NULL;
 	}
-	for ( long j = 0; j < MAX_INDEXDB_SPLIT; j++ ) 
+	for ( long j = 0; j < MAX_SHARDS; j++ ) 
 		m_mcast[j].reset();
 	// and the buffer that holds the final docids, etc.
 	if ( m_finalBuf )
@ -89,7 +89,7 @@ static void gotCacheReplyWrapper ( void *state ) {
 // . sets g_errno on error
 // . "query/coll" should NOT be on the stack in case we block
 // . uses Msg36 to retrieve term frequencies for each termId in query
-// . sends Msg39 request to get docids from each indexdb split
+// . sends Msg39 request to get docids from each indexdb shard
 // . merges replies together
 // . we print out debug info if debug is true
 // . "maxAge"/"addToCache" is talking about the clusterdb cache as well
@ -337,7 +337,7 @@ bool Msg3a::gotCacheReply ( ) {
 		}
 	}

-	// time how long to get each split's docids
+	// time how long to get each shard's docids
 	if ( m_debug )
 		m_startTime = gettimeofdayInMilliseconds();

@ -483,7 +483,7 @@ bool Msg3a::gotCacheReply ( ) {
 		Multicast *m = &m_mcast[i];
 		// clear it for transmit
 		m->reset();
-		// . send out a msg39 request to each split
+		// . send out a msg39 request to each shard
 		// . multicasts to a host in group "groupId"
 		// . we always block waiting for the reply with a multicast
 		// . returns false and sets g_errno on error
@ -532,10 +532,10 @@ bool Msg3a::gotCacheReply ( ) {
 	if ( m_numReplies < m_numHosts ) return false;//indexdbSplit )
 	// . otherwise, we did not block... error?
 	// . it must have been an error or just no new lists available!!
-	// . if we call gotAllSplitReplies() here, and we were called by 
+	// . if we call gotAllShardReplies() here, and we were called by 
 	//   mergeLists() we end up calling mergeLists() again... bad. so
 	//   just return true in that case.
-	//return gotAllSplitReplies();
+	//return gotAllShardReplies();
 	return true;
 }

@ -553,7 +553,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 		     " err=%s", (long)THIS, THIS->m_numReplies ,
 		     mstrerror(g_errno) );

-	// if one split times out, ignore it!
+	// if one shard times out, ignore it!
 	if ( g_errno == EQUERYTRUNCATED ||
 	     g_errno == EUDPTIMEDOUT ) 
 		g_errno = 0;
@ -576,7 +576,7 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 		// . sanity check
 		// . ntpd can screw with our local time and make this negative
 		if ( delta >= 0 ) {
-			// count the split
+			// count the shards
 			h->m_splitsDone++;
 			// accumulate the times so we can do an average display
 			// in PageHosts.cpp.
@ -587,8 +587,8 @@ void gotReplyWrapper3a ( void *state , void *state2 ) {
 	THIS->m_numReplies++;
 	// bail if still awaiting more replies
 	if ( THIS->m_numReplies < THIS->m_numHosts ) return;
-	// return if gotAllSplitReplies() blocked
-	if ( ! THIS->gotAllSplitReplies( ) ) return;
+	// return if gotAllShardReplies() blocked
+	if ( ! THIS->gotAllShardReplies( ) ) return;
 	// set g_errno i guess so parent knows
 	if ( THIS->m_errno ) g_errno = THIS->m_errno;
 	// call callback if we did not block, since we're here. all done.
@ -603,9 +603,9 @@ static void gotSerpdbReplyWrapper ( void *state ) {
 	THIS->m_callback ( THIS->m_state );
 }
 	
-bool Msg3a::gotAllSplitReplies ( ) {
+bool Msg3a::gotAllShardReplies ( ) {

-	// if any of the split requests had an error, give up and set m_errno
+	// if any of the shard requests had an error, give up and set m_errno
 	// but don't set if for non critical errors like query truncation
 	if ( m_errno ) { 
 		g_errno = m_errno; 
@ -705,23 +705,23 @@ bool Msg3a::gotAllSplitReplies ( ) {
 		if ( mr->m_nqt != m_q->getNumTerms() ) {
 			g_errno = EBADREPLY;
 			m_errno = EBADREPLY;
-			log("query: msg3a: Split reply qterms=%li != %li.",
+			log("query: msg3a: Shard reply qterms=%li != %li.",
 			    (long)mr->m_nqt,(long)m_q->getNumTerms() );
 			return true;
 		}
-		// return if split had an error, but not for a non-critical
+		// return if shard had an error, but not for a non-critical
 		// error like query truncation
 		if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
 			g_errno = mr->m_errno;
 			m_errno = mr->m_errno;
-			log("query: msg3a: Split had error: %s",
+			log("query: msg3a: Shard had error: %s",
 			    mstrerror(g_errno));
 			return true;
 		}
 		// skip down here if reply was already set
 		//skip:
-		// add of the total hits from each split, this is how many
-		// total results the lastest split is estimated to be able to 
+		// add of the total hits from each shard, this is how many
+		// total results the lastest shard is estimated to be able to 
 		// return
 		// . THIS should now be exact since we read all termlists
 		//   of posdb...
@ -732,12 +732,12 @@ bool Msg3a::gotAllSplitReplies ( ) {
 		// cast these for printing out
 		long long *docIds    = (long long *)mr->ptr_docIds;
 		double    *scores    = (double    *)mr->ptr_scores;
-		// print out every docid in this split reply
+		// print out every docid in this shard reply
 		for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
 			// print out score_t
 			logf( LOG_DEBUG,
 			     "query: msg3a: [%lu] %03li) "
-			     "split=%li docId=%012llu domHash=0x%02lx "
+			     "shard=%li docId=%012llu domHash=0x%02lx "
 			     "score=%f"                     ,
 			     (unsigned long)this                      ,
 			     j                                        , 
@ -849,13 +849,13 @@ bool Msg3a::mergeLists ( ) {
 	// shortcut
 	//long numSplits = m_numHosts;//indexdbSplit;

-	// . point to the various docids, etc. in each split reply
+	// . point to the various docids, etc. in each shard reply
 	// . tcPtr = term count. how many required query terms does the doc 
 	//   have? formerly called topExplicits in IndexTable2.cpp
-	long long     *diPtr [MAX_INDEXDB_SPLIT];
-	double        *rsPtr [MAX_INDEXDB_SPLIT];
-	key_t         *ksPtr [MAX_INDEXDB_SPLIT];
-	long long     *diEnd [MAX_INDEXDB_SPLIT];
+	long long     *diPtr [MAX_SHARDS];
+	double        *rsPtr [MAX_SHARDS];
+	key_t         *ksPtr [MAX_SHARDS];
+	long long     *diEnd [MAX_SHARDS];
 	for ( long j = 0; j < m_numHosts ; j++ ) {
 		Msg39Reply *mr =m_reply[j];
 		// if we have gbdocid:| in query this could be NULL
@ -953,7 +953,7 @@ bool Msg3a::mergeLists ( ) {
 		return true;

 	//
-	// ***MERGE ALL SPLITS INTO m_docIds[], etc.***
+	// ***MERGE ALL SHARDS INTO m_docIds[], etc.***
 	//
 	// . merge all lists in m_replyDocIds[splitNum]
 	// . we may be re-called later after m_docsToGet is increased
@ -966,7 +966,7 @@ bool Msg3a::mergeLists ( ) {
 	//Msg39Reply *mr;
 	long hslot;

-	// get the next highest-scoring docids from all split lists
+	// get the next highest-scoring docids from all shard termlists
 	for ( long j = 0; j < m_numHosts; j++ ) {
 		// . skip exhausted lists
 		// . these both should be NULL if reply was skipped because
@ -1026,82 +1026,84 @@ bool Msg3a::mergeLists ( ) {

 	// . only add it to the final list if the docid is "unique"
 	// . BUT since different event ids share the same docid, exception!
-	if ( hslot < 0 ) {
-		// always inc this
-		//m_totalDocCount++;
-		// only do this if we need more
-		if ( m_numDocIds < m_docsToGet ) {
-			// get DocIdScore class for this docid
-			Msg39Reply *mr = m_reply[maxj];
-			// point to the array of DocIdScores
-			DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
-			long nds = mr->size_scoreInfo/sizeof(DocIdScore);
-			DocIdScore *dp = NULL;
-			for ( long i = 0 ; i < nds ; i++ ) {
-				if ( ds[i].m_docId != *diPtr[maxj] )  continue;
-				dp = &ds[i];
-				break;
-			}
-			// add the max to the final merged lists
-			m_docIds    [m_numDocIds] = *diPtr[maxj];
+	if ( hslot >= 0 ) goto skip; // < 0 ) {

-			// wtf?
-			if ( ! dp ) {
-				// this is empty if no scoring info
-				// supplied!
-				if ( m_r->m_getDocIdScoringInfo )
-					log("msg3a: CRAP! got empty score "
-					    "info for "
-					    "d=%lli",
-					    m_docIds[m_numDocIds]);
-				//char *xx=NULL; *xx=0;  261561804684
-				// qry = www.yahoo
-			}
-			// point to the single DocIdScore for this docid
-			m_scoreInfos[m_numDocIds] = dp;
+	// always inc this
+	//m_totalDocCount++;
+	// only do this if we need more
+	if ( m_numDocIds < m_docsToGet ) {
+		// get DocIdScore class for this docid
+		Msg39Reply *mr = m_reply[maxj];
+		// point to the array of DocIdScores
+		DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
+		long nds = mr->size_scoreInfo/sizeof(DocIdScore);
+		DocIdScore *dp = NULL;
+		for ( long i = 0 ; i < nds ; i++ ) {
+			if ( ds[i].m_docId != *diPtr[maxj] )  continue;
+			dp = &ds[i];
+			break;
+		}
+		// add the max to the final merged lists
+		m_docIds    [m_numDocIds] = *diPtr[maxj];

-			// reset this just in case
-			if ( dp ) {
-				dp->m_singleScores = NULL;
-				dp->m_pairScores   = NULL;
-			}
+		// wtf?
+		if ( ! dp ) {
+			// this is empty if no scoring info
+			// supplied!
+			if ( m_r->m_getDocIdScoringInfo )
+				log("msg3a: CRAP! got empty score "
+				    "info for "
+				    "d=%lli",
+				    m_docIds[m_numDocIds]);
+			//char *xx=NULL; *xx=0;  261561804684
+			// qry = www.yahoo
+		}
+		// point to the single DocIdScore for this docid
+		m_scoreInfos[m_numDocIds] = dp;

-			// now fix DocIdScore::m_pairScores and m_singleScores
-			// ptrs so they reference into the 
-			// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
-			// like they should. it seems we do not free the
-			// Msg39Replies so we should be ok referencing them.
-			if ( dp && dp->m_singlesOffset >= 0 )
-				dp->m_singleScores = 
-					(SingleScore *)(mr->ptr_singleScoreBuf+
-							dp->m_singlesOffset) ;
-			if ( dp && dp->m_pairsOffset >= 0 )
-				dp->m_pairScores = 
-					(PairScore *)(mr->ptr_pairScoreBuf +
-						      dp->m_pairsOffset );
+		// reset this just in case
+		if ( dp ) {
+			dp->m_singleScores = NULL;
+			dp->m_pairScores   = NULL;
+		}
+
+		// now fix DocIdScore::m_pairScores and m_singleScores
+		// ptrs so they reference into the 
+		// Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
+		// like they should. it seems we do not free the
+		// Msg39Replies so we should be ok referencing them.
+		if ( dp && dp->m_singlesOffset >= 0 )
+			dp->m_singleScores = 
+				(SingleScore *)(mr->ptr_singleScoreBuf+
+						dp->m_singlesOffset) ;
+		if ( dp && dp->m_pairsOffset >= 0 )
+			dp->m_pairScores = 
+				(PairScore *)(mr->ptr_pairScoreBuf +
+					      dp->m_pairsOffset );
 					

-			// turn it into a float, that is what rscore_t is.
-			// we do this to make it easier for PostQueryRerank.cpp
-			m_scores    [m_numDocIds]=(double)*rsPtr[maxj];
-			if ( m_r->m_doSiteClustering ) 
-				m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
-			// clear this out
-			//m_eventIdBits[m_numDocIds].clear();
-			// set this for use below
-			hslot = m_numDocIds;
-			// point to next available slot to add to
-			m_numDocIds++;
-		}
-		// if it has ALL the required query terms, count it
-		//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
-		// . add it, this should be pre-allocated!
-		// . returns false and sets g_errno on error
-		if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
+		// turn it into a float, that is what rscore_t is.
+		// we do this to make it easier for PostQueryRerank.cpp
+		m_scores    [m_numDocIds]=(double)*rsPtr[maxj];
+		if ( m_r->m_doSiteClustering ) 
+			m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
+		// clear this out
+		//m_eventIdBits[m_numDocIds].clear();
+		// set this for use below
+		hslot = m_numDocIds;
+		// point to next available slot to add to
+		m_numDocIds++;
 	}

+
+	// if it has ALL the required query terms, count it
+	//if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
+	// . add it, this should be pre-allocated!
+	// . returns false and sets g_errno on error
+	if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
+
 skip:
-	// increment the split pointers from which we took the max
+	// increment the shard pointers from which we took the max
 	rsPtr[maxj]++;
 	diPtr[maxj]++;
 	ksPtr[maxj]++;
@ -1113,7 +1115,7 @@ bool Msg3a::mergeLists ( ) {
 	if ( m_debug ) {
 		// show how long it took
 		logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
-		      "splits in %llu ms. "
+		      "shards in %llu ms. "
 		      ,
 		      (unsigned long)this, 
 		       m_numDocIds, (long)m_numHosts,
@ -1138,7 +1140,7 @@ bool Msg3a::mergeLists ( ) {
 	}

 	// if we had a full split, we should have gotten the cluster recs
-	// from each split already
+	// from each shard already
 	memset ( m_clusterLevels , CR_OK , m_numDocIds );

 	return true;
--- a/Msg3a.h
+++ b/Msg3a.h
@ -80,7 +80,7 @@ public:
 		return m_numTotalEstimatedHits; };

 	// called when we got a reply of docIds
-	bool gotAllSplitReplies ( );
+	bool gotAllShardReplies ( );

 	bool gotCacheReply ( );

@ -135,13 +135,13 @@ public:
 	float      m_termFreqWeights[MAX_QUERY_TERMS];

 	// a multicast class to send the request, one for each split
-	Multicast  m_mcast[MAX_INDEXDB_SPLIT];
+	Multicast  m_mcast[MAX_SHARDS];

 	// for timing how long things take
 	long long  m_startTime;

 	// this buffer should be big enough to hold all requests
-	//char       m_request [MAX_MSG39_REQUEST_SIZE * MAX_INDEXDB_SPLIT];
+	//char       m_request [MAX_MSG39_REQUEST_SIZE * MAX_SHARDS];
 	long       m_numReplies;

 	// . # estimated total hits
@ -157,8 +157,8 @@ public:
 	SafeBuf m_rbuf2;

 	// each split gives us a reply
-	class Msg39Reply   *m_reply       [MAX_INDEXDB_SPLIT];
-	long                m_replyMaxSize[MAX_INDEXDB_SPLIT];
+	class Msg39Reply   *m_reply       [MAX_SHARDS];
+	long                m_replyMaxSize[MAX_SHARDS];

 	char m_debug;

--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -1592,6 +1592,7 @@ bool Msg40::gotSummary ( ) {

 	for ( ; m_si && m_si->m_streamResults&&m_printi<m_msg3a.m_numDocIds ;
 	      m_printi++){
+
 		// if we are waiting on our previous send to complete... wait..
 		if ( m_sendsOut > m_sendsIn ) break;

@ -1661,13 +1662,23 @@ bool Msg40::gotSummary ( ) {
 		     mr->m_contentHash32 &&
 		     m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
 			//if ( g_conf.m_logDebugQuery )
-			log("msg40: dup sum #%li (%lu)",m_printi,
-			    mr->m_contentHash32);
+			log("msg40: dup sum #%li (%lu)(d=%lli)",m_printi,
+			    mr->m_contentHash32,mr->m_docId);
 			// make it available to be reused
 			m20->reset();
 			continue;
 		}

+		// static long s_bs = 0;
+		// if ( (s_bs++ % 5) != 0 ) {
+		// 	log("msg40: FAKE dup sum #%li (%lu)(d=%lli)",m_printi,
+		// 	    mr->m_contentHash32,mr->m_docId);
+		// 	// make it available to be reused
+		// 	m20->reset();
+		// 	continue;
+		// }
+
+
 		// return true with g_errno set on error
 		if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
 		     mr->m_contentHash32 &&
@ -1679,22 +1690,25 @@ bool Msg40::gotSummary ( ) {

 		// assume we show this to the user
 		m_numDisplayed++;
+		//log("msg40: numdisplayed=%li",m_numDisplayed);

 		// do not print it if before the &s=X start position though
 		if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
-			log("msg40: hiding #%li (%lu)",
-			    m_printi,mr->m_contentHash32);
+			log("msg40: hiding #%li (%lu)(d=%lli)",
+			    m_printi,mr->m_contentHash32,mr->m_docId);
 			m20->reset();
 			continue;
 		}

-		log("msg40: printing #%li (%lu)",m_printi,mr->m_contentHash32);
+		log("msg40: printing #%li (%lu)(d=%lli)",
+		    m_printi,mr->m_contentHash32,mr->m_docId);

 		// . ok, we got it, so print it and stream it
 		// . this might set m_hadPrintError to true
 		printSearchResult9 ( m_printi , m_numPrintedSoFar );

 		m_numPrintedSoFar++;
+		//log("msg40: printedsofar=%li",m_numPrintedSoFar);

 		// now free the reply to save memory since we could be 
 		// streaming back 1M+. we call reset below, no need for this.
@ -1709,6 +1723,56 @@ bool Msg40::gotSummary ( ) {
 	if ( m_si->m_streamResults )
 		st->m_socket->m_streamingMode = true;

+
+	// if streaming results, and too many results were clustered or
+	// deduped then try to get more by merging the docid lists that
+	// we already have from the shards. if this still does not provide
+	// enough docids then we will need to issue a new msg39 request to
+	// each shard to get even more docids from each shard.
+	if ( m_si && m_si->m_streamResults &&
+	     // must have no streamed chunk sends out
+	     m_sendsOut == m_sendsIn &&
+	     // if we did not ask for enough docids and they were mostly
+	     // dups so they got deduped, then ask for more.
+	     // m_numDisplayed includes results before the &s=X parm.
+	     // and so does m_docsToGetVisiable, so we can compare them.
+	     m_numDisplayed < m_docsToGetVisible && 
+	     // wait for us to have exhausted the docids we have merged
+	     m_printi >= m_msg3a.m_numDocIds &&
+	     // wait for us to have available msg20s to get summaries
+	     m_numReplies == m_numRequests &&
+	     // this is true if we can get more docids from merging
+	     // more of the termlists from the shards together.
+	     // otherwise, we will have to ask each shard for a
+	     // higher number of docids.
+	     m_msg3a.m_moreDocIdsAvail &&
+	     // do not do this if client closed connection
+	     ! m_socketHadError ) { //&&
+		// doesn't work on multi-coll just yet, it cores.
+		// MAKE it.
+		//m_numCollsToSearch == 1 ) {
+		// can it cover us?
+		long need = m_msg3a.m_docsToGet + 20;
+		// note it
+		log("msg40: too many summaries deduped. "
+		    "getting more "
+		    "docids from msg3a merge and getting summaries. "
+		    "%li are visible, need %li. "
+		    "changing docsToGet from %li to %li. "
+		    "numReplies=%li numRequests=%li",
+		    m_numDisplayed,
+		    m_docsToGetVisible,
+		    m_msg3a.m_docsToGet, 
+		    need,
+		    m_numReplies, 
+		    m_numRequests);
+		// merge more docids from the shards' termlists
+		m_msg3a.m_docsToGet = need;
+		// this should increase m_msg3a.m_numDocIds
+		m_msg3a.mergeLists();
+	}
+
+
 	// . wrap it up with Next 10 etc.
 	// . this is in PageResults.cpp
 	if ( m_si && m_si->m_streamResults && ! m_printedTail &&
@ -5196,27 +5260,23 @@ bool Msg40::printSearchResult9 ( long ix , long numPrintedSoFar ) {

 	// then print each result
 	// don't display more than docsWanted results
-	
+	if ( m_numPrinted >= msg40->getDocsWanted() ) return true;
+
 	// prints in xml or html
-	if ( m_numPrinted < msg40->getDocsWanted() ) {
-
-		if ( m_si->m_format == FORMAT_CSV ) {
-			printJsonItemInCSV ( st , ix );
-			//log("print: printing #%li csv",(long)ix);
-		}
-
-		// print that out into st->m_sb safebuf
-		else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
-			// oom?
-			if ( ! g_errno ) g_errno = EBADENGINEER;
-			log("query: had error: %s",mstrerror(g_errno));
-			m_hadPrintError = true;
-		}
-
-		// count it
-		m_numPrinted++;
-
+	if ( m_si->m_format == FORMAT_CSV ) {
+		printJsonItemInCSV ( st , ix );
+		//log("print: printing #%li csv",(long)ix);
 	}
+	// print that out into st->m_sb safebuf
+	else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
+		// oom?
+		if ( ! g_errno ) g_errno = EBADENGINEER;
+		log("query: had error: %s",mstrerror(g_errno));
+		m_hadPrintError = true;
+	}
+
+	// count it
+	m_numPrinted++;

 	return true;
 }
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -1537,7 +1537,9 @@ void Msg5::repairLists_r ( ) {
 		if ( i < nn && base ) {
 			long fn = m_msg3.m_fileNums[i];
 			BigFile *bf = base->getFile ( fn );
-			log("db: Corrupt filename is %s.",bf->getFilename());
+			log("db: Corrupt filename is %s in collnum %li."
+			    ,bf->getFilename()
+			    ,(long)m_collnum);
 			//key_t sk = m_listPtrs[i]->getStartKey();
 			//key_t ek = m_listPtrs[i]->getEndKey  ();
 			//log("db: "
@ -1553,10 +1555,10 @@ void Msg5::repairLists_r ( ) {
 		}
 		// . remove the bad eggs from the list
 		// . TODO: support non-fixed data sizes
-		if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
-			m_listPtrs[i]->removeBadData_r();
-		else
-			m_listPtrs[i]->reset();
+		//if ( m_listPtrs[i]->getFixedDataSize() >= 0 )
+		m_listPtrs[i]->removeBadData_r();
+		//else
+		//m_listPtrs[i]->reset();
 		// otherwise we have a patchable error
 		m_hadCorruption = true;
 		// don't add a list with errors to cache, please
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -1091,7 +1091,14 @@ bool RdbList::removeBadData_r ( ) {
 	// . if not fixed size, remove all the data for now
 	// . TODO: make this better, man
 	if ( m_fixedDataSize == -1 ) {
-		reset();
+		// don't call reset because it sets m_ks back to 12
+		//reset();
+		m_listSize = 0;
+		m_list = NULL;
+		m_listPtr = NULL;
+		m_listEnd = NULL;
+		m_mergeMinListSize = -1;
+		m_lastKeyIsValid = false;
 		return true;
 	}
 	//key_t oldk;
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -927,11 +927,11 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 	if(m_firstResultNum < 0) m_firstResultNum = 0;

 	// DEBUG: temp hack
-	static bool first = true;
-	if ( first ) { 
-		first = false;
-		m_firstResultNum = 1;
-	}
+	//static bool first = true;
+	//if ( first ) { 
+	//	first = false;
+	//	m_firstResultNum = 1;
+	//}

 	// if useCache is -1 then pick a default value
 	if ( m_useCache == -1 ) {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -14666,7 +14666,22 @@ char **XmlDoc::gotHttpReply ( ) {
 	// sanity check -- check after bailing on corruption because
 	// corrupted replies do not end in NULLs
 	if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
-		char *xx=NULL;*xx=0; }
+		log("http: httpReplySize=%li http reply does not end in \\0 "
+		    "for %s in collnum=%li. blanking out reply."
+		    ,m_httpReplySize
+		    ,m_firstUrl.m_url
+		    ,(long)m_collnum
+		    );
+		// free it i guess
+		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
+		// and reset it
+		m_httpReplySize      = 0;
+		m_httpReply          = NULL;
+		m_httpReplyAllocSize = 0;
+		// call it data corruption i guess for now
+		g_errno = ECORRUPTDATA;
+		//char *xx=NULL;*xx=0; 
+	}

 	// if its a bad gzip reply, a compressed http reply, then
 	// make the whole thing empty? some websites return compressed replies
--- a/main.cpp
+++ b/main.cpp
@ -4650,28 +4650,31 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
 			// don't copy to ourselves
 			//if ( h2->m_hostId == h->m_hostId ) continue;
 			sprintf(tmp,
-				"rcp %sgb.conf %s:%sgb.conf &",
+				"scp %sgb.conf %shosts.conf %s:%s %s",
+				dir ,
 				dir ,
 				//h->m_hostId ,
 				iptoa(h2->m_ip),
-				h2->m_dir);
+				h2->m_dir,
 				//h2->m_hostId);
+				amp);
+
 			log(LOG_INIT,"admin: %s", tmp);
 			system ( tmp );
-			sprintf(tmp,
-				"rcp %shosts.conf %s:%shosts.conf &",
-				dir ,
-				iptoa(h2->m_ip),
-				h2->m_dir);
-			log(LOG_INIT,"admin: %s", tmp);
-			system ( tmp );
-			sprintf(tmp,
-				"rcp %shosts2.conf %s:%shosts2.conf &",
-				dir ,
-				iptoa(h2->m_ip),
-				h2->m_dir);
-			log(LOG_INIT,"admin: %s", tmp);
-			system ( tmp );
+			// sprintf(tmp,
+			// 	"scp %shosts.conf %s:%shosts.conf &",
+			// 	dir ,
+			// 	iptoa(h2->m_ip),
+			// 	h2->m_dir);
+			// log(LOG_INIT,"admin: %s", tmp);
+			// system ( tmp );
+			// sprintf(tmp,
+			// 	"scp %shosts2.conf %s:%shosts2.conf &",
+			// 	dir ,
+			// 	iptoa(h2->m_ip),
+			// 	h2->m_dir);
+			// log(LOG_INIT,"admin: %s", tmp);
+			// system ( tmp );
 		}
 		else if ( installFlag == ifk_start ) {
 			// . save old log now, too
@ -16886,8 +16889,8 @@ bool isRecoveryFutile ( ) {
 		// get time stamp
 		long timestamp = ff.getLastModifiedTime ( );

-		// skip if not iwthin last minute
-		if ( timestamp < now - 60 ) continue;
+		// skip if not iwthin 2 minutes
+		if ( timestamp < now - 2*60 ) continue;

 		// open it up to see if ends with sighandle
 		long toRead = 3000;