open-source-search-engine/junkdrawer/Msg24.cpp

#include "gb-include.h"

#include "Msg51.h"
//#include "Msg24.h"
#include "Query.h"
#include "Msg20.h"
//#include "TermTable.h"
#include "Words.h"
#include "Speller.h"
#include <math.h>
#include "StopWords.h"
#include "HashTable.h"
#include "Clusterdb.h"
#include "Scores.h"
#include "Stats.h"
#include "Words.h"

// here's the knobs:

// sample radius in chars around each query term    : 600  (line  212)
// max sample size, all excerpts, per document      : 100k (line  213)
// map from distance to query term in words to score:      (line  855)
// map from popularity to score weight              :      (lines 950 et al)
// the comments above are way out of date (aac, Jan 2008)
//
// QPOP multiplier params
#define QPOP_ZONE_0          10
#define QPOP_ZONE_1          30
#define QPOP_ZONE_2          80
#define QPOP_ZONE_3          100
#define QPOP_ZONE_4          300
#define QPOP_MULT_0          10
#define QPOP_MULT_1          8
#define QPOP_MULT_2          6
#define QPOP_MULT_3          4
#define QPOP_MULT_4          2
// QTR scoring params
#define MAX_SCORE_MULTIPLIER 3000  // orig: 3000
#define ALT_MAX_SCORE        12000 // orig: 12000
#define ALT_START_SCORE      1000
#define QTR_ZONE_0           4
#define QTR_ZONE_1           8
#define QTR_ZONE_2           12
#define QTR_ZONE_3           20
#define QTR_BONUS_0          1000
#define QTR_BONUS_1          800
#define QTR_BONUS_2          500
#define QTR_BONUS_3          200
#define QTR_BONUS_CW         1
#define MULTIPLE_HIT_BOOST   1000 // orig: 1000
// gigabit phrase scoring params
#define SPARSE_MARK          0.34
#define SPARSE_PENALTY       1000
#define FWC_PENALTY          500   // penalty for beginning with common word
#define POP_ZONE_0           0.00001
#define POP_ZONE_1           0.0001
#define POP_ZONE_2           0.001
#define POP_ZONE_3           0.01
#define POP_BOOST_0          3.0
#define POP_BOOST_1          1.5
#define POP_BOOST_2          1.0
#define POP_BOOST_3          0.3
#define POP_BOOST_4          0.1


//static bool onSamePages(int32_t i,int32_t j,int32_t *slots,int32_t *heads,int32_t *pages);

static void handleRequest24 ( UdpSlot *slot , int32_t netnice ) ;

static void setRepeatScores ( char      *repeatScores        ,
			      int64_t *wids                ,
			      int32_t       nw                  ,
			      char      *repeatTable         ,
			      int32_t       repeatTableNumSlots ,
			      Words     *words               ) ;

Msg24::Msg24 ( ) {
	m_numTopics = 0;
	m_request   = NULL;
	m_reply     = NULL;

	m_topicPtrs   = NULL;
	m_topicLens   = NULL;
	m_topicScores = NULL;
	m_topicGids   = NULL;
	m_topicPops   = NULL;
	m_topicDocIds = NULL;
	m_topicNumDocIds = NULL;
	m_isUnicode = false;
}

Msg24::~Msg24 ( ) { reset(); }


void Msg24::reset ( ) {
	if ( m_request && m_request != m_requestBuf )
		mfree ( m_request , m_requestSize , "Msg24" );
	m_request = NULL;
	// free reply if we should
	if ( m_reply ) mfree ( m_reply , m_replySize , "Msg24" );
	m_reply = NULL;
	m_isUnicode = false;
}


bool Msg24::registerHandler ( ) {
        // . register ourselves with the udp server
        // . it calls our callback when it receives a msg of type 0x24
        if ( ! g_udpServer.registerHandler ( 0x24, handleRequest24 ))
                return false;
        return true;
}

static void gotReplyWrapper24 ( void *state1 , void *state2 ) ;

bool Msg24::generateTopics ( char       *coll                ,
			     int32_t        collLen             ,
			     char       *query               ,
			     int32_t        queryLen            ,
			     //float     termFreqWeights     ,
			     //float     phraseAffWeights    ,
			     int64_t  *docIds              ,
			     char       *clusterLevels       ,
			     int32_t        numDocIds           ,
			     TopicGroup  *topicGroups        ,
			     int32_t         numTopicGroups     ,
			     //int32_t        docsToScanForTopics ,
			     //int32_t        minTopicScore       ,
			     //int32_t        maxTopics           ,
			     //int32_t        maxWordsPerPhrase   ,
			     int32_t        maxCacheAge         ,
			     bool        addToCache          ,
			     bool        returnDocIdCount    ,
			     bool        returnDocIds        ,
			     bool        returnPops          ,
			     void       *state               ,
			     void     (* callback) (void *state ),
			     int32_t        niceness) {
	// force it to be true, since hi bit is set in pops if topic is unicode
	returnPops       = true;
	// warning
	if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
	// force it
	returnDocIdCount = true;
	// if we don't get docids, then deserialize doesn't work because it
	// expects the docids to be valid.
	returnDocIds     = true;
	// reset
	m_numTopics = 0;
	//m_docsToScanForTopics = docsToScanForTopics;
	//m_minTopicScore       = minTopicScore;
	//m_maxTopics           = maxTopics;
	m_numDocIds          = numDocIds;
	m_coll               = coll;
	m_collLen            = collLen;
	m_returnDocIdCount   = returnDocIdCount;
	m_returnDocIds       = returnDocIds;
	m_returnPops         = returnPops;
	// bail if no operations to do
	if ( numTopicGroups <= 0 ) return true;
	if ( numDocIds      <= 0 ) return true;

	int32_t numTopicsToGen = topicGroups->m_numTopics;
	// get the min we have to scan
	int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;

	for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
		int32_t x = topicGroups[i].m_docsToScanForTopics ;
		if ( x > docsToScanForTopics ) docsToScanForTopics = x;

		if ( topicGroups[i].m_numTopics > numTopicsToGen )
			numTopicsToGen = topicGroups[i].m_numTopics;
	}
	// bail if none
	if ( docsToScanForTopics <= 0 ) return true;
	if ( numTopicsToGen == 0      ) return true;

	m_state      = state;
	m_callback   = callback;

	m_startTime = gettimeofdayInMilliseconds();

	// save, caller should not delete this!
	m_topicGroups    = topicGroups;
	m_numTopicGroups = numTopicGroups;
	// truncate
	//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
	// truncate
	//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
	//	numDocIds = MAX_DOCIDS_TO_SCAN ;
	if ( numDocIds > docsToScanForTopics )
		numDocIds = docsToScanForTopics ;

	int32_t size = sizeof(TopicGroup) * numTopicGroups ;
	if ( queryLen > MAX_QUERY_LEN ) queryLen = MAX_QUERY_LEN;

	// how much space do we need?
	int32_t need = 4+4+4+size+
		queryLen+1+
		numDocIds*8 +
		numDocIds +collLen+1 + sizeof(niceness);
	m_requestSize = need;

	// make enough room for the request
	if ( need < MSG24_REQUEST_SIZE ) m_request = m_requestBuf;
	else {
		m_request = (char *)mmalloc ( need , "Msg24a" );
		if ( ! m_request ) {
			log("topics: Failed to allocate %"INT32" bytes.",need);
			return true;
		}
	}

	char *p = m_request;
	// store the cache parms
	*(int32_t *)p = maxCacheAge        ; p += 4;
	*(char *)p = addToCache         ; p += 1;
	*(char *)p = returnDocIdCount   ; p += 1;
	*(char *)p = returnDocIds       ; p += 1;
	*(char *)p = returnPops         ; p += 1;
	*(int32_t *)p = niceness           ; p += sizeof(int32_t);
	// store minTopicScore
	//*(int32_t *)p = minTopicScore     ; p += 4;
	//*(int32_t *)p = maxTopics         ; p += 4;
	//*(int32_t *)p = maxWordsPerPhrase ; p += 4;
	// store topic group information
	*(int32_t *)p = numTopicGroups; p += 4;
	gbmemcpy ( p , topicGroups , size ); p += size;
	// then coll
	gbmemcpy ( p , coll , collLen ); p += collLen ;
	*p++ = '\0';
	// then query
	gbmemcpy ( p , query , queryLen ); p += queryLen;
	*p++ = '\0';
	// then docids
	gbmemcpy ( p , docIds , numDocIds * 8 ); p += numDocIds * 8;
	// then cluster levels
	gbmemcpy ( p , clusterLevels , numDocIds ); p += numDocIds ;
	// how big is it?
	//m_requestSize = p - m_request;
	// sanity check
	//if ( m_requestSize > 5+MAX_QUERY_LEN + 1 + MAX_DOCIDS_TO_SCAN * 9){
	//	char *xx = NULL ; *xx = 0; }
	if ( p - m_request != m_requestSize ) {
		log("Bad msg24 request size");
		char *xx = NULL ; *xx = 0;
	}
	// . the groupId to handle... just pick randomly
	int32_t groupId = ((uint32_t)docIds[0]) & g_hostdb.m_groupMask;
	// . returns false and sets g_errno on error
	// . reply should be stored in UdpSlot::m_tmpBuf
        if ( ! m_mcast.send ( m_request       ,
			      m_requestSize   ,
			      0x24            , // msgType 0x24
			      false           , // m_mcast own m_request?
			      groupId         , // send to group (groupKey)
			      false           , // send to whole group?
			      (int32_t)docIds[0] , // key is lower bits of docId
			      this            , // state data
			      NULL            , // state data
			      gotReplyWrapper24 ,
			      30              , // 30 second time out
			      niceness        , // niceness
			      false           , // realtime?
			      -1              , // first hostid
			      NULL,//m_reply    , // store reply in here
			      0,//MAX_REPLY_LEN , // this is how big it can be
			      false           , // free reply buf?
			      false           , // do disk load balancing?
			      0               , // maxCacheAge
			      (key_t)0        , // cacheKey
			      RDB_NONE        , // TITLEDB // rdbId of titledb
			      0             ) ){// minRecSizes avg
		log("topics: Had error sending request for topics to host in "
		    "group #%"INT32": %s.",groupId,mstrerror(g_errno));
		return true;
	}
	// otherwise, we blocked and gotReplyWrapper will be called
	return false;
}

void gotReplyWrapper24 ( void *state1 , void *state2 ) {
	Msg24 *THIS = (Msg24 *)state1;
	THIS->gotReply();
	THIS->m_callback ( THIS->m_state );
}

void Msg24::gotReply ( ) {
	// bail on error, multicast will free the reply buffer if it should
	if ( g_errno ) {
		log("topics: Had error getting topics: %s.",
		    mstrerror(g_errno));
		return;
	}
	// get the reply
	int32_t  maxSize   ;
	bool  freeIt    ;
	m_reply = m_mcast.getBestReply (&m_replySize, &maxSize, &freeIt);
	relabel( m_reply, m_replySize, "Msg24-GBR" );
	// sanity check
	//if ( reply != m_reply ) { char *xx = NULL ; *xx = 0 ; }
	// . parse the reply, it should be our m_reply buffer
	// . topics are NULL terminated
	deserialize ( m_reply , m_replySize );

	int64_t now  = gettimeofdayInMilliseconds();
	g_stats.addStat_r ( 0           ,
			    m_startTime ,
			    now,
			    "get_gigabits",
			    0x00d1e1ff ,
			    STAT_QUERY );
	/*
	int32_t  i = 0;
	while ( p < pend && i < MAX_TOPICS ) {
		m_topicScores[i] = *(int32_t *)p ; p += 4;
		m_topicLens  [i] = *(int32_t *)p ; p += 4;
		m_topicGids  [i] = *(char *)p ; p += 1;
		m_topicPtrs  [i] = p          ; p += m_topicLens[i] + 1;
		i++;
	}
	m_numTopics = i;
	*/
}

// if this is too big we can run out of sockets to use to launch
#define MAX_OUTSTANDING 50

State24::State24 ( ) {
	m_msg20 = NULL;
	m_mem = NULL;
	m_memPtr = NULL;
	m_memEnd = NULL;

}
State24::~State24 ( ) {
	if ( m_msg20 == m_buf20 ) return;
	for ( int32_t i = 0 ; i < m_numDocIds ; i++ ) m_msg20[i].destructor();
	mfree ( m_msg20 , sizeof(Msg20) * m_numDocIds , "Msg24" );
	m_msg20 = NULL;
	if ( m_mem ) {
		mfree ( m_mem, m_memEnd - m_mem, "Msg24" );
		m_mem    = NULL;
		m_memEnd = NULL;
		m_memPtr = NULL;
	}
}


static void launchMsg20s     ( State24 *st, bool callsample, int32_t sampleSize );
static void gotSampleWrapper ( void *state ) ;

void handleRequest24 ( UdpSlot *slot , int32_t netnice ) {
	// if niceness is 0, use the higher priority udpServer
	UdpServer *us = &g_udpServer;
	//if ( niceness == 0 ) us = &g_udpServer2;
	// make the state
	State24 *st ;
	try { st = new (State24); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("topics: Could not allocate %i bytes for generating "
		    "topics. Replying with error.",sizeof(State24));
		us->sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}
	mnew ( st , sizeof(State24) , "Msg24b" );
	// get the request
	char *request     = slot->m_readBuf;
	int32_t  requestSize = slot->m_readBufSize;
	char *requestEnd  = request + requestSize;
	// parse the request
	char *p = request;
	// get cache parms
	//int32_t maxCacheAge = *(int32_t *)p ; p += 4;
	//char addToCache  = *(char *)p ; p += 1;
	st->m_maxCacheAge        = *(int32_t *)p ; p += 4;
	st->m_addToCache         = *(char *)p ; p += 1;
	st->m_returnDocIdCount   = *(char *)p ; p += 1;
	st->m_returnDocIds       = *(char *)p ; p += 1;
	st->m_returnPops         = *(char *)p ; p += 1;
	st->m_niceness           = *(int32_t *)p ; p += sizeof(int32_t);
	// first is minTopicScore
	//int32_t minTopicScore     = *(int32_t *)p ; p += 4;
	// until we roll to all hosts, lets keep the protocol standard
	//int32_t maxTopics         = *(int32_t *)p ; p += 4;
	//int32_t maxWordsPerPhrase = *(int32_t *)p ; p += 4;
	//int32_t maxTopics         = 100;
	//int32_t maxWordsPerPhrase = 6;
	//st->m_minTopicScore     = minTopicScore;
	//st->m_maxTopics         = maxTopics;
	//st->m_maxWordsPerPhrase = maxWordsPerPhrase;
	// get topic group information
	st->m_numTopicGroups = *(int32_t *)p ; p += 4;
	int32_t size = sizeof(TopicGroup) * st->m_numTopicGroups ;
	gbmemcpy ( st->m_topicGroups , p , size ); p += size;
	// then coll
	st->m_coll = p; p += strlen(p) + 1;
	// . then the query, a NULL terminated string
	// . store it in state
	int32_t qlen = strlen ( p );
	if ( qlen > MAX_QUERY_LEN ) qlen = MAX_QUERY_LEN;
	gbmemcpy ( st->m_query , p , qlen );
	st->m_query [ qlen ] = '\0';
	st->m_queryLen = qlen;
	p += qlen + 1;
	// then the docids
	//int64_t *docIds    = (int64_t *)p;
	//int32_t       numDocIds = (requestEnd - p) / 9;
	//p += numDocIds * 8;
	// cluster levels
	//char *clusterLevels = p;
	st->m_docIds    = (int64_t *)p;
	st->m_numDocIds = (requestEnd - p) / 9;
	p += st->m_numDocIds * 8;
	// cluster levels
	st->m_clusterLevels = p;

	// truncate
	//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
	//	numDocIds = MAX_DOCIDS_TO_SCAN ;
	// see if anyone blocks at all
	//bool noBlock = true;
	// we haven't got any responses as of yet or sent any requests
	st->m_slot        = slot;
	//st->m_niceness    = 0; // niceness;
	st->m_numReplies  = 0;
	st->m_numRequests = 0;

	// allocate enough msg20s
	if ( st->m_numDocIds <= 50 )
		st->m_msg20 = st->m_buf20;
	else {
		st->m_msg20=(Msg20 *)mmalloc(sizeof(Msg20)*
					     st->m_numDocIds,"Msg24c");
		if ( ! st->m_msg20 ) {
			log("Msg24: alloc of msg20s for %"INT32" bytes failed",
			    sizeof(Msg20)*st->m_numDocIds);
			// prevent a core dump in Msg24::~Msg24
			st->m_numDocIds = 0;
			mdelete ( st , sizeof(State24) , "Msg24" );
			delete ( st );
			us->sendErrorReply ( slot , g_errno );
			return;
		}
		for ( int32_t i = 0 ; i < st->m_numDocIds ; i++ )
			st->m_msg20[i].constructor();
	}

	// set query if need be
	//Query qq;
	st->m_qq.set ( st->m_query , st->m_queryLen , NULL , 0, 2 , true );
	// make a display metas string to get content for out TopicGroups
	//char dbuf[1024];
	p    = st->m_dbuf;
	char *pend = st->m_dbuf + 1024;
	for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
		TopicGroup *t = &st->m_topicGroups [ i ];
		int32_t tlen = strlen ( t->m_meta );
		if ( p + tlen + 1 >= pend ) break;
		if ( i > 0 ) *p++ = ' ';
		gbmemcpy ( p , t->m_meta , tlen );
		p += tlen;
	}
	//int32_t dbufLen = p - dbuf;
	st->m_dbufLen = p - st->m_dbuf;
	*p = '\0';
	st->m_n = 0;
	st->m_i = 0;
	launchMsg20s ( st , true ,st->m_topicGroups[0].m_topicSampleSize );
}

void launchMsg20s ( State24 *st , bool callsample , int32_t sampleSize ) {
	// launch all the msg20 to get big samples of each doc
	//int32_t n = 0;
	for ( ; st->m_i < st->m_numDocIds ; st->m_i++ ) {
		// skip if clustered out
		if ( st->m_clusterLevels[st->m_i] != CR_OK )
			continue;
		// wait for later if too many outstanding
		if ( st->m_numRequests - st->m_numReplies >=
		     MAX_OUTSTANDING ) return;
		// use the jth slot if we should
		//if ( j >= 0 ) n = j;
		// save the msg index
		//st->m_msg20[n].m_n      =  n;
		//st->m_msg20[n].m_parent = st;
		// supply the display metas as the meta in our TopicGroups
		// . start up a Msg20 to get the relevant doc text
		// . this will return false if blocks
		// . a 32k sample takes 11ms to hash in hashSample() and
		//   most samples are below 5k anyway...
		Msg20 *mm = &st->m_msg20[st->m_n++];
		// set the summary request then get it!
		Msg20Request req;
		Query *q = &st->m_qq;
		//int32_t nt                = q->m_numTerms;
		req.ptr_qbuf             = q->getQuery();
		req.size_qbuf            = q->getQueryLen()+1;
		//req.ptr_termFreqs      = (char *)m_msg3a.m_termFreqs;
		//req.size_termFreqs     = 8 * nt;
		//req.ptr_affWeights     = (char *)m_msg3a.m_affWeights;
		//req.size_affWeights    = 4 * nt; // 4 = sizeof(float)
		req.ptr_coll             = st->m_coll;
		req.size_coll            = strlen(st->m_coll)+1;
		if ( st->m_dbufLen > 0 ) {
			req.ptr_displayMetas     = st->m_dbuf ;
			req.size_displayMetas    = st->m_dbufLen+1;
		}
		req.m_docId              = st->m_docIds[st->m_i];
		req.m_numSummaryLines    = 0;
		req.m_maxCacheAge        = st->m_maxCacheAge;
		req.m_wcache             = st->m_addToCache;
		req.m_state              = st;
		req.m_callback           = gotSampleWrapper;
		req.m_niceness           = st->m_niceness;
		//req.m_summaryMode      = m_si->m_summaryMode;
		req.m_boolFlag           = q->m_isBoolean; // 2 means auto?
		//req.m_allowPunctInPhrase = m_si->m_allowPunctInPhrase;
		//req.m_showBanned       = m_si->m_showBanned;
		//req.m_excludeLinkText  = m_si->m_excludeLinkText ;
		//req.m_hackFixWords     = m_si->m_hackFixWords    ;
		//req.m_hackFixPhrases   = m_si->m_hackFixPhrases  ;
		//req.m_includeCachedCopy= m_si->m_includeCachedCopy;//bigsm
		req.m_bigSampleRadius    = 100;
		req.m_bigSampleMaxLen    = sampleSize;
		if ( ! mm->getSummary ( &req )) {st->m_numRequests++;continue;}
#ifdef _OLDMSG20_
		if ( ! mm->getSummary ( &st->m_qq             ,
					NULL                  , // term freqs
					NULL                  , // aff weights
					st->m_docIds[st->m_i] ,
					1                     , // clusterLevel
					0                     , // # sum lines
					st->m_maxCacheAge     ,
					st->m_addToCache      ,
					st->m_coll            , // coll
					strlen(st->m_coll)    ,
					st                    , // state
					gotSampleWrapper      ,
					st->m_niceness        ,
					false                 , // root?
					st->m_dbuf            , // dt metas
					st->m_dbufLen         , // dtmetalen
					100                   , // smpl radius
					sampleSize           )){// smpl max
			st->m_numRequests++;
			// if just launching one, bail if this blocked
			//if ( j >= 0 ) return;
			continue;
		}
#endif
		// deal with an error
		if ( g_errno ) {
			// log it
			log("topics: Received error when getting "
			    "document with docId %"INT64": %s. Document will not "
			    "contribute to the topics generation.",
			    st->m_docIds[st->m_i],mstrerror(g_errno));
			// reset g_errno
			g_errno   = 0;
		}
		// . otherwise we got summary without blocking
		// . increment # of replies (instant reply) and results
		st->m_numReplies++;
		st->m_numRequests++;
		// if we were just launching one and it did not block, return
		//if ( j >= 0 ) return;
	}
	// did anyone block? if so, return false for now
	if ( st->m_numReplies < st->m_numRequests ) return ;
	// . otherwise, we got everyone, so go right to the merge routine
	// . returns false if not all replies have been received
	// . returns true if done
	// . sets g_errno on error
	if ( callsample ) gotSampleWrapper ( st );
}

static bool hashSample ( Query *q, char *sample , int32_t sampleLen ,
			 TermTable *master, int32_t *nqiPtr ,
			 TopicGroup *t ,
			 State24* st,
			 int64_t docId ,
			 char *vecs , int32_t *numVecs ,
			 class Words *wordsPtr , class Scores *scoresPtr ,
			 bool isUnicode ,
			 char *repeatTable , int32_t repeatTableNumSlots ,
			 char language );

void gotSampleWrapper ( void *state ) {
	// get ptr to our state 24 class
	State24 *st = (State24 *)state;
	// if niceness is 0, use the higher priority udpServer
	UdpServer *us = &g_udpServer;
	//if ( st->m_niceness == 0 ) us = &g_udpServer2;
	//else                       us = &g_udpServer ;
	UdpSlot *slot = st->m_slot;
	// just bitch if there was an error, then ignore it
	if ( g_errno ) {
		log("topics: Had error getting document: %s. Document will "
		    "not contribute to the topics generation.",
		    mstrerror(g_errno));
		g_errno = 0;
	}
	// we got one
	st->m_numReplies++;
	// launch another request if we can
	// return if all done
	launchMsg20s ( st , false , st->m_topicGroups[0].m_topicSampleSize ) ;
	// wait for all replies to get here
	if ( st->m_numReplies < st->m_numRequests ) return;
	// get time now
	//int64_t now = gettimeofdayInMilliseconds();
	// . add the stat
	// . use purple for tie to get all summaries
	//g_stats.addStat_r ( 0           ,
	//		    m_startTime ,
	//		    now         ,
	//		    0x008220ff  );
	// timestamp log
	//int64_t startTime = gettimeofdayInMilliseconds();
	log(LOG_DEBUG,"topics: msg24: Got %"INT32" titleRecs.",// in %"INT64" ms",
	    st->m_numReplies );//, now - m_startTime );

	// set query
	//Query q;
	//q.set ( st->m_query , st->m_queryLen , NULL , 0 , 2/*auto*/, true);

	// . init table for up to about 5k total distinct pronouns & phrases
	// . it automatically grows by like 20% if it runs out of space
	// . only alloc space for linked lists if docid info is wanted
	TermTable master;
	if ( ! master.set ( 20000 , true , true ,
			    st->m_returnDocIdCount | st->m_returnDocIds ,
			    st->m_returnPops , true, false, NULL ) ) {
		mdelete ( st , sizeof(State24) , "Msg24" );
		delete ( st );
		log("topics: Could not allocate memory for topic generation.");
		us->sendErrorReply ( slot , ENOMEM );
		return ;
	}

	// timestamp log
	int64_t startTime = gettimeofdayInMilliseconds();

	// debug
	//char *pp = (char *)mmalloc ( 4 , "foo");
	//*(int32_t *)pp = 0;
	//us->sendReply_ass ( pp , 4 , pp , 4 , slot );
	//delete(st);
	//return;

	// store all topics (scores/gids) in this buffer
	//char buf [ 128*1024 ];
	//char *p    = buf;
	//char *pend = buf + 128*1024;
	char *buf     = NULL;
	int32_t  bufSize = 0;
	//for ( int32_t yyy = 0 ; yyy < 100 ; yyy++ ) {	master.clear();//mdw
	// loop over all topic groups
	for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
		// get ith topic group descriptor
		TopicGroup *t = &st->m_topicGroups[i];
		// . generate topics for this topic group
		// . serialize them into "p"
		// . getTopics will realloc() this "buf" to exactly the size
		//   it needs
		getTopics ( st , t , &master , &st->m_qq , i ,
			    // getTopics will realloc this buffer
			    &buf , &bufSize , NULL , NULL , NULL );
		// clear master table each time
		if ( i + 1 < st->m_numTopicGroups ) master.clear();
	}
	//}

	// free mem now to avoid fragmentation
	master.reset();

	// if small enough, copy into slot's tmp buffer
	char *reply     = buf;
	int32_t  replySize = bufSize;
	// launch it
	us->sendReply_ass ( reply , replySize , reply , replySize , slot );
	mdelete ( st , sizeof(State24) , "Msg24" );
	delete ( st );

	// . on host0, this is 21.3 ms with a std.dev. of 17.5 using dsrt=30
	//   measured on log[b-d] with the limit of 4 words per "giga bit".
	// . now time with our new 6 word phrase maximum:
	//   sum = 1294.0  avg = 16.0 sdev = 10.8 ... our rewrite was faster!!
	//if ( g_conf.m_timingDebugEnabled )
	int64_t took = gettimeofdayInMilliseconds() - startTime ;
	if ( took > 1 )
		log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.",
		     took );
	// timing debug
	else log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.", took);
}

class DocIdLink {
public:
	int64_t  m_docId;
	int32_t       m_next; // offset into st->m_mem to DocIdLink
};


// returns false and set g_errno on error, true otherwise
bool getTopics ( State24       *st        ,
		 TopicGroup    *t         ,
		 TermTable     *master    ,
		 Query         *q         ,
		 char           gid       ,
		 char         **buf       ,
		 int32_t          *bufSize   ,
		 // these ptrs are supplied by the spider when trying to
		 // generate the gigabit vector for a document it is indexing
		 class Words   *wordsPtr  ,
		 class Scores  *scoresPtr ,
		 int32_t          *hashes    ,
		 unsigned char  language  ,
		 int32_t           niceness  ,
		 LinkInfo*      linkInfo,
		 LinkInfo*      linkInfo2) {

	////////////////////////////////////////////
	//
	// GENERATE THE TOPICS
	//
	////////////////////////////////////////////


	//int64_t start = gettimeofdayInMilliseconds();

	// only allow one vote per ip
	HashTable iptable;
	// return fales and set g_errno if this alloc fails
	if ( t->m_ipRestrict && ! iptable.set ( st->m_numRequests * 4 ) )
		return false;

	// space for all vectors for deduping samples that are 80% similar
	char  vbuf [ 64*1024 ];
	char *vecs    = vbuf;
	int32_t  numVecs = 0;
	int32_t  vneed   = st->m_numRequests * SAMPLE_VECTOR_SIZE;
	if ( t->m_dedupSamplePercent >= 0 && vneed > 64*1024 )
		vecs = (char *)mmalloc ( vneed , "Msg24d" );
	if ( ! vecs ) return false;

	// hack, if words supplied, treat as one request
	if ( wordsPtr ) st->m_numRequests = 1;

	//
	//
	// . make the hash table used for repeated fragment detection
	// . one slot per word, over all samples
	//
	//

	// for every sample estimate the number of words so we know how big
	// to make our repeat hash table
	int32_t maxWords = 0;
	Words tmpw;
	// if getting a gigabit vector for a single doc, we know the # of words
	if ( wordsPtr ) maxWords += wordsPtr->getNumWords();
	// otherwise, get max # of words for each big sample via Msg20
	int32_t numMsg20Used = 0;
	for ( int32_t i = 0 ; ! wordsPtr && i < st->m_numRequests ; i++ ) {
		Msg20* thisMsg20 = NULL;
		if(wordsPtr) {}
		else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
		else {
			thisMsg20 = st->m_msg20Ptrs[i];
			if ( st->m_clusterLevels[i] != CR_OK ) continue;
		}
		//continue if we've gotten no content
		if(!wordsPtr &&
		   (!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
			continue;
		// make sure the summary is not in a foreign language (aac)
		if (thisMsg20) {
		    unsigned char sLang;
		    sLang = thisMsg20->m_r->m_summaryLanguage;
		    if (language != langUnknown && sLang != language) continue;
		};
		// get the ith big sample
		char *sample = NULL;
		int32_t  slen   = 0;
		// but if doing metas, get the display content
		char  *next = NULL;
		if(thisMsg20) next = thisMsg20->getDisplayBuf();
		if ( t->m_meta[0] && next)
			sample = thisMsg20->getNextDisplayBuf(&slen,&next);
		// XmlDoc::getGigabitVector() provides us with the Words/Scores
		// classes for the whole document. that is the "sample"
		else {
			sample = thisMsg20->getBigSampleBuf();
			slen   = thisMsg20->getBigSampleLen();
		}
		// are we unicode?
		bool isUnicode = thisMsg20->isUnicode();
		// set parser vars
		char *p    = sample;
		char *pend = sample + slen;
		// each sample consists of multiple \0 terminated excerpts
		int32_t sampleWords = 0;
#ifdef DEBUG_MSG24
		int32_t numExcerpts = 0;
#endif
		while ( p < pend ) {
			int32_t plen ;
			if ( isUnicode ) plen = ucStrNLen    (p,pend-p);
			else             plen = strlen       (p);
			if ( isUnicode ) sampleWords += countWords((UChar *)p,plen);
			else             sampleWords += countWords( p,plen);
			// advance to next excerpt
			p += plen + 1;
#ifdef DEBUG_MSG24
			numExcerpts++;
#endif
		};
#ifdef DEBUG_MSG24
		if ( sampleWords > 2048 ) {
		    char *dbgBuf = NULL;
		    log("topics: Unusually int32_t sample in Msg24: "
			"sampleWords=%"INT32" numExcerpts=%"INT32"",
			sampleWords, numExcerpts);
		    if ( (dbgBuf = (char *)mmalloc(slen+1, "DEBUG_MSG24")) ) {
			int jjStep = 1;
			if (isUnicode) jjStep = 2;
			int kk = 0;
			for (int jj = 0; jj< slen; jj += jjStep) {
			    if (sample[jj]) {
				dbgBuf[kk++] = sample[jj];
			    }
			    else {
				dbgBuf[kk++] = '#';
			    };
			};
			dbgBuf[kk++] = '\0';
			log("topics: \tsample was: %s", dbgBuf);
		    };
		}
		else {
		    log("topics: Reasonable sample in Msg24: "
			"sampleWords=%"INT32" numExcerpts=%"INT32"",
			sampleWords, numExcerpts);
		};
#endif
		if (maxWords + sampleWords > 0x08000000) {
		    log("topics: too many words in samples. "
			"Discarding the remaining samples "
			"(maxWords=%"INT32")", maxWords);
		    break;
		}
		else {
		    maxWords += sampleWords;
		    numMsg20Used++;
		};
	}
	// make it big enough so there are gaps, so chains are not too long
	int32_t  minBuckets = (int32_t)(maxWords * 1.5);
	if(minBuckets < 512) minBuckets = 512;
	int32_t  numSlots   = 2 * getHighestLitBitValue ( minBuckets ) ;
	int32_t  need2      = numSlots * (8+4);
	char *rbuf       = NULL;
	char  tmpBuf2[13000];
	// sanity check
	if ( need2 < 0 ) {
		g_errno = EBADENGINEER;
		return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
			   "numSlots=%"INT32" maxWords=%"INT32" q=%s", need2,numSlots,maxWords,q->m_orig);
	}
	if ( need2 < 13000 ) rbuf = tmpBuf2;
	else                  rbuf = (char *)mmalloc ( need2 , "WeightsSet3");
	if ( ! rbuf ) return false;
	// sanity check
	if ( numSlots * 8 > need2 || numSlots * 8 < 0 ) {
		g_errno = EBADENGINEER;
		return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
			   "numSlots=%"INT32" q=%s", need2,numSlots,q->m_orig);
	}
	// clear the keys in the hash table (empty it out)
	memset ( rbuf , 0 , numSlots * 8 );
	// set the member var to this
	char *repeatTable         = rbuf;
	int32_t  repeatTableNumSlots = numSlots;

	//
	//
	// end making the hash table for repeated fragment detection
	//
	//


	// now combine all the pronouns and pronoun phrases into one big hash
	// table and collect the top 10 topics
	int32_t nqi = 0;   // how many query terms actually used? for normalizing.
	int32_t tcount = 0; // how many title recs did we process?
	QUICKPOLL(niceness);

	for ( int32_t i = 0 ; i < numMsg20Used ; i++ ) {
		Msg20* thisMsg20 = NULL;
		if(wordsPtr) {}
		else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
		else {
			thisMsg20 = st->m_msg20Ptrs[i];
			if ( st->m_clusterLevels[i] != CR_OK ) continue;
		}
		// make sure the summary is not in a foreign language (aac)
		if (thisMsg20) {
		    unsigned char sLang;
		    sLang = thisMsg20->m_r->m_summaryLanguage;
		    if (language != langUnknown && sLang != language) continue;
		};
		//continue if we've gotten no content
		if(!wordsPtr &&
		   (!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
			continue;
		// skip if from an ip we already did
		if ( t->m_ipRestrict ) {
			int32_t ipd = ipdom (thisMsg20->getIp() );
			// zero is invalid!
			if ( ! ipd ) continue;
			//log("url=%s",thisMsg20->getUrl());
			if ( iptable.getValue(ipd) ) {
				//log("dup=%s",thisMsg20->getUrl());
				continue;
			}
			// now we also check domain
			Url uu;
			uu.set ( thisMsg20->getUrl()    ,
				 thisMsg20->getUrlLen() );
			// "mid dom" is the "ibm" part of ibm.com or ibm.de
			char *dom  = uu.getMidDomain();
			int32_t  dlen = uu.getMidDomainLen();
			if ( dom && dlen > 0 ) {
				int32_t  h = hash32 ( dom , dlen );
				if ( iptable.getValue(h) ) continue;
				iptable.addKey (h,1);
			}
			// add ip
			iptable.addKey (ipd,1);
		}
		// get the ith big sample
		char *bigSampleBuf = NULL;
		int32_t  bigSampleLen = 0;
		// but if doing metas, get the display content
		char  *next = NULL;
		if(thisMsg20) next = thisMsg20->getDisplayBuf();
		// but if doing metas, get the display content
		if ( t->m_meta[0] && next) {
			bigSampleBuf =
				thisMsg20->getNextDisplayBuf(&bigSampleLen,&next);
		}
		// XmlDoc::getGigabitVector() provides us with the Words/Scores
		// classes for the whole document. that is the "sample"
		else if ( ! wordsPtr ) {
			bigSampleBuf = thisMsg20->getBigSampleBuf();
			bigSampleLen = thisMsg20->getBigSampleLen();
		}
		// skip if empty
		if ( !wordsPtr && (bigSampleLen<=0 ||!bigSampleBuf)) continue;
		// otherwise count it
		tcount++;
		// the docid
		int64_t docId = 0;
		if ( ! wordsPtr ) docId = thisMsg20->getDocId();
		// are we unicode?
		bool isUnicode;
		if ( ! wordsPtr ) isUnicode = thisMsg20->isUnicode();
		else              isUnicode = wordsPtr->isUnicode();
		unsigned char lang = language;
		if ( ! wordsPtr ) lang = thisMsg20->getLanguage();
		// continue; // mdw
		QUICKPOLL(niceness);
		// . hash it into the master table
		// . this may alloc st->m_mem, so be sure to free below
		hashSample ( q, bigSampleBuf, bigSampleLen, master, &nqi , t ,
			     st, docId ,
			     vecs , &numVecs ,
			     wordsPtr , scoresPtr , isUnicode ,
			     repeatTable , repeatTableNumSlots , lang );
		// ignore errors
		g_errno = 0;

		// hash the inlink texts and neighborhoods
		for(Inlink *k=NULL;linkInfo&&(k=linkInfo->getNextInlink(k));){
			char *s = k->ptr_linkText;
			int32_t len = k->size_linkText - 1;
			hashSample ( q, s, len, master, &nqi , t ,
				     st,     docId , // 0
				     vecs , &numVecs ,
				     NULL , NULL , k->m_isUnicode,
				     repeatTable , repeatTableNumSlots ,
				     lang );
			// and surrounding
			s   = k->ptr_surroundingText;
			len = k->size_surroundingText - 1;
			hashSample ( q, s, len, master, &nqi , t ,
				     st,     docId , // 0
				     vecs , &numVecs ,
				     NULL , NULL , k->m_isUnicode,
				     repeatTable , repeatTableNumSlots ,
				     lang );
		}
		for(Inlink*k=NULL;linkInfo2&&(k=linkInfo2->getNextInlink(k));){
			char *s = k->ptr_linkText;
			int32_t len = k->size_linkText - 1;
			hashSample ( q, s, len, master, &nqi, t ,
				     st,   docId , // docId
				     vecs , &numVecs ,
				     NULL , NULL, isUnicode,
				     repeatTable, repeatTableNumSlots,
				     lang );
		}
		// ignore errors
		g_errno = 0;
	}

	//hash meta keywords and meta description when generating the gigabit
	//vector, mainly useful for docs which have all of their content in frames
	if(st->m_dbufLen > 0 && wordsPtr) {
		hashSample ( q, st->m_dbuf, st->m_dbufLen, master, &nqi , t ,
			     st,  0/*docId*/ ,
			     vecs , &numVecs ,
			     NULL , NULL , wordsPtr->isUnicode() ,
			     repeatTable , repeatTableNumSlots , language );
	}

	//log("did samples in %"INT64" ",gettimeofdayInMilliseconds()-start);

	int32_t  nt = master->getNumTerms();

	// debug msg
	/*
	for ( int32_t i = 0 ; i < nt ; i++ ) {
		int32_t score = master->getScoreFromTermNum(i) ;
		if ( ! score ) continue;
		char *ptr  = master->getTermPtr(i) ;
		int32_t len   = master->getTermLen(i);
		char ff[1024];
		if ( len > 1020 ) len = 1020;
		gbmemcpy ( ff , ptr , len );
		ff[len] = '\0';
		// we can have html entities in here now
		//if ( ! is_alnum(ff[0]) ) { char *xx = NULL; *xx = 0; }
		log("%08"INT32" %s",score,ff);
	}
	*/

	// how many do we need?
	int32_t need = t->m_maxTopics ;
	// get this many winners
	int32_t maxWinners = need;
	// double it in case some get deduped
	if ( t->m_dedup ) maxWinners *= 2; // mdw
	// count how many get removed, might have to recompute
	int32_t removed ;
	int32_t got = 0;

	// now get the top MAX_TOPICS or maxWinners pronouns or pronoun phrases
	//int32_t           scores [ MAX_TOPICS ];
	//char          *ptrs   [ MAX_TOPICS ];
	//unsigned char  lens   [ MAX_TOPICS ];
	int32_t  *scores  = NULL;
	char **ptrs    = NULL;
	int32_t  *lens    = NULL;
	char  *isunis  = NULL;
	int32_t  *slots   = NULL;
	int32_t  *pages   = NULL;
	// these vars are used below
	//char *ptrs2 [ MAX_TOPICS ];
	//int32_t  lens2 [ MAX_TOPICS ];
	char **ptrs2 = NULL;
	int32_t  *lens2 = NULL;

	char  *tmpBuf  = NULL;
	int32_t   tmpSize = 0;
	//bool   triedLinkInfo = false;
 redo:
	// ensure maxWinners not too big
	//if ( maxWinners > MAX_TOPICS ) maxWinners = MAX_TOPICS;

	// allocate enough space
	int32_t  newSize = maxWinners*(sizeof(char *)+4+4+4+4+sizeof(char *)+4+1);
	char *newBuf  = (char *)mrealloc(tmpBuf,tmpSize , newSize , "Msg24e" );
	if ( ! newBuf ) {
		if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
		// free the links in the linked list, if any
		if ( st->m_mem ) {
			mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
			st->m_mem    = NULL;
			st->m_memEnd = NULL;
			st->m_memPtr = NULL;
		}
		if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
		return log("topics: realloc to %"INT32" failed.",newSize);
	}
	tmpBuf   = newBuf;
	tmpSize  = newSize;
	char *pp = tmpBuf;
	ptrs     = (char **)pp ; pp += sizeof(char *) * maxWinners;
	scores   = (int32_t  *)pp ; pp += 4 * maxWinners;
	lens     = (int32_t  *)pp ; pp += 4 * maxWinners;
	isunis   =          pp ; pp += maxWinners;
	slots    = (int32_t  *)pp ; pp += 4 * maxWinners;
	pages    = (int32_t  *)pp ; pp += 4 * maxWinners;
	ptrs2    = (char **)pp ; pp += sizeof(char *) * maxWinners;
	lens2    = (int32_t  *)pp ; pp += 4 * maxWinners;

	int32_t *pops = master->m_pops;

	QUICKPOLL(niceness);

	int32_t  np = 0;
	int32_t  minScore = 0x7fffffff;
	int32_t  minj = -1;
	int32_t  i ;
 	int32_t *heads = master->getHeads();
	bool  callRedo = true;
	// total # of pages sampled
	int32_t  sampled = numMsg20Used;
	for ( i = 0 ; i < nt && np < maxWinners ; i++ ) {
		// skip term #i from "table" if it has 0 score
		int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
		if ( ! score ) continue;

		// . make it higher the more popular a term is
		// . these are based on a MAXPOP of 10000
		int32_t mdc = (int32_t)((((double)sampled * 3.0 *
				    (double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
		if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;

		// skip if does not meet the min doc count
		int32_t count = 0;
		//if ( mdc > 1 || st->m_returnDocIds ) {
		if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
			DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
			while ( (char *)link >= st->m_mem ) {
				count++;
				link = (DocIdLink*)(st->m_mem + link->m_next);
			}
			if ( count < mdc ) continue;
		}

		// set the min of all in our list
		if ( score < minScore ) { minScore = score; minj = np; }
		// i've seen this become NULL at line 753 on gb1 below for
		// /search?code=mammaXbG&uip=12.41.126.39&n=15&raw=8&q=
		//  manhattan,+ny
		// so let's try it again and try to find out why maybe
		if ( master->m_termLens[i] <= 0 ) {
			char *orig = "";
			if ( q ) orig = q->m_orig;
			log (LOG_LOGIC,"query: Got 0 length gigabit. q=%s",
			     orig);
			continue;
		}
		// recalc the score
		//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
		//double frac2 = ((double)count * 100.0) / (double)sampled;
		//score = (int32_t)((frac1 * frac2) / 100.0);
		// we got a winner
		scores [ np ] = score;
		ptrs   [ np ] = master->m_termPtrs[i]; // getTermPtr(i) ;
		lens   [ np ] = master->m_termLens[i]; // getTermLen(i);
		isunis [ np ] = master->m_isunis[i];
		slots  [ np ] = i;
		pages  [ np ] = count;
		np++;
	}

	QUICKPOLL(niceness);
	// if not enough no matter what, do not redo
	if ( np < maxWinners ) callRedo = false;
	// now do the rest
	for ( ; i < nt ; i++ ) {
		// skip term #i from "table" if it has 0 score
		int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
		// bail if empty
		if ( score <= 0 ) continue;
		// ignore if not a winner
		if ( score <= minScore ) continue;
		// . make it higher the more popular a term is
		// . these are based on a MAXPOP of 10000
		int32_t mdc = (int32_t)((((double)sampled * 3.0 *
				    (double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
		if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;

		// skip if does not meet the min doc count
		int32_t count = 0;
		if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
			DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
			// m_next is -1 to indicate end
			while ( (char *)link >= st->m_mem ) {
				count++;
				link = (DocIdLink *)(st->m_mem + link->m_next);
			}
			if ( count < mdc ) continue;
		}
		// find the score it will replace, the min one
		//int32_t j ;
		//for ( j = 0 ; j < np ; j++ )
		//	if ( scores [ j ] == minScore ) break;
		// bad engineer?
		//if ( j == np ) { char *xx = NULL; *xx = 0; }
		// recalc the score
		//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
		//double frac2 = ((double)count * 100.0) / (double)sampled;
		//int32_t   newScore = (int32_t)((frac1 * frac2) / 100.0);
		//int32_t   oldminj  = minj;
		// replace jth guy
		scores [ minj ] = score;
		ptrs   [ minj ] = master->m_termPtrs[i]; // getTermPtr(i) ;
		lens   [ minj ] = master->m_termLens[i]; // getTermLen(i);
		isunis [ minj ] = master->m_isunis[i];
		pages  [ minj ] = count;
		slots  [ minj ] = i;
		//log("ptrs[%"INT32"]=%"XINT32"",j,ptrs[j]);
		// hopefully we increased the min score in our top set now
		minScore = 0x7fffffff;
		for ( int32_t j = 0 ; j < np ; j++ ) {
			if ( scores[j] < minScore ) {
				minScore = scores[j];
				minj     = j;
			}
		}
		//scores [oldminj] = newScore;
	}

	// bubble sort the top winners
 again:
	bool flag = 0;
	for ( int32_t i = 1 ; i < np ; i++ ) {
		if ( scores[i-1] >= scores[i] ) continue;
		int32_t   ts = scores[i];
		char  *tp = ptrs  [i];
		int32_t   tl = lens  [i];
		char   tu = isunis[i];
		int32_t   tc = pages [i];
		int32_t   tt = slots [i];
		scores [i  ] = scores[i-1];
		ptrs   [i  ] = ptrs  [i-1];
		lens   [i  ] = lens  [i-1];
		isunis [i  ] = isunis[i-1];
		pages  [i  ] = pages [i-1];
		slots  [i  ] = slots [i-1];
		scores [i-1] = ts;
		ptrs   [i-1] = tp;
		lens   [i-1] = tl;
		isunis [i-1] = tu;
		pages  [i-1] = tc;
		slots  [i-1] = tt;
		flag = 1;
	}
	if ( flag == 1 ) goto again;

	QUICKPOLL(niceness);

	// . normalize all scores
	// . assume 20000 pointer per query term per page
	// . an topic term will get 20000 points for each query term it is
	//   close to
	int32_t max = nqi * tcount * MAX_SCORE_MULTIPLIER ; //10000;
        if ( nqi == 0 ) max = tcount * ALT_MAX_SCORE;
	if ( max == 0 ) max = 1;
	for ( i = 0 ; i < np ; i++ ) {
		// skip if length is 0, it was a dup from above
		//if ( lens[i] <= 0 ) continue;
		scores[i] = (scores[i] * 100) / max;
		if ( scores[i] <= 0   ) scores[i] = 1;
		if ( scores[i] >= 100 ) scores[i] = 100; // add a log statement here? (aac)
	}

	// . now set ptrs2/lens2 to point to comparison string in each topic
	// . skip it over stop words, don't compare those
	// . this way we can do a more flexible strcasestr and ignore common
	//   words when comparing, they don't add much beyond repetition
	// . "super bowl" + "the super bowl" --> "super bowl"
	//char *ptrs2 [ MAX_TOPICS ];
	//int32_t  lens2 [ MAX_TOPICS ];
	for ( i = 0 ; i < np ; i++ ) {
		/*
		Words w;
		w.set ( false , ptrs[i] , lens[i] , false );
		int32_t nw = w.getNumWords();
		// skip if none
		if ( nw <= 0 ) continue;
		*/
		// establish our new ptrs
		ptrs2 [ i ] = ptrs[i];
		lens2 [ i ] = lens[i];
		// skip initial common words
		//----> not if capitalized!! leave those in tact. like
		//      Michael Jackson's "Beat It"
		/*
		int32_t h;
		int32_t j = 0;
		if ( w.isPunct(j) ) j++;
		for (  ; j < nw ; j += 2 ) {
			char *ww    = w.getWord   (j);
			int32_t  wwlen = w.getWordLen(j);
			// if capitlized, leave it
			if ( is_upper(ww[0]) ) break;
			// single letter lower case is common word
			if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
			// leave it if not common
			h= hash64d(w.getWord(j),w.getWordLen(j));
			if ( ! isCommonWord ( h ) ) break;
			// otherwise, scrub it off
		gotone:
			ptrs2 [i] = w.getWord(j+2);
		}
		// skip trailing common words
		int32_t k = nw - 1 ;
		if ( w.isPunct(k) ) k--;
		for (  ; k >= j ; k -= 2 ) {
			char *ww    = w.getWord   (k);
			int32_t  wwlen = w.getWordLen(k);
			// if capitlized, leave it
			if ( is_upper(ww[k]) ) break;
			// single letter lower case is common word
			if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
			// left off here!!
			if ( w.getWordLen(j) <= 1&&is_alpha(w.getWord(j)[0]) )
				continue;
			h=hash64d(w.getWord(j),w.getWordLen(j));
			if ( ! isCommonWord ( h ) ) break;
		}
		// set new length
		char *end2 = w.getWord(k) + w.getWordLen(k);
		lens2[i] = end2 - ptrs2[i];
		*/
	}

	if ( ! t->m_dedup ) goto skipdedup;
	//goto skipdedup; // mdw

	removed = 0;
	// now remove similar terms from the top topics
	for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
		// skip if nuked already
		if ( lens[i] == 0 ) continue;
		// scan down to this score, but not below
		//int32_t minScore = (scores[i] * 75) / 100 ;
		int32_t minScore = scores[i] - 25;
		// if we get replaced by a longer guy, remember him
		int32_t replacerj = -1;
		// . a longer term than encapsulates us can eliminate us
		// . or, if we're the longer, we eliminate the int16_ter
		for ( int32_t j = i + 1 ; j < np ; j++ ) {
			// skip if nuked already
			if ( lens[j] == 0 ) continue;
			// null term both
			char c1 = ptrs2[i][lens2[i]];
			char c2 = ptrs2[j][lens2[j]];
			ptrs2[i][lens2[i]] = '\0';
			ptrs2[j][lens2[j]] = '\0';
			// if we are the int16_ter, and longer contains us
			// then it nukes us... unless his score is too low
			if ( lens2[i] < lens2[j] ) {
				// if int16_ter is contained
				char *s;
				if (isunis[j] == 0 && isunis[i] == 0)
					s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
				else if (isunis[j] == 0 && isunis[i] == 1)
					s = ucStrNCaseStr(
						ptrs2[j],
						(UChar*)ptrs2[i], lens2[i]>>1);
				else if (isunis[j] == 1 && isunis[i] == 0)
					s = (char*)ucStrNCaseStr(
						(UChar*)ptrs2[j], lens2[j]>>1,
						ptrs2[i]);
				else
					s = (char*)ucStrNCaseStr(
						(UChar*)ptrs2[j], lens2[j]>>1,
						(UChar*)ptrs2[i], lens2[i]>>1);
				// un-null term both
				ptrs2[i][lens2[i]] = c1;
				ptrs2[j][lens2[j]] = c2;
				// even if he's longer, if his score is too
				// low then he cannot nuke us
				if ( scores[j] < minScore ) continue;
				// if we were NOT contained by someone below...
				if ( ! s ) continue;
				// he's gotta be on all of our pages, too
				//if ( ! onSamePages(i,j,slots,heads,pages) )
				//	continue;
				// int16_ter gets our score (we need to sort)
				// not yet! let him finish, then replace him!!
				replacerj = j;
				// see if we can nuke other guys at least
				continue;
			}
			// . otherwise, we are the longer
			// . we can nuke any int16_ter below us, all scores
			char *s;
			if (isunis[i] == 0 && isunis[j] == 0)
				s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
			else if (isunis[i] == 0 && isunis[j] == 1)
				s = ucStrNCaseStr(
					ptrs2[i],
					(UChar*)ptrs2[j], lens2[j]>>1);
			else if (isunis[i] == 1 && isunis[j] == 0)
				s = (char*)ucStrNCaseStr(
					(UChar*)ptrs2[i], lens2[i]>>1,
					ptrs2[j]);
			else
				s = (char*)ucStrNCaseStr(
					(UChar*)ptrs2[i], lens2[i]>>1,
					(UChar*)ptrs2[j], lens2[j]>>1);
			// un-null term both
			ptrs2[i][lens2[i]] = c1;
			ptrs2[j][lens2[j]] = c2;

			QUICKPOLL(niceness);


			// keep going if no match
			if ( ! s ) continue;
			// remove him if we contain him
			lens[j] = 0;
			// count him
			removed++;
			// the redo flag
			//rflag = 1;

		}
		// if we got replaced by a longer guy, he replaces us
		// and takes our score
		if ( replacerj >= 0 ) {
			ptrs  [i] = ptrs  [replacerj];
			lens  [i] = lens  [replacerj];
			pages [i] = pages [replacerj];
			slots [i] = slots [replacerj];
			ptrs2 [i] = ptrs2 [replacerj];
			lens2 [i] = lens2 [replacerj];
			//scores[i] = scores[replacerj];
			lens  [replacerj] = 0;
			i--;
			// count him
			removed++;
			// the redo flag
			//rflag = 1;
		}
	}

	// . PROBLEM #2: often a phrase and the next phrase, +1, are in
	//   there... how to fix? the higher scoring one should swallow
	//   up the lower scoring one, even if only 3 of the 4 words match
	//   (do not count common words)

	// . #3 or when all non-query, non-common terms match... pick the
	//   longer and remove the common words, but keep query words.

	// again2:
	//char rflag = 0;
	// if two terms are close in score, and one is a longer version
	// of the other, choose it and remove the int16_ter
	for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
		// skip if nuked already
		if ( lens[i] == 0 ) continue;
		// scan down to this score, but not below
		//int32_t minScore = (scores[i] * 75) / 100 ;
		int32_t minScore = scores[i] - 15;
		// if we get replaced by a longer guy, remember him
		int32_t replacerj = -1;
		// . a longer term than encapsulates us can eliminate us
		// . or, if we're the longer, we eliminate the int16_ter
		for ( int32_t j = i + 1 ; j < np ; j++ ) {
			// skip if nuked already
			if ( lens[j] == 0 ) continue;
			// null term both
			char c1 = ptrs[i][lens[i]];
			char c2 = ptrs[j][lens[j]];
			ptrs[i][lens[i]] = '\0';
			ptrs[j][lens[j]] = '\0';
			// if we are the int16_ter, and longer contains us
			// then it nukes us... unless his score is too low
			if ( lens[i] < lens[j] ) {
				// if int16_ter is contained
				char *s;
				if (isunis[j] == 0 && isunis[i] == 0)
					s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
				else if (isunis[j] == 0 && isunis[i] == 1)
					s = ucStrNCaseStr(
						ptrs2[j],
						(UChar*)ptrs2[i], lens2[i]>>1);
				else if (isunis[j] == 1 && isunis[i] == 0)
					s = (char*)ucStrNCaseStr(
						(UChar*)ptrs2[j], lens2[j]>>1,
						ptrs2[i]);
				else
					s = (char*)ucStrNCaseStr(
						(UChar*)ptrs2[j], lens2[j]>>1,
						(UChar*)ptrs2[i], lens2[i]>>1);
				// un-null term both
				ptrs[i][lens[i]] = c1;
				ptrs[j][lens[j]] = c2;
				// even if he's longer, if his score is too
				// low then he cannot nuke us
				if ( scores[j] < minScore ) continue;
				// if we were NOT contained by someone below...
				if ( ! s ) continue;
				// if we are not on the same pages as the
				// int16_ter one, then we cannot absorb him
				//if ( ! onSamePages(i,j,slots,heads,pages))
				//	continue;
				// int16_ter gets our score (we need to sort)
				// not yet! let him finish, then replace him!!
				replacerj = j;
				// see if we can nuke other guys at least
				continue;
			}
			// . otherwise, we are the longer
			// . we can nuke any int16_ter below us, all scores
			char *s;
			if (isunis[i] == 0 && isunis[j] == 0)
				s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
			else if (isunis[i] == 0 && isunis[j] == 1)
				s = ucStrNCaseStr(
					ptrs2[i],
					(UChar*)ptrs2[j], lens2[j]>>1);
			else if (isunis[i] == 1 && isunis[j] == 0)
				s = (char*)ucStrNCaseStr(
					(UChar*)ptrs2[i], lens2[i]>>1,
					ptrs2[j]);
			else
				s = (char*)ucStrNCaseStr(
					(UChar*)ptrs2[i], lens2[i]>>1,
					(UChar*)ptrs2[j], lens2[j]>>1);
			// un-null term both
			ptrs[i][lens[i]] = c1;
			ptrs[j][lens[j]] = c2;

			QUICKPOLL(niceness);

			// keep going if no match
			if ( ! s ) continue;
			// if we are not on the same pages as the
			// int16_ter one, then we cannot absorb him
			//if ( ! onSamePages(i,j,slots,heads,pages))
			//	continue;
			// remove him if we contain him
			lens[j] = 0;
			// count him
			removed++;
			// the redo flag
			//rflag = 1;

		}
		// if we got replaced by a longer guy, he replaces us
		// and takes our score
		if ( replacerj >= 0 ) {
			ptrs  [i] = ptrs  [replacerj];
			lens  [i] = lens  [replacerj];
			pages [i] = pages [replacerj];
			slots [i] = slots [replacerj];
			//scores[i] = scores[replacerj];
			lens  [replacerj] = 0;
			i--;
			// count him
			removed++;
			// the redo flag
			//rflag = 1;
		}
	}
	// if someone got replaced, loop more
	//if ( rflag ) goto again2;

	// remove common phrases
	for ( int32_t i = 0 ; i < np ; i++ ) {
		// skip if nuked already
		if ( lens[i] == 0 ) continue;
		// compare
		bool remove = false;
		if ( isunis[i] == 0 ) { //com org dom xhtml html dtd
		  if (!strncasecmp(ptrs[i], "all rights reserved",lens[i]) ||
		      !strncasecmp(ptrs[i], "rights reserved"    ,lens[i]) ||
		      !strncasecmp(ptrs[i], "in addition"        ,lens[i]) ||
		      !strncasecmp(ptrs[i], "for example"        ,lens[i]) ||
		      !strncasecmp(ptrs[i], "in order"           ,lens[i]) ||
		      !strncasecmp(ptrs[i], "in fact"            ,lens[i]) ||
		      !strncasecmp(ptrs[i], "in general"         ,lens[i]) ||
		      !strncasecmp(ptrs[i], "contact us"         ,lens[i]) ||
		      !strncasecmp(ptrs[i], "at the same time"   ,lens[i]) ||
		      !strncasecmp(ptrs[i], "http"               ,lens[i]) ||
		      !strncasecmp(ptrs[i], "html"               ,lens[i]) ||
		      !strncasecmp(ptrs[i], "s "                 ,lens[i]) ||
		      !strncasecmp(ptrs[i], "for more information",lens[i]))
			  remove = true;
		}
		else {
		  if ( !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
				    "all rights reserved", 19) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		     		    "rights reserved", 15) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "in addition", 11) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "for example", 11) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "in order", 8) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "in fact", 7) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "in general", 10) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "contact us", 10) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "at the same time", 16) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "http", 4) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "s ", 2) ||
		       !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
		                    "for more information", 20) )
			  remove = true;
		}
		if ( remove ) {
			lens[i] = 0;
			// count him
			removed++;
		}
	}
	QUICKPOLL(niceness);
	// now after longer topics replaced the int16_ter topics which they
	// contained, remove the longer topics if they have too many words
	// remove common phrases
	for ( int32_t i = 0 ; i < np ; i++ ) {
		// skip if nuked already
		if ( lens[i] == 0 ) continue;
		if ( ! ptrs[i]    ) continue;

		Words w;
		w.set ( false , false, ptrs[i] , lens[i] , TITLEREC_CURRENT_VERSION,
			false, false, niceness );
		int32_t nw = w.getNumWords();
		// . does it have comma? or other punct besides an apostrophe?
		// . we allow gigabit phrases to incorporate a int32_t stretch
		//   of punct... only before the LAST word in the phrase,
		//   that way our overlap removal still works well.
		bool hasPunct = false;
		for ( int32_t k = 0 ; k < lens[i] ; k++ ) {
			if ( ! is_punct(ptrs[i][k]) ) continue;
			// apostrophe is ok as int32_t as alnum follows
			if ( ptrs[i][k] == '\'' &&
			     is_alnum(ptrs[i][k+1]) ) continue;
			// . period ok, as int32_t as space or alnum follows
			// . if space follows, then an alnum must follow that
			// . same goes for colon
			QUICKPOLL(niceness);

			// . for now, until we get abbreviations working,
			//   alnum must follow period
			if ( (ptrs[i][k] == '.' || ptrs[i][k] == ':' ) &&
			     ( is_alnum(ptrs[i][k+1])  ||
			       // accept single initial before the period, too
			       (ptrs[i][k+1] ==' ' && is_alnum(ptrs[i][k+2])
				&& k>=2 && ptrs[i][k-2]==' ')))
				continue;
			// comma is ok if surrounded by digits
			if ( (ptrs[i][k] == ',' &&
			      is_digit(ptrs[i][k-1]) &&
			      is_digit(ptrs[i][k+1])   )) continue;
			// percent is ok
			if ( ptrs[i][k] == '%' ) continue;
			if ( ptrs[i][k] == '&' ) continue;
			if ( ptrs[i][k] == '@' ) continue;
			if ( ptrs[i][k] == '-' ) continue;
			//if ( ptrs[i][k] == '(' ) continue;
			//if ( ptrs[i][k] == ')' ) continue;
			hasPunct = true;
			break;
		}
		// keep it if words are under limit
		// and has no commas
		if ( nw <= 2*t->m_maxWordsPerTopic -1 && ! hasPunct )
			continue;
		lens[i] = 0;
		removed++;
	}

	QUICKPOLL(niceness);
	// if we removed enough to fall below maxWinners, redo
	got = np - removed;
	if ( got >= need ) goto skipdedup;
	// if we already did all from "master", no more left!
	if ( np  >= master->getNumTermsUsed() ) goto skipdedup;
	// if we didn't have enough raw results, do not redo it
	if ( ! callRedo ) goto skipdedup;
	// or if already hit MAX_TOPICS
	//if ( maxWinners >= MAX_TOPICS ) goto skipdedup; mdw
	if ( got == 0 ) maxWinners = maxWinners*2;
	else            maxWinners = ((int64_t)maxWinners *
				      (int64_t)need * 110LL) /
				((int64_t)got * 100LL) + 10;
	goto redo; // mdw

 skipdedup:

	// free the repeat table if it allocated mem
	if ( repeatTable != tmpBuf2 ) {
		mfree ( repeatTable , need2 , "Msg24" );
		repeatTable = NULL;
	}


	// how much space do we need for reply?
	int32_t size = 0;
	// 4 bytes for number of topics
	size += 4;
	// then how much for each topic?
	int32_t ntp  = 0;
	for ( i = 0 ; i < np ; i++ ) {
		// cutoff at min score
		if ( scores[i] < t->m_minTopicScore ) continue;
		// skip if length is 0, it was a dup from above
		if ( lens[i] <= 0 ) continue;
		// we always get the count now
		if ( st->m_returnDocIds ) {
			int32_t count = 0;
			DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
			while ( (char *)link >= st->m_mem ) {
				count++;
				link = (DocIdLink *)(st->m_mem + link->m_next);
			}
			// space for the docids if they want them
			size += 8 * count;
			// sanity check
			if ( count != pages[i] ) { char *xx = NULL; *xx = 0; }
		}
		// length (include \0 for null termination)
		size += 4 + 4 + 4 + 1 + lens[i] + 1;
		// . do we send back docid info?
		// . each termId can have a linked list of docids
		// . how many are in that list? (0 if none)
		size += 4;
		// 4 bytes for the dummy place holder. each one of these
		// can be a ptr to the list of docids, but it will be NULL
		// if we do not have a list of docids for this gigabit.
		size += 4;
		// the popularity... topic pop
		size += 4;
		// count numbre of topics we'll store
		ntp++;
	}
	// realloc reply
	newSize = *bufSize + size;
	char *s = (char *) mrealloc ( *buf , *bufSize , newSize , "Msg24f" );
	if ( ! s ) {
		if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
		if ( *buf   ) mfree ( *buf , *bufSize , "Msg24" );
		*buf     = NULL;
		*bufSize = 0;
		// free the links in the linked list, if any
		if ( st->m_mem ) {
			mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
			st->m_mem    = NULL;
			st->m_memEnd = NULL;
			st->m_memPtr = NULL;
		}
		if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
		return log("topics: Realloc reply buf to %"INT32" failed.",newSize);
	}
	// we realloc'd successfully, use it
	*buf = s;
	// copy into reply after previous topic groups
	char *p = *buf + *bufSize;
	// serialize ourselves into the buffer
	//serialize2 ( p , ptrs , scores , lens , gids );
	// store number of topics first
	*(int32_t *)p = ntp; p += 4;
	// arrays first
	char      **pptrs   = (char      **)p; p += ntp * 4;
	int32_t       *pscores = (int32_t       *)p; p += ntp * 4;
	int32_t       *plens   = (int32_t       *)p; p += ntp * 4;
	int32_t       *ndocids = (int32_t       *)p; p += ntp * 4;
	int64_t **dptrs   = (int64_t **)p; p += ntp * 4; // place holder
	int32_t       *ppops   = (int32_t       *)p; p += ntp * 4;
	char       *pgids   = (char       *)p; p += ntp ;
	char       *ptext   = p;
	int32_t        j       = 0;
	for ( i = 0 ; i < np ; i++ ) {
		// cutoff at min score
		if ( scores[i] < t->m_minTopicScore ) continue;
		// skip if length is 0, it was a dup from above
		if ( lens[i] <= 0 ) continue;
		// store it
		pptrs   [j] = (char *)(ptext - p);
		pscores [j] = scores [i];
		plens   [j] = lens   [i];
		pgids   [j] = gid;
		if ( pops ) ppops [j] = pops[slots[i]];
		else        ppops [j] = 0;
		ndocids [j] = 0;
		dptrs   [j] = NULL; // dummy placeholder
		gbmemcpy ( ptext , ptrs[i] , lens[i] ); ptext += lens[i];
		//if ( hashes && j < GIGABITS_IN_VECTOR )
		//	hashes[j] = hash32Lower (ptrs[i],lens[i]);
		*ptext++ = '\0';
		j++;
	}
	QUICKPOLL(niceness);


	// fill in docid info
	if ( st->m_returnDocIdCount || st->m_returnDocIds ) {
		// reset j for this repeat loop
		j = 0;
		// this loop header is the same as above
		for ( i = 0 ; i < np ; i++ ) {
			// cutoff at min score
			if ( scores[i] < t->m_minTopicScore ) continue;
			// skip if length is 0, it was a dup from above
			if ( lens[i] <= 0 ) continue;
			// count em
			int32_t count = 0;
			DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
			while ( (char *)link >= st->m_mem ) {
				count++;
				if ( st->m_returnDocIds ) {
					*(int64_t *)ptext = link->m_docId;
					ptext += 8;
				}
				link = (DocIdLink *)(st->m_mem + link->m_next);
			}
			ndocids[j] = count;
			j++;
		}
	}
	//skipd:
	// update buf parms for re-calls
	*bufSize = newSize;

	// free tmp buf
	mfree ( tmpBuf , tmpSize , "Msg24" );
	// free the links in the linked list, if any
	if ( st->m_mem ) {
		mfree ( st->m_mem , st->m_memEnd - st->m_mem , "Msg24" );
		st->m_mem    = NULL;
		st->m_memEnd = NULL;
		st->m_memPtr = NULL;
	}
	if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
	// copy into reply topic buf
	//char *start = slot->m_tmpBuf;
	//char *p     = slot->m_tmpBuf;
	//char *pend  = p + TMPBUFSIZE;
	/*
	for ( i = 0 ; i < np ; i++ ) {
		// cutoff at min score
		if ( scores[i] < t->m_minTopicScore ) continue;
		// skip if length is 0, it was a dup from above
		if ( lens[i] <= 0 ) continue;
		if ( p + lens[i] + 9 >= pend ) break;
		*(int32_t *)p = scores[i]; p += 4;
		*(int32_t *)p = lens  [i]; p += 4;
		*(char *)p = gid      ; p += 1;
		gbmemcpy ( p , ptrs[i] , lens[i] ); p += lens[i];
		*p++ = '\0';
	}
	*/
	return true;
}

/*
bool onSamePages ( int32_t i, int32_t j, int32_t *slots, int32_t *heads, int32_t *pages ) {
	if ( pages[i] != pages[j] ) return false;
	DocIdLink *link1 = (DocIdLink *)(st->m_mem+heads[slots[i]]);
	DocIdLink *link2 = (DocIdLink *)(st->m_mem+heads[slots[j]]);
	while ( (char *)link1 >= st->m_mem ) {
		if ( link1->m_docId != link2->m_docId ) return false;
		link1 = (DocIdLink *)(st->m_mem + link1->m_next);
		link2 = (DocIdLink *)(st->m_mem + link2->m_next);
	}
	return true;
}
*/

void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops ,
		   int32_t nqi , TermTable *tt , char *buf , int32_t bufLen ,
		   Words *w , TopicGroup *t , Scores *scoresPtr ,
		   bool isUnicode , char *repeatTable ,
		   int32_t repeatTableNumSlots , char language );

// . returns false and sets g_errno on error
// . here's the tricky part
// . *nqiPtr is how many query terms we used - so caller can normalize scores
bool hashSample ( Query *q, char *bigSampleBuf , int32_t bigSampleLen ,
		  TermTable *master, int32_t *nqiPtr , TopicGroup *t ,
		  State24 *st, int64_t docId ,
		  char *vecs , int32_t *numVecs ,
		  Words *wordsPtr , Scores *scoresPtr , bool isUnicode ,
		  char *repeatTable , int32_t repeatTableNumSlots ,
		  char language ) {
	// numTerms must be less than this
	//if ( q && q->getNumTerms() > MAX_QUERY_TERMS ) (aac)
	if ( q && q->m_numWords > MAX_QUERY_TERMS )
		return log("topics: Too many query terms for "
			   "topic generation.");

	//bool returnDocIdCount = st->m_returnDocIdCount;
	//bool returnDocIds = st->m_returnDocIds;
	bool returnPops = st->m_returnPops;


	// this is the pure content now
	char *content     = bigSampleBuf;
	int32_t  contentLen  = bigSampleLen;
	// truncate it to 40k, that's enough
	//if ( contentLen > 50*1024 ) contentLen = 50*1024;
	// bail if empty!
	if ( ! wordsPtr && (! content || contentLen <= 0) ) {
		log("topics: Got empty document for topic generation.");
		return true;
	}
	// make buf point to the available space
	char *buf = content;
	// get length of the buffer
	int32_t bufLen = contentLen;

#ifdef DEBUG_MSG24
	if (q) {
	    log("topics: Query stats in hashSample");
	    int32_t numQT = q->getNumTerms();
	    int32_t numQW = q->m_numWords;
	    log("topics: \tnumQueryTerms = %"INT32"", numQT);
	    log("topics: \tnumQueryWords = %"INT32"", numQW);
	    char *thisQT, *thisQW, iCode, tmpBuf[1024];
	    int32_t qtLen, qwLen, i, j, k;
	    for (i = 0; i < numQT; i++) {
		thisQT = q->getTerm(i);
		qtLen  = q->getTermLen(i);
		k = 0;
		for (j = 0; j < qtLen && k < 1023; j++) {
		    if (thisQT[j]) tmpBuf[k++] = thisQT[j];
		};
		tmpBuf[k] = '\0';
		log ("topics: \tQT[%"INT32"] = %s", i, &tmpBuf[0]);
	    };
	    for (i = 0; i < numQW; i++) {
		thisQW = q->m_qwords[i].m_word;
		qwLen  = q->m_qwords[i].m_wordLen;
		iCode  = q->m_qwords[i].m_ignoreWord;
		k = 0;
		for (j = 0; j < qwLen && k < 1023; j++) {
		    if (thisQW[j]) tmpBuf[k++] = thisQW[j];
		};
		tmpBuf[k] = '\0';
		log ("topics: \tQW[%"INT32"] = %s,\tignore = %i", i, &tmpBuf[0], iCode);
	    };
	};
#endif

	// get query hashes/ids, 32 bit, skip phrases
	uint64_t qids [MAX_QUERY_TERMS];
	int32_t qpops[MAX_QUERY_TERMS];
	int32_t nqi = 0;
	//for ( int32_t i=0 ; q && i<q->getNumTerms() && nqi<MAX_QUERY_TERMS; i++){ (aac)
	for ( int32_t i=0 ; q && i < q->m_numWords && nqi<MAX_QUERY_TERMS; i++){
		//if ( q->isPhrase       (i) ) continue; (aac)
		//if ( q->isQueryStopWord(i) ) continue; (aac)
		char ignCode = q->m_qwords[i].m_ignoreWord;
		if ( ignCode && ignCode != 8 ) continue;
		char *s    = q->m_qwords[i].m_word;    // q->getTerm(i);    (aac)
		int32_t  slen = q->m_qwords[i].m_wordLen; // q->getTermLen(i); (aac)
		int32_t qpop;
		int32_t encodeType = csISOLatin1;
		if ( q->isUnicode() ) encodeType = csUTF16;
		qids[nqi] = hash64d(s, slen, encodeType);
		qpop = g_speller.getPhrasePopularity(s, qids[nqi], true,
						     language);
		if       ( qpop < QPOP_ZONE_0 ) qpop = QPOP_MULT_0;
		else if  ( qpop < QPOP_ZONE_1 ) qpop = QPOP_MULT_1;
		else if  ( qpop < QPOP_ZONE_2 ) qpop = QPOP_MULT_2;
		else if  ( qpop < QPOP_ZONE_3 ) qpop = QPOP_MULT_3;
		else if  ( qpop < QPOP_ZONE_4 ) qpop = QPOP_MULT_4;
		else                            qpop = 1;
		// qpop = 1; // this makes no sense here (aac)
		qpops[nqi] = qpop;
		nqi++;
	}
	// tell caller how many query terms we used so he can normalize scores
	*nqiPtr = nqi;

	//int64_t start = gettimeofdayInMilliseconds();

	TermTable tt;
	if ( ! tt.set(20000,true,true, false , returnPops, false, false,NULL)){
		log("topics: Had error allocating a table for topic "
		    "generation: %s.",mstrerror(g_errno));
		//mfree ( buf , bufMaxLen , "Msg24" );
		return true;
	}

	Words w;

	//---> word next to both query terms should not be between by word just
	//next to one....
	//---> weight by query popularity too!

	//log("******** hashing doc *********");

	// hash each excerpt
	char *p    = buf;
	// most samples are under 5k, i've seend a 32k sample take 11ms!
	char *pend = buf + bufLen;
	while ( p < pend ) {
		// debug
		//log("docId=%"INT64" EXCERPT=%s",docId,p);
		int32_t plen ;
		if ( isUnicode ) plen = ucStrNLen(p,pend-p);
		else             plen = strlen(p);
		// p is only non-NULL if we are doing it the old way
		hashExcerpt ( q, qids, qpops, nqi, &tt, p, plen, &w, t , NULL,
			      isUnicode , repeatTable , repeatTableNumSlots ,
			      language );
		// advance to next excerpt
		if ( isUnicode ) p += plen + 2;
		else             p += plen + 1;
	}

	// hash the provided wordsPtr as one excerpt if there
	if ( wordsPtr )
		hashExcerpt ( q, qids, qpops, nqi, &tt, NULL,0, wordsPtr, t ,
			      scoresPtr , isUnicode ,
			      repeatTable , repeatTableNumSlots ,
			      language );

	// . compute the fingerprint/similarirtyVector from this table
	//   the same way we do for documents for deduping them at query time
	// . or we could just wait for our dedup algo to kick in... (mdw)
	//   then comment this stuff out ...
	if ( t->m_dedupSamplePercent >= 0 ) {
		char *v1 = vecs + (*numVecs * SAMPLE_VECTOR_SIZE);
		g_clusterdb.getSampleVector ( v1 , &tt );
		// compare to others done so far
		char *v2 = vecs ;
		for ( int32_t i = 0 ; i < *numVecs ; i++,v2+=SAMPLE_VECTOR_SIZE){
			char ss = g_clusterdb.getSampleSimilarity(v1,v2,
							   SAMPLE_VECTOR_SIZE);
			// return true if too similar to another sample we did
			if ( ss >= t->m_dedupSamplePercent ) { // 80 ) {
				log(LOG_DEBUG,"topics: removed dup sample.");
				return true;
			}
		}
		// we have another vector to contend with for next time
		*numVecs = *numVecs + 1;
	}

	//log("TOOK %"INT64" ms plen=%"INT32"",gettimeofdayInMilliseconds()-start,
	//    bufLen);

	// . this termtable carries two special buckets per slot in order
	//   to hold a linked list of docids with each termid in the hash table
	// . heads is NULL if returnDocIdCount and returnDocIds are false
	int32_t *heads = master->getHeads();
	// . now hash the entries of this table, tt, into the master
	// . the master contains entries from all the other tables
	//log("have %"INT32" terms in termtable. adding to master.",
	//     tt.getNumTermsUsed());
	int32_t nt = tt.getNumTerms();
	int32_t pop = 0 ;
	for ( int32_t i = 0 ; i < nt ; i++ ) {
		// this should be indented
		//if ( ! tt.getScoreFromTermNum(i) ) continue;
		if ( ! tt.m_scores[i] ) continue;
		//int32_t ii = (int32_t)tt.getTermPtr(i);
		// then divide by that
		int32_t score = tt.getScoreFromTermNum(i) ;
		// watch out for 0
		if ( score <= 0 ) continue;
		// . get the bucket
		// . may be or may not be full (score is 0 if empty)
		int32_t n = master->getTermNum ( tt.getTermId(i) );
		// skip if 0, i've seen this happen before
		if ( tt.getTermId(i) == 0 ) continue;
		// . but now we add one more things to the termtable,
		//   a linked list field for keeping track of the docids
		//   of the documents that contain each termid
		// . grab some mem for the link
		// . "heads" is NULL if we should not do this...
		if ( heads ) {
			if ( st->m_memPtr + sizeof(DocIdLink) > st->m_memEnd ) {
				int32_t oldSize = st->m_memEnd - st->m_mem;
				int32_t newSize = oldSize + 256*1024;
				char *s = (char *)mrealloc(st->m_mem,oldSize,
							   newSize,"Msg24g");
				if ( !s )
					return log("Msg24: realloc failed.");
				int32_t off = st->m_memPtr - st->m_mem;
				st->m_mem    = s;
				st->m_memEnd = s + newSize;
				st->m_memPtr = s + off;
			}
			DocIdLink *link = (DocIdLink *)st->m_memPtr;
			st->m_memPtr += sizeof(DocIdLink);
			link->m_docId = docId;
			// if empty... make new head
			if ( master->m_scores[n] == 0 ) {
				link->m_next  = -1;
				master->m_heads[n] = (char *)link - st->m_mem;
			}
			// otherwise, add link to tail of this bucket
			else  {
				link->m_next = master->m_heads[n];
				master->m_heads[n] = (char *)link - st->m_mem;
			}
		}
		if ( returnPops ) pop = tt.m_pops[i];
		// set hi bit of "pop" if in unicode
		if ( isUnicode ) pop |= 0x80000000;
		else             pop &= 0x7fffffff;
		// . add term to master table
		// . don't keep filling it up if we failed to alloc more space
		//   because that causes getTermNum() above to crash if the
		//   table is 100% full.
		if ( ! master->addTerm ( tt.getTermId(i)             ,
					 // divide by the AVG score used
					 //tt.getScoreFromTermNum(i)+30000/pop,
					 score ,
					 //tt.getScoreFromTermNum(i)+30000,
					 0x7fffffff                  ,
					 false                       ,
					 TITLEREC_CURRENT_VERSION    ,
					 tt.getTermPtr(i) ,
					 tt.getTermLen(i) ,
					 n                ,// termNum
					 NULL             ,// dummy(char *)link
					 pop,
					 isUnicode ) )
			break;
		// debug msg
		if ( g_conf.m_logDebugQuery ) {
		        char *ww = tt.getTermPtr(i);
			int32_t  wwlen = tt.getTermLen(i);
			char c     = ww[wwlen];
			ww[wwlen]='\0';
			log(LOG_DEBUG,"topics: master termId=%"UINT32" "
			    "score=%"INT32" cumscore=%"INT32" len=%"INT32" term=%s\n",
			    (int32_t)tt.getTermId(i),
			    score,master->getScoreFromTermId(tt.getTermId(i)),
			    wwlen,ww);
			ww[wwlen]=c;
		}
	}

	//log("master has %"INT32" terms",master->getNumTermsUsed());
	// clear any error
	if ( g_errno ) {
		log("topics: Had error getting topic candidates from document: "
		    "%s.",mstrerror(g_errno));
		g_errno = 0;
	}
	//mfree ( buf , bufMaxLen , "Msg24" );
	return true;
}


void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops, int32_t nqi,
		   TermTable *tt , char *buf , int32_t bufLen ,
		   Words *w , TopicGroup *t , Scores *scoresPtr ,
		   bool isUnicode , char *repeatTable ,
		   int32_t repeatTableNumSlots , char language ) {
	// . bring it out
	// . allow one more word per gigabit, then remove gigabits that
	//   are that length. this fixes the problem of having the same
	//   sentence repeated in different documents, which are fairly
	//   different as a whole, but have the same repeated sentence or
	//   paragraph.
	// . by only adding one, if the next word is a common word then
	//   we would fail to make a larger gigabit, that's why i added
	//   the maxjend code below this.
	int32_t maxWordsPerPhrase  = t->m_maxWordsPerTopic ;
	if ( t->m_topicRemoveOverlaps ) maxWordsPerPhrase += 2;
	char enforceQueryRadius = ! t->m_meta[0];
	char delimeter          = t->m_delimeter; // 0 means none (default)
	char idf                = t->m_useIdfForTopics;
	// or if no query, no query radius
	if ( ! q || q->getNumNonFieldedSingletonTerms() == 0 )
		enforceQueryRadius = false;
	// . now all the data is in buf/bufLen
	// . parse it up into Words
	// . now XmlDoc::getGigabitVector() calls us and it already has the
	//   Words passed up, so it will use a NULL buf
	if ( buf ) w->set ( isUnicode ,  // isUnicode?
			    false     ,  // isNormalized?
			    buf       ,
			    bufLen    ,
			    TITLEREC_CURRENT_VERSION,
			    true      ,  // compute word ids?
			    true      ); // has html entities?
	int32_t nw = w->getNumWords();
	// don't breech our arrays man
	if ( nw > 10000 ) nw = 10000;
	void *lrgBuf;
	int32_t lrgBufSize = 0;
       	lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
	lrgBufSize += 2 * nw * sizeof(int32_t);
	lrgBufSize += 3 * nw * sizeof(char);
	lrgBufSize += nw * sizeof(uint64_t);
	lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
	if (! lrgBuf) {
	    nw >>= 2;
	    lrgBufSize = 0;
	    lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
	    lrgBufSize += 2 * nw * sizeof(int32_t);
	    lrgBufSize += 3 * nw * sizeof(char);
	    lrgBufSize += nw * sizeof(uint64_t);
	    lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
	};
	if (! lrgBuf) {
	    log("topics: could not allocate local buffer "
		"(%"INT32" bytes required)", lrgBufSize);
	    return;
	};
	char *lrgBufPtr = (char *)lrgBuf;

	// . the popularity of word #i is pops[i]
	// . but we only set below if we need to
	int32_t *pops = (int32_t *) lrgBufPtr; // popularity 1-1 with first 10000 words
	lrgBufPtr += nw * sizeof(int32_t);
	char *iqt = lrgBufPtr; // is query term? 1-1 with words
	lrgBufPtr += nw * sizeof(char);
	char *icw = lrgBufPtr; // do not let frags end in these words
	lrgBufPtr += nw * sizeof(char);
	int32_t *qtrs = (int32_t *)lrgBufPtr; // the raw QTR scores (aac)
	lrgBufPtr += nw * sizeof(int32_t);

	// record list of word positions for each query term
	int32_t *pos = (int32_t *)lrgBufPtr;
	lrgBufPtr += MAX_QUERY_TERMS * 1000 * sizeof(int32_t);
	int32_t *posLen = (int32_t *)lrgBufPtr;
	lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
	int32_t *posPtr = (int32_t *)lrgBufPtr;
        lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
	//for ( int32_t i = 0 ; q && i < q->getNumTerms() ; i++ ) { (aac)
	for (int32_t i = 0; q && i < q->m_numWords && i < MAX_QUERY_TERMS; i++) {
		posLen[i] = 0; posPtr[i] = 0; }

	// skip punct
	int32_t i  = 0;
	if ( i < nw && w->isPunct(i) ) i++;
	qtrs[i] = 0;
	uint64_t *wids = (uint64_t *)lrgBufPtr;
	lrgBufPtr += nw * sizeof(uint64_t);
	// record the positions of all query words
	char **wp   = w->m_words;
	int32_t  *wlen = w->m_wordLens;
	int32_t   step = 2;
	int64_t *rwids  = w->getWordIds();
	int32_t      *scores = NULL;

	// . now we keep a hash table to zero out repeated fragments
	// . it uses a sliding window of 5 words
	// . it stores the hash of those 5 words in the hash table
	// . if sees how many 5-word matches it gets in a row
	// . the more matches it gets, the more it demotes the word scores
	// . these are stored in the weights class
	// . a repeatScore of 0 means to demote it out completely, 100 means
	//   it is not repeated at all
	// . multiply the final gigabit score by the repeatScore/100.
	char *repeatScores = lrgBufPtr;
	lrgBufPtr += nw * sizeof(char);
	setRepeatScores ( repeatScores , rwids , nw , repeatTable ,
 			  repeatTableNumSlots , w );

	QUICKPOLL(0);
	// single char length in bytes, etc.
	char oneChar    = 1;
	char twoChars   = 2;
	char threeChars = 3;
	if ( isUnicode ) {
		oneChar    = 2;
		twoChars   = 4;
		threeChars = 6;
	}
	// . advance one word at a time if doing it the new way
	// . also, the word ids will already be set, so use those to see what
	//   is indexable and what isn't
	if ( ! buf ) {
		step   = 1;
		scores = scoresPtr->m_scores;
	}
	// loop over the words in our sample
	//for ( ; i < nw ; i += 2 ) {
	for ( ; i < nw ; i += step ) {
	        qtrs[i] = 0;
		// do we have pre-supplied words and scores from XmlDoc.cpp?
		//if ( rwids ) {
		// skip if not indexable
 		if ( ! rwids[i] ) continue;
		// or if score is <= 0
		if ( scores && scores[i] <= 0 ) continue;
		// or repeated too much
		if ( repeatScores[i] <= 20 ) continue;
		//}
		// reset popularity
		if   ( idf ) pops[i] = -1;
		else         pops[i] =  1; // assume all same if not using idf
		// reset "is query term" array
		iqt[i] = 0;
		// store the id
		int32_t encodeType = csISOLatin1;
		if ( isUnicode ) encodeType = csUTF16;
		wids[i] = hash64d(wp[i], wlen[i], encodeType);
		// . is it a common word?
		// . it is if it is just one letter
		// . what about X-windows coming up for a 'windows' query?
		//   or e-mail coming up for a query?
		// . METALINCS likes to have 1 digit topics
		if ( wlen[i] <= oneChar && is_lower(wp[i][0]) ) icw[i] = 1;
		// unicode ~equivalent
		//if ( isUnicode && wlen[i] == 2 ) icw[i] = 1;
		// 2004 is common here but if it makes it in, don't remove it
		// in the top topics list... no. loses 'atari 2600' then!
		//else if ( is_digit(w->getWord(i)[0]) )
		//	icw[i] = 1;
#ifndef _METALINCS_
		else icw[i] = isCommonWord ( (int32_t)rwids[i] );
#else
		// always allow gigabits that start with numbers for metalincs
		else if ( ! is_digit(wp[i][0]))
			icw[i] = isCommonWord ( (int32_t)rwids[i] );
		else
			icw[i] = 0;
#endif
		// debug msg
		/*
		char *s    = w->getWord(i);
		int32_t  slen = w->getWordLen(i);
		char  c    = s[slen];
		s[slen]='\0';
		log("icw=%"INT32" %s",icw[i],s);
		s[slen]=c;
		*/
		// is it a query term? if so, record its word # in "pos" arry
		for ( int32_t j = 0 ; j < nqi ; j++ ) {
			if ( wids[i] != qids[j] ) continue;
			if ( posLen[j] >= 1000  ) continue;
			pos    [ 1000 * j + posLen[j] ] = i;
			posLen [ j ]++;
			// mark this word so if a phrase only has
			// all query terms we do not hash it
			iqt[i] = 1;
			break;
		}
	}

	QUICKPOLL(0);
	// max score -- ONE max scoring hits per doc
	int32_t maxScore = nqi * MAX_SCORE_MULTIPLIER;
	// this happens when generating the gigabit vector for a single doc
	// so don't hamper it to such a small ceiling
	if ( nqi == 0 ) maxScore = ALT_MAX_SCORE;

	// skip punct
	i = 0;
	if ( i < nw && w->isPunct(i) ) i++;
	// score each word based on distance to query terms
	int32_t score;
	// loop through all the words
	//for ( ; i < nw ; i += 2 ) {x
	for ( ; i < nw ; i += step ) {
		// debug point
		//if ( strncasecmp( wp[i],"Microsoft",9) == 0 )
		//	log("hey");
		// do we have pre-supplied words and scores from XmlDoc.cpp?
		//if ( rwids ) {
		// skip if not indexable
		if ( ! rwids[i] ) continue;
		// or if score is <= 0
		if ( scores && scores[i] <= 0 ) continue;
		//}
		// skip if in a repeat chunk of doc
		if ( repeatScores[i] <= 20 ) continue;
		// protect against misspelled html entities (aac)
		if ( (wp[i][-oneChar] == '&' && is_alnum(wp[i][0])) ||
		     (wp[i][0] == '&' && is_alnum(wp[i][oneChar]))   )	continue;
		// no more one or two letter gigabits (aac)
		if ( wlen[i] < threeChars && (! is_digit(wp[i][0])) ) continue;
		//continue; //mdw
		// if we had a delimeter, previous word must have it
		// or be the first punct word
		if ( delimeter && i >= 2 && ! w->hasChar(i-1,delimeter) )
			continue;
		// skip if a query term, it's ineligible
		//if ( w->getWordLen(i) == 0 ) continue;
		// if query is NULL, assume we are restricting to meta tags
		// and query is not necessary
		if   ( enforceQueryRadius ) score = 0;
		else                        score = ALT_START_SCORE;
		int32_t j ;
		int32_t nm = 0; // number of matches
		for ( j = 0 ; j < nqi ; j++ ) {
			// skip if no query terms in doc for query term #j
			if ( posLen[j] <= 0 ) continue;
			// get distance in words
			int32_t d1 = i - pos[ 1000 * j + posPtr[j] ] ;
			if ( d1 < 0   ) d1 = d1 * -1;
			if ( posPtr[j] + 1 >= posLen[j] ) {
				if (d1 >= QTR_ZONE_3) continue;
				if (iqt[i] || icw[i] ||
				    wlen[i] <= threeChars) {
				    // common word, query terms, int16_t words
				    // are all second class citizens when it
				    // comes to scoring: they get a small
				    // bonus, to ensure that they are
				    // considered in the next stage, but do not
				    // benefit from QPOP and multiple hit
				    // bonuses (aac)
				    score += QTR_BONUS_CW;
				    continue;
				};
				if (d1 < QTR_ZONE_0)
				    score += QTR_BONUS_0;
				else if (d1 < QTR_ZONE_1)
				    score += QTR_BONUS_1;
				else if (d1 < QTR_ZONE_2)
				    score += QTR_BONUS_2;
				else
				    score += QTR_BONUS_3;
				nm++;
				score *= qpops[j];
				continue;
			}
			int32_t d2 = pos[ 1000 * j + posPtr[j] + 1 ] - i ;
			if ( d2 < 0  ) d2 = d2 * -1;
			if ( d2 > d1 ) {
				// if      ( d1 >=20 ) continue;
				// if      ( d1 <  4 ) score += 1000;
				// else if ( d1 <  8 ) score += 800;
				// else if ( d1 < 12 ) score += 500;
				// else                score += 200;
				// nm++;
				// score *= qpops[j];
				// continue;
				if (d1 >= QTR_ZONE_3) continue;
				if (iqt[i] || icw[i] ||
				    wlen[i] <= threeChars) {
				    // common word, query terms, int16_t words
				    // are all second class citizens when it
				    // comes to scoring: they get a small
				    // bonus, to ensure that they are
				    // considered in the next stage, but do not
				    // benefit from QPOP and multiple hit
				    // bonuses (aac)
				    score += QTR_BONUS_CW;
				    continue;
				};
				if (d1 < QTR_ZONE_0)
				    score += QTR_BONUS_0;
				else if (d1 < QTR_ZONE_1)
				    score += QTR_BONUS_1;
				else if (d1 < QTR_ZONE_2)
				    score += QTR_BONUS_2;
				else
				    score += QTR_BONUS_3;
				nm++;
				score *= qpops[j];
				continue;
			}
			// if      ( d2 >=20 ) { posPtr[j]++; continue; }
			// if      ( d2 <  4 ) score += 1000;
			// else if ( d2 <  8 ) score += 800;
			// else if ( d2 < 12 ) score += 500;
			// else                score += 200;
			// nm++;
			// score  *= qpops[j];
			if (d2 >= QTR_ZONE_3) { posPtr[j]++; continue; };
			if (iqt[i] || icw[i] || wlen[i] <= threeChars) {
			    // common word, query terms, int16_t words
			    // are all second class citizens when it
			    // comes to scoring: they get a small
			    // bonus, to ensure that they are
			    // considered in the next stage, but do not
			    // benefit from QPOP and multiple hit
			    // bonuses (aac)
			    score += QTR_BONUS_CW;
			    continue;
			};
			if (d2 < QTR_ZONE_0) score += QTR_BONUS_0;
			else if (d2 < QTR_ZONE_1) score += QTR_BONUS_1;
			else if (d2 < QTR_ZONE_2) score += QTR_BONUS_2;
			else                      score += QTR_BONUS_3;
			nm++;
			score *= qpops[j];
			continue;
			posPtr[j]++;
		}

		// skip if too far away from all query terms
		if ( score <= 0 ) continue;

		// no longer count closeness to query terms for score,
		// just use # times topic is in doc(s) and popularity
		//score = 1000;

		// set pop if it is -1
		if ( pops[i] == -1 ) {
			pops[i] = g_speller.
				getPhrasePopularity( wp[i],wids[i], true,
						     language );
		       // decrease popularity by half if
		       // capitalized so Jack does not have
		       // same pop as "jack"
		       if ( is_upper (wp[i][0]) ) pops[i] >>= 1;
		       if ( pops[i] == 0 ) pops[i] = 1;
		       QUICKPOLL(0);
		}

		// give a boost for multiple hits
		// the more terms in range, the bigger the boost
		if ( nm > 1 ) {
			//log("nm=%"INT32"",nm);
			score += MULTIPLE_HIT_BOOST * nm;
		};

		// save the raw QTR score
		qtrs[i] = score;
	};

	QUICKPOLL(0);
	int32_t mm = 0;
	// skip punct
	i = 0;
	if ( i < nw && w->isPunct(i) ) i++;
	for ( ; i < nw ; i += step ) {
	        float pop;
		int32_t score;
		int32_t bonus;
		// must start with a QTR-scoring word
	        if (qtrs[i] <= 0) continue;
		// add it to table
		// init for debug here
		char *ww;
		int32_t  wwlen;
		//char  c;
		int32_t  ss;
		ww    = wp  [i]; // w->getWord(i);
		wwlen = wlen[i]; // w->getWordLen(i);
		if ( icw[i] ) {
		    // . skip this and all phrases if we're "to"
		    // . avoid "to use..." "to do..." "to make..." annoying
		    // . "to" has score 1, "and" has score 2, "of" is 3,
		    // . "the" is 4, "this" is 5
		    if ( icw[i] <= 5 ) continue;
		    // cannot start with any common word, unless capitalized
		    if ( is_lower(wp[i][0]) ) continue;
		}
		// if a hyphen is immediately before us, we cannot start
		// a phrase... fu-ture, preven-tion
		if ( i > 0 && wp[i][-oneChar]=='-' ) continue;
		// same for colon
		if ( i > 0 && wp[i][-oneChar]==':' ) continue;
		// . if a "'s " is before us, we cannot start either
		// . "valentine's day cards"
		if ( i >= 3 &&
		     wp[i][-threeChars]=='\'' &&
		     wp[i][-twoChars  ]=='s' &&
		     is_space(wp[i][-oneChar]) ) continue;
		// or if our first char is a digit and a "digit," is before us
		// because we don't want to break numbers with commas in them
		if ( is_digit(wp[i][0]) && i >= 2 && wp[i][-oneChar]==',' &&
		     is_digit(wp[i][-twoChars]) ) continue;
		// set initial popularity
		if (pops[i] > 0) {
		    pop = ((float) pops[i]) / MAXPOP;
		}
		else {
		    pop = 1.0 / MAXPOP;
		};
		// set initial score and bonus
		score = qtrs[i];
		bonus = 0;
		uint64_t  h = wids[i]; // hash value
		// if first letter is upper case, double the score
		//if ( is_upper (w->getWord(i)[0]) ) score <<= 1;

		// . loop through all phrases that start with this word
		// . up to 6 real words per phrase
		// . 'j' counts our 'words' which counts a $ of puncts as word
		int32_t jend    = i + maxWordsPerPhrase * 2; // 12;
		int32_t maxjend = jend ;
		if ( t->m_topicRemoveOverlaps ) maxjend += 8;
		if ( jend    > nw ) jend    = nw;
		if ( maxjend > nw ) maxjend = nw;

		QUICKPOLL(0);

		int32_t count = 0;
		int32_t nqc   = 0; // # common/query words in our phrase
		int32_t nhw   = 0; // # of "hot words" (contribute to score)
		if ( scores ) mm = scores[i];
		//for ( int32_t j = i ; j < jend ; j += 2 ) {
		for ( int32_t j = i ; j < jend ; j += step ) {
			// skip if not indexable
			if ( ! rwids[j] ) continue;
			// or if score is <= 0
			if ( scores && scores[j] <= 0 ) continue;
			if ( repeatScores[j] <= 20 ) continue;
			// no ending in ing on capitalized
			if ( wlen[j] > threeChars &&
			     wp[j][wlen[j]-oneChar   ]=='g' &&
			     wp[j][wlen[j]-twoChars  ]=='n' &&
			     wp[j][wlen[j]-threeChars]=='i' &&
			     is_lower(wp[j][0]) )
				continue;
			if (j == i) {
			    if (icw[j] || wlen[j] < threeChars) bonus -= FWC_PENALTY;
			    // if word is 4 letters or more and ends in ed, do not
			    // allow to be its own gigabit
			    if ( wlen[j] > threeChars &&
				 wp[j][wlen[j]-oneChar ]=='d' &&
				 wp[j][wlen[j]-twoChars]=='e' )
				    continue;
			    // no more "com" gigabits, please! (aac)
			    if ( wlen[j] == threeChars &&
				 wp[j][0       ]=='c' &&
				 wp[j][oneChar ]=='o' &&
				 wp[j][twoChars]=='m') continue;
			};
			// let's generalize even more! do not allow common
			// single words as gigabits, with 250+ pop
			//if ( pop > 100 && j == i && is_lower(wp[j][0]) ) continue;
			// the above assumes a MAX_POP of 10k (sanity check)
			//if ( MAXPOP != 10000 ) { char *xx = NULL; *xx = 0; }
			// are we passed the first word in the phrase?
			if ( j > i ) {
				// advance phrase length
				wwlen += wlen[j-1] + wlen[j];
				// . cut phrase int16_t if too much punct between
				//   the current word, j, and the last one, j-2
				// . but allow for abbreviations or initials
				//   of single letters, like 'harry s. truman'.
				//   we do not want to break before 's.'
				// . because the phrase "s. doesn't stand for
				//   anything." was unable to form. we only
				//   got "s." and "doesn't stand for anything."
				//   as possible gigabit candidates.
				//if ( wlen[j-1] > 1 ) {
				//	if ( wlen[j-1]    != 2   ) break;
				//	if ( wp  [j-1][0] != '.' ) break;
				//	if ( wlen[j-2]    >  1   ) break;
				//}
				// . we now allow most punct since it is
				//   filtered out above w/ hasPunct variable
				// . this a little more than doubles the
				//   processing overhead going from 1 to 3
				// . going from 1 to 2 we see that we take 60ms
				//   instead of 50ms *when removing overlaps*
				// . at 1 we take about 48/45ms, not much
				//   different when removing overlaps
				// . increasing this totally wipes out our
				//   overlap problem, but it is very expensive,
				//   so now i just halt after jumping one big
				//   string of punct below, and filter out
				//   those gigabits above with hasPunct.
				// . i'd really like to NOT have this here
				//   because we get much better gigabits, but
				//   we need it as a speed saver...
				if (wlen[j-1]>t->m_topicMaxPunctLen) break;
				// no phrasing across commas, etc.
				/*
				if ( wlen[j-1] == 2 ) {
					// only allow "  " or ": " or ". "
					if ( wp[j-1][1]!=' ' ) break;
					if ( wp[j-1][0]!=' ' &&
					     wp[j-1][0]!=':' &&
					     wp[j-1][0]!='\'' && // beatles'
					     // allow commas here, but we
					     // remove any gigabits with commas
					     // because we just use them to
					     // cancel out bad gigabits.
					     wp[j-1][0]!=',' &&
					     wp[j-1][0]!='.'  ) break;
					// . TODO: add in sgt. col. so that
					//   stuff can be in a gigabit
					// . only allow ". " if prev word was
					//   abbreviation.
					if ( wp[j-1][0]=='.' &&
					     j >= 2 &&
					     wlen[j-2] > 3) break; // != 1
				}
				*/
				// or if we just skipped the delimeter,
				// we are not allowed to phrase across that
				// if one was provided
				if ( delimeter && w->hasChar(j-1,delimeter))
					break;
				// make sure we could phrase across last word
				//if ( wlen[j-1] > 1 &&
				//   bits.getPunctuationBits(wp[j-1],wlen[j-1])
				//   == 0 ) break;
				// accumulate the phrase's hash
				h = hash64 ( h , wids[j] );
				// set pop if it is -1
				if ( pops[j] == -1 ) {
					pops[j]= g_speller.
						getPhrasePopularity( wp[j],
						wids[j], true, language );
					// decrease popularity by half if
					// capitalized so Jack does not have
					// same pop as "jack"
					if ( is_upper (wp[j][0]) )
						pops[j] >>= 1;
					// why was this in there?
					if ( pops[j] <= 0 ) pops[j] = 1;
					QUICKPOLL(0);
				}
				// adjust popularity
				pop = (pop * pops[j])/MAXPOP;
				// watch our for overflow
				if ( pop <= 0.0 ) pop = 1.0/MAXPOP;
				// get lowest of scores
				if ( scores && scores[j] > mm )	mm = scores[j];
			}

			// keep track of words
			count++;
			if ( iqt[j] || icw[j] ) {
			    nqc++; // increment number of query/commoners
			}
			else if (qtrs[j] > 0) {
			    score += qtrs[j];
			    nhw++; // increment "hot word" counter
			};
			// keep phrasing until next punct word is delimeter
			// or the end
			if ( delimeter ) {
				// if we end on a punct word, then hash
				// our phrase, otherwise, only hash it if
				// the next word has the delimeter
				if ( j+2 < jend && ! w->hasChar(j+1,delimeter))
					continue;
			}
			// otherwise, ensure phrase is not ALL query terms
			else {
				// if phrase is all commoners  & query skip it
				if ( nqc == count ) {
#ifdef DEBUG_MSG24
				    char saveChar = ww[wwlen];
				    ww[wwlen] = '\0';
				    log("topics: phrase is all QT or CW; skipping"
					    " phrase %s", ww);
				    ww[wwlen] = saveChar;
#endif
				    continue;
				};
			}
			// . skip if we're common, pair across common words
			// . BUT it is common to end a meta from tag in ".com"
			//   so we should not count that one as common
			if ( icw[j] ) {
				// allow for more words only for purposes
				// of fixing the ABCD and BCDE overlap bug
				// without having to raise jend for all cases
				if ( jend < maxjend ) jend++;
				continue;
			}
			// do not stop if - . or @ follows us right b4 alnum
			if ( j+1 < nw && is_alnum(wp[j+1][oneChar]) ) {
			if ( wp[j+1][0]=='-' ) continue;
			if ( wp[j+1][0]=='.' ) continue;
			if ( wp[j+1][0]=='\'') continue;
			if ( wp[j+1][0]=='@' ) continue;
			// . do not split phrases between capitalized words
			// . this should fix the Costa Rica, Costa Blah bug
			// . it may decrease score of Belkin for query
			//   'Belkin Omni Cube' but that's ok because if
			//   Belkin is important it will be used independently.
			if ( is_upper(wp[j][0]) &&
			     j + 2 < nw &&
			     wp[j+1][0]==' ' &&
			     is_upper(wp[j+2][0]) &&
			     wlen[j+1] == oneChar &&
			     t->m_maxWordsPerTopic > 1 )
				continue;
			}
			// do not mix caps
			if ( is_upper(wp[i][0]) != is_upper(wp[j][0]) )
			     continue;
			// . do not stop on a single capital letter
			// . so we don't stop on "George W->" (george w. bush)
			// . i added the " && j > i" so METALINCS can have
			//   single digit gigabits
			if ( wlen[j] == oneChar && j > i ) continue;
			// . do not split after Mr. or St. or Ms. or Mt. ...
			// . fixes 'st. valentines day'
			if ( wlen[j] == twoChars && is_upper(wp[j][0]) &&
			     wp[j][twoChars]=='.' ) continue;
			// sgt. or col.
			if ( wlen[j] == threeChars && wp[j][threeChars]=='.' ){
				if ( to_lower(wp[j][0       ])=='s' &&
				     to_lower(wp[j][oneChar ])=='g' &&
				     to_lower(wp[j][twoChars])=='t' ) continue;
				if ( to_lower(wp[j][0       ])=='c' &&
				     to_lower(wp[j][oneChar ])=='o' &&
				     to_lower(wp[j][twoChars])=='l' ) continue;
				if ( to_lower(wp[j][0       ])=='m' &&
				     to_lower(wp[j][oneChar ])=='r' &&
				     to_lower(wp[j][twoChars])=='s' ) continue;
			}
			// . do not split commas in numbers
			// . like 1,000,000,000
			if ( j >= 2 &&
			              wp[j][-oneChar ]==',' &&
			     is_digit(wp[j][-twoChars])     &&
			     wp[j][wlen[j]]==',' &&
			     is_digit(wp[j][wlen[j]+oneChar]))
				continue;
			/*
			if       ( pop < 1  ) ;
			else if  ( pop < 2  ) ss = (score * 90) / 100;
			else if  ( pop < 5  ) ss = (score * 85) / 100;
			else if  ( pop < 10 ) ss = (score * 80) / 100;
			else if  ( pop < 20 ) ss = (score * 75) / 100;
			else if  ( pop < 30 ) ss = (score * 70) / 100;
			else if  ( pop < 40 ) ss = (score * 65) / 100;
			else if  ( pop < 50 ) ss = (score * 60) / 100;
			else                  ss = (score * 40) / 100;
			*/
			//if ( tt->getScoreFromTermId((int64_t)h) > 0 )
			//	continue;
			// debug msg
			//char c     = ww[wwlen];
			//ww[wwlen]='\0';
			//fprintf(stderr,"tid=%"UINT32" score=%"INT32" pop=%"INT32" len=%"INT32" "
			// "repeat=%"INT32" term=%s\n",h,ss,pop,wwlen,
			//	repeatScores[i],ww);
			//ww[wwlen]=c;
			// include any ending or starting ( or )
			if ( i > 0 && ww[-oneChar] == '(' ) {
				// ensure we got a ')' somewhere before adding (
				for ( int32_t r = 0 ; r <= wwlen ; r++ )
					if ( ww[r]==')' ) {
						ww--; wwlen++; break; }
			}
			if ( i < nw && ww[wwlen] == ')' ) {
				// we need a '(' somewhere before adding the )
				for ( int32_t r = 0 ; r <= wwlen ; r++ )
					if ( ww[r]=='(' ) {
						wwlen++; break; }
			}
			// now remove ('s if begin AND end in them
			if ( ww[0] == '(' && ww[wwlen-oneChar] == ')' ) {
				ww++; wwlen -= twoChars; }
			// now double score if capitalized, we need more
			// proper nouns for topic clustering to work better,
			// but it doesn't count if start of a sentence, so
			// there must be some alnum word right before it.
			//if (is_upper(ww[0]) && !isUnicode && wwlen>=2 &&
			if ( is_upper(ww[0]) && wwlen>=twoChars &&
			     is_alnum(ww[-twoChars]))
				ss <<= 1; // 1;
			// adjust the gigabit score using the new scores array
			//if ( scores && mm != NORM_WORD_SCORE )
			//	ss = (ss * mm) / NORM_WORD_SCORE;
			// adjust the gigabit score using the new scores array
			//if ( scores && mm != NORM_WORD_SCORE )
			//	ss = (ss * mm) / NORM_WORD_SCORE;
			// only count the highest scoring guy once per page
			//int32_t tn = tt->getTermNum((int64_t)h);
			//maxScore = ss;
			//if ( tn >= 0 ) {
			//	int32_t sc = tt->getScoreFromTermNum(tn);
			//	if ( sc > maxScore ) maxScore = sc;
			//}
			// . add it
			// . now store the popularity, too, so we can display
			//   it for the winning gigabits
			//if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
			//		    ww,wwlen,tn,NULL,pop) )
			// . weight score by pop
			// . lets try weighting more popular phrases more!
			ss = score;
			if (nhw > 0) ss /= nhw;
			ss += bonus;
			float boost;
			if ( ((float)nhw) / count < SPARSE_MARK)
			    ss -= SPARSE_PENALTY;
			if (pop < POP_ZONE_0)      boost = POP_BOOST_0;
			else if (pop < POP_ZONE_1) boost = POP_BOOST_1;
			else if (pop < POP_ZONE_2) boost = POP_BOOST_2;
			else if (pop < POP_ZONE_3) boost = POP_BOOST_3;
			else                       boost = POP_BOOST_4;
			ss = (int32_t)(boost *ss);
			if ( ss <= 0 ) ss = 1;
			// store it
			int32_t ipop = (int32_t)(pop * MAXPOP);
			if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
					    TITLEREC_CURRENT_VERSION    ,
					    ww,wwlen,-1,NULL,ipop) ) {
				log("topics: No memory to grow table.");
				return;
			}

			// stop after indexing a word after a int32_t string of
			// punct, this is the overlap bug fix without taking
			// a performance hit. hasPunct above will remove it.
			if ( j > i && wlen[j-1] > twoChars ) break;
		}
	}
	// clear any error
	if ( g_errno ) {
		log("topics: Had error getting topic candidates from "
		    "document: %s.",mstrerror(g_errno));
		g_errno = 0;
	}
	mfree(lrgBuf, lrgBufSize, "hashExcerpt (Msg24)");
}

// taken from Weights.cpp's set3() function
void setRepeatScores ( char      *repeatScores        ,
		       int64_t *wids                ,
		       int32_t       nw                  ,
		       char      *repeatTable         ,
		       int32_t       repeatTableNumSlots ,
		       Words     *words               ) {
	// if no words, nothing to do
	if ( nw == 0 ) return;

	char      *ptr      = repeatTable;
	int32_t       numSlots = repeatTableNumSlots;
	int64_t *hashes   = (int64_t *)ptr; ptr += numSlots * 8;
	int32_t      *vals     = (int32_t      *)ptr; ptr += numSlots * 4;

	int64_t   ringWids [ 5 ];
	int32_t        ringPos  [ 5 ];
	int32_t        ringi = 0;
	int32_t        count = 0;
	int64_t   h     = 0;

	// make the mask
	uint32_t mask = numSlots - 1;

	// clear ring of hashes
	memset ( ringWids , 0 , 5 * sizeof(int64_t) );

	// for sanity check
	//int32_t lastStart = -1;

	// count how many 5-word sequences we match in a row
	int32_t matched    = 0;
	int32_t matchStart = -1;

	// reset
	memset ( repeatScores , 100 , nw );

	// return until we fix the infinite loop bug
	//return;

	// . hash EVERY 5-word sequence in the document
	// . if we get a match look and see what sequences it matches
	// . we allow multiple instances of the same hash to be stored in
	//   the hash table, so keep checking for a matching hash until you
	//   chain to a 0 hash, indicating the chain ends
	// . check each matching hash to see if more than 5 words match
	// . get the max words that matched from all of the candidates
	// . demote the word and phrase weights based on the total/max
	//   number of words matching
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not alnum word
		if ( ! wids[i] ) continue;
		// reset
		//repeatScores[i] = 100;
		// add new to the 5 word hash
		h ^= wids[i];
		// . remove old from 5 word hash before adding new...
		// . initial ring wids are 0, so should be benign at startup
		h ^= ringWids[ringi];
		// add to ring
		ringWids[ringi] = wids[i];
		// save our position
		ringPos[ringi] = i;
		// wrap the ring ptr if we need to, that is why we are a ring
		if ( ++ringi >= 5 ) ringi = 0;
		// this 5-word sequence starts with word # "start"
		int32_t start = ringPos[ringi];
		// need at least 5 words in the ring buffer to do analysis
		if ( ++count < 5 ) continue;
		// sanity check
		//if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
		// look up in the hash table
		int32_t n = h & mask;
		// stop at new york times - debug
		/*
		if ( words->m_words[i][0] == 'A' &&
		     words->m_words[i][1] == 's' &&
		     words->m_words[i][2] == 'k' &&
		     words->m_words[i][3] == 'e' &&
		     words->m_words[i][4] == 'd' &&
		     words->m_words[i][5] == ' ' &&
		     words->m_words[i][6] == 'Q' &&
		     words->m_words[i][7] == 'u' )
			log("hey");
		*/
	loop:
		// all done if empty
		if ( ! hashes[n] ) {
			// add ourselves to the hash table now
			hashes[n] = h;
			// this is where the 5-word sequence starts
			vals  [n] = matchStart+1;
			// do not demote any words if less than 8 matched
			if ( matched < 3 ) { matched = 0; continue; }
			// reset
			matched = 0;
			// . how much we should we demote
			// . 10 matching words pretty much means 0 weights
			//float demote = 1.0 - ((matched-5)*.10);
			//if ( demote >= 1.0 ) continue;
			//if ( demote <  0.0 ) demote = 0.0;
			// demote the words involved
			for ( int32_t j = matchStart ; j < i ; j++ )
				repeatScores[j] = 0;
			// get next word
			continue;
		}
		// get next in chain if hash does not match
		if ( hashes[n] != h ) {
			// wrap around the hash table if we hit the end
			if ( ++n >= numSlots ) n = 0;
			// check out bucket #n now
			goto loop;
		}
		// save start of matching sequence for demote loop
		if ( matched == 0 ) matchStart = start;
		// inc the match count
		matched++;
	}
	// if we ended without nulling out some matches
	if ( matched < 3 ) return;
	for ( int32_t j = matchStart ; j < nw ; j++ ) repeatScores[j] = 0;

}

/*
// is it a stop word?
char isCommonPhrase ( int32_t h ) {
	static TermTable  s_table;
	static bool       s_isInitialized = false;
	// . these have the stop words above plus some foreign stop words
	// . these aren't
	// . i shrunk this list a lot
	// . see backups for the hold list
	// . i shrunk this list a lot
	// . see backups for the hold list
	static char      *s_stopPhrases[] = {
		"all rights reserved" ,
		"in addition" ,
		"for example" ,
		"for more information"
	};
	// include a bunch of foreign prepositions so they don't get required
	// by the bitScores in IndexTable.cpp
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( sizeof(s_stopPhrases) * 2 ) )
			return log("Msg24::isCommonPhrase: error set table");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_stopPhrases)/ sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			// set the phrases
			char *sw    = s_stopPhrases[i];
			int32_t  swlen = strlen ( sw );
			Words w;
			w->set ( false , sw , swlen );
			int32_t h = hash64d ( w->getWord   (0),
						     w->getWordLen(0));
			for ( int32_t j = 1 ; j < w->getNumWords() ; j++ )
				int32_t h2 =

			int32_t  swh   = hash64d ( sw , swlen );
			s_table.addTerm ((int32_t)swh,i+1,0x7fffffff,true);
		}
		s_isInitialized = true;
	}

	// . all 1 char letter words are stop words
	// . good for initials and some contractions
	//if ( len == 1 && is_alpha(*s) ) return true;

	// get from table
	return (char)s_table.getScoreFromTermId ( h );
}
*/

int32_t Msg24::getStoredSize ( ) {
	// store number of topics into 4 bytes
	int32_t size = 4;
	// store number of topics we have
	// all related topics that have scores >= m_minTopicScore
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		// get group info
		//TopicGroup *t = &m_topicGroups[m_topicGids[i]];
		// break if buf is too small
		//if ( size + m_topicLens[i] + 2 + 8 > MAX_REPLY_LEN ) break;
		// include \0 and 4 byte score and 4 byte topic length
		size +=
			4 + // topic ptr
			4 + // topicScore
			4 + // topicLen
			4 + // numDocIds
			4 + // ptr to docids
			4 + // topic pop
			1 + // topic gid
			m_topicLens[i] + 1 +   // topic string with \0
			m_topicNumDocIds[i]*8; // actual docids
	}
	return size;
}

// . serialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
// . just like serializing the reply
int32_t Msg24::serialize ( char *buf , int32_t bufLen ) {
	char *p = buf;
	// store number of topics
	*(int32_t *)p = m_numTopics; p += 4;
	// if no topics, bail
	if ( m_numTopics <= 0 ) return 4;
	// then the ptrs, with offset relative to m_topicPtrs[0] so
	// deserialize works
	char *base = m_topicPtrs[0];
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		*(int32_t *)p = m_topicPtrs[i] - base; p += 4; }
	// then the scores
	gbmemcpy ( p , m_topicScores   , m_numTopics * 4 ); p += m_numTopics * 4;
	gbmemcpy ( p , m_topicLens     , m_numTopics * 4 ); p += m_numTopics * 4;
	gbmemcpy ( p , m_topicNumDocIds, m_numTopics * 4 ); p += m_numTopics * 4;
	// these m_topicDocIds, are just essentially placeholders for ptrs
	// to the docids, just like the topic ptrs above, but these call all
	// be NULL if we didn't get back the list of docids for each gigabit
	p += m_numTopics * 4;
	// then the popularity rating of each topic
	gbmemcpy ( p , m_topicPops     , m_numTopics * 4 ); p += m_numTopics * 4;
	gbmemcpy ( p , m_topicGids     , m_numTopics     ); p += m_numTopics;
	// then the text
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		gbmemcpy ( p , m_topicPtrs[i] , m_topicLens[i] ) ;
		p += m_topicLens[i];
		*p++ = '\0';
	}
	// and one array of docids per topic
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		gbmemcpy ( p , m_topicDocIds[i] , m_topicNumDocIds[i] * 8 );
		p += m_topicNumDocIds[i] * 8;
		// sanity check
		//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
		//	if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
		//		log("query: Msg24 bad docid in serialize.");
		//		char *xx = NULL; *xx = 0;
		//	}
	}
	// debug msg
	//log("in nt=%"INT32"",*nt);
	if ( p - buf > bufLen ) {
		log("query: Msg24 serialize overflow.");
		char *xx = NULL; *xx = 0;
	}
	return p - buf;
}

// . deserialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
// . Msg40 owns the buffer, so we can reference it without having to copy
int32_t Msg24::deserialize ( char *buf , int32_t bufLen ) {
	// sanity check, i've seen this happen before when the handle of
	// the Msg24 runs out of memory at a certain plance and ends up
	// sending back a 0 length reply
	if ( bufLen < 4 ) {
		g_errno = EBADREPLY;
		log("query: Msg24::deserialize: bad reply.");
		return -1;
	}
	char *p = buf;
	m_numTopics   = *(int32_t *)p; p += 4;
	// another sanity check, just in case
	if ( bufLen < m_numTopics * (6*4+1) ) {
		g_errno = EBADREPLY;
		log("query: Msg24::deserialize: bad reply 2.");
		return -1;
	}
	m_topicPtrs      =  (char      **)p; p += m_numTopics * 4;
	m_topicScores    =  (int32_t       *)p; p += m_numTopics * 4;
	m_topicLens      =  (int32_t       *)p; p += m_numTopics * 4;
	m_topicNumDocIds =  (int32_t       *)p; p += m_numTopics * 4; //voters
	m_topicDocIds    =  (int64_t **)p; p += m_numTopics * 4; //placehldrs
	m_topicPops      =  (int32_t       *)p; p += m_numTopics * 4;
	m_topicGids      =                p; p += m_numTopics;
	// . make ptrs to topic text
	// . we were just provided with offsets to make it portable
	char *off = p;
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		m_topicPtrs[i] = (int32_t)m_topicPtrs[i] + off;
		p += m_topicLens[i] + 1;
	}
	// now for the array of docids per topic
	for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
		m_topicDocIds[i] = (int64_t *)p;
		p += m_topicNumDocIds[i] * 8;
		// sanity check
		//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
		//	if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
		//		log("query: Msg24 bad docid in deserialize.");
		//		char *xx = NULL; *xx = 0;
		//	}
	}
	if ( p - buf > bufLen ) {
		log("query: Msg24 deserialize overflow.");
		char *xx = NULL; *xx = 0;
	}
	return p - buf;
}


//if we already have the msg20s, just generate the gigabits from those.
bool Msg24::generateTopicsLocal ( char       *coll                ,
				  int32_t        collLen             ,
				  char       *query               ,
				  int32_t        queryLen            ,
				  Msg20**     msg20Ptrs           ,
				  int32_t        numMsg20s           ,
				  char       *clusterLevels       ,
				  TopicGroup  *topicGroups        ,
				  int32_t         numTopicGroups     ,
				  unsigned char lang              ) { // (aac)
	// force it to be true, since hi bit is set in pops if topic is unicode
	m_returnPops       = true;
	// warning
	if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
	// force it
	m_returnDocIdCount = true;
	// if we don't get docids, then deserialize doesn't work because it
	// expects the docids to be valid.
	m_returnDocIds     = true;
	// reset
	m_numTopics = 0;
	//m_docsToScanForTopics = docsToScanForTopics;
	//m_minTopicScore       = minTopicScore;
	//m_maxTopics           = maxTopics;
	m_numDocIds          = 0;
	m_coll               = coll;
	m_collLen            = collLen;
	// bail if no operations to do

	int32_t numTopicsToGen = topicGroups->m_numTopics;
	// get the min we have to scan
	int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;

	for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
		int32_t x = topicGroups[i].m_docsToScanForTopics ;
		if ( x > docsToScanForTopics ) docsToScanForTopics = x;

		if ( topicGroups[i].m_numTopics > numTopicsToGen )
			numTopicsToGen = topicGroups[i].m_numTopics;
	}
	// bail if none
	if ( docsToScanForTopics <= 0 ) return true;
	if ( numTopicsToGen == 0      ) return true;

	m_startTime = gettimeofdayInMilliseconds();

	// save, caller should not delete this!
	m_topicGroups    = topicGroups;
	m_numTopicGroups = numTopicGroups;
	// truncate
	//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
	// truncate
	//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
	//	numDocIds = MAX_DOCIDS_TO_SCAN ;
	// 	if ( numDocIds > docsToScanForTopics )
	// 		numDocIds = docsToScanForTopics ;


	State24 st;
	st.m_slot             = NULL;
	st.m_niceness         = 0;
	st.m_numRequests      = numMsg20s;
	st.m_numReplies       = numMsg20s;

	gbmemcpy ( st.m_query , query , queryLen );
	st.m_query [ queryLen ] = '\0';
	st.m_queryLen = queryLen;
	st.m_qq.set ( st.m_query , st.m_queryLen , NULL , 0, 2 , true );

	st.m_numTopicGroups   = m_numTopicGroups;
	gbmemcpy(st.m_topicGroups, m_topicGroups,
	       sizeof(TopicGroup) * m_numTopicGroups);
	st.m_maxCacheAge      = 0;
	st.m_addToCache       = false;
	st.m_returnDocIdCount = m_returnDocIdCount;
	st.m_returnDocIds     = m_returnDocIds;
	st.m_returnPops       = true; // ??? use this in dedup vector?
	st.m_docIds           = NULL;
	st.m_numDocIds        = 0;
	st.m_clusterLevels    = clusterLevels;
	st.m_n                = 0;
	st.m_i                = 0;
	st.m_coll             = coll;
	st.m_msg20Ptrs        = msg20Ptrs;
	st.m_msg20            = NULL;


	TermTable master;
	if ( ! master.set ( 20000 , true , true ,
			    st.m_returnDocIdCount | st.m_returnDocIds ,
			    st.m_returnPops , true, false, NULL ) ) {
		log("topics: Could not allocate memory for topic generation.");
		return true;
	}


	char *buf     = NULL;
	int32_t  bufSize = 0;
	for ( int32_t i = 0 ; i < st.m_numTopicGroups ; i++ ) {
		// get ith topic group descriptor
		TopicGroup *t = &st.m_topicGroups[i];
		// . generate topics for this topic group
		// . serialize them into "p"
		// . getTopics will realloc() this "buf" to exactly the size
		//   it needs
		getTopics ( &st , t , &master , &st.m_qq , i ,
			    // getTopics will realloc this buffer
			    &buf , &bufSize , NULL , NULL , NULL, lang ); // (aac)
		// clear master table each time
		if ( i + 1 < st.m_numTopicGroups ) master.clear();
	}
	//}

	// free mem now to avoid fragmentation
	master.reset();
	deserialize ( buf , bufSize );

	//we are pointing into buf, but we want to make sure it gets freed when we
	//are done with it, so we make it our m_reply
	m_reply = buf;
	m_replySize = bufSize;
	g_stats.addStat_r ( 0           ,
			    m_startTime ,
			    gettimeofdayInMilliseconds(),
			    "get_gigabits",
			    0x00d1e1ff ,
			    STAT_QUERY );
	return true;
}